From f16a110f8776398ef23a2a6b7b57522c2471637a Mon Sep 17 00:00:00 2001
From: leejet <leejet714@gmail.com>
Date: Mon, 30 Mar 2026 00:19:25 +0800
Subject: [PATCH] refactor: migrate generation pipeline to sd::Tensor (#1373)

---
 examples/cli/main.cpp         |    2 +-
 src/anima.hpp                 |   37 +-
 src/auto_encoder_kl.hpp       |  261 +--
 src/cache_dit.hpp             |   52 +-
 src/clip.hpp                  |   29 +-
 src/common_dit.hpp            |   58 +-
 src/condition_cache_utils.hpp |   64 +
 src/conditioner.hpp           |  951 ++++------
 src/control.hpp               |   98 +-
 src/denoiser.hpp              | 1329 ++++---------
 src/diffusion_model.hpp       |  204 +-
 src/easycache.hpp             |   72 +-
 src/esrgan.hpp                |   19 +-
 src/flux.hpp                  |  119 +-
 src/ggml_extend.hpp           |  469 +++--
 src/latent-preview.h          |   66 +
 src/llm.hpp                   |  156 +-
 src/lora.hpp                  |    2 +-
 src/mmdit.hpp                 |   68 +-
 src/pmid.hpp                  |   43 +-
 src/preprocessing.hpp         |  362 ++--
 src/qwen_image.hpp            |   78 +-
 src/sample-cache.cpp          |  361 ++++
 src/sample-cache.h            |   61 +
 src/spectrum.hpp              |   20 +-
 src/stable-diffusion.cpp      | 3297 ++++++++++++++-------------------
 src/t5.hpp                    | 2074 +++++++++++----------
 src/tae.hpp                   |   56 +-
 src/tensor.hpp                | 1249 +++++++++++++
 src/tensor_ggml.hpp           |  127 ++
 src/tokenize_util.cpp         | 1986 ++++++++++----------
 src/ucache.hpp                |   67 +-
 src/unet.hpp                  |   97 +-
 src/upscaler.cpp              |   73 +-
 src/util.cpp                  |  194 +-
 src/util.h                    |   19 +-
 src/vae.hpp                   |  260 +--
 src/wan.hpp                   |  308 ++-
 src/z_image.hpp               |   78 +-
 39 files changed, 7768 insertions(+), 7098 deletions(-)
 create mode 100644 src/condition_cache_utils.hpp
 create mode 100644 src/sample-cache.cpp
 create mode 100644 src/sample-cache.h
 create mode 100644 src/tensor.hpp
 create mode 100644 src/tensor_ggml.hpp
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index f9e4928..ddb88c9 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -601,7 +601,7 @@ int main(int argc, const char* argv[]) {
 
     if (gen_params.end_image_path.size() > 0) {
         vae_decode_only = false;
-        if (!load_image_and_update_size(gen_params.init_image_path, end_image)) {
+        if (!load_image_and_update_size(gen_params.end_image_path, end_image)) {
             return 1;
         }
     }
diff --git a/src/anima.hpp b/src/anima.hpp
index 81dbefe..5850cc3 100644
--- a/src/anima.hpp
+++ b/src/anima.hpp
@@ -602,20 +602,19 @@ namespace Anima {
             return Rope::embed_nd(ids, bs, axis_thetas, axes_dim);
         }
 
-        ggml_cgraph* build_graph(ggml_tensor* x,
-                                 ggml_tensor* timesteps,
-                                 ggml_tensor* context,
-                                 ggml_tensor* t5_ids     = nullptr,
-                                 ggml_tensor* t5_weights = nullptr) {
+        ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
+                                 const sd::Tensor<float>& timesteps_tensor,
+                                 const sd::Tensor<float>& context_tensor    = {},
+                                 const sd::Tensor<int32_t>& t5_ids_tensor   = {},
+                                 const sd::Tensor<float>& t5_weights_tensor = {}) {
+            ggml_tensor* x          = make_input(x_tensor);
+            ggml_tensor* timesteps  = make_input(timesteps_tensor);
+            ggml_tensor* context    = make_optional_input(context_tensor);
+            ggml_tensor* t5_ids     = make_optional_input(t5_ids_tensor);
+            ggml_tensor* t5_weights = make_optional_input(t5_weights_tensor);
             GGML_ASSERT(x->ne[3] == 1);
             ggml_cgraph* gf = new_graph_custom(ANIMA_GRAPH_SIZE);
 
-            x          = to_backend(x);
-            timesteps  = to_backend(timesteps);
-            context    = to_backend(context);
-            t5_ids     = to_backend(t5_ids);
-            t5_weights = to_backend(t5_weights);
-
             int64_t pad_h = (net.patch_size - x->ne[1] % net.patch_size) % net.patch_size;
             int64_t pad_w = (net.patch_size - x->ne[0] % net.patch_size) % net.patch_size;
             int64_t h_pad = x->ne[1] + pad_h;
@@ -667,18 +666,16 @@ namespace Anima {
             return gf;
         }
 
-        bool compute(int n_threads,
-                     ggml_tensor* x,
-                     ggml_tensor* timesteps,
-                     ggml_tensor* context,
-                     ggml_tensor* t5_ids      = nullptr,
-                     ggml_tensor* t5_weights  = nullptr,
-                     ggml_tensor** output     = nullptr,
-                     ggml_context* output_ctx = nullptr) {
+        sd::Tensor<float> compute(int n_threads,
+                                  const sd::Tensor<float>& x,
+                                  const sd::Tensor<float>& timesteps,
+                                  const sd::Tensor<float>& context    = {},
+                                  const sd::Tensor<int32_t>& t5_ids   = {},
+                                  const sd::Tensor<float>& t5_weights = {}) {
             auto get_graph = [&]() -> ggml_cgraph* {
                 return build_graph(x, timesteps, context, t5_ids, t5_weights);
             };
-            return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
         }
     };
 }  // namespace Anima
diff --git a/src/auto_encoder_kl.hpp b/src/auto_encoder_kl.hpp
index 6efdb41..039fb9d 100644
--- a/src/auto_encoder_kl.hpp
+++ b/src/auto_encoder_kl.hpp
@@ -1,4 +1,4 @@
-#ifndef __AUTO_ENCODER_KL_HPP__
+﻿#ifndef __AUTO_ENCODER_KL_HPP__
 #define __AUTO_ENCODER_KL_HPP__
 
 #include "vae.hpp"
@@ -685,10 +685,9 @@ struct AutoEncoderKL : public VAE {
         ae.get_param_tensors(tensors, prefix);
     }
 
-    ggml_cgraph* build_graph(ggml_tensor* z, bool decode_graph) {
+    ggml_cgraph* build_graph(const sd::Tensor<float>& z_tensor, bool decode_graph) {
         ggml_cgraph* gf = ggml_new_graph(compute_ctx);
-
-        z = to_backend(z);
+        ggml_tensor* z  = make_input(z_tensor);
 
         auto runner_ctx = get_context();
 
@@ -699,184 +698,100 @@ struct AutoEncoderKL : public VAE {
         return gf;
     }
 
-    bool _compute(const int n_threads,
-                  ggml_tensor* z,
-                  bool decode_graph,
-                  ggml_tensor** output,
-                  ggml_context* output_ctx = nullptr) override {
+    sd::Tensor<float> _compute(const int n_threads,
+                               const sd::Tensor<float>& z,
+                               bool decode_graph) override {
         GGML_ASSERT(!decode_only || decode_graph);
         auto get_graph = [&]() -> ggml_cgraph* {
             return build_graph(z, decode_graph);
         };
-        // ggml_set_f32(z, 0.5f);
-        // print_ggml_tensor(z);
-        return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), z.dim());
     }
 
-    ggml_tensor* gaussian_latent_sample(ggml_context* work_ctx, ggml_tensor* moments, std::shared_ptr<RNG> rng) {
+    sd::Tensor<float> gaussian_latent_sample(const sd::Tensor<float>& moments, std::shared_ptr<RNG> rng) {
         // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample
-        ggml_tensor* latents = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]);
-        ggml_tensor* noise   = ggml_dup_tensor(work_ctx, latents);
-        ggml_ext_im_set_randn_f32(noise, rng);
-        {
-            float mean   = 0;
-            float logvar = 0;
-            float value  = 0;
-            float std_   = 0;
-            for (int i = 0; i < latents->ne[3]; i++) {
-                for (int j = 0; j < latents->ne[2]; j++) {
-                    for (int k = 0; k < latents->ne[1]; k++) {
-                        for (int l = 0; l < latents->ne[0]; l++) {
-                            mean   = ggml_ext_tensor_get_f32(moments, l, k, j, i);
-                            logvar = ggml_ext_tensor_get_f32(moments, l, k, j + (int)latents->ne[2], i);
-                            logvar = std::max(-30.0f, std::min(logvar, 20.0f));
-                            std_   = std::exp(0.5f * logvar);
-                            value  = mean + std_ * ggml_ext_tensor_get_f32(noise, l, k, j, i);
-                            // printf("%d %d %d %d -> %f\n", i, j, k, l, value);
-                            ggml_ext_tensor_set_f32(latents, value, l, k, j, i);
-                        }
-                    }
-                }
-            }
-        }
+        auto chunks               = sd::ops::chunk(moments, 2, 2);
+        const auto& mean          = chunks[0];
+        const auto& logvar        = chunks[1];
+        sd::Tensor<float> stddev  = sd::ops::exp(0.5f * sd::ops::clamp(logvar, -30.0f, 20.0f));
+        sd::Tensor<float> noise   = sd::Tensor<float>::randn_like(mean, rng);
+        sd::Tensor<float> latents = mean + stddev * noise;
         return latents;
     }
 
-    ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr<RNG> rng) {
+    sd::Tensor<float> vae_output_to_latents(const sd::Tensor<float>& vae_output, std::shared_ptr<RNG> rng) override {
         if (sd_version_is_flux2(version)) {
             return vae_output;
         } else if (version == VERSION_SD1_PIX2PIX) {
-            return ggml_view_3d(work_ctx,
-                                vae_output,
-                                vae_output->ne[0],
-                                vae_output->ne[1],
-                                vae_output->ne[2] / 2,
-                                vae_output->nb[1],
-                                vae_output->nb[2],
-                                0);
+            return sd::ops::chunk(vae_output, 2, 2)[0];
         } else {
-            return gaussian_latent_sample(work_ctx, vae_output, rng);
+            return gaussian_latent_sample(vae_output, rng);
         }
     }
 
-    void get_latents_mean_std_vec(ggml_tensor* latents, int channel_dim, std::vector<float>& latents_mean_vec, std::vector<float>& latents_std_vec) {
-        // flux2
+    std::pair<sd::Tensor<float>, sd::Tensor<float>> get_latents_mean_std(const sd::Tensor<float>& latents, int channel_dim) {
+        GGML_ASSERT(channel_dim >= 0 && static_cast<size_t>(channel_dim) < static_cast<size_t>(latents.dim()));
         if (sd_version_is_flux2(version)) {
-            GGML_ASSERT(latents->ne[channel_dim] == 128);
-            latents_mean_vec = {-0.0676f, -0.0715f, -0.0753f, -0.0745f, 0.0223f, 0.0180f, 0.0142f, 0.0184f,
-                                -0.0001f, -0.0063f, -0.0002f, -0.0031f, -0.0272f, -0.0281f, -0.0276f, -0.0290f,
-                                -0.0769f, -0.0672f, -0.0902f, -0.0892f, 0.0168f, 0.0152f, 0.0079f, 0.0086f,
-                                0.0083f, 0.0015f, 0.0003f, -0.0043f, -0.0439f, -0.0419f, -0.0438f, -0.0431f,
-                                -0.0102f, -0.0132f, -0.0066f, -0.0048f, -0.0311f, -0.0306f, -0.0279f, -0.0180f,
-                                0.0030f, 0.0015f, 0.0126f, 0.0145f, 0.0347f, 0.0338f, 0.0337f, 0.0283f,
-                                0.0020f, 0.0047f, 0.0047f, 0.0050f, 0.0123f, 0.0081f, 0.0081f, 0.0146f,
-                                0.0681f, 0.0679f, 0.0767f, 0.0732f, -0.0462f, -0.0474f, -0.0392f, -0.0511f,
-                                -0.0528f, -0.0477f, -0.0470f, -0.0517f, -0.0317f, -0.0316f, -0.0345f, -0.0283f,
-                                0.0510f, 0.0445f, 0.0578f, 0.0458f, -0.0412f, -0.0458f, -0.0487f, -0.0467f,
-                                -0.0088f, -0.0106f, -0.0088f, -0.0046f, -0.0376f, -0.0432f, -0.0436f, -0.0499f,
-                                0.0118f, 0.0166f, 0.0203f, 0.0279f, 0.0113f, 0.0129f, 0.0016f, 0.0072f,
-                                -0.0118f, -0.0018f, -0.0141f, -0.0054f, -0.0091f, -0.0138f, -0.0145f, -0.0187f,
-                                0.0323f, 0.0305f, 0.0259f, 0.0300f, 0.0540f, 0.0614f, 0.0495f, 0.0590f,
-                                -0.0511f, -0.0603f, -0.0478f, -0.0524f, -0.0227f, -0.0274f, -0.0154f, -0.0255f,
-                                -0.0572f, -0.0565f, -0.0518f, -0.0496f, 0.0116f, 0.0054f, 0.0163f, 0.0104f};
-            latents_std_vec  = {
-                 1.8029f, 1.7786f, 1.7868f, 1.7837f, 1.7717f, 1.7590f, 1.7610f, 1.7479f,
-                 1.7336f, 1.7373f, 1.7340f, 1.7343f, 1.8626f, 1.8527f, 1.8629f, 1.8589f,
-                 1.7593f, 1.7526f, 1.7556f, 1.7583f, 1.7363f, 1.7400f, 1.7355f, 1.7394f,
-                 1.7342f, 1.7246f, 1.7392f, 1.7304f, 1.7551f, 1.7513f, 1.7559f, 1.7488f,
-                 1.8449f, 1.8454f, 1.8550f, 1.8535f, 1.8240f, 1.7813f, 1.7854f, 1.7945f,
-                 1.8047f, 1.7876f, 1.7695f, 1.7676f, 1.7782f, 1.7667f, 1.7925f, 1.7848f,
-                 1.7579f, 1.7407f, 1.7483f, 1.7368f, 1.7961f, 1.7998f, 1.7920f, 1.7925f,
-                 1.7780f, 1.7747f, 1.7727f, 1.7749f, 1.7526f, 1.7447f, 1.7657f, 1.7495f,
-                 1.7775f, 1.7720f, 1.7813f, 1.7813f, 1.8162f, 1.8013f, 1.8023f, 1.8033f,
-                 1.7527f, 1.7331f, 1.7563f, 1.7482f, 1.7610f, 1.7507f, 1.7681f, 1.7613f,
-                 1.7665f, 1.7545f, 1.7828f, 1.7726f, 1.7896f, 1.7999f, 1.7864f, 1.7760f,
-                 1.7613f, 1.7625f, 1.7560f, 1.7577f, 1.7783f, 1.7671f, 1.7810f, 1.7799f,
-                 1.7201f, 1.7068f, 1.7265f, 1.7091f, 1.7793f, 1.7578f, 1.7502f, 1.7455f,
-                 1.7587f, 1.7500f, 1.7525f, 1.7362f, 1.7616f, 1.7572f, 1.7444f, 1.7430f,
-                 1.7509f, 1.7610f, 1.7634f, 1.7612f, 1.7254f, 1.7135f, 1.7321f, 1.7226f,
-                 1.7664f, 1.7624f, 1.7718f, 1.7664f, 1.7457f, 1.7441f, 1.7569f, 1.7530f};
+            GGML_ASSERT(latents.shape()[channel_dim] == 128);
+            std::vector<int64_t> stats_shape(static_cast<size_t>(latents.dim()), 1);
+            stats_shape[static_cast<size_t>(channel_dim)] = latents.shape()[channel_dim];
+
+            auto mean_tensor = sd::Tensor<float>::from_vector({-0.0676f, -0.0715f, -0.0753f, -0.0745f, 0.0223f, 0.0180f, 0.0142f, 0.0184f,
+                                                               -0.0001f, -0.0063f, -0.0002f, -0.0031f, -0.0272f, -0.0281f, -0.0276f, -0.0290f,
+                                                               -0.0769f, -0.0672f, -0.0902f, -0.0892f, 0.0168f, 0.0152f, 0.0079f, 0.0086f,
+                                                               0.0083f, 0.0015f, 0.0003f, -0.0043f, -0.0439f, -0.0419f, -0.0438f, -0.0431f,
+                                                               -0.0102f, -0.0132f, -0.0066f, -0.0048f, -0.0311f, -0.0306f, -0.0279f, -0.0180f,
+                                                               0.0030f, 0.0015f, 0.0126f, 0.0145f, 0.0347f, 0.0338f, 0.0337f, 0.0283f,
+                                                               0.0020f, 0.0047f, 0.0047f, 0.0050f, 0.0123f, 0.0081f, 0.0081f, 0.0146f,
+                                                               0.0681f, 0.0679f, 0.0767f, 0.0732f, -0.0462f, -0.0474f, -0.0392f, -0.0511f,
+                                                               -0.0528f, -0.0477f, -0.0470f, -0.0517f, -0.0317f, -0.0316f, -0.0345f, -0.0283f,
+                                                               0.0510f, 0.0445f, 0.0578f, 0.0458f, -0.0412f, -0.0458f, -0.0487f, -0.0467f,
+                                                               -0.0088f, -0.0106f, -0.0088f, -0.0046f, -0.0376f, -0.0432f, -0.0436f, -0.0499f,
+                                                               0.0118f, 0.0166f, 0.0203f, 0.0279f, 0.0113f, 0.0129f, 0.0016f, 0.0072f,
+                                                               -0.0118f, -0.0018f, -0.0141f, -0.0054f, -0.0091f, -0.0138f, -0.0145f, -0.0187f,
+                                                               0.0323f, 0.0305f, 0.0259f, 0.0300f, 0.0540f, 0.0614f, 0.0495f, 0.0590f,
+                                                               -0.0511f, -0.0603f, -0.0478f, -0.0524f, -0.0227f, -0.0274f, -0.0154f, -0.0255f,
+                                                               -0.0572f, -0.0565f, -0.0518f, -0.0496f, 0.0116f, 0.0054f, 0.0163f, 0.0104f});
+            mean_tensor.reshape_(stats_shape);
+            auto std_tensor = sd::Tensor<float>::from_vector({1.8029f, 1.7786f, 1.7868f, 1.7837f, 1.7717f, 1.7590f, 1.7610f, 1.7479f,
+                                                              1.7336f, 1.7373f, 1.7340f, 1.7343f, 1.8626f, 1.8527f, 1.8629f, 1.8589f,
+                                                              1.7593f, 1.7526f, 1.7556f, 1.7583f, 1.7363f, 1.7400f, 1.7355f, 1.7394f,
+                                                              1.7342f, 1.7246f, 1.7392f, 1.7304f, 1.7551f, 1.7513f, 1.7559f, 1.7488f,
+                                                              1.8449f, 1.8454f, 1.8550f, 1.8535f, 1.8240f, 1.7813f, 1.7854f, 1.7945f,
+                                                              1.8047f, 1.7876f, 1.7695f, 1.7676f, 1.7782f, 1.7667f, 1.7925f, 1.7848f,
+                                                              1.7579f, 1.7407f, 1.7483f, 1.7368f, 1.7961f, 1.7998f, 1.7920f, 1.7925f,
+                                                              1.7780f, 1.7747f, 1.7727f, 1.7749f, 1.7526f, 1.7447f, 1.7657f, 1.7495f,
+                                                              1.7775f, 1.7720f, 1.7813f, 1.7813f, 1.8162f, 1.8013f, 1.8023f, 1.8033f,
+                                                              1.7527f, 1.7331f, 1.7563f, 1.7482f, 1.7610f, 1.7507f, 1.7681f, 1.7613f,
+                                                              1.7665f, 1.7545f, 1.7828f, 1.7726f, 1.7896f, 1.7999f, 1.7864f, 1.7760f,
+                                                              1.7613f, 1.7625f, 1.7560f, 1.7577f, 1.7783f, 1.7671f, 1.7810f, 1.7799f,
+                                                              1.7201f, 1.7068f, 1.7265f, 1.7091f, 1.7793f, 1.7578f, 1.7502f, 1.7455f,
+                                                              1.7587f, 1.7500f, 1.7525f, 1.7362f, 1.7616f, 1.7572f, 1.7444f, 1.7430f,
+                                                              1.7509f, 1.7610f, 1.7634f, 1.7612f, 1.7254f, 1.7135f, 1.7321f, 1.7226f,
+                                                              1.7664f, 1.7624f, 1.7718f, 1.7664f, 1.7457f, 1.7441f, 1.7569f, 1.7530f});
+            std_tensor.reshape_(stats_shape);
+            return {std::move(mean_tensor), std::move(std_tensor)};
         } else {
             GGML_ABORT("unknown version %d", version);
         }
     }
 
-    ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents) {
-        ggml_tensor* vae_latents = ggml_dup(work_ctx, latents);
+    sd::Tensor<float> diffusion_to_vae_latents(const sd::Tensor<float>& latents) override {
         if (sd_version_is_flux2(version)) {
-            int channel_dim = 2;
-            std::vector<float> latents_mean_vec;
-            std::vector<float> latents_std_vec;
-            get_latents_mean_std_vec(latents, channel_dim, latents_mean_vec, latents_std_vec);
-
-            float mean;
-            float std_;
-            for (int i = 0; i < latents->ne[3]; i++) {
-                if (channel_dim == 3) {
-                    mean = latents_mean_vec[i];
-                    std_ = latents_std_vec[i];
-                }
-                for (int j = 0; j < latents->ne[2]; j++) {
-                    if (channel_dim == 2) {
-                        mean = latents_mean_vec[j];
-                        std_ = latents_std_vec[j];
-                    }
-                    for (int k = 0; k < latents->ne[1]; k++) {
-                        for (int l = 0; l < latents->ne[0]; l++) {
-                            float value = ggml_ext_tensor_get_f32(latents, l, k, j, i);
-                            value       = value * std_ / scale_factor + mean;
-                            ggml_ext_tensor_set_f32(vae_latents, value, l, k, j, i);
-                        }
-                    }
-                }
-            }
-        } else {
-            ggml_ext_tensor_iter(latents, [&](ggml_tensor* latents, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-                float value = ggml_ext_tensor_get_f32(latents, i0, i1, i2, i3);
-                value       = (value / scale_factor) + shift_factor;
-                ggml_ext_tensor_set_f32(vae_latents, value, i0, i1, i2, i3);
-            });
+            int channel_dim                = 2;
+            auto [mean_tensor, std_tensor] = get_latents_mean_std(latents, channel_dim);
+            return (latents * std_tensor) / scale_factor + mean_tensor;
         }
-        return vae_latents;
+        return (latents / scale_factor) + shift_factor;
     }
 
-    ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents) {
-        ggml_tensor* diffusion_latents = ggml_dup(work_ctx, latents);
+    sd::Tensor<float> vae_to_diffusion_latents(const sd::Tensor<float>& latents) override {
         if (sd_version_is_flux2(version)) {
-            int channel_dim = 2;
-            std::vector<float> latents_mean_vec;
-            std::vector<float> latents_std_vec;
-            get_latents_mean_std_vec(latents, channel_dim, latents_mean_vec, latents_std_vec);
-
-            float mean;
-            float std_;
-            for (int i = 0; i < latents->ne[3]; i++) {
-                if (channel_dim == 3) {
-                    mean = latents_mean_vec[i];
-                    std_ = latents_std_vec[i];
-                }
-                for (int j = 0; j < latents->ne[2]; j++) {
-                    if (channel_dim == 2) {
-                        mean = latents_mean_vec[j];
-                        std_ = latents_std_vec[j];
-                    }
-                    for (int k = 0; k < latents->ne[1]; k++) {
-                        for (int l = 0; l < latents->ne[0]; l++) {
-                            float value = ggml_ext_tensor_get_f32(latents, l, k, j, i);
-                            value       = (value - mean) * scale_factor / std_;
-                            ggml_ext_tensor_set_f32(diffusion_latents, value, l, k, j, i);
-                        }
-                    }
-                }
-            }
-        } else {
-            ggml_ext_tensor_iter(latents, [&](ggml_tensor* latents, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-                float value = ggml_ext_tensor_get_f32(latents, i0, i1, i2, i3);
-                value       = (value - shift_factor) * scale_factor;
-                ggml_ext_tensor_set_f32(diffusion_latents, value, i0, i1, i2, i3);
-            });
+            int channel_dim                = 2;
+            auto [mean_tensor, std_tensor] = get_latents_mean_std(latents, channel_dim);
+            return ((latents - mean_tensor) * scale_factor) / std_tensor;
         }
-        return diffusion_latents;
+        return (latents - shift_factor) * scale_factor;
     }
 
     int get_encoder_output_channels(int input_channels) {
@@ -889,24 +804,26 @@ struct AutoEncoderKL : public VAE {
         params.mem_buffer = nullptr;
         params.no_alloc   = false;
 
-        ggml_context* work_ctx = ggml_init(params);
-        GGML_ASSERT(work_ctx != nullptr);
+        ggml_context* ctx = ggml_init(params);
+        GGML_ASSERT(ctx != nullptr);
 
         {
             // CPU, x{1, 3, 64, 64}: Pass
             // CUDA, x{1, 3, 64, 64}: Pass, but sill get wrong result for some image, may be due to interlnal nan
             // CPU, x{2, 3, 64, 64}: Wrong result
             // CUDA, x{2, 3, 64, 64}: Wrong result, and different from CPU result
-            auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 64, 64, 3, 2);
-            ggml_set_f32(x, 0.5f);
-            print_ggml_tensor(x);
-            ggml_tensor* out = nullptr;
+            sd::Tensor<float> x({64, 64, 3, 2});
+            x.fill_(0.5f);
+            print_sd_tensor(x);
+            sd::Tensor<float> out;
 
-            int64_t t0 = ggml_time_ms();
-            _compute(8, x, false, &out, work_ctx);
-            int64_t t1 = ggml_time_ms();
+            int64_t t0   = ggml_time_ms();
+            auto out_opt = _compute(8, x, false);
+            int64_t t1   = ggml_time_ms();
 
-            print_ggml_tensor(out);
+            GGML_ASSERT(!out_opt.empty());
+            out = std::move(out_opt);
+            print_sd_tensor(out);
             LOG_DEBUG("encode test done in %lldms", t1 - t0);
         }
 
@@ -915,16 +832,18 @@ struct AutoEncoderKL : public VAE {
             // CUDA, z{1, 4, 8, 8}: Pass
             // CPU, z{3, 4, 8, 8}: Wrong result
             // CUDA, z{3, 4, 8, 8}: Wrong result, and different from CPU result
-            auto z = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1);
-            ggml_set_f32(z, 0.5f);
-            print_ggml_tensor(z);
-            ggml_tensor* out = nullptr;
+            sd::Tensor<float> z({8, 8, 4, 1});
+            z.fill_(0.5f);
+            print_sd_tensor(z);
+            sd::Tensor<float> out;
 
-            int64_t t0 = ggml_time_ms();
-            _compute(8, z, true, &out, work_ctx);
-            int64_t t1 = ggml_time_ms();
+            int64_t t0   = ggml_time_ms();
+            auto out_opt = _compute(8, z, true);
+            int64_t t1   = ggml_time_ms();
 
-            print_ggml_tensor(out);
+            GGML_ASSERT(!out_opt.empty());
+            out = std::move(out_opt);
+            print_sd_tensor(out);
             LOG_DEBUG("decode test done in %lldms", t1 - t0);
         }
     };
diff --git a/src/cache_dit.hpp b/src/cache_dit.hpp
index 9af627f..dad67d4 100644
--- a/src/cache_dit.hpp
+++ b/src/cache_dit.hpp
@@ -8,7 +8,9 @@
 #include <unordered_map>
 #include <vector>
 
+#include "condition_cache_utils.hpp"
 #include "ggml_extend.hpp"
+#include "tensor.hpp"
 
 struct DBCacheConfig {
     bool enabled                        = false;
@@ -771,35 +773,37 @@ struct CacheDitConditionState {
         return it != cache_diffs.end() && !it->second.diff.empty();
     }
 
-    void update_cache(const void* cond, const float* input, const float* output, size_t size) {
+    void update_cache(const void* cond, const sd::Tensor<float>& input, const sd::Tensor<float>& output) {
         CacheEntry& entry = cache_diffs[cond];
-        entry.diff.resize(size);
-        for (size_t i = 0; i < size; i++) {
-            entry.diff[i] = output[i] - input[i];
+        if (!sd::store_condition_cache_diff(&entry.diff, input, output)) {
+            entry.prev_input.clear();
+            entry.prev_output.clear();
+            entry.has_prev = false;
+            return;
         }
 
+        size_t size              = static_cast<size_t>(output.numel());
+        const float* input_data  = input.data();
+        const float* output_data = output.data();
         entry.prev_input.resize(size);
         entry.prev_output.resize(size);
         for (size_t i = 0; i < size; i++) {
-            entry.prev_input[i]  = input[i];
-            entry.prev_output[i] = output[i];
+            entry.prev_input[i]  = input_data[i];
+            entry.prev_output[i] = output_data[i];
         }
         entry.has_prev = true;
     }
 
-    void apply_cache(const void* cond, const float* input, float* output, size_t size) {
+    void apply_cache(const void* cond,
+                     const sd::Tensor<float>& input,
+                     sd::Tensor<float>* output) {
         auto it = cache_diffs.find(cond);
         if (it == cache_diffs.end() || it->second.diff.empty())
             return;
-        if (it->second.diff.size() != size)
-            return;
-
-        for (size_t i = 0; i < size; i++) {
-            output[i] = input[i] + it->second.diff[i];
-        }
+        sd::apply_condition_cache_diff(it->second.diff, input, output);
     }
 
-    bool before_condition(const void* cond, ggml_tensor* input, ggml_tensor* output, float sigma, int step_index) {
+    bool before_condition(const void* cond, const sd::Tensor<float>& input, sd::Tensor<float>* output, float sigma, int step_index) {
         if (!enabled() || step_index < 0)
             return false;
 
@@ -819,8 +823,7 @@ struct CacheDitConditionState {
 
         if (skip_current_step) {
             if (has_cache(cond)) {
-                apply_cache(cond, (float*)input->data, (float*)output->data,
-                            static_cast<size_t>(ggml_nelements(output)));
+                apply_cache(cond, input, output);
                 return true;
             }
             return false;
@@ -833,13 +836,13 @@ struct CacheDitConditionState {
         if (it == cache_diffs.end() || !it->second.has_prev)
             return false;
 
-        size_t ne = static_cast<size_t>(ggml_nelements(input));
+        size_t ne = static_cast<size_t>(input.numel());
         if (it->second.prev_input.size() != ne)
             return false;
 
-        float* input_data = (float*)input->data;
-        float diff        = CacheDitState::calculate_residual_diff(
-                   it->second.prev_input.data(), input_data, ne);
+        const float* input_data = input.data();
+        float diff              = CacheDitState::calculate_residual_diff(
+                         it->second.prev_input.data(), input_data, ne);
 
         float effective_threshold = config.residual_diff_threshold;
         if (config.Fn_compute_blocks > 0) {
@@ -859,7 +862,7 @@ struct CacheDitConditionState {
             cached_steps.push_back(current_step_index);
             continuous_cached_steps++;
             accumulated_residual_diff += diff;
-            apply_cache(cond, input_data, (float*)output->data, ne);
+            apply_cache(cond, input, output);
             return true;
         }
 
@@ -867,15 +870,14 @@ struct CacheDitConditionState {
         return false;
     }
 
-    void after_condition(const void* cond, ggml_tensor* input, ggml_tensor* output) {
+    void after_condition(const void* cond, const sd::Tensor<float>& input, const sd::Tensor<float>& output) {
         if (!step_is_active())
             return;
 
-        size_t ne = static_cast<size_t>(ggml_nelements(output));
-        update_cache(cond, (float*)input->data, (float*)output->data, ne);
+        update_cache(cond, input, output);
 
         if (cond == anchor_condition && taylor_config.enabled) {
-            taylor_state.update_derivatives((float*)output->data, ne, current_step_index);
+            taylor_state.update_derivatives(output.data(), static_cast<size_t>(output.numel()), current_step_index);
         }
     }
 
diff --git a/src/clip.hpp b/src/clip.hpp
index f4e5ef7..8f2ac06 100644
--- a/src/clip.hpp
+++ b/src/clip.hpp
@@ -957,15 +957,14 @@ struct CLIPTextModelRunner : public GGMLRunner {
         return model.forward(ctx, input_ids, embeddings, mask, max_token_idx, return_pooled, clip_skip);
     }
 
-    ggml_cgraph* build_graph(ggml_tensor* input_ids,
+    ggml_cgraph* build_graph(const sd::Tensor<int32_t>& input_ids_tensor,
                              int num_custom_embeddings    = 0,
                              void* custom_embeddings_data = nullptr,
                              size_t max_token_idx         = 0,
                              bool return_pooled           = false,
                              int clip_skip                = -1) {
-        ggml_cgraph* gf = new_graph_custom(2048);
-
-        input_ids = to_backend(input_ids);
+        ggml_cgraph* gf        = new_graph_custom(2048);
+        ggml_tensor* input_ids = make_input(input_ids_tensor);
 
         ggml_tensor* embeddings = nullptr;
 
@@ -1004,19 +1003,21 @@ struct CLIPTextModelRunner : public GGMLRunner {
         return gf;
     }
 
-    bool compute(const int n_threads,
-                 ggml_tensor* input_ids,
-                 int num_custom_embeddings,
-                 void* custom_embeddings_data,
-                 size_t max_token_idx,
-                 bool return_pooled,
-                 int clip_skip,
-                 ggml_tensor** output,
-                 ggml_context* output_ctx = nullptr) {
+    sd::Tensor<float> compute(const int n_threads,
+                              const sd::Tensor<int32_t>& input_ids,
+                              int num_custom_embeddings,
+                              void* custom_embeddings_data,
+                              size_t max_token_idx,
+                              bool return_pooled,
+                              int clip_skip) {
         auto get_graph = [&]() -> ggml_cgraph* {
             return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip);
         };
-        return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
+        auto result = GGMLRunner::compute<float>(get_graph, n_threads, true);
+        if (return_pooled) {
+            return take_or_empty(std::move(result));
+        }
+        return restore_trailing_singleton_dims(std::move(result), 3);
     }
 };
 
diff --git a/src/common_dit.hpp b/src/common_dit.hpp
index 0e6f0f0..30141d4 100644
--- a/src/common_dit.hpp
+++ b/src/common_dit.hpp
@@ -4,11 +4,11 @@
 #include "ggml_extend.hpp"
 
 namespace DiT {
-    ggml_tensor* patchify(ggml_context* ctx,
-                          ggml_tensor* x,
-                          int pw,
-                          int ph,
-                          bool patch_last = true) {
+    inline ggml_tensor* patchify(ggml_context* ctx,
+                                 ggml_tensor* x,
+                                 int pw,
+                                 int ph,
+                                 bool patch_last = true) {
         // x: [N, C, H, W]
         // return: [N, h*w, C*ph*pw] if patch_last else [N, h*w, ph*pw*C]
         int64_t N = x->ne[3];
@@ -33,13 +33,13 @@ namespace DiT {
         return x;
     }
 
-    ggml_tensor* unpatchify(ggml_context* ctx,
-                            ggml_tensor* x,
-                            int64_t h,
-                            int64_t w,
-                            int ph,
-                            int pw,
-                            bool patch_last = true) {
+    inline ggml_tensor* unpatchify(ggml_context* ctx,
+                                   ggml_tensor* x,
+                                   int64_t h,
+                                   int64_t w,
+                                   int ph,
+                                   int pw,
+                                   bool patch_last = true) {
         // x: [N, h*w, C*ph*pw] if patch_last else [N, h*w, ph*pw*C]
         // return: [N, C, H, W]
         int64_t N = x->ne[2];
@@ -64,10 +64,10 @@ namespace DiT {
         return x;
     }
 
-    ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx,
-                                   ggml_tensor* x,
-                                   int ph,
-                                   int pw) {
+    inline ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx,
+                                          ggml_tensor* x,
+                                          int ph,
+                                          int pw) {
         int64_t W = x->ne[0];
         int64_t H = x->ne[1];
 
@@ -77,23 +77,23 @@ namespace DiT {
         return x;
     }
 
-    ggml_tensor* pad_and_patchify(GGMLRunnerContext* ctx,
-                                  ggml_tensor* x,
-                                  int ph,
-                                  int pw,
-                                  bool patch_last = true) {
+    inline ggml_tensor* pad_and_patchify(GGMLRunnerContext* ctx,
+                                         ggml_tensor* x,
+                                         int ph,
+                                         int pw,
+                                         bool patch_last = true) {
         x = pad_to_patch_size(ctx, x, ph, pw);
         x = patchify(ctx->ggml_ctx, x, ph, pw, patch_last);
         return x;
     }
 
-    ggml_tensor* unpatchify_and_crop(ggml_context* ctx,
-                                     ggml_tensor* x,
-                                     int64_t H,
-                                     int64_t W,
-                                     int ph,
-                                     int pw,
-                                     bool patch_last = true) {
+    inline ggml_tensor* unpatchify_and_crop(ggml_context* ctx,
+                                            ggml_tensor* x,
+                                            int64_t H,
+                                            int64_t W,
+                                            int ph,
+                                            int pw,
+                                            bool patch_last = true) {
         int pad_h = (ph - H % ph) % ph;
         int pad_w = (pw - W % pw) % pw;
         int64_t h = ((H + pad_h) / ph);
@@ -105,4 +105,4 @@ namespace DiT {
     }
 }  // namespace DiT
 
-#endif  // __COMMON_DIT_HPP__
\ No newline at end of file
+#endif  // __COMMON_DIT_HPP__
diff --git a/src/condition_cache_utils.hpp b/src/condition_cache_utils.hpp
new file mode 100644
index 0000000..903d64e
--- /dev/null
+++ b/src/condition_cache_utils.hpp
@@ -0,0 +1,64 @@
+#ifndef __CONDITION_CACHE_UTILS_HPP__
+#define __CONDITION_CACHE_UTILS_HPP__
+
+#include <vector>
+
+#include "tensor.hpp"
+
+namespace sd {
+
+    inline bool store_condition_cache_diff(std::vector<float>* diff,
+                                           const sd::Tensor<float>& input,
+                                           const sd::Tensor<float>& output) {
+        if (diff == nullptr || input.empty() || output.empty()) {
+            return false;
+        }
+
+        size_t input_size  = static_cast<size_t>(input.numel());
+        size_t output_size = static_cast<size_t>(output.numel());
+        if (input_size == 0 || input_size != output_size) {
+            diff->clear();
+            return false;
+        }
+
+        const float* input_data  = input.data();
+        const float* output_data = output.data();
+        if (input_data == nullptr || output_data == nullptr) {
+            diff->clear();
+            return false;
+        }
+
+        diff->resize(output_size);
+        for (size_t i = 0; i < output_size; ++i) {
+            (*diff)[i] = output_data[i] - input_data[i];
+        }
+        return true;
+    }
+
+    inline bool apply_condition_cache_diff(const std::vector<float>& diff,
+                                           const sd::Tensor<float>& input,
+                                           sd::Tensor<float>* output) {
+        if (output == nullptr || input.empty() || diff.empty()) {
+            return false;
+        }
+
+        size_t input_size = static_cast<size_t>(input.numel());
+        if (input_size == 0 || diff.size() != input_size) {
+            return false;
+        }
+
+        *output            = input;
+        float* output_data = output->data();
+        if (output_data == nullptr) {
+            return false;
+        }
+
+        for (size_t i = 0; i < input_size; ++i) {
+            output_data[i] += diff[i];
+        }
+        return true;
+    }
+
+}  // namespace sd
+
+#endif  // __CONDITION_CACHE_UTILS_HPP__
diff --git a/src/conditioner.hpp b/src/conditioner.hpp
index 534a2f1..05167cf 100644
--- a/src/conditioner.hpp
+++ b/src/conditioner.hpp
@@ -1,39 +1,85 @@
 #ifndef __CONDITIONER_HPP__
 #define __CONDITIONER_HPP__
 
+#include <optional>
+
 #include "clip.hpp"
 #include "llm.hpp"
 #include "t5.hpp"
+#include "tensor_ggml.hpp"
 
 struct SDCondition {
-    ggml_tensor* c_crossattn = nullptr;  // aka context
-    ggml_tensor* c_vector    = nullptr;  // aka y
-    ggml_tensor* c_concat    = nullptr;
+    sd::Tensor<float> c_crossattn;
+    sd::Tensor<float> c_vector;
+    sd::Tensor<float> c_concat;
+    sd::Tensor<int32_t> c_t5_ids;
+    sd::Tensor<float> c_t5_weights;
 
-    std::vector<ggml_tensor*> extra_c_crossattns;
+    std::vector<sd::Tensor<float>> extra_c_crossattns;
 
     SDCondition() = default;
-    SDCondition(ggml_tensor* c_crossattn,
-                ggml_tensor* c_vector,
-                ggml_tensor* c_concat,
-                const std::vector<ggml_tensor*>& extra_c_crossattns = {})
-        : c_crossattn(c_crossattn), c_vector(c_vector), c_concat(c_concat), extra_c_crossattns(extra_c_crossattns) {}
+
+    SDCondition(sd::Tensor<float> c_crossattn,
+                sd::Tensor<float> c_vector,
+                sd::Tensor<float> c_concat)
+        : c_crossattn(std::move(c_crossattn)), c_vector(std::move(c_vector)), c_concat(std::move(c_concat)) {}
+
+    bool empty() const {
+        if (!c_crossattn.empty() || !c_vector.empty() || !c_concat.empty() ||
+            !c_t5_ids.empty() || !c_t5_weights.empty()) {
+            return false;
+        }
+
+        for (const auto& tensor : extra_c_crossattns) {
+            if (!tensor.empty()) {
+                return false;
+            }
+        }
+
+        return true;
+    }
 };
 
+static inline sd::Tensor<float> apply_token_weights(sd::Tensor<float> hidden_states,
+                                                    const std::vector<float>& weights) {
+    if (hidden_states.empty()) {
+        return hidden_states;
+    }
+
+    if (hidden_states.dim() == 1) {
+        hidden_states.unsqueeze_(1);
+    }
+
+    GGML_ASSERT(static_cast<size_t>(hidden_states.shape()[1]) == weights.size());
+
+    float original_mean = hidden_states.mean();
+    auto chunk_weights  = sd::Tensor<float>::from_vector(weights);
+    chunk_weights.reshape_({1, static_cast<int64_t>(weights.size())});
+    hidden_states *= chunk_weights;
+    float new_mean = hidden_states.mean();
+    if (new_mean != 0.0f) {
+        hidden_states *= (original_mean / new_mean);
+    }
+
+    return hidden_states;
+}
+
 struct ConditionerParams {
     std::string text;
-    int clip_skip                       = -1;
-    int width                           = -1;
-    int height                          = -1;
-    int adm_in_channels                 = -1;
-    bool zero_out_masked                = false;
-    int num_input_imgs                  = 0;   // for photomaker
-    std::vector<sd_image_t*> ref_images = {};  // for qwen image edit
+    int clip_skip                                    = -1;
+    int width                                        = -1;
+    int height                                       = -1;
+    int adm_in_channels                              = -1;
+    bool zero_out_masked                             = false;
+    int num_input_imgs                               = 0;        // for photomaker
+    const std::vector<sd::Tensor<float>>* ref_images = nullptr;  // for qwen image edit
 };
 
 struct Conditioner {
-    virtual SDCondition get_learned_condition(ggml_context* work_ctx,
-                                              int n_threads,
+    virtual ~Conditioner() = default;
+
+public:
+    virtual SDCondition get_learned_condition(int n_threads,
                                               const ConditionerParams& conditioner_params) = 0;
     virtual void alloc_params_buffer()                                                     = 0;
     virtual void free_params_buffer()                                                      = 0;
@@ -41,13 +87,11 @@ struct Conditioner {
     virtual size_t get_params_buffer_size()                                                = 0;
     virtual void set_flash_attention_enabled(bool enabled)                                 = 0;
     virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {}
-    virtual std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(ggml_context* work_ctx,
-                                                                                          int n_threads,
+    virtual std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(int n_threads,
                                                                                           const ConditionerParams& conditioner_params) {
         GGML_ABORT("Not implemented yet!");
     }
-    virtual std::string remove_trigger_from_prompt(ggml_context* work_ctx,
-                                                   const std::string& prompt) {
+    virtual std::string remove_trigger_from_prompt(const std::string& prompt) {
         GGML_ABORT("Not implemented yet!");
     }
 };
@@ -426,8 +470,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
         return {tokens, weights};
     }
 
-    SDCondition get_learned_condition_common(ggml_context* work_ctx,
-                                             int n_threads,
+    SDCondition get_learned_condition_common(int n_threads,
                                              std::vector<int>& tokens,
                                              std::vector<float>& weights,
                                              int clip_skip,
@@ -435,13 +478,9 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                              int height,
                                              int adm_in_channels  = -1,
                                              bool zero_out_masked = false) {
-        int64_t t0                        = ggml_time_ms();
-        ggml_tensor* hidden_states        = nullptr;  // [N, n_token, hidden_size]
-        ggml_tensor* chunk_hidden_states  = nullptr;  // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2]
-        ggml_tensor* chunk_hidden_states1 = nullptr;  // [n_token, hidden_size]
-        ggml_tensor* chunk_hidden_states2 = nullptr;  // [n_token, hidden_size2]
-        ggml_tensor* pooled               = nullptr;
-        std::vector<float> hidden_states_vec;
+        int64_t t0 = ggml_time_ms();
+        sd::Tensor<float> hidden_states;  // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2]
+        sd::Tensor<float> pooled;
 
         if (clip_skip <= 0) {
             clip_skip = (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) ? 2 : 1;
@@ -455,9 +494,9 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
             std::vector<float> chunk_weights(weights.begin() + chunk_idx * chunk_len,
                                              weights.begin() + (chunk_idx + 1) * chunk_len);
 
-            auto input_ids          = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
-            ggml_tensor* input_ids2 = nullptr;
-            size_t max_token_idx    = 0;
+            sd::Tensor<int32_t> input_ids({static_cast<int64_t>(chunk_tokens.size())}, chunk_tokens);
+            sd::Tensor<int32_t> input_ids2;
+            size_t max_token_idx = 0;
             if (sd_version_is_sdxl(version)) {
                 auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), tokenizer.EOS_TOKEN_ID);
                 if (it != chunk_tokens.end()) {
@@ -466,7 +505,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
 
                 max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
 
-                input_ids2 = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
+                input_ids2 = sd::Tensor<int32_t>({static_cast<int64_t>(chunk_tokens.size())}, chunk_tokens);
 
                 // for (int i = 0; i < chunk_tokens.size(); i++) {
                 //     printf("%d ", chunk_tokens[i]);
@@ -475,118 +514,87 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
             }
 
             {
-                text_model->compute(n_threads,
-                                    input_ids,
-                                    num_custom_embeddings,
-                                    token_embed_custom.data(),
-                                    max_token_idx,
-                                    false,
-                                    clip_skip,
-                                    &chunk_hidden_states1,
-                                    work_ctx);
+                auto chunk_hidden_states = text_model->compute(n_threads,
+                                                               input_ids,
+                                                               num_custom_embeddings,
+                                                               token_embed_custom.data(),
+                                                               max_token_idx,
+                                                               false,
+                                                               clip_skip);
+                GGML_ASSERT(!chunk_hidden_states.empty());
                 if (sd_version_is_sdxl(version)) {
-                    text_model2->compute(n_threads,
-                                         input_ids2,
-                                         num_custom_embeddings,
-                                         token_embed_custom.data(),
-                                         max_token_idx,
-                                         false,
-                                         clip_skip,
-                                         &chunk_hidden_states2, work_ctx);
-                    // concat
-                    chunk_hidden_states = ggml_ext_tensor_concat(work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0);
+                    auto chunk_hidden_states2 = text_model2->compute(n_threads,
+                                                                     input_ids2,
+                                                                     num_custom_embeddings,
+                                                                     token_embed_custom.data(),
+                                                                     max_token_idx,
+                                                                     false,
+                                                                     clip_skip);
+                    GGML_ASSERT(!chunk_hidden_states2.empty());
+                    chunk_hidden_states = sd::ops::concat(chunk_hidden_states, chunk_hidden_states2, 0);
 
                     if (chunk_idx == 0) {
-                        text_model2->compute(n_threads,
-                                             input_ids2,
-                                             num_custom_embeddings,
-                                             token_embed_custom.data(),
-                                             max_token_idx,
-                                             true,
-                                             clip_skip,
-                                             &pooled,
-                                             work_ctx);
+                        pooled = text_model2->compute(n_threads,
+                                                      input_ids2,
+                                                      num_custom_embeddings,
+                                                      token_embed_custom.data(),
+                                                      max_token_idx,
+                                                      true,
+                                                      clip_skip);
+                        GGML_ASSERT(!pooled.empty());
                     }
-                } else {
-                    chunk_hidden_states = chunk_hidden_states1;
                 }
-            }
+                int64_t t1 = ggml_time_ms();
+                LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
 
-            int64_t t1 = ggml_time_ms();
-            LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
-            ggml_tensor* result = ggml_dup_tensor(work_ctx, chunk_hidden_states);
-            {
-                float original_mean = ggml_ext_tensor_mean(chunk_hidden_states);
-                for (int i2 = 0; i2 < chunk_hidden_states->ne[2]; i2++) {
-                    for (int i1 = 0; i1 < chunk_hidden_states->ne[1]; i1++) {
-                        for (int i0 = 0; i0 < chunk_hidden_states->ne[0]; i0++) {
-                            float value = ggml_ext_tensor_get_f32(chunk_hidden_states, i0, i1, i2);
-                            value *= chunk_weights[i1];
-                            ggml_ext_tensor_set_f32(result, value, i0, i1, i2);
-                        }
-                    }
+                chunk_hidden_states = apply_token_weights(std::move(chunk_hidden_states), chunk_weights);
+
+                if (zero_out_masked) {
+                    chunk_hidden_states.fill_(0.0f);
                 }
-                float new_mean = ggml_ext_tensor_mean(result);
-                ggml_ext_tensor_scale_inplace(result, (original_mean / new_mean));
-            }
-            if (zero_out_masked) {
-                float* vec = (float*)result->data;
-                for (int i = 0; i < ggml_nelements(result); i++) {
-                    vec[i] = 0;
+                if (!hidden_states.empty()) {
+                    hidden_states = sd::ops::concat(hidden_states, chunk_hidden_states, 1);
+                } else {
+                    hidden_states = std::move(chunk_hidden_states);
                 }
             }
-            hidden_states_vec.insert(hidden_states_vec.end(), (float*)result->data, ((float*)result->data) + ggml_nelements(result));
         }
 
-        hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec);
-        hidden_states = ggml_reshape_2d(work_ctx,
-                                        hidden_states,
-                                        chunk_hidden_states->ne[0],
-                                        ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
-
-        ggml_tensor* vec = nullptr;
+        sd::Tensor<float> vec;
         if (sd_version_is_sdxl(version)) {
             int out_dim = 256;
-            vec         = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, adm_in_channels);
-            // [0:1280]
+            GGML_ASSERT(!pooled.empty());
+            vec = sd::Tensor<float>({adm_in_channels});
+            vec.fill_(0.0f);
             size_t offset = 0;
-            memcpy(vec->data, pooled->data, ggml_nbytes(pooled));
-            offset += ggml_nbytes(pooled);
+            std::copy(pooled.values().begin(), pooled.values().end(), vec.values().begin());
+            offset += pooled.values().size();
 
-            // original_size_as_tuple
-            float orig_width             = (float)width;
-            float orig_height            = (float)height;
-            std::vector<float> timesteps = {orig_height, orig_width};
+            auto append_embedding = [&](const std::vector<float>& timesteps) {
+                sd::Tensor<float> embedding;
+                set_timestep_embedding(timesteps, &embedding, out_dim);
+                std::copy(embedding.values().begin(), embedding.values().end(), vec.values().begin() + static_cast<int64_t>(offset));
+                offset += embedding.values().size();
+            };
 
-            ggml_tensor* embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset);
-            offset += ggml_nbytes(embed_view);
-            set_timestep_embedding(timesteps, embed_view, out_dim);
-            // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
-            // crop_coords_top_left
-            float crop_coord_top  = 0.f;
-            float crop_coord_left = 0.f;
-            timesteps             = {crop_coord_top, crop_coord_left};
-            embed_view            = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset);
-            offset += ggml_nbytes(embed_view);
-            set_timestep_embedding(timesteps, embed_view, out_dim);
-            // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
-            // target_size_as_tuple
-            float target_width  = (float)width;
-            float target_height = (float)height;
-            timesteps           = {target_height, target_width};
-            embed_view          = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset);
-            offset += ggml_nbytes(embed_view);
-            set_timestep_embedding(timesteps, embed_view, out_dim);
-            // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
-            GGML_ASSERT(offset == ggml_nbytes(vec));
+            append_embedding({static_cast<float>(height), static_cast<float>(width)});
+            append_embedding({0.0f, 0.0f});
+            append_embedding({static_cast<float>(height), static_cast<float>(width)});
+            GGML_ASSERT(offset == vec.values().size());
         }
-        // print_ggml_tensor(result);
-        return {hidden_states, vec, nullptr};
+        SDCondition result;
+        if (!hidden_states.empty()) {
+            result.c_crossattn = std::move(hidden_states);
+        }
+
+        if (!vec.empty()) {
+            result.c_vector = std::move(vec);
+        }
+        return result;
     }
 
     std::tuple<SDCondition, std::vector<bool>>
-    get_learned_condition_with_trigger(ggml_context* work_ctx,
-                                       int n_threads,
+    get_learned_condition_with_trigger(int n_threads,
                                        const ConditionerParams& conditioner_params) override {
         auto image_tokens = convert_token_to_id(trigger_word);
         // if(image_tokens.size() == 1){
@@ -608,8 +616,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
         // for(int i = 0; i < clsm.size(); ++i)
         //    printf("%d ", clsm[i]?1:0);
         // printf("\n");
-        auto cond = get_learned_condition_common(work_ctx,
-                                                 n_threads,
+        auto cond = get_learned_condition_common(n_threads,
                                                  tokens,
                                                  weights,
                                                  conditioner_params.clip_skip,
@@ -620,8 +627,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
         return std::make_tuple(cond, clsm);
     }
 
-    std::string remove_trigger_from_prompt(ggml_context* work_ctx,
-                                           const std::string& prompt) override {
+    std::string remove_trigger_from_prompt(const std::string& prompt) override {
         auto image_tokens = convert_token_to_id(trigger_word);
         GGML_ASSERT(image_tokens.size() == 1);
         auto tokens_and_weights  = tokenize(prompt, false);
@@ -632,14 +638,12 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
         return decode(tokens);
     }
 
-    SDCondition get_learned_condition(ggml_context* work_ctx,
-                                      int n_threads,
+    SDCondition get_learned_condition(int n_threads,
                                       const ConditionerParams& conditioner_params) override {
         auto tokens_and_weights     = tokenize(conditioner_params.text, true);
         std::vector<int>& tokens    = tokens_and_weights.first;
         std::vector<float>& weights = tokens_and_weights.second;
-        return get_learned_condition_common(work_ctx,
-                                            n_threads,
+        return get_learned_condition_common(n_threads,
                                             tokens,
                                             weights,
                                             conditioner_params.clip_skip,
@@ -680,10 +684,9 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
         vision_model.get_param_tensors(tensors, "cond_stage_model.transformer");
     }
 
-    ggml_cgraph* build_graph(ggml_tensor* pixel_values, bool return_pooled, int clip_skip) {
-        ggml_cgraph* gf = ggml_new_graph(compute_ctx);
-
-        pixel_values = to_backend(pixel_values);
+    ggml_cgraph* build_graph(const sd::Tensor<float>& pixel_values_tensor, bool return_pooled, int clip_skip) {
+        ggml_cgraph* gf           = ggml_new_graph(compute_ctx);
+        ggml_tensor* pixel_values = make_input(pixel_values_tensor);
 
         auto runner_ctx = get_context();
 
@@ -694,16 +697,14 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
         return gf;
     }
 
-    bool compute(const int n_threads,
-                 ggml_tensor* pixel_values,
-                 bool return_pooled,
-                 int clip_skip,
-                 ggml_tensor** output,
-                 ggml_context* output_ctx) {
+    sd::Tensor<float> compute(const int n_threads,
+                              const sd::Tensor<float>& pixel_values,
+                              bool return_pooled,
+                              int clip_skip) {
         auto get_graph = [&]() -> ggml_cgraph* {
             return build_graph(pixel_values, return_pooled, clip_skip);
         };
-        return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
+        return take_or_empty(GGMLRunner::compute<float>(get_graph, n_threads, true));
     }
 };
 
@@ -893,8 +894,7 @@ struct SD3CLIPEmbedder : public Conditioner {
         return {{clip_l_tokens, clip_l_weights}, {clip_g_tokens, clip_g_weights}, {t5_tokens, t5_weights}};
     }
 
-    SDCondition get_learned_condition_common(ggml_context* work_ctx,
-                                             int n_threads,
+    SDCondition get_learned_condition_common(int n_threads,
                                              std::vector<std::pair<std::vector<int>, std::vector<float>>> token_and_weights,
                                              int clip_skip,
                                              bool zero_out_masked = false) {
@@ -909,232 +909,155 @@ struct SD3CLIPEmbedder : public Conditioner {
             clip_skip = 2;
         }
 
-        int64_t t0                          = ggml_time_ms();
-        ggml_tensor* hidden_states          = nullptr;  // [N, n_token*2, 4096]
-        ggml_tensor* chunk_hidden_states    = nullptr;  // [n_token*2, 4096]
-        ggml_tensor* chunk_hidden_states_l  = nullptr;  // [n_token, hidden_size_l]
-        ggml_tensor* chunk_hidden_states_g  = nullptr;  // [n_token, hidden_size_g]
-        ggml_tensor* chunk_hidden_states_t5 = nullptr;  // [n_token, hidden_size_t5]
-        ggml_tensor* pooled                 = nullptr;
-        ggml_tensor* pooled_l               = nullptr;  // [768,]
-        ggml_tensor* pooled_g               = nullptr;  // [1280,]
-        std::vector<float> hidden_states_vec;
+        size_t chunk_len = 77;
+        int64_t t0       = ggml_time_ms();
+        sd::Tensor<float> hidden_states;
+        sd::Tensor<float> pooled;
 
-        size_t chunk_len   = 77;
         size_t chunk_count = std::max(std::max(clip_l_tokens.size(), clip_g_tokens.size()), t5_tokens.size()) / chunk_len;
+
         for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) {
             // clip_l
+            sd::Tensor<float> chunk_hidden_states_l;
+            sd::Tensor<float> pooled_l;
             if (clip_l) {
                 std::vector<int> chunk_tokens(clip_l_tokens.begin() + chunk_idx * chunk_len,
                                               clip_l_tokens.begin() + (chunk_idx + 1) * chunk_len);
                 std::vector<float> chunk_weights(clip_l_weights.begin() + chunk_idx * chunk_len,
                                                  clip_l_weights.begin() + (chunk_idx + 1) * chunk_len);
 
-                auto input_ids       = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
+                sd::Tensor<int32_t> input_ids({static_cast<int64_t>(chunk_tokens.size())}, chunk_tokens);
                 size_t max_token_idx = 0;
 
-                clip_l->compute(n_threads,
-                                input_ids,
-                                0,
-                                nullptr,
-                                max_token_idx,
-                                false,
-                                clip_skip,
-                                &chunk_hidden_states_l,
-                                work_ctx);
-                {
-                    auto tensor         = chunk_hidden_states_l;
-                    float original_mean = ggml_ext_tensor_mean(tensor);
-                    for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                        for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                                float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
-                                value *= chunk_weights[i1];
-                                ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
-                            }
-                        }
-                    }
-                    float new_mean = ggml_ext_tensor_mean(tensor);
-                    ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
-                }
+                chunk_hidden_states_l = clip_l->compute(n_threads,
+                                                        input_ids,
+                                                        0,
+                                                        nullptr,
+                                                        max_token_idx,
+                                                        false,
+                                                        clip_skip);
+                GGML_ASSERT(!chunk_hidden_states_l.empty());
+                chunk_hidden_states_l = ::apply_token_weights(std::move(chunk_hidden_states_l), chunk_weights);
 
                 if (chunk_idx == 0) {
                     auto it       = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID);
                     max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
-                    clip_l->compute(n_threads,
-                                    input_ids,
-                                    0,
-                                    nullptr,
-                                    max_token_idx,
-                                    true,
-                                    clip_skip,
-                                    &pooled_l,
-                                    work_ctx);
+                    pooled_l      = clip_l->compute(n_threads,
+                                                    input_ids,
+                                                    0,
+                                                    nullptr,
+                                                    max_token_idx,
+                                                    true,
+                                                    clip_skip);
+                    GGML_ASSERT(!pooled_l.empty());
                 }
             } else {
-                chunk_hidden_states_l = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 768, chunk_len);
-                ggml_set_f32(chunk_hidden_states_l, 0.f);
+                chunk_hidden_states_l = sd::Tensor<float>::zeros({768, static_cast<int64_t>(chunk_len), 1});
                 if (chunk_idx == 0) {
-                    pooled_l = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768);
-                    ggml_set_f32(pooled_l, 0.f);
+                    pooled = sd::Tensor<float>::zeros({768, 1});
                 }
             }
 
             // clip_g
+            sd::Tensor<float> chunk_hidden_states_g;
+            sd::Tensor<float> pooled_g;
             if (clip_g) {
                 std::vector<int> chunk_tokens(clip_g_tokens.begin() + chunk_idx * chunk_len,
                                               clip_g_tokens.begin() + (chunk_idx + 1) * chunk_len);
                 std::vector<float> chunk_weights(clip_g_weights.begin() + chunk_idx * chunk_len,
                                                  clip_g_weights.begin() + (chunk_idx + 1) * chunk_len);
 
-                auto input_ids       = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
+                sd::Tensor<int32_t> input_ids({static_cast<int64_t>(chunk_tokens.size())}, chunk_tokens);
                 size_t max_token_idx = 0;
 
-                clip_g->compute(n_threads,
-                                input_ids,
-                                0,
-                                nullptr,
-                                max_token_idx,
-                                false,
-                                clip_skip,
-                                &chunk_hidden_states_g,
-                                work_ctx);
-
-                {
-                    auto tensor         = chunk_hidden_states_g;
-                    float original_mean = ggml_ext_tensor_mean(tensor);
-                    for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                        for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                                float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
-                                value *= chunk_weights[i1];
-                                ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
-                            }
-                        }
-                    }
-                    float new_mean = ggml_ext_tensor_mean(tensor);
-                    ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
-                }
+                chunk_hidden_states_g = clip_g->compute(n_threads,
+                                                        input_ids,
+                                                        0,
+                                                        nullptr,
+                                                        max_token_idx,
+                                                        false,
+                                                        clip_skip);
+                GGML_ASSERT(!chunk_hidden_states_g.empty());
+                chunk_hidden_states_g = ::apply_token_weights(std::move(chunk_hidden_states_g), chunk_weights);
 
                 if (chunk_idx == 0) {
                     auto it       = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_g_tokenizer.EOS_TOKEN_ID);
                     max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
-                    clip_g->compute(n_threads,
-                                    input_ids,
-                                    0,
-                                    nullptr,
-                                    max_token_idx,
-                                    true,
-                                    clip_skip,
-                                    &pooled_g,
-                                    work_ctx);
+                    pooled_g      = clip_g->compute(n_threads,
+                                                    input_ids,
+                                                    0,
+                                                    nullptr,
+                                                    max_token_idx,
+                                                    true,
+                                                    clip_skip);
+                    GGML_ASSERT(!pooled_g.empty());
                 }
             } else {
-                chunk_hidden_states_g = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 1280, chunk_len);
-                ggml_set_f32(chunk_hidden_states_g, 0.f);
+                chunk_hidden_states_g = sd::Tensor<float>::zeros({1280, static_cast<int64_t>(chunk_len), 1});
                 if (chunk_idx == 0) {
-                    pooled_g = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1280);
-                    ggml_set_f32(pooled_g, 0.f);
+                    pooled_g = sd::Tensor<float>::zeros({1280, 1});
                 }
             }
 
             // t5
+            sd::Tensor<float> chunk_hidden_states_t5;
             if (t5) {
                 std::vector<int> chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len,
                                               t5_tokens.begin() + (chunk_idx + 1) * chunk_len);
                 std::vector<float> chunk_weights(t5_weights.begin() + chunk_idx * chunk_len,
                                                  t5_weights.begin() + (chunk_idx + 1) * chunk_len);
 
-                auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
+                sd::Tensor<int32_t> input_ids({static_cast<int64_t>(chunk_tokens.size())}, chunk_tokens);
 
-                t5->compute(n_threads,
-                            input_ids,
-                            nullptr,
-                            &chunk_hidden_states_t5,
-                            work_ctx);
-                {
-                    auto tensor         = chunk_hidden_states_t5;
-                    float original_mean = ggml_ext_tensor_mean(tensor);
-                    for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                        for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                                float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
-                                value *= chunk_weights[i1];
-                                ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
-                            }
-                        }
-                    }
-                    float new_mean = ggml_ext_tensor_mean(tensor);
-                    ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
-                }
+                chunk_hidden_states_t5 = t5->compute(n_threads,
+                                                     input_ids,
+                                                     sd::Tensor<float>());
+                GGML_ASSERT(!chunk_hidden_states_t5.empty());
+                chunk_hidden_states_t5 = ::apply_token_weights(std::move(chunk_hidden_states_t5), chunk_weights);
             } else {
-                chunk_hidden_states_t5 = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, chunk_len);
-                ggml_set_f32(chunk_hidden_states_t5, 0.f);
+                chunk_hidden_states_t5 = sd::Tensor<float>::zeros({4096, static_cast<int64_t>(chunk_len), 1});
             }
 
-            auto chunk_hidden_states_lg_pad = ggml_new_tensor_3d(work_ctx,
-                                                                 chunk_hidden_states_l->type,
-                                                                 4096,
-                                                                 chunk_hidden_states_l->ne[1],
-                                                                 chunk_hidden_states_l->ne[2]);  // [n_token, 4096]
-
-            for (int i2 = 0; i2 < chunk_hidden_states_lg_pad->ne[2]; i2++) {
-                for (int i1 = 0; i1 < chunk_hidden_states_lg_pad->ne[1]; i1++) {
-                    for (int i0 = 0; i0 < chunk_hidden_states_lg_pad->ne[0]; i0++) {
-                        float value = 0.f;
-                        if (i0 < chunk_hidden_states_l->ne[0]) {
-                            value = ggml_ext_tensor_get_f32(chunk_hidden_states_l, i0, i1, i2);
-                        } else if (i0 < chunk_hidden_states_l->ne[0] + chunk_hidden_states_g->ne[0]) {
-                            value = ggml_ext_tensor_get_f32(chunk_hidden_states_g, i0 - chunk_hidden_states_l->ne[0], i1, i2);
-                        }
-                        ggml_ext_tensor_set_f32(chunk_hidden_states_lg_pad, value, i0, i1, i2);
-                    }
-                }
+            sd::Tensor<float> chunk_hidden_states_lg = sd::ops::concat(chunk_hidden_states_l, chunk_hidden_states_g, 0);
+            if (chunk_hidden_states_lg.shape()[0] < 4096) {
+                auto pad_shape         = chunk_hidden_states_lg.shape();
+                pad_shape[0]           = 4096 - chunk_hidden_states_lg.shape()[0];
+                chunk_hidden_states_lg = sd::ops::concat(chunk_hidden_states_lg,
+                                                         sd::Tensor<float>::zeros(pad_shape),
+                                                         0);
             }
 
-            chunk_hidden_states = ggml_ext_tensor_concat(work_ctx, chunk_hidden_states_lg_pad, chunk_hidden_states_t5, 1);  // [n_token*2, 4096]
+            sd::Tensor<float> chunk_hidden_states = sd::ops::concat(chunk_hidden_states_lg,
+                                                                    chunk_hidden_states_t5,
+                                                                    1);  // [n_token*2, 4096]
 
             if (chunk_idx == 0) {
-                pooled = ggml_ext_tensor_concat(work_ctx, pooled_l, pooled_g, 0);  // [768 + 1280]
+                pooled = sd::ops::concat(pooled_l, pooled_g, 0);  // [768 + 1280]
             }
 
             int64_t t1 = ggml_time_ms();
             LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
             if (zero_out_masked) {
-                float* vec = (float*)chunk_hidden_states->data;
-                for (int i = 0; i < ggml_nelements(chunk_hidden_states); i++) {
-                    vec[i] = 0;
-                }
+                chunk_hidden_states.fill_(0.0f);
             }
 
-            hidden_states_vec.insert(hidden_states_vec.end(),
-                                     (float*)chunk_hidden_states->data,
-                                     ((float*)chunk_hidden_states->data) + ggml_nelements(chunk_hidden_states));
+            if (!hidden_states.empty()) {
+                hidden_states = sd::ops::concat(hidden_states, chunk_hidden_states, 1);
+            } else {
+                hidden_states = std::move(chunk_hidden_states);
+            }
         }
 
-        if (hidden_states_vec.size() > 0) {
-            hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec);
-            hidden_states = ggml_reshape_2d(work_ctx,
-                                            hidden_states,
-                                            chunk_hidden_states->ne[0],
-                                            ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
-        } else {
-            hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, 256);
-            ggml_set_f32(hidden_states, 0.f);
-        }
-        if (pooled == nullptr) {
-            pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 2048);
-            ggml_set_f32(pooled, 0.f);
-        }
-        return {hidden_states, pooled, nullptr};
+        SDCondition result;
+        result.c_crossattn = std::move(hidden_states);
+        result.c_vector    = std::move(pooled);
+        return result;
     }
 
-    SDCondition get_learned_condition(ggml_context* work_ctx,
-                                      int n_threads,
+    SDCondition get_learned_condition(int n_threads,
                                       const ConditionerParams& conditioner_params) override {
         auto tokens_and_weights = tokenize(conditioner_params.text, 77, true);
-        return get_learned_condition_common(work_ctx,
-                                            n_threads,
+        return get_learned_condition_common(n_threads,
                                             tokens_and_weights,
                                             conditioner_params.clip_skip,
                                             conditioner_params.zero_out_masked);
@@ -1292,8 +1215,7 @@ struct FluxCLIPEmbedder : public Conditioner {
         return {{clip_l_tokens, clip_l_weights}, {t5_tokens, t5_weights}};
     }
 
-    SDCondition get_learned_condition_common(ggml_context* work_ctx,
-                                             int n_threads,
+    SDCondition get_learned_condition_common(int n_threads,
                                              std::vector<std::pair<std::vector<int>, std::vector<float>>> token_and_weights,
                                              int clip_skip,
                                              bool zero_out_masked = false) {
@@ -1306,11 +1228,9 @@ struct FluxCLIPEmbedder : public Conditioner {
             clip_skip = 2;
         }
 
-        int64_t t0                       = ggml_time_ms();
-        ggml_tensor* hidden_states       = nullptr;  // [N, n_token, 4096]
-        ggml_tensor* chunk_hidden_states = nullptr;  // [n_token, 4096]
-        ggml_tensor* pooled              = nullptr;  // [768,]
-        std::vector<float> hidden_states_vec;
+        int64_t t0 = ggml_time_ms();
+        sd::Tensor<float> hidden_states;  // [N, n_token, 4096]
+        sd::Tensor<float> pooled;         // [768,]
 
         size_t chunk_count = std::max(clip_l_tokens.size() > 0 ? chunk_len : 0, t5_tokens.size()) / chunk_len;
         for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) {
@@ -1323,95 +1243,65 @@ struct FluxCLIPEmbedder : public Conditioner {
                     std::vector<float> chunk_weights(clip_l_weights.begin(),
                                                      clip_l_weights.begin() + chunk_len_l);
 
-                    auto input_ids       = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
+                    sd::Tensor<int32_t> input_ids({static_cast<int64_t>(chunk_tokens.size())}, chunk_tokens);
                     size_t max_token_idx = 0;
 
                     auto it       = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID);
                     max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
 
-                    clip_l->compute(n_threads,
-                                    input_ids,
-                                    0,
-                                    nullptr,
-                                    max_token_idx,
-                                    true,
-                                    clip_skip,
-                                    &pooled,
-                                    work_ctx);
+                    pooled = clip_l->compute(n_threads,
+                                             input_ids,
+                                             0,
+                                             nullptr,
+                                             max_token_idx,
+                                             true,
+                                             clip_skip);
+                    GGML_ASSERT(!pooled.empty());
+                } else {
+                    pooled = sd::Tensor<float>::zeros({768});
                 }
             }
 
             // t5
+            sd::Tensor<float> chunk_hidden_states;
             if (t5) {
                 std::vector<int> chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len,
                                               t5_tokens.begin() + (chunk_idx + 1) * chunk_len);
                 std::vector<float> chunk_weights(t5_weights.begin() + chunk_idx * chunk_len,
                                                  t5_weights.begin() + (chunk_idx + 1) * chunk_len);
 
-                auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
-
-                t5->compute(n_threads,
-                            input_ids,
-                            nullptr,
-                            &chunk_hidden_states,
-                            work_ctx);
-                {
-                    auto tensor         = chunk_hidden_states;
-                    float original_mean = ggml_ext_tensor_mean(tensor);
-                    for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                        for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                                float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
-                                value *= chunk_weights[i1];
-                                ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
-                            }
-                        }
-                    }
-                    float new_mean = ggml_ext_tensor_mean(tensor);
-                    ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
+                sd::Tensor<int32_t> input_ids({static_cast<int64_t>(chunk_tokens.size())}, chunk_tokens);
+                chunk_hidden_states = t5->compute(n_threads,
+                                                  input_ids,
+                                                  sd::Tensor<float>());
+                GGML_ASSERT(!chunk_hidden_states.empty());
+                chunk_hidden_states = ::apply_token_weights(std::move(chunk_hidden_states), chunk_weights);
+                if (zero_out_masked) {
+                    chunk_hidden_states.fill_(0.0f);
                 }
             } else {
-                chunk_hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, chunk_len);
-                ggml_set_f32(chunk_hidden_states, 0.f);
+                chunk_hidden_states = sd::Tensor<float>::zeros({4096, static_cast<int64_t>(chunk_len)});
             }
 
             int64_t t1 = ggml_time_ms();
             LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
-            if (zero_out_masked) {
-                float* vec = (float*)chunk_hidden_states->data;
-                for (int i = 0; i < ggml_nelements(chunk_hidden_states); i++) {
-                    vec[i] = 0;
-                }
+            if (!hidden_states.empty()) {
+                hidden_states = sd::ops::concat(hidden_states, chunk_hidden_states, 1);
+            } else {
+                hidden_states = std::move(chunk_hidden_states);
             }
-
-            hidden_states_vec.insert(hidden_states_vec.end(),
-                                     (float*)chunk_hidden_states->data,
-                                     ((float*)chunk_hidden_states->data) + ggml_nelements(chunk_hidden_states));
         }
 
-        if (hidden_states_vec.size() > 0) {
-            hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec);
-            hidden_states = ggml_reshape_2d(work_ctx,
-                                            hidden_states,
-                                            chunk_hidden_states->ne[0],
-                                            ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
-        } else {
-            hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, 256);
-            ggml_set_f32(hidden_states, 0.f);
-        }
-        if (pooled == nullptr) {
-            pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768);
-            ggml_set_f32(pooled, 0.f);
-        }
-        return {hidden_states, pooled, nullptr};
+        SDCondition result;
+        result.c_crossattn = std::move(hidden_states);
+        result.c_vector    = std::move(pooled);
+        return result;
     }
 
-    SDCondition get_learned_condition(ggml_context* work_ctx,
-                                      int n_threads,
+    SDCondition get_learned_condition(int n_threads,
                                       const ConditionerParams& conditioner_params) override {
         auto tokens_and_weights = tokenize(conditioner_params.text, chunk_len, true);
-        return get_learned_condition_common(work_ctx,
-                                            n_threads,
+        return get_learned_condition_common(n_threads,
                                             tokens_and_weights,
                                             conditioner_params.clip_skip,
                                             conditioner_params.zero_out_masked);
@@ -1523,8 +1413,9 @@ struct T5CLIPEmbedder : public Conditioner {
         return {t5_tokens, t5_weights, t5_mask};
     }
 
-    void modify_mask_to_attend_padding(ggml_tensor* mask, int max_seq_length, int num_extra_padding = 8) {
-        float* mask_data = (float*)mask->data;
+    void modify_mask_to_attend_padding(sd::Tensor<float>* mask, int max_seq_length, int num_extra_padding = 8) {
+        GGML_ASSERT(mask != nullptr);
+        float* mask_data = mask->data();
         int num_pad      = 0;
         for (int64_t i = 0; i < max_seq_length; i++) {
             if (num_pad >= num_extra_padding) {
@@ -1538,29 +1429,23 @@ struct T5CLIPEmbedder : public Conditioner {
         // LOG_DEBUG("PAD: %d", num_pad);
     }
 
-    SDCondition get_learned_condition_common(ggml_context* work_ctx,
-                                             int n_threads,
+    SDCondition get_learned_condition_common(int n_threads,
                                              std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> token_and_weights,
                                              int clip_skip,
                                              bool zero_out_masked = false) {
         if (!t5) {
-            auto hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, 256);
-            ggml_set_f32(hidden_states, 0.f);
-            auto t5_attn_mask = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 256);
-            ggml_set_f32(t5_attn_mask, -HUGE_VALF);
-            return {hidden_states, t5_attn_mask, nullptr};
+            SDCondition result;
+            result.c_crossattn = sd::Tensor<float>::zeros({4096, 256});
+            result.c_vector    = sd::Tensor<float>::full({256}, -HUGE_VALF);
+            return result;
         }
         auto& t5_tokens        = std::get<0>(token_and_weights);
         auto& t5_weights       = std::get<1>(token_and_weights);
         auto& t5_attn_mask_vec = std::get<2>(token_and_weights);
 
-        int64_t t0                       = ggml_time_ms();
-        ggml_tensor* hidden_states       = nullptr;  // [N, n_token, 4096]
-        ggml_tensor* chunk_hidden_states = nullptr;  // [n_token, 4096]
-        ggml_tensor* pooled              = nullptr;
-        ggml_tensor* t5_attn_mask        = vector_to_ggml_tensor(work_ctx, t5_attn_mask_vec);  // [n_token]
-
-        std::vector<float> hidden_states_vec;
+        int64_t t0                     = ggml_time_ms();
+        sd::Tensor<float> t5_attn_mask = sd::Tensor<float>::from_vector(t5_attn_mask_vec);
+        sd::Tensor<float> hidden_states;
 
         size_t chunk_count = t5_tokens.size() / chunk_len;
 
@@ -1573,68 +1458,46 @@ struct T5CLIPEmbedder : public Conditioner {
             std::vector<float> chunk_mask(t5_attn_mask_vec.begin() + chunk_idx * chunk_len,
                                           t5_attn_mask_vec.begin() + (chunk_idx + 1) * chunk_len);
 
-            auto input_ids          = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
-            auto t5_attn_mask_chunk = use_mask ? vector_to_ggml_tensor(work_ctx, chunk_mask) : nullptr;
+            sd::Tensor<int32_t> input_ids({static_cast<int64_t>(chunk_tokens.size())}, chunk_tokens);
+            sd::Tensor<float> t5_attn_mask_chunk;
+            if (use_mask) {
+                t5_attn_mask_chunk = sd::Tensor<float>({static_cast<int64_t>(chunk_mask.size())}, chunk_mask);
+            }
 
-            t5->compute(n_threads,
-                        input_ids,
-                        t5_attn_mask_chunk,
-                        &chunk_hidden_states,
-                        work_ctx);
-            {
-                auto tensor         = chunk_hidden_states;
-                float original_mean = ggml_ext_tensor_mean(tensor);
-                for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                    for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                        for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                            float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
-                            value *= chunk_weights[i1];
-                            ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
-                        }
-                    }
-                }
-                float new_mean = ggml_ext_tensor_mean(tensor);
-                ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
+            auto chunk_hidden_states = t5->compute(n_threads,
+                                                   input_ids,
+                                                   t5_attn_mask_chunk);
+            GGML_ASSERT(!chunk_hidden_states.empty());
+            chunk_hidden_states = apply_token_weights(std::move(chunk_hidden_states), chunk_weights);
+
+            if (zero_out_masked) {
+                auto chunk_mask_tensor = sd::Tensor<float>::from_vector(chunk_mask)
+                                             .reshape_({1, static_cast<int64_t>(chunk_mask.size())});
+                chunk_hidden_states.masked_fill_(chunk_mask_tensor < 0.0f, 0.0f);
             }
 
             int64_t t1 = ggml_time_ms();
             LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
-            if (zero_out_masked) {
-                auto tensor = chunk_hidden_states;
-                for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                    for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                        for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                            if (chunk_mask[i1] < 0.f) {
-                                ggml_ext_tensor_set_f32(tensor, 0.f, i0, i1, i2);
-                            }
-                        }
-                    }
-                }
-            }
 
-            hidden_states_vec.insert(hidden_states_vec.end(),
-                                     (float*)chunk_hidden_states->data,
-                                     ((float*)chunk_hidden_states->data) + ggml_nelements(chunk_hidden_states));
+            if (!hidden_states.empty()) {
+                hidden_states = sd::ops::concat(hidden_states, chunk_hidden_states, 1);
+            } else {
+                hidden_states = std::move(chunk_hidden_states);
+            }
         }
 
-        GGML_ASSERT(hidden_states_vec.size() > 0);
-        hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec);
-        hidden_states = ggml_reshape_2d(work_ctx,
-                                        hidden_states,
-                                        chunk_hidden_states->ne[0],
-                                        ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
+        modify_mask_to_attend_padding(&t5_attn_mask, static_cast<int>(t5_attn_mask.numel()), mask_pad);
 
-        modify_mask_to_attend_padding(t5_attn_mask, static_cast<int>(ggml_nelements(t5_attn_mask)), mask_pad);
-
-        return {hidden_states, t5_attn_mask, nullptr};
+        SDCondition result;
+        result.c_crossattn = std::move(hidden_states);
+        result.c_vector    = std::move(t5_attn_mask);
+        return result;
     }
 
-    SDCondition get_learned_condition(ggml_context* work_ctx,
-                                      int n_threads,
+    SDCondition get_learned_condition(int n_threads,
                                       const ConditionerParams& conditioner_params) override {
         auto tokens_and_weights = tokenize(conditioner_params.text, chunk_len, true);
-        return get_learned_condition_common(work_ctx,
-                                            n_threads,
+        return get_learned_condition_common(n_threads,
                                             tokens_and_weights,
                                             conditioner_params.clip_skip,
                                             conditioner_params.zero_out_masked);
@@ -1723,8 +1586,7 @@ struct AnimaConditioner : public Conditioner {
         return {qwen_tokens, qwen_weights, t5_tokens, t5_weights};
     }
 
-    SDCondition get_learned_condition(ggml_context* work_ctx,
-                                      int n_threads,
+    SDCondition get_learned_condition(int n_threads,
                                       const ConditionerParams& conditioner_params) override {
         int64_t t0 = ggml_time_ms();
 
@@ -1734,46 +1596,25 @@ struct AnimaConditioner : public Conditioner {
         auto& t5_tokens    = std::get<2>(tokenized);
         auto& t5_weights   = std::get<3>(tokenized);
 
-        auto input_ids = vector_to_ggml_tensor_i32(work_ctx, qwen_tokens);
-
-        ggml_tensor* hidden_states = nullptr;  // [N, n_token, 1024]
-        llm->compute(n_threads,
-                     input_ids,
-                     nullptr,
-                     {},
-                     {},
-                     &hidden_states,
-                     work_ctx);
-
-        {
-            auto tensor         = hidden_states;
-            float original_mean = ggml_ext_tensor_mean(tensor);
-            for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                    for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                        float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
-                        value *= qwen_weights[i1];
-                        ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
-                    }
-                }
-            }
-            float new_mean = ggml_ext_tensor_mean(tensor);
-            if (new_mean != 0.f) {
-                ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
-            }
-        }
-
-        ggml_tensor* t5_ids_tensor    = nullptr;
-        ggml_tensor* t5_weight_tensor = nullptr;
-        if (!t5_tokens.empty()) {
-            t5_ids_tensor    = vector_to_ggml_tensor_i32(work_ctx, t5_tokens);
-            t5_weight_tensor = vector_to_ggml_tensor(work_ctx, t5_weights);
-        }
+        sd::Tensor<int32_t> input_ids({static_cast<int64_t>(qwen_tokens.size()), 1}, qwen_tokens);
+        auto hidden_states = llm->compute(n_threads,
+                                          input_ids,
+                                          sd::Tensor<float>(),
+                                          {},
+                                          {});
+        GGML_ASSERT(!hidden_states.empty());
+        hidden_states         = apply_token_weights(std::move(hidden_states), qwen_weights);
+        auto t5_ids_tensor    = sd::Tensor<int32_t>::from_vector(t5_tokens);
+        auto t5_weight_tensor = sd::Tensor<float>::from_vector(t5_weights);
 
         int64_t t1 = ggml_time_ms();
         LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
 
-        return {hidden_states, t5_weight_tensor, t5_ids_tensor};
+        SDCondition result;
+        result.c_crossattn  = std::move(hidden_states);
+        result.c_t5_ids     = std::move(t5_ids_tensor);
+        result.c_t5_weights = std::move(t5_weight_tensor);
+        return result;
     }
 };
 
@@ -1884,15 +1725,14 @@ struct LLMEmbedder : public Conditioner {
         return {tokens, weights};
     }
 
-    ggml_tensor* encode_prompt(ggml_context* work_ctx,
-                               int n_threads,
-                               const std::string prompt,
-                               const std::pair<int, int>& prompt_attn_range,
-                               int max_length,
-                               int min_length,
-                               std::vector<std::pair<int, ggml_tensor*>> image_embeds,
-                               const std::set<int>& out_layers,
-                               int prompt_template_encode_start_idx) {
+    sd::Tensor<float> encode_prompt(int n_threads,
+                                    const std::string prompt,
+                                    const std::pair<int, int>& prompt_attn_range,
+                                    int max_length,
+                                    int min_length,
+                                    const std::vector<std::pair<int, sd::Tensor<float>>>& image_embeds,
+                                    const std::set<int>& out_layers,
+                                    int prompt_template_encode_start_idx) {
         auto tokens_and_weights = tokenize(prompt, prompt_attn_range);
         auto& tokens            = std::get<0>(tokens_and_weights);
         auto& weights           = std::get<1>(tokens_and_weights);
@@ -1904,81 +1744,59 @@ struct LLMEmbedder : public Conditioner {
             tokenizer->pad_tokens(tokens, weights, max_length, true);
         }
 
-        ggml_tensor* hidden_states = nullptr;  // [N, n_token, hidden_size]
-
-        auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens);
-
-        ggml_tensor* attention_mask = nullptr;
+        sd::Tensor<int32_t> input_ids({static_cast<int64_t>(tokens.size())}, tokens);
+        sd::Tensor<float> attention_mask;
         if (!mask.empty()) {
-            attention_mask = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, mask.size(), mask.size());
-            ggml_ext_tensor_iter(attention_mask, [&](ggml_tensor* attention_mask, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-                float value = 0.f;
-                if (mask[i0] == 0.f) {
-                    value = -INFINITY;
-                } else if (i0 > i1) {
-                    value = -INFINITY;
-                }
-                ggml_ext_tensor_set_f32(attention_mask, value, i0, i1, i2, i3);
-            });
-        }
-
-        llm->compute(n_threads,
-                     input_ids,
-                     attention_mask,
-                     image_embeds,
-                     out_layers,
-                     &hidden_states,
-                     work_ctx);
-        {
-            auto tensor         = hidden_states;
-            float original_mean = ggml_ext_tensor_mean(tensor);
-            for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                    for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                        float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
-                        value *= weights[i1];
-                        ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
+            attention_mask = sd::Tensor<float>({static_cast<int64_t>(mask.size()), static_cast<int64_t>(mask.size())});
+            for (size_t i1 = 0; i1 < mask.size(); ++i1) {
+                for (size_t i0 = 0; i0 < mask.size(); ++i0) {
+                    float value = 0.0f;
+                    if (mask[i0] == 0.0f || i0 > i1) {
+                        value = -INFINITY;
                     }
+                    attention_mask[static_cast<int64_t>(i0 + mask.size() * i1)] = value;
                 }
             }
-            float new_mean = ggml_ext_tensor_mean(tensor);
-            ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
         }
 
-        GGML_ASSERT(hidden_states->ne[1] > prompt_template_encode_start_idx);
+        auto hidden_states = llm->compute(n_threads,
+                                          input_ids,
+                                          attention_mask,
+                                          image_embeds,
+                                          out_layers);
+        GGML_ASSERT(!hidden_states.empty());
+        hidden_states = apply_token_weights(std::move(hidden_states), weights);
+        GGML_ASSERT(hidden_states.shape()[1] > prompt_template_encode_start_idx);
 
         int64_t zero_pad_len = 0;
         if (min_length > 0) {
-            if (hidden_states->ne[1] - prompt_template_encode_start_idx < min_length) {
-                zero_pad_len = min_length - hidden_states->ne[1] + prompt_template_encode_start_idx;
+            if (hidden_states.shape()[1] - prompt_template_encode_start_idx < min_length) {
+                zero_pad_len = min_length - hidden_states.shape()[1] + prompt_template_encode_start_idx;
             }
         }
 
-        ggml_tensor* new_hidden_states = ggml_new_tensor_3d(work_ctx,
-                                                            GGML_TYPE_F32,
-                                                            hidden_states->ne[0],
-                                                            hidden_states->ne[1] - prompt_template_encode_start_idx + zero_pad_len,
-                                                            hidden_states->ne[2]);
-
-        ggml_ext_tensor_iter(new_hidden_states, [&](ggml_tensor* new_hidden_states, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-            float value = 0.f;
-            if (i1 + prompt_template_encode_start_idx < hidden_states->ne[1]) {
-                value = ggml_ext_tensor_get_f32(hidden_states, i0, i1 + prompt_template_encode_start_idx, i2, i3);
-            }
-            ggml_ext_tensor_set_f32(new_hidden_states, value, i0, i1, i2, i3);
-        });
+        sd::Tensor<float> new_hidden_states = sd::ops::slice(hidden_states,
+                                                             1,
+                                                             prompt_template_encode_start_idx,
+                                                             hidden_states.shape()[1]);
+        if (zero_pad_len > 0) {
+            auto pad_shape    = new_hidden_states.shape();
+            pad_shape[1]      = zero_pad_len;
+            new_hidden_states = sd::ops::concat(new_hidden_states,
+                                                sd::Tensor<float>::zeros(std::move(pad_shape)),
+                                                1);
+        }
 
         return new_hidden_states;
     }
 
-    SDCondition get_learned_condition(ggml_context* work_ctx,
-                                      int n_threads,
+    SDCondition get_learned_condition(int n_threads,
                                       const ConditionerParams& conditioner_params) override {
         std::string prompt;
         std::pair<int, int> prompt_attn_range;
         std::vector<std::string> extra_prompts;
         std::vector<std::pair<int, int>> extra_prompts_attn_range;
-        std::vector<std::pair<int, ggml_tensor*>> image_embeds;
+        std::vector<std::pair<int, sd::Tensor<float>>> image_embeds;
         int prompt_template_encode_start_idx = 34;
         int max_length                       = 0;  // pad tokens
         int min_length                       = 0;  // zero pad hidden_states
@@ -1987,7 +1805,7 @@ struct LLMEmbedder : public Conditioner {
         int64_t t0 = ggml_time_ms();
 
         if (sd_version_is_qwen_image(version)) {
-            if (llm->enable_vision && !conditioner_params.ref_images.empty()) {
+            if (llm->enable_vision && conditioner_params.ref_images != nullptr && !conditioner_params.ref_images->empty()) {
                 LOG_INFO("QwenImageEditPlusPipeline");
                 prompt_template_encode_start_idx = 64;
                 int image_embed_idx              = 64 + 6;
@@ -1997,13 +1815,13 @@ struct LLMEmbedder : public Conditioner {
                 std::string placeholder = "<|image_pad|>";
                 std::string img_prompt;
 
-                for (int i = 0; i < conditioner_params.ref_images.size(); i++) {
-                    sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(*conditioner_params.ref_images[i]);
-                    double factor        = llm->params.vision.patch_size * llm->params.vision.spatial_merge_size;
-                    int height           = image.height;
-                    int width            = image.width;
-                    int h_bar            = static_cast<int>(std::round(height / factor) * factor);
-                    int w_bar            = static_cast<int>(std::round(width / factor) * factor);
+                for (int i = 0; i < conditioner_params.ref_images->size(); i++) {
+                    const auto& image = (*conditioner_params.ref_images)[i];
+                    double factor     = llm->params.vision.patch_size * llm->params.vision.spatial_merge_size;
+                    int height        = static_cast<int>(image.shape()[1]);
+                    int width         = static_cast<int>(image.shape()[0]);
+                    int h_bar         = static_cast<int>(std::round(height / factor) * factor);
+                    int w_bar         = static_cast<int>(std::round(width / factor) * factor);
 
                     if (static_cast<double>(h_bar) * w_bar > max_pixels) {
                         double beta = std::sqrt((height * width) / static_cast<double>(max_pixels));
@@ -2017,24 +1835,17 @@ struct LLMEmbedder : public Conditioner {
                         w_bar       = static_cast<int>(std::ceil(width * beta / factor)) * static_cast<int>(factor);
                     }
 
-                    LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, image.height, image.width, h_bar, w_bar);
+                    LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, height, width, h_bar, w_bar);
 
-                    sd_image_f32_t resized_image = clip_preprocess(image, w_bar, h_bar);
-                    free(image.data);
-                    image.data = nullptr;
+                    auto resized_image = clip_preprocess(image, w_bar, h_bar);
 
-                    ggml_tensor* image_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, resized_image.width, resized_image.height, 3, 1);
-                    sd_image_f32_to_ggml_tensor(resized_image, image_tensor, false);
-                    free(resized_image.data);
-                    resized_image.data = nullptr;
-
-                    ggml_tensor* image_embed = nullptr;
-                    llm->encode_image(n_threads, image_tensor, &image_embed, work_ctx);
+                    auto image_embed = llm->encode_image(n_threads, resized_image);
+                    GGML_ASSERT(!image_embed.empty());
                     image_embeds.emplace_back(image_embed_idx, image_embed);
-                    image_embed_idx += 1 + static_cast<int>(image_embed->ne[1]) + 6;
+                    image_embed_idx += 1 + static_cast<int>(image_embed.shape()[1]) + 6;
 
                     img_prompt += "Picture " + std::to_string(i + 1) + ": <|vision_start|>";  // [24669, 220, index, 25, 220, 151652]
-                    int64_t num_image_tokens = image_embed->ne[1];
+                    int64_t num_image_tokens = image_embed.shape()[1];
                     img_prompt.reserve(num_image_tokens * placeholder.size());
                     for (int j = 0; j < num_image_tokens; j++) {
                         img_prompt += placeholder;
@@ -2077,10 +1888,10 @@ struct LLMEmbedder : public Conditioner {
             prompt_template_encode_start_idx = 0;
             out_layers                       = {35};  // -2
 
-            if (!conditioner_params.ref_images.empty()) {
+            if (conditioner_params.ref_images != nullptr && !conditioner_params.ref_images->empty()) {
                 LOG_INFO("ZImageOmniPipeline");
                 prompt = "<|im_start|>user\n<|vision_start|>";
-                for (int i = 0; i < conditioner_params.ref_images.size() - 1; i++) {
+                for (int i = 0; i < conditioner_params.ref_images->size() - 1; i++) {
                     extra_prompts.push_back("<|vision_end|><|vision_start|>");
                 }
                 extra_prompts.push_back("<|vision_end|>" + conditioner_params.text + "<|im_end|>\n<|im_start|>assistant\n<|vision_start|>");
@@ -2121,8 +1932,7 @@ struct LLMEmbedder : public Conditioner {
             GGML_ABORT("unknown version %d", version);
         }
 
-        auto hidden_states = encode_prompt(work_ctx,
-                                           n_threads,
+        auto hidden_states = encode_prompt(n_threads,
                                            prompt,
                                            prompt_attn_range,
                                            max_length,
@@ -2130,11 +1940,9 @@ struct LLMEmbedder : public Conditioner {
                                            image_embeds,
                                            out_layers,
                                            prompt_template_encode_start_idx);
-
-        std::vector<ggml_tensor*> extra_hidden_states_vec;
+        std::vector<sd::Tensor<float>> extra_hidden_states_vec;
         for (int i = 0; i < extra_prompts.size(); i++) {
-            auto extra_hidden_states = encode_prompt(work_ctx,
-                                                     n_threads,
+            auto extra_hidden_states = encode_prompt(n_threads,
                                                      extra_prompts[i],
                                                      extra_prompts_attn_range[i],
                                                      max_length,
@@ -2142,12 +1950,15 @@ struct LLMEmbedder : public Conditioner {
                                                      image_embeds,
                                                      out_layers,
                                                      prompt_template_encode_start_idx);
-            extra_hidden_states_vec.push_back(extra_hidden_states);
+            extra_hidden_states_vec.push_back(std::move(extra_hidden_states));
         }
 
         int64_t t1 = ggml_time_ms();
         LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
-        return {hidden_states, nullptr, nullptr, extra_hidden_states_vec};
+        SDCondition result;
+        result.c_crossattn        = std::move(hidden_states);
+        result.extra_c_crossattns = std::move(extra_hidden_states_vec);
+        return result;
     }
 };
 
diff --git a/src/control.hpp b/src/control.hpp
index 93df10a..d227ec9 100644
--- a/src/control.hpp
+++ b/src/control.hpp
@@ -310,11 +310,13 @@ struct ControlNet : public GGMLRunner {
     SDVersion version = VERSION_SD1;
     ControlNetBlock control_net;
 
-    ggml_backend_buffer_t control_buffer = nullptr;  // keep control output tensors in backend memory
+    ggml_backend_buffer_t control_buffer = nullptr;
     ggml_context* control_ctx            = nullptr;
-    std::vector<ggml_tensor*> controls;  // (12 input block outputs, 1 middle block output) SD 1.5
-    ggml_tensor* guided_hint = nullptr;  // guided_hint cache, for faster inference
-    bool guided_hint_cached  = false;
+    std::vector<ggml_tensor*> control_outputs_ggml;
+    ggml_tensor* guided_hint_output_ggml = nullptr;
+    std::vector<sd::Tensor<float>> controls;
+    sd::Tensor<float> guided_hint;
+    bool guided_hint_cached = false;
 
     ControlNet(ggml_backend_t backend,
                bool offload_params_to_cpu,
@@ -335,16 +337,16 @@ struct ControlNet : public GGMLRunner {
         params.no_alloc   = true;
         control_ctx       = ggml_init(params);
 
-        controls.resize(outs.size() - 1);
+        control_outputs_ggml.resize(outs.size() - 1);
 
         size_t control_buffer_size = 0;
 
-        guided_hint = ggml_dup_tensor(control_ctx, outs[0]);
-        control_buffer_size += ggml_nbytes(guided_hint);
+        guided_hint_output_ggml = ggml_dup_tensor(control_ctx, outs[0]);
+        control_buffer_size += ggml_nbytes(guided_hint_output_ggml);
 
         for (int i = 0; i < outs.size() - 1; i++) {
-            controls[i] = ggml_dup_tensor(control_ctx, outs[i + 1]);
-            control_buffer_size += ggml_nbytes(controls[i]);
+            control_outputs_ggml[i] = ggml_dup_tensor(control_ctx, outs[i + 1]);
+            control_buffer_size += ggml_nbytes(control_outputs_ggml[i]);
         }
 
         control_buffer = ggml_backend_alloc_ctx_tensors(control_ctx, runtime_backend);
@@ -361,8 +363,10 @@ struct ControlNet : public GGMLRunner {
             ggml_free(control_ctx);
             control_ctx = nullptr;
         }
-        guided_hint        = nullptr;
-        guided_hint_cached = false;
+        guided_hint_output_ggml = nullptr;
+        guided_hint_cached      = false;
+        guided_hint             = {};
+        control_outputs_ggml.clear();
         controls.clear();
     }
 
@@ -374,29 +378,33 @@ struct ControlNet : public GGMLRunner {
         control_net.get_param_tensors(tensors, prefix);
     }
 
-    ggml_cgraph* build_graph(ggml_tensor* x,
-                             ggml_tensor* hint,
-                             ggml_tensor* timesteps,
-                             ggml_tensor* context,
-                             ggml_tensor* y = nullptr) {
+    ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
+                             const sd::Tensor<float>& hint_tensor,
+                             const sd::Tensor<float>& timesteps_tensor,
+                             const sd::Tensor<float>& context_tensor = {},
+                             const sd::Tensor<float>& y_tensor       = {}) {
         ggml_cgraph* gf = new_graph_custom(CONTROL_NET_GRAPH_SIZE);
 
-        x = to_backend(x);
-        if (guided_hint_cached) {
-            hint = nullptr;
+        ggml_tensor* x         = make_input(x_tensor);
+        ggml_tensor* hint      = nullptr;
+        ggml_tensor* timesteps = make_input(timesteps_tensor);
+        ggml_tensor* context   = make_optional_input(context_tensor);
+        ggml_tensor* y         = make_optional_input(y_tensor);
+
+        ggml_tensor* guided_hint_input = nullptr;
+        if (guided_hint_cached && !guided_hint.empty()) {
+            guided_hint_input = make_input(guided_hint);
+            hint              = nullptr;
         } else {
-            hint = to_backend(hint);
+            hint = make_input(hint_tensor);
         }
-        context   = to_backend(context);
-        y         = to_backend(y);
-        timesteps = to_backend(timesteps);
 
         auto runner_ctx = get_context();
 
         auto outs = control_net.forward(&runner_ctx,
                                         x,
                                         hint,
-                                        guided_hint_cached ? guided_hint : nullptr,
+                                        guided_hint_input,
                                         timesteps,
                                         context,
                                         y);
@@ -405,22 +413,20 @@ struct ControlNet : public GGMLRunner {
             alloc_control_ctx(outs);
         }
 
-        ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[0], guided_hint));
+        ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[0], guided_hint_output_ggml));
         for (int i = 0; i < outs.size() - 1; i++) {
-            ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[i + 1], controls[i]));
+            ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[i + 1], control_outputs_ggml[i]));
         }
 
         return gf;
     }
 
-    bool compute(int n_threads,
-                 ggml_tensor* x,
-                 ggml_tensor* hint,
-                 ggml_tensor* timesteps,
-                 ggml_tensor* context,
-                 ggml_tensor* y,
-                 ggml_tensor** output     = nullptr,
-                 ggml_context* output_ctx = nullptr) {
+    std::optional<std::vector<sd::Tensor<float>>> compute(int n_threads,
+                                                          const sd::Tensor<float>& x,
+                                                          const sd::Tensor<float>& hint,
+                                                          const sd::Tensor<float>& timesteps,
+                                                          const sd::Tensor<float>& context = {},
+                                                          const sd::Tensor<float>& y       = {}) {
         // x: [N, in_channels, h, w]
         // timesteps: [N, ]
         // context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size]
@@ -429,12 +435,24 @@ struct ControlNet : public GGMLRunner {
             return build_graph(x, hint, timesteps, context, y);
         };
 
-        bool res = GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
-        if (res) {
-            // cache guided_hint
-            guided_hint_cached = true;
+        auto compute_result = GGMLRunner::compute<float>(get_graph, n_threads, false);
+        if (!compute_result.has_value()) {
+            return std::nullopt;
         }
-        return res;
+
+        if (guided_hint_output_ggml != nullptr) {
+            guided_hint = restore_trailing_singleton_dims(sd::make_sd_tensor_from_ggml<float>(guided_hint_output_ggml),
+                                                          4);
+        }
+        controls.clear();
+        controls.reserve(control_outputs_ggml.size());
+        for (ggml_tensor* control : control_outputs_ggml) {
+            auto control_host = restore_trailing_singleton_dims(sd::make_sd_tensor_from_ggml<float>(control), 4);
+            GGML_ASSERT(!control_host.empty());
+            controls.push_back(std::move(control_host));
+        }
+        guided_hint_cached = true;
+        return controls;
     }
 
     bool load_from_file(const std::string& file_path, int n_threads) {
@@ -462,4 +480,4 @@ struct ControlNet : public GGMLRunner {
     }
 };
 
-#endif  // __CONTROL_HPP__
\ No newline at end of file
+#endif  // __CONTROL_HPP__
diff --git a/src/denoiser.hpp b/src/denoiser.hpp
index b92ca4e..077a1b7 100644
--- a/src/denoiser.hpp
+++ b/src/denoiser.hpp
@@ -5,6 +5,7 @@
 
 #include "ggml_extend.hpp"
 #include "gits_noise.inl"
+#include "tensor.hpp"
 
 /*================================================= CompVisDenoiser ==================================================*/
 
@@ -73,9 +74,9 @@ constexpr double interp(double left, double right, double perc) noexcept {
 /* This will make the assumption that the reference x and y values are
  * already sorted in ascending order because they are being generated as
  * such in the calling function */
-std::vector<double> linear_interp(std::vector<float> new_x,
-                                  const std::vector<float> ref_x,
-                                  const std::vector<float> ref_y) {
+inline std::vector<double> linear_interp(std::vector<float> new_x,
+                                         const std::vector<float> ref_x,
+                                         const std::vector<float> ref_y) {
     const size_t len_x = new_x.size();
     size_t i           = 0;
     size_t j           = 0;
@@ -109,7 +110,7 @@ std::vector<double> linear_interp(std::vector<float> new_x,
     return new_y;
 }
 
-std::vector<float> linear_space(const float start, const float end, const size_t num_points) {
+inline std::vector<float> linear_space(const float start, const float end, const size_t num_points) {
     std::vector<float> result(num_points);
     const float inc = (end - start) / (static_cast<float>(num_points - 1));
 
@@ -124,8 +125,8 @@ std::vector<float> linear_space(const float start, const float end, const size_t
     return result;
 }
 
-std::vector<float> log_linear_interpolation(std::vector<float> sigma_in,
-                                            const size_t new_len) {
+inline std::vector<float> log_linear_interpolation(std::vector<float> sigma_in,
+                                                   const size_t new_len) {
     const size_t s_len        = sigma_in.size();
     std::vector<float> x_vals = linear_space(0.f, 1.f, s_len);
     std::vector<float> y_vals(s_len);
@@ -478,13 +479,16 @@ struct KLOptimalScheduler : SigmaScheduler {
 };
 
 struct Denoiser {
-    virtual float sigma_min()                                                                = 0;
-    virtual float sigma_max()                                                                = 0;
-    virtual float sigma_to_t(float sigma)                                                    = 0;
-    virtual float t_to_sigma(float t)                                                        = 0;
-    virtual std::vector<float> get_scalings(float sigma)                                     = 0;
-    virtual ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) = 0;
-    virtual ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent)             = 0;
+    virtual float sigma_min()                                                        = 0;
+    virtual float sigma_max()                                                        = 0;
+    virtual float sigma_to_t(float sigma)                                            = 0;
+    virtual float t_to_sigma(float t)                                                = 0;
+    virtual std::vector<float> get_scalings(float sigma)                             = 0;
+    virtual sd::Tensor<float> noise_scaling(float sigma,
+                                            const sd::Tensor<float>& noise,
+                                            const sd::Tensor<float>& latent)         = 0;
+    virtual sd::Tensor<float> inverse_noise_scaling(float sigma,
+                                                    const sd::Tensor<float>& latent) = 0;
 
     virtual std::vector<float> get_sigmas(uint32_t n, int /*image_seq_len*/, scheduler_t scheduler_type, SDVersion version) {
         auto bound_t_to_sigma = std::bind(&Denoiser::t_to_sigma, this, std::placeholders::_1);
@@ -598,14 +602,15 @@ struct CompVisDenoiser : public Denoiser {
         return {c_skip, c_out, c_in};
     }
 
-    // this function will modify noise/latent
-    ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) override {
-        ggml_ext_tensor_scale_inplace(noise, sigma);
-        ggml_ext_tensor_add_inplace(latent, noise);
-        return latent;
+    virtual sd::Tensor<float> noise_scaling(float sigma,
+                                            const sd::Tensor<float>& noise,
+                                            const sd::Tensor<float>& latent) override {
+        GGML_ASSERT(noise.numel() == latent.numel());
+        return latent + noise * sigma;
     }
 
-    ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) override {
+    sd::Tensor<float> inverse_noise_scaling(float sigma, const sd::Tensor<float>& latent) override {
+        SD_UNUSED(sigma);
         return latent;
     }
 };
@@ -644,7 +649,7 @@ struct EDMVDenoiser : public CompVisVDenoiser {
     }
 };
 
-float time_snr_shift(float alpha, float t) {
+inline float time_snr_shift(float alpha, float t) {
     if (alpha == 1.0f) {
         return t;
     }
@@ -696,21 +701,18 @@ struct DiscreteFlowDenoiser : public Denoiser {
         return {c_skip, c_out, c_in};
     }
 
-    // this function will modify noise/latent
-    ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) override {
-        ggml_ext_tensor_scale_inplace(noise, sigma);
-        ggml_ext_tensor_scale_inplace(latent, 1.0f - sigma);
-        ggml_ext_tensor_add_inplace(latent, noise);
-        return latent;
+    sd::Tensor<float> noise_scaling(float sigma,
+                                    const sd::Tensor<float>& noise,
+                                    const sd::Tensor<float>& latent) override {
+        GGML_ASSERT(noise.numel() == latent.numel());
+        return latent * (1.0f - sigma) + noise * sigma;
     }
-
-    ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) override {
-        ggml_ext_tensor_scale_inplace(latent, 1.0f / (1.0f - sigma));
-        return latent;
+    sd::Tensor<float> inverse_noise_scaling(float sigma, const sd::Tensor<float>& latent) override {
+        return latent * (1.0f / (1.0f - sigma));
     }
 };
 
-float flux_time_shift(float mu, float sigma, float t) {
+inline float flux_time_shift(float mu, float sigma, float t) {
     return ::expf(mu) / (::expf(mu) + ::powf((1.0f / t - 1.0f), sigma));
 }
 
@@ -759,938 +761,289 @@ struct Flux2FlowDenoiser : public FluxFlowDenoiser {
     }
 };
 
-typedef std::function<ggml_tensor*(ggml_tensor*, float, int)> denoise_cb_t;
+typedef std::function<sd::Tensor<float>(const sd::Tensor<float>&, float, int)> denoise_cb_t;
 
 // k diffusion reverse ODE: dx = (x - D(x;\sigma)) / \sigma dt; \sigma(t) = t
-static bool sample_k_diffusion(sample_method_t method,
-                               denoise_cb_t model,
-                               ggml_context* work_ctx,
-                               ggml_tensor* x,
-                               std::vector<float> sigmas,
-                               std::shared_ptr<RNG> rng,
-                               float eta) {
+static sd::Tensor<float> sample_k_diffusion(sample_method_t method,
+                                            denoise_cb_t model,
+                                            sd::Tensor<float> x,
+                                            std::vector<float> sigmas,
+                                            std::shared_ptr<RNG> rng,
+                                            float eta) {
     size_t steps = sigmas.size() - 1;
-    // sample_euler_ancestral
     switch (method) {
         case EULER_A_SAMPLE_METHOD: {
-            ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
-            ggml_tensor* d     = ggml_dup_tensor(work_ctx, x);
-
             for (int i = 0; i < steps; i++) {
-                float sigma = sigmas[i];
-
-                // denoise
-                ggml_tensor* denoised = model(x, sigma, i + 1);
-                if (denoised == nullptr) {
-                    return false;
+                float sigma       = sigmas[i];
+                auto denoised_opt = model(x, sigma, i + 1);
+                if (denoised_opt.empty()) {
+                    return {};
                 }
-
-                // d = (x - denoised) / sigma
-                {
-                    float* vec_d        = (float*)d->data;
-                    float* vec_x        = (float*)x->data;
-                    float* vec_denoised = (float*)denoised->data;
-
-                    for (int i = 0; i < ggml_nelements(d); i++) {
-                        vec_d[i] = (vec_x[i] - vec_denoised[i]) / sigma;
-                    }
-                }
-
-                // get_ancestral_step
-                float sigma_up   = std::min(sigmas[i + 1],
-                                            std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i])));
-                float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up);
-
-                // Euler method
-                float dt = sigma_down - sigmas[i];
-                // x = x + d * dt
-                {
-                    float* vec_d = (float*)d->data;
-                    float* vec_x = (float*)x->data;
-
-                    for (int i = 0; i < ggml_nelements(x); i++) {
-                        vec_x[i] = vec_x[i] + vec_d[i] * dt;
-                    }
-                }
-
+                sd::Tensor<float> denoised = std::move(denoised_opt);
+                sd::Tensor<float> d        = (x - denoised) / sigma;
+                float sigma_up             = std::min(sigmas[i + 1],
+                                                      std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i])));
+                float sigma_down           = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up);
+                float dt                   = sigma_down - sigmas[i];
+                x += d * dt;
                 if (sigmas[i + 1] > 0) {
-                    // x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
-                    ggml_ext_im_set_randn_f32(noise, rng);
-                    // noise = load_tensor_from_file(work_ctx, "./rand" + std::to_string(i+1) + ".bin");
-                    {
-                        float* vec_x     = (float*)x->data;
-                        float* vec_noise = (float*)noise->data;
-
-                        for (int i = 0; i < ggml_nelements(x); i++) {
-                            vec_x[i] = vec_x[i] + vec_noise[i] * sigma_up;
-                        }
-                    }
+                    x += sd::Tensor<float>::randn_like(x, rng) * sigma_up;
                 }
             }
-        } break;
-        case EULER_SAMPLE_METHOD:  // Implemented without any sigma churn
-        {
-            ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
-
+            return x;
+        }
+        case EULER_SAMPLE_METHOD: {
             for (int i = 0; i < steps; i++) {
-                float sigma = sigmas[i];
-
-                // denoise
-                ggml_tensor* denoised = model(x, sigma, i + 1);
-                if (denoised == nullptr) {
-                    return false;
-                }
-
-                // d = (x - denoised) / sigma
-                {
-                    float* vec_d        = (float*)d->data;
-                    float* vec_x        = (float*)x->data;
-                    float* vec_denoised = (float*)denoised->data;
-
-                    for (int j = 0; j < ggml_nelements(d); j++) {
-                        vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigma;
-                    }
-                }
-
-                float dt = sigmas[i + 1] - sigma;
-                // x = x + d * dt
-                {
-                    float* vec_d = (float*)d->data;
-                    float* vec_x = (float*)x->data;
-
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] = vec_x[j] + vec_d[j] * dt;
-                    }
+                float sigma       = sigmas[i];
+                auto denoised_opt = model(x, sigma, i + 1);
+                if (denoised_opt.empty()) {
+                    return {};
                 }
+                sd::Tensor<float> denoised = std::move(denoised_opt);
+                sd::Tensor<float> d        = (x - denoised) / sigma;
+                float dt                   = sigmas[i + 1] - sigma;
+                x += d * dt;
             }
-        } break;
+            return x;
+        }
         case HEUN_SAMPLE_METHOD: {
-            ggml_tensor* d  = ggml_dup_tensor(work_ctx, x);
-            ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x);
-
             for (int i = 0; i < steps; i++) {
-                // denoise
-                ggml_tensor* denoised = model(x, sigmas[i], -(i + 1));
-                if (denoised == nullptr) {
-                    return false;
+                auto denoised_opt = model(x, sigmas[i], -(i + 1));
+                if (denoised_opt.empty()) {
+                    return {};
                 }
-
-                // d = (x - denoised) / sigma
-                {
-                    float* vec_d        = (float*)d->data;
-                    float* vec_x        = (float*)x->data;
-                    float* vec_denoised = (float*)denoised->data;
-
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i];
-                    }
-                }
-
-                float dt = sigmas[i + 1] - sigmas[i];
+                sd::Tensor<float> denoised = std::move(denoised_opt);
+                sd::Tensor<float> d        = (x - denoised) / sigmas[i];
+                float dt                   = sigmas[i + 1] - sigmas[i];
                 if (sigmas[i + 1] == 0) {
-                    // Euler step
-                    // x = x + d * dt
-                    float* vec_d = (float*)d->data;
-                    float* vec_x = (float*)x->data;
-
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] = vec_x[j] + vec_d[j] * dt;
-                    }
+                    x += d * dt;
                 } else {
-                    // Heun step
-                    float* vec_d  = (float*)d->data;
-                    float* vec_d2 = (float*)d->data;
-                    float* vec_x  = (float*)x->data;
-                    float* vec_x2 = (float*)x2->data;
-
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x2[j] = vec_x[j] + vec_d[j] * dt;
-                    }
-
-                    ggml_tensor* denoised = model(x2, sigmas[i + 1], i + 1);
-                    if (denoised == nullptr) {
-                        return false;
-                    }
-                    float* vec_denoised = (float*)denoised->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        float d2 = (vec_x2[j] - vec_denoised[j]) / sigmas[i + 1];
-                        vec_d[j] = (vec_d[j] + d2) / 2;
-                        vec_x[j] = vec_x[j] + vec_d[j] * dt;
+                    sd::Tensor<float> x2 = x + d * dt;
+                    auto denoised2_opt   = model(x2, sigmas[i + 1], i + 1);
+                    if (denoised2_opt.empty()) {
+                        return {};
                     }
+                    sd::Tensor<float> denoised2 = std::move(denoised2_opt);
+                    d                           = (d + (x2 - denoised2) / sigmas[i + 1]) / 2.0f;
+                    x += d * dt;
                 }
             }
-        } break;
+            return x;
+        }
         case DPM2_SAMPLE_METHOD: {
-            ggml_tensor* d  = ggml_dup_tensor(work_ctx, x);
-            ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x);
-
             for (int i = 0; i < steps; i++) {
-                // denoise
-                ggml_tensor* denoised = model(x, sigmas[i], -(i + 1));
-                if (denoised == nullptr) {
-                    return false;
+                auto denoised_opt = model(x, sigmas[i], -(i + 1));
+                if (denoised_opt.empty()) {
+                    return {};
                 }
-
-                // d = (x - denoised) / sigma
-                {
-                    float* vec_d        = (float*)d->data;
-                    float* vec_x        = (float*)x->data;
-                    float* vec_denoised = (float*)denoised->data;
-
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i];
-                    }
-                }
-
+                sd::Tensor<float> denoised = std::move(denoised_opt);
+                sd::Tensor<float> d        = (x - denoised) / sigmas[i];
                 if (sigmas[i + 1] == 0) {
-                    // Euler step
-                    // x = x + d * dt
-                    float dt     = sigmas[i + 1] - sigmas[i];
-                    float* vec_d = (float*)d->data;
-                    float* vec_x = (float*)x->data;
-
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] = vec_x[j] + vec_d[j] * dt;
-                    }
+                    float dt = sigmas[i + 1] - sigmas[i];
+                    x += d * dt;
                 } else {
-                    // DPM-Solver-2
-                    float sigma_mid = exp(0.5f * (log(sigmas[i]) + log(sigmas[i + 1])));
-                    float dt_1      = sigma_mid - sigmas[i];
-                    float dt_2      = sigmas[i + 1] - sigmas[i];
-
-                    float* vec_d  = (float*)d->data;
-                    float* vec_x  = (float*)x->data;
-                    float* vec_x2 = (float*)x2->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x2[j] = vec_x[j] + vec_d[j] * dt_1;
-                    }
-
-                    ggml_tensor* denoised = model(x2, sigma_mid, i + 1);
-                    if (denoised == nullptr) {
-                        return false;
-                    }
-                    float* vec_denoised = (float*)denoised->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        float d2 = (vec_x2[j] - vec_denoised[j]) / sigma_mid;
-                        vec_x[j] = vec_x[j] + d2 * dt_2;
+                    float sigma_mid      = exp(0.5f * (log(sigmas[i]) + log(sigmas[i + 1])));
+                    float dt_1           = sigma_mid - sigmas[i];
+                    float dt_2           = sigmas[i + 1] - sigmas[i];
+                    sd::Tensor<float> x2 = x + d * dt_1;
+                    auto denoised2_opt   = model(x2, sigma_mid, i + 1);
+                    if (denoised2_opt.empty()) {
+                        return {};
                     }
+                    sd::Tensor<float> denoised2 = std::move(denoised2_opt);
+                    x += ((x2 - denoised2) / sigma_mid) * dt_2;
                 }
             }
-
-        } break;
+            return x;
+        }
         case DPMPP2S_A_SAMPLE_METHOD: {
-            ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
-            ggml_tensor* x2    = ggml_dup_tensor(work_ctx, x);
-
             for (int i = 0; i < steps; i++) {
-                // denoise
-                ggml_tensor* denoised = model(x, sigmas[i], -(i + 1));
-                if (denoised == nullptr) {
-                    return false;
+                auto denoised_opt = model(x, sigmas[i], -(i + 1));
+                if (denoised_opt.empty()) {
+                    return {};
                 }
-
-                // get_ancestral_step
-                float sigma_up   = std::min(sigmas[i + 1],
-                                            std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i])));
-                float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up);
-                auto t_fn        = [](float sigma) -> float { return -log(sigma); };
-                auto sigma_fn    = [](float t) -> float { return exp(-t); };
+                sd::Tensor<float> denoised = std::move(denoised_opt);
+                float sigma_up             = std::min(sigmas[i + 1],
+                                                      std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i])));
+                float sigma_down           = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up);
+                auto t_fn                  = [](float sigma) -> float { return -log(sigma); };
+                auto sigma_fn              = [](float t) -> float { return exp(-t); };
 
                 if (sigma_down == 0) {
-                    // d = (x - denoised) / sigmas[i];
-                    // dt = sigma_down - sigmas[i];
-                    // x += d * dt;
-                    // => x = denoised
-                    float* vec_x        = (float*)x->data;
-                    float* vec_denoised = (float*)denoised->data;
-
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] = vec_denoised[j];
-                    }
+                    x = denoised;
                 } else {
-                    // DPM-Solver++(2S)
-                    float t      = t_fn(sigmas[i]);
-                    float t_next = t_fn(sigma_down);
-                    float h      = t_next - t;
-                    float s      = t + 0.5f * h;
-
-                    float* vec_x        = (float*)x->data;
-                    float* vec_x2       = (float*)x2->data;
-                    float* vec_denoised = (float*)denoised->data;
-
-                    // First half-step
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x2[j] = (sigma_fn(s) / sigma_fn(t)) * vec_x[j] - (exp(-h * 0.5f) - 1) * vec_denoised[j];
-                    }
-
-                    ggml_tensor* denoised = model(x2, sigmas[i + 1], i + 1);
-                    if (denoised == nullptr) {
-                        return false;
-                    }
-
-                    // Second half-step
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] = (sigma_fn(t_next) / sigma_fn(t)) * vec_x[j] - (exp(-h) - 1) * vec_denoised[j];
+                    float t              = t_fn(sigmas[i]);
+                    float t_next         = t_fn(sigma_down);
+                    float h              = t_next - t;
+                    float s              = t + 0.5f * h;
+                    sd::Tensor<float> x2 = (sigma_fn(s) / sigma_fn(t)) * x - (exp(-h * 0.5f) - 1) * denoised;
+                    auto denoised2_opt   = model(x2, sigmas[i + 1], i + 1);
+                    if (denoised2_opt.empty()) {
+                        return {};
                     }
+                    sd::Tensor<float> denoised2 = std::move(denoised2_opt);
+                    x                           = (sigma_fn(t_next) / sigma_fn(t)) * (x) - (exp(-h) - 1) * denoised2;
                 }
 
-                // Noise addition
                 if (sigmas[i + 1] > 0) {
-                    ggml_ext_im_set_randn_f32(noise, rng);
-                    {
-                        float* vec_x     = (float*)x->data;
-                        float* vec_noise = (float*)noise->data;
-
-                        for (int i = 0; i < ggml_nelements(x); i++) {
-                            vec_x[i] = vec_x[i] + vec_noise[i] * sigma_up;
-                        }
-                    }
+                    x += sd::Tensor<float>::randn_like(x, rng) * sigma_up;
                 }
             }
-        } break;
-        case DPMPP2M_SAMPLE_METHOD:  // DPM++ (2M) from Karras et al (2022)
-        {
-            ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x);
-
-            auto t_fn = [](float sigma) -> float { return -log(sigma); };
-
+            return x;
+        }
+        case DPMPP2M_SAMPLE_METHOD: {
+            sd::Tensor<float> old_denoised = x;
+            auto t_fn                      = [](float sigma) -> float { return -log(sigma); };
             for (int i = 0; i < steps; i++) {
-                // denoise
-                ggml_tensor* denoised = model(x, sigmas[i], i + 1);
-                if (denoised == nullptr) {
-                    return false;
+                auto denoised_opt = model(x, sigmas[i], i + 1);
+                if (denoised_opt.empty()) {
+                    return {};
                 }
-
-                float t                 = t_fn(sigmas[i]);
-                float t_next            = t_fn(sigmas[i + 1]);
-                float h                 = t_next - t;
-                float a                 = sigmas[i + 1] / sigmas[i];
-                float b                 = exp(-h) - 1.f;
-                float* vec_x            = (float*)x->data;
-                float* vec_denoised     = (float*)denoised->data;
-                float* vec_old_denoised = (float*)old_denoised->data;
+                sd::Tensor<float> denoised = std::move(denoised_opt);
+                float t                    = t_fn(sigmas[i]);
+                float t_next               = t_fn(sigmas[i + 1]);
+                float h                    = t_next - t;
+                float a                    = sigmas[i + 1] / sigmas[i];
+                float b                    = exp(-h) - 1.f;
 
                 if (i == 0 || sigmas[i + 1] == 0) {
-                    // Simpler step for the edge cases
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] = a * vec_x[j] - b * vec_denoised[j];
-                    }
+                    x = a * (x)-b * denoised;
                 } else {
-                    float h_last = t - t_fn(sigmas[i - 1]);
-                    float r      = h_last / h;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        float denoised_d = (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j];
-                        vec_x[j]         = a * vec_x[j] - b * denoised_d;
-                    }
-                }
-
-                // old_denoised = denoised
-                for (int j = 0; j < ggml_nelements(x); j++) {
-                    vec_old_denoised[j] = vec_denoised[j];
+                    float h_last                 = t - t_fn(sigmas[i - 1]);
+                    float r                      = h_last / h;
+                    sd::Tensor<float> denoised_d = (1.f + 1.f / (2.f * r)) * denoised - (1.f / (2.f * r)) * old_denoised;
+                    x                            = a * (x)-b * denoised_d;
                 }
+                old_denoised = denoised;
             }
-        } break;
-        case DPMPP2Mv2_SAMPLE_METHOD:  // Modified DPM++ (2M) from https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457
-        {
-            ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x);
-
-            auto t_fn = [](float sigma) -> float { return -log(sigma); };
-
+            return x;
+        }
+        case DPMPP2Mv2_SAMPLE_METHOD: {
+            sd::Tensor<float> old_denoised = x;
+            auto t_fn                      = [](float sigma) -> float { return -log(sigma); };
             for (int i = 0; i < steps; i++) {
-                // denoise
-                ggml_tensor* denoised = model(x, sigmas[i], i + 1);
-                if (denoised == nullptr) {
-                    return false;
+                auto denoised_opt = model(x, sigmas[i], i + 1);
+                if (denoised_opt.empty()) {
+                    return {};
                 }
-
-                float t                 = t_fn(sigmas[i]);
-                float t_next            = t_fn(sigmas[i + 1]);
-                float h                 = t_next - t;
-                float a                 = sigmas[i + 1] / sigmas[i];
-                float* vec_x            = (float*)x->data;
-                float* vec_denoised     = (float*)denoised->data;
-                float* vec_old_denoised = (float*)old_denoised->data;
-
+                sd::Tensor<float> denoised = std::move(denoised_opt);
+                float t                    = t_fn(sigmas[i]);
+                float t_next               = t_fn(sigmas[i + 1]);
+                float h                    = t_next - t;
+                float a                    = sigmas[i + 1] / sigmas[i];
                 if (i == 0 || sigmas[i + 1] == 0) {
-                    // Simpler step for the edge cases
                     float b = exp(-h) - 1.f;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] = a * vec_x[j] - b * vec_denoised[j];
-                    }
+                    x       = a * (x)-b * denoised;
                 } else {
-                    float h_last = t - t_fn(sigmas[i - 1]);
-                    float h_min  = std::min(h_last, h);
-                    float h_max  = std::max(h_last, h);
-                    float r      = h_max / h_min;
-                    float h_d    = (h_max + h_min) / 2.f;
-                    float b      = exp(-h_d) - 1.f;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        float denoised_d = (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j];
-                        vec_x[j]         = a * vec_x[j] - b * denoised_d;
-                    }
+                    float h_last                 = t - t_fn(sigmas[i - 1]);
+                    float h_min                  = std::min(h_last, h);
+                    float h_max                  = std::max(h_last, h);
+                    float r                      = h_max / h_min;
+                    float h_d                    = (h_max + h_min) / 2.f;
+                    float b                      = exp(-h_d) - 1.f;
+                    sd::Tensor<float> denoised_d = (1.f + 1.f / (2.f * r)) * denoised - (1.f / (2.f * r)) * old_denoised;
+                    x                            = a * (x)-b * denoised_d;
                 }
+                old_denoised = denoised;
+            }
+            return x;
+        }
+        case LCM_SAMPLE_METHOD: {
+            for (int i = 0; i < steps; i++) {
+                auto denoised_opt = model(x, sigmas[i], i + 1);
+                if (denoised_opt.empty()) {
+                    return {};
+                }
+                sd::Tensor<float> denoised = std::move(denoised_opt);
 
-                // old_denoised = denoised
-                for (int j = 0; j < ggml_nelements(x); j++) {
-                    vec_old_denoised[j] = vec_denoised[j];
+                x = denoised;
+                if (sigmas[i + 1] > 0) {
+                    x += sd::Tensor<float>::randn_like(x, rng) * sigmas[i + 1];
                 }
             }
-        } break;
-        case IPNDM_SAMPLE_METHOD:  // iPNDM sampler from https://github.com/zju-pi/diff-sampler/tree/main/diff-solvers-main
-        {
-            int max_order       = 4;
-            ggml_tensor* x_next = x;
-            std::vector<ggml_tensor*> buffer_model;
-
+            return x;
+        }
+        case IPNDM_SAMPLE_METHOD: {
+            int max_order                       = 4;
+            std::vector<sd::Tensor<float>> hist = {};
             for (int i = 0; i < steps; i++) {
                 float sigma      = sigmas[i];
                 float sigma_next = sigmas[i + 1];
 
-                ggml_tensor* x_cur = x_next;
-                float* vec_x_cur   = (float*)x_cur->data;
-                float* vec_x_next  = (float*)x_next->data;
-
-                // Denoising step
-                ggml_tensor* denoised = model(x_cur, sigma, i + 1);
-                if (denoised == nullptr) {
-                    return false;
+                auto denoised_opt = model(x, sigma, i + 1);
+                if (denoised_opt.empty()) {
+                    return {};
                 }
-                float* vec_denoised = (float*)denoised->data;
-                // d_cur = (x_cur - denoised) / sigma
-                ggml_tensor* d_cur = ggml_dup_tensor(work_ctx, x_cur);
-                float* vec_d_cur   = (float*)d_cur->data;
+                sd::Tensor<float> denoised = std::move(denoised_opt);
 
-                for (int j = 0; j < ggml_nelements(d_cur); j++) {
-                    vec_d_cur[j] = (vec_x_cur[j] - vec_denoised[j]) / sigma;
-                }
+                sd::Tensor<float> d_cur = (x - denoised) / sigma;
+                int order               = std::min(max_order, i + 1);
+                float dt                = sigma_next - sigma;
 
-                int order = std::min(max_order, i + 1);
-
-                // Calculate vec_x_next based on the order
                 switch (order) {
-                    case 1:  // First Euler step
-                        for (int j = 0; j < ggml_nelements(x_next); j++) {
-                            vec_x_next[j] = vec_x_cur[j] + (sigma_next - sigma) * vec_d_cur[j];
-                        }
+                    case 1:
+                        x += d_cur * dt;
+                        break;
+                    case 2:
+                        x += ((3.f * d_cur - hist.back()) / 2.f) * dt;
+                        break;
+                    case 3:
+                        x += ((23.f * d_cur - 16.f * hist[hist.size() - 1] + 5.f * hist[hist.size() - 2]) / 12.f) * dt;
+                        break;
+                    case 4:
+                        x += ((55.f * d_cur - 59.f * hist[hist.size() - 1] + 37.f * hist[hist.size() - 2] - 9.f * hist[hist.size() - 3]) / 24.f) * dt;
                         break;
-
-                    case 2:  // Use one history point
-                    {
-                        float* vec_d_prev1 = (float*)buffer_model.back()->data;
-                        for (int j = 0; j < ggml_nelements(x_next); j++) {
-                            vec_x_next[j] = vec_x_cur[j] + (sigma_next - sigma) * (3 * vec_d_cur[j] - vec_d_prev1[j]) / 2;
-                        }
-                    } break;
-
-                    case 3:  // Use two history points
-                    {
-                        float* vec_d_prev1 = (float*)buffer_model.back()->data;
-                        float* vec_d_prev2 = (float*)buffer_model[buffer_model.size() - 2]->data;
-                        for (int j = 0; j < ggml_nelements(x_next); j++) {
-                            vec_x_next[j] = vec_x_cur[j] + (sigma_next - sigma) * (23 * vec_d_cur[j] - 16 * vec_d_prev1[j] + 5 * vec_d_prev2[j]) / 12;
-                        }
-                    } break;
-
-                    case 4:  // Use three history points
-                    {
-                        float* vec_d_prev1 = (float*)buffer_model.back()->data;
-                        float* vec_d_prev2 = (float*)buffer_model[buffer_model.size() - 2]->data;
-                        float* vec_d_prev3 = (float*)buffer_model[buffer_model.size() - 3]->data;
-                        for (int j = 0; j < ggml_nelements(x_next); j++) {
-                            vec_x_next[j] = vec_x_cur[j] + (sigma_next - sigma) * (55 * vec_d_cur[j] - 59 * vec_d_prev1[j] + 37 * vec_d_prev2[j] - 9 * vec_d_prev3[j]) / 24;
-                        }
-                    } break;
                 }
 
-                // Manage buffer_model
-                if (buffer_model.size() == max_order - 1) {
-                    // Shift elements to the left
-                    for (int k = 0; k < max_order - 2; k++) {
-                        buffer_model[k] = buffer_model[k + 1];
-                    }
-                    buffer_model.back() = d_cur;  // Replace the last element with d_cur
-                } else {
-                    buffer_model.push_back(d_cur);
+                if (hist.size() == static_cast<size_t>(max_order - 1)) {
+                    hist.erase(hist.begin());
                 }
+                hist.push_back(std::move(d_cur));
             }
-        } break;
-        case IPNDM_V_SAMPLE_METHOD:  // iPNDM_v sampler from https://github.com/zju-pi/diff-sampler/tree/main/diff-solvers-main
-        {
-            int max_order = 4;
-            std::vector<ggml_tensor*> buffer_model;
-            ggml_tensor* x_next = x;
-
+            return x;
+        }
+        case IPNDM_V_SAMPLE_METHOD: {
+            int max_order                       = 4;
+            std::vector<sd::Tensor<float>> hist = {};
             for (int i = 0; i < steps; i++) {
                 float sigma  = sigmas[i];
                 float t_next = sigmas[i + 1];
 
-                // Denoising step
-                ggml_tensor* denoised = model(x, sigma, i + 1);
-                float* vec_denoised   = (float*)denoised->data;
-                ggml_tensor* d_cur    = ggml_dup_tensor(work_ctx, x);
-                float* vec_d_cur      = (float*)d_cur->data;
-                float* vec_x          = (float*)x->data;
-
-                // d_cur = (x - denoised) / sigma
-                for (int j = 0; j < ggml_nelements(d_cur); j++) {
-                    vec_d_cur[j] = (vec_x[j] - vec_denoised[j]) / sigma;
+                auto denoised_opt = model(x, sigma, i + 1);
+                if (denoised_opt.empty()) {
+                    return {};
                 }
+                sd::Tensor<float> denoised = std::move(denoised_opt);
 
-                int order   = std::min(max_order, i + 1);
-                float h_n   = t_next - sigma;
-                float h_n_1 = (i > 0) ? (sigma - sigmas[i - 1]) : h_n;
+                sd::Tensor<float> d_cur = (x - denoised) / sigma;
+                int order               = std::min(max_order, i + 1);
+                float h_n               = t_next - sigma;
+                float h_n_1             = (i > 0) ? (sigma - sigmas[i - 1]) : h_n;
 
                 switch (order) {
-                    case 1:  // First Euler step
-                        for (int j = 0; j < ggml_nelements(x_next); j++) {
-                            vec_x[j] += vec_d_cur[j] * h_n;
-                        }
+                    case 1:
+                        x += d_cur * h_n;
                         break;
-
-                    case 2: {
-                        float* vec_d_prev1 = (float*)buffer_model.back()->data;
-                        for (int j = 0; j < ggml_nelements(x_next); j++) {
-                            vec_x[j] += h_n * ((2 + (h_n / h_n_1)) * vec_d_cur[j] - (h_n / h_n_1) * vec_d_prev1[j]) / 2;
-                        }
+                    case 2:
+                        x += (((2.f + (h_n / h_n_1)) * d_cur - (h_n / h_n_1) * hist.back()) / 2.f) * h_n;
                         break;
-                    }
-
-                    case 3: {
-                        float h_n_2        = (i > 1) ? (sigmas[i - 1] - sigmas[i - 2]) : h_n_1;
-                        float* vec_d_prev1 = (float*)buffer_model.back()->data;
-                        float* vec_d_prev2 = (buffer_model.size() > 1) ? (float*)buffer_model[buffer_model.size() - 2]->data : vec_d_prev1;
-                        for (int j = 0; j < ggml_nelements(x_next); j++) {
-                            vec_x[j] += h_n * ((23 * vec_d_cur[j] - 16 * vec_d_prev1[j] + 5 * vec_d_prev2[j]) / 12);
-                        }
+                    case 3:
+                        x += ((23.f * d_cur - 16.f * hist[hist.size() - 1] + 5.f * hist[hist.size() - 2]) / 12.f) * h_n;
                         break;
-                    }
-
-                    case 4: {
-                        float h_n_2        = (i > 1) ? (sigmas[i - 1] - sigmas[i - 2]) : h_n_1;
-                        float h_n_3        = (i > 2) ? (sigmas[i - 2] - sigmas[i - 3]) : h_n_2;
-                        float* vec_d_prev1 = (float*)buffer_model.back()->data;
-                        float* vec_d_prev2 = (buffer_model.size() > 1) ? (float*)buffer_model[buffer_model.size() - 2]->data : vec_d_prev1;
-                        float* vec_d_prev3 = (buffer_model.size() > 2) ? (float*)buffer_model[buffer_model.size() - 3]->data : vec_d_prev2;
-                        for (int j = 0; j < ggml_nelements(x_next); j++) {
-                            vec_x[j] += h_n * ((55 * vec_d_cur[j] - 59 * vec_d_prev1[j] + 37 * vec_d_prev2[j] - 9 * vec_d_prev3[j]) / 24);
-                        }
+                    case 4:
+                        x += ((55.f * d_cur - 59.f * hist[hist.size() - 1] + 37.f * hist[hist.size() - 2] - 9.f * hist[hist.size() - 3]) / 24.f) * h_n;
                         break;
-                    }
                 }
 
-                // Manage buffer_model
-                if (buffer_model.size() == max_order - 1) {
-                    buffer_model.erase(buffer_model.begin());
+                if (hist.size() == static_cast<size_t>(max_order - 1)) {
+                    hist.erase(hist.begin());
                 }
-                buffer_model.push_back(d_cur);
-
-                // Prepare the next d tensor
-                d_cur = ggml_dup_tensor(work_ctx, x_next);
+                hist.push_back(std::move(d_cur));
             }
-        } break;
-        case LCM_SAMPLE_METHOD:  // Latent Consistency Models
-        {
-            ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
-            ggml_tensor* d     = ggml_dup_tensor(work_ctx, x);
-
-            for (int i = 0; i < steps; i++) {
-                float sigma = sigmas[i];
-
-                // denoise
-                ggml_tensor* denoised = model(x, sigma, i + 1);
-                if (denoised == nullptr) {
-                    return false;
-                }
-
-                // x = denoised
-                {
-                    float* vec_x        = (float*)x->data;
-                    float* vec_denoised = (float*)denoised->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] = vec_denoised[j];
-                    }
-                }
-
-                if (sigmas[i + 1] > 0) {
-                    // x += sigmas[i + 1] * noise_sampler(sigmas[i], sigmas[i + 1])
-                    ggml_ext_im_set_randn_f32(noise, rng);
-                    // noise = load_tensor_from_file(res_ctx, "./rand" + std::to_string(i+1) + ".bin");
-                    {
-                        float* vec_x     = (float*)x->data;
-                        float* vec_noise = (float*)noise->data;
-
-                        for (int j = 0; j < ggml_nelements(x); j++) {
-                            vec_x[j] = vec_x[j] + sigmas[i + 1] * vec_noise[j];
-                        }
-                    }
-                }
-            }
-        } break;
-        case DDIM_TRAILING_SAMPLE_METHOD:  // Denoising Diffusion Implicit Models
-                                           // with the "trailing" timestep spacing
-        {
-            // See J. Song et al., "Denoising Diffusion Implicit
-            // Models", arXiv:2010.02502 [cs.LG]
-            //
-            // DDIM itself needs alphas_cumprod (DDPM, J. Ho et al.,
-            // arXiv:2006.11239 [cs.LG] with k-diffusion's start and
-            // end beta) (which unfortunately k-diffusion's data
-            // structure hides from the denoiser), and the sigmas are
-            // also needed to invert the behavior of CompVisDenoiser
-            // (k-diffusion's LMSDiscreteSchedulerr)
-            float beta_start = 0.00085f;
-            float beta_end   = 0.0120f;
-            std::vector<double> alphas_cumprod;
-            std::vector<double> compvis_sigmas;
-
-            alphas_cumprod.reserve(TIMESTEPS);
-            compvis_sigmas.reserve(TIMESTEPS);
-            for (int i = 0; i < TIMESTEPS; i++) {
-                alphas_cumprod[i] =
-                    (i == 0 ? 1.0f : alphas_cumprod[i - 1]) *
-                    (1.0f -
-                     std::pow(sqrtf(beta_start) +
-                                  (sqrtf(beta_end) - sqrtf(beta_start)) *
-                                      ((float)i / (TIMESTEPS - 1)),
-                              2));
-                compvis_sigmas[i] =
-                    std::sqrt((1 - alphas_cumprod[i]) /
-                              alphas_cumprod[i]);
-            }
-
-            ggml_tensor* pred_original_sample =
-                ggml_dup_tensor(work_ctx, x);
-            ggml_tensor* variance_noise =
-                ggml_dup_tensor(work_ctx, x);
-
-            for (int i = 0; i < steps; i++) {
-                // The "trailing" DDIM timestep, see S. Lin et al.,
-                // "Common Diffusion Noise Schedulers and Sample Steps
-                // are Flawed", arXiv:2305.08891 [cs], p. 4, Table
-                // 2. Most variables below follow Diffusers naming
-                //
-                // Diffuser naming vs. Song et al. (2010), p. 5, (12)
-                // and p. 16, (16) (<variable name> -> <name in
-                // paper>):
-                //
-                // - pred_noise_t -> epsilon_theta^(t)(x_t)
-                // - pred_original_sample -> f_theta^(t)(x_t) or x_0
-                // - std_dev_t -> sigma_t (not the LMS sigma)
-                // - eta -> eta (set to 0 at the moment)
-                // - pred_sample_direction -> "direction pointing to
-                //   x_t"
-                // - pred_prev_sample -> "x_t-1"
-                int timestep = static_cast<int>(roundf(TIMESTEPS - i * ((float)TIMESTEPS / steps))) - 1;
-                // 1. get previous step value (=t-1)
-                int prev_timestep = timestep - TIMESTEPS / static_cast<int>(steps);
-                // The sigma here is chosen to cause the
-                // CompVisDenoiser to produce t = timestep
-                float sigma = static_cast<float>(compvis_sigmas[timestep]);
-                if (i == 0) {
-                    // The function add_noise intializes x to
-                    // Diffusers' latents * sigma (as in Diffusers'
-                    // pipeline) or sample * sigma (Diffusers'
-                    // scheduler), where this sigma = init_noise_sigma
-                    // in Diffusers. For DDPM and DDIM however,
-                    // init_noise_sigma = 1. But the k-diffusion
-                    // model() also evaluates F_theta(c_in(sigma) x;
-                    // ...) instead of the bare U-net F_theta, with
-                    // c_in = 1 / sqrt(sigma^2 + 1), as defined in
-                    // T. Karras et al., "Elucidating the Design Space
-                    // of Diffusion-Based Generative Models",
-                    // arXiv:2206.00364 [cs.CV], p. 3, Table 1. Hence
-                    // the first call has to be prescaled as x <- x /
-                    // (c_in * sigma) with the k-diffusion pipeline
-                    // and CompVisDenoiser.
-                    float* vec_x = (float*)x->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] *= std::sqrt(sigma * sigma + 1) /
-                                    sigma;
-                    }
-                } else {
-                    // For the subsequent steps after the first one,
-                    // at this point x = latents or x = sample, and
-                    // needs to be prescaled with x <- sample / c_in
-                    // to compensate for model() applying the scale
-                    // c_in before the U-net F_theta
-                    float* vec_x = (float*)x->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] *= std::sqrt(sigma * sigma + 1);
-                    }
-                }
-                // Note (also noise_pred in Diffuser's pipeline)
-                // model_output = model() is the D(x, sigma) as
-                // defined in Karras et al. (2022), p. 3, Table 1 and
-                // p. 8 (7), compare also p. 38 (226) therein.
-                ggml_tensor* model_output =
-                    model(x, sigma, i + 1);
-                // Here model_output is still the k-diffusion denoiser
-                // output, not the U-net output F_theta(c_in(sigma) x;
-                // ...) in Karras et al. (2022), whereas Diffusers'
-                // model_output is F_theta(...). Recover the actual
-                // model_output, which is also referred to as the
-                // "Karras ODE derivative" d or d_cur in several
-                // samplers above.
-                {
-                    float* vec_x = (float*)x->data;
-                    float* vec_model_output =
-                        (float*)model_output->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_model_output[j] =
-                            (vec_x[j] - vec_model_output[j]) *
-                            (1 / sigma);
-                    }
-                }
-                // 2. compute alphas, betas
-                float alpha_prod_t = static_cast<float>(alphas_cumprod[timestep]);
-                // Note final_alpha_cumprod = alphas_cumprod[0] due to
-                // trailing timestep spacing
-                float alpha_prod_t_prev = static_cast<float>(prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]);
-                float beta_prod_t       = 1 - alpha_prod_t;
-                // 3. compute predicted original sample from predicted
-                // noise also called "predicted x_0" of formula (12)
-                // from https://arxiv.org/pdf/2010.02502.pdf
-                {
-                    float* vec_x = (float*)x->data;
-                    float* vec_model_output =
-                        (float*)model_output->data;
-                    float* vec_pred_original_sample =
-                        (float*)pred_original_sample->data;
-                    // Note the substitution of latents or sample = x
-                    // * c_in = x / sqrt(sigma^2 + 1)
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_pred_original_sample[j] =
-                            (vec_x[j] / std::sqrt(sigma * sigma + 1) -
-                             std::sqrt(beta_prod_t) *
-                                 vec_model_output[j]) *
-                            (1 / std::sqrt(alpha_prod_t));
-                    }
-                }
-                // Assuming the "epsilon" prediction type, where below
-                // pred_epsilon = model_output is inserted, and is not
-                // defined/copied explicitly.
-                //
-                // 5. compute variance: "sigma_t(eta)" -> see formula
-                // (16)
-                //
-                // sigma_t = sqrt((1 - alpha_t-1)/(1 - alpha_t)) *
-                // sqrt(1 - alpha_t/alpha_t-1)
-                float beta_prod_t_prev = 1 - alpha_prod_t_prev;
-                float variance         = (beta_prod_t_prev / beta_prod_t) *
-                                 (1 - alpha_prod_t / alpha_prod_t_prev);
-                float std_dev_t = eta * std::sqrt(variance);
-                // 6. compute "direction pointing to x_t" of formula
-                // (12) from https://arxiv.org/pdf/2010.02502.pdf
-                // 7. compute x_t without "random noise" of formula
-                // (12) from https://arxiv.org/pdf/2010.02502.pdf
-                {
-                    float* vec_model_output = (float*)model_output->data;
-                    float* vec_pred_original_sample =
-                        (float*)pred_original_sample->data;
-                    float* vec_x = (float*)x->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        // Two step inner loop without an explicit
-                        // tensor
-                        float pred_sample_direction =
-                            ::sqrtf(1 - alpha_prod_t_prev -
-                                    ::powf(std_dev_t, 2)) *
-                            vec_model_output[j];
-                        vec_x[j] = std::sqrt(alpha_prod_t_prev) *
-                                       vec_pred_original_sample[j] +
-                                   pred_sample_direction;
-                    }
-                }
-                if (eta > 0) {
-                    ggml_ext_im_set_randn_f32(variance_noise, rng);
-                    float* vec_variance_noise =
-                        (float*)variance_noise->data;
-                    float* vec_x = (float*)x->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] += std_dev_t * vec_variance_noise[j];
-                    }
-                }
-                // See the note above: x = latents or sample here, and
-                // is not scaled by the c_in. For the final output
-                // this is correct, but for subsequent iterations, x
-                // needs to be prescaled again, since k-diffusion's
-                // model() differes from the bare U-net F_theta by the
-                // factor c_in.
-            }
-        } break;
-        case TCD_SAMPLE_METHOD:  // Strategic Stochastic Sampling (Algorithm 4) in
-                                 // Trajectory Consistency Distillation
-        {
-            // See J. Zheng et al., "Trajectory Consistency
-            // Distillation: Improved Latent Consistency Distillation
-            // by Semi-Linear Consistency Function with Trajectory
-            // Mapping", arXiv:2402.19159 [cs.CV]
-            float beta_start = 0.00085f;
-            float beta_end   = 0.0120f;
-            std::vector<double> alphas_cumprod;
-            std::vector<double> compvis_sigmas;
-
-            alphas_cumprod.reserve(TIMESTEPS);
-            compvis_sigmas.reserve(TIMESTEPS);
-            for (int i = 0; i < TIMESTEPS; i++) {
-                alphas_cumprod[i] =
-                    (i == 0 ? 1.0f : alphas_cumprod[i - 1]) *
-                    (1.0f -
-                     std::pow(sqrtf(beta_start) +
-                                  (sqrtf(beta_end) - sqrtf(beta_start)) *
-                                      ((float)i / (TIMESTEPS - 1)),
-                              2));
-                compvis_sigmas[i] =
-                    std::sqrt((1 - alphas_cumprod[i]) /
-                              alphas_cumprod[i]);
-            }
-            int original_steps = 50;
-
-            ggml_tensor* pred_original_sample =
-                ggml_dup_tensor(work_ctx, x);
-            ggml_tensor* noise =
-                ggml_dup_tensor(work_ctx, x);
-
-            for (int i = 0; i < steps; i++) {
-                // Analytic form for TCD timesteps
-                int timestep = TIMESTEPS - 1 -
-                               (TIMESTEPS / original_steps) *
-                                   (int)floor(i * ((float)original_steps / steps));
-                // 1. get previous step value
-                int prev_timestep = i >= steps - 1 ? 0 : TIMESTEPS - 1 - (TIMESTEPS / original_steps) * (int)floor((i + 1) * ((float)original_steps / steps));
-                // Here timestep_s is tau_n' in Algorithm 4. The _s
-                // notation appears to be that from C. Lu,
-                // "DPM-Solver: A Fast ODE Solver for Diffusion
-                // Probabilistic Model Sampling in Around 10 Steps",
-                // arXiv:2206.00927 [cs.LG], but this notation is not
-                // continued in Algorithm 4, where _n' is used.
-                int timestep_s =
-                    (int)floor((1 - eta) * prev_timestep);
-                // Begin k-diffusion specific workaround for
-                // evaluating F_theta(x; ...) from D(x, sigma), same
-                // as in DDIM (and see there for detailed comments)
-                float sigma = static_cast<float>(compvis_sigmas[timestep]);
-                if (i == 0) {
-                    float* vec_x = (float*)x->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] *= std::sqrt(sigma * sigma + 1) /
-                                    sigma;
-                    }
-                } else {
-                    float* vec_x = (float*)x->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] *= std::sqrt(sigma * sigma + 1);
-                    }
-                }
-                ggml_tensor* model_output =
-                    model(x, sigma, i + 1);
-                {
-                    float* vec_x = (float*)x->data;
-                    float* vec_model_output =
-                        (float*)model_output->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_model_output[j] =
-                            (vec_x[j] - vec_model_output[j]) *
-                            (1 / sigma);
-                    }
-                }
-                // 2. compute alphas, betas
-                //
-                // When comparing TCD with DDPM/DDIM note that Zheng
-                // et al. (2024) follows the DPM-Solver notation for
-                // alpha. One can find the following comment in the
-                // original DPM-Solver code
-                // (https://github.com/LuChengTHU/dpm-solver/):
-                // "**Important**: Please pay special attention for
-                // the args for `alphas_cumprod`: The `alphas_cumprod`
-                // is the \hat{alpha_n} arrays in the notations of
-                // DDPM. [...] Therefore, the notation \hat{alpha_n}
-                // is different from the notation alpha_t in
-                // DPM-Solver. In fact, we have alpha_{t_n} =
-                // \sqrt{\hat{alpha_n}}, [...]"
-                float alpha_prod_t = static_cast<float>(alphas_cumprod[timestep]);
-                float beta_prod_t  = 1 - alpha_prod_t;
-                // Note final_alpha_cumprod = alphas_cumprod[0] since
-                // TCD is always "trailing"
-                float alpha_prod_t_prev = static_cast<float>(prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]);
-                // The subscript _s are the only portion in this
-                // section (2) unique to TCD
-                float alpha_prod_s = static_cast<float>(alphas_cumprod[timestep_s]);
-                float beta_prod_s  = 1 - alpha_prod_s;
-                // 3. Compute the predicted noised sample x_s based on
-                // the model parameterization
-                //
-                // This section is also exactly the same as DDIM
-                {
-                    float* vec_x = (float*)x->data;
-                    float* vec_model_output =
-                        (float*)model_output->data;
-                    float* vec_pred_original_sample =
-                        (float*)pred_original_sample->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_pred_original_sample[j] =
-                            (vec_x[j] / std::sqrt(sigma * sigma + 1) -
-                             std::sqrt(beta_prod_t) *
-                                 vec_model_output[j]) *
-                            (1 / std::sqrt(alpha_prod_t));
-                    }
-                }
-                // This consistency function step can be difficult to
-                // decipher from Algorithm 4, as it is simply stated
-                // using a consistency function. This step is the
-                // modified DDIM, i.e. p. 8 (32) in Zheng et
-                // al. (2024), with eta set to 0 (see the paragraph
-                // immediately thereafter that states this somewhat
-                // obliquely).
-                {
-                    float* vec_pred_original_sample =
-                        (float*)pred_original_sample->data;
-                    float* vec_model_output =
-                        (float*)model_output->data;
-                    float* vec_x = (float*)x->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        // Substituting x = pred_noised_sample and
-                        // pred_epsilon = model_output
-                        vec_x[j] =
-                            std::sqrt(alpha_prod_s) *
-                                vec_pred_original_sample[j] +
-                            std::sqrt(beta_prod_s) *
-                                vec_model_output[j];
-                    }
-                }
-                // 4. Sample and inject noise z ~ N(0, I) for
-                // MultiStep Inference Noise is not used on the final
-                // timestep of the timestep schedule. This also means
-                // that noise is not used for one-step sampling. Eta
-                // (referred to as "gamma" in the paper) was
-                // introduced to control the stochasticity in every
-                // step. When eta = 0, it represents deterministic
-                // sampling, whereas eta = 1 indicates full stochastic
-                // sampling.
-                if (eta > 0 && i != steps - 1) {
-                    // In this case, x is still pred_noised_sample,
-                    // continue in-place
-                    ggml_ext_im_set_randn_f32(noise, rng);
-                    float* vec_x     = (float*)x->data;
-                    float* vec_noise = (float*)noise->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        // Corresponding to (35) in Zheng et
-                        // al. (2024), substituting x =
-                        // pred_noised_sample
-                        vec_x[j] =
-                            std::sqrt(alpha_prod_t_prev /
-                                      alpha_prod_s) *
-                                vec_x[j] +
-                            std::sqrt(1 - alpha_prod_t_prev /
-                                              alpha_prod_s) *
-                                vec_noise[j];
-                    }
-                }
-            }
-        } break;
-        case RES_MULTISTEP_SAMPLE_METHOD:  // Res Multistep sampler
-        {
-            ggml_tensor* noise        = ggml_dup_tensor(work_ctx, x);
-            ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x);
+            return x;
+        }
+        case RES_MULTISTEP_SAMPLE_METHOD: {
+            sd::Tensor<float> old_denoised = x;
 
             bool have_old_sigma  = false;
             float old_sigma_down = 0.0f;
@@ -1712,10 +1065,11 @@ static bool sample_k_diffusion(sample_method_t method,
             };
 
             for (int i = 0; i < steps; i++) {
-                ggml_tensor* denoised = model(x, sigmas[i], i + 1);
-                if (denoised == nullptr) {
-                    return false;
+                auto denoised_opt = model(x, sigmas[i], i + 1);
+                if (denoised_opt.empty()) {
+                    return {};
                 }
+                sd::Tensor<float> denoised = std::move(denoised_opt);
 
                 float sigma_from = sigmas[i];
                 float sigma_to   = sigmas[i + 1];
@@ -1737,14 +1091,7 @@ static bool sample_k_diffusion(sample_method_t method,
                 }
 
                 if (sigma_down == 0.0f || !have_old_sigma) {
-                    float dt            = sigma_down - sigma_from;
-                    float* vec_x        = (float*)x->data;
-                    float* vec_denoised = (float*)denoised->data;
-
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        float d  = (vec_x[j] - vec_denoised[j]) / sigma_from;
-                        vec_x[j] = vec_x[j] + d * dt;
-                    }
+                    x += ((x - denoised) / sigma_from) * (sigma_down - sigma_from);
                 } else {
                     float t      = t_fn(sigma_from);
                     float t_old  = t_fn(old_sigma_down);
@@ -1765,42 +1112,20 @@ static bool sample_k_diffusion(sample_method_t method,
                         b2 = 0.0f;
                     }
 
-                    float sigma_h           = sigma_fn(h);
-                    float* vec_x            = (float*)x->data;
-                    float* vec_denoised     = (float*)denoised->data;
-                    float* vec_old_denoised = (float*)old_denoised->data;
-
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] = sigma_h * vec_x[j] + h * (b1 * vec_denoised[j] + b2 * vec_old_denoised[j]);
-                    }
+                    x = sigma_fn(h) * (x) + h * (b1 * denoised + b2 * old_denoised);
                 }
 
                 if (sigmas[i + 1] > 0 && sigma_up > 0.0f) {
-                    ggml_ext_im_set_randn_f32(noise, rng);
-                    float* vec_x     = (float*)x->data;
-                    float* vec_noise = (float*)noise->data;
-
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] = vec_x[j] + vec_noise[j] * sigma_up;
-                    }
-                }
-
-                float* vec_old_denoised = (float*)old_denoised->data;
-                float* vec_denoised     = (float*)denoised->data;
-                for (int j = 0; j < ggml_nelements(x); j++) {
-                    vec_old_denoised[j] = vec_denoised[j];
+                    x += sd::Tensor<float>::randn_like(x, rng) * sigma_up;
                 }
 
+                old_denoised   = denoised;
                 old_sigma_down = sigma_down;
                 have_old_sigma = true;
             }
-        } break;
-        case RES_2S_SAMPLE_METHOD:  // Res 2s sampler
-        {
-            ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
-            ggml_tensor* x0    = ggml_dup_tensor(work_ctx, x);
-            ggml_tensor* x2    = ggml_dup_tensor(work_ctx, x);
-
+            return x;
+        }
+        case RES_2S_SAMPLE_METHOD: {
             const float c2 = 0.5f;
             auto t_fn      = [](float sigma) -> float { return -logf(sigma); };
             auto phi1_fn   = [](float t) -> float {
@@ -1821,10 +1146,11 @@ static bool sample_k_diffusion(sample_method_t method,
                 float sigma_from = sigmas[i];
                 float sigma_to   = sigmas[i + 1];
 
-                ggml_tensor* denoised = model(x, sigma_from, -(i + 1));
-                if (denoised == nullptr) {
-                    return false;
+                auto denoised_opt = model(x, sigma_from, -(i + 1));
+                if (denoised_opt.empty()) {
+                    return {};
                 }
+                sd::Tensor<float> denoised = std::move(denoised_opt);
 
                 float sigma_up   = 0.0f;
                 float sigma_down = sigma_to;
@@ -1842,17 +1168,9 @@ static bool sample_k_diffusion(sample_method_t method,
                     sigma_down          = sigma_down_sq > 0.0f ? std::sqrt(sigma_down_sq) : 0.0f;
                 }
 
-                float* vec_x  = (float*)x->data;
-                float* vec_x0 = (float*)x0->data;
-                for (int j = 0; j < ggml_nelements(x); j++) {
-                    vec_x0[j] = vec_x[j];
-                }
-
+                sd::Tensor<float> x0 = x;
                 if (sigma_down == 0.0f || sigma_from == 0.0f) {
-                    float* vec_denoised = (float*)denoised->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] = vec_denoised[j];
-                    }
+                    x = denoised;
                 } else {
                     float t      = t_fn(sigma_from);
                     float t_next = t_fn(sigma_down);
@@ -1864,45 +1182,140 @@ static bool sample_k_diffusion(sample_method_t method,
                     float b2       = phi2_val / c2;
                     float b1       = phi1_val - b2;
 
-                    float sigma_c2 = expf(-(t + h * c2));
+                    float sigma_c2         = expf(-(t + h * c2));
+                    sd::Tensor<float> eps1 = denoised - x0;
+                    sd::Tensor<float> x2   = x0 + eps1 * (h * a21);
 
-                    float* vec_denoised = (float*)denoised->data;
-                    float* vec_x2       = (float*)x2->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        float eps1 = vec_denoised[j] - vec_x0[j];
-                        vec_x2[j]  = vec_x0[j] + h * a21 * eps1;
-                    }
-
-                    ggml_tensor* denoised2 = model(x2, sigma_c2, i + 1);
-                    if (denoised2 == nullptr) {
-                        return false;
-                    }
-                    float* vec_denoised2 = (float*)denoised2->data;
-
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        float eps1 = vec_denoised[j] - vec_x0[j];
-                        float eps2 = vec_denoised2[j] - vec_x0[j];
-                        vec_x[j]   = vec_x0[j] + h * (b1 * eps1 + b2 * eps2);
+                    auto denoised2_opt = model(x2, sigma_c2, i + 1);
+                    if (denoised2_opt.empty()) {
+                        return {};
                     }
+                    sd::Tensor<float> denoised2 = std::move(denoised2_opt);
+                    sd::Tensor<float> eps2      = denoised2 - x0;
+                    x                           = x0 + h * (b1 * eps1 + b2 * eps2);
                 }
 
                 if (sigmas[i + 1] > 0 && sigma_up > 0.0f) {
-                    ggml_ext_im_set_randn_f32(noise, rng);
-                    float* vec_x     = (float*)x->data;
-                    float* vec_noise = (float*)noise->data;
-
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] = vec_x[j] + vec_noise[j] * sigma_up;
-                    }
+                    x += sd::Tensor<float>::randn_like(x, rng) * sigma_up;
                 }
             }
-        } break;
+            return x;
+        }
+        case DDIM_TRAILING_SAMPLE_METHOD: {
+            float beta_start = 0.00085f;
+            float beta_end   = 0.0120f;
+            std::vector<double> alphas_cumprod(TIMESTEPS);
+            std::vector<double> compvis_sigmas(TIMESTEPS);
+            for (int i = 0; i < TIMESTEPS; i++) {
+                alphas_cumprod[i] =
+                    (i == 0 ? 1.0f : alphas_cumprod[i - 1]) *
+                    (1.0f -
+                     std::pow(sqrtf(beta_start) +
+                                  (sqrtf(beta_end) - sqrtf(beta_start)) *
+                                      ((float)i / (TIMESTEPS - 1)),
+                              2));
+                compvis_sigmas[i] =
+                    std::sqrt((1 - alphas_cumprod[i]) / alphas_cumprod[i]);
+            }
 
+            for (int i = 0; i < steps; i++) {
+                int timestep      = static_cast<int>(roundf(TIMESTEPS - i * ((float)TIMESTEPS / steps))) - 1;
+                int prev_timestep = timestep - TIMESTEPS / static_cast<int>(steps);
+                float sigma       = static_cast<float>(compvis_sigmas[timestep]);
+                if (i == 0) {
+                    x *= std::sqrt(sigma * sigma + 1) / sigma;
+                } else {
+                    x *= std::sqrt(sigma * sigma + 1);
+                }
+
+                auto model_output_opt = model(x, sigma, i + 1);
+                if (model_output_opt.empty()) {
+                    return {};
+                }
+                sd::Tensor<float> model_output = std::move(model_output_opt);
+                model_output                   = (x - model_output) * (1.0f / sigma);
+
+                float alpha_prod_t      = static_cast<float>(alphas_cumprod[timestep]);
+                float alpha_prod_t_prev = static_cast<float>(prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]);
+                float beta_prod_t       = 1.0f - alpha_prod_t;
+
+                sd::Tensor<float> pred_original_sample = ((x / std::sqrt(sigma * sigma + 1)) -
+                                                          std::sqrt(beta_prod_t) * model_output) *
+                                                         (1.0f / std::sqrt(alpha_prod_t));
+
+                float beta_prod_t_prev = 1.0f - alpha_prod_t_prev;
+                float variance         = (beta_prod_t_prev / beta_prod_t) *
+                                 (1.0f - alpha_prod_t / alpha_prod_t_prev);
+                float std_dev_t = eta * std::sqrt(variance);
+
+                x = std::sqrt(alpha_prod_t_prev) * pred_original_sample +
+                    std::sqrt(1.0f - alpha_prod_t_prev - std::pow(std_dev_t, 2)) * model_output;
+
+                if (eta > 0) {
+                    x += std_dev_t * sd::Tensor<float>::randn_like(x, rng);
+                }
+            }
+            return x;
+        }
+        case TCD_SAMPLE_METHOD: {
+            float beta_start = 0.00085f;
+            float beta_end   = 0.0120f;
+            std::vector<double> alphas_cumprod(TIMESTEPS);
+            std::vector<double> compvis_sigmas(TIMESTEPS);
+            for (int i = 0; i < TIMESTEPS; i++) {
+                alphas_cumprod[i] =
+                    (i == 0 ? 1.0f : alphas_cumprod[i - 1]) *
+                    (1.0f -
+                     std::pow(sqrtf(beta_start) +
+                                  (sqrtf(beta_end) - sqrtf(beta_start)) *
+                                      ((float)i / (TIMESTEPS - 1)),
+                              2));
+                compvis_sigmas[i] =
+                    std::sqrt((1 - alphas_cumprod[i]) / alphas_cumprod[i]);
+            }
+            int original_steps = 50;
+            for (int i = 0; i < steps; i++) {
+                int timestep      = TIMESTEPS - 1 - (TIMESTEPS / original_steps) * (int)floor(i * ((float)original_steps / steps));
+                int prev_timestep = i >= steps - 1 ? 0 : TIMESTEPS - 1 - (TIMESTEPS / original_steps) * (int)floor((i + 1) * ((float)original_steps / steps));
+                int timestep_s    = (int)floor((1 - eta) * prev_timestep);
+                float sigma       = static_cast<float>(compvis_sigmas[timestep]);
+
+                if (i == 0) {
+                    x *= std::sqrt(sigma * sigma + 1) / sigma;
+                } else {
+                    x *= std::sqrt(sigma * sigma + 1);
+                }
+
+                auto model_output_opt = model(x, sigma, i + 1);
+                if (model_output_opt.empty()) {
+                    return {};
+                }
+                sd::Tensor<float> model_output = std::move(model_output_opt);
+                model_output                   = (x - model_output) * (1.0f / sigma);
+
+                float alpha_prod_t      = static_cast<float>(alphas_cumprod[timestep]);
+                float beta_prod_t       = 1.0f - alpha_prod_t;
+                float alpha_prod_t_prev = static_cast<float>(prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]);
+                float alpha_prod_s      = static_cast<float>(alphas_cumprod[timestep_s]);
+                float beta_prod_s       = 1.0f - alpha_prod_s;
+
+                sd::Tensor<float> pred_original_sample = ((x / std::sqrt(sigma * sigma + 1)) -
+                                                          std::sqrt(beta_prod_t) * model_output) *
+                                                         (1.0f / std::sqrt(alpha_prod_t));
+
+                x = std::sqrt(alpha_prod_s) * pred_original_sample +
+                    std::sqrt(beta_prod_s) * model_output;
+
+                if (eta > 0 && i != steps - 1) {
+                    x = std::sqrt(alpha_prod_t_prev / alpha_prod_s) * (x) +
+                        std::sqrt(1.0f - alpha_prod_t_prev / alpha_prod_s) * sd::Tensor<float>::randn_like(x, rng);
+                }
+            }
+            return x;
+        }
         default:
-            LOG_ERROR("Attempting to sample with nonexisting sample method %i", method);
-            return false;
+            return {};
     }
-    return true;
 }
 
 #endif  // __DENOISER_HPP__
diff --git a/src/diffusion_model.hpp b/src/diffusion_model.hpp
index 07d9df8..eb0debf 100644
--- a/src/diffusion_model.hpp
+++ b/src/diffusion_model.hpp
@@ -1,37 +1,45 @@
 #ifndef __DIFFUSION_MODEL_H__
 #define __DIFFUSION_MODEL_H__
 
+#include <optional>
 #include "anima.hpp"
 #include "flux.hpp"
 #include "mmdit.hpp"
 #include "qwen_image.hpp"
+#include "tensor_ggml.hpp"
 #include "unet.hpp"
 #include "wan.hpp"
 #include "z_image.hpp"
 
 struct DiffusionParams {
-    ggml_tensor* x                        = nullptr;
-    ggml_tensor* timesteps                = nullptr;
-    ggml_tensor* context                  = nullptr;
-    ggml_tensor* c_concat                 = nullptr;
-    ggml_tensor* y                        = nullptr;
-    ggml_tensor* guidance                 = nullptr;
-    std::vector<ggml_tensor*> ref_latents = {};
-    bool increase_ref_index               = false;
-    int num_video_frames                  = -1;
-    std::vector<ggml_tensor*> controls    = {};
-    float control_strength                = 0.f;
-    ggml_tensor* vace_context             = nullptr;
-    float vace_strength                   = 1.f;
-    std::vector<int> skip_layers          = {};
+    const sd::Tensor<float>* x                        = nullptr;
+    const sd::Tensor<float>* timesteps                = nullptr;
+    const sd::Tensor<float>* context                  = nullptr;
+    const sd::Tensor<float>* c_concat                 = nullptr;
+    const sd::Tensor<float>* y                        = nullptr;
+    const sd::Tensor<int32_t>* t5_ids                 = nullptr;
+    const sd::Tensor<float>* t5_weights               = nullptr;
+    const sd::Tensor<float>* guidance                 = nullptr;
+    const std::vector<sd::Tensor<float>>* ref_latents = nullptr;
+    bool increase_ref_index                           = false;
+    int num_video_frames                              = -1;
+    const std::vector<sd::Tensor<float>>* controls    = nullptr;
+    float control_strength                            = 0.f;
+    const sd::Tensor<float>* vace_context             = nullptr;
+    float vace_strength                               = 1.f;
+    const std::vector<int>* skip_layers               = nullptr;
 };
 
+template <typename T>
+static inline const sd::Tensor<T>& tensor_or_empty(const sd::Tensor<T>* tensor) {
+    static const sd::Tensor<T> kEmpty;
+    return tensor != nullptr ? *tensor : kEmpty;
+}
+
 struct DiffusionModel {
     virtual std::string get_desc()                                               = 0;
-    virtual bool compute(int n_threads,
-                         DiffusionParams diffusion_params,
-                         ggml_tensor** output     = nullptr,
-                         ggml_context* output_ctx = nullptr)                     = 0;
+    virtual sd::Tensor<float> compute(int n_threads,
+                                      const DiffusionParams& diffusion_params)   = 0;
     virtual void alloc_params_buffer()                                           = 0;
     virtual void free_params_buffer()                                            = 0;
     virtual void free_compute_buffer()                                           = 0;
@@ -93,19 +101,20 @@ struct UNetModel : public DiffusionModel {
         unet.set_circular_axes(circular_x, circular_y);
     }
 
-    bool compute(int n_threads,
-                 DiffusionParams diffusion_params,
-                 ggml_tensor** output     = nullptr,
-                 ggml_context* output_ctx = nullptr) override {
+    sd::Tensor<float> compute(int n_threads,
+                              const DiffusionParams& diffusion_params) override {
+        GGML_ASSERT(diffusion_params.x != nullptr);
+        GGML_ASSERT(diffusion_params.timesteps != nullptr);
+        static const std::vector<sd::Tensor<float>> empty_controls;
         return unet.compute(n_threads,
-                            diffusion_params.x,
-                            diffusion_params.timesteps,
-                            diffusion_params.context,
-                            diffusion_params.c_concat,
-                            diffusion_params.y,
+                            *diffusion_params.x,
+                            *diffusion_params.timesteps,
+                            tensor_or_empty(diffusion_params.context),
+                            tensor_or_empty(diffusion_params.c_concat),
+                            tensor_or_empty(diffusion_params.y),
                             diffusion_params.num_video_frames,
-                            diffusion_params.controls,
-                            diffusion_params.control_strength, output, output_ctx);
+                            diffusion_params.controls ? *diffusion_params.controls : empty_controls,
+                            diffusion_params.control_strength);
     }
 };
 
@@ -158,18 +167,17 @@ struct MMDiTModel : public DiffusionModel {
         mmdit.set_circular_axes(circular_x, circular_y);
     }
 
-    bool compute(int n_threads,
-                 DiffusionParams diffusion_params,
-                 ggml_tensor** output     = nullptr,
-                 ggml_context* output_ctx = nullptr) override {
+    sd::Tensor<float> compute(int n_threads,
+                              const DiffusionParams& diffusion_params) override {
+        GGML_ASSERT(diffusion_params.x != nullptr);
+        GGML_ASSERT(diffusion_params.timesteps != nullptr);
+        static const std::vector<int> empty_skip_layers;
         return mmdit.compute(n_threads,
-                             diffusion_params.x,
-                             diffusion_params.timesteps,
-                             diffusion_params.context,
-                             diffusion_params.y,
-                             output,
-                             output_ctx,
-                             diffusion_params.skip_layers);
+                             *diffusion_params.x,
+                             *diffusion_params.timesteps,
+                             tensor_or_empty(diffusion_params.context),
+                             tensor_or_empty(diffusion_params.y),
+                             diffusion_params.skip_layers ? *diffusion_params.skip_layers : empty_skip_layers);
     }
 };
 
@@ -224,22 +232,22 @@ struct FluxModel : public DiffusionModel {
         flux.set_circular_axes(circular_x, circular_y);
     }
 
-    bool compute(int n_threads,
-                 DiffusionParams diffusion_params,
-                 ggml_tensor** output     = nullptr,
-                 ggml_context* output_ctx = nullptr) override {
+    sd::Tensor<float> compute(int n_threads,
+                              const DiffusionParams& diffusion_params) override {
+        GGML_ASSERT(diffusion_params.x != nullptr);
+        GGML_ASSERT(diffusion_params.timesteps != nullptr);
+        static const std::vector<sd::Tensor<float>> empty_ref_latents;
+        static const std::vector<int> empty_skip_layers;
         return flux.compute(n_threads,
-                            diffusion_params.x,
-                            diffusion_params.timesteps,
-                            diffusion_params.context,
-                            diffusion_params.c_concat,
-                            diffusion_params.y,
-                            diffusion_params.guidance,
-                            diffusion_params.ref_latents,
+                            *diffusion_params.x,
+                            *diffusion_params.timesteps,
+                            tensor_or_empty(diffusion_params.context),
+                            tensor_or_empty(diffusion_params.c_concat),
+                            tensor_or_empty(diffusion_params.y),
+                            tensor_or_empty(diffusion_params.guidance),
+                            diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
                             diffusion_params.increase_ref_index,
-                            output,
-                            output_ctx,
-                            diffusion_params.skip_layers);
+                            diffusion_params.skip_layers ? *diffusion_params.skip_layers : empty_skip_layers);
     }
 };
 
@@ -294,18 +302,16 @@ struct AnimaModel : public DiffusionModel {
         anima.set_circular_axes(circular_x, circular_y);
     }
 
-    bool compute(int n_threads,
-                 DiffusionParams diffusion_params,
-                 ggml_tensor** output     = nullptr,
-                 ggml_context* output_ctx = nullptr) override {
+    sd::Tensor<float> compute(int n_threads,
+                              const DiffusionParams& diffusion_params) override {
+        GGML_ASSERT(diffusion_params.x != nullptr);
+        GGML_ASSERT(diffusion_params.timesteps != nullptr);
         return anima.compute(n_threads,
-                             diffusion_params.x,
-                             diffusion_params.timesteps,
-                             diffusion_params.context,
-                             diffusion_params.c_concat,
-                             diffusion_params.y,
-                             output,
-                             output_ctx);
+                             *diffusion_params.x,
+                             *diffusion_params.timesteps,
+                             tensor_or_empty(diffusion_params.context),
+                             tensor_or_empty(diffusion_params.t5_ids),
+                             tensor_or_empty(diffusion_params.t5_weights));
     }
 };
 
@@ -361,21 +367,19 @@ struct WanModel : public DiffusionModel {
         wan.set_circular_axes(circular_x, circular_y);
     }
 
-    bool compute(int n_threads,
-                 DiffusionParams diffusion_params,
-                 ggml_tensor** output     = nullptr,
-                 ggml_context* output_ctx = nullptr) override {
+    sd::Tensor<float> compute(int n_threads,
+                              const DiffusionParams& diffusion_params) override {
+        GGML_ASSERT(diffusion_params.x != nullptr);
+        GGML_ASSERT(diffusion_params.timesteps != nullptr);
         return wan.compute(n_threads,
-                           diffusion_params.x,
-                           diffusion_params.timesteps,
-                           diffusion_params.context,
-                           diffusion_params.y,
-                           diffusion_params.c_concat,
-                           nullptr,
-                           diffusion_params.vace_context,
-                           diffusion_params.vace_strength,
-                           output,
-                           output_ctx);
+                           *diffusion_params.x,
+                           *diffusion_params.timesteps,
+                           tensor_or_empty(diffusion_params.context),
+                           tensor_or_empty(diffusion_params.y),
+                           tensor_or_empty(diffusion_params.c_concat),
+                           sd::Tensor<float>(),
+                           tensor_or_empty(diffusion_params.vace_context),
+                           diffusion_params.vace_strength);
     }
 };
 
@@ -432,18 +436,17 @@ struct QwenImageModel : public DiffusionModel {
         qwen_image.set_circular_axes(circular_x, circular_y);
     }
 
-    bool compute(int n_threads,
-                 DiffusionParams diffusion_params,
-                 ggml_tensor** output     = nullptr,
-                 ggml_context* output_ctx = nullptr) override {
+    sd::Tensor<float> compute(int n_threads,
+                              const DiffusionParams& diffusion_params) override {
+        GGML_ASSERT(diffusion_params.x != nullptr);
+        GGML_ASSERT(diffusion_params.timesteps != nullptr);
+        static const std::vector<sd::Tensor<float>> empty_ref_latents;
         return qwen_image.compute(n_threads,
-                                  diffusion_params.x,
-                                  diffusion_params.timesteps,
-                                  diffusion_params.context,
-                                  diffusion_params.ref_latents,
-                                  true,  // increase_ref_index
-                                  output,
-                                  output_ctx);
+                                  *diffusion_params.x,
+                                  *diffusion_params.timesteps,
+                                  tensor_or_empty(diffusion_params.context),
+                                  diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
+                                  true);
     }
 };
 
@@ -499,18 +502,17 @@ struct ZImageModel : public DiffusionModel {
         z_image.set_circular_axes(circular_x, circular_y);
     }
 
-    bool compute(int n_threads,
-                 DiffusionParams diffusion_params,
-                 ggml_tensor** output     = nullptr,
-                 ggml_context* output_ctx = nullptr) override {
+    sd::Tensor<float> compute(int n_threads,
+                              const DiffusionParams& diffusion_params) override {
+        GGML_ASSERT(diffusion_params.x != nullptr);
+        GGML_ASSERT(diffusion_params.timesteps != nullptr);
+        static const std::vector<sd::Tensor<float>> empty_ref_latents;
         return z_image.compute(n_threads,
-                               diffusion_params.x,
-                               diffusion_params.timesteps,
-                               diffusion_params.context,
-                               diffusion_params.ref_latents,
-                               true,  // increase_ref_index
-                               output,
-                               output_ctx);
+                               *diffusion_params.x,
+                               *diffusion_params.timesteps,
+                               tensor_or_empty(diffusion_params.context),
+                               diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
+                               true);
     }
 };
 
diff --git a/src/easycache.hpp b/src/easycache.hpp
index 3f0287a..409a464 100644
--- a/src/easycache.hpp
+++ b/src/easycache.hpp
@@ -1,10 +1,15 @@
+#ifndef __EASYCACHE_HPP__
+#define __EASYCACHE_HPP__
+
 #include <cmath>
 #include <limits>
 #include <unordered_map>
 #include <vector>
 
+#include "condition_cache_utils.hpp"
 #include "denoiser.hpp"
 #include "ggml_extend.hpp"
+#include "tensor.hpp"
 
 struct EasyCacheConfig {
     bool enabled          = false;
@@ -19,15 +24,15 @@ struct EasyCacheCacheEntry {
 
 struct EasyCacheState {
     EasyCacheConfig config;
-    Denoiser* denoiser                  = nullptr;
-    float start_sigma                   = std::numeric_limits<float>::max();
-    float end_sigma                     = 0.0f;
-    bool initialized                    = false;
-    bool initial_step                   = true;
-    bool skip_current_step              = false;
-    bool step_active                    = false;
-    const SDCondition* anchor_condition = nullptr;
-    std::unordered_map<const SDCondition*, EasyCacheCacheEntry> cache_diffs;
+    Denoiser* denoiser           = nullptr;
+    float start_sigma            = std::numeric_limits<float>::max();
+    float end_sigma              = 0.0f;
+    bool initialized             = false;
+    bool initial_step            = true;
+    bool skip_current_step       = false;
+    bool step_active             = false;
+    const void* anchor_condition = nullptr;
+    std::unordered_map<const void*, EasyCacheCacheEntry> cache_diffs;
     std::vector<float> prev_input;
     std::vector<float> prev_output;
     float output_prev_norm                = 0.0f;
@@ -120,41 +125,30 @@ struct EasyCacheState {
         return enabled() && step_active && skip_current_step;
     }
 
-    bool has_cache(const SDCondition* cond) const {
+    bool has_cache(const void* cond) const {
         auto it = cache_diffs.find(cond);
         return it != cache_diffs.end() && !it->second.diff.empty();
     }
 
-    void update_cache(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) {
+    void update_cache(const void* cond, const sd::Tensor<float>& input, const sd::Tensor<float>& output) {
         EasyCacheCacheEntry& entry = cache_diffs[cond];
-        size_t ne                  = static_cast<size_t>(ggml_nelements(output));
-        entry.diff.resize(ne);
-        float* out_data = (float*)output->data;
-        float* in_data  = (float*)input->data;
-        for (size_t i = 0; i < ne; ++i) {
-            entry.diff[i] = out_data[i] - in_data[i];
-        }
+        sd::store_condition_cache_diff(&entry.diff, input, output);
     }
 
-    void apply_cache(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) {
+    void apply_cache(const void* cond, const sd::Tensor<float>& input, sd::Tensor<float>* output) {
         auto it = cache_diffs.find(cond);
         if (it == cache_diffs.end() || it->second.diff.empty()) {
             return;
         }
-        copy_ggml_tensor(output, input);
-        float* out_data                = (float*)output->data;
-        const std::vector<float>& diff = it->second.diff;
-        for (size_t i = 0; i < diff.size(); ++i) {
-            out_data[i] += diff[i];
-        }
+        sd::apply_condition_cache_diff(it->second.diff, input, output);
     }
 
-    bool before_condition(const SDCondition* cond,
-                          ggml_tensor* input,
-                          ggml_tensor* output,
+    bool before_condition(const void* cond,
+                          const sd::Tensor<float>& input,
+                          sd::Tensor<float>* output,
                           float sigma,
                           int step_index) {
-        if (!enabled() || step_index < 0) {
+        if (!enabled() || step_index < 0 || output == nullptr) {
             return false;
         }
         if (step_index != current_step_index) {
@@ -181,12 +175,12 @@ struct EasyCacheState {
         if (!has_prev_input || !has_prev_output || !has_cache(cond)) {
             return false;
         }
-        size_t ne = static_cast<size_t>(ggml_nelements(input));
+        size_t ne = static_cast<size_t>(input.numel());
         if (prev_input.size() != ne) {
             return false;
         }
-        float* input_data = (float*)input->data;
-        last_input_change = 0.0f;
+        const float* input_data = input.data();
+        last_input_change       = 0.0f;
         for (size_t i = 0; i < ne; ++i) {
             last_input_change += std::fabs(input_data[i] - prev_input[i]);
         }
@@ -211,7 +205,7 @@ struct EasyCacheState {
         return false;
     }
 
-    void after_condition(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) {
+    void after_condition(const void* cond, const sd::Tensor<float>& input, const sd::Tensor<float>& output) {
         if (!step_is_active()) {
             return;
         }
@@ -220,16 +214,16 @@ struct EasyCacheState {
             return;
         }
 
-        size_t ne      = static_cast<size_t>(ggml_nelements(input));
-        float* in_data = (float*)input->data;
+        size_t ne            = static_cast<size_t>(input.numel());
+        const float* in_data = input.data();
         prev_input.resize(ne);
         for (size_t i = 0; i < ne; ++i) {
             prev_input[i] = in_data[i];
         }
         has_prev_input = true;
 
-        float* out_data     = (float*)output->data;
-        float output_change = 0.0f;
+        const float* out_data = output.data();
+        float output_change   = 0.0f;
         if (has_prev_output && prev_output.size() == ne) {
             for (size_t i = 0; i < ne; ++i) {
                 output_change += std::fabs(out_data[i] - prev_output[i]);
@@ -262,4 +256,6 @@ struct EasyCacheState {
         cumulative_change_rate = 0.0f;
         has_last_input_change  = false;
     }
-};
\ No newline at end of file
+};
+
+#endif
diff --git a/src/esrgan.hpp b/src/esrgan.hpp
index efb3aed..26c46f5 100644
--- a/src/esrgan.hpp
+++ b/src/esrgan.hpp
@@ -341,12 +341,12 @@ struct ESRGAN : public GGMLRunner {
         return success;
     }
 
-    ggml_cgraph* build_graph(ggml_tensor* x) {
+    ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor) {
         if (!rrdb_net)
             return nullptr;
         constexpr int kGraphNodes = 1 << 16;  // 65k
         ggml_cgraph* gf           = new_graph_custom(kGraphNodes);
-        x                         = to_backend(x);
+        ggml_tensor* x            = make_input(x_tensor);
 
         auto runner_ctx  = get_context();
         ggml_tensor* out = rrdb_net->forward(&runner_ctx, x);
@@ -354,15 +354,12 @@ struct ESRGAN : public GGMLRunner {
         return gf;
     }
 
-    bool compute(const int n_threads,
-                 ggml_tensor* x,
-                 ggml_tensor** output,
-                 ggml_context* output_ctx = nullptr) {
-        auto get_graph = [&]() -> ggml_cgraph* {
-            return build_graph(x);
-        };
-        return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+    sd::Tensor<float> compute(const int n_threads,
+                              const sd::Tensor<float>& x) {
+        auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x); };
+        auto result    = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
+        return result;
     }
 };
 
-#endif  // __ESRGAN_HPP__
\ No newline at end of file
+#endif  // __ESRGAN_HPP__
diff --git a/src/flux.hpp b/src/flux.hpp
index 93b9350..e6bf002 100644
--- a/src/flux.hpp
+++ b/src/flux.hpp
@@ -1178,6 +1178,7 @@ namespace Flux {
         std::vector<float> pe_vec;
         std::vector<float> mod_index_arange_vec;
         std::vector<float> dct_vec;
+        sd::Tensor<float> guidance_tensor;
         SDVersion version;
         bool use_mask = false;
 
@@ -1353,29 +1354,42 @@ namespace Flux {
             return dct;
         }
 
-        ggml_cgraph* build_graph(ggml_tensor* x,
-                                 ggml_tensor* timesteps,
-                                 ggml_tensor* context,
-                                 ggml_tensor* c_concat,
-                                 ggml_tensor* y,
-                                 ggml_tensor* guidance,
-                                 std::vector<ggml_tensor*> ref_latents = {},
-                                 bool increase_ref_index               = false,
-                                 std::vector<int> skip_layers          = {}) {
+        ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
+                                 const sd::Tensor<float>& timesteps_tensor,
+                                 const sd::Tensor<float>& context_tensor                  = {},
+                                 const sd::Tensor<float>& c_concat_tensor                 = {},
+                                 const sd::Tensor<float>& y_tensor                        = {},
+                                 const sd::Tensor<float>& guidance_tensor                 = {},
+                                 const std::vector<sd::Tensor<float>>& ref_latents_tensor = {},
+                                 bool increase_ref_index                                  = false,
+                                 std::vector<int> skip_layers                             = {}) {
+            ggml_tensor* x         = make_input(x_tensor);
+            ggml_tensor* timesteps = make_input(timesteps_tensor);
+            ggml_tensor* context   = make_optional_input(context_tensor);
+            ggml_tensor* c_concat  = make_optional_input(c_concat_tensor);
+            ggml_tensor* y         = make_optional_input(y_tensor);
+            if (flux_params.guidance_embed || flux_params.is_chroma) {
+                if (!guidance_tensor.empty()) {
+                    this->guidance_tensor = guidance_tensor;
+                    if (flux_params.is_chroma) {
+                        this->guidance_tensor.fill_(0.f);
+                    }
+                }
+            }
+            ggml_tensor* guidance = make_optional_input(this->guidance_tensor);
+            std::vector<ggml_tensor*> ref_latents;
+            ref_latents.reserve(ref_latents_tensor.size());
+            for (const auto& ref_latent_tensor : ref_latents_tensor) {
+                ref_latents.push_back(make_input(ref_latent_tensor));
+            }
+
             GGML_ASSERT(x->ne[3] == 1);
             ggml_cgraph* gf = new_graph_custom(FLUX_GRAPH_SIZE);
 
             ggml_tensor* mod_index_arange = nullptr;
             ggml_tensor* dct              = nullptr;  // for chroma radiance
 
-            x       = to_backend(x);
-            context = to_backend(context);
-            if (c_concat != nullptr) {
-                c_concat = to_backend(c_concat);
-            }
             if (flux_params.is_chroma) {
-                guidance = ggml_set_f32(guidance, 0);
-
                 if (!use_mask) {
                     y = nullptr;
                 }
@@ -1385,16 +1399,6 @@ namespace Flux {
                 mod_index_arange     = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_F32, mod_index_arange_vec.size());
                 set_backend_tensor_data(mod_index_arange, mod_index_arange_vec.data());
             }
-            y = to_backend(y);
-
-            timesteps = to_backend(timesteps);
-            if (flux_params.guidance_embed || flux_params.is_chroma) {
-                guidance = to_backend(guidance);
-            }
-            for (int i = 0; i < ref_latents.size(); i++) {
-                ref_latents[i] = to_backend(ref_latents[i]);
-            }
-
             std::set<int> txt_arange_dims;
             if (sd_version_is_flux2(version)) {
                 txt_arange_dims    = {3};
@@ -1455,18 +1459,16 @@ namespace Flux {
             return gf;
         }
 
-        bool compute(int n_threads,
-                     ggml_tensor* x,
-                     ggml_tensor* timesteps,
-                     ggml_tensor* context,
-                     ggml_tensor* c_concat,
-                     ggml_tensor* y,
-                     ggml_tensor* guidance,
-                     std::vector<ggml_tensor*> ref_latents = {},
-                     bool increase_ref_index               = false,
-                     ggml_tensor** output                  = nullptr,
-                     ggml_context* output_ctx              = nullptr,
-                     std::vector<int> skip_layers          = std::vector<int>()) {
+        sd::Tensor<float> compute(int n_threads,
+                                  const sd::Tensor<float>& x,
+                                  const sd::Tensor<float>& timesteps,
+                                  const sd::Tensor<float>& context                  = {},
+                                  const sd::Tensor<float>& c_concat                 = {},
+                                  const sd::Tensor<float>& y                        = {},
+                                  const sd::Tensor<float>& guidance                 = {},
+                                  const std::vector<sd::Tensor<float>>& ref_latents = {},
+                                  bool increase_ref_index                           = false,
+                                  std::vector<int> skip_layers                      = std::vector<int>()) {
             // x: [N, in_channels, h, w]
             // timesteps: [N, ]
             // context: [N, max_position, hidden_size]
@@ -1476,7 +1478,8 @@ namespace Flux {
                 return build_graph(x, timesteps, context, c_concat, y, guidance, ref_latents, increase_ref_index, skip_layers);
             };
 
-            return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+            auto result = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
+            return result;
         }
 
         void test() {
@@ -1485,41 +1488,51 @@ namespace Flux {
             params.mem_buffer = nullptr;
             params.no_alloc   = false;
 
-            ggml_context* work_ctx = ggml_init(params);
-            GGML_ASSERT(work_ctx != nullptr);
+            ggml_context* ctx = ggml_init(params);
+            GGML_ASSERT(ctx != nullptr);
 
             {
                 // cpu f16:
                 // cuda f16: nan
                 // cuda q8_0: pass
-                auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 16, 16, 128, 1);
+                sd::Tensor<float> x({16, 16, 128, 1});
                 // ggml_set_f32(x, 0.01f);
-                // auto x = load_tensor_from_file(work_ctx, "chroma_x.bin");
+                // auto x = load_tensor_from_file(ctx, "chroma_x.bin");
                 // print_ggml_tensor(x);
 
                 std::vector<float> timesteps_vec(1, 1.f);
-                auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
+                auto timesteps = sd::Tensor<float>::from_vector(timesteps_vec);
 
                 std::vector<float> guidance_vec(1, 0.f);
-                auto guidance = vector_to_ggml_tensor(work_ctx, guidance_vec);
+                auto guidance = sd::Tensor<float>::from_vector(guidance_vec);
 
-                auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 15360, 256, 1);
+                sd::Tensor<float> context({15360, 256, 1});
                 // ggml_set_f32(context, 0.01f);
-                // auto context = load_tensor_from_file(work_ctx, "chroma_context.bin");
+                // auto context = load_tensor_from_file(ctx, "chroma_context.bin");
                 // print_ggml_tensor(context);
 
-                // auto y = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 768, 1);
+                // auto y = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 768, 1);
                 // ggml_set_f32(y, 0.01f);
                 auto y = nullptr;
                 // print_ggml_tensor(y);
 
-                ggml_tensor* out = nullptr;
+                sd::Tensor<float> out;
 
-                int64_t t0 = ggml_time_ms();
-                compute(8, x, timesteps, context, nullptr, y, guidance, {}, false, &out, work_ctx);
-                int64_t t1 = ggml_time_ms();
+                int64_t t0   = ggml_time_ms();
+                auto out_opt = compute(8,
+                                       x,
+                                       timesteps,
+                                       context,
+                                       {},
+                                       {},
+                                       guidance,
+                                       {},
+                                       false);
+                int64_t t1   = ggml_time_ms();
 
-                print_ggml_tensor(out);
+                GGML_ASSERT(!out_opt.empty());
+                out = std::move(out_opt);
+                print_sd_tensor(out);
                 LOG_DEBUG("flux test done in %lldms", t1 - t0);
             }
         }
diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp
index e6b27cc..859270c 100644
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
@@ -13,6 +13,7 @@
 #include <iterator>
 #include <map>
 #include <memory>
+#include <optional>
 #include <random>
 #include <regex>
 #include <set>
@@ -27,6 +28,7 @@
 #include "ggml.h"
 
 #include "model.h"
+#include "tensor.hpp"
 
 #ifdef SD_USE_CUDA
 #include "ggml-cuda.h"
@@ -49,6 +51,7 @@
 #endif
 
 #include "rng.hpp"
+#include "tensor_ggml.hpp"
 #include "util.h"
 
 #define EPS 1e-05f
@@ -205,14 +208,6 @@ __STATIC_INLINE__ float sd_image_get_f32(sd_image_t image, int64_t iw, int64_t i
     return value;
 }
 
-__STATIC_INLINE__ float sd_image_get_f32(sd_image_f32_t image, int64_t iw, int64_t ih, int64_t ic, bool scale = true) {
-    float value = *(image.data + ih * image.width * image.channel + iw * image.channel + ic);
-    if (scale) {
-        value /= 255.f;
-    }
-    return value;
-}
-
 __STATIC_INLINE__ void print_ggml_tensor(ggml_tensor* tensor, bool shape_only = false, const char* mark = "") {
     printf("%s (%s): shape(%zu, %zu, %zu, %zu)\n", mark, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
     fflush(stdout);
@@ -250,6 +245,56 @@ __STATIC_INLINE__ void print_ggml_tensor(ggml_tensor* tensor, bool shape_only =
     }
 }
 
+template <typename T>
+__STATIC_INLINE__ void print_sd_tensor(const sd::Tensor<T>& tensor, bool shape_only = false, const char* mark = "") {
+    printf("%s: shape(", mark);
+    for (size_t i = 0; i < static_cast<size_t>(tensor.dim()); ++i) {
+        printf("%s%lld", i == 0 ? "" : ", ", static_cast<long long>(tensor.shape()[i]));
+    }
+    printf(")\n");
+    fflush(stdout);
+    if (shape_only) {
+        return;
+    }
+    int range                  = 3;
+    std::vector<int64_t> shape = tensor.shape();
+    while (shape.size() < 4) {
+        shape.push_back(1);
+    }
+    for (int64_t i3 = 0; i3 < shape[3]; i3++) {
+        if (i3 >= range && i3 + range < shape[3]) {
+            continue;
+        }
+        for (int64_t i2 = 0; i2 < shape[2]; i2++) {
+            if (i2 >= range && i2 + range < shape[2]) {
+                continue;
+            }
+            for (int64_t i1 = 0; i1 < shape[1]; i1++) {
+                if (i1 >= range && i1 + range < shape[1]) {
+                    continue;
+                }
+                for (int64_t i0 = 0; i0 < shape[0]; i0++) {
+                    if (i0 >= range && i0 + range < shape[0]) {
+                        continue;
+                    }
+                    size_t offset = static_cast<size_t>(i0 + shape[0] * (i1 + shape[1] * (i2 + shape[2] * i3)));
+                    printf("  [%lld, %lld, %lld, %lld] = ", static_cast<long long>(i3), static_cast<long long>(i2), static_cast<long long>(i1), static_cast<long long>(i0));
+                    if constexpr (std::is_same_v<T, float>) {
+                        printf("%f\n", tensor[static_cast<int64_t>(offset)]);
+                    } else if constexpr (std::is_same_v<T, ggml_fp16_t>) {
+                        printf("%f\n", ggml_fp16_to_fp32(tensor[static_cast<int64_t>(offset)]));
+                    } else if constexpr (std::is_same_v<T, int32_t>) {
+                        printf("%d\n", tensor[static_cast<int64_t>(offset)]);
+                    } else if constexpr (std::is_same_v<T, int64_t>) {
+                        printf("%lld\n", static_cast<long long>(tensor[static_cast<int64_t>(offset)]));
+                    }
+                    fflush(stdout);
+                }
+            }
+        }
+    }
+}
+
 __STATIC_INLINE__ void ggml_ext_tensor_iter(
     ggml_tensor* tensor,
     const std::function<void(ggml_tensor*, int64_t, int64_t, int64_t, int64_t)>& fn) {
@@ -475,99 +520,6 @@ __STATIC_INLINE__ void ggml_ext_tensor_apply_mask(ggml_tensor* image_data,
     }
 }
 
-__STATIC_INLINE__ void sd_image_f32_to_ggml_tensor(sd_image_f32_t image,
-                                                   ggml_tensor* tensor,
-                                                   bool scale = true) {
-    GGML_ASSERT(image.width == tensor->ne[0]);
-    GGML_ASSERT(image.height == tensor->ne[1]);
-    GGML_ASSERT(image.channel == tensor->ne[2]);
-    GGML_ASSERT(1 == tensor->ne[3]);
-    GGML_ASSERT(tensor->type == GGML_TYPE_F32);
-    ggml_ext_tensor_iter(tensor, [&](ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-        float value = sd_image_get_f32(image, i0, i1, i2, scale);
-        ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2, i3);
-    });
-}
-
-__STATIC_INLINE__ void ggml_ext_tensor_split_2d(ggml_tensor* input,
-                                                ggml_tensor* output,
-                                                int x,
-                                                int y) {
-    int64_t width    = output->ne[0];
-    int64_t height   = output->ne[1];
-    int64_t channels = output->ne[2];
-    int64_t ne3      = output->ne[3];
-
-    int64_t input_width  = input->ne[0];
-    int64_t input_height = input->ne[1];
-
-    GGML_ASSERT(input->type == GGML_TYPE_F32 && output->type == GGML_TYPE_F32);
-    for (int iy = 0; iy < height; iy++) {
-        for (int ix = 0; ix < width; ix++) {
-            for (int k = 0; k < channels; k++) {
-                for (int l = 0; l < ne3; l++) {
-                    float value = ggml_ext_tensor_get_f32(input, (ix + x) % input_width, (iy + y) % input_height, k, l);
-                    ggml_ext_tensor_set_f32(output, value, ix, iy, k, l);
-                }
-            }
-        }
-    }
-}
-
-// unclamped -> expects x in the range [0-1]
-__STATIC_INLINE__ float smootherstep_f32(const float x) {
-    GGML_ASSERT(x >= 0.f && x <= 1.f);
-    return x * x * x * (x * (6.0f * x - 15.0f) + 10.0f);
-}
-
-__STATIC_INLINE__ void ggml_ext_tensor_merge_2d(ggml_tensor* input,
-                                                ggml_tensor* output,
-                                                int x,
-                                                int y,
-                                                int overlap_x,
-                                                int overlap_y,
-                                                bool circular_x,
-                                                bool circular_y,
-                                                int x_skip = 0,
-                                                int y_skip = 0) {
-    int64_t width    = input->ne[0];
-    int64_t height   = input->ne[1];
-    int64_t channels = input->ne[2];
-    int64_t ne3      = input->ne[3];
-
-    int64_t img_width  = output->ne[0];
-    int64_t img_height = output->ne[1];
-
-    GGML_ASSERT(input->type == GGML_TYPE_F32 && output->type == GGML_TYPE_F32);
-    for (int iy = y_skip; iy < height; iy++) {
-        for (int ix = x_skip; ix < width; ix++) {
-            for (int k = 0; k < channels; k++) {
-                for (int l = 0; l < ne3; l++) {
-                    float new_value = ggml_ext_tensor_get_f32(input, ix, iy, k, l);
-                    if (overlap_x > 0 || overlap_y > 0) {  // blend colors in overlapped area
-                        float old_value = ggml_ext_tensor_get_f32(output, (x + ix) % img_width, (y + iy) % img_height, k, l);
-
-                        const float x_f_0 = (circular_x || (overlap_x > 0 && x > 0)) ? (ix - x_skip) / float(overlap_x) : 1;
-                        const float x_f_1 = (circular_x || (overlap_x > 0 && x < (img_width - width))) ? (width - ix) / float(overlap_x) : 1;
-                        const float y_f_0 = (circular_y || (overlap_y > 0 && y > 0)) ? (iy - y_skip) / float(overlap_y) : 1;
-                        const float y_f_1 = (circular_y || (overlap_y > 0 && y < (img_height - height))) ? (height - iy) / float(overlap_y) : 1;
-
-                        const float x_f = std::min(std::min(x_f_0, x_f_1), 1.f);
-                        const float y_f = std::min(std::min(y_f_0, y_f_1), 1.f);
-
-                        ggml_ext_tensor_set_f32(
-                            output,
-                            old_value + new_value * smootherstep_f32(y_f) * smootherstep_f32(x_f),
-                            (x + ix) % img_width, (y + iy) % img_height, k, l);
-                    } else {
-                        ggml_ext_tensor_set_f32(output, new_value, (x + ix) % img_width, (y + iy) % img_height, k, l);
-                    }
-                }
-            }
-        }
-    }
-}
-
 __STATIC_INLINE__ float ggml_ext_tensor_mean(ggml_tensor* src) {
     float mean        = 0.0f;
     int64_t nelements = ggml_nelements(src);
@@ -832,22 +784,102 @@ __STATIC_INLINE__ void sd_tiling_calc_tiles(int& num_tiles_dim,
 }
 
 // Tiling
-__STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,
-                                            ggml_tensor* output,
-                                            const int scale,
-                                            const int p_tile_size_x,
-                                            const int p_tile_size_y,
-                                            const float tile_overlap_factor,
-                                            const bool circular_x,
-                                            const bool circular_y,
-                                            on_tile_process on_processing,
-                                            bool slient = false) {
-    output = ggml_set_f32(output, 0);
 
-    int input_width   = (int)input->ne[0];
-    int input_height  = (int)input->ne[1];
-    int output_width  = (int)output->ne[0];
-    int output_height = (int)output->ne[1];
+__STATIC_INLINE__ int64_t sd_tensor_plane_size(const sd::Tensor<float>& tensor) {
+    GGML_ASSERT(tensor.dim() >= 2);
+    return tensor.shape()[0] * tensor.shape()[1];
+}
+
+__STATIC_INLINE__ sd::Tensor<float> sd_tensor_split_2d(const sd::Tensor<float>& input, int width, int height, int x, int y) {
+    GGML_ASSERT(input.dim() >= 4);
+    std::vector<int64_t> output_shape = input.shape();
+    output_shape[0]                   = width;
+    output_shape[1]                   = height;
+    sd::Tensor<float> output(std::move(output_shape));
+    int64_t input_width  = input.shape()[0];
+    int64_t input_height = input.shape()[1];
+    int64_t input_plane  = sd_tensor_plane_size(input);
+    int64_t output_plane = sd_tensor_plane_size(output);
+    int64_t plane_count  = input.numel() / input_plane;
+    for (int iy = 0; iy < height; iy++) {
+        for (int ix = 0; ix < width; ix++) {
+            int64_t src_xy = (ix + x) % input_width + input_width * ((iy + y) % input_height);
+            int64_t dst_xy = ix + width * iy;
+            for (int64_t plane = 0; plane < plane_count; ++plane) {
+                output[plane * output_plane + dst_xy] = input[plane * input_plane + src_xy];
+            }
+        }
+    }
+    return output;
+}
+
+__STATIC_INLINE__ void sd_tensor_merge_2d(const sd::Tensor<float>& input,
+                                          sd::Tensor<float>* output,
+                                          int x,
+                                          int y,
+                                          int overlap_x,
+                                          int overlap_y,
+                                          bool circular_x,
+                                          bool circular_y,
+                                          int x_skip = 0,
+                                          int y_skip = 0) {
+    GGML_ASSERT(output != nullptr);
+    int64_t width        = input.shape()[0];
+    int64_t height       = input.shape()[1];
+    int64_t img_width    = output->shape()[0];
+    int64_t img_height   = output->shape()[1];
+    int64_t input_plane  = sd_tensor_plane_size(input);
+    int64_t output_plane = sd_tensor_plane_size(*output);
+    int64_t plane_count  = input.numel() / input_plane;
+    GGML_ASSERT(output->numel() / output_plane == plane_count);
+
+    // unclamped -> expects x in the range [0-1]
+    auto smootherstep_f32 = [](const float x) -> float {
+        GGML_ASSERT(x >= 0.f && x <= 1.f);
+        return x * x * x * (x * (6.0f * x - 15.0f) + 10.0f);
+    };
+
+    for (int iy = y_skip; iy < height; iy++) {
+        for (int ix = x_skip; ix < width; ix++) {
+            int64_t src_xy = ix + width * iy;
+            int64_t ox     = (x + ix) % img_width;
+            int64_t oy     = (y + iy) % img_height;
+            int64_t dst_xy = ox + img_width * oy;
+            for (int64_t plane = 0; plane < plane_count; ++plane) {
+                float new_value = input[plane * input_plane + src_xy];
+                if (overlap_x > 0 || overlap_y > 0) {
+                    float old_value   = (*output)[plane * output_plane + dst_xy];
+                    const float x_f_0 = (circular_x || (overlap_x > 0 && x > 0)) ? (ix - x_skip) / float(overlap_x) : 1.f;
+                    const float x_f_1 = (circular_x || (overlap_x > 0 && x < (img_width - width))) ? (width - ix) / float(overlap_x) : 1.f;
+                    const float y_f_0 = (circular_y || (overlap_y > 0 && y > 0)) ? (iy - y_skip) / float(overlap_y) : 1.f;
+                    const float y_f_1 = (circular_y || (overlap_y > 0 && y < (img_height - height))) ? (height - iy) / float(overlap_y) : 1.f;
+                    const float x_f   = std::min(std::min(x_f_0, x_f_1), 1.f);
+                    const float y_f   = std::min(std::min(y_f_0, y_f_1), 1.f);
+                    (*output)[plane * output_plane + dst_xy] =
+                        old_value + new_value * smootherstep_f32(y_f) * smootherstep_f32(x_f);
+                } else {
+                    (*output)[plane * output_plane + dst_xy] = new_value;
+                }
+            }
+        }
+    }
+}
+
+template <typename Fn>
+__STATIC_INLINE__ sd::Tensor<float> process_tiles_2d(const sd::Tensor<float>& input,
+                                                     int output_width,
+                                                     int output_height,
+                                                     int scale,
+                                                     int p_tile_size_x,
+                                                     int p_tile_size_y,
+                                                     float tile_overlap_factor,
+                                                     bool circular_x,
+                                                     bool circular_y,
+                                                     Fn&& on_processing,
+                                                     bool silent = false) {
+    sd::Tensor<float> output;
+    int input_width  = static_cast<int>(input.shape()[0]);
+    int input_height = static_cast<int>(input.shape()[1]);
 
     GGML_ASSERT(((input_width / output_width) == (input_height / output_height)) &&
                 ((output_width / input_width) == (output_height / input_height)));
@@ -856,8 +888,7 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,
 
     int small_width  = output_width;
     int small_height = output_height;
-
-    bool decode = output_width > input_width;
+    bool decode      = output_width > input_width;
     if (decode) {
         small_width  = input_width;
         small_height = input_height;
@@ -871,25 +902,16 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,
     float tile_overlap_factor_y;
     sd_tiling_calc_tiles(num_tiles_y, tile_overlap_factor_y, small_height, p_tile_size_y, tile_overlap_factor, circular_y);
 
-    if (!slient) {
-        LOG_DEBUG("num tiles : %d, %d ", num_tiles_x, num_tiles_y);
-        LOG_DEBUG("optimal overlap : %f, %f (targeting %f)", tile_overlap_factor_x, tile_overlap_factor_y, tile_overlap_factor);
-    }
-
-    int tile_overlap_x     = (int32_t)(p_tile_size_x * tile_overlap_factor_x);
+    int tile_overlap_x     = static_cast<int32_t>(p_tile_size_x * tile_overlap_factor_x);
     int non_tile_overlap_x = p_tile_size_x - tile_overlap_x;
-
-    int tile_overlap_y     = (int32_t)(p_tile_size_y * tile_overlap_factor_y);
+    int tile_overlap_y     = static_cast<int32_t>(p_tile_size_y * tile_overlap_factor_y);
     int non_tile_overlap_y = p_tile_size_y - tile_overlap_y;
-
-    int tile_size_x = p_tile_size_x < small_width ? p_tile_size_x : small_width;
-    int tile_size_y = p_tile_size_y < small_height ? p_tile_size_y : small_height;
-
+    int tile_size_x        = p_tile_size_x < small_width ? p_tile_size_x : small_width;
+    int tile_size_y        = p_tile_size_y < small_height ? p_tile_size_y : small_height;
     int input_tile_size_x  = tile_size_x;
     int input_tile_size_y  = tile_size_y;
     int output_tile_size_x = tile_size_x;
     int output_tile_size_y = tile_size_y;
-
     if (decode) {
         output_tile_size_x *= scale;
         output_tile_size_y *= scale;
@@ -898,41 +920,23 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,
         input_tile_size_y *= scale;
     }
 
-    ggml_init_params params = {};
-    params.mem_size += input_tile_size_x * input_tile_size_y * input->ne[2] * input->ne[3] * sizeof(float);      // input chunk
-    params.mem_size += output_tile_size_x * output_tile_size_y * output->ne[2] * output->ne[3] * sizeof(float);  // output chunk
-    params.mem_size += 3 * ggml_tensor_overhead();
-    params.mem_buffer = nullptr;
-    params.no_alloc   = false;
-
-    if (!slient) {
-        LOG_DEBUG("tile work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f);
-    }
-
-    // draft context
-    ggml_context* tiles_ctx = ggml_init(params);
-    if (!tiles_ctx) {
-        LOG_ERROR("ggml_init() failed");
-        return;
-    }
-
-    // tiling
-    ggml_tensor* input_tile  = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, input_tile_size_x, input_tile_size_y, input->ne[2], input->ne[3]);
-    ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, output_tile_size_x, output_tile_size_y, output->ne[2], output->ne[3]);
-    int num_tiles            = num_tiles_x * num_tiles_y;
-    if (!slient) {
+    int num_tiles   = num_tiles_x * num_tiles_y;
+    int tile_count  = 1;
+    bool last_y     = false;
+    bool last_x     = false;
+    float last_time = 0.0f;
+    if (!silent) {
+        LOG_DEBUG("num tiles : %d, %d ", num_tiles_x, num_tiles_y);
+        LOG_DEBUG("optimal overlap : %f, %f (targeting %f)", tile_overlap_factor_x, tile_overlap_factor_y, tile_overlap_factor);
         LOG_DEBUG("processing %i tiles", num_tiles);
         pretty_progress(0, num_tiles, 0.0f);
     }
-    int tile_count = 1;
-    bool last_y = false, last_x = false;
-    float last_time = 0.0f;
     for (int y = 0; y < small_height && !last_y; y += non_tile_overlap_y) {
         int dy = 0;
         if (!circular_y && y + tile_size_y >= small_height) {
-            int _y = y;
-            y      = small_height - tile_size_y;
-            dy     = _y - y;
+            int original_y = y;
+            y              = small_height - tile_size_y;
+            dy             = original_y - y;
             if (decode) {
                 dy *= scale;
             }
@@ -941,9 +945,9 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,
         for (int x = 0; x < small_width && !last_x; x += non_tile_overlap_x) {
             int dx = 0;
             if (!circular_x && x + tile_size_x >= small_width) {
-                int _x = x;
-                x      = small_width - tile_size_x;
-                dx     = _x - x;
+                int original_x = x;
+                x              = small_width - tile_size_x;
+                dx             = original_x - x;
                 if (decode) {
                     dx *= scale;
                 }
@@ -958,38 +962,37 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,
             int overlap_x_out = decode ? tile_overlap_x * scale : tile_overlap_x;
             int overlap_y_out = decode ? tile_overlap_y * scale : tile_overlap_y;
 
-            int64_t t1 = ggml_time_ms();
-            ggml_ext_tensor_split_2d(input, input_tile, x_in, y_in);
-            if (on_processing(input_tile, output_tile, false)) {
-                ggml_ext_tensor_merge_2d(output_tile, output, x_out, y_out, overlap_x_out, overlap_y_out, circular_x, circular_y, dx, dy);
+            int64_t t1       = ggml_time_ms();
+            auto input_tile  = sd_tensor_split_2d(input, input_tile_size_x, input_tile_size_y, x_in, y_in);
+            auto output_tile = on_processing(input_tile);
+            if (output_tile.empty()) {
+                return {};
+            }
+            GGML_ASSERT(output_tile.shape()[0] == output_tile_size_x && output_tile.shape()[1] == output_tile_size_y);
+            if (output.empty()) {
+                std::vector<int64_t> output_shape = output_tile.shape();
+                output_shape[0]                   = output_width;
+                output_shape[1]                   = output_height;
+                output                            = sd::Tensor<float>::zeros(std::move(output_shape));
+            }
+            sd_tensor_merge_2d(output_tile, &output, x_out, y_out, overlap_x_out, overlap_y_out, circular_x, circular_y, dx, dy);
 
+            if (!silent) {
                 int64_t t2 = ggml_time_ms();
                 last_time  = (t2 - t1) / 1000.0f;
                 pretty_progress(tile_count, num_tiles, last_time);
-            } else {
-                LOG_ERROR("Failed to process patch %d at (%d, %d)", tile_count, x, y);
             }
             tile_count++;
         }
         last_x = false;
     }
-    if (!slient) {
-        if (tile_count < num_tiles) {
-            pretty_progress(num_tiles, num_tiles, last_time);
-        }
+    if (!silent && tile_count < num_tiles) {
+        pretty_progress(num_tiles, num_tiles, last_time);
     }
-    ggml_free(tiles_ctx);
-}
-
-__STATIC_INLINE__ void sd_tiling(ggml_tensor* input,
-                                 ggml_tensor* output,
-                                 const int scale,
-                                 const int tile_size,
-                                 const float tile_overlap_factor,
-                                 const bool circular_x,
-                                 const bool circular_y,
-                                 on_tile_process on_processing) {
-    sd_tiling_non_square(input, output, scale, tile_size, tile_size, tile_overlap_factor, circular_x, circular_y, on_processing);
+    if (output.empty()) {
+        return {};
+    }
+    return output;
 }
 
 __STATIC_INLINE__ ggml_tensor* ggml_ext_group_norm_32(ggml_context* ctx,
@@ -1588,6 +1591,18 @@ __STATIC_INLINE__ void set_timestep_embedding(std::vector<float> timesteps,
     memcpy(((char*)embedding->data), ((char*)embedding_vec.data()), ggml_nbytes(embedding));
 }
 
+__STATIC_INLINE__ void set_timestep_embedding(std::vector<float> timesteps,
+                                              sd::Tensor<float>* embedding,
+                                              int dim,
+                                              int max_period = 10000) {
+    GGML_ASSERT(embedding != nullptr);
+    std::vector<float> embedding_vec = timestep_embedding(timesteps, dim, max_period);
+    if (embedding->numel() != static_cast<int64_t>(embedding_vec.size())) {
+        embedding->resize({dim, static_cast<int64_t>(timesteps.size())});
+    }
+    std::copy(embedding_vec.begin(), embedding_vec.end(), embedding->values().begin());
+}
+
 __STATIC_INLINE__ ggml_tensor* new_timestep_embedding(ggml_context* ctx,
                                                       std::vector<float> timesteps,
                                                       int dim,
@@ -1705,6 +1720,32 @@ protected:
     bool circular_x_enabled    = false;
     bool circular_y_enabled    = false;
 
+    template <typename T>
+    static sd::Tensor<T> take_or_empty(std::optional<sd::Tensor<T>> tensor) {
+        if (!tensor.has_value()) {
+            return {};
+        }
+        return std::move(*tensor);
+    }
+
+    template <typename T>
+    static sd::Tensor<T> restore_trailing_singleton_dims(std::optional<sd::Tensor<T>> tensor,
+                                                         size_t expected_dim) {
+        return restore_trailing_singleton_dims(take_or_empty(std::move(tensor)), expected_dim);
+    }
+
+    template <typename T>
+    static sd::Tensor<T> restore_trailing_singleton_dims(sd::Tensor<T> tensor,
+                                                         size_t expected_dim) {
+        if (tensor.empty()) {
+            return tensor;
+        }
+        while (static_cast<size_t>(tensor.dim()) < expected_dim) {
+            tensor.unsqueeze_(tensor.dim());
+        }
+        return tensor;
+    }
+
     void alloc_params_ctx() {
         ggml_init_params params;
         params.mem_size   = static_cast<size_t>(MAX_PARAMS_TENSOR_NUM * ggml_tensor_overhead());
@@ -2042,6 +2083,29 @@ public:
         backend_tensor_data_map[tensor] = data;
     }
 
+    template <typename T>
+    ggml_tensor* make_input(const sd::Tensor<T>& tensor) {
+        ggml_tensor* input = sd::make_ggml_tensor(compute_ctx, tensor, false);
+        set_backend_tensor_data(input, tensor.data());
+        return input;
+    }
+
+    template <typename T>
+    ggml_tensor* make_optional_input(const sd::Tensor<T>& tensor) {
+        if (tensor.empty()) {
+            return nullptr;
+        }
+        return make_input(tensor);
+    }
+
+    template <typename T>
+    ggml_tensor* make_optional_input(const sd::Tensor<T>* tensor) {
+        if (tensor == nullptr) {
+            return nullptr;
+        }
+        return make_input(*tensor);
+    }
+
     ggml_tensor* to_backend(ggml_tensor* tensor) {
         GGML_ASSERT(compute_ctx != nullptr);
         if (tensor == nullptr) {
@@ -2070,24 +2134,24 @@ public:
         return ggml_get_tensor(cache_ctx, name.c_str());
     }
 
-    bool compute(get_graph_cb_t get_graph,
-                 int n_threads,
-                 bool free_compute_buffer_immediately = true,
-                 ggml_tensor** output                 = nullptr,
-                 ggml_context* output_ctx             = nullptr) {
+    template <typename T>
+    std::optional<sd::Tensor<T>> compute(get_graph_cb_t get_graph,
+                                         int n_threads,
+                                         bool free_compute_buffer_immediately,
+                                         bool no_return = false) {
         if (!offload_params_to_runtime_backend()) {
             LOG_ERROR("%s offload params to runtime backend failed", get_desc().c_str());
-            return false;
+            return std::nullopt;
         }
         if (!alloc_compute_buffer(get_graph)) {
             LOG_ERROR("%s alloc compute buffer failed", get_desc().c_str());
-            return false;
+            return std::nullopt;
         }
         reset_compute_ctx();
         ggml_cgraph* gf = get_compute_graph(get_graph);
         if (!ggml_gallocr_alloc_graph(compute_allocr, gf)) {
             LOG_ERROR("%s alloc compute graph failed", get_desc().c_str());
-            return false;
+            return std::nullopt;
         }
         copy_data_to_backend_tensor();
         if (ggml_backend_is_cpu(runtime_backend)) {
@@ -2097,26 +2161,19 @@ public:
         ggml_status status = ggml_backend_graph_compute(runtime_backend, gf);
         if (status != GGML_STATUS_SUCCESS) {
             LOG_ERROR("%s compute failed: %s", get_desc().c_str(), ggml_status_to_string(status));
-            return false;
+            return std::nullopt;
         }
-#ifdef GGML_PERF
-        ggml_graph_print(gf);
-#endif
         copy_cache_tensors_to_cache_buffer();
-        if (output != nullptr) {
-            auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str());
-            if (*output == nullptr && output_ctx != nullptr) {
-                *output = ggml_dup_tensor(output_ctx, result);
-            }
-            if (*output != nullptr) {
-                ggml_ext_backend_tensor_get_and_sync(runtime_backend, result, (*output)->data, 0, ggml_nbytes(*output));
-            }
+        auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str());
+        std::optional<sd::Tensor<T>> output;
+        if (!no_return) {
+            output = sd::make_sd_tensor_from_ggml<T>(result);
         }
 
         if (free_compute_buffer_immediately) {
             free_compute_buffer();
         }
-        return true;
+        return output;
     }
 
     void set_flash_attention_enabled(bool enabled) {
diff --git a/src/latent-preview.h b/src/latent-preview.h
index 5078a6b..7f30734 100644
--- a/src/latent-preview.h
+++ b/src/latent-preview.h
@@ -1,6 +1,8 @@
+#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include "ggml.h"
+#include "tensor.hpp"
 
 const float wan_21_latent_rgb_proj[16][3] = {
     {0.015123f, -0.148418f, 0.479828f},
@@ -232,3 +234,67 @@ void preview_latent_video(uint8_t* buffer, ggml_tensor* latents, const float (*l
         }
     }
 }
+
+static inline bool preview_latent_tensor_is_video(const sd::Tensor<float>& latents) {
+    return latents.dim() == 5;
+}
+
+void preview_latent_video(uint8_t* buffer, const sd::Tensor<float>& latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int patch_size) {
+    uint32_t latent_width  = static_cast<uint32_t>(latents.shape()[0]);
+    uint32_t latent_height = static_cast<uint32_t>(latents.shape()[1]);
+    bool is_video          = preview_latent_tensor_is_video(latents);
+    uint32_t frames        = is_video ? static_cast<uint32_t>(latents.shape()[2]) : 1;
+    uint32_t dim           = is_video ? static_cast<uint32_t>(latents.shape()[3]) : static_cast<uint32_t>(latents.shape()[2]);
+
+    uint32_t rgb_width     = latent_width * patch_size;
+    uint32_t rgb_height    = latent_height * patch_size;
+    uint32_t unpatched_dim = dim / (patch_size * patch_size);
+
+    for (uint32_t k = 0; k < frames; k++) {
+        for (uint32_t rgb_x = 0; rgb_x < rgb_width; rgb_x++) {
+            for (uint32_t rgb_y = 0; rgb_y < rgb_height; rgb_y++) {
+                uint32_t latent_x = rgb_x / patch_size;
+                uint32_t latent_y = rgb_y / patch_size;
+
+                uint32_t channel_offset = 0;
+                if (patch_size > 1) {
+                    channel_offset = ((rgb_y % patch_size) * patch_size + (rgb_x % patch_size));
+                }
+
+                size_t pixel_id   = k * rgb_width * rgb_height + rgb_y * rgb_width + rgb_x;
+                auto latent_value = [&](uint32_t latent_channel) -> float {
+                    return is_video
+                               ? latents.values()[latent_x + latent_width * (latent_y + latent_height * (k + frames * latent_channel))]
+                               : latents.values()[latent_x + latent_width * (latent_y + latent_height * latent_channel)];
+                };
+
+                float r = 0.f, g = 0.f, b = 0.f;
+                if (latent_rgb_proj != nullptr) {
+                    for (uint32_t d = 0; d < unpatched_dim; d++) {
+                        uint32_t latent_channel = d * patch_size * patch_size + channel_offset;
+                        float value             = latent_value(latent_channel);
+                        r += value * latent_rgb_proj[d][0];
+                        g += value * latent_rgb_proj[d][1];
+                        b += value * latent_rgb_proj[d][2];
+                    }
+                } else {
+                    r = latent_value(0);
+                    g = latent_value(1);
+                    b = latent_value(2);
+                }
+                if (latent_rgb_bias != nullptr) {
+                    r += latent_rgb_bias[0];
+                    g += latent_rgb_bias[1];
+                    b += latent_rgb_bias[2];
+                }
+                r = std::min(1.0f, std::max(0.0f, r * .5f + .5f));
+                g = std::min(1.0f, std::max(0.0f, g * .5f + .5f));
+                b = std::min(1.0f, std::max(0.0f, b * .5f + .5f));
+
+                buffer[pixel_id * 3 + 0] = (uint8_t)(r * 255);
+                buffer[pixel_id * 3 + 1] = (uint8_t)(g * 255);
+                buffer[pixel_id * 3 + 2] = (uint8_t)(b * 255);
+            }
+        }
+    }
+}
diff --git a/src/llm.hpp b/src/llm.hpp
index 5a9c25c..c6c2961 100644
--- a/src/llm.hpp
+++ b/src/llm.hpp
@@ -194,6 +194,7 @@ namespace LLM {
                         bool padding      = false) {
             if (add_bos_token) {
                 tokens.insert(tokens.begin(), BOS_TOKEN_ID);
+                weights.insert(weights.begin(), 1.f);
             }
             if (max_length > 0 && padding) {
                 size_t n = static_cast<size_t>(std::ceil(tokens.size() * 1.f / max_length));
@@ -1180,16 +1181,17 @@ namespace LLM {
             return hidden_states;
         }
 
-        ggml_cgraph* build_graph(ggml_tensor* input_ids,
-                                 ggml_tensor* attention_mask,
-                                 std::vector<std::pair<int, ggml_tensor*>> image_embeds,
+        ggml_cgraph* build_graph(const sd::Tensor<int32_t>& input_ids_tensor,
+                                 const sd::Tensor<float>& attention_mask_tensor,
+                                 const std::vector<std::pair<int, sd::Tensor<float>>>& image_embeds_tensor,
                                  std::set<int> out_layers) {
-            ggml_cgraph* gf = ggml_new_graph(compute_ctx);
-
-            input_ids = to_backend(input_ids);
-
-            for (auto& image_embed : image_embeds) {
-                image_embed.second = to_backend(image_embed.second);
+            ggml_cgraph* gf        = ggml_new_graph(compute_ctx);
+            ggml_tensor* input_ids = make_input(input_ids_tensor);
+            std::vector<std::pair<int, ggml_tensor*>> image_embeds;
+            image_embeds.reserve(image_embeds_tensor.size());
+            for (const auto& [idx, embed_tensor] : image_embeds_tensor) {
+                ggml_tensor* embed = make_input(embed_tensor);
+                image_embeds.emplace_back(idx, embed);
             }
 
             int64_t n_tokens = input_ids->ne[0];
@@ -1213,8 +1215,9 @@ namespace LLM {
                                                 input_pos_vec.size());
             set_backend_tensor_data(input_pos, input_pos_vec.data());
 
-            if (attention_mask != nullptr) {
-                attention_mask = to_backend(attention_mask);
+            ggml_tensor* attention_mask = nullptr;
+            if (!attention_mask_tensor.empty()) {
+                attention_mask = make_input(attention_mask_tensor);
             } else {
                 attention_mask_vec.resize(n_tokens * n_tokens);
                 for (int i0 = 0; i0 < n_tokens; i0++) {
@@ -1239,17 +1242,15 @@ namespace LLM {
             return gf;
         }
 
-        bool compute(const int n_threads,
-                     ggml_tensor* input_ids,
-                     ggml_tensor* attention_mask,
-                     std::vector<std::pair<int, ggml_tensor*>> image_embeds,
-                     std::set<int> out_layers,
-                     ggml_tensor** output,
-                     ggml_context* output_ctx = nullptr) {
+        sd::Tensor<float> compute(const int n_threads,
+                                  const sd::Tensor<int32_t>& input_ids,
+                                  const sd::Tensor<float>& attention_mask,
+                                  const std::vector<std::pair<int, sd::Tensor<float>>>& image_embeds,
+                                  std::set<int> out_layers) {
             auto get_graph = [&]() -> ggml_cgraph* {
                 return build_graph(input_ids, attention_mask, image_embeds, out_layers);
             };
-            return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
+            return take_or_empty(GGMLRunner::compute<float>(get_graph, n_threads, true));
         }
 
         int64_t get_num_image_tokens(int64_t t, int64_t h, int64_t w) {
@@ -1288,8 +1289,9 @@ namespace LLM {
             return image;
         }
 
-        ggml_cgraph* build_encode_image_graph(ggml_tensor* image) {
-            ggml_cgraph* gf = new_graph_custom(LLM_GRAPH_SIZE);
+        ggml_cgraph* build_encode_image_graph(const sd::Tensor<float>& image_tensor) {
+            ggml_cgraph* gf    = new_graph_custom(LLM_GRAPH_SIZE);
+            ggml_tensor* image = make_input(image_tensor);
 
             GGML_ASSERT(image->ne[1] % (params.vision.patch_size * params.vision.spatial_merge_size) == 0);
             GGML_ASSERT(image->ne[0] % (params.vision.patch_size * params.vision.spatial_merge_size) == 0);
@@ -1301,8 +1303,6 @@ namespace LLM {
             int llm_grid_w             = grid_w / params.vision.spatial_merge_size;
             int vit_merger_window_size = params.vision.window_size / params.vision.patch_size / params.vision.spatial_merge_size;
 
-            image = to_backend(image);
-
             auto pixel_values = process_image(compute_ctx, image);
 
             // window index
@@ -1411,14 +1411,12 @@ namespace LLM {
             return gf;
         }
 
-        void encode_image(const int n_threads,
-                          ggml_tensor* image,
-                          ggml_tensor** output,
-                          ggml_context* output_ctx = nullptr) {
+        sd::Tensor<float> encode_image(const int n_threads,
+                                       const sd::Tensor<float>& image) {
             auto get_graph = [&]() -> ggml_cgraph* {
                 return build_encode_image_graph(image);
             };
-            GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+            return take_or_empty(GGMLRunner::compute<float>(get_graph, n_threads, false));
         }
     };
 
@@ -1497,39 +1495,41 @@ namespace LLM {
             params.mem_buffer = nullptr;
             params.no_alloc   = false;
 
-            ggml_context* work_ctx = ggml_init(params);
-            GGML_ASSERT(work_ctx != nullptr);
+            ggml_context* ctx = ggml_init(params);
+            GGML_ASSERT(ctx != nullptr);
             bool test_mistral          = false;
             bool test_qwen3            = true;
             bool test_vit              = false;
             bool test_decoder_with_vit = false;
 
             if (test_decoder_with_vit) {
-                ggml_tensor* image_embed = nullptr;
+                sd::Tensor<float> image_embed;
                 {
-                    auto image = load_tensor_from_file(work_ctx, "qwen2vl_normalized.bin");
-                    print_ggml_tensor(image, false, "image");
-                    ggml_tensor* out = nullptr;
+                    auto image = sd::load_tensor_from_file_as_tensor<float>("qwen2vl_normalized.bin");
+                    print_sd_tensor(image, false, "image");
+                    sd::Tensor<float> out;
 
-                    int64_t t0 = ggml_time_ms();
-                    model.encode_image(8, image, &out, work_ctx);
-                    int64_t t1 = ggml_time_ms();
+                    int64_t t0   = ggml_time_ms();
+                    auto out_opt = model.encode_image(8, image);
+                    int64_t t1   = ggml_time_ms();
 
-                    print_ggml_tensor(out, false, "image_embed");
+                    GGML_ASSERT(!out_opt.empty());
+                    out = std::move(out_opt);
+                    print_sd_tensor(out, false, "image_embed");
                     image_embed = out;
                     LOG_DEBUG("llm encode_image test done in %lldms", t1 - t0);
                 }
 
                 std::string placeholder  = "<|image_pad|>";
                 std::string img_prompt   = "Picture 1: <|vision_start|>";  // [24669, 220, 16, 25, 220, 151652]
-                int64_t num_image_tokens = image_embed->ne[1];
+                int64_t num_image_tokens = image_embed.shape()[1];
                 img_prompt.reserve(num_image_tokens * placeholder.size());
                 for (int i = 0; i < num_image_tokens; i++) {
                     img_prompt += placeholder;
                 }
                 img_prompt += "<|vision_end|>";
 
-                std::vector<std::pair<int, ggml_tensor*>> image_embeds;
+                std::vector<std::pair<int, sd::Tensor<float>>> image_embeds;
                 image_embeds.emplace_back(64, image_embed);
 
                 std::pair<int, int> prompt_attn_range;
@@ -1547,29 +1547,33 @@ namespace LLM {
                     printf("%d ", token);
                 }
                 printf("\n");
-                auto input_ids   = vector_to_ggml_tensor_i32(work_ctx, tokens);
-                ggml_tensor* out = nullptr;
+                auto input_ids = sd::Tensor<int32_t>::from_vector(tokens);
+                sd::Tensor<float> out;
 
-                int64_t t0 = ggml_time_ms();
-                model.compute(8, input_ids, nullptr, image_embeds, {}, &out, work_ctx);
-                int64_t t1 = ggml_time_ms();
+                int64_t t0   = ggml_time_ms();
+                auto out_opt = model.compute(8, input_ids, sd::Tensor<float>(), image_embeds, {});
+                int64_t t1   = ggml_time_ms();
 
-                print_ggml_tensor(out);
+                GGML_ASSERT(!out_opt.empty());
+                out = std::move(out_opt);
+                print_sd_tensor(out);
                 LOG_DEBUG("llm test done in %lldms", t1 - t0);
             } else if (test_vit) {
-                // auto image = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 280, 280, 3);
+                // auto image = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 280, 280, 3);
                 // ggml_set_f32(image, 0.f);
-                auto image = load_tensor_from_file(work_ctx, "qwen2vl_normalized.bin");
-                print_ggml_tensor(image, false, "image");
-                ggml_tensor* out = nullptr;
+                auto image = sd::load_tensor_from_file_as_tensor<float>("qwen2vl_normalized.bin");
+                print_sd_tensor(image, false, "image");
+                sd::Tensor<float> out;
 
-                int64_t t0 = ggml_time_ms();
-                model.encode_image(8, image, &out, work_ctx);
-                int64_t t1 = ggml_time_ms();
+                int64_t t0   = ggml_time_ms();
+                auto out_opt = model.encode_image(8, image);
+                int64_t t1   = ggml_time_ms();
 
-                print_ggml_tensor(out, false, "out");
+                GGML_ASSERT(!out_opt.empty());
+                out = std::move(out_opt);
+                print_sd_tensor(out, false, "out");
 
-                // auto ref_out = load_tensor_from_file(work_ctx, "qwen2vl.bin");
+                // auto ref_out = load_tensor_from_file(ctx, "qwen2vl.bin");
                 // ggml_ext_tensor_diff(ref_out, out, 0.01f);
 
                 LOG_DEBUG("llm test done in %lldms", t1 - t0);
@@ -1587,14 +1591,16 @@ namespace LLM {
                     printf("%d ", token);
                 }
                 printf("\n");
-                auto input_ids   = vector_to_ggml_tensor_i32(work_ctx, tokens);
-                ggml_tensor* out = nullptr;
+                auto input_ids = sd::Tensor<int32_t>::from_vector(tokens);
+                sd::Tensor<float> out;
 
-                int64_t t0 = ggml_time_ms();
-                model.compute(8, input_ids, nullptr, {}, {10, 20, 30}, &out, work_ctx);
-                int64_t t1 = ggml_time_ms();
+                int64_t t0   = ggml_time_ms();
+                auto out_opt = model.compute(8, input_ids, sd::Tensor<float>(), {}, {10, 20, 30});
+                int64_t t1   = ggml_time_ms();
 
-                print_ggml_tensor(out);
+                GGML_ASSERT(!out_opt.empty());
+                out = std::move(out_opt);
+                print_sd_tensor(out);
                 LOG_DEBUG("llm test done in %lldms", t1 - t0);
             } else if (test_qwen3) {
                 std::pair<int, int> prompt_attn_range;
@@ -1610,14 +1616,16 @@ namespace LLM {
                     printf("%d ", token);
                 }
                 printf("\n");
-                auto input_ids   = vector_to_ggml_tensor_i32(work_ctx, tokens);
-                ggml_tensor* out = nullptr;
+                auto input_ids = sd::Tensor<int32_t>::from_vector(tokens);
+                sd::Tensor<float> out;
 
-                int64_t t0 = ggml_time_ms();
-                model.compute(8, input_ids, nullptr, {}, {35}, &out, work_ctx);
-                int64_t t1 = ggml_time_ms();
+                int64_t t0   = ggml_time_ms();
+                auto out_opt = model.compute(8, input_ids, sd::Tensor<float>(), {}, {35});
+                int64_t t1   = ggml_time_ms();
 
-                print_ggml_tensor(out);
+                GGML_ASSERT(!out_opt.empty());
+                out = std::move(out_opt);
+                print_sd_tensor(out);
                 LOG_DEBUG("llm test done in %lldms", t1 - t0);
             } else {
                 std::pair<int, int> prompt_attn_range;
@@ -1633,14 +1641,16 @@ namespace LLM {
                     printf("%d ", token);
                 }
                 printf("\n");
-                auto input_ids   = vector_to_ggml_tensor_i32(work_ctx, tokens);
-                ggml_tensor* out = nullptr;
+                auto input_ids = sd::Tensor<int32_t>::from_vector(tokens);
+                sd::Tensor<float> out;
 
-                int64_t t0 = ggml_time_ms();
-                model.compute(8, input_ids, nullptr, {}, {}, &out, work_ctx);
-                int64_t t1 = ggml_time_ms();
+                int64_t t0   = ggml_time_ms();
+                auto out_opt = model.compute(8, input_ids, sd::Tensor<float>(), {}, {});
+                int64_t t1   = ggml_time_ms();
 
-                print_ggml_tensor(out);
+                GGML_ASSERT(!out_opt.empty());
+                out = std::move(out_opt);
+                print_sd_tensor(out);
                 LOG_DEBUG("llm test done in %lldms", t1 - t0);
             }
         }
diff --git a/src/lora.hpp b/src/lora.hpp
index 7df04ea..d4a749e 100644
--- a/src/lora.hpp
+++ b/src/lora.hpp
@@ -792,7 +792,7 @@ struct LoraModel : public GGMLRunner {
         auto get_graph = [&]() -> ggml_cgraph* {
             return build_lora_graph(model_tensors, version);
         };
-        GGMLRunner::compute(get_graph, n_threads, false);
+        GGMLRunner::compute<float>(get_graph, n_threads, false, true);
         stat();
         for (auto item : original_tensor_to_final_tensor) {
             ggml_tensor* original_tensor = item.first;
diff --git a/src/mmdit.hpp b/src/mmdit.hpp
index 7fbb2b2..e75736c 100644
--- a/src/mmdit.hpp
+++ b/src/mmdit.hpp
@@ -836,17 +836,17 @@ struct MMDiTRunner : public GGMLRunner {
         mmdit.get_param_tensors(tensors, prefix);
     }
 
-    ggml_cgraph* build_graph(ggml_tensor* x,
-                             ggml_tensor* timesteps,
-                             ggml_tensor* context,
-                             ggml_tensor* y,
-                             std::vector<int> skip_layers = std::vector<int>()) {
+    ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
+                             const sd::Tensor<float>& timesteps_tensor,
+                             const sd::Tensor<float>& context_tensor = {},
+                             const sd::Tensor<float>& y_tensor       = {},
+                             std::vector<int> skip_layers            = std::vector<int>()) {
         ggml_cgraph* gf = new_graph_custom(MMDIT_GRAPH_SIZE);
 
-        x         = to_backend(x);
-        context   = to_backend(context);
-        y         = to_backend(y);
-        timesteps = to_backend(timesteps);
+        ggml_tensor* x         = make_input(x_tensor);
+        ggml_tensor* timesteps = make_input(timesteps_tensor);
+        ggml_tensor* context   = make_optional_input(context_tensor);
+        ggml_tensor* y         = make_optional_input(y_tensor);
 
         auto runner_ctx  = get_context();
         ggml_tensor* out = mmdit.forward(&runner_ctx,
@@ -861,14 +861,12 @@ struct MMDiTRunner : public GGMLRunner {
         return gf;
     }
 
-    bool compute(int n_threads,
-                 ggml_tensor* x,
-                 ggml_tensor* timesteps,
-                 ggml_tensor* context,
-                 ggml_tensor* y,
-                 ggml_tensor** output         = nullptr,
-                 ggml_context* output_ctx     = nullptr,
-                 std::vector<int> skip_layers = std::vector<int>()) {
+    sd::Tensor<float> compute(int n_threads,
+                              const sd::Tensor<float>& x,
+                              const sd::Tensor<float>& timesteps,
+                              const sd::Tensor<float>& context = {},
+                              const sd::Tensor<float>& y       = {},
+                              std::vector<int> skip_layers     = std::vector<int>()) {
         // x: [N, in_channels, h, w]
         // timesteps: [N, ]
         // context: [N, max_position, hidden_size]([N, 154, 4096]) or [1, max_position, hidden_size]
@@ -877,7 +875,7 @@ struct MMDiTRunner : public GGMLRunner {
             return build_graph(x, timesteps, context, y, skip_layers);
         };
 
-        return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
     }
 
     void test() {
@@ -886,35 +884,41 @@ struct MMDiTRunner : public GGMLRunner {
         params.mem_buffer = nullptr;
         params.no_alloc   = false;
 
-        ggml_context* work_ctx = ggml_init(params);
-        GGML_ASSERT(work_ctx != nullptr);
+        ggml_context* ctx = ggml_init(params);
+        GGML_ASSERT(ctx != nullptr);
 
         {
             // cpu f16: pass
             // cpu f32: pass
             // cuda f16: pass
             // cuda f32: pass
-            auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 128, 128, 16, 1);
+            sd::Tensor<float> x({128, 128, 16, 1});
             std::vector<float> timesteps_vec(1, 999.f);
-            auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
-            ggml_set_f32(x, 0.01f);
+            auto timesteps = sd::Tensor<float>::from_vector(timesteps_vec);
+            x.fill_(0.01f);
             // print_ggml_tensor(x);
 
-            auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 4096, 154, 1);
-            ggml_set_f32(context, 0.01f);
+            sd::Tensor<float> context({4096, 154, 1});
+            context.fill_(0.01f);
             // print_ggml_tensor(context);
 
-            auto y = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 2048, 1);
-            ggml_set_f32(y, 0.01f);
+            sd::Tensor<float> y({2048, 1});
+            y.fill_(0.01f);
             // print_ggml_tensor(y);
 
-            ggml_tensor* out = nullptr;
+            sd::Tensor<float> out;
 
-            int64_t t0 = ggml_time_ms();
-            compute(8, x, timesteps, context, y, &out, work_ctx);
-            int64_t t1 = ggml_time_ms();
+            int64_t t0   = ggml_time_ms();
+            auto out_opt = compute(8,
+                                   x,
+                                   timesteps,
+                                   context,
+                                   y);
+            int64_t t1   = ggml_time_ms();
 
-            print_ggml_tensor(out);
+            GGML_ASSERT(!out_opt.empty());
+            out = std::move(out_opt);
+            print_sd_tensor(out);
             LOG_DEBUG("mmdit test done in %lldms", t1 - t0);
         }
     }
diff --git a/src/pmid.hpp b/src/pmid.hpp
index 30c4732..f19a8c3 100644
--- a/src/pmid.hpp
+++ b/src/pmid.hpp
@@ -443,11 +443,10 @@ public:
             id_encoder2.get_param_tensors(tensors, prefix);
     }
 
-    ggml_cgraph* build_graph(  // ggml_allocr* allocr,
-        ggml_tensor* id_pixel_values,
-        ggml_tensor* prompt_embeds,
-        std::vector<bool>& class_tokens_mask,
-        ggml_tensor* id_embeds) {
+    ggml_cgraph* build_graph(const sd::Tensor<float>& id_pixel_values_tensor,
+                             const sd::Tensor<float>& prompt_embeds_tensor,
+                             std::vector<bool>& class_tokens_mask,
+                             const sd::Tensor<float>& id_embeds_tensor = {}) {
         ctm.clear();
         ctmf16.clear();
         ctmpos.clear();
@@ -460,16 +459,16 @@ public:
 
         ggml_cgraph* gf = ggml_new_graph(compute_ctx);
 
+        ggml_tensor* id_pixel_values = make_input(id_pixel_values_tensor);
+        ggml_tensor* prompt_embeds   = make_input(prompt_embeds_tensor);
+        ggml_tensor* id_embeds       = make_optional_input(id_embeds_tensor);
+
         int64_t hidden_size = prompt_embeds->ne[0];
         int64_t seq_length  = prompt_embeds->ne[1];
         ggml_type type      = GGML_TYPE_F32;
 
         ggml_tensor* class_tokens_mask_d = ggml_new_tensor_1d(runner_ctx.ggml_ctx, type, class_tokens_mask.size());
 
-        ggml_tensor* id_pixel_values_d = to_backend(id_pixel_values);
-        ggml_tensor* prompt_embeds_d   = to_backend(prompt_embeds);
-        ggml_tensor* id_embeds_d       = to_backend(id_embeds);
-
         ggml_tensor* left  = nullptr;
         ggml_tensor* right = nullptr;
         for (int i = 0; i < class_tokens_mask.size(); i++) {
@@ -529,18 +528,18 @@ public:
         ggml_tensor* updated_prompt_embeds = nullptr;
         if (pm_version == PM_VERSION_1)
             updated_prompt_embeds = id_encoder.forward(&runner_ctx,
-                                                       id_pixel_values_d,
-                                                       prompt_embeds_d,
+                                                       id_pixel_values,
+                                                       prompt_embeds,
                                                        class_tokens_mask_d,
                                                        class_tokens_mask_pos,
                                                        left, right);
         else if (pm_version == PM_VERSION_2)
             updated_prompt_embeds = id_encoder2.forward(&runner_ctx,
-                                                        id_pixel_values_d,
-                                                        prompt_embeds_d,
+                                                        id_pixel_values,
+                                                        prompt_embeds,
                                                         class_tokens_mask_d,
                                                         class_tokens_mask_pos,
-                                                        id_embeds_d,
+                                                        id_embeds,
                                                         left, right);
 
         ggml_build_forward_expand(gf, updated_prompt_embeds);
@@ -548,20 +547,16 @@ public:
         return gf;
     }
 
-    bool compute(const int n_threads,
-                 ggml_tensor* id_pixel_values,
-                 ggml_tensor* prompt_embeds,
-                 ggml_tensor* id_embeds,
-                 std::vector<bool>& class_tokens_mask,
-                 ggml_tensor** updated_prompt_embeds,
-                 ggml_context* output_ctx) {
+    sd::Tensor<float> compute(const int n_threads,
+                              const sd::Tensor<float>& id_pixel_values,
+                              const sd::Tensor<float>& prompt_embeds,
+                              const sd::Tensor<float>& id_embeds,
+                              std::vector<bool>& class_tokens_mask) {
         auto get_graph = [&]() -> ggml_cgraph* {
-            // return build_graph(compute_allocr, id_pixel_values, prompt_embeds, class_tokens_mask);
             return build_graph(id_pixel_values, prompt_embeds, class_tokens_mask, id_embeds);
         };
 
-        // GGMLRunner::compute(get_graph, n_threads, updated_prompt_embeds);
-        return GGMLRunner::compute(get_graph, n_threads, true, updated_prompt_embeds, output_ctx);
+        return take_or_empty(GGMLRunner::compute<float>(get_graph, n_threads, true));
     }
 };
 
diff --git a/src/preprocessing.hpp b/src/preprocessing.hpp
index ca05ca2..7c83a28 100644
--- a/src/preprocessing.hpp
+++ b/src/preprocessing.hpp
@@ -1,179 +1,241 @@
 #ifndef __PREPROCESSING_HPP__
 #define __PREPROCESSING_HPP__
 
+#include <cmath>
+#include <limits>
+
 #include "ggml_extend.hpp"
+
 #define M_PI_ 3.14159265358979323846f
 
-void convolve(ggml_tensor* input, ggml_tensor* output, ggml_tensor* kernel, int padding) {
-    ggml_init_params params;
-    params.mem_size          = 80 * input->ne[0] * input->ne[1];  // 20M for 512x512
-    params.mem_buffer        = nullptr;
-    params.no_alloc          = false;
-    ggml_context* ctx0       = ggml_init(params);
-    ggml_tensor* kernel_fp16 = ggml_new_tensor_4d(ctx0, GGML_TYPE_F16, kernel->ne[0], kernel->ne[1], 1, 1);
-    ggml_fp32_to_fp16_row((float*)kernel->data, (ggml_fp16_t*)kernel_fp16->data, ggml_nelements(kernel));
-    ggml_tensor* h  = ggml_conv_2d(ctx0, kernel_fp16, input, 1, 1, padding, padding, 1, 1);
-    ggml_cgraph* gf = ggml_new_graph(ctx0);
-    ggml_build_forward_expand(gf, ggml_cpy(ctx0, h, output));
-    ggml_graph_compute_with_ctx(ctx0, gf, 1);
-    ggml_free(ctx0);
+static inline int64_t preprocessing_offset_4d(const sd::Tensor<float>& tensor, int64_t i0, int64_t i1 = 0, int64_t i2 = 0, int64_t i3 = 0) {
+    const auto& shape = tensor.shape();
+    int64_t n0        = shape.size() > 0 ? shape[0] : 1;
+    int64_t n1        = shape.size() > 1 ? shape[1] : 1;
+    int64_t n2        = shape.size() > 2 ? shape[2] : 1;
+    return ((i3 * n2 + i2) * n1 + i1) * n0 + i0;
 }
 
-void gaussian_kernel(ggml_tensor* kernel) {
-    int ks_mid   = static_cast<int>(kernel->ne[0] / 2);
+static inline float preprocessing_get_4d(const sd::Tensor<float>& tensor, int64_t i0, int64_t i1 = 0, int64_t i2 = 0, int64_t i3 = 0) {
+    return tensor.values()[static_cast<size_t>(preprocessing_offset_4d(tensor, i0, i1, i2, i3))];
+}
+
+static inline void preprocessing_set_4d(sd::Tensor<float>& tensor, float value, int64_t i0, int64_t i1 = 0, int64_t i2 = 0, int64_t i3 = 0) {
+    tensor.values()[static_cast<size_t>(preprocessing_offset_4d(tensor, i0, i1, i2, i3))] = value;
+}
+
+static inline sd::Tensor<float> sd_image_to_preprocessing_tensor(sd_image_t image) {
+    sd::Tensor<float> tensor({static_cast<int64_t>(image.width), static_cast<int64_t>(image.height), static_cast<int64_t>(image.channel), 1});
+    for (uint32_t y = 0; y < image.height; ++y) {
+        for (uint32_t x = 0; x < image.width; ++x) {
+            for (uint32_t c = 0; c < image.channel; ++c) {
+                preprocessing_set_4d(tensor, sd_image_get_f32(image, x, y, c), x, y, c, 0);
+            }
+        }
+    }
+    return tensor;
+}
+
+static inline void preprocessing_tensor_to_sd_image(const sd::Tensor<float>& tensor, uint8_t* image_data) {
+    GGML_ASSERT(tensor.dim() == 4);
+    GGML_ASSERT(tensor.shape()[3] == 1);
+    GGML_ASSERT(image_data != nullptr);
+
+    int width   = static_cast<int>(tensor.shape()[0]);
+    int height  = static_cast<int>(tensor.shape()[1]);
+    int channel = static_cast<int>(tensor.shape()[2]);
+    for (int y = 0; y < height; ++y) {
+        for (int x = 0; x < width; ++x) {
+            for (int c = 0; c < channel; ++c) {
+                float value                               = preprocessing_get_4d(tensor, x, y, c, 0);
+                value                                     = std::min(1.0f, std::max(0.0f, value));
+                image_data[(y * width + x) * channel + c] = static_cast<uint8_t>(std::round(value * 255.0f));
+            }
+        }
+    }
+}
+
+static inline sd::Tensor<float> gaussian_kernel_tensor(int kernel_size) {
+    sd::Tensor<float> kernel({kernel_size, kernel_size, 1, 1});
+    int ks_mid   = kernel_size / 2;
     float sigma  = 1.4f;
-    float normal = 1.f / (2.0f * M_PI_ * powf(sigma, 2.0f));
-    for (int y = 0; y < kernel->ne[0]; y++) {
+    float normal = 1.f / (2.0f * M_PI_ * std::pow(sigma, 2.0f));
+    for (int y = 0; y < kernel_size; ++y) {
         float gx = static_cast<float>(-ks_mid + y);
-        for (int x = 0; x < kernel->ne[1]; x++) {
+        for (int x = 0; x < kernel_size; ++x) {
             float gy = static_cast<float>(-ks_mid + x);
-            float k_ = expf(-((gx * gx + gy * gy) / (2.0f * powf(sigma, 2.0f)))) * normal;
-            ggml_ext_tensor_set_f32(kernel, k_, x, y);
+            float k  = std::exp(-((gx * gx + gy * gy) / (2.0f * std::pow(sigma, 2.0f)))) * normal;
+            preprocessing_set_4d(kernel, k, x, y, 0, 0);
         }
     }
+    return kernel;
 }
 
-void grayscale(ggml_tensor* rgb_img, ggml_tensor* grayscale) {
-    for (int iy = 0; iy < rgb_img->ne[1]; iy++) {
-        for (int ix = 0; ix < rgb_img->ne[0]; ix++) {
-            float r    = ggml_ext_tensor_get_f32(rgb_img, ix, iy);
-            float g    = ggml_ext_tensor_get_f32(rgb_img, ix, iy, 1);
-            float b    = ggml_ext_tensor_get_f32(rgb_img, ix, iy, 2);
+static inline sd::Tensor<float> convolve_tensor(const sd::Tensor<float>& input, const sd::Tensor<float>& kernel, int padding) {
+    GGML_ASSERT(input.dim() == 4);
+    GGML_ASSERT(kernel.dim() == 4);
+    GGML_ASSERT(input.shape()[3] == 1);
+    GGML_ASSERT(kernel.shape()[2] == 1);
+    GGML_ASSERT(kernel.shape()[3] == 1);
+
+    sd::Tensor<float> output(input.shape());
+    int64_t width    = input.shape()[0];
+    int64_t height   = input.shape()[1];
+    int64_t channels = input.shape()[2];
+    int64_t kernel_w = kernel.shape()[0];
+    int64_t kernel_h = kernel.shape()[1];
+
+    for (int64_t c = 0; c < channels; ++c) {
+        for (int64_t y = 0; y < height; ++y) {
+            for (int64_t x = 0; x < width; ++x) {
+                float sum = 0.0f;
+                for (int64_t ky = 0; ky < kernel_h; ++ky) {
+                    int64_t iy = y + ky - padding;
+                    if (iy < 0 || iy >= height) {
+                        continue;
+                    }
+                    for (int64_t kx = 0; kx < kernel_w; ++kx) {
+                        int64_t ix = x + kx - padding;
+                        if (ix < 0 || ix >= width) {
+                            continue;
+                        }
+                        sum += preprocessing_get_4d(input, ix, iy, c, 0) * preprocessing_get_4d(kernel, kx, ky, 0, 0);
+                    }
+                }
+                preprocessing_set_4d(output, sum, x, y, c, 0);
+            }
+        }
+    }
+    return output;
+}
+
+static inline sd::Tensor<float> grayscale_tensor(const sd::Tensor<float>& rgb_img) {
+    GGML_ASSERT(rgb_img.dim() == 4);
+    GGML_ASSERT(rgb_img.shape()[2] >= 3);
+    sd::Tensor<float> grayscale({rgb_img.shape()[0], rgb_img.shape()[1], 1, rgb_img.shape()[3]});
+    for (int64_t iy = 0; iy < rgb_img.shape()[1]; ++iy) {
+        for (int64_t ix = 0; ix < rgb_img.shape()[0]; ++ix) {
+            float r    = preprocessing_get_4d(rgb_img, ix, iy, 0, 0);
+            float g    = preprocessing_get_4d(rgb_img, ix, iy, 1, 0);
+            float b    = preprocessing_get_4d(rgb_img, ix, iy, 2, 0);
             float gray = 0.2989f * r + 0.5870f * g + 0.1140f * b;
-            ggml_ext_tensor_set_f32(grayscale, gray, ix, iy);
+            preprocessing_set_4d(grayscale, gray, ix, iy, 0, 0);
         }
     }
+    return grayscale;
 }
 
-void prop_hypot(ggml_tensor* x, ggml_tensor* y, ggml_tensor* h) {
-    int n_elements = static_cast<int>(ggml_nelements(h));
-    float* dx      = (float*)x->data;
-    float* dy      = (float*)y->data;
-    float* dh      = (float*)h->data;
-    for (int i = 0; i < n_elements; i++) {
-        dh[i] = sqrtf(dx[i] * dx[i] + dy[i] * dy[i]);
+static inline sd::Tensor<float> tensor_hypot(const sd::Tensor<float>& x, const sd::Tensor<float>& y) {
+    sd::tensor_check_same_shape(x, y);
+    sd::Tensor<float> out(x.shape());
+    for (int64_t i = 0; i < out.numel(); ++i) {
+        out[i] = std::sqrt(x[i] * x[i] + y[i] * y[i]);
     }
+    return out;
 }
 
-void prop_arctan2(ggml_tensor* x, ggml_tensor* y, ggml_tensor* h) {
-    int n_elements = static_cast<int>(ggml_nelements(h));
-    float* dx      = (float*)x->data;
-    float* dy      = (float*)y->data;
-    float* dh      = (float*)h->data;
-    for (int i = 0; i < n_elements; i++) {
-        dh[i] = atan2f(dy[i], dx[i]);
+static inline sd::Tensor<float> tensor_arctan2(const sd::Tensor<float>& x, const sd::Tensor<float>& y) {
+    sd::tensor_check_same_shape(x, y);
+    sd::Tensor<float> out(x.shape());
+    for (int64_t i = 0; i < out.numel(); ++i) {
+        out[i] = std::atan2(y[i], x[i]);
     }
+    return out;
 }
 
-void normalize_tensor(ggml_tensor* g) {
-    int n_elements = static_cast<int>(ggml_nelements(g));
-    float* dg      = (float*)g->data;
-    float max      = -INFINITY;
-    for (int i = 0; i < n_elements; i++) {
-        max = dg[i] > max ? dg[i] : max;
+static inline void normalize_tensor(sd::Tensor<float>* g) {
+    GGML_ASSERT(g != nullptr);
+    if (g->empty()) {
+        return;
     }
-    max = 1.0f / max;
-    for (int i = 0; i < n_elements; i++) {
-        dg[i] *= max;
+    float max_value = -std::numeric_limits<float>::infinity();
+    for (int64_t i = 0; i < g->numel(); ++i) {
+        max_value = std::max(max_value, (*g)[i]);
     }
+    if (max_value == 0.0f || !std::isfinite(max_value)) {
+        return;
+    }
+    *g *= (1.0f / max_value);
 }
 
-void non_max_supression(ggml_tensor* result, ggml_tensor* G, ggml_tensor* D) {
-    for (int iy = 1; iy < result->ne[1] - 1; iy++) {
-        for (int ix = 1; ix < result->ne[0] - 1; ix++) {
-            float angle = ggml_ext_tensor_get_f32(D, ix, iy) * 180.0f / M_PI_;
-            angle       = angle < 0.0f ? angle += 180.0f : angle;
+static inline sd::Tensor<float> non_max_supression(const sd::Tensor<float>& G, const sd::Tensor<float>& D) {
+    GGML_ASSERT(G.shape() == D.shape());
+    sd::Tensor<float> result = sd::Tensor<float>::zeros(G.shape());
+    for (int64_t iy = 1; iy < result.shape()[1] - 1; ++iy) {
+        for (int64_t ix = 1; ix < result.shape()[0] - 1; ++ix) {
+            float angle = preprocessing_get_4d(D, ix, iy, 0, 0) * 180.0f / M_PI_;
+            angle       = angle < 0.0f ? angle + 180.0f : angle;
             float q     = 1.0f;
             float r     = 1.0f;
 
-            // angle 0
-            if ((0 >= angle && angle < 22.5f) || (157.5f >= angle && angle <= 180)) {
-                q = ggml_ext_tensor_get_f32(G, ix, iy + 1);
-                r = ggml_ext_tensor_get_f32(G, ix, iy - 1);
-            }
-            // angle 45
-            else if (22.5f >= angle && angle < 67.5f) {
-                q = ggml_ext_tensor_get_f32(G, ix + 1, iy - 1);
-                r = ggml_ext_tensor_get_f32(G, ix - 1, iy + 1);
-            }
-            // angle 90
-            else if (67.5f >= angle && angle < 112.5) {
-                q = ggml_ext_tensor_get_f32(G, ix + 1, iy);
-                r = ggml_ext_tensor_get_f32(G, ix - 1, iy);
-            }
-            // angle 135
-            else if (112.5 >= angle && angle < 157.5f) {
-                q = ggml_ext_tensor_get_f32(G, ix - 1, iy - 1);
-                r = ggml_ext_tensor_get_f32(G, ix + 1, iy + 1);
+            if ((0 >= angle && angle < 22.5f) || (157.5f >= angle && angle <= 180.0f)) {
+                q = preprocessing_get_4d(G, ix, iy + 1, 0, 0);
+                r = preprocessing_get_4d(G, ix, iy - 1, 0, 0);
+            } else if (22.5f >= angle && angle < 67.5f) {
+                q = preprocessing_get_4d(G, ix + 1, iy - 1, 0, 0);
+                r = preprocessing_get_4d(G, ix - 1, iy + 1, 0, 0);
+            } else if (67.5f >= angle && angle < 112.5f) {
+                q = preprocessing_get_4d(G, ix + 1, iy, 0, 0);
+                r = preprocessing_get_4d(G, ix - 1, iy, 0, 0);
+            } else if (112.5f >= angle && angle < 157.5f) {
+                q = preprocessing_get_4d(G, ix - 1, iy - 1, 0, 0);
+                r = preprocessing_get_4d(G, ix + 1, iy + 1, 0, 0);
             }
 
-            float cur = ggml_ext_tensor_get_f32(G, ix, iy);
-            if ((cur >= q) && (cur >= r)) {
-                ggml_ext_tensor_set_f32(result, cur, ix, iy);
-            } else {
-                ggml_ext_tensor_set_f32(result, 0.0f, ix, iy);
-            }
+            float cur = preprocessing_get_4d(G, ix, iy, 0, 0);
+            preprocessing_set_4d(result, (cur >= q && cur >= r) ? cur : 0.0f, ix, iy, 0, 0);
         }
     }
+    return result;
 }
 
-void threshold_hystersis(ggml_tensor* img, float high_threshold, float low_threshold, float weak, float strong) {
-    int n_elements = static_cast<int>(ggml_nelements(img));
-    float* imd     = (float*)img->data;
-    float max      = -INFINITY;
-    for (int i = 0; i < n_elements; i++) {
-        max = imd[i] > max ? imd[i] : max;
+static inline void threshold_hystersis(sd::Tensor<float>* img, float high_threshold, float low_threshold, float weak, float strong) {
+    GGML_ASSERT(img != nullptr);
+    if (img->empty()) {
+        return;
     }
-    float ht = max * high_threshold;
+    float max_value = -std::numeric_limits<float>::infinity();
+    for (int64_t i = 0; i < img->numel(); ++i) {
+        max_value = std::max(max_value, (*img)[i]);
+    }
+
+    float ht = max_value * high_threshold;
     float lt = ht * low_threshold;
-    for (int i = 0; i < n_elements; i++) {
-        float img_v = imd[i];
-        if (img_v >= ht) {  // strong pixel
-            imd[i] = strong;
-        } else if (img_v <= ht && img_v >= lt) {  // strong pixel
-            imd[i] = weak;
+    for (int64_t i = 0; i < img->numel(); ++i) {
+        float img_v = (*img)[i];
+        if (img_v >= ht) {
+            (*img)[i] = strong;
+        } else if (img_v <= ht && img_v >= lt) {
+            (*img)[i] = weak;
         }
     }
 
-    for (int iy = 0; iy < img->ne[1]; iy++) {
-        for (int ix = 0; ix < img->ne[0]; ix++) {
-            if (ix >= 3 && ix <= img->ne[0] - 3 && iy >= 3 && iy <= img->ne[1] - 3) {
-                ggml_ext_tensor_set_f32(img, ggml_ext_tensor_get_f32(img, ix, iy), ix, iy);
-            } else {
-                ggml_ext_tensor_set_f32(img, 0.0f, ix, iy);
+    for (int64_t iy = 0; iy < img->shape()[1]; ++iy) {
+        for (int64_t ix = 0; ix < img->shape()[0]; ++ix) {
+            if (!(ix >= 3 && ix <= img->shape()[0] - 3 && iy >= 3 && iy <= img->shape()[1] - 3)) {
+                preprocessing_set_4d(*img, 0.0f, ix, iy, 0, 0);
             }
         }
     }
 
-    // hysteresis
-    for (int iy = 1; iy < img->ne[1] - 1; iy++) {
-        for (int ix = 1; ix < img->ne[0] - 1; ix++) {
-            float imd_v = ggml_ext_tensor_get_f32(img, ix, iy);
+    for (int64_t iy = 1; iy < img->shape()[1] - 1; ++iy) {
+        for (int64_t ix = 1; ix < img->shape()[0] - 1; ++ix) {
+            float imd_v = preprocessing_get_4d(*img, ix, iy, 0, 0);
             if (imd_v == weak) {
-                if (ggml_ext_tensor_get_f32(img, ix + 1, iy - 1) == strong || ggml_ext_tensor_get_f32(img, ix + 1, iy) == strong ||
-                    ggml_ext_tensor_get_f32(img, ix, iy - 1) == strong || ggml_ext_tensor_get_f32(img, ix, iy + 1) == strong ||
-                    ggml_ext_tensor_get_f32(img, ix - 1, iy - 1) == strong || ggml_ext_tensor_get_f32(img, ix - 1, iy) == strong) {
-                    ggml_ext_tensor_set_f32(img, strong, ix, iy);
-                } else {
-                    ggml_ext_tensor_set_f32(img, 0.0f, ix, iy);
-                }
+                bool has_strong_neighbor =
+                    preprocessing_get_4d(*img, ix + 1, iy - 1, 0, 0) == strong ||
+                    preprocessing_get_4d(*img, ix + 1, iy, 0, 0) == strong ||
+                    preprocessing_get_4d(*img, ix, iy - 1, 0, 0) == strong ||
+                    preprocessing_get_4d(*img, ix, iy + 1, 0, 0) == strong ||
+                    preprocessing_get_4d(*img, ix - 1, iy - 1, 0, 0) == strong ||
+                    preprocessing_get_4d(*img, ix - 1, iy, 0, 0) == strong;
+                preprocessing_set_4d(*img, has_strong_neighbor ? strong : 0.0f, ix, iy, 0, 0);
             }
         }
     }
 }
 
 bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold, float weak, float strong, bool inverse) {
-    ggml_init_params params;
-    params.mem_size        = static_cast<size_t>(40 * img.width * img.height);  // 10MB for 512x512
-    params.mem_buffer      = nullptr;
-    params.no_alloc        = false;
-    ggml_context* work_ctx = ggml_init(params);
-
-    if (!work_ctx) {
-        LOG_ERROR("ggml_init() failed");
-        return false;
-    }
-
     float kX[9] = {
         -1, 0, 1,
         -2, 0, 2,
@@ -184,43 +246,33 @@ bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold,
         0, 0, 0,
         -1, -2, -1};
 
-    // generate kernel
-    int kernel_size      = 5;
-    ggml_tensor* gkernel = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, kernel_size, kernel_size, 1, 1);
-    ggml_tensor* sf_kx   = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1);
-    memcpy(sf_kx->data, kX, ggml_nbytes(sf_kx));
-    ggml_tensor* sf_ky = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1);
-    memcpy(sf_ky->data, kY, ggml_nbytes(sf_ky));
-    gaussian_kernel(gkernel);
-    ggml_tensor* image      = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, img.width, img.height, 3, 1);
-    ggml_tensor* image_gray = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, img.width, img.height, 1, 1);
-    ggml_tensor* iX         = ggml_dup_tensor(work_ctx, image_gray);
-    ggml_tensor* iY         = ggml_dup_tensor(work_ctx, image_gray);
-    ggml_tensor* G          = ggml_dup_tensor(work_ctx, image_gray);
-    ggml_tensor* tetha      = ggml_dup_tensor(work_ctx, image_gray);
-    sd_image_to_ggml_tensor(img, image);
-    grayscale(image, image_gray);
-    convolve(image_gray, image_gray, gkernel, 2);
-    convolve(image_gray, iX, sf_kx, 1);
-    convolve(image_gray, iY, sf_ky, 1);
-    prop_hypot(iX, iY, G);
-    normalize_tensor(G);
-    prop_arctan2(iX, iY, tetha);
-    non_max_supression(image_gray, G, tetha);
-    threshold_hystersis(image_gray, high_threshold, low_threshold, weak, strong);
-    // to RGB channels
-    for (uint32_t iy = 0; iy < img.height; iy++) {
-        for (uint32_t ix = 0; ix < img.width; ix++) {
-            float gray = ggml_ext_tensor_get_f32(image_gray, ix, iy);
+    sd::Tensor<float> gkernel = gaussian_kernel_tensor(5);
+    sd::Tensor<float> sf_kx({3, 3, 1, 1}, std::vector<float>(kX, kX + 9));
+    sd::Tensor<float> sf_ky({3, 3, 1, 1}, std::vector<float>(kY, kY + 9));
+
+    sd::Tensor<float> image      = sd_image_to_preprocessing_tensor(img);
+    sd::Tensor<float> image_gray = grayscale_tensor(image);
+    image_gray                   = convolve_tensor(image_gray, gkernel, 2);
+    sd::Tensor<float> iX         = convolve_tensor(image_gray, sf_kx, 1);
+    sd::Tensor<float> iY         = convolve_tensor(image_gray, sf_ky, 1);
+    sd::Tensor<float> G          = tensor_hypot(iX, iY);
+    normalize_tensor(&G);
+    sd::Tensor<float> theta = tensor_arctan2(iX, iY);
+    image_gray              = non_max_supression(G, theta);
+    threshold_hystersis(&image_gray, high_threshold, low_threshold, weak, strong);
+
+    for (uint32_t iy = 0; iy < img.height; ++iy) {
+        for (uint32_t ix = 0; ix < img.width; ++ix) {
+            float gray = preprocessing_get_4d(image_gray, ix, iy, 0, 0);
             gray       = inverse ? 1.0f - gray : gray;
-            ggml_ext_tensor_set_f32(image, gray, ix, iy);
-            ggml_ext_tensor_set_f32(image, gray, ix, iy, 1);
-            ggml_ext_tensor_set_f32(image, gray, ix, iy, 2);
+            for (uint32_t c = 0; c < img.channel; ++c) {
+                preprocessing_set_4d(image, gray, ix, iy, c, 0);
+            }
         }
     }
-    ggml_tensor_to_sd_image(image, img.data);
-    ggml_free(work_ctx);
+
+    preprocessing_tensor_to_sd_image(image, img.data);
     return true;
 }
 
-#endif  // __PREPROCESSING_HPP__
\ No newline at end of file
+#endif  // __PREPROCESSING_HPP__
diff --git a/src/qwen_image.hpp b/src/qwen_image.hpp
index 68af0e8..83c8cec 100644
--- a/src/qwen_image.hpp
+++ b/src/qwen_image.hpp
@@ -525,20 +525,21 @@ namespace Qwen {
             qwen_image.get_param_tensors(tensors, prefix);
         }
 
-        ggml_cgraph* build_graph(ggml_tensor* x,
-                                 ggml_tensor* timesteps,
-                                 ggml_tensor* context,
-                                 std::vector<ggml_tensor*> ref_latents = {},
-                                 bool increase_ref_index               = false) {
+        ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
+                                 const sd::Tensor<float>& timesteps_tensor,
+                                 const sd::Tensor<float>& context_tensor,
+                                 const std::vector<sd::Tensor<float>>& ref_latents_tensor = {},
+                                 bool increase_ref_index                                  = false) {
+            ggml_cgraph* gf        = new_graph_custom(QWEN_IMAGE_GRAPH_SIZE);
+            ggml_tensor* x         = make_input(x_tensor);
+            ggml_tensor* timesteps = make_input(timesteps_tensor);
             GGML_ASSERT(x->ne[3] == 1);
-            ggml_cgraph* gf = new_graph_custom(QWEN_IMAGE_GRAPH_SIZE);
-
-            x         = to_backend(x);
-            context   = to_backend(context);
-            timesteps = to_backend(timesteps);
-
-            for (int i = 0; i < ref_latents.size(); i++) {
-                ref_latents[i] = to_backend(ref_latents[i]);
+            GGML_ASSERT(!context_tensor.empty());
+            ggml_tensor* context = make_input(context_tensor);
+            std::vector<ggml_tensor*> ref_latents;
+            ref_latents.reserve(ref_latents_tensor.size());
+            for (const auto& ref_latent_tensor : ref_latents_tensor) {
+                ref_latents.push_back(make_input(ref_latent_tensor));
             }
 
             pe_vec      = Rope::gen_qwen_image_pe(static_cast<int>(x->ne[1]),
@@ -600,14 +601,12 @@ namespace Qwen {
             return gf;
         }
 
-        bool compute(int n_threads,
-                     ggml_tensor* x,
-                     ggml_tensor* timesteps,
-                     ggml_tensor* context,
-                     std::vector<ggml_tensor*> ref_latents = {},
-                     bool increase_ref_index               = false,
-                     ggml_tensor** output                  = nullptr,
-                     ggml_context* output_ctx              = nullptr) {
+        sd::Tensor<float> compute(int n_threads,
+                                  const sd::Tensor<float>& x,
+                                  const sd::Tensor<float>& timesteps,
+                                  const sd::Tensor<float>& context,
+                                  const std::vector<sd::Tensor<float>>& ref_latents = {},
+                                  bool increase_ref_index                           = false) {
             // x: [N, in_channels, h, w]
             // timesteps: [N, ]
             // context: [N, max_position, hidden_size]
@@ -615,7 +614,7 @@ namespace Qwen {
                 return build_graph(x, timesteps, context, ref_latents, increase_ref_index);
             };
 
-            return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
         }
 
         void test() {
@@ -624,30 +623,37 @@ namespace Qwen {
             params.mem_buffer = nullptr;
             params.no_alloc   = false;
 
-            ggml_context* work_ctx = ggml_init(params);
-            GGML_ASSERT(work_ctx != nullptr);
+            ggml_context* ctx = ggml_init(params);
+            GGML_ASSERT(ctx != nullptr);
 
             {
-                // auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 16, 16, 16, 1);
+                // auto x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 16, 16, 16, 1);
                 // ggml_set_f32(x, 0.01f);
-                auto x = load_tensor_from_file(work_ctx, "./qwen_image_x.bin");
-                print_ggml_tensor(x);
+                auto x = sd::load_tensor_from_file_as_tensor<float>("./qwen_image_x.bin");
+                print_sd_tensor(x);
 
                 std::vector<float> timesteps_vec(1, 1000.f);
-                auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
+                auto timesteps = sd::Tensor<float>::from_vector(timesteps_vec);
 
-                // auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 3584, 256, 1);
+                // auto context = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 3584, 256, 1);
                 // ggml_set_f32(context, 0.01f);
-                auto context = load_tensor_from_file(work_ctx, "./qwen_image_context.bin");
-                print_ggml_tensor(context);
+                auto context = sd::load_tensor_from_file_as_tensor<float>("./qwen_image_context.bin");
+                print_sd_tensor(context);
 
-                ggml_tensor* out = nullptr;
+                sd::Tensor<float> out;
 
-                int64_t t0 = ggml_time_ms();
-                compute(8, x, timesteps, context, {}, false, &out, work_ctx);
-                int64_t t1 = ggml_time_ms();
+                int64_t t0   = ggml_time_ms();
+                auto out_opt = compute(8,
+                                       x,
+                                       timesteps,
+                                       context,
+                                       {},
+                                       false);
+                int64_t t1   = ggml_time_ms();
 
-                print_ggml_tensor(out);
+                GGML_ASSERT(!out_opt.empty());
+                out = std::move(out_opt);
+                print_sd_tensor(out);
                 LOG_DEBUG("qwen_image test done in %lldms", t1 - t0);
             }
         }
diff --git a/src/sample-cache.cpp b/src/sample-cache.cpp
new file mode 100644
index 0000000..5739178
--- /dev/null
+++ b/src/sample-cache.cpp
@@ -0,0 +1,361 @@
+#include "sample-cache.h"
+
+namespace sd_sample {
+
+    static float get_cache_reuse_threshold(const sd_cache_params_t& params) {
+        float reuse_threshold = params.reuse_threshold;
+        if (reuse_threshold == INFINITY) {
+            if (params.mode == SD_CACHE_EASYCACHE) {
+                reuse_threshold = 0.2f;
+            } else if (params.mode == SD_CACHE_UCACHE) {
+                reuse_threshold = 1.0f;
+            }
+        }
+        return std::max(0.0f, reuse_threshold);
+    }
+
+    bool SampleCacheRuntime::easycache_enabled() const {
+        return mode == SampleCacheMode::EASYCACHE;
+    }
+
+    bool SampleCacheRuntime::ucache_enabled() const {
+        return mode == SampleCacheMode::UCACHE;
+    }
+
+    bool SampleCacheRuntime::cachedit_enabled() const {
+        return mode == SampleCacheMode::CACHEDIT;
+    }
+
+    static bool has_valid_cache_percent_range(const sd_cache_params_t& cache_params) {
+        if (cache_params.mode != SD_CACHE_EASYCACHE && cache_params.mode != SD_CACHE_UCACHE) {
+            return true;
+        }
+
+        return cache_params.start_percent >= 0.0f &&
+               cache_params.start_percent < 1.0f &&
+               cache_params.end_percent > 0.0f &&
+               cache_params.end_percent <= 1.0f &&
+               cache_params.start_percent < cache_params.end_percent;
+    }
+
+    static void init_easycache_runtime(SampleCacheRuntime& runtime,
+                                       SDVersion version,
+                                       const sd_cache_params_t& cache_params,
+                                       Denoiser* denoiser) {
+        if (!sd_version_is_dit(version)) {
+            LOG_WARN("EasyCache requested but not supported for this model type");
+            return;
+        }
+
+        EasyCacheConfig config;
+        config.enabled         = true;
+        config.reuse_threshold = get_cache_reuse_threshold(cache_params);
+        config.start_percent   = cache_params.start_percent;
+        config.end_percent     = cache_params.end_percent;
+
+        runtime.easycache.init(config, denoiser);
+        if (!runtime.easycache.enabled()) {
+            LOG_WARN("EasyCache requested but could not be initialized for this run");
+            return;
+        }
+
+        runtime.mode = SampleCacheMode::EASYCACHE;
+        LOG_INFO("EasyCache enabled - threshold: %.3f, start: %.2f, end: %.2f",
+                 config.reuse_threshold,
+                 config.start_percent,
+                 config.end_percent);
+    }
+
+    static void init_ucache_runtime(SampleCacheRuntime& runtime,
+                                    SDVersion version,
+                                    const sd_cache_params_t& cache_params,
+                                    Denoiser* denoiser,
+                                    const std::vector<float>& sigmas) {
+        if (!sd_version_is_unet(version)) {
+            LOG_WARN("UCache requested but not supported for this model type (only UNET models)");
+            return;
+        }
+
+        UCacheConfig config;
+        config.enabled                = true;
+        config.reuse_threshold        = get_cache_reuse_threshold(cache_params);
+        config.start_percent          = cache_params.start_percent;
+        config.end_percent            = cache_params.end_percent;
+        config.error_decay_rate       = std::max(0.0f, std::min(1.0f, cache_params.error_decay_rate));
+        config.use_relative_threshold = cache_params.use_relative_threshold;
+        config.reset_error_on_compute = cache_params.reset_error_on_compute;
+
+        runtime.ucache.init(config, denoiser);
+        if (!runtime.ucache.enabled()) {
+            LOG_WARN("UCache requested but could not be initialized for this run");
+            return;
+        }
+
+        runtime.ucache.set_sigmas(sigmas);
+        runtime.mode = SampleCacheMode::UCACHE;
+        LOG_INFO("UCache enabled - threshold: %.3f, start: %.2f, end: %.2f, decay: %.2f, relative: %s, reset: %s",
+                 config.reuse_threshold,
+                 config.start_percent,
+                 config.end_percent,
+                 config.error_decay_rate,
+                 config.use_relative_threshold ? "true" : "false",
+                 config.reset_error_on_compute ? "true" : "false");
+    }
+
+    static void init_cachedit_runtime(SampleCacheRuntime& runtime,
+                                      SDVersion version,
+                                      const sd_cache_params_t& cache_params,
+                                      const std::vector<float>& sigmas) {
+        if (!sd_version_is_dit(version)) {
+            LOG_WARN("CacheDIT requested but not supported for this model type (only DiT models)");
+            return;
+        }
+
+        DBCacheConfig dbcfg;
+        dbcfg.enabled                     = (cache_params.mode == SD_CACHE_DBCACHE || cache_params.mode == SD_CACHE_CACHE_DIT);
+        dbcfg.Fn_compute_blocks           = cache_params.Fn_compute_blocks;
+        dbcfg.Bn_compute_blocks           = cache_params.Bn_compute_blocks;
+        dbcfg.residual_diff_threshold     = cache_params.residual_diff_threshold;
+        dbcfg.max_warmup_steps            = cache_params.max_warmup_steps;
+        dbcfg.max_cached_steps            = cache_params.max_cached_steps;
+        dbcfg.max_continuous_cached_steps = cache_params.max_continuous_cached_steps;
+        if (cache_params.scm_mask != nullptr && strlen(cache_params.scm_mask) > 0) {
+            dbcfg.steps_computation_mask = parse_scm_mask(cache_params.scm_mask);
+        }
+        dbcfg.scm_policy_dynamic = cache_params.scm_policy_dynamic;
+
+        TaylorSeerConfig tcfg;
+        tcfg.enabled             = (cache_params.mode == SD_CACHE_TAYLORSEER || cache_params.mode == SD_CACHE_CACHE_DIT);
+        tcfg.n_derivatives       = cache_params.taylorseer_n_derivatives;
+        tcfg.skip_interval_steps = cache_params.taylorseer_skip_interval;
+
+        runtime.cachedit.init(dbcfg, tcfg);
+        if (!runtime.cachedit.enabled()) {
+            LOG_WARN("CacheDIT requested but could not be initialized for this run");
+            return;
+        }
+
+        runtime.cachedit.set_sigmas(sigmas);
+        runtime.mode = SampleCacheMode::CACHEDIT;
+        LOG_INFO("CacheDIT enabled - mode: %s, Fn: %d, Bn: %d, threshold: %.3f, warmup: %d",
+                 cache_params.mode == SD_CACHE_CACHE_DIT ? "DBCache+TaylorSeer" : (cache_params.mode == SD_CACHE_DBCACHE ? "DBCache" : "TaylorSeer"),
+                 dbcfg.Fn_compute_blocks,
+                 dbcfg.Bn_compute_blocks,
+                 dbcfg.residual_diff_threshold,
+                 dbcfg.max_warmup_steps);
+    }
+
+    static void init_spectrum_runtime(SampleCacheRuntime& runtime,
+                                      SDVersion version,
+                                      const sd_cache_params_t& cache_params,
+                                      const std::vector<float>& sigmas) {
+        if (!sd_version_is_unet(version) && !sd_version_is_dit(version)) {
+            LOG_WARN("Spectrum requested but not supported for this model type (only UNET and DiT models)");
+            return;
+        }
+
+        SpectrumConfig config;
+        config.w            = cache_params.spectrum_w;
+        config.m            = cache_params.spectrum_m;
+        config.lam          = cache_params.spectrum_lam;
+        config.window_size  = cache_params.spectrum_window_size;
+        config.flex_window  = cache_params.spectrum_flex_window;
+        config.warmup_steps = cache_params.spectrum_warmup_steps;
+        config.stop_percent = cache_params.spectrum_stop_percent;
+
+        size_t total_steps = sigmas.size() > 0 ? sigmas.size() - 1 : 0;
+        runtime.spectrum.init(config, total_steps);
+        runtime.spectrum_enabled = true;
+
+        LOG_INFO("Spectrum enabled - w: %.2f, m: %d, lam: %.2f, window: %d, flex: %.2f, warmup: %d, stop: %.0f%%",
+                 config.w, config.m, config.lam,
+                 config.window_size, config.flex_window,
+                 config.warmup_steps, config.stop_percent * 100.0f);
+    }
+
+    SampleCacheRuntime init_sample_cache_runtime(SDVersion version,
+                                                 const sd_cache_params_t* cache_params,
+                                                 Denoiser* denoiser,
+                                                 const std::vector<float>& sigmas) {
+        SampleCacheRuntime runtime;
+        if (cache_params == nullptr || cache_params->mode == SD_CACHE_DISABLED) {
+            return runtime;
+        }
+
+        if (!has_valid_cache_percent_range(*cache_params)) {
+            LOG_WARN("Cache disabled due to invalid percent range (start=%.3f, end=%.3f)",
+                     cache_params->start_percent,
+                     cache_params->end_percent);
+            return runtime;
+        }
+
+        switch (cache_params->mode) {
+            case SD_CACHE_EASYCACHE:
+                init_easycache_runtime(runtime, version, *cache_params, denoiser);
+                break;
+            case SD_CACHE_UCACHE:
+                init_ucache_runtime(runtime, version, *cache_params, denoiser, sigmas);
+                break;
+            case SD_CACHE_DBCACHE:
+            case SD_CACHE_TAYLORSEER:
+            case SD_CACHE_CACHE_DIT:
+                init_cachedit_runtime(runtime, version, *cache_params, sigmas);
+                break;
+            case SD_CACHE_SPECTRUM:
+                init_spectrum_runtime(runtime, version, *cache_params, sigmas);
+                break;
+            default:
+                break;
+        }
+
+        return runtime;
+    }
+
+    SampleStepCacheDispatcher::SampleStepCacheDispatcher(SampleCacheRuntime& runtime, int step, float sigma)
+        : runtime(runtime), step(step), sigma(sigma), step_index(step > 0 ? (step - 1) : -1) {
+        if (step_index < 0) {
+            return;
+        }
+
+        switch (runtime.mode) {
+            case SampleCacheMode::EASYCACHE:
+                runtime.easycache.begin_step(step_index, sigma);
+                break;
+            case SampleCacheMode::UCACHE:
+                runtime.ucache.begin_step(step_index, sigma);
+                break;
+            case SampleCacheMode::CACHEDIT:
+                runtime.cachedit.begin_step(step_index, sigma);
+                break;
+            case SampleCacheMode::NONE:
+                break;
+        }
+    }
+
+    bool SampleStepCacheDispatcher::before_condition(const void* condition,
+                                                     const sd::Tensor<float>& input,
+                                                     sd::Tensor<float>* output) {
+        if (step_index < 0 || condition == nullptr || output == nullptr) {
+            return false;
+        }
+
+        switch (runtime.mode) {
+            case SampleCacheMode::EASYCACHE:
+                return runtime.easycache.before_condition(condition, input, output, sigma, step_index);
+            case SampleCacheMode::UCACHE:
+                return runtime.ucache.before_condition(condition, input, output, sigma, step_index);
+            case SampleCacheMode::CACHEDIT:
+                return runtime.cachedit.before_condition(condition, input, output, sigma, step_index);
+            case SampleCacheMode::NONE:
+                return false;
+        }
+
+        return false;
+    }
+
+    void SampleStepCacheDispatcher::after_condition(const void* condition,
+                                                    const sd::Tensor<float>& input,
+                                                    const sd::Tensor<float>& output) {
+        if (step_index < 0 || condition == nullptr) {
+            return;
+        }
+
+        switch (runtime.mode) {
+            case SampleCacheMode::EASYCACHE:
+                runtime.easycache.after_condition(condition, input, output);
+                break;
+            case SampleCacheMode::UCACHE:
+                runtime.ucache.after_condition(condition, input, output);
+                break;
+            case SampleCacheMode::CACHEDIT:
+                runtime.cachedit.after_condition(condition, input, output);
+                break;
+            case SampleCacheMode::NONE:
+                break;
+        }
+    }
+
+    bool SampleStepCacheDispatcher::is_step_skipped() const {
+        switch (runtime.mode) {
+            case SampleCacheMode::EASYCACHE:
+                return runtime.easycache.is_step_skipped();
+            case SampleCacheMode::UCACHE:
+                return runtime.ucache.is_step_skipped();
+            case SampleCacheMode::CACHEDIT:
+                return runtime.cachedit.is_step_skipped();
+            case SampleCacheMode::NONE:
+                return false;
+        }
+
+        return false;
+    }
+
+    void log_sample_cache_summary(const SampleCacheRuntime& runtime, size_t total_steps) {
+        if (runtime.easycache_enabled()) {
+            if (runtime.easycache.total_steps_skipped > 0 && total_steps > 0) {
+                if (runtime.easycache.total_steps_skipped < static_cast<int>(total_steps)) {
+                    double speedup = static_cast<double>(total_steps) /
+                                     static_cast<double>(total_steps - runtime.easycache.total_steps_skipped);
+                    LOG_INFO("EasyCache skipped %d/%zu steps (%.2fx estimated speedup)",
+                             runtime.easycache.total_steps_skipped,
+                             total_steps,
+                             speedup);
+                } else {
+                    LOG_INFO("EasyCache skipped %d/%zu steps",
+                             runtime.easycache.total_steps_skipped,
+                             total_steps);
+                }
+            } else if (total_steps > 0) {
+                LOG_INFO("EasyCache completed without skipping steps");
+            }
+        }
+
+        if (runtime.ucache_enabled()) {
+            if (runtime.ucache.total_steps_skipped > 0 && total_steps > 0) {
+                if (runtime.ucache.total_steps_skipped < static_cast<int>(total_steps)) {
+                    double speedup = static_cast<double>(total_steps) /
+                                     static_cast<double>(total_steps - runtime.ucache.total_steps_skipped);
+                    LOG_INFO("UCache skipped %d/%zu steps (%.2fx estimated speedup)",
+                             runtime.ucache.total_steps_skipped,
+                             total_steps,
+                             speedup);
+                } else {
+                    LOG_INFO("UCache skipped %d/%zu steps",
+                             runtime.ucache.total_steps_skipped,
+                             total_steps);
+                }
+            } else if (total_steps > 0) {
+                LOG_INFO("UCache completed without skipping steps");
+            }
+        }
+
+        if (runtime.cachedit_enabled()) {
+            if (runtime.cachedit.total_steps_skipped > 0 && total_steps > 0) {
+                if (runtime.cachedit.total_steps_skipped < static_cast<int>(total_steps)) {
+                    double speedup = static_cast<double>(total_steps) /
+                                     static_cast<double>(total_steps - runtime.cachedit.total_steps_skipped);
+                    LOG_INFO("CacheDIT skipped %d/%zu steps (%.2fx estimated speedup)",
+                             runtime.cachedit.total_steps_skipped,
+                             total_steps,
+                             speedup);
+                } else {
+                    LOG_INFO("CacheDIT skipped %d/%zu steps",
+                             runtime.cachedit.total_steps_skipped,
+                             total_steps);
+                }
+            } else if (total_steps > 0) {
+                LOG_INFO("CacheDIT completed without skipping steps");
+            }
+        }
+
+        if (runtime.spectrum_enabled && runtime.spectrum.total_steps_skipped > 0 && total_steps > 0) {
+            double speedup = static_cast<double>(total_steps) /
+                             static_cast<double>(total_steps - runtime.spectrum.total_steps_skipped);
+            LOG_INFO("Spectrum skipped %d/%zu steps (%.2fx estimated speedup)",
+                     runtime.spectrum.total_steps_skipped,
+                     total_steps,
+                     speedup);
+        }
+    }
+
+}  // namespace sd_sample
diff --git a/src/sample-cache.h b/src/sample-cache.h
new file mode 100644
index 0000000..398ad06
--- /dev/null
+++ b/src/sample-cache.h
@@ -0,0 +1,61 @@
+#ifndef __SAMPLE_CACHE_H__
+#define __SAMPLE_CACHE_H__
+
+#include <vector>
+
+#include "cache_dit.hpp"
+#include "denoiser.hpp"
+#include "easycache.hpp"
+#include "model.h"
+#include "spectrum.hpp"
+#include "tensor.hpp"
+#include "ucache.hpp"
+#include "util.h"
+
+namespace sd_sample {
+
+    enum class SampleCacheMode {
+        NONE,
+        EASYCACHE,
+        UCACHE,
+        CACHEDIT,
+    };
+
+    struct SampleCacheRuntime {
+        SampleCacheMode mode = SampleCacheMode::NONE;
+
+        EasyCacheState easycache;
+        UCacheState ucache;
+        CacheDitConditionState cachedit;
+        SpectrumState spectrum;
+
+        bool spectrum_enabled = false;
+
+        bool easycache_enabled() const;
+        bool ucache_enabled() const;
+        bool cachedit_enabled() const;
+    };
+
+    struct SampleStepCacheDispatcher {
+        SampleCacheRuntime& runtime;
+        int step;
+        float sigma;
+        int step_index;
+
+        SampleStepCacheDispatcher(SampleCacheRuntime& runtime, int step, float sigma);
+
+        bool before_condition(const void* condition, const sd::Tensor<float>& input, sd::Tensor<float>* output);
+        void after_condition(const void* condition, const sd::Tensor<float>& input, const sd::Tensor<float>& output);
+        bool is_step_skipped() const;
+    };
+
+    SampleCacheRuntime init_sample_cache_runtime(SDVersion version,
+                                                 const sd_cache_params_t* cache_params,
+                                                 Denoiser* denoiser,
+                                                 const std::vector<float>& sigmas);
+
+    void log_sample_cache_summary(const SampleCacheRuntime& runtime, size_t total_steps);
+
+}  // namespace sd_sample
+
+#endif  // __SAMPLE_CACHE_H__
diff --git a/src/spectrum.hpp b/src/spectrum.hpp
index 9542a8f..add1796 100644
--- a/src/spectrum.hpp
+++ b/src/spectrum.hpp
@@ -6,6 +6,7 @@
 #include <vector>
 
 #include "ggml_extend.hpp"
+#include "tensor.hpp"
 
 struct SpectrumConfig {
     float w            = 0.40f;
@@ -57,11 +58,8 @@ struct SpectrumState {
         return (num_cached + 1) % ws != 0;
     }
 
-    void update(const ggml_tensor* denoised) {
-        int64_t ne        = ggml_nelements(denoised);
-        const float* data = (const float*)denoised->data;
-
-        H_buf.emplace_back(data, data + ne);
+    void update(const sd::Tensor<float>& denoised) {
+        H_buf.emplace_back(denoised.data(), denoised.data() + denoised.numel());
         T_buf.push_back(taus(cnt));
 
         while ((int)H_buf.size() > K) {
@@ -76,13 +74,13 @@ struct SpectrumState {
         cnt++;
     }
 
-    void predict(ggml_tensor* denoised) {
+    void predict(sd::Tensor<float>* denoised) {
+        GGML_ASSERT(denoised != nullptr);
         int64_t F    = (int64_t)H_buf[0].size();
         int K_curr   = (int)H_buf.size();
         int M1       = config.m + 1;
         float tau_at = taus(cnt);
 
-        // Design matrix X: K_curr x M1 (Chebyshev basis)
         std::vector<float> X(K_curr * M1);
         for (int i = 0; i < K_curr; i++) {
             X[i * M1] = 1.0f;
@@ -92,7 +90,6 @@ struct SpectrumState {
                 X[i * M1 + j] = 2.0f * T_buf[i] * X[i * M1 + j - 1] - X[i * M1 + j - 2];
         }
 
-        // x_star: Chebyshev basis at current tau
         std::vector<float> x_star(M1);
         x_star[0] = 1.0f;
         if (M1 > 1)
@@ -100,7 +97,6 @@ struct SpectrumState {
         for (int j = 2; j < M1; j++)
             x_star[j] = 2.0f * tau_at * x_star[j - 1] - x_star[j - 2];
 
-        // XtX = X^T X + lambda I
         std::vector<float> XtX(M1 * M1, 0.0f);
         for (int i = 0; i < M1; i++) {
             for (int j = 0; j < M1; j++) {
@@ -111,7 +107,6 @@ struct SpectrumState {
             }
         }
 
-        // Cholesky decomposition
         std::vector<float> L(M1 * M1, 0.0f);
         if (!cholesky_decompose(XtX.data(), L.data(), M1)) {
             float trace = 0.0f;
@@ -122,18 +117,15 @@ struct SpectrumState {
             cholesky_decompose(XtX.data(), L.data(), M1);
         }
 
-        // Solve XtX v = x_star
         std::vector<float> v(M1);
         cholesky_solve(L.data(), x_star.data(), v.data(), M1);
 
-        // Prediction weights per history entry
         std::vector<float> weights(K_curr, 0.0f);
         for (int k = 0; k < K_curr; k++)
             for (int j = 0; j < M1; j++)
                 weights[k] += X[k * M1 + j] * v[j];
 
-        // Blend Chebyshev and Taylor predictions
-        float* out          = (float*)denoised->data;
+        float* out          = denoised->data();
         float w_cheb        = config.w;
         float w_taylor      = 1.0f - w_cheb;
         const float* h_last = H_buf.back().data();
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index bbf2f97..a59ff23 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -8,18 +8,15 @@
 #include "util.h"
 
 #include "auto_encoder_kl.hpp"
-#include "cache_dit.hpp"
 #include "conditioner.hpp"
 #include "control.hpp"
 #include "denoiser.hpp"
 #include "diffusion_model.hpp"
-#include "easycache.hpp"
 #include "esrgan.hpp"
 #include "lora.hpp"
 #include "pmid.hpp"
-#include "spectrum.hpp"
+#include "sample-cache.h"
 #include "tae.hpp"
-#include "ucache.hpp"
 #include "vae.hpp"
 
 #include "latent-preview.h"
@@ -78,7 +75,7 @@ const char* sampling_methods_str[] = {
 
 void calculate_alphas_cumprod(float* alphas_cumprod,
                               float linear_start = 0.00085f,
-                              float linear_end   = 0.0120,
+                              float linear_end   = 0.0120f,
                               int timesteps      = TIMESTEPS) {
     float ls_sqrt = sqrtf(linear_start);
     float le_sqrt = sqrtf(linear_end);
@@ -95,387 +92,14 @@ static float get_cache_reuse_threshold(const sd_cache_params_t& params) {
     float reuse_threshold = params.reuse_threshold;
     if (reuse_threshold == INFINITY) {
         if (params.mode == SD_CACHE_EASYCACHE) {
-            reuse_threshold = 0.2;
+            reuse_threshold = 0.2f;
         } else if (params.mode == SD_CACHE_UCACHE) {
-            reuse_threshold = 1.0;
+            reuse_threshold = 1.0f;
         }
     }
     return std::max(0.0f, reuse_threshold);
 }
 
-enum class SampleCacheMode {
-    NONE,
-    EASYCACHE,
-    UCACHE,
-    CACHEDIT,
-};
-
-struct SampleCacheRuntime {
-    SampleCacheMode mode = SampleCacheMode::NONE;
-
-    EasyCacheState easycache;
-    UCacheState ucache;
-    CacheDitConditionState cachedit;
-    SpectrumState spectrum;
-
-    bool spectrum_enabled = false;
-
-    bool has_step_cache() const {
-        return mode != SampleCacheMode::NONE;
-    }
-
-    bool easycache_enabled() const {
-        return mode == SampleCacheMode::EASYCACHE;
-    }
-
-    bool ucache_enabled() const {
-        return mode == SampleCacheMode::UCACHE;
-    }
-
-    bool cachedit_enabled() const {
-        return mode == SampleCacheMode::CACHEDIT;
-    }
-};
-
-static bool has_valid_cache_percent_range(const sd_cache_params_t& cache_params) {
-    if (cache_params.mode != SD_CACHE_EASYCACHE && cache_params.mode != SD_CACHE_UCACHE) {
-        return true;
-    }
-
-    return cache_params.start_percent >= 0.0f &&
-           cache_params.start_percent < 1.0f &&
-           cache_params.end_percent > 0.0f &&
-           cache_params.end_percent <= 1.0f &&
-           cache_params.start_percent < cache_params.end_percent;
-}
-
-static void init_easycache_runtime(SampleCacheRuntime& runtime,
-                                   SDVersion version,
-                                   const sd_cache_params_t& cache_params,
-                                   Denoiser* denoiser) {
-    if (!sd_version_is_dit(version)) {
-        LOG_WARN("EasyCache requested but not supported for this model type");
-        return;
-    }
-
-    EasyCacheConfig config;
-    config.enabled         = true;
-    config.reuse_threshold = get_cache_reuse_threshold(cache_params);
-    config.start_percent   = cache_params.start_percent;
-    config.end_percent     = cache_params.end_percent;
-
-    runtime.easycache.init(config, denoiser);
-    if (!runtime.easycache.enabled()) {
-        LOG_WARN("EasyCache requested but could not be initialized for this run");
-        return;
-    }
-
-    runtime.mode = SampleCacheMode::EASYCACHE;
-    LOG_INFO("EasyCache enabled - threshold: %.3f, start: %.2f, end: %.2f",
-             config.reuse_threshold,
-             config.start_percent,
-             config.end_percent);
-}
-
-static void init_ucache_runtime(SampleCacheRuntime& runtime,
-                                SDVersion version,
-                                const sd_cache_params_t& cache_params,
-                                Denoiser* denoiser,
-                                const std::vector<float>& sigmas) {
-    if (!sd_version_is_unet(version)) {
-        LOG_WARN("UCache requested but not supported for this model type (only UNET models)");
-        return;
-    }
-
-    UCacheConfig config;
-    config.enabled                = true;
-    config.reuse_threshold        = get_cache_reuse_threshold(cache_params);
-    config.start_percent          = cache_params.start_percent;
-    config.end_percent            = cache_params.end_percent;
-    config.error_decay_rate       = std::max(0.0f, std::min(1.0f, cache_params.error_decay_rate));
-    config.use_relative_threshold = cache_params.use_relative_threshold;
-    config.reset_error_on_compute = cache_params.reset_error_on_compute;
-
-    runtime.ucache.init(config, denoiser);
-    if (!runtime.ucache.enabled()) {
-        LOG_WARN("UCache requested but could not be initialized for this run");
-        return;
-    }
-
-    runtime.ucache.set_sigmas(sigmas);
-    runtime.mode = SampleCacheMode::UCACHE;
-    LOG_INFO("UCache enabled - threshold: %.3f, start: %.2f, end: %.2f, decay: %.2f, relative: %s, reset: %s",
-             config.reuse_threshold,
-             config.start_percent,
-             config.end_percent,
-             config.error_decay_rate,
-             config.use_relative_threshold ? "true" : "false",
-             config.reset_error_on_compute ? "true" : "false");
-}
-
-static void init_cachedit_runtime(SampleCacheRuntime& runtime,
-                                  SDVersion version,
-                                  const sd_cache_params_t& cache_params,
-                                  const std::vector<float>& sigmas) {
-    if (!sd_version_is_dit(version)) {
-        LOG_WARN("CacheDIT requested but not supported for this model type (only DiT models)");
-        return;
-    }
-
-    DBCacheConfig dbcfg;
-    dbcfg.enabled                     = (cache_params.mode == SD_CACHE_DBCACHE ||
-                     cache_params.mode == SD_CACHE_CACHE_DIT);
-    dbcfg.Fn_compute_blocks           = cache_params.Fn_compute_blocks;
-    dbcfg.Bn_compute_blocks           = cache_params.Bn_compute_blocks;
-    dbcfg.residual_diff_threshold     = cache_params.residual_diff_threshold;
-    dbcfg.max_warmup_steps            = cache_params.max_warmup_steps;
-    dbcfg.max_cached_steps            = cache_params.max_cached_steps;
-    dbcfg.max_continuous_cached_steps = cache_params.max_continuous_cached_steps;
-    if (cache_params.scm_mask != nullptr && strlen(cache_params.scm_mask) > 0) {
-        dbcfg.steps_computation_mask = parse_scm_mask(cache_params.scm_mask);
-    }
-    dbcfg.scm_policy_dynamic = cache_params.scm_policy_dynamic;
-
-    TaylorSeerConfig tcfg;
-    tcfg.enabled             = (cache_params.mode == SD_CACHE_TAYLORSEER ||
-                    cache_params.mode == SD_CACHE_CACHE_DIT);
-    tcfg.n_derivatives       = cache_params.taylorseer_n_derivatives;
-    tcfg.skip_interval_steps = cache_params.taylorseer_skip_interval;
-
-    runtime.cachedit.init(dbcfg, tcfg);
-    if (!runtime.cachedit.enabled()) {
-        LOG_WARN("CacheDIT requested but could not be initialized for this run");
-        return;
-    }
-
-    runtime.cachedit.set_sigmas(sigmas);
-    runtime.mode = SampleCacheMode::CACHEDIT;
-    LOG_INFO("CacheDIT enabled - mode: %s, Fn: %d, Bn: %d, threshold: %.3f, warmup: %d",
-             cache_params.mode == SD_CACHE_CACHE_DIT ? "DBCache+TaylorSeer" : (cache_params.mode == SD_CACHE_DBCACHE ? "DBCache" : "TaylorSeer"),
-             dbcfg.Fn_compute_blocks,
-             dbcfg.Bn_compute_blocks,
-             dbcfg.residual_diff_threshold,
-             dbcfg.max_warmup_steps);
-}
-
-static void init_spectrum_runtime(SampleCacheRuntime& runtime,
-                                  SDVersion version,
-                                  const sd_cache_params_t& cache_params,
-                                  const std::vector<float>& sigmas) {
-    if (!sd_version_is_unet(version) && !sd_version_is_dit(version)) {
-        LOG_WARN("Spectrum requested but not supported for this model type (only UNET and DiT models)");
-        return;
-    }
-
-    SpectrumConfig config;
-    config.w            = cache_params.spectrum_w;
-    config.m            = cache_params.spectrum_m;
-    config.lam          = cache_params.spectrum_lam;
-    config.window_size  = cache_params.spectrum_window_size;
-    config.flex_window  = cache_params.spectrum_flex_window;
-    config.warmup_steps = cache_params.spectrum_warmup_steps;
-    config.stop_percent = cache_params.spectrum_stop_percent;
-
-    size_t total_steps = sigmas.size() > 0 ? sigmas.size() - 1 : 0;
-    runtime.spectrum.init(config, total_steps);
-    runtime.spectrum_enabled = true;
-
-    LOG_INFO("Spectrum enabled - w: %.2f, m: %d, lam: %.2f, window: %d, flex: %.2f, warmup: %d, stop: %.0f%%",
-             config.w, config.m, config.lam,
-             config.window_size, config.flex_window,
-             config.warmup_steps, config.stop_percent * 100.0f);
-}
-
-static SampleCacheRuntime init_sample_cache_runtime(SDVersion version,
-                                                    const sd_cache_params_t* cache_params,
-                                                    Denoiser* denoiser,
-                                                    const std::vector<float>& sigmas) {
-    SampleCacheRuntime runtime;
-    if (cache_params == nullptr || cache_params->mode == SD_CACHE_DISABLED) {
-        return runtime;
-    }
-
-    if (!has_valid_cache_percent_range(*cache_params)) {
-        LOG_WARN("Cache disabled due to invalid percent range (start=%.3f, end=%.3f)",
-                 cache_params->start_percent,
-                 cache_params->end_percent);
-        return runtime;
-    }
-
-    switch (cache_params->mode) {
-        case SD_CACHE_EASYCACHE:
-            init_easycache_runtime(runtime, version, *cache_params, denoiser);
-            break;
-        case SD_CACHE_UCACHE:
-            init_ucache_runtime(runtime, version, *cache_params, denoiser, sigmas);
-            break;
-        case SD_CACHE_DBCACHE:
-        case SD_CACHE_TAYLORSEER:
-        case SD_CACHE_CACHE_DIT:
-            init_cachedit_runtime(runtime, version, *cache_params, sigmas);
-            break;
-        case SD_CACHE_SPECTRUM:
-            init_spectrum_runtime(runtime, version, *cache_params, sigmas);
-            break;
-        default:
-            break;
-    }
-
-    return runtime;
-}
-
-struct SampleStepCacheDispatcher {
-    SampleCacheRuntime& runtime;
-    int step;
-    float sigma;
-    int step_index;
-
-    SampleStepCacheDispatcher(SampleCacheRuntime& runtime, int step, float sigma)
-        : runtime(runtime), step(step), sigma(sigma), step_index(step > 0 ? (step - 1) : -1) {
-        if (step_index < 0) {
-            return;
-        }
-
-        switch (runtime.mode) {
-            case SampleCacheMode::EASYCACHE:
-                runtime.easycache.begin_step(step_index, sigma);
-                break;
-            case SampleCacheMode::UCACHE:
-                runtime.ucache.begin_step(step_index, sigma);
-                break;
-            case SampleCacheMode::CACHEDIT:
-                runtime.cachedit.begin_step(step_index, sigma);
-                break;
-            case SampleCacheMode::NONE:
-                break;
-        }
-    }
-
-    bool before_condition(const SDCondition* condition, ggml_tensor* input, ggml_tensor* output) {
-        if (step_index < 0 || condition == nullptr || input == nullptr || output == nullptr) {
-            return false;
-        }
-
-        switch (runtime.mode) {
-            case SampleCacheMode::EASYCACHE:
-                return runtime.easycache.before_condition(condition, input, output, sigma, step_index);
-            case SampleCacheMode::UCACHE:
-                return runtime.ucache.before_condition(condition, input, output, sigma, step_index);
-            case SampleCacheMode::CACHEDIT:
-                return runtime.cachedit.before_condition(condition, input, output, sigma, step_index);
-            case SampleCacheMode::NONE:
-                return false;
-        }
-
-        return false;
-    }
-
-    void after_condition(const SDCondition* condition, ggml_tensor* input, ggml_tensor* output) {
-        if (step_index < 0 || condition == nullptr || input == nullptr || output == nullptr) {
-            return;
-        }
-
-        switch (runtime.mode) {
-            case SampleCacheMode::EASYCACHE:
-                runtime.easycache.after_condition(condition, input, output);
-                break;
-            case SampleCacheMode::UCACHE:
-                runtime.ucache.after_condition(condition, input, output);
-                break;
-            case SampleCacheMode::CACHEDIT:
-                runtime.cachedit.after_condition(condition, input, output);
-                break;
-            case SampleCacheMode::NONE:
-                break;
-        }
-    }
-
-    bool is_step_skipped() const {
-        switch (runtime.mode) {
-            case SampleCacheMode::EASYCACHE:
-                return runtime.easycache.is_step_skipped();
-            case SampleCacheMode::UCACHE:
-                return runtime.ucache.is_step_skipped();
-            case SampleCacheMode::CACHEDIT:
-                return runtime.cachedit.is_step_skipped();
-            case SampleCacheMode::NONE:
-                return false;
-        }
-
-        return false;
-    }
-};
-
-static void log_sample_cache_summary(const SampleCacheRuntime& runtime, size_t total_steps) {
-    if (runtime.easycache_enabled()) {
-        if (runtime.easycache.total_steps_skipped > 0 && total_steps > 0) {
-            if (runtime.easycache.total_steps_skipped < static_cast<int>(total_steps)) {
-                double speedup = static_cast<double>(total_steps) /
-                                 static_cast<double>(total_steps - runtime.easycache.total_steps_skipped);
-                LOG_INFO("EasyCache skipped %d/%zu steps (%.2fx estimated speedup)",
-                         runtime.easycache.total_steps_skipped,
-                         total_steps,
-                         speedup);
-            } else {
-                LOG_INFO("EasyCache skipped %d/%zu steps",
-                         runtime.easycache.total_steps_skipped,
-                         total_steps);
-            }
-        } else if (total_steps > 0) {
-            LOG_INFO("EasyCache completed without skipping steps");
-        }
-    }
-
-    if (runtime.ucache_enabled()) {
-        if (runtime.ucache.total_steps_skipped > 0 && total_steps > 0) {
-            if (runtime.ucache.total_steps_skipped < static_cast<int>(total_steps)) {
-                double speedup = static_cast<double>(total_steps) /
-                                 static_cast<double>(total_steps - runtime.ucache.total_steps_skipped);
-                LOG_INFO("UCache skipped %d/%zu steps (%.2fx estimated speedup)",
-                         runtime.ucache.total_steps_skipped,
-                         total_steps,
-                         speedup);
-            } else {
-                LOG_INFO("UCache skipped %d/%zu steps",
-                         runtime.ucache.total_steps_skipped,
-                         total_steps);
-            }
-        } else if (total_steps > 0) {
-            LOG_INFO("UCache completed without skipping steps");
-        }
-    }
-
-    if (runtime.cachedit_enabled()) {
-        if (runtime.cachedit.total_steps_skipped > 0 && total_steps > 0) {
-            if (runtime.cachedit.total_steps_skipped < static_cast<int>(total_steps)) {
-                double speedup = static_cast<double>(total_steps) /
-                                 static_cast<double>(total_steps - runtime.cachedit.total_steps_skipped);
-                LOG_INFO("CacheDIT skipped %d/%zu steps (%.2fx estimated speedup), accum_diff: %.4f",
-                         runtime.cachedit.total_steps_skipped,
-                         total_steps,
-                         speedup,
-                         runtime.cachedit.accumulated_residual_diff);
-            } else {
-                LOG_INFO("CacheDIT skipped %d/%zu steps, accum_diff: %.4f",
-                         runtime.cachedit.total_steps_skipped,
-                         total_steps,
-                         runtime.cachedit.accumulated_residual_diff);
-            }
-        } else if (total_steps > 0) {
-            LOG_INFO("CacheDIT completed without skipping steps");
-        }
-    }
-
-    if (runtime.spectrum_enabled && runtime.spectrum.total_steps_skipped > 0 && total_steps > 0) {
-        double speedup = static_cast<double>(total_steps) /
-                         static_cast<double>(total_steps - runtime.spectrum.total_steps_skipped);
-        LOG_INFO("Spectrum skipped %d/%zu steps (%.2fx estimated speedup)",
-                 runtime.spectrum.total_steps_skipped,
-                 total_steps,
-                 speedup);
-    }
-}
-
 /*=============================================== StableDiffusionGGML ================================================*/
 
 class StableDiffusionGGML {
@@ -1279,7 +903,7 @@ public:
             if (pred_type == PREDICTION_COUNT) {
                 if (sd_version_is_sd2(version)) {
                     // check is_using_v_parameterization_for_sd2
-                    if (is_using_v_parameterization_for_sd2(ctx, sd_version_is_inpaint(version))) {
+                    if (is_using_v_parameterization_for_sd2(sd_version_is_inpaint(version))) {
                         pred_type = V_PRED;
                     } else {
                         pred_type = EPS_PRED;
@@ -1369,43 +993,31 @@ public:
         return true;
     }
 
-    bool is_using_v_parameterization_for_sd2(ggml_context* work_ctx, bool is_inpaint = false) {
-        ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1);
-        ggml_set_f32(x_t, 0.5);
-        ggml_tensor* c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1);
-        ggml_set_f32(c, 0.5);
-
-        ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1);
-        ggml_set_f32(timesteps, 999);
-
-        ggml_tensor* concat = is_inpaint ? ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 5, 1) : nullptr;
-        if (concat != nullptr) {
-            ggml_set_f32(concat, 0);
+    bool is_using_v_parameterization_for_sd2(bool is_inpaint = false) {
+        sd::Tensor<float> x_t   = sd::full<float>({8, 8, 4, 1}, 0.5f);
+        sd::Tensor<float> c     = sd::full<float>({1024, 2, 1, 1}, 0.5f);
+        sd::Tensor<float> steps = sd::full<float>({1}, 999.0f);
+        sd::Tensor<float> concat;
+        if (is_inpaint) {
+            concat = sd::zeros<float>({8, 8, 5, 1});
         }
 
-        int64_t t0       = ggml_time_ms();
-        ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t);
+        int64_t t0 = ggml_time_ms();
+        sd::Tensor<float> out;
         DiffusionParams diffusion_params;
-        diffusion_params.x         = x_t;
-        diffusion_params.timesteps = timesteps;
-        diffusion_params.context   = c;
-        diffusion_params.c_concat  = concat;
-        diffusion_model->compute(n_threads, diffusion_params, &out);
+        diffusion_params.x         = &x_t;
+        diffusion_params.timesteps = &steps;
+        diffusion_params.context   = &c;
+        if (!concat.empty()) {
+            diffusion_params.c_concat = &concat;
+        }
+        auto out_opt = diffusion_model->compute(n_threads, diffusion_params);
+        GGML_ASSERT(!out_opt.empty());
+        out = std::move(out_opt);
         diffusion_model->free_compute_buffer();
 
-        double result = 0.f;
-        {
-            float* vec_x   = (float*)x_t->data;
-            float* vec_out = (float*)out->data;
-
-            int64_t n = ggml_nelements(out);
-
-            for (int i = 0; i < n; i++) {
-                result += ((double)vec_out[i] - (double)vec_x[i]);
-            }
-            result /= n;
-        }
-        int64_t t1 = ggml_time_ms();
+        double result = static_cast<double>((out - x_t).mean());
+        int64_t t1    = ggml_time_ms();
         LOG_DEBUG("check is_using_v_parameterization_for_sd2, taking %.2fs", (t1 - t0) * 1.0f / 1000);
         return result < -1;
     }
@@ -1643,8 +1255,7 @@ public:
         }
     }
 
-    SDCondition get_pmid_conditon(ggml_context* work_ctx,
-                                  sd_pm_params_t pm_params,
+    SDCondition get_pmid_conditon(sd_pm_params_t pm_params,
                                   ConditionerParams& condition_params) {
         SDCondition id_cond;
         if (use_pmid) {
@@ -1663,60 +1274,60 @@ public:
             if (pm_params.id_images_count > 0) {
                 int clip_image_size        = 224;
                 pmid_model->style_strength = pm_params.style_strength;
-
-                auto id_image_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, clip_image_size, clip_image_size, 3, pm_params.id_images_count);
-
-                std::vector<sd_image_f32_t> processed_id_images;
+                sd::Tensor<float> id_image_tensor;
                 for (int i = 0; i < pm_params.id_images_count; i++) {
-                    sd_image_f32_t id_image           = sd_image_t_to_sd_image_f32_t(pm_params.id_images[i]);
-                    sd_image_f32_t processed_id_image = clip_preprocess(id_image, clip_image_size, clip_image_size);
-                    free(id_image.data);
-                    id_image.data = nullptr;
-                    processed_id_images.push_back(processed_id_image);
+                    auto id_image           = sd_image_to_tensor(pm_params.id_images[i]);
+                    auto processed_id_image = clip_preprocess(id_image, clip_image_size, clip_image_size);
+                    if (id_image_tensor.empty()) {
+                        id_image_tensor = processed_id_image;
+                    } else {
+                        id_image_tensor = sd::ops::concat(id_image_tensor, processed_id_image, 3);
+                    }
                 }
 
-                ggml_ext_tensor_iter(id_image_tensor, [&](ggml_tensor* id_image_tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-                    float value = sd_image_get_f32(processed_id_images[i3], i0, i1, i2, false);
-                    ggml_ext_tensor_set_f32(id_image_tensor, value, i0, i1, i2, i3);
-                });
-
-                for (auto& image : processed_id_images) {
-                    free(image.data);
-                    image.data = nullptr;
-                }
-                processed_id_images.clear();
-
                 int64_t t0                      = ggml_time_ms();
                 condition_params.num_input_imgs = pm_params.id_images_count;
-                auto cond_tup                   = cond_stage_model->get_learned_condition_with_trigger(work_ctx,
-                                                                                                       n_threads,
+                auto cond_tup                   = cond_stage_model->get_learned_condition_with_trigger(n_threads,
                                                                                                        condition_params);
                 id_cond                         = std::get<0>(cond_tup);
                 auto class_tokens_mask          = std::get<1>(cond_tup);
-                ggml_tensor* id_embeds          = nullptr;
+                sd::Tensor<float> id_embeds;
                 if (pmv2 && pm_params.id_embed_path != nullptr) {
-                    id_embeds = load_tensor_from_file(work_ctx, pm_params.id_embed_path);
+                    try {
+                        id_embeds = sd::load_tensor_from_file_as_tensor<float>(pm_params.id_embed_path);
+                    } catch (const std::exception&) {
+                        id_embeds = {};
+                    }
                 }
-                if (pmv2 && id_embeds == nullptr) {
+                if (pmv2 && id_embeds.empty()) {
                     LOG_WARN("Provided PhotoMaker images, but NO valid ID embeds file for PM v2");
                     LOG_WARN("Turn off PhotoMaker");
                     use_pmid = false;
                 } else {
-                    if (pmv2 && pm_params.id_images_count != id_embeds->ne[1]) {
-                        LOG_WARN("PhotoMaker image count (%d) does NOT match ID embeds (%d). You should run face_detect.py again.", pm_params.id_images_count, id_embeds->ne[1]);
+                    if (pmv2 && pm_params.id_images_count != id_embeds.shape()[1]) {
+                        LOG_WARN("PhotoMaker image count (%d) does NOT match ID embeds (%d). You should run face_detect.py again.", pm_params.id_images_count, static_cast<int>(id_embeds.shape()[1]));
                         LOG_WARN("Turn off PhotoMaker");
                         use_pmid = false;
                     } else {
-                        ggml_tensor* res = nullptr;
-                        pmid_model->compute(n_threads, id_image_tensor, id_cond.c_crossattn, id_embeds, class_tokens_mask, &res, work_ctx);
-                        id_cond.c_crossattn = res;
-                        int64_t t1          = ggml_time_ms();
-                        LOG_INFO("Photomaker ID Stacking, taking %" PRId64 " ms", t1 - t0);
+                        auto res = pmid_model->compute(n_threads,
+                                                       id_image_tensor,
+                                                       id_cond.c_crossattn,
+                                                       id_embeds,
+                                                       class_tokens_mask);
+                        if (res.empty()) {
+                            LOG_ERROR("Photomaker ID Stacking failed");
+                            LOG_WARN("Turn off PhotoMaker");
+                            use_pmid = false;
+                        } else {
+                            id_cond.c_crossattn = std::move(res);
+                            int64_t t1          = ggml_time_ms();
+                            LOG_INFO("Photomaker ID Stacking, taking %" PRId64 " ms", t1 - t0);
+                            // Encode input prompt without the trigger word for delayed conditioning
+                            condition_params.text = cond_stage_model->remove_trigger_from_prompt(condition_params.text);
+                        }
                         if (free_params_immediately) {
                             pmid_model->free_params_buffer();
                         }
-                        // Encode input prompt without the trigger word for delayed conditioning
-                        condition_params.text = cond_stage_model->remove_trigger_from_prompt(work_ctx, condition_params.text);
                     }
                 }
             } else {
@@ -1728,108 +1339,37 @@ public:
         return id_cond;
     }
 
-    ggml_tensor* get_clip_vision_output(ggml_context* work_ctx,
-                                        sd_image_t init_image,
-                                        bool return_pooled   = true,
-                                        int clip_skip        = -1,
-                                        bool zero_out_masked = false) {
-        ggml_tensor* output = nullptr;
+    sd::Tensor<float> get_clip_vision_output(const sd::Tensor<float>& image,
+                                             bool return_pooled   = true,
+                                             int clip_skip        = -1,
+                                             bool zero_out_masked = false) {
+        sd::Tensor<float> output;
         if (zero_out_masked) {
             if (return_pooled) {
-                output = ggml_new_tensor_1d(work_ctx,
-                                            GGML_TYPE_F32,
-                                            clip_vision->vision_model.projection_dim);
+                output = sd::zeros<float>({clip_vision->vision_model.projection_dim});
             } else {
-                output = ggml_new_tensor_2d(work_ctx,
-                                            GGML_TYPE_F32,
-                                            clip_vision->vision_model.hidden_size,
-                                            257);
+                output = sd::zeros<float>({clip_vision->vision_model.hidden_size, 257});
             }
-
-            ggml_set_f32(output, 0.f);
         } else {
-            sd_image_f32_t image         = sd_image_t_to_sd_image_f32_t(init_image);
-            sd_image_f32_t resized_image = clip_preprocess(image, clip_vision->vision_model.image_size, clip_vision->vision_model.image_size);
-            free(image.data);
-            image.data = nullptr;
-
-            ggml_tensor* pixel_values = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, resized_image.width, resized_image.height, 3, 1);
-            sd_image_f32_to_ggml_tensor(resized_image, pixel_values, false);
-            free(resized_image.data);
-            resized_image.data = nullptr;
-
-            // print_ggml_tensor(pixel_values);
-            clip_vision->compute(n_threads, pixel_values, return_pooled, clip_skip, &output, work_ctx);
-            // print_ggml_tensor(c_crossattn);
+            auto pixel_values = clip_preprocess(image, clip_vision->vision_model.image_size, clip_vision->vision_model.image_size);
+            auto output_opt   = clip_vision->compute(n_threads, pixel_values, return_pooled, clip_skip);
+            if (output_opt.empty()) {
+                LOG_ERROR("clip_vision compute failed");
+                return {};
+            }
+            output = std::move(output_opt);
         }
         return output;
     }
 
-    SDCondition get_svd_condition(ggml_context* work_ctx,
-                                  sd_image_t init_image,
-                                  int width,
-                                  int height,
-                                  int fps                  = 6,
-                                  int motion_bucket_id     = 127,
-                                  float augmentation_level = 0.f,
-                                  bool zero_out_masked     = false) {
-        // c_crossattn
-        int64_t t0               = ggml_time_ms();
-        ggml_tensor* c_crossattn = get_clip_vision_output(work_ctx, init_image, true, -1, zero_out_masked);
-
-        // c_concat
-        ggml_tensor* c_concat = nullptr;
-        {
-            if (zero_out_masked) {
-                c_concat = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / get_vae_scale_factor(), height / get_vae_scale_factor(), 4, 1);
-                ggml_set_f32(c_concat, 0.f);
-            } else {
-                ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
-
-                if (width != init_image.width || height != init_image.height) {
-                    sd_image_f32_t image         = sd_image_t_to_sd_image_f32_t(init_image);
-                    sd_image_f32_t resized_image = resize_sd_image_f32_t(image, width, height);
-                    free(image.data);
-                    image.data = nullptr;
-                    sd_image_f32_to_ggml_tensor(resized_image, init_img, false);
-                    free(resized_image.data);
-                    resized_image.data = nullptr;
-                } else {
-                    sd_image_to_ggml_tensor(init_image, init_img);
-                }
-                if (augmentation_level > 0.f) {
-                    ggml_tensor* noise = ggml_dup_tensor(work_ctx, init_img);
-                    ggml_ext_im_set_randn_f32(noise, rng);
-                    // encode_pixels += torch.randn_like(pixels) * augmentation_level
-                    ggml_ext_tensor_scale_inplace(noise, augmentation_level);
-                    ggml_ext_tensor_add_inplace(init_img, noise);
-                }
-                c_concat = encode_first_stage(work_ctx, init_img);
-            }
-        }
-
-        // y
-        ggml_tensor* y = nullptr;
-        {
-            y                            = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model->get_adm_in_channels());
-            int out_dim                  = 256;
-            int fps_id                   = fps - 1;
-            std::vector<float> timesteps = {(float)fps_id, (float)motion_bucket_id, augmentation_level};
-            set_timestep_embedding(timesteps, y, out_dim);
-        }
-        int64_t t1 = ggml_time_ms();
-        LOG_DEBUG("computing svd condition graph completed, taking %" PRId64 " ms", t1 - t0);
-        return {c_crossattn, y, c_concat};
-    }
-
     std::vector<float> process_timesteps(const std::vector<float>& timesteps,
-                                         ggml_tensor* init_latent,
-                                         ggml_tensor* denoise_mask) {
+                                         const sd::Tensor<float>& init_latent,
+                                         const sd::Tensor<float>& denoise_mask) {
         if (diffusion_model->get_desc() == "Wan2.2-TI2V-5B") {
-            auto new_timesteps = std::vector<float>(init_latent->ne[2], timesteps[0]);
+            auto new_timesteps = std::vector<float>(static_cast<size_t>(init_latent.shape()[2]), timesteps[0]);
 
-            if (denoise_mask != nullptr) {
-                float value = ggml_ext_tensor_get_f32(denoise_mask, 0, 0, 0, 0);
+            if (!denoise_mask.empty()) {
+                float value = denoise_mask.dim() == 5 ? denoise_mask.index(0, 0, 0, 0, 0) : denoise_mask.index(0, 0, 0, 0);
                 if (value == 0.f) {
                     new_timesteps[0] = 0.f;
                 }
@@ -1840,40 +1380,19 @@ public:
         }
     }
 
-    // a = a * mask + b * (1 - mask)
-    void apply_mask(ggml_tensor* a, ggml_tensor* b, ggml_tensor* mask) {
-        for (int64_t i0 = 0; i0 < a->ne[0]; i0++) {
-            for (int64_t i1 = 0; i1 < a->ne[1]; i1++) {
-                for (int64_t i2 = 0; i2 < a->ne[2]; i2++) {
-                    for (int64_t i3 = 0; i3 < a->ne[3]; i3++) {
-                        float a_value    = ggml_ext_tensor_get_f32(a, i0, i1, i2, i3);
-                        float b_value    = ggml_ext_tensor_get_f32(b, i0, i1, i2, i3);
-                        float mask_value = ggml_ext_tensor_get_f32(mask, i0 % mask->ne[0], i1 % mask->ne[1], i2 % mask->ne[2], i3 % mask->ne[3]);
-                        ggml_ext_tensor_set_f32(a, a_value * mask_value + b_value * (1 - mask_value), i0, i1, i2, i3);
-                    }
-                }
-            }
-        }
-    }
-
-    void preview_image(ggml_context* work_ctx,
-                       int step,
-                       ggml_tensor* latents,
+    void preview_image(int step,
+                       const sd::Tensor<float>& latents,
                        enum SDVersion version,
                        preview_t preview_mode,
-                       ggml_tensor* result,
                        std::function<void(int, int, sd_image_t*, bool, void*)> step_callback,
                        void* step_callback_data,
                        bool is_noisy) {
-        const uint32_t channel = 3;
-        uint32_t width         = static_cast<uint32_t>(latents->ne[0]);
-        uint32_t height        = static_cast<uint32_t>(latents->ne[1]);
-        uint32_t dim           = static_cast<uint32_t>(latents->ne[ggml_n_dims(latents) - 1]);
-
         if (preview_mode == PREVIEW_PROJ) {
-            int patch_sz                           = 1;
-            const float(*latent_rgb_proj)[channel] = nullptr;
-            float* latent_rgb_bias                 = nullptr;
+            int patch_sz                     = 1;
+            const float(*latent_rgb_proj)[3] = nullptr;
+            float* latent_rgb_bias           = nullptr;
+            bool is_video                    = preview_latent_tensor_is_video(latents);
+            uint32_t dim                     = is_video ? static_cast<uint32_t>(latents.shape()[3]) : static_cast<uint32_t>(latents.shape()[2]);
 
             if (dim == 128) {
                 if (sd_version_is_flux2(version)) {
@@ -1887,12 +1406,9 @@ public:
                     latent_rgb_bias = wan_22_latent_rgb_bias;
                 } else {
                     LOG_WARN("No latent to RGB projection known for this model");
-                    // unknown model
                     return;
                 }
             } else if (dim == 16) {
-                // 16 channels VAE -> Flux or SD3
-
                 if (sd_version_is_sd3(version)) {
                     latent_rgb_proj = sd3_latent_rgb_proj;
                     latent_rgb_bias = sd3_latent_rgb_bias;
@@ -1904,12 +1420,9 @@ public:
                     latent_rgb_bias = wan_21_latent_rgb_bias;
                 } else {
                     LOG_WARN("No latent to RGB projection known for this model");
-                    // unknown model
                     return;
                 }
-
             } else if (dim == 4) {
-                // 4 channels VAE
                 if (sd_version_is_sdxl(version)) {
                     latent_rgb_proj = sdxl_latent_rgb_proj;
                     latent_rgb_bias = sdxl_latent_rgb_bias;
@@ -1917,459 +1430,394 @@ public:
                     latent_rgb_proj = sd_latent_rgb_proj;
                     latent_rgb_bias = sd_latent_rgb_bias;
                 } else {
-                    // unknown model
                     LOG_WARN("No latent to RGB projection known for this model");
                     return;
                 }
-            } else if (dim == 3) {
-                // Do nothing, assuming already RGB latents
-            } else {
+            } else if (dim != 3) {
                 LOG_WARN("No latent to RGB projection known for this model");
-                // unknown latent space
                 return;
             }
 
-            uint32_t frames = 1;
-            if (ggml_n_dims(latents) == 4) {
-                frames = static_cast<uint32_t>(latents->ne[2]);
-            }
-
-            uint32_t img_width  = width * patch_sz;
-            uint32_t img_height = height * patch_sz;
-
-            uint8_t* data = (uint8_t*)malloc(frames * img_width * img_height * channel * sizeof(uint8_t));
+            uint32_t frames     = is_video ? static_cast<uint32_t>(latents.shape()[2]) : 1;
+            uint32_t img_width  = static_cast<uint32_t>(latents.shape()[0]) * patch_sz;
+            uint32_t img_height = static_cast<uint32_t>(latents.shape()[1]) * patch_sz;
 
+            uint8_t* data = (uint8_t*)malloc(frames * img_width * img_height * 3 * sizeof(uint8_t));
+            GGML_ASSERT(data != nullptr);
             preview_latent_video(data, latents, latent_rgb_proj, latent_rgb_bias, patch_sz);
             sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t));
+            GGML_ASSERT(images != nullptr);
             for (uint32_t i = 0; i < frames; i++) {
-                images[i] = {img_width, img_height, channel, data + i * img_width * img_height * channel};
+                images[i] = {img_width, img_height, 3, data + i * img_width * img_height * 3};
             }
             step_callback(step, frames, images, is_noisy, step_callback_data);
             free(data);
             free(images);
-        } else {
-            if (preview_mode == PREVIEW_VAE || preview_mode == PREVIEW_TAE) {
-                if (preview_vae) {
-                    latents = preview_vae->diffusion_to_vae_latents(work_ctx, latents);
-                    result  = preview_vae->decode(n_threads, work_ctx, latents, vae_tiling_params, false, circular_x, circular_y, result, true);
-                } else {
-                    latents = first_stage_model->diffusion_to_vae_latents(work_ctx, latents);
-                    result  = first_stage_model->decode(n_threads, work_ctx, latents, vae_tiling_params, false, circular_x, circular_y, result, true);
-                }
+            return;
+        }
+
+        if (preview_mode == PREVIEW_VAE || preview_mode == PREVIEW_TAE) {
+            sd::Tensor<float> vae_latents;
+            sd::Tensor<float> decoded;
+            bool is_video = preview_latent_tensor_is_video(latents);
+            if (preview_vae) {
+                vae_latents = preview_vae->diffusion_to_vae_latents(latents);
+                decoded     = preview_vae->decode(n_threads, vae_latents, vae_tiling_params, is_video, circular_x, circular_y, true);
             } else {
+                vae_latents = first_stage_model->diffusion_to_vae_latents(latents);
+                decoded     = first_stage_model->decode(n_threads, vae_latents, vae_tiling_params, is_video, circular_x, circular_y, true);
+            }
+            if (decoded.empty()) {
+                LOG_ERROR("preview decode failed at step %d", step);
                 return;
             }
 
-            ggml_ext_tensor_clamp_inplace(result, 0.0f, 1.0f);
-            uint32_t frames = 1;
-            if (ggml_n_dims(latents) == 4) {
-                frames = static_cast<uint32_t>(result->ne[2]);
-            }
-
+            is_video           = preview_latent_tensor_is_video(decoded);
+            uint32_t frames    = is_video ? static_cast<uint32_t>(decoded.shape()[2]) : 1;
             sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t));
-            // print_ggml_tensor(result,true);
-            for (size_t i = 0; i < frames; i++) {
-                images[i].width   = static_cast<uint32_t>(result->ne[0]);
-                images[i].height  = static_cast<uint32_t>(result->ne[1]);
-                images[i].channel = 3;
-                images[i].data    = ggml_tensor_to_sd_image(result, static_cast<int>(i), ggml_n_dims(latents) == 4);
+            GGML_ASSERT(images != nullptr);
+            for (uint32_t i = 0; i < frames; ++i) {
+                images[i] = tensor_to_sd_image(decoded, static_cast<int>(i));
             }
 
             step_callback(step, frames, images, is_noisy, step_callback_data);
-
-            ggml_ext_tensor_scale_inplace(result, 0);
-            for (uint32_t i = 0; i < frames; i++) {
+            for (uint32_t i = 0; i < frames; ++i) {
                 free(images[i].data);
             }
-
             free(images);
+            return;
+        }
+
+        if (preview_mode != PREVIEW_NONE) {
+            LOG_WARN("Unsupported preview mode: %d", static_cast<int>(preview_mode));
         }
     }
 
-    ggml_tensor* sample(ggml_context* work_ctx,
-                        std::shared_ptr<DiffusionModel> work_diffusion_model,
-                        bool inverse_noise_scaling,
-                        ggml_tensor* init_latent,
-                        ggml_tensor* noise,
-                        SDCondition cond,
-                        SDCondition uncond,
-                        SDCondition img_cond,
-                        ggml_tensor* control_hint,
-                        float control_strength,
-                        sd_guidance_params_t guidance,
-                        float eta,
-                        int shifted_timestep,
-                        sample_method_t method,
-                        const std::vector<float>& sigmas,
-                        int start_merge_step,
-                        SDCondition id_cond,
-                        std::vector<ggml_tensor*> ref_latents = {},
-                        bool increase_ref_index               = false,
-                        ggml_tensor* denoise_mask             = nullptr,
-                        ggml_tensor* vace_context             = nullptr,
-                        float vace_strength                   = 1.f,
-                        const sd_cache_params_t* cache_params = nullptr) {
-        if (shifted_timestep > 0 && !sd_version_is_sdxl(version)) {
-            LOG_WARN("timestep shifting is only supported for SDXL models!");
-            shifted_timestep = 0;
+    std::vector<float> prepare_sample_timesteps(float sigma,
+                                                int shifted_timestep) {
+        float t = denoiser->sigma_to_t(sigma);
+        if (shifted_timestep > 0) {
+            float shifted_t_float = t * (float(shifted_timestep) / float(TIMESTEPS));
+            int64_t shifted_t     = static_cast<int64_t>(roundf(shifted_t_float));
+            shifted_t             = std::max((int64_t)0, std::min((int64_t)(TIMESTEPS - 1), shifted_t));
+            LOG_DEBUG("shifting timestep from %.2f to %" PRId64 " (sigma: %.4f)", t, shifted_t, sigma);
+            return std::vector<float>{(float)shifted_t};
         }
+        if (sd_version_is_anima(version)) {
+            return std::vector<float>{t / static_cast<float>(TIMESTEPS)};
+        }
+        if (sd_version_is_z_image(version)) {
+            return std::vector<float>{1000.f - t};
+        }
+        return std::vector<float>{t};
+    }
+
+    void adjust_sample_step_scalings(int shifted_timestep,
+                                     const std::vector<float>& timesteps_vec,
+                                     float c_in,
+                                     float* c_skip,
+                                     float* c_out) {
+        GGML_ASSERT(c_skip != nullptr);
+        GGML_ASSERT(c_out != nullptr);
+        if (shifted_timestep <= 0) {
+            return;
+        }
+
+        int64_t shifted_t_idx              = static_cast<int64_t>(roundf(timesteps_vec[0]));
+        float shifted_sigma                = denoiser->t_to_sigma((float)shifted_t_idx);
+        std::vector<float> shifted_scaling = denoiser->get_scalings(shifted_sigma);
+        float shifted_c_skip               = shifted_scaling[0];
+        float shifted_c_out                = shifted_scaling[1];
+        float shifted_c_in                 = shifted_scaling[2];
+
+        *c_skip = shifted_c_skip * c_in / shifted_c_in;
+        *c_out  = shifted_c_out;
+    }
+
+    struct SamplePreviewContext {
+        sd_preview_cb_t callback = nullptr;
+        void* data               = nullptr;
+        preview_t mode           = PREVIEW_NONE;
+    };
+
+    SamplePreviewContext prepare_sample_preview_context() {
+        return SamplePreviewContext{sd_get_preview_callback(),
+                                    sd_get_preview_callback_data(),
+                                    sd_get_preview_mode()};
+    }
+
+    void report_sample_progress(int step, size_t total_steps, int64_t t0) {
+        int64_t t1 = ggml_time_us();
+        if (step > 0 || step == -(int)total_steps) {
+            int showstep = std::abs(step);
+            pretty_progress(showstep, (int)total_steps, (t1 - t0) / 1000000.f / showstep);
+        }
+    }
+
+    void compute_sample_controls(const sd::Tensor<float>& control_image,
+                                 const sd::Tensor<float>& noised_input,
+                                 const sd::Tensor<float>& timesteps_tensor,
+                                 const SDCondition& condition,
+                                 std::vector<sd::Tensor<float>>* controls) {
+        GGML_ASSERT(controls != nullptr);
+        controls->clear();
+        if (control_image.empty() || control_net == nullptr) {
+            return;
+        }
+
+        auto control_result = control_net->compute(n_threads,
+                                                   noised_input,
+                                                   control_image,
+                                                   timesteps_tensor,
+                                                   condition.c_crossattn,
+                                                   condition.c_vector);
+        if (!control_result.has_value()) {
+            LOG_ERROR("controlnet compute failed");
+            return;
+        }
+
+        *controls = std::move(*control_result);
+    }
+
+    sd::Tensor<float> sample(const std::shared_ptr<DiffusionModel>& work_diffusion_model,
+                             bool inverse_noise_scaling,
+                             const sd::Tensor<float>& init_latent,
+                             sd::Tensor<float> noise,
+                             const SDCondition& cond,
+                             const SDCondition& uncond,
+                             const SDCondition& img_cond,
+                             const SDCondition& id_cond,
+                             const sd::Tensor<float>& control_image,
+                             float control_strength,
+                             const sd_guidance_params_t& guidance,
+                             float eta,
+                             int shifted_timestep,
+                             sample_method_t method,
+                             const std::vector<float>& sigmas,
+                             int start_merge_step,
+                             const std::vector<sd::Tensor<float>>& ref_latents,
+                             bool increase_ref_index,
+                             const sd::Tensor<float>& denoise_mask,
+                             const sd::Tensor<float>& vace_context,
+                             float vace_strength,
+                             const sd_cache_params_t* cache_params) {
         std::vector<int> skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count);
-
-        float cfg_scale = guidance.txt_cfg;
-        if (cfg_scale < 1.f) {
-            if (cfg_scale == 0.f) {
-                // Diffusers follow the convention from the original paper
-                // (https://arxiv.org/abs/2207.12598v1), so many distilled model docs
-                // recommend 0 as guidance; warn the user that it'll disable prompt folowing
-                LOG_WARN("unconditioned mode, images won't follow the prompt (use cfg-scale=1 for distilled models)");
-            } else {
-                LOG_WARN("cfg value out of expected range may produce unexpected results");
-            }
-        }
-
-        float img_cfg_scale = std::isfinite(guidance.img_cfg) ? guidance.img_cfg : guidance.txt_cfg;
+        float cfg_scale     = guidance.txt_cfg;
+        float img_cfg_scale = guidance.img_cfg;
         float slg_scale     = guidance.slg.scale;
 
-        if (img_cfg_scale != cfg_scale && !sd_version_is_inpaint_or_unet_edit(version)) {
-            LOG_WARN("2-conditioning CFG is not supported with this model, disabling it for better performance...");
-            img_cfg_scale = cfg_scale;
+        sd_sample::SampleCacheRuntime cache_runtime = sd_sample::init_sample_cache_runtime(version,
+                                                                                           cache_params,
+                                                                                           denoiser.get(),
+                                                                                           sigmas);
+        size_t steps                                = sigmas.size() - 1;
+        bool has_skiplayer                          = slg_scale != 0.0f && !skip_layers.empty();
+        if (has_skiplayer && !sd_version_is_dit(version)) {
+            has_skiplayer = false;
+            LOG_WARN("SLG is incompatible with this model type");
         }
 
-        SampleCacheRuntime cache_runtime = init_sample_cache_runtime(version, cache_params, denoiser.get(), sigmas);
+        int64_t t0                   = ggml_time_us();
+        sd::Tensor<float> x_t        = !noise.empty()
+                                           ? denoiser->noise_scaling(sigmas[0], noise, init_latent)
+                                           : init_latent;
+        sd::Tensor<float> denoised   = x_t;
+        SamplePreviewContext preview = prepare_sample_preview_context();
 
-        size_t steps   = sigmas.size() - 1;
-        ggml_tensor* x = ggml_ext_dup_and_cpy_tensor(work_ctx, init_latent);
-
-        if (noise) {
-            x = denoiser->noise_scaling(sigmas[0], noise, x);
-        }
-
-        ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, x);
-
-        bool has_unconditioned = img_cfg_scale != 1.0 && uncond.c_crossattn != nullptr;
-        bool has_img_cond      = cfg_scale != img_cfg_scale && img_cond.c_crossattn != nullptr;
-        bool has_skiplayer     = slg_scale != 0.0 && skip_layers.size() > 0;
-
-        // denoise wrapper
-        ggml_tensor* out_cond     = ggml_dup_tensor(work_ctx, x);
-        ggml_tensor* out_uncond   = nullptr;
-        ggml_tensor* out_skip     = nullptr;
-        ggml_tensor* out_img_cond = nullptr;
-
-        if (has_unconditioned) {
-            out_uncond = ggml_dup_tensor(work_ctx, x);
-        }
-        if (has_skiplayer) {
-            if (sd_version_is_dit(version)) {
-                out_skip = ggml_dup_tensor(work_ctx, x);
-            } else {
-                has_skiplayer = false;
-                LOG_WARN("SLG is incompatible with %s models", model_version_to_str[version]);
-            }
-        }
-        if (has_img_cond) {
-            out_img_cond = ggml_dup_tensor(work_ctx, x);
-        }
-        ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x);
-
-        int64_t t0 = ggml_time_us();
-
-        ggml_tensor* preview_tensor = nullptr;
-        auto sd_preview_mode        = sd_get_preview_mode();
-        if (sd_preview_mode != PREVIEW_NONE && sd_preview_mode != PREVIEW_PROJ) {
-            int64_t W = x->ne[0] * get_vae_scale_factor();
-            int64_t H = x->ne[1] * get_vae_scale_factor();
-            if (ggml_n_dims(x) == 4) {
-                // assuming video mode (if batch processing gets implemented this will break)
-                int64_t T = x->ne[2];
-                if (sd_version_is_wan(version)) {
-                    T = ((T - 1) * 4) + 1;
-                }
-                preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32,
-                                                    W,
-                                                    H,
-                                                    T,
-                                                    3);
-            } else {
-                preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32,
-                                                    W,
-                                                    H,
-                                                    3,
-                                                    x->ne[3]);
-            }
-        }
-
-        auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
-            auto sd_preview_cb      = sd_get_preview_callback();
-            auto sd_preview_cb_data = sd_get_preview_callback_data();
-            auto sd_preview_mode    = sd_get_preview_mode();
+        auto denoise = [&](const sd::Tensor<float>& x, float sigma, int step) -> sd::Tensor<float> {
             if (step == 1 || step == -1) {
                 pretty_progress(0, (int)steps, 0);
             }
 
-            DiffusionParams diffusion_params;
-            SampleStepCacheDispatcher step_cache(cache_runtime, step, sigma);
-
             std::vector<float> scaling = denoiser->get_scalings(sigma);
             GGML_ASSERT(scaling.size() == 3);
             float c_skip = scaling[0];
             float c_out  = scaling[1];
             float c_in   = scaling[2];
 
-            float t = denoiser->sigma_to_t(sigma);
-            std::vector<float> timesteps_vec;
-            if (shifted_timestep > 0 && sd_version_is_sdxl(version)) {
-                float shifted_t_float = t * (float(shifted_timestep) / float(TIMESTEPS));
-                int64_t shifted_t     = static_cast<int64_t>(roundf(shifted_t_float));
-                shifted_t             = std::max((int64_t)0, std::min((int64_t)(TIMESTEPS - 1), shifted_t));
-                LOG_DEBUG("shifting timestep from %.2f to %" PRId64 " (sigma: %.4f)", t, shifted_t, sigma);
-                timesteps_vec.assign(1, (float)shifted_t);
-            } else if (sd_version_is_anima(version)) {
-                // Anima uses normalized flow timesteps.
-                timesteps_vec.assign(1, t / static_cast<float>(TIMESTEPS));
-            } else if (sd_version_is_z_image(version)) {
-                timesteps_vec.assign(1, 1000.f - t);
-            } else {
-                timesteps_vec.assign(1, t);
+            std::vector<float> timesteps_vec = prepare_sample_timesteps(sigma, shifted_timestep);
+            timesteps_vec                    = process_timesteps(timesteps_vec, init_latent, denoise_mask);
+            adjust_sample_step_scalings(shifted_timestep, timesteps_vec, c_in, &c_skip, &c_out);
+
+            sd::Tensor<float> timesteps_tensor({static_cast<int64_t>(timesteps_vec.size())}, timesteps_vec);
+            sd::Tensor<float> guidance_tensor({1}, std::vector<float>{guidance.distilled_guidance});
+            sd::Tensor<float> noised_input = x * c_in;
+            if (!denoise_mask.empty() && version == VERSION_WAN2_2_TI2V) {
+                noised_input = noised_input * denoise_mask + init_latent * (1.0f - denoise_mask);
             }
 
-            timesteps_vec = process_timesteps(timesteps_vec, init_latent, denoise_mask);
-
             if (cache_runtime.spectrum_enabled && cache_runtime.spectrum.should_predict()) {
-                cache_runtime.spectrum.predict(denoised);
-
-                if (denoise_mask != nullptr) {
-                    apply_mask(denoised, init_latent, denoise_mask);
+                cache_runtime.spectrum.predict(&denoised);
+                if (!denoise_mask.empty()) {
+                    denoised = denoised * denoise_mask + init_latent * (1.0f - denoise_mask);
                 }
-
-                if (sd_preview_cb != nullptr && sd_should_preview_denoised()) {
-                    if (step % sd_get_preview_interval() == 0) {
-                        preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb, sd_preview_cb_data, false);
-                    }
-                }
-
-                int64_t t1 = ggml_time_us();
-                if (step > 0 || step == -(int)steps) {
-                    int showstep = std::abs(step);
-                    pretty_progress(showstep, (int)steps, (t1 - t0) / 1000000.f / showstep);
+                if (sd_should_preview_denoised() && preview.callback != nullptr) {
+                    preview_image(step, denoised, version, preview.mode, preview.callback, preview.data, false);
                 }
+                report_sample_progress(step, steps, t0);
                 return denoised;
             }
 
-            auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
-            std::vector<float> guidance_vec(1, guidance.distilled_guidance);
-            auto guidance_tensor = vector_to_ggml_tensor(work_ctx, guidance_vec);
-
-            copy_ggml_tensor(noised_input, input);
-            // noised_input = noised_input * c_in
-            ggml_ext_tensor_scale_inplace(noised_input, c_in);
-
-            if (denoise_mask != nullptr && version == VERSION_WAN2_2_TI2V) {
-                apply_mask(noised_input, init_latent, denoise_mask);
-            }
-            if (sd_preview_cb != nullptr && sd_should_preview_noisy()) {
-                if (step % sd_get_preview_interval() == 0) {
-                    preview_image(work_ctx, step, noised_input, version, sd_preview_mode, preview_tensor, sd_preview_cb, sd_preview_cb_data, true);
-                }
+            if (sd_should_preview_noisy() && preview.callback != nullptr) {
+                preview_image(step, noised_input, version, preview.mode, preview.callback, preview.data, true);
             }
 
-            std::vector<ggml_tensor*> controls;
-
-            if (control_hint != nullptr && control_net != nullptr) {
-                if (control_net->compute(n_threads, noised_input, control_hint, timesteps, cond.c_crossattn, cond.c_vector)) {
-                    controls = control_net->controls;
-                } else {
-                    LOG_ERROR("controlnet compute failed");
-                }
-                // print_ggml_tensor(controls[12]);
-                // GGML_ASSERT(0);
-            }
-
-            diffusion_params.x                  = noised_input;
-            diffusion_params.timesteps          = timesteps;
-            diffusion_params.guidance           = guidance_tensor;
-            diffusion_params.ref_latents        = ref_latents;
+            sd::Tensor<float> cond_out;
+            sd::Tensor<float> uncond_out;
+            sd::Tensor<float> img_cond_out;
+            sd::Tensor<float> skip_cond_out;
+            sd_sample::SampleStepCacheDispatcher step_cache(cache_runtime, step, sigma);
+            std::vector<sd::Tensor<float>> controls;
+            DiffusionParams diffusion_params;
+            diffusion_params.x                  = &noised_input;
+            diffusion_params.timesteps          = &timesteps_tensor;
+            diffusion_params.guidance           = &guidance_tensor;
+            diffusion_params.ref_latents        = &ref_latents;
             diffusion_params.increase_ref_index = increase_ref_index;
-            diffusion_params.controls           = controls;
+            diffusion_params.controls           = &controls;
             diffusion_params.control_strength   = control_strength;
-            diffusion_params.vace_context       = vace_context;
+            diffusion_params.vace_context       = vace_context.empty() ? nullptr : &vace_context;
             diffusion_params.vace_strength      = vace_strength;
+            diffusion_params.skip_layers        = nullptr;
 
-            auto run_diffusion_condition = [&](const SDCondition* condition, ggml_tensor** output_tensor) -> bool {
-                if (step_cache.before_condition(condition, diffusion_params.x, *output_tensor)) {
-                    return true;
+            compute_sample_controls(control_image,
+                                    noised_input,
+                                    timesteps_tensor,
+                                    cond,
+                                    &controls);
+
+            auto run_condition = [&](const SDCondition& condition,
+                                     const sd::Tensor<float>* c_concat_override = nullptr,
+                                     const std::vector<int>* local_skip_layers  = nullptr) -> sd::Tensor<float> {
+                diffusion_params.context     = condition.c_crossattn.empty() ? nullptr : &condition.c_crossattn;
+                diffusion_params.c_concat    = c_concat_override != nullptr ? c_concat_override : (condition.c_concat.empty() ? nullptr : &condition.c_concat);
+                diffusion_params.y           = condition.c_vector.empty() ? nullptr : &condition.c_vector;
+                diffusion_params.t5_ids      = condition.c_t5_ids.empty() ? nullptr : &condition.c_t5_ids;
+                diffusion_params.t5_weights  = condition.c_t5_weights.empty() ? nullptr : &condition.c_t5_weights;
+                diffusion_params.skip_layers = local_skip_layers;
+
+                sd::Tensor<float> cached_output;
+                if (step_cache.before_condition(&condition, noised_input, &cached_output)) {
+                    return std::move(cached_output);
                 }
 
-                if (!work_diffusion_model->compute(n_threads,
-                                                   diffusion_params,
-                                                   output_tensor)) {
+                auto output_opt = work_diffusion_model->compute(n_threads, diffusion_params);
+                if (output_opt.empty()) {
                     LOG_ERROR("diffusion model compute failed");
-                    return false;
+                    return sd::Tensor<float>();
                 }
 
-                step_cache.after_condition(condition, diffusion_params.x, *output_tensor);
-                return true;
+                step_cache.after_condition(&condition, noised_input, output_opt);
+                return output_opt;
             };
 
-            const SDCondition* active_condition = nullptr;
-            ggml_tensor** active_output         = &out_cond;
             if (start_merge_step == -1 || step <= start_merge_step) {
-                // cond
-                diffusion_params.context  = cond.c_crossattn;
-                diffusion_params.c_concat = cond.c_concat;
-                diffusion_params.y        = cond.c_vector;
-                active_condition          = &cond;
+                cond_out = run_condition(cond);
+                if (cond_out.empty()) {
+                    return {};
+                }
             } else {
-                diffusion_params.context  = id_cond.c_crossattn;
-                diffusion_params.c_concat = cond.c_concat;
-                diffusion_params.y        = id_cond.c_vector;
-                active_condition          = &id_cond;
-            }
-
-            if (!run_diffusion_condition(active_condition, active_output)) {
-                return nullptr;
-            }
-
-            bool current_step_skipped = step_cache.is_step_skipped();
-
-            float* negative_data = nullptr;
-            if (has_unconditioned) {
-                // uncond
-                if (!current_step_skipped && control_hint != nullptr && control_net != nullptr) {
-                    if (control_net->compute(n_threads, noised_input, control_hint, timesteps, uncond.c_crossattn, uncond.c_vector)) {
-                        controls = control_net->controls;
-                    } else {
-                        LOG_ERROR("controlnet compute failed");
-                    }
+                GGML_ASSERT(!id_cond.empty());
+                cond_out = run_condition(id_cond,
+                                         cond.c_concat.empty() ? nullptr : &cond.c_concat);
+                if (cond_out.empty()) {
+                    return {};
                 }
-                current_step_skipped      = step_cache.is_step_skipped();
-                diffusion_params.controls = controls;
-                diffusion_params.context  = uncond.c_crossattn;
-                diffusion_params.c_concat = uncond.c_concat;
-                diffusion_params.y        = uncond.c_vector;
-                if (!run_diffusion_condition(&uncond, &out_uncond)) {
-                    return nullptr;
-                }
-                negative_data = (float*)out_uncond->data;
             }
 
-            float* img_cond_data = nullptr;
-            if (has_img_cond) {
-                diffusion_params.context  = img_cond.c_crossattn;
-                diffusion_params.c_concat = img_cond.c_concat;
-                diffusion_params.y        = img_cond.c_vector;
-                if (!run_diffusion_condition(&img_cond, &out_img_cond)) {
-                    return nullptr;
+            if (!uncond.empty()) {
+                if (!step_cache.is_step_skipped()) {
+                    compute_sample_controls(control_image,
+                                            noised_input,
+                                            timesteps_tensor,
+                                            uncond,
+                                            &controls);
+                }
+                uncond_out = run_condition(uncond);
+                if (uncond_out.empty()) {
+                    return {};
                 }
-                img_cond_data = (float*)out_img_cond->data;
             }
-
-            int step_count         = static_cast<int>(sigmas.size());
-            bool is_skiplayer_step = has_skiplayer && step > (int)(guidance.slg.layer_start * step_count) && step < (int)(guidance.slg.layer_end * step_count);
-            float* skip_layer_data = has_skiplayer ? (float*)out_skip->data : nullptr;
+            if (!img_cond.empty()) {
+                img_cond_out = run_condition(img_cond,
+                                             cond.c_concat.empty() ? nullptr : &cond.c_concat);
+                if (img_cond_out.empty()) {
+                    return {};
+                }
+            }
+            bool is_skiplayer_step = has_skiplayer &&
+                                     step > (int)(guidance.slg.layer_start * static_cast<int>(sigmas.size())) &&
+                                     step < (int)(guidance.slg.layer_end * static_cast<int>(sigmas.size()));
             if (is_skiplayer_step) {
                 LOG_DEBUG("Skipping layers at step %d\n", step);
                 if (!step_cache.is_step_skipped()) {
-                    // skip layer (same as conditioned)
-                    diffusion_params.context     = cond.c_crossattn;
-                    diffusion_params.c_concat    = cond.c_concat;
-                    diffusion_params.y           = cond.c_vector;
-                    diffusion_params.skip_layers = skip_layers;
-                    if (!work_diffusion_model->compute(n_threads,
-                                                       diffusion_params,
-                                                       &out_skip)) {
-                        LOG_ERROR("diffusion model compute failed");
-                        return nullptr;
+                    skip_cond_out = run_condition(cond,
+                                                  cond.c_concat.empty() ? nullptr : &cond.c_concat,
+                                                  &skip_layers);
+                    if (skip_cond_out.empty()) {
+                        return {};
                     }
                 }
-                skip_layer_data = (float*)out_skip->data;
-            }
-            float* vec_denoised  = (float*)denoised->data;
-            float* vec_input     = (float*)input->data;
-            float* positive_data = (float*)out_cond->data;
-            int ne_elements      = (int)ggml_nelements(denoised);
-
-            if (shifted_timestep > 0 && sd_version_is_sdxl(version)) {
-                int64_t shifted_t_idx              = static_cast<int64_t>(roundf(timesteps_vec[0]));
-                float shifted_sigma                = denoiser->t_to_sigma((float)shifted_t_idx);
-                std::vector<float> shifted_scaling = denoiser->get_scalings(shifted_sigma);
-                float shifted_c_skip               = shifted_scaling[0];
-                float shifted_c_out                = shifted_scaling[1];
-                float shifted_c_in                 = shifted_scaling[2];
-
-                c_skip = shifted_c_skip * c_in / shifted_c_in;
-                c_out  = shifted_c_out;
             }
 
-            for (int i = 0; i < ne_elements; i++) {
-                float latent_result = positive_data[i];
-                if (has_unconditioned) {
-                    // out_uncond + cfg_scale * (out_cond - out_uncond)
-                    if (has_img_cond) {
-                        // out_uncond + text_cfg_scale * (out_cond - out_img_cond) + image_cfg_scale * (out_img_cond - out_uncond)
-                        latent_result = negative_data[i] + img_cfg_scale * (img_cond_data[i] - negative_data[i]) + cfg_scale * (positive_data[i] - img_cond_data[i]);
-                    } else {
-                        // img_cfg_scale == cfg_scale
-                        latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]);
-                    }
-                } else if (has_img_cond) {
-                    // img_cfg_scale == 1
-                    latent_result = img_cond_data[i] + cfg_scale * (positive_data[i] - img_cond_data[i]);
+            GGML_ASSERT(!cond_out.empty());
+            sd::Tensor<float> latent_result = cond_out;
+            if (!uncond_out.empty()) {
+                if (!img_cond_out.empty()) {
+                    latent_result = uncond_out +
+                                    img_cfg_scale * (img_cond_out - uncond_out) +
+                                    cfg_scale * (cond_out - img_cond_out);
+                } else {
+                    latent_result = uncond_out + cfg_scale * (cond_out - uncond_out);
                 }
-                if (is_skiplayer_step) {
-                    latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_scale;
-                }
-                // v = latent_result, eps = latent_result
-                // denoised = (v * c_out + input * c_skip) or (input + eps * c_out)
-                vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip;
+            } else if (!img_cond_out.empty()) {
+                latent_result = img_cond_out + cfg_scale * (cond_out - img_cond_out);
             }
 
+            if (is_skiplayer_step && !skip_cond_out.empty()) {
+                latent_result += (cond_out - skip_cond_out) * slg_scale;
+            }
+            denoised = latent_result * c_out + x * c_skip;
             if (cache_runtime.spectrum_enabled) {
                 cache_runtime.spectrum.update(denoised);
             }
-
-            if (denoise_mask != nullptr) {
-                apply_mask(denoised, init_latent, denoise_mask);
+            if (!denoise_mask.empty()) {
+                denoised = denoised * denoise_mask + init_latent * (1.0f - denoise_mask);
             }
-
-            if (sd_preview_cb != nullptr && sd_should_preview_denoised()) {
-                if (step % sd_get_preview_interval() == 0) {
-                    preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb, sd_preview_cb_data, false);
-                }
-            }
-
-            int64_t t1 = ggml_time_us();
-            if (step > 0 || step == -(int)steps) {
-                int showstep = std::abs(step);
-                pretty_progress(showstep, (int)steps, (t1 - t0) / 1000000.f / showstep);
-                // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
+            if (sd_should_preview_denoised() && preview.callback != nullptr) {
+                preview_image(step, denoised, version, preview.mode, preview.callback, preview.data, false);
             }
+            report_sample_progress(step, steps, t0);
             return denoised;
         };
 
-        if (!sample_k_diffusion(method, denoise, work_ctx, x, sigmas, sampler_rng, eta)) {
+        auto x0_opt = sample_k_diffusion(method, denoise, x_t, sigmas, sampler_rng, eta);
+        if (x0_opt.empty()) {
             LOG_ERROR("Diffusion model sampling failed");
             if (control_net) {
                 control_net->free_control_ctx();
                 control_net->free_compute_buffer();
             }
-            diffusion_model->free_compute_buffer();
-            return NULL;
+            if (work_diffusion_model) {
+                work_diffusion_model->free_compute_buffer();
+            }
+            return {};
         }
 
-        size_t total_steps = sigmas.size() > 0 ? sigmas.size() - 1 : 0;
-        log_sample_cache_summary(cache_runtime, total_steps);
-
+        auto x0 = std::move(x0_opt);
+        sd_sample::log_sample_cache_summary(cache_runtime, steps);
         if (inverse_noise_scaling) {
-            x = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x);
+            x0 = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x0);
         }
 
         if (control_net) {
             control_net->free_control_ctx();
             control_net->free_compute_buffer();
         }
-        work_diffusion_model->free_compute_buffer();
-        return x;
+        if (work_diffusion_model) {
+            work_diffusion_model->free_compute_buffer();
+        }
+        return x0;
     }
 
     int get_vae_scale_factor() {
@@ -2409,11 +1857,10 @@ public:
         return (h / vae_scale_factor) * (w / vae_scale_factor);
     }
 
-    ggml_tensor* generate_init_latent(ggml_context* work_ctx,
-                                      int width,
-                                      int height,
-                                      int frames = 1,
-                                      bool video = false) {
+    sd::Tensor<float> generate_init_latent(int width,
+                                           int height,
+                                           int frames = 1,
+                                           bool video = false) {
         int vae_scale_factor = get_vae_scale_factor();
         int W                = width / vae_scale_factor;
         int H                = height / vae_scale_factor;
@@ -2422,34 +1869,35 @@ public:
             T = ((T - 1) / 4) + 1;
         }
         int C = get_latent_channel();
-        ggml_tensor* init_latent;
         if (video) {
-            init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, T, C);
-        } else {
-            init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
+            return sd::zeros<float>({W, H, T, C, 1});
         }
-        ggml_set_f32(init_latent, 0.f);
-        return init_latent;
+        return sd::zeros<float>({W, H, C, 1});
     }
 
-    ggml_tensor* encode_to_vae_latents(ggml_context* work_ctx, ggml_tensor* x) {
-        ggml_tensor* vae_output = first_stage_model->encode(n_threads, work_ctx, x, vae_tiling_params, circular_x, circular_y);
-        ggml_tensor* latents    = first_stage_model->vae_output_to_latents(work_ctx, vae_output, rng);
+    sd::Tensor<float> encode_to_vae_latents(const sd::Tensor<float>& x) {
+        auto latents = first_stage_model->encode(n_threads, x, vae_tiling_params, circular_x, circular_y);
+        if (latents.empty()) {
+            return {};
+        }
+        latents = first_stage_model->vae_output_to_latents(latents, rng);
         return latents;
     }
 
-    ggml_tensor* encode_first_stage(ggml_context* work_ctx, ggml_tensor* x) {
-        ggml_tensor* latents = encode_to_vae_latents(work_ctx, x);
+    sd::Tensor<float> encode_first_stage(const sd::Tensor<float>& x) {
+        auto latents = encode_to_vae_latents(x);
+        if (latents.empty()) {
+            return {};
+        }
         if (version != VERSION_SD1_PIX2PIX) {
-            latents = first_stage_model->vae_to_diffuison_latents(work_ctx, latents);
+            latents = first_stage_model->vae_to_diffusion_latents(latents);
         }
         return latents;
     }
 
-    ggml_tensor* decode_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode_video = false) {
-        x = first_stage_model->diffusion_to_vae_latents(work_ctx, x);
-        x = first_stage_model->decode(n_threads, work_ctx, x, vae_tiling_params, decode_video, circular_x, circular_y);
-        return x;
+    sd::Tensor<float> decode_first_stage(const sd::Tensor<float>& x, bool decode_video = false) {
+        auto latents = first_stage_model->diffusion_to_vae_latents(x);
+        return first_stage_model->decode(n_threads, latents, vae_tiling_params, decode_video, circular_x, circular_y);
     }
 
     void set_flow_shift(float flow_shift = INFINITY) {
@@ -2966,667 +2414,216 @@ enum scheduler_t sd_get_default_scheduler(const sd_ctx_t* sd_ctx, enum sample_me
     return DISCRETE_SCHEDULER;
 }
 
-sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
-                                    ggml_context* work_ctx,
-                                    ggml_tensor* init_latent,
-                                    std::string prompt,
-                                    std::string negative_prompt,
-                                    int clip_skip,
-                                    sd_guidance_params_t guidance,
-                                    float eta,
-                                    int shifted_timestep,
-                                    int width,
-                                    int height,
-                                    enum sample_method_t sample_method,
-                                    const std::vector<float>& sigmas,
-                                    int64_t seed,
-                                    int batch_count,
-                                    sd_image_t control_image,
-                                    float control_strength,
-                                    sd_pm_params_t pm_params,
-                                    std::vector<sd_image_t*> ref_images,
-                                    std::vector<ggml_tensor*> ref_latents,
-                                    bool increase_ref_index,
-                                    ggml_tensor* concat_latent            = nullptr,
-                                    ggml_tensor* denoise_mask             = nullptr,
-                                    const sd_cache_params_t* cache_params = nullptr) {
-    if (seed < 0) {
-        // Generally, when using the provided command line, the seed is always >0.
-        // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
-        // by a third party with a seed <0, let's incorporate randomization here.
-        srand((int)time(nullptr));
-        seed = rand();
+static int64_t resolve_seed(int64_t seed) {
+    if (seed >= 0) {
+        return seed;
     }
-
-    if (!std::isfinite(guidance.img_cfg)) {
-        guidance.img_cfg = guidance.txt_cfg;
-    }
-
-    int sample_steps = static_cast<int>(sigmas.size() - 1);
-
-    int64_t t0 = ggml_time_ms();
-
-    ConditionerParams condition_params;
-    condition_params.text            = prompt;
-    condition_params.clip_skip       = clip_skip;
-    condition_params.width           = width;
-    condition_params.height          = height;
-    condition_params.ref_images      = ref_images;
-    condition_params.adm_in_channels = static_cast<int>(sd_ctx->sd->diffusion_model->get_adm_in_channels());
-
-    // Photo Maker
-    SDCondition id_cond = sd_ctx->sd->get_pmid_conditon(work_ctx, pm_params, condition_params);
-
-    // Get learned condition
-    condition_params.zero_out_masked = false;
-    SDCondition cond                 = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
-                                                                                           sd_ctx->sd->n_threads,
-                                                                                           condition_params);
-
-    SDCondition uncond;
-    if (guidance.txt_cfg != 1.0 ||
-        (sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.txt_cfg != guidance.img_cfg)) {
-        bool zero_out_masked = false;
-        if (sd_version_is_sdxl(sd_ctx->sd->version) && negative_prompt.size() == 0 && !sd_ctx->sd->is_using_edm_v_parameterization) {
-            zero_out_masked = true;
-        }
-        condition_params.text            = negative_prompt;
-        condition_params.zero_out_masked = zero_out_masked;
-        uncond                           = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
-                                                                                               sd_ctx->sd->n_threads,
-                                                                                               condition_params);
-    }
-    int64_t t1 = ggml_time_ms();
-    LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t1 - t0);
-
-    if (sd_ctx->sd->free_params_immediately) {
-        sd_ctx->sd->cond_stage_model->free_params_buffer();
-    }
-
-    // Control net hint
-    ggml_tensor* image_hint = nullptr;
-    if (control_image.data != nullptr) {
-        image_hint = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
-        sd_image_to_ggml_tensor(control_image, image_hint);
-    }
-
-    // Sample
-    std::vector<ggml_tensor*> final_latents;  // collect latents to decode
-    int C = sd_ctx->sd->get_latent_channel();
-    int W = width / sd_ctx->sd->get_vae_scale_factor();
-    int H = height / sd_ctx->sd->get_vae_scale_factor();
-
-    ggml_tensor* control_latent = nullptr;
-    if (sd_version_is_control(sd_ctx->sd->version) && image_hint != nullptr) {
-        control_latent = sd_ctx->sd->encode_first_stage(work_ctx, image_hint);
-        ggml_ext_tensor_scale_inplace(control_latent, control_strength);
-    }
-
-    if (sd_version_is_inpaint(sd_ctx->sd->version)) {
-        int64_t mask_channels = 1;
-        if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
-            mask_channels = 8 * 8;  // flatten the whole mask
-        } else if (sd_ctx->sd->version == VERSION_FLEX_2) {
-            mask_channels = 1 + init_latent->ne[2];
-        }
-        auto empty_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1);
-        // no mask, set the whole image as masked
-        for (int64_t x = 0; x < empty_latent->ne[0]; x++) {
-            for (int64_t y = 0; y < empty_latent->ne[1]; y++) {
-                if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
-                    // TODO: this might be wrong
-                    for (int64_t c = 0; c < init_latent->ne[2]; c++) {
-                        ggml_ext_tensor_set_f32(empty_latent, 0, x, y, c);
-                    }
-                    for (int64_t c = init_latent->ne[2]; c < empty_latent->ne[2]; c++) {
-                        ggml_ext_tensor_set_f32(empty_latent, 1, x, y, c);
-                    }
-                } else if (sd_ctx->sd->version == VERSION_FLEX_2) {
-                    for (int64_t c = 0; c < empty_latent->ne[2]; c++) {
-                        // 0x16,1x1,0x16
-                        ggml_ext_tensor_set_f32(empty_latent, c == init_latent->ne[2], x, y, c);
-                    }
-                } else {
-                    ggml_ext_tensor_set_f32(empty_latent, 1, x, y, 0);
-                    for (int64_t c = 1; c < empty_latent->ne[2]; c++) {
-                        ggml_ext_tensor_set_f32(empty_latent, 0, x, y, c);
-                    }
-                }
-            }
-        }
-
-        if (sd_ctx->sd->version == VERSION_FLEX_2 && control_latent != nullptr && sd_ctx->sd->control_net == nullptr) {
-            bool no_inpaint = concat_latent == nullptr;
-            if (no_inpaint) {
-                concat_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1);
-            }
-            // fill in the control image here
-            for (int64_t x = 0; x < control_latent->ne[0]; x++) {
-                for (int64_t y = 0; y < control_latent->ne[1]; y++) {
-                    if (no_inpaint) {
-                        for (int64_t c = 0; c < concat_latent->ne[2] - control_latent->ne[2]; c++) {
-                            // 0x16,1x1,0x16
-                            ggml_ext_tensor_set_f32(concat_latent, c == init_latent->ne[2], x, y, c);
-                        }
-                    }
-                    for (int64_t c = 0; c < control_latent->ne[2]; c++) {
-                        float v = ggml_ext_tensor_get_f32(control_latent, x, y, c);
-                        ggml_ext_tensor_set_f32(concat_latent, v, x, y, concat_latent->ne[2] - control_latent->ne[2] + c);
-                    }
-                }
-            }
-        } else if (concat_latent == nullptr) {
-            concat_latent = empty_latent;
-        }
-        cond.c_concat   = concat_latent;
-        uncond.c_concat = empty_latent;
-        denoise_mask    = nullptr;
-    } else if (sd_version_is_unet_edit(sd_ctx->sd->version)) {
-        auto empty_latent = ggml_dup_tensor(work_ctx, init_latent);
-        ggml_set_f32(empty_latent, 0);
-        uncond.c_concat = empty_latent;
-        cond.c_concat   = ref_latents[0];
-        if (cond.c_concat == nullptr) {
-            cond.c_concat = empty_latent;
-        }
-    } else if (sd_version_is_control(sd_ctx->sd->version)) {
-        auto empty_latent = ggml_dup_tensor(work_ctx, init_latent);
-        ggml_set_f32(empty_latent, 0);
-        uncond.c_concat = empty_latent;
-        if (sd_ctx->sd->control_net == nullptr) {
-            cond.c_concat = control_latent;
-        }
-        if (cond.c_concat == nullptr) {
-            cond.c_concat = empty_latent;
-        }
-    }
-    SDCondition img_cond;
-    if (uncond.c_crossattn != nullptr &&
-        (sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.txt_cfg != guidance.img_cfg)) {
-        img_cond = SDCondition(uncond.c_crossattn, uncond.c_vector, cond.c_concat);
-    }
-    for (int b = 0; b < batch_count; b++) {
-        int64_t sampling_start = ggml_time_ms();
-        int64_t cur_seed       = seed + b;
-        LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, batch_count, cur_seed);
-
-        sd_ctx->sd->rng->manual_seed(cur_seed);
-        sd_ctx->sd->sampler_rng->manual_seed(cur_seed);
-        ggml_tensor* x_t   = init_latent;
-        ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
-        ggml_ext_im_set_randn_f32(noise, sd_ctx->sd->rng);
-
-        int start_merge_step = -1;
-        if (sd_ctx->sd->use_pmid) {
-            start_merge_step = int(sd_ctx->sd->pmid_model->style_strength / 100.f * sample_steps);
-            // if (start_merge_step > 30)
-            //     start_merge_step = 30;
-            LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step);
-        }
-
-        ggml_tensor* x_0     = sd_ctx->sd->sample(work_ctx,
-                                                  sd_ctx->sd->diffusion_model,
-                                                  true,
-                                                  x_t,
-                                                  noise,
-                                                  cond,
-                                                  uncond,
-                                                  img_cond,
-                                                  image_hint,
-                                                  control_strength,
-                                                  guidance,
-                                                  eta,
-                                                  shifted_timestep,
-                                                  sample_method,
-                                                  sigmas,
-                                                  start_merge_step,
-                                                  id_cond,
-                                                  ref_latents,
-                                                  increase_ref_index,
-                                                  denoise_mask,
-                                                  nullptr,
-                                                  1.0f,
-                                                  cache_params);
-        int64_t sampling_end = ggml_time_ms();
-        if (x_0 != nullptr) {
-            // print_ggml_tensor(x_0);
-            LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
-            final_latents.push_back(x_0);
-        } else {
-            LOG_ERROR("sampling for image %d/%d failed after %.2fs", b + 1, batch_count, (sampling_end - sampling_start) * 1.0f / 1000);
-        }
-    }
-
-    if (sd_ctx->sd->free_params_immediately) {
-        sd_ctx->sd->diffusion_model->free_params_buffer();
-    }
-    int64_t t3 = ggml_time_ms();
-    LOG_INFO("generating %" PRId64 " latent images completed, taking %.2fs", final_latents.size(), (t3 - t1) * 1.0f / 1000);
-
-    // Decode to image
-    LOG_INFO("decoding %zu latents", final_latents.size());
-    std::vector<ggml_tensor*> decoded_images;  // collect decoded images
-    for (size_t i = 0; i < final_latents.size(); i++) {
-        t1               = ggml_time_ms();
-        ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, final_latents[i] /* x_0 */);
-        // print_ggml_tensor(img);
-        if (img != nullptr) {
-            decoded_images.push_back(img);
-        }
-        int64_t t2 = ggml_time_ms();
-        LOG_INFO("latent %" PRId64 " decoded, taking %.2fs", i + 1, (t2 - t1) * 1.0f / 1000);
-    }
-
-    int64_t t4 = ggml_time_ms();
-    LOG_INFO("decode_first_stage completed, taking %.2fs", (t4 - t3) * 1.0f / 1000);
-    if (sd_ctx->sd->free_params_immediately) {
-        sd_ctx->sd->first_stage_model->free_params_buffer();
-    }
-
-    sd_ctx->sd->lora_stat();
-
-    sd_image_t* result_images = (sd_image_t*)calloc(batch_count, sizeof(sd_image_t));
-    if (result_images == nullptr) {
-        ggml_free(work_ctx);
-        return nullptr;
-    }
-    memset(result_images, 0, batch_count * sizeof(sd_image_t));
-
-    for (size_t i = 0; i < decoded_images.size(); i++) {
-        result_images[i].width   = width;
-        result_images[i].height  = height;
-        result_images[i].channel = 3;
-        result_images[i].data    = ggml_tensor_to_sd_image(decoded_images[i]);
-    }
-    ggml_free(work_ctx);
-
-    return result_images;
+    srand((int)time(nullptr));
+    return rand();
 }
 
-sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) {
-    sd_ctx->sd->vae_tiling_params = sd_img_gen_params->vae_tiling_params;
+static enum sample_method_t resolve_sample_method(sd_ctx_t* sd_ctx, enum sample_method_t sample_method) {
+    if (sample_method == SAMPLE_METHOD_COUNT) {
+        return sd_get_default_sample_method(sd_ctx);
+    }
+    return sample_method;
+}
 
-    int width  = sd_img_gen_params->width;
-    int height = sd_img_gen_params->height;
+static scheduler_t resolve_scheduler(sd_ctx_t* sd_ctx,
+                                     scheduler_t scheduler,
+                                     enum sample_method_t sample_method) {
+    if (scheduler == SCHEDULER_COUNT) {
+        return sd_get_default_scheduler(sd_ctx, sample_method);
+    }
+    return scheduler;
+}
 
-    int vae_scale_factor            = sd_ctx->sd->get_vae_scale_factor();
-    int diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor();
-    int spatial_multiple            = vae_scale_factor * diffusion_model_down_factor;
+struct GenerationRequest {
+    std::string prompt;
+    std::string negative_prompt;
+    int width                                = -1;
+    int height                               = -1;
+    int clip_skip                            = -1;
+    int vae_scale_factor                     = -1;
+    int diffusion_model_down_factor          = -1;
+    int64_t seed                             = -1;
+    bool use_uncond                          = false;
+    bool use_img_cond                        = false;
+    bool use_high_noise_uncond               = false;
+    bool use_high_noise_img_cond             = false;
+    const sd_cache_params_t* cache_params    = nullptr;
+    int batch_count                          = 1;
+    int shifted_timestep                     = 0;
+    float strength                           = 1.f;
+    float control_strength                   = 0.f;
+    float eta                                = 0.f;
+    bool increase_ref_index                  = false;
+    bool auto_resize_ref_image               = false;
+    sd_guidance_params_t guidance            = {};
+    sd_guidance_params_t high_noise_guidance = {};
+    sd_pm_params_t pm_params                 = {};
+    int frames                               = -1;
+    float vace_strength                      = 1.f;
+
+    GenerationRequest(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) {
+        prompt                      = SAFE_STR(sd_img_gen_params->prompt);
+        negative_prompt             = SAFE_STR(sd_img_gen_params->negative_prompt);
+        width                       = sd_img_gen_params->width;
+        height                      = sd_img_gen_params->height;
+        vae_scale_factor            = sd_ctx->sd->get_vae_scale_factor();
+        diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor();
+        seed                        = sd_img_gen_params->seed;
+        batch_count                 = sd_img_gen_params->batch_count;
+        clip_skip                   = sd_img_gen_params->clip_skip;
+        shifted_timestep            = sd_img_gen_params->sample_params.shifted_timestep;
+        strength                    = sd_img_gen_params->strength;
+        control_strength            = sd_img_gen_params->control_strength;
+        eta                         = sd_img_gen_params->sample_params.eta;
+        increase_ref_index          = sd_img_gen_params->increase_ref_index;
+        auto_resize_ref_image       = sd_img_gen_params->auto_resize_ref_image;
+        guidance                    = sd_img_gen_params->sample_params.guidance;
+        pm_params                   = sd_img_gen_params->pm_params;
+        cache_params                = &sd_img_gen_params->cache;
+        resolve(sd_ctx);
+    }
+
+    GenerationRequest(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params) {
+        prompt                      = SAFE_STR(sd_vid_gen_params->prompt);
+        negative_prompt             = SAFE_STR(sd_vid_gen_params->negative_prompt);
+        width                       = sd_vid_gen_params->width;
+        height                      = sd_vid_gen_params->height;
+        frames                      = (sd_vid_gen_params->video_frames - 1) / 4 * 4 + 1;
+        clip_skip                   = sd_vid_gen_params->clip_skip;
+        vae_scale_factor            = sd_ctx->sd->get_vae_scale_factor();
+        diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor();
+        seed                        = sd_vid_gen_params->seed;
+        cache_params                = &sd_vid_gen_params->cache;
+        vace_strength               = sd_vid_gen_params->vace_strength;
+        guidance                    = sd_vid_gen_params->sample_params.guidance;
+        high_noise_guidance         = sd_vid_gen_params->high_noise_sample_params.guidance;
+        resolve(sd_ctx);
+    }
+
+    void align_generation_request_size() {
+        int spatial_multiple = vae_scale_factor * diffusion_model_down_factor;
+        int width_offset     = align_up_offset(width, spatial_multiple);
+        int height_offset    = align_up_offset(height, spatial_multiple);
+        if (width_offset <= 0 && height_offset <= 0) {
+            return;
+        }
+
+        int original_width  = width;
+        int original_height = height;
 
-    int width_offset  = align_up_offset(width, spatial_multiple);
-    int height_offset = align_up_offset(height, spatial_multiple);
-    if (width_offset > 0 || height_offset > 0) {
         width += width_offset;
         height += height_offset;
-        LOG_WARN("align up %dx%d to %dx%d (multiple=%d)", sd_img_gen_params->width, sd_img_gen_params->height, width, height, spatial_multiple);
+        LOG_WARN("align up %dx%d to %dx%d (multiple=%d)",
+                 original_width,
+                 original_height,
+                 width,
+                 height,
+                 spatial_multiple);
     }
 
-    bool circular_x = sd_ctx->sd->circular_x;
-    bool circular_y = sd_ctx->sd->circular_y;
-
-    if (!sd_img_gen_params->vae_tiling_params.enabled) {
-        if (sd_ctx->sd->first_stage_model) {
-            sd_ctx->sd->first_stage_model->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y);
-        }
-        if (sd_ctx->sd->preview_vae) {
-            sd_ctx->sd->preview_vae->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y);
-        }
-    } else {
-        int tile_size_x, tile_size_y;
-        float _overlap;
-        int latent_size_x = width / sd_ctx->sd->get_vae_scale_factor();
-        int latent_size_y = height / sd_ctx->sd->get_vae_scale_factor();
-        sd_ctx->sd->first_stage_model->get_tile_sizes(tile_size_x, tile_size_y, _overlap, sd_img_gen_params->vae_tiling_params, latent_size_x, latent_size_y);
-
-        // force disable circular padding for vae if tiling is enabled unless latent is smaller than tile size
-        // otherwise it will cause artifacts at the edges of the tiles
-        sd_ctx->sd->circular_x = sd_ctx->sd->circular_x && (tile_size_x >= latent_size_x);
-        sd_ctx->sd->circular_y = sd_ctx->sd->circular_y && (tile_size_y >= latent_size_y);
-
-        if (sd_ctx->sd->first_stage_model) {
-            sd_ctx->sd->first_stage_model->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y);
-        }
-        if (sd_ctx->sd->preview_vae) {
-            sd_ctx->sd->preview_vae->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y);
+    static void resolve_guidance(sd_ctx_t* sd_ctx,
+                                 sd_guidance_params_t* guidance,
+                                 bool* use_uncond,
+                                 bool* use_img_cond,
+                                 const char* stage_name = nullptr) {
+        GGML_ASSERT(guidance != nullptr);
+        GGML_ASSERT(use_uncond != nullptr);
+        GGML_ASSERT(use_img_cond != nullptr);
+        // out_uncond + text_cfg_scale * (out_cond - out_img_cond) + image_cfg_scale * (out_img_cond - out_uncond)
+        // img_cfg == txt_cfg means that img_cfg is not used
+        if (!std::isfinite(guidance->img_cfg)) {
+            guidance->img_cfg = guidance->txt_cfg;
         }
 
-        // disable circular tiling if it's enabled for the VAE
-        sd_ctx->sd->circular_x = circular_x && (tile_size_x < latent_size_x);
-        sd_ctx->sd->circular_y = circular_y && (tile_size_y < latent_size_y);
-    }
-
-    LOG_DEBUG("generate_image %dx%d", width, height);
-    if (sd_ctx == nullptr || sd_img_gen_params == nullptr) {
-        return nullptr;
-    }
-
-    ggml_init_params params;
-    params.mem_size   = static_cast<size_t>(1024 * 1024) * 1024;  // 1G
-    params.mem_buffer = nullptr;
-    params.no_alloc   = false;
-    // LOG_DEBUG("mem_size %u ", params.mem_size);
-
-    ggml_context* work_ctx = ggml_init(params);
-    if (!work_ctx) {
-        LOG_ERROR("ggml_init() failed");
-        return nullptr;
-    }
-
-    int64_t seed = sd_img_gen_params->seed;
-    if (seed < 0) {
-        srand((int)time(nullptr));
-        seed = rand();
-    }
-    sd_ctx->sd->rng->manual_seed(seed);
-    sd_ctx->sd->sampler_rng->manual_seed(seed);
-
-    size_t t0 = ggml_time_ms();
-
-    sd_ctx->sd->set_flow_shift(sd_img_gen_params->sample_params.flow_shift);
-
-    // Apply lora
-    sd_ctx->sd->apply_loras(sd_img_gen_params->loras, sd_img_gen_params->lora_count);
-
-    enum sample_method_t sample_method = sd_img_gen_params->sample_params.sample_method;
-    if (sample_method == SAMPLE_METHOD_COUNT) {
-        sample_method = sd_get_default_sample_method(sd_ctx);
-    }
-    LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
-
-    int sample_steps = sd_img_gen_params->sample_params.sample_steps;
-    std::vector<float> sigmas;
-    if (sd_img_gen_params->sample_params.custom_sigmas_count > 0) {
-        sigmas = std::vector<float>(sd_img_gen_params->sample_params.custom_sigmas,
-                                    sd_img_gen_params->sample_params.custom_sigmas + sd_img_gen_params->sample_params.custom_sigmas_count);
-        if (sample_steps != sigmas.size() - 1) {
-            sample_steps = static_cast<int>(sigmas.size()) - 1;
-            LOG_WARN("sample_steps != custom_sigmas_count - 1, set sample_steps to %d", sample_steps);
+        if (!sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version)) {
+            guidance->img_cfg = guidance->txt_cfg;
         }
-    } else {
-        scheduler_t scheduler = sd_img_gen_params->sample_params.scheduler;
-        if (scheduler == SCHEDULER_COUNT) {
-            scheduler = sd_get_default_scheduler(sd_ctx, sample_method);
+
+        if (guidance->txt_cfg != 1.f) {
+            *use_uncond = true;
         }
-        sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps,
-                                                  sd_ctx->sd->get_image_seq_len(height, width),
-                                                  scheduler,
-                                                  sd_ctx->sd->version);
-    }
 
-    ggml_tensor* init_latent   = nullptr;
-    ggml_tensor* concat_latent = nullptr;
-    ggml_tensor* denoise_mask  = nullptr;
-    if (sd_img_gen_params->init_image.data) {
-        LOG_INFO("IMG2IMG");
+        if (guidance->img_cfg != guidance->txt_cfg) {
+            *use_img_cond = true;
+            *use_uncond   = true;
+        }
 
-        size_t t_enc = static_cast<size_t>(sample_steps * sd_img_gen_params->strength);
-        if (t_enc == sample_steps)
-            t_enc--;
-        LOG_INFO("target t_enc is %zu steps", t_enc);
-        std::vector<float> sigma_sched;
-        sigma_sched.assign(sigmas.begin() + sample_steps - t_enc - 1, sigmas.end());
-        sigmas = sigma_sched;
-
-        ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
-        ggml_tensor* mask_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1);
-
-        sd_image_to_ggml_tensor(sd_img_gen_params->mask_image, mask_img);
-        sd_image_to_ggml_tensor(sd_img_gen_params->init_image, init_img);
-
-        if (sd_version_is_inpaint(sd_ctx->sd->version)) {
-            int64_t mask_channels = 1;
-            if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
-                mask_channels = vae_scale_factor * vae_scale_factor;  // flatten the whole mask
-            } else if (sd_ctx->sd->version == VERSION_FLEX_2) {
-                mask_channels = 1 + sd_ctx->sd->get_latent_channel();
-            }
-            ggml_tensor* masked_latent = nullptr;
-
-            if (sd_ctx->sd->version != VERSION_FLEX_2) {
-                // most inpaint models mask before vae
-                ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
-                ggml_ext_tensor_apply_mask(init_img, mask_img, masked_img);
-                masked_latent = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
-                init_latent   = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
+        if (guidance->txt_cfg < 1.f) {
+            const char* prefix = stage_name == nullptr ? "" : stage_name;
+            if (guidance->txt_cfg == 0.f) {
+                LOG_WARN("%sunconditioned mode, images won't follow the prompt (use cfg-scale=1 for distilled models)",
+                         prefix);
             } else {
-                // mask after vae
-                init_latent   = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
-                masked_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2], 1);
-                ggml_ext_tensor_apply_mask(init_latent, mask_img, masked_latent, 0.);
-            }
-            concat_latent = ggml_new_tensor_4d(work_ctx,
-                                               GGML_TYPE_F32,
-                                               masked_latent->ne[0],
-                                               masked_latent->ne[1],
-                                               mask_channels + masked_latent->ne[2],
-                                               1);
-            for (int ix = 0; ix < masked_latent->ne[0]; ix++) {
-                for (int iy = 0; iy < masked_latent->ne[1]; iy++) {
-                    int mx = ix * vae_scale_factor;
-                    int my = iy * vae_scale_factor;
-                    if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
-                        for (int k = 0; k < masked_latent->ne[2]; k++) {
-                            float v = ggml_ext_tensor_get_f32(masked_latent, ix, iy, k);
-                            ggml_ext_tensor_set_f32(concat_latent, v, ix, iy, k);
-                        }
-                        // "Encode" 8x8 mask chunks into a flattened 1x64 vector, and concatenate to masked image
-                        for (int x = 0; x < vae_scale_factor; x++) {
-                            for (int y = 0; y < vae_scale_factor; y++) {
-                                float m = ggml_ext_tensor_get_f32(mask_img, mx + x, my + y);
-                                // TODO: check if the way the mask is flattened is correct (is it supposed to be x*vae_scale_factor+y or x+vae_scale_factor*y?)
-                                // python code was using "b (h vae_scale_factor) (w vae_scale_factor) -> b (vae_scale_factor vae_scale_factor) h w"
-                                ggml_ext_tensor_set_f32(concat_latent, m, ix, iy, masked_latent->ne[2] + x * vae_scale_factor + y);
-                            }
-                        }
-                    } else if (sd_ctx->sd->version == VERSION_FLEX_2) {
-                        float m = ggml_ext_tensor_get_f32(mask_img, mx, my);
-                        // masked image
-                        for (int k = 0; k < masked_latent->ne[2]; k++) {
-                            float v = ggml_ext_tensor_get_f32(masked_latent, ix, iy, k);
-                            ggml_ext_tensor_set_f32(concat_latent, v, ix, iy, k);
-                        }
-                        // downsampled mask
-                        ggml_ext_tensor_set_f32(concat_latent, m, ix, iy, masked_latent->ne[2]);
-                        // control (todo: support this)
-                        for (int k = 0; k < masked_latent->ne[2]; k++) {
-                            ggml_ext_tensor_set_f32(concat_latent, 0, ix, iy, masked_latent->ne[2] + 1 + k);
-                        }
-                    } else {
-                        float m = ggml_ext_tensor_get_f32(mask_img, mx, my);
-                        ggml_ext_tensor_set_f32(concat_latent, m, ix, iy, 0);
-                        for (int k = 0; k < masked_latent->ne[2]; k++) {
-                            float v = ggml_ext_tensor_get_f32(masked_latent, ix, iy, k);
-                            ggml_ext_tensor_set_f32(concat_latent, v, ix, iy, k + mask_channels);
-                        }
-                    }
-                }
-            }
-        } else {
-            init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
-        }
-
-        {
-            // LOG_WARN("Inpainting with a base model is not great");
-            denoise_mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / vae_scale_factor, height / vae_scale_factor, 1, 1);
-            for (int ix = 0; ix < denoise_mask->ne[0]; ix++) {
-                for (int iy = 0; iy < denoise_mask->ne[1]; iy++) {
-                    int mx  = ix * vae_scale_factor;
-                    int my  = iy * vae_scale_factor;
-                    float m = ggml_ext_tensor_get_f32(mask_img, mx, my);
-                    ggml_ext_tensor_set_f32(denoise_mask, m, ix, iy);
-                }
+                LOG_WARN("%scfg value out of expected range may produce unexpected results", prefix);
             }
         }
-    } else {
-        LOG_INFO("TXT2IMG");
-        if (sd_version_is_inpaint(sd_ctx->sd->version)) {
-            LOG_WARN("This is an inpainting model, this should only be used in img2img mode with a mask");
-        }
-        init_latent = sd_ctx->sd->generate_init_latent(work_ctx, width, height);
     }
 
-    sd_guidance_params_t guidance = sd_img_gen_params->sample_params.guidance;
-    std::vector<sd_image_t*> ref_images;
-    for (int i = 0; i < sd_img_gen_params->ref_images_count; i++) {
-        ref_images.push_back(&sd_img_gen_params->ref_images[i]);
-    }
+    void resolve(sd_ctx_t* sd_ctx) {
+        align_generation_request_size();
+        seed = resolve_seed(seed);
 
-    std::vector<uint8_t> empty_image_data;
-    sd_image_t empty_image = {(uint32_t)width, (uint32_t)height, 3, nullptr};
-    if (ref_images.empty() && sd_version_is_unet_edit(sd_ctx->sd->version)) {
-        LOG_WARN("This model needs at least one reference image; using an empty reference");
-        empty_image_data.resize(width * height * 3);
-        ref_images.push_back(&empty_image);
-        empty_image.data = empty_image_data.data();
-        guidance.img_cfg = 0.f;
-    }
-
-    if (ref_images.size() > 0) {
-        LOG_INFO("EDIT mode");
-    }
-
-    std::vector<ggml_tensor*> ref_latents;
-    for (int i = 0; i < ref_images.size(); i++) {
-        ggml_tensor* img;
-        if (sd_img_gen_params->auto_resize_ref_image) {
-            LOG_DEBUG("auto resize ref images");
-            sd_image_f32_t ref_image = sd_image_t_to_sd_image_f32_t(*ref_images[i]);
-            int VAE_IMAGE_SIZE       = std::min(1024 * 1024, width * height);
-            double vae_width         = sqrt(VAE_IMAGE_SIZE * ref_image.width / ref_image.height);
-            double vae_height        = vae_width * ref_image.height / ref_image.width;
-
-            int factor = 16;
-            if (sd_version_is_qwen_image(sd_ctx->sd->version)) {
-                factor = 32;
-            }
-
-            vae_height = round(vae_height / factor) * factor;
-            vae_width  = round(vae_width / factor) * factor;
-
-            sd_image_f32_t resized_image = resize_sd_image_f32_t(ref_image, static_cast<int>(vae_width), static_cast<int>(vae_height));
-            free(ref_image.data);
-            ref_image.data = nullptr;
-
-            LOG_DEBUG("resize vae ref image %d from %dx%d to %dx%d", i, ref_image.height, ref_image.width, resized_image.height, resized_image.width);
-
-            img = ggml_new_tensor_4d(work_ctx,
-                                     GGML_TYPE_F32,
-                                     resized_image.width,
-                                     resized_image.height,
-                                     3,
-                                     1);
-            sd_image_f32_to_ggml_tensor(resized_image, img);
-            free(resized_image.data);
-            resized_image.data = nullptr;
-        } else {
-            img = ggml_new_tensor_4d(work_ctx,
-                                     GGML_TYPE_F32,
-                                     ref_images[i]->width,
-                                     ref_images[i]->height,
-                                     3,
-                                     1);
-            sd_image_to_ggml_tensor(*ref_images[i], img);
+        resolve_guidance(sd_ctx, &guidance, &use_uncond, &use_img_cond);
+        if (sd_ctx->sd->high_noise_diffusion_model) {
+            resolve_guidance(sd_ctx,
+                             &high_noise_guidance,
+                             &use_high_noise_uncond,
+                             &use_high_noise_img_cond,
+                             "high noise: ");
         }
 
-        // print_ggml_tensor(img, false, "img");
-
-        ggml_tensor* latent = sd_ctx->sd->encode_first_stage(work_ctx, img);
-        ref_latents.push_back(latent);
-    }
-
-    if (sd_img_gen_params->init_image.data != nullptr || sd_img_gen_params->ref_images_count > 0) {
-        size_t t1 = ggml_time_ms();
-        LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
-    }
-
-    sd_image_t* result_images = generate_image_internal(sd_ctx,
-                                                        work_ctx,
-                                                        init_latent,
-                                                        SAFE_STR(sd_img_gen_params->prompt),
-                                                        SAFE_STR(sd_img_gen_params->negative_prompt),
-                                                        sd_img_gen_params->clip_skip,
-                                                        guidance,
-                                                        sd_img_gen_params->sample_params.eta,
-                                                        sd_img_gen_params->sample_params.shifted_timestep,
-                                                        width,
-                                                        height,
-                                                        sample_method,
-                                                        sigmas,
-                                                        seed,
-                                                        sd_img_gen_params->batch_count,
-                                                        sd_img_gen_params->control_image,
-                                                        sd_img_gen_params->control_strength,
-                                                        sd_img_gen_params->pm_params,
-                                                        ref_images,
-                                                        ref_latents,
-                                                        sd_img_gen_params->increase_ref_index,
-                                                        concat_latent,
-                                                        denoise_mask,
-                                                        &sd_img_gen_params->cache);
-
-    // restore circular params
-    sd_ctx->sd->circular_x = circular_x;
-    sd_ctx->sd->circular_y = circular_y;
-
-    size_t t2 = ggml_time_ms();
-
-    LOG_INFO("generate_image completed in %.2fs", (t2 - t0) * 1.0f / 1000);
-
-    return result_images;
-}
-
-SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, int* num_frames_out) {
-    if (sd_ctx == nullptr || sd_vid_gen_params == nullptr) {
-        return nullptr;
-    }
-    sd_ctx->sd->vae_tiling_params = sd_vid_gen_params->vae_tiling_params;
-
-    std::string prompt          = SAFE_STR(sd_vid_gen_params->prompt);
-    std::string negative_prompt = SAFE_STR(sd_vid_gen_params->negative_prompt);
-
-    int width        = sd_vid_gen_params->width;
-    int height       = sd_vid_gen_params->height;
-    int frames       = sd_vid_gen_params->video_frames;
-    frames           = (frames - 1) / 4 * 4 + 1;
-    int sample_steps = sd_vid_gen_params->sample_params.sample_steps;
-
-    int vae_scale_factor            = sd_ctx->sd->get_vae_scale_factor();
-    int diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor();
-    int spatial_multiple            = vae_scale_factor * diffusion_model_down_factor;
-
-    int width_offset  = align_up_offset(width, spatial_multiple);
-    int height_offset = align_up_offset(height, spatial_multiple);
-    if (width_offset > 0 || height_offset > 0) {
-        width += width_offset;
-        height += height_offset;
-        LOG_WARN("align up %dx%d to %dx%d (multiple=%d)", sd_vid_gen_params->width, sd_vid_gen_params->height, width, height, spatial_multiple);
-    }
-    LOG_INFO("generate_video %dx%dx%d", width, height, frames);
-
-    sd_ctx->sd->set_flow_shift(sd_vid_gen_params->sample_params.flow_shift);
-
-    enum sample_method_t sample_method = sd_vid_gen_params->sample_params.sample_method;
-    if (sample_method == SAMPLE_METHOD_COUNT) {
-        sample_method = sd_get_default_sample_method(sd_ctx);
-    }
-    LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
-
-    int high_noise_sample_steps = 0;
-    if (sd_ctx->sd->high_noise_diffusion_model) {
-        high_noise_sample_steps = sd_vid_gen_params->high_noise_sample_params.sample_steps;
-    }
-
-    int total_steps = sample_steps;
-
-    if (high_noise_sample_steps > 0) {
-        total_steps += high_noise_sample_steps;
+        if (shifted_timestep > 0 && !sd_version_is_sdxl(sd_ctx->sd->version)) {
+            LOG_WARN("timestep shifting is only supported for SDXL models!");
+            shifted_timestep = 0;
+        }
     }
+};
 
+struct SamplePlan {
+    enum sample_method_t sample_method            = SAMPLE_METHOD_COUNT;
+    enum sample_method_t high_noise_sample_method = SAMPLE_METHOD_COUNT;
+    int sample_steps                              = 0;
+    int high_noise_sample_steps                   = 0;
+    int total_steps                               = 0;
+    float moe_boundary                            = 0.f;
+    int start_merge_step                          = -1;
     std::vector<float> sigmas;
-    if (sd_vid_gen_params->sample_params.custom_sigmas_count > 0) {
-        sigmas = std::vector<float>(sd_vid_gen_params->sample_params.custom_sigmas,
-                                    sd_vid_gen_params->sample_params.custom_sigmas + sd_vid_gen_params->sample_params.custom_sigmas_count);
-        if (total_steps != sigmas.size() - 1) {
+
+    SamplePlan(sd_ctx_t* sd_ctx,
+               const sd_img_gen_params_t* sd_img_gen_params,
+               const GenerationRequest& request) {
+        sample_method = sd_img_gen_params->sample_params.sample_method;
+        sample_steps  = sd_img_gen_params->sample_params.sample_steps;
+        resolve(sd_ctx, &request, &sd_img_gen_params->sample_params);
+    }
+
+    SamplePlan(sd_ctx_t* sd_ctx,
+               const sd_vid_gen_params_t* sd_vid_gen_params,
+               const GenerationRequest& request) {
+        sample_method = sd_vid_gen_params->sample_params.sample_method;
+        sample_steps  = sd_vid_gen_params->sample_params.sample_steps;
+        if (sd_ctx->sd->high_noise_diffusion_model) {
+            high_noise_sample_steps  = sd_vid_gen_params->high_noise_sample_params.sample_steps;
+            high_noise_sample_method = sd_vid_gen_params->high_noise_sample_params.sample_method;
+        }
+        moe_boundary = sd_vid_gen_params->moe_boundary;
+        resolve(sd_ctx, &request, &sd_vid_gen_params->sample_params);
+    }
+
+    void resolve(sd_ctx_t* sd_ctx,
+                 const GenerationRequest* request,
+                 const sd_sample_params_t* sample_params) {
+        sample_method = resolve_sample_method(sd_ctx, sample_method);
+
+        total_steps = sample_steps + std::max(0, high_noise_sample_steps);
+
+        if (sample_params->custom_sigmas_count > 0) {
+            sigmas      = std::vector<float>(sample_params->custom_sigmas,
+                                        sample_params->custom_sigmas + sample_params->custom_sigmas_count);
             total_steps = static_cast<int>(sigmas.size()) - 1;
             LOG_WARN("total_steps != custom_sigmas_count - 1, set total_steps to %d", total_steps);
             if (sample_steps >= total_steps) {
@@ -3637,60 +2634,559 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
                 high_noise_sample_steps = total_steps - sample_steps;
                 LOG_WARN("total_steps != custom_sigmas_count - 1, set high_noise_sample_steps to %d", high_noise_sample_steps);
             }
+        } else {
+            scheduler_t scheduler = resolve_scheduler(sd_ctx,
+                                                      sample_params->scheduler,
+                                                      sample_method);
+            sigmas                = sd_ctx->sd->denoiser->get_sigmas(total_steps,
+                                                                     sd_ctx->sd->get_image_seq_len(request->height, request->width),
+                                                                     scheduler,
+                                                                     sd_ctx->sd->version);
         }
-    } else {
-        scheduler_t scheduler = sd_vid_gen_params->sample_params.scheduler;
-        if (scheduler == SCHEDULER_COUNT) {
-            scheduler = sd_get_default_scheduler(sd_ctx, sample_method);
-        }
-        sigmas = sd_ctx->sd->denoiser->get_sigmas(total_steps,
-                                                  0,
-                                                  scheduler,
-                                                  sd_ctx->sd->version);
-    }
 
-    if (high_noise_sample_steps < 0) {
-        // timesteps �?sigmas for Flow models (like wan2.2 a14b)
-        for (size_t i = 0; i < sigmas.size(); ++i) {
-            if (sigmas[i] < sd_vid_gen_params->moe_boundary) {
-                high_noise_sample_steps = static_cast<int>(i);
-                break;
+        if (high_noise_sample_steps < 0) {
+            for (size_t i = 0; i < sigmas.size(); ++i) {
+                if (sigmas[i] < moe_boundary) {
+                    high_noise_sample_steps = static_cast<int>(i);
+                    break;
+                }
             }
+            LOG_DEBUG("switching from high noise model at step %d", high_noise_sample_steps);
         }
-        LOG_DEBUG("switching from high noise model at step %d", high_noise_sample_steps);
+
+        LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
+        if (high_noise_sample_steps > 0) {
+            high_noise_sample_method = resolve_sample_method(sd_ctx,
+                                                             high_noise_sample_method);
+            LOG_INFO("sampling(high noise) using %s method", sampling_methods_str[high_noise_sample_method]);
+        }
+
+        if (sd_ctx->sd->use_pmid) {
+            start_merge_step = int(sd_ctx->sd->pmid_model->style_strength / 100.f * total_steps);
+            LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step);
+        }
+    }
+};
+
+struct ImageGenerationLatents {
+    sd::Tensor<float> init_latent;
+    sd::Tensor<float> concat_latent;
+    sd::Tensor<float> uncond_concat_latent;
+    sd::Tensor<float> control_image;
+    std::vector<sd::Tensor<float>> ref_images;
+    std::vector<sd::Tensor<float>> ref_latents;
+    sd::Tensor<float> denoise_mask;
+    sd::Tensor<float> clip_vision_output;
+    sd::Tensor<float> vace_context;
+    int64_t ref_image_num = 0;
+};
+
+struct ImageGenerationEmbeds {
+    SDCondition cond;
+    SDCondition uncond;
+    SDCondition img_cond;
+    SDCondition id_cond;
+};
+
+struct CircularAxesState {
+    bool circular_x = false;
+    bool circular_y = false;
+};
+
+static CircularAxesState configure_image_vae_axes(sd_ctx_t* sd_ctx,
+                                                  const sd_img_gen_params_t* sd_img_gen_params,
+                                                  const GenerationRequest& request) {
+    CircularAxesState original_axes = {sd_ctx->sd->circular_x, sd_ctx->sd->circular_y};
+
+    if (!sd_img_gen_params->vae_tiling_params.enabled) {
+        if (sd_ctx->sd->first_stage_model) {
+            sd_ctx->sd->first_stage_model->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y);
+        }
+        if (sd_ctx->sd->preview_vae) {
+            sd_ctx->sd->preview_vae->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y);
+        }
+        return original_axes;
     }
 
-    ggml_init_params params;
-    params.mem_size   = static_cast<size_t>(1024 * 1024) * 1024;  // 1G
-    params.mem_buffer = nullptr;
-    params.no_alloc   = false;
-    // LOG_DEBUG("mem_size %u ", params.mem_size);
+    int tile_size_x, tile_size_y;
+    float overlap;
+    int latent_size_x = request.width / request.vae_scale_factor;
+    int latent_size_y = request.height / request.vae_scale_factor;
+    sd_ctx->sd->first_stage_model->get_tile_sizes(tile_size_x,
+                                                  tile_size_y,
+                                                  overlap,
+                                                  sd_img_gen_params->vae_tiling_params,
+                                                  latent_size_x,
+                                                  latent_size_y);
 
-    ggml_context* work_ctx = ggml_init(params);
-    if (!work_ctx) {
-        LOG_ERROR("ggml_init() failed");
+    sd_ctx->sd->circular_x = sd_ctx->sd->circular_x && (tile_size_x >= latent_size_x);
+    sd_ctx->sd->circular_y = sd_ctx->sd->circular_y && (tile_size_y >= latent_size_y);
+
+    if (sd_ctx->sd->first_stage_model) {
+        sd_ctx->sd->first_stage_model->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y);
+    }
+    if (sd_ctx->sd->preview_vae) {
+        sd_ctx->sd->preview_vae->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y);
+    }
+
+    sd_ctx->sd->circular_x = original_axes.circular_x && (tile_size_x < latent_size_x);
+    sd_ctx->sd->circular_y = original_axes.circular_y && (tile_size_y < latent_size_y);
+
+    return original_axes;
+}
+
+static void restore_image_vae_axes(sd_ctx_t* sd_ctx, const CircularAxesState& original_axes) {
+    sd_ctx->sd->circular_x = original_axes.circular_x;
+    sd_ctx->sd->circular_y = original_axes.circular_y;
+}
+
+class ImageVaeAxesGuard {
+private:
+    sd_ctx_t* sd_ctx = nullptr;
+    CircularAxesState original_axes;
+
+public:
+    ImageVaeAxesGuard(sd_ctx_t* sd_ctx,
+                      const sd_img_gen_params_t* sd_img_gen_params,
+                      const GenerationRequest& request)
+        : sd_ctx(sd_ctx),
+          original_axes(configure_image_vae_axes(sd_ctx, sd_img_gen_params, request)) {}
+
+    ~ImageVaeAxesGuard() {
+        restore_image_vae_axes(sd_ctx, original_axes);
+    }
+
+    ImageVaeAxesGuard(const ImageVaeAxesGuard&)            = delete;
+    ImageVaeAxesGuard& operator=(const ImageVaeAxesGuard&) = delete;
+};
+
+static std::optional<ImageGenerationLatents> prepare_image_generation_latents(sd_ctx_t* sd_ctx,
+                                                                              const sd_img_gen_params_t* sd_img_gen_params,
+                                                                              GenerationRequest* request,
+                                                                              SamplePlan* plan) {
+    int64_t prepare_start_ms = ggml_time_ms();
+
+    sd::Tensor<float> init_image_tensor;
+    sd::Tensor<float> control_image_tensor;
+    sd::Tensor<float> mask_image_tensor;
+
+    if (sd_img_gen_params->init_image.data != nullptr) {
+        LOG_INFO("IMG2IMG");
+
+        if (request->strength < 1.f) {
+            size_t t_enc = static_cast<size_t>(plan->sample_steps * request->strength);
+            if (t_enc == static_cast<size_t>(plan->sample_steps)) {
+                t_enc--;
+            }
+            LOG_INFO("target t_enc is %zu steps", t_enc);
+            std::vector<float> sigma_sched;
+            sigma_sched.assign(plan->sigmas.begin() + plan->sample_steps - t_enc - 1, plan->sigmas.end());
+            plan->sigmas       = std::move(sigma_sched);
+            plan->sample_steps = static_cast<int>(plan->sigmas.size() - 1);
+        }
+
+        init_image_tensor = sd_image_to_tensor(sd_img_gen_params->init_image, request->width, request->height);
+    }
+
+    if (sd_img_gen_params->mask_image.data != nullptr) {
+        mask_image_tensor = sd_image_to_tensor(sd_img_gen_params->mask_image, request->width, request->height);
+        mask_image_tensor = sd::ops::round(mask_image_tensor);
+    }
+
+    if (sd_img_gen_params->control_image.data != nullptr) {
+        control_image_tensor = sd_image_to_tensor(sd_img_gen_params->control_image, request->width, request->height);
+    }
+
+    if (init_image_tensor.empty() || mask_image_tensor.empty()) {
+        if (sd_version_is_inpaint(sd_ctx->sd->version)) {
+            LOG_WARN("inpainting model requires both an init image and a mask image.");
+        }
+    }
+
+    if (mask_image_tensor.empty()) {
+        mask_image_tensor = sd::full<float>({request->width, request->height, 1, 1}, 1.f);
+    }
+
+    sd::Tensor<float> latent_mask = sd::ops::interpolate(mask_image_tensor,
+                                                         {request->width / request->vae_scale_factor,
+                                                          request->height / request->vae_scale_factor,
+                                                          1,
+                                                          1});
+
+    sd::Tensor<float> init_latent;
+    sd::Tensor<float> control_latent;
+    if (init_image_tensor.empty()) {
+        init_latent = sd_ctx->sd->generate_init_latent(request->width, request->height);
+    } else {
+        init_latent = sd_ctx->sd->encode_first_stage(init_image_tensor);
+        if (init_latent.empty()) {
+            LOG_ERROR("failed to encode init image");
+            return std::nullopt;
+        }
+    }
+
+    if (!control_image_tensor.empty() && !sd_ctx->sd->vae_decode_only) {
+        control_latent = sd_ctx->sd->encode_first_stage(control_image_tensor);
+        if (control_latent.empty()) {
+            LOG_ERROR("failed to encode control image");
+            return std::nullopt;
+        }
+    }
+
+    std::vector<sd::Tensor<float>> ref_images;
+    for (int i = 0; i < sd_img_gen_params->ref_images_count; i++) {
+        ref_images.push_back(sd_image_to_tensor(sd_img_gen_params->ref_images[i]));
+    }
+
+    if (ref_images.empty() && sd_version_is_unet_edit(sd_ctx->sd->version)) {
+        LOG_WARN("This model needs at least one reference image; using an empty reference");
+        ref_images.push_back(sd::zeros<float>({request->width, request->height, 3, 1}));
+        request->guidance.img_cfg = request->guidance.txt_cfg;
+    }
+
+    if (!ref_images.empty()) {
+        LOG_INFO("EDIT mode");
+    }
+
+    std::vector<sd::Tensor<float>> ref_latents;
+    for (size_t i = 0; i < ref_images.size(); i++) {
+        sd::Tensor<float> ref_latent;
+        if (request->auto_resize_ref_image) {
+            LOG_DEBUG("auto resize ref images");
+            int vae_image_size = std::min(1024 * 1024, request->width * request->height);
+            double vae_width   = sqrt(vae_image_size * ref_images[i].shape()[0] / ref_images[i].shape()[1]);
+            double vae_height  = vae_width * ref_images[i].shape()[1] / ref_images[i].shape()[0];
+
+            int factor = sd_version_is_qwen_image(sd_ctx->sd->version) ? 32 : 16;
+            vae_height = round(vae_height / factor) * factor;
+            vae_width  = round(vae_width / factor) * factor;
+
+            auto resized_ref_img = sd::ops::interpolate(ref_images[i],
+                                                        {static_cast<int>(vae_width), static_cast<int>(vae_height), 3, 1});
+
+            LOG_DEBUG("resize vae ref image %d from %" PRId64 "x%" PRId64 " to %" PRId64 "x%" PRId64,
+                      static_cast<int>(i),
+                      ref_images[i].shape()[1],
+                      ref_images[i].shape()[0],
+                      resized_ref_img.shape()[1],
+                      resized_ref_img.shape()[0]);
+
+            ref_latent = sd_ctx->sd->encode_first_stage(resized_ref_img);
+        } else {
+            ref_latent = sd_ctx->sd->encode_first_stage(ref_images[i]);
+        }
+        if (ref_latent.empty()) {
+            LOG_ERROR("failed to encode reference image %d", static_cast<int>(i));
+            return std::nullopt;
+        }
+
+        ref_latents.push_back(std::move(ref_latent));
+    }
+
+    sd::Tensor<float> concat_latent;
+    sd::Tensor<float> uncond_concat_latent;
+    if (sd_version_is_inpaint(sd_ctx->sd->version)) {
+        sd::Tensor<float> masked_init_latent;
+
+        if (sd_ctx->sd->version != VERSION_FLEX_2) {
+            if (!init_image_tensor.empty()) {
+                auto masked_image  = ((1.0f - mask_image_tensor) * (init_image_tensor - 0.5f)) + 0.5f;
+                masked_init_latent = sd_ctx->sd->encode_first_stage(masked_image);
+                if (masked_init_latent.empty()) {
+                    LOG_ERROR("failed to encode masked init image");
+                    return std::nullopt;
+                }
+            } else {
+                masked_init_latent = sd::Tensor<float>::zeros_like(init_latent);
+            }
+        } else {
+            masked_init_latent = ((1.0f - latent_mask) * init_latent);
+        }
+
+        auto uncond_masked_init_latent = sd::Tensor<float>::zeros_like(masked_init_latent);
+
+        if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
+            auto mask = mask_image_tensor.reshape({request->vae_scale_factor,
+                                                   request->width / request->vae_scale_factor,
+                                                   request->vae_scale_factor,
+                                                   request->height / request->vae_scale_factor});
+            mask      = mask.permute({1, 3, 0, 2}).reshape({request->width / request->vae_scale_factor, request->height / request->vae_scale_factor, request->vae_scale_factor * request->vae_scale_factor, 1});
+
+            concat_latent        = sd::ops::concat(masked_init_latent, mask, 2);
+            uncond_concat_latent = sd::ops::concat(uncond_masked_init_latent, mask, 2);
+        } else if (sd_ctx->sd->version == VERSION_FLEX_2) {
+            concat_latent = sd::ops::concat(masked_init_latent, latent_mask, 2);
+            if (!control_latent.empty()) {
+                concat_latent = sd::ops::concat(concat_latent, control_latent, 2);
+            } else {
+                concat_latent = sd::ops::concat(concat_latent, sd::Tensor<float>::zeros_like(masked_init_latent), 2);
+            }
+
+            uncond_concat_latent = sd::ops::concat(uncond_masked_init_latent, latent_mask, 2);
+            uncond_concat_latent = sd::ops::concat(uncond_concat_latent, sd::Tensor<float>::zeros_like(masked_init_latent), 2);
+        } else {  // SD1.x SD2.x SDXL inpaint
+            concat_latent        = sd::ops::concat(latent_mask, masked_init_latent, 2);
+            uncond_concat_latent = sd::ops::concat(latent_mask, uncond_masked_init_latent, 2);
+        }
+    }
+    if (sd_version_is_unet_edit(sd_ctx->sd->version)) {
+        concat_latent        = sd::ops::interpolate<float>(ref_latents[0], init_latent.shape());
+        uncond_concat_latent = sd::Tensor<float>::zeros_like(concat_latent);
+    }
+    if (sd_version_is_control(sd_ctx->sd->version)) {
+        if (!control_latent.empty()) {
+            concat_latent = control_latent;
+        } else {
+            concat_latent = sd::Tensor<float>::zeros_like(init_latent);
+        }
+        uncond_concat_latent = sd::Tensor<float>::zeros_like(concat_latent);
+    }
+
+    if (sd_img_gen_params->init_image.data != nullptr || sd_img_gen_params->ref_images_count > 0) {
+        int64_t t1 = ggml_time_ms();
+        LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - prepare_start_ms) * 1.0f / 1000);
+    }
+
+    ImageGenerationLatents latents;
+    latents.init_latent          = std::move(init_latent);
+    latents.concat_latent        = std::move(concat_latent);
+    latents.uncond_concat_latent = std::move(uncond_concat_latent);
+    latents.control_image        = std::move(control_image_tensor);
+    latents.ref_images           = std::move(ref_images);
+    latents.ref_latents          = std::move(ref_latents);
+
+    if (sd_version_is_inpaint(sd_ctx->sd->version)) {
+        latents.denoise_mask = std::move(latent_mask);
+    }
+
+    return latents;
+}
+
+static std::optional<ImageGenerationEmbeds> prepare_image_generation_embeds(sd_ctx_t* sd_ctx,
+                                                                            const sd_img_gen_params_t* sd_img_gen_params,
+                                                                            GenerationRequest* request,
+                                                                            SamplePlan* plan,
+                                                                            ImageGenerationLatents* latents) {
+    ConditionerParams condition_params;
+    condition_params.text            = request->prompt;
+    condition_params.clip_skip       = request->clip_skip;
+    condition_params.width           = request->width;
+    condition_params.height          = request->height;
+    condition_params.ref_images      = &latents->ref_images;
+    condition_params.adm_in_channels = static_cast<int>(sd_ctx->sd->diffusion_model->get_adm_in_channels());
+
+    auto id_cond                     = sd_ctx->sd->get_pmid_conditon(request->pm_params, condition_params);
+    int64_t prepare_start_ms         = ggml_time_ms();
+    condition_params.zero_out_masked = false;
+    auto cond                        = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads,
+                                                                                           condition_params);
+    if (cond.c_concat.empty()) {
+        cond.c_concat = latents->concat_latent;  // TODO: optimize
+    }
+
+    SDCondition uncond;
+    if (request->use_uncond || request->use_high_noise_uncond) {
+        bool zero_out_masked = false;
+        if (sd_version_is_sdxl(sd_ctx->sd->version) &&
+            request->negative_prompt.empty() &&
+            !sd_ctx->sd->is_using_edm_v_parameterization) {
+            zero_out_masked = true;
+        }
+        condition_params.text            = request->negative_prompt;
+        condition_params.zero_out_masked = zero_out_masked;
+        uncond                           = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads,
+                                                                                               condition_params);
+        if (uncond.c_concat.empty()) {
+            uncond.c_concat = latents->uncond_concat_latent;  // TODO: optimize
+        }
+    }
+
+    int64_t t1 = ggml_time_ms();
+    LOG_INFO("get_learned_condition completed, taking %.2fs", (t1 - prepare_start_ms) * 1.0f / 1000);
+
+    if (sd_ctx->sd->free_params_immediately) {
+        sd_ctx->sd->cond_stage_model->free_params_buffer();
+    }
+
+    ImageGenerationEmbeds embeds;
+    if (request->use_img_cond) {
+        embeds.img_cond = SDCondition(uncond.c_crossattn, uncond.c_vector, cond.c_concat);
+    }
+    embeds.cond    = std::move(cond);
+    embeds.uncond  = std::move(uncond);
+    embeds.id_cond = std::move(id_cond);
+
+    return embeds;
+}
+
+static sd_image_t* decode_image_outputs(sd_ctx_t* sd_ctx,
+                                        const GenerationRequest& request,
+                                        const std::vector<sd::Tensor<float>>& final_latents) {
+    if (final_latents.size() != static_cast<size_t>(request.batch_count)) {
+        LOG_ERROR("expected %d latents, got %zu", request.batch_count, final_latents.size());
+        return nullptr;
+    }
+    LOG_INFO("decoding %zu latents", final_latents.size());
+    std::vector<sd::Tensor<float>> decoded_images;
+    int64_t t0 = ggml_time_ms();
+
+    for (size_t i = 0; i < final_latents.size(); i++) {
+        int64_t t1              = ggml_time_ms();
+        sd::Tensor<float> image = sd_ctx->sd->decode_first_stage(final_latents[i]);
+        if (image.empty()) {
+            LOG_ERROR("decode_first_stage failed for latent %" PRId64, i + 1);
+            if (sd_ctx->sd->free_params_immediately) {
+                sd_ctx->sd->first_stage_model->free_params_buffer();
+            }
+            return nullptr;
+        }
+        decoded_images.push_back(std::move(image));
+        int64_t t2 = ggml_time_ms();
+        LOG_INFO("latent %" PRId64 " decoded, taking %.2fs", i + 1, (t2 - t1) * 1.0f / 1000);
+    }
+
+    int64_t t4 = ggml_time_ms();
+    LOG_INFO("decode_first_stage completed, taking %.2fs", (t4 - t0) * 1.0f / 1000);
+    if (sd_ctx->sd->free_params_immediately) {
+        sd_ctx->sd->first_stage_model->free_params_buffer();
+    }
+
+    sd_image_t* result_images = (sd_image_t*)calloc(request.batch_count, sizeof(sd_image_t));
+    if (result_images == nullptr) {
+        return nullptr;
+    }
+    memset(result_images, 0, request.batch_count * sizeof(sd_image_t));
+
+    for (size_t i = 0; i < decoded_images.size(); i++) {
+        result_images[i] = tensor_to_sd_image(decoded_images[i]);
+    }
+
+    return result_images;
+}
+
+SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) {
+    if (sd_ctx == nullptr || sd_img_gen_params == nullptr) {
         return nullptr;
     }
 
-    int64_t seed = sd_vid_gen_params->seed;
-    if (seed < 0) {
-        seed = (int)time(nullptr);
+    int64_t t0                    = ggml_time_ms();
+    sd_ctx->sd->vae_tiling_params = sd_img_gen_params->vae_tiling_params;
+    GenerationRequest request(sd_ctx, sd_img_gen_params);
+    LOG_INFO("generate_image %dx%d", request.width, request.height);
+
+    sd_ctx->sd->rng->manual_seed(request.seed);
+    sd_ctx->sd->sampler_rng->manual_seed(request.seed);
+    sd_ctx->sd->set_flow_shift(sd_img_gen_params->sample_params.flow_shift);
+    sd_ctx->sd->apply_loras(sd_img_gen_params->loras, sd_img_gen_params->lora_count);
+
+    ImageVaeAxesGuard axes_guard(sd_ctx, sd_img_gen_params, request);
+
+    SamplePlan plan(sd_ctx, sd_img_gen_params, request);
+    auto latents_opt = prepare_image_generation_latents(sd_ctx,
+                                                        sd_img_gen_params,
+                                                        &request,
+                                                        &plan);
+    if (!latents_opt.has_value()) {
+        return nullptr;
+    }
+    ImageGenerationLatents latents = std::move(*latents_opt);
+
+    auto embeds_opt = prepare_image_generation_embeds(sd_ctx,
+                                                      sd_img_gen_params,
+                                                      &request,
+                                                      &plan,
+                                                      &latents);
+    if (!embeds_opt.has_value()) {
+        return nullptr;
+    }
+    ImageGenerationEmbeds embeds = std::move(*embeds_opt);
+
+    std::vector<sd::Tensor<float>> final_latents;
+    int64_t denoise_start = ggml_time_ms();
+    for (int b = 0; b < request.batch_count; b++) {
+        int64_t sampling_start = ggml_time_ms();
+        int64_t cur_seed       = request.seed + b;
+        LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, request.batch_count, cur_seed);
+
+        sd_ctx->sd->rng->manual_seed(cur_seed);
+        sd_ctx->sd->sampler_rng->manual_seed(cur_seed);
+        sd::Tensor<float> noise = sd::randn_like<float>(latents.init_latent, sd_ctx->sd->rng);
+
+        sd::Tensor<float> x_0 = sd_ctx->sd->sample(sd_ctx->sd->diffusion_model,
+                                                   true,
+                                                   latents.init_latent,
+                                                   std::move(noise),
+                                                   embeds.cond,
+                                                   embeds.uncond,
+                                                   embeds.img_cond,
+                                                   embeds.id_cond,
+                                                   latents.control_image,
+                                                   request.control_strength,
+                                                   request.guidance,
+                                                   request.eta,
+                                                   request.shifted_timestep,
+                                                   plan.sample_method,
+                                                   plan.sigmas,
+                                                   plan.start_merge_step,
+                                                   latents.ref_latents,
+                                                   request.increase_ref_index,
+                                                   latents.denoise_mask,
+                                                   sd::Tensor<float>(),
+                                                   1.f,
+                                                   request.cache_params);
+        int64_t sampling_end  = ggml_time_ms();
+        if (!x_0.empty()) {
+            LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
+            final_latents.push_back(std::move(x_0));
+            continue;
+        }
+
+        LOG_ERROR("sampling for image %d/%d failed after %.2fs",
+                  b + 1,
+                  request.batch_count,
+                  (sampling_end - sampling_start) * 1.0f / 1000);
+        if (sd_ctx->sd->free_params_immediately) {
+            sd_ctx->sd->diffusion_model->free_params_buffer();
+        }
+        return nullptr;
+    }
+    if (sd_ctx->sd->free_params_immediately) {
+        sd_ctx->sd->diffusion_model->free_params_buffer();
+    }
+    int64_t denoise_end = ggml_time_ms();
+    LOG_INFO("generating %" PRId64 " latent images completed, taking %.2fs",
+             final_latents.size(),
+             (denoise_end - denoise_start) * 1.0f / 1000);
+
+    auto result = decode_image_outputs(sd_ctx, request, final_latents);
+    if (result == nullptr) {
+        return nullptr;
     }
 
-    sd_ctx->sd->rng->manual_seed(seed);
-    sd_ctx->sd->sampler_rng->manual_seed(seed);
+    sd_ctx->sd->lora_stat();
 
-    int64_t t0 = ggml_time_ms();
+    int64_t t1 = ggml_time_ms();
+    LOG_INFO("generate_image completed in %.2fs", (t1 - t0) * 1.0f / 1000);
+    return result;
+}
 
-    // Apply lora
-    sd_ctx->sd->apply_loras(sd_vid_gen_params->loras, sd_vid_gen_params->lora_count);
+static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd_ctx_t* sd_ctx,
+                                                                              const sd_vid_gen_params_t* sd_vid_gen_params,
+                                                                              GenerationRequest* request) {
+    ImageGenerationLatents latents;
+    int64_t prepare_start_ms = ggml_time_ms();
+
+    sd::Tensor<float> start_image;
+    sd::Tensor<float> end_image;
+
+    if (sd_vid_gen_params->init_image.data) {
+        start_image = sd_image_to_tensor(sd_vid_gen_params->init_image, request->width, request->height);
+    }
+
+    if (sd_vid_gen_params->end_image.data) {
+        end_image = sd_image_to_tensor(sd_vid_gen_params->end_image, request->width, request->height);
+    }
 
-    ggml_tensor* init_latent        = nullptr;
-    ggml_tensor* clip_vision_output = nullptr;
-    ggml_tensor* concat_latent      = nullptr;
-    ggml_tensor* denoise_mask       = nullptr;
-    ggml_tensor* vace_context       = nullptr;
-    int64_t ref_image_num           = 0;  // for vace
     if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-14B" ||
         sd_ctx->sd->diffusion_model->get_desc() == "Wan2.2-I2V-14B" ||
         sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-1.3B" ||
@@ -3700,331 +3196,370 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
         if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-14B" ||
             sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-1.3B" ||
             sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-FLF2V-14B") {
-            if (sd_vid_gen_params->init_image.data) {
-                clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->init_image, false, -2);
+            if (!start_image.empty()) {
+                auto clip_vision_output = sd_ctx->sd->get_clip_vision_output(start_image, false, -2);
+                if (clip_vision_output.empty()) {
+                    LOG_ERROR("failed to compute clip vision output for init image");
+                    return std::nullopt;
+                }
+                latents.clip_vision_output = std::move(clip_vision_output);
             } else {
-                clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->init_image, false, -2, true);
+                latents.clip_vision_output = sd_ctx->sd->get_clip_vision_output(start_image, false, -2, true);
             }
 
             if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-FLF2V-14B") {
-                ggml_tensor* end_image_clip_vision_output = nullptr;
-                if (sd_vid_gen_params->end_image.data) {
-                    end_image_clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->end_image, false, -2);
+                sd::Tensor<float> end_image_clip_vision_output;
+                if (!end_image.empty()) {
+                    end_image_clip_vision_output = sd_ctx->sd->get_clip_vision_output(end_image, false, -2);
+                    if (end_image_clip_vision_output.empty()) {
+                        LOG_ERROR("failed to compute clip vision output for end image");
+                        return std::nullopt;
+                    }
                 } else {
-                    end_image_clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->end_image, false, -2, true);
+                    end_image_clip_vision_output = sd_ctx->sd->get_clip_vision_output(end_image, false, -2, true);
                 }
-                clip_vision_output = ggml_ext_tensor_concat(work_ctx, clip_vision_output, end_image_clip_vision_output, 1);
+                latents.clip_vision_output = sd::ops::concat(latents.clip_vision_output, end_image_clip_vision_output, 1);
             }
 
             int64_t t1 = ggml_time_ms();
-            LOG_INFO("get_clip_vision_output completed, taking %" PRId64 " ms", t1 - t0);
+            LOG_INFO("get_clip_vision_output completed, taking %" PRId64 " ms", t1 - prepare_start_ms);
         }
 
-        int64_t t1         = ggml_time_ms();
-        ggml_tensor* image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, frames, 3);
-        ggml_ext_tensor_iter(image, [&](ggml_tensor* image, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-            float value = 0.5f;
-            if (i2 == 0 && sd_vid_gen_params->init_image.data) {  // start image
-                value = *(sd_vid_gen_params->init_image.data + i1 * width * 3 + i0 * 3 + i3);
-                value /= 255.f;
-            } else if (i2 == frames - 1 && sd_vid_gen_params->end_image.data) {
-                value = *(sd_vid_gen_params->end_image.data + i1 * width * 3 + i0 * 3 + i3);
-                value /= 255.f;
-            }
-            ggml_ext_tensor_set_f32(image, value, i0, i1, i2, i3);
-        });
+        int64_t t1              = ggml_time_ms();
+        sd::Tensor<float> image = sd::full<float>({request->width, request->height, request->frames, 3, 1}, 0.5f);
+        if (!start_image.empty()) {
+            sd::ops::slice_assign(&image, 2, 0, 1, start_image.unsqueeze(2));
+        }
+        if (!end_image.empty()) {
+            sd::ops::slice_assign(&image, 2, request->frames - 1, request->frames, end_image.unsqueeze(2));
+        }
 
-        concat_latent = sd_ctx->sd->encode_first_stage(work_ctx, image);  // [b*c, t, h/vae_scale_factor, w/vae_scale_factor]
+        auto concat_latent = sd_ctx->sd->encode_first_stage(image);  // [b, c, t, h/vae_scale_factor, w/vae_scale_factor]
+        if (concat_latent.empty()) {
+            LOG_ERROR("failed to encode video conditioning frames");
+            return std::nullopt;
+        }
+        latents.concat_latent = std::move(concat_latent);
 
         int64_t t2 = ggml_time_ms();
         LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1);
 
-        ggml_tensor* concat_mask = ggml_new_tensor_4d(work_ctx,
-                                                      GGML_TYPE_F32,
-                                                      concat_latent->ne[0],
-                                                      concat_latent->ne[1],
-                                                      concat_latent->ne[2],
-                                                      4);  // [b*4, t, w/vae_scale_factor, h/vae_scale_factor]
-        ggml_ext_tensor_iter(concat_mask, [&](ggml_tensor* concat_mask, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-            float value = 0.0f;
-            if (i2 == 0 && sd_vid_gen_params->init_image.data) {  // start image
-                value = 1.0f;
-            } else if (i2 == frames - 1 && sd_vid_gen_params->end_image.data && i3 == 3) {
-                value = 1.0f;
-            }
-            ggml_ext_tensor_set_f32(concat_mask, value, i0, i1, i2, i3);
-        });
-
-        concat_latent = ggml_ext_tensor_concat(work_ctx, concat_mask, concat_latent, 3);  // [b*(c+4), t, h/vae_scale_factor, w/vae_scale_factor]
-    } else if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.2-TI2V-5B" && sd_vid_gen_params->init_image.data) {
+        sd::Tensor<float> concat_mask = sd::zeros<float>({latents.concat_latent.shape()[0],
+                                                          latents.concat_latent.shape()[1],
+                                                          latents.concat_latent.shape()[2],
+                                                          4,
+                                                          1});  // [b, 4, t, h/vae_scale_factor, w/vae_scale_factor]
+        if (!start_image.empty()) {
+            sd::ops::fill_slice(&concat_mask, 2, 0, 1, 1.0f);
+        }
+        if (!end_image.empty()) {
+            auto last_channel = sd::ops::slice(concat_mask, 3, 3, 4);
+            sd::ops::fill_slice(&last_channel, 2, last_channel.shape()[2] - 1, last_channel.shape()[2], 1.0f);
+            sd::ops::slice_assign(&concat_mask, 3, 3, 4, last_channel);
+        }
+        latents.concat_latent = sd::ops::concat(concat_mask, latents.concat_latent, 3);  // [b, 4+c, t, h/vae_scale_factor, w/vae_scale_factor]
+    } else if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.2-TI2V-5B" && !start_image.empty()) {
         LOG_INFO("IMG2VID");
 
-        int64_t t1            = ggml_time_ms();
-        ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
-        sd_image_to_ggml_tensor(sd_vid_gen_params->init_image, init_img);
-        init_img = ggml_reshape_4d(work_ctx, init_img, width, height, 1, 3);
+        int64_t t1             = ggml_time_ms();
+        auto init_img          = start_image.reshape({start_image.shape()[0], start_image.shape()[1], 1, start_image.shape()[2], 1});
+        auto init_image_latent = sd_ctx->sd->encode_first_stage(init_img);  // [b, c, 1, h/vae_scale_factor, w/vae_scale_factor]
+        if (init_image_latent.empty()) {
+            LOG_ERROR("failed to encode init video frame");
+            return std::nullopt;
+        }
 
-        auto init_image_latent = sd_ctx->sd->encode_to_vae_latents(work_ctx, init_img);  // [b*c, 1, h/16, w/16]
+        latents.init_latent = sd_ctx->sd->generate_init_latent(request->width, request->height, request->frames, true);  // [b, c, t, h/vae_scale_factor, w/vae_scale_factor]
+        sd::ops::slice_assign(&latents.init_latent, 2, 0, init_image_latent.shape()[2], init_image_latent);
 
-        init_latent  = sd_ctx->sd->generate_init_latent(work_ctx, width, height, frames, true);
-        denoise_mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2], 1);
-        ggml_set_f32(denoise_mask, 1.f);
-
-        init_latent = sd_ctx->sd->first_stage_model->diffusion_to_vae_latents(work_ctx, init_latent);
-
-        ggml_ext_tensor_iter(init_image_latent, [&](ggml_tensor* t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-            float value = ggml_ext_tensor_get_f32(t, i0, i1, i2, i3);
-            ggml_ext_tensor_set_f32(init_latent, value, i0, i1, i2, i3);
-            if (i3 == 0) {
-                ggml_ext_tensor_set_f32(denoise_mask, 0.f, i0, i1, i2, i3);
-            }
-        });
-
-        init_latent = sd_ctx->sd->first_stage_model->vae_to_diffuison_latents(work_ctx, init_latent);
+        latents.denoise_mask = sd::full<float>({latents.init_latent.shape()[0], latents.init_latent.shape()[1], latents.init_latent.shape()[2], 1, 1}, 1.f);
+        sd::ops::fill_slice(&latents.denoise_mask, 2, 0, init_image_latent.shape()[2], 0.0f);
 
         int64_t t2 = ggml_time_ms();
         LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1);
     } else if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-VACE-1.3B" ||
                sd_ctx->sd->diffusion_model->get_desc() == "Wan2.x-VACE-14B") {
         LOG_INFO("VACE");
-        int64_t t1                    = ggml_time_ms();
-        ggml_tensor* ref_image_latent = nullptr;
-        if (sd_vid_gen_params->init_image.data) {
-            ggml_tensor* ref_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
-            sd_image_to_ggml_tensor(sd_vid_gen_params->init_image, ref_img);
-            ref_img = ggml_reshape_4d(work_ctx, ref_img, width, height, 1, 3);
-
-            ref_image_latent = sd_ctx->sd->encode_first_stage(work_ctx, ref_img);  // [b*c, 1, h/16, w/16]
-            auto zero_latent = ggml_dup_tensor(work_ctx, ref_image_latent);
-            ggml_set_f32(zero_latent, 0.f);
-            ref_image_latent = ggml_ext_tensor_concat(work_ctx, ref_image_latent, zero_latent, 3);  // [b*2*c, 1, h/16, w/16]
+        int64_t t1 = ggml_time_ms();
+        sd::Tensor<float> ref_image_latent;
+        if (!start_image.empty()) {
+            auto ref_img     = start_image.reshape({start_image.shape()[0], start_image.shape()[1], 1, start_image.shape()[2], 1});
+            auto encoded_ref = sd_ctx->sd->encode_first_stage(ref_img);  // [b, c, 1, h/vae_scale_factor, w/vae_scale_factor]
+            if (encoded_ref.empty()) {
+                LOG_ERROR("failed to encode VACE reference image");
+                return std::nullopt;
+            }
+            ref_image_latent = sd::ops::concat(encoded_ref, sd::zeros<float>(encoded_ref.shape()), 3);  // [b, 2*c, 1, h/vae_scale_factor, w/vae_scale_factor]
         }
 
-        ggml_tensor* control_video = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, frames, 3);
-        ggml_ext_tensor_iter(control_video, [&](ggml_tensor* control_video, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-            float value = 0.5f;
-            if (i2 < sd_vid_gen_params->control_frames_size) {
-                value = sd_image_get_f32(sd_vid_gen_params->control_frames[i2], i0, i1, i3);
-            }
-            ggml_ext_tensor_set_f32(control_video, value, i0, i1, i2, i3);
-        });
-        ggml_tensor* mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, frames, 1);
-        ggml_set_f32(mask, 1.0f);
-        ggml_tensor* inactive = ggml_dup_tensor(work_ctx, control_video);
-        ggml_tensor* reactive = ggml_dup_tensor(work_ctx, control_video);
+        sd::Tensor<float> control_video = sd::full<float>({request->width, request->height, request->frames, 3, 1}, 0.5f);
+        int64_t control_frame_count     = std::min<int64_t>(request->frames, sd_vid_gen_params->control_frames_size);
+        for (int64_t i = 0; i < control_frame_count; ++i) {
+            auto control_frame = sd_image_to_tensor(sd_vid_gen_params->control_frames[i], request->width, request->height);
+            sd::ops::slice_assign(&control_video, 2, i, i + 1, control_frame.unsqueeze(2));
+        }
 
-        ggml_ext_tensor_iter(control_video, [&](ggml_tensor* t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-            float control_video_value = ggml_ext_tensor_get_f32(t, i0, i1, i2, i3) - 0.5f;
-            float mask_value          = ggml_ext_tensor_get_f32(mask, i0, i1, i2, 0);
-            float inactive_value      = (control_video_value * (1.f - mask_value)) + 0.5f;
-            float reactive_value      = (control_video_value * mask_value) + 0.5f;
+        sd::Tensor<float> mask = sd::full<float>({request->width, request->height, request->frames, 1, 1}, 1.0f);
 
-            ggml_ext_tensor_set_f32(inactive, inactive_value, i0, i1, i2, i3);
-            ggml_ext_tensor_set_f32(reactive, reactive_value, i0, i1, i2, i3);
-        });
+        control_video              = control_video - 0.5f;
+        sd::Tensor<float> inactive = control_video * (1.0f - mask) + 0.5f;
+        sd::Tensor<float> reactive = control_video * mask + 0.5f;
 
-        inactive = sd_ctx->sd->encode_first_stage(work_ctx, inactive);  // [b*c, t, h/vae_scale_factor, w/vae_scale_factor]
-        reactive = sd_ctx->sd->encode_first_stage(work_ctx, reactive);  // [b*c, t, h/vae_scale_factor, w/vae_scale_factor]
+        inactive = sd_ctx->sd->encode_first_stage(inactive);  // [b, c, t, h/vae_scale_factor, w/vae_scale_factor]
+        if (inactive.empty()) {
+            LOG_ERROR("failed to encode VACE inactive context");
+            return std::nullopt;
+        }
 
-        int64_t length = inactive->ne[2];
-        if (ref_image_latent) {
+        reactive = sd_ctx->sd->encode_first_stage(reactive);  // [b, c, t, h/vae_scale_factor, w/vae_scale_factor]
+        if (reactive.empty()) {
+            LOG_ERROR("failed to encode VACE reactive context");
+            return std::nullopt;
+        }
+
+        int64_t length = inactive.shape()[2];
+        if (!ref_image_latent.empty()) {
             length += 1;
-            frames        = static_cast<int>((length - 1) * 4 + 1);
-            ref_image_num = 1;
+            request->frames       = static_cast<int>((length - 1) * 4 + 1);
+            latents.ref_image_num = 1;
         }
-        vace_context = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, inactive->ne[0], inactive->ne[1], length, 96);  // [b*96, t, h/vae_scale_factor, w/vae_scale_factor]
-        ggml_ext_tensor_iter(vace_context, [&](ggml_tensor* vace_context, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-            float value;
-            if (i3 < 32) {
-                if (ref_image_latent && i2 == 0) {
-                    value = ggml_ext_tensor_get_f32(ref_image_latent, i0, i1, 0, i3);
-                } else {
-                    if (i3 < 16) {
-                        value = ggml_ext_tensor_get_f32(inactive, i0, i1, i2 - ref_image_num, i3);
-                    } else {
-                        value = ggml_ext_tensor_get_f32(reactive, i0, i1, i2 - ref_image_num, i3 - 16);
-                    }
-                }
-            } else {  // mask
-                if (ref_image_latent && i2 == 0) {
-                    value = 0.f;
-                } else {
-                    int64_t vae_stride        = vae_scale_factor;
-                    int64_t mask_height_index = i1 * vae_stride + (i3 - 32) / vae_stride;
-                    int64_t mask_width_index  = i0 * vae_stride + (i3 - 32) % vae_stride;
-                    value                     = ggml_ext_tensor_get_f32(mask, mask_width_index, mask_height_index, i2 - ref_image_num, 0);
-                }
-            }
-            ggml_ext_tensor_set_f32(vace_context, value, i0, i1, i2, i3);
-        });
-        int64_t t2 = ggml_time_ms();
+        auto vace_context = sd::ops::concat(inactive, reactive, 3);  // [b, 2*c, t, h/vae_scale_factor, w/vae_scale_factor]
+
+        mask              = sd::full<float>({request->width, request->height, inactive.shape()[2], 1, 1}, 1.0f);
+        auto mask_context = mask.reshape({request->vae_scale_factor,
+                                          inactive.shape()[0],
+                                          request->vae_scale_factor,
+                                          inactive.shape()[1],
+                                          inactive.shape()[2]});   // [t, h/vae_scale_factor, vae_scale_factor, w/vae_scale_factor, vae_scale_factor]
+        mask_context      = mask_context.permute({1, 3, 4, 0, 2})  // [vae_scale_factor, vae_scale_factor, t, h/vae_scale_factor, w/vae_scale_factor]
+                           .reshape({inactive.shape()[0],
+                                     inactive.shape()[1],
+                                     inactive.shape()[2],
+                                     request->vae_scale_factor * request->vae_scale_factor});  // [vae_scale_factor*vae_scale_factor, t, h/vae_scale_factor, w/vae_scale_factor]
+
+        if (!ref_image_latent.empty()) {
+            vace_context  = sd::ops::concat(ref_image_latent, vace_context, 2);  // [b, 2*c, t+1, h/vae_scale_factor, w/vae_scale_factor]
+            auto mask_pad = sd::zeros<float>({mask_context.shape()[0],
+                                              mask_context.shape()[1],
+                                              1,
+                                              mask_context.shape()[3]});  // [vae_scale_factor*vae_scale_factor, 1, h/vae_scale_factor, w/vae_scale_factor]
+            mask_context  = sd::ops::concat(mask_pad, mask_context, 2);   // [vae_scale_factor*vae_scale_factor, t + 1, h/vae_scale_factor, w/vae_scale_factor]
+        }
+
+        mask_context.unsqueeze_(mask_context.dim());  // [b, vae_scale_factor*vae_scale_factor, t + 1 or t, h/vae_scale_factor, w/vae_scale_factor]
+
+        latents.vace_context = sd::ops::concat(vace_context, mask_context, 3);  // [b, 2*c + vae_scale_factor*vae_scale_factor, t + 1 or t, h/vae_scale_factor, w/vae_scale_factor]
+        int64_t t2           = ggml_time_ms();
         LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1);
     }
 
-    if (init_latent == nullptr) {
-        init_latent = sd_ctx->sd->generate_init_latent(work_ctx, width, height, frames, true);
+    if (latents.init_latent.empty()) {
+        latents.init_latent = sd_ctx->sd->generate_init_latent(request->width, request->height, request->frames, true);
     }
 
-    // Get learned condition
+    return latents;
+}
+
+static ImageGenerationEmbeds prepare_video_generation_embeds(sd_ctx_t* sd_ctx,
+                                                             const sd_vid_gen_params_t* sd_vid_gen_params,
+                                                             const GenerationRequest& request,
+                                                             const ImageGenerationLatents& latents) {
+    ImageGenerationEmbeds embeds;
     ConditionerParams condition_params;
-    condition_params.clip_skip       = sd_vid_gen_params->clip_skip;
+    condition_params.clip_skip       = request.clip_skip;
+    condition_params.text            = request.prompt;
     condition_params.zero_out_masked = true;
-    condition_params.text            = prompt;
 
-    int64_t t1       = ggml_time_ms();
-    SDCondition cond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
-                                                                           sd_ctx->sd->n_threads,
-                                                                           condition_params);
-    cond.c_concat    = concat_latent;
-    cond.c_vector    = clip_vision_output;
-    SDCondition uncond;
-    if (sd_vid_gen_params->sample_params.guidance.txt_cfg != 1.0 || sd_vid_gen_params->high_noise_sample_params.guidance.txt_cfg != 1.0) {
-        condition_params.text = negative_prompt;
-        uncond                = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
-                                                                                    sd_ctx->sd->n_threads,
-                                                                                    condition_params);
-        uncond.c_concat       = concat_latent;
-        uncond.c_vector       = clip_vision_output;
+    int64_t prepare_start_ms = ggml_time_ms();
+    embeds.cond              = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads,
+                                                                                   condition_params);
+    embeds.cond.c_concat     = latents.concat_latent;
+    embeds.cond.c_vector     = latents.clip_vision_output;
+    if (request.use_uncond) {
+        condition_params.text  = request.negative_prompt;
+        embeds.uncond          = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads,
+                                                                                     condition_params);
+        embeds.uncond.c_concat = latents.concat_latent;
+        embeds.uncond.c_vector = latents.clip_vision_output;
     }
-    int64_t t2 = ggml_time_ms();
-    LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t2 - t1);
+
+    int64_t t1 = ggml_time_ms();
+    LOG_INFO("get_learned_condition completed, taking %.2fs", (t1 - prepare_start_ms) * 1.0f / 1000);
 
     if (sd_ctx->sd->free_params_immediately) {
         sd_ctx->sd->cond_stage_model->free_params_buffer();
     }
+    return embeds;
+}
 
-    int W = width / vae_scale_factor;
-    int H = height / vae_scale_factor;
-    int T = static_cast<int>(init_latent->ne[2]);
-    int C = sd_ctx->sd->get_latent_channel();
-
-    ggml_tensor* final_latent;
-    ggml_tensor* x_t   = init_latent;
-    ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, T, C);
-    ggml_ext_im_set_randn_f32(noise, sd_ctx->sd->rng);
-    // High Noise Sample
-    if (high_noise_sample_steps > 0) {
-        LOG_DEBUG("sample(high noise) %dx%dx%d", W, H, T);
-        enum sample_method_t high_noise_sample_method = sd_vid_gen_params->high_noise_sample_params.sample_method;
-        if (high_noise_sample_method == SAMPLE_METHOD_COUNT) {
-            high_noise_sample_method = sd_get_default_sample_method(sd_ctx);
-        }
-        LOG_INFO("sampling(high noise) using %s method", sampling_methods_str[high_noise_sample_method]);
-
-        int64_t sampling_start = ggml_time_ms();
-
-        std::vector<float> high_noise_sigmas = std::vector<float>(sigmas.begin(), sigmas.begin() + high_noise_sample_steps + 1);
-        sigmas                               = std::vector<float>(sigmas.begin() + high_noise_sample_steps, sigmas.end());
-
-        x_t = sd_ctx->sd->sample(work_ctx,
-                                 sd_ctx->sd->high_noise_diffusion_model,
-                                 false,
-                                 x_t,
-                                 noise,
-                                 cond,
-                                 uncond,
-                                 {},
-                                 nullptr,
-                                 0,
-                                 sd_vid_gen_params->high_noise_sample_params.guidance,
-                                 sd_vid_gen_params->high_noise_sample_params.eta,
-                                 sd_vid_gen_params->high_noise_sample_params.shifted_timestep,
-                                 high_noise_sample_method,
-                                 high_noise_sigmas,
-                                 -1,
-                                 {},
-                                 {},
-                                 false,
-                                 denoise_mask,
-                                 vace_context,
-                                 sd_vid_gen_params->vace_strength,
-                                 &sd_vid_gen_params->cache);
-
-        int64_t sampling_end = ggml_time_ms();
-        LOG_INFO("sampling(high noise) completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
-        if (sd_ctx->sd->free_params_immediately) {
-            sd_ctx->sd->high_noise_diffusion_model->free_params_buffer();
-        }
-        noise = nullptr;
+static sd_image_t* decode_video_outputs(sd_ctx_t* sd_ctx,
+                                        const sd::Tensor<float>& final_latent,
+                                        int* num_frames_out) {
+    if (final_latent.empty()) {
+        LOG_ERROR("no latent video to decode");
+        return nullptr;
     }
-
-    // Sample
-    {
-        LOG_DEBUG("sample %dx%dx%d", W, H, T);
-        int64_t sampling_start = ggml_time_ms();
-
-        final_latent = sd_ctx->sd->sample(work_ctx,
-                                          sd_ctx->sd->diffusion_model,
-                                          true,
-                                          x_t,
-                                          noise,
-                                          cond,
-                                          uncond,
-                                          {},
-                                          nullptr,
-                                          0,
-                                          sd_vid_gen_params->sample_params.guidance,
-                                          sd_vid_gen_params->sample_params.eta,
-                                          sd_vid_gen_params->sample_params.shifted_timestep,
-                                          sample_method,
-                                          sigmas,
-                                          -1,
-                                          {},
-                                          {},
-                                          false,
-                                          denoise_mask,
-                                          vace_context,
-                                          sd_vid_gen_params->vace_strength,
-                                          &sd_vid_gen_params->cache);
-
-        int64_t sampling_end = ggml_time_ms();
-        LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
-        if (sd_ctx->sd->free_params_immediately) {
-            sd_ctx->sd->diffusion_model->free_params_buffer();
-        }
-    }
-
-    if (ref_image_num > 0) {
-        ggml_tensor* trim_latent = ggml_new_tensor_4d(work_ctx,
-                                                      GGML_TYPE_F32,
-                                                      final_latent->ne[0],
-                                                      final_latent->ne[1],
-                                                      final_latent->ne[2] - ref_image_num,
-                                                      final_latent->ne[3]);
-        ggml_ext_tensor_iter(trim_latent, [&](ggml_tensor* trim_latent, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-            float value = ggml_ext_tensor_get_f32(final_latent, i0, i1, i2 + ref_image_num, i3);
-            ggml_ext_tensor_set_f32(trim_latent, value, i0, i1, i2, i3);
-        });
-        final_latent = trim_latent;
-    }
-
-    int64_t t4 = ggml_time_ms();
-    LOG_INFO("generating latent video completed, taking %.2fs", (t4 - t2) * 1.0f / 1000);
-    ggml_tensor* vid = sd_ctx->sd->decode_first_stage(work_ctx, final_latent, true);
-    int64_t t5       = ggml_time_ms();
+    int64_t t4            = ggml_time_ms();
+    sd::Tensor<float> vid = sd_ctx->sd->decode_first_stage(final_latent, true);
+    int64_t t5            = ggml_time_ms();
     LOG_INFO("decode_first_stage completed, taking %.2fs", (t5 - t4) * 1.0f / 1000);
     if (sd_ctx->sd->free_params_immediately) {
         sd_ctx->sd->first_stage_model->free_params_buffer();
     }
-
-    sd_ctx->sd->lora_stat();
-
-    sd_image_t* result_images = (sd_image_t*)calloc(vid->ne[2], sizeof(sd_image_t));
-    if (result_images == nullptr) {
-        ggml_free(work_ctx);
+    if (vid.empty()) {
+        LOG_ERROR("decode_first_stage failed for video");
         return nullptr;
     }
-    *num_frames_out = static_cast<int>(vid->ne[2]);
 
-    for (int64_t i = 0; i < vid->ne[2]; i++) {
-        result_images[i].width   = static_cast<uint32_t>(vid->ne[0]);
-        result_images[i].height  = static_cast<uint32_t>(vid->ne[1]);
-        result_images[i].channel = 3;
-        result_images[i].data    = ggml_tensor_to_sd_image(vid, static_cast<int>(i), true);
+    sd_image_t* result_images = (sd_image_t*)calloc(vid.shape()[2], sizeof(sd_image_t));
+    if (result_images == nullptr) {
+        return nullptr;
+    }
+    if (num_frames_out != nullptr) {
+        *num_frames_out = static_cast<int>(vid.shape()[2]);
     }
-    ggml_free(work_ctx);
 
-    LOG_INFO("generate_video completed in %.2fs", (t5 - t0) * 1.0f / 1000);
+    for (int64_t i = 0; i < vid.shape()[2]; i++) {
+        result_images[i] = tensor_to_sd_image(vid, static_cast<int>(i));
+    }
 
     return result_images;
 }
+
+SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, int* num_frames_out) {
+    if (sd_ctx == nullptr || sd_vid_gen_params == nullptr) {
+        return nullptr;
+    }
+    if (num_frames_out != nullptr) {
+        *num_frames_out = 0;
+    }
+    int64_t t0                    = ggml_time_ms();
+    sd_ctx->sd->vae_tiling_params = sd_vid_gen_params->vae_tiling_params;
+    GenerationRequest request(sd_ctx, sd_vid_gen_params);
+    sd_ctx->sd->rng->manual_seed(request.seed);
+    sd_ctx->sd->sampler_rng->manual_seed(request.seed);
+    sd_ctx->sd->set_flow_shift(sd_vid_gen_params->sample_params.flow_shift);
+    sd_ctx->sd->apply_loras(sd_vid_gen_params->loras, sd_vid_gen_params->lora_count);
+
+    SamplePlan plan(sd_ctx, sd_vid_gen_params, request);
+    auto latent_inputs_opt = prepare_video_generation_latents(sd_ctx, sd_vid_gen_params, &request);
+    if (!latent_inputs_opt.has_value()) {
+        return nullptr;
+    }
+    ImageGenerationLatents latents = std::move(*latent_inputs_opt);
+    ImageGenerationEmbeds embeds   = prepare_video_generation_embeds(sd_ctx,
+                                                                     sd_vid_gen_params,
+                                                                     request,
+                                                                     latents);
+    LOG_INFO("generate_video %dx%dx%d",
+             request.width,
+             request.height,
+             request.frames);
+
+    int64_t latent_start = ggml_time_ms();
+    int W                = request.width / request.vae_scale_factor;
+    int H                = request.height / request.vae_scale_factor;
+    int T                = static_cast<int>(latents.init_latent.shape()[2]);
+
+    sd::Tensor<float> x_t   = latents.init_latent;
+    sd::Tensor<float> noise = sd::Tensor<float>::randn_like(x_t, sd_ctx->sd->rng);
+
+    if (plan.high_noise_sample_steps > 0) {
+        LOG_DEBUG("sample(high noise) %dx%dx%d", W, H, T);
+
+        int64_t sampling_start = ggml_time_ms();
+        std::vector<float> high_noise_sigmas(plan.sigmas.begin(), plan.sigmas.begin() + plan.high_noise_sample_steps + 1);
+        plan.sigmas = std::vector<float>(plan.sigmas.begin() + plan.high_noise_sample_steps, plan.sigmas.end());
+
+        sd::Tensor<float> x_t_sampled = sd_ctx->sd->sample(sd_ctx->sd->high_noise_diffusion_model,
+                                                           false,
+                                                           x_t,
+                                                           std::move(noise),
+                                                           embeds.cond,
+                                                           request.use_high_noise_uncond ? embeds.uncond : SDCondition(),
+                                                           embeds.img_cond,
+                                                           embeds.id_cond,
+                                                           sd::Tensor<float>(),
+                                                           0.f,
+                                                           request.high_noise_guidance,
+                                                           sd_vid_gen_params->high_noise_sample_params.eta,
+                                                           request.shifted_timestep,
+                                                           plan.high_noise_sample_method,
+                                                           high_noise_sigmas,
+                                                           -1,
+                                                           std::vector<sd::Tensor<float>>{},
+                                                           false,
+                                                           latents.denoise_mask,
+                                                           latents.vace_context,
+                                                           request.vace_strength,
+                                                           request.cache_params);
+        int64_t sampling_end          = ggml_time_ms();
+        if (x_t_sampled.empty()) {
+            LOG_ERROR("sampling(high noise) failed after %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
+            if (sd_ctx->sd->free_params_immediately) {
+                sd_ctx->sd->high_noise_diffusion_model->free_params_buffer();
+            }
+            return nullptr;
+        }
+
+        x_t   = std::move(x_t_sampled);
+        noise = {};
+        LOG_INFO("sampling(high noise) completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
+        if (sd_ctx->sd->free_params_immediately) {
+            sd_ctx->sd->high_noise_diffusion_model->free_params_buffer();
+        }
+    }
+
+    LOG_DEBUG("sample %dx%dx%d", W, H, T);
+    int64_t sampling_start         = ggml_time_ms();
+    sd::Tensor<float> final_latent = sd_ctx->sd->sample(sd_ctx->sd->diffusion_model,
+                                                        true,
+                                                        x_t,
+                                                        std::move(noise),
+                                                        embeds.cond,
+                                                        request.use_uncond ? embeds.uncond : SDCondition(),
+                                                        embeds.img_cond,
+                                                        embeds.id_cond,
+                                                        sd::Tensor<float>(),
+                                                        0.f,
+                                                        sd_vid_gen_params->sample_params.guidance,
+                                                        sd_vid_gen_params->sample_params.eta,
+                                                        sd_vid_gen_params->sample_params.shifted_timestep,
+                                                        plan.sample_method,
+                                                        plan.sigmas,
+                                                        -1,
+                                                        std::vector<sd::Tensor<float>>{},
+                                                        false,
+                                                        latents.denoise_mask,
+                                                        latents.vace_context,
+                                                        request.vace_strength,
+                                                        request.cache_params);
+
+    int64_t sampling_end = ggml_time_ms();
+    if (sd_ctx->sd->free_params_immediately) {
+        sd_ctx->sd->diffusion_model->free_params_buffer();
+    }
+    if (final_latent.empty()) {
+        LOG_ERROR("sampling failed after %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
+        return nullptr;
+    }
+    LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
+
+    if (latents.ref_image_num > 0) {
+        final_latent = sd::ops::slice(final_latent, 2, latents.ref_image_num, final_latent.shape()[2]);
+    }
+
+    int64_t latent_end = ggml_time_ms();
+    LOG_INFO("generating latent video completed, taking %.2fs", (latent_end - latent_start) * 1.0f / 1000);
+
+    auto result = decode_video_outputs(sd_ctx, final_latent, num_frames_out);
+    if (result == nullptr) {
+        return nullptr;
+    }
+
+    sd_ctx->sd->lora_stat();
+
+    int64_t t1 = ggml_time_ms();
+    LOG_INFO("generate_video completed in %.2fs", (t1 - t0) * 1.0f / 1000);
+    return result;
+}
diff --git a/src/t5.hpp b/src/t5.hpp
index 5f8c99d..f64d0b6 100644
--- a/src/t5.hpp
+++ b/src/t5.hpp
@@ -1,1038 +1,1036 @@
-#ifndef __T5_HPP__
-#define __T5_HPP__
-
-#include <cfloat>
-#include <limits>
-#include <map>
-#include <memory>
-#include <regex>
-#include <sstream>
-#include <string>
-#include <unordered_map>
-
-#include "darts.h"
-#include "ggml_extend.hpp"
-#include "json.hpp"
-#include "model.h"
-#include "vocab/vocab.h"
-
-// Port from: https://github.com/google/sentencepiece/blob/master/src/unigram_model.h
-// and https://github.com/google/sentencepiece/blob/master/src/unigram_model.h.
-// Original License: https://github.com/google/sentencepiece/blob/master/LICENSE
-//
-// Since tokenization is not the bottleneck in SD, performance was not a major consideration
-// during the migration.
-class MetaspacePreTokenizer {
-private:
-    std::string replacement;
-    bool add_prefix_space;
-
-public:
-    MetaspacePreTokenizer(const std::string replacement = " ", bool add_prefix_space = true)
-        : replacement(replacement), add_prefix_space(add_prefix_space) {}
-
-    std::string tokenize(const std::string& input) const {
-        std::string tokens;
-        std::stringstream ss(input);
-
-        if (add_prefix_space) {
-            tokens += replacement;
-        }
-
-        std::string token;
-        bool firstToken = true;
-        while (std::getline(ss, token, ' ')) {
-            if (!firstToken)
-                tokens += replacement + token;
-            else
-                tokens += token;
-
-            firstToken = false;
-        }
-
-        return tokens;
-    }
-};
-
-using EncodeResult = std::vector<std::pair<std::string, int>>;
-class T5UniGramTokenizer {
-public:
-    enum Status {
-        OK,
-        NO_PIECES_LOADED,
-        NO_ENTRY_FOUND,
-        BUILD_DOUBLE_ARRAY_FAILED,
-        PIECE_ALREADY_DEFINED,
-        INVLIAD_JSON
-    };
-
-protected:
-    MetaspacePreTokenizer pre_tokenizer;
-
-    // all <piece, score> pairs
-    std::vector<std::pair<std::string, float>> piece_score_pairs;
-
-    float min_score_ = 0.0;
-    float max_score_ = 0.0;
-    std::unique_ptr<Darts::DoubleArray> trie_;
-
-    // Maximum size of the return value of Trie, which corresponds
-    // to the maximum size of shared common prefix in the sentence pieces.
-    int trie_results_size_;
-    // unknown id.
-    int unk_id_            = 2;
-    std::string eos_token_ = "</s>";
-    int eos_id_            = 1;
-    int pad_id_            = 0;
-    // status.
-    Status status_ = OK;
-
-    float kUnkPenalty = 10.0;
-
-    std::string replacement;
-    bool add_prefix_space = true;
-
-    void InitializePieces(const std::string& json_str) {
-        nlohmann::json data;
-
-        try {
-            data = nlohmann::json::parse(json_str);
-        } catch (const nlohmann::json::parse_error&) {
-            status_ = INVLIAD_JSON;
-            return;
-        }
-        if (!data.contains("model")) {
-            status_ = INVLIAD_JSON;
-            return;
-        }
-        nlohmann::json model = data["model"];
-        if (!model.contains("vocab")) {
-            status_ = INVLIAD_JSON;
-            return;
-        }
-        if (model.contains("unk_id")) {
-            unk_id_ = model["unk_id"];
-        }
-
-        replacement      = data["pre_tokenizer"]["replacement"];
-        add_prefix_space = data["pre_tokenizer"]["add_prefix_space"];
-
-        pre_tokenizer = MetaspacePreTokenizer(replacement, add_prefix_space);
-
-        for (const auto& item : model["vocab"]) {
-            if (item.size() != 2 || !item[0].is_string() || !item[1].is_number_float()) {
-                status_ = INVLIAD_JSON;
-                return;
-            }
-            std::string piece = item[0];
-            if (piece.empty()) {
-                piece = "<empty_token>";
-            }
-            float score = item[1];
-            piece_score_pairs.emplace_back(piece, score);
-        }
-    }
-
-    // Builds a Trie index.
-    void BuildTrie(std::vector<std::pair<std::string, int>>* pieces) {
-        if (status_ != OK)
-            return;
-
-        if (pieces->empty()) {
-            status_ = NO_PIECES_LOADED;
-            return;
-        }
-
-        // sort by sentencepiece since DoubleArray::build()
-        // only accepts sorted strings.
-        sort(pieces->begin(), pieces->end());
-
-        // Makes key/value set for DoubleArrayTrie.
-        std::vector<const char*> key(pieces->size());
-        std::vector<int> value(pieces->size());
-        for (size_t i = 0; i < pieces->size(); ++i) {
-            // LOG_DEBUG("%s %d", (*pieces)[i].first.c_str(), (*pieces)[i].second);
-            key[i]   = (*pieces)[i].first.data();  // sorted piece.
-            value[i] = (*pieces)[i].second;        // vocab_id
-        }
-
-        trie_ = std::unique_ptr<Darts::DoubleArray>(new Darts::DoubleArray());
-        if (trie_->build(key.size(), const_cast<char**>(&key[0]), nullptr,
-                         &value[0]) != 0) {
-            status_ = BUILD_DOUBLE_ARRAY_FAILED;
-            return;
-        }
-
-        // Computes the maximum number of shared prefixes in the trie.
-        const int kMaxTrieResultsSize = 1024;
-        std::vector<Darts::DoubleArray::result_pair_type> results(
-            kMaxTrieResultsSize);
-        trie_results_size_ = 0;
-        for (const auto& p : *pieces) {
-            const size_t num_nodes = trie_->commonPrefixSearch(
-                p.first.data(), results.data(), results.size(), p.first.size());
-            trie_results_size_ = std::max(trie_results_size_, static_cast<int>(num_nodes));
-        }
-
-        if (trie_results_size_ == 0)
-            status_ = NO_ENTRY_FOUND;
-    }
-
-    // Non-virtual (inlined) implementation for faster execution.
-    inline float GetScoreInlined(int id) const {
-        return piece_score_pairs[id].second;
-    }
-
-    inline bool IsUnusedInlined(int id) const {
-        return false;  // TODO
-    }
-
-    inline bool IsUserDefinedInlined(int id) const {
-        return false;  // TODO
-    }
-
-    inline size_t OneCharLen(const char* src) const {
-        return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4];
-    }
-
-    // The optimized Viterbi encode.
-    // Main differences from the original function:
-    // 1. Memorizes the best path at each postion so far,
-    // 2. No need to store the Lattice nodes,
-    // 3. Works in utf-8 directly,
-    // 4. Defines a new struct with fewer fields than Lattice,
-    // 5. Does not depend on `class Lattice` nor call `SetSentence()`,
-    // `PopulateNodes()`, or `Viterbi()`. It does everything in one function.
-    // For detailed explanations please see the comments inside the function body.
-    EncodeResult EncodeOptimized(const std::string& normalized) const {
-        // An optimized Viterbi algorithm for unigram language models. Benchmarking
-        // results show that it generates almost identical outputs and achieves 2.1x
-        // speedup on average for 102 languages compared to the original
-        // implementation. It's based on the following three ideas:
-        //
-        // 1. Because it uses the *unigram* model:
-        //     best_score(x1, x2, �? xt) = best_score(x1, x2, �? x{t-1}) + score(xt)
-        // Deciding the best path (and score) can be decoupled into two isolated
-        // terms: (a) the best path ended before the last token `best_score(x1, x2, �?
-        // x{t-1})`, and (b) the last token and its `score(xt)`. The two terms are
-        // not related to each other at all.
-        //
-        // Therefore, we can compute once and store the *best_path ending at
-        // each character position*. In this way, when we know best_path_ends_at[M],
-        // we can reuse it to compute all the best_path_ends_at_[...] where the last
-        // token starts at the same character position M.
-        //
-        // This improves the time complexity from O(n*k*k) to O(n*k) because it
-        // eliminates the extra loop of recomputing the best path ending at the same
-        // position, where n is the input length and k is the maximum number of tokens
-        // that can be recognized starting at each position.
-        //
-        // 2. Again, because it uses the *unigram* model, we don’t need to actually
-        // store the lattice nodes. We still recognize all the tokens and lattice
-        // nodes from the input, but along identifying them, we use and discard them
-        // on the fly. There is no need to actually store them for best path Viterbi
-        // decoding. The only thing we need to store is the best_path ending at
-        // each character position.
-        //
-        // This improvement reduces the things needed to store in memory from O(n*k)
-        // to O(n), where n is the input length and k is the maximum number of tokens
-        // that can be recognized starting at each position.
-        //
-        // It also avoids the need of dynamic-size lattice node pool, because the
-        // number of things to store is fixed as n.
-        //
-        // 3. SentencePiece is designed to work with unicode, taking utf-8 encoding
-        // inputs. In the original implementation, the lattice positions are based on
-        // unicode positions. A mapping from unicode position to the utf-8 position is
-        // maintained to recover the utf-8 string piece.
-        //
-        // We found that it is sufficient and beneficial to directly work with utf-8
-        // positions:
-        //
-        // Firstly, it saves the conversion and mapping between unicode positions and
-        // utf-8 positions.
-        //
-        // Secondly, it reduces the number of fields we need to maintain in the
-        // node/path structure. Specifically, there are 8 fields defined in
-        // `Lattice::Node` used by the original encoder, but here in the optimized
-        // encoder we only need to define 3 fields in `BestPathNode`.
-
-        if (status() != OK || normalized.empty()) {
-            return {};
-        }
-        // Represents the last node of the best path.
-        struct BestPathNode {
-            int id = -1;  // The vocab id. (maybe -1 for UNK)
-            float best_path_score =
-                0;  // The total score of the best path ending at this node.
-            int starts_at =
-                -1;  // The starting position (in utf-8) of this node. The entire best
-                     // path can be constructed by backtracking along this link.
-        };
-        const int size        = static_cast<int>(normalized.size());
-        const float unk_score = min_score() - kUnkPenalty;
-        // The ends are exclusive.
-        std::vector<BestPathNode> best_path_ends_at(size + 1);
-        // Generate lattice on-the-fly (not stored) and update best_path_ends_at.
-        int starts_at = 0;
-        while (starts_at < size) {
-            std::size_t node_pos = 0;
-            std::size_t key_pos  = starts_at;
-            const auto best_path_score_till_here =
-                best_path_ends_at[starts_at].best_path_score;
-            bool has_single_node = false;
-            const int mblen =
-                std::min<int>(static_cast<int>(OneCharLen(normalized.data() + starts_at)),
-                              size - starts_at);
-            while (key_pos < size) {
-                const int ret =
-                    trie_->traverse(normalized.data(), node_pos, key_pos, key_pos + 1);
-                if (ret == -2)
-                    break;
-                if (ret >= 0) {
-                    if (IsUnusedInlined(ret))
-                        continue;
-                    // Update the best path node.
-                    auto& target_node = best_path_ends_at[key_pos];
-                    const auto length = (key_pos - starts_at);
-                    // User defined symbol receives extra bonus to always be selected.
-                    const auto score = IsUserDefinedInlined(ret)
-                                           ? (length * max_score_ - 0.1)
-                                           : GetScoreInlined(ret);
-                    const auto candidate_best_path_score =
-                        score + best_path_score_till_here;
-                    if (target_node.starts_at == -1 ||
-                        candidate_best_path_score > target_node.best_path_score) {
-                        target_node.best_path_score = static_cast<float>(candidate_best_path_score);
-                        target_node.starts_at       = starts_at;
-                        target_node.id              = ret;
-                    }
-                    if (!has_single_node && length == mblen) {
-                        has_single_node = true;
-                    }
-                }
-            }
-            if (!has_single_node) {
-                auto& target_node = best_path_ends_at[starts_at + mblen];
-                const auto candidate_best_path_score =
-                    unk_score + best_path_score_till_here;
-                if (target_node.starts_at == -1 ||
-                    candidate_best_path_score > target_node.best_path_score) {
-                    target_node.best_path_score = candidate_best_path_score;
-                    target_node.starts_at       = starts_at;
-                    target_node.id              = unk_id_;
-                }
-            }
-            // Move by one unicode character.
-            starts_at += mblen;
-        }
-        // Backtrack to identify the best path.
-        EncodeResult results;
-        int ends_at = size;
-        while (ends_at > 0) {
-            const auto& node = best_path_ends_at[ends_at];
-            results.emplace_back(
-                normalized.substr(node.starts_at, ends_at - node.starts_at), node.id);
-            ends_at = node.starts_at;
-        }
-        std::reverse(results.begin(), results.end());
-        return results;
-    }
-
-public:
-    explicit T5UniGramTokenizer(bool is_umt5 = false) {
-        if (is_umt5) {
-            InitializePieces(load_umt5_tokenizer_json());
-        } else {
-            InitializePieces(load_t5_tokenizer_json());
-        }
-
-        min_score_ = FLT_MAX;
-        max_score_ = FLT_MIN;
-
-        std::vector<std::pair<std::string, int>> pieces;
-        for (int i = 0; i < piece_score_pairs.size(); i++) {
-            const auto& sp = piece_score_pairs[i];
-
-            min_score_ = std::min(min_score_, sp.second);
-            max_score_ = std::max(max_score_, sp.second);
-
-            pieces.emplace_back(sp.first, i);
-        }
-
-        BuildTrie(&pieces);
-    }
-    ~T5UniGramTokenizer(){};
-
-    std::string Normalize(const std::string& input) const {
-        // Ref: https://github.com/huggingface/tokenizers/blob/1ff56c0c70b045f0cd82da1af9ac08cd4c7a6f9f/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py#L29
-        // TODO: nmt-nfkc
-        std::string normalized = std::regex_replace(input, std::regex(" {2,}"), " ");
-        return normalized;
-    }
-
-    std::vector<int> Encode(const std::string& input, bool append_eos_if_not_present = true) const {
-        std::string normalized = Normalize(input);
-        normalized             = pre_tokenizer.tokenize(normalized);
-        EncodeResult result    = EncodeOptimized(normalized);
-        if (result.size() > 0 && append_eos_if_not_present) {
-            auto item = result[result.size() - 1];
-            if (item.first != eos_token_) {
-                result.emplace_back(eos_token_, eos_id_);
-            }
-        }
-        std::vector<int> tokens;
-        for (auto item : result) {
-            tokens.push_back(item.second);
-        }
-        return tokens;
-    }
-
-    void pad_tokens(std::vector<int>& tokens,
-                    std::vector<float>& weights,
-                    std::vector<float>* attention_mask,
-                    size_t max_length = 0,
-                    bool padding      = false) {
-        if (max_length > 0 && padding) {
-            size_t orig_token_num = tokens.size() - 1;
-            size_t n              = static_cast<size_t>(std::ceil(orig_token_num * 1.0 / (max_length - 1)));
-            if (n == 0) {
-                n = 1;
-            }
-            size_t length = max_length * n;
-            LOG_DEBUG("token length: %llu", length);
-            std::vector<int> new_tokens;
-            std::vector<float> new_weights;
-            std::vector<float> new_attention_mask;
-            int token_idx = 0;
-            for (int i = 0; i < length; i++) {
-                if (token_idx >= orig_token_num) {
-                    break;
-                }
-                if (attention_mask != nullptr) {
-                    new_attention_mask.push_back(0.0);
-                }
-                if (i % max_length == max_length - 1) {
-                    new_tokens.push_back(eos_id_);
-                    new_weights.push_back(1.0);
-                } else {
-                    new_tokens.push_back(tokens[token_idx]);
-                    new_weights.push_back(weights[token_idx]);
-                    token_idx++;
-                }
-            }
-
-            new_tokens.push_back(eos_id_);
-            new_weights.push_back(1.0);
-            if (attention_mask != nullptr) {
-                new_attention_mask.push_back(0.0);
-            }
-
-            tokens  = new_tokens;
-            weights = new_weights;
-            if (attention_mask != nullptr) {
-                *attention_mask = new_attention_mask;
-            }
-
-            if (padding) {
-                int pad_token_id = pad_id_;
-                tokens.insert(tokens.end(), length - tokens.size(), pad_token_id);
-                weights.insert(weights.end(), length - weights.size(), 1.0);
-                if (attention_mask != nullptr) {
-                    // maybe keep some padding tokens unmasked?
-                    attention_mask->insert(attention_mask->end(), length - attention_mask->size(), -HUGE_VALF);
-                }
-            }
-        }
-    }
-
-    // Returns the minimum score in sentence pieces.
-    // min_score() - 10 is used for the cost of unknown sentence.
-    float min_score() const { return min_score_; }
-
-    // Returns the maximum score in sentence pieces.
-    // max_score() is used for the cost of user defined symbols.
-    float max_score() const { return max_score_; }
-
-    Status status() const { return status_; }
-};
-
-class T5LayerNorm : public UnaryBlock {
-protected:
-    int64_t hidden_size;
-    float eps;
-
-    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
-        enum ggml_type wtype = GGML_TYPE_F32;
-        params["weight"]     = ggml_new_tensor_1d(ctx, wtype, hidden_size);
-    }
-
-public:
-    T5LayerNorm(int64_t hidden_size,
-                float eps = 1e-06f)
-        : hidden_size(hidden_size),
-          eps(eps) {}
-
-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
-        ggml_tensor* w = params["weight"];
-        x              = ggml_rms_norm(ctx->ggml_ctx, x, eps);
-        x              = ggml_mul(ctx->ggml_ctx, x, w);
-        return x;
-    }
-};
-
-struct T5DenseActDense : public UnaryBlock {
-public:
-    T5DenseActDense(int64_t model_dim, int64_t ff_dim) {
-        blocks["wi"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
-        blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false));
-    }
-
-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
-        // x: [N, n_token, model_dim]
-        auto wi = std::dynamic_pointer_cast<Linear>(blocks["wi"]);
-        auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]);
-
-        x = wi->forward(ctx, x);
-        x = ggml_relu_inplace(ctx->ggml_ctx, x);
-        x = wo->forward(ctx, x);
-        return x;
-    }
-};
-
-struct T5DenseGatedActDense : public UnaryBlock {
-public:
-    T5DenseGatedActDense(int64_t model_dim, int64_t ff_dim) {
-        blocks["wi_0"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
-        blocks["wi_1"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
-        float scale    = 1.f / 32.f;
-        // The purpose of the scale here is to prevent NaN issues on some backends(CUDA, ...).
-        blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false, false, false, scale));
-    }
-
-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
-        // x: [N, n_token, model_dim]
-        auto wi_0 = std::dynamic_pointer_cast<Linear>(blocks["wi_0"]);
-        auto wi_1 = std::dynamic_pointer_cast<Linear>(blocks["wi_1"]);
-        auto wo   = std::dynamic_pointer_cast<Linear>(blocks["wo"]);
-
-        auto hidden_gelu   = ggml_ext_gelu(ctx->ggml_ctx, wi_0->forward(ctx, x), true);
-        auto hidden_linear = wi_1->forward(ctx, x);
-        x                  = ggml_mul_inplace(ctx->ggml_ctx, hidden_gelu, hidden_linear);
-        x                  = wo->forward(ctx, x);
-        return x;
-    }
-};
-
-struct T5LayerFF : public UnaryBlock {
-public:
-    T5LayerFF(int64_t model_dim, int64_t ff_dim) {
-        blocks["DenseReluDense"] = std::shared_ptr<GGMLBlock>(new T5DenseGatedActDense(model_dim, ff_dim));
-        blocks["layer_norm"]     = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
-    }
-
-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
-        // x: [N, n_token, model_dim]
-        auto DenseReluDense = std::dynamic_pointer_cast<T5DenseGatedActDense>(blocks["DenseReluDense"]);
-        auto layer_norm     = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);
-
-        auto forwarded_states = layer_norm->forward(ctx, x);
-        forwarded_states      = DenseReluDense->forward(ctx, forwarded_states);
-        x                     = ggml_add_inplace(ctx->ggml_ctx, forwarded_states, x);
-        return x;
-    }
-};
-
-class T5Attention : public GGMLBlock {
-protected:
-    int64_t model_dim;
-    int64_t inner_dim;
-    int64_t num_heads;
-    bool using_relative_attention_bias;
-    int64_t relative_attention_num_buckets  = 32;
-    int64_t relative_attention_max_distance = 128;
-
-public:
-    T5Attention(int64_t model_dim,
-                int64_t inner_dim,
-                int64_t num_heads,
-                bool using_relative_attention_bias = false)
-        : model_dim(model_dim),
-          inner_dim(inner_dim),
-          num_heads(num_heads),
-          using_relative_attention_bias(using_relative_attention_bias) {
-        blocks["q"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
-        blocks["k"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
-        blocks["v"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
-        blocks["o"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, model_dim, false));
-        if (using_relative_attention_bias) {
-            blocks["relative_attention_bias"] = std::shared_ptr<GGMLBlock>(new Embedding(relative_attention_num_buckets, num_heads));
-        }
-    }
-
-    ggml_tensor* compute_bias(GGMLRunnerContext* ctx,
-                              ggml_tensor* relative_position_bucket) {
-        auto relative_attention_bias = std::dynamic_pointer_cast<Embedding>(blocks["relative_attention_bias"]);
-
-        auto values = relative_attention_bias->forward(ctx, relative_position_bucket);            // shape (query_length, key_length, num_heads)
-        values      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, values, 2, 0, 1, 3));  // shape (1, num_heads, query_length, key_length)
-        return values;
-    }
-
-    // x: [N, n_token, model_dim]
-    std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
-                                                  ggml_tensor* x,
-                                                  ggml_tensor* past_bias                = nullptr,
-                                                  ggml_tensor* mask                     = nullptr,
-                                                  ggml_tensor* relative_position_bucket = nullptr) {
-        auto q_proj   = std::dynamic_pointer_cast<Linear>(blocks["q"]);
-        auto k_proj   = std::dynamic_pointer_cast<Linear>(blocks["k"]);
-        auto v_proj   = std::dynamic_pointer_cast<Linear>(blocks["v"]);
-        auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["o"]);
-
-        int64_t n_head = num_heads;
-        int64_t d_head = inner_dim / n_head;
-
-        auto q = q_proj->forward(ctx, x);
-        auto k = k_proj->forward(ctx, x);
-        auto v = v_proj->forward(ctx, x);
-
-        if (using_relative_attention_bias && relative_position_bucket != nullptr) {
-            past_bias = compute_bias(ctx, relative_position_bucket);
-        }
-        if (past_bias != nullptr) {
-            if (mask != nullptr) {
-                mask = ggml_repeat(ctx->ggml_ctx, mask, past_bias);
-                mask = ggml_add(ctx->ggml_ctx, mask, past_bias);
-            } else {
-                mask = past_bias;
-            }
-        }
-
-        k = ggml_ext_scale(ctx->ggml_ctx, k, ::sqrtf(static_cast<float>(d_head)), true);
-
-        x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, mask);  // [N, n_token, d_head * n_head]
-
-        x = out_proj->forward(ctx, x);  // [N, n_token, model_dim]
-        return {x, past_bias};
-    }
-};
-
-struct T5LayerSelfAttention : public GGMLBlock {
-public:
-    T5LayerSelfAttention(int64_t model_dim,
-                         int64_t inner_dim,
-                         int64_t ff_dim,
-                         int64_t num_heads,
-                         bool using_relative_attention_bias) {
-        blocks["SelfAttention"] = std::shared_ptr<GGMLBlock>(new T5Attention(model_dim, inner_dim, num_heads, using_relative_attention_bias));
-        blocks["layer_norm"]    = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
-    }
-
-    std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
-                                                  ggml_tensor* x,
-                                                  ggml_tensor* past_bias                = nullptr,
-                                                  ggml_tensor* mask                     = nullptr,
-                                                  ggml_tensor* relative_position_bucket = nullptr) {
-        // x: [N, n_token, model_dim]
-        auto SelfAttention = std::dynamic_pointer_cast<T5Attention>(blocks["SelfAttention"]);
-        auto layer_norm    = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);
-
-        auto normed_hidden_state = layer_norm->forward(ctx, x);
-        auto ret                 = SelfAttention->forward(ctx, normed_hidden_state, past_bias, mask, relative_position_bucket);
-        auto output              = ret.first;
-        past_bias                = ret.second;
-
-        x = ggml_add_inplace(ctx->ggml_ctx, output, x);
-        return {x, past_bias};
-    }
-};
-
-struct T5Block : public GGMLBlock {
-public:
-    T5Block(int64_t model_dim, int64_t inner_dim, int64_t ff_dim, int64_t num_heads, bool using_relative_attention_bias) {
-        blocks["layer.0"] = std::shared_ptr<GGMLBlock>(new T5LayerSelfAttention(model_dim, inner_dim, ff_dim, num_heads, using_relative_attention_bias));
-        blocks["layer.1"] = std::shared_ptr<GGMLBlock>(new T5LayerFF(model_dim, ff_dim));
-    }
-
-    std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
-                                                  ggml_tensor* x,
-                                                  ggml_tensor* past_bias                = nullptr,
-                                                  ggml_tensor* mask                     = nullptr,
-                                                  ggml_tensor* relative_position_bucket = nullptr) {
-        // x: [N, n_token, model_dim]
-        auto layer_0 = std::dynamic_pointer_cast<T5LayerSelfAttention>(blocks["layer.0"]);
-        auto layer_1 = std::dynamic_pointer_cast<T5LayerFF>(blocks["layer.1"]);
-
-        auto ret  = layer_0->forward(ctx, x, past_bias, mask, relative_position_bucket);
-        x         = ret.first;
-        past_bias = ret.second;
-        x         = layer_1->forward(ctx, x);
-        return {x, past_bias};
-    }
-};
-
-struct T5Stack : public GGMLBlock {
-    int64_t num_layers;
-
-public:
-    T5Stack(int64_t num_layers,
-            int64_t model_dim,
-            int64_t inner_dim,
-            int64_t ff_dim,
-            int64_t num_heads,
-            bool relative_attention = true)
-        : num_layers(num_layers) {
-        for (int i = 0; i < num_layers; i++) {
-            blocks["block." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new T5Block(model_dim, inner_dim, ff_dim, num_heads, (!relative_attention || i == 0)));
-        }
-
-        blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
-    }
-
-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* x,
-                         ggml_tensor* past_bias                = nullptr,
-                         ggml_tensor* attention_mask           = nullptr,
-                         ggml_tensor* relative_position_bucket = nullptr) {
-        // x: [N, n_token, model_dim]
-        for (int i = 0; i < num_layers; i++) {
-            auto block = std::dynamic_pointer_cast<T5Block>(blocks["block." + std::to_string(i)]);
-
-            auto ret  = block->forward(ctx, x, past_bias, attention_mask, relative_position_bucket);
-            x         = ret.first;
-            past_bias = ret.second;
-        }
-
-        auto final_layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["final_layer_norm"]);
-
-        x = final_layer_norm->forward(ctx, x);
-        return x;
-    }
-};
-
-struct T5Params {
-    int64_t num_layers      = 24;
-    int64_t model_dim       = 4096;
-    int64_t ff_dim          = 10240;
-    int64_t num_heads       = 64;
-    int64_t vocab_size      = 32128;
-    bool relative_attention = true;
-};
-
-struct T5 : public GGMLBlock {
-    T5Params params;
-
-public:
-    T5() {}
-    T5(T5Params params)
-        : params(params) {
-        blocks["encoder"] = std::shared_ptr<GGMLBlock>(new T5Stack(params.num_layers,
-                                                                   params.model_dim,
-                                                                   params.model_dim,
-                                                                   params.ff_dim,
-                                                                   params.num_heads,
-                                                                   params.relative_attention));
-        blocks["shared"]  = std::shared_ptr<GGMLBlock>(new Embedding(params.vocab_size,
-                                                                     params.model_dim));
-    }
-
-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* input_ids,
-                         ggml_tensor* past_bias                = nullptr,
-                         ggml_tensor* attention_mask           = nullptr,
-                         ggml_tensor* relative_position_bucket = nullptr) {
-        // input_ids: [N, n_token]
-
-        auto shared  = std::dynamic_pointer_cast<Embedding>(blocks["shared"]);
-        auto encoder = std::dynamic_pointer_cast<T5Stack>(blocks["encoder"]);
-
-        auto x = shared->forward(ctx, input_ids);
-        x      = encoder->forward(ctx, x, past_bias, attention_mask, relative_position_bucket);
-        return x;
-    }
-};
-
-struct T5Runner : public GGMLRunner {
-    T5Params params;
-    T5 model;
-    std::vector<int> relative_position_bucket_vec;
-
-    T5Runner(ggml_backend_t backend,
-             bool offload_params_to_cpu,
-             const String2TensorStorage& tensor_storage_map,
-             const std::string prefix,
-             bool is_umt5 = false)
-        : GGMLRunner(backend, offload_params_to_cpu) {
-        if (is_umt5) {
-            params.vocab_size         = 256384;
-            params.relative_attention = false;
-        }
-        model = T5(params);
-        model.init(params_ctx, tensor_storage_map, prefix);
-    }
-
-    std::string get_desc() override {
-        return "t5";
-    }
-
-    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
-        model.get_param_tensors(tensors, prefix);
-    }
-
-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* input_ids,
-                         ggml_tensor* relative_position_bucket,
-                         ggml_tensor* attention_mask = nullptr) {
-        size_t N       = input_ids->ne[1];
-        size_t n_token = input_ids->ne[0];
-
-        auto hidden_states = model.forward(ctx, input_ids, nullptr, attention_mask, relative_position_bucket);  // [N, n_token, model_dim]
-        return hidden_states;
-    }
-
-    ggml_cgraph* build_graph(ggml_tensor* input_ids,
-                             ggml_tensor* attention_mask = nullptr) {
-        ggml_cgraph* gf = ggml_new_graph(compute_ctx);
-
-        input_ids      = to_backend(input_ids);
-        attention_mask = to_backend(attention_mask);
-
-        relative_position_bucket_vec = compute_relative_position_bucket(static_cast<int>(input_ids->ne[0]), static_cast<int>(input_ids->ne[0]));
-
-        // for (int i = 0; i < relative_position_bucket_vec.size(); i++) {
-        //     if (i % 77 == 0) {
-        //         printf("\n");
-        //     }
-        //     printf("%d ", relative_position_bucket_vec[i]);
-        // }
-
-        auto relative_position_bucket = ggml_new_tensor_2d(compute_ctx,
-                                                           GGML_TYPE_I32,
-                                                           input_ids->ne[0],
-                                                           input_ids->ne[0]);
-        set_backend_tensor_data(relative_position_bucket, relative_position_bucket_vec.data());
-
-        auto runner_ctx            = get_context();
-        ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, relative_position_bucket, attention_mask);
-
-        ggml_build_forward_expand(gf, hidden_states);
-
-        return gf;
-    }
-
-    bool compute(const int n_threads,
-                 ggml_tensor* input_ids,
-                 ggml_tensor* attention_mask,
-                 ggml_tensor** output,
-                 ggml_context* output_ctx = nullptr) {
-        auto get_graph = [&]() -> ggml_cgraph* {
-            return build_graph(input_ids, attention_mask);
-        };
-        return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
-    }
-
-    static std::vector<int> _relative_position_bucket(const std::vector<int>& relative_position,
-                                                      bool bidirectional = true,
-                                                      int num_buckets    = 32,
-                                                      int max_distance   = 128) {
-        std::vector<int> relative_buckets(relative_position.size(), 0);
-        std::vector<int> abs_relative_position = relative_position;
-
-        if (bidirectional) {
-            num_buckets = num_buckets / 2;
-            for (size_t i = 0; i < relative_position.size(); ++i) {
-                if (relative_position[i] > 0) {
-                    relative_buckets[i] += num_buckets;
-                }
-                abs_relative_position[i] = std::abs(relative_position[i]);
-            }
-        } else {
-            for (size_t i = 0; i < relative_position.size(); ++i) {
-                abs_relative_position[i] = std::max(-relative_position[i], 0);
-            }
-        }
-
-        int max_exact = num_buckets / 2;
-        std::vector<int> relative_position_if_large(relative_position.size(), 0);
-
-        for (size_t i = 0; i < relative_position.size(); ++i) {
-            if (abs_relative_position[i] < max_exact) {
-                relative_buckets[i] += abs_relative_position[i];
-            } else {
-                float log_pos                 = std::log(static_cast<float>(abs_relative_position[i]) / max_exact);
-                float log_base                = std::log(static_cast<float>(max_distance) / max_exact);
-                relative_position_if_large[i] = max_exact + static_cast<int>((log_pos / log_base) * (num_buckets - max_exact));
-                relative_position_if_large[i] = std::min(relative_position_if_large[i], num_buckets - 1);
-                relative_buckets[i] += relative_position_if_large[i];
-            }
-        }
-
-        return relative_buckets;
-    }
-
-    std::vector<int> compute_relative_position_bucket(int query_length,
-                                                      int key_length) {
-        std::vector<int> context_position(query_length);
-        std::vector<int> memory_position(key_length);
-
-        for (int i = 0; i < query_length; ++i) {
-            context_position[i] = i;
-        }
-        for (int i = 0; i < key_length; ++i) {
-            memory_position[i] = i;
-        }
-
-        std::vector<std::vector<int>> relative_position(query_length, std::vector<int>(key_length, 0));
-        for (int i = 0; i < query_length; ++i) {
-            for (int j = 0; j < key_length; ++j) {
-                relative_position[i][j] = memory_position[j] - context_position[i];
-            }
-        }
-
-        std::vector<int> relative_position_bucket;
-        for (int i = 0; i < query_length; ++i) {
-            std::vector<int> result = _relative_position_bucket(relative_position[i], true);
-            relative_position_bucket.insert(relative_position_bucket.end(), result.begin(), result.end());
-        }
-
-        return relative_position_bucket;
-    }
-};
-
-struct T5Embedder {
-    T5UniGramTokenizer tokenizer;
-    T5Runner model;
-
-    T5Embedder(ggml_backend_t backend,
-               bool offload_params_to_cpu,
-               const String2TensorStorage& tensor_storage_map = {},
-               const std::string prefix                       = "",
-               bool is_umt5                                   = false)
-        : model(backend, offload_params_to_cpu, tensor_storage_map, prefix, is_umt5), tokenizer(is_umt5) {
-    }
-
-    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
-        model.get_param_tensors(tensors, prefix);
-    }
-
-    void alloc_params_buffer() {
-        model.alloc_params_buffer();
-    }
-
-    std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text,
-                                                                                  size_t max_length = 0,
-                                                                                  bool padding      = false) {
-        auto parsed_attention = parse_prompt_attention(text);
-
-        {
-            std::stringstream ss;
-            ss << "[";
-            for (const auto& item : parsed_attention) {
-                ss << "['" << item.first << "', " << item.second << "], ";
-            }
-            ss << "]";
-            LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
-        }
-
-        std::vector<int> tokens;
-        std::vector<float> weights;
-        for (const auto& item : parsed_attention) {
-            const std::string& curr_text = item.first;
-            float curr_weight            = item.second;
-            std::vector<int> curr_tokens = tokenizer.Encode(curr_text, false);
-            tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
-            weights.insert(weights.end(), curr_tokens.size(), curr_weight);
-        }
-
-        int EOS_TOKEN_ID = 1;
-        tokens.push_back(EOS_TOKEN_ID);
-        weights.push_back(1.0);
-
-        std::vector<float> attention_mask;
-
-        tokenizer.pad_tokens(tokens, weights, &attention_mask, max_length, padding);
-
-        // for (int i = 0; i < tokens.size(); i++) {
-        //     std::cout << tokens[i] << ":" << weights[i] << ", ";
-        // }
-        // std::cout << std::endl;
-
-        return {tokens, weights, attention_mask};
-    }
-
-    void test() {
-        ggml_init_params params;
-        params.mem_size   = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
-        params.mem_buffer = nullptr;
-        params.no_alloc   = false;
-
-        ggml_context* work_ctx = ggml_init(params);
-        GGML_ASSERT(work_ctx != nullptr);
-
-        {
-            std::string text("a lovely cat");
-            // std::string text("一只可爱的�?); // umt5 chinease test
-            auto tokens_and_weights     = tokenize(text, 512, true);
-            std::vector<int>& tokens    = std::get<0>(tokens_and_weights);
-            std::vector<float>& weights = std::get<1>(tokens_and_weights);
-            std::vector<float>& masks   = std::get<2>(tokens_and_weights);
-            for (auto token : tokens) {
-                printf("%d ", token);
-            }
-            printf("\n");
-            auto input_ids      = vector_to_ggml_tensor_i32(work_ctx, tokens);
-            auto attention_mask = vector_to_ggml_tensor(work_ctx, masks);
-            ggml_tensor* out    = nullptr;
-
-            int64_t t0 = ggml_time_ms();
-            model.compute(8, input_ids, attention_mask, &out, work_ctx);
-            int64_t t1 = ggml_time_ms();
-
-            print_ggml_tensor(out);
-            LOG_DEBUG("t5 test done in %lldms", t1 - t0);
-        }
-    }
-
-    static void load_from_file_and_test(const std::string& file_path) {
-        // cpu f16: pass
-        // cpu f32: pass
-        // cuda f16: pass
-        // cuda f32: pass
-        // cuda q8_0: pass
-        // ggml_backend_t backend = ggml_backend_cuda_init(0);
-        ggml_backend_t backend    = ggml_backend_cpu_init();
-        ggml_type model_data_type = GGML_TYPE_F16;
-
-        ModelLoader model_loader;
-        if (!model_loader.init_from_file_and_convert_name(file_path)) {
-            LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
-            return;
-        }
-
-        auto& tensor_storage_map = model_loader.get_tensor_storage_map();
-        for (auto& [name, tensor_storage] : tensor_storage_map) {
-            if (ends_with(name, "weight")) {
-                tensor_storage.expected_type = model_data_type;
-            }
-        }
-
-        std::shared_ptr<T5Embedder> t5 = std::make_shared<T5Embedder>(backend, false, tensor_storage_map, "", true);
-
-        t5->alloc_params_buffer();
-        std::map<std::string, ggml_tensor*> tensors;
-        t5->get_param_tensors(tensors, "");
-
-        bool success = model_loader.load_tensors(tensors);
-
-        if (!success) {
-            LOG_ERROR("load tensors from model loader failed");
-            return;
-        }
-
-        LOG_INFO("t5 model loaded");
-        t5->test();
-    }
-};
-
-#endif  // __T5_HPP__
\ No newline at end of file
+﻿#ifndef __T5_HPP__
+#define __T5_HPP__
+
+#include <cfloat>
+#include <limits>
+#include <map>
+#include <memory>
+#include <regex>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+
+#include "darts.h"
+#include "ggml_extend.hpp"
+#include "json.hpp"
+#include "model.h"
+#include "vocab/vocab.h"
+
+// Port from: https://github.com/google/sentencepiece/blob/master/src/unigram_model.h
+// and https://github.com/google/sentencepiece/blob/master/src/unigram_model.h.
+// Original License: https://github.com/google/sentencepiece/blob/master/LICENSE
+//
+// Since tokenization is not the bottleneck in SD, performance was not a major consideration
+// during the migration.
+class MetaspacePreTokenizer {
+private:
+    std::string replacement;
+    bool add_prefix_space;
+
+public:
+    MetaspacePreTokenizer(const std::string replacement = " ", bool add_prefix_space = true)
+        : replacement(replacement), add_prefix_space(add_prefix_space) {}
+
+    std::string tokenize(const std::string& input) const {
+        std::string tokens;
+        std::stringstream ss(input);
+
+        if (add_prefix_space) {
+            tokens += replacement;
+        }
+
+        std::string token;
+        bool firstToken = true;
+        while (std::getline(ss, token, ' ')) {
+            if (!firstToken)
+                tokens += replacement + token;
+            else
+                tokens += token;
+
+            firstToken = false;
+        }
+
+        return tokens;
+    }
+};
+
+using EncodeResult = std::vector<std::pair<std::string, int>>;
+class T5UniGramTokenizer {
+public:
+    enum Status {
+        OK,
+        NO_PIECES_LOADED,
+        NO_ENTRY_FOUND,
+        BUILD_DOUBLE_ARRAY_FAILED,
+        PIECE_ALREADY_DEFINED,
+        INVLIAD_JSON
+    };
+
+protected:
+    MetaspacePreTokenizer pre_tokenizer;
+
+    // all <piece, score> pairs
+    std::vector<std::pair<std::string, float>> piece_score_pairs;
+
+    float min_score_ = 0.0;
+    float max_score_ = 0.0;
+    std::unique_ptr<Darts::DoubleArray> trie_;
+
+    // Maximum size of the return value of Trie, which corresponds
+    // to the maximum size of shared common prefix in the sentence pieces.
+    int trie_results_size_;
+    // unknown id.
+    int unk_id_            = 2;
+    std::string eos_token_ = "</s>";
+    int eos_id_            = 1;
+    int pad_id_            = 0;
+    // status.
+    Status status_ = OK;
+
+    float kUnkPenalty = 10.0;
+
+    std::string replacement;
+    bool add_prefix_space = true;
+
+    void InitializePieces(const std::string& json_str) {
+        nlohmann::json data;
+
+        try {
+            data = nlohmann::json::parse(json_str);
+        } catch (const nlohmann::json::parse_error&) {
+            status_ = INVLIAD_JSON;
+            return;
+        }
+        if (!data.contains("model")) {
+            status_ = INVLIAD_JSON;
+            return;
+        }
+        nlohmann::json model = data["model"];
+        if (!model.contains("vocab")) {
+            status_ = INVLIAD_JSON;
+            return;
+        }
+        if (model.contains("unk_id")) {
+            unk_id_ = model["unk_id"];
+        }
+
+        replacement      = data["pre_tokenizer"]["replacement"];
+        add_prefix_space = data["pre_tokenizer"]["add_prefix_space"];
+
+        pre_tokenizer = MetaspacePreTokenizer(replacement, add_prefix_space);
+
+        for (const auto& item : model["vocab"]) {
+            if (item.size() != 2 || !item[0].is_string() || !item[1].is_number_float()) {
+                status_ = INVLIAD_JSON;
+                return;
+            }
+            std::string piece = item[0];
+            if (piece.empty()) {
+                piece = "<empty_token>";
+            }
+            float score = item[1];
+            piece_score_pairs.emplace_back(piece, score);
+        }
+    }
+
+    // Builds a Trie index.
+    void BuildTrie(std::vector<std::pair<std::string, int>>* pieces) {
+        if (status_ != OK)
+            return;
+
+        if (pieces->empty()) {
+            status_ = NO_PIECES_LOADED;
+            return;
+        }
+
+        // sort by sentencepiece since DoubleArray::build()
+        // only accepts sorted strings.
+        sort(pieces->begin(), pieces->end());
+
+        // Makes key/value set for DoubleArrayTrie.
+        std::vector<const char*> key(pieces->size());
+        std::vector<int> value(pieces->size());
+        for (size_t i = 0; i < pieces->size(); ++i) {
+            // LOG_DEBUG("%s %d", (*pieces)[i].first.c_str(), (*pieces)[i].second);
+            key[i]   = (*pieces)[i].first.data();  // sorted piece.
+            value[i] = (*pieces)[i].second;        // vocab_id
+        }
+
+        trie_ = std::unique_ptr<Darts::DoubleArray>(new Darts::DoubleArray());
+        if (trie_->build(key.size(), const_cast<char**>(&key[0]), nullptr,
+                         &value[0]) != 0) {
+            status_ = BUILD_DOUBLE_ARRAY_FAILED;
+            return;
+        }
+
+        // Computes the maximum number of shared prefixes in the trie.
+        const int kMaxTrieResultsSize = 1024;
+        std::vector<Darts::DoubleArray::result_pair_type> results(
+            kMaxTrieResultsSize);
+        trie_results_size_ = 0;
+        for (const auto& p : *pieces) {
+            const size_t num_nodes = trie_->commonPrefixSearch(
+                p.first.data(), results.data(), results.size(), p.first.size());
+            trie_results_size_ = std::max(trie_results_size_, static_cast<int>(num_nodes));
+        }
+
+        if (trie_results_size_ == 0)
+            status_ = NO_ENTRY_FOUND;
+    }
+
+    // Non-virtual (inlined) implementation for faster execution.
+    inline float GetScoreInlined(int id) const {
+        return piece_score_pairs[id].second;
+    }
+
+    inline bool IsUnusedInlined(int id) const {
+        return false;  // TODO
+    }
+
+    inline bool IsUserDefinedInlined(int id) const {
+        return false;  // TODO
+    }
+
+    inline size_t OneCharLen(const char* src) const {
+        return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4];
+    }
+
+    // The optimized Viterbi encode.
+    // Main differences from the original function:
+    // 1. Memorizes the best path at each postion so far,
+    // 2. No need to store the Lattice nodes,
+    // 3. Works in utf-8 directly,
+    // 4. Defines a new struct with fewer fields than Lattice,
+    // 5. Does not depend on `class Lattice` nor call `SetSentence()`,
+    // `PopulateNodes()`, or `Viterbi()`. It does everything in one function.
+    // For detailed explanations please see the comments inside the function body.
+    EncodeResult EncodeOptimized(const std::string& normalized) const {
+        // An optimized Viterbi algorithm for unigram language models. Benchmarking
+        // results show that it generates almost identical outputs and achieves 2.1x
+        // speedup on average for 102 languages compared to the original
+        // implementation. It's based on the following three ideas:
+        //
+        // 1. Because it uses the *unigram* model:
+        //     best_score(x1, x2, ... xt) = best_score(x1, x2, ... x{t-1}) + score(xt)
+        // Deciding the best path (and score) can be decoupled into two isolated
+        // terms: (a) the best path ended before the last token `best_score(x1, x2, ...)`
+        // x{t-1})`, and (b) the last token and its `score(xt)`. The two terms are
+        // not related to each other at all.
+        //
+        // Therefore, we can compute once and store the *best_path ending at
+        // each character position*. In this way, when we know best_path_ends_at[M],
+        // we can reuse it to compute all the best_path_ends_at_[...] where the last
+        // token starts at the same character position M.
+        //
+        // This improves the time complexity from O(n*k*k) to O(n*k) because it
+        // eliminates the extra loop of recomputing the best path ending at the same
+        // position, where n is the input length and k is the maximum number of tokens
+        // that can be recognized starting at each position.
+        //
+        // 2. Again, because it uses the *unigram* model, we don't need to actually
+        // store the lattice nodes. We still recognize all the tokens and lattice
+        // nodes from the input, but along identifying them, we use and discard them
+        // on the fly. There is no need to actually store them for best path Viterbi
+        // decoding. The only thing we need to store is the best_path ending at
+        // each character position.
+        //
+        // This improvement reduces the things needed to store in memory from O(n*k)
+        // to O(n), where n is the input length and k is the maximum number of tokens
+        // that can be recognized starting at each position.
+        //
+        // It also avoids the need of dynamic-size lattice node pool, because the
+        // number of things to store is fixed as n.
+        //
+        // 3. SentencePiece is designed to work with unicode, taking utf-8 encoding
+        // inputs. In the original implementation, the lattice positions are based on
+        // unicode positions. A mapping from unicode position to the utf-8 position is
+        // maintained to recover the utf-8 string piece.
+        //
+        // We found that it is sufficient and beneficial to directly work with utf-8
+        // positions:
+        //
+        // Firstly, it saves the conversion and mapping between unicode positions and
+        // utf-8 positions.
+        //
+        // Secondly, it reduces the number of fields we need to maintain in the
+        // node/path structure. Specifically, there are 8 fields defined in
+        // `Lattice::Node` used by the original encoder, but here in the optimized
+        // encoder we only need to define 3 fields in `BestPathNode`.
+
+        if (status() != OK || normalized.empty()) {
+            return {};
+        }
+        // Represents the last node of the best path.
+        struct BestPathNode {
+            int id = -1;  // The vocab id. (maybe -1 for UNK)
+            float best_path_score =
+                0;  // The total score of the best path ending at this node.
+            int starts_at =
+                -1;  // The starting position (in utf-8) of this node. The entire best
+                     // path can be constructed by backtracking along this link.
+        };
+        const int size        = static_cast<int>(normalized.size());
+        const float unk_score = min_score() - kUnkPenalty;
+        // The ends are exclusive.
+        std::vector<BestPathNode> best_path_ends_at(size + 1);
+        // Generate lattice on-the-fly (not stored) and update best_path_ends_at.
+        int starts_at = 0;
+        while (starts_at < size) {
+            std::size_t node_pos = 0;
+            std::size_t key_pos  = starts_at;
+            const auto best_path_score_till_here =
+                best_path_ends_at[starts_at].best_path_score;
+            bool has_single_node = false;
+            const int mblen =
+                std::min<int>(static_cast<int>(OneCharLen(normalized.data() + starts_at)),
+                              size - starts_at);
+            while (key_pos < size) {
+                const int ret =
+                    trie_->traverse(normalized.data(), node_pos, key_pos, key_pos + 1);
+                if (ret == -2)
+                    break;
+                if (ret >= 0) {
+                    if (IsUnusedInlined(ret))
+                        continue;
+                    // Update the best path node.
+                    auto& target_node = best_path_ends_at[key_pos];
+                    const auto length = (key_pos - starts_at);
+                    // User defined symbol receives extra bonus to always be selected.
+                    const auto score = IsUserDefinedInlined(ret)
+                                           ? (length * max_score_ - 0.1)
+                                           : GetScoreInlined(ret);
+                    const auto candidate_best_path_score =
+                        score + best_path_score_till_here;
+                    if (target_node.starts_at == -1 ||
+                        candidate_best_path_score > target_node.best_path_score) {
+                        target_node.best_path_score = static_cast<float>(candidate_best_path_score);
+                        target_node.starts_at       = starts_at;
+                        target_node.id              = ret;
+                    }
+                    if (!has_single_node && length == mblen) {
+                        has_single_node = true;
+                    }
+                }
+            }
+            if (!has_single_node) {
+                auto& target_node = best_path_ends_at[starts_at + mblen];
+                const auto candidate_best_path_score =
+                    unk_score + best_path_score_till_here;
+                if (target_node.starts_at == -1 ||
+                    candidate_best_path_score > target_node.best_path_score) {
+                    target_node.best_path_score = candidate_best_path_score;
+                    target_node.starts_at       = starts_at;
+                    target_node.id              = unk_id_;
+                }
+            }
+            // Move by one unicode character.
+            starts_at += mblen;
+        }
+        // Backtrack to identify the best path.
+        EncodeResult results;
+        int ends_at = size;
+        while (ends_at > 0) {
+            const auto& node = best_path_ends_at[ends_at];
+            results.emplace_back(
+                normalized.substr(node.starts_at, ends_at - node.starts_at), node.id);
+            ends_at = node.starts_at;
+        }
+        std::reverse(results.begin(), results.end());
+        return results;
+    }
+
+public:
+    explicit T5UniGramTokenizer(bool is_umt5 = false) {
+        if (is_umt5) {
+            InitializePieces(load_umt5_tokenizer_json());
+        } else {
+            InitializePieces(load_t5_tokenizer_json());
+        }
+
+        min_score_ = FLT_MAX;
+        max_score_ = FLT_MIN;
+
+        std::vector<std::pair<std::string, int>> pieces;
+        for (int i = 0; i < piece_score_pairs.size(); i++) {
+            const auto& sp = piece_score_pairs[i];
+
+            min_score_ = std::min(min_score_, sp.second);
+            max_score_ = std::max(max_score_, sp.second);
+
+            pieces.emplace_back(sp.first, i);
+        }
+
+        BuildTrie(&pieces);
+    }
+    ~T5UniGramTokenizer(){};
+
+    std::string Normalize(const std::string& input) const {
+        // Ref: https://github.com/huggingface/tokenizers/blob/1ff56c0c70b045f0cd82da1af9ac08cd4c7a6f9f/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py#L29
+        // TODO: nmt-nfkc
+        std::string normalized = std::regex_replace(input, std::regex(" {2,}"), " ");
+        return normalized;
+    }
+
+    std::vector<int> Encode(const std::string& input, bool append_eos_if_not_present = true) const {
+        std::string normalized = Normalize(input);
+        normalized             = pre_tokenizer.tokenize(normalized);
+        EncodeResult result    = EncodeOptimized(normalized);
+        if (result.size() > 0 && append_eos_if_not_present) {
+            auto item = result[result.size() - 1];
+            if (item.first != eos_token_) {
+                result.emplace_back(eos_token_, eos_id_);
+            }
+        }
+        std::vector<int> tokens;
+        for (auto item : result) {
+            tokens.push_back(item.second);
+        }
+        return tokens;
+    }
+
+    void pad_tokens(std::vector<int>& tokens,
+                    std::vector<float>& weights,
+                    std::vector<float>* attention_mask,
+                    size_t max_length = 0,
+                    bool padding      = false) {
+        if (max_length > 0 && padding) {
+            size_t orig_token_num = tokens.size() - 1;
+            size_t n              = static_cast<size_t>(std::ceil(orig_token_num * 1.0 / (max_length - 1)));
+            if (n == 0) {
+                n = 1;
+            }
+            size_t length = max_length * n;
+            LOG_DEBUG("token length: %llu", length);
+            std::vector<int> new_tokens;
+            std::vector<float> new_weights;
+            std::vector<float> new_attention_mask;
+            int token_idx = 0;
+            for (int i = 0; i < length; i++) {
+                if (token_idx >= orig_token_num) {
+                    break;
+                }
+                if (attention_mask != nullptr) {
+                    new_attention_mask.push_back(0.0);
+                }
+                if (i % max_length == max_length - 1) {
+                    new_tokens.push_back(eos_id_);
+                    new_weights.push_back(1.0);
+                } else {
+                    new_tokens.push_back(tokens[token_idx]);
+                    new_weights.push_back(weights[token_idx]);
+                    token_idx++;
+                }
+            }
+
+            new_tokens.push_back(eos_id_);
+            new_weights.push_back(1.0);
+            if (attention_mask != nullptr) {
+                new_attention_mask.push_back(0.0);
+            }
+
+            tokens  = new_tokens;
+            weights = new_weights;
+            if (attention_mask != nullptr) {
+                *attention_mask = new_attention_mask;
+            }
+
+            if (padding) {
+                int pad_token_id = pad_id_;
+                tokens.insert(tokens.end(), length - tokens.size(), pad_token_id);
+                weights.insert(weights.end(), length - weights.size(), 1.0);
+                if (attention_mask != nullptr) {
+                    // maybe keep some padding tokens unmasked?
+                    attention_mask->insert(attention_mask->end(), length - attention_mask->size(), -HUGE_VALF);
+                }
+            }
+        }
+    }
+
+    // Returns the minimum score in sentence pieces.
+    // min_score() - 10 is used for the cost of unknown sentence.
+    float min_score() const { return min_score_; }
+
+    // Returns the maximum score in sentence pieces.
+    // max_score() is used for the cost of user defined symbols.
+    float max_score() const { return max_score_; }
+
+    Status status() const { return status_; }
+};
+
+class T5LayerNorm : public UnaryBlock {
+protected:
+    int64_t hidden_size;
+    float eps;
+
+    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+        enum ggml_type wtype = GGML_TYPE_F32;
+        params["weight"]     = ggml_new_tensor_1d(ctx, wtype, hidden_size);
+    }
+
+public:
+    T5LayerNorm(int64_t hidden_size,
+                float eps = 1e-06f)
+        : hidden_size(hidden_size),
+          eps(eps) {}
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+        ggml_tensor* w = params["weight"];
+        x              = ggml_rms_norm(ctx->ggml_ctx, x, eps);
+        x              = ggml_mul(ctx->ggml_ctx, x, w);
+        return x;
+    }
+};
+
+struct T5DenseActDense : public UnaryBlock {
+public:
+    T5DenseActDense(int64_t model_dim, int64_t ff_dim) {
+        blocks["wi"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
+        blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false));
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+        // x: [N, n_token, model_dim]
+        auto wi = std::dynamic_pointer_cast<Linear>(blocks["wi"]);
+        auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]);
+
+        x = wi->forward(ctx, x);
+        x = ggml_relu_inplace(ctx->ggml_ctx, x);
+        x = wo->forward(ctx, x);
+        return x;
+    }
+};
+
+struct T5DenseGatedActDense : public UnaryBlock {
+public:
+    T5DenseGatedActDense(int64_t model_dim, int64_t ff_dim) {
+        blocks["wi_0"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
+        blocks["wi_1"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
+        float scale    = 1.f / 32.f;
+        // The purpose of the scale here is to prevent NaN issues on some backends(CUDA, ...).
+        blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false, false, false, scale));
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+        // x: [N, n_token, model_dim]
+        auto wi_0 = std::dynamic_pointer_cast<Linear>(blocks["wi_0"]);
+        auto wi_1 = std::dynamic_pointer_cast<Linear>(blocks["wi_1"]);
+        auto wo   = std::dynamic_pointer_cast<Linear>(blocks["wo"]);
+
+        auto hidden_gelu   = ggml_ext_gelu(ctx->ggml_ctx, wi_0->forward(ctx, x), true);
+        auto hidden_linear = wi_1->forward(ctx, x);
+        x                  = ggml_mul_inplace(ctx->ggml_ctx, hidden_gelu, hidden_linear);
+        x                  = wo->forward(ctx, x);
+        return x;
+    }
+};
+
+struct T5LayerFF : public UnaryBlock {
+public:
+    T5LayerFF(int64_t model_dim, int64_t ff_dim) {
+        blocks["DenseReluDense"] = std::shared_ptr<GGMLBlock>(new T5DenseGatedActDense(model_dim, ff_dim));
+        blocks["layer_norm"]     = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+        // x: [N, n_token, model_dim]
+        auto DenseReluDense = std::dynamic_pointer_cast<T5DenseGatedActDense>(blocks["DenseReluDense"]);
+        auto layer_norm     = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);
+
+        auto forwarded_states = layer_norm->forward(ctx, x);
+        forwarded_states      = DenseReluDense->forward(ctx, forwarded_states);
+        x                     = ggml_add_inplace(ctx->ggml_ctx, forwarded_states, x);
+        return x;
+    }
+};
+
+class T5Attention : public GGMLBlock {
+protected:
+    int64_t model_dim;
+    int64_t inner_dim;
+    int64_t num_heads;
+    bool using_relative_attention_bias;
+    int64_t relative_attention_num_buckets  = 32;
+    int64_t relative_attention_max_distance = 128;
+
+public:
+    T5Attention(int64_t model_dim,
+                int64_t inner_dim,
+                int64_t num_heads,
+                bool using_relative_attention_bias = false)
+        : model_dim(model_dim),
+          inner_dim(inner_dim),
+          num_heads(num_heads),
+          using_relative_attention_bias(using_relative_attention_bias) {
+        blocks["q"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
+        blocks["k"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
+        blocks["v"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
+        blocks["o"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, model_dim, false));
+        if (using_relative_attention_bias) {
+            blocks["relative_attention_bias"] = std::shared_ptr<GGMLBlock>(new Embedding(relative_attention_num_buckets, num_heads));
+        }
+    }
+
+    ggml_tensor* compute_bias(GGMLRunnerContext* ctx,
+                              ggml_tensor* relative_position_bucket) {
+        auto relative_attention_bias = std::dynamic_pointer_cast<Embedding>(blocks["relative_attention_bias"]);
+
+        auto values = relative_attention_bias->forward(ctx, relative_position_bucket);            // shape (query_length, key_length, num_heads)
+        values      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, values, 2, 0, 1, 3));  // shape (1, num_heads, query_length, key_length)
+        return values;
+    }
+
+    // x: [N, n_token, model_dim]
+    std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
+                                                  ggml_tensor* x,
+                                                  ggml_tensor* past_bias                = nullptr,
+                                                  ggml_tensor* mask                     = nullptr,
+                                                  ggml_tensor* relative_position_bucket = nullptr) {
+        auto q_proj   = std::dynamic_pointer_cast<Linear>(blocks["q"]);
+        auto k_proj   = std::dynamic_pointer_cast<Linear>(blocks["k"]);
+        auto v_proj   = std::dynamic_pointer_cast<Linear>(blocks["v"]);
+        auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["o"]);
+
+        int64_t n_head = num_heads;
+        int64_t d_head = inner_dim / n_head;
+
+        auto q = q_proj->forward(ctx, x);
+        auto k = k_proj->forward(ctx, x);
+        auto v = v_proj->forward(ctx, x);
+
+        if (using_relative_attention_bias && relative_position_bucket != nullptr) {
+            past_bias = compute_bias(ctx, relative_position_bucket);
+        }
+        if (past_bias != nullptr) {
+            if (mask != nullptr) {
+                mask = ggml_repeat(ctx->ggml_ctx, mask, past_bias);
+                mask = ggml_add(ctx->ggml_ctx, mask, past_bias);
+            } else {
+                mask = past_bias;
+            }
+        }
+
+        k = ggml_ext_scale(ctx->ggml_ctx, k, ::sqrtf(static_cast<float>(d_head)), true);
+
+        x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, mask);  // [N, n_token, d_head * n_head]
+
+        x = out_proj->forward(ctx, x);  // [N, n_token, model_dim]
+        return {x, past_bias};
+    }
+};
+
+struct T5LayerSelfAttention : public GGMLBlock {
+public:
+    T5LayerSelfAttention(int64_t model_dim,
+                         int64_t inner_dim,
+                         int64_t ff_dim,
+                         int64_t num_heads,
+                         bool using_relative_attention_bias) {
+        blocks["SelfAttention"] = std::shared_ptr<GGMLBlock>(new T5Attention(model_dim, inner_dim, num_heads, using_relative_attention_bias));
+        blocks["layer_norm"]    = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
+    }
+
+    std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
+                                                  ggml_tensor* x,
+                                                  ggml_tensor* past_bias                = nullptr,
+                                                  ggml_tensor* mask                     = nullptr,
+                                                  ggml_tensor* relative_position_bucket = nullptr) {
+        // x: [N, n_token, model_dim]
+        auto SelfAttention = std::dynamic_pointer_cast<T5Attention>(blocks["SelfAttention"]);
+        auto layer_norm    = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);
+
+        auto normed_hidden_state = layer_norm->forward(ctx, x);
+        auto ret                 = SelfAttention->forward(ctx, normed_hidden_state, past_bias, mask, relative_position_bucket);
+        auto output              = ret.first;
+        past_bias                = ret.second;
+
+        x = ggml_add_inplace(ctx->ggml_ctx, output, x);
+        return {x, past_bias};
+    }
+};
+
+struct T5Block : public GGMLBlock {
+public:
+    T5Block(int64_t model_dim, int64_t inner_dim, int64_t ff_dim, int64_t num_heads, bool using_relative_attention_bias) {
+        blocks["layer.0"] = std::shared_ptr<GGMLBlock>(new T5LayerSelfAttention(model_dim, inner_dim, ff_dim, num_heads, using_relative_attention_bias));
+        blocks["layer.1"] = std::shared_ptr<GGMLBlock>(new T5LayerFF(model_dim, ff_dim));
+    }
+
+    std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
+                                                  ggml_tensor* x,
+                                                  ggml_tensor* past_bias                = nullptr,
+                                                  ggml_tensor* mask                     = nullptr,
+                                                  ggml_tensor* relative_position_bucket = nullptr) {
+        // x: [N, n_token, model_dim]
+        auto layer_0 = std::dynamic_pointer_cast<T5LayerSelfAttention>(blocks["layer.0"]);
+        auto layer_1 = std::dynamic_pointer_cast<T5LayerFF>(blocks["layer.1"]);
+
+        auto ret  = layer_0->forward(ctx, x, past_bias, mask, relative_position_bucket);
+        x         = ret.first;
+        past_bias = ret.second;
+        x         = layer_1->forward(ctx, x);
+        return {x, past_bias};
+    }
+};
+
+struct T5Stack : public GGMLBlock {
+    int64_t num_layers;
+
+public:
+    T5Stack(int64_t num_layers,
+            int64_t model_dim,
+            int64_t inner_dim,
+            int64_t ff_dim,
+            int64_t num_heads,
+            bool relative_attention = true)
+        : num_layers(num_layers) {
+        for (int i = 0; i < num_layers; i++) {
+            blocks["block." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new T5Block(model_dim, inner_dim, ff_dim, num_heads, (!relative_attention || i == 0)));
+        }
+
+        blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* x,
+                         ggml_tensor* past_bias                = nullptr,
+                         ggml_tensor* attention_mask           = nullptr,
+                         ggml_tensor* relative_position_bucket = nullptr) {
+        // x: [N, n_token, model_dim]
+        for (int i = 0; i < num_layers; i++) {
+            auto block = std::dynamic_pointer_cast<T5Block>(blocks["block." + std::to_string(i)]);
+
+            auto ret  = block->forward(ctx, x, past_bias, attention_mask, relative_position_bucket);
+            x         = ret.first;
+            past_bias = ret.second;
+        }
+
+        auto final_layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["final_layer_norm"]);
+
+        x = final_layer_norm->forward(ctx, x);
+        return x;
+    }
+};
+
+struct T5Params {
+    int64_t num_layers      = 24;
+    int64_t model_dim       = 4096;
+    int64_t ff_dim          = 10240;
+    int64_t num_heads       = 64;
+    int64_t vocab_size      = 32128;
+    bool relative_attention = true;
+};
+
+struct T5 : public GGMLBlock {
+    T5Params params;
+
+public:
+    T5() {}
+    T5(T5Params params)
+        : params(params) {
+        blocks["encoder"] = std::shared_ptr<GGMLBlock>(new T5Stack(params.num_layers,
+                                                                   params.model_dim,
+                                                                   params.model_dim,
+                                                                   params.ff_dim,
+                                                                   params.num_heads,
+                                                                   params.relative_attention));
+        blocks["shared"]  = std::shared_ptr<GGMLBlock>(new Embedding(params.vocab_size,
+                                                                     params.model_dim));
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* input_ids,
+                         ggml_tensor* past_bias                = nullptr,
+                         ggml_tensor* attention_mask           = nullptr,
+                         ggml_tensor* relative_position_bucket = nullptr) {
+        // input_ids: [N, n_token]
+
+        auto shared  = std::dynamic_pointer_cast<Embedding>(blocks["shared"]);
+        auto encoder = std::dynamic_pointer_cast<T5Stack>(blocks["encoder"]);
+
+        auto x = shared->forward(ctx, input_ids);
+        x      = encoder->forward(ctx, x, past_bias, attention_mask, relative_position_bucket);
+        return x;
+    }
+};
+
+struct T5Runner : public GGMLRunner {
+    T5Params params;
+    T5 model;
+    std::vector<int> relative_position_bucket_vec;
+
+    T5Runner(ggml_backend_t backend,
+             bool offload_params_to_cpu,
+             const String2TensorStorage& tensor_storage_map,
+             const std::string prefix,
+             bool is_umt5 = false)
+        : GGMLRunner(backend, offload_params_to_cpu) {
+        if (is_umt5) {
+            params.vocab_size         = 256384;
+            params.relative_attention = false;
+        }
+        model = T5(params);
+        model.init(params_ctx, tensor_storage_map, prefix);
+    }
+
+    std::string get_desc() override {
+        return "t5";
+    }
+
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
+        model.get_param_tensors(tensors, prefix);
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* input_ids,
+                         ggml_tensor* relative_position_bucket,
+                         ggml_tensor* attention_mask = nullptr) {
+        size_t N       = input_ids->ne[1];
+        size_t n_token = input_ids->ne[0];
+
+        auto hidden_states = model.forward(ctx, input_ids, nullptr, attention_mask, relative_position_bucket);  // [N, n_token, model_dim]
+        return hidden_states;
+    }
+
+    ggml_cgraph* build_graph(const sd::Tensor<int32_t>& input_ids_tensor,
+                             const sd::Tensor<float>& attention_mask_tensor = {}) {
+        ggml_cgraph* gf             = ggml_new_graph(compute_ctx);
+        ggml_tensor* input_ids      = make_input(input_ids_tensor);
+        ggml_tensor* attention_mask = attention_mask_tensor.empty() ? nullptr : make_input(attention_mask_tensor);
+
+        relative_position_bucket_vec = compute_relative_position_bucket(static_cast<int>(input_ids->ne[0]), static_cast<int>(input_ids->ne[0]));
+
+        // for (int i = 0; i < relative_position_bucket_vec.size(); i++) {
+        //     if (i % 77 == 0) {
+        //         printf("\n");
+        //     }
+        //     printf("%d ", relative_position_bucket_vec[i]);
+        // }
+
+        auto relative_position_bucket = ggml_new_tensor_2d(compute_ctx,
+                                                           GGML_TYPE_I32,
+                                                           input_ids->ne[0],
+                                                           input_ids->ne[0]);
+        set_backend_tensor_data(relative_position_bucket, relative_position_bucket_vec.data());
+
+        auto runner_ctx            = get_context();
+        ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, relative_position_bucket, attention_mask);
+
+        ggml_build_forward_expand(gf, hidden_states);
+
+        return gf;
+    }
+
+    sd::Tensor<float> compute(const int n_threads,
+                              const sd::Tensor<int32_t>& input_ids,
+                              const sd::Tensor<float>& attention_mask) {
+        auto get_graph = [&]() -> ggml_cgraph* {
+            return build_graph(input_ids, attention_mask);
+        };
+        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, true), 3);
+    }
+
+    static std::vector<int> _relative_position_bucket(const std::vector<int>& relative_position,
+                                                      bool bidirectional = true,
+                                                      int num_buckets    = 32,
+                                                      int max_distance   = 128) {
+        std::vector<int> relative_buckets(relative_position.size(), 0);
+        std::vector<int> abs_relative_position = relative_position;
+
+        if (bidirectional) {
+            num_buckets = num_buckets / 2;
+            for (size_t i = 0; i < relative_position.size(); ++i) {
+                if (relative_position[i] > 0) {
+                    relative_buckets[i] += num_buckets;
+                }
+                abs_relative_position[i] = std::abs(relative_position[i]);
+            }
+        } else {
+            for (size_t i = 0; i < relative_position.size(); ++i) {
+                abs_relative_position[i] = std::max(-relative_position[i], 0);
+            }
+        }
+
+        int max_exact = num_buckets / 2;
+        std::vector<int> relative_position_if_large(relative_position.size(), 0);
+
+        for (size_t i = 0; i < relative_position.size(); ++i) {
+            if (abs_relative_position[i] < max_exact) {
+                relative_buckets[i] += abs_relative_position[i];
+            } else {
+                float log_pos                 = std::log(static_cast<float>(abs_relative_position[i]) / max_exact);
+                float log_base                = std::log(static_cast<float>(max_distance) / max_exact);
+                relative_position_if_large[i] = max_exact + static_cast<int>((log_pos / log_base) * (num_buckets - max_exact));
+                relative_position_if_large[i] = std::min(relative_position_if_large[i], num_buckets - 1);
+                relative_buckets[i] += relative_position_if_large[i];
+            }
+        }
+
+        return relative_buckets;
+    }
+
+    std::vector<int> compute_relative_position_bucket(int query_length,
+                                                      int key_length) {
+        std::vector<int> context_position(query_length);
+        std::vector<int> memory_position(key_length);
+
+        for (int i = 0; i < query_length; ++i) {
+            context_position[i] = i;
+        }
+        for (int i = 0; i < key_length; ++i) {
+            memory_position[i] = i;
+        }
+
+        std::vector<std::vector<int>> relative_position(query_length, std::vector<int>(key_length, 0));
+        for (int i = 0; i < query_length; ++i) {
+            for (int j = 0; j < key_length; ++j) {
+                relative_position[i][j] = memory_position[j] - context_position[i];
+            }
+        }
+
+        std::vector<int> relative_position_bucket;
+        for (int i = 0; i < query_length; ++i) {
+            std::vector<int> result = _relative_position_bucket(relative_position[i], true);
+            relative_position_bucket.insert(relative_position_bucket.end(), result.begin(), result.end());
+        }
+
+        return relative_position_bucket;
+    }
+};
+
+struct T5Embedder {
+    T5UniGramTokenizer tokenizer;
+    T5Runner model;
+
+    T5Embedder(ggml_backend_t backend,
+               bool offload_params_to_cpu,
+               const String2TensorStorage& tensor_storage_map = {},
+               const std::string prefix                       = "",
+               bool is_umt5                                   = false)
+        : model(backend, offload_params_to_cpu, tensor_storage_map, prefix, is_umt5), tokenizer(is_umt5) {
+    }
+
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
+        model.get_param_tensors(tensors, prefix);
+    }
+
+    void alloc_params_buffer() {
+        model.alloc_params_buffer();
+    }
+
+    std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text,
+                                                                                  size_t max_length = 0,
+                                                                                  bool padding      = false) {
+        auto parsed_attention = parse_prompt_attention(text);
+
+        {
+            std::stringstream ss;
+            ss << "[";
+            for (const auto& item : parsed_attention) {
+                ss << "['" << item.first << "', " << item.second << "], ";
+            }
+            ss << "]";
+            LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
+        }
+
+        std::vector<int> tokens;
+        std::vector<float> weights;
+        for (const auto& item : parsed_attention) {
+            const std::string& curr_text = item.first;
+            float curr_weight            = item.second;
+            std::vector<int> curr_tokens = tokenizer.Encode(curr_text, false);
+            tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
+            weights.insert(weights.end(), curr_tokens.size(), curr_weight);
+        }
+
+        int EOS_TOKEN_ID = 1;
+        tokens.push_back(EOS_TOKEN_ID);
+        weights.push_back(1.0);
+
+        std::vector<float> attention_mask;
+
+        tokenizer.pad_tokens(tokens, weights, &attention_mask, max_length, padding);
+
+        // for (int i = 0; i < tokens.size(); i++) {
+        //     std::cout << tokens[i] << ":" << weights[i] << ", ";
+        // }
+        // std::cout << std::endl;
+
+        return {tokens, weights, attention_mask};
+    }
+
+    void test() {
+        ggml_init_params params;
+        params.mem_size   = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
+        params.mem_buffer = nullptr;
+        params.no_alloc   = false;
+
+        ggml_context* ctx = ggml_init(params);
+        GGML_ASSERT(ctx != nullptr);
+
+        {
+            std::string text("a lovely cat");
+            auto tokens_and_weights     = tokenize(text, 512, true);
+            std::vector<int>& tokens    = std::get<0>(tokens_and_weights);
+            std::vector<float>& weights = std::get<1>(tokens_and_weights);
+            std::vector<float>& masks   = std::get<2>(tokens_and_weights);
+            for (auto token : tokens) {
+                printf("%d ", token);
+            }
+            printf("\n");
+            auto input_ids      = sd::Tensor<int32_t>::from_vector(tokens);
+            auto attention_mask = sd::Tensor<float>::from_vector(masks);
+            sd::Tensor<float> out;
+
+            int64_t t0   = ggml_time_ms();
+            auto out_opt = model.compute(8, input_ids, attention_mask);
+            int64_t t1   = ggml_time_ms();
+
+            GGML_ASSERT(!out_opt.empty());
+            out = std::move(out_opt);
+            print_sd_tensor(out);
+            LOG_DEBUG("t5 test done in %lldms", t1 - t0);
+        }
+    }
+
+    static void load_from_file_and_test(const std::string& file_path) {
+        // cpu f16: pass
+        // cpu f32: pass
+        // cuda f16: pass
+        // cuda f32: pass
+        // cuda q8_0: pass
+        // ggml_backend_t backend = ggml_backend_cuda_init(0);
+        ggml_backend_t backend    = ggml_backend_cpu_init();
+        ggml_type model_data_type = GGML_TYPE_F16;
+
+        ModelLoader model_loader;
+        if (!model_loader.init_from_file_and_convert_name(file_path)) {
+            LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
+            return;
+        }
+
+        auto& tensor_storage_map = model_loader.get_tensor_storage_map();
+        for (auto& [name, tensor_storage] : tensor_storage_map) {
+            if (ends_with(name, "weight")) {
+                tensor_storage.expected_type = model_data_type;
+            }
+        }
+
+        std::shared_ptr<T5Embedder> t5 = std::make_shared<T5Embedder>(backend, false, tensor_storage_map, "", true);
+
+        t5->alloc_params_buffer();
+        std::map<std::string, ggml_tensor*> tensors;
+        t5->get_param_tensors(tensors, "");
+
+        bool success = model_loader.load_tensors(tensors);
+
+        if (!success) {
+            LOG_ERROR("load tensors from model loader failed");
+            return;
+        }
+
+        LOG_INFO("t5 model loaded");
+        t5->test();
+    }
+};
+
+#endif  // __T5_HPP__
diff --git a/src/tae.hpp b/src/tae.hpp
index 3df09e4..0a0ca68 100644
--- a/src/tae.hpp
+++ b/src/tae.hpp
@@ -562,41 +562,40 @@ struct TinyImageAutoEncoder : public VAE {
         taesd.get_param_tensors(tensors, prefix);
     }
 
-    ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr<RNG> rng) {
+    sd::Tensor<float> vae_output_to_latents(const sd::Tensor<float>& vae_output, std::shared_ptr<RNG> rng) override {
+        SD_UNUSED(rng);
         return vae_output;
     }
 
-    ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents) {
-        return ggml_ext_dup_and_cpy_tensor(work_ctx, latents);
+    sd::Tensor<float> diffusion_to_vae_latents(const sd::Tensor<float>& latents) override {
+        return latents;
     }
 
-    ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents) {
-        return ggml_ext_dup_and_cpy_tensor(work_ctx, latents);
+    sd::Tensor<float> vae_to_diffusion_latents(const sd::Tensor<float>& latents) override {
+        return latents;
     }
 
     int get_encoder_output_channels(int input_channels) {
         return taesd.z_channels;
     }
 
-    ggml_cgraph* build_graph(ggml_tensor* z, bool decode_graph) {
+    ggml_cgraph* build_graph(const sd::Tensor<float>& z_tensor, bool decode_graph) {
         ggml_cgraph* gf  = ggml_new_graph(compute_ctx);
-        z                = to_backend(z);
+        ggml_tensor* z   = make_input(z_tensor);
         auto runner_ctx  = get_context();
         ggml_tensor* out = decode_graph ? taesd.decode(&runner_ctx, z) : taesd.encode(&runner_ctx, z);
         ggml_build_forward_expand(gf, out);
         return gf;
     }
 
-    bool _compute(const int n_threads,
-                  ggml_tensor* z,
-                  bool decode_graph,
-                  ggml_tensor** output,
-                  ggml_context* output_ctx = nullptr) {
+    sd::Tensor<float> _compute(const int n_threads,
+                               const sd::Tensor<float>& z_tensor,
+                               bool decode_graph) override {
         auto get_graph = [&]() -> ggml_cgraph* {
-            return build_graph(z, decode_graph);
+            return build_graph(z_tensor, decode_graph);
         };
 
-        return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), z_tensor.dim());
     }
 };
 
@@ -625,42 +624,41 @@ struct TinyVideoAutoEncoder : public VAE {
         taehv.get_param_tensors(tensors, prefix);
     }
 
-    ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr<RNG> rng) {
+    sd::Tensor<float> vae_output_to_latents(const sd::Tensor<float>& vae_output, std::shared_ptr<RNG> rng) override {
+        SD_UNUSED(rng);
         return vae_output;
     }
 
-    ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents) {
-        return ggml_ext_dup_and_cpy_tensor(work_ctx, latents);
+    sd::Tensor<float> diffusion_to_vae_latents(const sd::Tensor<float>& latents) override {
+        return latents;
     }
 
-    ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents) {
-        return ggml_ext_dup_and_cpy_tensor(work_ctx, latents);
+    sd::Tensor<float> vae_to_diffusion_latents(const sd::Tensor<float>& latents) override {
+        return latents;
     }
 
     int get_encoder_output_channels(int input_channels) {
         return taehv.z_channels;
     }
 
-    ggml_cgraph* build_graph(ggml_tensor* z, bool decode_graph) {
+    ggml_cgraph* build_graph(const sd::Tensor<float>& z_tensor, bool decode_graph) {
         ggml_cgraph* gf  = ggml_new_graph(compute_ctx);
-        z                = to_backend(z);
+        ggml_tensor* z   = make_input(z_tensor);
         auto runner_ctx  = get_context();
         ggml_tensor* out = decode_graph ? taehv.decode(&runner_ctx, z) : taehv.encode(&runner_ctx, z);
         ggml_build_forward_expand(gf, out);
         return gf;
     }
 
-    bool _compute(const int n_threads,
-                  ggml_tensor* z,
-                  bool decode_graph,
-                  ggml_tensor** output,
-                  ggml_context* output_ctx = nullptr) {
+    sd::Tensor<float> _compute(const int n_threads,
+                               const sd::Tensor<float>& z_tensor,
+                               bool decode_graph) override {
         auto get_graph = [&]() -> ggml_cgraph* {
-            return build_graph(z, decode_graph);
+            return build_graph(z_tensor, decode_graph);
         };
 
-        return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), z_tensor.dim());
     }
 };
 
-#endif  // __TAE_HPP__
\ No newline at end of file
+#endif  // __TAE_HPP__
diff --git a/src/tensor.hpp b/src/tensor.hpp
new file mode 100644
index 0000000..33a2bde
--- /dev/null
+++ b/src/tensor.hpp
@@ -0,0 +1,1249 @@
+#ifndef __SD_TENSOR_HPP__
+#define __SD_TENSOR_HPP__
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <initializer_list>
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "rng.hpp"
+
+namespace sd {
+
+    template <typename T>
+    class Tensor;
+
+    inline std::vector<int64_t> tensor_unravel_index(int64_t flat, const std::vector<int64_t>& shape);
+
+    [[noreturn]] inline void tensor_throw_invalid_argument(const std::string& message) {
+        std::fprintf(stderr, "sd::Tensor error: %s\n", message.c_str());
+        std::fflush(stderr);
+        throw std::invalid_argument(message);
+    }
+
+    inline std::string tensor_shape_to_string(const std::vector<int64_t>& shape) {
+        std::ostringstream oss;
+        oss << "[";
+        for (size_t i = 0; i < shape.size(); ++i) {
+            if (i != 0) {
+                oss << ", ";
+            }
+            oss << shape[i];
+        }
+        oss << "]";
+        return oss.str();
+    }
+
+    inline int64_t tensor_numel(const std::vector<int64_t>& shape) {
+        if (shape.empty()) {
+            return 0;
+        }
+        int64_t numel = 1;
+        for (int64_t dim : shape) {
+            if (dim < 0) {
+                tensor_throw_invalid_argument("Tensor shape must be non-negative, got shape=" +
+                                              tensor_shape_to_string(shape));
+            }
+            numel *= dim;
+        }
+        return numel;
+    }
+
+    template <typename T>
+    class Tensor {
+    public:
+        Tensor() = default;
+
+        explicit Tensor(std::vector<int64_t> shape)
+            : data_(static_cast<size_t>(tensor_numel(shape))), shape_(std::move(shape)) {
+        }
+
+        Tensor(std::vector<int64_t> shape, std::vector<T> data)
+            : data_(std::move(data)), shape_(std::move(shape)) {
+            if (static_cast<int64_t>(data_.size()) != tensor_numel(shape_)) {
+                tensor_throw_invalid_argument("Tensor data size does not match shape: data.size()=" +
+                                              std::to_string(data_.size()) + ", shape=" +
+                                              tensor_shape_to_string(shape_) + ", numel=" +
+                                              std::to_string(tensor_numel(shape_)));
+            }
+        }
+
+        const std::vector<int64_t>& shape() const {
+            return shape_;
+        }
+
+        int64_t dim() const {
+            return static_cast<int64_t>(shape_.size());
+        }
+
+        int64_t numel() const {
+            return static_cast<int64_t>(data_.size());
+        }
+
+        bool empty() const {
+            return data_.empty();
+        }
+
+        T* data() {
+            return data_.data();
+        }
+
+        const T* data() const {
+            return data_.data();
+        }
+
+        std::vector<T>& values() {
+            return data_;
+        }
+
+        const std::vector<T>& values() const {
+            return data_;
+        }
+
+        void resize(std::vector<int64_t> shape) {
+            shape_ = std::move(shape);
+            data_.resize(static_cast<size_t>(tensor_numel(shape_)));
+        }
+
+        Tensor& reshape_(std::vector<int64_t> shape) {
+            if (tensor_numel(shape) != numel()) {
+                tensor_throw_invalid_argument("Tensor reshape changes element count: from shape=" +
+                                              tensor_shape_to_string(shape_) + " (numel=" +
+                                              std::to_string(numel()) + ") to shape=" +
+                                              tensor_shape_to_string(shape) + " (numel=" +
+                                              std::to_string(tensor_numel(shape)) + ")");
+            }
+            shape_ = std::move(shape);
+            return *this;
+        }
+
+        Tensor reshape(std::vector<int64_t> shape) const {
+            Tensor result = *this;
+            result.reshape_(std::move(shape));
+            return result;
+        }
+
+        Tensor& squeeze_() {
+            std::vector<int64_t> new_shape;
+            new_shape.reserve(shape_.size());
+            for (int64_t dim : shape_) {
+                if (dim != 1) {
+                    new_shape.push_back(dim);
+                }
+            }
+            shape_ = std::move(new_shape);
+            return *this;
+        }
+
+        Tensor& squeeze_(size_t dim) {
+            if (dim >= shape_.size()) {
+                tensor_throw_invalid_argument("Tensor squeeze dimension out of range: dim=" +
+                                              std::to_string(dim) + ", shape=" +
+                                              tensor_shape_to_string(shape_));
+            }
+            if (shape_[dim] != 1) {
+                tensor_throw_invalid_argument("Tensor squeeze requires dimension size 1: dim=" +
+                                              std::to_string(dim) + ", shape=" +
+                                              tensor_shape_to_string(shape_));
+            }
+            shape_.erase(shape_.begin() + static_cast<std::ptrdiff_t>(dim));
+            return *this;
+        }
+
+        Tensor squeeze() const {
+            Tensor result = *this;
+            result.squeeze_();
+            return result;
+        }
+
+        Tensor squeeze(size_t dim) const {
+            Tensor result = *this;
+            result.squeeze_(dim);
+            return result;
+        }
+
+        Tensor& unsqueeze_(size_t dim) {
+            if (dim > shape_.size()) {
+                tensor_throw_invalid_argument("Tensor unsqueeze dimension out of range: dim=" +
+                                              std::to_string(dim) + ", shape=" +
+                                              tensor_shape_to_string(shape_));
+            }
+            shape_.insert(shape_.begin() + static_cast<std::ptrdiff_t>(dim), 1);
+            return *this;
+        }
+
+        Tensor unsqueeze(size_t dim) const {
+            Tensor result = *this;
+            result.unsqueeze_(dim);
+            return result;
+        }
+
+        Tensor permute(const std::vector<size_t>& dims) const {
+            if (dims.size() != static_cast<size_t>(dim())) {
+                tensor_throw_invalid_argument("Tensor permute requires one dimension index per axis: tensor_shape=" +
+                                              tensor_shape_to_string(shape_) + ", dims_size=" +
+                                              std::to_string(dims.size()));
+            }
+
+            std::vector<bool> seen(dims.size(), false);
+            std::vector<int64_t> out_shape(dims.size(), 1);
+            for (size_t i = 0; i < dims.size(); ++i) {
+                size_t dim_index = dims[i];
+                if (dim_index >= dims.size() || seen[dim_index]) {
+                    tensor_throw_invalid_argument("Tensor permute dimensions must be a valid permutation: tensor_shape=" +
+                                                  tensor_shape_to_string(shape_));
+                }
+                seen[dim_index] = true;
+                out_shape[i]    = shape_[dim_index];
+            }
+
+            Tensor result(out_shape);
+            if (result.numel() == 0) {
+                return result;
+            }
+
+            for (int64_t flat = 0; flat < result.numel(); ++flat) {
+                std::vector<int64_t> out_coord = tensor_unravel_index(flat, out_shape);
+                std::vector<int64_t> src_coord(static_cast<size_t>(dim()), 0);
+                for (size_t i = 0; i < dims.size(); ++i) {
+                    src_coord[dims[i]] = out_coord[i];
+                }
+                result[flat] = index(src_coord);
+            }
+
+            return result;
+        }
+
+        Tensor& permute_(const std::vector<size_t>& dims) {
+            *this = permute(dims);
+            return *this;
+        }
+
+        void fill_(const T& value) {
+            std::fill(data_.begin(), data_.end(), value);
+        }
+
+        Tensor& masked_fill_(const Tensor<uint8_t>& mask, const T& value);
+
+        T mean() const;
+
+        static Tensor zeros(std::vector<int64_t> shape) {
+            return Tensor(std::move(shape));
+        }
+
+        static Tensor zeros_like(const Tensor& other) {
+            return zeros(other.shape());
+        }
+
+        static Tensor ones(std::vector<int64_t> shape) {
+            return full(std::move(shape), static_cast<T>(1));
+        }
+
+        static Tensor ones_like(const Tensor& other) {
+            return ones(other.shape());
+        }
+
+        static Tensor full(std::vector<int64_t> shape, const T& value) {
+            Tensor tensor(std::move(shape));
+            tensor.fill_(value);
+            return tensor;
+        }
+
+        static Tensor randn(std::vector<int64_t> shape, const std::shared_ptr<RNG>& rng) {
+            static_assert(std::is_same_v<T, float>, "Tensor::randn currently requires Tensor<float>");
+            if (!rng) {
+                tensor_throw_invalid_argument("Tensor randn requires a valid RNG");
+            }
+            const uint32_t size = static_cast<uint32_t>(tensor_numel(shape));
+            return Tensor(std::move(shape), rng->randn(size));
+        }
+
+        static Tensor randn_like(const Tensor& other, const std::shared_ptr<RNG>& rng) {
+            return randn(other.shape(), rng);
+        }
+
+        static Tensor from_vector(std::vector<T> data) {
+            const int64_t size = static_cast<int64_t>(data.size());
+            return Tensor({size}, std::move(data));
+        }
+
+        T& index(const std::vector<int64_t>& coord) {
+            return data_.at(offset_of(coord));
+        }
+
+        const T& index(const std::vector<int64_t>& coord) const {
+            return data_.at(offset_of(coord));
+        }
+
+        template <typename... Indices, typename = std::enable_if_t<(std::is_convertible_v<Indices, int64_t> && ...)>>
+        T& index(Indices... indices) {
+            return index(std::vector<int64_t>{static_cast<int64_t>(indices)...});
+        }
+
+        template <typename... Indices, typename = std::enable_if_t<(std::is_convertible_v<Indices, int64_t> && ...)>>
+        const T& index(Indices... indices) const {
+            return index(std::vector<int64_t>{static_cast<int64_t>(indices)...});
+        }
+
+        T& operator[](int64_t index) {
+            return data_.at(static_cast<size_t>(index));
+        }
+
+        const T& operator[](int64_t index) const {
+            return data_.at(static_cast<size_t>(index));
+        }
+
+    private:
+        size_t offset_of(const std::vector<int64_t>& coord) const {
+            if (coord.size() != shape_.size()) {
+                tensor_throw_invalid_argument("Tensor index rank mismatch: coord_rank=" +
+                                              std::to_string(coord.size()) + ", shape=" +
+                                              tensor_shape_to_string(shape_));
+            }
+            size_t offset = 0;
+            size_t stride = 1;
+            for (size_t i = 0; i < shape_.size(); ++i) {
+                if (coord[i] < 0 || coord[i] >= shape_[i]) {
+                    tensor_throw_invalid_argument("Tensor index out of range: shape=" +
+                                                  tensor_shape_to_string(shape_));
+                }
+                offset += static_cast<size_t>(coord[i]) * stride;
+                stride *= static_cast<size_t>(shape_[i]);
+            }
+            return offset;
+        }
+
+        std::vector<T> data_;
+        std::vector<int64_t> shape_;
+    };
+
+    template <typename T>
+    inline T Tensor<T>::mean() const {
+        if (empty()) {
+            return T{};
+        }
+        T sum = T{};
+        for (const T& value : data_) {
+            sum += value;
+        }
+        return sum / static_cast<T>(numel());
+    }
+
+    template <>
+    inline float Tensor<float>::mean() const {
+        if (empty()) {
+            return 0.0f;
+        }
+        double sum = 0.0;
+        for (float value : data_) {
+            sum += static_cast<double>(value);
+        }
+        return static_cast<float>(sum / static_cast<double>(numel()));
+    }
+
+    template <typename T>
+    inline void tensor_check_same_shape(const Tensor<T>& lhs, const Tensor<T>& rhs) {
+        if (lhs.shape() != rhs.shape()) {
+            tensor_throw_invalid_argument("Tensor shapes must match: lhs_shape=" +
+                                          tensor_shape_to_string(lhs.shape()) + ", rhs_shape=" +
+                                          tensor_shape_to_string(rhs.shape()));
+        }
+    }
+
+    inline std::vector<int64_t> tensor_broadcast_shape(const std::vector<int64_t>& lhs, const std::vector<int64_t>& rhs) {
+        size_t ndim = std::max(lhs.size(), rhs.size());
+        std::vector<int64_t> shape(ndim, 1);
+        for (size_t i = 0; i < ndim; ++i) {
+            int64_t lhs_dim = lhs.size() > i ? lhs[i] : 1;
+            int64_t rhs_dim = rhs.size() > i ? rhs[i] : 1;
+            if (lhs_dim != rhs_dim && lhs_dim != 1 && rhs_dim != 1) {
+                tensor_throw_invalid_argument("Tensor shapes are not broadcastable: lhs_shape=" +
+                                              tensor_shape_to_string(lhs) + ", rhs_shape=" +
+                                              tensor_shape_to_string(rhs));
+            }
+            shape[i] = std::max(lhs_dim, rhs_dim);
+        }
+        return shape;
+    }
+
+    inline std::vector<int64_t> tensor_unravel_index(int64_t flat, const std::vector<int64_t>& shape) {
+        std::vector<int64_t> coord(shape.size(), 0);
+        for (size_t i = 0; i < shape.size(); ++i) {
+            if (shape[i] <= 0) {
+                tensor_throw_invalid_argument("Tensor unravel_index requires positive shape: shape=" +
+                                              tensor_shape_to_string(shape));
+            }
+            coord[i] = flat % shape[i];
+            flat /= shape[i];
+        }
+        return coord;
+    }
+
+    inline std::vector<int64_t> tensor_compute_strides(const std::vector<int64_t>& shape) {
+        std::vector<int64_t> strides(shape.size(), 1);
+        int64_t stride = 1;
+        for (size_t i = 0; i < shape.size(); ++i) {
+            strides[i] = stride;
+            stride *= shape[i];
+        }
+        return strides;
+    }
+
+    template <typename F>
+    inline void tensor_for_each_broadcast_offset(const std::vector<int64_t>& out_shape,
+                                                 const std::vector<int64_t>& lhs_shape_raw,
+                                                 const std::vector<int64_t>& lhs_strides_raw,
+                                                 const std::vector<int64_t>& rhs_shape_raw,
+                                                 const std::vector<int64_t>& rhs_strides_raw,
+                                                 F&& fn) {
+        const size_t ndim                = out_shape.size();
+        std::vector<int64_t> out_strides = tensor_compute_strides(out_shape);
+        std::vector<int64_t> lhs_shape(ndim, 1);
+        std::vector<int64_t> lhs_strides(ndim, 0);
+        std::vector<int64_t> rhs_shape(ndim, 1);
+        std::vector<int64_t> rhs_strides(ndim, 0);
+
+        for (size_t i = 0; i < lhs_shape_raw.size(); ++i) {
+            lhs_shape[i]   = lhs_shape_raw[i];
+            lhs_strides[i] = lhs_strides_raw[i];
+        }
+        for (size_t i = 0; i < rhs_shape_raw.size(); ++i) {
+            rhs_shape[i]   = rhs_shape_raw[i];
+            rhs_strides[i] = rhs_strides_raw[i];
+        }
+
+        const int64_t numel = tensor_numel(out_shape);
+        for (int64_t flat = 0; flat < numel; ++flat) {
+            int64_t remaining  = flat;
+            int64_t lhs_offset = 0;
+            int64_t rhs_offset = 0;
+            for (size_t i = ndim; i-- > 0;) {
+                int64_t coord = remaining / out_strides[i];
+                remaining %= out_strides[i];
+                if (lhs_shape[i] != 1) {
+                    lhs_offset += coord * lhs_strides[i];
+                }
+                if (rhs_shape[i] != 1) {
+                    rhs_offset += coord * rhs_strides[i];
+                }
+            }
+            fn(flat, lhs_offset, rhs_offset);
+        }
+    }
+
+    template <typename T>
+    inline Tensor<T>& Tensor<T>::masked_fill_(const Tensor<uint8_t>& mask, const T& value) {
+        if (empty()) {
+            return *this;
+        }
+        tensor_broadcast_shape(shape_, mask.shape());
+        const std::vector<int64_t> data_strides = tensor_compute_strides(shape_);
+        const std::vector<int64_t> mask_strides = tensor_compute_strides(mask.shape());
+        const uint8_t* mask_data                = mask.data();
+        tensor_for_each_broadcast_offset(shape_,
+                                         shape_,
+                                         data_strides,
+                                         mask.shape(),
+                                         mask_strides,
+                                         [&](int64_t, int64_t data_offset, int64_t mask_offset) {
+                                             if (mask_data[mask_offset] != 0) {
+                                                 data_[static_cast<size_t>(data_offset)] = value;
+                                             }
+                                         });
+        return *this;
+    }
+
+    template <typename T, typename Scalar, typename = std::enable_if_t<std::is_arithmetic<Scalar>::value>>
+    inline Tensor<uint8_t> operator<(const Tensor<T>& lhs, Scalar rhs) {
+        Tensor<uint8_t> result(lhs.shape());
+        const T value = static_cast<T>(rhs);
+        for (int64_t i = 0; i < lhs.numel(); ++i) {
+            result[i] = lhs[i] < value ? 1 : 0;
+        }
+        return result;
+    }
+
+    template <typename T, typename Scalar, typename = std::enable_if_t<std::is_arithmetic<Scalar>::value>>
+    inline Tensor<uint8_t> operator<(Scalar lhs, const Tensor<T>& rhs) {
+        Tensor<uint8_t> result(rhs.shape());
+        const T value = static_cast<T>(lhs);
+        for (int64_t i = 0; i < rhs.numel(); ++i) {
+            result[i] = value < rhs[i] ? 1 : 0;
+        }
+        return result;
+    }
+
+    template <typename T>
+    inline Tensor<uint8_t> operator<(const Tensor<T>& lhs, const Tensor<T>& rhs) {
+        const std::vector<int64_t> out_shape = tensor_broadcast_shape(lhs.shape(), rhs.shape());
+        Tensor<uint8_t> result(out_shape);
+        const std::vector<int64_t> lhs_strides = tensor_compute_strides(lhs.shape());
+        const std::vector<int64_t> rhs_strides = tensor_compute_strides(rhs.shape());
+        const T* lhs_data                      = lhs.data();
+        const T* rhs_data                      = rhs.data();
+        tensor_for_each_broadcast_offset(out_shape,
+                                         lhs.shape(),
+                                         lhs_strides,
+                                         rhs.shape(),
+                                         rhs_strides,
+                                         [&](int64_t flat, int64_t lhs_offset, int64_t rhs_offset) {
+                                             result[flat] = lhs_data[lhs_offset] < rhs_data[rhs_offset] ? 1 : 0;
+                                         });
+        return result;
+    }
+
+    template <typename T>
+    inline Tensor<T>& operator+=(Tensor<T>& lhs, const Tensor<T>& rhs) {
+        if (lhs.shape() == rhs.shape()) {
+            for (int64_t i = 0; i < lhs.numel(); ++i) {
+                lhs[i] += rhs[i];
+            }
+            return lhs;
+        }
+        tensor_broadcast_shape(lhs.shape(), rhs.shape());
+        const std::vector<int64_t> lhs_strides = tensor_compute_strides(lhs.shape());
+        const std::vector<int64_t> rhs_strides = tensor_compute_strides(rhs.shape());
+        const T* rhs_data                      = rhs.data();
+        tensor_for_each_broadcast_offset(lhs.shape(),
+                                         lhs.shape(),
+                                         lhs_strides,
+                                         rhs.shape(),
+                                         rhs_strides,
+                                         [&](int64_t, int64_t lhs_offset, int64_t rhs_offset) {
+                                             lhs[static_cast<int64_t>(lhs_offset)] += rhs_data[rhs_offset];
+                                         });
+        return lhs;
+    }
+
+    template <typename T, typename Scalar, typename = std::enable_if_t<std::is_arithmetic<Scalar>::value>>
+    inline Tensor<T>& operator+=(Tensor<T>& lhs, Scalar rhs) {
+        const T value = static_cast<T>(rhs);
+        for (int64_t i = 0; i < lhs.numel(); ++i) {
+            lhs[i] += value;
+        }
+        return lhs;
+    }
+
+    template <typename T>
+    inline Tensor<T>& operator-=(Tensor<T>& lhs, const Tensor<T>& rhs) {
+        if (lhs.shape() == rhs.shape()) {
+            for (int64_t i = 0; i < lhs.numel(); ++i) {
+                lhs[i] -= rhs[i];
+            }
+            return lhs;
+        }
+        tensor_broadcast_shape(lhs.shape(), rhs.shape());
+        const std::vector<int64_t> lhs_strides = tensor_compute_strides(lhs.shape());
+        const std::vector<int64_t> rhs_strides = tensor_compute_strides(rhs.shape());
+        const T* rhs_data                      = rhs.data();
+        tensor_for_each_broadcast_offset(lhs.shape(),
+                                         lhs.shape(),
+                                         lhs_strides,
+                                         rhs.shape(),
+                                         rhs_strides,
+                                         [&](int64_t, int64_t lhs_offset, int64_t rhs_offset) {
+                                             lhs[static_cast<int64_t>(lhs_offset)] -= rhs_data[rhs_offset];
+                                         });
+        return lhs;
+    }
+
+    template <typename T, typename Scalar, typename = std::enable_if_t<std::is_arithmetic<Scalar>::value>>
+    inline Tensor<T>& operator-=(Tensor<T>& lhs, Scalar rhs) {
+        const T value = static_cast<T>(rhs);
+        for (int64_t i = 0; i < lhs.numel(); ++i) {
+            lhs[i] -= value;
+        }
+        return lhs;
+    }
+
+    template <typename T>
+    inline Tensor<T>& operator*=(Tensor<T>& lhs, const Tensor<T>& rhs) {
+        if (lhs.shape() == rhs.shape()) {
+            for (int64_t i = 0; i < lhs.numel(); ++i) {
+                lhs[i] *= rhs[i];
+            }
+            return lhs;
+        }
+        tensor_broadcast_shape(lhs.shape(), rhs.shape());
+        const std::vector<int64_t> lhs_strides = tensor_compute_strides(lhs.shape());
+        const std::vector<int64_t> rhs_strides = tensor_compute_strides(rhs.shape());
+        const T* rhs_data                      = rhs.data();
+        tensor_for_each_broadcast_offset(lhs.shape(),
+                                         lhs.shape(),
+                                         lhs_strides,
+                                         rhs.shape(),
+                                         rhs_strides,
+                                         [&](int64_t, int64_t lhs_offset, int64_t rhs_offset) {
+                                             lhs[static_cast<int64_t>(lhs_offset)] *= rhs_data[rhs_offset];
+                                         });
+        return lhs;
+    }
+
+    template <typename T, typename Scalar, typename = std::enable_if_t<std::is_arithmetic<Scalar>::value>>
+    inline Tensor<T>& operator*=(Tensor<T>& lhs, Scalar rhs) {
+        const T value = static_cast<T>(rhs);
+        for (int64_t i = 0; i < lhs.numel(); ++i) {
+            lhs[i] *= value;
+        }
+        return lhs;
+    }
+
+    template <typename T>
+    inline Tensor<T>& operator/=(Tensor<T>& lhs, const Tensor<T>& rhs) {
+        if (lhs.shape() == rhs.shape()) {
+            for (int64_t i = 0; i < lhs.numel(); ++i) {
+                lhs[i] /= rhs[i];
+            }
+            return lhs;
+        }
+        tensor_broadcast_shape(lhs.shape(), rhs.shape());
+        const std::vector<int64_t> lhs_strides = tensor_compute_strides(lhs.shape());
+        const std::vector<int64_t> rhs_strides = tensor_compute_strides(rhs.shape());
+        const T* rhs_data                      = rhs.data();
+        tensor_for_each_broadcast_offset(lhs.shape(),
+                                         lhs.shape(),
+                                         lhs_strides,
+                                         rhs.shape(),
+                                         rhs_strides,
+                                         [&](int64_t, int64_t lhs_offset, int64_t rhs_offset) {
+                                             lhs[static_cast<int64_t>(lhs_offset)] /= rhs_data[rhs_offset];
+                                         });
+        return lhs;
+    }
+
+    template <typename T, typename Scalar, typename = std::enable_if_t<std::is_arithmetic<Scalar>::value>>
+    inline Tensor<T>& operator/=(Tensor<T>& lhs, Scalar rhs) {
+        const T value = static_cast<T>(rhs);
+        for (int64_t i = 0; i < lhs.numel(); ++i) {
+            lhs[i] /= value;
+        }
+        return lhs;
+    }
+
+    template <typename T>
+    inline Tensor<T> operator+(Tensor<T> lhs, const Tensor<T>& rhs) {
+        if (lhs.shape() != rhs.shape()) {
+            const std::vector<int64_t> out_shape = tensor_broadcast_shape(lhs.shape(), rhs.shape());
+            Tensor<T> result(out_shape);
+            const std::vector<int64_t> lhs_strides = tensor_compute_strides(lhs.shape());
+            const std::vector<int64_t> rhs_strides = tensor_compute_strides(rhs.shape());
+            const T* lhs_data                      = lhs.data();
+            const T* rhs_data                      = rhs.data();
+            tensor_for_each_broadcast_offset(out_shape,
+                                             lhs.shape(),
+                                             lhs_strides,
+                                             rhs.shape(),
+                                             rhs_strides,
+                                             [&](int64_t flat, int64_t lhs_offset, int64_t rhs_offset) {
+                                                 result[flat] = lhs_data[lhs_offset] + rhs_data[rhs_offset];
+                                             });
+            return result;
+        }
+        lhs += rhs;
+        return lhs;
+    }
+
+    template <typename T, typename Scalar, typename = std::enable_if_t<std::is_arithmetic<Scalar>::value>>
+    inline Tensor<T> operator+(Tensor<T> lhs, Scalar rhs) {
+        lhs += rhs;
+        return lhs;
+    }
+
+    template <typename T, typename Scalar, typename = std::enable_if_t<std::is_arithmetic<Scalar>::value>>
+    inline Tensor<T> operator+(Scalar lhs, Tensor<T> rhs) {
+        rhs += lhs;
+        return rhs;
+    }
+
+    template <typename T>
+    inline Tensor<T> operator-(Tensor<T> lhs, const Tensor<T>& rhs) {
+        if (lhs.shape() != rhs.shape()) {
+            const std::vector<int64_t> out_shape = tensor_broadcast_shape(lhs.shape(), rhs.shape());
+            Tensor<T> result(out_shape);
+            const std::vector<int64_t> lhs_strides = tensor_compute_strides(lhs.shape());
+            const std::vector<int64_t> rhs_strides = tensor_compute_strides(rhs.shape());
+            const T* lhs_data                      = lhs.data();
+            const T* rhs_data                      = rhs.data();
+            tensor_for_each_broadcast_offset(out_shape,
+                                             lhs.shape(),
+                                             lhs_strides,
+                                             rhs.shape(),
+                                             rhs_strides,
+                                             [&](int64_t flat, int64_t lhs_offset, int64_t rhs_offset) {
+                                                 result[flat] = lhs_data[lhs_offset] - rhs_data[rhs_offset];
+                                             });
+            return result;
+        }
+        lhs -= rhs;
+        return lhs;
+    }
+
+    template <typename T, typename Scalar, typename = std::enable_if_t<std::is_arithmetic<Scalar>::value>>
+    inline Tensor<T> operator-(Tensor<T> lhs, Scalar rhs) {
+        lhs -= rhs;
+        return lhs;
+    }
+
+    template <typename T, typename Scalar, typename = std::enable_if_t<std::is_arithmetic<Scalar>::value>>
+    inline Tensor<T> operator-(Scalar lhs, const Tensor<T>& rhs) {
+        Tensor<T> result = rhs;
+        const T value    = static_cast<T>(lhs);
+        for (int64_t i = 0; i < result.numel(); ++i) {
+            result[i] = value - result[i];
+        }
+        return result;
+    }
+
+    template <typename T>
+    inline Tensor<T> operator*(Tensor<T> lhs, const Tensor<T>& rhs) {
+        if (lhs.shape() != rhs.shape()) {
+            const std::vector<int64_t> out_shape = tensor_broadcast_shape(lhs.shape(), rhs.shape());
+            Tensor<T> result(out_shape);
+            const std::vector<int64_t> lhs_strides = tensor_compute_strides(lhs.shape());
+            const std::vector<int64_t> rhs_strides = tensor_compute_strides(rhs.shape());
+            const T* lhs_data                      = lhs.data();
+            const T* rhs_data                      = rhs.data();
+            tensor_for_each_broadcast_offset(out_shape,
+                                             lhs.shape(),
+                                             lhs_strides,
+                                             rhs.shape(),
+                                             rhs_strides,
+                                             [&](int64_t flat, int64_t lhs_offset, int64_t rhs_offset) {
+                                                 result[flat] = lhs_data[lhs_offset] * rhs_data[rhs_offset];
+                                             });
+            return result;
+        }
+        lhs *= rhs;
+        return lhs;
+    }
+
+    template <typename T, typename Scalar, typename = std::enable_if_t<std::is_arithmetic<Scalar>::value>>
+    inline Tensor<T> operator*(Tensor<T> lhs, Scalar rhs) {
+        lhs *= rhs;
+        return lhs;
+    }
+
+    template <typename T, typename Scalar, typename = std::enable_if_t<std::is_arithmetic<Scalar>::value>>
+    inline Tensor<T> operator*(Scalar lhs, Tensor<T> rhs) {
+        rhs *= lhs;
+        return rhs;
+    }
+
+    template <typename T>
+    inline Tensor<T> operator/(Tensor<T> lhs, const Tensor<T>& rhs) {
+        if (lhs.shape() != rhs.shape()) {
+            const std::vector<int64_t> out_shape = tensor_broadcast_shape(lhs.shape(), rhs.shape());
+            Tensor<T> result(out_shape);
+            const std::vector<int64_t> lhs_strides = tensor_compute_strides(lhs.shape());
+            const std::vector<int64_t> rhs_strides = tensor_compute_strides(rhs.shape());
+            const T* lhs_data                      = lhs.data();
+            const T* rhs_data                      = rhs.data();
+            tensor_for_each_broadcast_offset(out_shape,
+                                             lhs.shape(),
+                                             lhs_strides,
+                                             rhs.shape(),
+                                             rhs_strides,
+                                             [&](int64_t flat, int64_t lhs_offset, int64_t rhs_offset) {
+                                                 result[flat] = lhs_data[lhs_offset] / rhs_data[rhs_offset];
+                                             });
+            return result;
+        }
+        lhs /= rhs;
+        return lhs;
+    }
+
+    template <typename T, typename Scalar, typename = std::enable_if_t<std::is_arithmetic<Scalar>::value>>
+    inline Tensor<T> operator/(Tensor<T> lhs, Scalar rhs) {
+        lhs /= rhs;
+        return lhs;
+    }
+
+    template <typename T, typename Scalar, typename = std::enable_if_t<std::is_arithmetic<Scalar>::value>>
+    inline Tensor<T> operator/(Scalar lhs, const Tensor<T>& rhs) {
+        Tensor<T> result = rhs;
+        const T value    = static_cast<T>(lhs);
+        for (int64_t i = 0; i < result.numel(); ++i) {
+            result[i] = value / result[i];
+        }
+        return result;
+    }
+
+    template <typename T>
+    inline Tensor<T> operator-(const Tensor<T>& tensor) {
+        Tensor<T> result = tensor;
+        for (int64_t i = 0; i < result.numel(); ++i) {
+            result[i] = -result[i];
+        }
+        return result;
+    }
+
+    template <typename T>
+    inline Tensor<T> zeros(std::vector<int64_t> shape) {
+        return Tensor<T>::zeros(std::move(shape));
+    }
+
+    template <typename T>
+    inline Tensor<T> full(std::vector<int64_t> shape, const T& value) {
+        return Tensor<T>::full(std::move(shape), value);
+    }
+
+    template <typename T>
+    inline Tensor<T> randn(std::vector<int64_t> shape, const std::shared_ptr<RNG>& rng) {
+        return Tensor<T>::randn(std::move(shape), rng);
+    }
+
+    template <typename T>
+    inline Tensor<T> randn_like(const Tensor<T>& tensor, const std::shared_ptr<RNG>& rng) {
+        return Tensor<T>::randn(tensor.shape(), rng);
+    }
+
+    template <typename T>
+    inline std::vector<T> tensor_to_vector(const Tensor<T>& tensor) {
+        return tensor.values();
+    }
+
+    namespace ops {
+        enum class InterpolateMode {
+            Nearest,
+        };
+
+        inline int64_t normalize_slice_bound(int64_t index, int64_t dim_size) {
+            if (index < 0) {
+                index += dim_size;
+            }
+            return index;
+        }
+
+        template <typename T>
+        inline std::pair<int64_t, int64_t> resolve_slice_bounds(const Tensor<T>& input,
+                                                                size_t dim,
+                                                                int64_t start,
+                                                                int64_t end) {
+            if (dim >= static_cast<size_t>(input.dim())) {
+                tensor_throw_invalid_argument("Tensor slice dimension out of range: dim=" +
+                                              std::to_string(dim) + ", rank=" +
+                                              std::to_string(input.dim()) + ", input_shape=" +
+                                              tensor_shape_to_string(input.shape()));
+            }
+
+            int64_t dim_size = input.shape()[dim];
+            start            = normalize_slice_bound(start, dim_size);
+            end              = normalize_slice_bound(end, dim_size);
+
+            if (start < 0 || start > dim_size || end < 0 || end > dim_size || start > end) {
+                tensor_throw_invalid_argument("Tensor slice bounds out of range: dim=" +
+                                              std::to_string(dim) + ", start=" +
+                                              std::to_string(start) + ", end=" +
+                                              std::to_string(end) + ", input_shape=" +
+                                              tensor_shape_to_string(input.shape()));
+            }
+
+            return {start, end};
+        }
+
+        template <typename T>
+        inline Tensor<T> exp(const Tensor<T>& input) {
+            Tensor<T> output(input.shape());
+            for (int64_t i = 0; i < input.numel(); ++i) {
+                output[i] = static_cast<T>(std::exp(static_cast<double>(input[i])));
+            }
+            return output;
+        }
+
+        template <typename T>
+        inline Tensor<T> clamp(const Tensor<T>& input, const T& min_value, const T& max_value) {
+            if (min_value > max_value) {
+                tensor_throw_invalid_argument("Tensor clamp requires min_value <= max_value");
+            }
+            Tensor<T> output(input.shape());
+            for (int64_t i = 0; i < input.numel(); ++i) {
+                output[i] = std::clamp(input[i], min_value, max_value);
+            }
+            return output;
+        }
+
+        template <typename T>
+        inline Tensor<T> round(const Tensor<T>& input) {
+            Tensor<T> output(input.shape());
+            for (int64_t i = 0; i < input.numel(); ++i) {
+                output[i] = static_cast<T>(std::round(static_cast<double>(input[i])));
+            }
+            return output;
+        }
+
+        template <typename T>
+        inline Tensor<T> slice(const Tensor<T>& input,
+                               size_t dim,
+                               int64_t start,
+                               int64_t end) {
+            auto [resolved_start, resolved_end] = resolve_slice_bounds(input, dim, start, end);
+            std::vector<int64_t> out_shape      = input.shape();
+            out_shape[dim]                      = resolved_end - resolved_start;
+
+            Tensor<T> output(out_shape);
+            if (output.numel() == 0) {
+                return output;
+            }
+
+            int64_t inner = 1;
+            for (size_t i = 0; i < dim; ++i) {
+                inner *= input.shape()[i];
+            }
+
+            int64_t outer = 1;
+            for (size_t i = dim + 1; i < static_cast<size_t>(input.dim()); ++i) {
+                outer *= input.shape()[i];
+            }
+
+            int64_t src_chunk  = (resolved_end - resolved_start) * inner;
+            int64_t src_stride = input.shape()[dim] * inner;
+            for (int64_t i = 0; i < outer; ++i) {
+                const int64_t src_offset = i * src_stride + resolved_start * inner;
+                const int64_t dst_offset = i * src_chunk;
+                std::copy_n(input.data() + src_offset, src_chunk, output.data() + dst_offset);
+            }
+
+            return output;
+        }
+
+        template <typename T>
+        inline Tensor<T> narrow(const Tensor<T>& input,
+                                size_t dim,
+                                int64_t start,
+                                int64_t length) {
+            if (length < 0) {
+                tensor_throw_invalid_argument("Tensor narrow requires non-negative length: length=" +
+                                              std::to_string(length) + ", input_shape=" +
+                                              tensor_shape_to_string(input.shape()));
+            }
+            return slice(input, dim, start, start + length);
+        }
+
+        template <typename T>
+        inline void slice_assign(Tensor<T>* dst,
+                                 size_t dim,
+                                 int64_t start,
+                                 int64_t end,
+                                 const Tensor<T>& src) {
+            if (dst == nullptr) {
+                tensor_throw_invalid_argument("Tensor slice_assign requires non-null dst");
+            }
+
+            auto [resolved_start, resolved_end] = resolve_slice_bounds(*dst, dim, start, end);
+            if (src.dim() != dst->dim()) {
+                tensor_throw_invalid_argument("Tensor slice_assign requires matching rank: dst_shape=" +
+                                              tensor_shape_to_string(dst->shape()) + ", src_shape=" +
+                                              tensor_shape_to_string(src.shape()));
+            }
+
+            std::vector<int64_t> expected_shape = dst->shape();
+            expected_shape[dim]                 = resolved_end - resolved_start;
+            if (src.shape() != expected_shape) {
+                tensor_throw_invalid_argument("Tensor slice_assign requires matching source shape: dst_shape=" +
+                                              tensor_shape_to_string(dst->shape()) + ", src_shape=" +
+                                              tensor_shape_to_string(src.shape()) + ", expected_src_shape=" +
+                                              tensor_shape_to_string(expected_shape));
+            }
+
+            if (src.numel() == 0) {
+                return;
+            }
+
+            int64_t inner = 1;
+            for (size_t i = 0; i < dim; ++i) {
+                inner *= dst->shape()[i];
+            }
+
+            int64_t outer = 1;
+            for (size_t i = dim + 1; i < static_cast<size_t>(dst->dim()); ++i) {
+                outer *= dst->shape()[i];
+            }
+
+            int64_t dst_chunk  = (resolved_end - resolved_start) * inner;
+            int64_t dst_stride = dst->shape()[dim] * inner;
+            for (int64_t i = 0; i < outer; ++i) {
+                const int64_t dst_offset = i * dst_stride + resolved_start * inner;
+                const int64_t src_offset = i * dst_chunk;
+                std::copy_n(src.data() + src_offset, dst_chunk, dst->data() + dst_offset);
+            }
+        }
+
+        template <typename T>
+        inline void fill_slice(Tensor<T>* dst,
+                               size_t dim,
+                               int64_t start,
+                               int64_t end,
+                               const T& value) {
+            if (dst == nullptr) {
+                tensor_throw_invalid_argument("Tensor fill_slice requires non-null dst");
+            }
+
+            auto [resolved_start, resolved_end] = resolve_slice_bounds(*dst, dim, start, end);
+            int64_t inner                       = 1;
+            for (size_t i = 0; i < dim; ++i) {
+                inner *= dst->shape()[i];
+            }
+
+            int64_t outer = 1;
+            for (size_t i = dim + 1; i < static_cast<size_t>(dst->dim()); ++i) {
+                outer *= dst->shape()[i];
+            }
+
+            int64_t chunk  = (resolved_end - resolved_start) * inner;
+            int64_t stride = dst->shape()[dim] * inner;
+            for (int64_t i = 0; i < outer; ++i) {
+                const int64_t offset = i * stride + resolved_start * inner;
+                std::fill_n(dst->data() + offset, chunk, value);
+            }
+        }
+
+        template <typename T>
+        inline Tensor<T> interpolate(const Tensor<T>& input,
+                                     std::vector<int64_t> output_shape,
+                                     InterpolateMode mode = InterpolateMode::Nearest,
+                                     bool align_corners   = false) {
+            if (mode != InterpolateMode::Nearest) {
+                tensor_throw_invalid_argument("Only nearest interpolate mode is implemented, got mode=" +
+                                              std::to_string(static_cast<int>(mode)));
+            }
+            if (align_corners) {
+                tensor_throw_invalid_argument("align_corners is not supported for nearest interpolate: input_shape=" +
+                                              tensor_shape_to_string(input.shape()) + ", output_shape=" +
+                                              tensor_shape_to_string(output_shape));
+            }
+            if (input.shape() == output_shape) {
+                return input;
+            }
+            if (input.dim() != static_cast<int64_t>(output_shape.size())) {
+                tensor_throw_invalid_argument("Tensor interpolate requires matching rank: input_dim=" +
+                                              std::to_string(input.dim()) + ", output_dim=" +
+                                              std::to_string(output_shape.size()) + ", input_shape=" +
+                                              tensor_shape_to_string(input.shape()) + ", output_shape=" +
+                                              tensor_shape_to_string(output_shape));
+            }
+            for (size_t i = 0; i < output_shape.size(); ++i) {
+                if (output_shape[i] <= 0) {
+                    tensor_throw_invalid_argument("Tensor interpolate output shape must be positive: input_shape=" +
+                                                  tensor_shape_to_string(input.shape()) + ", output_shape=" +
+                                                  tensor_shape_to_string(output_shape));
+                }
+                if (input.shape()[i] <= 0) {
+                    tensor_throw_invalid_argument("Tensor interpolate input shape must be positive: input_shape=" +
+                                                  tensor_shape_to_string(input.shape()) + ", output_shape=" +
+                                                  tensor_shape_to_string(output_shape));
+                }
+            }
+
+            Tensor<T> output(std::move(output_shape));
+            for (int64_t flat = 0; flat < output.numel(); ++flat) {
+                std::vector<int64_t> output_coord = tensor_unravel_index(flat, output.shape());
+                std::vector<int64_t> input_coord(static_cast<size_t>(input.dim()), 0);
+                for (size_t i = 0; i < static_cast<size_t>(input.dim()); ++i) {
+                    input_coord[i] = output_coord[i] * input.shape()[i] / output.shape()[i];
+                }
+                output[flat] = input.index(input_coord);
+            }
+
+            return output;
+        }
+
+        template <typename T>
+        inline Tensor<T> interpolate(const Tensor<T>& input,
+                                     const std::optional<std::vector<int64_t>>& size,
+                                     const std::optional<std::vector<double>>& scale_factor,
+                                     InterpolateMode mode = InterpolateMode::Nearest,
+                                     bool align_corners   = false) {
+            if (mode != InterpolateMode::Nearest) {
+                tensor_throw_invalid_argument("Only nearest interpolate mode is implemented, got mode=" +
+                                              std::to_string(static_cast<int>(mode)));
+            }
+            if (align_corners) {
+                tensor_throw_invalid_argument("align_corners is not supported for nearest interpolate: input_shape=" +
+                                              tensor_shape_to_string(input.shape()));
+            }
+            if (size.has_value() == scale_factor.has_value()) {
+                tensor_throw_invalid_argument("Tensor interpolate requires exactly one of size or scale_factor: input_shape=" +
+                                              tensor_shape_to_string(input.shape()));
+            }
+
+            std::vector<int64_t> output_shape = input.shape();
+            if (size.has_value()) {
+                if (size->empty() || size->size() > output_shape.size()) {
+                    tensor_throw_invalid_argument("Tensor interpolate size must target low dimensions: input_shape=" +
+                                                  tensor_shape_to_string(input.shape()) + ", size_rank=" +
+                                                  std::to_string(size->size()));
+                }
+                for (size_t i = 0; i < size->size(); ++i) {
+                    if ((*size)[i] <= 0) {
+                        tensor_throw_invalid_argument("Tensor interpolate size must be positive: input_shape=" +
+                                                      tensor_shape_to_string(input.shape()) + ", size=" +
+                                                      tensor_shape_to_string(*size));
+                    }
+                    output_shape[i] = (*size)[i];
+                }
+            } else {
+                if (scale_factor->empty() || scale_factor->size() > output_shape.size()) {
+                    tensor_throw_invalid_argument("Tensor interpolate scale_factor must target low dimensions: input_shape=" +
+                                                  tensor_shape_to_string(input.shape()) + ", scale_factor_rank=" +
+                                                  std::to_string(scale_factor->size()));
+                }
+                for (size_t i = 0; i < scale_factor->size(); ++i) {
+                    if ((*scale_factor)[i] <= 0.0) {
+                        tensor_throw_invalid_argument("Tensor interpolate scale_factor must be positive: input_shape=" +
+                                                      tensor_shape_to_string(input.shape()));
+                    }
+                    output_shape[i] = static_cast<int64_t>(
+                        std::floor(static_cast<double>(output_shape[i]) * (*scale_factor)[i]));
+                    if (output_shape[i] <= 0) {
+                        tensor_throw_invalid_argument("Tensor interpolate output shape must be positive: input_shape=" +
+                                                      tensor_shape_to_string(input.shape()) + ", output_shape=" +
+                                                      tensor_shape_to_string(output_shape));
+                    }
+                }
+            }
+
+            return interpolate(input, std::move(output_shape), mode, align_corners);
+        }
+
+        template <typename T>
+        inline Tensor<T> interpolate(const Tensor<T>& input,
+                                     const std::optional<std::vector<int64_t>>& size,
+                                     double scale_factor,
+                                     InterpolateMode mode = InterpolateMode::Nearest,
+                                     bool align_corners   = false) {
+            return interpolate(input,
+                               size,
+                               std::vector<double>(size.has_value() ? size->size() : input.dim(), scale_factor),
+                               mode,
+                               align_corners);
+        }
+
+        template <typename T>
+        inline Tensor<T> concat(const Tensor<T>& lhs, const Tensor<T>& rhs, size_t dim) {
+            if (lhs.dim() != rhs.dim()) {
+                tensor_throw_invalid_argument("Tensor concat requires same rank: lhs_dim=" +
+                                              std::to_string(lhs.dim()) + ", rhs_dim=" +
+                                              std::to_string(rhs.dim()) + ", lhs_shape=" +
+                                              tensor_shape_to_string(lhs.shape()) + ", rhs_shape=" +
+                                              tensor_shape_to_string(rhs.shape()));
+            }
+            if (dim >= static_cast<size_t>(lhs.dim())) {
+                tensor_throw_invalid_argument("Tensor concat dimension out of range: dim=" +
+                                              std::to_string(dim) + ", rank=" +
+                                              std::to_string(lhs.dim()) + ", lhs_shape=" +
+                                              tensor_shape_to_string(lhs.shape()));
+            }
+            std::vector<int64_t> out_shape = lhs.shape();
+            for (size_t i = 0; i < static_cast<size_t>(lhs.dim()); ++i) {
+                if (i == dim) {
+                    continue;
+                }
+                if (lhs.shape()[i] != rhs.shape()[i]) {
+                    tensor_throw_invalid_argument("Tensor concat requires matching non-concat dimensions: dim=" +
+                                                  std::to_string(dim) + ", lhs_shape=" +
+                                                  tensor_shape_to_string(lhs.shape()) + ", rhs_shape=" +
+                                                  tensor_shape_to_string(rhs.shape()));
+                }
+            }
+            out_shape[dim] += rhs.shape()[dim];
+
+            Tensor<T> out(out_shape);
+            int64_t inner = 1;
+            for (size_t i = 0; i < dim; ++i) {
+                inner *= lhs.shape()[i];
+            }
+
+            int64_t outer = 1;
+            for (size_t i = dim + 1; i < static_cast<size_t>(lhs.dim()); ++i) {
+                outer *= lhs.shape()[i];
+            }
+
+            int64_t lhs_chunk = lhs.shape()[dim] * inner;
+            int64_t rhs_chunk = rhs.shape()[dim] * inner;
+            int64_t out_chunk = lhs_chunk + rhs_chunk;
+
+            for (int64_t i = 0; i < outer; ++i) {
+                int64_t lhs_offset = i * lhs_chunk;
+                int64_t rhs_offset = i * rhs_chunk;
+                int64_t out_offset = i * out_chunk;
+
+                std::copy_n(lhs.data() + lhs_offset, lhs_chunk, out.data() + out_offset);
+                std::copy_n(rhs.data() + rhs_offset, rhs_chunk, out.data() + out_offset + lhs_chunk);
+            }
+            return out;
+        }
+
+        template <typename T>
+        inline std::vector<Tensor<T>> chunk(const Tensor<T>& tensor, int64_t chunks, size_t dim) {
+            if (chunks <= 0) {
+                tensor_throw_invalid_argument("Tensor chunk requires chunks > 0: chunks=" +
+                                              std::to_string(chunks) + ", tensor_shape=" +
+                                              tensor_shape_to_string(tensor.shape()));
+            }
+            if (dim >= static_cast<size_t>(tensor.dim())) {
+                tensor_throw_invalid_argument("Tensor chunk dimension out of range: dim=" +
+                                              std::to_string(dim) + ", rank=" +
+                                              std::to_string(tensor.dim()) + ", tensor_shape=" +
+                                              tensor_shape_to_string(tensor.shape()));
+            }
+
+            const int64_t dim_size = tensor.shape()[dim];
+            if (dim_size == 0) {
+                return {};
+            }
+            if (dim_size % chunks != 0) {
+                tensor_throw_invalid_argument("Tensor chunk requires the dimension size to be divisible by chunks: dim=" +
+                                              std::to_string(dim) + ", dim_size=" +
+                                              std::to_string(dim_size) + ", chunks=" +
+                                              std::to_string(chunks) + ", tensor_shape=" +
+                                              tensor_shape_to_string(tensor.shape()));
+            }
+
+            const int64_t chunk_size = dim_size / chunks;
+            int64_t inner            = 1;
+            for (size_t i = 0; i < dim; ++i) {
+                inner *= tensor.shape()[i];
+            }
+
+            int64_t outer = 1;
+            for (size_t i = dim + 1; i < static_cast<size_t>(tensor.dim()); ++i) {
+                outer *= tensor.shape()[i];
+            }
+
+            std::vector<Tensor<T>> parts;
+            parts.reserve(static_cast<size_t>(chunks));
+
+            for (int64_t start = 0; start < dim_size; start += chunk_size) {
+                std::vector<int64_t> part_shape = tensor.shape();
+                part_shape[dim]                 = chunk_size;
+                Tensor<T> part(part_shape);
+
+                const int64_t src_chunk = chunk_size * inner;
+                const int64_t dst_chunk = src_chunk;
+                for (int64_t i = 0; i < outer; ++i) {
+                    const int64_t src_offset = (i * dim_size + start) * inner;
+                    const int64_t dst_offset = i * dst_chunk;
+                    std::copy_n(tensor.data() + src_offset, src_chunk, part.data() + dst_offset);
+                }
+
+                parts.push_back(std::move(part));
+            }
+
+            return parts;
+        }
+
+    }  // namespace ops
+
+}  // namespace sd
+
+#endif
diff --git a/src/tensor_ggml.hpp b/src/tensor_ggml.hpp
new file mode 100644
index 0000000..493a958
--- /dev/null
+++ b/src/tensor_ggml.hpp
@@ -0,0 +1,127 @@
+#ifndef __SD_TENSOR_GGML_HPP__
+#define __SD_TENSOR_GGML_HPP__
+
+#include <array>
+#include <cstring>
+#include <fstream>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+
+#include "ggml.h"
+#include "tensor.hpp"
+
+namespace sd {
+
+    template <typename T>
+    struct GGMLTypeTraits;
+
+    template <>
+    struct GGMLTypeTraits<float> {
+        static constexpr ggml_type type = GGML_TYPE_F32;
+    };
+
+    template <>
+    struct GGMLTypeTraits<ggml_fp16_t> {
+        static constexpr ggml_type type = GGML_TYPE_F16;
+    };
+
+    template <>
+    struct GGMLTypeTraits<int32_t> {
+        static constexpr ggml_type type = GGML_TYPE_I32;
+    };
+
+    template <>
+    struct GGMLTypeTraits<int64_t> {
+        static constexpr ggml_type type = GGML_TYPE_I64;
+    };
+
+    inline std::vector<int64_t> shape_from_ggml(const ggml_tensor* tensor) {
+        std::vector<int64_t> shape;
+        shape.reserve(static_cast<size_t>(ggml_n_dims(tensor)));
+        for (int i = 0; i < ggml_n_dims(tensor); ++i) {
+            shape.push_back(tensor->ne[i]);
+        }
+        return shape;
+    }
+
+    template <typename T>
+    inline Tensor<T> make_sd_tensor_from_ggml(const ggml_tensor* tensor) {
+        if (tensor == nullptr) {
+            return {};
+        }
+        if (tensor->type != GGMLTypeTraits<T>::type) {
+            GGML_ABORT("ggml tensor type does not match sd::Tensor type");
+        }
+        Tensor<T> result(shape_from_ggml(tensor));
+        if (tensor->buffer != nullptr) {
+            ggml_backend_tensor_get(tensor, result.data(), 0, ggml_nbytes(tensor));
+        } else {
+            std::memcpy(result.data(), tensor->data, ggml_nbytes(tensor));
+        }
+        return result;
+    }
+
+    template <typename T>
+    inline ggml_tensor* make_ggml_tensor(ggml_context* ctx, const Tensor<T>& tensor, bool copy_data = true) {
+        GGML_ASSERT(tensor.dim() > 0 && tensor.dim() <= 5);
+
+        int n_dims = std::min(static_cast<int>(tensor.dim()), GGML_MAX_DIMS);
+
+        std::array<int64_t, GGML_MAX_DIMS> ne = {1, 1, 1, 1};
+        for (int64_t i = 0; i < n_dims; ++i) {
+            ne[static_cast<size_t>(i)] = tensor.shape()[static_cast<size_t>(i)];
+        }
+
+        if (tensor.dim() == 5) {
+            ne[3] *= tensor.shape()[4];
+        }
+
+        ggml_tensor* result = ggml_new_tensor(ctx, GGMLTypeTraits<T>::type, n_dims, ne.data());
+        if (copy_data && tensor.numel() > 0) {
+            std::memcpy(result->data, tensor.data(), static_cast<size_t>(ggml_nbytes(result)));
+        }
+        return result;
+    }
+
+    template <typename T>
+    inline Tensor<T> load_tensor_from_file_as_tensor(const std::string& file_path) {
+        std::ifstream file(file_path, std::ios::binary);
+        if (!file.is_open()) {
+            throw std::runtime_error("failed to open tensor file: " + file_path);
+        }
+
+        int32_t n_dims = 0;
+        int32_t length = 0;
+        int32_t ttype  = 0;
+        file.read(reinterpret_cast<char*>(&n_dims), sizeof(n_dims));
+        file.read(reinterpret_cast<char*>(&length), sizeof(length));
+        file.read(reinterpret_cast<char*>(&ttype), sizeof(ttype));
+        if (!file.good()) {
+            throw std::runtime_error("incomplete tensor file header: " + file_path);
+        }
+        if (static_cast<ggml_type>(ttype) != GGMLTypeTraits<T>::type) {
+            throw std::invalid_argument("tensor file type does not match requested sd::Tensor type");
+        }
+
+        std::vector<int64_t> shape(4, 1);
+        for (int i = 0; i < n_dims; ++i) {
+            int32_t dim = 1;
+            file.read(reinterpret_cast<char*>(&dim), sizeof(dim));
+            shape[static_cast<size_t>(i)] = dim;
+        }
+        std::string name(static_cast<size_t>(length), '\0');
+        file.read(name.data(), length);
+
+        shape.resize(static_cast<size_t>(n_dims));
+        Tensor<T> tensor(shape);
+        file.read(reinterpret_cast<char*>(tensor.data()), static_cast<std::streamsize>(tensor.numel() * sizeof(T)));
+        if (!file.good()) {
+            throw std::runtime_error("incomplete tensor file data: " + file_path);
+        }
+        return tensor;
+    }
+
+}  // namespace sd
+
+#endif
diff --git a/src/tokenize_util.cpp b/src/tokenize_util.cpp
index 22cf8ae..33fdad2 100644
--- a/src/tokenize_util.cpp
+++ b/src/tokenize_util.cpp
@@ -1,993 +1,993 @@
-#include <algorithm>
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "tokenize_util.h"
-
-bool is_number(char32_t ch) {
-    return (ch >= U'0' && ch <= U'9');
-}
-
-bool is_letter(char32_t ch) {
-    static const struct { char32_t start, end; } ranges[] = {
-        {0x41, 0x5A},
-        {0x61, 0x7A},
-        {0xAA, 0xAA},
-        {0xB5, 0xB5},
-        {0xBA, 0xBA},
-        {0xC0, 0xD6},
-        {0xD8, 0xF6},
-        {0xF8, 0x2C1},
-        {0x2C6, 0x2D1},
-        {0x2E0, 0x2E4},
-        {0x2EC, 0x2EC},
-        {0x2EE, 0x2EE},
-        {0x370, 0x374},
-        {0x376, 0x377},
-        {0x37A, 0x37D},
-        {0x37F, 0x37F},
-        {0x386, 0x386},
-        {0x388, 0x38A},
-        {0x38C, 0x38C},
-        {0x38E, 0x3A1},
-        {0x3A3, 0x3F5},
-        {0x3F7, 0x481},
-        {0x48A, 0x52F},
-        {0x531, 0x556},
-        {0x559, 0x559},
-        {0x560, 0x588},
-        {0x5D0, 0x5EA},
-        {0x5EF, 0x5F2},
-        {0x620, 0x64A},
-        {0x66E, 0x66F},
-        {0x671, 0x6D3},
-        {0x6D5, 0x6D5},
-        {0x6E5, 0x6E6},
-        {0x6EE, 0x6EF},
-        {0x6FA, 0x6FC},
-        {0x6FF, 0x6FF},
-        {0x710, 0x710},
-        {0x712, 0x72F},
-        {0x74D, 0x7A5},
-        {0x7B1, 0x7B1},
-        {0x7CA, 0x7EA},
-        {0x7F4, 0x7F5},
-        {0x7FA, 0x7FA},
-        {0x800, 0x815},
-        {0x81A, 0x81A},
-        {0x824, 0x824},
-        {0x828, 0x828},
-        {0x840, 0x858},
-        {0x860, 0x86A},
-        {0x870, 0x887},
-        {0x889, 0x88F},
-        {0x8A0, 0x8C9},
-        {0x904, 0x939},
-        {0x93D, 0x93D},
-        {0x950, 0x950},
-        {0x958, 0x961},
-        {0x971, 0x980},
-        {0x985, 0x98C},
-        {0x98F, 0x990},
-        {0x993, 0x9A8},
-        {0x9AA, 0x9B0},
-        {0x9B2, 0x9B2},
-        {0x9B6, 0x9B9},
-        {0x9BD, 0x9BD},
-        {0x9CE, 0x9CE},
-        {0x9DC, 0x9DD},
-        {0x9DF, 0x9E1},
-        {0x9F0, 0x9F1},
-        {0x9FC, 0x9FC},
-        {0xA05, 0xA0A},
-        {0xA0F, 0xA10},
-        {0xA13, 0xA28},
-        {0xA2A, 0xA30},
-        {0xA32, 0xA33},
-        {0xA35, 0xA36},
-        {0xA38, 0xA39},
-        {0xA59, 0xA5C},
-        {0xA5E, 0xA5E},
-        {0xA72, 0xA74},
-        {0xA85, 0xA8D},
-        {0xA8F, 0xA91},
-        {0xA93, 0xAA8},
-        {0xAAA, 0xAB0},
-        {0xAB2, 0xAB3},
-        {0xAB5, 0xAB9},
-        {0xABD, 0xABD},
-        {0xAD0, 0xAD0},
-        {0xAE0, 0xAE1},
-        {0xAF9, 0xAF9},
-        {0xB05, 0xB0C},
-        {0xB0F, 0xB10},
-        {0xB13, 0xB28},
-        {0xB2A, 0xB30},
-        {0xB32, 0xB33},
-        {0xB35, 0xB39},
-        {0xB3D, 0xB3D},
-        {0xB5C, 0xB5D},
-        {0xB5F, 0xB61},
-        {0xB71, 0xB71},
-        {0xB83, 0xB83},
-        {0xB85, 0xB8A},
-        {0xB8E, 0xB90},
-        {0xB92, 0xB95},
-        {0xB99, 0xB9A},
-        {0xB9C, 0xB9C},
-        {0xB9E, 0xB9F},
-        {0xBA3, 0xBA4},
-        {0xBA8, 0xBAA},
-        {0xBAE, 0xBB9},
-        {0xBD0, 0xBD0},
-        {0xC05, 0xC0C},
-        {0xC0E, 0xC10},
-        {0xC12, 0xC28},
-        {0xC2A, 0xC39},
-        {0xC3D, 0xC3D},
-        {0xC58, 0xC5A},
-        {0xC5C, 0xC5D},
-        {0xC60, 0xC61},
-        {0xC80, 0xC80},
-        {0xC85, 0xC8C},
-        {0xC8E, 0xC90},
-        {0xC92, 0xCA8},
-        {0xCAA, 0xCB3},
-        {0xCB5, 0xCB9},
-        {0xCBD, 0xCBD},
-        {0xCDC, 0xCDE},
-        {0xCE0, 0xCE1},
-        {0xCF1, 0xCF2},
-        {0xD04, 0xD0C},
-        {0xD0E, 0xD10},
-        {0xD12, 0xD3A},
-        {0xD3D, 0xD3D},
-        {0xD4E, 0xD4E},
-        {0xD54, 0xD56},
-        {0xD5F, 0xD61},
-        {0xD7A, 0xD7F},
-        {0xD85, 0xD96},
-        {0xD9A, 0xDB1},
-        {0xDB3, 0xDBB},
-        {0xDBD, 0xDBD},
-        {0xDC0, 0xDC6},
-        {0xE01, 0xE30},
-        {0xE32, 0xE33},
-        {0xE40, 0xE46},
-        {0xE81, 0xE82},
-        {0xE84, 0xE84},
-        {0xE86, 0xE8A},
-        {0xE8C, 0xEA3},
-        {0xEA5, 0xEA5},
-        {0xEA7, 0xEB0},
-        {0xEB2, 0xEB3},
-        {0xEBD, 0xEBD},
-        {0xEC0, 0xEC4},
-        {0xEC6, 0xEC6},
-        {0xEDC, 0xEDF},
-        {0xF00, 0xF00},
-        {0xF40, 0xF47},
-        {0xF49, 0xF6C},
-        {0xF88, 0xF8C},
-        {0x1000, 0x102A},
-        {0x103F, 0x103F},
-        {0x1050, 0x1055},
-        {0x105A, 0x105D},
-        {0x1061, 0x1061},
-        {0x1065, 0x1066},
-        {0x106E, 0x1070},
-        {0x1075, 0x1081},
-        {0x108E, 0x108E},
-        {0x10A0, 0x10C5},
-        {0x10C7, 0x10C7},
-        {0x10CD, 0x10CD},
-        {0x10D0, 0x10FA},
-        {0x10FC, 0x1248},
-        {0x124A, 0x124D},
-        {0x1250, 0x1256},
-        {0x1258, 0x1258},
-        {0x125A, 0x125D},
-        {0x1260, 0x1288},
-        {0x128A, 0x128D},
-        {0x1290, 0x12B0},
-        {0x12B2, 0x12B5},
-        {0x12B8, 0x12BE},
-        {0x12C0, 0x12C0},
-        {0x12C2, 0x12C5},
-        {0x12C8, 0x12D6},
-        {0x12D8, 0x1310},
-        {0x1312, 0x1315},
-        {0x1318, 0x135A},
-        {0x1380, 0x138F},
-        {0x13A0, 0x13F5},
-        {0x13F8, 0x13FD},
-        {0x1401, 0x166C},
-        {0x166F, 0x167F},
-        {0x1681, 0x169A},
-        {0x16A0, 0x16EA},
-        {0x16F1, 0x16F8},
-        {0x1700, 0x1711},
-        {0x171F, 0x1731},
-        {0x1740, 0x1751},
-        {0x1760, 0x176C},
-        {0x176E, 0x1770},
-        {0x1780, 0x17B3},
-        {0x17D7, 0x17D7},
-        {0x17DC, 0x17DC},
-        {0x1820, 0x1878},
-        {0x1880, 0x1884},
-        {0x1887, 0x18A8},
-        {0x18AA, 0x18AA},
-        {0x18B0, 0x18F5},
-        {0x1900, 0x191E},
-        {0x1950, 0x196D},
-        {0x1970, 0x1974},
-        {0x1980, 0x19AB},
-        {0x19B0, 0x19C9},
-        {0x1A00, 0x1A16},
-        {0x1A20, 0x1A54},
-        {0x1AA7, 0x1AA7},
-        {0x1B05, 0x1B33},
-        {0x1B45, 0x1B4C},
-        {0x1B83, 0x1BA0},
-        {0x1BAE, 0x1BAF},
-        {0x1BBA, 0x1BE5},
-        {0x1C00, 0x1C23},
-        {0x1C4D, 0x1C4F},
-        {0x1C5A, 0x1C7D},
-        {0x1C80, 0x1C8A},
-        {0x1C90, 0x1CBA},
-        {0x1CBD, 0x1CBF},
-        {0x1CE9, 0x1CEC},
-        {0x1CEE, 0x1CF3},
-        {0x1CF5, 0x1CF6},
-        {0x1CFA, 0x1CFA},
-        {0x1D00, 0x1DBF},
-        {0x1E00, 0x1F15},
-        {0x1F18, 0x1F1D},
-        {0x1F20, 0x1F45},
-        {0x1F48, 0x1F4D},
-        {0x1F50, 0x1F57},
-        {0x1F59, 0x1F59},
-        {0x1F5B, 0x1F5B},
-        {0x1F5D, 0x1F5D},
-        {0x1F5F, 0x1F7D},
-        {0x1F80, 0x1FB4},
-        {0x1FB6, 0x1FBC},
-        {0x1FBE, 0x1FBE},
-        {0x1FC2, 0x1FC4},
-        {0x1FC6, 0x1FCC},
-        {0x1FD0, 0x1FD3},
-        {0x1FD6, 0x1FDB},
-        {0x1FE0, 0x1FEC},
-        {0x1FF2, 0x1FF4},
-        {0x1FF6, 0x1FFC},
-        {0x2071, 0x2071},
-        {0x207F, 0x207F},
-        {0x2090, 0x209C},
-        {0x2102, 0x2102},
-        {0x2107, 0x2107},
-        {0x210A, 0x2113},
-        {0x2115, 0x2115},
-        {0x2119, 0x211D},
-        {0x2124, 0x2124},
-        {0x2126, 0x2126},
-        {0x2128, 0x2128},
-        {0x212A, 0x212D},
-        {0x212F, 0x2139},
-        {0x213C, 0x213F},
-        {0x2145, 0x2149},
-        {0x214E, 0x214E},
-        {0x2183, 0x2184},
-        {0x2C00, 0x2CE4},
-        {0x2CEB, 0x2CEE},
-        {0x2CF2, 0x2CF3},
-        {0x2D00, 0x2D25},
-        {0x2D27, 0x2D27},
-        {0x2D2D, 0x2D2D},
-        {0x2D30, 0x2D67},
-        {0x2D6F, 0x2D6F},
-        {0x2D80, 0x2D96},
-        {0x2DA0, 0x2DA6},
-        {0x2DA8, 0x2DAE},
-        {0x2DB0, 0x2DB6},
-        {0x2DB8, 0x2DBE},
-        {0x2DC0, 0x2DC6},
-        {0x2DC8, 0x2DCE},
-        {0x2DD0, 0x2DD6},
-        {0x2DD8, 0x2DDE},
-        {0x2E2F, 0x2E2F},
-        {0x3005, 0x3006},
-        {0x3031, 0x3035},
-        {0x303B, 0x303C},
-        {0x3041, 0x3096},
-        {0x309D, 0x309F},
-        {0x30A1, 0x30FA},
-        {0x30FC, 0x30FF},
-        {0x3105, 0x312F},
-        {0x3131, 0x318E},
-        {0x31A0, 0x31BF},
-        {0x31F0, 0x31FF},
-        {0x3400, 0x4DBF},
-        {0x4E00, 0xA48C},
-        {0xA4D0, 0xA4FD},
-        {0xA500, 0xA60C},
-        {0xA610, 0xA61F},
-        {0xA62A, 0xA62B},
-        {0xA640, 0xA66E},
-        {0xA67F, 0xA69D},
-        {0xA6A0, 0xA6E5},
-        {0xA717, 0xA71F},
-        {0xA722, 0xA788},
-        {0xA78B, 0xA7DC},
-        {0xA7F1, 0xA801},
-        {0xA803, 0xA805},
-        {0xA807, 0xA80A},
-        {0xA80C, 0xA822},
-        {0xA840, 0xA873},
-        {0xA882, 0xA8B3},
-        {0xA8F2, 0xA8F7},
-        {0xA8FB, 0xA8FB},
-        {0xA8FD, 0xA8FE},
-        {0xA90A, 0xA925},
-        {0xA930, 0xA946},
-        {0xA960, 0xA97C},
-        {0xA984, 0xA9B2},
-        {0xA9CF, 0xA9CF},
-        {0xA9E0, 0xA9E4},
-        {0xA9E6, 0xA9EF},
-        {0xA9FA, 0xA9FE},
-        {0xAA00, 0xAA28},
-        {0xAA40, 0xAA42},
-        {0xAA44, 0xAA4B},
-        {0xAA60, 0xAA76},
-        {0xAA7A, 0xAA7A},
-        {0xAA7E, 0xAAAF},
-        {0xAAB1, 0xAAB1},
-        {0xAAB5, 0xAAB6},
-        {0xAAB9, 0xAABD},
-        {0xAAC0, 0xAAC0},
-        {0xAAC2, 0xAAC2},
-        {0xAADB, 0xAADD},
-        {0xAAE0, 0xAAEA},
-        {0xAAF2, 0xAAF4},
-        {0xAB01, 0xAB06},
-        {0xAB09, 0xAB0E},
-        {0xAB11, 0xAB16},
-        {0xAB20, 0xAB26},
-        {0xAB28, 0xAB2E},
-        {0xAB30, 0xAB5A},
-        {0xAB5C, 0xAB69},
-        {0xAB70, 0xABE2},
-        {0xAC00, 0xD7A3},
-        {0xD7B0, 0xD7C6},
-        {0xD7CB, 0xD7FB},
-        {0xF900, 0xFA6D},
-        {0xFA70, 0xFAD9},
-        {0xFB00, 0xFB06},
-        {0xFB13, 0xFB17},
-        {0xFB1D, 0xFB1D},
-        {0xFB1F, 0xFB28},
-        {0xFB2A, 0xFB36},
-        {0xFB38, 0xFB3C},
-        {0xFB3E, 0xFB3E},
-        {0xFB40, 0xFB41},
-        {0xFB43, 0xFB44},
-        {0xFB46, 0xFBB1},
-        {0xFBD3, 0xFD3D},
-        {0xFD50, 0xFD8F},
-        {0xFD92, 0xFDC7},
-        {0xFDF0, 0xFDFB},
-        {0xFE70, 0xFE74},
-        {0xFE76, 0xFEFC},
-        {0xFF21, 0xFF3A},
-        {0xFF41, 0xFF5A},
-        {0xFF66, 0xFFBE},
-        {0xFFC2, 0xFFC7},
-        {0xFFCA, 0xFFCF},
-        {0xFFD2, 0xFFD7},
-        {0xFFDA, 0xFFDC},
-        {0x10000, 0x1000B},
-        {0x1000D, 0x10026},
-        {0x10028, 0x1003A},
-        {0x1003C, 0x1003D},
-        {0x1003F, 0x1004D},
-        {0x10050, 0x1005D},
-        {0x10080, 0x100FA},
-        {0x10280, 0x1029C},
-        {0x102A0, 0x102D0},
-        {0x10300, 0x1031F},
-        {0x1032D, 0x10340},
-        {0x10342, 0x10349},
-        {0x10350, 0x10375},
-        {0x10380, 0x1039D},
-        {0x103A0, 0x103C3},
-        {0x103C8, 0x103CF},
-        {0x10400, 0x1049D},
-        {0x104B0, 0x104D3},
-        {0x104D8, 0x104FB},
-        {0x10500, 0x10527},
-        {0x10530, 0x10563},
-        {0x10570, 0x1057A},
-        {0x1057C, 0x1058A},
-        {0x1058C, 0x10592},
-        {0x10594, 0x10595},
-        {0x10597, 0x105A1},
-        {0x105A3, 0x105B1},
-        {0x105B3, 0x105B9},
-        {0x105BB, 0x105BC},
-        {0x105C0, 0x105F3},
-        {0x10600, 0x10736},
-        {0x10740, 0x10755},
-        {0x10760, 0x10767},
-        {0x10780, 0x10785},
-        {0x10787, 0x107B0},
-        {0x107B2, 0x107BA},
-        {0x10800, 0x10805},
-        {0x10808, 0x10808},
-        {0x1080A, 0x10835},
-        {0x10837, 0x10838},
-        {0x1083C, 0x1083C},
-        {0x1083F, 0x10855},
-        {0x10860, 0x10876},
-        {0x10880, 0x1089E},
-        {0x108E0, 0x108F2},
-        {0x108F4, 0x108F5},
-        {0x10900, 0x10915},
-        {0x10920, 0x10939},
-        {0x10940, 0x10959},
-        {0x10980, 0x109B7},
-        {0x109BE, 0x109BF},
-        {0x10A00, 0x10A00},
-        {0x10A10, 0x10A13},
-        {0x10A15, 0x10A17},
-        {0x10A19, 0x10A35},
-        {0x10A60, 0x10A7C},
-        {0x10A80, 0x10A9C},
-        {0x10AC0, 0x10AC7},
-        {0x10AC9, 0x10AE4},
-        {0x10B00, 0x10B35},
-        {0x10B40, 0x10B55},
-        {0x10B60, 0x10B72},
-        {0x10B80, 0x10B91},
-        {0x10C00, 0x10C48},
-        {0x10C80, 0x10CB2},
-        {0x10CC0, 0x10CF2},
-        {0x10D00, 0x10D23},
-        {0x10D4A, 0x10D65},
-        {0x10D6F, 0x10D85},
-        {0x10E80, 0x10EA9},
-        {0x10EB0, 0x10EB1},
-        {0x10EC2, 0x10EC7},
-        {0x10F00, 0x10F1C},
-        {0x10F27, 0x10F27},
-        {0x10F30, 0x10F45},
-        {0x10F70, 0x10F81},
-        {0x10FB0, 0x10FC4},
-        {0x10FE0, 0x10FF6},
-        {0x11003, 0x11037},
-        {0x11071, 0x11072},
-        {0x11075, 0x11075},
-        {0x11083, 0x110AF},
-        {0x110D0, 0x110E8},
-        {0x11103, 0x11126},
-        {0x11144, 0x11144},
-        {0x11147, 0x11147},
-        {0x11150, 0x11172},
-        {0x11176, 0x11176},
-        {0x11183, 0x111B2},
-        {0x111C1, 0x111C4},
-        {0x111DA, 0x111DA},
-        {0x111DC, 0x111DC},
-        {0x11200, 0x11211},
-        {0x11213, 0x1122B},
-        {0x1123F, 0x11240},
-        {0x11280, 0x11286},
-        {0x11288, 0x11288},
-        {0x1128A, 0x1128D},
-        {0x1128F, 0x1129D},
-        {0x1129F, 0x112A8},
-        {0x112B0, 0x112DE},
-        {0x11305, 0x1130C},
-        {0x1130F, 0x11310},
-        {0x11313, 0x11328},
-        {0x1132A, 0x11330},
-        {0x11332, 0x11333},
-        {0x11335, 0x11339},
-        {0x1133D, 0x1133D},
-        {0x11350, 0x11350},
-        {0x1135D, 0x11361},
-        {0x11380, 0x11389},
-        {0x1138B, 0x1138B},
-        {0x1138E, 0x1138E},
-        {0x11390, 0x113B5},
-        {0x113B7, 0x113B7},
-        {0x113D1, 0x113D1},
-        {0x113D3, 0x113D3},
-        {0x11400, 0x11434},
-        {0x11447, 0x1144A},
-        {0x1145F, 0x11461},
-        {0x11480, 0x114AF},
-        {0x114C4, 0x114C5},
-        {0x114C7, 0x114C7},
-        {0x11580, 0x115AE},
-        {0x115D8, 0x115DB},
-        {0x11600, 0x1162F},
-        {0x11644, 0x11644},
-        {0x11680, 0x116AA},
-        {0x116B8, 0x116B8},
-        {0x11700, 0x1171A},
-        {0x11740, 0x11746},
-        {0x11800, 0x1182B},
-        {0x118A0, 0x118DF},
-        {0x118FF, 0x11906},
-        {0x11909, 0x11909},
-        {0x1190C, 0x11913},
-        {0x11915, 0x11916},
-        {0x11918, 0x1192F},
-        {0x1193F, 0x1193F},
-        {0x11941, 0x11941},
-        {0x119A0, 0x119A7},
-        {0x119AA, 0x119D0},
-        {0x119E1, 0x119E1},
-        {0x119E3, 0x119E3},
-        {0x11A00, 0x11A00},
-        {0x11A0B, 0x11A32},
-        {0x11A3A, 0x11A3A},
-        {0x11A50, 0x11A50},
-        {0x11A5C, 0x11A89},
-        {0x11A9D, 0x11A9D},
-        {0x11AB0, 0x11AF8},
-        {0x11BC0, 0x11BE0},
-        {0x11C00, 0x11C08},
-        {0x11C0A, 0x11C2E},
-        {0x11C40, 0x11C40},
-        {0x11C72, 0x11C8F},
-        {0x11D00, 0x11D06},
-        {0x11D08, 0x11D09},
-        {0x11D0B, 0x11D30},
-        {0x11D46, 0x11D46},
-        {0x11D60, 0x11D65},
-        {0x11D67, 0x11D68},
-        {0x11D6A, 0x11D89},
-        {0x11D98, 0x11D98},
-        {0x11DB0, 0x11DDB},
-        {0x11EE0, 0x11EF2},
-        {0x11F02, 0x11F02},
-        {0x11F04, 0x11F10},
-        {0x11F12, 0x11F33},
-        {0x11FB0, 0x11FB0},
-        {0x12000, 0x12399},
-        {0x12480, 0x12543},
-        {0x12F90, 0x12FF0},
-        {0x13000, 0x1342F},
-        {0x13441, 0x13446},
-        {0x13460, 0x143FA},
-        {0x14400, 0x14646},
-        {0x16100, 0x1611D},
-        {0x16800, 0x16A38},
-        {0x16A40, 0x16A5E},
-        {0x16A70, 0x16ABE},
-        {0x16AD0, 0x16AED},
-        {0x16B00, 0x16B2F},
-        {0x16B40, 0x16B43},
-        {0x16B63, 0x16B77},
-        {0x16B7D, 0x16B8F},
-        {0x16D40, 0x16D6C},
-        {0x16E40, 0x16E7F},
-        {0x16EA0, 0x16EB8},
-        {0x16EBB, 0x16ED3},
-        {0x16F00, 0x16F4A},
-        {0x16F50, 0x16F50},
-        {0x16F93, 0x16F9F},
-        {0x16FE0, 0x16FE1},
-        {0x16FE3, 0x16FE3},
-        {0x16FF2, 0x16FF3},
-        {0x17000, 0x18CD5},
-        {0x18CFF, 0x18D1E},
-        {0x18D80, 0x18DF2},
-        {0x1AFF0, 0x1AFF3},
-        {0x1AFF5, 0x1AFFB},
-        {0x1AFFD, 0x1AFFE},
-        {0x1B000, 0x1B122},
-        {0x1B132, 0x1B132},
-        {0x1B150, 0x1B152},
-        {0x1B155, 0x1B155},
-        {0x1B164, 0x1B167},
-        {0x1B170, 0x1B2FB},
-        {0x1BC00, 0x1BC6A},
-        {0x1BC70, 0x1BC7C},
-        {0x1BC80, 0x1BC88},
-        {0x1BC90, 0x1BC99},
-        {0x1D400, 0x1D454},
-        {0x1D456, 0x1D49C},
-        {0x1D49E, 0x1D49F},
-        {0x1D4A2, 0x1D4A2},
-        {0x1D4A5, 0x1D4A6},
-        {0x1D4A9, 0x1D4AC},
-        {0x1D4AE, 0x1D4B9},
-        {0x1D4BB, 0x1D4BB},
-        {0x1D4BD, 0x1D4C3},
-        {0x1D4C5, 0x1D505},
-        {0x1D507, 0x1D50A},
-        {0x1D50D, 0x1D514},
-        {0x1D516, 0x1D51C},
-        {0x1D51E, 0x1D539},
-        {0x1D53B, 0x1D53E},
-        {0x1D540, 0x1D544},
-        {0x1D546, 0x1D546},
-        {0x1D54A, 0x1D550},
-        {0x1D552, 0x1D6A5},
-        {0x1D6A8, 0x1D6C0},
-        {0x1D6C2, 0x1D6DA},
-        {0x1D6DC, 0x1D6FA},
-        {0x1D6FC, 0x1D714},
-        {0x1D716, 0x1D734},
-        {0x1D736, 0x1D74E},
-        {0x1D750, 0x1D76E},
-        {0x1D770, 0x1D788},
-        {0x1D78A, 0x1D7A8},
-        {0x1D7AA, 0x1D7C2},
-        {0x1D7C4, 0x1D7CB},
-        {0x1DF00, 0x1DF1E},
-        {0x1DF25, 0x1DF2A},
-        {0x1E030, 0x1E06D},
-        {0x1E100, 0x1E12C},
-        {0x1E137, 0x1E13D},
-        {0x1E14E, 0x1E14E},
-        {0x1E290, 0x1E2AD},
-        {0x1E2C0, 0x1E2EB},
-        {0x1E4D0, 0x1E4EB},
-        {0x1E5D0, 0x1E5ED},
-        {0x1E5F0, 0x1E5F0},
-        {0x1E6C0, 0x1E6DE},
-        {0x1E6E0, 0x1E6E2},
-        {0x1E6E4, 0x1E6E5},
-        {0x1E6E7, 0x1E6ED},
-        {0x1E6F0, 0x1E6F4},
-        {0x1E6FE, 0x1E6FF},
-        {0x1E7E0, 0x1E7E6},
-        {0x1E7E8, 0x1E7EB},
-        {0x1E7ED, 0x1E7EE},
-        {0x1E7F0, 0x1E7FE},
-        {0x1E800, 0x1E8C4},
-        {0x1E900, 0x1E943},
-        {0x1E94B, 0x1E94B},
-        {0x1EE00, 0x1EE03},
-        {0x1EE05, 0x1EE1F},
-        {0x1EE21, 0x1EE22},
-        {0x1EE24, 0x1EE24},
-        {0x1EE27, 0x1EE27},
-        {0x1EE29, 0x1EE32},
-        {0x1EE34, 0x1EE37},
-        {0x1EE39, 0x1EE39},
-        {0x1EE3B, 0x1EE3B},
-        {0x1EE42, 0x1EE42},
-        {0x1EE47, 0x1EE47},
-        {0x1EE49, 0x1EE49},
-        {0x1EE4B, 0x1EE4B},
-        {0x1EE4D, 0x1EE4F},
-        {0x1EE51, 0x1EE52},
-        {0x1EE54, 0x1EE54},
-        {0x1EE57, 0x1EE57},
-        {0x1EE59, 0x1EE59},
-        {0x1EE5B, 0x1EE5B},
-        {0x1EE5D, 0x1EE5D},
-        {0x1EE5F, 0x1EE5F},
-        {0x1EE61, 0x1EE62},
-        {0x1EE64, 0x1EE64},
-        {0x1EE67, 0x1EE6A},
-        {0x1EE6C, 0x1EE72},
-        {0x1EE74, 0x1EE77},
-        {0x1EE79, 0x1EE7C},
-        {0x1EE7E, 0x1EE7E},
-        {0x1EE80, 0x1EE89},
-        {0x1EE8B, 0x1EE9B},
-        {0x1EEA1, 0x1EEA3},
-        {0x1EEA5, 0x1EEA9},
-        {0x1EEAB, 0x1EEBB},
-        {0x20000, 0x2A6DF},
-        {0x2A700, 0x2B81D},
-        {0x2B820, 0x2CEAD},
-        {0x2CEB0, 0x2EBE0},
-        {0x2EBF0, 0x2EE5D},
-        {0x2F800, 0x2FA1D},
-        {0x30000, 0x3134A},
-        {0x31350, 0x33479},
-    };
-
-    for (const auto& r : ranges) {
-        if (ch >= r.start && ch <= r.end)
-            return true;
-    }
-    return false;
-}
-
-bool is_space(char32_t cp) {
-    switch (cp) {
-        case 0x0009:  // TAB \t
-        case 0x000A:  // LF \n
-        case 0x000B:  // VT
-        case 0x000C:  // FF
-        case 0x000D:  // CR \r
-        case 0x0020:  // Space
-        case 0x00A0:  // No-Break Space
-        case 0x1680:  // Ogham Space Mark
-        case 0x2000:  // En Quad
-        case 0x2001:  // Em Quad
-        case 0x2002:  // En Space
-        case 0x2003:  // Em Space
-        case 0x2004:  // Three-Per-Em Space
-        case 0x2005:  // Four-Per-Em Space
-        case 0x2006:  // Six-Per-Em Space
-        case 0x2007:  // Figure Space
-        case 0x2008:  // Punctuation Space
-        case 0x2009:  // Thin Space
-        case 0x200A:  // Hair Space
-        case 0x202F:  // Narrow No-Break Space
-        case 0x205F:  // Medium Mathematical Space
-        case 0x3000:  // Ideographic Space
-            return true;
-        default:
-            return false;
-    }
-}
-
-std::string str_to_lower(const std::string& input) {
-    std::string result = input;
-    std::transform(result.begin(), result.end(), result.begin(),
-                   [](unsigned char c) { return std::tolower(c); });
-    return result;
-}
-
-// UTF-8 -> Unicode code points
-std::vector<char32_t> utf8_to_codepoints(const std::string& str) {
-    std::vector<char32_t> codepoints;
-    size_t i = 0;
-    while (i < str.size()) {
-        unsigned char c    = str[i];
-        char32_t cp        = 0;
-        size_t extra_bytes = 0;
-
-        if ((c & 0x80) == 0)
-            cp = c;
-        else if ((c & 0xE0) == 0xC0) {
-            cp          = c & 0x1F;
-            extra_bytes = 1;
-        } else if ((c & 0xF0) == 0xE0) {
-            cp          = c & 0x0F;
-            extra_bytes = 2;
-        } else if ((c & 0xF8) == 0xF0) {
-            cp          = c & 0x07;
-            extra_bytes = 3;
-        } else {
-            ++i;
-            continue;
-        }  // Invalid UTF-8
-
-        if (i + extra_bytes >= str.size())
-            break;
-
-        for (size_t j = 1; j <= extra_bytes; ++j)
-            cp = (cp << 6) | (str[i + j] & 0x3F);
-
-        codepoints.push_back(cp);
-        i += 1 + extra_bytes;
-    }
-    return codepoints;
-}
-
-// Unicode code point -> UTF-8
-std::string codepoint_to_utf8(char32_t cp) {
-    std::string out;
-    if (cp <= 0x7F)
-        out.push_back(static_cast<char>(cp));
-    else if (cp <= 0x7FF) {
-        out.push_back(static_cast<char>(0xC0 | (cp >> 6)));
-        out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
-    } else if (cp <= 0xFFFF) {
-        out.push_back(static_cast<char>(0xE0 | (cp >> 12)));
-        out.push_back(static_cast<char>(0x80 | ((cp >> 6) & 0x3F)));
-        out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
-    } else {
-        out.push_back(static_cast<char>(0xF0 | (cp >> 18)));
-        out.push_back(static_cast<char>(0x80 | ((cp >> 12) & 0x3F)));
-        out.push_back(static_cast<char>(0x80 | ((cp >> 6) & 0x3F)));
-        out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
-    }
-    return out;
-}
-
-bool starts_with(const std::vector<char32_t>& text,
-                 const std::vector<char32_t>& prefix,
-                 std::size_t index) {
-    if (index > text.size()) {
-        return false;
-    }
-    if (prefix.size() > text.size() - index) {
-        return false;
-    }
-    return std::equal(prefix.begin(), prefix.end(), text.begin() + index);
-}
-
-// mistral: [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+
-// qwen2: (?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+
-std::vector<std::string> token_split(const std::string& text) {
-    std::vector<std::string> tokens;
-    auto cps = utf8_to_codepoints(text);
-    size_t i = 0;
-
-    while (i < cps.size()) {
-        char32_t cp = cps[i];
-
-        // `(?i:'s|'t|'re|'ve|'m|'ll|'d)`
-        if (cp == U'\'' && i + 1 < cps.size()) {
-            std::string next = str_to_lower(codepoint_to_utf8(cps[i + 1]));
-            if (next == "s" || next == "t" || next == "m") {
-                tokens.push_back("'" + next);
-                i += 2;
-                continue;
-            }
-            if (i + 2 < cps.size()) {
-                next += str_to_lower(codepoint_to_utf8(cps[i + 2]));
-                if (next == "re" || next == "ve" || next == "ll" || next == "d") {
-                    tokens.push_back("'" + next);
-                    i += 3;
-                    continue;
-                }
-            }
-        }
-
-        // `\p{N}`
-        if (is_number(cp)) {
-            tokens.push_back(codepoint_to_utf8(cp));
-            ++i;
-            continue;
-        }
-
-        // `[^\r\n\p{L}\p{N}]?\p{L}+`
-        {
-            // `[^\r\n\p{L}\p{N}]\p{L}+`
-            if (!is_letter(cp) && cp != U'\r' && cp != U'\n' && i + 1 < cps.size() && is_letter(cps[i + 1])) {
-                std::string token = codepoint_to_utf8(cp);
-                ++i;
-
-                while (i < cps.size() && is_letter(cps[i])) {
-                    token += codepoint_to_utf8(cps[i]);
-                    ++i;
-                }
-                tokens.push_back(token);
-                continue;
-            }
-
-            // `\p{L}+`
-            if (is_letter(cp)) {
-                std::string token = codepoint_to_utf8(cp);
-                ++i;
-                while (i < cps.size() && is_letter(cps[i])) {
-                    token += codepoint_to_utf8(cps[i]);
-                    ++i;
-                }
-                tokens.push_back(token);
-                continue;
-            }
-        }
-
-        // ` ?[^\s\p{L}\p{N}]+[\r\n]*`
-        {
-            // ` [^\s\p{L}\p{N}]+[\r\n]*`
-            if (cp == U' ' && i + 1 < cps.size() && !isspace(cps[i + 1]) && !is_letter(cps[i + 1]) && !is_number(cps[i + 1])) {
-                std::string token = codepoint_to_utf8(cp);
-                token += codepoint_to_utf8(cps[i + 1]);
-                i += 2;
-
-                while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) {
-                    token += codepoint_to_utf8(cps[i]);
-                    ++i;
-                }
-
-                while (i < cps.size() && (cps[i] == U'\r' || cps[i] == U'\n')) {
-                    token += codepoint_to_utf8(cps[i]);
-                    ++i;
-                }
-
-                tokens.push_back(token);
-                continue;
-            }
-
-            // `[^\s\p{L}\p{N}]+[\r\n]*`
-            std::string token;
-            if (!is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) {
-                std::string token = codepoint_to_utf8(cp);
-                ++i;
-
-                while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) {
-                    token += codepoint_to_utf8(cps[i]);
-                    ++i;
-                }
-
-                while (i < cps.size() && (cps[i] == U'\r' || cps[i] == U'\n')) {
-                    token += codepoint_to_utf8(cps[i]);
-                    ++i;
-                }
-
-                tokens.push_back(token);
-                continue;
-            }
-        }
-
-        // `\s*[\r\n]+|\s+(?!\S)|\s+`
-        if (is_space(cp)) {
-            std::string token;
-            bool saw_new_line = false;
-
-            while (i < cps.size() && is_space(cps[i])) {
-                token += codepoint_to_utf8(cps[i]);
-
-                if (cps[i] == U'\r' || cps[i] == U'\n') {
-                    saw_new_line = true;
-                } else {
-                    if (saw_new_line) {
-                        break;
-                    }
-                }
-
-                ++i;
-            }
-
-            tokens.push_back(token);
-            continue;
-        }
-
-        // skip
-        ++i;
-    }
-
-    return tokens;
-}
-
-std::vector<std::string> split_with_special_tokens(
-    const std::string& text,
-    const std::vector<std::string>& special_tokens) {
-    std::vector<std::string> result;
-    size_t pos      = 0;
-    size_t text_len = text.size();
-
-    while (pos < text_len) {
-        size_t next_pos = text_len;
-        std::string matched_token;
-
-        for (const auto& token : special_tokens) {
-            size_t token_pos = text.find(token, pos);
-            if (token_pos != std::string::npos && token_pos < next_pos) {
-                next_pos      = token_pos;
-                matched_token = token;
-            }
-        }
-
-        if (next_pos > pos) {
-            result.push_back(text.substr(pos, next_pos - pos));
-        }
-
-        if (!matched_token.empty()) {
-            result.push_back(matched_token);
-            pos = next_pos + matched_token.size();
-        } else {
-            break;
-        }
-    }
-
-    return result;
-}
-
-// int main() {
-//     std::string text = "I'm testing C++ token_split function. 你好，世界! 123";
-//     auto tokens = token_split(text);
-
-//     for (const auto& t : tokens) {
-//         std::cout << "[" << t << "] ";
-//     }
-//     std::cout << "\n";
-//     return 0;
-// }
+﻿#include <algorithm>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "tokenize_util.h"
+
+bool is_number(char32_t ch) {
+    return (ch >= U'0' && ch <= U'9');
+}
+
+bool is_letter(char32_t ch) {
+    static const struct { char32_t start, end; } ranges[] = {
+        {0x41, 0x5A},
+        {0x61, 0x7A},
+        {0xAA, 0xAA},
+        {0xB5, 0xB5},
+        {0xBA, 0xBA},
+        {0xC0, 0xD6},
+        {0xD8, 0xF6},
+        {0xF8, 0x2C1},
+        {0x2C6, 0x2D1},
+        {0x2E0, 0x2E4},
+        {0x2EC, 0x2EC},
+        {0x2EE, 0x2EE},
+        {0x370, 0x374},
+        {0x376, 0x377},
+        {0x37A, 0x37D},
+        {0x37F, 0x37F},
+        {0x386, 0x386},
+        {0x388, 0x38A},
+        {0x38C, 0x38C},
+        {0x38E, 0x3A1},
+        {0x3A3, 0x3F5},
+        {0x3F7, 0x481},
+        {0x48A, 0x52F},
+        {0x531, 0x556},
+        {0x559, 0x559},
+        {0x560, 0x588},
+        {0x5D0, 0x5EA},
+        {0x5EF, 0x5F2},
+        {0x620, 0x64A},
+        {0x66E, 0x66F},
+        {0x671, 0x6D3},
+        {0x6D5, 0x6D5},
+        {0x6E5, 0x6E6},
+        {0x6EE, 0x6EF},
+        {0x6FA, 0x6FC},
+        {0x6FF, 0x6FF},
+        {0x710, 0x710},
+        {0x712, 0x72F},
+        {0x74D, 0x7A5},
+        {0x7B1, 0x7B1},
+        {0x7CA, 0x7EA},
+        {0x7F4, 0x7F5},
+        {0x7FA, 0x7FA},
+        {0x800, 0x815},
+        {0x81A, 0x81A},
+        {0x824, 0x824},
+        {0x828, 0x828},
+        {0x840, 0x858},
+        {0x860, 0x86A},
+        {0x870, 0x887},
+        {0x889, 0x88F},
+        {0x8A0, 0x8C9},
+        {0x904, 0x939},
+        {0x93D, 0x93D},
+        {0x950, 0x950},
+        {0x958, 0x961},
+        {0x971, 0x980},
+        {0x985, 0x98C},
+        {0x98F, 0x990},
+        {0x993, 0x9A8},
+        {0x9AA, 0x9B0},
+        {0x9B2, 0x9B2},
+        {0x9B6, 0x9B9},
+        {0x9BD, 0x9BD},
+        {0x9CE, 0x9CE},
+        {0x9DC, 0x9DD},
+        {0x9DF, 0x9E1},
+        {0x9F0, 0x9F1},
+        {0x9FC, 0x9FC},
+        {0xA05, 0xA0A},
+        {0xA0F, 0xA10},
+        {0xA13, 0xA28},
+        {0xA2A, 0xA30},
+        {0xA32, 0xA33},
+        {0xA35, 0xA36},
+        {0xA38, 0xA39},
+        {0xA59, 0xA5C},
+        {0xA5E, 0xA5E},
+        {0xA72, 0xA74},
+        {0xA85, 0xA8D},
+        {0xA8F, 0xA91},
+        {0xA93, 0xAA8},
+        {0xAAA, 0xAB0},
+        {0xAB2, 0xAB3},
+        {0xAB5, 0xAB9},
+        {0xABD, 0xABD},
+        {0xAD0, 0xAD0},
+        {0xAE0, 0xAE1},
+        {0xAF9, 0xAF9},
+        {0xB05, 0xB0C},
+        {0xB0F, 0xB10},
+        {0xB13, 0xB28},
+        {0xB2A, 0xB30},
+        {0xB32, 0xB33},
+        {0xB35, 0xB39},
+        {0xB3D, 0xB3D},
+        {0xB5C, 0xB5D},
+        {0xB5F, 0xB61},
+        {0xB71, 0xB71},
+        {0xB83, 0xB83},
+        {0xB85, 0xB8A},
+        {0xB8E, 0xB90},
+        {0xB92, 0xB95},
+        {0xB99, 0xB9A},
+        {0xB9C, 0xB9C},
+        {0xB9E, 0xB9F},
+        {0xBA3, 0xBA4},
+        {0xBA8, 0xBAA},
+        {0xBAE, 0xBB9},
+        {0xBD0, 0xBD0},
+        {0xC05, 0xC0C},
+        {0xC0E, 0xC10},
+        {0xC12, 0xC28},
+        {0xC2A, 0xC39},
+        {0xC3D, 0xC3D},
+        {0xC58, 0xC5A},
+        {0xC5C, 0xC5D},
+        {0xC60, 0xC61},
+        {0xC80, 0xC80},
+        {0xC85, 0xC8C},
+        {0xC8E, 0xC90},
+        {0xC92, 0xCA8},
+        {0xCAA, 0xCB3},
+        {0xCB5, 0xCB9},
+        {0xCBD, 0xCBD},
+        {0xCDC, 0xCDE},
+        {0xCE0, 0xCE1},
+        {0xCF1, 0xCF2},
+        {0xD04, 0xD0C},
+        {0xD0E, 0xD10},
+        {0xD12, 0xD3A},
+        {0xD3D, 0xD3D},
+        {0xD4E, 0xD4E},
+        {0xD54, 0xD56},
+        {0xD5F, 0xD61},
+        {0xD7A, 0xD7F},
+        {0xD85, 0xD96},
+        {0xD9A, 0xDB1},
+        {0xDB3, 0xDBB},
+        {0xDBD, 0xDBD},
+        {0xDC0, 0xDC6},
+        {0xE01, 0xE30},
+        {0xE32, 0xE33},
+        {0xE40, 0xE46},
+        {0xE81, 0xE82},
+        {0xE84, 0xE84},
+        {0xE86, 0xE8A},
+        {0xE8C, 0xEA3},
+        {0xEA5, 0xEA5},
+        {0xEA7, 0xEB0},
+        {0xEB2, 0xEB3},
+        {0xEBD, 0xEBD},
+        {0xEC0, 0xEC4},
+        {0xEC6, 0xEC6},
+        {0xEDC, 0xEDF},
+        {0xF00, 0xF00},
+        {0xF40, 0xF47},
+        {0xF49, 0xF6C},
+        {0xF88, 0xF8C},
+        {0x1000, 0x102A},
+        {0x103F, 0x103F},
+        {0x1050, 0x1055},
+        {0x105A, 0x105D},
+        {0x1061, 0x1061},
+        {0x1065, 0x1066},
+        {0x106E, 0x1070},
+        {0x1075, 0x1081},
+        {0x108E, 0x108E},
+        {0x10A0, 0x10C5},
+        {0x10C7, 0x10C7},
+        {0x10CD, 0x10CD},
+        {0x10D0, 0x10FA},
+        {0x10FC, 0x1248},
+        {0x124A, 0x124D},
+        {0x1250, 0x1256},
+        {0x1258, 0x1258},
+        {0x125A, 0x125D},
+        {0x1260, 0x1288},
+        {0x128A, 0x128D},
+        {0x1290, 0x12B0},
+        {0x12B2, 0x12B5},
+        {0x12B8, 0x12BE},
+        {0x12C0, 0x12C0},
+        {0x12C2, 0x12C5},
+        {0x12C8, 0x12D6},
+        {0x12D8, 0x1310},
+        {0x1312, 0x1315},
+        {0x1318, 0x135A},
+        {0x1380, 0x138F},
+        {0x13A0, 0x13F5},
+        {0x13F8, 0x13FD},
+        {0x1401, 0x166C},
+        {0x166F, 0x167F},
+        {0x1681, 0x169A},
+        {0x16A0, 0x16EA},
+        {0x16F1, 0x16F8},
+        {0x1700, 0x1711},
+        {0x171F, 0x1731},
+        {0x1740, 0x1751},
+        {0x1760, 0x176C},
+        {0x176E, 0x1770},
+        {0x1780, 0x17B3},
+        {0x17D7, 0x17D7},
+        {0x17DC, 0x17DC},
+        {0x1820, 0x1878},
+        {0x1880, 0x1884},
+        {0x1887, 0x18A8},
+        {0x18AA, 0x18AA},
+        {0x18B0, 0x18F5},
+        {0x1900, 0x191E},
+        {0x1950, 0x196D},
+        {0x1970, 0x1974},
+        {0x1980, 0x19AB},
+        {0x19B0, 0x19C9},
+        {0x1A00, 0x1A16},
+        {0x1A20, 0x1A54},
+        {0x1AA7, 0x1AA7},
+        {0x1B05, 0x1B33},
+        {0x1B45, 0x1B4C},
+        {0x1B83, 0x1BA0},
+        {0x1BAE, 0x1BAF},
+        {0x1BBA, 0x1BE5},
+        {0x1C00, 0x1C23},
+        {0x1C4D, 0x1C4F},
+        {0x1C5A, 0x1C7D},
+        {0x1C80, 0x1C8A},
+        {0x1C90, 0x1CBA},
+        {0x1CBD, 0x1CBF},
+        {0x1CE9, 0x1CEC},
+        {0x1CEE, 0x1CF3},
+        {0x1CF5, 0x1CF6},
+        {0x1CFA, 0x1CFA},
+        {0x1D00, 0x1DBF},
+        {0x1E00, 0x1F15},
+        {0x1F18, 0x1F1D},
+        {0x1F20, 0x1F45},
+        {0x1F48, 0x1F4D},
+        {0x1F50, 0x1F57},
+        {0x1F59, 0x1F59},
+        {0x1F5B, 0x1F5B},
+        {0x1F5D, 0x1F5D},
+        {0x1F5F, 0x1F7D},
+        {0x1F80, 0x1FB4},
+        {0x1FB6, 0x1FBC},
+        {0x1FBE, 0x1FBE},
+        {0x1FC2, 0x1FC4},
+        {0x1FC6, 0x1FCC},
+        {0x1FD0, 0x1FD3},
+        {0x1FD6, 0x1FDB},
+        {0x1FE0, 0x1FEC},
+        {0x1FF2, 0x1FF4},
+        {0x1FF6, 0x1FFC},
+        {0x2071, 0x2071},
+        {0x207F, 0x207F},
+        {0x2090, 0x209C},
+        {0x2102, 0x2102},
+        {0x2107, 0x2107},
+        {0x210A, 0x2113},
+        {0x2115, 0x2115},
+        {0x2119, 0x211D},
+        {0x2124, 0x2124},
+        {0x2126, 0x2126},
+        {0x2128, 0x2128},
+        {0x212A, 0x212D},
+        {0x212F, 0x2139},
+        {0x213C, 0x213F},
+        {0x2145, 0x2149},
+        {0x214E, 0x214E},
+        {0x2183, 0x2184},
+        {0x2C00, 0x2CE4},
+        {0x2CEB, 0x2CEE},
+        {0x2CF2, 0x2CF3},
+        {0x2D00, 0x2D25},
+        {0x2D27, 0x2D27},
+        {0x2D2D, 0x2D2D},
+        {0x2D30, 0x2D67},
+        {0x2D6F, 0x2D6F},
+        {0x2D80, 0x2D96},
+        {0x2DA0, 0x2DA6},
+        {0x2DA8, 0x2DAE},
+        {0x2DB0, 0x2DB6},
+        {0x2DB8, 0x2DBE},
+        {0x2DC0, 0x2DC6},
+        {0x2DC8, 0x2DCE},
+        {0x2DD0, 0x2DD6},
+        {0x2DD8, 0x2DDE},
+        {0x2E2F, 0x2E2F},
+        {0x3005, 0x3006},
+        {0x3031, 0x3035},
+        {0x303B, 0x303C},
+        {0x3041, 0x3096},
+        {0x309D, 0x309F},
+        {0x30A1, 0x30FA},
+        {0x30FC, 0x30FF},
+        {0x3105, 0x312F},
+        {0x3131, 0x318E},
+        {0x31A0, 0x31BF},
+        {0x31F0, 0x31FF},
+        {0x3400, 0x4DBF},
+        {0x4E00, 0xA48C},
+        {0xA4D0, 0xA4FD},
+        {0xA500, 0xA60C},
+        {0xA610, 0xA61F},
+        {0xA62A, 0xA62B},
+        {0xA640, 0xA66E},
+        {0xA67F, 0xA69D},
+        {0xA6A0, 0xA6E5},
+        {0xA717, 0xA71F},
+        {0xA722, 0xA788},
+        {0xA78B, 0xA7DC},
+        {0xA7F1, 0xA801},
+        {0xA803, 0xA805},
+        {0xA807, 0xA80A},
+        {0xA80C, 0xA822},
+        {0xA840, 0xA873},
+        {0xA882, 0xA8B3},
+        {0xA8F2, 0xA8F7},
+        {0xA8FB, 0xA8FB},
+        {0xA8FD, 0xA8FE},
+        {0xA90A, 0xA925},
+        {0xA930, 0xA946},
+        {0xA960, 0xA97C},
+        {0xA984, 0xA9B2},
+        {0xA9CF, 0xA9CF},
+        {0xA9E0, 0xA9E4},
+        {0xA9E6, 0xA9EF},
+        {0xA9FA, 0xA9FE},
+        {0xAA00, 0xAA28},
+        {0xAA40, 0xAA42},
+        {0xAA44, 0xAA4B},
+        {0xAA60, 0xAA76},
+        {0xAA7A, 0xAA7A},
+        {0xAA7E, 0xAAAF},
+        {0xAAB1, 0xAAB1},
+        {0xAAB5, 0xAAB6},
+        {0xAAB9, 0xAABD},
+        {0xAAC0, 0xAAC0},
+        {0xAAC2, 0xAAC2},
+        {0xAADB, 0xAADD},
+        {0xAAE0, 0xAAEA},
+        {0xAAF2, 0xAAF4},
+        {0xAB01, 0xAB06},
+        {0xAB09, 0xAB0E},
+        {0xAB11, 0xAB16},
+        {0xAB20, 0xAB26},
+        {0xAB28, 0xAB2E},
+        {0xAB30, 0xAB5A},
+        {0xAB5C, 0xAB69},
+        {0xAB70, 0xABE2},
+        {0xAC00, 0xD7A3},
+        {0xD7B0, 0xD7C6},
+        {0xD7CB, 0xD7FB},
+        {0xF900, 0xFA6D},
+        {0xFA70, 0xFAD9},
+        {0xFB00, 0xFB06},
+        {0xFB13, 0xFB17},
+        {0xFB1D, 0xFB1D},
+        {0xFB1F, 0xFB28},
+        {0xFB2A, 0xFB36},
+        {0xFB38, 0xFB3C},
+        {0xFB3E, 0xFB3E},
+        {0xFB40, 0xFB41},
+        {0xFB43, 0xFB44},
+        {0xFB46, 0xFBB1},
+        {0xFBD3, 0xFD3D},
+        {0xFD50, 0xFD8F},
+        {0xFD92, 0xFDC7},
+        {0xFDF0, 0xFDFB},
+        {0xFE70, 0xFE74},
+        {0xFE76, 0xFEFC},
+        {0xFF21, 0xFF3A},
+        {0xFF41, 0xFF5A},
+        {0xFF66, 0xFFBE},
+        {0xFFC2, 0xFFC7},
+        {0xFFCA, 0xFFCF},
+        {0xFFD2, 0xFFD7},
+        {0xFFDA, 0xFFDC},
+        {0x10000, 0x1000B},
+        {0x1000D, 0x10026},
+        {0x10028, 0x1003A},
+        {0x1003C, 0x1003D},
+        {0x1003F, 0x1004D},
+        {0x10050, 0x1005D},
+        {0x10080, 0x100FA},
+        {0x10280, 0x1029C},
+        {0x102A0, 0x102D0},
+        {0x10300, 0x1031F},
+        {0x1032D, 0x10340},
+        {0x10342, 0x10349},
+        {0x10350, 0x10375},
+        {0x10380, 0x1039D},
+        {0x103A0, 0x103C3},
+        {0x103C8, 0x103CF},
+        {0x10400, 0x1049D},
+        {0x104B0, 0x104D3},
+        {0x104D8, 0x104FB},
+        {0x10500, 0x10527},
+        {0x10530, 0x10563},
+        {0x10570, 0x1057A},
+        {0x1057C, 0x1058A},
+        {0x1058C, 0x10592},
+        {0x10594, 0x10595},
+        {0x10597, 0x105A1},
+        {0x105A3, 0x105B1},
+        {0x105B3, 0x105B9},
+        {0x105BB, 0x105BC},
+        {0x105C0, 0x105F3},
+        {0x10600, 0x10736},
+        {0x10740, 0x10755},
+        {0x10760, 0x10767},
+        {0x10780, 0x10785},
+        {0x10787, 0x107B0},
+        {0x107B2, 0x107BA},
+        {0x10800, 0x10805},
+        {0x10808, 0x10808},
+        {0x1080A, 0x10835},
+        {0x10837, 0x10838},
+        {0x1083C, 0x1083C},
+        {0x1083F, 0x10855},
+        {0x10860, 0x10876},
+        {0x10880, 0x1089E},
+        {0x108E0, 0x108F2},
+        {0x108F4, 0x108F5},
+        {0x10900, 0x10915},
+        {0x10920, 0x10939},
+        {0x10940, 0x10959},
+        {0x10980, 0x109B7},
+        {0x109BE, 0x109BF},
+        {0x10A00, 0x10A00},
+        {0x10A10, 0x10A13},
+        {0x10A15, 0x10A17},
+        {0x10A19, 0x10A35},
+        {0x10A60, 0x10A7C},
+        {0x10A80, 0x10A9C},
+        {0x10AC0, 0x10AC7},
+        {0x10AC9, 0x10AE4},
+        {0x10B00, 0x10B35},
+        {0x10B40, 0x10B55},
+        {0x10B60, 0x10B72},
+        {0x10B80, 0x10B91},
+        {0x10C00, 0x10C48},
+        {0x10C80, 0x10CB2},
+        {0x10CC0, 0x10CF2},
+        {0x10D00, 0x10D23},
+        {0x10D4A, 0x10D65},
+        {0x10D6F, 0x10D85},
+        {0x10E80, 0x10EA9},
+        {0x10EB0, 0x10EB1},
+        {0x10EC2, 0x10EC7},
+        {0x10F00, 0x10F1C},
+        {0x10F27, 0x10F27},
+        {0x10F30, 0x10F45},
+        {0x10F70, 0x10F81},
+        {0x10FB0, 0x10FC4},
+        {0x10FE0, 0x10FF6},
+        {0x11003, 0x11037},
+        {0x11071, 0x11072},
+        {0x11075, 0x11075},
+        {0x11083, 0x110AF},
+        {0x110D0, 0x110E8},
+        {0x11103, 0x11126},
+        {0x11144, 0x11144},
+        {0x11147, 0x11147},
+        {0x11150, 0x11172},
+        {0x11176, 0x11176},
+        {0x11183, 0x111B2},
+        {0x111C1, 0x111C4},
+        {0x111DA, 0x111DA},
+        {0x111DC, 0x111DC},
+        {0x11200, 0x11211},
+        {0x11213, 0x1122B},
+        {0x1123F, 0x11240},
+        {0x11280, 0x11286},
+        {0x11288, 0x11288},
+        {0x1128A, 0x1128D},
+        {0x1128F, 0x1129D},
+        {0x1129F, 0x112A8},
+        {0x112B0, 0x112DE},
+        {0x11305, 0x1130C},
+        {0x1130F, 0x11310},
+        {0x11313, 0x11328},
+        {0x1132A, 0x11330},
+        {0x11332, 0x11333},
+        {0x11335, 0x11339},
+        {0x1133D, 0x1133D},
+        {0x11350, 0x11350},
+        {0x1135D, 0x11361},
+        {0x11380, 0x11389},
+        {0x1138B, 0x1138B},
+        {0x1138E, 0x1138E},
+        {0x11390, 0x113B5},
+        {0x113B7, 0x113B7},
+        {0x113D1, 0x113D1},
+        {0x113D3, 0x113D3},
+        {0x11400, 0x11434},
+        {0x11447, 0x1144A},
+        {0x1145F, 0x11461},
+        {0x11480, 0x114AF},
+        {0x114C4, 0x114C5},
+        {0x114C7, 0x114C7},
+        {0x11580, 0x115AE},
+        {0x115D8, 0x115DB},
+        {0x11600, 0x1162F},
+        {0x11644, 0x11644},
+        {0x11680, 0x116AA},
+        {0x116B8, 0x116B8},
+        {0x11700, 0x1171A},
+        {0x11740, 0x11746},
+        {0x11800, 0x1182B},
+        {0x118A0, 0x118DF},
+        {0x118FF, 0x11906},
+        {0x11909, 0x11909},
+        {0x1190C, 0x11913},
+        {0x11915, 0x11916},
+        {0x11918, 0x1192F},
+        {0x1193F, 0x1193F},
+        {0x11941, 0x11941},
+        {0x119A0, 0x119A7},
+        {0x119AA, 0x119D0},
+        {0x119E1, 0x119E1},
+        {0x119E3, 0x119E3},
+        {0x11A00, 0x11A00},
+        {0x11A0B, 0x11A32},
+        {0x11A3A, 0x11A3A},
+        {0x11A50, 0x11A50},
+        {0x11A5C, 0x11A89},
+        {0x11A9D, 0x11A9D},
+        {0x11AB0, 0x11AF8},
+        {0x11BC0, 0x11BE0},
+        {0x11C00, 0x11C08},
+        {0x11C0A, 0x11C2E},
+        {0x11C40, 0x11C40},
+        {0x11C72, 0x11C8F},
+        {0x11D00, 0x11D06},
+        {0x11D08, 0x11D09},
+        {0x11D0B, 0x11D30},
+        {0x11D46, 0x11D46},
+        {0x11D60, 0x11D65},
+        {0x11D67, 0x11D68},
+        {0x11D6A, 0x11D89},
+        {0x11D98, 0x11D98},
+        {0x11DB0, 0x11DDB},
+        {0x11EE0, 0x11EF2},
+        {0x11F02, 0x11F02},
+        {0x11F04, 0x11F10},
+        {0x11F12, 0x11F33},
+        {0x11FB0, 0x11FB0},
+        {0x12000, 0x12399},
+        {0x12480, 0x12543},
+        {0x12F90, 0x12FF0},
+        {0x13000, 0x1342F},
+        {0x13441, 0x13446},
+        {0x13460, 0x143FA},
+        {0x14400, 0x14646},
+        {0x16100, 0x1611D},
+        {0x16800, 0x16A38},
+        {0x16A40, 0x16A5E},
+        {0x16A70, 0x16ABE},
+        {0x16AD0, 0x16AED},
+        {0x16B00, 0x16B2F},
+        {0x16B40, 0x16B43},
+        {0x16B63, 0x16B77},
+        {0x16B7D, 0x16B8F},
+        {0x16D40, 0x16D6C},
+        {0x16E40, 0x16E7F},
+        {0x16EA0, 0x16EB8},
+        {0x16EBB, 0x16ED3},
+        {0x16F00, 0x16F4A},
+        {0x16F50, 0x16F50},
+        {0x16F93, 0x16F9F},
+        {0x16FE0, 0x16FE1},
+        {0x16FE3, 0x16FE3},
+        {0x16FF2, 0x16FF3},
+        {0x17000, 0x18CD5},
+        {0x18CFF, 0x18D1E},
+        {0x18D80, 0x18DF2},
+        {0x1AFF0, 0x1AFF3},
+        {0x1AFF5, 0x1AFFB},
+        {0x1AFFD, 0x1AFFE},
+        {0x1B000, 0x1B122},
+        {0x1B132, 0x1B132},
+        {0x1B150, 0x1B152},
+        {0x1B155, 0x1B155},
+        {0x1B164, 0x1B167},
+        {0x1B170, 0x1B2FB},
+        {0x1BC00, 0x1BC6A},
+        {0x1BC70, 0x1BC7C},
+        {0x1BC80, 0x1BC88},
+        {0x1BC90, 0x1BC99},
+        {0x1D400, 0x1D454},
+        {0x1D456, 0x1D49C},
+        {0x1D49E, 0x1D49F},
+        {0x1D4A2, 0x1D4A2},
+        {0x1D4A5, 0x1D4A6},
+        {0x1D4A9, 0x1D4AC},
+        {0x1D4AE, 0x1D4B9},
+        {0x1D4BB, 0x1D4BB},
+        {0x1D4BD, 0x1D4C3},
+        {0x1D4C5, 0x1D505},
+        {0x1D507, 0x1D50A},
+        {0x1D50D, 0x1D514},
+        {0x1D516, 0x1D51C},
+        {0x1D51E, 0x1D539},
+        {0x1D53B, 0x1D53E},
+        {0x1D540, 0x1D544},
+        {0x1D546, 0x1D546},
+        {0x1D54A, 0x1D550},
+        {0x1D552, 0x1D6A5},
+        {0x1D6A8, 0x1D6C0},
+        {0x1D6C2, 0x1D6DA},
+        {0x1D6DC, 0x1D6FA},
+        {0x1D6FC, 0x1D714},
+        {0x1D716, 0x1D734},
+        {0x1D736, 0x1D74E},
+        {0x1D750, 0x1D76E},
+        {0x1D770, 0x1D788},
+        {0x1D78A, 0x1D7A8},
+        {0x1D7AA, 0x1D7C2},
+        {0x1D7C4, 0x1D7CB},
+        {0x1DF00, 0x1DF1E},
+        {0x1DF25, 0x1DF2A},
+        {0x1E030, 0x1E06D},
+        {0x1E100, 0x1E12C},
+        {0x1E137, 0x1E13D},
+        {0x1E14E, 0x1E14E},
+        {0x1E290, 0x1E2AD},
+        {0x1E2C0, 0x1E2EB},
+        {0x1E4D0, 0x1E4EB},
+        {0x1E5D0, 0x1E5ED},
+        {0x1E5F0, 0x1E5F0},
+        {0x1E6C0, 0x1E6DE},
+        {0x1E6E0, 0x1E6E2},
+        {0x1E6E4, 0x1E6E5},
+        {0x1E6E7, 0x1E6ED},
+        {0x1E6F0, 0x1E6F4},
+        {0x1E6FE, 0x1E6FF},
+        {0x1E7E0, 0x1E7E6},
+        {0x1E7E8, 0x1E7EB},
+        {0x1E7ED, 0x1E7EE},
+        {0x1E7F0, 0x1E7FE},
+        {0x1E800, 0x1E8C4},
+        {0x1E900, 0x1E943},
+        {0x1E94B, 0x1E94B},
+        {0x1EE00, 0x1EE03},
+        {0x1EE05, 0x1EE1F},
+        {0x1EE21, 0x1EE22},
+        {0x1EE24, 0x1EE24},
+        {0x1EE27, 0x1EE27},
+        {0x1EE29, 0x1EE32},
+        {0x1EE34, 0x1EE37},
+        {0x1EE39, 0x1EE39},
+        {0x1EE3B, 0x1EE3B},
+        {0x1EE42, 0x1EE42},
+        {0x1EE47, 0x1EE47},
+        {0x1EE49, 0x1EE49},
+        {0x1EE4B, 0x1EE4B},
+        {0x1EE4D, 0x1EE4F},
+        {0x1EE51, 0x1EE52},
+        {0x1EE54, 0x1EE54},
+        {0x1EE57, 0x1EE57},
+        {0x1EE59, 0x1EE59},
+        {0x1EE5B, 0x1EE5B},
+        {0x1EE5D, 0x1EE5D},
+        {0x1EE5F, 0x1EE5F},
+        {0x1EE61, 0x1EE62},
+        {0x1EE64, 0x1EE64},
+        {0x1EE67, 0x1EE6A},
+        {0x1EE6C, 0x1EE72},
+        {0x1EE74, 0x1EE77},
+        {0x1EE79, 0x1EE7C},
+        {0x1EE7E, 0x1EE7E},
+        {0x1EE80, 0x1EE89},
+        {0x1EE8B, 0x1EE9B},
+        {0x1EEA1, 0x1EEA3},
+        {0x1EEA5, 0x1EEA9},
+        {0x1EEAB, 0x1EEBB},
+        {0x20000, 0x2A6DF},
+        {0x2A700, 0x2B81D},
+        {0x2B820, 0x2CEAD},
+        {0x2CEB0, 0x2EBE0},
+        {0x2EBF0, 0x2EE5D},
+        {0x2F800, 0x2FA1D},
+        {0x30000, 0x3134A},
+        {0x31350, 0x33479},
+    };
+
+    for (const auto& r : ranges) {
+        if (ch >= r.start && ch <= r.end)
+            return true;
+    }
+    return false;
+}
+
+bool is_space(char32_t cp) {
+    switch (cp) {
+        case 0x0009:  // TAB \t
+        case 0x000A:  // LF \n
+        case 0x000B:  // VT
+        case 0x000C:  // FF
+        case 0x000D:  // CR \r
+        case 0x0020:  // Space
+        case 0x00A0:  // No-Break Space
+        case 0x1680:  // Ogham Space Mark
+        case 0x2000:  // En Quad
+        case 0x2001:  // Em Quad
+        case 0x2002:  // En Space
+        case 0x2003:  // Em Space
+        case 0x2004:  // Three-Per-Em Space
+        case 0x2005:  // Four-Per-Em Space
+        case 0x2006:  // Six-Per-Em Space
+        case 0x2007:  // Figure Space
+        case 0x2008:  // Punctuation Space
+        case 0x2009:  // Thin Space
+        case 0x200A:  // Hair Space
+        case 0x202F:  // Narrow No-Break Space
+        case 0x205F:  // Medium Mathematical Space
+        case 0x3000:  // Ideographic Space
+            return true;
+        default:
+            return false;
+    }
+}
+
+std::string str_to_lower(const std::string& input) {
+    std::string result = input;
+    std::transform(result.begin(), result.end(), result.begin(),
+                   [](unsigned char c) { return std::tolower(c); });
+    return result;
+}
+
+// UTF-8 -> Unicode code points
+std::vector<char32_t> utf8_to_codepoints(const std::string& str) {
+    std::vector<char32_t> codepoints;
+    size_t i = 0;
+    while (i < str.size()) {
+        unsigned char c    = str[i];
+        char32_t cp        = 0;
+        size_t extra_bytes = 0;
+
+        if ((c & 0x80) == 0)
+            cp = c;
+        else if ((c & 0xE0) == 0xC0) {
+            cp          = c & 0x1F;
+            extra_bytes = 1;
+        } else if ((c & 0xF0) == 0xE0) {
+            cp          = c & 0x0F;
+            extra_bytes = 2;
+        } else if ((c & 0xF8) == 0xF0) {
+            cp          = c & 0x07;
+            extra_bytes = 3;
+        } else {
+            ++i;
+            continue;
+        }  // Invalid UTF-8
+
+        if (i + extra_bytes >= str.size())
+            break;
+
+        for (size_t j = 1; j <= extra_bytes; ++j)
+            cp = (cp << 6) | (str[i + j] & 0x3F);
+
+        codepoints.push_back(cp);
+        i += 1 + extra_bytes;
+    }
+    return codepoints;
+}
+
+// Unicode code point -> UTF-8
+std::string codepoint_to_utf8(char32_t cp) {
+    std::string out;
+    if (cp <= 0x7F)
+        out.push_back(static_cast<char>(cp));
+    else if (cp <= 0x7FF) {
+        out.push_back(static_cast<char>(0xC0 | (cp >> 6)));
+        out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
+    } else if (cp <= 0xFFFF) {
+        out.push_back(static_cast<char>(0xE0 | (cp >> 12)));
+        out.push_back(static_cast<char>(0x80 | ((cp >> 6) & 0x3F)));
+        out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
+    } else {
+        out.push_back(static_cast<char>(0xF0 | (cp >> 18)));
+        out.push_back(static_cast<char>(0x80 | ((cp >> 12) & 0x3F)));
+        out.push_back(static_cast<char>(0x80 | ((cp >> 6) & 0x3F)));
+        out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
+    }
+    return out;
+}
+
+bool starts_with(const std::vector<char32_t>& text,
+                 const std::vector<char32_t>& prefix,
+                 std::size_t index) {
+    if (index > text.size()) {
+        return false;
+    }
+    if (prefix.size() > text.size() - index) {
+        return false;
+    }
+    return std::equal(prefix.begin(), prefix.end(), text.begin() + index);
+}
+
+// mistral: [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+
+// qwen2: (?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+
+std::vector<std::string> token_split(const std::string& text) {
+    std::vector<std::string> tokens;
+    auto cps = utf8_to_codepoints(text);
+    size_t i = 0;
+
+    while (i < cps.size()) {
+        char32_t cp = cps[i];
+
+        // `(?i:'s|'t|'re|'ve|'m|'ll|'d)`
+        if (cp == U'\'' && i + 1 < cps.size()) {
+            std::string next = str_to_lower(codepoint_to_utf8(cps[i + 1]));
+            if (next == "s" || next == "t" || next == "m") {
+                tokens.push_back("'" + next);
+                i += 2;
+                continue;
+            }
+            if (i + 2 < cps.size()) {
+                next += str_to_lower(codepoint_to_utf8(cps[i + 2]));
+                if (next == "re" || next == "ve" || next == "ll" || next == "d") {
+                    tokens.push_back("'" + next);
+                    i += 3;
+                    continue;
+                }
+            }
+        }
+
+        // `\p{N}`
+        if (is_number(cp)) {
+            tokens.push_back(codepoint_to_utf8(cp));
+            ++i;
+            continue;
+        }
+
+        // `[^\r\n\p{L}\p{N}]?\p{L}+`
+        {
+            // `[^\r\n\p{L}\p{N}]\p{L}+`
+            if (!is_letter(cp) && cp != U'\r' && cp != U'\n' && i + 1 < cps.size() && is_letter(cps[i + 1])) {
+                std::string token = codepoint_to_utf8(cp);
+                ++i;
+
+                while (i < cps.size() && is_letter(cps[i])) {
+                    token += codepoint_to_utf8(cps[i]);
+                    ++i;
+                }
+                tokens.push_back(token);
+                continue;
+            }
+
+            // `\p{L}+`
+            if (is_letter(cp)) {
+                std::string token = codepoint_to_utf8(cp);
+                ++i;
+                while (i < cps.size() && is_letter(cps[i])) {
+                    token += codepoint_to_utf8(cps[i]);
+                    ++i;
+                }
+                tokens.push_back(token);
+                continue;
+            }
+        }
+
+        // ` ?[^\s\p{L}\p{N}]+[\r\n]*`
+        {
+            // ` [^\s\p{L}\p{N}]+[\r\n]*`
+            if (cp == U' ' && i + 1 < cps.size() && !isspace(cps[i + 1]) && !is_letter(cps[i + 1]) && !is_number(cps[i + 1])) {
+                std::string token = codepoint_to_utf8(cp);
+                token += codepoint_to_utf8(cps[i + 1]);
+                i += 2;
+
+                while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) {
+                    token += codepoint_to_utf8(cps[i]);
+                    ++i;
+                }
+
+                while (i < cps.size() && (cps[i] == U'\r' || cps[i] == U'\n')) {
+                    token += codepoint_to_utf8(cps[i]);
+                    ++i;
+                }
+
+                tokens.push_back(token);
+                continue;
+            }
+
+            // `[^\s\p{L}\p{N}]+[\r\n]*`
+            std::string token;
+            if (!is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) {
+                std::string token = codepoint_to_utf8(cp);
+                ++i;
+
+                while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) {
+                    token += codepoint_to_utf8(cps[i]);
+                    ++i;
+                }
+
+                while (i < cps.size() && (cps[i] == U'\r' || cps[i] == U'\n')) {
+                    token += codepoint_to_utf8(cps[i]);
+                    ++i;
+                }
+
+                tokens.push_back(token);
+                continue;
+            }
+        }
+
+        // `\s*[\r\n]+|\s+(?!\S)|\s+`
+        if (is_space(cp)) {
+            std::string token;
+            bool saw_new_line = false;
+
+            while (i < cps.size() && is_space(cps[i])) {
+                token += codepoint_to_utf8(cps[i]);
+
+                if (cps[i] == U'\r' || cps[i] == U'\n') {
+                    saw_new_line = true;
+                } else {
+                    if (saw_new_line) {
+                        break;
+                    }
+                }
+
+                ++i;
+            }
+
+            tokens.push_back(token);
+            continue;
+        }
+
+        // skip
+        ++i;
+    }
+
+    return tokens;
+}
+
+std::vector<std::string> split_with_special_tokens(
+    const std::string& text,
+    const std::vector<std::string>& special_tokens) {
+    std::vector<std::string> result;
+    size_t pos      = 0;
+    size_t text_len = text.size();
+
+    while (pos < text_len) {
+        size_t next_pos = text_len;
+        std::string matched_token;
+
+        for (const auto& token : special_tokens) {
+            size_t token_pos = text.find(token, pos);
+            if (token_pos != std::string::npos && token_pos < next_pos) {
+                next_pos      = token_pos;
+                matched_token = token;
+            }
+        }
+
+        if (next_pos > pos) {
+            result.push_back(text.substr(pos, next_pos - pos));
+        }
+
+        if (!matched_token.empty()) {
+            result.push_back(matched_token);
+            pos = next_pos + matched_token.size();
+        } else {
+            break;
+        }
+    }
+
+    return result;
+}
+
+// int main() {
+//     std::string text = "I'm testing C++ token_split function. Hello world 123";
+//     auto tokens = token_split(text);
+
+//     for (const auto& t : tokens) {
+//         std::cout << "[" << t << "] ";
+//     }
+//     std::cout << "\n";
+//     return 0;
+// }
diff --git a/src/ucache.hpp b/src/ucache.hpp
index d324761..3d785c5 100644
--- a/src/ucache.hpp
+++ b/src/ucache.hpp
@@ -6,8 +6,10 @@
 #include <unordered_map>
 #include <vector>
 
+#include "condition_cache_utils.hpp"
 #include "denoiser.hpp"
 #include "ggml_extend.hpp"
+#include "tensor.hpp"
 
 struct UCacheConfig {
     bool enabled                = false;
@@ -29,15 +31,15 @@ struct UCacheCacheEntry {
 
 struct UCacheState {
     UCacheConfig config;
-    Denoiser* denoiser                  = nullptr;
-    float start_sigma                   = std::numeric_limits<float>::max();
-    float end_sigma                     = 0.0f;
-    bool initialized                    = false;
-    bool initial_step                   = true;
-    bool skip_current_step              = false;
-    bool step_active                    = false;
-    const SDCondition* anchor_condition = nullptr;
-    std::unordered_map<const SDCondition*, UCacheCacheEntry> cache_diffs;
+    Denoiser* denoiser           = nullptr;
+    float start_sigma            = std::numeric_limits<float>::max();
+    float end_sigma              = 0.0f;
+    bool initialized             = false;
+    bool initial_step            = true;
+    bool skip_current_step       = false;
+    bool step_active             = false;
+    const void* anchor_condition = nullptr;
+    std::unordered_map<const void*, UCacheCacheEntry> cache_diffs;
     std::vector<float> prev_input;
     std::vector<float> prev_output;
     float output_prev_norm                = 0.0f;
@@ -233,43 +235,30 @@ struct UCacheState {
         return base_threshold * multiplier;
     }
 
-    bool has_cache(const SDCondition* cond) const {
+    bool has_cache(const void* cond) const {
         auto it = cache_diffs.find(cond);
         return it != cache_diffs.end() && !it->second.diff.empty();
     }
 
-    void update_cache(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) {
+    void update_cache(const void* cond, const sd::Tensor<float>& input, const sd::Tensor<float>& output) {
         UCacheCacheEntry& entry = cache_diffs[cond];
-        size_t ne               = static_cast<size_t>(ggml_nelements(output));
-        entry.diff.resize(ne);
-        float* out_data = (float*)output->data;
-        float* in_data  = (float*)input->data;
-
-        for (size_t i = 0; i < ne; ++i) {
-            entry.diff[i] = out_data[i] - in_data[i];
-        }
+        sd::store_condition_cache_diff(&entry.diff, input, output);
     }
 
-    void apply_cache(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) {
+    void apply_cache(const void* cond, const sd::Tensor<float>& input, sd::Tensor<float>* output) {
         auto it = cache_diffs.find(cond);
         if (it == cache_diffs.end() || it->second.diff.empty()) {
             return;
         }
-
-        copy_ggml_tensor(output, input);
-        float* out_data                = (float*)output->data;
-        const std::vector<float>& diff = it->second.diff;
-        for (size_t i = 0; i < diff.size(); ++i) {
-            out_data[i] += diff[i];
-        }
+        sd::apply_condition_cache_diff(it->second.diff, input, output);
     }
 
-    bool before_condition(const SDCondition* cond,
-                          ggml_tensor* input,
-                          ggml_tensor* output,
+    bool before_condition(const void* cond,
+                          const sd::Tensor<float>& input,
+                          sd::Tensor<float>* output,
                           float sigma,
                           int step_index) {
-        if (!enabled() || step_index < 0) {
+        if (!enabled() || step_index < 0 || output == nullptr) {
             return false;
         }
         if (step_index != current_step_index) {
@@ -302,13 +291,13 @@ struct UCacheState {
             return false;
         }
 
-        size_t ne = static_cast<size_t>(ggml_nelements(input));
+        size_t ne = static_cast<size_t>(input.numel());
         if (prev_input.size() != ne) {
             return false;
         }
 
-        float* input_data = (float*)input->data;
-        last_input_change = 0.0f;
+        const float* input_data = input.data();
+        last_input_change       = 0.0f;
         for (size_t i = 0; i < ne; ++i) {
             last_input_change += std::fabs(input_data[i] - prev_input[i]);
         }
@@ -354,7 +343,7 @@ struct UCacheState {
         return false;
     }
 
-    void after_condition(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) {
+    void after_condition(const void* cond, const sd::Tensor<float>& input, const sd::Tensor<float>& output) {
         if (!step_is_active()) {
             return;
         }
@@ -367,16 +356,16 @@ struct UCacheState {
         steps_computed_since_active++;
         consecutive_skipped_steps = 0;
 
-        size_t ne      = static_cast<size_t>(ggml_nelements(input));
-        float* in_data = (float*)input->data;
+        size_t ne            = static_cast<size_t>(input.numel());
+        const float* in_data = input.data();
         prev_input.resize(ne);
         for (size_t i = 0; i < ne; ++i) {
             prev_input[i] = in_data[i];
         }
         has_prev_input = true;
 
-        float* out_data     = (float*)output->data;
-        float output_change = 0.0f;
+        const float* out_data = output.data();
+        float output_change   = 0.0f;
         if (has_prev_output && prev_output.size() == ne) {
             for (size_t i = 0; i < ne; ++i) {
                 output_change += std::fabs(out_data[i] - prev_output[i]);
diff --git a/src/unet.hpp b/src/unet.hpp
index f7aa3f0..63e23eb 100644
--- a/src/unet.hpp
+++ b/src/unet.hpp
@@ -609,30 +609,31 @@ struct UNetModelRunner : public GGMLRunner {
         unet.get_param_tensors(tensors, prefix);
     }
 
-    ggml_cgraph* build_graph(ggml_tensor* x,
-                             ggml_tensor* timesteps,
-                             ggml_tensor* context,
-                             ggml_tensor* c_concat              = nullptr,
-                             ggml_tensor* y                     = nullptr,
-                             int num_video_frames               = -1,
-                             std::vector<ggml_tensor*> controls = {},
-                             float control_strength             = 0.f) {
+    ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
+                             const sd::Tensor<float>& timesteps_tensor,
+                             const sd::Tensor<float>& context_tensor               = {},
+                             const sd::Tensor<float>& c_concat_tensor              = {},
+                             const sd::Tensor<float>& y_tensor                     = {},
+                             int num_video_frames                                  = -1,
+                             const std::vector<sd::Tensor<float>>& controls_tensor = {},
+                             float control_strength                                = 0.f) {
         ggml_cgraph* gf = new_graph_custom(UNET_GRAPH_SIZE);
 
+        ggml_tensor* x         = make_input(x_tensor);
+        ggml_tensor* timesteps = make_input(timesteps_tensor);
+        ggml_tensor* context   = make_optional_input(context_tensor);
+        ggml_tensor* c_concat  = make_optional_input(c_concat_tensor);
+        ggml_tensor* y         = make_optional_input(y_tensor);
+        std::vector<ggml_tensor*> controls;
+        controls.reserve(controls_tensor.size());
+        for (const auto& control_tensor : controls_tensor) {
+            controls.push_back(make_input(control_tensor));
+        }
+
         if (num_video_frames == -1) {
             num_video_frames = static_cast<int>(x->ne[3]);
         }
 
-        x         = to_backend(x);
-        context   = to_backend(context);
-        y         = to_backend(y);
-        timesteps = to_backend(timesteps);
-        c_concat  = to_backend(c_concat);
-
-        for (int i = 0; i < controls.size(); i++) {
-            controls[i] = to_backend(controls[i]);
-        }
-
         auto runner_ctx = get_context();
 
         ggml_tensor* out = unet.forward(&runner_ctx,
@@ -650,17 +651,15 @@ struct UNetModelRunner : public GGMLRunner {
         return gf;
     }
 
-    bool compute(int n_threads,
-                 ggml_tensor* x,
-                 ggml_tensor* timesteps,
-                 ggml_tensor* context,
-                 ggml_tensor* c_concat,
-                 ggml_tensor* y,
-                 int num_video_frames               = -1,
-                 std::vector<ggml_tensor*> controls = {},
-                 float control_strength             = 0.f,
-                 ggml_tensor** output               = nullptr,
-                 ggml_context* output_ctx           = nullptr) {
+    sd::Tensor<float> compute(int n_threads,
+                              const sd::Tensor<float>& x,
+                              const sd::Tensor<float>& timesteps,
+                              const sd::Tensor<float>& context               = {},
+                              const sd::Tensor<float>& c_concat              = {},
+                              const sd::Tensor<float>& y                     = {},
+                              int num_video_frames                           = -1,
+                              const std::vector<sd::Tensor<float>>& controls = {},
+                              float control_strength                         = 0.f) {
         // x: [N, in_channels, h, w]
         // timesteps: [N, ]
         // context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size]
@@ -670,7 +669,7 @@ struct UNetModelRunner : public GGMLRunner {
             return build_graph(x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength);
         };
 
-        return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
     }
 
     void test() {
@@ -679,8 +678,8 @@ struct UNetModelRunner : public GGMLRunner {
         params.mem_buffer = nullptr;
         params.no_alloc   = false;
 
-        ggml_context* work_ctx = ggml_init(params);
-        GGML_ASSERT(work_ctx != nullptr);
+        ggml_context* ctx = ggml_init(params);
+        GGML_ASSERT(ctx != nullptr);
 
         {
             // CPU, num_video_frames = 1, x{num_video_frames, 8, 8, 8}: Pass
@@ -689,27 +688,37 @@ struct UNetModelRunner : public GGMLRunner {
             // CUDA, num_video_frames = 3, x{num_video_frames, 8, 8, 8}: nan
             int num_video_frames = 3;
 
-            auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 8, num_video_frames);
+            sd::Tensor<float> x({8, 8, 8, num_video_frames});
             std::vector<float> timesteps_vec(num_video_frames, 999.f);
-            auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
-            ggml_set_f32(x, 0.5f);
+            auto timesteps = sd::Tensor<float>::from_vector(timesteps_vec);
+            x.fill_(0.5f);
             // print_ggml_tensor(x);
 
-            auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 1024, 1, num_video_frames);
-            ggml_set_f32(context, 0.5f);
+            sd::Tensor<float> context({1024, 1, num_video_frames});
+            context.fill_(0.5f);
             // print_ggml_tensor(context);
 
-            auto y = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 768, num_video_frames);
-            ggml_set_f32(y, 0.5f);
+            sd::Tensor<float> y({768, num_video_frames});
+            y.fill_(0.5f);
             // print_ggml_tensor(y);
 
-            ggml_tensor* out = nullptr;
+            sd::Tensor<float> out;
 
-            int64_t t0 = ggml_time_ms();
-            compute(8, x, timesteps, context, nullptr, y, num_video_frames, {}, 0.f, &out, work_ctx);
-            int64_t t1 = ggml_time_ms();
+            int64_t t0   = ggml_time_ms();
+            auto out_opt = compute(8,
+                                   x,
+                                   timesteps,
+                                   context,
+                                   {},
+                                   y,
+                                   num_video_frames,
+                                   {},
+                                   0.f);
+            int64_t t1   = ggml_time_ms();
 
-            print_ggml_tensor(out);
+            GGML_ASSERT(!out_opt.empty());
+            out = std::move(out_opt);
+            print_sd_tensor(out);
             LOG_DEBUG("unet test done in %lldms", t1 - t0);
         }
     }
diff --git a/src/upscaler.cpp b/src/upscaler.cpp
index 18e185d..03f7714 100644
--- a/src/upscaler.cpp
+++ b/src/upscaler.cpp
@@ -2,6 +2,7 @@
 #include "ggml_extend.hpp"
 #include "model.h"
 #include "stable-diffusion.h"
+#include "util.h"
 
 struct UpscalerGGML {
     ggml_backend_t backend    = nullptr;  // general backend
@@ -64,6 +65,39 @@ struct UpscalerGGML {
         return true;
     }
 
+    sd::Tensor<float> upscale_tensor(const sd::Tensor<float>& input_tensor) {
+        sd::Tensor<float> upscaled;
+        if (tile_size <= 0 || (input_tensor.shape()[0] <= tile_size && input_tensor.shape()[1] <= tile_size)) {
+            upscaled = esrgan_upscaler->compute(n_threads, input_tensor);
+        } else {
+            auto on_processing = [&](const sd::Tensor<float>& input_tile) -> sd::Tensor<float> {
+                auto output_tile = esrgan_upscaler->compute(n_threads, input_tile);
+                if (output_tile.empty()) {
+                    LOG_ERROR("esrgan compute failed while processing a tile");
+                    return {};
+                }
+                return output_tile;
+            };
+
+            upscaled = process_tiles_2d(input_tensor,
+                                        static_cast<int>(input_tensor.shape()[0] * esrgan_upscaler->scale),
+                                        static_cast<int>(input_tensor.shape()[1] * esrgan_upscaler->scale),
+                                        esrgan_upscaler->scale,
+                                        tile_size,
+                                        tile_size,
+                                        0.25f,
+                                        false,
+                                        false,
+                                        on_processing);
+        }
+        esrgan_upscaler->free_compute_buffer();
+        if (upscaled.empty()) {
+            LOG_ERROR("esrgan compute failed");
+            return {};
+        }
+        return upscaled;
+    }
+
     sd_image_t upscale(sd_image_t input_image, uint32_t upscale_factor) {
         // upscale_factor, unused for RealESRGAN_x4plus_anime_6B.pth
         sd_image_t upscaled_image = {0, 0, 0, nullptr};
@@ -72,40 +106,17 @@ struct UpscalerGGML {
         LOG_INFO("upscaling from (%i x %i) to (%i x %i)",
                  input_image.width, input_image.height, output_width, output_height);
 
-        ggml_init_params params;
-        params.mem_size   = static_cast<size_t>(1024 * 1024) * 1024;  // 1G
-        params.mem_buffer = nullptr;
-        params.no_alloc   = false;
-
-        // draft context
-        ggml_context* upscale_ctx = ggml_init(params);
-        if (!upscale_ctx) {
-            LOG_ERROR("ggml_init() failed");
+        sd::Tensor<float> input_tensor = sd_image_to_tensor(input_image);
+        sd::Tensor<float> upscaled;
+        int64_t t0 = ggml_time_ms();
+        upscaled   = upscale_tensor(input_tensor);
+        if (upscaled.empty()) {
             return upscaled_image;
         }
-        // LOG_DEBUG("upscale work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f);
-        ggml_tensor* input_image_tensor = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, input_image.width, input_image.height, 3, 1);
-        sd_image_to_ggml_tensor(input_image, input_image_tensor);
-
-        ggml_tensor* upscaled = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, output_width, output_height, 3, 1);
-        auto on_tiling        = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-            return esrgan_upscaler->compute(n_threads, in, &out);
-        };
-        int64_t t0 = ggml_time_ms();
-        // TODO: circular upscaling?
-        sd_tiling(input_image_tensor, upscaled, esrgan_upscaler->scale, esrgan_upscaler->tile_size, 0.25f, false, false, on_tiling);
-        esrgan_upscaler->free_compute_buffer();
-        ggml_ext_tensor_clamp_inplace(upscaled, 0.f, 1.f);
-        uint8_t* upscaled_data = ggml_tensor_to_sd_image(upscaled);
-        ggml_free(upscale_ctx);
-        int64_t t3 = ggml_time_ms();
+        sd_image_t upscaled_data = tensor_to_sd_image(upscaled);
+        int64_t t3               = ggml_time_ms();
         LOG_INFO("input_image_tensor upscaled, taking %.2fs", (t3 - t0) / 1000.0f);
-        upscaled_image = {
-            (uint32_t)output_width,
-            (uint32_t)output_height,
-            3,
-            upscaled_data,
-        };
+        upscaled_image = upscaled_data;
         return upscaled_image;
     }
 };
diff --git a/src/util.cpp b/src/util.cpp
index a94cfd9..2d330a2 100644
--- a/src/util.cpp
+++ b/src/util.cpp
@@ -479,158 +479,96 @@ const char* sd_get_system_info() {
     return buffer;
 }
 
-sd_image_f32_t sd_image_t_to_sd_image_f32_t(sd_image_t image) {
-    sd_image_f32_t converted_image;
-    converted_image.width   = image.width;
-    converted_image.height  = image.height;
-    converted_image.channel = image.channel;
+sd_image_t tensor_to_sd_image(const sd::Tensor<float>& tensor, int frame_index) {
+    const auto& shape = tensor.shape();
+    GGML_ASSERT(shape.size() == 4 || shape.size() == 5);
+    int width     = static_cast<int>(shape[0]);
+    int height    = static_cast<int>(shape[1]);
+    int channel   = static_cast<int>(shape[shape.size() == 5 ? 3 : 2]);
+    uint8_t* data = (uint8_t*)malloc(static_cast<size_t>(width * height * channel));
+    GGML_ASSERT(data != nullptr);
 
-    // Allocate memory for float data
-    converted_image.data = (float*)malloc(image.width * image.height * image.channel * sizeof(float));
-
-    for (uint32_t i = 0; i < image.width * image.height * image.channel; i++) {
-        // Convert uint8_t to float
-        converted_image.data[i] = (float)image.data[i];
-    }
-
-    return converted_image;
-}
-
-// Function to perform double linear interpolation
-float interpolate(float v1, float v2, float v3, float v4, float x_ratio, float y_ratio) {
-    return v1 * (1 - x_ratio) * (1 - y_ratio) + v2 * x_ratio * (1 - y_ratio) + v3 * (1 - x_ratio) * y_ratio + v4 * x_ratio * y_ratio;
-}
-
-sd_image_f32_t resize_sd_image_f32_t(sd_image_f32_t image, int target_width, int target_height) {
-    sd_image_f32_t resized_image;
-    resized_image.width   = target_width;
-    resized_image.height  = target_height;
-    resized_image.channel = image.channel;
-
-    // Allocate memory for resized float data
-    resized_image.data = (float*)malloc(target_width * target_height * image.channel * sizeof(float));
-
-    for (int y = 0; y < target_height; y++) {
-        for (int x = 0; x < target_width; x++) {
-            float original_x = (float)x * image.width / target_width;
-            float original_y = (float)y * image.height / target_height;
-
-            uint32_t x1 = (uint32_t)original_x;
-            uint32_t y1 = (uint32_t)original_y;
-            uint32_t x2 = std::min(x1 + 1, image.width - 1);
-            uint32_t y2 = std::min(y1 + 1, image.height - 1);
-
-            for (uint32_t k = 0; k < image.channel; k++) {
-                float v1 = *(image.data + y1 * image.width * image.channel + x1 * image.channel + k);
-                float v2 = *(image.data + y1 * image.width * image.channel + x2 * image.channel + k);
-                float v3 = *(image.data + y2 * image.width * image.channel + x1 * image.channel + k);
-                float v4 = *(image.data + y2 * image.width * image.channel + x2 * image.channel + k);
-
-                float x_ratio = original_x - x1;
-                float y_ratio = original_y - y1;
-
-                float value = interpolate(v1, v2, v3, v4, x_ratio, y_ratio);
-
-                *(resized_image.data + y * target_width * image.channel + x * image.channel + k) = value;
+    for (int iw = 0; iw < width; ++iw) {
+        for (int ih = 0; ih < height; ++ih) {
+            for (int ic = 0; ic < channel; ++ic) {
+                float value                            = shape.size() == 5 ? tensor.index(iw, ih, frame_index, ic, 0)
+                                                                           : tensor.index(iw, ih, ic, frame_index);
+                value                                  = std::clamp(value, 0.0f, 1.0f);
+                data[(ih * width + iw) * channel + ic] = static_cast<uint8_t>(std::round(value * 255.0f));
             }
         }
     }
-
-    return resized_image;
+    return {
+        static_cast<uint32_t>(width),
+        static_cast<uint32_t>(height),
+        static_cast<uint32_t>(channel),
+        data,
+    };
 }
 
-void normalize_sd_image_f32_t(sd_image_f32_t image, float means[3], float stds[3]) {
-    for (uint32_t y = 0; y < image.height; y++) {
-        for (uint32_t x = 0; x < image.width; x++) {
-            for (uint32_t k = 0; k < image.channel; k++) {
-                int index         = (y * image.width + x) * image.channel + k;
-                image.data[index] = (image.data[index] - means[k]) / stds[k];
+sd::Tensor<float> sd_image_to_tensor(sd_image_t image,
+                                     int target_width,
+                                     int target_height,
+                                     bool scale) {
+    sd::Tensor<float> tensor = sd::zeros<float>({static_cast<int64_t>(image.width),
+                                                 static_cast<int64_t>(image.height),
+                                                 static_cast<int64_t>(image.channel),
+                                                 1});
+    for (uint32_t iw = 0; iw < image.width; ++iw) {
+        for (uint32_t ih = 0; ih < image.height; ++ih) {
+            for (uint32_t ic = 0; ic < image.channel; ++ic) {
+                tensor.index(iw, ih, ic, 0) = sd_image_get_f32(image, iw, ih, ic, scale);
             }
         }
     }
+    if (target_width >= 0 && target_height >= 0 &&
+        (tensor.shape()[0] != target_width || tensor.shape()[1] != target_height)) {
+        tensor = sd::ops::interpolate(tensor,
+                                      {target_width,
+                                       target_height,
+                                       tensor.shape()[2],
+                                       tensor.shape()[3]});
+    }
+    return tensor;
 }
 
 // Constants for means and std
 float means[3] = {0.48145466f, 0.4578275f, 0.40821073f};
 float stds[3]  = {0.26862954f, 0.26130258f, 0.27577711f};
 
-// Function to clip and preprocess sd_image_f32_t
-sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int target_height) {
-    float width_scale  = (float)target_width / image.width;
-    float height_scale = (float)target_height / image.height;
+sd::Tensor<float> clip_preprocess(const sd::Tensor<float>& image, int target_width, int target_height) {
+    GGML_ASSERT(image.dim() == 4);
+    GGML_ASSERT(image.shape()[2] == 3);
+    GGML_ASSERT(image.shape()[3] == 1);
+    GGML_ASSERT(target_width > 0 && target_height > 0);
 
-    float scale = std::fmax(width_scale, height_scale);
+    float width_scale  = static_cast<float>(target_width) / static_cast<float>(image.shape()[0]);
+    float height_scale = static_cast<float>(target_height) / static_cast<float>(image.shape()[1]);
+    float scale        = std::fmax(width_scale, height_scale);
 
-    // Interpolation
-    int resized_width   = (int)(scale * image.width);
-    int resized_height  = (int)(scale * image.height);
-    float* resized_data = (float*)malloc(resized_width * resized_height * image.channel * sizeof(float));
+    int64_t resized_width  = static_cast<int64_t>(scale * static_cast<float>(image.shape()[0]));
+    int64_t resized_height = static_cast<int64_t>(scale * static_cast<float>(image.shape()[1]));
 
-    for (int y = 0; y < resized_height; y++) {
-        for (int x = 0; x < resized_width; x++) {
-            float original_x = (float)x * image.width / resized_width;
-            float original_y = (float)y * image.height / resized_height;
+    sd::Tensor<float> resized = sd::ops::interpolate(
+        image,
+        {resized_width, resized_height, image.shape()[2], image.shape()[3]});
 
-            uint32_t x1 = (uint32_t)original_x;
-            uint32_t y1 = (uint32_t)original_y;
-            uint32_t x2 = std::min(x1 + 1, image.width - 1);
-            uint32_t y2 = std::min(y1 + 1, image.height - 1);
+    int64_t h_offset = std::max<int64_t>((resized_height - target_height) / 2, 0);
+    int64_t w_offset = std::max<int64_t>((resized_width - target_width) / 2, 0);
 
-            for (uint32_t k = 0; k < image.channel; k++) {
-                float v1 = *(image.data + y1 * image.width * image.channel + x1 * image.channel + k);
-                float v2 = *(image.data + y1 * image.width * image.channel + x2 * image.channel + k);
-                float v3 = *(image.data + y2 * image.width * image.channel + x1 * image.channel + k);
-                float v4 = *(image.data + y2 * image.width * image.channel + x2 * image.channel + k);
-
-                float x_ratio = original_x - x1;
-                float y_ratio = original_y - y1;
-
-                float value = interpolate(v1, v2, v3, v4, x_ratio, y_ratio);
-
-                *(resized_data + y * resized_width * image.channel + x * image.channel + k) = value;
+    sd::Tensor<float> cropped({target_width, target_height, image.shape()[2], image.shape()[3]});
+    for (int64_t y = 0; y < target_height; ++y) {
+        for (int64_t x = 0; x < target_width; ++x) {
+            for (int64_t c = 0; c < image.shape()[2]; ++c) {
+                cropped.index(x, y, c, 0) = resized.index(x + w_offset, y + h_offset, c, 0);
             }
         }
     }
 
-    // Clip and preprocess
-    int h_offset = std::max((int)(resized_height - target_height) / 2, 0);
-    int w_offset = std::max((int)(resized_width - target_width) / 2, 0);
-
-    sd_image_f32_t result;
-    result.width   = target_width;
-    result.height  = target_height;
-    result.channel = image.channel;
-    result.data    = (float*)malloc(target_height * target_width * image.channel * sizeof(float));
-
-    for (uint32_t k = 0; k < image.channel; k++) {
-        for (uint32_t i = 0; i < result.height; i++) {
-            for (uint32_t j = 0; j < result.width; j++) {
-                int src_y = std::min(static_cast<int>(i + h_offset), resized_height - 1);
-                int src_x = std::min(static_cast<int>(j + w_offset), resized_width - 1);
-                *(result.data + i * result.width * image.channel + j * image.channel + k) =
-                    fmin(fmax(*(resized_data + src_y * resized_width * image.channel + src_x * image.channel + k), 0.0f), 255.0f) / 255.0f;
-            }
-        }
-    }
-
-    // Free allocated memory
-    free(resized_data);
-
-    // Normalize
-    for (uint32_t k = 0; k < image.channel; k++) {
-        for (uint32_t i = 0; i < result.height; i++) {
-            for (uint32_t j = 0; j < result.width; j++) {
-                // *(result.data + i * size * image.channel + j * image.channel + k) = 0.5f;
-                int offset  = i * result.width * image.channel + j * image.channel + k;
-                float value = *(result.data + offset);
-                value       = (value - means[k]) / stds[k];
-                // value = 0.5f;
-                *(result.data + offset) = value;
-            }
-        }
-    }
-
-    return result;
+    sd::Tensor<float> normalized = sd::ops::clamp(cropped, 0.0f, 1.0f);
+    sd::Tensor<float> mean({1, 1, 3, 1}, {means[0], means[1], means[2]});
+    sd::Tensor<float> std({1, 1, 3, 1}, {stds[0], stds[1], stds[2]});
+    return (normalized - mean) / std;
 }
 
 // Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/prompt_parser.py#L345
diff --git a/src/util.h b/src/util.h
index 7dee7bf..24ce4cf 100644
--- a/src/util.h
+++ b/src/util.h
@@ -7,6 +7,7 @@
 #include <vector>
 
 #include "stable-diffusion.h"
+#include "tensor.hpp"
 
 #define SAFE_STR(s) ((s) ? (s) : "")
 #define BOOL_STR(b) ((b) ? "true" : "false")
@@ -29,20 +30,14 @@ std::string utf32_to_utf8(const std::u32string& utf32_str);
 std::u32string unicode_value_to_utf32(int unicode_value);
 // std::string sd_basename(const std::string& path);
 
-typedef struct {
-    uint32_t width;
-    uint32_t height;
-    uint32_t channel;
-    float* data;
-} sd_image_f32_t;
+sd_image_t tensor_to_sd_image(const sd::Tensor<float>& tensor, int frame_index = 0);
 
-void normalize_sd_image_f32_t(sd_image_f32_t image, float means[3], float stds[3]);
+sd::Tensor<float> sd_image_to_tensor(sd_image_t image,
+                                     int target_width  = -1,
+                                     int target_height = -1,
+                                     bool scale        = true);
 
-sd_image_f32_t sd_image_t_to_sd_image_f32_t(sd_image_t image);
-
-sd_image_f32_t resize_sd_image_f32_t(sd_image_f32_t image, int target_width, int target_height);
-
-sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int target_height);
+sd::Tensor<float> clip_preprocess(const sd::Tensor<float>& image, int target_width, int target_height);
 
 class MmapWrapper {
 public:
diff --git a/src/vae.hpp b/src/vae.hpp
index dafc0d4..22be886 100644
--- a/src/vae.hpp
+++ b/src/vae.hpp
@@ -2,16 +2,64 @@
 #define __VAE_HPP__
 
 #include "common_block.hpp"
+#include "tensor_ggml.hpp"
 
 struct VAE : public GGMLRunner {
 protected:
     SDVersion version;
-    bool scale_input                                = true;
-    virtual bool _compute(const int n_threads,
-                          ggml_tensor* z,
-                          bool decode_graph,
-                          ggml_tensor** output,
-                          ggml_context* output_ctx) = 0;
+    bool scale_input                                      = true;
+    virtual sd::Tensor<float> _compute(const int n_threads,
+                                       const sd::Tensor<float>& z,
+                                       bool decode_graph) = 0;
+
+    static inline void scale_tensor_to_minus1_1(sd::Tensor<float>* tensor) {
+        GGML_ASSERT(tensor != nullptr);
+        for (int64_t i = 0; i < tensor->numel(); ++i) {
+            (*tensor)[i] = (*tensor)[i] * 2.0f - 1.0f;
+        }
+    }
+
+    static inline void scale_tensor_to_0_1(sd::Tensor<float>* tensor) {
+        GGML_ASSERT(tensor != nullptr);
+        for (int64_t i = 0; i < tensor->numel(); ++i) {
+            float value  = ((*tensor)[i] + 1.0f) * 0.5f;
+            (*tensor)[i] = std::max(0.0f, std::min(1.0f, value));
+        }
+    }
+
+    sd::Tensor<float> tiled_compute(const sd::Tensor<float>& input,
+                                    int n_threads,
+                                    int output_width,
+                                    int output_height,
+                                    int scale,
+                                    int p_tile_size_x,
+                                    int p_tile_size_y,
+                                    float tile_overlap_factor,
+                                    bool circular_x,
+                                    bool circular_y,
+                                    bool decode_graph,
+                                    const char* error_message,
+                                    bool silent = false) {
+        auto on_processing = [&](const sd::Tensor<float>& input_tile) {
+            auto output_tile = _compute(n_threads, input_tile, decode_graph);
+            if (output_tile.empty()) {
+                LOG_ERROR("%s", error_message);
+                return sd::Tensor<float>();
+            }
+            return output_tile;
+        };
+        return ::process_tiles_2d(input,
+                                  output_width,
+                                  output_height,
+                                  scale,
+                                  p_tile_size_x,
+                                  p_tile_size_y,
+                                  tile_overlap_factor,
+                                  circular_x,
+                                  circular_y,
+                                  on_processing,
+                                  silent);
+    }
 
 public:
     VAE(SDVersion version, ggml_backend_t backend, bool offload_params_to_cpu)
@@ -60,133 +108,109 @@ public:
         tile_size_y = get_tile_size(params.tile_size_y, params.rel_size_y, latent_y);
     }
 
-    ggml_tensor* encode(int n_threads,
-                        ggml_context* work_ctx,
-                        ggml_tensor* x,
-                        sd_tiling_params_t tiling_params,
-                        bool circular_x = false,
-                        bool circular_y = false) {
-        int64_t t0             = ggml_time_ms();
-        ggml_tensor* result    = nullptr;
-        const int scale_factor = get_scale_factor();
-        int64_t W              = x->ne[0] / scale_factor;
-        int64_t H              = x->ne[1] / scale_factor;
-        int channel_dim        = sd_version_is_wan(version) ? 3 : 2;
-        int64_t C              = get_encoder_output_channels(static_cast<int>(x->ne[channel_dim]));
-        int64_t ne2;
-        int64_t ne3;
-        if (sd_version_is_wan(version)) {
-            int64_t T = x->ne[2];
-            ne2       = (T - 1) / 4 + 1;
-            ne3       = C;
-        } else {
-            ne2 = C;
-            ne3 = x->ne[3];
-        }
-        result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, ne2, ne3);
-
+    sd::Tensor<float> encode(int n_threads,
+                             const sd::Tensor<float>& x,
+                             sd_tiling_params_t tiling_params,
+                             bool circular_x = false,
+                             bool circular_y = false) {
+        int64_t t0              = ggml_time_ms();
+        sd::Tensor<float> input = x;
+        sd::Tensor<float> output;
         if (scale_input) {
-            scale_to_minus1_1(x);
-        }
-
-        if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
-            x = ggml_reshape_4d(work_ctx, x, x->ne[0], x->ne[1], 1, x->ne[2] * x->ne[3]);
+            scale_tensor_to_minus1_1(&input);
         }
 
         if (tiling_params.enabled) {
+            const int scale_factor = get_scale_factor();
+            int64_t W              = input.shape()[0] / scale_factor;
+            int64_t H              = input.shape()[1] / scale_factor;
             float tile_overlap;
             int tile_size_x, tile_size_y;
-            // multiply tile size for encode to keep the compute buffer size consistent
             get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, tiling_params, W, H, 1.30539f);
-
             LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y);
-
-            auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-                return _compute(n_threads, in, false, &out, work_ctx);
-            };
-            sd_tiling_non_square(x, result, scale_factor, tile_size_x, tile_size_y, tile_overlap, circular_x, circular_y, on_tiling);
+            output = tiled_compute(input,
+                                   n_threads,
+                                   static_cast<int>(W),
+                                   static_cast<int>(H),
+                                   scale_factor,
+                                   tile_size_x,
+                                   tile_size_y,
+                                   tile_overlap,
+                                   circular_x,
+                                   circular_y,
+                                   false,
+                                   "vae encode compute failed while processing a tile");
         } else {
-            _compute(n_threads, x, false, &result, work_ctx);
+            output = _compute(n_threads, input, false);
+            free_compute_buffer();
         }
-        free_compute_buffer();
 
+        if (output.empty()) {
+            LOG_ERROR("vae encode compute failed");
+            return {};
+        }
         int64_t t1 = ggml_time_ms();
         LOG_DEBUG("computing vae encode graph completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
-        return result;
+        return std::move(output);
     }
 
-    ggml_tensor* decode(int n_threads,
-                        ggml_context* work_ctx,
-                        ggml_tensor* x,
-                        sd_tiling_params_t tiling_params,
-                        bool decode_video   = false,
-                        bool circular_x     = false,
-                        bool circular_y     = false,
-                        ggml_tensor* result = nullptr,
-                        bool silent         = false) {
-        const int scale_factor = get_scale_factor();
-        int64_t W              = x->ne[0] * scale_factor;
-        int64_t H              = x->ne[1] * scale_factor;
-        int64_t C              = 3;
-        if (result == nullptr) {
-            if (decode_video) {
-                int64_t T = x->ne[2];
-                if (sd_version_is_wan(version)) {
-                    T = ((T - 1) * 4) + 1;
-                }
-                result = ggml_new_tensor_4d(work_ctx,
-                                            GGML_TYPE_F32,
-                                            W,
-                                            H,
-                                            T,
-                                            3);
-            } else {
-                result = ggml_new_tensor_4d(work_ctx,
-                                            GGML_TYPE_F32,
-                                            W,
-                                            H,
-                                            C,
-                                            x->ne[3]);
-            }
-        }
-        int64_t t0 = ggml_time_ms();
-        if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
-            x = ggml_reshape_4d(work_ctx, x, x->ne[0], x->ne[1], 1, x->ne[2] * x->ne[3]);
-        }
+    sd::Tensor<float> decode(int n_threads,
+                             const sd::Tensor<float>& x,
+                             sd_tiling_params_t tiling_params,
+                             bool decode_video = false,
+                             bool circular_x   = false,
+                             bool circular_y   = false,
+                             bool silent       = false) {
+        int64_t t0              = ggml_time_ms();
+        sd::Tensor<float> input = x;
+        sd::Tensor<float> output;
+
         if (tiling_params.enabled) {
+            const int scale_factor = get_scale_factor();
+            int64_t W              = input.shape()[0] * scale_factor;
+            int64_t H              = input.shape()[1] * scale_factor;
             float tile_overlap;
             int tile_size_x, tile_size_y;
-            get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, tiling_params, x->ne[0], x->ne[1]);
-
+            get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, tiling_params, input.shape()[0], input.shape()[1]);
             if (!silent) {
                 LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y);
             }
-
-            auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-                return _compute(n_threads, in, true, &out, nullptr);
-            };
-            sd_tiling_non_square(x, result, scale_factor, tile_size_x, tile_size_y, tile_overlap, circular_x, circular_y, on_tiling, silent);
+            output = tiled_compute(
+                input,
+                n_threads,
+                static_cast<int>(W),
+                static_cast<int>(H),
+                scale_factor,
+                tile_size_x,
+                tile_size_y,
+                tile_overlap,
+                circular_x,
+                circular_y,
+                true,
+                "vae decode compute failed while processing a tile",
+                silent);
         } else {
-            if (!_compute(n_threads, x, true, &result, work_ctx)) {
-                LOG_ERROR("Failed to decode latetnts");
-                free_compute_buffer();
-                return nullptr;
-            }
+            output = _compute(n_threads, input, true);
         }
+
         free_compute_buffer();
+
+        if (output.empty()) {
+            LOG_ERROR("vae decode compute failed");
+            return {};
+        }
         if (scale_input) {
-            scale_to_0_1(result);
+            scale_tensor_to_0_1(&output);
         }
         int64_t t1 = ggml_time_ms();
         LOG_DEBUG("computing vae decode graph completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
-        ggml_ext_tensor_clamp_inplace(result, 0.0f, 1.0f);
-        return result;
+        return std::move(output);
     }
 
-    virtual ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr<RNG> rng) = 0;
-    virtual ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents)                           = 0;
-    virtual ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents)                           = 0;
-    virtual void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix)                = 0;
+    virtual sd::Tensor<float> vae_output_to_latents(const sd::Tensor<float>& vae_output, std::shared_ptr<RNG> rng) = 0;
+    virtual sd::Tensor<float> diffusion_to_vae_latents(const sd::Tensor<float>& latents)                           = 0;
+    virtual sd::Tensor<float> vae_to_diffusion_latents(const sd::Tensor<float>& latents)                           = 0;
+    virtual void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix)         = 0;
     virtual void set_conv2d_scale(float scale) { SD_UNUSED(scale); };
 };
 
@@ -198,31 +222,25 @@ struct FakeVAE : public VAE {
         return input_channels;
     }
 
-    bool _compute(const int n_threads,
-                  ggml_tensor* z,
-                  bool decode_graph,
-                  ggml_tensor** output,
-                  ggml_context* output_ctx) override {
-        if (*output == nullptr && output_ctx != nullptr) {
-            *output = ggml_dup_tensor(output_ctx, z);
-        }
-        ggml_ext_tensor_iter(z, [&](ggml_tensor* z, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-            float value = ggml_ext_tensor_get_f32(z, i0, i1, i2, i3);
-            ggml_ext_tensor_set_f32(*output, value, i0, i1, i2, i3);
-        });
-        return true;
+    sd::Tensor<float> _compute(const int n_threads,
+                               const sd::Tensor<float>& z,
+                               bool decode_graph) override {
+        SD_UNUSED(n_threads);
+        SD_UNUSED(decode_graph);
+        return z;
     }
 
-    ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr<RNG> rng) {
+    sd::Tensor<float> vae_output_to_latents(const sd::Tensor<float>& vae_output, std::shared_ptr<RNG> rng) override {
+        SD_UNUSED(rng);
         return vae_output;
     }
 
-    ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents) {
-        return ggml_ext_dup_and_cpy_tensor(work_ctx, latents);
+    sd::Tensor<float> diffusion_to_vae_latents(const sd::Tensor<float>& latents) override {
+        return latents;
     }
 
-    ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents) {
-        return ggml_ext_dup_and_cpy_tensor(work_ctx, latents);
+    sd::Tensor<float> vae_to_diffusion_latents(const sd::Tensor<float>& latents) override {
+        return latents;
     }
 
     void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) override {}
diff --git a/src/wan.hpp b/src/wan.hpp
index af8acbf..6860262 100644
--- a/src/wan.hpp
+++ b/src/wan.hpp
@@ -1131,105 +1131,66 @@ namespace WAN {
             ae.get_param_tensors(tensors, prefix);
         }
 
-        ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr<RNG> rng) {
+        sd::Tensor<float> vae_output_to_latents(const sd::Tensor<float>& vae_output, std::shared_ptr<RNG> rng) override {
+            SD_UNUSED(rng);
             return vae_output;
         }
 
-        void get_latents_mean_std_vec(ggml_tensor* latents, int channel_dim, std::vector<float>& latents_mean_vec, std::vector<float>& latents_std_vec) {
-            GGML_ASSERT(latents->ne[channel_dim] == 16 || latents->ne[channel_dim] == 48);
-            if (latents->ne[channel_dim] == 16) {  // Wan2.1 VAE
-                latents_mean_vec = {-0.7571f, -0.7089f, -0.9113f, 0.1075f, -0.1745f, 0.9653f, -0.1517f, 1.5508f,
-                                    0.4134f, -0.0715f, 0.5517f, -0.3632f, -0.1922f, -0.9497f, 0.2503f, -0.2921f};
-                latents_std_vec  = {2.8184f, 1.4541f, 2.3275f, 2.6558f, 1.2196f, 1.7708f, 2.6052f, 2.0743f,
-                                    3.2687f, 2.1526f, 2.8652f, 1.5579f, 1.6382f, 1.1253f, 2.8251f, 1.9160f};
-            } else if (latents->ne[channel_dim] == 48) {  // Wan2.2 VAE
-                latents_mean_vec = {-0.2289f, -0.0052f, -0.1323f, -0.2339f, -0.2799f, 0.0174f, 0.1838f, 0.1557f,
-                                    -0.1382f, 0.0542f, 0.2813f, 0.0891f, 0.1570f, -0.0098f, 0.0375f, -0.1825f,
-                                    -0.2246f, -0.1207f, -0.0698f, 0.5109f, 0.2665f, -0.2108f, -0.2158f, 0.2502f,
-                                    -0.2055f, -0.0322f, 0.1109f, 0.1567f, -0.0729f, 0.0899f, -0.2799f, -0.1230f,
-                                    -0.0313f, -0.1649f, 0.0117f, 0.0723f, -0.2839f, -0.2083f, -0.0520f, 0.3748f,
-                                    0.0152f, 0.1957f, 0.1433f, -0.2944f, 0.3573f, -0.0548f, -0.1681f, -0.0667f};
-                latents_std_vec  = {
-                     0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f,
-                     0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f,
-                     0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f,
-                     0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f,
-                     0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f,
-                     0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f};
+        std::pair<sd::Tensor<float>, sd::Tensor<float>> get_latents_mean_std(const sd::Tensor<float>& latents) {
+            int channel_dim = latents.dim() == 5 ? 3 : 2;
+            std::vector<int64_t> stats_shape(static_cast<size_t>(latents.dim()), 1);
+            if (latents.shape()[channel_dim] == 16) {  // Wan2.1 VAE
+                stats_shape[static_cast<size_t>(channel_dim)] = 16;
+
+                auto mean_tensor = sd::Tensor<float>::from_vector({-0.7571f, -0.7089f, -0.9113f, 0.1075f, -0.1745f, 0.9653f, -0.1517f, 1.5508f,
+                                                                   0.4134f, -0.0715f, 0.5517f, -0.3632f, -0.1922f, -0.9497f, 0.2503f, -0.2921f});
+                mean_tensor.reshape_(stats_shape);
+                auto std_tensor = sd::Tensor<float>::from_vector({2.8184f, 1.4541f, 2.3275f, 2.6558f, 1.2196f, 1.7708f, 2.6052f, 2.0743f,
+                                                                  3.2687f, 2.1526f, 2.8652f, 1.5579f, 1.6382f, 1.1253f, 2.8251f, 1.9160f});
+                std_tensor.reshape_(stats_shape);
+                return {std::move(mean_tensor), std::move(std_tensor)};
             }
+            if (latents.shape()[channel_dim] == 48) {  // Wan2.2 VAE
+                stats_shape[static_cast<size_t>(channel_dim)] = 48;
+
+                auto mean_tensor = sd::Tensor<float>::from_vector({-0.2289f, -0.0052f, -0.1323f, -0.2339f, -0.2799f, 0.0174f, 0.1838f, 0.1557f,
+                                                                   -0.1382f, 0.0542f, 0.2813f, 0.0891f, 0.1570f, -0.0098f, 0.0375f, -0.1825f,
+                                                                   -0.2246f, -0.1207f, -0.0698f, 0.5109f, 0.2665f, -0.2108f, -0.2158f, 0.2502f,
+                                                                   -0.2055f, -0.0322f, 0.1109f, 0.1567f, -0.0729f, 0.0899f, -0.2799f, -0.1230f,
+                                                                   -0.0313f, -0.1649f, 0.0117f, 0.0723f, -0.2839f, -0.2083f, -0.0520f, 0.3748f,
+                                                                   0.0152f, 0.1957f, 0.1433f, -0.2944f, 0.3573f, -0.0548f, -0.1681f, -0.0667f});
+                mean_tensor.reshape_(stats_shape);
+                auto std_tensor = sd::Tensor<float>::from_vector({0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f,
+                                                                  0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f,
+                                                                  0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f,
+                                                                  0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f,
+                                                                  0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f,
+                                                                  0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f});
+                std_tensor.reshape_(stats_shape);
+                return {std::move(mean_tensor), std::move(std_tensor)};
+            }
+            GGML_ABORT("unexpected latent channel dimension %lld for version %d",
+                       (long long)latents.shape()[channel_dim],
+                       version);
         }
 
-        ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents) {
-            ggml_tensor* vae_latents = ggml_dup(work_ctx, latents);
-            int channel_dim          = sd_version_is_wan(version) ? 3 : 2;
-            std::vector<float> latents_mean_vec;
-            std::vector<float> latents_std_vec;
-            get_latents_mean_std_vec(latents, channel_dim, latents_mean_vec, latents_std_vec);
-
-            float mean;
-            float std_;
-            for (int i = 0; i < latents->ne[3]; i++) {
-                if (channel_dim == 3) {
-                    mean = latents_mean_vec[i];
-                    std_ = latents_std_vec[i];
-                }
-                for (int j = 0; j < latents->ne[2]; j++) {
-                    if (channel_dim == 2) {
-                        mean = latents_mean_vec[j];
-                        std_ = latents_std_vec[j];
-                    }
-                    for (int k = 0; k < latents->ne[1]; k++) {
-                        for (int l = 0; l < latents->ne[0]; l++) {
-                            float value = ggml_ext_tensor_get_f32(latents, l, k, j, i);
-                            value       = value * std_ / scale_factor + mean;
-                            ggml_ext_tensor_set_f32(vae_latents, value, l, k, j, i);
-                        }
-                    }
-                }
-            }
-
-            return vae_latents;
+        sd::Tensor<float> diffusion_to_vae_latents(const sd::Tensor<float>& latents) override {
+            auto [mean_tensor, std_tensor] = get_latents_mean_std(latents);
+            return (latents * std_tensor) / scale_factor + mean_tensor;
         }
 
-        ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents) {
-            ggml_tensor* diffusion_latents = ggml_dup(work_ctx, latents);
-            int channel_dim                = sd_version_is_wan(version) ? 3 : 2;
-            std::vector<float> latents_mean_vec;
-            std::vector<float> latents_std_vec;
-            get_latents_mean_std_vec(latents, channel_dim, latents_mean_vec, latents_std_vec);
-
-            float mean;
-            float std_;
-            for (int i = 0; i < latents->ne[3]; i++) {
-                if (channel_dim == 3) {
-                    mean = latents_mean_vec[i];
-                    std_ = latents_std_vec[i];
-                }
-                for (int j = 0; j < latents->ne[2]; j++) {
-                    if (channel_dim == 2) {
-                        mean = latents_mean_vec[j];
-                        std_ = latents_std_vec[j];
-                    }
-                    for (int k = 0; k < latents->ne[1]; k++) {
-                        for (int l = 0; l < latents->ne[0]; l++) {
-                            float value = ggml_ext_tensor_get_f32(latents, l, k, j, i);
-                            value       = (value - mean) * scale_factor / std_;
-                            ggml_ext_tensor_set_f32(diffusion_latents, value, l, k, j, i);
-                        }
-                    }
-                }
-            }
-            return diffusion_latents;
+        sd::Tensor<float> vae_to_diffusion_latents(const sd::Tensor<float>& latents) override {
+            auto [mean_tensor, std_tensor] = get_latents_mean_std(latents);
+            return ((latents - mean_tensor) * scale_factor) / std_tensor;
         }
 
         int get_encoder_output_channels(int input_channels) {
             return static_cast<int>(ae.z_dim);
         }
 
-        ggml_cgraph* build_graph(ggml_tensor* z, bool decode_graph) {
-            ggml_cgraph* gf = new_graph_custom(10240 * z->ne[2]);
-
-            z = to_backend(z);
+        ggml_cgraph* build_graph(const sd::Tensor<float>& z_tensor, bool decode_graph) {
+            ggml_cgraph* gf = new_graph_custom(10240 * z_tensor.shape()[2]);
+            ggml_tensor* z  = make_input(z_tensor);
 
             auto runner_ctx = get_context();
 
@@ -1240,7 +1201,7 @@ namespace WAN {
             return gf;
         }
 
-        ggml_cgraph* build_graph_partial(ggml_tensor* z, bool decode_graph, int i) {
+        ggml_cgraph* build_graph_partial(const sd::Tensor<float>& z_tensor, bool decode_graph, int i) {
             ggml_cgraph* gf = new_graph_custom(20480);
 
             ae.clear_cache();
@@ -1250,7 +1211,7 @@ namespace WAN {
                 ae._feat_map[feat_idx] = feat_cache;
             }
 
-            z = to_backend(z);
+            ggml_tensor* z = make_input(z_tensor);
 
             auto runner_ctx = get_context();
 
@@ -1269,58 +1230,57 @@ namespace WAN {
             return gf;
         }
 
-        bool _compute(const int n_threads,
-                      ggml_tensor* z,
-                      bool decode_graph,
-                      ggml_tensor** output,
-                      ggml_context* output_ctx = nullptr) override {
+        sd::Tensor<float> _compute(const int n_threads,
+                                   const sd::Tensor<float>& z,
+                                   bool decode_graph) override {
             if (true) {
+                sd::Tensor<float> input;
+                if (z.dim() == 4) {
+                    input = z.unsqueeze(2);
+                }
                 auto get_graph = [&]() -> ggml_cgraph* {
-                    return build_graph(z, decode_graph);
+                    if (input.empty()) {
+                        return build_graph(z, decode_graph);
+                    } else {
+                        return build_graph(input, decode_graph);
+                    }
                 };
-                return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
+                auto result = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, true),
+                                                              input.empty() ? z.dim() : input.dim());
+                if (!result.empty() && z.dim() == 4) {
+                    result.squeeze_(2);
+                }
+                return result;
             } else {  // chunk 1 result is weird
                 ae.clear_cache();
-                int64_t t      = z->ne[2];
+                int64_t t      = z.shape()[2];
                 int i          = 0;
                 auto get_graph = [&]() -> ggml_cgraph* {
                     return build_graph_partial(z, decode_graph, i);
                 };
-                ggml_tensor* out = nullptr;
-                bool res         = GGMLRunner::compute(get_graph, n_threads, true, &out, output_ctx);
+                auto out_opt = GGMLRunner::compute<float>(get_graph, n_threads, true);
+                if (!out_opt.has_value()) {
+                    return {};
+                }
+                sd::Tensor<float> out = std::move(*out_opt);
                 ae.clear_cache();
                 if (t == 1) {
-                    *output = out;
-                    return res;
+                    return out;
                 }
 
-                *output = ggml_new_tensor_4d(output_ctx, GGML_TYPE_F32, out->ne[0], out->ne[1], (t - 1) * 4 + 1, out->ne[3]);
-
-                auto copy_to_output = [&]() {
-                    for (int64_t i3 = 0; i3 < out->ne[3]; i3++) {
-                        for (int64_t i2 = 0; i2 < out->ne[2]; i2++) {
-                            for (int64_t i1 = 0; i1 < out->ne[1]; i1++) {
-                                for (int64_t i0 = 0; i0 < out->ne[0]; i0++) {
-                                    float value    = ggml_ext_tensor_get_f32(out, i0, i1, i2, i3);
-                                    int64_t offset = (i == 0) ? 0 : (1 + (i - 1) * 4);
-                                    ggml_ext_tensor_set_f32(*output, value, i0, i1, offset + i2, i3);
-                                }
-                            }
-                        }
-                    }
-                };
-
-                copy_to_output();
-
-                out = ggml_new_tensor_4d(output_ctx, GGML_TYPE_F32, out->ne[0], out->ne[1], 4, out->ne[3]);
+                sd::Tensor<float> output = std::move(out);
 
                 for (i = 1; i < t; i++) {
-                    res = res || GGMLRunner::compute(get_graph, n_threads, true, &out);
+                    auto chunk_opt = GGMLRunner::compute<float>(get_graph, n_threads, true);
+                    if (!chunk_opt.has_value()) {
+                        return {};
+                    }
+                    out = std::move(*chunk_opt);
                     ae.clear_cache();
-                    copy_to_output();
+                    output = sd::ops::concat(output, out, 2);
                 }
                 free_cache_ctx_and_buffer();
-                return res;
+                return output;
             }
         }
 
@@ -1330,25 +1290,25 @@ namespace WAN {
             params.mem_buffer = nullptr;
             params.no_alloc   = false;
 
-            ggml_context* work_ctx = ggml_init(params);
-            GGML_ASSERT(work_ctx != nullptr);
+            ggml_context* ctx = ggml_init(params);
+            GGML_ASSERT(ctx != nullptr);
 
             if (true) {
                 // cpu f32, pass
                 // cpu f16, pass
                 // cuda f16, pass
                 // cuda f32, pass
-                auto z = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 104, 60, 2, 16);
-                ggml_set_f32(z, 0.5f);
-                z = load_tensor_from_file(work_ctx, "wan_vae_z.bin");
-                print_ggml_tensor(z);
-                ggml_tensor* out = nullptr;
+                auto z = sd::load_tensor_from_file_as_tensor<float>("wan_vae_z.bin");
+                print_sd_tensor(z);
+                sd::Tensor<float> out;
 
-                int64_t t0 = ggml_time_ms();
-                _compute(8, z, true, &out, work_ctx);
-                int64_t t1 = ggml_time_ms();
+                int64_t t0   = ggml_time_ms();
+                auto out_opt = _compute(8, z, true);
+                int64_t t1   = ggml_time_ms();
 
-                print_ggml_tensor(out);
+                GGML_ASSERT(!out_opt.empty());
+                out = std::move(out_opt);
+                print_sd_tensor(out);
                 LOG_DEBUG("decode test done in %ldms", t1 - t0);
             }
         };
@@ -2229,23 +2189,23 @@ namespace WAN {
             wan.get_param_tensors(tensors, prefix);
         }
 
-        ggml_cgraph* build_graph(ggml_tensor* x,
-                                 ggml_tensor* timesteps,
-                                 ggml_tensor* context,
-                                 ggml_tensor* clip_fea        = nullptr,
-                                 ggml_tensor* c_concat        = nullptr,
-                                 ggml_tensor* time_dim_concat = nullptr,
-                                 ggml_tensor* vace_context    = nullptr,
-                                 float vace_strength          = 1.f) {
+        ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
+                                 const sd::Tensor<float>& timesteps_tensor,
+                                 const sd::Tensor<float>& context_tensor         = {},
+                                 const sd::Tensor<float>& clip_fea_tensor        = {},
+                                 const sd::Tensor<float>& c_concat_tensor        = {},
+                                 const sd::Tensor<float>& time_dim_concat_tensor = {},
+                                 const sd::Tensor<float>& vace_context_tensor    = {},
+                                 float vace_strength                             = 1.f) {
             ggml_cgraph* gf = new_graph_custom(WAN_GRAPH_SIZE);
 
-            x               = to_backend(x);
-            timesteps       = to_backend(timesteps);
-            context         = to_backend(context);
-            clip_fea        = to_backend(clip_fea);
-            c_concat        = to_backend(c_concat);
-            time_dim_concat = to_backend(time_dim_concat);
-            vace_context    = to_backend(vace_context);
+            ggml_tensor* x               = make_input(x_tensor);
+            ggml_tensor* timesteps       = make_input(timesteps_tensor);
+            ggml_tensor* context         = make_optional_input(context_tensor);
+            ggml_tensor* clip_fea        = make_optional_input(clip_fea_tensor);
+            ggml_tensor* c_concat        = make_optional_input(c_concat_tensor);
+            ggml_tensor* time_dim_concat = make_optional_input(time_dim_concat_tensor);
+            ggml_tensor* vace_context    = make_optional_input(vace_context_tensor);
 
             pe_vec      = Rope::gen_wan_pe(static_cast<int>(x->ne[2]),
                                            static_cast<int>(x->ne[1]),
@@ -2285,22 +2245,20 @@ namespace WAN {
             return gf;
         }
 
-        bool compute(int n_threads,
-                     ggml_tensor* x,
-                     ggml_tensor* timesteps,
-                     ggml_tensor* context,
-                     ggml_tensor* clip_fea        = nullptr,
-                     ggml_tensor* c_concat        = nullptr,
-                     ggml_tensor* time_dim_concat = nullptr,
-                     ggml_tensor* vace_context    = nullptr,
-                     float vace_strength          = 1.f,
-                     ggml_tensor** output         = nullptr,
-                     ggml_context* output_ctx     = nullptr) {
+        sd::Tensor<float> compute(int n_threads,
+                                  const sd::Tensor<float>& x,
+                                  const sd::Tensor<float>& timesteps,
+                                  const sd::Tensor<float>& context         = {},
+                                  const sd::Tensor<float>& clip_fea        = {},
+                                  const sd::Tensor<float>& c_concat        = {},
+                                  const sd::Tensor<float>& time_dim_concat = {},
+                                  const sd::Tensor<float>& vace_context    = {},
+                                  float vace_strength                      = 1.f) {
             auto get_graph = [&]() -> ggml_cgraph* {
                 return build_graph(x, timesteps, context, clip_fea, c_concat, time_dim_concat, vace_context, vace_strength);
             };
 
-            return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
         }
 
         void test() {
@@ -2309,36 +2267,38 @@ namespace WAN {
             params.mem_buffer = nullptr;
             params.no_alloc   = false;
 
-            ggml_context* work_ctx = ggml_init(params);
-            GGML_ASSERT(work_ctx != nullptr);
+            ggml_context* ctx = ggml_init(params);
+            GGML_ASSERT(ctx != nullptr);
 
             {
                 // cpu f16: pass
                 // cuda f16: pass
                 // cpu q8_0: pass
-                // auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 104, 60, 1, 16);
+                // auto x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 104, 60, 1, 16);
                 // ggml_set_f32(x, 0.01f);
-                auto x = load_tensor_from_file(work_ctx, "wan_dit_x.bin");
-                print_ggml_tensor(x);
+                auto x = sd::load_tensor_from_file_as_tensor<float>("wan_dit_x.bin");
+                print_sd_tensor(x);
 
                 std::vector<float> timesteps_vec(3, 1000.f);
                 timesteps_vec[0] = 0.f;
-                auto timesteps   = vector_to_ggml_tensor(work_ctx, timesteps_vec);
+                auto timesteps   = sd::Tensor<float>::from_vector(timesteps_vec);
 
-                // auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 4096, 512, 1);
+                // auto context = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 4096, 512, 1);
                 // ggml_set_f32(context, 0.01f);
-                auto context = load_tensor_from_file(work_ctx, "wan_dit_context.bin");
-                print_ggml_tensor(context);
-                // auto clip_fea = load_tensor_from_file(work_ctx, "wan_dit_clip_fea.bin");
+                auto context = sd::load_tensor_from_file_as_tensor<float>("wan_dit_context.bin");
+                print_sd_tensor(context);
+                // auto clip_fea = load_tensor_from_file(ctx, "wan_dit_clip_fea.bin");
                 // print_ggml_tensor(clip_fea);
 
-                ggml_tensor* out = nullptr;
+                sd::Tensor<float> out;
 
-                int64_t t0 = ggml_time_ms();
-                compute(8, x, timesteps, context, nullptr, nullptr, nullptr, nullptr, 1.f, &out, work_ctx);
-                int64_t t1 = ggml_time_ms();
+                int64_t t0   = ggml_time_ms();
+                auto out_opt = compute(8, x, timesteps, context, {}, {}, {}, {}, 1.f);
+                int64_t t1   = ggml_time_ms();
 
-                print_ggml_tensor(out);
+                GGML_ASSERT(!out_opt.empty());
+                out = std::move(out_opt);
+                print_sd_tensor(out);
                 LOG_DEBUG("wan test done in %lldms", t1 - t0);
             }
         }
diff --git a/src/z_image.hpp b/src/z_image.hpp
index 53a7cf8..363ce5f 100644
--- a/src/z_image.hpp
+++ b/src/z_image.hpp
@@ -481,20 +481,21 @@ namespace ZImage {
             z_image.get_param_tensors(tensors, prefix);
         }
 
-        ggml_cgraph* build_graph(ggml_tensor* x,
-                                 ggml_tensor* timesteps,
-                                 ggml_tensor* context,
-                                 std::vector<ggml_tensor*> ref_latents = {},
-                                 bool increase_ref_index               = false) {
+        ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
+                                 const sd::Tensor<float>& timesteps_tensor,
+                                 const sd::Tensor<float>& context_tensor,
+                                 const std::vector<sd::Tensor<float>>& ref_latents_tensor = {},
+                                 bool increase_ref_index                                  = false) {
+            ggml_cgraph* gf        = new_graph_custom(Z_IMAGE_GRAPH_SIZE);
+            ggml_tensor* x         = make_input(x_tensor);
+            ggml_tensor* timesteps = make_input(timesteps_tensor);
             GGML_ASSERT(x->ne[3] == 1);
-            ggml_cgraph* gf = new_graph_custom(Z_IMAGE_GRAPH_SIZE);
-
-            x         = to_backend(x);
-            context   = to_backend(context);
-            timesteps = to_backend(timesteps);
-
-            for (int i = 0; i < ref_latents.size(); i++) {
-                ref_latents[i] = to_backend(ref_latents[i]);
+            GGML_ASSERT(!context_tensor.empty());
+            ggml_tensor* context = make_input(context_tensor);
+            std::vector<ggml_tensor*> ref_latents;
+            ref_latents.reserve(ref_latents_tensor.size());
+            for (const auto& ref_latent_tensor : ref_latents_tensor) {
+                ref_latents.push_back(make_input(ref_latent_tensor));
             }
 
             pe_vec      = Rope::gen_z_image_pe(static_cast<int>(x->ne[1]),
@@ -530,14 +531,12 @@ namespace ZImage {
             return gf;
         }
 
-        bool compute(int n_threads,
-                     ggml_tensor* x,
-                     ggml_tensor* timesteps,
-                     ggml_tensor* context,
-                     std::vector<ggml_tensor*> ref_latents = {},
-                     bool increase_ref_index               = false,
-                     ggml_tensor** output                  = nullptr,
-                     ggml_context* output_ctx              = nullptr) {
+        sd::Tensor<float> compute(int n_threads,
+                                  const sd::Tensor<float>& x,
+                                  const sd::Tensor<float>& timesteps,
+                                  const sd::Tensor<float>& context,
+                                  const std::vector<sd::Tensor<float>>& ref_latents = {},
+                                  bool increase_ref_index                           = false) {
             // x: [N, in_channels, h, w]
             // timesteps: [N, ]
             // context: [N, max_position, hidden_size]
@@ -545,7 +544,7 @@ namespace ZImage {
                 return build_graph(x, timesteps, context, ref_latents, increase_ref_index);
             };
 
-            return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
         }
 
         void test() {
@@ -554,30 +553,37 @@ namespace ZImage {
             params.mem_buffer = nullptr;
             params.no_alloc   = false;
 
-            ggml_context* work_ctx = ggml_init(params);
-            GGML_ASSERT(work_ctx != nullptr);
+            ggml_context* ctx = ggml_init(params);
+            GGML_ASSERT(ctx != nullptr);
 
             {
-                // auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 16, 16, 16, 1);
+                // auto x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 16, 16, 16, 1);
                 // ggml_set_f32(x, 0.01f);
-                auto x = load_tensor_from_file(work_ctx, "./z_image_x.bin");
-                print_ggml_tensor(x);
+                auto x = sd::load_tensor_from_file_as_tensor<float>("./z_image_x.bin");
+                print_sd_tensor(x);
 
                 std::vector<float> timesteps_vec(1, 0.f);
-                auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
+                auto timesteps = sd::Tensor<float>::from_vector(timesteps_vec);
 
-                // auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 2560, 256, 1);
+                // auto context = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 2560, 256, 1);
                 // ggml_set_f32(context, 0.01f);
-                auto context = load_tensor_from_file(work_ctx, "./z_image_context.bin");
-                print_ggml_tensor(context);
+                auto context = sd::load_tensor_from_file_as_tensor<float>("./z_image_context.bin");
+                print_sd_tensor(context);
 
-                ggml_tensor* out = nullptr;
+                sd::Tensor<float> out;
 
-                int64_t t0 = ggml_time_ms();
-                compute(8, x, timesteps, context, {}, false, &out, work_ctx);
-                int64_t t1 = ggml_time_ms();
+                int64_t t0   = ggml_time_ms();
+                auto out_opt = compute(8,
+                                       x,
+                                       timesteps,
+                                       context,
+                                       {},
+                                       false);
+                int64_t t1   = ggml_time_ms();
 
-                print_ggml_tensor(out);
+                GGML_ASSERT(!out_opt.empty());
+                out = std::move(out_opt);
+                print_sd_tensor(out);
                 LOG_DEBUG("z_image test done in %lldms", t1 - t0);
             }
         }