diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 666887d9..1fbcbf94 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -21,11 +21,13 @@ on:
         "**/*.c",
         "**/*.cpp",
         "**/*.cu",
+        "examples/server/frontend/**",
       ]
   pull_request:
     types: [opened, synchronize, reopened]
     paths:
       [
+        ".github/workflows/**",
         "**/CMakeLists.txt",
         "**/Makefile",
         "**/*.h",
@@ -33,6 +35,7 @@ on:
         "**/*.c",
         "**/*.cpp",
         "**/*.cu",
+        "examples/server/frontend/**",
       ]
 
 env:
@@ -53,6 +56,16 @@ jobs:
         with:
           submodules: recursive
 
+      - name: Setup Node
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20
+
+      - name: Setup pnpm
+        uses: pnpm/action-setup@v4
+        with:
+          version: 9
+
       - name: Dependencies
         id: depends
         run: |
@@ -70,7 +83,7 @@ jobs:
       - name: Get commit hash
         id: commit
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: pr-mpt/actions-commit-hash@v2
+        uses: prompt/actions-commit-hash@v2
 
       - name: Fetch system info
         id: system-info
@@ -106,6 +119,16 @@ jobs:
         with:
           submodules: recursive
 
+      - name: Setup Node
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20
+
+      - name: Setup pnpm
+        uses: pnpm/action-setup@v4
+        with:
+          version: 9
+
       - name: Dependencies
         id: depends
         run: |
@@ -123,7 +146,7 @@ jobs:
       - name: Get commit hash
         id: commit
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: pr-mpt/actions-commit-hash@v2
+        uses: prompt/actions-commit-hash@v2
 
       - name: Fetch system info
         id: system-info
@@ -162,7 +185,7 @@ jobs:
 
     strategy:
       matrix:
-        variant: [musa, sycl, vulkan]
+        variant: [musa, sycl, vulkan, cuda]
 
     env:
       REGISTRY: ghcr.io
@@ -174,10 +197,20 @@ jobs:
         with:
           submodules: recursive
 
+      - name: Setup Node
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20
+
+      - name: Setup pnpm
+        uses: pnpm/action-setup@v4
+        with:
+          version: 9
+
       - name: Get commit hash
         id: commit
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: pr-mpt/actions-commit-hash@v2
+        uses: prompt/actions-commit-hash@v2
 
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
@@ -223,6 +256,16 @@ jobs:
         with:
           submodules: recursive
 
+      - name: Setup Node
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20
+
+      - name: Setup pnpm
+        uses: pnpm/action-setup@v4
+        with:
+          version: 9
+
       - name: Dependencies
         id: depends
         run: |
@@ -240,7 +283,7 @@ jobs:
       - name: Get commit hash
         id: commit
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: pr-mpt/actions-commit-hash@v2
+        uses: prompt/actions-commit-hash@v2
 
       - name: Fetch system info
         id: system-info
@@ -294,6 +337,16 @@ jobs:
         with:
           submodules: recursive
 
+      - name: Setup Node
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20
+
+      - name: Setup pnpm
+        uses: pnpm/action-setup@v4
+        with:
+          version: 9
+
       - name: Install cuda-toolkit
         id: cuda-toolkit
         if: ${{ matrix.build == 'cuda12' }}
@@ -340,7 +393,7 @@ jobs:
       - name: Get commit hash
         id: commit
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: pr-mpt/actions-commit-hash@v2
+        uses: prompt/actions-commit-hash@v2
 
       - name: Pack artifacts
         id: pack_artifacts
@@ -399,6 +452,16 @@ jobs:
         with:
           submodules: recursive
 
+      - name: Setup Node
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20
+
+      - name: Setup pnpm
+        uses: pnpm/action-setup@v4
+        with:
+          version: 9
+
       - name: Cache ROCm Installation
         id: cache-rocm
         uses: actions/cache@v4
@@ -463,7 +526,7 @@ jobs:
       - name: Get commit hash
         id: commit
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: pr-mpt/actions-commit-hash@v2
+        uses: prompt/actions-commit-hash@v2
 
       - name: Pack artifacts
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@@ -502,6 +565,16 @@ jobs:
         with:
           submodules: recursive
 
+      - name: Setup Node
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20
+
+      - name: Setup pnpm
+        uses: pnpm/action-setup@v4
+        with:
+          version: 9
+
       - name: Free disk space
         run: |
           # Remove preinstalled SDKs and caches not needed for this job
@@ -581,7 +654,7 @@ jobs:
       - name: Get commit hash
         id: commit
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: pr-mpt/actions-commit-hash@v2
+        uses: prompt/actions-commit-hash@v2
 
       - name: Prepare artifacts
         id: prepare_artifacts
@@ -660,7 +733,7 @@ jobs:
 
       - name: Get commit hash
         id: commit
-        uses: pr-mpt/actions-commit-hash@v2
+        uses: prompt/actions-commit-hash@v2
 
       - name: Create release
         id: create_release
diff --git a/.gitmodules b/.gitmodules
index 5a785197..5d66c879 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "ggml"]
     path = ggml
 	url = https://github.com/ggml-org/ggml.git
+[submodule "examples/server/frontend"]
+	path = examples/server/frontend
+	url = https://github.com/leejet/stable-ui.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b90086ea..bad1ba4c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,7 +36,6 @@ option(SD_VULKAN                     "sd: vulkan backend" OFF)
 option(SD_OPENCL                     "sd: opencl backend" OFF)
 option(SD_SYCL                       "sd: sycl backend" OFF)
 option(SD_MUSA                       "sd: musa backend" OFF)
-option(SD_FAST_SOFTMAX               "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
 option(SD_BUILD_SHARED_LIBS          "sd: build shared libs" OFF)
 option(SD_BUILD_SHARED_GGML_LIB      "sd: build ggml as a separate shared lib" OFF)
 option(SD_USE_SYSTEM_GGML            "sd: use system-installed GGML library" OFF)
@@ -70,18 +69,12 @@ if (SD_HIPBLAS)
     message("-- Use HIPBLAS as backend stable-diffusion")
     set(GGML_HIP ON)
     add_definitions(-DSD_USE_CUDA)
-    if(SD_FAST_SOFTMAX)
-        set(GGML_CUDA_FAST_SOFTMAX ON)
-    endif()
 endif ()
 
 if(SD_MUSA)
     message("-- Use MUSA as backend stable-diffusion")
     set(GGML_MUSA ON)
     add_definitions(-DSD_USE_CUDA)
-    if(SD_FAST_SOFTMAX)
-        set(GGML_CUDA_FAST_SOFTMAX ON)
-    endif()
 endif()
 
 set(SD_LIB stable-diffusion)
diff --git a/Dockerfile.cuda b/Dockerfile.cuda
new file mode 100644
index 00000000..4deb7247
--- /dev/null
+++ b/Dockerfile.cuda
@@ -0,0 +1,25 @@
+ARG CUDA_VERSION=12.6.3
+ARG UBUNTU_VERSION=24.04
+
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu${UBUNTU_VERSION} AS build
+
+RUN apt-get update && apt-get install -y --no-install-recommends build-essential git ccache cmake
+
+WORKDIR /sd.cpp
+
+COPY . .
+
+ARG CUDACXX=/usr/local/cuda/bin/nvcc
+RUN cmake . -B ./build -DSD_CUDA=ON
+RUN cmake --build ./build --config Release -j$(nproc)
+
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn-runtime-ubuntu${UBUNTU_VERSION} AS runtime
+
+RUN apt-get update && \
+    apt-get install --yes --no-install-recommends libgomp1 && \
+    apt-get clean
+
+COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
+COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
+
+ENTRYPOINT [ "/sd-cli" ]
diff --git a/docs/anima.md b/docs/anima.md
index 9c941785..debc370b 100644
--- a/docs/anima.md
+++ b/docs/anima.md
@@ -5,6 +5,7 @@
 - Download Anima
     - safetensors: https://huggingface.co/circlestone-labs/Anima/tree/main/split_files/diffusion_models
     - gguf: https://huggingface.co/Bedovyy/Anima-GGUF/tree/main
+    - gguf Anima2: https://huggingface.co/JusteLeo/Anima2-GGUF/tree/main
 - Download vae
     - safetensors: https://huggingface.co/circlestone-labs/Anima/tree/main/split_files/vae
 - Download Qwen3-0.6B-Base
@@ -17,4 +18,4 @@
 .\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\anima-preview.safetensors --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\qwen_3_06b_base.safetensors  -p "a lovely cat holding a sign says 'anima.cpp'" --cfg-scale 6.0 --sampling-method euler -v --offload-to-cpu --diffusion-fa
 ```
 
-<img alt="anima image example" src="../assets/anima/example.png" />
\ No newline at end of file
+<img alt="anima image example" src="../assets/anima/example.png" />
diff --git a/docs/caching.md b/docs/caching.md
index 7b4be3ce..b02a541b 100644
--- a/docs/caching.md
+++ b/docs/caching.md
@@ -11,6 +11,7 @@ Caching methods accelerate diffusion inference by reusing intermediate computati
 | `dbcache` | DiT models | Block-level L1 residual threshold |
 | `taylorseer` | DiT models | Taylor series approximation |
 | `cache-dit` | DiT models | Combined DBCache + TaylorSeer |
+| `spectrum` | UNET and DiT models | Chebyshev + Taylor output forecasting |
 
 ### UCache (UNET Models)
 
@@ -79,7 +80,7 @@ Uses Taylor series approximation to predict block outputs:
 Combines DBCache and TaylorSeer:
 
 ```bash
---cache-mode cache-dit --cache-preset fast
+--cache-mode cache-dit
 ```
 
 #### Parameters
@@ -91,14 +92,6 @@ Combines DBCache and TaylorSeer:
 | `threshold` | L1 residual difference threshold | 0.08 |
 | `warmup` | Steps before caching starts | 8 |
 
-#### Presets
-
-Available presets: `slow`, `medium`, `fast`, `ultra` (or `s`, `m`, `f`, `u`).
-
-```bash
---cache-mode cache-dit --cache-preset fast
-```
-
 #### SCM Options
 
 Steps Computation Mask controls which steps can be cached:
@@ -118,6 +111,28 @@ Mask values: `1` = compute, `0` = can cache.
 --scm-policy dynamic
 ```
 
+### Spectrum (UNET and DiT Models)
+
+Spectrum uses Chebyshev polynomial fitting blended with Taylor extrapolation to predict denoised outputs, skipping entire forward passes. Based on the paper [Spectrum: Adaptive Spectral Feature Forecasting for Efficient Diffusion Sampling](https://github.com/tingyu215/Spectrum).
+
+```bash
+sd-cli -m model.safetensors -p "a cat" --cache-mode spectrum
+```
+
+#### Parameters
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `w` | Chebyshev vs Taylor blend weight (0=Taylor, 1=Chebyshev) | 0.40 |
+| `m` | Chebyshev polynomial degree | 3 |
+| `lam` | Ridge regression regularization | 1.0 |
+| `window` | Initial window size (compute every N steps) | 2 |
+| `flex` | Window growth per computed step after warmup | 0.50 |
+| `warmup` | Steps to always compute before caching starts | 4 |
+| `stop` | Stop caching at this fraction of total steps | 0.9 |
+
+```
+
 ### Performance Tips
 
 - Start with default thresholds and adjust based on output quality
diff --git a/examples/cli/README.md b/examples/cli/README.md
index 564e5ce0..904f3c44 100644
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@@ -138,11 +138,12 @@ Generation Options:
   --skip-layers                            layers to skip for SLG steps (default: [7,8,9])
   --high-noise-skip-layers                 (high noise) layers to skip for SLG steps (default: [7,8,9])
   -r, --ref-image                          reference image for Flux Kontext models (can be used multiple times)
-  --cache-mode                             caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level)
+  --cache-mode                             caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level),
+                                           'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)
   --cache-option                           named cache params (key=value format, comma-separated). easycache/ucache:
-                                           threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples:
-                                           "threshold=0.25" or "threshold=1.5,reset=0"
-  --cache-preset                           cache-dit preset: 'slow'/'s', 'medium'/'m', 'fast'/'f', 'ultra'/'u'
+                                           threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=;
+                                           spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples:
+                                           "threshold=0.25" or "threshold=1.5,reset=0" or "w=0.4,window=2"
   --scm-mask                               SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
   --scm-policy                             SCM policy: 'dynamic' (default) or 'static'
 ```
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index f9e4928e..ddb88c97 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -601,7 +601,7 @@ int main(int argc, const char* argv[]) {
 
     if (gen_params.end_image_path.size() > 0) {
         vae_decode_only = false;
-        if (!load_image_and_update_size(gen_params.init_image_path, end_image)) {
+        if (!load_image_and_update_size(gen_params.end_image_path, end_image)) {
             return 1;
         }
     }
diff --git a/examples/common/common.hpp b/examples/common/common.hpp
index 369c1f07..9389b03a 100644
--- a/examples/common/common.hpp
+++ b/examples/common/common.hpp
@@ -1047,7 +1047,6 @@ struct SDGenerationParams {
 
     std::string cache_mode;
     std::string cache_option;
-    std::string cache_preset;
     std::string scm_mask;
     bool scm_policy_dynamic = true;
     sd_cache_params_t cache_params{};
@@ -1422,8 +1421,8 @@ struct SDGenerationParams {
             }
             cache_mode = argv_to_utf8(index, argv);
             if (cache_mode != "easycache" && cache_mode != "ucache" &&
-                cache_mode != "dbcache" && cache_mode != "taylorseer" && cache_mode != "cache-dit") {
-                fprintf(stderr, "error: invalid cache mode '%s', must be 'easycache', 'ucache', 'dbcache', 'taylorseer', or 'cache-dit'\n", cache_mode.c_str());
+                cache_mode != "dbcache" && cache_mode != "taylorseer" && cache_mode != "cache-dit" && cache_mode != "spectrum") {
+                fprintf(stderr, "error: invalid cache mode '%s', must be 'easycache', 'ucache', 'dbcache', 'taylorseer', 'cache-dit', or 'spectrum'\n", cache_mode.c_str());
                 return -1;
             }
             return 1;
@@ -1461,21 +1460,6 @@ struct SDGenerationParams {
             return 1;
         };
 
-        auto on_cache_preset_arg = [&](int argc, const char** argv, int index) {
-            if (++index >= argc) {
-                return -1;
-            }
-            cache_preset = argv_to_utf8(index, argv);
-            if (cache_preset != "slow" && cache_preset != "s" && cache_preset != "S" &&
-                cache_preset != "medium" && cache_preset != "m" && cache_preset != "M" &&
-                cache_preset != "fast" && cache_preset != "f" && cache_preset != "F" &&
-                cache_preset != "ultra" && cache_preset != "u" && cache_preset != "U") {
-                fprintf(stderr, "error: invalid cache preset '%s', must be 'slow'/'s', 'medium'/'m', 'fast'/'f', or 'ultra'/'u'\n", cache_preset.c_str());
-                return -1;
-            }
-            return 1;
-        };
-
         options.manual_options = {
             {"-s",
              "--seed",
@@ -1513,16 +1497,12 @@ struct SDGenerationParams {
              on_ref_image_arg},
             {"",
              "--cache-mode",
-             "caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level)",
+             "caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)",
              on_cache_mode_arg},
             {"",
              "--cache-option",
-             "named cache params (key=value format, comma-separated). easycache/ucache: threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples: \"threshold=0.25\" or \"threshold=1.5,reset=0\"",
+             "named cache params (key=value format, comma-separated). easycache/ucache: threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples: \"threshold=0.25\" or \"threshold=1.5,reset=0\"",
              on_cache_option_arg},
-            {"",
-             "--cache-preset",
-             "cache-dit preset: 'slow'/'s', 'medium'/'m', 'fast'/'f', 'ultra'/'u'",
-             on_cache_preset_arg},
             {"",
              "--scm-mask",
              "SCM steps mask for cache-dit: comma-separated 0/1 (e.g., \"1,1,1,0,0,1,0,0,1,0\") - 1=compute, 0=can cache",
@@ -1575,7 +1555,6 @@ struct SDGenerationParams {
         load_if_exists("negative_prompt", negative_prompt);
         load_if_exists("cache_mode", cache_mode);
         load_if_exists("cache_option", cache_option);
-        load_if_exists("cache_preset", cache_preset);
         load_if_exists("scm_mask", scm_mask);
 
         load_if_exists("clip_skip", clip_skip);
@@ -1779,7 +1758,23 @@ struct SDGenerationParams {
                     } else if (key == "Bn" || key == "bn") {
                         cache_params.Bn_compute_blocks = std::stoi(val);
                     } else if (key == "warmup") {
-                        cache_params.max_warmup_steps = std::stoi(val);
+                        if (cache_mode == "spectrum") {
+                            cache_params.spectrum_warmup_steps = std::stoi(val);
+                        } else {
+                            cache_params.max_warmup_steps = std::stoi(val);
+                        }
+                    } else if (key == "w") {
+                        cache_params.spectrum_w = std::stof(val);
+                    } else if (key == "m") {
+                        cache_params.spectrum_m = std::stoi(val);
+                    } else if (key == "lam") {
+                        cache_params.spectrum_lam = std::stof(val);
+                    } else if (key == "window") {
+                        cache_params.spectrum_window_size = std::stoi(val);
+                    } else if (key == "flex") {
+                        cache_params.spectrum_flex_window = std::stof(val);
+                    } else if (key == "stop") {
+                        cache_params.spectrum_stop_percent = std::stof(val);
                     } else {
                         LOG_ERROR("error: unknown cache parameter '%s'", key.c_str());
                         return false;
@@ -1794,39 +1789,17 @@ struct SDGenerationParams {
 
         if (!cache_mode.empty()) {
             if (cache_mode == "easycache") {
-                cache_params.mode                   = SD_CACHE_EASYCACHE;
-                cache_params.reuse_threshold        = 0.2f;
-                cache_params.start_percent          = 0.15f;
-                cache_params.end_percent            = 0.95f;
-                cache_params.error_decay_rate       = 1.0f;
-                cache_params.use_relative_threshold = true;
-                cache_params.reset_error_on_compute = true;
+                cache_params.mode = SD_CACHE_EASYCACHE;
             } else if (cache_mode == "ucache") {
-                cache_params.mode                   = SD_CACHE_UCACHE;
-                cache_params.reuse_threshold        = 1.0f;
-                cache_params.start_percent          = 0.15f;
-                cache_params.end_percent            = 0.95f;
-                cache_params.error_decay_rate       = 1.0f;
-                cache_params.use_relative_threshold = true;
-                cache_params.reset_error_on_compute = true;
+                cache_params.mode = SD_CACHE_UCACHE;
             } else if (cache_mode == "dbcache") {
-                cache_params.mode                    = SD_CACHE_DBCACHE;
-                cache_params.Fn_compute_blocks       = 8;
-                cache_params.Bn_compute_blocks       = 0;
-                cache_params.residual_diff_threshold = 0.08f;
-                cache_params.max_warmup_steps        = 8;
+                cache_params.mode = SD_CACHE_DBCACHE;
             } else if (cache_mode == "taylorseer") {
-                cache_params.mode                    = SD_CACHE_TAYLORSEER;
-                cache_params.Fn_compute_blocks       = 8;
-                cache_params.Bn_compute_blocks       = 0;
-                cache_params.residual_diff_threshold = 0.08f;
-                cache_params.max_warmup_steps        = 8;
+                cache_params.mode = SD_CACHE_TAYLORSEER;
             } else if (cache_mode == "cache-dit") {
-                cache_params.mode                    = SD_CACHE_CACHE_DIT;
-                cache_params.Fn_compute_blocks       = 8;
-                cache_params.Bn_compute_blocks       = 0;
-                cache_params.residual_diff_threshold = 0.08f;
-                cache_params.max_warmup_steps        = 8;
+                cache_params.mode = SD_CACHE_CACHE_DIT;
+            } else if (cache_mode == "spectrum") {
+                cache_params.mode = SD_CACHE_SPECTRUM;
             }
 
             if (!cache_option.empty()) {
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
index d1912608..8f5beba8 100644
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -1,6 +1,73 @@
 set(TARGET sd-server)
 
+option(SD_SERVER_BUILD_FRONTEND "Build server frontend with pnpm" ON)
+
+set(FRONTEND_DIR "${CMAKE_CURRENT_SOURCE_DIR}/frontend")
+set(GENERATED_HTML_HEADER "${FRONTEND_DIR}/dist/gen_index_html.h")
+
+set(HAVE_FRONTEND_BUILD OFF)
+
+if(SD_SERVER_BUILD_FRONTEND AND EXISTS "${FRONTEND_DIR}")
+    if(WIN32)
+        find_program(PNPM_EXECUTABLE NAMES pnpm.cmd pnpm)
+    else()
+        find_program(PNPM_EXECUTABLE NAMES pnpm)
+    endif()
+
+    if(PNPM_EXECUTABLE)
+        message(STATUS "Frontend dir found: ${FRONTEND_DIR}")
+        message(STATUS "pnpm found: ${PNPM_EXECUTABLE}")
+
+        set(HAVE_FRONTEND_BUILD ON)
+
+        add_custom_target(${TARGET}_frontend_install
+            COMMAND "${PNPM_EXECUTABLE}" -C "${FRONTEND_DIR}" install
+            WORKING_DIRECTORY "${FRONTEND_DIR}"
+            COMMENT "Installing frontend dependencies"
+            VERBATIM
+        )
+
+        add_custom_target(${TARGET}_frontend_build
+            COMMAND "${PNPM_EXECUTABLE}" -C "${FRONTEND_DIR}" run build
+            WORKING_DIRECTORY "${FRONTEND_DIR}"
+            COMMENT "Building frontend"
+            VERBATIM
+        )
+
+        add_custom_target(${TARGET}_frontend_header
+            COMMAND "${PNPM_EXECUTABLE}" -C "${FRONTEND_DIR}" run build:header
+            WORKING_DIRECTORY "${FRONTEND_DIR}"
+            COMMENT "Generating gen_index_html.h"
+            VERBATIM
+        )
+
+        add_dependencies(${TARGET}_frontend_build ${TARGET}_frontend_install)
+        add_dependencies(${TARGET}_frontend_header ${TARGET}_frontend_build)
+
+        add_custom_target(${TARGET}_frontend
+            DEPENDS ${TARGET}_frontend_header
+        )
+
+        set_source_files_properties("${GENERATED_HTML_HEADER}" PROPERTIES GENERATED TRUE)
+    else()
+        message(WARNING "pnpm not found, frontend build disabled")
+    endif()
+else()
+    message(STATUS "Frontend disabled or directory not found: ${FRONTEND_DIR}")
+endif()
+
 add_executable(${TARGET} main.cpp)
+
+if(HAVE_FRONTEND_BUILD)
+    add_dependencies(${TARGET} ${TARGET}_frontend)
+    target_sources(${TARGET} PRIVATE "${GENERATED_HTML_HEADER}")
+    target_include_directories(${TARGET} PRIVATE "${FRONTEND_DIR}/dist")
+    target_compile_definitions(${TARGET} PRIVATE HAVE_INDEX_HTML)
+    message(STATUS "HAVE_INDEX_HTML enabled")
+else()
+    message(STATUS "HAVE_INDEX_HTML disabled")
+endif()
+
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE stable-diffusion ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PUBLIC c_std_11 cxx_std_17)
\ No newline at end of file
diff --git a/examples/server/README.md b/examples/server/README.md
index 75544364..8aa2158f 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -1,3 +1,92 @@
+# Frontend
+
+## Build with Frontend
+
+The server can optionally build the web frontend and embed it into the binary as `gen_index_html.h`.
+
+### Requirements
+
+Install the following tools:
+
+* **Node.js** ≥ 22.18
+  https://nodejs.org/
+
+* **pnpm** ≥ 10
+  Install via npm:
+
+```bash
+npm install -g pnpm
+```
+
+Verify installation:
+
+```bash
+node -v
+pnpm -v
+```
+
+### Install frontend dependencies
+
+Go to the frontend directory and install dependencies:
+
+```bash
+cd examples/server/frontend
+pnpm install
+```
+
+### Build the server with CMake
+
+Enable the frontend build option when configuring CMake:
+
+```bash
+cmake -B build -DSD_SERVER_BUILD_FRONTEND=ON
+cmake --build build --config Release
+```
+
+If `pnpm` is available, the build system will automatically run:
+
+```
+pnpm run build
+pnpm run build:header
+```
+
+and embed the generated frontend into the server binary.
+
+## Frontend Repository
+
+The web frontend is maintained in a **separate repository**, https://github.com/leejet/stable-ui.
+
+If you want to modify the UI or frontend logic, please submit pull requests to the **frontend repository**.
+
+This repository (`stable-diffusion.cpp`) only vendors the frontend periodically. Changes from the frontend repo are synchronized:
+
+* approximately **every 1–2 weeks**, or
+* when there are **major frontend updates**
+
+Because of this, frontend changes will **not appear here immediately** after being merged upstream.
+
+## Using an external frontend
+
+By default, the server uses the **embedded frontend** generated during the build (`gen_index_html.h`).
+
+You can also serve a custom frontend file instead of the embedded one by using:
+
+```bash
+--serve-html-path <path-to-index.html>
+```
+
+For example:
+
+```bash
+sd-server --serve-html-path ./index.html
+```
+
+In this case, the server will load and serve the specified `index.html` file instead of the embedded frontend. This is useful when:
+
+* developing or testing frontend changes
+* using a custom UI
+* avoiding rebuilding the binary after frontend modifications
+
 # Run
 
 ```
@@ -129,11 +218,10 @@ Default Generation Options:
   --skip-layers                            layers to skip for SLG steps (default: [7,8,9])
   --high-noise-skip-layers                 (high noise) layers to skip for SLG steps (default: [7,8,9])
   -r, --ref-image                          reference image for Flux Kontext models (can be used multiple times)
-  --cache-mode                             caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level)
+  --cache-mode                             caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)
   --cache-option                           named cache params (key=value format, comma-separated). easycache/ucache:
                                            threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples:
                                            "threshold=0.25" or "threshold=1.5,reset=0"
-  --cache-preset                           cache-dit preset: 'slow'/'s', 'medium'/'m', 'fast'/'f', 'ultra'/'u'
   --scm-mask                               SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
   --scm-policy                             SCM policy: 'dynamic' (default) or 'static'
 ```
diff --git a/examples/server/frontend b/examples/server/frontend
new file mode 160000
index 00000000..1a34176c
--- /dev/null
+++ b/examples/server/frontend
@@ -0,0 +1 @@
+Subproject commit 1a34176cd6d39ad3a226b2b69047e71f6797f6bc
diff --git a/examples/server/main.cpp b/examples/server/main.cpp
index cc9e66cc..6e4340a6 100644
--- a/examples/server/main.cpp
+++ b/examples/server/main.cpp
@@ -13,6 +13,10 @@
 
 #include "common/common.hpp"
 
+#ifdef HAVE_INDEX_HTML
+#include "frontend/dist/gen_index_html.h"
+#endif
+
 namespace fs = std::filesystem;
 
 // ----------------------- helpers -----------------------
@@ -380,7 +384,13 @@ int main(int argc, const char** argv) {
         return httplib::Server::HandlerResponse::Unhandled;
     });
 
-    // root
+    // index html
+    std::string index_html;
+#ifdef HAVE_INDEX_HTML
+    index_html.assign(reinterpret_cast<const char*>(index_html_bytes), index_html_size);
+#else
+    index_html = "Stable Diffusion Server is running";
+#endif
     svr.Get("/", [&](const httplib::Request&, httplib::Response& res) {
         if (!svr_params.serve_html_path.empty()) {
             std::ifstream file(svr_params.serve_html_path);
@@ -392,7 +402,7 @@ int main(int argc, const char** argv) {
                 res.set_content("Error: Unable to read HTML file", "text/plain");
             }
         } else {
-            res.set_content("Stable Diffusion Server is running", "text/plain");
+            res.set_content(index_html, "text/html");
         }
     });
 
diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
index 51b2b329..029c2ab1 100644
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@@ -251,6 +251,7 @@ enum sd_cache_mode_t {
     SD_CACHE_DBCACHE,
     SD_CACHE_TAYLORSEER,
     SD_CACHE_CACHE_DIT,
+    SD_CACHE_SPECTRUM,
 };
 
 typedef struct {
@@ -271,6 +272,13 @@ typedef struct {
     int taylorseer_skip_interval;
     const char* scm_mask;
     bool scm_policy_dynamic;
+    float spectrum_w;
+    int spectrum_m;
+    float spectrum_lam;
+    int spectrum_window_size;
+    float spectrum_flex_window;
+    int spectrum_warmup_steps;
+    float spectrum_stop_percent;
 } sd_cache_params_t;
 
 typedef struct {
diff --git a/src/anima.hpp b/src/anima.hpp
index 191a096d..5850cc3e 100644
--- a/src/anima.hpp
+++ b/src/anima.hpp
@@ -13,9 +13,9 @@
 namespace Anima {
     constexpr int ANIMA_GRAPH_SIZE = 65536;
 
-    __STATIC_INLINE__ struct ggml_tensor* apply_gate(struct ggml_context* ctx,
-                                                     struct ggml_tensor* x,
-                                                     struct ggml_tensor* gate) {
+    __STATIC_INLINE__ ggml_tensor* apply_gate(ggml_context* ctx,
+                                              ggml_tensor* x,
+                                              ggml_tensor* gate) {
         gate = ggml_reshape_3d(ctx, gate, gate->ne[0], 1, gate->ne[1]);  // [N, 1, C]
         return ggml_mul(ctx, x, gate);
     }
@@ -26,7 +26,7 @@ namespace Anima {
             blocks["proj.1"] = std::make_shared<Linear>(in_dim, out_dim, false);
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
             auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj.1"]);
             return proj->forward(ctx, x);
         }
@@ -39,7 +39,7 @@ namespace Anima {
             blocks["1.linear_2"] = std::make_shared<Linear>(in_dim, out_dim, false);
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
             auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["1.linear_1"]);
             auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["1.linear_2"]);
 
@@ -62,10 +62,10 @@ namespace Anima {
             blocks["2"]    = std::make_shared<Linear>(hidden_features, 3 * in_features, false);
         }
 
-        std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(GGMLRunnerContext* ctx,
-                                                                    struct ggml_tensor* hidden_states,
-                                                                    struct ggml_tensor* embedded_timestep,
-                                                                    struct ggml_tensor* temb = nullptr) {
+        std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
+                                                      ggml_tensor* hidden_states,
+                                                      ggml_tensor* embedded_timestep,
+                                                      ggml_tensor* temb = nullptr) {
             auto norm     = std::dynamic_pointer_cast<LayerNorm>(blocks["norm"]);
             auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["1"]);
             auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["2"]);
@@ -102,10 +102,10 @@ namespace Anima {
             blocks["2"]    = std::make_shared<Linear>(hidden_features, 2 * in_features, false);
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* hidden_states,
-                                    struct ggml_tensor* embedded_timestep,
-                                    struct ggml_tensor* temb = nullptr) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* hidden_states,
+                             ggml_tensor* embedded_timestep,
+                             ggml_tensor* temb = nullptr) {
             auto norm     = std::dynamic_pointer_cast<LayerNorm>(blocks["norm"]);
             auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["1"]);
             auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["2"]);
@@ -152,11 +152,11 @@ namespace Anima {
             blocks[this->out_proj_name] = std::make_shared<Linear>(inner_dim, query_dim, false);
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* hidden_states,
-                                    struct ggml_tensor* encoder_hidden_states = nullptr,
-                                    struct ggml_tensor* pe_q                  = nullptr,
-                                    struct ggml_tensor* pe_k                  = nullptr) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* hidden_states,
+                             ggml_tensor* encoder_hidden_states = nullptr,
+                             ggml_tensor* pe_q                  = nullptr,
+                             ggml_tensor* pe_k                  = nullptr) {
             if (encoder_hidden_states == nullptr) {
                 encoder_hidden_states = hidden_states;
             }
@@ -183,7 +183,7 @@ namespace Anima {
             q4 = q_norm->forward(ctx, q4);
             k4 = k_norm->forward(ctx, k4);
 
-            struct ggml_tensor* attn_out = nullptr;
+            ggml_tensor* attn_out = nullptr;
             if (pe_q != nullptr || pe_k != nullptr) {
                 if (pe_q == nullptr) {
                     pe_q = pe_k;
@@ -227,7 +227,7 @@ namespace Anima {
             blocks["layer2"] = std::make_shared<Linear>(hidden_dim, dim, false);
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
             auto layer1 = std::dynamic_pointer_cast<Linear>(blocks["layer1"]);
             auto layer2 = std::dynamic_pointer_cast<Linear>(blocks["layer2"]);
 
@@ -245,7 +245,7 @@ namespace Anima {
             blocks["2"] = std::make_shared<Linear>(hidden_dim, dim, true);
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
             auto layer0 = std::dynamic_pointer_cast<Linear>(blocks["0"]);
             auto layer2 = std::dynamic_pointer_cast<Linear>(blocks["2"]);
 
@@ -267,11 +267,11 @@ namespace Anima {
             blocks["mlp"]             = std::make_shared<AdapterMLP>(model_dim, model_dim * 4);
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    struct ggml_tensor* context,
-                                    struct ggml_tensor* target_pe,
-                                    struct ggml_tensor* context_pe) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* context,
+                             ggml_tensor* target_pe,
+                             ggml_tensor* context_pe) {
             auto norm_self_attn  = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_self_attn"]);
             auto self_attn       = std::dynamic_pointer_cast<AnimaAttention>(blocks["self_attn"]);
             auto norm_cross_attn = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_cross_attn"]);
@@ -317,11 +317,11 @@ namespace Anima {
             blocks["norm"]     = std::make_shared<RMSNorm>(target_dim, 1e-6f);
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* source_hidden_states,
-                                    struct ggml_tensor* target_input_ids,
-                                    struct ggml_tensor* target_pe,
-                                    struct ggml_tensor* source_pe) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* source_hidden_states,
+                             ggml_tensor* target_input_ids,
+                             ggml_tensor* target_pe,
+                             ggml_tensor* source_pe) {
             GGML_ASSERT(target_input_ids != nullptr);
             if (ggml_n_dims(target_input_ids) == 1) {
                 target_input_ids = ggml_reshape_2d(ctx->ggml_ctx, target_input_ids, target_input_ids->ne[0], 1);
@@ -360,12 +360,12 @@ namespace Anima {
             blocks["mlp"]                         = std::make_shared<AnimaMLP>(hidden_size, hidden_size * mlp_ratio);
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* hidden_states,
-                                    struct ggml_tensor* encoder_hidden_states,
-                                    struct ggml_tensor* embedded_timestep,
-                                    struct ggml_tensor* temb,
-                                    struct ggml_tensor* image_pe) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* hidden_states,
+                             ggml_tensor* encoder_hidden_states,
+                             ggml_tensor* embedded_timestep,
+                             ggml_tensor* temb,
+                             ggml_tensor* image_pe) {
             auto norm1 = std::dynamic_pointer_cast<AdaLayerNormZero>(blocks["adaln_modulation_self_attn"]);
             auto attn1 = std::dynamic_pointer_cast<AnimaAttention>(blocks["self_attn"]);
             auto norm2 = std::dynamic_pointer_cast<AdaLayerNormZero>(blocks["adaln_modulation_cross_attn"]);
@@ -402,10 +402,10 @@ namespace Anima {
             blocks["linear"]           = std::make_shared<Linear>(hidden_size, patch_size * patch_size * out_channels, false);
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* hidden_states,
-                                    struct ggml_tensor* embedded_timestep,
-                                    struct ggml_tensor* temb) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* hidden_states,
+                             ggml_tensor* embedded_timestep,
+                             ggml_tensor* temb) {
             auto adaln  = std::dynamic_pointer_cast<AdaLayerNorm>(blocks["adaln_modulation"]);
             auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
 
@@ -445,15 +445,15 @@ namespace Anima {
             blocks["llm_adapter"] = std::make_shared<LLMAdapter>(1024, 1024, 1024, 6, 16);
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    struct ggml_tensor* timestep,
-                                    struct ggml_tensor* encoder_hidden_states,
-                                    struct ggml_tensor* image_pe,
-                                    struct ggml_tensor* t5_ids       = nullptr,
-                                    struct ggml_tensor* t5_weights   = nullptr,
-                                    struct ggml_tensor* adapter_q_pe = nullptr,
-                                    struct ggml_tensor* adapter_k_pe = nullptr) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* timestep,
+                             ggml_tensor* encoder_hidden_states,
+                             ggml_tensor* image_pe,
+                             ggml_tensor* t5_ids       = nullptr,
+                             ggml_tensor* t5_weights   = nullptr,
+                             ggml_tensor* adapter_q_pe = nullptr,
+                             ggml_tensor* adapter_k_pe = nullptr) {
             GGML_ASSERT(x->ne[3] == 1);
 
             auto x_embedder       = std::dynamic_pointer_cast<XEmbedder>(blocks["x_embedder"]);
@@ -553,7 +553,7 @@ namespace Anima {
             return "anima";
         }
 
-        void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
             net.get_param_tensors(tensors, prefix + ".net");
         }
 
@@ -602,19 +602,18 @@ namespace Anima {
             return Rope::embed_nd(ids, bs, axis_thetas, axes_dim);
         }
 
-        struct ggml_cgraph* build_graph(struct ggml_tensor* x,
-                                        struct ggml_tensor* timesteps,
-                                        struct ggml_tensor* context,
-                                        struct ggml_tensor* t5_ids     = nullptr,
-                                        struct ggml_tensor* t5_weights = nullptr) {
+        ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
+                                 const sd::Tensor<float>& timesteps_tensor,
+                                 const sd::Tensor<float>& context_tensor    = {},
+                                 const sd::Tensor<int32_t>& t5_ids_tensor   = {},
+                                 const sd::Tensor<float>& t5_weights_tensor = {}) {
+            ggml_tensor* x          = make_input(x_tensor);
+            ggml_tensor* timesteps  = make_input(timesteps_tensor);
+            ggml_tensor* context    = make_optional_input(context_tensor);
+            ggml_tensor* t5_ids     = make_optional_input(t5_ids_tensor);
+            ggml_tensor* t5_weights = make_optional_input(t5_weights_tensor);
             GGML_ASSERT(x->ne[3] == 1);
-            struct ggml_cgraph* gf = new_graph_custom(ANIMA_GRAPH_SIZE);
-
-            x          = to_backend(x);
-            timesteps  = to_backend(timesteps);
-            context    = to_backend(context);
-            t5_ids     = to_backend(t5_ids);
-            t5_weights = to_backend(t5_weights);
+            ggml_cgraph* gf = new_graph_custom(ANIMA_GRAPH_SIZE);
 
             int64_t pad_h = (net.patch_size - x->ne[1] % net.patch_size) % net.patch_size;
             int64_t pad_w = (net.patch_size - x->ne[0] % net.patch_size) % net.patch_size;
@@ -667,18 +666,16 @@ namespace Anima {
             return gf;
         }
 
-        bool compute(int n_threads,
-                     struct ggml_tensor* x,
-                     struct ggml_tensor* timesteps,
-                     struct ggml_tensor* context,
-                     struct ggml_tensor* t5_ids      = nullptr,
-                     struct ggml_tensor* t5_weights  = nullptr,
-                     struct ggml_tensor** output     = nullptr,
-                     struct ggml_context* output_ctx = nullptr) {
-            auto get_graph = [&]() -> struct ggml_cgraph* {
+        sd::Tensor<float> compute(int n_threads,
+                                  const sd::Tensor<float>& x,
+                                  const sd::Tensor<float>& timesteps,
+                                  const sd::Tensor<float>& context    = {},
+                                  const sd::Tensor<int32_t>& t5_ids   = {},
+                                  const sd::Tensor<float>& t5_weights = {}) {
+            auto get_graph = [&]() -> ggml_cgraph* {
                 return build_graph(x, timesteps, context, t5_ids, t5_weights);
             };
-            return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
         }
     };
 }  // namespace Anima
diff --git a/src/auto_encoder_kl.hpp b/src/auto_encoder_kl.hpp
new file mode 100644
index 00000000..039fb9df
--- /dev/null
+++ b/src/auto_encoder_kl.hpp
@@ -0,0 +1,852 @@
+﻿#ifndef __AUTO_ENCODER_KL_HPP__
+#define __AUTO_ENCODER_KL_HPP__
+
+#include "vae.hpp"
+
+/*================================================== AutoEncoderKL ===================================================*/
+
+#define VAE_GRAPH_SIZE 20480
+
+class ResnetBlock : public UnaryBlock {
+protected:
+    int64_t in_channels;
+    int64_t out_channels;
+
+public:
+    ResnetBlock(int64_t in_channels,
+                int64_t out_channels)
+        : in_channels(in_channels),
+          out_channels(out_channels) {
+        // temb_channels is always 0
+        blocks["norm1"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
+        blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
+
+        blocks["norm2"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(out_channels));
+        blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv2d(out_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
+
+        if (out_channels != in_channels) {
+            blocks["nin_shortcut"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, {1, 1}));
+        }
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+        // x: [N, in_channels, h, w]
+        // t_emb is always None
+        auto norm1 = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm1"]);
+        auto conv1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv1"]);
+        auto norm2 = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm2"]);
+        auto conv2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv2"]);
+
+        auto h = x;
+        h      = norm1->forward(ctx, h);
+        h      = ggml_silu_inplace(ctx->ggml_ctx, h);  // swish
+        h      = conv1->forward(ctx, h);
+        // return h;
+
+        h = norm2->forward(ctx, h);
+        h = ggml_silu_inplace(ctx->ggml_ctx, h);  // swish
+        // dropout, skip for inference
+        h = conv2->forward(ctx, h);
+
+        // skip connection
+        if (out_channels != in_channels) {
+            auto nin_shortcut = std::dynamic_pointer_cast<Conv2d>(blocks["nin_shortcut"]);
+
+            x = nin_shortcut->forward(ctx, x);  // [N, out_channels, h, w]
+        }
+
+        h = ggml_add(ctx->ggml_ctx, h, x);
+        return h;  // [N, out_channels, h, w]
+    }
+};
+
+class AttnBlock : public UnaryBlock {
+protected:
+    int64_t in_channels;
+    bool use_linear;
+
+    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
+        auto iter = tensor_storage_map.find(prefix + "proj_out.weight");
+        if (iter != tensor_storage_map.end()) {
+            if (iter->second.n_dims == 4 && use_linear) {
+                use_linear         = false;
+                blocks["q"]        = std::make_shared<Conv2d>(in_channels, in_channels, std::pair{1, 1});
+                blocks["k"]        = std::make_shared<Conv2d>(in_channels, in_channels, std::pair{1, 1});
+                blocks["v"]        = std::make_shared<Conv2d>(in_channels, in_channels, std::pair{1, 1});
+                blocks["proj_out"] = std::make_shared<Conv2d>(in_channels, in_channels, std::pair{1, 1});
+            } else if (iter->second.n_dims == 2 && !use_linear) {
+                use_linear         = true;
+                blocks["q"]        = std::make_shared<Linear>(in_channels, in_channels);
+                blocks["k"]        = std::make_shared<Linear>(in_channels, in_channels);
+                blocks["v"]        = std::make_shared<Linear>(in_channels, in_channels);
+                blocks["proj_out"] = std::make_shared<Linear>(in_channels, in_channels);
+            }
+        }
+    }
+
+public:
+    AttnBlock(int64_t in_channels, bool use_linear)
+        : in_channels(in_channels), use_linear(use_linear) {
+        blocks["norm"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
+        if (use_linear) {
+            blocks["q"]        = std::shared_ptr<GGMLBlock>(new Linear(in_channels, in_channels));
+            blocks["k"]        = std::shared_ptr<GGMLBlock>(new Linear(in_channels, in_channels));
+            blocks["v"]        = std::shared_ptr<GGMLBlock>(new Linear(in_channels, in_channels));
+            blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, in_channels));
+        } else {
+            blocks["q"]        = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
+            blocks["k"]        = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
+            blocks["v"]        = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
+            blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
+        }
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+        // x: [N, in_channels, h, w]
+        auto norm     = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
+        auto q_proj   = std::dynamic_pointer_cast<UnaryBlock>(blocks["q"]);
+        auto k_proj   = std::dynamic_pointer_cast<UnaryBlock>(blocks["k"]);
+        auto v_proj   = std::dynamic_pointer_cast<UnaryBlock>(blocks["v"]);
+        auto proj_out = std::dynamic_pointer_cast<UnaryBlock>(blocks["proj_out"]);
+
+        auto h_ = norm->forward(ctx, x);
+
+        const int64_t n = h_->ne[3];
+        const int64_t c = h_->ne[2];
+        const int64_t h = h_->ne[1];
+        const int64_t w = h_->ne[0];
+
+        ggml_tensor* q;
+        ggml_tensor* k;
+        ggml_tensor* v;
+        if (use_linear) {
+            h_ = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, h_, 1, 2, 0, 3));  // [N, h, w, in_channels]
+            h_ = ggml_reshape_3d(ctx->ggml_ctx, h_, c, h * w, n);                        // [N, h * w, in_channels]
+
+            q = q_proj->forward(ctx, h_);  // [N, h * w, in_channels]
+            k = k_proj->forward(ctx, h_);  // [N, h * w, in_channels]
+            v = v_proj->forward(ctx, h_);  // [N, h * w, in_channels]
+        } else {
+            q = q_proj->forward(ctx, h_);                                              // [N, in_channels, h, w]
+            q = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, q, 1, 2, 0, 3));  // [N, h, w, in_channels]
+            q = ggml_reshape_3d(ctx->ggml_ctx, q, c, h * w, n);                        // [N, h * w, in_channels]
+
+            k = k_proj->forward(ctx, h_);                                              // [N, in_channels, h, w]
+            k = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, k, 1, 2, 0, 3));  // [N, h, w, in_channels]
+            k = ggml_reshape_3d(ctx->ggml_ctx, k, c, h * w, n);                        // [N, h * w, in_channels]
+
+            v = v_proj->forward(ctx, h_);                                              // [N, in_channels, h, w]
+            v = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, v, 1, 2, 0, 3));  // [N, h, w, in_channels]
+            v = ggml_reshape_3d(ctx->ggml_ctx, v, c, h * w, n);                        // [N, h * w, in_channels]
+        }
+
+        h_ = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, 1, nullptr, false, ctx->flash_attn_enabled);
+
+        if (use_linear) {
+            h_ = proj_out->forward(ctx, h_);  // [N, h * w, in_channels]
+
+            h_ = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, h_, 1, 0, 2, 3));  // [N, in_channels, h * w]
+            h_ = ggml_reshape_4d(ctx->ggml_ctx, h_, w, h, c, n);                         // [N, in_channels, h, w]
+        } else {
+            h_ = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, h_, 1, 0, 2, 3));  // [N, in_channels, h * w]
+            h_ = ggml_reshape_4d(ctx->ggml_ctx, h_, w, h, c, n);                         // [N, in_channels, h, w]
+
+            h_ = proj_out->forward(ctx, h_);  // [N, in_channels, h, w]
+        }
+
+        h_ = ggml_add(ctx->ggml_ctx, h_, x);
+        return h_;
+    }
+};
+
+class AE3DConv : public Conv2d {
+public:
+    AE3DConv(int64_t in_channels,
+             int64_t out_channels,
+             std::pair<int, int> kernel_size,
+             int video_kernel_size        = 3,
+             std::pair<int, int> stride   = {1, 1},
+             std::pair<int, int> padding  = {0, 0},
+             std::pair<int, int> dilation = {1, 1},
+             bool bias                    = true)
+        : Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias) {
+        int kernel_padding      = video_kernel_size / 2;
+        blocks["time_mix_conv"] = std::shared_ptr<GGMLBlock>(new Conv3d(out_channels,
+                                                                        out_channels,
+                                                                        {video_kernel_size, 1, 1},
+                                                                        {1, 1, 1},
+                                                                        {kernel_padding, 0, 0}));
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* x) override {
+        // timesteps always None
+        // skip_video always False
+        // x: [N, IC, IH, IW]
+        // result: [N, OC, OH, OW]
+        auto time_mix_conv = std::dynamic_pointer_cast<Conv3d>(blocks["time_mix_conv"]);
+
+        x = Conv2d::forward(ctx, x);
+        // timesteps = x.shape[0]
+        // x = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps)
+        // x = conv3d(x)
+        // return rearrange(x, "b c t h w -> (b t) c h w")
+        int64_t T = x->ne[3];
+        int64_t B = x->ne[3] / T;
+        int64_t C = x->ne[2];
+        int64_t H = x->ne[1];
+        int64_t W = x->ne[0];
+
+        x = ggml_reshape_4d(ctx->ggml_ctx, x, W * H, C, T, B);                     // (b t) c h w -> b t c (h w)
+        x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3));  // b t c (h w) -> b c t (h w)
+        x = time_mix_conv->forward(ctx, x);                                        // [B, OC, T, OH * OW]
+        x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3));  // b c t (h w) -> b t c (h w)
+        x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B);                     // b t c (h w) -> (b t) c h w
+        return x;                                                                  // [B*T, OC, OH, OW]
+    }
+};
+
+class VideoResnetBlock : public ResnetBlock {
+protected:
+    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+        enum ggml_type wtype = get_type(prefix + "mix_factor", tensor_storage_map, GGML_TYPE_F32);
+        params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
+    }
+
+    float get_alpha() {
+        float alpha = ggml_ext_backend_tensor_get_f32(params["mix_factor"]);
+        return sigmoid(alpha);
+    }
+
+public:
+    VideoResnetBlock(int64_t in_channels,
+                     int64_t out_channels,
+                     int video_kernel_size = 3)
+        : ResnetBlock(in_channels, out_channels) {
+        // merge_strategy is always learned
+        blocks["time_stack"] = std::shared_ptr<GGMLBlock>(new ResBlock(out_channels, 0, out_channels, {video_kernel_size, 1}, 3, false, true));
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+        // x: [N, in_channels, h, w] aka [b*t, in_channels, h, w]
+        // return: [N, out_channels, h, w] aka [b*t, out_channels, h, w]
+        // t_emb is always None
+        // skip_video is always False
+        // timesteps is always None
+        auto time_stack = std::dynamic_pointer_cast<ResBlock>(blocks["time_stack"]);
+
+        x = ResnetBlock::forward(ctx, x);  // [N, out_channels, h, w]
+        // return x;
+
+        int64_t T = x->ne[3];
+        int64_t B = x->ne[3] / T;
+        int64_t C = x->ne[2];
+        int64_t H = x->ne[1];
+        int64_t W = x->ne[0];
+
+        x          = ggml_reshape_4d(ctx->ggml_ctx, x, W * H, C, T, B);                     // (b t) c h w -> b t c (h w)
+        x          = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3));  // b t c (h w) -> b c t (h w)
+        auto x_mix = x;
+
+        x = time_stack->forward(ctx, x);  // b t c (h w)
+
+        float alpha = get_alpha();
+        x           = ggml_add(ctx->ggml_ctx,
+                               ggml_ext_scale(ctx->ggml_ctx, x, alpha),
+                               ggml_ext_scale(ctx->ggml_ctx, x_mix, 1.0f - alpha));
+
+        x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3));  // b c t (h w) -> b t c (h w)
+        x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B);                     // b t c (h w) -> (b t) c h w
+
+        return x;
+    }
+};
+
+// ldm.modules.diffusionmodules.model.Encoder
+class Encoder : public GGMLBlock {
+protected:
+    int ch                   = 128;
+    std::vector<int> ch_mult = {1, 2, 4, 4};
+    int num_res_blocks       = 2;
+    int in_channels          = 3;
+    int z_channels           = 4;
+    bool double_z            = true;
+
+public:
+    Encoder(int ch,
+            std::vector<int> ch_mult,
+            int num_res_blocks,
+            int in_channels,
+            int z_channels,
+            bool double_z              = true,
+            bool use_linear_projection = false)
+        : ch(ch),
+          ch_mult(ch_mult),
+          num_res_blocks(num_res_blocks),
+          in_channels(in_channels),
+          z_channels(z_channels),
+          double_z(double_z) {
+        blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, ch, {3, 3}, {1, 1}, {1, 1}));
+
+        size_t num_resolutions = ch_mult.size();
+
+        int block_in = 1;
+        for (int i = 0; i < num_resolutions; i++) {
+            if (i == 0) {
+                block_in = ch;
+            } else {
+                block_in = ch * ch_mult[i - 1];
+            }
+            int block_out = ch * ch_mult[i];
+            for (int j = 0; j < num_res_blocks; j++) {
+                std::string name = "down." + std::to_string(i) + ".block." + std::to_string(j);
+                blocks[name]     = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_out));
+                block_in         = block_out;
+            }
+            if (i != num_resolutions - 1) {
+                std::string name = "down." + std::to_string(i) + ".downsample";
+                blocks[name]     = std::shared_ptr<GGMLBlock>(new DownSampleBlock(block_in, block_in, true));
+            }
+        }
+
+        blocks["mid.block_1"] = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_in));
+        blocks["mid.attn_1"]  = std::shared_ptr<GGMLBlock>(new AttnBlock(block_in, use_linear_projection));
+        blocks["mid.block_2"] = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_in));
+
+        blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(block_in));
+        blocks["conv_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1}));
+    }
+
+    virtual ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+        // x: [N, in_channels, h, w]
+
+        auto conv_in     = std::dynamic_pointer_cast<Conv2d>(blocks["conv_in"]);
+        auto mid_block_1 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_1"]);
+        auto mid_attn_1  = std::dynamic_pointer_cast<AttnBlock>(blocks["mid.attn_1"]);
+        auto mid_block_2 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_2"]);
+        auto norm_out    = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm_out"]);
+        auto conv_out    = std::dynamic_pointer_cast<Conv2d>(blocks["conv_out"]);
+
+        auto h = conv_in->forward(ctx, x);  // [N, ch, h, w]
+
+        // downsampling
+        size_t num_resolutions = ch_mult.size();
+        for (int i = 0; i < num_resolutions; i++) {
+            for (int j = 0; j < num_res_blocks; j++) {
+                std::string name = "down." + std::to_string(i) + ".block." + std::to_string(j);
+                auto down_block  = std::dynamic_pointer_cast<ResnetBlock>(blocks[name]);
+
+                h = down_block->forward(ctx, h);
+            }
+            if (i != num_resolutions - 1) {
+                std::string name = "down." + std::to_string(i) + ".downsample";
+                auto down_sample = std::dynamic_pointer_cast<DownSampleBlock>(blocks[name]);
+
+                h = down_sample->forward(ctx, h);
+            }
+        }
+
+        // middle
+        h = mid_block_1->forward(ctx, h);
+        h = mid_attn_1->forward(ctx, h);
+        h = mid_block_2->forward(ctx, h);  // [N, block_in, h, w]
+
+        // end
+        h = norm_out->forward(ctx, h);
+        h = ggml_silu_inplace(ctx->ggml_ctx, h);  // nonlinearity/swish
+        h = conv_out->forward(ctx, h);            // [N, z_channels*2, h, w]
+        return h;
+    }
+};
+
+// ldm.modules.diffusionmodules.model.Decoder
+class Decoder : public GGMLBlock {
+protected:
+    int ch                   = 128;
+    int out_ch               = 3;
+    std::vector<int> ch_mult = {1, 2, 4, 4};
+    int num_res_blocks       = 2;
+    int z_channels           = 4;
+    bool video_decoder       = false;
+    int video_kernel_size    = 3;
+
+    virtual std::shared_ptr<GGMLBlock> get_conv_out(int64_t in_channels,
+                                                    int64_t out_channels,
+                                                    std::pair<int, int> kernel_size,
+                                                    std::pair<int, int> stride  = {1, 1},
+                                                    std::pair<int, int> padding = {0, 0}) {
+        if (video_decoder) {
+            return std::shared_ptr<GGMLBlock>(new AE3DConv(in_channels, out_channels, kernel_size, video_kernel_size, stride, padding));
+        } else {
+            return std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, kernel_size, stride, padding));
+        }
+    }
+
+    virtual std::shared_ptr<GGMLBlock> get_resnet_block(int64_t in_channels,
+                                                        int64_t out_channels) {
+        if (video_decoder) {
+            return std::shared_ptr<GGMLBlock>(new VideoResnetBlock(in_channels, out_channels, video_kernel_size));
+        } else {
+            return std::shared_ptr<GGMLBlock>(new ResnetBlock(in_channels, out_channels));
+        }
+    }
+
+public:
+    Decoder(int ch,
+            int out_ch,
+            std::vector<int> ch_mult,
+            int num_res_blocks,
+            int z_channels,
+            bool use_linear_projection = false,
+            bool video_decoder         = false,
+            int video_kernel_size      = 3)
+        : ch(ch),
+          out_ch(out_ch),
+          ch_mult(ch_mult),
+          num_res_blocks(num_res_blocks),
+          z_channels(z_channels),
+          video_decoder(video_decoder),
+          video_kernel_size(video_kernel_size) {
+        int num_resolutions = static_cast<int>(ch_mult.size());
+        int block_in        = ch * ch_mult[num_resolutions - 1];
+
+        blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, block_in, {3, 3}, {1, 1}, {1, 1}));
+
+        blocks["mid.block_1"] = get_resnet_block(block_in, block_in);
+        blocks["mid.attn_1"]  = std::shared_ptr<GGMLBlock>(new AttnBlock(block_in, use_linear_projection));
+        blocks["mid.block_2"] = get_resnet_block(block_in, block_in);
+
+        for (int i = num_resolutions - 1; i >= 0; i--) {
+            int mult      = ch_mult[i];
+            int block_out = ch * mult;
+            for (int j = 0; j < num_res_blocks + 1; j++) {
+                std::string name = "up." + std::to_string(i) + ".block." + std::to_string(j);
+                blocks[name]     = get_resnet_block(block_in, block_out);
+
+                block_in = block_out;
+            }
+            if (i != 0) {
+                std::string name = "up." + std::to_string(i) + ".upsample";
+                blocks[name]     = std::shared_ptr<GGMLBlock>(new UpSampleBlock(block_in, block_in));
+            }
+        }
+
+        blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(block_in));
+        blocks["conv_out"] = get_conv_out(block_in, out_ch, {3, 3}, {1, 1}, {1, 1});
+    }
+
+    virtual ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* z) {
+        // z: [N, z_channels, h, w]
+        // alpha is always 0
+        // merge_strategy is always learned
+        // time_mode is always conv-only, so we need to replace conv_out_op/resnet_op to AE3DConv/VideoResBlock
+        // AttnVideoBlock will not be used
+        auto conv_in     = std::dynamic_pointer_cast<Conv2d>(blocks["conv_in"]);
+        auto mid_block_1 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_1"]);
+        auto mid_attn_1  = std::dynamic_pointer_cast<AttnBlock>(blocks["mid.attn_1"]);
+        auto mid_block_2 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_2"]);
+        auto norm_out    = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm_out"]);
+        auto conv_out    = std::dynamic_pointer_cast<Conv2d>(blocks["conv_out"]);
+
+        // conv_in
+        auto h = conv_in->forward(ctx, z);  // [N, block_in, h, w]
+
+        // middle
+        h = mid_block_1->forward(ctx, h);
+        // return h;
+
+        h = mid_attn_1->forward(ctx, h);
+        h = mid_block_2->forward(ctx, h);  // [N, block_in, h, w]
+
+        // upsampling
+        int num_resolutions = static_cast<int>(ch_mult.size());
+        for (int i = num_resolutions - 1; i >= 0; i--) {
+            for (int j = 0; j < num_res_blocks + 1; j++) {
+                std::string name = "up." + std::to_string(i) + ".block." + std::to_string(j);
+                auto up_block    = std::dynamic_pointer_cast<ResnetBlock>(blocks[name]);
+
+                h = up_block->forward(ctx, h);
+            }
+            if (i != 0) {
+                std::string name = "up." + std::to_string(i) + ".upsample";
+                auto up_sample   = std::dynamic_pointer_cast<UpSampleBlock>(blocks[name]);
+
+                h = up_sample->forward(ctx, h);
+            }
+        }
+
+        h = norm_out->forward(ctx, h);
+        h = ggml_silu_inplace(ctx->ggml_ctx, h);  // nonlinearity/swish
+        h = conv_out->forward(ctx, h);            // [N, out_ch, h*8, w*8]
+        return h;
+    }
+};
+
+// ldm.models.autoencoder.AutoencoderKL
+class AutoEncoderKLModel : public GGMLBlock {
+protected:
+    SDVersion version;
+    bool decode_only       = true;
+    bool use_video_decoder = false;
+    bool use_quant         = true;
+    int embed_dim          = 4;
+    struct {
+        int z_channels           = 4;
+        int resolution           = 256;
+        int in_channels          = 3;
+        int out_ch               = 3;
+        int ch                   = 128;
+        std::vector<int> ch_mult = {1, 2, 4, 4};
+        int num_res_blocks       = 2;
+        bool double_z            = true;
+    } dd_config;
+
+public:
+    AutoEncoderKLModel(SDVersion version          = VERSION_SD1,
+                       bool decode_only           = true,
+                       bool use_linear_projection = false,
+                       bool use_video_decoder     = false)
+        : version(version), decode_only(decode_only), use_video_decoder(use_video_decoder) {
+        if (sd_version_is_dit(version)) {
+            if (sd_version_is_flux2(version)) {
+                dd_config.z_channels = 32;
+                embed_dim            = 32;
+            } else {
+                use_quant            = false;
+                dd_config.z_channels = 16;
+            }
+        }
+        if (use_video_decoder) {
+            use_quant = false;
+        }
+        blocks["decoder"] = std::shared_ptr<GGMLBlock>(new Decoder(dd_config.ch,
+                                                                   dd_config.out_ch,
+                                                                   dd_config.ch_mult,
+                                                                   dd_config.num_res_blocks,
+                                                                   dd_config.z_channels,
+                                                                   use_linear_projection,
+                                                                   use_video_decoder));
+        if (use_quant) {
+            blocks["post_quant_conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(dd_config.z_channels,
+                                                                              embed_dim,
+                                                                              {1, 1}));
+        }
+        if (!decode_only) {
+            blocks["encoder"] = std::shared_ptr<GGMLBlock>(new Encoder(dd_config.ch,
+                                                                       dd_config.ch_mult,
+                                                                       dd_config.num_res_blocks,
+                                                                       dd_config.in_channels,
+                                                                       dd_config.z_channels,
+                                                                       dd_config.double_z,
+                                                                       use_linear_projection));
+            if (use_quant) {
+                int factor = dd_config.double_z ? 2 : 1;
+
+                blocks["quant_conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(embed_dim * factor,
+                                                                             dd_config.z_channels * factor,
+                                                                             {1, 1}));
+            }
+        }
+    }
+
+    ggml_tensor* decode(GGMLRunnerContext* ctx, ggml_tensor* z) {
+        // z: [N, z_channels, h, w]
+        if (sd_version_is_flux2(version)) {
+            // [N, C*p*p, h, w] -> [N, C, h*p, w*p]
+            int64_t p = 2;
+
+            int64_t N = z->ne[3];
+            int64_t C = z->ne[2] / p / p;
+            int64_t h = z->ne[1];
+            int64_t w = z->ne[0];
+            int64_t H = h * p;
+            int64_t W = w * p;
+
+            z = ggml_reshape_4d(ctx->ggml_ctx, z, w * h, p * p, C, N);                           // [N, C, p*p, h*w]
+            z = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, z, 1, 0, 2, 3));  // [N, C, h*w, p*p]
+            z = ggml_reshape_4d(ctx->ggml_ctx, z, p, p, w, h * C * N);                           // [N*C*h, w, p, p]
+            z = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, z, 0, 2, 1, 3));  // [N*C*h, p, w, p]
+            z = ggml_reshape_4d(ctx->ggml_ctx, z, W, H, C, N);                                   // [N, C, h*p, w*p]
+        }
+
+        if (use_quant) {
+            auto post_quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["post_quant_conv"]);
+            z                    = post_quant_conv->forward(ctx, z);  // [N, z_channels, h, w]
+        }
+        auto decoder = std::dynamic_pointer_cast<Decoder>(blocks["decoder"]);
+
+        ggml_set_name(z, "bench-start");
+        auto h = decoder->forward(ctx, z);
+        ggml_set_name(h, "bench-end");
+        return h;
+    }
+
+    ggml_tensor* encode(GGMLRunnerContext* ctx, ggml_tensor* x) {
+        // x: [N, in_channels, h, w]
+        auto encoder = std::dynamic_pointer_cast<Encoder>(blocks["encoder"]);
+
+        auto z = encoder->forward(ctx, x);  // [N, 2*z_channels, h/8, w/8]
+        if (use_quant) {
+            auto quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["quant_conv"]);
+            z               = quant_conv->forward(ctx, z);  // [N, 2*embed_dim, h/8, w/8]
+        }
+        if (sd_version_is_flux2(version)) {
+            z = ggml_ext_chunk(ctx->ggml_ctx, z, 2, 2)[0];
+
+            // [N, C, H, W] -> [N, C*p*p, H/p, W/p]
+            int64_t p = 2;
+            int64_t N = z->ne[3];
+            int64_t C = z->ne[2];
+            int64_t H = z->ne[1];
+            int64_t W = z->ne[0];
+            int64_t h = H / p;
+            int64_t w = W / p;
+
+            z = ggml_reshape_4d(ctx->ggml_ctx, z, p, w, p, h * C * N);                 // [N*C*h, p, w, p]
+            z = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, z, 0, 2, 1, 3));  // [N*C*h, w, p, p]
+            z = ggml_reshape_4d(ctx->ggml_ctx, z, p * p, w * h, C, N);                 // [N, C, h*w, p*p]
+            z = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, z, 1, 0, 2, 3));  // [N, C, p*p, h*w]
+            z = ggml_reshape_4d(ctx->ggml_ctx, z, w, h, p * p * C, N);                 // [N, C*p*p, h*w]
+        }
+        return z;
+    }
+
+    int get_encoder_output_channels() {
+        int factor = dd_config.double_z ? 2 : 1;
+        if (sd_version_is_flux2(version)) {
+            return dd_config.z_channels * 4;
+        }
+        return dd_config.z_channels * factor;
+    }
+};
+
+struct AutoEncoderKL : public VAE {
+    float scale_factor = 1.f;
+    float shift_factor = 0.f;
+    bool decode_only   = true;
+    AutoEncoderKLModel ae;
+
+    AutoEncoderKL(ggml_backend_t backend,
+                  bool offload_params_to_cpu,
+                  const String2TensorStorage& tensor_storage_map,
+                  const std::string prefix,
+                  bool decode_only       = false,
+                  bool use_video_decoder = false,
+                  SDVersion version      = VERSION_SD1)
+        : decode_only(decode_only), VAE(version, backend, offload_params_to_cpu) {
+        if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) {
+            scale_factor = 0.18215f;
+            shift_factor = 0.f;
+        } else if (sd_version_is_sdxl(version)) {
+            scale_factor = 0.13025f;
+            shift_factor = 0.f;
+        } else if (sd_version_is_sd3(version)) {
+            scale_factor = 1.5305f;
+            shift_factor = 0.0609f;
+        } else if (sd_version_is_flux(version) || sd_version_is_z_image(version)) {
+            scale_factor = 0.3611f;
+            shift_factor = 0.1159f;
+        } else if (sd_version_is_flux2(version)) {
+            scale_factor = 1.0f;
+            shift_factor = 0.f;
+        }
+        bool use_linear_projection = false;
+        for (const auto& [name, tensor_storage] : tensor_storage_map) {
+            if (!starts_with(name, prefix)) {
+                continue;
+            }
+            if (ends_with(name, "attn_1.proj_out.weight")) {
+                if (tensor_storage.n_dims == 2) {
+                    use_linear_projection = true;
+                }
+                break;
+            }
+        }
+        ae = AutoEncoderKLModel(version, decode_only, use_linear_projection, use_video_decoder);
+        ae.init(params_ctx, tensor_storage_map, prefix);
+    }
+
+    void set_conv2d_scale(float scale) override {
+        std::vector<GGMLBlock*> blocks;
+        ae.get_all_blocks(blocks);
+        for (auto block : blocks) {
+            if (block->get_desc() == "Conv2d") {
+                auto conv_block = (Conv2d*)block;
+                conv_block->set_scale(scale);
+            }
+        }
+    }
+
+    std::string get_desc() override {
+        return "vae";
+    }
+
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) override {
+        ae.get_param_tensors(tensors, prefix);
+    }
+
+    ggml_cgraph* build_graph(const sd::Tensor<float>& z_tensor, bool decode_graph) {
+        ggml_cgraph* gf = ggml_new_graph(compute_ctx);
+        ggml_tensor* z  = make_input(z_tensor);
+
+        auto runner_ctx = get_context();
+
+        ggml_tensor* out = decode_graph ? ae.decode(&runner_ctx, z) : ae.encode(&runner_ctx, z);
+
+        ggml_build_forward_expand(gf, out);
+
+        return gf;
+    }
+
+    sd::Tensor<float> _compute(const int n_threads,
+                               const sd::Tensor<float>& z,
+                               bool decode_graph) override {
+        GGML_ASSERT(!decode_only || decode_graph);
+        auto get_graph = [&]() -> ggml_cgraph* {
+            return build_graph(z, decode_graph);
+        };
+        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), z.dim());
+    }
+
+    sd::Tensor<float> gaussian_latent_sample(const sd::Tensor<float>& moments, std::shared_ptr<RNG> rng) {
+        // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample
+        auto chunks               = sd::ops::chunk(moments, 2, 2);
+        const auto& mean          = chunks[0];
+        const auto& logvar        = chunks[1];
+        sd::Tensor<float> stddev  = sd::ops::exp(0.5f * sd::ops::clamp(logvar, -30.0f, 20.0f));
+        sd::Tensor<float> noise   = sd::Tensor<float>::randn_like(mean, rng);
+        sd::Tensor<float> latents = mean + stddev * noise;
+        return latents;
+    }
+
+    sd::Tensor<float> vae_output_to_latents(const sd::Tensor<float>& vae_output, std::shared_ptr<RNG> rng) override {
+        if (sd_version_is_flux2(version)) {
+            return vae_output;
+        } else if (version == VERSION_SD1_PIX2PIX) {
+            return sd::ops::chunk(vae_output, 2, 2)[0];
+        } else {
+            return gaussian_latent_sample(vae_output, rng);
+        }
+    }
+
+    std::pair<sd::Tensor<float>, sd::Tensor<float>> get_latents_mean_std(const sd::Tensor<float>& latents, int channel_dim) {
+        GGML_ASSERT(channel_dim >= 0 && static_cast<size_t>(channel_dim) < static_cast<size_t>(latents.dim()));
+        if (sd_version_is_flux2(version)) {
+            GGML_ASSERT(latents.shape()[channel_dim] == 128);
+            std::vector<int64_t> stats_shape(static_cast<size_t>(latents.dim()), 1);
+            stats_shape[static_cast<size_t>(channel_dim)] = latents.shape()[channel_dim];
+
+            auto mean_tensor = sd::Tensor<float>::from_vector({-0.0676f, -0.0715f, -0.0753f, -0.0745f, 0.0223f, 0.0180f, 0.0142f, 0.0184f,
+                                                               -0.0001f, -0.0063f, -0.0002f, -0.0031f, -0.0272f, -0.0281f, -0.0276f, -0.0290f,
+                                                               -0.0769f, -0.0672f, -0.0902f, -0.0892f, 0.0168f, 0.0152f, 0.0079f, 0.0086f,
+                                                               0.0083f, 0.0015f, 0.0003f, -0.0043f, -0.0439f, -0.0419f, -0.0438f, -0.0431f,
+                                                               -0.0102f, -0.0132f, -0.0066f, -0.0048f, -0.0311f, -0.0306f, -0.0279f, -0.0180f,
+                                                               0.0030f, 0.0015f, 0.0126f, 0.0145f, 0.0347f, 0.0338f, 0.0337f, 0.0283f,
+                                                               0.0020f, 0.0047f, 0.0047f, 0.0050f, 0.0123f, 0.0081f, 0.0081f, 0.0146f,
+                                                               0.0681f, 0.0679f, 0.0767f, 0.0732f, -0.0462f, -0.0474f, -0.0392f, -0.0511f,
+                                                               -0.0528f, -0.0477f, -0.0470f, -0.0517f, -0.0317f, -0.0316f, -0.0345f, -0.0283f,
+                                                               0.0510f, 0.0445f, 0.0578f, 0.0458f, -0.0412f, -0.0458f, -0.0487f, -0.0467f,
+                                                               -0.0088f, -0.0106f, -0.0088f, -0.0046f, -0.0376f, -0.0432f, -0.0436f, -0.0499f,
+                                                               0.0118f, 0.0166f, 0.0203f, 0.0279f, 0.0113f, 0.0129f, 0.0016f, 0.0072f,
+                                                               -0.0118f, -0.0018f, -0.0141f, -0.0054f, -0.0091f, -0.0138f, -0.0145f, -0.0187f,
+                                                               0.0323f, 0.0305f, 0.0259f, 0.0300f, 0.0540f, 0.0614f, 0.0495f, 0.0590f,
+                                                               -0.0511f, -0.0603f, -0.0478f, -0.0524f, -0.0227f, -0.0274f, -0.0154f, -0.0255f,
+                                                               -0.0572f, -0.0565f, -0.0518f, -0.0496f, 0.0116f, 0.0054f, 0.0163f, 0.0104f});
+            mean_tensor.reshape_(stats_shape);
+            auto std_tensor = sd::Tensor<float>::from_vector({1.8029f, 1.7786f, 1.7868f, 1.7837f, 1.7717f, 1.7590f, 1.7610f, 1.7479f,
+                                                              1.7336f, 1.7373f, 1.7340f, 1.7343f, 1.8626f, 1.8527f, 1.8629f, 1.8589f,
+                                                              1.7593f, 1.7526f, 1.7556f, 1.7583f, 1.7363f, 1.7400f, 1.7355f, 1.7394f,
+                                                              1.7342f, 1.7246f, 1.7392f, 1.7304f, 1.7551f, 1.7513f, 1.7559f, 1.7488f,
+                                                              1.8449f, 1.8454f, 1.8550f, 1.8535f, 1.8240f, 1.7813f, 1.7854f, 1.7945f,
+                                                              1.8047f, 1.7876f, 1.7695f, 1.7676f, 1.7782f, 1.7667f, 1.7925f, 1.7848f,
+                                                              1.7579f, 1.7407f, 1.7483f, 1.7368f, 1.7961f, 1.7998f, 1.7920f, 1.7925f,
+                                                              1.7780f, 1.7747f, 1.7727f, 1.7749f, 1.7526f, 1.7447f, 1.7657f, 1.7495f,
+                                                              1.7775f, 1.7720f, 1.7813f, 1.7813f, 1.8162f, 1.8013f, 1.8023f, 1.8033f,
+                                                              1.7527f, 1.7331f, 1.7563f, 1.7482f, 1.7610f, 1.7507f, 1.7681f, 1.7613f,
+                                                              1.7665f, 1.7545f, 1.7828f, 1.7726f, 1.7896f, 1.7999f, 1.7864f, 1.7760f,
+                                                              1.7613f, 1.7625f, 1.7560f, 1.7577f, 1.7783f, 1.7671f, 1.7810f, 1.7799f,
+                                                              1.7201f, 1.7068f, 1.7265f, 1.7091f, 1.7793f, 1.7578f, 1.7502f, 1.7455f,
+                                                              1.7587f, 1.7500f, 1.7525f, 1.7362f, 1.7616f, 1.7572f, 1.7444f, 1.7430f,
+                                                              1.7509f, 1.7610f, 1.7634f, 1.7612f, 1.7254f, 1.7135f, 1.7321f, 1.7226f,
+                                                              1.7664f, 1.7624f, 1.7718f, 1.7664f, 1.7457f, 1.7441f, 1.7569f, 1.7530f});
+            std_tensor.reshape_(stats_shape);
+            return {std::move(mean_tensor), std::move(std_tensor)};
+        } else {
+            GGML_ABORT("unknown version %d", version);
+        }
+    }
+
+    sd::Tensor<float> diffusion_to_vae_latents(const sd::Tensor<float>& latents) override {
+        if (sd_version_is_flux2(version)) {
+            int channel_dim                = 2;
+            auto [mean_tensor, std_tensor] = get_latents_mean_std(latents, channel_dim);
+            return (latents * std_tensor) / scale_factor + mean_tensor;
+        }
+        return (latents / scale_factor) + shift_factor;
+    }
+
+    sd::Tensor<float> vae_to_diffusion_latents(const sd::Tensor<float>& latents) override {
+        if (sd_version_is_flux2(version)) {
+            int channel_dim                = 2;
+            auto [mean_tensor, std_tensor] = get_latents_mean_std(latents, channel_dim);
+            return ((latents - mean_tensor) * scale_factor) / std_tensor;
+        }
+        return (latents - shift_factor) * scale_factor;
+    }
+
+    int get_encoder_output_channels(int input_channels) {
+        return ae.get_encoder_output_channels();
+    }
+
+    void test() {
+        ggml_init_params params;
+        params.mem_size   = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
+        params.mem_buffer = nullptr;
+        params.no_alloc   = false;
+
+        ggml_context* ctx = ggml_init(params);
+        GGML_ASSERT(ctx != nullptr);
+
+        {
+            // CPU, x{1, 3, 64, 64}: Pass
+            // CUDA, x{1, 3, 64, 64}: Pass, but sill get wrong result for some image, may be due to interlnal nan
+            // CPU, x{2, 3, 64, 64}: Wrong result
+            // CUDA, x{2, 3, 64, 64}: Wrong result, and different from CPU result
+            sd::Tensor<float> x({64, 64, 3, 2});
+            x.fill_(0.5f);
+            print_sd_tensor(x);
+            sd::Tensor<float> out;
+
+            int64_t t0   = ggml_time_ms();
+            auto out_opt = _compute(8, x, false);
+            int64_t t1   = ggml_time_ms();
+
+            GGML_ASSERT(!out_opt.empty());
+            out = std::move(out_opt);
+            print_sd_tensor(out);
+            LOG_DEBUG("encode test done in %lldms", t1 - t0);
+        }
+
+        if (false) {
+            // CPU, z{1, 4, 8, 8}: Pass
+            // CUDA, z{1, 4, 8, 8}: Pass
+            // CPU, z{3, 4, 8, 8}: Wrong result
+            // CUDA, z{3, 4, 8, 8}: Wrong result, and different from CPU result
+            sd::Tensor<float> z({8, 8, 4, 1});
+            z.fill_(0.5f);
+            print_sd_tensor(z);
+            sd::Tensor<float> out;
+
+            int64_t t0   = ggml_time_ms();
+            auto out_opt = _compute(8, z, true);
+            int64_t t1   = ggml_time_ms();
+
+            GGML_ASSERT(!out_opt.empty());
+            out = std::move(out_opt);
+            print_sd_tensor(out);
+            LOG_DEBUG("decode test done in %lldms", t1 - t0);
+        }
+    };
+};
+
+#endif  // __AUTO_ENCODER_KL_HPP__
diff --git a/src/cache_dit.hpp b/src/cache_dit.hpp
index 6fe104da..dad67d45 100644
--- a/src/cache_dit.hpp
+++ b/src/cache_dit.hpp
@@ -8,7 +8,9 @@
 #include <unordered_map>
 #include <vector>
 
+#include "condition_cache_utils.hpp"
 #include "ggml_extend.hpp"
+#include "tensor.hpp"
 
 struct DBCacheConfig {
     bool enabled                        = false;
@@ -603,87 +605,6 @@ inline std::vector<int> generate_scm_mask(
     return mask;
 }
 
-inline std::vector<int> get_scm_preset(const std::string& preset, int total_steps) {
-    struct Preset {
-        std::vector<int> compute_bins;
-        std::vector<int> cache_bins;
-    };
-
-    Preset slow   = {{8, 3, 3, 2, 1, 1}, {1, 2, 2, 2, 3}};
-    Preset medium = {{6, 2, 2, 2, 2, 1}, {1, 3, 3, 3, 3}};
-    Preset fast   = {{6, 1, 1, 1, 1, 1}, {1, 3, 4, 5, 4}};
-    Preset ultra  = {{4, 1, 1, 1, 1}, {2, 5, 6, 7}};
-
-    Preset* p = nullptr;
-    if (preset == "slow" || preset == "s" || preset == "S")
-        p = &slow;
-    else if (preset == "medium" || preset == "m" || preset == "M")
-        p = &medium;
-    else if (preset == "fast" || preset == "f" || preset == "F")
-        p = &fast;
-    else if (preset == "ultra" || preset == "u" || preset == "U")
-        p = &ultra;
-    else
-        return {};
-
-    if (total_steps != 28 && total_steps > 0) {
-        float scale = static_cast<float>(total_steps) / 28.0f;
-        std::vector<int> scaled_compute, scaled_cache;
-
-        for (int v : p->compute_bins) {
-            scaled_compute.push_back(std::max(1, static_cast<int>(v * scale + 0.5f)));
-        }
-        for (int v : p->cache_bins) {
-            scaled_cache.push_back(std::max(1, static_cast<int>(v * scale + 0.5f)));
-        }
-
-        return generate_scm_mask(scaled_compute, scaled_cache, total_steps);
-    }
-
-    return generate_scm_mask(p->compute_bins, p->cache_bins, total_steps);
-}
-
-inline float get_preset_threshold(const std::string& preset) {
-    if (preset == "slow" || preset == "s" || preset == "S")
-        return 0.20f;
-    if (preset == "medium" || preset == "m" || preset == "M")
-        return 0.25f;
-    if (preset == "fast" || preset == "f" || preset == "F")
-        return 0.30f;
-    if (preset == "ultra" || preset == "u" || preset == "U")
-        return 0.34f;
-    return 0.08f;
-}
-
-inline int get_preset_warmup(const std::string& preset) {
-    if (preset == "slow" || preset == "s" || preset == "S")
-        return 8;
-    if (preset == "medium" || preset == "m" || preset == "M")
-        return 6;
-    if (preset == "fast" || preset == "f" || preset == "F")
-        return 6;
-    if (preset == "ultra" || preset == "u" || preset == "U")
-        return 4;
-    return 8;
-}
-
-inline int get_preset_Fn(const std::string& preset) {
-    if (preset == "slow" || preset == "s" || preset == "S")
-        return 8;
-    if (preset == "medium" || preset == "m" || preset == "M")
-        return 8;
-    if (preset == "fast" || preset == "f" || preset == "F")
-        return 6;
-    if (preset == "ultra" || preset == "u" || preset == "U")
-        return 4;
-    return 8;
-}
-
-inline int get_preset_Bn(const std::string& preset) {
-    (void)preset;
-    return 0;
-}
-
 inline void parse_dbcache_options(const std::string& opts, DBCacheConfig& cfg) {
     if (opts.empty())
         return;
@@ -852,35 +773,37 @@ struct CacheDitConditionState {
         return it != cache_diffs.end() && !it->second.diff.empty();
     }
 
-    void update_cache(const void* cond, const float* input, const float* output, size_t size) {
+    void update_cache(const void* cond, const sd::Tensor<float>& input, const sd::Tensor<float>& output) {
         CacheEntry& entry = cache_diffs[cond];
-        entry.diff.resize(size);
-        for (size_t i = 0; i < size; i++) {
-            entry.diff[i] = output[i] - input[i];
+        if (!sd::store_condition_cache_diff(&entry.diff, input, output)) {
+            entry.prev_input.clear();
+            entry.prev_output.clear();
+            entry.has_prev = false;
+            return;
         }
 
+        size_t size              = static_cast<size_t>(output.numel());
+        const float* input_data  = input.data();
+        const float* output_data = output.data();
         entry.prev_input.resize(size);
         entry.prev_output.resize(size);
         for (size_t i = 0; i < size; i++) {
-            entry.prev_input[i]  = input[i];
-            entry.prev_output[i] = output[i];
+            entry.prev_input[i]  = input_data[i];
+            entry.prev_output[i] = output_data[i];
         }
         entry.has_prev = true;
     }
 
-    void apply_cache(const void* cond, const float* input, float* output, size_t size) {
+    void apply_cache(const void* cond,
+                     const sd::Tensor<float>& input,
+                     sd::Tensor<float>* output) {
         auto it = cache_diffs.find(cond);
         if (it == cache_diffs.end() || it->second.diff.empty())
             return;
-        if (it->second.diff.size() != size)
-            return;
-
-        for (size_t i = 0; i < size; i++) {
-            output[i] = input[i] + it->second.diff[i];
-        }
+        sd::apply_condition_cache_diff(it->second.diff, input, output);
     }
 
-    bool before_condition(const void* cond, struct ggml_tensor* input, struct ggml_tensor* output, float sigma, int step_index) {
+    bool before_condition(const void* cond, const sd::Tensor<float>& input, sd::Tensor<float>* output, float sigma, int step_index) {
         if (!enabled() || step_index < 0)
             return false;
 
@@ -900,8 +823,7 @@ struct CacheDitConditionState {
 
         if (skip_current_step) {
             if (has_cache(cond)) {
-                apply_cache(cond, (float*)input->data, (float*)output->data,
-                            static_cast<size_t>(ggml_nelements(output)));
+                apply_cache(cond, input, output);
                 return true;
             }
             return false;
@@ -914,13 +836,13 @@ struct CacheDitConditionState {
         if (it == cache_diffs.end() || !it->second.has_prev)
             return false;
 
-        size_t ne = static_cast<size_t>(ggml_nelements(input));
+        size_t ne = static_cast<size_t>(input.numel());
         if (it->second.prev_input.size() != ne)
             return false;
 
-        float* input_data = (float*)input->data;
-        float diff        = CacheDitState::calculate_residual_diff(
-                   it->second.prev_input.data(), input_data, ne);
+        const float* input_data = input.data();
+        float diff              = CacheDitState::calculate_residual_diff(
+                         it->second.prev_input.data(), input_data, ne);
 
         float effective_threshold = config.residual_diff_threshold;
         if (config.Fn_compute_blocks > 0) {
@@ -940,7 +862,7 @@ struct CacheDitConditionState {
             cached_steps.push_back(current_step_index);
             continuous_cached_steps++;
             accumulated_residual_diff += diff;
-            apply_cache(cond, input_data, (float*)output->data, ne);
+            apply_cache(cond, input, output);
             return true;
         }
 
@@ -948,15 +870,14 @@ struct CacheDitConditionState {
         return false;
     }
 
-    void after_condition(const void* cond, struct ggml_tensor* input, struct ggml_tensor* output) {
+    void after_condition(const void* cond, const sd::Tensor<float>& input, const sd::Tensor<float>& output) {
         if (!step_is_active())
             return;
 
-        size_t ne = static_cast<size_t>(ggml_nelements(output));
-        update_cache(cond, (float*)input->data, (float*)output->data, ne);
+        update_cache(cond, input, output);
 
         if (cond == anchor_condition && taylor_config.enabled) {
-            taylor_state.update_derivatives((float*)output->data, ne, current_step_index);
+            taylor_state.update_derivatives(output.data(), static_cast<size_t>(output.numel()), current_step_index);
         }
     }
 
diff --git a/src/clip.hpp b/src/clip.hpp
index adecd4d2..8f2ac064 100644
--- a/src/clip.hpp
+++ b/src/clip.hpp
@@ -473,7 +473,7 @@ public:
         }
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
         // x: [N, n_token, d_model]
         auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
         auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);
@@ -511,7 +511,7 @@ public:
         blocks["mlp"] = std::shared_ptr<GGMLBlock>(new CLIPMLP(d_model, intermediate_size));
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* mask = nullptr) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* mask = nullptr) {
         // x: [N, n_token, d_model]
         auto self_attn   = std::dynamic_pointer_cast<MultiheadAttention>(blocks["self_attn"]);
         auto layer_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm1"]);
@@ -541,10 +541,10 @@ public:
         }
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                struct ggml_tensor* x,
-                                struct ggml_tensor* mask = nullptr,
-                                int clip_skip            = -1) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* x,
+                         ggml_tensor* mask = nullptr,
+                         int clip_skip     = -1) {
         // x: [N, n_token, d_model]
         int layer_idx = n_layer - 1;
         // LOG_DEBUG("clip_skip %d", clip_skip);
@@ -573,7 +573,7 @@ protected:
     int64_t num_positions;
     bool force_clip_f32;
 
-    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
         enum ggml_type token_wtype = GGML_TYPE_F32;
         if (!force_clip_f32) {
             token_wtype = get_type(prefix + "token_embedding.weight", tensor_storage_map, GGML_TYPE_F32);
@@ -597,13 +597,13 @@ public:
           force_clip_f32(force_clip_f32) {
     }
 
-    struct ggml_tensor* get_token_embed_weight() {
+    ggml_tensor* get_token_embed_weight() {
         return params["token_embedding.weight"];
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                struct ggml_tensor* input_ids,
-                                struct ggml_tensor* custom_embed_weight) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* input_ids,
+                         ggml_tensor* custom_embed_weight) {
         // input_ids: [N, n_token]
         auto token_embed_weight    = params["token_embedding.weight"];
         auto position_embed_weight = params["position_embedding.weight"];
@@ -630,7 +630,7 @@ protected:
     int num_patches;
     int64_t num_positions;
 
-    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
         enum ggml_type patch_wtype    = GGML_TYPE_F16;
         enum ggml_type class_wtype    = GGML_TYPE_F32;
         enum ggml_type position_wtype = GGML_TYPE_F32;
@@ -653,7 +653,7 @@ public:
         num_positions = num_patches + 1;
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* pixel_values) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* pixel_values) {
         // pixel_values: [N, num_channels, image_size, image_size]
         // return: [N, num_positions, embed_dim]
         GGML_ASSERT(pixel_values->ne[0] == image_size && pixel_values->ne[1] == image_size && pixel_values->ne[2] == num_channels);
@@ -663,20 +663,20 @@ public:
         auto position_embed_weight = params["position_embedding.weight"];
 
         // concat(patch_embedding, class_embedding) + position_embedding
-        struct ggml_tensor* patch_embedding;
+        ggml_tensor* patch_embedding;
         int64_t N       = pixel_values->ne[3];
         patch_embedding = ggml_ext_conv_2d(ctx->ggml_ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size);  // [N, embed_dim, image_size // pacht_size, image_size // pacht_size]
         patch_embedding = ggml_reshape_3d(ctx->ggml_ctx, patch_embedding, num_patches, embed_dim, N);                          // [N, embed_dim, num_patches]
         patch_embedding = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, patch_embedding, 1, 0, 2, 3));                  // [N, num_patches, embed_dim]
         patch_embedding = ggml_reshape_4d(ctx->ggml_ctx, patch_embedding, 1, embed_dim, num_patches, N);                       // [N, num_patches, embed_dim, 1]
 
-        struct ggml_tensor* class_embedding = ggml_new_tensor_2d(ctx->ggml_ctx, GGML_TYPE_F32, embed_dim, N);
-        class_embedding                     = ggml_repeat(ctx->ggml_ctx, class_embed_weight, class_embedding);      // [N, embed_dim]
-        class_embedding                     = ggml_reshape_4d(ctx->ggml_ctx, class_embedding, 1, embed_dim, 1, N);  // [N, 1, embed_dim, 1]
+        ggml_tensor* class_embedding = ggml_new_tensor_2d(ctx->ggml_ctx, GGML_TYPE_F32, embed_dim, N);
+        class_embedding              = ggml_repeat(ctx->ggml_ctx, class_embed_weight, class_embedding);      // [N, embed_dim]
+        class_embedding              = ggml_reshape_4d(ctx->ggml_ctx, class_embedding, 1, embed_dim, 1, N);  // [N, 1, embed_dim, 1]
 
-        struct ggml_tensor* x = ggml_concat(ctx->ggml_ctx, class_embedding, patch_embedding, 2);  // [N, num_positions, embed_dim, 1]
-        x                     = ggml_reshape_3d(ctx->ggml_ctx, x, embed_dim, num_positions, N);   // [N, num_positions, embed_dim]
-        x                     = ggml_add(ctx->ggml_ctx, x, position_embed_weight);
+        ggml_tensor* x = ggml_concat(ctx->ggml_ctx, class_embedding, patch_embedding, 2);  // [N, num_positions, embed_dim, 1]
+        x              = ggml_reshape_3d(ctx->ggml_ctx, x, embed_dim, num_positions, N);   // [N, num_positions, embed_dim]
+        x              = ggml_add(ctx->ggml_ctx, x, position_embed_weight);
         return x;  // [N, num_positions, embed_dim]
     }
 };
@@ -693,7 +693,7 @@ enum CLIPVersion {
 
 class CLIPTextModel : public GGMLBlock {
 protected:
-    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
         if (version == OPEN_CLIP_VIT_BIGG_14) {
             enum ggml_type wtype      = GGML_TYPE_F32;
             params["text_projection"] = ggml_new_tensor_2d(ctx, wtype, projection_dim, hidden_size);
@@ -734,18 +734,18 @@ public:
         blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
     }
 
-    struct ggml_tensor* get_token_embed_weight() {
+    ggml_tensor* get_token_embed_weight() {
         auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
         return embeddings->get_token_embed_weight();
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                struct ggml_tensor* input_ids,
-                                struct ggml_tensor* tkn_embeddings,
-                                struct ggml_tensor* mask = nullptr,
-                                size_t max_token_idx     = 0,
-                                bool return_pooled       = false,
-                                int clip_skip            = -1) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* input_ids,
+                         ggml_tensor* tkn_embeddings,
+                         ggml_tensor* mask    = nullptr,
+                         size_t max_token_idx = 0,
+                         bool return_pooled   = false,
+                         int clip_skip        = -1) {
         // input_ids: [N, n_token]
         auto embeddings       = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
         auto encoder          = std::dynamic_pointer_cast<CLIPEncoder>(blocks["encoder"]);
@@ -804,10 +804,10 @@ public:
         blocks["post_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                struct ggml_tensor* pixel_values,
-                                bool return_pooled = true,
-                                int clip_skip      = -1) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* pixel_values,
+                         bool return_pooled = true,
+                         int clip_skip      = -1) {
         // pixel_values: [N, num_channels, image_size, image_size]
         auto embeddings     = std::dynamic_pointer_cast<CLIPVisionEmbeddings>(blocks["embeddings"]);
         auto pre_layernorm  = std::dynamic_pointer_cast<LayerNorm>(blocks["pre_layernorm"]);
@@ -839,7 +839,7 @@ protected:
     int64_t out_features;
     bool transpose_weight;
 
-    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
         enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32);
         if (transpose_weight) {
             params["weight"] = ggml_new_tensor_2d(ctx, wtype, out_features, in_features);
@@ -856,8 +856,8 @@ public:
           out_features(out_features),
           transpose_weight(transpose_weight) {}
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
-        struct ggml_tensor* w = params["weight"];
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+        ggml_tensor* w = params["weight"];
         if (transpose_weight) {
             w = ggml_cont(ctx->ggml_ctx, ggml_transpose(ctx->ggml_ctx, w));
         }
@@ -886,10 +886,10 @@ public:
         blocks["visual_projection"] = std::shared_ptr<GGMLBlock>(new CLIPProjection(hidden_size, projection_dim, transpose_proj_w));
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                struct ggml_tensor* pixel_values,
-                                bool return_pooled = true,
-                                int clip_skip      = -1) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* pixel_values,
+                         bool return_pooled = true,
+                         int clip_skip      = -1) {
         // pixel_values: [N, num_channels, image_size, image_size]
         // return: [N, projection_dim] if return_pooled else [N, n_token, hidden_size]
         auto vision_model      = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]);
@@ -936,17 +936,17 @@ struct CLIPTextModelRunner : public GGMLRunner {
         return "clip";
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
         model.get_param_tensors(tensors, prefix);
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                struct ggml_tensor* input_ids,
-                                struct ggml_tensor* embeddings,
-                                struct ggml_tensor* mask,
-                                size_t max_token_idx = 0,
-                                bool return_pooled   = false,
-                                int clip_skip        = -1) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* input_ids,
+                         ggml_tensor* embeddings,
+                         ggml_tensor* mask,
+                         size_t max_token_idx = 0,
+                         bool return_pooled   = false,
+                         int clip_skip        = -1) {
         size_t N       = input_ids->ne[1];
         size_t n_token = input_ids->ne[0];
         if (input_ids->ne[0] > model.n_token) {
@@ -957,17 +957,16 @@ struct CLIPTextModelRunner : public GGMLRunner {
         return model.forward(ctx, input_ids, embeddings, mask, max_token_idx, return_pooled, clip_skip);
     }
 
-    struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
-                                    int num_custom_embeddings    = 0,
-                                    void* custom_embeddings_data = nullptr,
-                                    size_t max_token_idx         = 0,
-                                    bool return_pooled           = false,
-                                    int clip_skip                = -1) {
-        struct ggml_cgraph* gf = new_graph_custom(2048);
+    ggml_cgraph* build_graph(const sd::Tensor<int32_t>& input_ids_tensor,
+                             int num_custom_embeddings    = 0,
+                             void* custom_embeddings_data = nullptr,
+                             size_t max_token_idx         = 0,
+                             bool return_pooled           = false,
+                             int clip_skip                = -1) {
+        ggml_cgraph* gf        = new_graph_custom(2048);
+        ggml_tensor* input_ids = make_input(input_ids_tensor);
 
-        input_ids = to_backend(input_ids);
-
-        struct ggml_tensor* embeddings = nullptr;
+        ggml_tensor* embeddings = nullptr;
 
         if (num_custom_embeddings > 0 && custom_embeddings_data != nullptr) {
             auto token_embed_weight = model.get_token_embed_weight();
@@ -997,26 +996,28 @@ struct CLIPTextModelRunner : public GGMLRunner {
 
         auto runner_ctx = get_context();
 
-        struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, embeddings, attention_mask, max_token_idx, return_pooled, clip_skip);
+        ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, embeddings, attention_mask, max_token_idx, return_pooled, clip_skip);
 
         ggml_build_forward_expand(gf, hidden_states);
 
         return gf;
     }
 
-    bool compute(const int n_threads,
-                 struct ggml_tensor* input_ids,
-                 int num_custom_embeddings,
-                 void* custom_embeddings_data,
-                 size_t max_token_idx,
-                 bool return_pooled,
-                 int clip_skip,
-                 ggml_tensor** output,
-                 ggml_context* output_ctx = nullptr) {
-        auto get_graph = [&]() -> struct ggml_cgraph* {
+    sd::Tensor<float> compute(const int n_threads,
+                              const sd::Tensor<int32_t>& input_ids,
+                              int num_custom_embeddings,
+                              void* custom_embeddings_data,
+                              size_t max_token_idx,
+                              bool return_pooled,
+                              int clip_skip) {
+        auto get_graph = [&]() -> ggml_cgraph* {
             return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip);
         };
-        return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
+        auto result = GGMLRunner::compute<float>(get_graph, n_threads, true);
+        if (return_pooled) {
+            return take_or_empty(std::move(result));
+        }
+        return restore_trailing_singleton_dims(std::move(result), 3);
     }
 };
 
diff --git a/src/common_block.hpp b/src/common_block.hpp
index 435afa4f..2cef389a 100644
--- a/src/common_block.hpp
+++ b/src/common_block.hpp
@@ -23,7 +23,7 @@ public:
         }
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
         // x: [N, channels, h, w]
         if (vae_downsample) {
             auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
@@ -52,7 +52,7 @@ public:
         blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
         // x: [N, channels, h, w]
         auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
 
@@ -121,7 +121,7 @@ public:
         }
     }
 
-    virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = nullptr) {
+    virtual ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* emb = nullptr) {
         // For dims==3, we reduce dimension from 5d to 4d by merging h and w, in order not to change ggml
         // [N, c, t, h, w] => [N, c, t, h * w]
         // x: [N, channels, h, w] if dims == 2 else [N, channels, t, h, w]
@@ -188,7 +188,7 @@ public:
         blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out * 2));
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
         // x: [ne3, ne2, ne1, dim_in]
         // return: [ne3, ne2, ne1, dim_out]
         auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
@@ -214,7 +214,7 @@ public:
         blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out, bias));
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
         // x: [ne3, ne2, ne1, dim_in]
         // return: [ne3, ne2, ne1, dim_out]
         auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
@@ -258,7 +258,7 @@ public:
         blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, force_prec_f32, scale));
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
         // x: [ne3, ne2, ne1, dim]
         // return: [ne3, ne2, ne1, dim_out]
 
@@ -297,9 +297,9 @@ public:
         // to_out_1 is nn.Dropout(), skip for inference
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                struct ggml_tensor* x,
-                                struct ggml_tensor* context) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* x,
+                         ggml_tensor* context) {
         // x: [N, n_token, query_dim]
         // context: [N, n_context, context_dim]
         // return: [N, n_token, query_dim]
@@ -355,9 +355,9 @@ public:
         }
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                struct ggml_tensor* x,
-                                struct ggml_tensor* context) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* x,
+                         ggml_tensor* context) {
         // x: [N, n_token, query_dim]
         // context: [N, n_context, context_dim]
         // return: [N, n_token, query_dim]
@@ -406,7 +406,7 @@ protected:
     int64_t context_dim = 768;  // hidden_size, 1024 for VERSION_SD2
     bool use_linear     = false;
 
-    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
+    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
         auto iter = tensor_storage_map.find(prefix + "proj_out.weight");
         if (iter != tensor_storage_map.end()) {
             int64_t inner_dim = n_head * d_head;
@@ -456,9 +456,9 @@ public:
         }
     }
 
-    virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                        struct ggml_tensor* x,
-                                        struct ggml_tensor* context) {
+    virtual ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                 ggml_tensor* x,
+                                 ggml_tensor* context) {
         // x: [N, in_channels, h, w]
         // context: [N, max_position(aka n_token), hidden_size(aka context_dim)]
         auto norm     = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
@@ -510,7 +510,7 @@ public:
 
 class AlphaBlender : public GGMLBlock {
 protected:
-    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
+    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
         // Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix
         enum ggml_type wtype = GGML_TYPE_F32;
         params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
@@ -530,9 +530,9 @@ public:
         // since mix_factor.shape is [1,], we don't need rearrange using  rearrange_pattern
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                struct ggml_tensor* x_spatial,
-                                struct ggml_tensor* x_temporal) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* x_spatial,
+                         ggml_tensor* x_temporal) {
         // image_only_indicator is always tensor([0.])
         float alpha = get_alpha();
         auto x      = ggml_add(ctx->ggml_ctx,
@@ -555,10 +555,10 @@ public:
         blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender());
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                struct ggml_tensor* x,
-                                struct ggml_tensor* emb,
-                                int num_video_frames) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* x,
+                         ggml_tensor* emb,
+                         int num_video_frames) {
         // x: [N, channels, h, w] aka [b*t, channels, h, w]
         // emb: [N, emb_channels] aka [b*t, emb_channels]
         // image_only_indicator is always tensor([0.])
diff --git a/src/common_dit.hpp b/src/common_dit.hpp
index 0e6f0f08..30141d42 100644
--- a/src/common_dit.hpp
+++ b/src/common_dit.hpp
@@ -4,11 +4,11 @@
 #include "ggml_extend.hpp"
 
 namespace DiT {
-    ggml_tensor* patchify(ggml_context* ctx,
-                          ggml_tensor* x,
-                          int pw,
-                          int ph,
-                          bool patch_last = true) {
+    inline ggml_tensor* patchify(ggml_context* ctx,
+                                 ggml_tensor* x,
+                                 int pw,
+                                 int ph,
+                                 bool patch_last = true) {
         // x: [N, C, H, W]
         // return: [N, h*w, C*ph*pw] if patch_last else [N, h*w, ph*pw*C]
         int64_t N = x->ne[3];
@@ -33,13 +33,13 @@ namespace DiT {
         return x;
     }
 
-    ggml_tensor* unpatchify(ggml_context* ctx,
-                            ggml_tensor* x,
-                            int64_t h,
-                            int64_t w,
-                            int ph,
-                            int pw,
-                            bool patch_last = true) {
+    inline ggml_tensor* unpatchify(ggml_context* ctx,
+                                   ggml_tensor* x,
+                                   int64_t h,
+                                   int64_t w,
+                                   int ph,
+                                   int pw,
+                                   bool patch_last = true) {
         // x: [N, h*w, C*ph*pw] if patch_last else [N, h*w, ph*pw*C]
         // return: [N, C, H, W]
         int64_t N = x->ne[2];
@@ -64,10 +64,10 @@ namespace DiT {
         return x;
     }
 
-    ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx,
-                                   ggml_tensor* x,
-                                   int ph,
-                                   int pw) {
+    inline ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx,
+                                          ggml_tensor* x,
+                                          int ph,
+                                          int pw) {
         int64_t W = x->ne[0];
         int64_t H = x->ne[1];
 
@@ -77,23 +77,23 @@ namespace DiT {
         return x;
     }
 
-    ggml_tensor* pad_and_patchify(GGMLRunnerContext* ctx,
-                                  ggml_tensor* x,
-                                  int ph,
-                                  int pw,
-                                  bool patch_last = true) {
+    inline ggml_tensor* pad_and_patchify(GGMLRunnerContext* ctx,
+                                         ggml_tensor* x,
+                                         int ph,
+                                         int pw,
+                                         bool patch_last = true) {
         x = pad_to_patch_size(ctx, x, ph, pw);
         x = patchify(ctx->ggml_ctx, x, ph, pw, patch_last);
         return x;
     }
 
-    ggml_tensor* unpatchify_and_crop(ggml_context* ctx,
-                                     ggml_tensor* x,
-                                     int64_t H,
-                                     int64_t W,
-                                     int ph,
-                                     int pw,
-                                     bool patch_last = true) {
+    inline ggml_tensor* unpatchify_and_crop(ggml_context* ctx,
+                                            ggml_tensor* x,
+                                            int64_t H,
+                                            int64_t W,
+                                            int ph,
+                                            int pw,
+                                            bool patch_last = true) {
         int pad_h = (ph - H % ph) % ph;
         int pad_w = (pw - W % pw) % pw;
         int64_t h = ((H + pad_h) / ph);
@@ -105,4 +105,4 @@ namespace DiT {
     }
 }  // namespace DiT
 
-#endif  // __COMMON_DIT_HPP__
\ No newline at end of file
+#endif  // __COMMON_DIT_HPP__
diff --git a/src/condition_cache_utils.hpp b/src/condition_cache_utils.hpp
new file mode 100644
index 00000000..903d64e3
--- /dev/null
+++ b/src/condition_cache_utils.hpp
@@ -0,0 +1,64 @@
+#ifndef __CONDITION_CACHE_UTILS_HPP__
+#define __CONDITION_CACHE_UTILS_HPP__
+
+#include <vector>
+
+#include "tensor.hpp"
+
+namespace sd {
+
+    inline bool store_condition_cache_diff(std::vector<float>* diff,
+                                           const sd::Tensor<float>& input,
+                                           const sd::Tensor<float>& output) {
+        if (diff == nullptr || input.empty() || output.empty()) {
+            return false;
+        }
+
+        size_t input_size  = static_cast<size_t>(input.numel());
+        size_t output_size = static_cast<size_t>(output.numel());
+        if (input_size == 0 || input_size != output_size) {
+            diff->clear();
+            return false;
+        }
+
+        const float* input_data  = input.data();
+        const float* output_data = output.data();
+        if (input_data == nullptr || output_data == nullptr) {
+            diff->clear();
+            return false;
+        }
+
+        diff->resize(output_size);
+        for (size_t i = 0; i < output_size; ++i) {
+            (*diff)[i] = output_data[i] - input_data[i];
+        }
+        return true;
+    }
+
+    inline bool apply_condition_cache_diff(const std::vector<float>& diff,
+                                           const sd::Tensor<float>& input,
+                                           sd::Tensor<float>* output) {
+        if (output == nullptr || input.empty() || diff.empty()) {
+            return false;
+        }
+
+        size_t input_size = static_cast<size_t>(input.numel());
+        if (input_size == 0 || diff.size() != input_size) {
+            return false;
+        }
+
+        *output            = input;
+        float* output_data = output->data();
+        if (output_data == nullptr) {
+            return false;
+        }
+
+        for (size_t i = 0; i < input_size; ++i) {
+            output_data[i] += diff[i];
+        }
+        return true;
+    }
+
+}  // namespace sd
+
+#endif  // __CONDITION_CACHE_UTILS_HPP__
diff --git a/src/conditioner.hpp b/src/conditioner.hpp
index d4a3146b..05167cfd 100644
--- a/src/conditioner.hpp
+++ b/src/conditioner.hpp
@@ -1,53 +1,97 @@
 #ifndef __CONDITIONER_HPP__
 #define __CONDITIONER_HPP__
 
+#include <optional>
+
 #include "clip.hpp"
 #include "llm.hpp"
 #include "t5.hpp"
+#include "tensor_ggml.hpp"
 
 struct SDCondition {
-    struct ggml_tensor* c_crossattn = nullptr;  // aka context
-    struct ggml_tensor* c_vector    = nullptr;  // aka y
-    struct ggml_tensor* c_concat    = nullptr;
+    sd::Tensor<float> c_crossattn;
+    sd::Tensor<float> c_vector;
+    sd::Tensor<float> c_concat;
+    sd::Tensor<int32_t> c_t5_ids;
+    sd::Tensor<float> c_t5_weights;
 
-    std::vector<struct ggml_tensor*> extra_c_crossattns;
+    std::vector<sd::Tensor<float>> extra_c_crossattns;
 
     SDCondition() = default;
-    SDCondition(struct ggml_tensor* c_crossattn,
-                struct ggml_tensor* c_vector,
-                struct ggml_tensor* c_concat,
-                const std::vector<struct ggml_tensor*>& extra_c_crossattns = {})
-        : c_crossattn(c_crossattn), c_vector(c_vector), c_concat(c_concat), extra_c_crossattns(extra_c_crossattns) {}
+
+    SDCondition(sd::Tensor<float> c_crossattn,
+                sd::Tensor<float> c_vector,
+                sd::Tensor<float> c_concat)
+        : c_crossattn(std::move(c_crossattn)), c_vector(std::move(c_vector)), c_concat(std::move(c_concat)) {}
+
+    bool empty() const {
+        if (!c_crossattn.empty() || !c_vector.empty() || !c_concat.empty() ||
+            !c_t5_ids.empty() || !c_t5_weights.empty()) {
+            return false;
+        }
+
+        for (const auto& tensor : extra_c_crossattns) {
+            if (!tensor.empty()) {
+                return false;
+            }
+        }
+
+        return true;
+    }
 };
 
+static inline sd::Tensor<float> apply_token_weights(sd::Tensor<float> hidden_states,
+                                                    const std::vector<float>& weights) {
+    if (hidden_states.empty()) {
+        return hidden_states;
+    }
+
+    if (hidden_states.dim() == 1) {
+        hidden_states.unsqueeze_(1);
+    }
+
+    GGML_ASSERT(static_cast<size_t>(hidden_states.shape()[1]) == weights.size());
+
+    float original_mean = hidden_states.mean();
+    auto chunk_weights  = sd::Tensor<float>::from_vector(weights);
+    chunk_weights.reshape_({1, static_cast<int64_t>(weights.size())});
+    hidden_states *= chunk_weights;
+    float new_mean = hidden_states.mean();
+    if (new_mean != 0.0f) {
+        hidden_states *= (original_mean / new_mean);
+    }
+
+    return hidden_states;
+}
+
 struct ConditionerParams {
     std::string text;
-    int clip_skip                       = -1;
-    int width                           = -1;
-    int height                          = -1;
-    int adm_in_channels                 = -1;
-    bool zero_out_masked                = false;
-    int num_input_imgs                  = 0;   // for photomaker
-    std::vector<sd_image_t*> ref_images = {};  // for qwen image edit
+    int clip_skip                                    = -1;
+    int width                                        = -1;
+    int height                                       = -1;
+    int adm_in_channels                              = -1;
+    bool zero_out_masked                             = false;
+    int num_input_imgs                               = 0;        // for photomaker
+    const std::vector<sd::Tensor<float>>* ref_images = nullptr;  // for qwen image edit
 };
 
 struct Conditioner {
-    virtual SDCondition get_learned_condition(ggml_context* work_ctx,
-                                              int n_threads,
+    virtual ~Conditioner() = default;
+
+public:
+    virtual SDCondition get_learned_condition(int n_threads,
                                               const ConditionerParams& conditioner_params) = 0;
     virtual void alloc_params_buffer()                                                     = 0;
     virtual void free_params_buffer()                                                      = 0;
-    virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors)    = 0;
+    virtual void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors)           = 0;
     virtual size_t get_params_buffer_size()                                                = 0;
     virtual void set_flash_attention_enabled(bool enabled)                                 = 0;
     virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {}
-    virtual std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(ggml_context* work_ctx,
-                                                                                          int n_threads,
+    virtual std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(int n_threads,
                                                                                           const ConditionerParams& conditioner_params) {
         GGML_ABORT("Not implemented yet!");
     }
-    virtual std::string remove_trigger_from_prompt(ggml_context* work_ctx,
-                                                   const std::string& prompt) {
+    virtual std::string remove_trigger_from_prompt(const std::string& prompt) {
         GGML_ABORT("Not implemented yet!");
     }
 };
@@ -92,7 +136,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
         }
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
         text_model->get_param_tensors(tensors, "cond_stage_model.transformer.text_model");
         if (sd_version_is_sdxl(version)) {
             text_model2->get_param_tensors(tensors, "cond_stage_model.1.transformer.text_model");
@@ -149,14 +193,14 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
             }
             return true;
         }
-        struct ggml_init_params params;
-        params.mem_size               = 100 * 1024 * 1024;  // max for custom embeddings 100 MB
-        params.mem_buffer             = nullptr;
-        params.no_alloc               = false;
-        struct ggml_context* embd_ctx = ggml_init(params);
-        struct ggml_tensor* embd      = nullptr;
-        struct ggml_tensor* embd2     = nullptr;
-        auto on_load                  = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) {
+        ggml_init_params params;
+        params.mem_size        = 100 * 1024 * 1024;  // max for custom embeddings 100 MB
+        params.mem_buffer      = nullptr;
+        params.no_alloc        = false;
+        ggml_context* embd_ctx = ggml_init(params);
+        ggml_tensor* embd      = nullptr;
+        ggml_tensor* embd2     = nullptr;
+        auto on_load           = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) {
             if (tensor_storage.ne[0] != text_model->model.hidden_size) {
                 if (text_model2) {
                     if (tensor_storage.ne[0] == text_model2->model.hidden_size) {
@@ -426,8 +470,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
         return {tokens, weights};
     }
 
-    SDCondition get_learned_condition_common(ggml_context* work_ctx,
-                                             int n_threads,
+    SDCondition get_learned_condition_common(int n_threads,
                                              std::vector<int>& tokens,
                                              std::vector<float>& weights,
                                              int clip_skip,
@@ -435,13 +478,9 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                              int height,
                                              int adm_in_channels  = -1,
                                              bool zero_out_masked = false) {
-        int64_t t0                               = ggml_time_ms();
-        struct ggml_tensor* hidden_states        = nullptr;  // [N, n_token, hidden_size]
-        struct ggml_tensor* chunk_hidden_states  = nullptr;  // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2]
-        struct ggml_tensor* chunk_hidden_states1 = nullptr;  // [n_token, hidden_size]
-        struct ggml_tensor* chunk_hidden_states2 = nullptr;  // [n_token, hidden_size2]
-        struct ggml_tensor* pooled               = nullptr;
-        std::vector<float> hidden_states_vec;
+        int64_t t0 = ggml_time_ms();
+        sd::Tensor<float> hidden_states;  // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2]
+        sd::Tensor<float> pooled;
 
         if (clip_skip <= 0) {
             clip_skip = (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) ? 2 : 1;
@@ -455,9 +494,9 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
             std::vector<float> chunk_weights(weights.begin() + chunk_idx * chunk_len,
                                              weights.begin() + (chunk_idx + 1) * chunk_len);
 
-            auto input_ids                 = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
-            struct ggml_tensor* input_ids2 = nullptr;
-            size_t max_token_idx           = 0;
+            sd::Tensor<int32_t> input_ids({static_cast<int64_t>(chunk_tokens.size())}, chunk_tokens);
+            sd::Tensor<int32_t> input_ids2;
+            size_t max_token_idx = 0;
             if (sd_version_is_sdxl(version)) {
                 auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), tokenizer.EOS_TOKEN_ID);
                 if (it != chunk_tokens.end()) {
@@ -466,7 +505,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
 
                 max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
 
-                input_ids2 = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
+                input_ids2 = sd::Tensor<int32_t>({static_cast<int64_t>(chunk_tokens.size())}, chunk_tokens);
 
                 // for (int i = 0; i < chunk_tokens.size(); i++) {
                 //     printf("%d ", chunk_tokens[i]);
@@ -475,118 +514,87 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
             }
 
             {
-                text_model->compute(n_threads,
-                                    input_ids,
-                                    num_custom_embeddings,
-                                    token_embed_custom.data(),
-                                    max_token_idx,
-                                    false,
-                                    clip_skip,
-                                    &chunk_hidden_states1,
-                                    work_ctx);
+                auto chunk_hidden_states = text_model->compute(n_threads,
+                                                               input_ids,
+                                                               num_custom_embeddings,
+                                                               token_embed_custom.data(),
+                                                               max_token_idx,
+                                                               false,
+                                                               clip_skip);
+                GGML_ASSERT(!chunk_hidden_states.empty());
                 if (sd_version_is_sdxl(version)) {
-                    text_model2->compute(n_threads,
-                                         input_ids2,
-                                         num_custom_embeddings,
-                                         token_embed_custom.data(),
-                                         max_token_idx,
-                                         false,
-                                         clip_skip,
-                                         &chunk_hidden_states2, work_ctx);
-                    // concat
-                    chunk_hidden_states = ggml_ext_tensor_concat(work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0);
+                    auto chunk_hidden_states2 = text_model2->compute(n_threads,
+                                                                     input_ids2,
+                                                                     num_custom_embeddings,
+                                                                     token_embed_custom.data(),
+                                                                     max_token_idx,
+                                                                     false,
+                                                                     clip_skip);
+                    GGML_ASSERT(!chunk_hidden_states2.empty());
+                    chunk_hidden_states = sd::ops::concat(chunk_hidden_states, chunk_hidden_states2, 0);
 
                     if (chunk_idx == 0) {
-                        text_model2->compute(n_threads,
-                                             input_ids2,
-                                             num_custom_embeddings,
-                                             token_embed_custom.data(),
-                                             max_token_idx,
-                                             true,
-                                             clip_skip,
-                                             &pooled,
-                                             work_ctx);
+                        pooled = text_model2->compute(n_threads,
+                                                      input_ids2,
+                                                      num_custom_embeddings,
+                                                      token_embed_custom.data(),
+                                                      max_token_idx,
+                                                      true,
+                                                      clip_skip);
+                        GGML_ASSERT(!pooled.empty());
                     }
-                } else {
-                    chunk_hidden_states = chunk_hidden_states1;
                 }
-            }
+                int64_t t1 = ggml_time_ms();
+                LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
 
-            int64_t t1 = ggml_time_ms();
-            LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
-            ggml_tensor* result = ggml_dup_tensor(work_ctx, chunk_hidden_states);
-            {
-                float original_mean = ggml_ext_tensor_mean(chunk_hidden_states);
-                for (int i2 = 0; i2 < chunk_hidden_states->ne[2]; i2++) {
-                    for (int i1 = 0; i1 < chunk_hidden_states->ne[1]; i1++) {
-                        for (int i0 = 0; i0 < chunk_hidden_states->ne[0]; i0++) {
-                            float value = ggml_ext_tensor_get_f32(chunk_hidden_states, i0, i1, i2);
-                            value *= chunk_weights[i1];
-                            ggml_ext_tensor_set_f32(result, value, i0, i1, i2);
-                        }
-                    }
+                chunk_hidden_states = apply_token_weights(std::move(chunk_hidden_states), chunk_weights);
+
+                if (zero_out_masked) {
+                    chunk_hidden_states.fill_(0.0f);
                 }
-                float new_mean = ggml_ext_tensor_mean(result);
-                ggml_ext_tensor_scale_inplace(result, (original_mean / new_mean));
-            }
-            if (zero_out_masked) {
-                float* vec = (float*)result->data;
-                for (int i = 0; i < ggml_nelements(result); i++) {
-                    vec[i] = 0;
+                if (!hidden_states.empty()) {
+                    hidden_states = sd::ops::concat(hidden_states, chunk_hidden_states, 1);
+                } else {
+                    hidden_states = std::move(chunk_hidden_states);
                 }
             }
-            hidden_states_vec.insert(hidden_states_vec.end(), (float*)result->data, ((float*)result->data) + ggml_nelements(result));
         }
 
-        hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec);
-        hidden_states = ggml_reshape_2d(work_ctx,
-                                        hidden_states,
-                                        chunk_hidden_states->ne[0],
-                                        ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
-
-        ggml_tensor* vec = nullptr;
+        sd::Tensor<float> vec;
         if (sd_version_is_sdxl(version)) {
             int out_dim = 256;
-            vec         = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, adm_in_channels);
-            // [0:1280]
+            GGML_ASSERT(!pooled.empty());
+            vec = sd::Tensor<float>({adm_in_channels});
+            vec.fill_(0.0f);
             size_t offset = 0;
-            memcpy(vec->data, pooled->data, ggml_nbytes(pooled));
-            offset += ggml_nbytes(pooled);
+            std::copy(pooled.values().begin(), pooled.values().end(), vec.values().begin());
+            offset += pooled.values().size();
 
-            // original_size_as_tuple
-            float orig_width             = (float)width;
-            float orig_height            = (float)height;
-            std::vector<float> timesteps = {orig_height, orig_width};
+            auto append_embedding = [&](const std::vector<float>& timesteps) {
+                sd::Tensor<float> embedding;
+                set_timestep_embedding(timesteps, &embedding, out_dim);
+                std::copy(embedding.values().begin(), embedding.values().end(), vec.values().begin() + static_cast<int64_t>(offset));
+                offset += embedding.values().size();
+            };
 
-            ggml_tensor* embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset);
-            offset += ggml_nbytes(embed_view);
-            set_timestep_embedding(timesteps, embed_view, out_dim);
-            // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
-            // crop_coords_top_left
-            float crop_coord_top  = 0.f;
-            float crop_coord_left = 0.f;
-            timesteps             = {crop_coord_top, crop_coord_left};
-            embed_view            = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset);
-            offset += ggml_nbytes(embed_view);
-            set_timestep_embedding(timesteps, embed_view, out_dim);
-            // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
-            // target_size_as_tuple
-            float target_width  = (float)width;
-            float target_height = (float)height;
-            timesteps           = {target_height, target_width};
-            embed_view          = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset);
-            offset += ggml_nbytes(embed_view);
-            set_timestep_embedding(timesteps, embed_view, out_dim);
-            // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
-            GGML_ASSERT(offset == ggml_nbytes(vec));
+            append_embedding({static_cast<float>(height), static_cast<float>(width)});
+            append_embedding({0.0f, 0.0f});
+            append_embedding({static_cast<float>(height), static_cast<float>(width)});
+            GGML_ASSERT(offset == vec.values().size());
         }
-        // print_ggml_tensor(result);
-        return {hidden_states, vec, nullptr};
+        SDCondition result;
+        if (!hidden_states.empty()) {
+            result.c_crossattn = std::move(hidden_states);
+        }
+
+        if (!vec.empty()) {
+            result.c_vector = std::move(vec);
+        }
+        return result;
     }
 
     std::tuple<SDCondition, std::vector<bool>>
-    get_learned_condition_with_trigger(ggml_context* work_ctx,
-                                       int n_threads,
+    get_learned_condition_with_trigger(int n_threads,
                                        const ConditionerParams& conditioner_params) override {
         auto image_tokens = convert_token_to_id(trigger_word);
         // if(image_tokens.size() == 1){
@@ -608,8 +616,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
         // for(int i = 0; i < clsm.size(); ++i)
         //    printf("%d ", clsm[i]?1:0);
         // printf("\n");
-        auto cond = get_learned_condition_common(work_ctx,
-                                                 n_threads,
+        auto cond = get_learned_condition_common(n_threads,
                                                  tokens,
                                                  weights,
                                                  conditioner_params.clip_skip,
@@ -620,8 +627,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
         return std::make_tuple(cond, clsm);
     }
 
-    std::string remove_trigger_from_prompt(ggml_context* work_ctx,
-                                           const std::string& prompt) override {
+    std::string remove_trigger_from_prompt(const std::string& prompt) override {
         auto image_tokens = convert_token_to_id(trigger_word);
         GGML_ASSERT(image_tokens.size() == 1);
         auto tokens_and_weights  = tokenize(prompt, false);
@@ -632,14 +638,12 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
         return decode(tokens);
     }
 
-    SDCondition get_learned_condition(ggml_context* work_ctx,
-                                      int n_threads,
+    SDCondition get_learned_condition(int n_threads,
                                       const ConditionerParams& conditioner_params) override {
         auto tokens_and_weights     = tokenize(conditioner_params.text, true);
         std::vector<int>& tokens    = tokens_and_weights.first;
         std::vector<float>& weights = tokens_and_weights.second;
-        return get_learned_condition_common(work_ctx,
-                                            n_threads,
+        return get_learned_condition_common(n_threads,
                                             tokens,
                                             weights,
                                             conditioner_params.clip_skip,
@@ -676,34 +680,31 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
         return "clip_vision";
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) {
         vision_model.get_param_tensors(tensors, "cond_stage_model.transformer");
     }
 
-    struct ggml_cgraph* build_graph(struct ggml_tensor* pixel_values, bool return_pooled, int clip_skip) {
-        struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
-
-        pixel_values = to_backend(pixel_values);
+    ggml_cgraph* build_graph(const sd::Tensor<float>& pixel_values_tensor, bool return_pooled, int clip_skip) {
+        ggml_cgraph* gf           = ggml_new_graph(compute_ctx);
+        ggml_tensor* pixel_values = make_input(pixel_values_tensor);
 
         auto runner_ctx = get_context();
 
-        struct ggml_tensor* hidden_states = vision_model.forward(&runner_ctx, pixel_values, return_pooled, clip_skip);
+        ggml_tensor* hidden_states = vision_model.forward(&runner_ctx, pixel_values, return_pooled, clip_skip);
 
         ggml_build_forward_expand(gf, hidden_states);
 
         return gf;
     }
 
-    bool compute(const int n_threads,
-                 ggml_tensor* pixel_values,
-                 bool return_pooled,
-                 int clip_skip,
-                 ggml_tensor** output,
-                 ggml_context* output_ctx) {
-        auto get_graph = [&]() -> struct ggml_cgraph* {
+    sd::Tensor<float> compute(const int n_threads,
+                              const sd::Tensor<float>& pixel_values,
+                              bool return_pooled,
+                              int clip_skip) {
+        auto get_graph = [&]() -> ggml_cgraph* {
             return build_graph(pixel_values, return_pooled, clip_skip);
         };
-        return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
+        return take_or_empty(GGMLRunner::compute<float>(get_graph, n_threads, true));
     }
 };
 
@@ -746,7 +747,7 @@ struct SD3CLIPEmbedder : public Conditioner {
         }
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
         if (clip_l) {
             clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model");
         }
@@ -893,8 +894,7 @@ struct SD3CLIPEmbedder : public Conditioner {
         return {{clip_l_tokens, clip_l_weights}, {clip_g_tokens, clip_g_weights}, {t5_tokens, t5_weights}};
     }
 
-    SDCondition get_learned_condition_common(ggml_context* work_ctx,
-                                             int n_threads,
+    SDCondition get_learned_condition_common(int n_threads,
                                              std::vector<std::pair<std::vector<int>, std::vector<float>>> token_and_weights,
                                              int clip_skip,
                                              bool zero_out_masked = false) {
@@ -909,232 +909,155 @@ struct SD3CLIPEmbedder : public Conditioner {
             clip_skip = 2;
         }
 
-        int64_t t0                                 = ggml_time_ms();
-        struct ggml_tensor* hidden_states          = nullptr;  // [N, n_token*2, 4096]
-        struct ggml_tensor* chunk_hidden_states    = nullptr;  // [n_token*2, 4096]
-        struct ggml_tensor* chunk_hidden_states_l  = nullptr;  // [n_token, hidden_size_l]
-        struct ggml_tensor* chunk_hidden_states_g  = nullptr;  // [n_token, hidden_size_g]
-        struct ggml_tensor* chunk_hidden_states_t5 = nullptr;  // [n_token, hidden_size_t5]
-        struct ggml_tensor* pooled                 = nullptr;
-        struct ggml_tensor* pooled_l               = nullptr;  // [768,]
-        struct ggml_tensor* pooled_g               = nullptr;  // [1280,]
-        std::vector<float> hidden_states_vec;
+        size_t chunk_len = 77;
+        int64_t t0       = ggml_time_ms();
+        sd::Tensor<float> hidden_states;
+        sd::Tensor<float> pooled;
 
-        size_t chunk_len   = 77;
         size_t chunk_count = std::max(std::max(clip_l_tokens.size(), clip_g_tokens.size()), t5_tokens.size()) / chunk_len;
+
         for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) {
             // clip_l
+            sd::Tensor<float> chunk_hidden_states_l;
+            sd::Tensor<float> pooled_l;
             if (clip_l) {
                 std::vector<int> chunk_tokens(clip_l_tokens.begin() + chunk_idx * chunk_len,
                                               clip_l_tokens.begin() + (chunk_idx + 1) * chunk_len);
                 std::vector<float> chunk_weights(clip_l_weights.begin() + chunk_idx * chunk_len,
                                                  clip_l_weights.begin() + (chunk_idx + 1) * chunk_len);
 
-                auto input_ids       = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
+                sd::Tensor<int32_t> input_ids({static_cast<int64_t>(chunk_tokens.size())}, chunk_tokens);
                 size_t max_token_idx = 0;
 
-                clip_l->compute(n_threads,
-                                input_ids,
-                                0,
-                                nullptr,
-                                max_token_idx,
-                                false,
-                                clip_skip,
-                                &chunk_hidden_states_l,
-                                work_ctx);
-                {
-                    auto tensor         = chunk_hidden_states_l;
-                    float original_mean = ggml_ext_tensor_mean(tensor);
-                    for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                        for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                                float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
-                                value *= chunk_weights[i1];
-                                ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
-                            }
-                        }
-                    }
-                    float new_mean = ggml_ext_tensor_mean(tensor);
-                    ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
-                }
+                chunk_hidden_states_l = clip_l->compute(n_threads,
+                                                        input_ids,
+                                                        0,
+                                                        nullptr,
+                                                        max_token_idx,
+                                                        false,
+                                                        clip_skip);
+                GGML_ASSERT(!chunk_hidden_states_l.empty());
+                chunk_hidden_states_l = ::apply_token_weights(std::move(chunk_hidden_states_l), chunk_weights);
 
                 if (chunk_idx == 0) {
                     auto it       = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID);
                     max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
-                    clip_l->compute(n_threads,
-                                    input_ids,
-                                    0,
-                                    nullptr,
-                                    max_token_idx,
-                                    true,
-                                    clip_skip,
-                                    &pooled_l,
-                                    work_ctx);
+                    pooled_l      = clip_l->compute(n_threads,
+                                                    input_ids,
+                                                    0,
+                                                    nullptr,
+                                                    max_token_idx,
+                                                    true,
+                                                    clip_skip);
+                    GGML_ASSERT(!pooled_l.empty());
                 }
             } else {
-                chunk_hidden_states_l = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 768, chunk_len);
-                ggml_set_f32(chunk_hidden_states_l, 0.f);
+                chunk_hidden_states_l = sd::Tensor<float>::zeros({768, static_cast<int64_t>(chunk_len), 1});
                 if (chunk_idx == 0) {
-                    pooled_l = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768);
-                    ggml_set_f32(pooled_l, 0.f);
+                    pooled = sd::Tensor<float>::zeros({768, 1});
                 }
             }
 
             // clip_g
+            sd::Tensor<float> chunk_hidden_states_g;
+            sd::Tensor<float> pooled_g;
             if (clip_g) {
                 std::vector<int> chunk_tokens(clip_g_tokens.begin() + chunk_idx * chunk_len,
                                               clip_g_tokens.begin() + (chunk_idx + 1) * chunk_len);
                 std::vector<float> chunk_weights(clip_g_weights.begin() + chunk_idx * chunk_len,
                                                  clip_g_weights.begin() + (chunk_idx + 1) * chunk_len);
 
-                auto input_ids       = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
+                sd::Tensor<int32_t> input_ids({static_cast<int64_t>(chunk_tokens.size())}, chunk_tokens);
                 size_t max_token_idx = 0;
 
-                clip_g->compute(n_threads,
-                                input_ids,
-                                0,
-                                nullptr,
-                                max_token_idx,
-                                false,
-                                clip_skip,
-                                &chunk_hidden_states_g,
-                                work_ctx);
-
-                {
-                    auto tensor         = chunk_hidden_states_g;
-                    float original_mean = ggml_ext_tensor_mean(tensor);
-                    for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                        for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                                float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
-                                value *= chunk_weights[i1];
-                                ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
-                            }
-                        }
-                    }
-                    float new_mean = ggml_ext_tensor_mean(tensor);
-                    ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
-                }
+                chunk_hidden_states_g = clip_g->compute(n_threads,
+                                                        input_ids,
+                                                        0,
+                                                        nullptr,
+                                                        max_token_idx,
+                                                        false,
+                                                        clip_skip);
+                GGML_ASSERT(!chunk_hidden_states_g.empty());
+                chunk_hidden_states_g = ::apply_token_weights(std::move(chunk_hidden_states_g), chunk_weights);
 
                 if (chunk_idx == 0) {
                     auto it       = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_g_tokenizer.EOS_TOKEN_ID);
                     max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
-                    clip_g->compute(n_threads,
-                                    input_ids,
-                                    0,
-                                    nullptr,
-                                    max_token_idx,
-                                    true,
-                                    clip_skip,
-                                    &pooled_g,
-                                    work_ctx);
+                    pooled_g      = clip_g->compute(n_threads,
+                                                    input_ids,
+                                                    0,
+                                                    nullptr,
+                                                    max_token_idx,
+                                                    true,
+                                                    clip_skip);
+                    GGML_ASSERT(!pooled_g.empty());
                 }
             } else {
-                chunk_hidden_states_g = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 1280, chunk_len);
-                ggml_set_f32(chunk_hidden_states_g, 0.f);
+                chunk_hidden_states_g = sd::Tensor<float>::zeros({1280, static_cast<int64_t>(chunk_len), 1});
                 if (chunk_idx == 0) {
-                    pooled_g = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1280);
-                    ggml_set_f32(pooled_g, 0.f);
+                    pooled_g = sd::Tensor<float>::zeros({1280, 1});
                 }
             }
 
             // t5
+            sd::Tensor<float> chunk_hidden_states_t5;
             if (t5) {
                 std::vector<int> chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len,
                                               t5_tokens.begin() + (chunk_idx + 1) * chunk_len);
                 std::vector<float> chunk_weights(t5_weights.begin() + chunk_idx * chunk_len,
                                                  t5_weights.begin() + (chunk_idx + 1) * chunk_len);
 
-                auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
+                sd::Tensor<int32_t> input_ids({static_cast<int64_t>(chunk_tokens.size())}, chunk_tokens);
 
-                t5->compute(n_threads,
-                            input_ids,
-                            nullptr,
-                            &chunk_hidden_states_t5,
-                            work_ctx);
-                {
-                    auto tensor         = chunk_hidden_states_t5;
-                    float original_mean = ggml_ext_tensor_mean(tensor);
-                    for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                        for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                                float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
-                                value *= chunk_weights[i1];
-                                ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
-                            }
-                        }
-                    }
-                    float new_mean = ggml_ext_tensor_mean(tensor);
-                    ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
-                }
+                chunk_hidden_states_t5 = t5->compute(n_threads,
+                                                     input_ids,
+                                                     sd::Tensor<float>());
+                GGML_ASSERT(!chunk_hidden_states_t5.empty());
+                chunk_hidden_states_t5 = ::apply_token_weights(std::move(chunk_hidden_states_t5), chunk_weights);
             } else {
-                chunk_hidden_states_t5 = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, chunk_len);
-                ggml_set_f32(chunk_hidden_states_t5, 0.f);
+                chunk_hidden_states_t5 = sd::Tensor<float>::zeros({4096, static_cast<int64_t>(chunk_len), 1});
             }
 
-            auto chunk_hidden_states_lg_pad = ggml_new_tensor_3d(work_ctx,
-                                                                 chunk_hidden_states_l->type,
-                                                                 4096,
-                                                                 chunk_hidden_states_l->ne[1],
-                                                                 chunk_hidden_states_l->ne[2]);  // [n_token, 4096]
-
-            for (int i2 = 0; i2 < chunk_hidden_states_lg_pad->ne[2]; i2++) {
-                for (int i1 = 0; i1 < chunk_hidden_states_lg_pad->ne[1]; i1++) {
-                    for (int i0 = 0; i0 < chunk_hidden_states_lg_pad->ne[0]; i0++) {
-                        float value = 0.f;
-                        if (i0 < chunk_hidden_states_l->ne[0]) {
-                            value = ggml_ext_tensor_get_f32(chunk_hidden_states_l, i0, i1, i2);
-                        } else if (i0 < chunk_hidden_states_l->ne[0] + chunk_hidden_states_g->ne[0]) {
-                            value = ggml_ext_tensor_get_f32(chunk_hidden_states_g, i0 - chunk_hidden_states_l->ne[0], i1, i2);
-                        }
-                        ggml_ext_tensor_set_f32(chunk_hidden_states_lg_pad, value, i0, i1, i2);
-                    }
-                }
+            sd::Tensor<float> chunk_hidden_states_lg = sd::ops::concat(chunk_hidden_states_l, chunk_hidden_states_g, 0);
+            if (chunk_hidden_states_lg.shape()[0] < 4096) {
+                auto pad_shape         = chunk_hidden_states_lg.shape();
+                pad_shape[0]           = 4096 - chunk_hidden_states_lg.shape()[0];
+                chunk_hidden_states_lg = sd::ops::concat(chunk_hidden_states_lg,
+                                                         sd::Tensor<float>::zeros(pad_shape),
+                                                         0);
             }
 
-            chunk_hidden_states = ggml_ext_tensor_concat(work_ctx, chunk_hidden_states_lg_pad, chunk_hidden_states_t5, 1);  // [n_token*2, 4096]
+            sd::Tensor<float> chunk_hidden_states = sd::ops::concat(chunk_hidden_states_lg,
+                                                                    chunk_hidden_states_t5,
+                                                                    1);  // [n_token*2, 4096]
 
             if (chunk_idx == 0) {
-                pooled = ggml_ext_tensor_concat(work_ctx, pooled_l, pooled_g, 0);  // [768 + 1280]
+                pooled = sd::ops::concat(pooled_l, pooled_g, 0);  // [768 + 1280]
             }
 
             int64_t t1 = ggml_time_ms();
             LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
             if (zero_out_masked) {
-                float* vec = (float*)chunk_hidden_states->data;
-                for (int i = 0; i < ggml_nelements(chunk_hidden_states); i++) {
-                    vec[i] = 0;
-                }
+                chunk_hidden_states.fill_(0.0f);
             }
 
-            hidden_states_vec.insert(hidden_states_vec.end(),
-                                     (float*)chunk_hidden_states->data,
-                                     ((float*)chunk_hidden_states->data) + ggml_nelements(chunk_hidden_states));
+            if (!hidden_states.empty()) {
+                hidden_states = sd::ops::concat(hidden_states, chunk_hidden_states, 1);
+            } else {
+                hidden_states = std::move(chunk_hidden_states);
+            }
         }
 
-        if (hidden_states_vec.size() > 0) {
-            hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec);
-            hidden_states = ggml_reshape_2d(work_ctx,
-                                            hidden_states,
-                                            chunk_hidden_states->ne[0],
-                                            ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
-        } else {
-            hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, 256);
-            ggml_set_f32(hidden_states, 0.f);
-        }
-        if (pooled == nullptr) {
-            pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 2048);
-            ggml_set_f32(pooled, 0.f);
-        }
-        return {hidden_states, pooled, nullptr};
+        SDCondition result;
+        result.c_crossattn = std::move(hidden_states);
+        result.c_vector    = std::move(pooled);
+        return result;
     }
 
-    SDCondition get_learned_condition(ggml_context* work_ctx,
-                                      int n_threads,
+    SDCondition get_learned_condition(int n_threads,
                                       const ConditionerParams& conditioner_params) override {
         auto tokens_and_weights = tokenize(conditioner_params.text, 77, true);
-        return get_learned_condition_common(work_ctx,
-                                            n_threads,
+        return get_learned_condition_common(n_threads,
                                             tokens_and_weights,
                                             conditioner_params.clip_skip,
                                             conditioner_params.zero_out_masked);
@@ -1178,7 +1101,7 @@ struct FluxCLIPEmbedder : public Conditioner {
         }
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
         if (clip_l) {
             clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model");
         }
@@ -1292,8 +1215,7 @@ struct FluxCLIPEmbedder : public Conditioner {
         return {{clip_l_tokens, clip_l_weights}, {t5_tokens, t5_weights}};
     }
 
-    SDCondition get_learned_condition_common(ggml_context* work_ctx,
-                                             int n_threads,
+    SDCondition get_learned_condition_common(int n_threads,
                                              std::vector<std::pair<std::vector<int>, std::vector<float>>> token_and_weights,
                                              int clip_skip,
                                              bool zero_out_masked = false) {
@@ -1306,11 +1228,9 @@ struct FluxCLIPEmbedder : public Conditioner {
             clip_skip = 2;
         }
 
-        int64_t t0                              = ggml_time_ms();
-        struct ggml_tensor* hidden_states       = nullptr;  // [N, n_token, 4096]
-        struct ggml_tensor* chunk_hidden_states = nullptr;  // [n_token, 4096]
-        struct ggml_tensor* pooled              = nullptr;  // [768,]
-        std::vector<float> hidden_states_vec;
+        int64_t t0 = ggml_time_ms();
+        sd::Tensor<float> hidden_states;  // [N, n_token, 4096]
+        sd::Tensor<float> pooled;         // [768,]
 
         size_t chunk_count = std::max(clip_l_tokens.size() > 0 ? chunk_len : 0, t5_tokens.size()) / chunk_len;
         for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) {
@@ -1323,95 +1243,65 @@ struct FluxCLIPEmbedder : public Conditioner {
                     std::vector<float> chunk_weights(clip_l_weights.begin(),
                                                      clip_l_weights.begin() + chunk_len_l);
 
-                    auto input_ids       = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
+                    sd::Tensor<int32_t> input_ids({static_cast<int64_t>(chunk_tokens.size())}, chunk_tokens);
                     size_t max_token_idx = 0;
 
                     auto it       = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID);
                     max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
 
-                    clip_l->compute(n_threads,
-                                    input_ids,
-                                    0,
-                                    nullptr,
-                                    max_token_idx,
-                                    true,
-                                    clip_skip,
-                                    &pooled,
-                                    work_ctx);
+                    pooled = clip_l->compute(n_threads,
+                                             input_ids,
+                                             0,
+                                             nullptr,
+                                             max_token_idx,
+                                             true,
+                                             clip_skip);
+                    GGML_ASSERT(!pooled.empty());
+                } else {
+                    pooled = sd::Tensor<float>::zeros({768});
                 }
             }
 
             // t5
+            sd::Tensor<float> chunk_hidden_states;
             if (t5) {
                 std::vector<int> chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len,
                                               t5_tokens.begin() + (chunk_idx + 1) * chunk_len);
                 std::vector<float> chunk_weights(t5_weights.begin() + chunk_idx * chunk_len,
                                                  t5_weights.begin() + (chunk_idx + 1) * chunk_len);
 
-                auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
-
-                t5->compute(n_threads,
-                            input_ids,
-                            nullptr,
-                            &chunk_hidden_states,
-                            work_ctx);
-                {
-                    auto tensor         = chunk_hidden_states;
-                    float original_mean = ggml_ext_tensor_mean(tensor);
-                    for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                        for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                                float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
-                                value *= chunk_weights[i1];
-                                ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
-                            }
-                        }
-                    }
-                    float new_mean = ggml_ext_tensor_mean(tensor);
-                    ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
+                sd::Tensor<int32_t> input_ids({static_cast<int64_t>(chunk_tokens.size())}, chunk_tokens);
+                chunk_hidden_states = t5->compute(n_threads,
+                                                  input_ids,
+                                                  sd::Tensor<float>());
+                GGML_ASSERT(!chunk_hidden_states.empty());
+                chunk_hidden_states = ::apply_token_weights(std::move(chunk_hidden_states), chunk_weights);
+                if (zero_out_masked) {
+                    chunk_hidden_states.fill_(0.0f);
                 }
             } else {
-                chunk_hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, chunk_len);
-                ggml_set_f32(chunk_hidden_states, 0.f);
+                chunk_hidden_states = sd::Tensor<float>::zeros({4096, static_cast<int64_t>(chunk_len)});
             }
 
             int64_t t1 = ggml_time_ms();
             LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
-            if (zero_out_masked) {
-                float* vec = (float*)chunk_hidden_states->data;
-                for (int i = 0; i < ggml_nelements(chunk_hidden_states); i++) {
-                    vec[i] = 0;
-                }
+            if (!hidden_states.empty()) {
+                hidden_states = sd::ops::concat(hidden_states, chunk_hidden_states, 1);
+            } else {
+                hidden_states = std::move(chunk_hidden_states);
             }
-
-            hidden_states_vec.insert(hidden_states_vec.end(),
-                                     (float*)chunk_hidden_states->data,
-                                     ((float*)chunk_hidden_states->data) + ggml_nelements(chunk_hidden_states));
         }
 
-        if (hidden_states_vec.size() > 0) {
-            hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec);
-            hidden_states = ggml_reshape_2d(work_ctx,
-                                            hidden_states,
-                                            chunk_hidden_states->ne[0],
-                                            ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
-        } else {
-            hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, 256);
-            ggml_set_f32(hidden_states, 0.f);
-        }
-        if (pooled == nullptr) {
-            pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768);
-            ggml_set_f32(pooled, 0.f);
-        }
-        return {hidden_states, pooled, nullptr};
+        SDCondition result;
+        result.c_crossattn = std::move(hidden_states);
+        result.c_vector    = std::move(pooled);
+        return result;
     }
 
-    SDCondition get_learned_condition(ggml_context* work_ctx,
-                                      int n_threads,
+    SDCondition get_learned_condition(int n_threads,
                                       const ConditionerParams& conditioner_params) override {
         auto tokens_and_weights = tokenize(conditioner_params.text, chunk_len, true);
-        return get_learned_condition_common(work_ctx,
-                                            n_threads,
+        return get_learned_condition_common(n_threads,
                                             tokens_and_weights,
                                             conditioner_params.clip_skip,
                                             conditioner_params.zero_out_masked);
@@ -1448,7 +1338,7 @@ struct T5CLIPEmbedder : public Conditioner {
         }
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
         if (t5) {
             t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
         }
@@ -1523,8 +1413,9 @@ struct T5CLIPEmbedder : public Conditioner {
         return {t5_tokens, t5_weights, t5_mask};
     }
 
-    void modify_mask_to_attend_padding(struct ggml_tensor* mask, int max_seq_length, int num_extra_padding = 8) {
-        float* mask_data = (float*)mask->data;
+    void modify_mask_to_attend_padding(sd::Tensor<float>* mask, int max_seq_length, int num_extra_padding = 8) {
+        GGML_ASSERT(mask != nullptr);
+        float* mask_data = mask->data();
         int num_pad      = 0;
         for (int64_t i = 0; i < max_seq_length; i++) {
             if (num_pad >= num_extra_padding) {
@@ -1538,29 +1429,23 @@ struct T5CLIPEmbedder : public Conditioner {
         // LOG_DEBUG("PAD: %d", num_pad);
     }
 
-    SDCondition get_learned_condition_common(ggml_context* work_ctx,
-                                             int n_threads,
+    SDCondition get_learned_condition_common(int n_threads,
                                              std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> token_and_weights,
                                              int clip_skip,
                                              bool zero_out_masked = false) {
         if (!t5) {
-            auto hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, 256);
-            ggml_set_f32(hidden_states, 0.f);
-            auto t5_attn_mask = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 256);
-            ggml_set_f32(t5_attn_mask, -HUGE_VALF);
-            return {hidden_states, t5_attn_mask, nullptr};
+            SDCondition result;
+            result.c_crossattn = sd::Tensor<float>::zeros({4096, 256});
+            result.c_vector    = sd::Tensor<float>::full({256}, -HUGE_VALF);
+            return result;
         }
         auto& t5_tokens        = std::get<0>(token_and_weights);
         auto& t5_weights       = std::get<1>(token_and_weights);
         auto& t5_attn_mask_vec = std::get<2>(token_and_weights);
 
-        int64_t t0                              = ggml_time_ms();
-        struct ggml_tensor* hidden_states       = nullptr;  // [N, n_token, 4096]
-        struct ggml_tensor* chunk_hidden_states = nullptr;  // [n_token, 4096]
-        struct ggml_tensor* pooled              = nullptr;
-        struct ggml_tensor* t5_attn_mask        = vector_to_ggml_tensor(work_ctx, t5_attn_mask_vec);  // [n_token]
-
-        std::vector<float> hidden_states_vec;
+        int64_t t0                     = ggml_time_ms();
+        sd::Tensor<float> t5_attn_mask = sd::Tensor<float>::from_vector(t5_attn_mask_vec);
+        sd::Tensor<float> hidden_states;
 
         size_t chunk_count = t5_tokens.size() / chunk_len;
 
@@ -1573,68 +1458,46 @@ struct T5CLIPEmbedder : public Conditioner {
             std::vector<float> chunk_mask(t5_attn_mask_vec.begin() + chunk_idx * chunk_len,
                                           t5_attn_mask_vec.begin() + (chunk_idx + 1) * chunk_len);
 
-            auto input_ids          = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
-            auto t5_attn_mask_chunk = use_mask ? vector_to_ggml_tensor(work_ctx, chunk_mask) : nullptr;
+            sd::Tensor<int32_t> input_ids({static_cast<int64_t>(chunk_tokens.size())}, chunk_tokens);
+            sd::Tensor<float> t5_attn_mask_chunk;
+            if (use_mask) {
+                t5_attn_mask_chunk = sd::Tensor<float>({static_cast<int64_t>(chunk_mask.size())}, chunk_mask);
+            }
 
-            t5->compute(n_threads,
-                        input_ids,
-                        t5_attn_mask_chunk,
-                        &chunk_hidden_states,
-                        work_ctx);
-            {
-                auto tensor         = chunk_hidden_states;
-                float original_mean = ggml_ext_tensor_mean(tensor);
-                for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                    for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                        for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                            float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
-                            value *= chunk_weights[i1];
-                            ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
-                        }
-                    }
-                }
-                float new_mean = ggml_ext_tensor_mean(tensor);
-                ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
+            auto chunk_hidden_states = t5->compute(n_threads,
+                                                   input_ids,
+                                                   t5_attn_mask_chunk);
+            GGML_ASSERT(!chunk_hidden_states.empty());
+            chunk_hidden_states = apply_token_weights(std::move(chunk_hidden_states), chunk_weights);
+
+            if (zero_out_masked) {
+                auto chunk_mask_tensor = sd::Tensor<float>::from_vector(chunk_mask)
+                                             .reshape_({1, static_cast<int64_t>(chunk_mask.size())});
+                chunk_hidden_states.masked_fill_(chunk_mask_tensor < 0.0f, 0.0f);
             }
 
             int64_t t1 = ggml_time_ms();
             LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
-            if (zero_out_masked) {
-                auto tensor = chunk_hidden_states;
-                for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                    for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                        for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                            if (chunk_mask[i1] < 0.f) {
-                                ggml_ext_tensor_set_f32(tensor, 0.f, i0, i1, i2);
-                            }
-                        }
-                    }
-                }
-            }
 
-            hidden_states_vec.insert(hidden_states_vec.end(),
-                                     (float*)chunk_hidden_states->data,
-                                     ((float*)chunk_hidden_states->data) + ggml_nelements(chunk_hidden_states));
+            if (!hidden_states.empty()) {
+                hidden_states = sd::ops::concat(hidden_states, chunk_hidden_states, 1);
+            } else {
+                hidden_states = std::move(chunk_hidden_states);
+            }
         }
 
-        GGML_ASSERT(hidden_states_vec.size() > 0);
-        hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec);
-        hidden_states = ggml_reshape_2d(work_ctx,
-                                        hidden_states,
-                                        chunk_hidden_states->ne[0],
-                                        ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
+        modify_mask_to_attend_padding(&t5_attn_mask, static_cast<int>(t5_attn_mask.numel()), mask_pad);
 
-        modify_mask_to_attend_padding(t5_attn_mask, static_cast<int>(ggml_nelements(t5_attn_mask)), mask_pad);
-
-        return {hidden_states, t5_attn_mask, nullptr};
+        SDCondition result;
+        result.c_crossattn = std::move(hidden_states);
+        result.c_vector    = std::move(t5_attn_mask);
+        return result;
     }
 
-    SDCondition get_learned_condition(ggml_context* work_ctx,
-                                      int n_threads,
+    SDCondition get_learned_condition(int n_threads,
                                       const ConditionerParams& conditioner_params) override {
         auto tokens_and_weights = tokenize(conditioner_params.text, chunk_len, true);
-        return get_learned_condition_common(work_ctx,
-                                            n_threads,
+        return get_learned_condition_common(n_threads,
                                             tokens_and_weights,
                                             conditioner_params.clip_skip,
                                             conditioner_params.zero_out_masked);
@@ -1658,7 +1521,7 @@ struct AnimaConditioner : public Conditioner {
                                                false);
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
         llm->get_param_tensors(tensors, "text_encoders.llm");
     }
 
@@ -1723,8 +1586,7 @@ struct AnimaConditioner : public Conditioner {
         return {qwen_tokens, qwen_weights, t5_tokens, t5_weights};
     }
 
-    SDCondition get_learned_condition(ggml_context* work_ctx,
-                                      int n_threads,
+    SDCondition get_learned_condition(int n_threads,
                                       const ConditionerParams& conditioner_params) override {
         int64_t t0 = ggml_time_ms();
 
@@ -1734,46 +1596,25 @@ struct AnimaConditioner : public Conditioner {
         auto& t5_tokens    = std::get<2>(tokenized);
         auto& t5_weights   = std::get<3>(tokenized);
 
-        auto input_ids = vector_to_ggml_tensor_i32(work_ctx, qwen_tokens);
-
-        struct ggml_tensor* hidden_states = nullptr;  // [N, n_token, 1024]
-        llm->compute(n_threads,
-                     input_ids,
-                     nullptr,
-                     {},
-                     {},
-                     &hidden_states,
-                     work_ctx);
-
-        {
-            auto tensor         = hidden_states;
-            float original_mean = ggml_ext_tensor_mean(tensor);
-            for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                    for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                        float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
-                        value *= qwen_weights[i1];
-                        ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
-                    }
-                }
-            }
-            float new_mean = ggml_ext_tensor_mean(tensor);
-            if (new_mean != 0.f) {
-                ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
-            }
-        }
-
-        struct ggml_tensor* t5_ids_tensor    = nullptr;
-        struct ggml_tensor* t5_weight_tensor = nullptr;
-        if (!t5_tokens.empty()) {
-            t5_ids_tensor    = vector_to_ggml_tensor_i32(work_ctx, t5_tokens);
-            t5_weight_tensor = vector_to_ggml_tensor(work_ctx, t5_weights);
-        }
+        sd::Tensor<int32_t> input_ids({static_cast<int64_t>(qwen_tokens.size()), 1}, qwen_tokens);
+        auto hidden_states = llm->compute(n_threads,
+                                          input_ids,
+                                          sd::Tensor<float>(),
+                                          {},
+                                          {});
+        GGML_ASSERT(!hidden_states.empty());
+        hidden_states         = apply_token_weights(std::move(hidden_states), qwen_weights);
+        auto t5_ids_tensor    = sd::Tensor<int32_t>::from_vector(t5_tokens);
+        auto t5_weight_tensor = sd::Tensor<float>::from_vector(t5_weights);
 
         int64_t t1 = ggml_time_ms();
         LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
 
-        return {hidden_states, t5_weight_tensor, t5_ids_tensor};
+        SDCondition result;
+        result.c_crossattn  = std::move(hidden_states);
+        result.c_t5_ids     = std::move(t5_ids_tensor);
+        result.c_t5_weights = std::move(t5_weight_tensor);
+        return result;
     }
 };
 
@@ -1808,7 +1649,7 @@ struct LLMEmbedder : public Conditioner {
                                                enable_vision);
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
         llm->get_param_tensors(tensors, "text_encoders.llm");
     }
 
@@ -1884,15 +1725,14 @@ struct LLMEmbedder : public Conditioner {
         return {tokens, weights};
     }
 
-    ggml_tensor* encode_prompt(ggml_context* work_ctx,
-                               int n_threads,
-                               const std::string prompt,
-                               const std::pair<int, int>& prompt_attn_range,
-                               int max_length,
-                               int min_length,
-                               std::vector<std::pair<int, ggml_tensor*>> image_embeds,
-                               const std::set<int>& out_layers,
-                               int prompt_template_encode_start_idx) {
+    sd::Tensor<float> encode_prompt(int n_threads,
+                                    const std::string prompt,
+                                    const std::pair<int, int>& prompt_attn_range,
+                                    int max_length,
+                                    int min_length,
+                                    const std::vector<std::pair<int, sd::Tensor<float>>>& image_embeds,
+                                    const std::set<int>& out_layers,
+                                    int prompt_template_encode_start_idx) {
         auto tokens_and_weights = tokenize(prompt, prompt_attn_range);
         auto& tokens            = std::get<0>(tokens_and_weights);
         auto& weights           = std::get<1>(tokens_and_weights);
@@ -1904,81 +1744,59 @@ struct LLMEmbedder : public Conditioner {
             tokenizer->pad_tokens(tokens, weights, max_length, true);
         }
 
-        struct ggml_tensor* hidden_states = nullptr;  // [N, n_token, hidden_size]
-
-        auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens);
-
-        ggml_tensor* attention_mask = nullptr;
+        sd::Tensor<int32_t> input_ids({static_cast<int64_t>(tokens.size())}, tokens);
+        sd::Tensor<float> attention_mask;
         if (!mask.empty()) {
-            attention_mask = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, mask.size(), mask.size());
-            ggml_ext_tensor_iter(attention_mask, [&](ggml_tensor* attention_mask, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-                float value = 0.f;
-                if (mask[i0] == 0.f) {
-                    value = -INFINITY;
-                } else if (i0 > i1) {
-                    value = -INFINITY;
-                }
-                ggml_ext_tensor_set_f32(attention_mask, value, i0, i1, i2, i3);
-            });
-        }
-
-        llm->compute(n_threads,
-                     input_ids,
-                     attention_mask,
-                     image_embeds,
-                     out_layers,
-                     &hidden_states,
-                     work_ctx);
-        {
-            auto tensor         = hidden_states;
-            float original_mean = ggml_ext_tensor_mean(tensor);
-            for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                    for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                        float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
-                        value *= weights[i1];
-                        ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
+            attention_mask = sd::Tensor<float>({static_cast<int64_t>(mask.size()), static_cast<int64_t>(mask.size())});
+            for (size_t i1 = 0; i1 < mask.size(); ++i1) {
+                for (size_t i0 = 0; i0 < mask.size(); ++i0) {
+                    float value = 0.0f;
+                    if (mask[i0] == 0.0f || i0 > i1) {
+                        value = -INFINITY;
                     }
+                    attention_mask[static_cast<int64_t>(i0 + mask.size() * i1)] = value;
                 }
             }
-            float new_mean = ggml_ext_tensor_mean(tensor);
-            ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
         }
 
-        GGML_ASSERT(hidden_states->ne[1] > prompt_template_encode_start_idx);
+        auto hidden_states = llm->compute(n_threads,
+                                          input_ids,
+                                          attention_mask,
+                                          image_embeds,
+                                          out_layers);
+        GGML_ASSERT(!hidden_states.empty());
+        hidden_states = apply_token_weights(std::move(hidden_states), weights);
+        GGML_ASSERT(hidden_states.shape()[1] > prompt_template_encode_start_idx);
 
         int64_t zero_pad_len = 0;
         if (min_length > 0) {
-            if (hidden_states->ne[1] - prompt_template_encode_start_idx < min_length) {
-                zero_pad_len = min_length - hidden_states->ne[1] + prompt_template_encode_start_idx;
+            if (hidden_states.shape()[1] - prompt_template_encode_start_idx < min_length) {
+                zero_pad_len = min_length - hidden_states.shape()[1] + prompt_template_encode_start_idx;
             }
         }
 
-        ggml_tensor* new_hidden_states = ggml_new_tensor_3d(work_ctx,
-                                                            GGML_TYPE_F32,
-                                                            hidden_states->ne[0],
-                                                            hidden_states->ne[1] - prompt_template_encode_start_idx + zero_pad_len,
-                                                            hidden_states->ne[2]);
-
-        ggml_ext_tensor_iter(new_hidden_states, [&](ggml_tensor* new_hidden_states, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-            float value = 0.f;
-            if (i1 + prompt_template_encode_start_idx < hidden_states->ne[1]) {
-                value = ggml_ext_tensor_get_f32(hidden_states, i0, i1 + prompt_template_encode_start_idx, i2, i3);
-            }
-            ggml_ext_tensor_set_f32(new_hidden_states, value, i0, i1, i2, i3);
-        });
+        sd::Tensor<float> new_hidden_states = sd::ops::slice(hidden_states,
+                                                             1,
+                                                             prompt_template_encode_start_idx,
+                                                             hidden_states.shape()[1]);
+        if (zero_pad_len > 0) {
+            auto pad_shape    = new_hidden_states.shape();
+            pad_shape[1]      = zero_pad_len;
+            new_hidden_states = sd::ops::concat(new_hidden_states,
+                                                sd::Tensor<float>::zeros(std::move(pad_shape)),
+                                                1);
+        }
 
         return new_hidden_states;
     }
 
-    SDCondition get_learned_condition(ggml_context* work_ctx,
-                                      int n_threads,
+    SDCondition get_learned_condition(int n_threads,
                                       const ConditionerParams& conditioner_params) override {
         std::string prompt;
         std::pair<int, int> prompt_attn_range;
         std::vector<std::string> extra_prompts;
         std::vector<std::pair<int, int>> extra_prompts_attn_range;
-        std::vector<std::pair<int, ggml_tensor*>> image_embeds;
+        std::vector<std::pair<int, sd::Tensor<float>>> image_embeds;
         int prompt_template_encode_start_idx = 34;
         int max_length                       = 0;  // pad tokens
         int min_length                       = 0;  // zero pad hidden_states
@@ -1987,7 +1805,7 @@ struct LLMEmbedder : public Conditioner {
         int64_t t0 = ggml_time_ms();
 
         if (sd_version_is_qwen_image(version)) {
-            if (llm->enable_vision && !conditioner_params.ref_images.empty()) {
+            if (llm->enable_vision && conditioner_params.ref_images != nullptr && !conditioner_params.ref_images->empty()) {
                 LOG_INFO("QwenImageEditPlusPipeline");
                 prompt_template_encode_start_idx = 64;
                 int image_embed_idx              = 64 + 6;
@@ -1997,13 +1815,13 @@ struct LLMEmbedder : public Conditioner {
                 std::string placeholder = "<|image_pad|>";
                 std::string img_prompt;
 
-                for (int i = 0; i < conditioner_params.ref_images.size(); i++) {
-                    sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(*conditioner_params.ref_images[i]);
-                    double factor        = llm->params.vision.patch_size * llm->params.vision.spatial_merge_size;
-                    int height           = image.height;
-                    int width            = image.width;
-                    int h_bar            = static_cast<int>(std::round(height / factor) * factor);
-                    int w_bar            = static_cast<int>(std::round(width / factor) * factor);
+                for (int i = 0; i < conditioner_params.ref_images->size(); i++) {
+                    const auto& image = (*conditioner_params.ref_images)[i];
+                    double factor     = llm->params.vision.patch_size * llm->params.vision.spatial_merge_size;
+                    int height        = static_cast<int>(image.shape()[1]);
+                    int width         = static_cast<int>(image.shape()[0]);
+                    int h_bar         = static_cast<int>(std::round(height / factor) * factor);
+                    int w_bar         = static_cast<int>(std::round(width / factor) * factor);
 
                     if (static_cast<double>(h_bar) * w_bar > max_pixels) {
                         double beta = std::sqrt((height * width) / static_cast<double>(max_pixels));
@@ -2017,24 +1835,17 @@ struct LLMEmbedder : public Conditioner {
                         w_bar       = static_cast<int>(std::ceil(width * beta / factor)) * static_cast<int>(factor);
                     }
 
-                    LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, image.height, image.width, h_bar, w_bar);
+                    LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, height, width, h_bar, w_bar);
 
-                    sd_image_f32_t resized_image = clip_preprocess(image, w_bar, h_bar);
-                    free(image.data);
-                    image.data = nullptr;
+                    auto resized_image = clip_preprocess(image, w_bar, h_bar);
 
-                    ggml_tensor* image_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, resized_image.width, resized_image.height, 3, 1);
-                    sd_image_f32_to_ggml_tensor(resized_image, image_tensor, false);
-                    free(resized_image.data);
-                    resized_image.data = nullptr;
-
-                    ggml_tensor* image_embed = nullptr;
-                    llm->encode_image(n_threads, image_tensor, &image_embed, work_ctx);
+                    auto image_embed = llm->encode_image(n_threads, resized_image);
+                    GGML_ASSERT(!image_embed.empty());
                     image_embeds.emplace_back(image_embed_idx, image_embed);
-                    image_embed_idx += 1 + static_cast<int>(image_embed->ne[1]) + 6;
+                    image_embed_idx += 1 + static_cast<int>(image_embed.shape()[1]) + 6;
 
                     img_prompt += "Picture " + std::to_string(i + 1) + ": <|vision_start|>";  // [24669, 220, index, 25, 220, 151652]
-                    int64_t num_image_tokens = image_embed->ne[1];
+                    int64_t num_image_tokens = image_embed.shape()[1];
                     img_prompt.reserve(num_image_tokens * placeholder.size());
                     for (int j = 0; j < num_image_tokens; j++) {
                         img_prompt += placeholder;
@@ -2077,10 +1888,10 @@ struct LLMEmbedder : public Conditioner {
             prompt_template_encode_start_idx = 0;
             out_layers                       = {35};  // -2
 
-            if (!conditioner_params.ref_images.empty()) {
+            if (conditioner_params.ref_images != nullptr && !conditioner_params.ref_images->empty()) {
                 LOG_INFO("ZImageOmniPipeline");
                 prompt = "<|im_start|>user\n<|vision_start|>";
-                for (int i = 0; i < conditioner_params.ref_images.size() - 1; i++) {
+                for (int i = 0; i < conditioner_params.ref_images->size() - 1; i++) {
                     extra_prompts.push_back("<|vision_end|><|vision_start|>");
                 }
                 extra_prompts.push_back("<|vision_end|>" + conditioner_params.text + "<|im_end|>\n<|im_start|>assistant\n<|vision_start|>");
@@ -2121,8 +1932,7 @@ struct LLMEmbedder : public Conditioner {
             GGML_ABORT("unknown version %d", version);
         }
 
-        auto hidden_states = encode_prompt(work_ctx,
-                                           n_threads,
+        auto hidden_states = encode_prompt(n_threads,
                                            prompt,
                                            prompt_attn_range,
                                            max_length,
@@ -2130,11 +1940,9 @@ struct LLMEmbedder : public Conditioner {
                                            image_embeds,
                                            out_layers,
                                            prompt_template_encode_start_idx);
-
-        std::vector<ggml_tensor*> extra_hidden_states_vec;
+        std::vector<sd::Tensor<float>> extra_hidden_states_vec;
         for (int i = 0; i < extra_prompts.size(); i++) {
-            auto extra_hidden_states = encode_prompt(work_ctx,
-                                                     n_threads,
+            auto extra_hidden_states = encode_prompt(n_threads,
                                                      extra_prompts[i],
                                                      extra_prompts_attn_range[i],
                                                      max_length,
@@ -2142,12 +1950,15 @@ struct LLMEmbedder : public Conditioner {
                                                      image_embeds,
                                                      out_layers,
                                                      prompt_template_encode_start_idx);
-            extra_hidden_states_vec.push_back(extra_hidden_states);
+            extra_hidden_states_vec.push_back(std::move(extra_hidden_states));
         }
 
         int64_t t1 = ggml_time_ms();
         LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
-        return {hidden_states, nullptr, nullptr, extra_hidden_states_vec};
+        SDCondition result;
+        result.c_crossattn        = std::move(hidden_states);
+        result.extra_c_crossattns = std::move(extra_hidden_states_vec);
+        return result;
     }
 };
 
diff --git a/src/control.hpp b/src/control.hpp
index 5bab0381..d227ec94 100644
--- a/src/control.hpp
+++ b/src/control.hpp
@@ -164,26 +164,26 @@ public:
         blocks["middle_block_out.0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(ch));
     }
 
-    struct ggml_tensor* resblock_forward(std::string name,
-                                         GGMLRunnerContext* ctx,
-                                         struct ggml_tensor* x,
-                                         struct ggml_tensor* emb) {
+    ggml_tensor* resblock_forward(std::string name,
+                                  GGMLRunnerContext* ctx,
+                                  ggml_tensor* x,
+                                  ggml_tensor* emb) {
         auto block = std::dynamic_pointer_cast<ResBlock>(blocks[name]);
         return block->forward(ctx, x, emb);
     }
 
-    struct ggml_tensor* attention_layer_forward(std::string name,
-                                                GGMLRunnerContext* ctx,
-                                                struct ggml_tensor* x,
-                                                struct ggml_tensor* context) {
+    ggml_tensor* attention_layer_forward(std::string name,
+                                         GGMLRunnerContext* ctx,
+                                         ggml_tensor* x,
+                                         ggml_tensor* context) {
         auto block = std::dynamic_pointer_cast<SpatialTransformer>(blocks[name]);
         return block->forward(ctx, x, context);
     }
 
-    struct ggml_tensor* input_hint_block_forward(GGMLRunnerContext* ctx,
-                                                 struct ggml_tensor* hint,
-                                                 struct ggml_tensor* emb,
-                                                 struct ggml_tensor* context) {
+    ggml_tensor* input_hint_block_forward(GGMLRunnerContext* ctx,
+                                          ggml_tensor* hint,
+                                          ggml_tensor* emb,
+                                          ggml_tensor* context) {
         int num_input_blocks = 15;
         auto h               = hint;
         for (int i = 0; i < num_input_blocks; i++) {
@@ -198,13 +198,13 @@ public:
         return h;
     }
 
-    std::vector<struct ggml_tensor*> forward(GGMLRunnerContext* ctx,
-                                             struct ggml_tensor* x,
-                                             struct ggml_tensor* hint,
-                                             struct ggml_tensor* guided_hint,
-                                             struct ggml_tensor* timesteps,
-                                             struct ggml_tensor* context,
-                                             struct ggml_tensor* y = nullptr) {
+    std::vector<ggml_tensor*> forward(GGMLRunnerContext* ctx,
+                                      ggml_tensor* x,
+                                      ggml_tensor* hint,
+                                      ggml_tensor* guided_hint,
+                                      ggml_tensor* timesteps,
+                                      ggml_tensor* context,
+                                      ggml_tensor* y = nullptr) {
         // x: [N, in_channels, h, w] or [N, in_channels/2, h, w]
         // timesteps: [N,]
         // context: [N, max_position, hidden_size] or [1, max_position, hidden_size]. for example, [N, 77, 768]
@@ -246,7 +246,7 @@ public:
             emb = ggml_add(ctx->ggml_ctx, emb, label_emb);  // [N, time_embed_dim]
         }
 
-        std::vector<struct ggml_tensor*> outs;
+        std::vector<ggml_tensor*> outs;
 
         if (guided_hint == nullptr) {
             guided_hint = input_hint_block_forward(ctx, hint, emb, context);
@@ -310,11 +310,13 @@ struct ControlNet : public GGMLRunner {
     SDVersion version = VERSION_SD1;
     ControlNetBlock control_net;
 
-    ggml_backend_buffer_t control_buffer = nullptr;  // keep control output tensors in backend memory
+    ggml_backend_buffer_t control_buffer = nullptr;
     ggml_context* control_ctx            = nullptr;
-    std::vector<struct ggml_tensor*> controls;  // (12 input block outputs, 1 middle block output) SD 1.5
-    struct ggml_tensor* guided_hint = nullptr;  // guided_hint cache, for faster inference
-    bool guided_hint_cached         = false;
+    std::vector<ggml_tensor*> control_outputs_ggml;
+    ggml_tensor* guided_hint_output_ggml = nullptr;
+    std::vector<sd::Tensor<float>> controls;
+    sd::Tensor<float> guided_hint;
+    bool guided_hint_cached = false;
 
     ControlNet(ggml_backend_t backend,
                bool offload_params_to_cpu,
@@ -328,23 +330,23 @@ struct ControlNet : public GGMLRunner {
         free_control_ctx();
     }
 
-    void alloc_control_ctx(std::vector<struct ggml_tensor*> outs) {
-        struct ggml_init_params params;
+    void alloc_control_ctx(std::vector<ggml_tensor*> outs) {
+        ggml_init_params params;
         params.mem_size   = static_cast<size_t>(outs.size() * ggml_tensor_overhead()) + 1024 * 1024;
         params.mem_buffer = nullptr;
         params.no_alloc   = true;
         control_ctx       = ggml_init(params);
 
-        controls.resize(outs.size() - 1);
+        control_outputs_ggml.resize(outs.size() - 1);
 
         size_t control_buffer_size = 0;
 
-        guided_hint = ggml_dup_tensor(control_ctx, outs[0]);
-        control_buffer_size += ggml_nbytes(guided_hint);
+        guided_hint_output_ggml = ggml_dup_tensor(control_ctx, outs[0]);
+        control_buffer_size += ggml_nbytes(guided_hint_output_ggml);
 
         for (int i = 0; i < outs.size() - 1; i++) {
-            controls[i] = ggml_dup_tensor(control_ctx, outs[i + 1]);
-            control_buffer_size += ggml_nbytes(controls[i]);
+            control_outputs_ggml[i] = ggml_dup_tensor(control_ctx, outs[i + 1]);
+            control_buffer_size += ggml_nbytes(control_outputs_ggml[i]);
         }
 
         control_buffer = ggml_backend_alloc_ctx_tensors(control_ctx, runtime_backend);
@@ -361,8 +363,10 @@ struct ControlNet : public GGMLRunner {
             ggml_free(control_ctx);
             control_ctx = nullptr;
         }
-        guided_hint        = nullptr;
-        guided_hint_cached = false;
+        guided_hint_output_ggml = nullptr;
+        guided_hint_cached      = false;
+        guided_hint             = {};
+        control_outputs_ggml.clear();
         controls.clear();
     }
 
@@ -370,33 +374,37 @@ struct ControlNet : public GGMLRunner {
         return "control_net";
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
         control_net.get_param_tensors(tensors, prefix);
     }
 
-    struct ggml_cgraph* build_graph(struct ggml_tensor* x,
-                                    struct ggml_tensor* hint,
-                                    struct ggml_tensor* timesteps,
-                                    struct ggml_tensor* context,
-                                    struct ggml_tensor* y = nullptr) {
-        struct ggml_cgraph* gf = new_graph_custom(CONTROL_NET_GRAPH_SIZE);
+    ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
+                             const sd::Tensor<float>& hint_tensor,
+                             const sd::Tensor<float>& timesteps_tensor,
+                             const sd::Tensor<float>& context_tensor = {},
+                             const sd::Tensor<float>& y_tensor       = {}) {
+        ggml_cgraph* gf = new_graph_custom(CONTROL_NET_GRAPH_SIZE);
 
-        x = to_backend(x);
-        if (guided_hint_cached) {
-            hint = nullptr;
+        ggml_tensor* x         = make_input(x_tensor);
+        ggml_tensor* hint      = nullptr;
+        ggml_tensor* timesteps = make_input(timesteps_tensor);
+        ggml_tensor* context   = make_optional_input(context_tensor);
+        ggml_tensor* y         = make_optional_input(y_tensor);
+
+        ggml_tensor* guided_hint_input = nullptr;
+        if (guided_hint_cached && !guided_hint.empty()) {
+            guided_hint_input = make_input(guided_hint);
+            hint              = nullptr;
         } else {
-            hint = to_backend(hint);
+            hint = make_input(hint_tensor);
         }
-        context   = to_backend(context);
-        y         = to_backend(y);
-        timesteps = to_backend(timesteps);
 
         auto runner_ctx = get_context();
 
         auto outs = control_net.forward(&runner_ctx,
                                         x,
                                         hint,
-                                        guided_hint_cached ? guided_hint : nullptr,
+                                        guided_hint_input,
                                         timesteps,
                                         context,
                                         y);
@@ -405,36 +413,46 @@ struct ControlNet : public GGMLRunner {
             alloc_control_ctx(outs);
         }
 
-        ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[0], guided_hint));
+        ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[0], guided_hint_output_ggml));
         for (int i = 0; i < outs.size() - 1; i++) {
-            ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[i + 1], controls[i]));
+            ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[i + 1], control_outputs_ggml[i]));
         }
 
         return gf;
     }
 
-    bool compute(int n_threads,
-                 struct ggml_tensor* x,
-                 struct ggml_tensor* hint,
-                 struct ggml_tensor* timesteps,
-                 struct ggml_tensor* context,
-                 struct ggml_tensor* y,
-                 struct ggml_tensor** output     = nullptr,
-                 struct ggml_context* output_ctx = nullptr) {
+    std::optional<std::vector<sd::Tensor<float>>> compute(int n_threads,
+                                                          const sd::Tensor<float>& x,
+                                                          const sd::Tensor<float>& hint,
+                                                          const sd::Tensor<float>& timesteps,
+                                                          const sd::Tensor<float>& context = {},
+                                                          const sd::Tensor<float>& y       = {}) {
         // x: [N, in_channels, h, w]
         // timesteps: [N, ]
         // context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size]
         // y: [N, adm_in_channels] or [1, adm_in_channels]
-        auto get_graph = [&]() -> struct ggml_cgraph* {
+        auto get_graph = [&]() -> ggml_cgraph* {
             return build_graph(x, hint, timesteps, context, y);
         };
 
-        bool res = GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
-        if (res) {
-            // cache guided_hint
-            guided_hint_cached = true;
+        auto compute_result = GGMLRunner::compute<float>(get_graph, n_threads, false);
+        if (!compute_result.has_value()) {
+            return std::nullopt;
         }
-        return res;
+
+        if (guided_hint_output_ggml != nullptr) {
+            guided_hint = restore_trailing_singleton_dims(sd::make_sd_tensor_from_ggml<float>(guided_hint_output_ggml),
+                                                          4);
+        }
+        controls.clear();
+        controls.reserve(control_outputs_ggml.size());
+        for (ggml_tensor* control : control_outputs_ggml) {
+            auto control_host = restore_trailing_singleton_dims(sd::make_sd_tensor_from_ggml<float>(control), 4);
+            GGML_ASSERT(!control_host.empty());
+            controls.push_back(std::move(control_host));
+        }
+        guided_hint_cached = true;
+        return controls;
     }
 
     bool load_from_file(const std::string& file_path, int n_threads) {
@@ -462,4 +480,4 @@ struct ControlNet : public GGMLRunner {
     }
 };
 
-#endif  // __CONTROL_HPP__
\ No newline at end of file
+#endif  // __CONTROL_HPP__
diff --git a/src/denoiser.hpp b/src/denoiser.hpp
index 40bd7cb7..077a1b79 100644
--- a/src/denoiser.hpp
+++ b/src/denoiser.hpp
@@ -5,6 +5,7 @@
 
 #include "ggml_extend.hpp"
 #include "gits_noise.inl"
+#include "tensor.hpp"
 
 /*================================================= CompVisDenoiser ==================================================*/
 
@@ -73,9 +74,9 @@ constexpr double interp(double left, double right, double perc) noexcept {
 /* This will make the assumption that the reference x and y values are
  * already sorted in ascending order because they are being generated as
  * such in the calling function */
-std::vector<double> linear_interp(std::vector<float> new_x,
-                                  const std::vector<float> ref_x,
-                                  const std::vector<float> ref_y) {
+inline std::vector<double> linear_interp(std::vector<float> new_x,
+                                         const std::vector<float> ref_x,
+                                         const std::vector<float> ref_y) {
     const size_t len_x = new_x.size();
     size_t i           = 0;
     size_t j           = 0;
@@ -109,7 +110,7 @@ std::vector<double> linear_interp(std::vector<float> new_x,
     return new_y;
 }
 
-std::vector<float> linear_space(const float start, const float end, const size_t num_points) {
+inline std::vector<float> linear_space(const float start, const float end, const size_t num_points) {
     std::vector<float> result(num_points);
     const float inc = (end - start) / (static_cast<float>(num_points - 1));
 
@@ -124,8 +125,8 @@ std::vector<float> linear_space(const float start, const float end, const size_t
     return result;
 }
 
-std::vector<float> log_linear_interpolation(std::vector<float> sigma_in,
-                                            const size_t new_len) {
+inline std::vector<float> log_linear_interpolation(std::vector<float> sigma_in,
+                                                   const size_t new_len) {
     const size_t s_len        = sigma_in.size();
     std::vector<float> x_vals = linear_space(0.f, 1.f, s_len);
     std::vector<float> y_vals(s_len);
@@ -478,13 +479,16 @@ struct KLOptimalScheduler : SigmaScheduler {
 };
 
 struct Denoiser {
-    virtual float sigma_min()                                                                = 0;
-    virtual float sigma_max()                                                                = 0;
-    virtual float sigma_to_t(float sigma)                                                    = 0;
-    virtual float t_to_sigma(float t)                                                        = 0;
-    virtual std::vector<float> get_scalings(float sigma)                                     = 0;
-    virtual ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) = 0;
-    virtual ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent)             = 0;
+    virtual float sigma_min()                                                        = 0;
+    virtual float sigma_max()                                                        = 0;
+    virtual float sigma_to_t(float sigma)                                            = 0;
+    virtual float t_to_sigma(float t)                                                = 0;
+    virtual std::vector<float> get_scalings(float sigma)                             = 0;
+    virtual sd::Tensor<float> noise_scaling(float sigma,
+                                            const sd::Tensor<float>& noise,
+                                            const sd::Tensor<float>& latent)         = 0;
+    virtual sd::Tensor<float> inverse_noise_scaling(float sigma,
+                                                    const sd::Tensor<float>& latent) = 0;
 
     virtual std::vector<float> get_sigmas(uint32_t n, int /*image_seq_len*/, scheduler_t scheduler_type, SDVersion version) {
         auto bound_t_to_sigma = std::bind(&Denoiser::t_to_sigma, this, std::placeholders::_1);
@@ -598,14 +602,15 @@ struct CompVisDenoiser : public Denoiser {
         return {c_skip, c_out, c_in};
     }
 
-    // this function will modify noise/latent
-    ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) override {
-        ggml_ext_tensor_scale_inplace(noise, sigma);
-        ggml_ext_tensor_add_inplace(latent, noise);
-        return latent;
+    virtual sd::Tensor<float> noise_scaling(float sigma,
+                                            const sd::Tensor<float>& noise,
+                                            const sd::Tensor<float>& latent) override {
+        GGML_ASSERT(noise.numel() == latent.numel());
+        return latent + noise * sigma;
     }
 
-    ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) override {
+    sd::Tensor<float> inverse_noise_scaling(float sigma, const sd::Tensor<float>& latent) override {
+        SD_UNUSED(sigma);
         return latent;
     }
 };
@@ -644,7 +649,7 @@ struct EDMVDenoiser : public CompVisVDenoiser {
     }
 };
 
-float time_snr_shift(float alpha, float t) {
+inline float time_snr_shift(float alpha, float t) {
     if (alpha == 1.0f) {
         return t;
     }
@@ -696,21 +701,18 @@ struct DiscreteFlowDenoiser : public Denoiser {
         return {c_skip, c_out, c_in};
     }
 
-    // this function will modify noise/latent
-    ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) override {
-        ggml_ext_tensor_scale_inplace(noise, sigma);
-        ggml_ext_tensor_scale_inplace(latent, 1.0f - sigma);
-        ggml_ext_tensor_add_inplace(latent, noise);
-        return latent;
+    sd::Tensor<float> noise_scaling(float sigma,
+                                    const sd::Tensor<float>& noise,
+                                    const sd::Tensor<float>& latent) override {
+        GGML_ASSERT(noise.numel() == latent.numel());
+        return latent * (1.0f - sigma) + noise * sigma;
     }
-
-    ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) override {
-        ggml_ext_tensor_scale_inplace(latent, 1.0f / (1.0f - sigma));
-        return latent;
+    sd::Tensor<float> inverse_noise_scaling(float sigma, const sd::Tensor<float>& latent) override {
+        return latent * (1.0f / (1.0f - sigma));
     }
 };
 
-float flux_time_shift(float mu, float sigma, float t) {
+inline float flux_time_shift(float mu, float sigma, float t) {
     return ::expf(mu) / (::expf(mu) + ::powf((1.0f / t - 1.0f), sigma));
 }
 
@@ -759,938 +761,289 @@ struct Flux2FlowDenoiser : public FluxFlowDenoiser {
     }
 };
 
-typedef std::function<ggml_tensor*(ggml_tensor*, float, int)> denoise_cb_t;
+typedef std::function<sd::Tensor<float>(const sd::Tensor<float>&, float, int)> denoise_cb_t;
 
 // k diffusion reverse ODE: dx = (x - D(x;\sigma)) / \sigma dt; \sigma(t) = t
-static bool sample_k_diffusion(sample_method_t method,
-                               denoise_cb_t model,
-                               ggml_context* work_ctx,
-                               ggml_tensor* x,
-                               std::vector<float> sigmas,
-                               std::shared_ptr<RNG> rng,
-                               float eta) {
+static sd::Tensor<float> sample_k_diffusion(sample_method_t method,
+                                            denoise_cb_t model,
+                                            sd::Tensor<float> x,
+                                            std::vector<float> sigmas,
+                                            std::shared_ptr<RNG> rng,
+                                            float eta) {
     size_t steps = sigmas.size() - 1;
-    // sample_euler_ancestral
     switch (method) {
         case EULER_A_SAMPLE_METHOD: {
-            struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
-            struct ggml_tensor* d     = ggml_dup_tensor(work_ctx, x);
-
             for (int i = 0; i < steps; i++) {
-                float sigma = sigmas[i];
-
-                // denoise
-                ggml_tensor* denoised = model(x, sigma, i + 1);
-                if (denoised == nullptr) {
-                    return false;
+                float sigma       = sigmas[i];
+                auto denoised_opt = model(x, sigma, i + 1);
+                if (denoised_opt.empty()) {
+                    return {};
                 }
-
-                // d = (x - denoised) / sigma
-                {
-                    float* vec_d        = (float*)d->data;
-                    float* vec_x        = (float*)x->data;
-                    float* vec_denoised = (float*)denoised->data;
-
-                    for (int i = 0; i < ggml_nelements(d); i++) {
-                        vec_d[i] = (vec_x[i] - vec_denoised[i]) / sigma;
-                    }
-                }
-
-                // get_ancestral_step
-                float sigma_up   = std::min(sigmas[i + 1],
-                                            std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i])));
-                float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up);
-
-                // Euler method
-                float dt = sigma_down - sigmas[i];
-                // x = x + d * dt
-                {
-                    float* vec_d = (float*)d->data;
-                    float* vec_x = (float*)x->data;
-
-                    for (int i = 0; i < ggml_nelements(x); i++) {
-                        vec_x[i] = vec_x[i] + vec_d[i] * dt;
-                    }
-                }
-
+                sd::Tensor<float> denoised = std::move(denoised_opt);
+                sd::Tensor<float> d        = (x - denoised) / sigma;
+                float sigma_up             = std::min(sigmas[i + 1],
+                                                      std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i])));
+                float sigma_down           = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up);
+                float dt                   = sigma_down - sigmas[i];
+                x += d * dt;
                 if (sigmas[i + 1] > 0) {
-                    // x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
-                    ggml_ext_im_set_randn_f32(noise, rng);
-                    // noise = load_tensor_from_file(work_ctx, "./rand" + std::to_string(i+1) + ".bin");
-                    {
-                        float* vec_x     = (float*)x->data;
-                        float* vec_noise = (float*)noise->data;
-
-                        for (int i = 0; i < ggml_nelements(x); i++) {
-                            vec_x[i] = vec_x[i] + vec_noise[i] * sigma_up;
-                        }
-                    }
+                    x += sd::Tensor<float>::randn_like(x, rng) * sigma_up;
                 }
             }
-        } break;
-        case EULER_SAMPLE_METHOD:  // Implemented without any sigma churn
-        {
-            struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
-
+            return x;
+        }
+        case EULER_SAMPLE_METHOD: {
             for (int i = 0; i < steps; i++) {
-                float sigma = sigmas[i];
-
-                // denoise
-                ggml_tensor* denoised = model(x, sigma, i + 1);
-                if (denoised == nullptr) {
-                    return false;
-                }
-
-                // d = (x - denoised) / sigma
-                {
-                    float* vec_d        = (float*)d->data;
-                    float* vec_x        = (float*)x->data;
-                    float* vec_denoised = (float*)denoised->data;
-
-                    for (int j = 0; j < ggml_nelements(d); j++) {
-                        vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigma;
-                    }
-                }
-
-                float dt = sigmas[i + 1] - sigma;
-                // x = x + d * dt
-                {
-                    float* vec_d = (float*)d->data;
-                    float* vec_x = (float*)x->data;
-
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] = vec_x[j] + vec_d[j] * dt;
-                    }
+                float sigma       = sigmas[i];
+                auto denoised_opt = model(x, sigma, i + 1);
+                if (denoised_opt.empty()) {
+                    return {};
                 }
+                sd::Tensor<float> denoised = std::move(denoised_opt);
+                sd::Tensor<float> d        = (x - denoised) / sigma;
+                float dt                   = sigmas[i + 1] - sigma;
+                x += d * dt;
             }
-        } break;
+            return x;
+        }
         case HEUN_SAMPLE_METHOD: {
-            struct ggml_tensor* d  = ggml_dup_tensor(work_ctx, x);
-            struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x);
-
             for (int i = 0; i < steps; i++) {
-                // denoise
-                ggml_tensor* denoised = model(x, sigmas[i], -(i + 1));
-                if (denoised == nullptr) {
-                    return false;
+                auto denoised_opt = model(x, sigmas[i], -(i + 1));
+                if (denoised_opt.empty()) {
+                    return {};
                 }
-
-                // d = (x - denoised) / sigma
-                {
-                    float* vec_d        = (float*)d->data;
-                    float* vec_x        = (float*)x->data;
-                    float* vec_denoised = (float*)denoised->data;
-
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i];
-                    }
-                }
-
-                float dt = sigmas[i + 1] - sigmas[i];
+                sd::Tensor<float> denoised = std::move(denoised_opt);
+                sd::Tensor<float> d        = (x - denoised) / sigmas[i];
+                float dt                   = sigmas[i + 1] - sigmas[i];
                 if (sigmas[i + 1] == 0) {
-                    // Euler step
-                    // x = x + d * dt
-                    float* vec_d = (float*)d->data;
-                    float* vec_x = (float*)x->data;
-
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] = vec_x[j] + vec_d[j] * dt;
-                    }
+                    x += d * dt;
                 } else {
-                    // Heun step
-                    float* vec_d  = (float*)d->data;
-                    float* vec_d2 = (float*)d->data;
-                    float* vec_x  = (float*)x->data;
-                    float* vec_x2 = (float*)x2->data;
-
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x2[j] = vec_x[j] + vec_d[j] * dt;
-                    }
-
-                    ggml_tensor* denoised = model(x2, sigmas[i + 1], i + 1);
-                    if (denoised == nullptr) {
-                        return false;
-                    }
-                    float* vec_denoised = (float*)denoised->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        float d2 = (vec_x2[j] - vec_denoised[j]) / sigmas[i + 1];
-                        vec_d[j] = (vec_d[j] + d2) / 2;
-                        vec_x[j] = vec_x[j] + vec_d[j] * dt;
+                    sd::Tensor<float> x2 = x + d * dt;
+                    auto denoised2_opt   = model(x2, sigmas[i + 1], i + 1);
+                    if (denoised2_opt.empty()) {
+                        return {};
                     }
+                    sd::Tensor<float> denoised2 = std::move(denoised2_opt);
+                    d                           = (d + (x2 - denoised2) / sigmas[i + 1]) / 2.0f;
+                    x += d * dt;
                 }
             }
-        } break;
+            return x;
+        }
         case DPM2_SAMPLE_METHOD: {
-            struct ggml_tensor* d  = ggml_dup_tensor(work_ctx, x);
-            struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x);
-
             for (int i = 0; i < steps; i++) {
-                // denoise
-                ggml_tensor* denoised = model(x, sigmas[i], -(i + 1));
-                if (denoised == nullptr) {
-                    return false;
+                auto denoised_opt = model(x, sigmas[i], -(i + 1));
+                if (denoised_opt.empty()) {
+                    return {};
                 }
-
-                // d = (x - denoised) / sigma
-                {
-                    float* vec_d        = (float*)d->data;
-                    float* vec_x        = (float*)x->data;
-                    float* vec_denoised = (float*)denoised->data;
-
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i];
-                    }
-                }
-
+                sd::Tensor<float> denoised = std::move(denoised_opt);
+                sd::Tensor<float> d        = (x - denoised) / sigmas[i];
                 if (sigmas[i + 1] == 0) {
-                    // Euler step
-                    // x = x + d * dt
-                    float dt     = sigmas[i + 1] - sigmas[i];
-                    float* vec_d = (float*)d->data;
-                    float* vec_x = (float*)x->data;
-
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] = vec_x[j] + vec_d[j] * dt;
-                    }
+                    float dt = sigmas[i + 1] - sigmas[i];
+                    x += d * dt;
                 } else {
-                    // DPM-Solver-2
-                    float sigma_mid = exp(0.5f * (log(sigmas[i]) + log(sigmas[i + 1])));
-                    float dt_1      = sigma_mid - sigmas[i];
-                    float dt_2      = sigmas[i + 1] - sigmas[i];
-
-                    float* vec_d  = (float*)d->data;
-                    float* vec_x  = (float*)x->data;
-                    float* vec_x2 = (float*)x2->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x2[j] = vec_x[j] + vec_d[j] * dt_1;
-                    }
-
-                    ggml_tensor* denoised = model(x2, sigma_mid, i + 1);
-                    if (denoised == nullptr) {
-                        return false;
-                    }
-                    float* vec_denoised = (float*)denoised->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        float d2 = (vec_x2[j] - vec_denoised[j]) / sigma_mid;
-                        vec_x[j] = vec_x[j] + d2 * dt_2;
+                    float sigma_mid      = exp(0.5f * (log(sigmas[i]) + log(sigmas[i + 1])));
+                    float dt_1           = sigma_mid - sigmas[i];
+                    float dt_2           = sigmas[i + 1] - sigmas[i];
+                    sd::Tensor<float> x2 = x + d * dt_1;
+                    auto denoised2_opt   = model(x2, sigma_mid, i + 1);
+                    if (denoised2_opt.empty()) {
+                        return {};
                     }
+                    sd::Tensor<float> denoised2 = std::move(denoised2_opt);
+                    x += ((x2 - denoised2) / sigma_mid) * dt_2;
                 }
             }
-
-        } break;
+            return x;
+        }
         case DPMPP2S_A_SAMPLE_METHOD: {
-            struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
-            struct ggml_tensor* x2    = ggml_dup_tensor(work_ctx, x);
-
             for (int i = 0; i < steps; i++) {
-                // denoise
-                ggml_tensor* denoised = model(x, sigmas[i], -(i + 1));
-                if (denoised == nullptr) {
-                    return false;
+                auto denoised_opt = model(x, sigmas[i], -(i + 1));
+                if (denoised_opt.empty()) {
+                    return {};
                 }
-
-                // get_ancestral_step
-                float sigma_up   = std::min(sigmas[i + 1],
-                                            std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i])));
-                float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up);
-                auto t_fn        = [](float sigma) -> float { return -log(sigma); };
-                auto sigma_fn    = [](float t) -> float { return exp(-t); };
+                sd::Tensor<float> denoised = std::move(denoised_opt);
+                float sigma_up             = std::min(sigmas[i + 1],
+                                                      std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i])));
+                float sigma_down           = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up);
+                auto t_fn                  = [](float sigma) -> float { return -log(sigma); };
+                auto sigma_fn              = [](float t) -> float { return exp(-t); };
 
                 if (sigma_down == 0) {
-                    // d = (x - denoised) / sigmas[i];
-                    // dt = sigma_down - sigmas[i];
-                    // x += d * dt;
-                    // => x = denoised
-                    float* vec_x        = (float*)x->data;
-                    float* vec_denoised = (float*)denoised->data;
-
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] = vec_denoised[j];
-                    }
+                    x = denoised;
                 } else {
-                    // DPM-Solver++(2S)
-                    float t      = t_fn(sigmas[i]);
-                    float t_next = t_fn(sigma_down);
-                    float h      = t_next - t;
-                    float s      = t + 0.5f * h;
-
-                    float* vec_x        = (float*)x->data;
-                    float* vec_x2       = (float*)x2->data;
-                    float* vec_denoised = (float*)denoised->data;
-
-                    // First half-step
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x2[j] = (sigma_fn(s) / sigma_fn(t)) * vec_x[j] - (exp(-h * 0.5f) - 1) * vec_denoised[j];
-                    }
-
-                    ggml_tensor* denoised = model(x2, sigmas[i + 1], i + 1);
-                    if (denoised == nullptr) {
-                        return false;
-                    }
-
-                    // Second half-step
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] = (sigma_fn(t_next) / sigma_fn(t)) * vec_x[j] - (exp(-h) - 1) * vec_denoised[j];
+                    float t              = t_fn(sigmas[i]);
+                    float t_next         = t_fn(sigma_down);
+                    float h              = t_next - t;
+                    float s              = t + 0.5f * h;
+                    sd::Tensor<float> x2 = (sigma_fn(s) / sigma_fn(t)) * x - (exp(-h * 0.5f) - 1) * denoised;
+                    auto denoised2_opt   = model(x2, sigmas[i + 1], i + 1);
+                    if (denoised2_opt.empty()) {
+                        return {};
                     }
+                    sd::Tensor<float> denoised2 = std::move(denoised2_opt);
+                    x                           = (sigma_fn(t_next) / sigma_fn(t)) * (x) - (exp(-h) - 1) * denoised2;
                 }
 
-                // Noise addition
                 if (sigmas[i + 1] > 0) {
-                    ggml_ext_im_set_randn_f32(noise, rng);
-                    {
-                        float* vec_x     = (float*)x->data;
-                        float* vec_noise = (float*)noise->data;
-
-                        for (int i = 0; i < ggml_nelements(x); i++) {
-                            vec_x[i] = vec_x[i] + vec_noise[i] * sigma_up;
-                        }
-                    }
+                    x += sd::Tensor<float>::randn_like(x, rng) * sigma_up;
                 }
             }
-        } break;
-        case DPMPP2M_SAMPLE_METHOD:  // DPM++ (2M) from Karras et al (2022)
-        {
-            struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x);
-
-            auto t_fn = [](float sigma) -> float { return -log(sigma); };
-
+            return x;
+        }
+        case DPMPP2M_SAMPLE_METHOD: {
+            sd::Tensor<float> old_denoised = x;
+            auto t_fn                      = [](float sigma) -> float { return -log(sigma); };
             for (int i = 0; i < steps; i++) {
-                // denoise
-                ggml_tensor* denoised = model(x, sigmas[i], i + 1);
-                if (denoised == nullptr) {
-                    return false;
+                auto denoised_opt = model(x, sigmas[i], i + 1);
+                if (denoised_opt.empty()) {
+                    return {};
                 }
-
-                float t                 = t_fn(sigmas[i]);
-                float t_next            = t_fn(sigmas[i + 1]);
-                float h                 = t_next - t;
-                float a                 = sigmas[i + 1] / sigmas[i];
-                float b                 = exp(-h) - 1.f;
-                float* vec_x            = (float*)x->data;
-                float* vec_denoised     = (float*)denoised->data;
-                float* vec_old_denoised = (float*)old_denoised->data;
+                sd::Tensor<float> denoised = std::move(denoised_opt);
+                float t                    = t_fn(sigmas[i]);
+                float t_next               = t_fn(sigmas[i + 1]);
+                float h                    = t_next - t;
+                float a                    = sigmas[i + 1] / sigmas[i];
+                float b                    = exp(-h) - 1.f;
 
                 if (i == 0 || sigmas[i + 1] == 0) {
-                    // Simpler step for the edge cases
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] = a * vec_x[j] - b * vec_denoised[j];
-                    }
+                    x = a * (x)-b * denoised;
                 } else {
-                    float h_last = t - t_fn(sigmas[i - 1]);
-                    float r      = h_last / h;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        float denoised_d = (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j];
-                        vec_x[j]         = a * vec_x[j] - b * denoised_d;
-                    }
-                }
-
-                // old_denoised = denoised
-                for (int j = 0; j < ggml_nelements(x); j++) {
-                    vec_old_denoised[j] = vec_denoised[j];
+                    float h_last                 = t - t_fn(sigmas[i - 1]);
+                    float r                      = h_last / h;
+                    sd::Tensor<float> denoised_d = (1.f + 1.f / (2.f * r)) * denoised - (1.f / (2.f * r)) * old_denoised;
+                    x                            = a * (x)-b * denoised_d;
                 }
+                old_denoised = denoised;
             }
-        } break;
-        case DPMPP2Mv2_SAMPLE_METHOD:  // Modified DPM++ (2M) from https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457
-        {
-            struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x);
-
-            auto t_fn = [](float sigma) -> float { return -log(sigma); };
-
+            return x;
+        }
+        case DPMPP2Mv2_SAMPLE_METHOD: {
+            sd::Tensor<float> old_denoised = x;
+            auto t_fn                      = [](float sigma) -> float { return -log(sigma); };
             for (int i = 0; i < steps; i++) {
-                // denoise
-                ggml_tensor* denoised = model(x, sigmas[i], i + 1);
-                if (denoised == nullptr) {
-                    return false;
+                auto denoised_opt = model(x, sigmas[i], i + 1);
+                if (denoised_opt.empty()) {
+                    return {};
                 }
-
-                float t                 = t_fn(sigmas[i]);
-                float t_next            = t_fn(sigmas[i + 1]);
-                float h                 = t_next - t;
-                float a                 = sigmas[i + 1] / sigmas[i];
-                float* vec_x            = (float*)x->data;
-                float* vec_denoised     = (float*)denoised->data;
-                float* vec_old_denoised = (float*)old_denoised->data;
-
+                sd::Tensor<float> denoised = std::move(denoised_opt);
+                float t                    = t_fn(sigmas[i]);
+                float t_next               = t_fn(sigmas[i + 1]);
+                float h                    = t_next - t;
+                float a                    = sigmas[i + 1] / sigmas[i];
                 if (i == 0 || sigmas[i + 1] == 0) {
-                    // Simpler step for the edge cases
                     float b = exp(-h) - 1.f;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] = a * vec_x[j] - b * vec_denoised[j];
-                    }
+                    x       = a * (x)-b * denoised;
                 } else {
-                    float h_last = t - t_fn(sigmas[i - 1]);
-                    float h_min  = std::min(h_last, h);
-                    float h_max  = std::max(h_last, h);
-                    float r      = h_max / h_min;
-                    float h_d    = (h_max + h_min) / 2.f;
-                    float b      = exp(-h_d) - 1.f;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        float denoised_d = (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j];
-                        vec_x[j]         = a * vec_x[j] - b * denoised_d;
-                    }
+                    float h_last                 = t - t_fn(sigmas[i - 1]);
+                    float h_min                  = std::min(h_last, h);
+                    float h_max                  = std::max(h_last, h);
+                    float r                      = h_max / h_min;
+                    float h_d                    = (h_max + h_min) / 2.f;
+                    float b                      = exp(-h_d) - 1.f;
+                    sd::Tensor<float> denoised_d = (1.f + 1.f / (2.f * r)) * denoised - (1.f / (2.f * r)) * old_denoised;
+                    x                            = a * (x)-b * denoised_d;
                 }
+                old_denoised = denoised;
+            }
+            return x;
+        }
+        case LCM_SAMPLE_METHOD: {
+            for (int i = 0; i < steps; i++) {
+                auto denoised_opt = model(x, sigmas[i], i + 1);
+                if (denoised_opt.empty()) {
+                    return {};
+                }
+                sd::Tensor<float> denoised = std::move(denoised_opt);
 
-                // old_denoised = denoised
-                for (int j = 0; j < ggml_nelements(x); j++) {
-                    vec_old_denoised[j] = vec_denoised[j];
+                x = denoised;
+                if (sigmas[i + 1] > 0) {
+                    x += sd::Tensor<float>::randn_like(x, rng) * sigmas[i + 1];
                 }
             }
-        } break;
-        case IPNDM_SAMPLE_METHOD:  // iPNDM sampler from https://github.com/zju-pi/diff-sampler/tree/main/diff-solvers-main
-        {
-            int max_order       = 4;
-            ggml_tensor* x_next = x;
-            std::vector<ggml_tensor*> buffer_model;
-
+            return x;
+        }
+        case IPNDM_SAMPLE_METHOD: {
+            int max_order                       = 4;
+            std::vector<sd::Tensor<float>> hist = {};
             for (int i = 0; i < steps; i++) {
                 float sigma      = sigmas[i];
                 float sigma_next = sigmas[i + 1];
 
-                ggml_tensor* x_cur = x_next;
-                float* vec_x_cur   = (float*)x_cur->data;
-                float* vec_x_next  = (float*)x_next->data;
-
-                // Denoising step
-                ggml_tensor* denoised = model(x_cur, sigma, i + 1);
-                if (denoised == nullptr) {
-                    return false;
+                auto denoised_opt = model(x, sigma, i + 1);
+                if (denoised_opt.empty()) {
+                    return {};
                 }
-                float* vec_denoised = (float*)denoised->data;
-                // d_cur = (x_cur - denoised) / sigma
-                struct ggml_tensor* d_cur = ggml_dup_tensor(work_ctx, x_cur);
-                float* vec_d_cur          = (float*)d_cur->data;
+                sd::Tensor<float> denoised = std::move(denoised_opt);
 
-                for (int j = 0; j < ggml_nelements(d_cur); j++) {
-                    vec_d_cur[j] = (vec_x_cur[j] - vec_denoised[j]) / sigma;
-                }
+                sd::Tensor<float> d_cur = (x - denoised) / sigma;
+                int order               = std::min(max_order, i + 1);
+                float dt                = sigma_next - sigma;
 
-                int order = std::min(max_order, i + 1);
-
-                // Calculate vec_x_next based on the order
                 switch (order) {
-                    case 1:  // First Euler step
-                        for (int j = 0; j < ggml_nelements(x_next); j++) {
-                            vec_x_next[j] = vec_x_cur[j] + (sigma_next - sigma) * vec_d_cur[j];
-                        }
+                    case 1:
+                        x += d_cur * dt;
+                        break;
+                    case 2:
+                        x += ((3.f * d_cur - hist.back()) / 2.f) * dt;
+                        break;
+                    case 3:
+                        x += ((23.f * d_cur - 16.f * hist[hist.size() - 1] + 5.f * hist[hist.size() - 2]) / 12.f) * dt;
+                        break;
+                    case 4:
+                        x += ((55.f * d_cur - 59.f * hist[hist.size() - 1] + 37.f * hist[hist.size() - 2] - 9.f * hist[hist.size() - 3]) / 24.f) * dt;
                         break;
-
-                    case 2:  // Use one history point
-                    {
-                        float* vec_d_prev1 = (float*)buffer_model.back()->data;
-                        for (int j = 0; j < ggml_nelements(x_next); j++) {
-                            vec_x_next[j] = vec_x_cur[j] + (sigma_next - sigma) * (3 * vec_d_cur[j] - vec_d_prev1[j]) / 2;
-                        }
-                    } break;
-
-                    case 3:  // Use two history points
-                    {
-                        float* vec_d_prev1 = (float*)buffer_model.back()->data;
-                        float* vec_d_prev2 = (float*)buffer_model[buffer_model.size() - 2]->data;
-                        for (int j = 0; j < ggml_nelements(x_next); j++) {
-                            vec_x_next[j] = vec_x_cur[j] + (sigma_next - sigma) * (23 * vec_d_cur[j] - 16 * vec_d_prev1[j] + 5 * vec_d_prev2[j]) / 12;
-                        }
-                    } break;
-
-                    case 4:  // Use three history points
-                    {
-                        float* vec_d_prev1 = (float*)buffer_model.back()->data;
-                        float* vec_d_prev2 = (float*)buffer_model[buffer_model.size() - 2]->data;
-                        float* vec_d_prev3 = (float*)buffer_model[buffer_model.size() - 3]->data;
-                        for (int j = 0; j < ggml_nelements(x_next); j++) {
-                            vec_x_next[j] = vec_x_cur[j] + (sigma_next - sigma) * (55 * vec_d_cur[j] - 59 * vec_d_prev1[j] + 37 * vec_d_prev2[j] - 9 * vec_d_prev3[j]) / 24;
-                        }
-                    } break;
                 }
 
-                // Manage buffer_model
-                if (buffer_model.size() == max_order - 1) {
-                    // Shift elements to the left
-                    for (int k = 0; k < max_order - 2; k++) {
-                        buffer_model[k] = buffer_model[k + 1];
-                    }
-                    buffer_model.back() = d_cur;  // Replace the last element with d_cur
-                } else {
-                    buffer_model.push_back(d_cur);
+                if (hist.size() == static_cast<size_t>(max_order - 1)) {
+                    hist.erase(hist.begin());
                 }
+                hist.push_back(std::move(d_cur));
             }
-        } break;
-        case IPNDM_V_SAMPLE_METHOD:  // iPNDM_v sampler from https://github.com/zju-pi/diff-sampler/tree/main/diff-solvers-main
-        {
-            int max_order = 4;
-            std::vector<ggml_tensor*> buffer_model;
-            ggml_tensor* x_next = x;
-
+            return x;
+        }
+        case IPNDM_V_SAMPLE_METHOD: {
+            int max_order                       = 4;
+            std::vector<sd::Tensor<float>> hist = {};
             for (int i = 0; i < steps; i++) {
                 float sigma  = sigmas[i];
                 float t_next = sigmas[i + 1];
 
-                // Denoising step
-                ggml_tensor* denoised     = model(x, sigma, i + 1);
-                float* vec_denoised       = (float*)denoised->data;
-                struct ggml_tensor* d_cur = ggml_dup_tensor(work_ctx, x);
-                float* vec_d_cur          = (float*)d_cur->data;
-                float* vec_x              = (float*)x->data;
-
-                // d_cur = (x - denoised) / sigma
-                for (int j = 0; j < ggml_nelements(d_cur); j++) {
-                    vec_d_cur[j] = (vec_x[j] - vec_denoised[j]) / sigma;
+                auto denoised_opt = model(x, sigma, i + 1);
+                if (denoised_opt.empty()) {
+                    return {};
                 }
+                sd::Tensor<float> denoised = std::move(denoised_opt);
 
-                int order   = std::min(max_order, i + 1);
-                float h_n   = t_next - sigma;
-                float h_n_1 = (i > 0) ? (sigma - sigmas[i - 1]) : h_n;
+                sd::Tensor<float> d_cur = (x - denoised) / sigma;
+                int order               = std::min(max_order, i + 1);
+                float h_n               = t_next - sigma;
+                float h_n_1             = (i > 0) ? (sigma - sigmas[i - 1]) : h_n;
 
                 switch (order) {
-                    case 1:  // First Euler step
-                        for (int j = 0; j < ggml_nelements(x_next); j++) {
-                            vec_x[j] += vec_d_cur[j] * h_n;
-                        }
+                    case 1:
+                        x += d_cur * h_n;
                         break;
-
-                    case 2: {
-                        float* vec_d_prev1 = (float*)buffer_model.back()->data;
-                        for (int j = 0; j < ggml_nelements(x_next); j++) {
-                            vec_x[j] += h_n * ((2 + (h_n / h_n_1)) * vec_d_cur[j] - (h_n / h_n_1) * vec_d_prev1[j]) / 2;
-                        }
+                    case 2:
+                        x += (((2.f + (h_n / h_n_1)) * d_cur - (h_n / h_n_1) * hist.back()) / 2.f) * h_n;
                         break;
-                    }
-
-                    case 3: {
-                        float h_n_2        = (i > 1) ? (sigmas[i - 1] - sigmas[i - 2]) : h_n_1;
-                        float* vec_d_prev1 = (float*)buffer_model.back()->data;
-                        float* vec_d_prev2 = (buffer_model.size() > 1) ? (float*)buffer_model[buffer_model.size() - 2]->data : vec_d_prev1;
-                        for (int j = 0; j < ggml_nelements(x_next); j++) {
-                            vec_x[j] += h_n * ((23 * vec_d_cur[j] - 16 * vec_d_prev1[j] + 5 * vec_d_prev2[j]) / 12);
-                        }
+                    case 3:
+                        x += ((23.f * d_cur - 16.f * hist[hist.size() - 1] + 5.f * hist[hist.size() - 2]) / 12.f) * h_n;
                         break;
-                    }
-
-                    case 4: {
-                        float h_n_2        = (i > 1) ? (sigmas[i - 1] - sigmas[i - 2]) : h_n_1;
-                        float h_n_3        = (i > 2) ? (sigmas[i - 2] - sigmas[i - 3]) : h_n_2;
-                        float* vec_d_prev1 = (float*)buffer_model.back()->data;
-                        float* vec_d_prev2 = (buffer_model.size() > 1) ? (float*)buffer_model[buffer_model.size() - 2]->data : vec_d_prev1;
-                        float* vec_d_prev3 = (buffer_model.size() > 2) ? (float*)buffer_model[buffer_model.size() - 3]->data : vec_d_prev2;
-                        for (int j = 0; j < ggml_nelements(x_next); j++) {
-                            vec_x[j] += h_n * ((55 * vec_d_cur[j] - 59 * vec_d_prev1[j] + 37 * vec_d_prev2[j] - 9 * vec_d_prev3[j]) / 24);
-                        }
+                    case 4:
+                        x += ((55.f * d_cur - 59.f * hist[hist.size() - 1] + 37.f * hist[hist.size() - 2] - 9.f * hist[hist.size() - 3]) / 24.f) * h_n;
                         break;
-                    }
                 }
 
-                // Manage buffer_model
-                if (buffer_model.size() == max_order - 1) {
-                    buffer_model.erase(buffer_model.begin());
+                if (hist.size() == static_cast<size_t>(max_order - 1)) {
+                    hist.erase(hist.begin());
                 }
-                buffer_model.push_back(d_cur);
-
-                // Prepare the next d tensor
-                d_cur = ggml_dup_tensor(work_ctx, x_next);
+                hist.push_back(std::move(d_cur));
             }
-        } break;
-        case LCM_SAMPLE_METHOD:  // Latent Consistency Models
-        {
-            struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
-            struct ggml_tensor* d     = ggml_dup_tensor(work_ctx, x);
-
-            for (int i = 0; i < steps; i++) {
-                float sigma = sigmas[i];
-
-                // denoise
-                ggml_tensor* denoised = model(x, sigma, i + 1);
-                if (denoised == nullptr) {
-                    return false;
-                }
-
-                // x = denoised
-                {
-                    float* vec_x        = (float*)x->data;
-                    float* vec_denoised = (float*)denoised->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] = vec_denoised[j];
-                    }
-                }
-
-                if (sigmas[i + 1] > 0) {
-                    // x += sigmas[i + 1] * noise_sampler(sigmas[i], sigmas[i + 1])
-                    ggml_ext_im_set_randn_f32(noise, rng);
-                    // noise = load_tensor_from_file(res_ctx, "./rand" + std::to_string(i+1) + ".bin");
-                    {
-                        float* vec_x     = (float*)x->data;
-                        float* vec_noise = (float*)noise->data;
-
-                        for (int j = 0; j < ggml_nelements(x); j++) {
-                            vec_x[j] = vec_x[j] + sigmas[i + 1] * vec_noise[j];
-                        }
-                    }
-                }
-            }
-        } break;
-        case DDIM_TRAILING_SAMPLE_METHOD:  // Denoising Diffusion Implicit Models
-                                           // with the "trailing" timestep spacing
-        {
-            // See J. Song et al., "Denoising Diffusion Implicit
-            // Models", arXiv:2010.02502 [cs.LG]
-            //
-            // DDIM itself needs alphas_cumprod (DDPM, J. Ho et al.,
-            // arXiv:2006.11239 [cs.LG] with k-diffusion's start and
-            // end beta) (which unfortunately k-diffusion's data
-            // structure hides from the denoiser), and the sigmas are
-            // also needed to invert the behavior of CompVisDenoiser
-            // (k-diffusion's LMSDiscreteSchedulerr)
-            float beta_start = 0.00085f;
-            float beta_end   = 0.0120f;
-            std::vector<double> alphas_cumprod;
-            std::vector<double> compvis_sigmas;
-
-            alphas_cumprod.reserve(TIMESTEPS);
-            compvis_sigmas.reserve(TIMESTEPS);
-            for (int i = 0; i < TIMESTEPS; i++) {
-                alphas_cumprod[i] =
-                    (i == 0 ? 1.0f : alphas_cumprod[i - 1]) *
-                    (1.0f -
-                     std::pow(sqrtf(beta_start) +
-                                  (sqrtf(beta_end) - sqrtf(beta_start)) *
-                                      ((float)i / (TIMESTEPS - 1)),
-                              2));
-                compvis_sigmas[i] =
-                    std::sqrt((1 - alphas_cumprod[i]) /
-                              alphas_cumprod[i]);
-            }
-
-            struct ggml_tensor* pred_original_sample =
-                ggml_dup_tensor(work_ctx, x);
-            struct ggml_tensor* variance_noise =
-                ggml_dup_tensor(work_ctx, x);
-
-            for (int i = 0; i < steps; i++) {
-                // The "trailing" DDIM timestep, see S. Lin et al.,
-                // "Common Diffusion Noise Schedulers and Sample Steps
-                // are Flawed", arXiv:2305.08891 [cs], p. 4, Table
-                // 2. Most variables below follow Diffusers naming
-                //
-                // Diffuser naming vs. Song et al. (2010), p. 5, (12)
-                // and p. 16, (16) (<variable name> -> <name in
-                // paper>):
-                //
-                // - pred_noise_t -> epsilon_theta^(t)(x_t)
-                // - pred_original_sample -> f_theta^(t)(x_t) or x_0
-                // - std_dev_t -> sigma_t (not the LMS sigma)
-                // - eta -> eta (set to 0 at the moment)
-                // - pred_sample_direction -> "direction pointing to
-                //   x_t"
-                // - pred_prev_sample -> "x_t-1"
-                int timestep = static_cast<int>(roundf(TIMESTEPS - i * ((float)TIMESTEPS / steps))) - 1;
-                // 1. get previous step value (=t-1)
-                int prev_timestep = timestep - TIMESTEPS / static_cast<int>(steps);
-                // The sigma here is chosen to cause the
-                // CompVisDenoiser to produce t = timestep
-                float sigma = static_cast<float>(compvis_sigmas[timestep]);
-                if (i == 0) {
-                    // The function add_noise intializes x to
-                    // Diffusers' latents * sigma (as in Diffusers'
-                    // pipeline) or sample * sigma (Diffusers'
-                    // scheduler), where this sigma = init_noise_sigma
-                    // in Diffusers. For DDPM and DDIM however,
-                    // init_noise_sigma = 1. But the k-diffusion
-                    // model() also evaluates F_theta(c_in(sigma) x;
-                    // ...) instead of the bare U-net F_theta, with
-                    // c_in = 1 / sqrt(sigma^2 + 1), as defined in
-                    // T. Karras et al., "Elucidating the Design Space
-                    // of Diffusion-Based Generative Models",
-                    // arXiv:2206.00364 [cs.CV], p. 3, Table 1. Hence
-                    // the first call has to be prescaled as x <- x /
-                    // (c_in * sigma) with the k-diffusion pipeline
-                    // and CompVisDenoiser.
-                    float* vec_x = (float*)x->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] *= std::sqrt(sigma * sigma + 1) /
-                                    sigma;
-                    }
-                } else {
-                    // For the subsequent steps after the first one,
-                    // at this point x = latents or x = sample, and
-                    // needs to be prescaled with x <- sample / c_in
-                    // to compensate for model() applying the scale
-                    // c_in before the U-net F_theta
-                    float* vec_x = (float*)x->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] *= std::sqrt(sigma * sigma + 1);
-                    }
-                }
-                // Note (also noise_pred in Diffuser's pipeline)
-                // model_output = model() is the D(x, sigma) as
-                // defined in Karras et al. (2022), p. 3, Table 1 and
-                // p. 8 (7), compare also p. 38 (226) therein.
-                struct ggml_tensor* model_output =
-                    model(x, sigma, i + 1);
-                // Here model_output is still the k-diffusion denoiser
-                // output, not the U-net output F_theta(c_in(sigma) x;
-                // ...) in Karras et al. (2022), whereas Diffusers'
-                // model_output is F_theta(...). Recover the actual
-                // model_output, which is also referred to as the
-                // "Karras ODE derivative" d or d_cur in several
-                // samplers above.
-                {
-                    float* vec_x = (float*)x->data;
-                    float* vec_model_output =
-                        (float*)model_output->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_model_output[j] =
-                            (vec_x[j] - vec_model_output[j]) *
-                            (1 / sigma);
-                    }
-                }
-                // 2. compute alphas, betas
-                float alpha_prod_t = static_cast<float>(alphas_cumprod[timestep]);
-                // Note final_alpha_cumprod = alphas_cumprod[0] due to
-                // trailing timestep spacing
-                float alpha_prod_t_prev = static_cast<float>(prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]);
-                float beta_prod_t       = 1 - alpha_prod_t;
-                // 3. compute predicted original sample from predicted
-                // noise also called "predicted x_0" of formula (12)
-                // from https://arxiv.org/pdf/2010.02502.pdf
-                {
-                    float* vec_x = (float*)x->data;
-                    float* vec_model_output =
-                        (float*)model_output->data;
-                    float* vec_pred_original_sample =
-                        (float*)pred_original_sample->data;
-                    // Note the substitution of latents or sample = x
-                    // * c_in = x / sqrt(sigma^2 + 1)
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_pred_original_sample[j] =
-                            (vec_x[j] / std::sqrt(sigma * sigma + 1) -
-                             std::sqrt(beta_prod_t) *
-                                 vec_model_output[j]) *
-                            (1 / std::sqrt(alpha_prod_t));
-                    }
-                }
-                // Assuming the "epsilon" prediction type, where below
-                // pred_epsilon = model_output is inserted, and is not
-                // defined/copied explicitly.
-                //
-                // 5. compute variance: "sigma_t(eta)" -> see formula
-                // (16)
-                //
-                // sigma_t = sqrt((1 - alpha_t-1)/(1 - alpha_t)) *
-                // sqrt(1 - alpha_t/alpha_t-1)
-                float beta_prod_t_prev = 1 - alpha_prod_t_prev;
-                float variance         = (beta_prod_t_prev / beta_prod_t) *
-                                 (1 - alpha_prod_t / alpha_prod_t_prev);
-                float std_dev_t = eta * std::sqrt(variance);
-                // 6. compute "direction pointing to x_t" of formula
-                // (12) from https://arxiv.org/pdf/2010.02502.pdf
-                // 7. compute x_t without "random noise" of formula
-                // (12) from https://arxiv.org/pdf/2010.02502.pdf
-                {
-                    float* vec_model_output = (float*)model_output->data;
-                    float* vec_pred_original_sample =
-                        (float*)pred_original_sample->data;
-                    float* vec_x = (float*)x->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        // Two step inner loop without an explicit
-                        // tensor
-                        float pred_sample_direction =
-                            ::sqrtf(1 - alpha_prod_t_prev -
-                                    ::powf(std_dev_t, 2)) *
-                            vec_model_output[j];
-                        vec_x[j] = std::sqrt(alpha_prod_t_prev) *
-                                       vec_pred_original_sample[j] +
-                                   pred_sample_direction;
-                    }
-                }
-                if (eta > 0) {
-                    ggml_ext_im_set_randn_f32(variance_noise, rng);
-                    float* vec_variance_noise =
-                        (float*)variance_noise->data;
-                    float* vec_x = (float*)x->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] += std_dev_t * vec_variance_noise[j];
-                    }
-                }
-                // See the note above: x = latents or sample here, and
-                // is not scaled by the c_in. For the final output
-                // this is correct, but for subsequent iterations, x
-                // needs to be prescaled again, since k-diffusion's
-                // model() differes from the bare U-net F_theta by the
-                // factor c_in.
-            }
-        } break;
-        case TCD_SAMPLE_METHOD:  // Strategic Stochastic Sampling (Algorithm 4) in
-                                 // Trajectory Consistency Distillation
-        {
-            // See J. Zheng et al., "Trajectory Consistency
-            // Distillation: Improved Latent Consistency Distillation
-            // by Semi-Linear Consistency Function with Trajectory
-            // Mapping", arXiv:2402.19159 [cs.CV]
-            float beta_start = 0.00085f;
-            float beta_end   = 0.0120f;
-            std::vector<double> alphas_cumprod;
-            std::vector<double> compvis_sigmas;
-
-            alphas_cumprod.reserve(TIMESTEPS);
-            compvis_sigmas.reserve(TIMESTEPS);
-            for (int i = 0; i < TIMESTEPS; i++) {
-                alphas_cumprod[i] =
-                    (i == 0 ? 1.0f : alphas_cumprod[i - 1]) *
-                    (1.0f -
-                     std::pow(sqrtf(beta_start) +
-                                  (sqrtf(beta_end) - sqrtf(beta_start)) *
-                                      ((float)i / (TIMESTEPS - 1)),
-                              2));
-                compvis_sigmas[i] =
-                    std::sqrt((1 - alphas_cumprod[i]) /
-                              alphas_cumprod[i]);
-            }
-            int original_steps = 50;
-
-            struct ggml_tensor* pred_original_sample =
-                ggml_dup_tensor(work_ctx, x);
-            struct ggml_tensor* noise =
-                ggml_dup_tensor(work_ctx, x);
-
-            for (int i = 0; i < steps; i++) {
-                // Analytic form for TCD timesteps
-                int timestep = TIMESTEPS - 1 -
-                               (TIMESTEPS / original_steps) *
-                                   (int)floor(i * ((float)original_steps / steps));
-                // 1. get previous step value
-                int prev_timestep = i >= steps - 1 ? 0 : TIMESTEPS - 1 - (TIMESTEPS / original_steps) * (int)floor((i + 1) * ((float)original_steps / steps));
-                // Here timestep_s is tau_n' in Algorithm 4. The _s
-                // notation appears to be that from C. Lu,
-                // "DPM-Solver: A Fast ODE Solver for Diffusion
-                // Probabilistic Model Sampling in Around 10 Steps",
-                // arXiv:2206.00927 [cs.LG], but this notation is not
-                // continued in Algorithm 4, where _n' is used.
-                int timestep_s =
-                    (int)floor((1 - eta) * prev_timestep);
-                // Begin k-diffusion specific workaround for
-                // evaluating F_theta(x; ...) from D(x, sigma), same
-                // as in DDIM (and see there for detailed comments)
-                float sigma = static_cast<float>(compvis_sigmas[timestep]);
-                if (i == 0) {
-                    float* vec_x = (float*)x->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] *= std::sqrt(sigma * sigma + 1) /
-                                    sigma;
-                    }
-                } else {
-                    float* vec_x = (float*)x->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] *= std::sqrt(sigma * sigma + 1);
-                    }
-                }
-                struct ggml_tensor* model_output =
-                    model(x, sigma, i + 1);
-                {
-                    float* vec_x = (float*)x->data;
-                    float* vec_model_output =
-                        (float*)model_output->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_model_output[j] =
-                            (vec_x[j] - vec_model_output[j]) *
-                            (1 / sigma);
-                    }
-                }
-                // 2. compute alphas, betas
-                //
-                // When comparing TCD with DDPM/DDIM note that Zheng
-                // et al. (2024) follows the DPM-Solver notation for
-                // alpha. One can find the following comment in the
-                // original DPM-Solver code
-                // (https://github.com/LuChengTHU/dpm-solver/):
-                // "**Important**: Please pay special attention for
-                // the args for `alphas_cumprod`: The `alphas_cumprod`
-                // is the \hat{alpha_n} arrays in the notations of
-                // DDPM. [...] Therefore, the notation \hat{alpha_n}
-                // is different from the notation alpha_t in
-                // DPM-Solver. In fact, we have alpha_{t_n} =
-                // \sqrt{\hat{alpha_n}}, [...]"
-                float alpha_prod_t = static_cast<float>(alphas_cumprod[timestep]);
-                float beta_prod_t  = 1 - alpha_prod_t;
-                // Note final_alpha_cumprod = alphas_cumprod[0] since
-                // TCD is always "trailing"
-                float alpha_prod_t_prev = static_cast<float>(prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]);
-                // The subscript _s are the only portion in this
-                // section (2) unique to TCD
-                float alpha_prod_s = static_cast<float>(alphas_cumprod[timestep_s]);
-                float beta_prod_s  = 1 - alpha_prod_s;
-                // 3. Compute the predicted noised sample x_s based on
-                // the model parameterization
-                //
-                // This section is also exactly the same as DDIM
-                {
-                    float* vec_x = (float*)x->data;
-                    float* vec_model_output =
-                        (float*)model_output->data;
-                    float* vec_pred_original_sample =
-                        (float*)pred_original_sample->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_pred_original_sample[j] =
-                            (vec_x[j] / std::sqrt(sigma * sigma + 1) -
-                             std::sqrt(beta_prod_t) *
-                                 vec_model_output[j]) *
-                            (1 / std::sqrt(alpha_prod_t));
-                    }
-                }
-                // This consistency function step can be difficult to
-                // decipher from Algorithm 4, as it is simply stated
-                // using a consistency function. This step is the
-                // modified DDIM, i.e. p. 8 (32) in Zheng et
-                // al. (2024), with eta set to 0 (see the paragraph
-                // immediately thereafter that states this somewhat
-                // obliquely).
-                {
-                    float* vec_pred_original_sample =
-                        (float*)pred_original_sample->data;
-                    float* vec_model_output =
-                        (float*)model_output->data;
-                    float* vec_x = (float*)x->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        // Substituting x = pred_noised_sample and
-                        // pred_epsilon = model_output
-                        vec_x[j] =
-                            std::sqrt(alpha_prod_s) *
-                                vec_pred_original_sample[j] +
-                            std::sqrt(beta_prod_s) *
-                                vec_model_output[j];
-                    }
-                }
-                // 4. Sample and inject noise z ~ N(0, I) for
-                // MultiStep Inference Noise is not used on the final
-                // timestep of the timestep schedule. This also means
-                // that noise is not used for one-step sampling. Eta
-                // (referred to as "gamma" in the paper) was
-                // introduced to control the stochasticity in every
-                // step. When eta = 0, it represents deterministic
-                // sampling, whereas eta = 1 indicates full stochastic
-                // sampling.
-                if (eta > 0 && i != steps - 1) {
-                    // In this case, x is still pred_noised_sample,
-                    // continue in-place
-                    ggml_ext_im_set_randn_f32(noise, rng);
-                    float* vec_x     = (float*)x->data;
-                    float* vec_noise = (float*)noise->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        // Corresponding to (35) in Zheng et
-                        // al. (2024), substituting x =
-                        // pred_noised_sample
-                        vec_x[j] =
-                            std::sqrt(alpha_prod_t_prev /
-                                      alpha_prod_s) *
-                                vec_x[j] +
-                            std::sqrt(1 - alpha_prod_t_prev /
-                                              alpha_prod_s) *
-                                vec_noise[j];
-                    }
-                }
-            }
-        } break;
-        case RES_MULTISTEP_SAMPLE_METHOD:  // Res Multistep sampler
-        {
-            struct ggml_tensor* noise        = ggml_dup_tensor(work_ctx, x);
-            struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x);
+            return x;
+        }
+        case RES_MULTISTEP_SAMPLE_METHOD: {
+            sd::Tensor<float> old_denoised = x;
 
             bool have_old_sigma  = false;
             float old_sigma_down = 0.0f;
@@ -1712,10 +1065,11 @@ static bool sample_k_diffusion(sample_method_t method,
             };
 
             for (int i = 0; i < steps; i++) {
-                ggml_tensor* denoised = model(x, sigmas[i], i + 1);
-                if (denoised == nullptr) {
-                    return false;
+                auto denoised_opt = model(x, sigmas[i], i + 1);
+                if (denoised_opt.empty()) {
+                    return {};
                 }
+                sd::Tensor<float> denoised = std::move(denoised_opt);
 
                 float sigma_from = sigmas[i];
                 float sigma_to   = sigmas[i + 1];
@@ -1737,14 +1091,7 @@ static bool sample_k_diffusion(sample_method_t method,
                 }
 
                 if (sigma_down == 0.0f || !have_old_sigma) {
-                    float dt            = sigma_down - sigma_from;
-                    float* vec_x        = (float*)x->data;
-                    float* vec_denoised = (float*)denoised->data;
-
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        float d  = (vec_x[j] - vec_denoised[j]) / sigma_from;
-                        vec_x[j] = vec_x[j] + d * dt;
-                    }
+                    x += ((x - denoised) / sigma_from) * (sigma_down - sigma_from);
                 } else {
                     float t      = t_fn(sigma_from);
                     float t_old  = t_fn(old_sigma_down);
@@ -1765,42 +1112,20 @@ static bool sample_k_diffusion(sample_method_t method,
                         b2 = 0.0f;
                     }
 
-                    float sigma_h           = sigma_fn(h);
-                    float* vec_x            = (float*)x->data;
-                    float* vec_denoised     = (float*)denoised->data;
-                    float* vec_old_denoised = (float*)old_denoised->data;
-
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] = sigma_h * vec_x[j] + h * (b1 * vec_denoised[j] + b2 * vec_old_denoised[j]);
-                    }
+                    x = sigma_fn(h) * (x) + h * (b1 * denoised + b2 * old_denoised);
                 }
 
                 if (sigmas[i + 1] > 0 && sigma_up > 0.0f) {
-                    ggml_ext_im_set_randn_f32(noise, rng);
-                    float* vec_x     = (float*)x->data;
-                    float* vec_noise = (float*)noise->data;
-
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] = vec_x[j] + vec_noise[j] * sigma_up;
-                    }
-                }
-
-                float* vec_old_denoised = (float*)old_denoised->data;
-                float* vec_denoised     = (float*)denoised->data;
-                for (int j = 0; j < ggml_nelements(x); j++) {
-                    vec_old_denoised[j] = vec_denoised[j];
+                    x += sd::Tensor<float>::randn_like(x, rng) * sigma_up;
                 }
 
+                old_denoised   = denoised;
                 old_sigma_down = sigma_down;
                 have_old_sigma = true;
             }
-        } break;
-        case RES_2S_SAMPLE_METHOD:  // Res 2s sampler
-        {
-            struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
-            struct ggml_tensor* x0    = ggml_dup_tensor(work_ctx, x);
-            struct ggml_tensor* x2    = ggml_dup_tensor(work_ctx, x);
-
+            return x;
+        }
+        case RES_2S_SAMPLE_METHOD: {
             const float c2 = 0.5f;
             auto t_fn      = [](float sigma) -> float { return -logf(sigma); };
             auto phi1_fn   = [](float t) -> float {
@@ -1821,10 +1146,11 @@ static bool sample_k_diffusion(sample_method_t method,
                 float sigma_from = sigmas[i];
                 float sigma_to   = sigmas[i + 1];
 
-                ggml_tensor* denoised = model(x, sigma_from, -(i + 1));
-                if (denoised == nullptr) {
-                    return false;
+                auto denoised_opt = model(x, sigma_from, -(i + 1));
+                if (denoised_opt.empty()) {
+                    return {};
                 }
+                sd::Tensor<float> denoised = std::move(denoised_opt);
 
                 float sigma_up   = 0.0f;
                 float sigma_down = sigma_to;
@@ -1842,17 +1168,9 @@ static bool sample_k_diffusion(sample_method_t method,
                     sigma_down          = sigma_down_sq > 0.0f ? std::sqrt(sigma_down_sq) : 0.0f;
                 }
 
-                float* vec_x  = (float*)x->data;
-                float* vec_x0 = (float*)x0->data;
-                for (int j = 0; j < ggml_nelements(x); j++) {
-                    vec_x0[j] = vec_x[j];
-                }
-
+                sd::Tensor<float> x0 = x;
                 if (sigma_down == 0.0f || sigma_from == 0.0f) {
-                    float* vec_denoised = (float*)denoised->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] = vec_denoised[j];
-                    }
+                    x = denoised;
                 } else {
                     float t      = t_fn(sigma_from);
                     float t_next = t_fn(sigma_down);
@@ -1864,45 +1182,140 @@ static bool sample_k_diffusion(sample_method_t method,
                     float b2       = phi2_val / c2;
                     float b1       = phi1_val - b2;
 
-                    float sigma_c2 = expf(-(t + h * c2));
+                    float sigma_c2         = expf(-(t + h * c2));
+                    sd::Tensor<float> eps1 = denoised - x0;
+                    sd::Tensor<float> x2   = x0 + eps1 * (h * a21);
 
-                    float* vec_denoised = (float*)denoised->data;
-                    float* vec_x2       = (float*)x2->data;
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        float eps1 = vec_denoised[j] - vec_x0[j];
-                        vec_x2[j]  = vec_x0[j] + h * a21 * eps1;
-                    }
-
-                    ggml_tensor* denoised2 = model(x2, sigma_c2, i + 1);
-                    if (denoised2 == nullptr) {
-                        return false;
-                    }
-                    float* vec_denoised2 = (float*)denoised2->data;
-
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        float eps1 = vec_denoised[j] - vec_x0[j];
-                        float eps2 = vec_denoised2[j] - vec_x0[j];
-                        vec_x[j]   = vec_x0[j] + h * (b1 * eps1 + b2 * eps2);
+                    auto denoised2_opt = model(x2, sigma_c2, i + 1);
+                    if (denoised2_opt.empty()) {
+                        return {};
                     }
+                    sd::Tensor<float> denoised2 = std::move(denoised2_opt);
+                    sd::Tensor<float> eps2      = denoised2 - x0;
+                    x                           = x0 + h * (b1 * eps1 + b2 * eps2);
                 }
 
                 if (sigmas[i + 1] > 0 && sigma_up > 0.0f) {
-                    ggml_ext_im_set_randn_f32(noise, rng);
-                    float* vec_x     = (float*)x->data;
-                    float* vec_noise = (float*)noise->data;
-
-                    for (int j = 0; j < ggml_nelements(x); j++) {
-                        vec_x[j] = vec_x[j] + vec_noise[j] * sigma_up;
-                    }
+                    x += sd::Tensor<float>::randn_like(x, rng) * sigma_up;
                 }
             }
-        } break;
+            return x;
+        }
+        case DDIM_TRAILING_SAMPLE_METHOD: {
+            float beta_start = 0.00085f;
+            float beta_end   = 0.0120f;
+            std::vector<double> alphas_cumprod(TIMESTEPS);
+            std::vector<double> compvis_sigmas(TIMESTEPS);
+            for (int i = 0; i < TIMESTEPS; i++) {
+                alphas_cumprod[i] =
+                    (i == 0 ? 1.0f : alphas_cumprod[i - 1]) *
+                    (1.0f -
+                     std::pow(sqrtf(beta_start) +
+                                  (sqrtf(beta_end) - sqrtf(beta_start)) *
+                                      ((float)i / (TIMESTEPS - 1)),
+                              2));
+                compvis_sigmas[i] =
+                    std::sqrt((1 - alphas_cumprod[i]) / alphas_cumprod[i]);
+            }
 
+            for (int i = 0; i < steps; i++) {
+                int timestep      = static_cast<int>(roundf(TIMESTEPS - i * ((float)TIMESTEPS / steps))) - 1;
+                int prev_timestep = timestep - TIMESTEPS / static_cast<int>(steps);
+                float sigma       = static_cast<float>(compvis_sigmas[timestep]);
+                if (i == 0) {
+                    x *= std::sqrt(sigma * sigma + 1) / sigma;
+                } else {
+                    x *= std::sqrt(sigma * sigma + 1);
+                }
+
+                auto model_output_opt = model(x, sigma, i + 1);
+                if (model_output_opt.empty()) {
+                    return {};
+                }
+                sd::Tensor<float> model_output = std::move(model_output_opt);
+                model_output                   = (x - model_output) * (1.0f / sigma);
+
+                float alpha_prod_t      = static_cast<float>(alphas_cumprod[timestep]);
+                float alpha_prod_t_prev = static_cast<float>(prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]);
+                float beta_prod_t       = 1.0f - alpha_prod_t;
+
+                sd::Tensor<float> pred_original_sample = ((x / std::sqrt(sigma * sigma + 1)) -
+                                                          std::sqrt(beta_prod_t) * model_output) *
+                                                         (1.0f / std::sqrt(alpha_prod_t));
+
+                float beta_prod_t_prev = 1.0f - alpha_prod_t_prev;
+                float variance         = (beta_prod_t_prev / beta_prod_t) *
+                                 (1.0f - alpha_prod_t / alpha_prod_t_prev);
+                float std_dev_t = eta * std::sqrt(variance);
+
+                x = std::sqrt(alpha_prod_t_prev) * pred_original_sample +
+                    std::sqrt(1.0f - alpha_prod_t_prev - std::pow(std_dev_t, 2)) * model_output;
+
+                if (eta > 0) {
+                    x += std_dev_t * sd::Tensor<float>::randn_like(x, rng);
+                }
+            }
+            return x;
+        }
+        case TCD_SAMPLE_METHOD: {
+            float beta_start = 0.00085f;
+            float beta_end   = 0.0120f;
+            std::vector<double> alphas_cumprod(TIMESTEPS);
+            std::vector<double> compvis_sigmas(TIMESTEPS);
+            for (int i = 0; i < TIMESTEPS; i++) {
+                alphas_cumprod[i] =
+                    (i == 0 ? 1.0f : alphas_cumprod[i - 1]) *
+                    (1.0f -
+                     std::pow(sqrtf(beta_start) +
+                                  (sqrtf(beta_end) - sqrtf(beta_start)) *
+                                      ((float)i / (TIMESTEPS - 1)),
+                              2));
+                compvis_sigmas[i] =
+                    std::sqrt((1 - alphas_cumprod[i]) / alphas_cumprod[i]);
+            }
+            int original_steps = 50;
+            for (int i = 0; i < steps; i++) {
+                int timestep      = TIMESTEPS - 1 - (TIMESTEPS / original_steps) * (int)floor(i * ((float)original_steps / steps));
+                int prev_timestep = i >= steps - 1 ? 0 : TIMESTEPS - 1 - (TIMESTEPS / original_steps) * (int)floor((i + 1) * ((float)original_steps / steps));
+                int timestep_s    = (int)floor((1 - eta) * prev_timestep);
+                float sigma       = static_cast<float>(compvis_sigmas[timestep]);
+
+                if (i == 0) {
+                    x *= std::sqrt(sigma * sigma + 1) / sigma;
+                } else {
+                    x *= std::sqrt(sigma * sigma + 1);
+                }
+
+                auto model_output_opt = model(x, sigma, i + 1);
+                if (model_output_opt.empty()) {
+                    return {};
+                }
+                sd::Tensor<float> model_output = std::move(model_output_opt);
+                model_output                   = (x - model_output) * (1.0f / sigma);
+
+                float alpha_prod_t      = static_cast<float>(alphas_cumprod[timestep]);
+                float beta_prod_t       = 1.0f - alpha_prod_t;
+                float alpha_prod_t_prev = static_cast<float>(prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]);
+                float alpha_prod_s      = static_cast<float>(alphas_cumprod[timestep_s]);
+                float beta_prod_s       = 1.0f - alpha_prod_s;
+
+                sd::Tensor<float> pred_original_sample = ((x / std::sqrt(sigma * sigma + 1)) -
+                                                          std::sqrt(beta_prod_t) * model_output) *
+                                                         (1.0f / std::sqrt(alpha_prod_t));
+
+                x = std::sqrt(alpha_prod_s) * pred_original_sample +
+                    std::sqrt(beta_prod_s) * model_output;
+
+                if (eta > 0 && i != steps - 1) {
+                    x = std::sqrt(alpha_prod_t_prev / alpha_prod_s) * (x) +
+                        std::sqrt(1.0f - alpha_prod_t_prev / alpha_prod_s) * sd::Tensor<float>::randn_like(x, rng);
+                }
+            }
+            return x;
+        }
         default:
-            LOG_ERROR("Attempting to sample with nonexisting sample method %i", method);
-            return false;
+            return {};
     }
-    return true;
 }
 
 #endif  // __DENOISER_HPP__
diff --git a/src/diffusion_model.hpp b/src/diffusion_model.hpp
index 329bb9d9..eb0debff 100644
--- a/src/diffusion_model.hpp
+++ b/src/diffusion_model.hpp
@@ -1,42 +1,50 @@
 #ifndef __DIFFUSION_MODEL_H__
 #define __DIFFUSION_MODEL_H__
 
+#include <optional>
 #include "anima.hpp"
 #include "flux.hpp"
 #include "mmdit.hpp"
 #include "qwen_image.hpp"
+#include "tensor_ggml.hpp"
 #include "unet.hpp"
 #include "wan.hpp"
 #include "z_image.hpp"
 
 struct DiffusionParams {
-    struct ggml_tensor* x                     = nullptr;
-    struct ggml_tensor* timesteps             = nullptr;
-    struct ggml_tensor* context               = nullptr;
-    struct ggml_tensor* c_concat              = nullptr;
-    struct ggml_tensor* y                     = nullptr;
-    struct ggml_tensor* guidance              = nullptr;
-    std::vector<ggml_tensor*> ref_latents     = {};
-    bool increase_ref_index                   = false;
-    int num_video_frames                      = -1;
-    std::vector<struct ggml_tensor*> controls = {};
-    float control_strength                    = 0.f;
-    struct ggml_tensor* vace_context          = nullptr;
-    float vace_strength                       = 1.f;
-    std::vector<int> skip_layers              = {};
+    const sd::Tensor<float>* x                        = nullptr;
+    const sd::Tensor<float>* timesteps                = nullptr;
+    const sd::Tensor<float>* context                  = nullptr;
+    const sd::Tensor<float>* c_concat                 = nullptr;
+    const sd::Tensor<float>* y                        = nullptr;
+    const sd::Tensor<int32_t>* t5_ids                 = nullptr;
+    const sd::Tensor<float>* t5_weights               = nullptr;
+    const sd::Tensor<float>* guidance                 = nullptr;
+    const std::vector<sd::Tensor<float>>* ref_latents = nullptr;
+    bool increase_ref_index                           = false;
+    int num_video_frames                              = -1;
+    const std::vector<sd::Tensor<float>>* controls    = nullptr;
+    float control_strength                            = 0.f;
+    const sd::Tensor<float>* vace_context             = nullptr;
+    float vace_strength                               = 1.f;
+    const std::vector<int>* skip_layers               = nullptr;
 };
 
+template <typename T>
+static inline const sd::Tensor<T>& tensor_or_empty(const sd::Tensor<T>* tensor) {
+    static const sd::Tensor<T> kEmpty;
+    return tensor != nullptr ? *tensor : kEmpty;
+}
+
 struct DiffusionModel {
-    virtual std::string get_desc()                                                      = 0;
-    virtual bool compute(int n_threads,
-                         DiffusionParams diffusion_params,
-                         struct ggml_tensor** output     = nullptr,
-                         struct ggml_context* output_ctx = nullptr)                     = 0;
-    virtual void alloc_params_buffer()                                                  = 0;
-    virtual void free_params_buffer()                                                   = 0;
-    virtual void free_compute_buffer()                                                  = 0;
-    virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) = 0;
-    virtual size_t get_params_buffer_size()                                             = 0;
+    virtual std::string get_desc()                                               = 0;
+    virtual sd::Tensor<float> compute(int n_threads,
+                                      const DiffusionParams& diffusion_params)   = 0;
+    virtual void alloc_params_buffer()                                           = 0;
+    virtual void free_params_buffer()                                            = 0;
+    virtual void free_compute_buffer()                                           = 0;
+    virtual void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) = 0;
+    virtual size_t get_params_buffer_size()                                      = 0;
     virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter){};
     virtual int64_t get_adm_in_channels()                            = 0;
     virtual void set_flash_attention_enabled(bool enabled)           = 0;
@@ -69,7 +77,7 @@ struct UNetModel : public DiffusionModel {
         unet.free_compute_buffer();
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
         unet.get_param_tensors(tensors, "model.diffusion_model");
     }
 
@@ -93,19 +101,20 @@ struct UNetModel : public DiffusionModel {
         unet.set_circular_axes(circular_x, circular_y);
     }
 
-    bool compute(int n_threads,
-                 DiffusionParams diffusion_params,
-                 struct ggml_tensor** output     = nullptr,
-                 struct ggml_context* output_ctx = nullptr) override {
+    sd::Tensor<float> compute(int n_threads,
+                              const DiffusionParams& diffusion_params) override {
+        GGML_ASSERT(diffusion_params.x != nullptr);
+        GGML_ASSERT(diffusion_params.timesteps != nullptr);
+        static const std::vector<sd::Tensor<float>> empty_controls;
         return unet.compute(n_threads,
-                            diffusion_params.x,
-                            diffusion_params.timesteps,
-                            diffusion_params.context,
-                            diffusion_params.c_concat,
-                            diffusion_params.y,
+                            *diffusion_params.x,
+                            *diffusion_params.timesteps,
+                            tensor_or_empty(diffusion_params.context),
+                            tensor_or_empty(diffusion_params.c_concat),
+                            tensor_or_empty(diffusion_params.y),
                             diffusion_params.num_video_frames,
-                            diffusion_params.controls,
-                            diffusion_params.control_strength, output, output_ctx);
+                            diffusion_params.controls ? *diffusion_params.controls : empty_controls,
+                            diffusion_params.control_strength);
     }
 };
 
@@ -134,7 +143,7 @@ struct MMDiTModel : public DiffusionModel {
         mmdit.free_compute_buffer();
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
         mmdit.get_param_tensors(tensors, "model.diffusion_model");
     }
 
@@ -158,18 +167,17 @@ struct MMDiTModel : public DiffusionModel {
         mmdit.set_circular_axes(circular_x, circular_y);
     }
 
-    bool compute(int n_threads,
-                 DiffusionParams diffusion_params,
-                 struct ggml_tensor** output     = nullptr,
-                 struct ggml_context* output_ctx = nullptr) override {
+    sd::Tensor<float> compute(int n_threads,
+                              const DiffusionParams& diffusion_params) override {
+        GGML_ASSERT(diffusion_params.x != nullptr);
+        GGML_ASSERT(diffusion_params.timesteps != nullptr);
+        static const std::vector<int> empty_skip_layers;
         return mmdit.compute(n_threads,
-                             diffusion_params.x,
-                             diffusion_params.timesteps,
-                             diffusion_params.context,
-                             diffusion_params.y,
-                             output,
-                             output_ctx,
-                             diffusion_params.skip_layers);
+                             *diffusion_params.x,
+                             *diffusion_params.timesteps,
+                             tensor_or_empty(diffusion_params.context),
+                             tensor_or_empty(diffusion_params.y),
+                             diffusion_params.skip_layers ? *diffusion_params.skip_layers : empty_skip_layers);
     }
 };
 
@@ -200,7 +208,7 @@ struct FluxModel : public DiffusionModel {
         flux.free_compute_buffer();
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
         flux.get_param_tensors(tensors, "model.diffusion_model");
     }
 
@@ -224,22 +232,22 @@ struct FluxModel : public DiffusionModel {
         flux.set_circular_axes(circular_x, circular_y);
     }
 
-    bool compute(int n_threads,
-                 DiffusionParams diffusion_params,
-                 struct ggml_tensor** output     = nullptr,
-                 struct ggml_context* output_ctx = nullptr) override {
+    sd::Tensor<float> compute(int n_threads,
+                              const DiffusionParams& diffusion_params) override {
+        GGML_ASSERT(diffusion_params.x != nullptr);
+        GGML_ASSERT(diffusion_params.timesteps != nullptr);
+        static const std::vector<sd::Tensor<float>> empty_ref_latents;
+        static const std::vector<int> empty_skip_layers;
         return flux.compute(n_threads,
-                            diffusion_params.x,
-                            diffusion_params.timesteps,
-                            diffusion_params.context,
-                            diffusion_params.c_concat,
-                            diffusion_params.y,
-                            diffusion_params.guidance,
-                            diffusion_params.ref_latents,
+                            *diffusion_params.x,
+                            *diffusion_params.timesteps,
+                            tensor_or_empty(diffusion_params.context),
+                            tensor_or_empty(diffusion_params.c_concat),
+                            tensor_or_empty(diffusion_params.y),
+                            tensor_or_empty(diffusion_params.guidance),
+                            diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
                             diffusion_params.increase_ref_index,
-                            output,
-                            output_ctx,
-                            diffusion_params.skip_layers);
+                            diffusion_params.skip_layers ? *diffusion_params.skip_layers : empty_skip_layers);
     }
 };
 
@@ -270,7 +278,7 @@ struct AnimaModel : public DiffusionModel {
         anima.free_compute_buffer();
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
         anima.get_param_tensors(tensors, prefix);
     }
 
@@ -294,18 +302,16 @@ struct AnimaModel : public DiffusionModel {
         anima.set_circular_axes(circular_x, circular_y);
     }
 
-    bool compute(int n_threads,
-                 DiffusionParams diffusion_params,
-                 struct ggml_tensor** output     = nullptr,
-                 struct ggml_context* output_ctx = nullptr) override {
+    sd::Tensor<float> compute(int n_threads,
+                              const DiffusionParams& diffusion_params) override {
+        GGML_ASSERT(diffusion_params.x != nullptr);
+        GGML_ASSERT(diffusion_params.timesteps != nullptr);
         return anima.compute(n_threads,
-                             diffusion_params.x,
-                             diffusion_params.timesteps,
-                             diffusion_params.context,
-                             diffusion_params.c_concat,
-                             diffusion_params.y,
-                             output,
-                             output_ctx);
+                             *diffusion_params.x,
+                             *diffusion_params.timesteps,
+                             tensor_or_empty(diffusion_params.context),
+                             tensor_or_empty(diffusion_params.t5_ids),
+                             tensor_or_empty(diffusion_params.t5_weights));
     }
 };
 
@@ -337,7 +343,7 @@ struct WanModel : public DiffusionModel {
         wan.free_compute_buffer();
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
         wan.get_param_tensors(tensors, prefix);
     }
 
@@ -361,21 +367,19 @@ struct WanModel : public DiffusionModel {
         wan.set_circular_axes(circular_x, circular_y);
     }
 
-    bool compute(int n_threads,
-                 DiffusionParams diffusion_params,
-                 struct ggml_tensor** output     = nullptr,
-                 struct ggml_context* output_ctx = nullptr) override {
+    sd::Tensor<float> compute(int n_threads,
+                              const DiffusionParams& diffusion_params) override {
+        GGML_ASSERT(diffusion_params.x != nullptr);
+        GGML_ASSERT(diffusion_params.timesteps != nullptr);
         return wan.compute(n_threads,
-                           diffusion_params.x,
-                           diffusion_params.timesteps,
-                           diffusion_params.context,
-                           diffusion_params.y,
-                           diffusion_params.c_concat,
-                           nullptr,
-                           diffusion_params.vace_context,
-                           diffusion_params.vace_strength,
-                           output,
-                           output_ctx);
+                           *diffusion_params.x,
+                           *diffusion_params.timesteps,
+                           tensor_or_empty(diffusion_params.context),
+                           tensor_or_empty(diffusion_params.y),
+                           tensor_or_empty(diffusion_params.c_concat),
+                           sd::Tensor<float>(),
+                           tensor_or_empty(diffusion_params.vace_context),
+                           diffusion_params.vace_strength);
     }
 };
 
@@ -408,7 +412,7 @@ struct QwenImageModel : public DiffusionModel {
         qwen_image.free_compute_buffer();
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
         qwen_image.get_param_tensors(tensors, prefix);
     }
 
@@ -432,18 +436,17 @@ struct QwenImageModel : public DiffusionModel {
         qwen_image.set_circular_axes(circular_x, circular_y);
     }
 
-    bool compute(int n_threads,
-                 DiffusionParams diffusion_params,
-                 struct ggml_tensor** output     = nullptr,
-                 struct ggml_context* output_ctx = nullptr) override {
+    sd::Tensor<float> compute(int n_threads,
+                              const DiffusionParams& diffusion_params) override {
+        GGML_ASSERT(diffusion_params.x != nullptr);
+        GGML_ASSERT(diffusion_params.timesteps != nullptr);
+        static const std::vector<sd::Tensor<float>> empty_ref_latents;
         return qwen_image.compute(n_threads,
-                                  diffusion_params.x,
-                                  diffusion_params.timesteps,
-                                  diffusion_params.context,
-                                  diffusion_params.ref_latents,
-                                  true,  // increase_ref_index
-                                  output,
-                                  output_ctx);
+                                  *diffusion_params.x,
+                                  *diffusion_params.timesteps,
+                                  tensor_or_empty(diffusion_params.context),
+                                  diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
+                                  true);
     }
 };
 
@@ -475,7 +478,7 @@ struct ZImageModel : public DiffusionModel {
         z_image.free_compute_buffer();
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
         z_image.get_param_tensors(tensors, prefix);
     }
 
@@ -499,18 +502,17 @@ struct ZImageModel : public DiffusionModel {
         z_image.set_circular_axes(circular_x, circular_y);
     }
 
-    bool compute(int n_threads,
-                 DiffusionParams diffusion_params,
-                 struct ggml_tensor** output     = nullptr,
-                 struct ggml_context* output_ctx = nullptr) override {
+    sd::Tensor<float> compute(int n_threads,
+                              const DiffusionParams& diffusion_params) override {
+        GGML_ASSERT(diffusion_params.x != nullptr);
+        GGML_ASSERT(diffusion_params.timesteps != nullptr);
+        static const std::vector<sd::Tensor<float>> empty_ref_latents;
         return z_image.compute(n_threads,
-                               diffusion_params.x,
-                               diffusion_params.timesteps,
-                               diffusion_params.context,
-                               diffusion_params.ref_latents,
-                               true,  // increase_ref_index
-                               output,
-                               output_ctx);
+                               *diffusion_params.x,
+                               *diffusion_params.timesteps,
+                               tensor_or_empty(diffusion_params.context),
+                               diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
+                               true);
     }
 };
 
diff --git a/src/easycache.hpp b/src/easycache.hpp
index 3f0287a6..409a464e 100644
--- a/src/easycache.hpp
+++ b/src/easycache.hpp
@@ -1,10 +1,15 @@
+#ifndef __EASYCACHE_HPP__
+#define __EASYCACHE_HPP__
+
 #include <cmath>
 #include <limits>
 #include <unordered_map>
 #include <vector>
 
+#include "condition_cache_utils.hpp"
 #include "denoiser.hpp"
 #include "ggml_extend.hpp"
+#include "tensor.hpp"
 
 struct EasyCacheConfig {
     bool enabled          = false;
@@ -19,15 +24,15 @@ struct EasyCacheCacheEntry {
 
 struct EasyCacheState {
     EasyCacheConfig config;
-    Denoiser* denoiser                  = nullptr;
-    float start_sigma                   = std::numeric_limits<float>::max();
-    float end_sigma                     = 0.0f;
-    bool initialized                    = false;
-    bool initial_step                   = true;
-    bool skip_current_step              = false;
-    bool step_active                    = false;
-    const SDCondition* anchor_condition = nullptr;
-    std::unordered_map<const SDCondition*, EasyCacheCacheEntry> cache_diffs;
+    Denoiser* denoiser           = nullptr;
+    float start_sigma            = std::numeric_limits<float>::max();
+    float end_sigma              = 0.0f;
+    bool initialized             = false;
+    bool initial_step            = true;
+    bool skip_current_step       = false;
+    bool step_active             = false;
+    const void* anchor_condition = nullptr;
+    std::unordered_map<const void*, EasyCacheCacheEntry> cache_diffs;
     std::vector<float> prev_input;
     std::vector<float> prev_output;
     float output_prev_norm                = 0.0f;
@@ -120,41 +125,30 @@ struct EasyCacheState {
         return enabled() && step_active && skip_current_step;
     }
 
-    bool has_cache(const SDCondition* cond) const {
+    bool has_cache(const void* cond) const {
         auto it = cache_diffs.find(cond);
         return it != cache_diffs.end() && !it->second.diff.empty();
     }
 
-    void update_cache(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) {
+    void update_cache(const void* cond, const sd::Tensor<float>& input, const sd::Tensor<float>& output) {
         EasyCacheCacheEntry& entry = cache_diffs[cond];
-        size_t ne                  = static_cast<size_t>(ggml_nelements(output));
-        entry.diff.resize(ne);
-        float* out_data = (float*)output->data;
-        float* in_data  = (float*)input->data;
-        for (size_t i = 0; i < ne; ++i) {
-            entry.diff[i] = out_data[i] - in_data[i];
-        }
+        sd::store_condition_cache_diff(&entry.diff, input, output);
     }
 
-    void apply_cache(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) {
+    void apply_cache(const void* cond, const sd::Tensor<float>& input, sd::Tensor<float>* output) {
         auto it = cache_diffs.find(cond);
         if (it == cache_diffs.end() || it->second.diff.empty()) {
             return;
         }
-        copy_ggml_tensor(output, input);
-        float* out_data                = (float*)output->data;
-        const std::vector<float>& diff = it->second.diff;
-        for (size_t i = 0; i < diff.size(); ++i) {
-            out_data[i] += diff[i];
-        }
+        sd::apply_condition_cache_diff(it->second.diff, input, output);
     }
 
-    bool before_condition(const SDCondition* cond,
-                          ggml_tensor* input,
-                          ggml_tensor* output,
+    bool before_condition(const void* cond,
+                          const sd::Tensor<float>& input,
+                          sd::Tensor<float>* output,
                           float sigma,
                           int step_index) {
-        if (!enabled() || step_index < 0) {
+        if (!enabled() || step_index < 0 || output == nullptr) {
             return false;
         }
         if (step_index != current_step_index) {
@@ -181,12 +175,12 @@ struct EasyCacheState {
         if (!has_prev_input || !has_prev_output || !has_cache(cond)) {
             return false;
         }
-        size_t ne = static_cast<size_t>(ggml_nelements(input));
+        size_t ne = static_cast<size_t>(input.numel());
         if (prev_input.size() != ne) {
             return false;
         }
-        float* input_data = (float*)input->data;
-        last_input_change = 0.0f;
+        const float* input_data = input.data();
+        last_input_change       = 0.0f;
         for (size_t i = 0; i < ne; ++i) {
             last_input_change += std::fabs(input_data[i] - prev_input[i]);
         }
@@ -211,7 +205,7 @@ struct EasyCacheState {
         return false;
     }
 
-    void after_condition(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) {
+    void after_condition(const void* cond, const sd::Tensor<float>& input, const sd::Tensor<float>& output) {
         if (!step_is_active()) {
             return;
         }
@@ -220,16 +214,16 @@ struct EasyCacheState {
             return;
         }
 
-        size_t ne      = static_cast<size_t>(ggml_nelements(input));
-        float* in_data = (float*)input->data;
+        size_t ne            = static_cast<size_t>(input.numel());
+        const float* in_data = input.data();
         prev_input.resize(ne);
         for (size_t i = 0; i < ne; ++i) {
             prev_input[i] = in_data[i];
         }
         has_prev_input = true;
 
-        float* out_data     = (float*)output->data;
-        float output_change = 0.0f;
+        const float* out_data = output.data();
+        float output_change   = 0.0f;
         if (has_prev_output && prev_output.size() == ne) {
             for (size_t i = 0; i < ne; ++i) {
                 output_change += std::fabs(out_data[i] - prev_output[i]);
@@ -262,4 +256,6 @@ struct EasyCacheState {
         cumulative_change_rate = 0.0f;
         has_last_input_change  = false;
     }
-};
\ No newline at end of file
+};
+
+#endif
diff --git a/src/esrgan.hpp b/src/esrgan.hpp
index f740c2bc..26c46f5b 100644
--- a/src/esrgan.hpp
+++ b/src/esrgan.hpp
@@ -27,11 +27,11 @@ public:
         blocks["conv5"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 4 * num_grow_ch, num_feat, {3, 3}, {1, 1}, {1, 1}));
     }
 
-    struct ggml_tensor* lrelu(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+    ggml_tensor* lrelu(GGMLRunnerContext* ctx, ggml_tensor* x) {
         return ggml_leaky_relu(ctx->ggml_ctx, x, 0.2f, true);
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
         // x: [n, num_feat, h, w]
         // return: [n, num_feat, h, w]
 
@@ -64,7 +64,7 @@ public:
         blocks["rdb3"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
         // x: [n, num_feat, h, w]
         // return: [n, num_feat, h, w]
 
@@ -112,11 +112,11 @@ public:
     int get_scale() { return scale; }
     int get_num_block() { return num_block; }
 
-    struct ggml_tensor* lrelu(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+    ggml_tensor* lrelu(GGMLRunnerContext* ctx, ggml_tensor* x) {
         return ggml_leaky_relu(ctx->ggml_ctx, x, 0.2f, true);
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
         // x: [n, num_in_ch, h, w]
         // return: [n, num_out_ch, h*scale, w*scale]
         auto conv_first = std::dynamic_pointer_cast<Conv2d>(blocks["conv_first"]);
@@ -341,28 +341,25 @@ struct ESRGAN : public GGMLRunner {
         return success;
     }
 
-    struct ggml_cgraph* build_graph(struct ggml_tensor* x) {
+    ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor) {
         if (!rrdb_net)
             return nullptr;
         constexpr int kGraphNodes = 1 << 16;  // 65k
-        struct ggml_cgraph* gf    = new_graph_custom(kGraphNodes);
-        x                         = to_backend(x);
+        ggml_cgraph* gf           = new_graph_custom(kGraphNodes);
+        ggml_tensor* x            = make_input(x_tensor);
 
-        auto runner_ctx         = get_context();
-        struct ggml_tensor* out = rrdb_net->forward(&runner_ctx, x);
+        auto runner_ctx  = get_context();
+        ggml_tensor* out = rrdb_net->forward(&runner_ctx, x);
         ggml_build_forward_expand(gf, out);
         return gf;
     }
 
-    bool compute(const int n_threads,
-                 struct ggml_tensor* x,
-                 ggml_tensor** output,
-                 ggml_context* output_ctx = nullptr) {
-        auto get_graph = [&]() -> struct ggml_cgraph* {
-            return build_graph(x);
-        };
-        return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+    sd::Tensor<float> compute(const int n_threads,
+                              const sd::Tensor<float>& x) {
+        auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x); };
+        auto result    = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
+        return result;
     }
 };
 
-#endif  // __ESRGAN_HPP__
\ No newline at end of file
+#endif  // __ESRGAN_HPP__
diff --git a/src/flux.hpp b/src/flux.hpp
index 1204ae1e..e6bf002f 100644
--- a/src/flux.hpp
+++ b/src/flux.hpp
@@ -19,7 +19,7 @@ namespace Flux {
             blocks["out_layer"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_dim, hidden_dim, bias));
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
             // x: [..., in_dim]
             // return: [..., hidden_dim]
             auto in_layer  = std::dynamic_pointer_cast<Linear>(blocks["in_layer"]);
@@ -37,7 +37,7 @@ namespace Flux {
         int64_t hidden_size;
         float eps;
 
-        void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+        void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
             ggml_type wtype = GGML_TYPE_F32;
             params["scale"] = ggml_new_tensor_1d(ctx, wtype, hidden_size);
         }
@@ -48,10 +48,10 @@ namespace Flux {
             : hidden_size(hidden_size),
               eps(eps) {}
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
-            struct ggml_tensor* w = params["scale"];
-            x                     = ggml_rms_norm(ctx->ggml_ctx, x, eps);
-            x                     = ggml_mul(ctx->ggml_ctx, x, w);
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+            ggml_tensor* w = params["scale"];
+            x              = ggml_rms_norm(ctx->ggml_ctx, x, eps);
+            x              = ggml_mul(ctx->ggml_ctx, x, w);
             return x;
         }
     };
@@ -63,7 +63,7 @@ namespace Flux {
             blocks["key_norm"]   = std::shared_ptr<GGMLBlock>(new RMSNorm(dim));
         }
 
-        struct ggml_tensor* query_norm(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+        ggml_tensor* query_norm(GGMLRunnerContext* ctx, ggml_tensor* x) {
             // x: [..., dim]
             // return: [..., dim]
             auto norm = std::dynamic_pointer_cast<RMSNorm>(blocks["query_norm"]);
@@ -72,7 +72,7 @@ namespace Flux {
             return x;
         }
 
-        struct ggml_tensor* key_norm(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+        ggml_tensor* key_norm(GGMLRunnerContext* ctx, ggml_tensor* x) {
             // x: [..., dim]
             // return: [..., dim]
             auto norm = std::dynamic_pointer_cast<RMSNorm>(blocks["key_norm"]);
@@ -98,7 +98,7 @@ namespace Flux {
             blocks["proj"]   = std::shared_ptr<GGMLBlock>(new Linear(dim, dim, proj_bias));
         }
 
-        std::vector<struct ggml_tensor*> pre_attention(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+        std::vector<ggml_tensor*> pre_attention(GGMLRunnerContext* ctx, ggml_tensor* x) {
             auto qkv_proj = std::dynamic_pointer_cast<Linear>(blocks["qkv"]);
             auto norm     = std::dynamic_pointer_cast<QKNorm>(blocks["norm"]);
 
@@ -115,17 +115,17 @@ namespace Flux {
             return {q, k, v};
         }
 
-        struct ggml_tensor* post_attention(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+        ggml_tensor* post_attention(GGMLRunnerContext* ctx, ggml_tensor* x) {
             auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
 
             x = proj->forward(ctx, x);  // [N, n_token, dim]
             return x;
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    struct ggml_tensor* pe,
-                                    struct ggml_tensor* mask) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* pe,
+                             ggml_tensor* mask) {
             // x: [N, n_token, dim]
             // pe: [n_token, d_head/2, 2, 2]
             // return [N, n_token, dim]
@@ -147,7 +147,7 @@ namespace Flux {
             blocks["2"]             = std::make_shared<Linear>(intermediate_size, hidden_size, bias);
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
             auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["0"]);
             auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["2"]);
 
@@ -170,7 +170,7 @@ namespace Flux {
             blocks["down_proj"] = std::make_shared<Linear>(intermediate_size, hidden_size, bias);
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
             auto gate_proj = std::dynamic_pointer_cast<Linear>(blocks["gate_proj"]);
             auto up_proj   = std::dynamic_pointer_cast<Linear>(blocks["up_proj"]);
             auto down_proj = std::dynamic_pointer_cast<Linear>(blocks["down_proj"]);
@@ -212,7 +212,7 @@ namespace Flux {
             blocks["lin"] = std::shared_ptr<GGMLBlock>(new Linear(dim, dim * multiplier, bias));
         }
 
-        std::vector<ModulationOut> forward(GGMLRunnerContext* ctx, struct ggml_tensor* vec) {
+        std::vector<ModulationOut> forward(GGMLRunnerContext* ctx, ggml_tensor* vec) {
             // x: [N, dim]
             // return: [ModulationOut, ModulationOut]
             auto lin = std::dynamic_pointer_cast<Linear>(blocks["lin"]);
@@ -232,11 +232,11 @@ namespace Flux {
         }
     };
 
-    __STATIC_INLINE__ struct ggml_tensor* modulate(struct ggml_context* ctx,
-                                                   struct ggml_tensor* x,
-                                                   struct ggml_tensor* shift,
-                                                   struct ggml_tensor* scale,
-                                                   bool skip_reshape = false) {
+    __STATIC_INLINE__ ggml_tensor* modulate(ggml_context* ctx,
+                                            ggml_tensor* x,
+                                            ggml_tensor* shift,
+                                            ggml_tensor* scale,
+                                            bool skip_reshape = false) {
         // x: [N, L, C]
         // scale: [N, C]
         // shift: [N, C]
@@ -294,7 +294,7 @@ namespace Flux {
             }
         }
 
-        std::vector<ModulationOut> get_distil_img_mod(GGMLRunnerContext* ctx, struct ggml_tensor* vec) {
+        std::vector<ModulationOut> get_distil_img_mod(GGMLRunnerContext* ctx, ggml_tensor* vec) {
             // TODO: not hardcoded?
             const int single_blocks_count = 38;
             const int double_blocks_count = 19;
@@ -303,7 +303,7 @@ namespace Flux {
             return {ModulationOut(ctx, vec, offset), ModulationOut(ctx, vec, offset + 3)};
         }
 
-        std::vector<ModulationOut> get_distil_txt_mod(GGMLRunnerContext* ctx, struct ggml_tensor* vec) {
+        std::vector<ModulationOut> get_distil_txt_mod(GGMLRunnerContext* ctx, ggml_tensor* vec) {
             // TODO: not hardcoded?
             const int single_blocks_count = 38;
             const int double_blocks_count = 19;
@@ -312,14 +312,14 @@ namespace Flux {
             return {ModulationOut(ctx, vec, offset), ModulationOut(ctx, vec, offset + 3)};
         }
 
-        std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(GGMLRunnerContext* ctx,
-                                                                    struct ggml_tensor* img,
-                                                                    struct ggml_tensor* txt,
-                                                                    struct ggml_tensor* vec,
-                                                                    struct ggml_tensor* pe,
-                                                                    struct ggml_tensor* mask            = nullptr,
-                                                                    std::vector<ModulationOut> img_mods = {},
-                                                                    std::vector<ModulationOut> txt_mods = {}) {
+        std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
+                                                      ggml_tensor* img,
+                                                      ggml_tensor* txt,
+                                                      ggml_tensor* vec,
+                                                      ggml_tensor* pe,
+                                                      ggml_tensor* mask                   = nullptr,
+                                                      std::vector<ModulationOut> img_mods = {},
+                                                      std::vector<ModulationOut> txt_mods = {}) {
             // img: [N, n_img_token, hidden_size]
             // txt: [N, n_txt_token, hidden_size]
             // pe: [n_img_token + n_txt_token, d_head/2, 2, 2]
@@ -457,17 +457,17 @@ namespace Flux {
             }
         }
 
-        ModulationOut get_distil_mod(GGMLRunnerContext* ctx, struct ggml_tensor* vec) {
+        ModulationOut get_distil_mod(GGMLRunnerContext* ctx, ggml_tensor* vec) {
             int64_t offset = 3 * idx;
             return ModulationOut(ctx, vec, offset);
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    struct ggml_tensor* vec,
-                                    struct ggml_tensor* pe,
-                                    struct ggml_tensor* mask        = nullptr,
-                                    std::vector<ModulationOut> mods = {}) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* vec,
+                             ggml_tensor* pe,
+                             ggml_tensor* mask               = nullptr,
+                             std::vector<ModulationOut> mods = {}) {
             // x: [N, n_token, hidden_size]
             // pe: [n_token, d_head/2, 2, 2]
             // return: [N, n_token, hidden_size]
@@ -539,7 +539,7 @@ namespace Flux {
             }
         }
 
-        ModulationOut get_distil_mod(GGMLRunnerContext* ctx, struct ggml_tensor* vec) {
+        ModulationOut get_distil_mod(GGMLRunnerContext* ctx, ggml_tensor* vec) {
             int64_t offset = vec->ne[2] - 2;
             int64_t stride = vec->nb[1] * vec->ne[1];
             auto shift     = ggml_view_2d(ctx->ggml_ctx, vec, vec->ne[0], vec->ne[1], vec->nb[1], stride * (offset + 0));  // [N, dim]
@@ -548,15 +548,15 @@ namespace Flux {
             return {shift, scale, nullptr};
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    struct ggml_tensor* c) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* c) {
             // x: [N, n_token, hidden_size]
             // c: [N, hidden_size]
             // return: [N, n_token, patch_size * patch_size * out_channels]
             auto norm_final = std::dynamic_pointer_cast<LayerNorm>(blocks["norm_final"]);
             auto linear     = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
-            struct ggml_tensor *shift, *scale;
+            ggml_tensor *shift, *scale;
             if (prune_mod) {
                 auto mod = get_distil_mod(ctx, c);
                 shift    = mod.shift;
@@ -589,7 +589,7 @@ namespace Flux {
             blocks["out_proj"] = std::shared_ptr<GGMLBlock>(new Linear(inner_size, hidden_size, true));
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
             auto in_proj  = std::dynamic_pointer_cast<Linear>(blocks["in_proj"]);
             auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["out_proj"]);
 
@@ -612,9 +612,9 @@ namespace Flux {
             blocks["embedder.0"] = std::make_shared<Linear>(in_channels + max_freqs * max_freqs, hidden_size_input);
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    struct ggml_tensor* dct) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* dct) {
             // x: (B, P^2, C)
             // dct: (1, P^2, max_freqs^2)
             // return: (B, P^2, hidden_size_input)
@@ -639,9 +639,9 @@ namespace Flux {
             blocks["norm"]            = std::make_shared<RMSNorm>(hidden_size_x);
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    struct ggml_tensor* s) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* s) {
             // x: (batch_size, n_token, hidden_size_x)
             // s: (batch_size, hidden_size_s)
             // return: (batch_size, n_token, hidden_size_x)
@@ -689,8 +689,8 @@ namespace Flux {
             blocks["linear"] = std::make_shared<Linear>(hidden_size, out_channels);
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x) {
             auto norm   = std::dynamic_pointer_cast<RMSNorm>(blocks["norm"]);
             auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
 
@@ -708,8 +708,8 @@ namespace Flux {
             blocks["conv"] = std::make_shared<Conv2d>(hidden_size, out_channels, std::pair{3, 3}, std::pair{1, 1}, std::pair{1, 1});
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x) {
             // x: [N, C, H, W]
             auto norm = std::dynamic_pointer_cast<RMSNorm>(blocks["norm"]);
             auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
@@ -847,15 +847,15 @@ namespace Flux {
             }
         }
 
-        struct ggml_tensor* forward_orig(GGMLRunnerContext* ctx,
-                                         struct ggml_tensor* img,
-                                         struct ggml_tensor* txt,
-                                         struct ggml_tensor* timesteps,
-                                         struct ggml_tensor* y,
-                                         struct ggml_tensor* guidance,
-                                         struct ggml_tensor* pe,
-                                         struct ggml_tensor* mod_index_arange = nullptr,
-                                         std::vector<int> skip_layers         = {}) {
+        ggml_tensor* forward_orig(GGMLRunnerContext* ctx,
+                                  ggml_tensor* img,
+                                  ggml_tensor* txt,
+                                  ggml_tensor* timesteps,
+                                  ggml_tensor* y,
+                                  ggml_tensor* guidance,
+                                  ggml_tensor* pe,
+                                  ggml_tensor* mod_index_arange = nullptr,
+                                  std::vector<int> skip_layers  = {}) {
             auto img_in      = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
             auto txt_in      = std::dynamic_pointer_cast<Linear>(blocks["txt_in"]);
             auto final_layer = std::dynamic_pointer_cast<LastLayer>(blocks["final_layer"]);
@@ -864,8 +864,8 @@ namespace Flux {
                 img = img_in->forward(ctx, img);
             }
 
-            struct ggml_tensor* vec;
-            struct ggml_tensor* txt_img_mask = nullptr;
+            ggml_tensor* vec;
+            ggml_tensor* txt_img_mask = nullptr;
             if (params.is_chroma) {
                 int64_t mod_index_length = 344;
                 auto approx              = std::dynamic_pointer_cast<ChromaApproximator>(blocks["distilled_guidance_layer"]);
@@ -967,27 +967,27 @@ namespace Flux {
             return img;
         }
 
-        struct ggml_tensor* _apply_x0_residual(GGMLRunnerContext* ctx,
-                                               struct ggml_tensor* predicted,
-                                               struct ggml_tensor* noisy,
-                                               struct ggml_tensor* timesteps) {
+        ggml_tensor* _apply_x0_residual(GGMLRunnerContext* ctx,
+                                        ggml_tensor* predicted,
+                                        ggml_tensor* noisy,
+                                        ggml_tensor* timesteps) {
             auto x = ggml_sub(ctx->ggml_ctx, noisy, predicted);
             x      = ggml_div(ctx->ggml_ctx, x, timesteps);
             return x;
         }
 
-        struct ggml_tensor* forward_chroma_radiance(GGMLRunnerContext* ctx,
-                                                    struct ggml_tensor* x,
-                                                    struct ggml_tensor* timestep,
-                                                    struct ggml_tensor* context,
-                                                    struct ggml_tensor* c_concat,
-                                                    struct ggml_tensor* y,
-                                                    struct ggml_tensor* guidance,
-                                                    struct ggml_tensor* pe,
-                                                    struct ggml_tensor* mod_index_arange  = nullptr,
-                                                    struct ggml_tensor* dct               = nullptr,
-                                                    std::vector<ggml_tensor*> ref_latents = {},
-                                                    std::vector<int> skip_layers          = {}) {
+        ggml_tensor* forward_chroma_radiance(GGMLRunnerContext* ctx,
+                                             ggml_tensor* x,
+                                             ggml_tensor* timestep,
+                                             ggml_tensor* context,
+                                             ggml_tensor* c_concat,
+                                             ggml_tensor* y,
+                                             ggml_tensor* guidance,
+                                             ggml_tensor* pe,
+                                             ggml_tensor* mod_index_arange         = nullptr,
+                                             ggml_tensor* dct                      = nullptr,
+                                             std::vector<ggml_tensor*> ref_latents = {},
+                                             std::vector<int> skip_layers          = {}) {
             GGML_ASSERT(x->ne[3] == 1);
 
             int64_t W      = x->ne[0];
@@ -1050,18 +1050,18 @@ namespace Flux {
             return out;
         }
 
-        struct ggml_tensor* forward_flux_chroma(GGMLRunnerContext* ctx,
-                                                struct ggml_tensor* x,
-                                                struct ggml_tensor* timestep,
-                                                struct ggml_tensor* context,
-                                                struct ggml_tensor* c_concat,
-                                                struct ggml_tensor* y,
-                                                struct ggml_tensor* guidance,
-                                                struct ggml_tensor* pe,
-                                                struct ggml_tensor* mod_index_arange  = nullptr,
-                                                struct ggml_tensor* dct               = nullptr,
-                                                std::vector<ggml_tensor*> ref_latents = {},
-                                                std::vector<int> skip_layers          = {}) {
+        ggml_tensor* forward_flux_chroma(GGMLRunnerContext* ctx,
+                                         ggml_tensor* x,
+                                         ggml_tensor* timestep,
+                                         ggml_tensor* context,
+                                         ggml_tensor* c_concat,
+                                         ggml_tensor* y,
+                                         ggml_tensor* guidance,
+                                         ggml_tensor* pe,
+                                         ggml_tensor* mod_index_arange         = nullptr,
+                                         ggml_tensor* dct                      = nullptr,
+                                         std::vector<ggml_tensor*> ref_latents = {},
+                                         std::vector<int> skip_layers          = {}) {
             GGML_ASSERT(x->ne[3] == 1);
 
             int64_t W      = x->ne[0];
@@ -1119,18 +1119,18 @@ namespace Flux {
             return out;
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    struct ggml_tensor* timestep,
-                                    struct ggml_tensor* context,
-                                    struct ggml_tensor* c_concat,
-                                    struct ggml_tensor* y,
-                                    struct ggml_tensor* guidance,
-                                    struct ggml_tensor* pe,
-                                    struct ggml_tensor* mod_index_arange  = nullptr,
-                                    struct ggml_tensor* dct               = nullptr,
-                                    std::vector<ggml_tensor*> ref_latents = {},
-                                    std::vector<int> skip_layers          = {}) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* timestep,
+                             ggml_tensor* context,
+                             ggml_tensor* c_concat,
+                             ggml_tensor* y,
+                             ggml_tensor* guidance,
+                             ggml_tensor* pe,
+                             ggml_tensor* mod_index_arange         = nullptr,
+                             ggml_tensor* dct                      = nullptr,
+                             std::vector<ggml_tensor*> ref_latents = {},
+                             std::vector<int> skip_layers          = {}) {
             // Forward pass of DiT.
             // x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
             // timestep: (N,) tensor of diffusion timesteps
@@ -1178,6 +1178,7 @@ namespace Flux {
         std::vector<float> pe_vec;
         std::vector<float> mod_index_arange_vec;
         std::vector<float> dct_vec;
+        sd::Tensor<float> guidance_tensor;
         SDVersion version;
         bool use_mask = false;
 
@@ -1299,7 +1300,7 @@ namespace Flux {
             return "flux";
         }
 
-        void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
             flux.get_param_tensors(tensors, prefix);
         }
 
@@ -1353,29 +1354,42 @@ namespace Flux {
             return dct;
         }
 
-        struct ggml_cgraph* build_graph(struct ggml_tensor* x,
-                                        struct ggml_tensor* timesteps,
-                                        struct ggml_tensor* context,
-                                        struct ggml_tensor* c_concat,
-                                        struct ggml_tensor* y,
-                                        struct ggml_tensor* guidance,
-                                        std::vector<ggml_tensor*> ref_latents = {},
-                                        bool increase_ref_index               = false,
-                                        std::vector<int> skip_layers          = {}) {
-            GGML_ASSERT(x->ne[3] == 1);
-            struct ggml_cgraph* gf = new_graph_custom(FLUX_GRAPH_SIZE);
-
-            struct ggml_tensor* mod_index_arange = nullptr;
-            struct ggml_tensor* dct              = nullptr;  // for chroma radiance
-
-            x       = to_backend(x);
-            context = to_backend(context);
-            if (c_concat != nullptr) {
-                c_concat = to_backend(c_concat);
+        ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
+                                 const sd::Tensor<float>& timesteps_tensor,
+                                 const sd::Tensor<float>& context_tensor                  = {},
+                                 const sd::Tensor<float>& c_concat_tensor                 = {},
+                                 const sd::Tensor<float>& y_tensor                        = {},
+                                 const sd::Tensor<float>& guidance_tensor                 = {},
+                                 const std::vector<sd::Tensor<float>>& ref_latents_tensor = {},
+                                 bool increase_ref_index                                  = false,
+                                 std::vector<int> skip_layers                             = {}) {
+            ggml_tensor* x         = make_input(x_tensor);
+            ggml_tensor* timesteps = make_input(timesteps_tensor);
+            ggml_tensor* context   = make_optional_input(context_tensor);
+            ggml_tensor* c_concat  = make_optional_input(c_concat_tensor);
+            ggml_tensor* y         = make_optional_input(y_tensor);
+            if (flux_params.guidance_embed || flux_params.is_chroma) {
+                if (!guidance_tensor.empty()) {
+                    this->guidance_tensor = guidance_tensor;
+                    if (flux_params.is_chroma) {
+                        this->guidance_tensor.fill_(0.f);
+                    }
+                }
+            }
+            ggml_tensor* guidance = make_optional_input(this->guidance_tensor);
+            std::vector<ggml_tensor*> ref_latents;
+            ref_latents.reserve(ref_latents_tensor.size());
+            for (const auto& ref_latent_tensor : ref_latents_tensor) {
+                ref_latents.push_back(make_input(ref_latent_tensor));
             }
-            if (flux_params.is_chroma) {
-                guidance = ggml_set_f32(guidance, 0);
 
+            GGML_ASSERT(x->ne[3] == 1);
+            ggml_cgraph* gf = new_graph_custom(FLUX_GRAPH_SIZE);
+
+            ggml_tensor* mod_index_arange = nullptr;
+            ggml_tensor* dct              = nullptr;  // for chroma radiance
+
+            if (flux_params.is_chroma) {
                 if (!use_mask) {
                     y = nullptr;
                 }
@@ -1385,16 +1399,6 @@ namespace Flux {
                 mod_index_arange     = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_F32, mod_index_arange_vec.size());
                 set_backend_tensor_data(mod_index_arange, mod_index_arange_vec.data());
             }
-            y = to_backend(y);
-
-            timesteps = to_backend(timesteps);
-            if (flux_params.guidance_embed || flux_params.is_chroma) {
-                guidance = to_backend(guidance);
-            }
-            for (int i = 0; i < ref_latents.size(); i++) {
-                ref_latents[i] = to_backend(ref_latents[i]);
-            }
-
             std::set<int> txt_arange_dims;
             if (sd_version_is_flux2(version)) {
                 txt_arange_dims    = {3};
@@ -1437,89 +1441,98 @@ namespace Flux {
 
             auto runner_ctx = get_context();
 
-            struct ggml_tensor* out = flux.forward(&runner_ctx,
-                                                   x,
-                                                   timesteps,
-                                                   context,
-                                                   c_concat,
-                                                   y,
-                                                   guidance,
-                                                   pe,
-                                                   mod_index_arange,
-                                                   dct,
-                                                   ref_latents,
-                                                   skip_layers);
+            ggml_tensor* out = flux.forward(&runner_ctx,
+                                            x,
+                                            timesteps,
+                                            context,
+                                            c_concat,
+                                            y,
+                                            guidance,
+                                            pe,
+                                            mod_index_arange,
+                                            dct,
+                                            ref_latents,
+                                            skip_layers);
 
             ggml_build_forward_expand(gf, out);
 
             return gf;
         }
 
-        bool compute(int n_threads,
-                     struct ggml_tensor* x,
-                     struct ggml_tensor* timesteps,
-                     struct ggml_tensor* context,
-                     struct ggml_tensor* c_concat,
-                     struct ggml_tensor* y,
-                     struct ggml_tensor* guidance,
-                     std::vector<ggml_tensor*> ref_latents = {},
-                     bool increase_ref_index               = false,
-                     struct ggml_tensor** output           = nullptr,
-                     struct ggml_context* output_ctx       = nullptr,
-                     std::vector<int> skip_layers          = std::vector<int>()) {
+        sd::Tensor<float> compute(int n_threads,
+                                  const sd::Tensor<float>& x,
+                                  const sd::Tensor<float>& timesteps,
+                                  const sd::Tensor<float>& context                  = {},
+                                  const sd::Tensor<float>& c_concat                 = {},
+                                  const sd::Tensor<float>& y                        = {},
+                                  const sd::Tensor<float>& guidance                 = {},
+                                  const std::vector<sd::Tensor<float>>& ref_latents = {},
+                                  bool increase_ref_index                           = false,
+                                  std::vector<int> skip_layers                      = std::vector<int>()) {
             // x: [N, in_channels, h, w]
             // timesteps: [N, ]
             // context: [N, max_position, hidden_size]
             // y: [N, adm_in_channels] or [1, adm_in_channels]
             // guidance: [N, ]
-            auto get_graph = [&]() -> struct ggml_cgraph* {
+            auto get_graph = [&]() -> ggml_cgraph* {
                 return build_graph(x, timesteps, context, c_concat, y, guidance, ref_latents, increase_ref_index, skip_layers);
             };
 
-            return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+            auto result = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
+            return result;
         }
 
         void test() {
-            struct ggml_init_params params;
+            ggml_init_params params;
             params.mem_size   = static_cast<size_t>(1024 * 1024) * 1024;  // 1GB
             params.mem_buffer = nullptr;
             params.no_alloc   = false;
 
-            struct ggml_context* work_ctx = ggml_init(params);
-            GGML_ASSERT(work_ctx != nullptr);
+            ggml_context* ctx = ggml_init(params);
+            GGML_ASSERT(ctx != nullptr);
 
             {
                 // cpu f16:
                 // cuda f16: nan
                 // cuda q8_0: pass
-                auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 16, 16, 128, 1);
+                sd::Tensor<float> x({16, 16, 128, 1});
                 // ggml_set_f32(x, 0.01f);
-                // auto x = load_tensor_from_file(work_ctx, "chroma_x.bin");
+                // auto x = load_tensor_from_file(ctx, "chroma_x.bin");
                 // print_ggml_tensor(x);
 
                 std::vector<float> timesteps_vec(1, 1.f);
-                auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
+                auto timesteps = sd::Tensor<float>::from_vector(timesteps_vec);
 
                 std::vector<float> guidance_vec(1, 0.f);
-                auto guidance = vector_to_ggml_tensor(work_ctx, guidance_vec);
+                auto guidance = sd::Tensor<float>::from_vector(guidance_vec);
 
-                auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 15360, 256, 1);
+                sd::Tensor<float> context({15360, 256, 1});
                 // ggml_set_f32(context, 0.01f);
-                // auto context = load_tensor_from_file(work_ctx, "chroma_context.bin");
+                // auto context = load_tensor_from_file(ctx, "chroma_context.bin");
                 // print_ggml_tensor(context);
 
-                // auto y = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 768, 1);
+                // auto y = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 768, 1);
                 // ggml_set_f32(y, 0.01f);
                 auto y = nullptr;
                 // print_ggml_tensor(y);
 
-                struct ggml_tensor* out = nullptr;
+                sd::Tensor<float> out;
 
-                int64_t t0 = ggml_time_ms();
-                compute(8, x, timesteps, context, nullptr, y, guidance, {}, false, &out, work_ctx);
-                int64_t t1 = ggml_time_ms();
+                int64_t t0   = ggml_time_ms();
+                auto out_opt = compute(8,
+                                       x,
+                                       timesteps,
+                                       context,
+                                       {},
+                                       {},
+                                       guidance,
+                                       {},
+                                       false);
+                int64_t t1   = ggml_time_ms();
 
-                print_ggml_tensor(out);
+                GGML_ASSERT(!out_opt.empty());
+                out = std::move(out_opt);
+                print_sd_tensor(out);
                 LOG_DEBUG("flux test done in %lldms", t1 - t0);
             }
         }
diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp
index 131d66fb..859270cb 100644
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
@@ -13,6 +13,7 @@
 #include <iterator>
 #include <map>
 #include <memory>
+#include <optional>
 #include <random>
 #include <regex>
 #include <set>
@@ -27,6 +28,7 @@
 #include "ggml.h"
 
 #include "model.h"
+#include "tensor.hpp"
 
 #ifdef SD_USE_CUDA
 #include "ggml-cuda.h"
@@ -49,6 +51,7 @@
 #endif
 
 #include "rng.hpp"
+#include "tensor_ggml.hpp"
 #include "util.h"
 
 #define EPS 1e-05f
@@ -95,7 +98,7 @@ static_assert(GGML_MAX_NAME >= 128, "GGML_MAX_NAME must be at least 128");
 // A: [ne03, k, ne01, ne00]
 // B: k rows, m columns => [k, m]
 // result is [ne03, m, ne01, ne00]
-__STATIC_INLINE__ struct ggml_tensor* ggml_ext_mul_n_mode(struct ggml_context* ctx, struct ggml_tensor* a, struct ggml_tensor* b, int mode = 0) {
+__STATIC_INLINE__ ggml_tensor* ggml_ext_mul_n_mode(ggml_context* ctx, ggml_tensor* a, ggml_tensor* b, int mode = 0) {
     // reshape A
     // swap 0th and nth axis
     a           = ggml_cont(ctx, ggml_permute(ctx, a, mode, mode != 1 ? 1 : 0, mode != 2 ? 2 : 0, mode != 3 ? 3 : 0));
@@ -105,7 +108,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_mul_n_mode(struct ggml_context* c
     // make 2D
     a = ggml_cont(ctx, ggml_reshape_2d(ctx, a, a->ne[0], (ne3 * ne2 * ne1)));
 
-    struct ggml_tensor* result = ggml_cont(ctx, ggml_transpose(ctx, ggml_mul_mat(ctx, a, b)));
+    ggml_tensor* result = ggml_cont(ctx, ggml_transpose(ctx, ggml_mul_mat(ctx, a, b)));
 
     // reshape output (same shape as a after permutation except first dim)
     result = ggml_reshape_4d(ctx, result, result->ne[0], ne1, ne2, ne3);
@@ -114,11 +117,11 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_mul_n_mode(struct ggml_context* c
     return result;
 }
 
-__STATIC_INLINE__ struct ggml_tensor* ggml_ext_merge_lora(ggml_context* ctx,
-                                                          ggml_tensor* lora_down,
-                                                          ggml_tensor* lora_up,
-                                                          ggml_tensor* lora_mid = nullptr) {
-    struct ggml_tensor* updown;
+__STATIC_INLINE__ ggml_tensor* ggml_ext_merge_lora(ggml_context* ctx,
+                                                   ggml_tensor* lora_down,
+                                                   ggml_tensor* lora_up,
+                                                   ggml_tensor* lora_mid = nullptr) {
+    ggml_tensor* updown;
     // flat lora tensors to multiply it
     int64_t lora_up_rows  = lora_up->ne[ggml_n_dims(lora_up) - 1];
     lora_up               = ggml_reshape_2d(ctx, lora_up, ggml_nelements(lora_up) / lora_up_rows, lora_up_rows);
@@ -147,7 +150,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_merge_lora(ggml_context* ctx,
 
 // Kronecker product
 // [ne03,ne02,ne01,ne00] x [ne13,ne12,ne11,ne10] => [ne03*ne13,ne02*ne12,ne01*ne11,ne00*ne10]
-__STATIC_INLINE__ struct ggml_tensor* ggml_ext_kronecker(ggml_context* ctx, struct ggml_tensor* a, struct ggml_tensor* b) {
+__STATIC_INLINE__ ggml_tensor* ggml_ext_kronecker(ggml_context* ctx, ggml_tensor* a, ggml_tensor* b) {
     return ggml_mul(ctx,
                     ggml_interpolate(ctx,
                                      a,
@@ -159,7 +162,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_kronecker(ggml_context* ctx, stru
                     b);
 }
 
-__STATIC_INLINE__ void ggml_ext_im_set_randn_f32(struct ggml_tensor* tensor, std::shared_ptr<RNG> rng) {
+__STATIC_INLINE__ void ggml_ext_im_set_randn_f32(ggml_tensor* tensor, std::shared_ptr<RNG> rng) {
     uint32_t n                        = (uint32_t)ggml_nelements(tensor);
     std::vector<float> random_numbers = rng->randn(n);
     for (uint32_t i = 0; i < n; i++) {
@@ -167,7 +170,7 @@ __STATIC_INLINE__ void ggml_ext_im_set_randn_f32(struct ggml_tensor* tensor, std
     }
 }
 
-__STATIC_INLINE__ void ggml_ext_tensor_set_f32(struct ggml_tensor* tensor, float value, int64_t i0, int64_t i1 = 0, int64_t i2 = 0, int64_t i3 = 0) {
+__STATIC_INLINE__ void ggml_ext_tensor_set_f32(ggml_tensor* tensor, float value, int64_t i0, int64_t i1 = 0, int64_t i2 = 0, int64_t i3 = 0) {
     GGML_ASSERT(tensor->nb[0] == sizeof(float));
     *(float*)((char*)(tensor->data) + i3 * tensor->nb[3] + i2 * tensor->nb[2] + i1 * tensor->nb[1] + i0 * tensor->nb[0]) = value;
 }
@@ -205,15 +208,7 @@ __STATIC_INLINE__ float sd_image_get_f32(sd_image_t image, int64_t iw, int64_t i
     return value;
 }
 
-__STATIC_INLINE__ float sd_image_get_f32(sd_image_f32_t image, int64_t iw, int64_t ih, int64_t ic, bool scale = true) {
-    float value = *(image.data + ih * image.width * image.channel + iw * image.channel + ic);
-    if (scale) {
-        value /= 255.f;
-    }
-    return value;
-}
-
-__STATIC_INLINE__ void print_ggml_tensor(struct ggml_tensor* tensor, bool shape_only = false, const char* mark = "") {
+__STATIC_INLINE__ void print_ggml_tensor(ggml_tensor* tensor, bool shape_only = false, const char* mark = "") {
     printf("%s (%s): shape(%zu, %zu, %zu, %zu)\n", mark, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
     fflush(stdout);
     if (shape_only) {
@@ -250,6 +245,56 @@ __STATIC_INLINE__ void print_ggml_tensor(struct ggml_tensor* tensor, bool shape_
     }
 }
 
+template <typename T>
+__STATIC_INLINE__ void print_sd_tensor(const sd::Tensor<T>& tensor, bool shape_only = false, const char* mark = "") {
+    printf("%s: shape(", mark);
+    for (size_t i = 0; i < static_cast<size_t>(tensor.dim()); ++i) {
+        printf("%s%lld", i == 0 ? "" : ", ", static_cast<long long>(tensor.shape()[i]));
+    }
+    printf(")\n");
+    fflush(stdout);
+    if (shape_only) {
+        return;
+    }
+    int range                  = 3;
+    std::vector<int64_t> shape = tensor.shape();
+    while (shape.size() < 4) {
+        shape.push_back(1);
+    }
+    for (int64_t i3 = 0; i3 < shape[3]; i3++) {
+        if (i3 >= range && i3 + range < shape[3]) {
+            continue;
+        }
+        for (int64_t i2 = 0; i2 < shape[2]; i2++) {
+            if (i2 >= range && i2 + range < shape[2]) {
+                continue;
+            }
+            for (int64_t i1 = 0; i1 < shape[1]; i1++) {
+                if (i1 >= range && i1 + range < shape[1]) {
+                    continue;
+                }
+                for (int64_t i0 = 0; i0 < shape[0]; i0++) {
+                    if (i0 >= range && i0 + range < shape[0]) {
+                        continue;
+                    }
+                    size_t offset = static_cast<size_t>(i0 + shape[0] * (i1 + shape[1] * (i2 + shape[2] * i3)));
+                    printf("  [%lld, %lld, %lld, %lld] = ", static_cast<long long>(i3), static_cast<long long>(i2), static_cast<long long>(i1), static_cast<long long>(i0));
+                    if constexpr (std::is_same_v<T, float>) {
+                        printf("%f\n", tensor[static_cast<int64_t>(offset)]);
+                    } else if constexpr (std::is_same_v<T, ggml_fp16_t>) {
+                        printf("%f\n", ggml_fp16_to_fp32(tensor[static_cast<int64_t>(offset)]));
+                    } else if constexpr (std::is_same_v<T, int32_t>) {
+                        printf("%d\n", tensor[static_cast<int64_t>(offset)]);
+                    } else if constexpr (std::is_same_v<T, int64_t>) {
+                        printf("%lld\n", static_cast<long long>(tensor[static_cast<int64_t>(offset)]));
+                    }
+                    fflush(stdout);
+                }
+            }
+        }
+    }
+}
+
 __STATIC_INLINE__ void ggml_ext_tensor_iter(
     ggml_tensor* tensor,
     const std::function<void(ggml_tensor*, int64_t, int64_t, int64_t, int64_t)>& fn) {
@@ -350,7 +395,7 @@ __STATIC_INLINE__ ggml_tensor* load_tensor_from_file(ggml_context* ctx, const st
 //     file.close();
 // }
 
-__STATIC_INLINE__ void copy_ggml_tensor(struct ggml_tensor* dst, struct ggml_tensor* src) {
+__STATIC_INLINE__ void copy_ggml_tensor(ggml_tensor* dst, ggml_tensor* src) {
     if (dst->type == src->type) {
         dst->nb[0] = src->nb[0];
         dst->nb[1] = src->nb[1];
@@ -360,30 +405,36 @@ __STATIC_INLINE__ void copy_ggml_tensor(struct ggml_tensor* dst, struct ggml_ten
         memcpy(((char*)dst->data), ((char*)src->data), ggml_nbytes(dst));
         return;
     }
-    struct ggml_init_params params;
-    params.mem_size          = 10 * 1024 * 1024;  // for padding
-    params.mem_buffer        = nullptr;
-    params.no_alloc          = false;
-    struct ggml_context* ctx = ggml_init(params);
+    ggml_init_params params;
+    params.mem_size   = 10 * 1024 * 1024;  // for padding
+    params.mem_buffer = nullptr;
+    params.no_alloc   = false;
+    ggml_context* ctx = ggml_init(params);
     if (!ctx) {
         LOG_ERROR("ggml_init() failed");
         return;
     }
     ggml_tensor* final = ggml_cpy(ctx, src, dst);
 
-    struct ggml_cgraph* graph = ggml_new_graph(ctx);
+    ggml_cgraph* graph = ggml_new_graph(ctx);
     ggml_build_forward_expand(graph, final);
     ggml_graph_compute_with_ctx(ctx, graph, 1);
     ggml_free(ctx);
 }
 
+__STATIC_INLINE__ ggml_tensor* ggml_ext_dup_and_cpy_tensor(ggml_context* ctx, ggml_tensor* src) {
+    ggml_tensor* dup = ggml_dup_tensor(ctx, src);
+    copy_ggml_tensor(dup, src);
+    return dup;
+}
+
 __STATIC_INLINE__ float sigmoid(float x) {
     return 1 / (1.0f + expf(-x));
 }
 
 // SPECIAL OPERATIONS WITH TENSORS
 
-__STATIC_INLINE__ uint8_t* ggml_tensor_to_sd_image(struct ggml_tensor* input, uint8_t* image_data = nullptr) {
+__STATIC_INLINE__ uint8_t* ggml_tensor_to_sd_image(ggml_tensor* input, uint8_t* image_data = nullptr) {
     int64_t width    = input->ne[0];
     int64_t height   = input->ne[1];
     int64_t channels = input->ne[2];
@@ -402,7 +453,7 @@ __STATIC_INLINE__ uint8_t* ggml_tensor_to_sd_image(struct ggml_tensor* input, ui
     return image_data;
 }
 
-__STATIC_INLINE__ uint8_t* ggml_tensor_to_sd_image(struct ggml_tensor* input, int idx, bool video = false) {
+__STATIC_INLINE__ uint8_t* ggml_tensor_to_sd_image(ggml_tensor* input, int idx, bool video = false) {
     int64_t width  = input->ne[0];
     int64_t height = input->ne[1];
     int64_t channels;
@@ -443,9 +494,9 @@ __STATIC_INLINE__ void sd_image_to_ggml_tensor(sd_image_t image,
     });
 }
 
-__STATIC_INLINE__ void ggml_ext_tensor_apply_mask(struct ggml_tensor* image_data,
-                                                  struct ggml_tensor* mask,
-                                                  struct ggml_tensor* output,
+__STATIC_INLINE__ void ggml_ext_tensor_apply_mask(ggml_tensor* image_data,
+                                                  ggml_tensor* mask,
+                                                  ggml_tensor* output,
                                                   float masked_value = 0.5f) {
     int64_t width    = output->ne[0];
     int64_t height   = output->ne[1];
@@ -469,94 +520,7 @@ __STATIC_INLINE__ void ggml_ext_tensor_apply_mask(struct ggml_tensor* image_data
     }
 }
 
-__STATIC_INLINE__ void sd_image_f32_to_ggml_tensor(sd_image_f32_t image,
-                                                   ggml_tensor* tensor,
-                                                   bool scale = true) {
-    GGML_ASSERT(image.width == tensor->ne[0]);
-    GGML_ASSERT(image.height == tensor->ne[1]);
-    GGML_ASSERT(image.channel == tensor->ne[2]);
-    GGML_ASSERT(1 == tensor->ne[3]);
-    GGML_ASSERT(tensor->type == GGML_TYPE_F32);
-    ggml_ext_tensor_iter(tensor, [&](ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-        float value = sd_image_get_f32(image, i0, i1, i2, scale);
-        ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2, i3);
-    });
-}
-
-__STATIC_INLINE__ void ggml_ext_tensor_split_2d(struct ggml_tensor* input,
-                                                struct ggml_tensor* output,
-                                                int x,
-                                                int y) {
-    int64_t width    = output->ne[0];
-    int64_t height   = output->ne[1];
-    int64_t channels = output->ne[2];
-    int64_t ne3      = output->ne[3];
-    GGML_ASSERT(input->type == GGML_TYPE_F32 && output->type == GGML_TYPE_F32);
-    for (int iy = 0; iy < height; iy++) {
-        for (int ix = 0; ix < width; ix++) {
-            for (int k = 0; k < channels; k++) {
-                for (int l = 0; l < ne3; l++) {
-                    float value = ggml_ext_tensor_get_f32(input, ix + x, iy + y, k, l);
-                    ggml_ext_tensor_set_f32(output, value, ix, iy, k, l);
-                }
-            }
-        }
-    }
-}
-
-// unclamped -> expects x in the range [0-1]
-__STATIC_INLINE__ float smootherstep_f32(const float x) {
-    GGML_ASSERT(x >= 0.f && x <= 1.f);
-    return x * x * x * (x * (6.0f * x - 15.0f) + 10.0f);
-}
-
-__STATIC_INLINE__ void ggml_ext_tensor_merge_2d(struct ggml_tensor* input,
-                                                struct ggml_tensor* output,
-                                                int x,
-                                                int y,
-                                                int overlap_x,
-                                                int overlap_y,
-                                                int x_skip = 0,
-                                                int y_skip = 0) {
-    int64_t width    = input->ne[0];
-    int64_t height   = input->ne[1];
-    int64_t channels = input->ne[2];
-    int64_t ne3      = input->ne[3];
-
-    int64_t img_width  = output->ne[0];
-    int64_t img_height = output->ne[1];
-
-    GGML_ASSERT(input->type == GGML_TYPE_F32 && output->type == GGML_TYPE_F32);
-    for (int iy = y_skip; iy < height; iy++) {
-        for (int ix = x_skip; ix < width; ix++) {
-            for (int k = 0; k < channels; k++) {
-                for (int l = 0; l < ne3; l++) {
-                    float new_value = ggml_ext_tensor_get_f32(input, ix, iy, k, l);
-                    if (overlap_x > 0 || overlap_y > 0) {  // blend colors in overlapped area
-                        float old_value = ggml_ext_tensor_get_f32(output, x + ix, y + iy, k, l);
-
-                        const float x_f_0 = (overlap_x > 0 && x > 0) ? (ix - x_skip) / float(overlap_x) : 1;
-                        const float x_f_1 = (overlap_x > 0 && x < (img_width - width)) ? (width - ix) / float(overlap_x) : 1;
-                        const float y_f_0 = (overlap_y > 0 && y > 0) ? (iy - y_skip) / float(overlap_y) : 1;
-                        const float y_f_1 = (overlap_y > 0 && y < (img_height - height)) ? (height - iy) / float(overlap_y) : 1;
-
-                        const float x_f = std::min(std::min(x_f_0, x_f_1), 1.f);
-                        const float y_f = std::min(std::min(y_f_0, y_f_1), 1.f);
-
-                        ggml_ext_tensor_set_f32(
-                            output,
-                            old_value + new_value * smootherstep_f32(y_f) * smootherstep_f32(x_f),
-                            x + ix, y + iy, k, l);
-                    } else {
-                        ggml_ext_tensor_set_f32(output, new_value, x + ix, y + iy, k, l);
-                    }
-                }
-            }
-        }
-    }
-}
-
-__STATIC_INLINE__ float ggml_ext_tensor_mean(struct ggml_tensor* src) {
+__STATIC_INLINE__ float ggml_ext_tensor_mean(ggml_tensor* src) {
     float mean        = 0.0f;
     int64_t nelements = ggml_nelements(src);
     float* data       = (float*)src->data;
@@ -567,7 +531,7 @@ __STATIC_INLINE__ float ggml_ext_tensor_mean(struct ggml_tensor* src) {
 }
 
 // a = a+b
-__STATIC_INLINE__ void ggml_ext_tensor_add_inplace(struct ggml_tensor* a, struct ggml_tensor* b) {
+__STATIC_INLINE__ void ggml_ext_tensor_add_inplace(ggml_tensor* a, ggml_tensor* b) {
     GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
     int64_t nelements = ggml_nelements(a);
     float* vec_a      = (float*)a->data;
@@ -577,7 +541,7 @@ __STATIC_INLINE__ void ggml_ext_tensor_add_inplace(struct ggml_tensor* a, struct
     }
 }
 
-__STATIC_INLINE__ void ggml_ext_tensor_scale_inplace(struct ggml_tensor* src, float scale) {
+__STATIC_INLINE__ void ggml_ext_tensor_scale_inplace(ggml_tensor* src, float scale) {
     int64_t nelements = ggml_nelements(src);
     float* data       = (float*)src->data;
     for (int i = 0; i < nelements; i++) {
@@ -585,7 +549,7 @@ __STATIC_INLINE__ void ggml_ext_tensor_scale_inplace(struct ggml_tensor* src, fl
     }
 }
 
-__STATIC_INLINE__ void ggml_ext_tensor_clamp_inplace(struct ggml_tensor* src, float min, float max) {
+__STATIC_INLINE__ void ggml_ext_tensor_clamp_inplace(ggml_tensor* src, float min, float max) {
     int64_t nelements = ggml_nelements(src);
     float* data       = (float*)src->data;
     for (int i = 0; i < nelements; i++) {
@@ -594,10 +558,10 @@ __STATIC_INLINE__ void ggml_ext_tensor_clamp_inplace(struct ggml_tensor* src, fl
     }
 }
 
-__STATIC_INLINE__ struct ggml_tensor* ggml_ext_tensor_concat(struct ggml_context* ctx,
-                                                             struct ggml_tensor* a,
-                                                             struct ggml_tensor* b,
-                                                             int dim) {
+__STATIC_INLINE__ ggml_tensor* ggml_ext_tensor_concat(ggml_context* ctx,
+                                                      ggml_tensor* a,
+                                                      ggml_tensor* b,
+                                                      int dim) {
     int64_t ne[GGML_MAX_DIMS];
     for (int d = 0; d < GGML_MAX_DIMS; ++d) {
         if (d == dim) {
@@ -607,9 +571,9 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_tensor_concat(struct ggml_context
         GGML_ASSERT(a->ne[d] == b->ne[d]);
         ne[d] = a->ne[d];
     }
-    struct ggml_tensor* result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
-    int64_t o[4]               = {0, 0, 0, 0};
-    o[dim]                     = a->ne[dim];
+    ggml_tensor* result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
+    int64_t o[4]        = {0, 0, 0, 0};
+    o[dim]              = a->ne[dim];
 
     float v;
     for (int i3 = 0; i3 < result->ne[3]; i3++) {
@@ -631,7 +595,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_tensor_concat(struct ggml_context
 }
 
 // convert values from [0, 1] to [-1, 1]
-__STATIC_INLINE__ void process_vae_input_tensor(struct ggml_tensor* src) {
+__STATIC_INLINE__ void scale_to_minus1_1(ggml_tensor* src) {
     int64_t nelements = ggml_nelements(src);
     float* data       = (float*)src->data;
     for (int i = 0; i < nelements; i++) {
@@ -641,7 +605,7 @@ __STATIC_INLINE__ void process_vae_input_tensor(struct ggml_tensor* src) {
 }
 
 // convert values from [-1, 1] to [0, 1]
-__STATIC_INLINE__ void process_vae_output_tensor(struct ggml_tensor* src) {
+__STATIC_INLINE__ void scale_to_0_1(ggml_tensor* src) {
     int64_t nelements = ggml_nelements(src);
     float* data       = (float*)src->data;
     for (int i = 0; i < nelements; i++) {
@@ -650,8 +614,8 @@ __STATIC_INLINE__ void process_vae_output_tensor(struct ggml_tensor* src) {
     }
 }
 
-__STATIC_INLINE__ struct ggml_tensor* ggml_ext_cont(struct ggml_context* ctx,
-                                                    struct ggml_tensor* x) {
+__STATIC_INLINE__ ggml_tensor* ggml_ext_cont(ggml_context* ctx,
+                                             ggml_tensor* x) {
     if (ggml_is_contiguous(x)) {
         return x;
     }
@@ -659,12 +623,12 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_cont(struct ggml_context* ctx,
 }
 
 // torch like permute
-__STATIC_INLINE__ struct ggml_tensor* ggml_ext_torch_permute(struct ggml_context* ctx,
-                                                             struct ggml_tensor* x,
-                                                             int axis0,
-                                                             int axis1,
-                                                             int axis2,
-                                                             int axis3) {
+__STATIC_INLINE__ ggml_tensor* ggml_ext_torch_permute(ggml_context* ctx,
+                                                      ggml_tensor* x,
+                                                      int axis0,
+                                                      int axis1,
+                                                      int axis2,
+                                                      int axis3) {
     int torch_axes[4] = {axis0, axis1, axis2, axis3};
 
     int ggml_axes[4] = {0};
@@ -683,12 +647,12 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_torch_permute(struct ggml_context
     return ggml_permute(ctx, x, ggml_axes[0], ggml_axes[1], ggml_axes[2], ggml_axes[3]);
 }
 
-__STATIC_INLINE__ struct ggml_tensor* ggml_ext_slice(struct ggml_context* ctx,
-                                                     struct ggml_tensor* x,
-                                                     int dim,
-                                                     int64_t start,
-                                                     int64_t end,
-                                                     bool cont = true) {
+__STATIC_INLINE__ ggml_tensor* ggml_ext_slice(ggml_context* ctx,
+                                              ggml_tensor* x,
+                                              int dim,
+                                              int64_t start,
+                                              int64_t end,
+                                              bool cont = true) {
     GGML_ASSERT(dim >= 0 && dim < 4);
     if (x->ne[dim] == 1) {
         return x;
@@ -719,15 +683,15 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_slice(struct ggml_context* ctx,
 }
 
 // example: [N, 3*C, H, W] => ([N, C, H, W], [N, C, H, W], [N, C, H, W])
-__STATIC_INLINE__ std::vector<struct ggml_tensor*> ggml_ext_chunk(struct ggml_context* ctx,
-                                                                  struct ggml_tensor* x,
-                                                                  int num,
-                                                                  int64_t dim,
-                                                                  bool cont = true) {
+__STATIC_INLINE__ std::vector<ggml_tensor*> ggml_ext_chunk(ggml_context* ctx,
+                                                           ggml_tensor* x,
+                                                           int num,
+                                                           int64_t dim,
+                                                           bool cont = true) {
     GGML_ASSERT(dim >= 0 && dim < 4);
     GGML_ASSERT(x->ne[dim] % num == 0);
 
-    std::vector<struct ggml_tensor*> chunks;
+    std::vector<ggml_tensor*> chunks;
     int64_t chunk_size  = x->ne[dim] / num;
     int64_t stride      = chunk_size * x->nb[dim];
     int64_t chunk_ne[4] = {x->ne[0], x->ne[1], x->ne[2], x->ne[3]};
@@ -773,10 +737,31 @@ __STATIC_INLINE__ void sd_tiling_calc_tiles(int& num_tiles_dim,
                                             float& tile_overlap_factor_dim,
                                             int small_dim,
                                             int tile_size,
-                                            const float tile_overlap_factor) {
+                                            const float tile_overlap_factor,
+                                            bool circular) {
     int tile_overlap     = static_cast<int>(tile_size * tile_overlap_factor);
     int non_tile_overlap = tile_size - tile_overlap;
 
+    if (circular) {
+        // circular means the last and first tile are overlapping (wraping around)
+        num_tiles_dim = small_dim / non_tile_overlap;
+
+        if (num_tiles_dim < 1) {
+            num_tiles_dim = 1;
+        }
+
+        tile_overlap_factor_dim = (tile_size - small_dim / num_tiles_dim) / (float)tile_size;
+
+        // if single tile and tile_overlap_factor is not 0, add one to ensure we have at least two overlapping tiles
+        if (num_tiles_dim == 1 && tile_overlap_factor_dim > 0) {
+            num_tiles_dim++;
+            tile_overlap_factor_dim = 0.5;
+        }
+
+        return;
+    }
+    // else, non-circular means the last and first tile are not overlapping
+
     num_tiles_dim     = (small_dim - tile_overlap) / non_tile_overlap;
     int overshoot_dim = ((num_tiles_dim + 1) * non_tile_overlap + tile_overlap) % small_dim;
 
@@ -799,19 +784,102 @@ __STATIC_INLINE__ void sd_tiling_calc_tiles(int& num_tiles_dim,
 }
 
 // Tiling
-__STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,
-                                            ggml_tensor* output,
-                                            const int scale,
-                                            const int p_tile_size_x,
-                                            const int p_tile_size_y,
-                                            const float tile_overlap_factor,
-                                            on_tile_process on_processing) {
-    output = ggml_set_f32(output, 0);
 
-    int input_width   = (int)input->ne[0];
-    int input_height  = (int)input->ne[1];
-    int output_width  = (int)output->ne[0];
-    int output_height = (int)output->ne[1];
+__STATIC_INLINE__ int64_t sd_tensor_plane_size(const sd::Tensor<float>& tensor) {
+    GGML_ASSERT(tensor.dim() >= 2);
+    return tensor.shape()[0] * tensor.shape()[1];
+}
+
+__STATIC_INLINE__ sd::Tensor<float> sd_tensor_split_2d(const sd::Tensor<float>& input, int width, int height, int x, int y) {
+    GGML_ASSERT(input.dim() >= 4);
+    std::vector<int64_t> output_shape = input.shape();
+    output_shape[0]                   = width;
+    output_shape[1]                   = height;
+    sd::Tensor<float> output(std::move(output_shape));
+    int64_t input_width  = input.shape()[0];
+    int64_t input_height = input.shape()[1];
+    int64_t input_plane  = sd_tensor_plane_size(input);
+    int64_t output_plane = sd_tensor_plane_size(output);
+    int64_t plane_count  = input.numel() / input_plane;
+    for (int iy = 0; iy < height; iy++) {
+        for (int ix = 0; ix < width; ix++) {
+            int64_t src_xy = (ix + x) % input_width + input_width * ((iy + y) % input_height);
+            int64_t dst_xy = ix + width * iy;
+            for (int64_t plane = 0; plane < plane_count; ++plane) {
+                output[plane * output_plane + dst_xy] = input[plane * input_plane + src_xy];
+            }
+        }
+    }
+    return output;
+}
+
+__STATIC_INLINE__ void sd_tensor_merge_2d(const sd::Tensor<float>& input,
+                                          sd::Tensor<float>* output,
+                                          int x,
+                                          int y,
+                                          int overlap_x,
+                                          int overlap_y,
+                                          bool circular_x,
+                                          bool circular_y,
+                                          int x_skip = 0,
+                                          int y_skip = 0) {
+    GGML_ASSERT(output != nullptr);
+    int64_t width        = input.shape()[0];
+    int64_t height       = input.shape()[1];
+    int64_t img_width    = output->shape()[0];
+    int64_t img_height   = output->shape()[1];
+    int64_t input_plane  = sd_tensor_plane_size(input);
+    int64_t output_plane = sd_tensor_plane_size(*output);
+    int64_t plane_count  = input.numel() / input_plane;
+    GGML_ASSERT(output->numel() / output_plane == plane_count);
+
+    // unclamped -> expects x in the range [0-1]
+    auto smootherstep_f32 = [](const float x) -> float {
+        GGML_ASSERT(x >= 0.f && x <= 1.f);
+        return x * x * x * (x * (6.0f * x - 15.0f) + 10.0f);
+    };
+
+    for (int iy = y_skip; iy < height; iy++) {
+        for (int ix = x_skip; ix < width; ix++) {
+            int64_t src_xy = ix + width * iy;
+            int64_t ox     = (x + ix) % img_width;
+            int64_t oy     = (y + iy) % img_height;
+            int64_t dst_xy = ox + img_width * oy;
+            for (int64_t plane = 0; plane < plane_count; ++plane) {
+                float new_value = input[plane * input_plane + src_xy];
+                if (overlap_x > 0 || overlap_y > 0) {
+                    float old_value   = (*output)[plane * output_plane + dst_xy];
+                    const float x_f_0 = (circular_x || (overlap_x > 0 && x > 0)) ? (ix - x_skip) / float(overlap_x) : 1.f;
+                    const float x_f_1 = (circular_x || (overlap_x > 0 && x < (img_width - width))) ? (width - ix) / float(overlap_x) : 1.f;
+                    const float y_f_0 = (circular_y || (overlap_y > 0 && y > 0)) ? (iy - y_skip) / float(overlap_y) : 1.f;
+                    const float y_f_1 = (circular_y || (overlap_y > 0 && y < (img_height - height))) ? (height - iy) / float(overlap_y) : 1.f;
+                    const float x_f   = std::min(std::min(x_f_0, x_f_1), 1.f);
+                    const float y_f   = std::min(std::min(y_f_0, y_f_1), 1.f);
+                    (*output)[plane * output_plane + dst_xy] =
+                        old_value + new_value * smootherstep_f32(y_f) * smootherstep_f32(x_f);
+                } else {
+                    (*output)[plane * output_plane + dst_xy] = new_value;
+                }
+            }
+        }
+    }
+}
+
+template <typename Fn>
+__STATIC_INLINE__ sd::Tensor<float> process_tiles_2d(const sd::Tensor<float>& input,
+                                                     int output_width,
+                                                     int output_height,
+                                                     int scale,
+                                                     int p_tile_size_x,
+                                                     int p_tile_size_y,
+                                                     float tile_overlap_factor,
+                                                     bool circular_x,
+                                                     bool circular_y,
+                                                     Fn&& on_processing,
+                                                     bool silent = false) {
+    sd::Tensor<float> output;
+    int input_width  = static_cast<int>(input.shape()[0]);
+    int input_height = static_cast<int>(input.shape()[1]);
 
     GGML_ASSERT(((input_width / output_width) == (input_height / output_height)) &&
                 ((output_width / input_width) == (output_height / input_height)));
@@ -820,8 +888,7 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,
 
     int small_width  = output_width;
     int small_height = output_height;
-
-    bool decode = output_width > input_width;
+    bool decode      = output_width > input_width;
     if (decode) {
         small_width  = input_width;
         small_height = input_height;
@@ -829,29 +896,22 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,
 
     int num_tiles_x;
     float tile_overlap_factor_x;
-    sd_tiling_calc_tiles(num_tiles_x, tile_overlap_factor_x, small_width, p_tile_size_x, tile_overlap_factor);
+    sd_tiling_calc_tiles(num_tiles_x, tile_overlap_factor_x, small_width, p_tile_size_x, tile_overlap_factor, circular_x);
 
     int num_tiles_y;
     float tile_overlap_factor_y;
-    sd_tiling_calc_tiles(num_tiles_y, tile_overlap_factor_y, small_height, p_tile_size_y, tile_overlap_factor);
+    sd_tiling_calc_tiles(num_tiles_y, tile_overlap_factor_y, small_height, p_tile_size_y, tile_overlap_factor, circular_y);
 
-    LOG_DEBUG("num tiles : %d, %d ", num_tiles_x, num_tiles_y);
-    LOG_DEBUG("optimal overlap : %f, %f (targeting %f)", tile_overlap_factor_x, tile_overlap_factor_y, tile_overlap_factor);
-
-    int tile_overlap_x     = (int32_t)(p_tile_size_x * tile_overlap_factor_x);
+    int tile_overlap_x     = static_cast<int32_t>(p_tile_size_x * tile_overlap_factor_x);
     int non_tile_overlap_x = p_tile_size_x - tile_overlap_x;
-
-    int tile_overlap_y     = (int32_t)(p_tile_size_y * tile_overlap_factor_y);
+    int tile_overlap_y     = static_cast<int32_t>(p_tile_size_y * tile_overlap_factor_y);
     int non_tile_overlap_y = p_tile_size_y - tile_overlap_y;
-
-    int tile_size_x = p_tile_size_x < small_width ? p_tile_size_x : small_width;
-    int tile_size_y = p_tile_size_y < small_height ? p_tile_size_y : small_height;
-
+    int tile_size_x        = p_tile_size_x < small_width ? p_tile_size_x : small_width;
+    int tile_size_y        = p_tile_size_y < small_height ? p_tile_size_y : small_height;
     int input_tile_size_x  = tile_size_x;
     int input_tile_size_y  = tile_size_y;
     int output_tile_size_x = tile_size_x;
     int output_tile_size_y = tile_size_y;
-
     if (decode) {
         output_tile_size_x *= scale;
         output_tile_size_y *= scale;
@@ -860,37 +920,23 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,
         input_tile_size_y *= scale;
     }
 
-    struct ggml_init_params params = {};
-    params.mem_size += input_tile_size_x * input_tile_size_y * input->ne[2] * input->ne[3] * sizeof(float);      // input chunk
-    params.mem_size += output_tile_size_x * output_tile_size_y * output->ne[2] * output->ne[3] * sizeof(float);  // output chunk
-    params.mem_size += 3 * ggml_tensor_overhead();
-    params.mem_buffer = nullptr;
-    params.no_alloc   = false;
-
-    LOG_DEBUG("tile work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f);
-
-    // draft context
-    struct ggml_context* tiles_ctx = ggml_init(params);
-    if (!tiles_ctx) {
-        LOG_ERROR("ggml_init() failed");
-        return;
-    }
-
-    // tiling
-    ggml_tensor* input_tile  = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, input_tile_size_x, input_tile_size_y, input->ne[2], input->ne[3]);
-    ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, output_tile_size_x, output_tile_size_y, output->ne[2], output->ne[3]);
-    int num_tiles            = num_tiles_x * num_tiles_y;
-    LOG_DEBUG("processing %i tiles", num_tiles);
-    pretty_progress(0, num_tiles, 0.0f);
-    int tile_count = 1;
-    bool last_y = false, last_x = false;
+    int num_tiles   = num_tiles_x * num_tiles_y;
+    int tile_count  = 1;
+    bool last_y     = false;
+    bool last_x     = false;
     float last_time = 0.0f;
+    if (!silent) {
+        LOG_DEBUG("num tiles : %d, %d ", num_tiles_x, num_tiles_y);
+        LOG_DEBUG("optimal overlap : %f, %f (targeting %f)", tile_overlap_factor_x, tile_overlap_factor_y, tile_overlap_factor);
+        LOG_DEBUG("processing %i tiles", num_tiles);
+        pretty_progress(0, num_tiles, 0.0f);
+    }
     for (int y = 0; y < small_height && !last_y; y += non_tile_overlap_y) {
         int dy = 0;
-        if (y + tile_size_y >= small_height) {
-            int _y = y;
-            y      = small_height - tile_size_y;
-            dy     = _y - y;
+        if (!circular_y && y + tile_size_y >= small_height) {
+            int original_y = y;
+            y              = small_height - tile_size_y;
+            dy             = original_y - y;
             if (decode) {
                 dy *= scale;
             }
@@ -898,10 +944,10 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,
         }
         for (int x = 0; x < small_width && !last_x; x += non_tile_overlap_x) {
             int dx = 0;
-            if (x + tile_size_x >= small_width) {
-                int _x = x;
-                x      = small_width - tile_size_x;
-                dx     = _x - x;
+            if (!circular_x && x + tile_size_x >= small_width) {
+                int original_x = x;
+                x              = small_width - tile_size_x;
+                dx             = original_x - x;
                 if (decode) {
                     dx *= scale;
                 }
@@ -916,46 +962,49 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,
             int overlap_x_out = decode ? tile_overlap_x * scale : tile_overlap_x;
             int overlap_y_out = decode ? tile_overlap_y * scale : tile_overlap_y;
 
-            int64_t t1 = ggml_time_ms();
-            ggml_ext_tensor_split_2d(input, input_tile, x_in, y_in);
-            if (on_processing(input_tile, output_tile, false)) {
-                ggml_ext_tensor_merge_2d(output_tile, output, x_out, y_out, overlap_x_out, overlap_y_out, dx, dy);
+            int64_t t1       = ggml_time_ms();
+            auto input_tile  = sd_tensor_split_2d(input, input_tile_size_x, input_tile_size_y, x_in, y_in);
+            auto output_tile = on_processing(input_tile);
+            if (output_tile.empty()) {
+                return {};
+            }
+            GGML_ASSERT(output_tile.shape()[0] == output_tile_size_x && output_tile.shape()[1] == output_tile_size_y);
+            if (output.empty()) {
+                std::vector<int64_t> output_shape = output_tile.shape();
+                output_shape[0]                   = output_width;
+                output_shape[1]                   = output_height;
+                output                            = sd::Tensor<float>::zeros(std::move(output_shape));
+            }
+            sd_tensor_merge_2d(output_tile, &output, x_out, y_out, overlap_x_out, overlap_y_out, circular_x, circular_y, dx, dy);
 
+            if (!silent) {
                 int64_t t2 = ggml_time_ms();
                 last_time  = (t2 - t1) / 1000.0f;
                 pretty_progress(tile_count, num_tiles, last_time);
-            } else {
-                LOG_ERROR("Failed to process patch %d at (%d, %d)", tile_count, x, y);
             }
             tile_count++;
         }
         last_x = false;
     }
-    if (tile_count < num_tiles) {
+    if (!silent && tile_count < num_tiles) {
         pretty_progress(num_tiles, num_tiles, last_time);
     }
-    ggml_free(tiles_ctx);
+    if (output.empty()) {
+        return {};
+    }
+    return output;
 }
 
-__STATIC_INLINE__ void sd_tiling(ggml_tensor* input,
-                                 ggml_tensor* output,
-                                 const int scale,
-                                 const int tile_size,
-                                 const float tile_overlap_factor,
-                                 on_tile_process on_processing) {
-    sd_tiling_non_square(input, output, scale, tile_size, tile_size, tile_overlap_factor, on_processing);
-}
-
-__STATIC_INLINE__ struct ggml_tensor* ggml_ext_group_norm_32(struct ggml_context* ctx,
-                                                             struct ggml_tensor* a) {
+__STATIC_INLINE__ ggml_tensor* ggml_ext_group_norm_32(ggml_context* ctx,
+                                                      ggml_tensor* a) {
     const float eps = 1e-6f;  // default eps parameter
     return ggml_group_norm(ctx, a, 32, eps);
 }
 
-__STATIC_INLINE__ struct ggml_tensor* ggml_ext_scale(struct ggml_context* ctx,
-                                                     struct ggml_tensor* x,
-                                                     float factor,
-                                                     bool inplace = false) {
+__STATIC_INLINE__ ggml_tensor* ggml_ext_scale(ggml_context* ctx,
+                                              ggml_tensor* x,
+                                              float factor,
+                                              bool inplace = false) {
     if (!ggml_is_contiguous(x)) {
         x = ggml_cont(ctx, x);
     }
@@ -967,9 +1016,9 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_scale(struct ggml_context* ctx,
     return x;
 }
 
-__STATIC_INLINE__ struct ggml_tensor* ggml_ext_gelu(struct ggml_context* ctx,
-                                                    struct ggml_tensor* x,
-                                                    bool inplace = false) {
+__STATIC_INLINE__ ggml_tensor* ggml_ext_gelu(ggml_context* ctx,
+                                             ggml_tensor* x,
+                                             bool inplace = false) {
     if (!ggml_is_contiguous(x)) {
         x = ggml_cont(ctx, x);
     }
@@ -981,9 +1030,9 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_gelu(struct ggml_context* ctx,
     return x;
 }
 
-__STATIC_INLINE__ struct ggml_tensor* ggml_ext_gelu_quick(struct ggml_context* ctx,
-                                                          struct ggml_tensor* x,
-                                                          bool inplace = false) {
+__STATIC_INLINE__ ggml_tensor* ggml_ext_gelu_quick(ggml_context* ctx,
+                                                   ggml_tensor* x,
+                                                   bool inplace = false) {
     if (!ggml_is_contiguous(x)) {
         x = ggml_cont(ctx, x);
     }
@@ -995,12 +1044,12 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_gelu_quick(struct ggml_context* c
     return x;
 }
 
-__STATIC_INLINE__ struct ggml_tensor* ggml_ext_linear(struct ggml_context* ctx,
-                                                      struct ggml_tensor* x,
-                                                      struct ggml_tensor* w,
-                                                      struct ggml_tensor* b,
-                                                      bool force_prec_f32 = false,
-                                                      float scale         = 1.f) {
+__STATIC_INLINE__ ggml_tensor* ggml_ext_linear(ggml_context* ctx,
+                                               ggml_tensor* x,
+                                               ggml_tensor* w,
+                                               ggml_tensor* b,
+                                               bool force_prec_f32 = false,
+                                               float scale         = 1.f) {
     if (scale != 1.f) {
         x = ggml_ext_scale(ctx, x, scale);
     }
@@ -1029,18 +1078,18 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_linear(struct ggml_context* ctx,
     return x;
 }
 
-__STATIC_INLINE__ struct ggml_tensor* ggml_ext_pad_ext(struct ggml_context* ctx,
-                                                       struct ggml_tensor* x,
-                                                       int lp0,
-                                                       int rp0,
-                                                       int lp1,
-                                                       int rp1,
-                                                       int lp2,
-                                                       int rp2,
-                                                       int lp3,
-                                                       int rp3,
-                                                       bool circular_x = false,
-                                                       bool circular_y = false) {
+__STATIC_INLINE__ ggml_tensor* ggml_ext_pad_ext(ggml_context* ctx,
+                                                ggml_tensor* x,
+                                                int lp0,
+                                                int rp0,
+                                                int lp1,
+                                                int rp1,
+                                                int lp2,
+                                                int rp2,
+                                                int lp3,
+                                                int rp3,
+                                                bool circular_x = false,
+                                                bool circular_y = false) {
     if (circular_x && circular_y) {
         return ggml_pad_ext_circular(ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
     }
@@ -1060,14 +1109,14 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_pad_ext(struct ggml_context* ctx,
     return x;
 }
 
-__STATIC_INLINE__ struct ggml_tensor* ggml_ext_pad(struct ggml_context* ctx,
-                                                   struct ggml_tensor* x,
-                                                   int p0,
-                                                   int p1,
-                                                   int p2          = 0,
-                                                   int p3          = 0,
-                                                   bool circular_x = false,
-                                                   bool circular_y = false) {
+__STATIC_INLINE__ ggml_tensor* ggml_ext_pad(ggml_context* ctx,
+                                            ggml_tensor* x,
+                                            int p0,
+                                            int p1,
+                                            int p2          = 0,
+                                            int p3          = 0,
+                                            bool circular_x = false,
+                                            bool circular_y = false) {
     return ggml_ext_pad_ext(ctx, x, 0, p0, 0, p1, 0, p2, 0, p3, circular_x, circular_y);
 }
 
@@ -1075,20 +1124,20 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_pad(struct ggml_context* ctx,
 // x: [N, IC, IH, IW]
 // b: [OC,]
 // result: [N, OC, OH, OW]
-__STATIC_INLINE__ struct ggml_tensor* ggml_ext_conv_2d(struct ggml_context* ctx,
-                                                       struct ggml_tensor* x,
-                                                       struct ggml_tensor* w,
-                                                       struct ggml_tensor* b,
-                                                       int s0          = 1,
-                                                       int s1          = 1,
-                                                       int p0          = 0,
-                                                       int p1          = 0,
-                                                       int d0          = 1,
-                                                       int d1          = 1,
-                                                       bool direct     = false,
-                                                       bool circular_x = false,
-                                                       bool circular_y = false,
-                                                       float scale     = 1.f) {
+__STATIC_INLINE__ ggml_tensor* ggml_ext_conv_2d(ggml_context* ctx,
+                                                ggml_tensor* x,
+                                                ggml_tensor* w,
+                                                ggml_tensor* b,
+                                                int s0          = 1,
+                                                int s1          = 1,
+                                                int p0          = 0,
+                                                int p1          = 0,
+                                                int d0          = 1,
+                                                int d1          = 1,
+                                                bool direct     = false,
+                                                bool circular_x = false,
+                                                bool circular_y = false,
+                                                float scale     = 1.f) {
     if (scale != 1.f) {
         x = ggml_ext_scale(ctx, x, scale);
     }
@@ -1121,20 +1170,20 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_conv_2d(struct ggml_context* ctx,
 // x: [N, IC, IH, IW]
 // b: [OC,]
 // result: [N*OC, OD, OH, OW]
-__STATIC_INLINE__ struct ggml_tensor* ggml_ext_conv_3d(struct ggml_context* ctx,
-                                                       struct ggml_tensor* x,
-                                                       struct ggml_tensor* w,
-                                                       struct ggml_tensor* b,
-                                                       int64_t IC,
-                                                       int s0 = 1,
-                                                       int s1 = 1,
-                                                       int s2 = 1,
-                                                       int p0 = 0,
-                                                       int p1 = 0,
-                                                       int p2 = 0,
-                                                       int d0 = 1,
-                                                       int d1 = 1,
-                                                       int d2 = 1) {
+__STATIC_INLINE__ ggml_tensor* ggml_ext_conv_3d(ggml_context* ctx,
+                                                ggml_tensor* x,
+                                                ggml_tensor* w,
+                                                ggml_tensor* b,
+                                                int64_t IC,
+                                                int s0 = 1,
+                                                int s1 = 1,
+                                                int s2 = 1,
+                                                int p0 = 0,
+                                                int p1 = 0,
+                                                int p2 = 0,
+                                                int d0 = 1,
+                                                int d1 = 1,
+                                                int d2 = 1) {
     int64_t OC = w->ne[3] / IC;
     int64_t N  = x->ne[3] / IC;
     x          = ggml_conv_3d(ctx, w, x, IC, s0, s1, s2, p0, p1, p2, d0, d1, d2);
@@ -1150,13 +1199,13 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_conv_3d(struct ggml_context* ctx,
 // x: [N, IC, ID, IH*IW]
 // b: [OC,]
 // result: [N, OC, OD, OH*OW]
-__STATIC_INLINE__ struct ggml_tensor* ggml_ext_conv_3d_nx1x1(struct ggml_context* ctx,
-                                                             struct ggml_tensor* x,
-                                                             struct ggml_tensor* w,
-                                                             struct ggml_tensor* b,
-                                                             int s2 = 1,
-                                                             int p2 = 1,
-                                                             int d2 = 1) {
+__STATIC_INLINE__ ggml_tensor* ggml_ext_conv_3d_nx1x1(ggml_context* ctx,
+                                                      ggml_tensor* x,
+                                                      ggml_tensor* w,
+                                                      ggml_tensor* b,
+                                                      int s2 = 1,
+                                                      int p2 = 1,
+                                                      int d2 = 1) {
     x = ggml_conv_2d(ctx, w, x, 1, s2, 0, p2, 1, d2);  // [N, OC, T, OH * OW]
     if (b != nullptr) {
         b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
@@ -1167,8 +1216,8 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_conv_3d_nx1x1(struct ggml_context
 
 // qkv: [N, L, 3*C]
 // return: ([N, L, C], [N, L, C], [N, L, C])
-__STATIC_INLINE__ std::vector<struct ggml_tensor*> split_qkv(struct ggml_context* ctx,
-                                                             struct ggml_tensor* qkv) {
+__STATIC_INLINE__ std::vector<ggml_tensor*> split_qkv(ggml_context* ctx,
+                                                      ggml_tensor* qkv) {
     qkv = ggml_reshape_4d(ctx, qkv, qkv->ne[0] / 3, 3, qkv->ne[1], qkv->ne[2]);  // [N, L, 3, C]
     qkv = ggml_cont(ctx, ggml_permute(ctx, qkv, 0, 3, 1, 2));                    // [3, N, L, C]
 
@@ -1181,8 +1230,8 @@ __STATIC_INLINE__ std::vector<struct ggml_tensor*> split_qkv(struct ggml_context
 
 // qkv: [N, 3*C, H, W]
 // return: ([N, C, H, W], [N, C, H, W], [N, C, H, W])
-__STATIC_INLINE__ std::vector<struct ggml_tensor*> split_image_qkv(struct ggml_context* ctx,
-                                                                   struct ggml_tensor* qkv) {
+__STATIC_INLINE__ std::vector<ggml_tensor*> split_image_qkv(ggml_context* ctx,
+                                                            ggml_tensor* qkv) {
     int64_t W   = qkv->ne[0];
     int64_t H   = qkv->ne[1];
     int64_t C   = qkv->ne[2] / 3;
@@ -1199,41 +1248,41 @@ __STATIC_INLINE__ std::vector<struct ggml_tensor*> split_image_qkv(struct ggml_c
     return {q, k, v};
 }
 
-__STATIC_INLINE__ struct ggml_tensor* ggml_ext_full(struct ggml_context* ctx,
-                                                    float value,
-                                                    int64_t ne0,
-                                                    int64_t ne1,
-                                                    int64_t ne2,
-                                                    int64_t ne3) {
+__STATIC_INLINE__ ggml_tensor* ggml_ext_full(ggml_context* ctx,
+                                             float value,
+                                             int64_t ne0,
+                                             int64_t ne1,
+                                             int64_t ne2,
+                                             int64_t ne3) {
     auto one = ggml_get_tensor(ctx, "ggml_runner_build_in_tensor:one");
     auto t   = ggml_ext_scale(ctx, one, value);             // [1,]
     t        = ggml_repeat_4d(ctx, t, ne0, ne1, ne2, ne3);  // [ne0, ne1, ne2, ne3]
     return t;
 }
 
-__STATIC_INLINE__ struct ggml_tensor* ggml_ext_zeros(struct ggml_context* ctx,
-                                                     int64_t ne0,
-                                                     int64_t ne1,
-                                                     int64_t ne2,
-                                                     int64_t ne3) {
+__STATIC_INLINE__ ggml_tensor* ggml_ext_zeros(ggml_context* ctx,
+                                              int64_t ne0,
+                                              int64_t ne1,
+                                              int64_t ne2,
+                                              int64_t ne3) {
     return ggml_ext_full(ctx, 0.f, ne0, ne1, ne2, ne3);
 }
 
-__STATIC_INLINE__ struct ggml_tensor* ggml_ext_zeros_like(struct ggml_context* ctx,
-                                                          struct ggml_tensor* x) {
+__STATIC_INLINE__ ggml_tensor* ggml_ext_zeros_like(ggml_context* ctx,
+                                                   ggml_tensor* x) {
     return ggml_ext_zeros(ctx, x->ne[0], x->ne[1], x->ne[2], x->ne[3]);
 }
 
-__STATIC_INLINE__ struct ggml_tensor* ggml_ext_ones(struct ggml_context* ctx,
-                                                    int64_t ne0,
-                                                    int64_t ne1,
-                                                    int64_t ne2,
-                                                    int64_t ne3) {
+__STATIC_INLINE__ ggml_tensor* ggml_ext_ones(ggml_context* ctx,
+                                             int64_t ne0,
+                                             int64_t ne1,
+                                             int64_t ne2,
+                                             int64_t ne3) {
     return ggml_ext_full(ctx, 1.f, ne0, ne1, ne2, ne3);
 }
 
-__STATIC_INLINE__ struct ggml_tensor* ggml_ext_ones_like(struct ggml_context* ctx,
-                                                         struct ggml_tensor* x) {
+__STATIC_INLINE__ ggml_tensor* ggml_ext_ones_like(ggml_context* ctx,
+                                                  ggml_tensor* x) {
     return ggml_ext_ones(ctx, x->ne[0], x->ne[1], x->ne[2], x->ne[3]);
 }
 
@@ -1263,16 +1312,16 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_cast_f32(ggml_context* ctx, ggml_tensor*
 // v: [N, L_k, n_kv_head*d_head] or [N, L_k, n_kv_head, d_head]
 // mask: [N, L_q, L_k]
 // return: [N, L_q, C]
-__STATIC_INLINE__ struct ggml_tensor* ggml_ext_attention_ext(struct ggml_context* ctx,
-                                                             ggml_backend_t backend,
-                                                             struct ggml_tensor* q,
-                                                             struct ggml_tensor* k,
-                                                             struct ggml_tensor* v,
-                                                             int64_t n_head,
-                                                             struct ggml_tensor* mask = nullptr,
-                                                             bool skip_reshape        = false,
-                                                             bool flash_attn          = false,
-                                                             float kv_scale           = 1.0f) {  // avoid overflow
+__STATIC_INLINE__ ggml_tensor* ggml_ext_attention_ext(ggml_context* ctx,
+                                                      ggml_backend_t backend,
+                                                      ggml_tensor* q,
+                                                      ggml_tensor* k,
+                                                      ggml_tensor* v,
+                                                      int64_t n_head,
+                                                      ggml_tensor* mask = nullptr,
+                                                      bool skip_reshape = false,
+                                                      bool flash_attn   = false,
+                                                      float kv_scale    = 1.0f) {  // avoid overflow
     int64_t L_q;
     int64_t L_k;
     int64_t C;
@@ -1411,11 +1460,11 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_attention_ext(struct ggml_context
     return kqv;
 }
 
-__STATIC_INLINE__ struct ggml_tensor* ggml_ext_layer_norm(struct ggml_context* ctx,
-                                                          struct ggml_tensor* x,
-                                                          struct ggml_tensor* w,
-                                                          struct ggml_tensor* b,
-                                                          float eps = EPS) {
+__STATIC_INLINE__ ggml_tensor* ggml_ext_layer_norm(ggml_context* ctx,
+                                                   ggml_tensor* x,
+                                                   ggml_tensor* w,
+                                                   ggml_tensor* b,
+                                                   float eps = EPS) {
     x = ggml_norm(ctx, x, eps);
     if (w != nullptr) {
         x = ggml_mul_inplace(ctx, x, w);
@@ -1426,11 +1475,11 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_layer_norm(struct ggml_context* c
     return x;
 }
 
-__STATIC_INLINE__ struct ggml_tensor* ggml_ext_group_norm(struct ggml_context* ctx,
-                                                          struct ggml_tensor* x,
-                                                          struct ggml_tensor* w,
-                                                          struct ggml_tensor* b,
-                                                          int num_groups = 32) {
+__STATIC_INLINE__ ggml_tensor* ggml_ext_group_norm(ggml_context* ctx,
+                                                   ggml_tensor* x,
+                                                   ggml_tensor* w,
+                                                   ggml_tensor* b,
+                                                   int num_groups = 32) {
     if (ggml_n_dims(x) >= 3 && w != nullptr && b != nullptr) {
         w = ggml_reshape_4d(ctx, w, 1, 1, w->ne[0], 1);
         b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
@@ -1446,7 +1495,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_group_norm(struct ggml_context* c
     return x;
 }
 
-__STATIC_INLINE__ void ggml_ext_backend_tensor_get_and_sync(ggml_backend_t backend, const struct ggml_tensor* tensor, void* data, size_t offset, size_t size) {
+__STATIC_INLINE__ void ggml_ext_backend_tensor_get_and_sync(ggml_backend_t backend, const ggml_tensor* tensor, void* data, size_t offset, size_t size) {
 #if defined(SD_USE_CUDA) || defined(SD_USE_SYCL)
     if (!ggml_backend_is_cpu(backend)) {
         ggml_backend_tensor_get_async(backend, tensor, data, offset, size);
@@ -1480,16 +1529,16 @@ __STATIC_INLINE__ float ggml_ext_backend_tensor_get_f32(ggml_tensor* tensor) {
     return value;
 }
 
-__STATIC_INLINE__ struct ggml_tensor* vector_to_ggml_tensor(struct ggml_context* ctx,
-                                                            const std::vector<float>& vec) {
-    struct ggml_tensor* t = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, vec.size());
+__STATIC_INLINE__ ggml_tensor* vector_to_ggml_tensor(ggml_context* ctx,
+                                                     const std::vector<float>& vec) {
+    ggml_tensor* t = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, vec.size());
     memcpy(t->data, (const void*)vec.data(), ggml_nbytes(t));
     return t;
 }
 
-__STATIC_INLINE__ struct ggml_tensor* vector_to_ggml_tensor_i32(struct ggml_context* ctx,
-                                                                const std::vector<int>& vec) {
-    struct ggml_tensor* t = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, vec.size());
+__STATIC_INLINE__ ggml_tensor* vector_to_ggml_tensor_i32(ggml_context* ctx,
+                                                         const std::vector<int>& vec) {
+    ggml_tensor* t = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, vec.size());
     memcpy(t->data, (const void*)vec.data(), ggml_nbytes(t));
     return t;
 }
@@ -1535,21 +1584,33 @@ __STATIC_INLINE__ std::vector<float> timestep_embedding(std::vector<float> times
 }
 
 __STATIC_INLINE__ void set_timestep_embedding(std::vector<float> timesteps,
-                                              struct ggml_tensor* embedding,
+                                              ggml_tensor* embedding,
                                               int dim,
                                               int max_period = 10000) {
     std::vector<float> embedding_vec = timestep_embedding(timesteps, dim, max_period);
     memcpy(((char*)embedding->data), ((char*)embedding_vec.data()), ggml_nbytes(embedding));
 }
 
-__STATIC_INLINE__ struct ggml_tensor* new_timestep_embedding(struct ggml_context* ctx,
-                                                             std::vector<float> timesteps,
-                                                             int dim,
-                                                             int max_period = 10000) {
+__STATIC_INLINE__ void set_timestep_embedding(std::vector<float> timesteps,
+                                              sd::Tensor<float>* embedding,
+                                              int dim,
+                                              int max_period = 10000) {
+    GGML_ASSERT(embedding != nullptr);
+    std::vector<float> embedding_vec = timestep_embedding(timesteps, dim, max_period);
+    if (embedding->numel() != static_cast<int64_t>(embedding_vec.size())) {
+        embedding->resize({dim, static_cast<int64_t>(timesteps.size())});
+    }
+    std::copy(embedding_vec.begin(), embedding_vec.end(), embedding->values().begin());
+}
+
+__STATIC_INLINE__ ggml_tensor* new_timestep_embedding(ggml_context* ctx,
+                                                      std::vector<float> timesteps,
+                                                      int dim,
+                                                      int max_period = 10000) {
     // timesteps: [N,]
     // embedding: [N, dim]
     std::vector<float> embedding_vec = timestep_embedding(timesteps, dim, max_period);
-    struct ggml_tensor* embedding    = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, timesteps.size());
+    ggml_tensor* embedding           = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, timesteps.size());
     if (embedding->data != nullptr) {
         memcpy(((char*)embedding->data), ((char*)embedding_vec.data()), ggml_nbytes(embedding));
     } else {
@@ -1558,9 +1619,9 @@ __STATIC_INLINE__ struct ggml_tensor* new_timestep_embedding(struct ggml_context
     return embedding;
 }
 
-__STATIC_INLINE__ struct ggml_tensor* ggml_ext_timestep_embedding(
-    struct ggml_context* ctx,
-    struct ggml_tensor* timesteps,
+__STATIC_INLINE__ ggml_tensor* ggml_ext_timestep_embedding(
+    ggml_context* ctx,
+    ggml_tensor* timesteps,
     int dim,
     int max_period    = 10000,
     float time_factor = 1.0f) {
@@ -1625,22 +1686,22 @@ struct GGMLRunnerContext {
 
 struct GGMLRunner {
 protected:
-    typedef std::function<struct ggml_cgraph*()> get_graph_cb_t;
+    typedef std::function<ggml_cgraph*()> get_graph_cb_t;
 
     ggml_backend_t params_backend  = nullptr;
     ggml_backend_t runtime_backend = nullptr;
 
-    struct ggml_context* params_ctx             = nullptr;
+    ggml_context* params_ctx                    = nullptr;
     ggml_backend_buffer_t params_buffer         = nullptr;
-    struct ggml_context* offload_ctx            = nullptr;
+    ggml_context* offload_ctx                   = nullptr;
     ggml_backend_buffer_t runtime_params_buffer = nullptr;
     bool params_on_runtime_backend              = false;
 
-    struct ggml_context* cache_ctx     = nullptr;
+    ggml_context* cache_ctx            = nullptr;
     ggml_backend_buffer_t cache_buffer = nullptr;
 
-    struct ggml_context* compute_ctx    = nullptr;
-    struct ggml_gallocr* compute_allocr = nullptr;
+    ggml_context* compute_ctx    = nullptr;
+    ggml_gallocr* compute_allocr = nullptr;
 
     std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
 
@@ -1650,8 +1711,8 @@ protected:
     std::vector<int> zero_int_vec = {0};
     ggml_tensor* zero_int_tensor  = nullptr;
 
-    std::map<struct ggml_tensor*, const void*> backend_tensor_data_map;
-    std::map<std::string, struct ggml_tensor*> cache_tensor_map;  // name -> tensor
+    std::map<ggml_tensor*, const void*> backend_tensor_data_map;
+    std::map<std::string, ggml_tensor*> cache_tensor_map;  // name -> tensor
     const std::string final_result_name = "ggml_runner_final_result_tensor";
 
     bool flash_attn_enabled    = false;
@@ -1659,8 +1720,34 @@ protected:
     bool circular_x_enabled    = false;
     bool circular_y_enabled    = false;
 
+    template <typename T>
+    static sd::Tensor<T> take_or_empty(std::optional<sd::Tensor<T>> tensor) {
+        if (!tensor.has_value()) {
+            return {};
+        }
+        return std::move(*tensor);
+    }
+
+    template <typename T>
+    static sd::Tensor<T> restore_trailing_singleton_dims(std::optional<sd::Tensor<T>> tensor,
+                                                         size_t expected_dim) {
+        return restore_trailing_singleton_dims(take_or_empty(std::move(tensor)), expected_dim);
+    }
+
+    template <typename T>
+    static sd::Tensor<T> restore_trailing_singleton_dims(sd::Tensor<T> tensor,
+                                                         size_t expected_dim) {
+        if (tensor.empty()) {
+            return tensor;
+        }
+        while (static_cast<size_t>(tensor.dim()) < expected_dim) {
+            tensor.unsqueeze_(tensor.dim());
+        }
+        return tensor;
+    }
+
     void alloc_params_ctx() {
-        struct ggml_init_params params;
+        ggml_init_params params;
         params.mem_size   = static_cast<size_t>(MAX_PARAMS_TENSOR_NUM * ggml_tensor_overhead());
         params.mem_buffer = nullptr;
         params.no_alloc   = true;
@@ -1685,7 +1772,7 @@ protected:
     }
 
     void alloc_cache_ctx() {
-        struct ggml_init_params params;
+        ggml_init_params params;
         params.mem_size   = static_cast<size_t>(MAX_PARAMS_TENSOR_NUM * ggml_tensor_overhead());
         params.mem_buffer = nullptr;
         params.no_alloc   = true;
@@ -1702,7 +1789,7 @@ protected:
     }
 
     void alloc_compute_ctx() {
-        struct ggml_init_params params;
+        ggml_init_params params;
         params.mem_size   = static_cast<size_t>(ggml_tensor_overhead() * MAX_GRAPH_SIZE + ggml_graph_overhead());
         params.mem_buffer = nullptr;
         params.no_alloc   = true;
@@ -1728,21 +1815,21 @@ protected:
         set_backend_tensor_data(zero_int_tensor, zero_int_vec.data());
     }
 
-    void prepare_build_in_tensor_after(struct ggml_cgraph* gf) {
+    void prepare_build_in_tensor_after(ggml_cgraph* gf) {
         ggml_build_forward_expand(gf, one_tensor);
         ggml_build_forward_expand(gf, zero_int_tensor);
     }
 
-    struct ggml_cgraph* new_graph_custom(size_t graph_size) {
+    ggml_cgraph* new_graph_custom(size_t graph_size) {
         if (weight_adapter) {
             graph_size += weight_adapter->get_extra_graph_size();
         }
         return ggml_new_graph_custom(compute_ctx, graph_size, false);
     }
 
-    struct ggml_cgraph* get_compute_graph(get_graph_cb_t get_graph) {
+    ggml_cgraph* get_compute_graph(get_graph_cb_t get_graph) {
         prepare_build_in_tensor_before();
-        struct ggml_cgraph* gf = get_graph();
+        ggml_cgraph* gf = get_graph();
         if (ggml_graph_n_nodes(gf) > 0) {
             auto result = ggml_graph_node(gf, -1);
             ggml_set_name(result, final_result_name.c_str());
@@ -1756,7 +1843,7 @@ protected:
             return true;
         }
         reset_compute_ctx();
-        struct ggml_cgraph* gf = get_compute_graph(get_graph);
+        ggml_cgraph* gf = get_compute_graph(get_graph);
         backend_tensor_data_map.clear();
         compute_allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(runtime_backend));
 
@@ -1992,11 +2079,34 @@ public:
     }
 
     // do copy after alloc graph
-    void set_backend_tensor_data(struct ggml_tensor* tensor, const void* data) {
+    void set_backend_tensor_data(ggml_tensor* tensor, const void* data) {
         backend_tensor_data_map[tensor] = data;
     }
 
-    struct ggml_tensor* to_backend(struct ggml_tensor* tensor) {
+    template <typename T>
+    ggml_tensor* make_input(const sd::Tensor<T>& tensor) {
+        ggml_tensor* input = sd::make_ggml_tensor(compute_ctx, tensor, false);
+        set_backend_tensor_data(input, tensor.data());
+        return input;
+    }
+
+    template <typename T>
+    ggml_tensor* make_optional_input(const sd::Tensor<T>& tensor) {
+        if (tensor.empty()) {
+            return nullptr;
+        }
+        return make_input(tensor);
+    }
+
+    template <typename T>
+    ggml_tensor* make_optional_input(const sd::Tensor<T>* tensor) {
+        if (tensor == nullptr) {
+            return nullptr;
+        }
+        return make_input(*tensor);
+    }
+
+    ggml_tensor* to_backend(ggml_tensor* tensor) {
         GGML_ASSERT(compute_ctx != nullptr);
         if (tensor == nullptr) {
             return nullptr;
@@ -2013,35 +2123,35 @@ public:
         }
     }
 
-    void cache(const std::string name, struct ggml_tensor* tensor) {
+    void cache(const std::string name, ggml_tensor* tensor) {
         cache_tensor_map[name] = tensor;
     }
 
-    struct ggml_tensor* get_cache_tensor_by_name(const std::string& name) {
+    ggml_tensor* get_cache_tensor_by_name(const std::string& name) {
         if (cache_ctx == nullptr) {
             return nullptr;
         }
         return ggml_get_tensor(cache_ctx, name.c_str());
     }
 
-    bool compute(get_graph_cb_t get_graph,
-                 int n_threads,
-                 bool free_compute_buffer_immediately = true,
-                 struct ggml_tensor** output          = nullptr,
-                 struct ggml_context* output_ctx      = nullptr) {
+    template <typename T>
+    std::optional<sd::Tensor<T>> compute(get_graph_cb_t get_graph,
+                                         int n_threads,
+                                         bool free_compute_buffer_immediately,
+                                         bool no_return = false) {
         if (!offload_params_to_runtime_backend()) {
             LOG_ERROR("%s offload params to runtime backend failed", get_desc().c_str());
-            return false;
+            return std::nullopt;
         }
         if (!alloc_compute_buffer(get_graph)) {
             LOG_ERROR("%s alloc compute buffer failed", get_desc().c_str());
-            return false;
+            return std::nullopt;
         }
         reset_compute_ctx();
-        struct ggml_cgraph* gf = get_compute_graph(get_graph);
+        ggml_cgraph* gf = get_compute_graph(get_graph);
         if (!ggml_gallocr_alloc_graph(compute_allocr, gf)) {
             LOG_ERROR("%s alloc compute graph failed", get_desc().c_str());
-            return false;
+            return std::nullopt;
         }
         copy_data_to_backend_tensor();
         if (ggml_backend_is_cpu(runtime_backend)) {
@@ -2051,26 +2161,19 @@ public:
         ggml_status status = ggml_backend_graph_compute(runtime_backend, gf);
         if (status != GGML_STATUS_SUCCESS) {
             LOG_ERROR("%s compute failed: %s", get_desc().c_str(), ggml_status_to_string(status));
-            return false;
+            return std::nullopt;
         }
-#ifdef GGML_PERF
-        ggml_graph_print(gf);
-#endif
         copy_cache_tensors_to_cache_buffer();
-        if (output != nullptr) {
-            auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str());
-            if (*output == nullptr && output_ctx != nullptr) {
-                *output = ggml_dup_tensor(output_ctx, result);
-            }
-            if (*output != nullptr) {
-                ggml_ext_backend_tensor_get_and_sync(runtime_backend, result, (*output)->data, 0, ggml_nbytes(*output));
-            }
+        auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str());
+        std::optional<sd::Tensor<T>> output;
+        if (!no_return) {
+            output = sd::make_sd_tensor_from_ggml<T>(result);
         }
 
         if (free_compute_buffer_immediately) {
             free_compute_buffer();
         }
-        return true;
+        return output;
     }
 
     void set_flash_attention_enabled(bool enabled) {
@@ -2093,7 +2196,7 @@ public:
 
 class GGMLBlock {
 protected:
-    typedef std::unordered_map<std::string, struct ggml_tensor*> ParameterMap;
+    typedef std::unordered_map<std::string, ggml_tensor*> ParameterMap;
     typedef std::unordered_map<std::string, std::shared_ptr<GGMLBlock>> GGMLBlockMap;
     GGMLBlockMap blocks;
     ParameterMap params;
@@ -2112,17 +2215,17 @@ protected:
         return wtype;
     }
 
-    void init_blocks(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
+    void init_blocks(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
         for (auto& pair : blocks) {
             auto& block = pair.second;
             block->init(ctx, tensor_storage_map, prefix + pair.first);
         }
     }
 
-    virtual void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {}
+    virtual void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {}
 
 public:
-    void init(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") {
+    void init(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") {
         if (prefix.size() > 0) {
             prefix = prefix + ".";
         }
@@ -2155,7 +2258,7 @@ public:
         return mem_size;
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, std::string prefix = "") {
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, std::string prefix = "") {
         if (prefix.size() > 0) {
             prefix = prefix + ".";
         }
@@ -2165,7 +2268,7 @@ public:
         }
 
         for (auto& pair : params) {
-            struct ggml_tensor* param    = pair.second;
+            ggml_tensor* param           = pair.second;
             tensors[prefix + pair.first] = pair.second;
         }
     }
@@ -2186,12 +2289,12 @@ public:
 
 class UnaryBlock : public GGMLBlock {
 public:
-    virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) = 0;
+    virtual ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) = 0;
 };
 
 class Identity : public UnaryBlock {
 public:
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
         return x;
     }
 };
@@ -2206,7 +2309,7 @@ protected:
     float scale;
     std::string prefix;
 
-    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
         this->prefix         = prefix;
         enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32);
         if (in_features % ggml_blck_size(wtype) != 0 || force_f32) {
@@ -2233,9 +2336,9 @@ public:
           force_prec_f32(force_prec_f32),
           scale(scale) {}
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
-        struct ggml_tensor* w = params["weight"];
-        struct ggml_tensor* b = nullptr;
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+        ggml_tensor* w = params["weight"];
+        ggml_tensor* b = nullptr;
         if (bias) {
             b = params["bias"];
         }
@@ -2262,7 +2365,7 @@ class Embedding : public UnaryBlock {
 protected:
     int64_t embedding_dim;
     int64_t num_embeddings;
-    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override {
+    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override {
         enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32);
         if (!support_get_rows(wtype)) {
             wtype = GGML_TYPE_F32;
@@ -2276,8 +2379,8 @@ public:
           num_embeddings(num_embeddings) {
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                struct ggml_tensor* input_ids) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* input_ids) {
         // input_ids: [N, n_token]
         auto weight = params["weight"];
 
@@ -2307,7 +2410,7 @@ protected:
     float scale = 1.f;
     std::string prefix;
 
-    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override {
+    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override {
         this->prefix         = prefix;
         enum ggml_type wtype = GGML_TYPE_F16;
         params["weight"]     = ggml_new_tensor_4d(ctx, wtype, kernel_size.second, kernel_size.first, in_channels, out_channels);
@@ -2341,9 +2444,9 @@ public:
         return "Conv2d";
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
-        struct ggml_tensor* w = params["weight"];
-        struct ggml_tensor* b = nullptr;
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+        ggml_tensor* w = params["weight"];
+        ggml_tensor* b = nullptr;
         if (bias) {
             b = params["bias"];
         }
@@ -2390,7 +2493,7 @@ protected:
     bool bias;
     std::string prefix;
 
-    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override {
+    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override {
         this->prefix         = prefix;
         enum ggml_type wtype = GGML_TYPE_F16;
         params["weight"]     = ggml_new_tensor_4d(ctx,
@@ -2420,9 +2523,9 @@ public:
           dilation(dilation),
           bias(bias) {}
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
-        struct ggml_tensor* w = params["weight"];
-        struct ggml_tensor* b = nullptr;
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+        ggml_tensor* w = params["weight"];
+        ggml_tensor* b = nullptr;
         if (ctx->weight_adapter) {
             w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight");
             if (w->type != GGML_TYPE_F16) {
@@ -2450,7 +2553,7 @@ protected:
     bool bias;
     std::string prefix;
 
-    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
         this->prefix = prefix;
         if (elementwise_affine) {
             enum ggml_type wtype = GGML_TYPE_F32;
@@ -2472,9 +2575,9 @@ public:
           elementwise_affine(elementwise_affine),
           bias(bias) {}
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
-        struct ggml_tensor* w = nullptr;
-        struct ggml_tensor* b = nullptr;
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+        ggml_tensor* w = nullptr;
+        ggml_tensor* b = nullptr;
 
         if (elementwise_affine) {
             w = params["weight"];
@@ -2500,7 +2603,7 @@ protected:
     bool affine;
     std::string prefix;
 
-    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
         this->prefix = prefix;
         if (affine) {
             enum ggml_type wtype      = GGML_TYPE_F32;
@@ -2520,9 +2623,9 @@ public:
           eps(eps),
           affine(affine) {}
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
-        struct ggml_tensor* w = nullptr;
-        struct ggml_tensor* b = nullptr;
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+        ggml_tensor* w = nullptr;
+        ggml_tensor* b = nullptr;
         if (affine) {
             w = params["weight"];
             b = params["bias"];
@@ -2547,7 +2650,7 @@ protected:
     float eps;
     std::string prefix;
 
-    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
+    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
         this->prefix         = prefix;
         enum ggml_type wtype = GGML_TYPE_F32;
         params["weight"]     = ggml_new_tensor_1d(ctx, wtype, hidden_size);
@@ -2559,8 +2662,8 @@ public:
         : hidden_size(hidden_size),
           eps(eps) {}
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
-        struct ggml_tensor* w = params["weight"];
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+        ggml_tensor* w = params["weight"];
         if (ctx->weight_adapter) {
             w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight");
         }
@@ -2611,9 +2714,9 @@ public:
     }
 
     // x: [N, n_token, embed_dim]
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                struct ggml_tensor* x,
-                                struct ggml_tensor* mask = nullptr) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* x,
+                         ggml_tensor* mask = nullptr) {
         auto out_proj = std::dynamic_pointer_cast<Linear>(blocks[out_proj_name]);
 
         ggml_tensor* q;
@@ -2643,15 +2746,15 @@ public:
     }
 };
 
-__STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
-    struct ggml_context* ctx,
-    struct ggml_tensor* h,    // Input: [q, batch] or [W, H, q, batch]
-    struct ggml_tensor* w1,   // Outer C (Full rank)
-    struct ggml_tensor* w1a,  // Outer A (Low rank part 1)
-    struct ggml_tensor* w1b,  // Outer B (Low rank part 2)
-    struct ggml_tensor* w2,   // Inner BA (Full rank)
-    struct ggml_tensor* w2a,  // Inner A (Low rank part 1)
-    struct ggml_tensor* w2b,  // Inner B (Low rank part 2)
+__STATIC_INLINE__ ggml_tensor* ggml_ext_lokr_forward(
+    ggml_context* ctx,
+    ggml_tensor* h,    // Input: [q, batch] or [W, H, q, batch]
+    ggml_tensor* w1,   // Outer C (Full rank)
+    ggml_tensor* w1a,  // Outer A (Low rank part 1)
+    ggml_tensor* w1b,  // Outer B (Low rank part 2)
+    ggml_tensor* w2,   // Inner BA (Full rank)
+    ggml_tensor* w2a,  // Inner A (Low rank part 1)
+    ggml_tensor* w2b,  // Inner B (Low rank part 2)
     bool is_conv,
     WeightAdapter::ForwardParams::conv2d_params_t conv_params,
     float scale) {
@@ -2668,7 +2771,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
                           : (int)w2a->ne[1];
     GGML_ASSERT(q_actual == (uq * vq) && "Input dimension mismatch for LoKR split");
 
-    struct ggml_tensor* hb;
+    ggml_tensor* hb;
 
     if (!is_conv) {
         int batch          = (int)h->ne[1];
@@ -2699,7 +2802,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
         }
 #endif
 
-        struct ggml_tensor* h_split = ggml_reshape_3d(ctx, h, vq, uq * merge_batch_uq, batch / merge_batch_uq);
+        ggml_tensor* h_split = ggml_reshape_3d(ctx, h, vq, uq * merge_batch_uq, batch / merge_batch_uq);
         if (w2 != NULL) {
             hb = ggml_mul_mat(ctx, w2, h_split);
         } else {
@@ -2709,10 +2812,10 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
         if (batch > 1) {
             hb = ggml_reshape_3d(ctx, hb, vp, uq, batch);
         }
-        struct ggml_tensor* hb_t = ggml_cont(ctx, ggml_transpose(ctx, hb));
-        hb_t                     = ggml_reshape_3d(ctx, hb_t, uq, vp * merge_batch_vp, batch / merge_batch_vp);
+        ggml_tensor* hb_t = ggml_cont(ctx, ggml_transpose(ctx, hb));
+        hb_t              = ggml_reshape_3d(ctx, hb_t, uq, vp * merge_batch_vp, batch / merge_batch_vp);
 
-        struct ggml_tensor* hc_t;
+        ggml_tensor* hc_t;
         if (w1 != NULL) {
             hc_t = ggml_mul_mat(ctx, w1, hb_t);
         } else {
@@ -2723,13 +2826,13 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
             hc_t = ggml_reshape_3d(ctx, hc_t, up, vp, batch);
         }
 
-        struct ggml_tensor* hc  = ggml_transpose(ctx, hc_t);
-        struct ggml_tensor* out = ggml_reshape_2d(ctx, ggml_cont(ctx, hc), up * vp, batch);
+        ggml_tensor* hc  = ggml_transpose(ctx, hc_t);
+        ggml_tensor* out = ggml_reshape_2d(ctx, ggml_cont(ctx, hc), up * vp, batch);
         return ggml_scale(ctx, out, scale);
     } else {
         int batch = (int)h->ne[3];
         // 1. Reshape input: [W, H, vq*uq, batch] -> [W, H, vq, uq * batch]
-        struct ggml_tensor* h_split = ggml_reshape_4d(ctx, h, h->ne[0], h->ne[1], vq, uq * batch);
+        ggml_tensor* h_split = ggml_reshape_4d(ctx, h, h->ne[0], h->ne[1], vq, uq * batch);
 
         if (w2 != NULL) {
             hb = ggml_ext_conv_2d(ctx, h_split, w2, nullptr,
@@ -2745,8 +2848,8 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
                                   conv_params.scale);
         } else {
             // swap a and b order for conv lora
-            struct ggml_tensor* a = w2b;
-            struct ggml_tensor* b = w2a;
+            ggml_tensor* a = w2b;
+            ggml_tensor* b = w2a;
 
             // unpack conv2d weights if needed
             if (ggml_n_dims(a) < 4) {
@@ -2758,17 +2861,17 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
                 GGML_ASSERT(k * k * h_split->ne[2] == a->ne[2]);
                 a = ggml_reshape_4d(ctx, a, a->ne[0] * k, a->ne[1] * k, a->ne[2] / (k * k), a->ne[3]);
             }
-            struct ggml_tensor* ha = ggml_ext_conv_2d(ctx, h_split, a, nullptr,
-                                                      conv_params.s0,
-                                                      conv_params.s1,
-                                                      conv_params.p0,
-                                                      conv_params.p1,
-                                                      conv_params.d0,
-                                                      conv_params.d1,
-                                                      conv_params.direct,
-                                                      conv_params.circular_x,
-                                                      conv_params.circular_y,
-                                                      conv_params.scale);
+            ggml_tensor* ha = ggml_ext_conv_2d(ctx, h_split, a, nullptr,
+                                               conv_params.s0,
+                                               conv_params.s1,
+                                               conv_params.p0,
+                                               conv_params.p1,
+                                               conv_params.d0,
+                                               conv_params.d1,
+                                               conv_params.direct,
+                                               conv_params.circular_x,
+                                               conv_params.circular_y,
+                                               conv_params.scale);
 
             // not supporting lora_mid here
             hb = ggml_ext_conv_2d(ctx,
@@ -2791,23 +2894,23 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward(
         int w_out = (int)hb->ne[0];
         int h_out = (int)hb->ne[1];
 
-        // struct ggml_tensor* hb_cat = ggml_reshape_4d(ctx, hb, w_out , h_out , vp * uq, batch);
+        // ggml_tensor* hb_cat = ggml_reshape_4d(ctx, hb, w_out , h_out , vp * uq, batch);
         // [W_out, H_out, vp * uq,  batch]
         // Now left to compute (W1 kr Id) * hb_cat == (W1 kr W2) cv h
 
         // merge the uq groups of size vp*w_out*h_out
-        struct ggml_tensor* hb_merged = ggml_reshape_2d(ctx, hb, w_out * h_out * vp, uq * batch);
-        struct ggml_tensor* hc_t;
-        struct ggml_tensor* hb_merged_t = ggml_cont(ctx, ggml_transpose(ctx, hb_merged));
+        ggml_tensor* hb_merged = ggml_reshape_2d(ctx, hb, w_out * h_out * vp, uq * batch);
+        ggml_tensor* hc_t;
+        ggml_tensor* hb_merged_t = ggml_cont(ctx, ggml_transpose(ctx, hb_merged));
         if (w1 != NULL) {
             // Would be great to be able to transpose w1 instead to avoid transposing both hb and hc
             hc_t = ggml_mul_mat(ctx, w1, hb_merged_t);
         } else {
             hc_t = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_merged_t));
         }
-        struct ggml_tensor* hc = ggml_transpose(ctx, hc_t);
+        ggml_tensor* hc = ggml_transpose(ctx, hc_t);
         // ungroup
-        struct ggml_tensor* out = ggml_reshape_4d(ctx, ggml_cont(ctx, hc), w_out, h_out, up * vp, batch);
+        ggml_tensor* out = ggml_reshape_4d(ctx, ggml_cont(ctx, hc), w_out, h_out, up * vp, batch);
         return ggml_scale(ctx, out, scale);
     }
 }
diff --git a/src/latent-preview.h b/src/latent-preview.h
index 85c8e0dc..7f30734f 100644
--- a/src/latent-preview.h
+++ b/src/latent-preview.h
@@ -1,6 +1,8 @@
+#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include "ggml.h"
+#include "tensor.hpp"
 
 const float wan_21_latent_rgb_proj[16][3] = {
     {0.015123f, -0.148418f, 0.479828f},
@@ -163,7 +165,7 @@ const float sd_latent_rgb_proj[4][3] = {
     {-0.178022f, -0.200862f, -0.678514f}};
 float sd_latent_rgb_bias[3] = {-0.017478f, -0.055834f, -0.105825f};
 
-void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int patch_size) {
+void preview_latent_video(uint8_t* buffer, ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int patch_size) {
     size_t buffer_head = 0;
 
     uint32_t latent_width  = static_cast<uint32_t>(latents->ne[0]);
@@ -232,3 +234,67 @@ void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const fl
         }
     }
 }
+
+static inline bool preview_latent_tensor_is_video(const sd::Tensor<float>& latents) {
+    return latents.dim() == 5;
+}
+
+void preview_latent_video(uint8_t* buffer, const sd::Tensor<float>& latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int patch_size) {
+    uint32_t latent_width  = static_cast<uint32_t>(latents.shape()[0]);
+    uint32_t latent_height = static_cast<uint32_t>(latents.shape()[1]);
+    bool is_video          = preview_latent_tensor_is_video(latents);
+    uint32_t frames        = is_video ? static_cast<uint32_t>(latents.shape()[2]) : 1;
+    uint32_t dim           = is_video ? static_cast<uint32_t>(latents.shape()[3]) : static_cast<uint32_t>(latents.shape()[2]);
+
+    uint32_t rgb_width     = latent_width * patch_size;
+    uint32_t rgb_height    = latent_height * patch_size;
+    uint32_t unpatched_dim = dim / (patch_size * patch_size);
+
+    for (uint32_t k = 0; k < frames; k++) {
+        for (uint32_t rgb_x = 0; rgb_x < rgb_width; rgb_x++) {
+            for (uint32_t rgb_y = 0; rgb_y < rgb_height; rgb_y++) {
+                uint32_t latent_x = rgb_x / patch_size;
+                uint32_t latent_y = rgb_y / patch_size;
+
+                uint32_t channel_offset = 0;
+                if (patch_size > 1) {
+                    channel_offset = ((rgb_y % patch_size) * patch_size + (rgb_x % patch_size));
+                }
+
+                size_t pixel_id   = k * rgb_width * rgb_height + rgb_y * rgb_width + rgb_x;
+                auto latent_value = [&](uint32_t latent_channel) -> float {
+                    return is_video
+                               ? latents.values()[latent_x + latent_width * (latent_y + latent_height * (k + frames * latent_channel))]
+                               : latents.values()[latent_x + latent_width * (latent_y + latent_height * latent_channel)];
+                };
+
+                float r = 0.f, g = 0.f, b = 0.f;
+                if (latent_rgb_proj != nullptr) {
+                    for (uint32_t d = 0; d < unpatched_dim; d++) {
+                        uint32_t latent_channel = d * patch_size * patch_size + channel_offset;
+                        float value             = latent_value(latent_channel);
+                        r += value * latent_rgb_proj[d][0];
+                        g += value * latent_rgb_proj[d][1];
+                        b += value * latent_rgb_proj[d][2];
+                    }
+                } else {
+                    r = latent_value(0);
+                    g = latent_value(1);
+                    b = latent_value(2);
+                }
+                if (latent_rgb_bias != nullptr) {
+                    r += latent_rgb_bias[0];
+                    g += latent_rgb_bias[1];
+                    b += latent_rgb_bias[2];
+                }
+                r = std::min(1.0f, std::max(0.0f, r * .5f + .5f));
+                g = std::min(1.0f, std::max(0.0f, g * .5f + .5f));
+                b = std::min(1.0f, std::max(0.0f, b * .5f + .5f));
+
+                buffer[pixel_id * 3 + 0] = (uint8_t)(r * 255);
+                buffer[pixel_id * 3 + 1] = (uint8_t)(g * 255);
+                buffer[pixel_id * 3 + 2] = (uint8_t)(b * 255);
+            }
+        }
+    }
+}
diff --git a/src/llm.hpp b/src/llm.hpp
index 5490f07c..c6c29614 100644
--- a/src/llm.hpp
+++ b/src/llm.hpp
@@ -194,6 +194,7 @@ namespace LLM {
                         bool padding      = false) {
             if (add_bos_token) {
                 tokens.insert(tokens.begin(), BOS_TOKEN_ID);
+                weights.insert(weights.begin(), 1.f);
             }
             if (max_length > 0 && padding) {
                 size_t n = static_cast<size_t>(std::ceil(tokens.size() * 1.f / max_length));
@@ -522,7 +523,7 @@ namespace LLM {
             blocks["down_proj"] = std::shared_ptr<GGMLBlock>(new Linear(intermediate_size, hidden_size, bias));
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
             // x: [N, n_token, hidden_size]
             auto gate_proj = std::dynamic_pointer_cast<Linear>(blocks["gate_proj"]);
             auto up_proj   = std::dynamic_pointer_cast<Linear>(blocks["up_proj"]);
@@ -582,7 +583,7 @@ namespace LLM {
             }
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
             // x: [N*grid_t*grid_h*grid_w, in_channels, temporal_patch_size*patch_size*patch_size]
             // return: [N*grid_t*grid_h*grid_w, embed_dim]
             x = ggml_reshape_4d(ctx->ggml_ctx,
@@ -631,7 +632,7 @@ namespace LLM {
             blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, dim));
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
             auto ln_q  = std::dynamic_pointer_cast<RMSNorm>(blocks["ln_q"]);
             auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]);
             auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]);
@@ -668,10 +669,10 @@ namespace LLM {
             blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size));
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    struct ggml_tensor* pe,
-                                    struct ggml_tensor* mask = nullptr) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* pe,
+                             ggml_tensor* mask = nullptr) {
             // x: [N, n_token, hidden_size]
             int64_t n_token = x->ne[1];
             int64_t N       = x->ne[2];
@@ -718,10 +719,10 @@ namespace LLM {
             blocks["norm2"] = std::shared_ptr<GGMLBlock>(new RMSNorm(hidden_size, eps));
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    struct ggml_tensor* pe,
-                                    struct ggml_tensor* mask = nullptr) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* pe,
+                             ggml_tensor* mask = nullptr) {
             // x: [N, n_token, hidden_size]
             auto attn  = std::dynamic_pointer_cast<VisionAttention>(blocks["attn"]);
             auto mlp   = std::dynamic_pointer_cast<MLP>(blocks["mlp"]);
@@ -778,12 +779,12 @@ namespace LLM {
             blocks["merger"] = std::shared_ptr<GGMLBlock>(new PatchMerger(out_hidden_size, hidden_size, spatial_merge_size));
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* pixel_values,
-                                    struct ggml_tensor* pe,
-                                    struct ggml_tensor* window_index,
-                                    struct ggml_tensor* window_inverse_index,
-                                    struct ggml_tensor* window_mask) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* pixel_values,
+                             ggml_tensor* pe,
+                             ggml_tensor* window_index,
+                             ggml_tensor* window_inverse_index,
+                             ggml_tensor* window_mask) {
             // pixel_values: [grid_t*(H/mh/ph)*(W/mw/pw)*mh*mw, C*pt*ph*pw]
             // window_index: [grid_t*(H/mh/ph)*(W/mw/pw)]
             // window_inverse_index: [grid_t*(H/mh/ph)*(W/mw/pw)]
@@ -836,10 +837,10 @@ namespace LLM {
             }
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    struct ggml_tensor* input_pos,
-                                    struct ggml_tensor* attention_mask = nullptr) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* input_pos,
+                             ggml_tensor* attention_mask = nullptr) {
             // x: [N, n_token, hidden_size]
             int64_t n_token = x->ne[1];
             int64_t N       = x->ne[2];
@@ -898,10 +899,10 @@ namespace LLM {
             blocks["post_attention_layernorm"] = std::make_shared<RMSNorm>(params.hidden_size, params.rms_norm_eps);
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    struct ggml_tensor* input_pos,
-                                    struct ggml_tensor* attention_mask = nullptr) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* input_pos,
+                             ggml_tensor* attention_mask = nullptr) {
             // x: [N, n_token, hidden_size]
             auto self_attn                = std::dynamic_pointer_cast<Attention>(blocks["self_attn"]);
             auto mlp                      = std::dynamic_pointer_cast<MLP>(blocks["mlp"]);
@@ -936,12 +937,12 @@ namespace LLM {
             blocks["norm"] = std::shared_ptr<GGMLBlock>(new RMSNorm(params.hidden_size, params.rms_norm_eps));
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* input_ids,
-                                    struct ggml_tensor* input_pos,
-                                    struct ggml_tensor* attention_mask,
-                                    std::vector<std::pair<int, ggml_tensor*>> image_embeds,
-                                    std::set<int> out_layers) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* input_ids,
+                             ggml_tensor* input_pos,
+                             ggml_tensor* attention_mask,
+                             std::vector<std::pair<int, ggml_tensor*>> image_embeds,
+                             std::set<int> out_layers) {
             // input_ids: [N, n_token]
             // return: [N, n_token, hidden_size]
 
@@ -1037,12 +1038,12 @@ namespace LLM {
             }
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* input_ids,
-                                    struct ggml_tensor* input_pos,
-                                    struct ggml_tensor* attention_mask,
-                                    std::vector<std::pair<int, ggml_tensor*>> image_embeds,
-                                    std::set<int> out_layers) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* input_ids,
+                             ggml_tensor* input_pos,
+                             ggml_tensor* attention_mask,
+                             std::vector<std::pair<int, ggml_tensor*>> image_embeds,
+                             std::set<int> out_layers) {
             // input_ids: [N, n_token]
             auto model = std::dynamic_pointer_cast<TextModel>(blocks["model"]);
 
@@ -1050,12 +1051,12 @@ namespace LLM {
             return x;
         }
 
-        struct ggml_tensor* vision_forward(GGMLRunnerContext* ctx,
-                                           struct ggml_tensor* pixel_values,
-                                           struct ggml_tensor* pe,
-                                           struct ggml_tensor* window_index,
-                                           struct ggml_tensor* window_inverse_index,
-                                           struct ggml_tensor* window_mask) {
+        ggml_tensor* vision_forward(GGMLRunnerContext* ctx,
+                                    ggml_tensor* pixel_values,
+                                    ggml_tensor* pe,
+                                    ggml_tensor* window_index,
+                                    ggml_tensor* window_inverse_index,
+                                    ggml_tensor* window_mask) {
             GGML_ASSERT(enable_vision);
             auto vision_model = std::dynamic_pointer_cast<VisionModel>(blocks["visual"]);
             return vision_model->forward(ctx, pixel_values, pe, window_index, window_inverse_index, window_mask);
@@ -1156,40 +1157,41 @@ namespace LLM {
             return llm_arch_to_str[static_cast<int>(params.arch)];
         }
 
-        void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
             model.get_param_tensors(tensors, prefix);
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* input_ids,
-                                    struct ggml_tensor* input_pos,
-                                    struct ggml_tensor* attention_mask,
-                                    std::vector<std::pair<int, ggml_tensor*>> image_embeds,
-                                    std::set<int> out_layers) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* input_ids,
+                             ggml_tensor* input_pos,
+                             ggml_tensor* attention_mask,
+                             std::vector<std::pair<int, ggml_tensor*>> image_embeds,
+                             std::set<int> out_layers) {
             auto hidden_states = model.forward(ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers);  // [N, n_token, hidden_size]
             return hidden_states;
         }
 
-        struct ggml_tensor* vision_forward(GGMLRunnerContext* ctx,
-                                           struct ggml_tensor* pixel_values,
-                                           struct ggml_tensor* input_pos,
-                                           struct ggml_tensor* window_index,
-                                           struct ggml_tensor* window_inverse_index,
-                                           struct ggml_tensor* window_mask) {
+        ggml_tensor* vision_forward(GGMLRunnerContext* ctx,
+                                    ggml_tensor* pixel_values,
+                                    ggml_tensor* input_pos,
+                                    ggml_tensor* window_index,
+                                    ggml_tensor* window_inverse_index,
+                                    ggml_tensor* window_mask) {
             auto hidden_states = model.vision_forward(ctx, pixel_values, input_pos, window_index, window_inverse_index, window_mask);
             return hidden_states;
         }
 
-        struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
-                                        struct ggml_tensor* attention_mask,
-                                        std::vector<std::pair<int, ggml_tensor*>> image_embeds,
-                                        std::set<int> out_layers) {
-            struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
-
-            input_ids = to_backend(input_ids);
-
-            for (auto& image_embed : image_embeds) {
-                image_embed.second = to_backend(image_embed.second);
+        ggml_cgraph* build_graph(const sd::Tensor<int32_t>& input_ids_tensor,
+                                 const sd::Tensor<float>& attention_mask_tensor,
+                                 const std::vector<std::pair<int, sd::Tensor<float>>>& image_embeds_tensor,
+                                 std::set<int> out_layers) {
+            ggml_cgraph* gf        = ggml_new_graph(compute_ctx);
+            ggml_tensor* input_ids = make_input(input_ids_tensor);
+            std::vector<std::pair<int, ggml_tensor*>> image_embeds;
+            image_embeds.reserve(image_embeds_tensor.size());
+            for (const auto& [idx, embed_tensor] : image_embeds_tensor) {
+                ggml_tensor* embed = make_input(embed_tensor);
+                image_embeds.emplace_back(idx, embed);
             }
 
             int64_t n_tokens = input_ids->ne[0];
@@ -1213,8 +1215,9 @@ namespace LLM {
                                                 input_pos_vec.size());
             set_backend_tensor_data(input_pos, input_pos_vec.data());
 
-            if (attention_mask != nullptr) {
-                attention_mask = to_backend(attention_mask);
+            ggml_tensor* attention_mask = nullptr;
+            if (!attention_mask_tensor.empty()) {
+                attention_mask = make_input(attention_mask_tensor);
             } else {
                 attention_mask_vec.resize(n_tokens * n_tokens);
                 for (int i0 = 0; i0 < n_tokens; i0++) {
@@ -1232,24 +1235,22 @@ namespace LLM {
 
             auto runner_ctx = get_context();
 
-            struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers);
+            ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers);
 
             ggml_build_forward_expand(gf, hidden_states);
 
             return gf;
         }
 
-        bool compute(const int n_threads,
-                     struct ggml_tensor* input_ids,
-                     struct ggml_tensor* attention_mask,
-                     std::vector<std::pair<int, ggml_tensor*>> image_embeds,
-                     std::set<int> out_layers,
-                     ggml_tensor** output,
-                     ggml_context* output_ctx = nullptr) {
-            auto get_graph = [&]() -> struct ggml_cgraph* {
+        sd::Tensor<float> compute(const int n_threads,
+                                  const sd::Tensor<int32_t>& input_ids,
+                                  const sd::Tensor<float>& attention_mask,
+                                  const std::vector<std::pair<int, sd::Tensor<float>>>& image_embeds,
+                                  std::set<int> out_layers) {
+            auto get_graph = [&]() -> ggml_cgraph* {
                 return build_graph(input_ids, attention_mask, image_embeds, out_layers);
             };
-            return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
+            return take_or_empty(GGMLRunner::compute<float>(get_graph, n_threads, true));
         }
 
         int64_t get_num_image_tokens(int64_t t, int64_t h, int64_t w) {
@@ -1261,7 +1262,7 @@ namespace LLM {
             return grid_t * grid_h * grid_w;
         }
 
-        struct ggml_tensor* process_image(struct ggml_context* ctx, struct ggml_tensor* image) {
+        ggml_tensor* process_image(ggml_context* ctx, ggml_tensor* image) {
             // image: [C, H, W]
             // return: [grid_t*(H/mh/ph)*(W/mw/pw)*mh*mw, C*pt*ph*pw], grid_t == 1
             int64_t C  = image->ne[2];
@@ -1288,8 +1289,9 @@ namespace LLM {
             return image;
         }
 
-        struct ggml_cgraph* build_encode_image_graph(struct ggml_tensor* image) {
-            struct ggml_cgraph* gf = new_graph_custom(LLM_GRAPH_SIZE);
+        ggml_cgraph* build_encode_image_graph(const sd::Tensor<float>& image_tensor) {
+            ggml_cgraph* gf    = new_graph_custom(LLM_GRAPH_SIZE);
+            ggml_tensor* image = make_input(image_tensor);
 
             GGML_ASSERT(image->ne[1] % (params.vision.patch_size * params.vision.spatial_merge_size) == 0);
             GGML_ASSERT(image->ne[0] % (params.vision.patch_size * params.vision.spatial_merge_size) == 0);
@@ -1301,8 +1303,6 @@ namespace LLM {
             int llm_grid_w             = grid_w / params.vision.spatial_merge_size;
             int vit_merger_window_size = params.vision.window_size / params.vision.patch_size / params.vision.spatial_merge_size;
 
-            image = to_backend(image);
-
             auto pixel_values = process_image(compute_ctx, image);
 
             // window index
@@ -1399,26 +1399,24 @@ namespace LLM {
             // pe->data = nullptr;
             set_backend_tensor_data(pe, pe_vec.data());
 
-            auto runnter_ctx                  = get_context();
-            struct ggml_tensor* hidden_states = vision_forward(&runnter_ctx,
-                                                               pixel_values,
-                                                               pe,
-                                                               window_index,
-                                                               window_inverse_index,
-                                                               window_mask);
+            auto runnter_ctx           = get_context();
+            ggml_tensor* hidden_states = vision_forward(&runnter_ctx,
+                                                        pixel_values,
+                                                        pe,
+                                                        window_index,
+                                                        window_inverse_index,
+                                                        window_mask);
             ggml_build_forward_expand(gf, hidden_states);
 
             return gf;
         }
 
-        void encode_image(const int n_threads,
-                          struct ggml_tensor* image,
-                          ggml_tensor** output,
-                          ggml_context* output_ctx = nullptr) {
-            auto get_graph = [&]() -> struct ggml_cgraph* {
+        sd::Tensor<float> encode_image(const int n_threads,
+                                       const sd::Tensor<float>& image) {
+            auto get_graph = [&]() -> ggml_cgraph* {
                 return build_encode_image_graph(image);
             };
-            GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+            return take_or_empty(GGMLRunner::compute<float>(get_graph, n_threads, false));
         }
     };
 
@@ -1440,7 +1438,7 @@ namespace LLM {
             }
         }
 
-        void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
             model.get_param_tensors(tensors, prefix);
         }
 
@@ -1492,44 +1490,46 @@ namespace LLM {
         }
 
         void test() {
-            struct ggml_init_params params;
+            ggml_init_params params;
             params.mem_size   = static_cast<size_t>(1024 * 1024) * 1024;  // 1GB
             params.mem_buffer = nullptr;
             params.no_alloc   = false;
 
-            struct ggml_context* work_ctx = ggml_init(params);
-            GGML_ASSERT(work_ctx != nullptr);
+            ggml_context* ctx = ggml_init(params);
+            GGML_ASSERT(ctx != nullptr);
             bool test_mistral          = false;
             bool test_qwen3            = true;
             bool test_vit              = false;
             bool test_decoder_with_vit = false;
 
             if (test_decoder_with_vit) {
-                ggml_tensor* image_embed = nullptr;
+                sd::Tensor<float> image_embed;
                 {
-                    auto image = load_tensor_from_file(work_ctx, "qwen2vl_normalized.bin");
-                    print_ggml_tensor(image, false, "image");
-                    struct ggml_tensor* out = nullptr;
+                    auto image = sd::load_tensor_from_file_as_tensor<float>("qwen2vl_normalized.bin");
+                    print_sd_tensor(image, false, "image");
+                    sd::Tensor<float> out;
 
-                    int64_t t0 = ggml_time_ms();
-                    model.encode_image(8, image, &out, work_ctx);
-                    int64_t t1 = ggml_time_ms();
+                    int64_t t0   = ggml_time_ms();
+                    auto out_opt = model.encode_image(8, image);
+                    int64_t t1   = ggml_time_ms();
 
-                    print_ggml_tensor(out, false, "image_embed");
+                    GGML_ASSERT(!out_opt.empty());
+                    out = std::move(out_opt);
+                    print_sd_tensor(out, false, "image_embed");
                     image_embed = out;
                     LOG_DEBUG("llm encode_image test done in %lldms", t1 - t0);
                 }
 
                 std::string placeholder  = "<|image_pad|>";
                 std::string img_prompt   = "Picture 1: <|vision_start|>";  // [24669, 220, 16, 25, 220, 151652]
-                int64_t num_image_tokens = image_embed->ne[1];
+                int64_t num_image_tokens = image_embed.shape()[1];
                 img_prompt.reserve(num_image_tokens * placeholder.size());
                 for (int i = 0; i < num_image_tokens; i++) {
                     img_prompt += placeholder;
                 }
                 img_prompt += "<|vision_end|>";
 
-                std::vector<std::pair<int, ggml_tensor*>> image_embeds;
+                std::vector<std::pair<int, sd::Tensor<float>>> image_embeds;
                 image_embeds.emplace_back(64, image_embed);
 
                 std::pair<int, int> prompt_attn_range;
@@ -1547,29 +1547,33 @@ namespace LLM {
                     printf("%d ", token);
                 }
                 printf("\n");
-                auto input_ids          = vector_to_ggml_tensor_i32(work_ctx, tokens);
-                struct ggml_tensor* out = nullptr;
+                auto input_ids = sd::Tensor<int32_t>::from_vector(tokens);
+                sd::Tensor<float> out;
 
-                int64_t t0 = ggml_time_ms();
-                model.compute(8, input_ids, nullptr, image_embeds, {}, &out, work_ctx);
-                int64_t t1 = ggml_time_ms();
+                int64_t t0   = ggml_time_ms();
+                auto out_opt = model.compute(8, input_ids, sd::Tensor<float>(), image_embeds, {});
+                int64_t t1   = ggml_time_ms();
 
-                print_ggml_tensor(out);
+                GGML_ASSERT(!out_opt.empty());
+                out = std::move(out_opt);
+                print_sd_tensor(out);
                 LOG_DEBUG("llm test done in %lldms", t1 - t0);
             } else if (test_vit) {
-                // auto image = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 280, 280, 3);
+                // auto image = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 280, 280, 3);
                 // ggml_set_f32(image, 0.f);
-                auto image = load_tensor_from_file(work_ctx, "qwen2vl_normalized.bin");
-                print_ggml_tensor(image, false, "image");
-                struct ggml_tensor* out = nullptr;
+                auto image = sd::load_tensor_from_file_as_tensor<float>("qwen2vl_normalized.bin");
+                print_sd_tensor(image, false, "image");
+                sd::Tensor<float> out;
 
-                int64_t t0 = ggml_time_ms();
-                model.encode_image(8, image, &out, work_ctx);
-                int64_t t1 = ggml_time_ms();
+                int64_t t0   = ggml_time_ms();
+                auto out_opt = model.encode_image(8, image);
+                int64_t t1   = ggml_time_ms();
 
-                print_ggml_tensor(out, false, "out");
+                GGML_ASSERT(!out_opt.empty());
+                out = std::move(out_opt);
+                print_sd_tensor(out, false, "out");
 
-                // auto ref_out = load_tensor_from_file(work_ctx, "qwen2vl.bin");
+                // auto ref_out = load_tensor_from_file(ctx, "qwen2vl.bin");
                 // ggml_ext_tensor_diff(ref_out, out, 0.01f);
 
                 LOG_DEBUG("llm test done in %lldms", t1 - t0);
@@ -1587,14 +1591,16 @@ namespace LLM {
                     printf("%d ", token);
                 }
                 printf("\n");
-                auto input_ids          = vector_to_ggml_tensor_i32(work_ctx, tokens);
-                struct ggml_tensor* out = nullptr;
+                auto input_ids = sd::Tensor<int32_t>::from_vector(tokens);
+                sd::Tensor<float> out;
 
-                int64_t t0 = ggml_time_ms();
-                model.compute(8, input_ids, nullptr, {}, {10, 20, 30}, &out, work_ctx);
-                int64_t t1 = ggml_time_ms();
+                int64_t t0   = ggml_time_ms();
+                auto out_opt = model.compute(8, input_ids, sd::Tensor<float>(), {}, {10, 20, 30});
+                int64_t t1   = ggml_time_ms();
 
-                print_ggml_tensor(out);
+                GGML_ASSERT(!out_opt.empty());
+                out = std::move(out_opt);
+                print_sd_tensor(out);
                 LOG_DEBUG("llm test done in %lldms", t1 - t0);
             } else if (test_qwen3) {
                 std::pair<int, int> prompt_attn_range;
@@ -1610,14 +1616,16 @@ namespace LLM {
                     printf("%d ", token);
                 }
                 printf("\n");
-                auto input_ids          = vector_to_ggml_tensor_i32(work_ctx, tokens);
-                struct ggml_tensor* out = nullptr;
+                auto input_ids = sd::Tensor<int32_t>::from_vector(tokens);
+                sd::Tensor<float> out;
 
-                int64_t t0 = ggml_time_ms();
-                model.compute(8, input_ids, nullptr, {}, {35}, &out, work_ctx);
-                int64_t t1 = ggml_time_ms();
+                int64_t t0   = ggml_time_ms();
+                auto out_opt = model.compute(8, input_ids, sd::Tensor<float>(), {}, {35});
+                int64_t t1   = ggml_time_ms();
 
-                print_ggml_tensor(out);
+                GGML_ASSERT(!out_opt.empty());
+                out = std::move(out_opt);
+                print_sd_tensor(out);
                 LOG_DEBUG("llm test done in %lldms", t1 - t0);
             } else {
                 std::pair<int, int> prompt_attn_range;
@@ -1633,14 +1641,16 @@ namespace LLM {
                     printf("%d ", token);
                 }
                 printf("\n");
-                auto input_ids          = vector_to_ggml_tensor_i32(work_ctx, tokens);
-                struct ggml_tensor* out = nullptr;
+                auto input_ids = sd::Tensor<int32_t>::from_vector(tokens);
+                sd::Tensor<float> out;
 
-                int64_t t0 = ggml_time_ms();
-                model.compute(8, input_ids, nullptr, {}, {}, &out, work_ctx);
-                int64_t t1 = ggml_time_ms();
+                int64_t t0   = ggml_time_ms();
+                auto out_opt = model.compute(8, input_ids, sd::Tensor<float>(), {}, {});
+                int64_t t1   = ggml_time_ms();
 
-                print_ggml_tensor(out);
+                GGML_ASSERT(!out_opt.empty());
+                out = std::move(out_opt);
+                print_sd_tensor(out);
                 LOG_DEBUG("llm test done in %lldms", t1 - t0);
             }
         }
diff --git a/src/lora.hpp b/src/lora.hpp
index d2f91cd4..d4a749ef 100644
--- a/src/lora.hpp
+++ b/src/lora.hpp
@@ -9,7 +9,7 @@
 struct LoraModel : public GGMLRunner {
     std::string lora_id;
     float multiplier = 1.0f;
-    std::unordered_map<std::string, struct ggml_tensor*> lora_tensors;
+    std::unordered_map<std::string, ggml_tensor*> lora_tensors;
     std::map<ggml_tensor*, ggml_tensor*> original_tensor_to_final_tensor;
     std::set<std::string> applied_lora_tensors;
     std::string file_path;
@@ -76,13 +76,13 @@ struct LoraModel : public GGMLRunner {
         }
 
         for (const auto& pair : tensors_to_create) {
-            const auto& name         = pair.first;
-            const auto& ts           = pair.second;
-            struct ggml_tensor* real = ggml_new_tensor(params_ctx,
-                                                       ts.type,
-                                                       ts.n_dims,
-                                                       ts.ne);
-            lora_tensors[name]       = real;
+            const auto& name   = pair.first;
+            const auto& ts     = pair.second;
+            ggml_tensor* real  = ggml_new_tensor(params_ctx,
+                                                 ts.type,
+                                                 ts.n_dims,
+                                                 ts.ne);
+            lora_tensors[name] = real;
         }
 
         alloc_params_buffer();
@@ -337,10 +337,10 @@ struct LoraModel : public GGMLRunner {
             }
             scale_value *= multiplier;
 
-            struct ggml_tensor* updown_1 = ggml_ext_merge_lora(ctx, hada_1_down, hada_1_up, hada_1_mid);
-            struct ggml_tensor* updown_2 = ggml_ext_merge_lora(ctx, hada_2_down, hada_2_up, hada_2_mid);
-            auto curr_updown             = ggml_mul_inplace(ctx, updown_1, updown_2);
-            curr_updown                  = ggml_ext_scale(ctx, curr_updown, scale_value, true);
+            ggml_tensor* updown_1 = ggml_ext_merge_lora(ctx, hada_1_down, hada_1_up, hada_1_mid);
+            ggml_tensor* updown_2 = ggml_ext_merge_lora(ctx, hada_2_down, hada_2_up, hada_2_mid);
+            auto curr_updown      = ggml_mul_inplace(ctx, updown_1, updown_2);
+            curr_updown           = ggml_ext_scale(ctx, curr_updown, scale_value, true);
             if (updown == nullptr) {
                 updown = curr_updown;
             } else {
@@ -747,9 +747,9 @@ struct LoraModel : public GGMLRunner {
         return out_diff;
     }
 
-    struct ggml_cgraph* build_lora_graph(const std::map<std::string, ggml_tensor*>& model_tensors, SDVersion version) {
+    ggml_cgraph* build_lora_graph(const std::map<std::string, ggml_tensor*>& model_tensors, SDVersion version) {
         size_t lora_graph_size = LORA_GRAPH_BASE_SIZE + lora_tensors.size() * 10;
-        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, lora_graph_size, false);
+        ggml_cgraph* gf        = ggml_new_graph_custom(compute_ctx, lora_graph_size, false);
 
         preprocess_lora_tensors(model_tensors);
 
@@ -788,11 +788,11 @@ struct LoraModel : public GGMLRunner {
         return gf;
     }
 
-    void apply(std::map<std::string, struct ggml_tensor*> model_tensors, SDVersion version, int n_threads) {
-        auto get_graph = [&]() -> struct ggml_cgraph* {
+    void apply(std::map<std::string, ggml_tensor*> model_tensors, SDVersion version, int n_threads) {
+        auto get_graph = [&]() -> ggml_cgraph* {
             return build_lora_graph(model_tensors, version);
         };
-        GGMLRunner::compute(get_graph, n_threads, false);
+        GGMLRunner::compute<float>(get_graph, n_threads, false, true);
         stat();
         for (auto item : original_tensor_to_final_tensor) {
             ggml_tensor* original_tensor = item.first;
diff --git a/src/ltxv.hpp b/src/ltxv.hpp
index 9dcdd4b2..fb37dbe0 100644
--- a/src/ltxv.hpp
+++ b/src/ltxv.hpp
@@ -26,9 +26,9 @@ namespace LTXV {
                                                                      bias));
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    bool causal = true) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             bool causal = true) {
             // x: [N*IC, ID, IH, IW]
             // result: [N*OC, OD, OH, OW]
             auto conv = std::dynamic_pointer_cast<Conv3d>(blocks["conv"]);
diff --git a/src/mmdit.hpp b/src/mmdit.hpp
index ba1c35d6..e75736c5 100644
--- a/src/mmdit.hpp
+++ b/src/mmdit.hpp
@@ -27,7 +27,7 @@ public:
         blocks["fc2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_features, out_features, bias));
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
         // x: [N, n_token, in_features]
         auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
         auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);
@@ -72,7 +72,7 @@ public:
                                                                bias));
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
         // x: [N, C, H, W]
         // return: [N, H*W, embed_dim]
         auto proj = std::dynamic_pointer_cast<Conv2d>(blocks["proj"]);
@@ -111,7 +111,7 @@ public:
         blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, out_channels, true, true));
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* t) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* t) {
         // t: [N, ]
         // return: [N, hidden_size]
         auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]);
@@ -135,7 +135,7 @@ public:
         blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true));
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
         // x: [N, input_dim]
         // return: [N, hidden_size]
         auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]);
@@ -175,7 +175,7 @@ public:
         }
     }
 
-    std::vector<struct ggml_tensor*> pre_attention(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+    std::vector<ggml_tensor*> pre_attention(GGMLRunnerContext* ctx, ggml_tensor* x) {
         auto qkv_proj = std::dynamic_pointer_cast<Linear>(blocks["qkv"]);
 
         auto qkv         = qkv_proj->forward(ctx, x);
@@ -198,7 +198,7 @@ public:
         return {q, k, v};
     }
 
-    struct ggml_tensor* post_attention(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+    ggml_tensor* post_attention(GGMLRunnerContext* ctx, ggml_tensor* x) {
         GGML_ASSERT(!pre_only);
 
         auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
@@ -208,8 +208,8 @@ public:
     }
 
     // x: [N, n_token, dim]
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                struct ggml_tensor* x) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* x) {
         auto qkv = pre_attention(ctx, x);
         x        = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, ctx->flash_attn_enabled);  // [N, n_token, dim]
         x        = post_attention(ctx, x);                                                                                                           // [N, n_token, dim]
@@ -217,10 +217,10 @@ public:
     }
 };
 
-__STATIC_INLINE__ struct ggml_tensor* modulate(struct ggml_context* ctx,
-                                               struct ggml_tensor* x,
-                                               struct ggml_tensor* shift,
-                                               struct ggml_tensor* scale) {
+__STATIC_INLINE__ ggml_tensor* modulate(ggml_context* ctx,
+                                        ggml_tensor* x,
+                                        ggml_tensor* shift,
+                                        ggml_tensor* scale) {
     // x: [N, L, C]
     // scale: [N, C]
     // shift: [N, C]
@@ -274,8 +274,8 @@ public:
     }
 
     std::tuple<std::vector<ggml_tensor*>, std::vector<ggml_tensor*>, std::vector<ggml_tensor*>> pre_attention_x(GGMLRunnerContext* ctx,
-                                                                                                                struct ggml_tensor* x,
-                                                                                                                struct ggml_tensor* c) {
+                                                                                                                ggml_tensor* x,
+                                                                                                                ggml_tensor* c) {
         GGML_ASSERT(self_attn);
         // x: [N, n_token, hidden_size]
         // c: [N, hidden_size]
@@ -309,9 +309,9 @@ public:
         return {qkv, qkv2, {x, gate_msa, shift_mlp, scale_mlp, gate_mlp, gate_msa2}};
     }
 
-    std::pair<std::vector<struct ggml_tensor*>, std::vector<struct ggml_tensor*>> pre_attention(GGMLRunnerContext* ctx,
-                                                                                                struct ggml_tensor* x,
-                                                                                                struct ggml_tensor* c) {
+    std::pair<std::vector<ggml_tensor*>, std::vector<ggml_tensor*>> pre_attention(GGMLRunnerContext* ctx,
+                                                                                  ggml_tensor* x,
+                                                                                  ggml_tensor* c) {
         // x: [N, n_token, hidden_size]
         // c: [N, hidden_size]
         auto norm1              = std::dynamic_pointer_cast<LayerNorm>(blocks["norm1"]);
@@ -346,15 +346,15 @@ public:
         }
     }
 
-    struct ggml_tensor* post_attention_x(GGMLRunnerContext* ctx,
-                                         struct ggml_tensor* attn_out,
-                                         struct ggml_tensor* attn2_out,
-                                         struct ggml_tensor* x,
-                                         struct ggml_tensor* gate_msa,
-                                         struct ggml_tensor* shift_mlp,
-                                         struct ggml_tensor* scale_mlp,
-                                         struct ggml_tensor* gate_mlp,
-                                         struct ggml_tensor* gate_msa2) {
+    ggml_tensor* post_attention_x(GGMLRunnerContext* ctx,
+                                  ggml_tensor* attn_out,
+                                  ggml_tensor* attn2_out,
+                                  ggml_tensor* x,
+                                  ggml_tensor* gate_msa,
+                                  ggml_tensor* shift_mlp,
+                                  ggml_tensor* scale_mlp,
+                                  ggml_tensor* gate_mlp,
+                                  ggml_tensor* gate_msa2) {
         // attn_out: [N, n_token, hidden_size]
         // x: [N, n_token, hidden_size]
         // gate_msa: [N, hidden_size]
@@ -384,13 +384,13 @@ public:
         return x;
     }
 
-    struct ggml_tensor* post_attention(GGMLRunnerContext* ctx,
-                                       struct ggml_tensor* attn_out,
-                                       struct ggml_tensor* x,
-                                       struct ggml_tensor* gate_msa,
-                                       struct ggml_tensor* shift_mlp,
-                                       struct ggml_tensor* scale_mlp,
-                                       struct ggml_tensor* gate_mlp) {
+    ggml_tensor* post_attention(GGMLRunnerContext* ctx,
+                                ggml_tensor* attn_out,
+                                ggml_tensor* x,
+                                ggml_tensor* gate_msa,
+                                ggml_tensor* shift_mlp,
+                                ggml_tensor* scale_mlp,
+                                ggml_tensor* gate_mlp) {
         // attn_out: [N, n_token, hidden_size]
         // x: [N, n_token, hidden_size]
         // gate_msa: [N, hidden_size]
@@ -416,9 +416,9 @@ public:
         return x;
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                struct ggml_tensor* x,
-                                struct ggml_tensor* c) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* x,
+                         ggml_tensor* c) {
         // x: [N, n_token, hidden_size]
         // c: [N, hidden_size]
         // return: [N, n_token, hidden_size]
@@ -463,11 +463,11 @@ public:
     }
 };
 
-__STATIC_INLINE__ std::pair<struct ggml_tensor*, struct ggml_tensor*>
+__STATIC_INLINE__ std::pair<ggml_tensor*, ggml_tensor*>
 block_mixing(GGMLRunnerContext* ctx,
-             struct ggml_tensor* context,
-             struct ggml_tensor* x,
-             struct ggml_tensor* c,
+             ggml_tensor* context,
+             ggml_tensor* x,
+             ggml_tensor* c,
              std::shared_ptr<DismantledBlock> context_block,
              std::shared_ptr<DismantledBlock> x_block) {
     // context: [N, n_context, hidden_size]
@@ -489,7 +489,7 @@ block_mixing(GGMLRunnerContext* ctx,
         x_qkv                    = x_qkv_intermediates.first;
         x_intermediates          = x_qkv_intermediates.second;
     }
-    std::vector<struct ggml_tensor*> qkv;
+    std::vector<ggml_tensor*> qkv;
     for (int i = 0; i < 3; i++) {
         qkv.push_back(ggml_concat(ctx->ggml_ctx, context_qkv[i], x_qkv[i], 1));
     }
@@ -563,10 +563,10 @@ public:
         blocks["x_block"]       = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, false, self_attn_x));
     }
 
-    std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(GGMLRunnerContext* ctx,
-                                                                struct ggml_tensor* context,
-                                                                struct ggml_tensor* x,
-                                                                struct ggml_tensor* c) {
+    std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
+                                                  ggml_tensor* context,
+                                                  ggml_tensor* x,
+                                                  ggml_tensor* c) {
         auto context_block = std::dynamic_pointer_cast<DismantledBlock>(blocks["context_block"]);
         auto x_block       = std::dynamic_pointer_cast<DismantledBlock>(blocks["x_block"]);
 
@@ -586,9 +586,9 @@ public:
         blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, 2 * hidden_size));
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                struct ggml_tensor* x,
-                                struct ggml_tensor* c) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* x,
+                         ggml_tensor* c) {
         // x: [N, n_token, hidden_size]
         // c: [N, hidden_size]
         // return: [N, n_token, patch_size * patch_size * out_channels]
@@ -626,7 +626,7 @@ protected:
     int64_t hidden_size;
     std::string qk_norm;
 
-    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
+    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
         enum ggml_type wtype = GGML_TYPE_F32;
         params["pos_embed"]  = ggml_new_tensor_3d(ctx, wtype, hidden_size, num_patchs, 1);
     }
@@ -705,8 +705,8 @@ public:
         blocks["final_layer"] = std::shared_ptr<GGMLBlock>(new FinalLayer(hidden_size, patch_size, out_channels));
     }
 
-    struct ggml_tensor*
-    cropped_pos_embed(struct ggml_context* ctx,
+    ggml_tensor*
+    cropped_pos_embed(ggml_context* ctx,
                       int64_t h,
                       int64_t w) {
         auto pos_embed = params["pos_embed"];
@@ -745,11 +745,11 @@ public:
         return spatial_pos_embed;
     }
 
-    struct ggml_tensor* forward_core_with_concat(GGMLRunnerContext* ctx,
-                                                 struct ggml_tensor* x,
-                                                 struct ggml_tensor* c_mod,
-                                                 struct ggml_tensor* context,
-                                                 std::vector<int> skip_layers = std::vector<int>()) {
+    ggml_tensor* forward_core_with_concat(GGMLRunnerContext* ctx,
+                                          ggml_tensor* x,
+                                          ggml_tensor* c_mod,
+                                          ggml_tensor* context,
+                                          std::vector<int> skip_layers = std::vector<int>()) {
         // x: [N, H*W, hidden_size]
         // context: [N, n_context, d_context]
         // c: [N, hidden_size]
@@ -774,12 +774,12 @@ public:
         return x;
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                struct ggml_tensor* x,
-                                struct ggml_tensor* t,
-                                struct ggml_tensor* y        = nullptr,
-                                struct ggml_tensor* context  = nullptr,
-                                std::vector<int> skip_layers = std::vector<int>()) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* x,
+                         ggml_tensor* t,
+                         ggml_tensor* y               = nullptr,
+                         ggml_tensor* context         = nullptr,
+                         std::vector<int> skip_layers = std::vector<int>()) {
         // Forward pass of DiT.
         // x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
         // t: (N,) tensor of diffusion timesteps
@@ -832,89 +832,93 @@ struct MMDiTRunner : public GGMLRunner {
         return "mmdit";
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
         mmdit.get_param_tensors(tensors, prefix);
     }
 
-    struct ggml_cgraph* build_graph(struct ggml_tensor* x,
-                                    struct ggml_tensor* timesteps,
-                                    struct ggml_tensor* context,
-                                    struct ggml_tensor* y,
-                                    std::vector<int> skip_layers = std::vector<int>()) {
-        struct ggml_cgraph* gf = new_graph_custom(MMDIT_GRAPH_SIZE);
+    ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
+                             const sd::Tensor<float>& timesteps_tensor,
+                             const sd::Tensor<float>& context_tensor = {},
+                             const sd::Tensor<float>& y_tensor       = {},
+                             std::vector<int> skip_layers            = std::vector<int>()) {
+        ggml_cgraph* gf = new_graph_custom(MMDIT_GRAPH_SIZE);
 
-        x         = to_backend(x);
-        context   = to_backend(context);
-        y         = to_backend(y);
-        timesteps = to_backend(timesteps);
+        ggml_tensor* x         = make_input(x_tensor);
+        ggml_tensor* timesteps = make_input(timesteps_tensor);
+        ggml_tensor* context   = make_optional_input(context_tensor);
+        ggml_tensor* y         = make_optional_input(y_tensor);
 
-        auto runner_ctx         = get_context();
-        struct ggml_tensor* out = mmdit.forward(&runner_ctx,
-                                                x,
-                                                timesteps,
-                                                y,
-                                                context,
-                                                skip_layers);
+        auto runner_ctx  = get_context();
+        ggml_tensor* out = mmdit.forward(&runner_ctx,
+                                         x,
+                                         timesteps,
+                                         y,
+                                         context,
+                                         skip_layers);
 
         ggml_build_forward_expand(gf, out);
 
         return gf;
     }
 
-    bool compute(int n_threads,
-                 struct ggml_tensor* x,
-                 struct ggml_tensor* timesteps,
-                 struct ggml_tensor* context,
-                 struct ggml_tensor* y,
-                 struct ggml_tensor** output     = nullptr,
-                 struct ggml_context* output_ctx = nullptr,
-                 std::vector<int> skip_layers    = std::vector<int>()) {
+    sd::Tensor<float> compute(int n_threads,
+                              const sd::Tensor<float>& x,
+                              const sd::Tensor<float>& timesteps,
+                              const sd::Tensor<float>& context = {},
+                              const sd::Tensor<float>& y       = {},
+                              std::vector<int> skip_layers     = std::vector<int>()) {
         // x: [N, in_channels, h, w]
         // timesteps: [N, ]
         // context: [N, max_position, hidden_size]([N, 154, 4096]) or [1, max_position, hidden_size]
         // y: [N, adm_in_channels] or [1, adm_in_channels]
-        auto get_graph = [&]() -> struct ggml_cgraph* {
+        auto get_graph = [&]() -> ggml_cgraph* {
             return build_graph(x, timesteps, context, y, skip_layers);
         };
 
-        return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
     }
 
     void test() {
-        struct ggml_init_params params;
+        ggml_init_params params;
         params.mem_size   = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
         params.mem_buffer = nullptr;
         params.no_alloc   = false;
 
-        struct ggml_context* work_ctx = ggml_init(params);
-        GGML_ASSERT(work_ctx != nullptr);
+        ggml_context* ctx = ggml_init(params);
+        GGML_ASSERT(ctx != nullptr);
 
         {
             // cpu f16: pass
             // cpu f32: pass
             // cuda f16: pass
             // cuda f32: pass
-            auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 128, 128, 16, 1);
+            sd::Tensor<float> x({128, 128, 16, 1});
             std::vector<float> timesteps_vec(1, 999.f);
-            auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
-            ggml_set_f32(x, 0.01f);
+            auto timesteps = sd::Tensor<float>::from_vector(timesteps_vec);
+            x.fill_(0.01f);
             // print_ggml_tensor(x);
 
-            auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 4096, 154, 1);
-            ggml_set_f32(context, 0.01f);
+            sd::Tensor<float> context({4096, 154, 1});
+            context.fill_(0.01f);
             // print_ggml_tensor(context);
 
-            auto y = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 2048, 1);
-            ggml_set_f32(y, 0.01f);
+            sd::Tensor<float> y({2048, 1});
+            y.fill_(0.01f);
             // print_ggml_tensor(y);
 
-            struct ggml_tensor* out = nullptr;
+            sd::Tensor<float> out;
 
-            int64_t t0 = ggml_time_ms();
-            compute(8, x, timesteps, context, y, &out, work_ctx);
-            int64_t t1 = ggml_time_ms();
+            int64_t t0   = ggml_time_ms();
+            auto out_opt = compute(8,
+                                   x,
+                                   timesteps,
+                                   context,
+                                   y);
+            int64_t t1   = ggml_time_ms();
 
-            print_ggml_tensor(out);
+            GGML_ASSERT(!out_opt.empty());
+            out = std::move(out_opt);
+            print_sd_tensor(out);
             LOG_DEBUG("mmdit test done in %lldms", t1 - t0);
         }
     }
diff --git a/src/model.cpp b/src/model.cpp
index 77b032c2..2c708ed6 100644
--- a/src/model.cpp
+++ b/src/model.cpp
@@ -162,43 +162,7 @@ uint16_t f8_e4m3_to_f16(uint8_t f8) {
 }
 
 uint16_t f8_e5m2_to_f16(uint8_t fp8) {
-    uint8_t sign     = (fp8 >> 7) & 0x1;
-    uint8_t exponent = (fp8 >> 2) & 0x1F;
-    uint8_t mantissa = fp8 & 0x3;
-
-    uint16_t fp16_sign = sign << 15;
-    uint16_t fp16_exponent;
-    uint16_t fp16_mantissa;
-
-    if (exponent == 0 && mantissa == 0) {  // zero
-        return fp16_sign;
-    }
-
-    if (exponent == 0x1F) {  // NAN and INF
-        fp16_exponent = 0x1F;
-        fp16_mantissa = mantissa ? (mantissa << 8) : 0;
-        return fp16_sign | (fp16_exponent << 10) | fp16_mantissa;
-    }
-
-    if (exponent == 0) {  // subnormal numbers
-        fp16_mantissa = (mantissa << 8);
-        return fp16_sign | fp16_mantissa;
-    }
-
-    // normal numbers
-    int16_t true_exponent = (int16_t)exponent - 15 + 15;
-    if (true_exponent <= 0) {
-        fp16_exponent = 0;
-        fp16_mantissa = (mantissa << 8);
-    } else if (true_exponent >= 0x1F) {
-        fp16_exponent = 0x1F;
-        fp16_mantissa = 0;
-    } else {
-        fp16_exponent = (uint16_t)true_exponent;
-        fp16_mantissa = mantissa << 8;
-    }
-
-    return fp16_sign | (fp16_exponent << 10) | fp16_mantissa;
+    return static_cast<uint16_t>(fp8) << 8;
 }
 
 void f8_e4m3_to_f16_vec(uint8_t* src, uint16_t* dst, int64_t n) {
@@ -287,7 +251,7 @@ void ModelLoader::add_tensor_storage(const TensorStorage& tensor_storage) {
 }
 
 bool is_zip_file(const std::string& file_path) {
-    struct zip_t* zip = zip_open(file_path.c_str(), 0, 'r');
+    zip_t* zip = zip_open(file_path.c_str(), 0, 'r');
     if (zip == nullptr) {
         return false;
     }
@@ -453,9 +417,9 @@ bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::s
     size_t total_size  = 0;
     size_t data_offset = gguf_get_data_offset(ctx_gguf_);
     for (int i = 0; i < n_tensors; i++) {
-        std::string name          = gguf_get_tensor_name(ctx_gguf_, i);
-        struct ggml_tensor* dummy = ggml_get_tensor(ctx_meta_, name.c_str());
-        size_t offset             = data_offset + gguf_get_tensor_offset(ctx_gguf_, i);
+        std::string name   = gguf_get_tensor_name(ctx_gguf_, i);
+        ggml_tensor* dummy = ggml_get_tensor(ctx_meta_, name.c_str());
+        size_t offset      = data_offset + gguf_get_tensor_offset(ctx_gguf_, i);
 
         // LOG_DEBUG("%s", name.c_str());
 
@@ -812,7 +776,7 @@ struct PickleTensorReader {
         }
     }
 
-    void read_string(const std::string& str, struct zip_t* zip, std::string dir) {
+    void read_string(const std::string& str, zip_t* zip, std::string dir) {
         if (str == "storage") {
             read_global_type = true;
         } else if (str != "state_dict") {
@@ -995,7 +959,7 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s
     file_paths_.push_back(file_path);
     size_t file_index = file_paths_.size() - 1;
 
-    struct zip_t* zip = zip_open(file_path.c_str(), 0, 'r');
+    zip_t* zip = zip_open(file_path.c_str(), 0, 'r');
     if (zip == nullptr) {
         LOG_ERROR("failed to open '%s'", file_path.c_str());
         return false;
@@ -1104,10 +1068,12 @@ SDVersion ModelLoader::get_sd_version() {
             tensor_storage.name.find("unet.mid_block.resnets.1.") != std::string::npos) {
             has_middle_block_1 = true;
         }
-        if (tensor_storage.name.find("model.diffusion_model.output_blocks.3.1.transformer_blocks.1") != std::string::npos) {
+        if (tensor_storage.name.find("model.diffusion_model.output_blocks.3.1.transformer_blocks.1") != std::string::npos ||
+            tensor_storage.name.find("unet.up_blocks.1.attentions.0.transformer_blocks.1") != std::string::npos) {
             has_output_block_311 = true;
         }
-        if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1") != std::string::npos) {
+        if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1") != std::string::npos ||
+            tensor_storage.name.find("unet.up_blocks.2.attentions.1") != std::string::npos) {
             has_output_block_71 = true;
         }
         if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" ||
@@ -1411,7 +1377,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
         for (int i = 0; i < n_threads; ++i) {
             workers.emplace_back([&, file_path, is_zip]() {
                 std::ifstream file;
-                struct zip_t* zip = nullptr;
+                zip_t* zip = nullptr;
                 if (is_zip) {
                     zip = zip_open(file_path.c_str(), 0, 'r');
                     if (zip == nullptr) {
@@ -1599,7 +1565,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
     return success;
 }
 
-bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
+bool ModelLoader::load_tensors(std::map<std::string, ggml_tensor*>& tensors,
                                std::set<std::string> ignore_tensors,
                                int n_threads,
                                bool enable_mmap) {
@@ -1613,7 +1579,7 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
             tensor_names_in_file.insert(name);
         }
 
-        struct ggml_tensor* real;
+        ggml_tensor* real;
         if (tensors.find(name) != tensors.end()) {
             real = tensors[name];
         } else {
diff --git a/src/model.h b/src/model.h
index 5b9ce18a..3af35eb7 100644
--- a/src/model.h
+++ b/src/model.h
@@ -323,7 +323,7 @@ public:
     String2TensorStorage& get_tensor_storage_map() { return tensor_storage_map; }
     void set_wtype_override(ggml_type wtype, std::string tensor_type_rules = "");
     bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0, bool use_mmap = false);
-    bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
+    bool load_tensors(std::map<std::string, ggml_tensor*>& tensors,
                       std::set<std::string> ignore_tensors = {},
                       int n_threads                        = 0,
                       bool use_mmap                        = false);
diff --git a/src/name_conversion.cpp b/src/name_conversion.cpp
index 3b3abfb6..d5d5e052 100644
--- a/src/name_conversion.cpp
+++ b/src/name_conversion.cpp
@@ -1120,7 +1120,11 @@ std::string convert_tensor_name(std::string name, SDVersion version) {
         for (const auto& prefix : first_stage_model_prefix_vec) {
             if (starts_with(name, prefix)) {
                 name = convert_first_stage_model_name(name.substr(prefix.size()), prefix);
-                name = prefix + name;
+                if (version == VERSION_SDXS) {
+                    name = "tae." + name;
+                } else {
+                    name = prefix + name;
+                }
                 break;
             }
         }
diff --git a/src/pmid.hpp b/src/pmid.hpp
index 8ce78d3a..f19a8c3c 100644
--- a/src/pmid.hpp
+++ b/src/pmid.hpp
@@ -21,14 +21,14 @@ public:
         blocks["layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(in_dim));
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
         // x: [N, channels, h, w]
 
         auto fc1        = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
         auto fc2        = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);
         auto layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["layernorm"]);
 
-        struct ggml_tensor* r = x;
+        ggml_tensor* r = x;
         // x = ggml_ext_layer_norm(ctx, x, ln_w, ln_b);
         x = layer_norm->forward(ctx, x);
         // x = ggml_add(ctx, ggml_mul_mat(ctx, fc1_w, x),  fc1_b);
@@ -54,8 +54,8 @@ public:
         blocks["1"]   = std::shared_ptr<GGMLBlock>(new Mlp(dim, inner_dim, dim, false));
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                struct ggml_tensor* x) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* x) {
         auto norm = std::dynamic_pointer_cast<LayerNorm>(blocks["0"]);
         auto ff   = std::dynamic_pointer_cast<Mlp>(blocks["1"]);
 
@@ -81,9 +81,9 @@ public:
         blocks["to_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim, false));
     }
 
-    struct ggml_tensor* reshape_tensor(struct ggml_context* ctx,
-                                       struct ggml_tensor* x,
-                                       int heads) {
+    ggml_tensor* reshape_tensor(ggml_context* ctx,
+                                ggml_tensor* x,
+                                int heads) {
         int64_t ne[4];
         for (int i = 0; i < 4; ++i)
             ne[i] = x->ne[i];
@@ -92,17 +92,17 @@ public:
         return x;
     }
 
-    std::vector<struct ggml_tensor*> chunk_half(struct ggml_context* ctx,
-                                                struct ggml_tensor* x) {
+    std::vector<ggml_tensor*> chunk_half(ggml_context* ctx,
+                                         ggml_tensor* x) {
         auto tlo = ggml_view_4d(ctx, x, x->ne[0] / 2, x->ne[1], x->ne[2], x->ne[3], x->nb[1], x->nb[2], x->nb[3], 0);
         auto tli = ggml_view_4d(ctx, x, x->ne[0] / 2, x->ne[1], x->ne[2], x->ne[3], x->nb[1], x->nb[2], x->nb[3], x->nb[0] * x->ne[0] / 2);
         return {ggml_cont(ctx, tlo),
                 ggml_cont(ctx, tli)};
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                struct ggml_tensor* x,
-                                struct ggml_tensor* latents) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* x,
+                         ggml_tensor* latents) {
         // x (torch.Tensor): image features
         //     shape (b, n1, D)
         // latent (torch.Tensor): latent features
@@ -176,9 +176,9 @@ public:
         }
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                struct ggml_tensor* latents,
-                                struct ggml_tensor* x) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* latents,
+                         ggml_tensor* x) {
         // x: [N, channels, h, w]
         auto proj_in  = std::dynamic_pointer_cast<Linear>(blocks["proj_in"]);
         auto proj_out = std::dynamic_pointer_cast<Linear>(blocks["proj_out"]);
@@ -225,19 +225,19 @@ public:
             4));
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                struct ggml_tensor* x,
-                                struct ggml_tensor* last_hidden_state) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* x,
+                         ggml_tensor* last_hidden_state) {
         // x: [N, channels, h, w]
         auto token_proj          = std::dynamic_pointer_cast<Mlp>(blocks["token_proj"]);
         auto token_norm          = std::dynamic_pointer_cast<LayerNorm>(blocks["token_norm"]);
         auto perceiver_resampler = std::dynamic_pointer_cast<FacePerceiverResampler>(blocks["perceiver_resampler"]);
 
-        x                       = token_proj->forward(ctx, x);
-        int64_t nel             = ggml_nelements(x);
-        x                       = ggml_reshape_3d(ctx->ggml_ctx, x, cross_attention_dim, num_tokens, nel / (cross_attention_dim * num_tokens));
-        x                       = token_norm->forward(ctx, x);
-        struct ggml_tensor* out = perceiver_resampler->forward(ctx, x, last_hidden_state);
+        x                = token_proj->forward(ctx, x);
+        int64_t nel      = ggml_nelements(x);
+        x                = ggml_reshape_3d(ctx->ggml_ctx, x, cross_attention_dim, num_tokens, nel / (cross_attention_dim * num_tokens));
+        x                = token_norm->forward(ctx, x);
+        ggml_tensor* out = perceiver_resampler->forward(ctx, x, last_hidden_state);
         if (use_residul)
             out = ggml_add(ctx->ggml_ctx, x, out);
         return out;
@@ -256,9 +256,9 @@ public:
         blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(embed_dim));
     }
 
-    struct ggml_tensor* fuse_fn(GGMLRunnerContext* ctx,
-                                struct ggml_tensor* prompt_embeds,
-                                struct ggml_tensor* id_embeds) {
+    ggml_tensor* fuse_fn(GGMLRunnerContext* ctx,
+                         ggml_tensor* prompt_embeds,
+                         ggml_tensor* id_embeds) {
         auto mlp1       = std::dynamic_pointer_cast<FuseBlock>(blocks["mlp1"]);
         auto mlp2       = std::dynamic_pointer_cast<FuseBlock>(blocks["mlp2"]);
         auto layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm"]);
@@ -273,24 +273,24 @@ public:
         return stacked_id_embeds;
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                struct ggml_tensor* prompt_embeds,
-                                struct ggml_tensor* id_embeds,
-                                struct ggml_tensor* class_tokens_mask,
-                                struct ggml_tensor* class_tokens_mask_pos,
-                                struct ggml_tensor* left,
-                                struct ggml_tensor* right) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* prompt_embeds,
+                         ggml_tensor* id_embeds,
+                         ggml_tensor* class_tokens_mask,
+                         ggml_tensor* class_tokens_mask_pos,
+                         ggml_tensor* left,
+                         ggml_tensor* right) {
         // x: [N, channels, h, w]
 
-        struct ggml_tensor* valid_id_embeds = id_embeds;
+        ggml_tensor* valid_id_embeds = id_embeds;
         // # slice out the image token embeddings
         ggml_set_name(class_tokens_mask_pos, "class_tokens_mask_pos");
         ggml_set_name(prompt_embeds, "prompt_embeds");
-        struct ggml_tensor* image_token_embeds = ggml_get_rows(ctx->ggml_ctx, prompt_embeds, class_tokens_mask_pos);
+        ggml_tensor* image_token_embeds = ggml_get_rows(ctx->ggml_ctx, prompt_embeds, class_tokens_mask_pos);
         ggml_set_name(image_token_embeds, "image_token_embeds");
-        valid_id_embeds                       = ggml_reshape_2d(ctx->ggml_ctx, valid_id_embeds, valid_id_embeds->ne[0],
-                                                                ggml_nelements(valid_id_embeds) / valid_id_embeds->ne[0]);
-        struct ggml_tensor* stacked_id_embeds = fuse_fn(ctx, image_token_embeds, valid_id_embeds);
+        valid_id_embeds                = ggml_reshape_2d(ctx->ggml_ctx, valid_id_embeds, valid_id_embeds->ne[0],
+                                                         ggml_nelements(valid_id_embeds) / valid_id_embeds->ne[0]);
+        ggml_tensor* stacked_id_embeds = fuse_fn(ctx, image_token_embeds, valid_id_embeds);
 
         if (left && right) {
             stacked_id_embeds = ggml_concat(ctx->ggml_ctx, left, stacked_id_embeds, 1);
@@ -301,10 +301,10 @@ public:
             stacked_id_embeds = ggml_concat(ctx->ggml_ctx, stacked_id_embeds, right, 1);
         }
 
-        class_tokens_mask                         = ggml_cont(ctx->ggml_ctx, ggml_transpose(ctx->ggml_ctx, class_tokens_mask));
-        class_tokens_mask                         = ggml_repeat(ctx->ggml_ctx, class_tokens_mask, prompt_embeds);
-        prompt_embeds                             = ggml_mul(ctx->ggml_ctx, prompt_embeds, class_tokens_mask);
-        struct ggml_tensor* updated_prompt_embeds = ggml_add(ctx->ggml_ctx, prompt_embeds, stacked_id_embeds);
+        class_tokens_mask                  = ggml_cont(ctx->ggml_ctx, ggml_transpose(ctx->ggml_ctx, class_tokens_mask));
+        class_tokens_mask                  = ggml_repeat(ctx->ggml_ctx, class_tokens_mask, prompt_embeds);
+        prompt_embeds                      = ggml_mul(ctx->ggml_ctx, prompt_embeds, class_tokens_mask);
+        ggml_tensor* updated_prompt_embeds = ggml_add(ctx->ggml_ctx, prompt_embeds, stacked_id_embeds);
         ggml_set_name(updated_prompt_embeds, "updated_prompt_embeds");
         return updated_prompt_embeds;
     }
@@ -317,22 +317,22 @@ struct PhotoMakerIDEncoderBlock : public CLIPVisionModelProjection {
         blocks["fuse_module"]         = std::shared_ptr<GGMLBlock>(new FuseModule(2048));
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                struct ggml_tensor* id_pixel_values,
-                                struct ggml_tensor* prompt_embeds,
-                                struct ggml_tensor* class_tokens_mask,
-                                struct ggml_tensor* class_tokens_mask_pos,
-                                struct ggml_tensor* left,
-                                struct ggml_tensor* right) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* id_pixel_values,
+                         ggml_tensor* prompt_embeds,
+                         ggml_tensor* class_tokens_mask,
+                         ggml_tensor* class_tokens_mask_pos,
+                         ggml_tensor* left,
+                         ggml_tensor* right) {
         // x: [N, channels, h, w]
         auto vision_model        = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]);
         auto visual_projection   = std::dynamic_pointer_cast<CLIPProjection>(blocks["visual_projection"]);
         auto visual_projection_2 = std::dynamic_pointer_cast<Linear>(blocks["visual_projection_2"]);
         auto fuse_module         = std::dynamic_pointer_cast<FuseModule>(blocks["fuse_module"]);
 
-        struct ggml_tensor* shared_id_embeds = vision_model->forward(ctx, id_pixel_values);          // [N, hidden_size]
-        struct ggml_tensor* id_embeds        = visual_projection->forward(ctx, shared_id_embeds);    // [N, proj_dim(768)]
-        struct ggml_tensor* id_embeds_2      = visual_projection_2->forward(ctx, shared_id_embeds);  // [N, 1280]
+        ggml_tensor* shared_id_embeds = vision_model->forward(ctx, id_pixel_values);          // [N, hidden_size]
+        ggml_tensor* id_embeds        = visual_projection->forward(ctx, shared_id_embeds);    // [N, proj_dim(768)]
+        ggml_tensor* id_embeds_2      = visual_projection_2->forward(ctx, shared_id_embeds);  // [N, 1280]
 
         id_embeds   = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, id_embeds, 2, 0, 1, 3));
         id_embeds_2 = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, id_embeds_2, 2, 0, 1, 3));
@@ -340,12 +340,12 @@ struct PhotoMakerIDEncoderBlock : public CLIPVisionModelProjection {
         id_embeds = ggml_concat(ctx->ggml_ctx, id_embeds, id_embeds_2, 2);  // [batch_size, seq_length, 1, 2048] check whether concat at dim 2 is right
         id_embeds = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, id_embeds, 1, 2, 0, 3));
 
-        struct ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx,
-                                                                         prompt_embeds,
-                                                                         id_embeds,
-                                                                         class_tokens_mask,
-                                                                         class_tokens_mask_pos,
-                                                                         left, right);
+        ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx,
+                                                                  prompt_embeds,
+                                                                  id_embeds,
+                                                                  class_tokens_mask,
+                                                                  class_tokens_mask_pos,
+                                                                  left, right);
         return updated_prompt_embeds;
     }
 };
@@ -365,29 +365,29 @@ struct PhotoMakerIDEncoder_CLIPInsightfaceExtendtokenBlock : public CLIPVisionMo
                                                                                         num_tokens));
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                struct ggml_tensor* id_pixel_values,
-                                struct ggml_tensor* prompt_embeds,
-                                struct ggml_tensor* class_tokens_mask,
-                                struct ggml_tensor* class_tokens_mask_pos,
-                                struct ggml_tensor* id_embeds,
-                                struct ggml_tensor* left,
-                                struct ggml_tensor* right) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* id_pixel_values,
+                         ggml_tensor* prompt_embeds,
+                         ggml_tensor* class_tokens_mask,
+                         ggml_tensor* class_tokens_mask_pos,
+                         ggml_tensor* id_embeds,
+                         ggml_tensor* left,
+                         ggml_tensor* right) {
         // x: [N, channels, h, w]
         auto vision_model      = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]);
         auto fuse_module       = std::dynamic_pointer_cast<FuseModule>(blocks["fuse_module"]);
         auto qformer_perceiver = std::dynamic_pointer_cast<QFormerPerceiver>(blocks["qformer_perceiver"]);
 
-        // struct ggml_tensor* last_hidden_state = vision_model->forward(ctx, id_pixel_values);          // [N, hidden_size]
-        struct ggml_tensor* last_hidden_state = vision_model->forward(ctx, id_pixel_values, false);  // [N, hidden_size]
-        id_embeds                             = qformer_perceiver->forward(ctx, id_embeds, last_hidden_state);
+        // ggml_tensor* last_hidden_state = vision_model->forward(ctx, id_pixel_values);          // [N, hidden_size]
+        ggml_tensor* last_hidden_state = vision_model->forward(ctx, id_pixel_values, false);  // [N, hidden_size]
+        id_embeds                      = qformer_perceiver->forward(ctx, id_embeds, last_hidden_state);
 
-        struct ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx,
-                                                                         prompt_embeds,
-                                                                         id_embeds,
-                                                                         class_tokens_mask,
-                                                                         class_tokens_mask_pos,
-                                                                         left, right);
+        ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx,
+                                                                  prompt_embeds,
+                                                                  id_embeds,
+                                                                  class_tokens_mask,
+                                                                  class_tokens_mask_pos,
+                                                                  left, right);
         return updated_prompt_embeds;
     }
 };
@@ -436,18 +436,17 @@ public:
         return pm_version;
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
         if (pm_version == PM_VERSION_1)
             id_encoder.get_param_tensors(tensors, prefix);
         else if (pm_version == PM_VERSION_2)
             id_encoder2.get_param_tensors(tensors, prefix);
     }
 
-    struct ggml_cgraph* build_graph(  // struct ggml_allocr* allocr,
-        struct ggml_tensor* id_pixel_values,
-        struct ggml_tensor* prompt_embeds,
-        std::vector<bool>& class_tokens_mask,
-        struct ggml_tensor* id_embeds) {
+    ggml_cgraph* build_graph(const sd::Tensor<float>& id_pixel_values_tensor,
+                             const sd::Tensor<float>& prompt_embeds_tensor,
+                             std::vector<bool>& class_tokens_mask,
+                             const sd::Tensor<float>& id_embeds_tensor = {}) {
         ctm.clear();
         ctmf16.clear();
         ctmpos.clear();
@@ -458,20 +457,20 @@ public:
 
         auto runner_ctx = get_context();
 
-        struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
+        ggml_cgraph* gf = ggml_new_graph(compute_ctx);
+
+        ggml_tensor* id_pixel_values = make_input(id_pixel_values_tensor);
+        ggml_tensor* prompt_embeds   = make_input(prompt_embeds_tensor);
+        ggml_tensor* id_embeds       = make_optional_input(id_embeds_tensor);
 
         int64_t hidden_size = prompt_embeds->ne[0];
         int64_t seq_length  = prompt_embeds->ne[1];
         ggml_type type      = GGML_TYPE_F32;
 
-        struct ggml_tensor* class_tokens_mask_d = ggml_new_tensor_1d(runner_ctx.ggml_ctx, type, class_tokens_mask.size());
+        ggml_tensor* class_tokens_mask_d = ggml_new_tensor_1d(runner_ctx.ggml_ctx, type, class_tokens_mask.size());
 
-        struct ggml_tensor* id_pixel_values_d = to_backend(id_pixel_values);
-        struct ggml_tensor* prompt_embeds_d   = to_backend(prompt_embeds);
-        struct ggml_tensor* id_embeds_d       = to_backend(id_embeds);
-
-        struct ggml_tensor* left  = nullptr;
-        struct ggml_tensor* right = nullptr;
+        ggml_tensor* left  = nullptr;
+        ggml_tensor* right = nullptr;
         for (int i = 0; i < class_tokens_mask.size(); i++) {
             if (class_tokens_mask[i]) {
                 // printf(" 1,");
@@ -495,7 +494,7 @@ public:
             right = ggml_new_tensor_3d(runner_ctx.ggml_ctx, type,
                                        hidden_size, seq_length - ctmpos[ctmpos.size() - 1] - 1, 1);
         }
-        struct ggml_tensor* class_tokens_mask_pos = ggml_new_tensor_1d(runner_ctx.ggml_ctx, GGML_TYPE_I32, ctmpos.size());
+        ggml_tensor* class_tokens_mask_pos = ggml_new_tensor_1d(runner_ctx.ggml_ctx, GGML_TYPE_I32, ctmpos.size());
 
         {
             if (type == GGML_TYPE_F16)
@@ -526,21 +525,21 @@ public:
                 }
             }
         }
-        struct ggml_tensor* updated_prompt_embeds = nullptr;
+        ggml_tensor* updated_prompt_embeds = nullptr;
         if (pm_version == PM_VERSION_1)
             updated_prompt_embeds = id_encoder.forward(&runner_ctx,
-                                                       id_pixel_values_d,
-                                                       prompt_embeds_d,
+                                                       id_pixel_values,
+                                                       prompt_embeds,
                                                        class_tokens_mask_d,
                                                        class_tokens_mask_pos,
                                                        left, right);
         else if (pm_version == PM_VERSION_2)
             updated_prompt_embeds = id_encoder2.forward(&runner_ctx,
-                                                        id_pixel_values_d,
-                                                        prompt_embeds_d,
+                                                        id_pixel_values,
+                                                        prompt_embeds,
                                                         class_tokens_mask_d,
                                                         class_tokens_mask_pos,
-                                                        id_embeds_d,
+                                                        id_embeds,
                                                         left, right);
 
         ggml_build_forward_expand(gf, updated_prompt_embeds);
@@ -548,25 +547,21 @@ public:
         return gf;
     }
 
-    bool compute(const int n_threads,
-                 struct ggml_tensor* id_pixel_values,
-                 struct ggml_tensor* prompt_embeds,
-                 struct ggml_tensor* id_embeds,
-                 std::vector<bool>& class_tokens_mask,
-                 struct ggml_tensor** updated_prompt_embeds,
-                 ggml_context* output_ctx) {
-        auto get_graph = [&]() -> struct ggml_cgraph* {
-            // return build_graph(compute_allocr, id_pixel_values, prompt_embeds, class_tokens_mask);
+    sd::Tensor<float> compute(const int n_threads,
+                              const sd::Tensor<float>& id_pixel_values,
+                              const sd::Tensor<float>& prompt_embeds,
+                              const sd::Tensor<float>& id_embeds,
+                              std::vector<bool>& class_tokens_mask) {
+        auto get_graph = [&]() -> ggml_cgraph* {
             return build_graph(id_pixel_values, prompt_embeds, class_tokens_mask, id_embeds);
         };
 
-        // GGMLRunner::compute(get_graph, n_threads, updated_prompt_embeds);
-        return GGMLRunner::compute(get_graph, n_threads, true, updated_prompt_embeds, output_ctx);
+        return take_or_empty(GGMLRunner::compute<float>(get_graph, n_threads, true));
     }
 };
 
 struct PhotoMakerIDEmbed : public GGMLRunner {
-    std::map<std::string, struct ggml_tensor*> tensors;
+    std::map<std::string, ggml_tensor*> tensors;
     std::string file_path;
     ModelLoader* model_loader;
     bool load_failed = false;
@@ -606,11 +601,11 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
             }
             if (dry_run) {
                 std::lock_guard<std::mutex> lock(tensor_mutex);
-                struct ggml_tensor* real = ggml_new_tensor(params_ctx,
-                                                           tensor_storage.type,
-                                                           tensor_storage.n_dims,
-                                                           tensor_storage.ne);
-                tensors[name]            = real;
+                ggml_tensor* real = ggml_new_tensor(params_ctx,
+                                                    tensor_storage.type,
+                                                    tensor_storage.n_dims,
+                                                    tensor_storage.ne);
+                tensors[name]     = real;
             } else {
                 auto real   = tensors[name];
                 *dst_tensor = real;
@@ -629,8 +624,8 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
         return true;
     }
 
-    struct ggml_tensor* get() {
-        std::map<std::string, struct ggml_tensor*>::iterator pos;
+    ggml_tensor* get() {
+        std::map<std::string, ggml_tensor*>::iterator pos;
         pos = tensors.find("pmid.id_embeds");
         if (pos != tensors.end())
             return pos->second;
diff --git a/src/preprocessing.hpp b/src/preprocessing.hpp
index 84e0ed3f..7c83a289 100644
--- a/src/preprocessing.hpp
+++ b/src/preprocessing.hpp
@@ -1,179 +1,241 @@
 #ifndef __PREPROCESSING_HPP__
 #define __PREPROCESSING_HPP__
 
+#include <cmath>
+#include <limits>
+
 #include "ggml_extend.hpp"
+
 #define M_PI_ 3.14159265358979323846f
 
-void convolve(struct ggml_tensor* input, struct ggml_tensor* output, struct ggml_tensor* kernel, int padding) {
-    struct ggml_init_params params;
-    params.mem_size                 = 80 * input->ne[0] * input->ne[1];  // 20M for 512x512
-    params.mem_buffer               = nullptr;
-    params.no_alloc                 = false;
-    struct ggml_context* ctx0       = ggml_init(params);
-    struct ggml_tensor* kernel_fp16 = ggml_new_tensor_4d(ctx0, GGML_TYPE_F16, kernel->ne[0], kernel->ne[1], 1, 1);
-    ggml_fp32_to_fp16_row((float*)kernel->data, (ggml_fp16_t*)kernel_fp16->data, ggml_nelements(kernel));
-    ggml_tensor* h  = ggml_conv_2d(ctx0, kernel_fp16, input, 1, 1, padding, padding, 1, 1);
-    ggml_cgraph* gf = ggml_new_graph(ctx0);
-    ggml_build_forward_expand(gf, ggml_cpy(ctx0, h, output));
-    ggml_graph_compute_with_ctx(ctx0, gf, 1);
-    ggml_free(ctx0);
+static inline int64_t preprocessing_offset_4d(const sd::Tensor<float>& tensor, int64_t i0, int64_t i1 = 0, int64_t i2 = 0, int64_t i3 = 0) {
+    const auto& shape = tensor.shape();
+    int64_t n0        = shape.size() > 0 ? shape[0] : 1;
+    int64_t n1        = shape.size() > 1 ? shape[1] : 1;
+    int64_t n2        = shape.size() > 2 ? shape[2] : 1;
+    return ((i3 * n2 + i2) * n1 + i1) * n0 + i0;
 }
 
-void gaussian_kernel(struct ggml_tensor* kernel) {
-    int ks_mid   = static_cast<int>(kernel->ne[0] / 2);
+static inline float preprocessing_get_4d(const sd::Tensor<float>& tensor, int64_t i0, int64_t i1 = 0, int64_t i2 = 0, int64_t i3 = 0) {
+    return tensor.values()[static_cast<size_t>(preprocessing_offset_4d(tensor, i0, i1, i2, i3))];
+}
+
+static inline void preprocessing_set_4d(sd::Tensor<float>& tensor, float value, int64_t i0, int64_t i1 = 0, int64_t i2 = 0, int64_t i3 = 0) {
+    tensor.values()[static_cast<size_t>(preprocessing_offset_4d(tensor, i0, i1, i2, i3))] = value;
+}
+
+static inline sd::Tensor<float> sd_image_to_preprocessing_tensor(sd_image_t image) {
+    sd::Tensor<float> tensor({static_cast<int64_t>(image.width), static_cast<int64_t>(image.height), static_cast<int64_t>(image.channel), 1});
+    for (uint32_t y = 0; y < image.height; ++y) {
+        for (uint32_t x = 0; x < image.width; ++x) {
+            for (uint32_t c = 0; c < image.channel; ++c) {
+                preprocessing_set_4d(tensor, sd_image_get_f32(image, x, y, c), x, y, c, 0);
+            }
+        }
+    }
+    return tensor;
+}
+
+static inline void preprocessing_tensor_to_sd_image(const sd::Tensor<float>& tensor, uint8_t* image_data) {
+    GGML_ASSERT(tensor.dim() == 4);
+    GGML_ASSERT(tensor.shape()[3] == 1);
+    GGML_ASSERT(image_data != nullptr);
+
+    int width   = static_cast<int>(tensor.shape()[0]);
+    int height  = static_cast<int>(tensor.shape()[1]);
+    int channel = static_cast<int>(tensor.shape()[2]);
+    for (int y = 0; y < height; ++y) {
+        for (int x = 0; x < width; ++x) {
+            for (int c = 0; c < channel; ++c) {
+                float value                               = preprocessing_get_4d(tensor, x, y, c, 0);
+                value                                     = std::min(1.0f, std::max(0.0f, value));
+                image_data[(y * width + x) * channel + c] = static_cast<uint8_t>(std::round(value * 255.0f));
+            }
+        }
+    }
+}
+
+static inline sd::Tensor<float> gaussian_kernel_tensor(int kernel_size) {
+    sd::Tensor<float> kernel({kernel_size, kernel_size, 1, 1});
+    int ks_mid   = kernel_size / 2;
     float sigma  = 1.4f;
-    float normal = 1.f / (2.0f * M_PI_ * powf(sigma, 2.0f));
-    for (int y = 0; y < kernel->ne[0]; y++) {
+    float normal = 1.f / (2.0f * M_PI_ * std::pow(sigma, 2.0f));
+    for (int y = 0; y < kernel_size; ++y) {
         float gx = static_cast<float>(-ks_mid + y);
-        for (int x = 0; x < kernel->ne[1]; x++) {
+        for (int x = 0; x < kernel_size; ++x) {
             float gy = static_cast<float>(-ks_mid + x);
-            float k_ = expf(-((gx * gx + gy * gy) / (2.0f * powf(sigma, 2.0f)))) * normal;
-            ggml_ext_tensor_set_f32(kernel, k_, x, y);
+            float k  = std::exp(-((gx * gx + gy * gy) / (2.0f * std::pow(sigma, 2.0f)))) * normal;
+            preprocessing_set_4d(kernel, k, x, y, 0, 0);
         }
     }
+    return kernel;
 }
 
-void grayscale(struct ggml_tensor* rgb_img, struct ggml_tensor* grayscale) {
-    for (int iy = 0; iy < rgb_img->ne[1]; iy++) {
-        for (int ix = 0; ix < rgb_img->ne[0]; ix++) {
-            float r    = ggml_ext_tensor_get_f32(rgb_img, ix, iy);
-            float g    = ggml_ext_tensor_get_f32(rgb_img, ix, iy, 1);
-            float b    = ggml_ext_tensor_get_f32(rgb_img, ix, iy, 2);
+static inline sd::Tensor<float> convolve_tensor(const sd::Tensor<float>& input, const sd::Tensor<float>& kernel, int padding) {
+    GGML_ASSERT(input.dim() == 4);
+    GGML_ASSERT(kernel.dim() == 4);
+    GGML_ASSERT(input.shape()[3] == 1);
+    GGML_ASSERT(kernel.shape()[2] == 1);
+    GGML_ASSERT(kernel.shape()[3] == 1);
+
+    sd::Tensor<float> output(input.shape());
+    int64_t width    = input.shape()[0];
+    int64_t height   = input.shape()[1];
+    int64_t channels = input.shape()[2];
+    int64_t kernel_w = kernel.shape()[0];
+    int64_t kernel_h = kernel.shape()[1];
+
+    for (int64_t c = 0; c < channels; ++c) {
+        for (int64_t y = 0; y < height; ++y) {
+            for (int64_t x = 0; x < width; ++x) {
+                float sum = 0.0f;
+                for (int64_t ky = 0; ky < kernel_h; ++ky) {
+                    int64_t iy = y + ky - padding;
+                    if (iy < 0 || iy >= height) {
+                        continue;
+                    }
+                    for (int64_t kx = 0; kx < kernel_w; ++kx) {
+                        int64_t ix = x + kx - padding;
+                        if (ix < 0 || ix >= width) {
+                            continue;
+                        }
+                        sum += preprocessing_get_4d(input, ix, iy, c, 0) * preprocessing_get_4d(kernel, kx, ky, 0, 0);
+                    }
+                }
+                preprocessing_set_4d(output, sum, x, y, c, 0);
+            }
+        }
+    }
+    return output;
+}
+
+static inline sd::Tensor<float> grayscale_tensor(const sd::Tensor<float>& rgb_img) {
+    GGML_ASSERT(rgb_img.dim() == 4);
+    GGML_ASSERT(rgb_img.shape()[2] >= 3);
+    sd::Tensor<float> grayscale({rgb_img.shape()[0], rgb_img.shape()[1], 1, rgb_img.shape()[3]});
+    for (int64_t iy = 0; iy < rgb_img.shape()[1]; ++iy) {
+        for (int64_t ix = 0; ix < rgb_img.shape()[0]; ++ix) {
+            float r    = preprocessing_get_4d(rgb_img, ix, iy, 0, 0);
+            float g    = preprocessing_get_4d(rgb_img, ix, iy, 1, 0);
+            float b    = preprocessing_get_4d(rgb_img, ix, iy, 2, 0);
             float gray = 0.2989f * r + 0.5870f * g + 0.1140f * b;
-            ggml_ext_tensor_set_f32(grayscale, gray, ix, iy);
+            preprocessing_set_4d(grayscale, gray, ix, iy, 0, 0);
         }
     }
+    return grayscale;
 }
 
-void prop_hypot(struct ggml_tensor* x, struct ggml_tensor* y, struct ggml_tensor* h) {
-    int n_elements = static_cast<int>(ggml_nelements(h));
-    float* dx      = (float*)x->data;
-    float* dy      = (float*)y->data;
-    float* dh      = (float*)h->data;
-    for (int i = 0; i < n_elements; i++) {
-        dh[i] = sqrtf(dx[i] * dx[i] + dy[i] * dy[i]);
+static inline sd::Tensor<float> tensor_hypot(const sd::Tensor<float>& x, const sd::Tensor<float>& y) {
+    sd::tensor_check_same_shape(x, y);
+    sd::Tensor<float> out(x.shape());
+    for (int64_t i = 0; i < out.numel(); ++i) {
+        out[i] = std::sqrt(x[i] * x[i] + y[i] * y[i]);
     }
+    return out;
 }
 
-void prop_arctan2(struct ggml_tensor* x, struct ggml_tensor* y, struct ggml_tensor* h) {
-    int n_elements = static_cast<int>(ggml_nelements(h));
-    float* dx      = (float*)x->data;
-    float* dy      = (float*)y->data;
-    float* dh      = (float*)h->data;
-    for (int i = 0; i < n_elements; i++) {
-        dh[i] = atan2f(dy[i], dx[i]);
+static inline sd::Tensor<float> tensor_arctan2(const sd::Tensor<float>& x, const sd::Tensor<float>& y) {
+    sd::tensor_check_same_shape(x, y);
+    sd::Tensor<float> out(x.shape());
+    for (int64_t i = 0; i < out.numel(); ++i) {
+        out[i] = std::atan2(y[i], x[i]);
     }
+    return out;
 }
 
-void normalize_tensor(struct ggml_tensor* g) {
-    int n_elements = static_cast<int>(ggml_nelements(g));
-    float* dg      = (float*)g->data;
-    float max      = -INFINITY;
-    for (int i = 0; i < n_elements; i++) {
-        max = dg[i] > max ? dg[i] : max;
+static inline void normalize_tensor(sd::Tensor<float>* g) {
+    GGML_ASSERT(g != nullptr);
+    if (g->empty()) {
+        return;
     }
-    max = 1.0f / max;
-    for (int i = 0; i < n_elements; i++) {
-        dg[i] *= max;
+    float max_value = -std::numeric_limits<float>::infinity();
+    for (int64_t i = 0; i < g->numel(); ++i) {
+        max_value = std::max(max_value, (*g)[i]);
     }
+    if (max_value == 0.0f || !std::isfinite(max_value)) {
+        return;
+    }
+    *g *= (1.0f / max_value);
 }
 
-void non_max_supression(struct ggml_tensor* result, struct ggml_tensor* G, struct ggml_tensor* D) {
-    for (int iy = 1; iy < result->ne[1] - 1; iy++) {
-        for (int ix = 1; ix < result->ne[0] - 1; ix++) {
-            float angle = ggml_ext_tensor_get_f32(D, ix, iy) * 180.0f / M_PI_;
-            angle       = angle < 0.0f ? angle += 180.0f : angle;
+static inline sd::Tensor<float> non_max_supression(const sd::Tensor<float>& G, const sd::Tensor<float>& D) {
+    GGML_ASSERT(G.shape() == D.shape());
+    sd::Tensor<float> result = sd::Tensor<float>::zeros(G.shape());
+    for (int64_t iy = 1; iy < result.shape()[1] - 1; ++iy) {
+        for (int64_t ix = 1; ix < result.shape()[0] - 1; ++ix) {
+            float angle = preprocessing_get_4d(D, ix, iy, 0, 0) * 180.0f / M_PI_;
+            angle       = angle < 0.0f ? angle + 180.0f : angle;
             float q     = 1.0f;
             float r     = 1.0f;
 
-            // angle 0
-            if ((0 >= angle && angle < 22.5f) || (157.5f >= angle && angle <= 180)) {
-                q = ggml_ext_tensor_get_f32(G, ix, iy + 1);
-                r = ggml_ext_tensor_get_f32(G, ix, iy - 1);
-            }
-            // angle 45
-            else if (22.5f >= angle && angle < 67.5f) {
-                q = ggml_ext_tensor_get_f32(G, ix + 1, iy - 1);
-                r = ggml_ext_tensor_get_f32(G, ix - 1, iy + 1);
-            }
-            // angle 90
-            else if (67.5f >= angle && angle < 112.5) {
-                q = ggml_ext_tensor_get_f32(G, ix + 1, iy);
-                r = ggml_ext_tensor_get_f32(G, ix - 1, iy);
-            }
-            // angle 135
-            else if (112.5 >= angle && angle < 157.5f) {
-                q = ggml_ext_tensor_get_f32(G, ix - 1, iy - 1);
-                r = ggml_ext_tensor_get_f32(G, ix + 1, iy + 1);
+            if ((0 >= angle && angle < 22.5f) || (157.5f >= angle && angle <= 180.0f)) {
+                q = preprocessing_get_4d(G, ix, iy + 1, 0, 0);
+                r = preprocessing_get_4d(G, ix, iy - 1, 0, 0);
+            } else if (22.5f >= angle && angle < 67.5f) {
+                q = preprocessing_get_4d(G, ix + 1, iy - 1, 0, 0);
+                r = preprocessing_get_4d(G, ix - 1, iy + 1, 0, 0);
+            } else if (67.5f >= angle && angle < 112.5f) {
+                q = preprocessing_get_4d(G, ix + 1, iy, 0, 0);
+                r = preprocessing_get_4d(G, ix - 1, iy, 0, 0);
+            } else if (112.5f >= angle && angle < 157.5f) {
+                q = preprocessing_get_4d(G, ix - 1, iy - 1, 0, 0);
+                r = preprocessing_get_4d(G, ix + 1, iy + 1, 0, 0);
             }
 
-            float cur = ggml_ext_tensor_get_f32(G, ix, iy);
-            if ((cur >= q) && (cur >= r)) {
-                ggml_ext_tensor_set_f32(result, cur, ix, iy);
-            } else {
-                ggml_ext_tensor_set_f32(result, 0.0f, ix, iy);
-            }
+            float cur = preprocessing_get_4d(G, ix, iy, 0, 0);
+            preprocessing_set_4d(result, (cur >= q && cur >= r) ? cur : 0.0f, ix, iy, 0, 0);
         }
     }
+    return result;
 }
 
-void threshold_hystersis(struct ggml_tensor* img, float high_threshold, float low_threshold, float weak, float strong) {
-    int n_elements = static_cast<int>(ggml_nelements(img));
-    float* imd     = (float*)img->data;
-    float max      = -INFINITY;
-    for (int i = 0; i < n_elements; i++) {
-        max = imd[i] > max ? imd[i] : max;
+static inline void threshold_hystersis(sd::Tensor<float>* img, float high_threshold, float low_threshold, float weak, float strong) {
+    GGML_ASSERT(img != nullptr);
+    if (img->empty()) {
+        return;
     }
-    float ht = max * high_threshold;
+    float max_value = -std::numeric_limits<float>::infinity();
+    for (int64_t i = 0; i < img->numel(); ++i) {
+        max_value = std::max(max_value, (*img)[i]);
+    }
+
+    float ht = max_value * high_threshold;
     float lt = ht * low_threshold;
-    for (int i = 0; i < n_elements; i++) {
-        float img_v = imd[i];
-        if (img_v >= ht) {  // strong pixel
-            imd[i] = strong;
-        } else if (img_v <= ht && img_v >= lt) {  // strong pixel
-            imd[i] = weak;
+    for (int64_t i = 0; i < img->numel(); ++i) {
+        float img_v = (*img)[i];
+        if (img_v >= ht) {
+            (*img)[i] = strong;
+        } else if (img_v <= ht && img_v >= lt) {
+            (*img)[i] = weak;
         }
     }
 
-    for (int iy = 0; iy < img->ne[1]; iy++) {
-        for (int ix = 0; ix < img->ne[0]; ix++) {
-            if (ix >= 3 && ix <= img->ne[0] - 3 && iy >= 3 && iy <= img->ne[1] - 3) {
-                ggml_ext_tensor_set_f32(img, ggml_ext_tensor_get_f32(img, ix, iy), ix, iy);
-            } else {
-                ggml_ext_tensor_set_f32(img, 0.0f, ix, iy);
+    for (int64_t iy = 0; iy < img->shape()[1]; ++iy) {
+        for (int64_t ix = 0; ix < img->shape()[0]; ++ix) {
+            if (!(ix >= 3 && ix <= img->shape()[0] - 3 && iy >= 3 && iy <= img->shape()[1] - 3)) {
+                preprocessing_set_4d(*img, 0.0f, ix, iy, 0, 0);
             }
         }
     }
 
-    // hysteresis
-    for (int iy = 1; iy < img->ne[1] - 1; iy++) {
-        for (int ix = 1; ix < img->ne[0] - 1; ix++) {
-            float imd_v = ggml_ext_tensor_get_f32(img, ix, iy);
+    for (int64_t iy = 1; iy < img->shape()[1] - 1; ++iy) {
+        for (int64_t ix = 1; ix < img->shape()[0] - 1; ++ix) {
+            float imd_v = preprocessing_get_4d(*img, ix, iy, 0, 0);
             if (imd_v == weak) {
-                if (ggml_ext_tensor_get_f32(img, ix + 1, iy - 1) == strong || ggml_ext_tensor_get_f32(img, ix + 1, iy) == strong ||
-                    ggml_ext_tensor_get_f32(img, ix, iy - 1) == strong || ggml_ext_tensor_get_f32(img, ix, iy + 1) == strong ||
-                    ggml_ext_tensor_get_f32(img, ix - 1, iy - 1) == strong || ggml_ext_tensor_get_f32(img, ix - 1, iy) == strong) {
-                    ggml_ext_tensor_set_f32(img, strong, ix, iy);
-                } else {
-                    ggml_ext_tensor_set_f32(img, 0.0f, ix, iy);
-                }
+                bool has_strong_neighbor =
+                    preprocessing_get_4d(*img, ix + 1, iy - 1, 0, 0) == strong ||
+                    preprocessing_get_4d(*img, ix + 1, iy, 0, 0) == strong ||
+                    preprocessing_get_4d(*img, ix, iy - 1, 0, 0) == strong ||
+                    preprocessing_get_4d(*img, ix, iy + 1, 0, 0) == strong ||
+                    preprocessing_get_4d(*img, ix - 1, iy - 1, 0, 0) == strong ||
+                    preprocessing_get_4d(*img, ix - 1, iy, 0, 0) == strong;
+                preprocessing_set_4d(*img, has_strong_neighbor ? strong : 0.0f, ix, iy, 0, 0);
             }
         }
     }
 }
 
 bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold, float weak, float strong, bool inverse) {
-    struct ggml_init_params params;
-    params.mem_size               = static_cast<size_t>(40 * img.width * img.height);  // 10MB for 512x512
-    params.mem_buffer             = nullptr;
-    params.no_alloc               = false;
-    struct ggml_context* work_ctx = ggml_init(params);
-
-    if (!work_ctx) {
-        LOG_ERROR("ggml_init() failed");
-        return false;
-    }
-
     float kX[9] = {
         -1, 0, 1,
         -2, 0, 2,
@@ -184,43 +246,33 @@ bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold,
         0, 0, 0,
         -1, -2, -1};
 
-    // generate kernel
-    int kernel_size             = 5;
-    struct ggml_tensor* gkernel = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, kernel_size, kernel_size, 1, 1);
-    struct ggml_tensor* sf_kx   = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1);
-    memcpy(sf_kx->data, kX, ggml_nbytes(sf_kx));
-    struct ggml_tensor* sf_ky = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1);
-    memcpy(sf_ky->data, kY, ggml_nbytes(sf_ky));
-    gaussian_kernel(gkernel);
-    struct ggml_tensor* image      = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, img.width, img.height, 3, 1);
-    struct ggml_tensor* image_gray = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, img.width, img.height, 1, 1);
-    struct ggml_tensor* iX         = ggml_dup_tensor(work_ctx, image_gray);
-    struct ggml_tensor* iY         = ggml_dup_tensor(work_ctx, image_gray);
-    struct ggml_tensor* G          = ggml_dup_tensor(work_ctx, image_gray);
-    struct ggml_tensor* tetha      = ggml_dup_tensor(work_ctx, image_gray);
-    sd_image_to_ggml_tensor(img, image);
-    grayscale(image, image_gray);
-    convolve(image_gray, image_gray, gkernel, 2);
-    convolve(image_gray, iX, sf_kx, 1);
-    convolve(image_gray, iY, sf_ky, 1);
-    prop_hypot(iX, iY, G);
-    normalize_tensor(G);
-    prop_arctan2(iX, iY, tetha);
-    non_max_supression(image_gray, G, tetha);
-    threshold_hystersis(image_gray, high_threshold, low_threshold, weak, strong);
-    // to RGB channels
-    for (uint32_t iy = 0; iy < img.height; iy++) {
-        for (uint32_t ix = 0; ix < img.width; ix++) {
-            float gray = ggml_ext_tensor_get_f32(image_gray, ix, iy);
+    sd::Tensor<float> gkernel = gaussian_kernel_tensor(5);
+    sd::Tensor<float> sf_kx({3, 3, 1, 1}, std::vector<float>(kX, kX + 9));
+    sd::Tensor<float> sf_ky({3, 3, 1, 1}, std::vector<float>(kY, kY + 9));
+
+    sd::Tensor<float> image      = sd_image_to_preprocessing_tensor(img);
+    sd::Tensor<float> image_gray = grayscale_tensor(image);
+    image_gray                   = convolve_tensor(image_gray, gkernel, 2);
+    sd::Tensor<float> iX         = convolve_tensor(image_gray, sf_kx, 1);
+    sd::Tensor<float> iY         = convolve_tensor(image_gray, sf_ky, 1);
+    sd::Tensor<float> G          = tensor_hypot(iX, iY);
+    normalize_tensor(&G);
+    sd::Tensor<float> theta = tensor_arctan2(iX, iY);
+    image_gray              = non_max_supression(G, theta);
+    threshold_hystersis(&image_gray, high_threshold, low_threshold, weak, strong);
+
+    for (uint32_t iy = 0; iy < img.height; ++iy) {
+        for (uint32_t ix = 0; ix < img.width; ++ix) {
+            float gray = preprocessing_get_4d(image_gray, ix, iy, 0, 0);
             gray       = inverse ? 1.0f - gray : gray;
-            ggml_ext_tensor_set_f32(image, gray, ix, iy);
-            ggml_ext_tensor_set_f32(image, gray, ix, iy, 1);
-            ggml_ext_tensor_set_f32(image, gray, ix, iy, 2);
+            for (uint32_t c = 0; c < img.channel; ++c) {
+                preprocessing_set_4d(image, gray, ix, iy, c, 0);
+            }
         }
     }
-    ggml_tensor_to_sd_image(image, img.data);
-    ggml_free(work_ctx);
+
+    preprocessing_tensor_to_sd_image(image, img.data);
     return true;
 }
 
-#endif  // __PREPROCESSING_HPP__
\ No newline at end of file
+#endif  // __PREPROCESSING_HPP__
diff --git a/src/qwen_image.hpp b/src/qwen_image.hpp
index 2c70344c..83c8cec6 100644
--- a/src/qwen_image.hpp
+++ b/src/qwen_image.hpp
@@ -26,9 +26,9 @@ namespace Qwen {
             blocks["linear_2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, out_dim, sample_proj_bias));
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* sample,
-                                    struct ggml_tensor* condition = nullptr) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* sample,
+                             ggml_tensor* condition = nullptr) {
             if (condition != nullptr) {
                 auto cond_proj = std::dynamic_pointer_cast<Linear>(blocks["cond_proj"]);
                 sample         = ggml_add(ctx->ggml_ctx, sample, cond_proj->forward(ctx, condition));
@@ -49,8 +49,8 @@ namespace Qwen {
             blocks["timestep_embedder"] = std::shared_ptr<GGMLBlock>(new TimestepEmbedding(256, embedding_dim));
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* timesteps) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* timesteps) {
             // timesteps: [N,]
             // return: [N, embedding_dim]
             auto timestep_embedder = std::dynamic_pointer_cast<TimestepEmbedding>(blocks["timestep_embedder"]);
@@ -107,10 +107,10 @@ namespace Qwen {
         }
 
         std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
-                                                      struct ggml_tensor* img,
-                                                      struct ggml_tensor* txt,
-                                                      struct ggml_tensor* pe,
-                                                      struct ggml_tensor* mask = nullptr) {
+                                                      ggml_tensor* img,
+                                                      ggml_tensor* txt,
+                                                      ggml_tensor* pe,
+                                                      ggml_tensor* mask = nullptr) {
             // img: [N, n_img_token, hidden_size]
             // txt: [N, n_txt_token, hidden_size]
             // pe: [n_img_token + n_txt_token, d_head/2, 2, 2]
@@ -249,11 +249,11 @@ namespace Qwen {
         }
 
         virtual std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
-                                                              struct ggml_tensor* img,
-                                                              struct ggml_tensor* txt,
-                                                              struct ggml_tensor* t_emb,
-                                                              struct ggml_tensor* pe,
-                                                              struct ggml_tensor* modulate_index = nullptr) {
+                                                              ggml_tensor* img,
+                                                              ggml_tensor* txt,
+                                                              ggml_tensor* t_emb,
+                                                              ggml_tensor* pe,
+                                                              ggml_tensor* modulate_index = nullptr) {
             // img: [N, n_img_token, hidden_size]
             // txt: [N, n_txt_token, hidden_size]
             // pe: [n_img_token + n_txt_token, d_head/2, 2, 2]
@@ -325,9 +325,9 @@ namespace Qwen {
             blocks["linear"] = std::shared_ptr<GGMLBlock>(new Linear(conditioning_embedding_dim, embedding_dim * 2, bias));
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    struct ggml_tensor* c) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* c) {
             // x: [N, n_token, hidden_size]
             // c: [N, hidden_size]
             // return: [N, n_token, patch_size * patch_size * out_channels]
@@ -389,12 +389,12 @@ namespace Qwen {
             blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, params.patch_size * params.patch_size * params.out_channels));
         }
 
-        struct ggml_tensor* forward_orig(GGMLRunnerContext* ctx,
-                                         struct ggml_tensor* x,
-                                         struct ggml_tensor* timestep,
-                                         struct ggml_tensor* context,
-                                         struct ggml_tensor* pe,
-                                         struct ggml_tensor* modulate_index = nullptr) {
+        ggml_tensor* forward_orig(GGMLRunnerContext* ctx,
+                                  ggml_tensor* x,
+                                  ggml_tensor* timestep,
+                                  ggml_tensor* context,
+                                  ggml_tensor* pe,
+                                  ggml_tensor* modulate_index = nullptr) {
             auto time_text_embed = std::dynamic_pointer_cast<QwenTimestepProjEmbeddings>(blocks["time_text_embed"]);
             auto txt_norm        = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm"]);
             auto img_in          = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
@@ -429,13 +429,13 @@ namespace Qwen {
             return img;
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    struct ggml_tensor* timestep,
-                                    struct ggml_tensor* context,
-                                    struct ggml_tensor* pe,
-                                    std::vector<ggml_tensor*> ref_latents = {},
-                                    struct ggml_tensor* modulate_index    = nullptr) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* timestep,
+                             ggml_tensor* context,
+                             ggml_tensor* pe,
+                             std::vector<ggml_tensor*> ref_latents = {},
+                             ggml_tensor* modulate_index           = nullptr) {
             // Forward pass of DiT.
             // x: [N, C, H, W]
             // timestep: [N,]
@@ -521,24 +521,25 @@ namespace Qwen {
             return "qwen_image";
         }
 
-        void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
             qwen_image.get_param_tensors(tensors, prefix);
         }
 
-        struct ggml_cgraph* build_graph(struct ggml_tensor* x,
-                                        struct ggml_tensor* timesteps,
-                                        struct ggml_tensor* context,
-                                        std::vector<ggml_tensor*> ref_latents = {},
-                                        bool increase_ref_index               = false) {
+        ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
+                                 const sd::Tensor<float>& timesteps_tensor,
+                                 const sd::Tensor<float>& context_tensor,
+                                 const std::vector<sd::Tensor<float>>& ref_latents_tensor = {},
+                                 bool increase_ref_index                                  = false) {
+            ggml_cgraph* gf        = new_graph_custom(QWEN_IMAGE_GRAPH_SIZE);
+            ggml_tensor* x         = make_input(x_tensor);
+            ggml_tensor* timesteps = make_input(timesteps_tensor);
             GGML_ASSERT(x->ne[3] == 1);
-            struct ggml_cgraph* gf = new_graph_custom(QWEN_IMAGE_GRAPH_SIZE);
-
-            x         = to_backend(x);
-            context   = to_backend(context);
-            timesteps = to_backend(timesteps);
-
-            for (int i = 0; i < ref_latents.size(); i++) {
-                ref_latents[i] = to_backend(ref_latents[i]);
+            GGML_ASSERT(!context_tensor.empty());
+            ggml_tensor* context = make_input(context_tensor);
+            std::vector<ggml_tensor*> ref_latents;
+            ref_latents.reserve(ref_latents_tensor.size());
+            for (const auto& ref_latent_tensor : ref_latents_tensor) {
+                ref_latents.push_back(make_input(ref_latent_tensor));
             }
 
             pe_vec      = Rope::gen_qwen_image_pe(static_cast<int>(x->ne[1]),
@@ -587,67 +588,72 @@ namespace Qwen {
 
             auto runner_ctx = get_context();
 
-            struct ggml_tensor* out = qwen_image.forward(&runner_ctx,
-                                                         x,
-                                                         timesteps,
-                                                         context,
-                                                         pe,
-                                                         ref_latents,
-                                                         modulate_index);
+            ggml_tensor* out = qwen_image.forward(&runner_ctx,
+                                                  x,
+                                                  timesteps,
+                                                  context,
+                                                  pe,
+                                                  ref_latents,
+                                                  modulate_index);
 
             ggml_build_forward_expand(gf, out);
 
             return gf;
         }
 
-        bool compute(int n_threads,
-                     struct ggml_tensor* x,
-                     struct ggml_tensor* timesteps,
-                     struct ggml_tensor* context,
-                     std::vector<ggml_tensor*> ref_latents = {},
-                     bool increase_ref_index               = false,
-                     struct ggml_tensor** output           = nullptr,
-                     struct ggml_context* output_ctx       = nullptr) {
+        sd::Tensor<float> compute(int n_threads,
+                                  const sd::Tensor<float>& x,
+                                  const sd::Tensor<float>& timesteps,
+                                  const sd::Tensor<float>& context,
+                                  const std::vector<sd::Tensor<float>>& ref_latents = {},
+                                  bool increase_ref_index                           = false) {
             // x: [N, in_channels, h, w]
             // timesteps: [N, ]
             // context: [N, max_position, hidden_size]
-            auto get_graph = [&]() -> struct ggml_cgraph* {
+            auto get_graph = [&]() -> ggml_cgraph* {
                 return build_graph(x, timesteps, context, ref_latents, increase_ref_index);
             };
 
-            return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
         }
 
         void test() {
-            struct ggml_init_params params;
+            ggml_init_params params;
             params.mem_size   = static_cast<size_t>(1024 * 1024) * 1024;  // 1GB
             params.mem_buffer = nullptr;
             params.no_alloc   = false;
 
-            struct ggml_context* work_ctx = ggml_init(params);
-            GGML_ASSERT(work_ctx != nullptr);
+            ggml_context* ctx = ggml_init(params);
+            GGML_ASSERT(ctx != nullptr);
 
             {
-                // auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 16, 16, 16, 1);
+                // auto x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 16, 16, 16, 1);
                 // ggml_set_f32(x, 0.01f);
-                auto x = load_tensor_from_file(work_ctx, "./qwen_image_x.bin");
-                print_ggml_tensor(x);
+                auto x = sd::load_tensor_from_file_as_tensor<float>("./qwen_image_x.bin");
+                print_sd_tensor(x);
 
                 std::vector<float> timesteps_vec(1, 1000.f);
-                auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
+                auto timesteps = sd::Tensor<float>::from_vector(timesteps_vec);
 
-                // auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 3584, 256, 1);
+                // auto context = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 3584, 256, 1);
                 // ggml_set_f32(context, 0.01f);
-                auto context = load_tensor_from_file(work_ctx, "./qwen_image_context.bin");
-                print_ggml_tensor(context);
+                auto context = sd::load_tensor_from_file_as_tensor<float>("./qwen_image_context.bin");
+                print_sd_tensor(context);
 
-                struct ggml_tensor* out = nullptr;
+                sd::Tensor<float> out;
 
-                int64_t t0 = ggml_time_ms();
-                compute(8, x, timesteps, context, {}, false, &out, work_ctx);
-                int64_t t1 = ggml_time_ms();
+                int64_t t0   = ggml_time_ms();
+                auto out_opt = compute(8,
+                                       x,
+                                       timesteps,
+                                       context,
+                                       {},
+                                       false);
+                int64_t t1   = ggml_time_ms();
 
-                print_ggml_tensor(out);
+                GGML_ASSERT(!out_opt.empty());
+                out = std::move(out_opt);
+                print_sd_tensor(out);
                 LOG_DEBUG("qwen_image test done in %lldms", t1 - t0);
             }
         }
diff --git a/src/rope.hpp b/src/rope.hpp
index b26e4fcc..db577f5d 100644
--- a/src/rope.hpp
+++ b/src/rope.hpp
@@ -600,10 +600,10 @@ namespace Rope {
         return embed_nd(ids, bs, static_cast<float>(theta), axes_dim, wrap_dims);
     }
 
-    __STATIC_INLINE__ struct ggml_tensor* apply_rope(struct ggml_context* ctx,
-                                                     struct ggml_tensor* x,
-                                                     struct ggml_tensor* pe,
-                                                     bool rope_interleaved = true) {
+    __STATIC_INLINE__ ggml_tensor* apply_rope(ggml_context* ctx,
+                                              ggml_tensor* x,
+                                              ggml_tensor* pe,
+                                              bool rope_interleaved = true) {
         // x: [N, L, n_head, d_head]
         // pe: [L, d_head/2, 2, 2], [[cos, -sin], [sin, cos]]
         int64_t d_head = x->ne[0];
@@ -641,14 +641,14 @@ namespace Rope {
         return x_out;
     }
 
-    __STATIC_INLINE__ struct ggml_tensor* attention(GGMLRunnerContext* ctx,
-                                                    struct ggml_tensor* q,
-                                                    struct ggml_tensor* k,
-                                                    struct ggml_tensor* v,
-                                                    struct ggml_tensor* pe,
-                                                    struct ggml_tensor* mask,
-                                                    float kv_scale        = 1.0f,
-                                                    bool rope_interleaved = true) {
+    __STATIC_INLINE__ ggml_tensor* attention(GGMLRunnerContext* ctx,
+                                             ggml_tensor* q,
+                                             ggml_tensor* k,
+                                             ggml_tensor* v,
+                                             ggml_tensor* pe,
+                                             ggml_tensor* mask,
+                                             float kv_scale        = 1.0f,
+                                             bool rope_interleaved = true) {
         // q,k,v: [N, L, n_head, d_head]
         // pe: [L, d_head/2, 2, 2]
         // return: [N, L, n_head*d_head]
diff --git a/src/sample-cache.cpp b/src/sample-cache.cpp
new file mode 100644
index 00000000..5739178d
--- /dev/null
+++ b/src/sample-cache.cpp
@@ -0,0 +1,361 @@
+#include "sample-cache.h"
+
+namespace sd_sample {
+
+    static float get_cache_reuse_threshold(const sd_cache_params_t& params) {
+        float reuse_threshold = params.reuse_threshold;
+        if (reuse_threshold == INFINITY) {
+            if (params.mode == SD_CACHE_EASYCACHE) {
+                reuse_threshold = 0.2f;
+            } else if (params.mode == SD_CACHE_UCACHE) {
+                reuse_threshold = 1.0f;
+            }
+        }
+        return std::max(0.0f, reuse_threshold);
+    }
+
+    bool SampleCacheRuntime::easycache_enabled() const {
+        return mode == SampleCacheMode::EASYCACHE;
+    }
+
+    bool SampleCacheRuntime::ucache_enabled() const {
+        return mode == SampleCacheMode::UCACHE;
+    }
+
+    bool SampleCacheRuntime::cachedit_enabled() const {
+        return mode == SampleCacheMode::CACHEDIT;
+    }
+
+    static bool has_valid_cache_percent_range(const sd_cache_params_t& cache_params) {
+        if (cache_params.mode != SD_CACHE_EASYCACHE && cache_params.mode != SD_CACHE_UCACHE) {
+            return true;
+        }
+
+        return cache_params.start_percent >= 0.0f &&
+               cache_params.start_percent < 1.0f &&
+               cache_params.end_percent > 0.0f &&
+               cache_params.end_percent <= 1.0f &&
+               cache_params.start_percent < cache_params.end_percent;
+    }
+
+    static void init_easycache_runtime(SampleCacheRuntime& runtime,
+                                       SDVersion version,
+                                       const sd_cache_params_t& cache_params,
+                                       Denoiser* denoiser) {
+        if (!sd_version_is_dit(version)) {
+            LOG_WARN("EasyCache requested but not supported for this model type");
+            return;
+        }
+
+        EasyCacheConfig config;
+        config.enabled         = true;
+        config.reuse_threshold = get_cache_reuse_threshold(cache_params);
+        config.start_percent   = cache_params.start_percent;
+        config.end_percent     = cache_params.end_percent;
+
+        runtime.easycache.init(config, denoiser);
+        if (!runtime.easycache.enabled()) {
+            LOG_WARN("EasyCache requested but could not be initialized for this run");
+            return;
+        }
+
+        runtime.mode = SampleCacheMode::EASYCACHE;
+        LOG_INFO("EasyCache enabled - threshold: %.3f, start: %.2f, end: %.2f",
+                 config.reuse_threshold,
+                 config.start_percent,
+                 config.end_percent);
+    }
+
+    static void init_ucache_runtime(SampleCacheRuntime& runtime,
+                                    SDVersion version,
+                                    const sd_cache_params_t& cache_params,
+                                    Denoiser* denoiser,
+                                    const std::vector<float>& sigmas) {
+        if (!sd_version_is_unet(version)) {
+            LOG_WARN("UCache requested but not supported for this model type (only UNET models)");
+            return;
+        }
+
+        UCacheConfig config;
+        config.enabled                = true;
+        config.reuse_threshold        = get_cache_reuse_threshold(cache_params);
+        config.start_percent          = cache_params.start_percent;
+        config.end_percent            = cache_params.end_percent;
+        config.error_decay_rate       = std::max(0.0f, std::min(1.0f, cache_params.error_decay_rate));
+        config.use_relative_threshold = cache_params.use_relative_threshold;
+        config.reset_error_on_compute = cache_params.reset_error_on_compute;
+
+        runtime.ucache.init(config, denoiser);
+        if (!runtime.ucache.enabled()) {
+            LOG_WARN("UCache requested but could not be initialized for this run");
+            return;
+        }
+
+        runtime.ucache.set_sigmas(sigmas);
+        runtime.mode = SampleCacheMode::UCACHE;
+        LOG_INFO("UCache enabled - threshold: %.3f, start: %.2f, end: %.2f, decay: %.2f, relative: %s, reset: %s",
+                 config.reuse_threshold,
+                 config.start_percent,
+                 config.end_percent,
+                 config.error_decay_rate,
+                 config.use_relative_threshold ? "true" : "false",
+                 config.reset_error_on_compute ? "true" : "false");
+    }
+
+    static void init_cachedit_runtime(SampleCacheRuntime& runtime,
+                                      SDVersion version,
+                                      const sd_cache_params_t& cache_params,
+                                      const std::vector<float>& sigmas) {
+        if (!sd_version_is_dit(version)) {
+            LOG_WARN("CacheDIT requested but not supported for this model type (only DiT models)");
+            return;
+        }
+
+        DBCacheConfig dbcfg;
+        dbcfg.enabled                     = (cache_params.mode == SD_CACHE_DBCACHE || cache_params.mode == SD_CACHE_CACHE_DIT);
+        dbcfg.Fn_compute_blocks           = cache_params.Fn_compute_blocks;
+        dbcfg.Bn_compute_blocks           = cache_params.Bn_compute_blocks;
+        dbcfg.residual_diff_threshold     = cache_params.residual_diff_threshold;
+        dbcfg.max_warmup_steps            = cache_params.max_warmup_steps;
+        dbcfg.max_cached_steps            = cache_params.max_cached_steps;
+        dbcfg.max_continuous_cached_steps = cache_params.max_continuous_cached_steps;
+        if (cache_params.scm_mask != nullptr && strlen(cache_params.scm_mask) > 0) {
+            dbcfg.steps_computation_mask = parse_scm_mask(cache_params.scm_mask);
+        }
+        dbcfg.scm_policy_dynamic = cache_params.scm_policy_dynamic;
+
+        TaylorSeerConfig tcfg;
+        tcfg.enabled             = (cache_params.mode == SD_CACHE_TAYLORSEER || cache_params.mode == SD_CACHE_CACHE_DIT);
+        tcfg.n_derivatives       = cache_params.taylorseer_n_derivatives;
+        tcfg.skip_interval_steps = cache_params.taylorseer_skip_interval;
+
+        runtime.cachedit.init(dbcfg, tcfg);
+        if (!runtime.cachedit.enabled()) {
+            LOG_WARN("CacheDIT requested but could not be initialized for this run");
+            return;
+        }
+
+        runtime.cachedit.set_sigmas(sigmas);
+        runtime.mode = SampleCacheMode::CACHEDIT;
+        LOG_INFO("CacheDIT enabled - mode: %s, Fn: %d, Bn: %d, threshold: %.3f, warmup: %d",
+                 cache_params.mode == SD_CACHE_CACHE_DIT ? "DBCache+TaylorSeer" : (cache_params.mode == SD_CACHE_DBCACHE ? "DBCache" : "TaylorSeer"),
+                 dbcfg.Fn_compute_blocks,
+                 dbcfg.Bn_compute_blocks,
+                 dbcfg.residual_diff_threshold,
+                 dbcfg.max_warmup_steps);
+    }
+
+    static void init_spectrum_runtime(SampleCacheRuntime& runtime,
+                                      SDVersion version,
+                                      const sd_cache_params_t& cache_params,
+                                      const std::vector<float>& sigmas) {
+        if (!sd_version_is_unet(version) && !sd_version_is_dit(version)) {
+            LOG_WARN("Spectrum requested but not supported for this model type (only UNET and DiT models)");
+            return;
+        }
+
+        SpectrumConfig config;
+        config.w            = cache_params.spectrum_w;
+        config.m            = cache_params.spectrum_m;
+        config.lam          = cache_params.spectrum_lam;
+        config.window_size  = cache_params.spectrum_window_size;
+        config.flex_window  = cache_params.spectrum_flex_window;
+        config.warmup_steps = cache_params.spectrum_warmup_steps;
+        config.stop_percent = cache_params.spectrum_stop_percent;
+
+        size_t total_steps = sigmas.size() > 0 ? sigmas.size() - 1 : 0;
+        runtime.spectrum.init(config, total_steps);
+        runtime.spectrum_enabled = true;
+
+        LOG_INFO("Spectrum enabled - w: %.2f, m: %d, lam: %.2f, window: %d, flex: %.2f, warmup: %d, stop: %.0f%%",
+                 config.w, config.m, config.lam,
+                 config.window_size, config.flex_window,
+                 config.warmup_steps, config.stop_percent * 100.0f);
+    }
+
+    SampleCacheRuntime init_sample_cache_runtime(SDVersion version,
+                                                 const sd_cache_params_t* cache_params,
+                                                 Denoiser* denoiser,
+                                                 const std::vector<float>& sigmas) {
+        SampleCacheRuntime runtime;
+        if (cache_params == nullptr || cache_params->mode == SD_CACHE_DISABLED) {
+            return runtime;
+        }
+
+        if (!has_valid_cache_percent_range(*cache_params)) {
+            LOG_WARN("Cache disabled due to invalid percent range (start=%.3f, end=%.3f)",
+                     cache_params->start_percent,
+                     cache_params->end_percent);
+            return runtime;
+        }
+
+        switch (cache_params->mode) {
+            case SD_CACHE_EASYCACHE:
+                init_easycache_runtime(runtime, version, *cache_params, denoiser);
+                break;
+            case SD_CACHE_UCACHE:
+                init_ucache_runtime(runtime, version, *cache_params, denoiser, sigmas);
+                break;
+            case SD_CACHE_DBCACHE:
+            case SD_CACHE_TAYLORSEER:
+            case SD_CACHE_CACHE_DIT:
+                init_cachedit_runtime(runtime, version, *cache_params, sigmas);
+                break;
+            case SD_CACHE_SPECTRUM:
+                init_spectrum_runtime(runtime, version, *cache_params, sigmas);
+                break;
+            default:
+                break;
+        }
+
+        return runtime;
+    }
+
+    SampleStepCacheDispatcher::SampleStepCacheDispatcher(SampleCacheRuntime& runtime, int step, float sigma)
+        : runtime(runtime), step(step), sigma(sigma), step_index(step > 0 ? (step - 1) : -1) {
+        if (step_index < 0) {
+            return;
+        }
+
+        switch (runtime.mode) {
+            case SampleCacheMode::EASYCACHE:
+                runtime.easycache.begin_step(step_index, sigma);
+                break;
+            case SampleCacheMode::UCACHE:
+                runtime.ucache.begin_step(step_index, sigma);
+                break;
+            case SampleCacheMode::CACHEDIT:
+                runtime.cachedit.begin_step(step_index, sigma);
+                break;
+            case SampleCacheMode::NONE:
+                break;
+        }
+    }
+
+    bool SampleStepCacheDispatcher::before_condition(const void* condition,
+                                                     const sd::Tensor<float>& input,
+                                                     sd::Tensor<float>* output) {
+        if (step_index < 0 || condition == nullptr || output == nullptr) {
+            return false;
+        }
+
+        switch (runtime.mode) {
+            case SampleCacheMode::EASYCACHE:
+                return runtime.easycache.before_condition(condition, input, output, sigma, step_index);
+            case SampleCacheMode::UCACHE:
+                return runtime.ucache.before_condition(condition, input, output, sigma, step_index);
+            case SampleCacheMode::CACHEDIT:
+                return runtime.cachedit.before_condition(condition, input, output, sigma, step_index);
+            case SampleCacheMode::NONE:
+                return false;
+        }
+
+        return false;
+    }
+
+    void SampleStepCacheDispatcher::after_condition(const void* condition,
+                                                    const sd::Tensor<float>& input,
+                                                    const sd::Tensor<float>& output) {
+        if (step_index < 0 || condition == nullptr) {
+            return;
+        }
+
+        switch (runtime.mode) {
+            case SampleCacheMode::EASYCACHE:
+                runtime.easycache.after_condition(condition, input, output);
+                break;
+            case SampleCacheMode::UCACHE:
+                runtime.ucache.after_condition(condition, input, output);
+                break;
+            case SampleCacheMode::CACHEDIT:
+                runtime.cachedit.after_condition(condition, input, output);
+                break;
+            case SampleCacheMode::NONE:
+                break;
+        }
+    }
+
+    bool SampleStepCacheDispatcher::is_step_skipped() const {
+        switch (runtime.mode) {
+            case SampleCacheMode::EASYCACHE:
+                return runtime.easycache.is_step_skipped();
+            case SampleCacheMode::UCACHE:
+                return runtime.ucache.is_step_skipped();
+            case SampleCacheMode::CACHEDIT:
+                return runtime.cachedit.is_step_skipped();
+            case SampleCacheMode::NONE:
+                return false;
+        }
+
+        return false;
+    }
+
+    void log_sample_cache_summary(const SampleCacheRuntime& runtime, size_t total_steps) {
+        if (runtime.easycache_enabled()) {
+            if (runtime.easycache.total_steps_skipped > 0 && total_steps > 0) {
+                if (runtime.easycache.total_steps_skipped < static_cast<int>(total_steps)) {
+                    double speedup = static_cast<double>(total_steps) /
+                                     static_cast<double>(total_steps - runtime.easycache.total_steps_skipped);
+                    LOG_INFO("EasyCache skipped %d/%zu steps (%.2fx estimated speedup)",
+                             runtime.easycache.total_steps_skipped,
+                             total_steps,
+                             speedup);
+                } else {
+                    LOG_INFO("EasyCache skipped %d/%zu steps",
+                             runtime.easycache.total_steps_skipped,
+                             total_steps);
+                }
+            } else if (total_steps > 0) {
+                LOG_INFO("EasyCache completed without skipping steps");
+            }
+        }
+
+        if (runtime.ucache_enabled()) {
+            if (runtime.ucache.total_steps_skipped > 0 && total_steps > 0) {
+                if (runtime.ucache.total_steps_skipped < static_cast<int>(total_steps)) {
+                    double speedup = static_cast<double>(total_steps) /
+                                     static_cast<double>(total_steps - runtime.ucache.total_steps_skipped);
+                    LOG_INFO("UCache skipped %d/%zu steps (%.2fx estimated speedup)",
+                             runtime.ucache.total_steps_skipped,
+                             total_steps,
+                             speedup);
+                } else {
+                    LOG_INFO("UCache skipped %d/%zu steps",
+                             runtime.ucache.total_steps_skipped,
+                             total_steps);
+                }
+            } else if (total_steps > 0) {
+                LOG_INFO("UCache completed without skipping steps");
+            }
+        }
+
+        if (runtime.cachedit_enabled()) {
+            if (runtime.cachedit.total_steps_skipped > 0 && total_steps > 0) {
+                if (runtime.cachedit.total_steps_skipped < static_cast<int>(total_steps)) {
+                    double speedup = static_cast<double>(total_steps) /
+                                     static_cast<double>(total_steps - runtime.cachedit.total_steps_skipped);
+                    LOG_INFO("CacheDIT skipped %d/%zu steps (%.2fx estimated speedup)",
+                             runtime.cachedit.total_steps_skipped,
+                             total_steps,
+                             speedup);
+                } else {
+                    LOG_INFO("CacheDIT skipped %d/%zu steps",
+                             runtime.cachedit.total_steps_skipped,
+                             total_steps);
+                }
+            } else if (total_steps > 0) {
+                LOG_INFO("CacheDIT completed without skipping steps");
+            }
+        }
+
+        if (runtime.spectrum_enabled && runtime.spectrum.total_steps_skipped > 0 && total_steps > 0) {
+            double speedup = static_cast<double>(total_steps) /
+                             static_cast<double>(total_steps - runtime.spectrum.total_steps_skipped);
+            LOG_INFO("Spectrum skipped %d/%zu steps (%.2fx estimated speedup)",
+                     runtime.spectrum.total_steps_skipped,
+                     total_steps,
+                     speedup);
+        }
+    }
+
+}  // namespace sd_sample
diff --git a/src/sample-cache.h b/src/sample-cache.h
new file mode 100644
index 00000000..398ad065
--- /dev/null
+++ b/src/sample-cache.h
@@ -0,0 +1,61 @@
+#ifndef __SAMPLE_CACHE_H__
+#define __SAMPLE_CACHE_H__
+
+#include <vector>
+
+#include "cache_dit.hpp"
+#include "denoiser.hpp"
+#include "easycache.hpp"
+#include "model.h"
+#include "spectrum.hpp"
+#include "tensor.hpp"
+#include "ucache.hpp"
+#include "util.h"
+
+namespace sd_sample {
+
+    enum class SampleCacheMode {
+        NONE,
+        EASYCACHE,
+        UCACHE,
+        CACHEDIT,
+    };
+
+    struct SampleCacheRuntime {
+        SampleCacheMode mode = SampleCacheMode::NONE;
+
+        EasyCacheState easycache;
+        UCacheState ucache;
+        CacheDitConditionState cachedit;
+        SpectrumState spectrum;
+
+        bool spectrum_enabled = false;
+
+        bool easycache_enabled() const;
+        bool ucache_enabled() const;
+        bool cachedit_enabled() const;
+    };
+
+    struct SampleStepCacheDispatcher {
+        SampleCacheRuntime& runtime;
+        int step;
+        float sigma;
+        int step_index;
+
+        SampleStepCacheDispatcher(SampleCacheRuntime& runtime, int step, float sigma);
+
+        bool before_condition(const void* condition, const sd::Tensor<float>& input, sd::Tensor<float>* output);
+        void after_condition(const void* condition, const sd::Tensor<float>& input, const sd::Tensor<float>& output);
+        bool is_step_skipped() const;
+    };
+
+    SampleCacheRuntime init_sample_cache_runtime(SDVersion version,
+                                                 const sd_cache_params_t* cache_params,
+                                                 Denoiser* denoiser,
+                                                 const std::vector<float>& sigmas);
+
+    void log_sample_cache_summary(const SampleCacheRuntime& runtime, size_t total_steps);
+
+}  // namespace sd_sample
+
+#endif  // __SAMPLE_CACHE_H__
diff --git a/src/spectrum.hpp b/src/spectrum.hpp
new file mode 100644
index 00000000..add1796f
--- /dev/null
+++ b/src/spectrum.hpp
@@ -0,0 +1,187 @@
+#ifndef __SPECTRUM_HPP__
+#define __SPECTRUM_HPP__
+
+#include <cmath>
+#include <cstring>
+#include <vector>
+
+#include "ggml_extend.hpp"
+#include "tensor.hpp"
+
+struct SpectrumConfig {
+    float w            = 0.40f;
+    int m              = 3;
+    float lam          = 1.0f;
+    int window_size    = 2;
+    float flex_window  = 0.50f;
+    int warmup_steps   = 4;
+    float stop_percent = 0.9f;
+};
+
+struct SpectrumState {
+    SpectrumConfig config;
+    int cnt                 = 0;
+    int num_cached          = 0;
+    float curr_ws           = 2.0f;
+    int K                   = 6;
+    int stop_step           = 0;
+    int total_steps_skipped = 0;
+
+    std::vector<std::vector<float>> H_buf;
+    std::vector<float> T_buf;
+
+    void init(const SpectrumConfig& cfg, size_t total_steps) {
+        config              = cfg;
+        cnt                 = 0;
+        num_cached          = 0;
+        curr_ws             = (float)cfg.window_size;
+        K                   = std::max(cfg.m + 1, 6);
+        stop_step           = (int)(cfg.stop_percent * (float)total_steps);
+        total_steps_skipped = 0;
+        H_buf.clear();
+        T_buf.clear();
+    }
+
+    float taus(int step_cnt) const {
+        return (step_cnt / 50.0f) * 2.0f - 1.0f;
+    }
+
+    bool should_predict() {
+        if (cnt < config.warmup_steps)
+            return false;
+        if (stop_step > 0 && cnt >= stop_step)
+            return false;
+        if ((int)H_buf.size() < 2)
+            return false;
+
+        int ws = std::max(1, (int)std::floor(curr_ws));
+        return (num_cached + 1) % ws != 0;
+    }
+
+    void update(const sd::Tensor<float>& denoised) {
+        H_buf.emplace_back(denoised.data(), denoised.data() + denoised.numel());
+        T_buf.push_back(taus(cnt));
+
+        while ((int)H_buf.size() > K) {
+            H_buf.erase(H_buf.begin());
+            T_buf.erase(T_buf.begin());
+        }
+
+        if (cnt >= config.warmup_steps)
+            curr_ws += config.flex_window;
+
+        num_cached = 0;
+        cnt++;
+    }
+
+    void predict(sd::Tensor<float>* denoised) {
+        GGML_ASSERT(denoised != nullptr);
+        int64_t F    = (int64_t)H_buf[0].size();
+        int K_curr   = (int)H_buf.size();
+        int M1       = config.m + 1;
+        float tau_at = taus(cnt);
+
+        std::vector<float> X(K_curr * M1);
+        for (int i = 0; i < K_curr; i++) {
+            X[i * M1] = 1.0f;
+            if (M1 > 1)
+                X[i * M1 + 1] = T_buf[i];
+            for (int j = 2; j < M1; j++)
+                X[i * M1 + j] = 2.0f * T_buf[i] * X[i * M1 + j - 1] - X[i * M1 + j - 2];
+        }
+
+        std::vector<float> x_star(M1);
+        x_star[0] = 1.0f;
+        if (M1 > 1)
+            x_star[1] = tau_at;
+        for (int j = 2; j < M1; j++)
+            x_star[j] = 2.0f * tau_at * x_star[j - 1] - x_star[j - 2];
+
+        std::vector<float> XtX(M1 * M1, 0.0f);
+        for (int i = 0; i < M1; i++) {
+            for (int j = 0; j < M1; j++) {
+                float sum = 0.0f;
+                for (int k = 0; k < K_curr; k++)
+                    sum += X[k * M1 + i] * X[k * M1 + j];
+                XtX[i * M1 + j] = sum + (i == j ? config.lam : 0.0f);
+            }
+        }
+
+        std::vector<float> L(M1 * M1, 0.0f);
+        if (!cholesky_decompose(XtX.data(), L.data(), M1)) {
+            float trace = 0.0f;
+            for (int i = 0; i < M1; i++)
+                trace += XtX[i * M1 + i];
+            for (int i = 0; i < M1; i++)
+                XtX[i * M1 + i] += 1e-4f * trace / M1;
+            cholesky_decompose(XtX.data(), L.data(), M1);
+        }
+
+        std::vector<float> v(M1);
+        cholesky_solve(L.data(), x_star.data(), v.data(), M1);
+
+        std::vector<float> weights(K_curr, 0.0f);
+        for (int k = 0; k < K_curr; k++)
+            for (int j = 0; j < M1; j++)
+                weights[k] += X[k * M1 + j] * v[j];
+
+        float* out          = denoised->data();
+        float w_cheb        = config.w;
+        float w_taylor      = 1.0f - w_cheb;
+        const float* h_last = H_buf.back().data();
+        const float* h_prev = H_buf[H_buf.size() - 2].data();
+
+        for (int64_t f = 0; f < F; f++) {
+            float pred_cheb = 0.0f;
+            for (int k = 0; k < K_curr; k++)
+                pred_cheb += weights[k] * H_buf[k][f];
+
+            float pred_taylor = h_last[f] + 0.5f * (h_last[f] - h_prev[f]);
+
+            out[f] = w_taylor * pred_taylor + w_cheb * pred_cheb;
+        }
+
+        num_cached++;
+        total_steps_skipped++;
+        cnt++;
+    }
+
+private:
+    static bool cholesky_decompose(const float* A, float* L, int n) {
+        std::memset(L, 0, n * n * sizeof(float));
+        for (int i = 0; i < n; i++) {
+            for (int j = 0; j <= i; j++) {
+                float sum = 0.0f;
+                for (int k = 0; k < j; k++)
+                    sum += L[i * n + k] * L[j * n + k];
+                if (i == j) {
+                    float diag = A[i * n + i] - sum;
+                    if (diag <= 0.0f)
+                        return false;
+                    L[i * n + j] = std::sqrt(diag);
+                } else {
+                    L[i * n + j] = (A[i * n + j] - sum) / L[j * n + j];
+                }
+            }
+        }
+        return true;
+    }
+
+    static void cholesky_solve(const float* L, const float* b, float* x, int n) {
+        std::vector<float> y(n);
+        for (int i = 0; i < n; i++) {
+            float sum = 0.0f;
+            for (int j = 0; j < i; j++)
+                sum += L[i * n + j] * y[j];
+            y[i] = (b[i] - sum) / L[i * n + i];
+        }
+        for (int i = n - 1; i >= 0; i--) {
+            float sum = 0.0f;
+            for (int j = i + 1; j < n; j++)
+                sum += L[j * n + i] * x[j];
+            x[i] = (y[i] - sum) / L[i * n + i];
+        }
+    }
+};
+
+#endif  // __SPECTRUM_HPP__
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index d18db6ed..6db01d9b 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -7,17 +7,16 @@
 #include "stable-diffusion.h"
 #include "util.h"
 
-#include "cache_dit.hpp"
+#include "auto_encoder_kl.hpp"
 #include "conditioner.hpp"
 #include "control.hpp"
 #include "denoiser.hpp"
 #include "diffusion_model.hpp"
-#include "easycache.hpp"
 #include "esrgan.hpp"
 #include "lora.hpp"
 #include "pmid.hpp"
+#include "sample-cache.h"
 #include "tae.hpp"
-#include "ucache.hpp"
 #include "vae.hpp"
 
 #include "latent-preview.h"
@@ -76,7 +75,7 @@ const char* sampling_methods_str[] = {
 
 void calculate_alphas_cumprod(float* alphas_cumprod,
                               float linear_start = 0.00085f,
-                              float linear_end   = 0.0120,
+                              float linear_end   = 0.0120f,
                               int timesteps      = TIMESTEPS) {
     float ls_sqrt = sqrtf(linear_start);
     float le_sqrt = sqrtf(linear_end);
@@ -89,12 +88,16 @@ void calculate_alphas_cumprod(float* alphas_cumprod,
     }
 }
 
-void suppress_pp(int step, int steps, float time, void* data) {
-    (void)step;
-    (void)steps;
-    (void)time;
-    (void)data;
-    return;
+static float get_cache_reuse_threshold(const sd_cache_params_t& params) {
+    float reuse_threshold = params.reuse_threshold;
+    if (reuse_threshold == INFINITY) {
+        if (params.mode == SD_CACHE_EASYCACHE) {
+            reuse_threshold = 0.2f;
+        } else if (params.mode == SD_CACHE_UCACHE) {
+            reuse_threshold = 1.0f;
+        }
+    }
+    return std::max(0.0f, reuse_threshold);
 }
 
 /*=============================================== StableDiffusionGGML ================================================*/
@@ -111,11 +114,12 @@ public:
     bool external_vae_is_invalid = false;
     bool free_params_immediately = false;
 
+    bool circular_x = false;
+    bool circular_y = false;
+
     std::shared_ptr<RNG> rng         = std::make_shared<PhiloxRNG>();
     std::shared_ptr<RNG> sampler_rng = nullptr;
     int n_threads                    = -1;
-    float scale_factor               = 0.18215f;
-    float shift_factor               = 0.f;
     float default_flow_shift         = INFINITY;
 
     std::shared_ptr<Conditioner> cond_stage_model;
@@ -123,7 +127,7 @@ public:
     std::shared_ptr<DiffusionModel> diffusion_model;
     std::shared_ptr<DiffusionModel> high_noise_diffusion_model;
     std::shared_ptr<VAE> first_stage_model;
-    std::shared_ptr<TinyAutoEncoder> tae_first_stage;
+    std::shared_ptr<VAE> preview_vae;
     std::shared_ptr<ControlNet> control_net;
     std::shared_ptr<PhotoMakerIDEncoder> pmid_model;
     std::shared_ptr<LoraModel> pmid_lora;
@@ -134,7 +138,6 @@ public:
     bool apply_lora_immediately = false;
 
     std::string taesd_path;
-    bool use_tiny_autoencoder            = false;
     sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0, 0};
     bool offload_params_to_cpu           = false;
     bool use_pmid                        = false;
@@ -142,7 +145,7 @@ public:
     bool is_using_v_parameterization     = false;
     bool is_using_edm_v_parameterization = false;
 
-    std::map<std::string, struct ggml_tensor*> tensors;
+    std::map<std::string, ggml_tensor*> tensors;
 
     // lora_name => multiplier
     std::unordered_map<std::string, float> curr_lora_state;
@@ -235,10 +238,10 @@ public:
         n_threads               = sd_ctx_params->n_threads;
         vae_decode_only         = sd_ctx_params->vae_decode_only;
         free_params_immediately = sd_ctx_params->free_params_immediately;
-        taesd_path              = SAFE_STR(sd_ctx_params->taesd_path);
-        use_tiny_autoencoder    = taesd_path.size() > 0;
         offload_params_to_cpu   = sd_ctx_params->offload_params_to_cpu;
 
+        bool use_tae = false;
+
         rng = get_rng(sd_ctx_params->rng_type);
         if (sd_ctx_params->sampler_rng_type != RNG_TYPE_COUNT && sd_ctx_params->sampler_rng_type != sd_ctx_params->rng_type) {
             sampler_rng = get_rng(sd_ctx_params->sampler_rng_type);
@@ -328,6 +331,14 @@ public:
             }
         }
 
+        if (strlen(SAFE_STR(sd_ctx_params->taesd_path)) > 0) {
+            LOG_INFO("loading tae from '%s'", sd_ctx_params->taesd_path);
+            if (!model_loader.init_from_file(sd_ctx_params->taesd_path, "tae.")) {
+                LOG_WARN("loading tae from '%s' failed", sd_ctx_params->taesd_path);
+            }
+            use_tae = true;
+        }
+
         model_loader.convert_tensors_name();
 
         version = model_loader.get_sd_version();
@@ -396,22 +407,6 @@ public:
             apply_lora_immediately = false;
         }
 
-        if (sd_version_is_sdxl(version)) {
-            scale_factor = 0.13025f;
-        } else if (sd_version_is_sd3(version)) {
-            scale_factor = 1.5305f;
-            shift_factor = 0.0609f;
-        } else if (sd_version_is_flux(version) || sd_version_is_z_image(version)) {
-            scale_factor = 0.3611f;
-            shift_factor = 0.1159f;
-        } else if (sd_version_is_wan(version) ||
-                   sd_version_is_qwen_image(version) ||
-                   sd_version_is_anima(version) ||
-                   sd_version_is_flux2(version)) {
-            scale_factor = 1.0f;
-            shift_factor = 0.f;
-        }
-
         if (sd_version_is_control(version)) {
             // Might need vae encode for control cond
             vae_decode_only = false;
@@ -420,6 +415,7 @@ public:
         bool tae_preview_only = sd_ctx_params->tae_preview_only;
         if (version == VERSION_SDXS) {
             tae_preview_only = false;
+            use_tae          = true;
         }
 
         if (sd_ctx_params->circular_x || sd_ctx_params->circular_y) {
@@ -606,31 +602,46 @@ public:
                 vae_backend = backend;
             }
 
-            if (!(use_tiny_autoencoder || version == VERSION_SDXS) || tae_preview_only) {
-                if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
-                    first_stage_model = std::make_shared<WAN::WanVAERunner>(vae_backend,
-                                                                            offload_params_to_cpu,
-                                                                            tensor_storage_map,
-                                                                            "first_stage_model",
-                                                                            vae_decode_only,
-                                                                            version);
-                    first_stage_model->alloc_params_buffer();
-                    first_stage_model->get_param_tensors(tensors, "first_stage_model");
-                } else if (version == VERSION_CHROMA_RADIANCE) {
-                    first_stage_model = std::make_shared<FakeVAE>(vae_backend,
-                                                                  offload_params_to_cpu);
+            auto create_tae = [&]() -> std::shared_ptr<VAE> {
+                if (sd_version_is_wan(version) ||
+                    sd_version_is_qwen_image(version) ||
+                    sd_version_is_anima(version)) {
+                    return std::make_shared<TinyVideoAutoEncoder>(vae_backend,
+                                                                  offload_params_to_cpu,
+                                                                  tensor_storage_map,
+                                                                  "decoder",
+                                                                  vae_decode_only,
+                                                                  version);
+
                 } else {
-                    first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend,
+                    auto model = std::make_shared<TinyImageAutoEncoder>(vae_backend,
                                                                         offload_params_to_cpu,
                                                                         tensor_storage_map,
-                                                                        "first_stage_model",
+                                                                        "decoder.layers",
                                                                         vae_decode_only,
-                                                                        false,
                                                                         version);
-                    if (sd_ctx_params->vae_conv_direct) {
-                        LOG_INFO("Using Conv2d direct in the vae model");
-                        first_stage_model->set_conv2d_direct_enabled(true);
-                    }
+                    return model;
+                }
+            };
+
+            auto create_vae = [&]() -> std::shared_ptr<VAE> {
+                if (sd_version_is_wan(version) ||
+                    sd_version_is_qwen_image(version) ||
+                    sd_version_is_anima(version)) {
+                    return std::make_shared<WAN::WanVAERunner>(vae_backend,
+                                                               offload_params_to_cpu,
+                                                               tensor_storage_map,
+                                                               "first_stage_model",
+                                                               vae_decode_only,
+                                                               version);
+                } else {
+                    auto model = std::make_shared<AutoEncoderKL>(vae_backend,
+                                                                 offload_params_to_cpu,
+                                                                 tensor_storage_map,
+                                                                 "first_stage_model",
+                                                                 vae_decode_only,
+                                                                 false,
+                                                                 version);
                     if (sd_version_is_sdxl(version) &&
                         (strlen(SAFE_STR(sd_ctx_params->vae_path)) == 0 || sd_ctx_params->force_sdxl_vae_conv_scale || external_vae_is_invalid)) {
                         float vae_conv_2d_scale = 1.f / 32.f;
@@ -638,35 +649,40 @@ public:
                             "No valid VAE specified with --vae or --force-sdxl-vae-conv-scale flag set, "
                             "using Conv2D scale %.3f",
                             vae_conv_2d_scale);
-                        first_stage_model->set_conv2d_scale(vae_conv_2d_scale);
+                        model->set_conv2d_scale(vae_conv_2d_scale);
                     }
-                    first_stage_model->alloc_params_buffer();
-                    first_stage_model->get_param_tensors(tensors, "first_stage_model");
+                    return model;
+                }
+            };
+
+            if (version == VERSION_CHROMA_RADIANCE) {
+                LOG_INFO("using FakeVAE");
+                first_stage_model = std::make_shared<FakeVAE>(version,
+                                                              vae_backend,
+                                                              offload_params_to_cpu);
+            } else if (use_tae && !tae_preview_only) {
+                LOG_INFO("using TAE for encoding / decoding");
+                first_stage_model = create_tae();
+                first_stage_model->alloc_params_buffer();
+                first_stage_model->get_param_tensors(tensors, "tae");
+            } else {
+                LOG_INFO("using VAE for encoding / decoding");
+                first_stage_model = create_vae();
+                first_stage_model->alloc_params_buffer();
+                first_stage_model->get_param_tensors(tensors, "first_stage_model");
+                if (use_tae && tae_preview_only) {
+                    LOG_INFO("using TAE for preview");
+                    preview_vae = create_tae();
+                    preview_vae->alloc_params_buffer();
+                    preview_vae->get_param_tensors(tensors, "tae");
                 }
             }
-            if (use_tiny_autoencoder || version == VERSION_SDXS) {
-                if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
-                    tae_first_stage = std::make_shared<TinyVideoAutoEncoder>(vae_backend,
-                                                                             offload_params_to_cpu,
-                                                                             tensor_storage_map,
-                                                                             "decoder",
-                                                                             vae_decode_only,
-                                                                             version);
-                } else {
-                    tae_first_stage = std::make_shared<TinyImageAutoEncoder>(vae_backend,
-                                                                             offload_params_to_cpu,
-                                                                             tensor_storage_map,
-                                                                             "decoder.layers",
-                                                                             vae_decode_only,
-                                                                             version);
-                    if (version == VERSION_SDXS) {
-                        tae_first_stage->alloc_params_buffer();
-                        tae_first_stage->get_param_tensors(tensors, "first_stage_model");
-                    }
-                }
-                if (sd_ctx_params->vae_conv_direct) {
-                    LOG_INFO("Using Conv2d direct in the tae model");
-                    tae_first_stage->set_conv2d_direct_enabled(true);
+
+            if (sd_ctx_params->vae_conv_direct) {
+                LOG_INFO("Using Conv2d direct in the vae model");
+                first_stage_model->set_conv2d_direct_enabled(true);
+                if (preview_vae) {
+                    preview_vae->set_conv2d_direct_enabled(true);
                 }
             }
 
@@ -739,8 +755,8 @@ public:
                 if (first_stage_model) {
                     first_stage_model->set_flash_attention_enabled(true);
                 }
-                if (tae_first_stage) {
-                    tae_first_stage->set_flash_attention_enabled(true);
+                if (preview_vae) {
+                    preview_vae->set_flash_attention_enabled(true);
                 }
             }
 
@@ -759,20 +775,16 @@ public:
             if (control_net) {
                 control_net->set_circular_axes(sd_ctx_params->circular_x, sd_ctx_params->circular_y);
             }
-            if (first_stage_model) {
-                first_stage_model->set_circular_axes(sd_ctx_params->circular_x, sd_ctx_params->circular_y);
-            }
-            if (tae_first_stage) {
-                tae_first_stage->set_circular_axes(sd_ctx_params->circular_x, sd_ctx_params->circular_y);
-            }
+            circular_x = sd_ctx_params->circular_x;
+            circular_y = sd_ctx_params->circular_y;
         }
 
-        struct ggml_init_params params;
+        ggml_init_params params;
         params.mem_size   = static_cast<size_t>(10 * 1024) * 1024;  // 10M
         params.mem_buffer = nullptr;
         params.no_alloc   = false;
         // LOG_DEBUG("mem_size %u ", params.mem_size);
-        struct ggml_context* ctx = ggml_init(params);  // for  alphas_cumprod and is_using_v_parameterization check
+        ggml_context* ctx = ggml_init(params);  // for  alphas_cumprod and is_using_v_parameterization check
         GGML_ASSERT(ctx != nullptr);
         ggml_tensor* alphas_cumprod_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, TIMESTEPS);
         calculate_alphas_cumprod((float*)alphas_cumprod_tensor->data);
@@ -782,7 +794,7 @@ public:
 
         std::set<std::string> ignore_tensors;
         tensors["alphas_cumprod"] = alphas_cumprod_tensor;
-        if (use_tiny_autoencoder) {
+        if (use_tae && !tae_preview_only) {
             ignore_tensors.insert("first_stage_model.");
         }
         if (use_pmid) {
@@ -796,6 +808,7 @@ public:
             ignore_tensors.insert("first_stage_model.encoder");
             ignore_tensors.insert("first_stage_model.conv1");
             ignore_tensors.insert("first_stage_model.quant");
+            ignore_tensors.insert("tae.encoder");
             ignore_tensors.insert("text_encoders.llm.visual.");
         }
         if (version == VERSION_OVIS_IMAGE) {
@@ -822,15 +835,9 @@ public:
                 unet_params_mem_size += high_noise_diffusion_model->get_params_buffer_size();
             }
             size_t vae_params_mem_size = 0;
-            if (!(use_tiny_autoencoder || version == VERSION_SDXS) || tae_preview_only) {
-                vae_params_mem_size = first_stage_model->get_params_buffer_size();
-            }
-            if (use_tiny_autoencoder || version == VERSION_SDXS) {
-                if (use_tiny_autoencoder && !tae_first_stage->load_from_file(taesd_path, n_threads)) {
-                    return false;
-                }
-                use_tiny_autoencoder = true;  // now the processing is identical for VERSION_SDXS
-                vae_params_mem_size  = tae_first_stage->get_params_buffer_size();
+            vae_params_mem_size        = first_stage_model->get_params_buffer_size();
+            if (preview_vae) {
+                vae_params_mem_size += preview_vae->get_params_buffer_size();
             }
             size_t control_net_params_mem_size = 0;
             if (control_net) {
@@ -896,7 +903,7 @@ public:
             if (pred_type == PREDICTION_COUNT) {
                 if (sd_version_is_sd2(version)) {
                     // check is_using_v_parameterization_for_sd2
-                    if (is_using_v_parameterization_for_sd2(ctx, sd_version_is_inpaint(version))) {
+                    if (is_using_v_parameterization_for_sd2(sd_version_is_inpaint(version))) {
                         pred_type = V_PRED;
                     } else {
                         pred_type = EPS_PRED;
@@ -983,47 +990,34 @@ public:
         }
 
         ggml_free(ctx);
-        use_tiny_autoencoder = use_tiny_autoencoder && !tae_preview_only;
         return true;
     }
 
-    bool is_using_v_parameterization_for_sd2(ggml_context* work_ctx, bool is_inpaint = false) {
-        struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1);
-        ggml_set_f32(x_t, 0.5);
-        struct ggml_tensor* c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1);
-        ggml_set_f32(c, 0.5);
-
-        struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1);
-        ggml_set_f32(timesteps, 999);
-
-        struct ggml_tensor* concat = is_inpaint ? ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 5, 1) : nullptr;
-        if (concat != nullptr) {
-            ggml_set_f32(concat, 0);
+    bool is_using_v_parameterization_for_sd2(bool is_inpaint = false) {
+        sd::Tensor<float> x_t   = sd::full<float>({8, 8, 4, 1}, 0.5f);
+        sd::Tensor<float> c     = sd::full<float>({1024, 2, 1, 1}, 0.5f);
+        sd::Tensor<float> steps = sd::full<float>({1}, 999.0f);
+        sd::Tensor<float> concat;
+        if (is_inpaint) {
+            concat = sd::zeros<float>({8, 8, 5, 1});
         }
 
-        int64_t t0              = ggml_time_ms();
-        struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t);
+        int64_t t0 = ggml_time_ms();
+        sd::Tensor<float> out;
         DiffusionParams diffusion_params;
-        diffusion_params.x         = x_t;
-        diffusion_params.timesteps = timesteps;
-        diffusion_params.context   = c;
-        diffusion_params.c_concat  = concat;
-        diffusion_model->compute(n_threads, diffusion_params, &out);
+        diffusion_params.x         = &x_t;
+        diffusion_params.timesteps = &steps;
+        diffusion_params.context   = &c;
+        if (!concat.empty()) {
+            diffusion_params.c_concat = &concat;
+        }
+        auto out_opt = diffusion_model->compute(n_threads, diffusion_params);
+        GGML_ASSERT(!out_opt.empty());
+        out = std::move(out_opt);
         diffusion_model->free_compute_buffer();
 
-        double result = 0.f;
-        {
-            float* vec_x   = (float*)x_t->data;
-            float* vec_out = (float*)out->data;
-
-            int64_t n = ggml_nelements(out);
-
-            for (int i = 0; i < n; i++) {
-                result += ((double)vec_out[i] - (double)vec_x[i]);
-            }
-            result /= n;
-        }
-        int64_t t1 = ggml_time_ms();
+        double result = static_cast<double>((out - x_t).mean());
+        int64_t t1    = ggml_time_ms();
         LOG_DEBUG("check is_using_v_parameterization_for_sd2, taking %.2fs", (t1 - t0) * 1.0f / 1000);
         return result < -1;
     }
@@ -1261,8 +1255,7 @@ public:
         }
     }
 
-    SDCondition get_pmid_conditon(ggml_context* work_ctx,
-                                  sd_pm_params_t pm_params,
+    SDCondition get_pmid_conditon(sd_pm_params_t pm_params,
                                   ConditionerParams& condition_params) {
         SDCondition id_cond;
         if (use_pmid) {
@@ -1281,60 +1274,60 @@ public:
             if (pm_params.id_images_count > 0) {
                 int clip_image_size        = 224;
                 pmid_model->style_strength = pm_params.style_strength;
-
-                auto id_image_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, clip_image_size, clip_image_size, 3, pm_params.id_images_count);
-
-                std::vector<sd_image_f32_t> processed_id_images;
+                sd::Tensor<float> id_image_tensor;
                 for (int i = 0; i < pm_params.id_images_count; i++) {
-                    sd_image_f32_t id_image           = sd_image_t_to_sd_image_f32_t(pm_params.id_images[i]);
-                    sd_image_f32_t processed_id_image = clip_preprocess(id_image, clip_image_size, clip_image_size);
-                    free(id_image.data);
-                    id_image.data = nullptr;
-                    processed_id_images.push_back(processed_id_image);
+                    auto id_image           = sd_image_to_tensor(pm_params.id_images[i]);
+                    auto processed_id_image = clip_preprocess(id_image, clip_image_size, clip_image_size);
+                    if (id_image_tensor.empty()) {
+                        id_image_tensor = processed_id_image;
+                    } else {
+                        id_image_tensor = sd::ops::concat(id_image_tensor, processed_id_image, 3);
+                    }
                 }
 
-                ggml_ext_tensor_iter(id_image_tensor, [&](ggml_tensor* id_image_tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-                    float value = sd_image_get_f32(processed_id_images[i3], i0, i1, i2, false);
-                    ggml_ext_tensor_set_f32(id_image_tensor, value, i0, i1, i2, i3);
-                });
-
-                for (auto& image : processed_id_images) {
-                    free(image.data);
-                    image.data = nullptr;
-                }
-                processed_id_images.clear();
-
                 int64_t t0                      = ggml_time_ms();
                 condition_params.num_input_imgs = pm_params.id_images_count;
-                auto cond_tup                   = cond_stage_model->get_learned_condition_with_trigger(work_ctx,
-                                                                                                       n_threads,
+                auto cond_tup                   = cond_stage_model->get_learned_condition_with_trigger(n_threads,
                                                                                                        condition_params);
                 id_cond                         = std::get<0>(cond_tup);
                 auto class_tokens_mask          = std::get<1>(cond_tup);
-                struct ggml_tensor* id_embeds   = nullptr;
+                sd::Tensor<float> id_embeds;
                 if (pmv2 && pm_params.id_embed_path != nullptr) {
-                    id_embeds = load_tensor_from_file(work_ctx, pm_params.id_embed_path);
+                    try {
+                        id_embeds = sd::load_tensor_from_file_as_tensor<float>(pm_params.id_embed_path);
+                    } catch (const std::exception&) {
+                        id_embeds = {};
+                    }
                 }
-                if (pmv2 && id_embeds == nullptr) {
+                if (pmv2 && id_embeds.empty()) {
                     LOG_WARN("Provided PhotoMaker images, but NO valid ID embeds file for PM v2");
                     LOG_WARN("Turn off PhotoMaker");
                     use_pmid = false;
                 } else {
-                    if (pmv2 && pm_params.id_images_count != id_embeds->ne[1]) {
-                        LOG_WARN("PhotoMaker image count (%d) does NOT match ID embeds (%d). You should run face_detect.py again.", pm_params.id_images_count, id_embeds->ne[1]);
+                    if (pmv2 && pm_params.id_images_count != id_embeds.shape()[1]) {
+                        LOG_WARN("PhotoMaker image count (%d) does NOT match ID embeds (%d). You should run face_detect.py again.", pm_params.id_images_count, static_cast<int>(id_embeds.shape()[1]));
                         LOG_WARN("Turn off PhotoMaker");
                         use_pmid = false;
                     } else {
-                        ggml_tensor* res = nullptr;
-                        pmid_model->compute(n_threads, id_image_tensor, id_cond.c_crossattn, id_embeds, class_tokens_mask, &res, work_ctx);
-                        id_cond.c_crossattn = res;
-                        int64_t t1          = ggml_time_ms();
-                        LOG_INFO("Photomaker ID Stacking, taking %" PRId64 " ms", t1 - t0);
+                        auto res = pmid_model->compute(n_threads,
+                                                       id_image_tensor,
+                                                       id_cond.c_crossattn,
+                                                       id_embeds,
+                                                       class_tokens_mask);
+                        if (res.empty()) {
+                            LOG_ERROR("Photomaker ID Stacking failed");
+                            LOG_WARN("Turn off PhotoMaker");
+                            use_pmid = false;
+                        } else {
+                            id_cond.c_crossattn = std::move(res);
+                            int64_t t1          = ggml_time_ms();
+                            LOG_INFO("Photomaker ID Stacking, taking %" PRId64 " ms", t1 - t0);
+                            // Encode input prompt without the trigger word for delayed conditioning
+                            condition_params.text = cond_stage_model->remove_trigger_from_prompt(condition_params.text);
+                        }
                         if (free_params_immediately) {
                             pmid_model->free_params_buffer();
                         }
-                        // Encode input prompt without the trigger word for delayed conditioning
-                        condition_params.text = cond_stage_model->remove_trigger_from_prompt(work_ctx, condition_params.text);
                     }
                 }
             } else {
@@ -1346,109 +1339,37 @@ public:
         return id_cond;
     }
 
-    ggml_tensor* get_clip_vision_output(ggml_context* work_ctx,
-                                        sd_image_t init_image,
-                                        bool return_pooled   = true,
-                                        int clip_skip        = -1,
-                                        bool zero_out_masked = false) {
-        ggml_tensor* output = nullptr;
+    sd::Tensor<float> get_clip_vision_output(const sd::Tensor<float>& image,
+                                             bool return_pooled   = true,
+                                             int clip_skip        = -1,
+                                             bool zero_out_masked = false) {
+        sd::Tensor<float> output;
         if (zero_out_masked) {
             if (return_pooled) {
-                output = ggml_new_tensor_1d(work_ctx,
-                                            GGML_TYPE_F32,
-                                            clip_vision->vision_model.projection_dim);
+                output = sd::zeros<float>({clip_vision->vision_model.projection_dim});
             } else {
-                output = ggml_new_tensor_2d(work_ctx,
-                                            GGML_TYPE_F32,
-                                            clip_vision->vision_model.hidden_size,
-                                            257);
+                output = sd::zeros<float>({clip_vision->vision_model.hidden_size, 257});
             }
-
-            ggml_set_f32(output, 0.f);
         } else {
-            sd_image_f32_t image         = sd_image_t_to_sd_image_f32_t(init_image);
-            sd_image_f32_t resized_image = clip_preprocess(image, clip_vision->vision_model.image_size, clip_vision->vision_model.image_size);
-            free(image.data);
-            image.data = nullptr;
-
-            ggml_tensor* pixel_values = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, resized_image.width, resized_image.height, 3, 1);
-            sd_image_f32_to_ggml_tensor(resized_image, pixel_values, false);
-            free(resized_image.data);
-            resized_image.data = nullptr;
-
-            // print_ggml_tensor(pixel_values);
-            clip_vision->compute(n_threads, pixel_values, return_pooled, clip_skip, &output, work_ctx);
-            // print_ggml_tensor(c_crossattn);
+            auto pixel_values = clip_preprocess(image, clip_vision->vision_model.image_size, clip_vision->vision_model.image_size);
+            auto output_opt   = clip_vision->compute(n_threads, pixel_values, return_pooled, clip_skip);
+            if (output_opt.empty()) {
+                LOG_ERROR("clip_vision compute failed");
+                return {};
+            }
+            output = std::move(output_opt);
         }
         return output;
     }
 
-    SDCondition get_svd_condition(ggml_context* work_ctx,
-                                  sd_image_t init_image,
-                                  int width,
-                                  int height,
-                                  int fps                  = 6,
-                                  int motion_bucket_id     = 127,
-                                  float augmentation_level = 0.f,
-                                  bool zero_out_masked     = false) {
-        // c_crossattn
-        int64_t t0                      = ggml_time_ms();
-        struct ggml_tensor* c_crossattn = get_clip_vision_output(work_ctx, init_image, true, -1, zero_out_masked);
-
-        // c_concat
-        struct ggml_tensor* c_concat = nullptr;
-        {
-            if (zero_out_masked) {
-                c_concat = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / get_vae_scale_factor(), height / get_vae_scale_factor(), 4, 1);
-                ggml_set_f32(c_concat, 0.f);
-            } else {
-                ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
-
-                if (width != init_image.width || height != init_image.height) {
-                    sd_image_f32_t image         = sd_image_t_to_sd_image_f32_t(init_image);
-                    sd_image_f32_t resized_image = resize_sd_image_f32_t(image, width, height);
-                    free(image.data);
-                    image.data = nullptr;
-                    sd_image_f32_to_ggml_tensor(resized_image, init_img, false);
-                    free(resized_image.data);
-                    resized_image.data = nullptr;
-                } else {
-                    sd_image_to_ggml_tensor(init_image, init_img);
-                }
-                if (augmentation_level > 0.f) {
-                    struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, init_img);
-                    ggml_ext_im_set_randn_f32(noise, rng);
-                    // encode_pixels += torch.randn_like(pixels) * augmentation_level
-                    ggml_ext_tensor_scale_inplace(noise, augmentation_level);
-                    ggml_ext_tensor_add_inplace(init_img, noise);
-                }
-                ggml_tensor* moments = vae_encode(work_ctx, init_img);
-                c_concat             = get_first_stage_encoding(work_ctx, moments);
-            }
-        }
-
-        // y
-        struct ggml_tensor* y = nullptr;
-        {
-            y                            = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model->get_adm_in_channels());
-            int out_dim                  = 256;
-            int fps_id                   = fps - 1;
-            std::vector<float> timesteps = {(float)fps_id, (float)motion_bucket_id, augmentation_level};
-            set_timestep_embedding(timesteps, y, out_dim);
-        }
-        int64_t t1 = ggml_time_ms();
-        LOG_DEBUG("computing svd condition graph completed, taking %" PRId64 " ms", t1 - t0);
-        return {c_crossattn, y, c_concat};
-    }
-
     std::vector<float> process_timesteps(const std::vector<float>& timesteps,
-                                         ggml_tensor* init_latent,
-                                         ggml_tensor* denoise_mask) {
+                                         const sd::Tensor<float>& init_latent,
+                                         const sd::Tensor<float>& denoise_mask) {
         if (diffusion_model->get_desc() == "Wan2.2-TI2V-5B") {
-            auto new_timesteps = std::vector<float>(init_latent->ne[2], timesteps[0]);
+            auto new_timesteps = std::vector<float>(static_cast<size_t>(init_latent.shape()[2]), timesteps[0]);
 
-            if (denoise_mask != nullptr) {
-                float value = ggml_ext_tensor_get_f32(denoise_mask, 0, 0, 0, 0);
+            if (!denoise_mask.empty()) {
+                float value = denoise_mask.dim() == 5 ? denoise_mask.index(0, 0, 0, 0, 0) : denoise_mask.index(0, 0, 0, 0);
                 if (value == 0.f) {
                     new_timesteps[0] = 0.f;
                 }
@@ -1459,48 +1380,19 @@ public:
         }
     }
 
-    // a = a * mask + b * (1 - mask)
-    void apply_mask(ggml_tensor* a, ggml_tensor* b, ggml_tensor* mask) {
-        for (int64_t i0 = 0; i0 < a->ne[0]; i0++) {
-            for (int64_t i1 = 0; i1 < a->ne[1]; i1++) {
-                for (int64_t i2 = 0; i2 < a->ne[2]; i2++) {
-                    for (int64_t i3 = 0; i3 < a->ne[3]; i3++) {
-                        float a_value    = ggml_ext_tensor_get_f32(a, i0, i1, i2, i3);
-                        float b_value    = ggml_ext_tensor_get_f32(b, i0, i1, i2, i3);
-                        float mask_value = ggml_ext_tensor_get_f32(mask, i0 % mask->ne[0], i1 % mask->ne[1], i2 % mask->ne[2], i3 % mask->ne[3]);
-                        ggml_ext_tensor_set_f32(a, a_value * mask_value + b_value * (1 - mask_value), i0, i1, i2, i3);
-                    }
-                }
-            }
-        }
-    }
-
-    void silent_tiling(ggml_tensor* input, ggml_tensor* output, const int scale, const int tile_size, const float tile_overlap_factor, on_tile_process on_processing) {
-        sd_progress_cb_t cb = sd_get_progress_callback();
-        void* cbd           = sd_get_progress_callback_data();
-        sd_set_progress_callback((sd_progress_cb_t)suppress_pp, nullptr);
-        sd_tiling(input, output, scale, tile_size, tile_overlap_factor, on_processing);
-        sd_set_progress_callback(cb, cbd);
-    }
-
-    void preview_image(ggml_context* work_ctx,
-                       int step,
-                       struct ggml_tensor* latents,
+    void preview_image(int step,
+                       const sd::Tensor<float>& latents,
                        enum SDVersion version,
                        preview_t preview_mode,
-                       ggml_tensor* result,
                        std::function<void(int, int, sd_image_t*, bool, void*)> step_callback,
                        void* step_callback_data,
                        bool is_noisy) {
-        const uint32_t channel = 3;
-        uint32_t width         = static_cast<uint32_t>(latents->ne[0]);
-        uint32_t height        = static_cast<uint32_t>(latents->ne[1]);
-        uint32_t dim           = static_cast<uint32_t>(latents->ne[ggml_n_dims(latents) - 1]);
-
         if (preview_mode == PREVIEW_PROJ) {
-            int patch_sz                           = 1;
-            const float(*latent_rgb_proj)[channel] = nullptr;
-            float* latent_rgb_bias                 = nullptr;
+            int patch_sz                     = 1;
+            const float(*latent_rgb_proj)[3] = nullptr;
+            float* latent_rgb_bias           = nullptr;
+            bool is_video                    = preview_latent_tensor_is_video(latents);
+            uint32_t dim                     = is_video ? static_cast<uint32_t>(latents.shape()[3]) : static_cast<uint32_t>(latents.shape()[2]);
 
             if (dim == 128) {
                 if (sd_version_is_flux2(version)) {
@@ -1514,12 +1406,9 @@ public:
                     latent_rgb_bias = wan_22_latent_rgb_bias;
                 } else {
                     LOG_WARN("No latent to RGB projection known for this model");
-                    // unknown model
                     return;
                 }
             } else if (dim == 16) {
-                // 16 channels VAE -> Flux or SD3
-
                 if (sd_version_is_sd3(version)) {
                     latent_rgb_proj = sd3_latent_rgb_proj;
                     latent_rgb_bias = sd3_latent_rgb_bias;
@@ -1531,12 +1420,9 @@ public:
                     latent_rgb_bias = wan_21_latent_rgb_bias;
                 } else {
                     LOG_WARN("No latent to RGB projection known for this model");
-                    // unknown model
                     return;
                 }
-
             } else if (dim == 4) {
-                // 4 channels VAE
                 if (sd_version_is_sdxl(version)) {
                     latent_rgb_proj = sdxl_latent_rgb_proj;
                     latent_rgb_bias = sdxl_latent_rgb_bias;
@@ -1544,765 +1430,398 @@ public:
                     latent_rgb_proj = sd_latent_rgb_proj;
                     latent_rgb_bias = sd_latent_rgb_bias;
                 } else {
-                    // unknown model
                     LOG_WARN("No latent to RGB projection known for this model");
                     return;
                 }
-            } else if (dim == 3) {
-                // Do nothing, assuming already RGB latents
-            } else {
+            } else if (dim != 3) {
                 LOG_WARN("No latent to RGB projection known for this model");
-                // unknown latent space
                 return;
             }
 
-            uint32_t frames = 1;
-            if (ggml_n_dims(latents) == 4) {
-                frames = static_cast<uint32_t>(latents->ne[2]);
-            }
-
-            uint32_t img_width  = width * patch_sz;
-            uint32_t img_height = height * patch_sz;
-
-            uint8_t* data = (uint8_t*)malloc(frames * img_width * img_height * channel * sizeof(uint8_t));
+            uint32_t frames     = is_video ? static_cast<uint32_t>(latents.shape()[2]) : 1;
+            uint32_t img_width  = static_cast<uint32_t>(latents.shape()[0]) * patch_sz;
+            uint32_t img_height = static_cast<uint32_t>(latents.shape()[1]) * patch_sz;
 
+            uint8_t* data = (uint8_t*)malloc(frames * img_width * img_height * 3 * sizeof(uint8_t));
+            GGML_ASSERT(data != nullptr);
             preview_latent_video(data, latents, latent_rgb_proj, latent_rgb_bias, patch_sz);
             sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t));
+            GGML_ASSERT(images != nullptr);
             for (uint32_t i = 0; i < frames; i++) {
-                images[i] = {img_width, img_height, channel, data + i * img_width * img_height * channel};
+                images[i] = {img_width, img_height, 3, data + i * img_width * img_height * 3};
             }
             step_callback(step, frames, images, is_noisy, step_callback_data);
             free(data);
             free(images);
-        } else {
-            if (preview_mode == PREVIEW_VAE) {
-                process_latent_out(latents);
-                if (vae_tiling_params.enabled) {
-                    // split latent in 32x32 tiles and compute in several steps
-                    auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-                        return first_stage_model->compute(n_threads, in, true, &out, nullptr);
-                    };
-                    silent_tiling(latents, result, get_vae_scale_factor(), 32, 0.5f, on_tiling);
+            return;
+        }
 
-                } else {
-                    first_stage_model->compute(n_threads, latents, true, &result, work_ctx);
-                }
-
-                first_stage_model->free_compute_buffer();
-                process_vae_output_tensor(result);
-                process_latent_in(latents);
-            } else if (preview_mode == PREVIEW_TAE) {
-                if (tae_first_stage == nullptr) {
-                    LOG_WARN("TAE not found for preview");
-                    return;
-                }
-                if (vae_tiling_params.enabled) {
-                    // split latent in 64x64 tiles and compute in several steps
-                    auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-                        return tae_first_stage->compute(n_threads, in, true, &out, nullptr);
-                    };
-                    silent_tiling(latents, result, get_vae_scale_factor(), 64, 0.5f, on_tiling);
-                } else {
-                    tae_first_stage->compute(n_threads, latents, true, &result, work_ctx);
-                }
-                tae_first_stage->free_compute_buffer();
+        if (preview_mode == PREVIEW_VAE || preview_mode == PREVIEW_TAE) {
+            sd::Tensor<float> vae_latents;
+            sd::Tensor<float> decoded;
+            bool is_video = preview_latent_tensor_is_video(latents);
+            if (preview_vae) {
+                vae_latents = preview_vae->diffusion_to_vae_latents(latents);
+                decoded     = preview_vae->decode(n_threads, vae_latents, vae_tiling_params, is_video, circular_x, circular_y, true);
             } else {
+                vae_latents = first_stage_model->diffusion_to_vae_latents(latents);
+                decoded     = first_stage_model->decode(n_threads, vae_latents, vae_tiling_params, is_video, circular_x, circular_y, true);
+            }
+            if (decoded.empty()) {
+                LOG_ERROR("preview decode failed at step %d", step);
                 return;
             }
 
-            ggml_ext_tensor_clamp_inplace(result, 0.0f, 1.0f);
-            uint32_t frames = 1;
-            if (ggml_n_dims(latents) == 4) {
-                frames = static_cast<uint32_t>(result->ne[2]);
-            }
-
+            is_video           = preview_latent_tensor_is_video(decoded);
+            uint32_t frames    = is_video ? static_cast<uint32_t>(decoded.shape()[2]) : 1;
             sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t));
-            // print_ggml_tensor(result,true);
-            for (size_t i = 0; i < frames; i++) {
-                images[i].width   = static_cast<uint32_t>(result->ne[0]);
-                images[i].height  = static_cast<uint32_t>(result->ne[1]);
-                images[i].channel = 3;
-                images[i].data    = ggml_tensor_to_sd_image(result, static_cast<int>(i), ggml_n_dims(latents) == 4);
+            GGML_ASSERT(images != nullptr);
+            for (uint32_t i = 0; i < frames; ++i) {
+                images[i] = tensor_to_sd_image(decoded, static_cast<int>(i));
             }
 
             step_callback(step, frames, images, is_noisy, step_callback_data);
-
-            ggml_ext_tensor_scale_inplace(result, 0);
-            for (uint32_t i = 0; i < frames; i++) {
+            for (uint32_t i = 0; i < frames; ++i) {
                 free(images[i].data);
             }
-
             free(images);
+            return;
+        }
+
+        if (preview_mode != PREVIEW_NONE) {
+            LOG_WARN("Unsupported preview mode: %d", static_cast<int>(preview_mode));
         }
     }
 
-    ggml_tensor* sample(ggml_context* work_ctx,
-                        std::shared_ptr<DiffusionModel> work_diffusion_model,
-                        bool inverse_noise_scaling,
-                        ggml_tensor* init_latent,
-                        ggml_tensor* noise,
-                        SDCondition cond,
-                        SDCondition uncond,
-                        SDCondition img_cond,
-                        ggml_tensor* control_hint,
-                        float control_strength,
-                        sd_guidance_params_t guidance,
-                        float eta,
-                        int shifted_timestep,
-                        sample_method_t method,
-                        const std::vector<float>& sigmas,
-                        int start_merge_step,
-                        SDCondition id_cond,
-                        std::vector<ggml_tensor*> ref_latents = {},
-                        bool increase_ref_index               = false,
-                        ggml_tensor* denoise_mask             = nullptr,
-                        ggml_tensor* vace_context             = nullptr,
-                        float vace_strength                   = 1.f,
-                        const sd_cache_params_t* cache_params = nullptr) {
-        if (shifted_timestep > 0 && !sd_version_is_sdxl(version)) {
-            LOG_WARN("timestep shifting is only supported for SDXL models!");
-            shifted_timestep = 0;
+    std::vector<float> prepare_sample_timesteps(float sigma,
+                                                int shifted_timestep) {
+        float t = denoiser->sigma_to_t(sigma);
+        if (shifted_timestep > 0) {
+            float shifted_t_float = t * (float(shifted_timestep) / float(TIMESTEPS));
+            int64_t shifted_t     = static_cast<int64_t>(roundf(shifted_t_float));
+            shifted_t             = std::max((int64_t)0, std::min((int64_t)(TIMESTEPS - 1), shifted_t));
+            LOG_DEBUG("shifting timestep from %.2f to %" PRId64 " (sigma: %.4f)", t, shifted_t, sigma);
+            return std::vector<float>{(float)shifted_t};
         }
+        if (sd_version_is_anima(version)) {
+            return std::vector<float>{t / static_cast<float>(TIMESTEPS)};
+        }
+        if (sd_version_is_z_image(version)) {
+            return std::vector<float>{1000.f - t};
+        }
+        return std::vector<float>{t};
+    }
+
+    void adjust_sample_step_scalings(int shifted_timestep,
+                                     const std::vector<float>& timesteps_vec,
+                                     float c_in,
+                                     float* c_skip,
+                                     float* c_out) {
+        GGML_ASSERT(c_skip != nullptr);
+        GGML_ASSERT(c_out != nullptr);
+        if (shifted_timestep <= 0) {
+            return;
+        }
+
+        int64_t shifted_t_idx              = static_cast<int64_t>(roundf(timesteps_vec[0]));
+        float shifted_sigma                = denoiser->t_to_sigma((float)shifted_t_idx);
+        std::vector<float> shifted_scaling = denoiser->get_scalings(shifted_sigma);
+        float shifted_c_skip               = shifted_scaling[0];
+        float shifted_c_out                = shifted_scaling[1];
+        float shifted_c_in                 = shifted_scaling[2];
+
+        *c_skip = shifted_c_skip * c_in / shifted_c_in;
+        *c_out  = shifted_c_out;
+    }
+
+    struct SamplePreviewContext {
+        sd_preview_cb_t callback = nullptr;
+        void* data               = nullptr;
+        preview_t mode           = PREVIEW_NONE;
+    };
+
+    SamplePreviewContext prepare_sample_preview_context() {
+        return SamplePreviewContext{sd_get_preview_callback(),
+                                    sd_get_preview_callback_data(),
+                                    sd_get_preview_mode()};
+    }
+
+    void report_sample_progress(int step, size_t total_steps, int64_t t0) {
+        int64_t t1 = ggml_time_us();
+        if (step > 0 || step == -(int)total_steps) {
+            int showstep = std::abs(step);
+            pretty_progress(showstep, (int)total_steps, (t1 - t0) / 1000000.f / showstep);
+        }
+    }
+
+    void compute_sample_controls(const sd::Tensor<float>& control_image,
+                                 const sd::Tensor<float>& noised_input,
+                                 const sd::Tensor<float>& timesteps_tensor,
+                                 const SDCondition& condition,
+                                 std::vector<sd::Tensor<float>>* controls) {
+        GGML_ASSERT(controls != nullptr);
+        controls->clear();
+        if (control_image.empty() || control_net == nullptr) {
+            return;
+        }
+
+        auto control_result = control_net->compute(n_threads,
+                                                   noised_input,
+                                                   control_image,
+                                                   timesteps_tensor,
+                                                   condition.c_crossattn,
+                                                   condition.c_vector);
+        if (!control_result.has_value()) {
+            LOG_ERROR("controlnet compute failed");
+            return;
+        }
+
+        *controls = std::move(*control_result);
+    }
+
+    sd::Tensor<float> sample(const std::shared_ptr<DiffusionModel>& work_diffusion_model,
+                             bool inverse_noise_scaling,
+                             const sd::Tensor<float>& init_latent,
+                             sd::Tensor<float> noise,
+                             const SDCondition& cond,
+                             const SDCondition& uncond,
+                             const SDCondition& img_cond,
+                             const SDCondition& id_cond,
+                             const sd::Tensor<float>& control_image,
+                             float control_strength,
+                             const sd_guidance_params_t& guidance,
+                             float eta,
+                             int shifted_timestep,
+                             sample_method_t method,
+                             const std::vector<float>& sigmas,
+                             int start_merge_step,
+                             const std::vector<sd::Tensor<float>>& ref_latents,
+                             bool increase_ref_index,
+                             const sd::Tensor<float>& denoise_mask,
+                             const sd::Tensor<float>& vace_context,
+                             float vace_strength,
+                             const sd_cache_params_t* cache_params) {
         std::vector<int> skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count);
-
-        float cfg_scale = guidance.txt_cfg;
-        if (cfg_scale < 1.f) {
-            if (cfg_scale == 0.f) {
-                // Diffusers follow the convention from the original paper
-                // (https://arxiv.org/abs/2207.12598v1), so many distilled model docs
-                // recommend 0 as guidance; warn the user that it'll disable prompt folowing
-                LOG_WARN("unconditioned mode, images won't follow the prompt (use cfg-scale=1 for distilled models)");
-            } else {
-                LOG_WARN("cfg value out of expected range may produce unexpected results");
-            }
-        }
-
-        float img_cfg_scale = std::isfinite(guidance.img_cfg) ? guidance.img_cfg : guidance.txt_cfg;
+        float cfg_scale     = guidance.txt_cfg;
+        float img_cfg_scale = guidance.img_cfg;
         float slg_scale     = guidance.slg.scale;
 
-        if (img_cfg_scale != cfg_scale && !sd_version_is_inpaint_or_unet_edit(version)) {
-            LOG_WARN("2-conditioning CFG is not supported with this model, disabling it for better performance...");
-            img_cfg_scale = cfg_scale;
+        sd_sample::SampleCacheRuntime cache_runtime = sd_sample::init_sample_cache_runtime(version,
+                                                                                           cache_params,
+                                                                                           denoiser.get(),
+                                                                                           sigmas);
+        size_t steps                                = sigmas.size() - 1;
+        bool has_skiplayer                          = slg_scale != 0.0f && !skip_layers.empty();
+        if (has_skiplayer && !sd_version_is_dit(version)) {
+            has_skiplayer = false;
+            LOG_WARN("SLG is incompatible with this model type");
         }
 
-        EasyCacheState easycache_state;
-        UCacheState ucache_state;
-        CacheDitConditionState cachedit_state;
-        bool easycache_enabled = false;
-        bool ucache_enabled    = false;
-        bool cachedit_enabled  = false;
+        int64_t t0                   = ggml_time_us();
+        sd::Tensor<float> x_t        = !noise.empty()
+                                           ? denoiser->noise_scaling(sigmas[0], noise, init_latent)
+                                           : init_latent;
+        sd::Tensor<float> denoised   = x_t;
+        SamplePreviewContext preview = prepare_sample_preview_context();
 
-        if (cache_params != nullptr && cache_params->mode != SD_CACHE_DISABLED) {
-            bool percent_valid = true;
-            if (cache_params->mode == SD_CACHE_EASYCACHE || cache_params->mode == SD_CACHE_UCACHE) {
-                percent_valid = cache_params->start_percent >= 0.0f &&
-                                cache_params->start_percent < 1.0f &&
-                                cache_params->end_percent > 0.0f &&
-                                cache_params->end_percent <= 1.0f &&
-                                cache_params->start_percent < cache_params->end_percent;
-            }
-
-            if (!percent_valid) {
-                LOG_WARN("Cache disabled due to invalid percent range (start=%.3f, end=%.3f)",
-                         cache_params->start_percent,
-                         cache_params->end_percent);
-            } else if (cache_params->mode == SD_CACHE_EASYCACHE) {
-                bool easycache_supported = sd_version_is_dit(version);
-                if (!easycache_supported) {
-                    LOG_WARN("EasyCache requested but not supported for this model type");
-                } else {
-                    EasyCacheConfig easycache_config;
-                    easycache_config.enabled         = true;
-                    easycache_config.reuse_threshold = std::max(0.0f, cache_params->reuse_threshold);
-                    easycache_config.start_percent   = cache_params->start_percent;
-                    easycache_config.end_percent     = cache_params->end_percent;
-                    easycache_state.init(easycache_config, denoiser.get());
-                    if (easycache_state.enabled()) {
-                        easycache_enabled = true;
-                        LOG_INFO("EasyCache enabled - threshold: %.3f, start: %.2f, end: %.2f",
-                                 easycache_config.reuse_threshold,
-                                 easycache_config.start_percent,
-                                 easycache_config.end_percent);
-                    } else {
-                        LOG_WARN("EasyCache requested but could not be initialized for this run");
-                    }
-                }
-            } else if (cache_params->mode == SD_CACHE_UCACHE) {
-                bool ucache_supported = sd_version_is_unet(version);
-                if (!ucache_supported) {
-                    LOG_WARN("UCache requested but not supported for this model type (only UNET models)");
-                } else {
-                    UCacheConfig ucache_config;
-                    ucache_config.enabled                = true;
-                    ucache_config.reuse_threshold        = std::max(0.0f, cache_params->reuse_threshold);
-                    ucache_config.start_percent          = cache_params->start_percent;
-                    ucache_config.end_percent            = cache_params->end_percent;
-                    ucache_config.error_decay_rate       = std::max(0.0f, std::min(1.0f, cache_params->error_decay_rate));
-                    ucache_config.use_relative_threshold = cache_params->use_relative_threshold;
-                    ucache_config.reset_error_on_compute = cache_params->reset_error_on_compute;
-                    ucache_state.init(ucache_config, denoiser.get());
-                    if (ucache_state.enabled()) {
-                        ucache_enabled = true;
-                        LOG_INFO("UCache enabled - threshold: %.3f, start: %.2f, end: %.2f, decay: %.2f, relative: %s, reset: %s",
-                                 ucache_config.reuse_threshold,
-                                 ucache_config.start_percent,
-                                 ucache_config.end_percent,
-                                 ucache_config.error_decay_rate,
-                                 ucache_config.use_relative_threshold ? "true" : "false",
-                                 ucache_config.reset_error_on_compute ? "true" : "false");
-                    } else {
-                        LOG_WARN("UCache requested but could not be initialized for this run");
-                    }
-                }
-            } else if (cache_params->mode == SD_CACHE_DBCACHE ||
-                       cache_params->mode == SD_CACHE_TAYLORSEER ||
-                       cache_params->mode == SD_CACHE_CACHE_DIT) {
-                bool cachedit_supported = sd_version_is_dit(version);
-                if (!cachedit_supported) {
-                    LOG_WARN("CacheDIT requested but not supported for this model type (only DiT models)");
-                } else {
-                    DBCacheConfig dbcfg;
-                    dbcfg.enabled                     = (cache_params->mode == SD_CACHE_DBCACHE ||
-                                     cache_params->mode == SD_CACHE_CACHE_DIT);
-                    dbcfg.Fn_compute_blocks           = cache_params->Fn_compute_blocks;
-                    dbcfg.Bn_compute_blocks           = cache_params->Bn_compute_blocks;
-                    dbcfg.residual_diff_threshold     = cache_params->residual_diff_threshold;
-                    dbcfg.max_warmup_steps            = cache_params->max_warmup_steps;
-                    dbcfg.max_cached_steps            = cache_params->max_cached_steps;
-                    dbcfg.max_continuous_cached_steps = cache_params->max_continuous_cached_steps;
-                    if (cache_params->scm_mask != nullptr && strlen(cache_params->scm_mask) > 0) {
-                        dbcfg.steps_computation_mask = parse_scm_mask(cache_params->scm_mask);
-                    }
-                    dbcfg.scm_policy_dynamic = cache_params->scm_policy_dynamic;
-
-                    TaylorSeerConfig tcfg;
-                    tcfg.enabled             = (cache_params->mode == SD_CACHE_TAYLORSEER ||
-                                    cache_params->mode == SD_CACHE_CACHE_DIT);
-                    tcfg.n_derivatives       = cache_params->taylorseer_n_derivatives;
-                    tcfg.skip_interval_steps = cache_params->taylorseer_skip_interval;
-
-                    cachedit_state.init(dbcfg, tcfg);
-                    if (cachedit_state.enabled()) {
-                        cachedit_enabled = true;
-                        LOG_INFO("CacheDIT enabled - mode: %s, Fn: %d, Bn: %d, threshold: %.3f, warmup: %d",
-                                 cache_params->mode == SD_CACHE_CACHE_DIT ? "DBCache+TaylorSeer" : (cache_params->mode == SD_CACHE_DBCACHE ? "DBCache" : "TaylorSeer"),
-                                 dbcfg.Fn_compute_blocks,
-                                 dbcfg.Bn_compute_blocks,
-                                 dbcfg.residual_diff_threshold,
-                                 dbcfg.max_warmup_steps);
-                    } else {
-                        LOG_WARN("CacheDIT requested but could not be initialized for this run");
-                    }
-                }
-            }
-        }
-
-        if (ucache_enabled) {
-            ucache_state.set_sigmas(sigmas);
-        }
-
-        if (cachedit_enabled) {
-            cachedit_state.set_sigmas(sigmas);
-        }
-
-        size_t steps          = sigmas.size() - 1;
-        struct ggml_tensor* x = ggml_dup_tensor(work_ctx, init_latent);
-        copy_ggml_tensor(x, init_latent);
-
-        if (noise) {
-            x = denoiser->noise_scaling(sigmas[0], noise, x);
-        }
-
-        struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, x);
-
-        bool has_unconditioned = img_cfg_scale != 1.0 && uncond.c_crossattn != nullptr;
-        bool has_img_cond      = cfg_scale != img_cfg_scale && img_cond.c_crossattn != nullptr;
-        bool has_skiplayer     = slg_scale != 0.0 && skip_layers.size() > 0;
-
-        // denoise wrapper
-        struct ggml_tensor* out_cond     = ggml_dup_tensor(work_ctx, x);
-        struct ggml_tensor* out_uncond   = nullptr;
-        struct ggml_tensor* out_skip     = nullptr;
-        struct ggml_tensor* out_img_cond = nullptr;
-
-        if (has_unconditioned) {
-            out_uncond = ggml_dup_tensor(work_ctx, x);
-        }
-        if (has_skiplayer) {
-            if (sd_version_is_dit(version)) {
-                out_skip = ggml_dup_tensor(work_ctx, x);
-            } else {
-                has_skiplayer = false;
-                LOG_WARN("SLG is incompatible with %s models", model_version_to_str[version]);
-            }
-        }
-        if (has_img_cond) {
-            out_img_cond = ggml_dup_tensor(work_ctx, x);
-        }
-        struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x);
-
-        int64_t t0 = ggml_time_us();
-
-        struct ggml_tensor* preview_tensor = nullptr;
-        auto sd_preview_mode               = sd_get_preview_mode();
-        if (sd_preview_mode != PREVIEW_NONE && sd_preview_mode != PREVIEW_PROJ) {
-            int64_t W = x->ne[0] * get_vae_scale_factor();
-            int64_t H = x->ne[1] * get_vae_scale_factor();
-            if (ggml_n_dims(x) == 4) {
-                // assuming video mode (if batch processing gets implemented this will break)
-                int64_t T = x->ne[2];
-                if (sd_version_is_wan(version)) {
-                    T = ((T - 1) * 4) + 1;
-                }
-                preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32,
-                                                    W,
-                                                    H,
-                                                    T,
-                                                    3);
-            } else {
-                preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32,
-                                                    W,
-                                                    H,
-                                                    3,
-                                                    x->ne[3]);
-            }
-        }
-
-        auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
-            auto sd_preview_cb      = sd_get_preview_callback();
-            auto sd_preview_cb_data = sd_get_preview_callback_data();
-            auto sd_preview_mode    = sd_get_preview_mode();
+        auto denoise = [&](const sd::Tensor<float>& x, float sigma, int step) -> sd::Tensor<float> {
             if (step == 1 || step == -1) {
                 pretty_progress(0, (int)steps, 0);
             }
 
-            DiffusionParams diffusion_params;
-
-            const bool easycache_step_active = easycache_enabled && step > 0;
-            int easycache_step_index         = easycache_step_active ? (step - 1) : -1;
-            if (easycache_step_active) {
-                easycache_state.begin_step(easycache_step_index, sigma);
-            }
-
-            auto easycache_before_condition = [&](const SDCondition* condition, struct ggml_tensor* output_tensor) -> bool {
-                if (!easycache_step_active || condition == nullptr || output_tensor == nullptr) {
-                    return false;
-                }
-                return easycache_state.before_condition(condition,
-                                                        diffusion_params.x,
-                                                        output_tensor,
-                                                        sigma,
-                                                        easycache_step_index);
-            };
-
-            auto easycache_after_condition = [&](const SDCondition* condition, struct ggml_tensor* output_tensor) {
-                if (!easycache_step_active || condition == nullptr || output_tensor == nullptr) {
-                    return;
-                }
-                easycache_state.after_condition(condition,
-                                                diffusion_params.x,
-                                                output_tensor);
-            };
-
-            auto easycache_step_is_skipped = [&]() {
-                return easycache_step_active && easycache_state.is_step_skipped();
-            };
-
-            const bool ucache_step_active = ucache_enabled && step > 0;
-            int ucache_step_index         = ucache_step_active ? (step - 1) : -1;
-            if (ucache_step_active) {
-                ucache_state.begin_step(ucache_step_index, sigma);
-            }
-
-            auto ucache_before_condition = [&](const SDCondition* condition, struct ggml_tensor* output_tensor) -> bool {
-                if (!ucache_step_active || condition == nullptr || output_tensor == nullptr) {
-                    return false;
-                }
-                return ucache_state.before_condition(condition,
-                                                     diffusion_params.x,
-                                                     output_tensor,
-                                                     sigma,
-                                                     ucache_step_index);
-            };
-
-            auto ucache_after_condition = [&](const SDCondition* condition, struct ggml_tensor* output_tensor) {
-                if (!ucache_step_active || condition == nullptr || output_tensor == nullptr) {
-                    return;
-                }
-                ucache_state.after_condition(condition,
-                                             diffusion_params.x,
-                                             output_tensor);
-            };
-
-            auto ucache_step_is_skipped = [&]() {
-                return ucache_step_active && ucache_state.is_step_skipped();
-            };
-
-            const bool cachedit_step_active = cachedit_enabled && step > 0;
-            int cachedit_step_index         = cachedit_step_active ? (step - 1) : -1;
-            if (cachedit_step_active) {
-                cachedit_state.begin_step(cachedit_step_index, sigma);
-            }
-
-            auto cachedit_before_condition = [&](const SDCondition* condition, struct ggml_tensor* output_tensor) -> bool {
-                if (!cachedit_step_active || condition == nullptr || output_tensor == nullptr) {
-                    return false;
-                }
-                return cachedit_state.before_condition(condition,
-                                                       diffusion_params.x,
-                                                       output_tensor,
-                                                       sigma,
-                                                       cachedit_step_index);
-            };
-
-            auto cachedit_after_condition = [&](const SDCondition* condition, struct ggml_tensor* output_tensor) {
-                if (!cachedit_step_active || condition == nullptr || output_tensor == nullptr) {
-                    return;
-                }
-                cachedit_state.after_condition(condition,
-                                               diffusion_params.x,
-                                               output_tensor);
-            };
-
-            auto cachedit_step_is_skipped = [&]() {
-                return cachedit_step_active && cachedit_state.is_step_skipped();
-            };
-
-            auto cache_before_condition = [&](const SDCondition* condition, struct ggml_tensor* output_tensor) -> bool {
-                if (easycache_step_active) {
-                    return easycache_before_condition(condition, output_tensor);
-                } else if (ucache_step_active) {
-                    return ucache_before_condition(condition, output_tensor);
-                } else if (cachedit_step_active) {
-                    return cachedit_before_condition(condition, output_tensor);
-                }
-                return false;
-            };
-
-            auto cache_after_condition = [&](const SDCondition* condition, struct ggml_tensor* output_tensor) {
-                if (easycache_step_active) {
-                    easycache_after_condition(condition, output_tensor);
-                } else if (ucache_step_active) {
-                    ucache_after_condition(condition, output_tensor);
-                } else if (cachedit_step_active) {
-                    cachedit_after_condition(condition, output_tensor);
-                }
-            };
-
-            auto cache_step_is_skipped = [&]() {
-                return easycache_step_is_skipped() || ucache_step_is_skipped() || cachedit_step_is_skipped();
-            };
-
             std::vector<float> scaling = denoiser->get_scalings(sigma);
             GGML_ASSERT(scaling.size() == 3);
             float c_skip = scaling[0];
             float c_out  = scaling[1];
             float c_in   = scaling[2];
 
-            float t = denoiser->sigma_to_t(sigma);
-            std::vector<float> timesteps_vec;
-            if (shifted_timestep > 0 && sd_version_is_sdxl(version)) {
-                float shifted_t_float = t * (float(shifted_timestep) / float(TIMESTEPS));
-                int64_t shifted_t     = static_cast<int64_t>(roundf(shifted_t_float));
-                shifted_t             = std::max((int64_t)0, std::min((int64_t)(TIMESTEPS - 1), shifted_t));
-                LOG_DEBUG("shifting timestep from %.2f to %" PRId64 " (sigma: %.4f)", t, shifted_t, sigma);
-                timesteps_vec.assign(1, (float)shifted_t);
-            } else if (sd_version_is_anima(version)) {
-                // Anima uses normalized flow timesteps.
-                timesteps_vec.assign(1, t / static_cast<float>(TIMESTEPS));
-            } else if (sd_version_is_z_image(version)) {
-                timesteps_vec.assign(1, 1000.f - t);
-            } else {
-                timesteps_vec.assign(1, t);
+            std::vector<float> timesteps_vec = prepare_sample_timesteps(sigma, shifted_timestep);
+            timesteps_vec                    = process_timesteps(timesteps_vec, init_latent, denoise_mask);
+            adjust_sample_step_scalings(shifted_timestep, timesteps_vec, c_in, &c_skip, &c_out);
+
+            sd::Tensor<float> timesteps_tensor({static_cast<int64_t>(timesteps_vec.size())}, timesteps_vec);
+            sd::Tensor<float> guidance_tensor({1}, std::vector<float>{guidance.distilled_guidance});
+            sd::Tensor<float> noised_input = x * c_in;
+            if (!denoise_mask.empty() && version == VERSION_WAN2_2_TI2V) {
+                noised_input = noised_input * denoise_mask + init_latent * (1.0f - denoise_mask);
             }
 
-            timesteps_vec  = process_timesteps(timesteps_vec, init_latent, denoise_mask);
-            auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
-            std::vector<float> guidance_vec(1, guidance.distilled_guidance);
-            auto guidance_tensor = vector_to_ggml_tensor(work_ctx, guidance_vec);
-
-            copy_ggml_tensor(noised_input, input);
-            // noised_input = noised_input * c_in
-            ggml_ext_tensor_scale_inplace(noised_input, c_in);
-
-            if (denoise_mask != nullptr && version == VERSION_WAN2_2_TI2V) {
-                apply_mask(noised_input, init_latent, denoise_mask);
-            }
-            if (sd_preview_cb != nullptr && sd_should_preview_noisy()) {
-                if (step % sd_get_preview_interval() == 0) {
-                    preview_image(work_ctx, step, noised_input, version, sd_preview_mode, preview_tensor, sd_preview_cb, sd_preview_cb_data, true);
+            if (cache_runtime.spectrum_enabled && cache_runtime.spectrum.should_predict()) {
+                cache_runtime.spectrum.predict(&denoised);
+                if (!denoise_mask.empty()) {
+                    denoised = denoised * denoise_mask + init_latent * (1.0f - denoise_mask);
                 }
-            }
-
-            std::vector<struct ggml_tensor*> controls;
-
-            if (control_hint != nullptr && control_net != nullptr) {
-                if (control_net->compute(n_threads, noised_input, control_hint, timesteps, cond.c_crossattn, cond.c_vector)) {
-                    controls = control_net->controls;
-                } else {
-                    LOG_ERROR("controlnet compute failed");
+                if (sd_should_preview_denoised() && preview.callback != nullptr) {
+                    preview_image(step, denoised, version, preview.mode, preview.callback, preview.data, false);
                 }
-                // print_ggml_tensor(controls[12]);
-                // GGML_ASSERT(0);
+                report_sample_progress(step, steps, t0);
+                return denoised;
             }
 
-            diffusion_params.x                  = noised_input;
-            diffusion_params.timesteps          = timesteps;
-            diffusion_params.guidance           = guidance_tensor;
-            diffusion_params.ref_latents        = ref_latents;
+            if (sd_should_preview_noisy() && preview.callback != nullptr) {
+                preview_image(step, noised_input, version, preview.mode, preview.callback, preview.data, true);
+            }
+
+            sd::Tensor<float> cond_out;
+            sd::Tensor<float> uncond_out;
+            sd::Tensor<float> img_cond_out;
+            sd::Tensor<float> skip_cond_out;
+            sd_sample::SampleStepCacheDispatcher step_cache(cache_runtime, step, sigma);
+            std::vector<sd::Tensor<float>> controls;
+            DiffusionParams diffusion_params;
+            diffusion_params.x                  = &noised_input;
+            diffusion_params.timesteps          = &timesteps_tensor;
+            diffusion_params.guidance           = &guidance_tensor;
+            diffusion_params.ref_latents        = &ref_latents;
             diffusion_params.increase_ref_index = increase_ref_index;
-            diffusion_params.controls           = controls;
+            diffusion_params.controls           = &controls;
             diffusion_params.control_strength   = control_strength;
-            diffusion_params.vace_context       = vace_context;
+            diffusion_params.vace_context       = vace_context.empty() ? nullptr : &vace_context;
             diffusion_params.vace_strength      = vace_strength;
+            diffusion_params.skip_layers        = nullptr;
 
-            const SDCondition* active_condition = nullptr;
-            struct ggml_tensor** active_output  = &out_cond;
-            if (start_merge_step == -1 || step <= start_merge_step) {
-                // cond
-                diffusion_params.context  = cond.c_crossattn;
-                diffusion_params.c_concat = cond.c_concat;
-                diffusion_params.y        = cond.c_vector;
-                active_condition          = &cond;
-            } else {
-                diffusion_params.context  = id_cond.c_crossattn;
-                diffusion_params.c_concat = cond.c_concat;
-                diffusion_params.y        = id_cond.c_vector;
-                active_condition          = &id_cond;
-            }
+            compute_sample_controls(control_image,
+                                    noised_input,
+                                    timesteps_tensor,
+                                    cond,
+                                    &controls);
 
-            bool skip_model = cache_before_condition(active_condition, *active_output);
-            if (!skip_model) {
-                if (!work_diffusion_model->compute(n_threads,
-                                                   diffusion_params,
-                                                   active_output)) {
+            auto run_condition = [&](const SDCondition& condition,
+                                     const sd::Tensor<float>* c_concat_override = nullptr,
+                                     const std::vector<int>* local_skip_layers  = nullptr) -> sd::Tensor<float> {
+                diffusion_params.context     = condition.c_crossattn.empty() ? nullptr : &condition.c_crossattn;
+                diffusion_params.c_concat    = c_concat_override != nullptr ? c_concat_override : (condition.c_concat.empty() ? nullptr : &condition.c_concat);
+                diffusion_params.y           = condition.c_vector.empty() ? nullptr : &condition.c_vector;
+                diffusion_params.t5_ids      = condition.c_t5_ids.empty() ? nullptr : &condition.c_t5_ids;
+                diffusion_params.t5_weights  = condition.c_t5_weights.empty() ? nullptr : &condition.c_t5_weights;
+                diffusion_params.skip_layers = local_skip_layers;
+
+                sd::Tensor<float> cached_output;
+                if (step_cache.before_condition(&condition, noised_input, &cached_output)) {
+                    return std::move(cached_output);
+                }
+
+                auto output_opt = work_diffusion_model->compute(n_threads, diffusion_params);
+                if (output_opt.empty()) {
                     LOG_ERROR("diffusion model compute failed");
-                    return nullptr;
+                    return sd::Tensor<float>();
+                }
+
+                step_cache.after_condition(&condition, noised_input, output_opt);
+                return output_opt;
+            };
+
+            if (start_merge_step == -1 || step <= start_merge_step) {
+                cond_out = run_condition(cond);
+                if (cond_out.empty()) {
+                    return {};
+                }
+            } else {
+                GGML_ASSERT(!id_cond.empty());
+                cond_out = run_condition(id_cond,
+                                         cond.c_concat.empty() ? nullptr : &cond.c_concat);
+                if (cond_out.empty()) {
+                    return {};
                 }
-                cache_after_condition(active_condition, *active_output);
             }
 
-            bool current_step_skipped = cache_step_is_skipped();
-
-            float* negative_data = nullptr;
-            if (has_unconditioned) {
-                // uncond
-                if (!current_step_skipped && control_hint != nullptr && control_net != nullptr) {
-                    if (control_net->compute(n_threads, noised_input, control_hint, timesteps, uncond.c_crossattn, uncond.c_vector)) {
-                        controls = control_net->controls;
-                    } else {
-                        LOG_ERROR("controlnet compute failed");
-                    }
+            if (!uncond.empty()) {
+                if (!step_cache.is_step_skipped()) {
+                    compute_sample_controls(control_image,
+                                            noised_input,
+                                            timesteps_tensor,
+                                            uncond,
+                                            &controls);
                 }
-                current_step_skipped      = cache_step_is_skipped();
-                diffusion_params.controls = controls;
-                diffusion_params.context  = uncond.c_crossattn;
-                diffusion_params.c_concat = uncond.c_concat;
-                diffusion_params.y        = uncond.c_vector;
-                bool skip_uncond          = cache_before_condition(&uncond, out_uncond);
-                if (!skip_uncond) {
-                    if (!work_diffusion_model->compute(n_threads,
-                                                       diffusion_params,
-                                                       &out_uncond)) {
-                        LOG_ERROR("diffusion model compute failed");
-                        return nullptr;
-                    }
-                    cache_after_condition(&uncond, out_uncond);
+                uncond_out = run_condition(uncond);
+                if (uncond_out.empty()) {
+                    return {};
                 }
-                negative_data = (float*)out_uncond->data;
             }
-
-            float* img_cond_data = nullptr;
-            if (has_img_cond) {
-                diffusion_params.context  = img_cond.c_crossattn;
-                diffusion_params.c_concat = img_cond.c_concat;
-                diffusion_params.y        = img_cond.c_vector;
-                bool skip_img_cond        = cache_before_condition(&img_cond, out_img_cond);
-                if (!skip_img_cond) {
-                    if (!work_diffusion_model->compute(n_threads,
-                                                       diffusion_params,
-                                                       &out_img_cond)) {
-                        LOG_ERROR("diffusion model compute failed");
-                        return nullptr;
-                    }
-                    cache_after_condition(&img_cond, out_img_cond);
+            if (!img_cond.empty()) {
+                img_cond_out = run_condition(img_cond,
+                                             cond.c_concat.empty() ? nullptr : &cond.c_concat);
+                if (img_cond_out.empty()) {
+                    return {};
                 }
-                img_cond_data = (float*)out_img_cond->data;
             }
-
-            int step_count         = static_cast<int>(sigmas.size());
-            bool is_skiplayer_step = has_skiplayer && step > (int)(guidance.slg.layer_start * step_count) && step < (int)(guidance.slg.layer_end * step_count);
-            float* skip_layer_data = has_skiplayer ? (float*)out_skip->data : nullptr;
+            bool is_skiplayer_step = has_skiplayer &&
+                                     step > (int)(guidance.slg.layer_start * static_cast<int>(sigmas.size())) &&
+                                     step < (int)(guidance.slg.layer_end * static_cast<int>(sigmas.size()));
             if (is_skiplayer_step) {
                 LOG_DEBUG("Skipping layers at step %d\n", step);
-                if (!cache_step_is_skipped()) {
-                    // skip layer (same as conditioned)
-                    diffusion_params.context     = cond.c_crossattn;
-                    diffusion_params.c_concat    = cond.c_concat;
-                    diffusion_params.y           = cond.c_vector;
-                    diffusion_params.skip_layers = skip_layers;
-                    if (!work_diffusion_model->compute(n_threads,
-                                                       diffusion_params,
-                                                       &out_skip)) {
-                        LOG_ERROR("diffusion model compute failed");
-                        return nullptr;
+                if (!step_cache.is_step_skipped()) {
+                    skip_cond_out = run_condition(cond,
+                                                  cond.c_concat.empty() ? nullptr : &cond.c_concat,
+                                                  &skip_layers);
+                    if (skip_cond_out.empty()) {
+                        return {};
                     }
                 }
-                skip_layer_data = (float*)out_skip->data;
-            }
-            float* vec_denoised  = (float*)denoised->data;
-            float* vec_input     = (float*)input->data;
-            float* positive_data = (float*)out_cond->data;
-            int ne_elements      = (int)ggml_nelements(denoised);
-
-            if (shifted_timestep > 0 && sd_version_is_sdxl(version)) {
-                int64_t shifted_t_idx              = static_cast<int64_t>(roundf(timesteps_vec[0]));
-                float shifted_sigma                = denoiser->t_to_sigma((float)shifted_t_idx);
-                std::vector<float> shifted_scaling = denoiser->get_scalings(shifted_sigma);
-                float shifted_c_skip               = shifted_scaling[0];
-                float shifted_c_out                = shifted_scaling[1];
-                float shifted_c_in                 = shifted_scaling[2];
-
-                c_skip = shifted_c_skip * c_in / shifted_c_in;
-                c_out  = shifted_c_out;
             }
 
-            for (int i = 0; i < ne_elements; i++) {
-                float latent_result = positive_data[i];
-                if (has_unconditioned) {
-                    // out_uncond + cfg_scale * (out_cond - out_uncond)
-                    if (has_img_cond) {
-                        // out_uncond + text_cfg_scale * (out_cond - out_img_cond) + image_cfg_scale * (out_img_cond - out_uncond)
-                        latent_result = negative_data[i] + img_cfg_scale * (img_cond_data[i] - negative_data[i]) + cfg_scale * (positive_data[i] - img_cond_data[i]);
-                    } else {
-                        // img_cfg_scale == cfg_scale
-                        latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]);
-                    }
-                } else if (has_img_cond) {
-                    // img_cfg_scale == 1
-                    latent_result = img_cond_data[i] + cfg_scale * (positive_data[i] - img_cond_data[i]);
+            GGML_ASSERT(!cond_out.empty());
+            sd::Tensor<float> latent_result = cond_out;
+            if (!uncond_out.empty()) {
+                if (!img_cond_out.empty()) {
+                    latent_result = uncond_out +
+                                    img_cfg_scale * (img_cond_out - uncond_out) +
+                                    cfg_scale * (cond_out - img_cond_out);
+                } else {
+                    latent_result = uncond_out + cfg_scale * (cond_out - uncond_out);
                 }
-                if (is_skiplayer_step) {
-                    latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_scale;
-                }
-                // v = latent_result, eps = latent_result
-                // denoised = (v * c_out + input * c_skip) or (input + eps * c_out)
-                vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip;
+            } else if (!img_cond_out.empty()) {
+                latent_result = img_cond_out + cfg_scale * (cond_out - img_cond_out);
             }
 
-            if (denoise_mask != nullptr) {
-                apply_mask(denoised, init_latent, denoise_mask);
+            if (is_skiplayer_step && !skip_cond_out.empty()) {
+                latent_result += (cond_out - skip_cond_out) * slg_scale;
             }
-
-            if (sd_preview_cb != nullptr && sd_should_preview_denoised()) {
-                if (step % sd_get_preview_interval() == 0) {
-                    preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb, sd_preview_cb_data, false);
-                }
+            denoised = latent_result * c_out + x * c_skip;
+            if (cache_runtime.spectrum_enabled) {
+                cache_runtime.spectrum.update(denoised);
             }
-
-            int64_t t1 = ggml_time_us();
-            if (step > 0 || step == -(int)steps) {
-                int showstep = std::abs(step);
-                pretty_progress(showstep, (int)steps, (t1 - t0) / 1000000.f / showstep);
-                // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
+            if (!denoise_mask.empty()) {
+                denoised = denoised * denoise_mask + init_latent * (1.0f - denoise_mask);
             }
+            if (sd_should_preview_denoised() && preview.callback != nullptr) {
+                preview_image(step, denoised, version, preview.mode, preview.callback, preview.data, false);
+            }
+            report_sample_progress(step, steps, t0);
             return denoised;
         };
 
-        if (!sample_k_diffusion(method, denoise, work_ctx, x, sigmas, sampler_rng, eta)) {
+        auto x0_opt = sample_k_diffusion(method, denoise, x_t, sigmas, sampler_rng, eta);
+        if (x0_opt.empty()) {
             LOG_ERROR("Diffusion model sampling failed");
             if (control_net) {
                 control_net->free_control_ctx();
                 control_net->free_compute_buffer();
             }
-            diffusion_model->free_compute_buffer();
-            return NULL;
-        }
-
-        if (easycache_enabled) {
-            size_t total_steps = sigmas.size() > 0 ? sigmas.size() - 1 : 0;
-            if (easycache_state.total_steps_skipped > 0 && total_steps > 0) {
-                if (easycache_state.total_steps_skipped < static_cast<int>(total_steps)) {
-                    double speedup = static_cast<double>(total_steps) /
-                                     static_cast<double>(total_steps - easycache_state.total_steps_skipped);
-                    LOG_INFO("EasyCache skipped %d/%zu steps (%.2fx estimated speedup)",
-                             easycache_state.total_steps_skipped,
-                             total_steps,
-                             speedup);
-                } else {
-                    LOG_INFO("EasyCache skipped %d/%zu steps",
-                             easycache_state.total_steps_skipped,
-                             total_steps);
-                }
-            } else if (total_steps > 0) {
-                LOG_INFO("EasyCache completed without skipping steps");
-            }
-        }
-
-        if (ucache_enabled) {
-            size_t total_steps = sigmas.size() > 0 ? sigmas.size() - 1 : 0;
-            if (ucache_state.total_steps_skipped > 0 && total_steps > 0) {
-                if (ucache_state.total_steps_skipped < static_cast<int>(total_steps)) {
-                    double speedup = static_cast<double>(total_steps) /
-                                     static_cast<double>(total_steps - ucache_state.total_steps_skipped);
-                    LOG_INFO("UCache skipped %d/%zu steps (%.2fx estimated speedup)",
-                             ucache_state.total_steps_skipped,
-                             total_steps,
-                             speedup);
-                } else {
-                    LOG_INFO("UCache skipped %d/%zu steps",
-                             ucache_state.total_steps_skipped,
-                             total_steps);
-                }
-            } else if (total_steps > 0) {
-                LOG_INFO("UCache completed without skipping steps");
-            }
-        }
-
-        if (cachedit_enabled) {
-            size_t total_steps = sigmas.size() > 0 ? sigmas.size() - 1 : 0;
-            if (cachedit_state.total_steps_skipped > 0 && total_steps > 0) {
-                if (cachedit_state.total_steps_skipped < static_cast<int>(total_steps)) {
-                    double speedup = static_cast<double>(total_steps) /
-                                     static_cast<double>(total_steps - cachedit_state.total_steps_skipped);
-                    LOG_INFO("CacheDIT skipped %d/%zu steps (%.2fx estimated speedup), accum_diff: %.4f",
-                             cachedit_state.total_steps_skipped,
-                             total_steps,
-                             speedup,
-                             cachedit_state.accumulated_residual_diff);
-                } else {
-                    LOG_INFO("CacheDIT skipped %d/%zu steps, accum_diff: %.4f",
-                             cachedit_state.total_steps_skipped,
-                             total_steps,
-                             cachedit_state.accumulated_residual_diff);
-                }
-            } else if (total_steps > 0) {
-                LOG_INFO("CacheDIT completed without skipping steps");
+            if (work_diffusion_model) {
+                work_diffusion_model->free_compute_buffer();
             }
+            return {};
         }
 
+        auto x0 = std::move(x0_opt);
+        sd_sample::log_sample_cache_summary(cache_runtime, steps);
         if (inverse_noise_scaling) {
-            x = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x);
+            x0 = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x0);
         }
 
         if (control_net) {
             control_net->free_control_ctx();
             control_net->free_compute_buffer();
         }
-        work_diffusion_model->free_compute_buffer();
-        return x;
+        if (work_diffusion_model) {
+            work_diffusion_model->free_compute_buffer();
+        }
+        return x0;
     }
 
     int get_vae_scale_factor() {
-        int vae_scale_factor = 8;
-        if (version == VERSION_WAN2_2_TI2V) {
-            vae_scale_factor = 16;
-        } else if (sd_version_is_flux2(version)) {
-            vae_scale_factor = 16;
-        } else if (version == VERSION_CHROMA_RADIANCE) {
-            vae_scale_factor = 1;
-        }
-        return vae_scale_factor;
+        return first_stage_model->get_scale_factor();
     }
 
     int get_diffusion_model_down_factor() {
@@ -2338,11 +1857,10 @@ public:
         return (h / vae_scale_factor) * (w / vae_scale_factor);
     }
 
-    ggml_tensor* generate_init_latent(ggml_context* work_ctx,
-                                      int width,
-                                      int height,
-                                      int frames = 1,
-                                      bool video = false) {
+    sd::Tensor<float> generate_init_latent(int width,
+                                           int height,
+                                           int frames = 1,
+                                           bool video = false) {
         int vae_scale_factor = get_vae_scale_factor();
         int W                = width / vae_scale_factor;
         int H                = height / vae_scale_factor;
@@ -2351,389 +1869,35 @@ public:
             T = ((T - 1) / 4) + 1;
         }
         int C = get_latent_channel();
-        ggml_tensor* init_latent;
         if (video) {
-            init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, T, C);
-        } else {
-            init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
+            return sd::zeros<float>({W, H, T, C, 1});
         }
-        ggml_set_f32(init_latent, shift_factor);
-        return init_latent;
+        return sd::zeros<float>({W, H, C, 1});
     }
 
-    void get_latents_mean_std_vec(ggml_tensor* latent, int channel_dim, std::vector<float>& latents_mean_vec, std::vector<float>& latents_std_vec) {
-        GGML_ASSERT(latent->ne[channel_dim] == 16 || latent->ne[channel_dim] == 48 || latent->ne[channel_dim] == 128);
-        if (latent->ne[channel_dim] == 16) {
-            latents_mean_vec = {-0.7571f, -0.7089f, -0.9113f, 0.1075f, -0.1745f, 0.9653f, -0.1517f, 1.5508f,
-                                0.4134f, -0.0715f, 0.5517f, -0.3632f, -0.1922f, -0.9497f, 0.2503f, -0.2921f};
-            latents_std_vec  = {2.8184f, 1.4541f, 2.3275f, 2.6558f, 1.2196f, 1.7708f, 2.6052f, 2.0743f,
-                                3.2687f, 2.1526f, 2.8652f, 1.5579f, 1.6382f, 1.1253f, 2.8251f, 1.9160f};
-        } else if (latent->ne[channel_dim] == 48) {
-            latents_mean_vec = {-0.2289f, -0.0052f, -0.1323f, -0.2339f, -0.2799f, 0.0174f, 0.1838f, 0.1557f,
-                                -0.1382f, 0.0542f, 0.2813f, 0.0891f, 0.1570f, -0.0098f, 0.0375f, -0.1825f,
-                                -0.2246f, -0.1207f, -0.0698f, 0.5109f, 0.2665f, -0.2108f, -0.2158f, 0.2502f,
-                                -0.2055f, -0.0322f, 0.1109f, 0.1567f, -0.0729f, 0.0899f, -0.2799f, -0.1230f,
-                                -0.0313f, -0.1649f, 0.0117f, 0.0723f, -0.2839f, -0.2083f, -0.0520f, 0.3748f,
-                                0.0152f, 0.1957f, 0.1433f, -0.2944f, 0.3573f, -0.0548f, -0.1681f, -0.0667f};
-            latents_std_vec  = {
-                 0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f,
-                 0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f,
-                 0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f,
-                 0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f,
-                 0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f,
-                 0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f};
-        } else if (latent->ne[channel_dim] == 128) {
-            // flux2
-            latents_mean_vec = {-0.0676f, -0.0715f, -0.0753f, -0.0745f, 0.0223f, 0.0180f, 0.0142f, 0.0184f,
-                                -0.0001f, -0.0063f, -0.0002f, -0.0031f, -0.0272f, -0.0281f, -0.0276f, -0.0290f,
-                                -0.0769f, -0.0672f, -0.0902f, -0.0892f, 0.0168f, 0.0152f, 0.0079f, 0.0086f,
-                                0.0083f, 0.0015f, 0.0003f, -0.0043f, -0.0439f, -0.0419f, -0.0438f, -0.0431f,
-                                -0.0102f, -0.0132f, -0.0066f, -0.0048f, -0.0311f, -0.0306f, -0.0279f, -0.0180f,
-                                0.0030f, 0.0015f, 0.0126f, 0.0145f, 0.0347f, 0.0338f, 0.0337f, 0.0283f,
-                                0.0020f, 0.0047f, 0.0047f, 0.0050f, 0.0123f, 0.0081f, 0.0081f, 0.0146f,
-                                0.0681f, 0.0679f, 0.0767f, 0.0732f, -0.0462f, -0.0474f, -0.0392f, -0.0511f,
-                                -0.0528f, -0.0477f, -0.0470f, -0.0517f, -0.0317f, -0.0316f, -0.0345f, -0.0283f,
-                                0.0510f, 0.0445f, 0.0578f, 0.0458f, -0.0412f, -0.0458f, -0.0487f, -0.0467f,
-                                -0.0088f, -0.0106f, -0.0088f, -0.0046f, -0.0376f, -0.0432f, -0.0436f, -0.0499f,
-                                0.0118f, 0.0166f, 0.0203f, 0.0279f, 0.0113f, 0.0129f, 0.0016f, 0.0072f,
-                                -0.0118f, -0.0018f, -0.0141f, -0.0054f, -0.0091f, -0.0138f, -0.0145f, -0.0187f,
-                                0.0323f, 0.0305f, 0.0259f, 0.0300f, 0.0540f, 0.0614f, 0.0495f, 0.0590f,
-                                -0.0511f, -0.0603f, -0.0478f, -0.0524f, -0.0227f, -0.0274f, -0.0154f, -0.0255f,
-                                -0.0572f, -0.0565f, -0.0518f, -0.0496f, 0.0116f, 0.0054f, 0.0163f, 0.0104f};
-            latents_std_vec  = {
-                 1.8029f, 1.7786f, 1.7868f, 1.7837f, 1.7717f, 1.7590f, 1.7610f, 1.7479f,
-                 1.7336f, 1.7373f, 1.7340f, 1.7343f, 1.8626f, 1.8527f, 1.8629f, 1.8589f,
-                 1.7593f, 1.7526f, 1.7556f, 1.7583f, 1.7363f, 1.7400f, 1.7355f, 1.7394f,
-                 1.7342f, 1.7246f, 1.7392f, 1.7304f, 1.7551f, 1.7513f, 1.7559f, 1.7488f,
-                 1.8449f, 1.8454f, 1.8550f, 1.8535f, 1.8240f, 1.7813f, 1.7854f, 1.7945f,
-                 1.8047f, 1.7876f, 1.7695f, 1.7676f, 1.7782f, 1.7667f, 1.7925f, 1.7848f,
-                 1.7579f, 1.7407f, 1.7483f, 1.7368f, 1.7961f, 1.7998f, 1.7920f, 1.7925f,
-                 1.7780f, 1.7747f, 1.7727f, 1.7749f, 1.7526f, 1.7447f, 1.7657f, 1.7495f,
-                 1.7775f, 1.7720f, 1.7813f, 1.7813f, 1.8162f, 1.8013f, 1.8023f, 1.8033f,
-                 1.7527f, 1.7331f, 1.7563f, 1.7482f, 1.7610f, 1.7507f, 1.7681f, 1.7613f,
-                 1.7665f, 1.7545f, 1.7828f, 1.7726f, 1.7896f, 1.7999f, 1.7864f, 1.7760f,
-                 1.7613f, 1.7625f, 1.7560f, 1.7577f, 1.7783f, 1.7671f, 1.7810f, 1.7799f,
-                 1.7201f, 1.7068f, 1.7265f, 1.7091f, 1.7793f, 1.7578f, 1.7502f, 1.7455f,
-                 1.7587f, 1.7500f, 1.7525f, 1.7362f, 1.7616f, 1.7572f, 1.7444f, 1.7430f,
-                 1.7509f, 1.7610f, 1.7634f, 1.7612f, 1.7254f, 1.7135f, 1.7321f, 1.7226f,
-                 1.7664f, 1.7624f, 1.7718f, 1.7664f, 1.7457f, 1.7441f, 1.7569f, 1.7530f};
+    sd::Tensor<float> encode_to_vae_latents(const sd::Tensor<float>& x) {
+        auto latents = first_stage_model->encode(n_threads, x, vae_tiling_params, circular_x, circular_y);
+        if (latents.empty()) {
+            return {};
         }
+        latents = first_stage_model->vae_output_to_latents(latents, rng);
+        return latents;
     }
 
-    void process_latent_in(ggml_tensor* latent) {
-        if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version) || sd_version_is_flux2(version)) {
-            int channel_dim = sd_version_is_flux2(version) ? 2 : 3;
-            std::vector<float> latents_mean_vec;
-            std::vector<float> latents_std_vec;
-            get_latents_mean_std_vec(latent, channel_dim, latents_mean_vec, latents_std_vec);
-
-            float mean;
-            float std_;
-            for (int i = 0; i < latent->ne[3]; i++) {
-                if (channel_dim == 3) {
-                    mean = latents_mean_vec[i];
-                    std_ = latents_std_vec[i];
-                }
-                for (int j = 0; j < latent->ne[2]; j++) {
-                    if (channel_dim == 2) {
-                        mean = latents_mean_vec[i];
-                        std_ = latents_std_vec[i];
-                    }
-                    for (int k = 0; k < latent->ne[1]; k++) {
-                        for (int l = 0; l < latent->ne[0]; l++) {
-                            float value = ggml_ext_tensor_get_f32(latent, l, k, j, i);
-                            value       = (value - mean) * scale_factor / std_;
-                            ggml_ext_tensor_set_f32(latent, value, l, k, j, i);
-                        }
-                    }
-                }
-            }
-        } else if (version == VERSION_CHROMA_RADIANCE) {
-            // pass
-        } else {
-            ggml_ext_tensor_iter(latent, [&](ggml_tensor* latent, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-                float value = ggml_ext_tensor_get_f32(latent, i0, i1, i2, i3);
-                value       = (value - shift_factor) * scale_factor;
-                ggml_ext_tensor_set_f32(latent, value, i0, i1, i2, i3);
-            });
+    sd::Tensor<float> encode_first_stage(const sd::Tensor<float>& x) {
+        auto latents = encode_to_vae_latents(x);
+        if (latents.empty()) {
+            return {};
         }
+        if (version != VERSION_SD1_PIX2PIX) {
+            latents = first_stage_model->vae_to_diffusion_latents(latents);
+        }
+        return latents;
     }
 
-    void process_latent_out(ggml_tensor* latent) {
-        if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version) || sd_version_is_flux2(version)) {
-            int channel_dim = sd_version_is_flux2(version) ? 2 : 3;
-            std::vector<float> latents_mean_vec;
-            std::vector<float> latents_std_vec;
-            get_latents_mean_std_vec(latent, channel_dim, latents_mean_vec, latents_std_vec);
-
-            float mean;
-            float std_;
-            for (int i = 0; i < latent->ne[3]; i++) {
-                if (channel_dim == 3) {
-                    mean = latents_mean_vec[i];
-                    std_ = latents_std_vec[i];
-                }
-                for (int j = 0; j < latent->ne[2]; j++) {
-                    if (channel_dim == 2) {
-                        mean = latents_mean_vec[i];
-                        std_ = latents_std_vec[i];
-                    }
-                    for (int k = 0; k < latent->ne[1]; k++) {
-                        for (int l = 0; l < latent->ne[0]; l++) {
-                            float value = ggml_ext_tensor_get_f32(latent, l, k, j, i);
-                            value       = value * std_ / scale_factor + mean;
-                            ggml_ext_tensor_set_f32(latent, value, l, k, j, i);
-                        }
-                    }
-                }
-            }
-        } else if (version == VERSION_CHROMA_RADIANCE) {
-            // pass
-        } else {
-            ggml_ext_tensor_iter(latent, [&](ggml_tensor* latent, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-                float value = ggml_ext_tensor_get_f32(latent, i0, i1, i2, i3);
-                value       = (value / scale_factor) + shift_factor;
-                ggml_ext_tensor_set_f32(latent, value, i0, i1, i2, i3);
-            });
-        }
-    }
-
-    void get_tile_sizes(int& tile_size_x,
-                        int& tile_size_y,
-                        float& tile_overlap,
-                        const sd_tiling_params_t& params,
-                        int64_t latent_x,
-                        int64_t latent_y,
-                        float encoding_factor = 1.0f) {
-        tile_overlap       = std::max(std::min(params.target_overlap, 0.5f), 0.0f);
-        auto get_tile_size = [&](int requested_size, float factor, int64_t latent_size) {
-            const int default_tile_size  = 32;
-            const int min_tile_dimension = 4;
-            int tile_size                = default_tile_size;
-            // factor <= 1 means simple fraction of the latent dimension
-            // factor > 1 means number of tiles across that dimension
-            if (factor > 0.f) {
-                if (factor > 1.0)
-                    factor = 1 / (factor - factor * tile_overlap + tile_overlap);
-                tile_size = static_cast<int>(std::round(latent_size * factor));
-            } else if (requested_size >= min_tile_dimension) {
-                tile_size = requested_size;
-            }
-            tile_size = static_cast<int>(tile_size * encoding_factor);
-            return std::max(std::min(tile_size, static_cast<int>(latent_size)), min_tile_dimension);
-        };
-
-        tile_size_x = get_tile_size(params.tile_size_x, params.rel_size_x, latent_x);
-        tile_size_y = get_tile_size(params.tile_size_y, params.rel_size_y, latent_y);
-    }
-
-    ggml_tensor* vae_encode(ggml_context* work_ctx, ggml_tensor* x, bool encode_video = false) {
-        int64_t t0                 = ggml_time_ms();
-        ggml_tensor* result        = nullptr;
-        const int vae_scale_factor = get_vae_scale_factor();
-        int64_t W                  = x->ne[0] / vae_scale_factor;
-        int64_t H                  = x->ne[1] / vae_scale_factor;
-        int64_t C                  = get_latent_channel();
-        if (vae_tiling_params.enabled && !encode_video) {
-            // TODO wan2.2 vae support?
-            int64_t ne2;
-            int64_t ne3;
-            if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
-                ne2 = 1;
-                ne3 = C * x->ne[3];
-            } else {
-                int64_t out_channels   = C;
-                bool encode_outputs_mu = use_tiny_autoencoder ||
-                                         sd_version_is_wan(version) ||
-                                         sd_version_is_flux2(version) ||
-                                         version == VERSION_CHROMA_RADIANCE;
-                if (!encode_outputs_mu) {
-                    out_channels *= 2;
-                }
-                ne2 = out_channels;
-                ne3 = x->ne[3];
-            }
-            result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, ne2, ne3);
-        }
-
-        if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
-            x = ggml_reshape_4d(work_ctx, x, x->ne[0], x->ne[1], 1, x->ne[2] * x->ne[3]);
-        }
-
-        if (!use_tiny_autoencoder) {
-            process_vae_input_tensor(x);
-            if (vae_tiling_params.enabled && !encode_video) {
-                float tile_overlap;
-                int tile_size_x, tile_size_y;
-                // multiply tile size for encode to keep the compute buffer size consistent
-                get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, vae_tiling_params, W, H, 1.30539f);
-
-                LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y);
-
-                auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-                    return first_stage_model->compute(n_threads, in, false, &out, work_ctx);
-                };
-                sd_tiling_non_square(x, result, vae_scale_factor, tile_size_x, tile_size_y, tile_overlap, on_tiling);
-            } else {
-                first_stage_model->compute(n_threads, x, false, &result, work_ctx);
-            }
-            first_stage_model->free_compute_buffer();
-        } else {
-            if (vae_tiling_params.enabled && !encode_video) {
-                // split latent in 32x32 tiles and compute in several steps
-                auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-                    return tae_first_stage->compute(n_threads, in, false, &out, nullptr);
-                };
-                sd_tiling(x, result, vae_scale_factor, 64, 0.5f, on_tiling);
-            } else {
-                tae_first_stage->compute(n_threads, x, false, &result, work_ctx);
-            }
-            tae_first_stage->free_compute_buffer();
-        }
-
-        int64_t t1 = ggml_time_ms();
-        LOG_DEBUG("computing vae encode graph completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
-        return result;
-    }
-
-    ggml_tensor* gaussian_latent_sample(ggml_context* work_ctx, ggml_tensor* moments) {
-        // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample
-        ggml_tensor* latent       = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]);
-        struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latent);
-        ggml_ext_im_set_randn_f32(noise, rng);
-        {
-            float mean   = 0;
-            float logvar = 0;
-            float value  = 0;
-            float std_   = 0;
-            for (int i = 0; i < latent->ne[3]; i++) {
-                for (int j = 0; j < latent->ne[2]; j++) {
-                    for (int k = 0; k < latent->ne[1]; k++) {
-                        for (int l = 0; l < latent->ne[0]; l++) {
-                            mean   = ggml_ext_tensor_get_f32(moments, l, k, j, i);
-                            logvar = ggml_ext_tensor_get_f32(moments, l, k, j + (int)latent->ne[2], i);
-                            logvar = std::max(-30.0f, std::min(logvar, 20.0f));
-                            std_   = std::exp(0.5f * logvar);
-                            value  = mean + std_ * ggml_ext_tensor_get_f32(noise, l, k, j, i);
-                            // printf("%d %d %d %d -> %f\n", i, j, k, l, value);
-                            ggml_ext_tensor_set_f32(latent, value, l, k, j, i);
-                        }
-                    }
-                }
-            }
-        }
-        return latent;
-    }
-
-    ggml_tensor* get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* vae_output) {
-        ggml_tensor* latent;
-        if (use_tiny_autoencoder ||
-            sd_version_is_qwen_image(version) ||
-            sd_version_is_anima(version) ||
-            sd_version_is_wan(version) ||
-            sd_version_is_flux2(version) ||
-            version == VERSION_CHROMA_RADIANCE) {
-            latent = vae_output;
-        } else if (version == VERSION_SD1_PIX2PIX) {
-            latent = ggml_view_3d(work_ctx,
-                                  vae_output,
-                                  vae_output->ne[0],
-                                  vae_output->ne[1],
-                                  vae_output->ne[2] / 2,
-                                  vae_output->nb[1],
-                                  vae_output->nb[2],
-                                  0);
-        } else {
-            latent = gaussian_latent_sample(work_ctx, vae_output);
-        }
-        if (!use_tiny_autoencoder) {
-            process_latent_in(latent);
-        }
-        if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
-            latent = ggml_reshape_4d(work_ctx, latent, latent->ne[0], latent->ne[1], latent->ne[3], 1);
-        }
-        return latent;
-    }
-
-    ggml_tensor* encode_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool encode_video = false) {
-        ggml_tensor* vae_output = vae_encode(work_ctx, x, encode_video);
-        return get_first_stage_encoding(work_ctx, vae_output);
-    }
-
-    ggml_tensor* decode_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode_video = false) {
-        const int vae_scale_factor = get_vae_scale_factor();
-        int64_t W                  = x->ne[0] * vae_scale_factor;
-        int64_t H                  = x->ne[1] * vae_scale_factor;
-        int64_t C                  = 3;
-        ggml_tensor* result        = nullptr;
-        if (decode_video) {
-            int64_t T = x->ne[2];
-            if (sd_version_is_wan(version)) {
-                T = ((T - 1) * 4) + 1;
-            }
-            result = ggml_new_tensor_4d(work_ctx,
-                                        GGML_TYPE_F32,
-                                        W,
-                                        H,
-                                        T,
-                                        3);
-        } else {
-            result = ggml_new_tensor_4d(work_ctx,
-                                        GGML_TYPE_F32,
-                                        W,
-                                        H,
-                                        C,
-                                        x->ne[3]);
-        }
-        int64_t t0 = ggml_time_ms();
-        if (!use_tiny_autoencoder) {
-            if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
-                x = ggml_reshape_4d(work_ctx, x, x->ne[0], x->ne[1], 1, x->ne[2] * x->ne[3]);
-            }
-            process_latent_out(x);
-            // x = load_tensor_from_file(work_ctx, "wan_vae_z.bin");
-            if (vae_tiling_params.enabled) {
-                float tile_overlap;
-                int tile_size_x, tile_size_y;
-                get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, vae_tiling_params, x->ne[0], x->ne[1]);
-
-                LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y);
-
-                // split latent in 32x32 tiles and compute in several steps
-                auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-                    return first_stage_model->compute(n_threads, in, true, &out, nullptr);
-                };
-                sd_tiling_non_square(x, result, vae_scale_factor, tile_size_x, tile_size_y, tile_overlap, on_tiling);
-            } else {
-                if (!first_stage_model->compute(n_threads, x, true, &result, work_ctx)) {
-                    LOG_ERROR("Failed to decode latetnts");
-                    first_stage_model->free_compute_buffer();
-                    return nullptr;
-                }
-            }
-            first_stage_model->free_compute_buffer();
-            process_vae_output_tensor(result);
-        } else {
-            if (vae_tiling_params.enabled) {
-                // split latent in 64x64 tiles and compute in several steps
-                auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-                    return tae_first_stage->compute(n_threads, in, true, &out);
-                };
-                sd_tiling(x, result, vae_scale_factor, 64, 0.5f, on_tiling);
-            } else {
-                if (!tae_first_stage->compute(n_threads, x, true, &result)) {
-                    LOG_ERROR("Failed to decode latetnts");
-                    tae_first_stage->free_compute_buffer();
-                    return nullptr;
-                }
-            }
-            tae_first_stage->free_compute_buffer();
-        }
-
-        int64_t t1 = ggml_time_ms();
-        LOG_DEBUG("computing vae decode graph completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
-        ggml_ext_tensor_clamp_inplace(result, 0.0f, 1.0f);
-        return result;
+    sd::Tensor<float> decode_first_stage(const sd::Tensor<float>& x, bool decode_video = false) {
+        auto latents = first_stage_model->diffusion_to_vae_latents(x);
+        return first_stage_model->decode(n_threads, latents, vae_tiling_params, decode_video, circular_x, circular_y);
     }
 
     void set_flow_shift(float flow_shift = INFINITY) {
@@ -2926,7 +2090,7 @@ enum lora_apply_mode_t str_to_lora_apply_mode(const char* str) {
 void sd_cache_params_init(sd_cache_params_t* cache_params) {
     *cache_params                             = {};
     cache_params->mode                        = SD_CACHE_DISABLED;
-    cache_params->reuse_threshold             = 1.0f;
+    cache_params->reuse_threshold             = INFINITY;
     cache_params->start_percent               = 0.15f;
     cache_params->end_percent                 = 0.95f;
     cache_params->error_decay_rate            = 1.0f;
@@ -2942,6 +2106,13 @@ void sd_cache_params_init(sd_cache_params_t* cache_params) {
     cache_params->taylorseer_skip_interval    = 1;
     cache_params->scm_mask                    = nullptr;
     cache_params->scm_policy_dynamic          = true;
+    cache_params->spectrum_w                  = 0.40f;
+    cache_params->spectrum_m                  = 3;
+    cache_params->spectrum_lam                = 1.0f;
+    cache_params->spectrum_window_size        = 2;
+    cache_params->spectrum_flex_window        = 0.50f;
+    cache_params->spectrum_warmup_steps       = 4;
+    cache_params->spectrum_stop_percent       = 0.9f;
 }
 
 void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
@@ -3165,7 +2336,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
     snprintf(buf + strlen(buf), 4096 - strlen(buf),
              "cache: %s (threshold=%.3f, start=%.2f, end=%.2f)\n",
              cache_mode_str,
-             sd_img_gen_params->cache.reuse_threshold,
+             get_cache_reuse_threshold(sd_img_gen_params->cache),
              sd_img_gen_params->cache.start_percent,
              sd_img_gen_params->cache.end_percent);
     free(sample_params_str);
@@ -3243,628 +2414,216 @@ enum scheduler_t sd_get_default_scheduler(const sd_ctx_t* sd_ctx, enum sample_me
     return DISCRETE_SCHEDULER;
 }
 
-sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
-                                    struct ggml_context* work_ctx,
-                                    ggml_tensor* init_latent,
-                                    std::string prompt,
-                                    std::string negative_prompt,
-                                    int clip_skip,
-                                    sd_guidance_params_t guidance,
-                                    float eta,
-                                    int shifted_timestep,
-                                    int width,
-                                    int height,
-                                    enum sample_method_t sample_method,
-                                    const std::vector<float>& sigmas,
-                                    int64_t seed,
-                                    int batch_count,
-                                    sd_image_t control_image,
-                                    float control_strength,
-                                    sd_pm_params_t pm_params,
-                                    std::vector<sd_image_t*> ref_images,
-                                    std::vector<ggml_tensor*> ref_latents,
-                                    bool increase_ref_index,
-                                    ggml_tensor* concat_latent            = nullptr,
-                                    ggml_tensor* denoise_mask             = nullptr,
-                                    const sd_cache_params_t* cache_params = nullptr) {
-    if (seed < 0) {
-        // Generally, when using the provided command line, the seed is always >0.
-        // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
-        // by a third party with a seed <0, let's incorporate randomization here.
-        srand((int)time(nullptr));
-        seed = rand();
+static int64_t resolve_seed(int64_t seed) {
+    if (seed >= 0) {
+        return seed;
     }
-
-    if (!std::isfinite(guidance.img_cfg)) {
-        guidance.img_cfg = guidance.txt_cfg;
-    }
-
-    int sample_steps = static_cast<int>(sigmas.size() - 1);
-
-    int64_t t0 = ggml_time_ms();
-
-    ConditionerParams condition_params;
-    condition_params.text            = prompt;
-    condition_params.clip_skip       = clip_skip;
-    condition_params.width           = width;
-    condition_params.height          = height;
-    condition_params.ref_images      = ref_images;
-    condition_params.adm_in_channels = static_cast<int>(sd_ctx->sd->diffusion_model->get_adm_in_channels());
-
-    // Photo Maker
-    SDCondition id_cond = sd_ctx->sd->get_pmid_conditon(work_ctx, pm_params, condition_params);
-
-    // Get learned condition
-    condition_params.zero_out_masked = false;
-    SDCondition cond                 = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
-                                                                                           sd_ctx->sd->n_threads,
-                                                                                           condition_params);
-
-    SDCondition uncond;
-    if (guidance.txt_cfg != 1.0 ||
-        (sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.txt_cfg != guidance.img_cfg)) {
-        bool zero_out_masked = false;
-        if (sd_version_is_sdxl(sd_ctx->sd->version) && negative_prompt.size() == 0 && !sd_ctx->sd->is_using_edm_v_parameterization) {
-            zero_out_masked = true;
-        }
-        condition_params.text            = negative_prompt;
-        condition_params.zero_out_masked = zero_out_masked;
-        uncond                           = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
-                                                                                               sd_ctx->sd->n_threads,
-                                                                                               condition_params);
-    }
-    int64_t t1 = ggml_time_ms();
-    LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t1 - t0);
-
-    if (sd_ctx->sd->free_params_immediately) {
-        sd_ctx->sd->cond_stage_model->free_params_buffer();
-    }
-
-    // Control net hint
-    struct ggml_tensor* image_hint = nullptr;
-    if (control_image.data != nullptr) {
-        image_hint = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
-        sd_image_to_ggml_tensor(control_image, image_hint);
-    }
-
-    // Sample
-    std::vector<struct ggml_tensor*> final_latents;  // collect latents to decode
-    int C = sd_ctx->sd->get_latent_channel();
-    int W = width / sd_ctx->sd->get_vae_scale_factor();
-    int H = height / sd_ctx->sd->get_vae_scale_factor();
-
-    struct ggml_tensor* control_latent = nullptr;
-    if (sd_version_is_control(sd_ctx->sd->version) && image_hint != nullptr) {
-        control_latent = sd_ctx->sd->encode_first_stage(work_ctx, image_hint);
-        ggml_ext_tensor_scale_inplace(control_latent, control_strength);
-    }
-
-    if (sd_version_is_inpaint(sd_ctx->sd->version)) {
-        int64_t mask_channels = 1;
-        if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
-            mask_channels = 8 * 8;  // flatten the whole mask
-        } else if (sd_ctx->sd->version == VERSION_FLEX_2) {
-            mask_channels = 1 + init_latent->ne[2];
-        }
-        auto empty_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1);
-        // no mask, set the whole image as masked
-        for (int64_t x = 0; x < empty_latent->ne[0]; x++) {
-            for (int64_t y = 0; y < empty_latent->ne[1]; y++) {
-                if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
-                    // TODO: this might be wrong
-                    for (int64_t c = 0; c < init_latent->ne[2]; c++) {
-                        ggml_ext_tensor_set_f32(empty_latent, 0, x, y, c);
-                    }
-                    for (int64_t c = init_latent->ne[2]; c < empty_latent->ne[2]; c++) {
-                        ggml_ext_tensor_set_f32(empty_latent, 1, x, y, c);
-                    }
-                } else if (sd_ctx->sd->version == VERSION_FLEX_2) {
-                    for (int64_t c = 0; c < empty_latent->ne[2]; c++) {
-                        // 0x16,1x1,0x16
-                        ggml_ext_tensor_set_f32(empty_latent, c == init_latent->ne[2], x, y, c);
-                    }
-                } else {
-                    ggml_ext_tensor_set_f32(empty_latent, 1, x, y, 0);
-                    for (int64_t c = 1; c < empty_latent->ne[2]; c++) {
-                        ggml_ext_tensor_set_f32(empty_latent, 0, x, y, c);
-                    }
-                }
-            }
-        }
-
-        if (sd_ctx->sd->version == VERSION_FLEX_2 && control_latent != nullptr && sd_ctx->sd->control_net == nullptr) {
-            bool no_inpaint = concat_latent == nullptr;
-            if (no_inpaint) {
-                concat_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1);
-            }
-            // fill in the control image here
-            for (int64_t x = 0; x < control_latent->ne[0]; x++) {
-                for (int64_t y = 0; y < control_latent->ne[1]; y++) {
-                    if (no_inpaint) {
-                        for (int64_t c = 0; c < concat_latent->ne[2] - control_latent->ne[2]; c++) {
-                            // 0x16,1x1,0x16
-                            ggml_ext_tensor_set_f32(concat_latent, c == init_latent->ne[2], x, y, c);
-                        }
-                    }
-                    for (int64_t c = 0; c < control_latent->ne[2]; c++) {
-                        float v = ggml_ext_tensor_get_f32(control_latent, x, y, c);
-                        ggml_ext_tensor_set_f32(concat_latent, v, x, y, concat_latent->ne[2] - control_latent->ne[2] + c);
-                    }
-                }
-            }
-        } else if (concat_latent == nullptr) {
-            concat_latent = empty_latent;
-        }
-        cond.c_concat   = concat_latent;
-        uncond.c_concat = empty_latent;
-        denoise_mask    = nullptr;
-    } else if (sd_version_is_unet_edit(sd_ctx->sd->version)) {
-        auto empty_latent = ggml_dup_tensor(work_ctx, init_latent);
-        ggml_set_f32(empty_latent, 0);
-        uncond.c_concat = empty_latent;
-        cond.c_concat   = ref_latents[0];
-        if (cond.c_concat == nullptr) {
-            cond.c_concat = empty_latent;
-        }
-    } else if (sd_version_is_control(sd_ctx->sd->version)) {
-        auto empty_latent = ggml_dup_tensor(work_ctx, init_latent);
-        ggml_set_f32(empty_latent, 0);
-        uncond.c_concat = empty_latent;
-        if (sd_ctx->sd->control_net == nullptr) {
-            cond.c_concat = control_latent;
-        }
-        if (cond.c_concat == nullptr) {
-            cond.c_concat = empty_latent;
-        }
-    }
-    SDCondition img_cond;
-    if (uncond.c_crossattn != nullptr &&
-        (sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.txt_cfg != guidance.img_cfg)) {
-        img_cond = SDCondition(uncond.c_crossattn, uncond.c_vector, cond.c_concat);
-    }
-    for (int b = 0; b < batch_count; b++) {
-        int64_t sampling_start = ggml_time_ms();
-        int64_t cur_seed       = seed + b;
-        LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, batch_count, cur_seed);
-
-        sd_ctx->sd->rng->manual_seed(cur_seed);
-        sd_ctx->sd->sampler_rng->manual_seed(cur_seed);
-        struct ggml_tensor* x_t   = init_latent;
-        struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
-        ggml_ext_im_set_randn_f32(noise, sd_ctx->sd->rng);
-
-        int start_merge_step = -1;
-        if (sd_ctx->sd->use_pmid) {
-            start_merge_step = int(sd_ctx->sd->pmid_model->style_strength / 100.f * sample_steps);
-            // if (start_merge_step > 30)
-            //     start_merge_step = 30;
-            LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step);
-        }
-
-        struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx,
-                                                     sd_ctx->sd->diffusion_model,
-                                                     true,
-                                                     x_t,
-                                                     noise,
-                                                     cond,
-                                                     uncond,
-                                                     img_cond,
-                                                     image_hint,
-                                                     control_strength,
-                                                     guidance,
-                                                     eta,
-                                                     shifted_timestep,
-                                                     sample_method,
-                                                     sigmas,
-                                                     start_merge_step,
-                                                     id_cond,
-                                                     ref_latents,
-                                                     increase_ref_index,
-                                                     denoise_mask,
-                                                     nullptr,
-                                                     1.0f,
-                                                     cache_params);
-        int64_t sampling_end    = ggml_time_ms();
-        if (x_0 != nullptr) {
-            // print_ggml_tensor(x_0);
-            LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
-            final_latents.push_back(x_0);
-        } else {
-            LOG_ERROR("sampling for image %d/%d failed after %.2fs", b + 1, batch_count, (sampling_end - sampling_start) * 1.0f / 1000);
-        }
-    }
-
-    if (sd_ctx->sd->free_params_immediately) {
-        sd_ctx->sd->diffusion_model->free_params_buffer();
-    }
-    int64_t t3 = ggml_time_ms();
-    LOG_INFO("generating %" PRId64 " latent images completed, taking %.2fs", final_latents.size(), (t3 - t1) * 1.0f / 1000);
-
-    // Decode to image
-    LOG_INFO("decoding %zu latents", final_latents.size());
-    std::vector<struct ggml_tensor*> decoded_images;  // collect decoded images
-    for (size_t i = 0; i < final_latents.size(); i++) {
-        t1                      = ggml_time_ms();
-        struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, final_latents[i] /* x_0 */);
-        // print_ggml_tensor(img);
-        if (img != nullptr) {
-            decoded_images.push_back(img);
-        }
-        int64_t t2 = ggml_time_ms();
-        LOG_INFO("latent %" PRId64 " decoded, taking %.2fs", i + 1, (t2 - t1) * 1.0f / 1000);
-    }
-
-    int64_t t4 = ggml_time_ms();
-    LOG_INFO("decode_first_stage completed, taking %.2fs", (t4 - t3) * 1.0f / 1000);
-    if (sd_ctx->sd->free_params_immediately && !sd_ctx->sd->use_tiny_autoencoder) {
-        sd_ctx->sd->first_stage_model->free_params_buffer();
-    }
-
-    sd_ctx->sd->lora_stat();
-
-    sd_image_t* result_images = (sd_image_t*)calloc(batch_count, sizeof(sd_image_t));
-    if (result_images == nullptr) {
-        ggml_free(work_ctx);
-        return nullptr;
-    }
-    memset(result_images, 0, batch_count * sizeof(sd_image_t));
-
-    for (size_t i = 0; i < decoded_images.size(); i++) {
-        result_images[i].width   = width;
-        result_images[i].height  = height;
-        result_images[i].channel = 3;
-        result_images[i].data    = ggml_tensor_to_sd_image(decoded_images[i]);
-    }
-    ggml_free(work_ctx);
-
-    return result_images;
+    srand((int)time(nullptr));
+    return rand();
 }
 
-sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) {
-    sd_ctx->sd->vae_tiling_params = sd_img_gen_params->vae_tiling_params;
-    int width                     = sd_img_gen_params->width;
-    int height                    = sd_img_gen_params->height;
+static enum sample_method_t resolve_sample_method(sd_ctx_t* sd_ctx, enum sample_method_t sample_method) {
+    if (sample_method == SAMPLE_METHOD_COUNT) {
+        return sd_get_default_sample_method(sd_ctx);
+    }
+    return sample_method;
+}
 
-    int vae_scale_factor            = sd_ctx->sd->get_vae_scale_factor();
-    int diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor();
-    int spatial_multiple            = vae_scale_factor * diffusion_model_down_factor;
+static scheduler_t resolve_scheduler(sd_ctx_t* sd_ctx,
+                                     scheduler_t scheduler,
+                                     enum sample_method_t sample_method) {
+    if (scheduler == SCHEDULER_COUNT) {
+        return sd_get_default_scheduler(sd_ctx, sample_method);
+    }
+    return scheduler;
+}
+
+struct GenerationRequest {
+    std::string prompt;
+    std::string negative_prompt;
+    int width                                = -1;
+    int height                               = -1;
+    int clip_skip                            = -1;
+    int vae_scale_factor                     = -1;
+    int diffusion_model_down_factor          = -1;
+    int64_t seed                             = -1;
+    bool use_uncond                          = false;
+    bool use_img_cond                        = false;
+    bool use_high_noise_uncond               = false;
+    bool use_high_noise_img_cond             = false;
+    const sd_cache_params_t* cache_params    = nullptr;
+    int batch_count                          = 1;
+    int shifted_timestep                     = 0;
+    float strength                           = 1.f;
+    float control_strength                   = 0.f;
+    float eta                                = 0.f;
+    bool increase_ref_index                  = false;
+    bool auto_resize_ref_image               = false;
+    sd_guidance_params_t guidance            = {};
+    sd_guidance_params_t high_noise_guidance = {};
+    sd_pm_params_t pm_params                 = {};
+    int frames                               = -1;
+    float vace_strength                      = 1.f;
+
+    GenerationRequest(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) {
+        prompt                      = SAFE_STR(sd_img_gen_params->prompt);
+        negative_prompt             = SAFE_STR(sd_img_gen_params->negative_prompt);
+        width                       = sd_img_gen_params->width;
+        height                      = sd_img_gen_params->height;
+        vae_scale_factor            = sd_ctx->sd->get_vae_scale_factor();
+        diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor();
+        seed                        = sd_img_gen_params->seed;
+        batch_count                 = sd_img_gen_params->batch_count;
+        clip_skip                   = sd_img_gen_params->clip_skip;
+        shifted_timestep            = sd_img_gen_params->sample_params.shifted_timestep;
+        strength                    = sd_img_gen_params->strength;
+        control_strength            = sd_img_gen_params->control_strength;
+        eta                         = sd_img_gen_params->sample_params.eta;
+        increase_ref_index          = sd_img_gen_params->increase_ref_index;
+        auto_resize_ref_image       = sd_img_gen_params->auto_resize_ref_image;
+        guidance                    = sd_img_gen_params->sample_params.guidance;
+        pm_params                   = sd_img_gen_params->pm_params;
+        cache_params                = &sd_img_gen_params->cache;
+        resolve(sd_ctx);
+    }
+
+    GenerationRequest(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params) {
+        prompt                      = SAFE_STR(sd_vid_gen_params->prompt);
+        negative_prompt             = SAFE_STR(sd_vid_gen_params->negative_prompt);
+        width                       = sd_vid_gen_params->width;
+        height                      = sd_vid_gen_params->height;
+        frames                      = (sd_vid_gen_params->video_frames - 1) / 4 * 4 + 1;
+        clip_skip                   = sd_vid_gen_params->clip_skip;
+        vae_scale_factor            = sd_ctx->sd->get_vae_scale_factor();
+        diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor();
+        seed                        = sd_vid_gen_params->seed;
+        cache_params                = &sd_vid_gen_params->cache;
+        vace_strength               = sd_vid_gen_params->vace_strength;
+        guidance                    = sd_vid_gen_params->sample_params.guidance;
+        high_noise_guidance         = sd_vid_gen_params->high_noise_sample_params.guidance;
+        resolve(sd_ctx);
+    }
+
+    void align_generation_request_size() {
+        int spatial_multiple = vae_scale_factor * diffusion_model_down_factor;
+        int width_offset     = align_up_offset(width, spatial_multiple);
+        int height_offset    = align_up_offset(height, spatial_multiple);
+        if (width_offset <= 0 && height_offset <= 0) {
+            return;
+        }
+
+        int original_width  = width;
+        int original_height = height;
 
-    int width_offset  = align_up_offset(width, spatial_multiple);
-    int height_offset = align_up_offset(height, spatial_multiple);
-    if (width_offset > 0 || height_offset > 0) {
         width += width_offset;
         height += height_offset;
-        LOG_WARN("align up %dx%d to %dx%d (multiple=%d)", sd_img_gen_params->width, sd_img_gen_params->height, width, height, spatial_multiple);
+        LOG_WARN("align up %dx%d to %dx%d (multiple=%d)",
+                 original_width,
+                 original_height,
+                 width,
+                 height,
+                 spatial_multiple);
     }
 
-    LOG_DEBUG("generate_image %dx%d", width, height);
-    if (sd_ctx == nullptr || sd_img_gen_params == nullptr) {
-        return nullptr;
-    }
-
-    struct ggml_init_params params;
-    params.mem_size   = static_cast<size_t>(1024 * 1024) * 1024;  // 1G
-    params.mem_buffer = nullptr;
-    params.no_alloc   = false;
-    // LOG_DEBUG("mem_size %u ", params.mem_size);
-
-    struct ggml_context* work_ctx = ggml_init(params);
-    if (!work_ctx) {
-        LOG_ERROR("ggml_init() failed");
-        return nullptr;
-    }
-
-    int64_t seed = sd_img_gen_params->seed;
-    if (seed < 0) {
-        srand((int)time(nullptr));
-        seed = rand();
-    }
-    sd_ctx->sd->rng->manual_seed(seed);
-    sd_ctx->sd->sampler_rng->manual_seed(seed);
-
-    size_t t0 = ggml_time_ms();
-
-    sd_ctx->sd->set_flow_shift(sd_img_gen_params->sample_params.flow_shift);
-
-    // Apply lora
-    sd_ctx->sd->apply_loras(sd_img_gen_params->loras, sd_img_gen_params->lora_count);
-
-    enum sample_method_t sample_method = sd_img_gen_params->sample_params.sample_method;
-    if (sample_method == SAMPLE_METHOD_COUNT) {
-        sample_method = sd_get_default_sample_method(sd_ctx);
-    }
-    LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
-
-    int sample_steps = sd_img_gen_params->sample_params.sample_steps;
-    std::vector<float> sigmas;
-    if (sd_img_gen_params->sample_params.custom_sigmas_count > 0) {
-        sigmas = std::vector<float>(sd_img_gen_params->sample_params.custom_sigmas,
-                                    sd_img_gen_params->sample_params.custom_sigmas + sd_img_gen_params->sample_params.custom_sigmas_count);
-        if (sample_steps != sigmas.size() - 1) {
-            sample_steps = static_cast<int>(sigmas.size()) - 1;
-            LOG_WARN("sample_steps != custom_sigmas_count - 1, set sample_steps to %d", sample_steps);
+    static void resolve_guidance(sd_ctx_t* sd_ctx,
+                                 sd_guidance_params_t* guidance,
+                                 bool* use_uncond,
+                                 bool* use_img_cond,
+                                 const char* stage_name = nullptr) {
+        GGML_ASSERT(guidance != nullptr);
+        GGML_ASSERT(use_uncond != nullptr);
+        GGML_ASSERT(use_img_cond != nullptr);
+        // out_uncond + text_cfg_scale * (out_cond - out_img_cond) + image_cfg_scale * (out_img_cond - out_uncond)
+        // img_cfg == txt_cfg means that img_cfg is not used
+        if (!std::isfinite(guidance->img_cfg)) {
+            guidance->img_cfg = guidance->txt_cfg;
         }
-    } else {
-        scheduler_t scheduler = sd_img_gen_params->sample_params.scheduler;
-        if (scheduler == SCHEDULER_COUNT) {
-            scheduler = sd_get_default_scheduler(sd_ctx, sample_method);
+
+        if (!sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version)) {
+            guidance->img_cfg = guidance->txt_cfg;
         }
-        sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps,
-                                                  sd_ctx->sd->get_image_seq_len(height, width),
-                                                  scheduler,
-                                                  sd_ctx->sd->version);
-    }
 
-    ggml_tensor* init_latent   = nullptr;
-    ggml_tensor* concat_latent = nullptr;
-    ggml_tensor* denoise_mask  = nullptr;
-    if (sd_img_gen_params->init_image.data) {
-        LOG_INFO("IMG2IMG");
+        if (guidance->txt_cfg != 1.f) {
+            *use_uncond = true;
+        }
 
-        size_t t_enc = static_cast<size_t>(sample_steps * sd_img_gen_params->strength);
-        if (t_enc == sample_steps)
-            t_enc--;
-        LOG_INFO("target t_enc is %zu steps", t_enc);
-        std::vector<float> sigma_sched;
-        sigma_sched.assign(sigmas.begin() + sample_steps - t_enc - 1, sigmas.end());
-        sigmas = sigma_sched;
+        if (guidance->img_cfg != guidance->txt_cfg) {
+            *use_img_cond = true;
+            *use_uncond   = true;
+        }
 
-        ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
-        ggml_tensor* mask_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1);
-
-        sd_image_to_ggml_tensor(sd_img_gen_params->mask_image, mask_img);
-        sd_image_to_ggml_tensor(sd_img_gen_params->init_image, init_img);
-
-        if (sd_version_is_inpaint(sd_ctx->sd->version)) {
-            int64_t mask_channels = 1;
-            if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
-                mask_channels = vae_scale_factor * vae_scale_factor;  // flatten the whole mask
-            } else if (sd_ctx->sd->version == VERSION_FLEX_2) {
-                mask_channels = 1 + sd_ctx->sd->get_latent_channel();
-            }
-            ggml_tensor* masked_latent = nullptr;
-
-            if (sd_ctx->sd->version != VERSION_FLEX_2) {
-                // most inpaint models mask before vae
-                ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
-                ggml_ext_tensor_apply_mask(init_img, mask_img, masked_img);
-                masked_latent = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
-                init_latent   = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
+        if (guidance->txt_cfg < 1.f) {
+            const char* prefix = stage_name == nullptr ? "" : stage_name;
+            if (guidance->txt_cfg == 0.f) {
+                LOG_WARN("%sunconditioned mode, images won't follow the prompt (use cfg-scale=1 for distilled models)",
+                         prefix);
             } else {
-                // mask after vae
-                init_latent   = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
-                masked_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2], 1);
-                ggml_ext_tensor_apply_mask(init_latent, mask_img, masked_latent, 0.);
-            }
-            concat_latent = ggml_new_tensor_4d(work_ctx,
-                                               GGML_TYPE_F32,
-                                               masked_latent->ne[0],
-                                               masked_latent->ne[1],
-                                               mask_channels + masked_latent->ne[2],
-                                               1);
-            for (int ix = 0; ix < masked_latent->ne[0]; ix++) {
-                for (int iy = 0; iy < masked_latent->ne[1]; iy++) {
-                    int mx = ix * vae_scale_factor;
-                    int my = iy * vae_scale_factor;
-                    if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
-                        for (int k = 0; k < masked_latent->ne[2]; k++) {
-                            float v = ggml_ext_tensor_get_f32(masked_latent, ix, iy, k);
-                            ggml_ext_tensor_set_f32(concat_latent, v, ix, iy, k);
-                        }
-                        // "Encode" 8x8 mask chunks into a flattened 1x64 vector, and concatenate to masked image
-                        for (int x = 0; x < vae_scale_factor; x++) {
-                            for (int y = 0; y < vae_scale_factor; y++) {
-                                float m = ggml_ext_tensor_get_f32(mask_img, mx + x, my + y);
-                                // TODO: check if the way the mask is flattened is correct (is it supposed to be x*vae_scale_factor+y or x+vae_scale_factor*y?)
-                                // python code was using "b (h vae_scale_factor) (w vae_scale_factor) -> b (vae_scale_factor vae_scale_factor) h w"
-                                ggml_ext_tensor_set_f32(concat_latent, m, ix, iy, masked_latent->ne[2] + x * vae_scale_factor + y);
-                            }
-                        }
-                    } else if (sd_ctx->sd->version == VERSION_FLEX_2) {
-                        float m = ggml_ext_tensor_get_f32(mask_img, mx, my);
-                        // masked image
-                        for (int k = 0; k < masked_latent->ne[2]; k++) {
-                            float v = ggml_ext_tensor_get_f32(masked_latent, ix, iy, k);
-                            ggml_ext_tensor_set_f32(concat_latent, v, ix, iy, k);
-                        }
-                        // downsampled mask
-                        ggml_ext_tensor_set_f32(concat_latent, m, ix, iy, masked_latent->ne[2]);
-                        // control (todo: support this)
-                        for (int k = 0; k < masked_latent->ne[2]; k++) {
-                            ggml_ext_tensor_set_f32(concat_latent, 0, ix, iy, masked_latent->ne[2] + 1 + k);
-                        }
-                    } else {
-                        float m = ggml_ext_tensor_get_f32(mask_img, mx, my);
-                        ggml_ext_tensor_set_f32(concat_latent, m, ix, iy, 0);
-                        for (int k = 0; k < masked_latent->ne[2]; k++) {
-                            float v = ggml_ext_tensor_get_f32(masked_latent, ix, iy, k);
-                            ggml_ext_tensor_set_f32(concat_latent, v, ix, iy, k + mask_channels);
-                        }
-                    }
-                }
-            }
-        } else {
-            init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
-        }
-
-        {
-            // LOG_WARN("Inpainting with a base model is not great");
-            denoise_mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / vae_scale_factor, height / vae_scale_factor, 1, 1);
-            for (int ix = 0; ix < denoise_mask->ne[0]; ix++) {
-                for (int iy = 0; iy < denoise_mask->ne[1]; iy++) {
-                    int mx  = ix * vae_scale_factor;
-                    int my  = iy * vae_scale_factor;
-                    float m = ggml_ext_tensor_get_f32(mask_img, mx, my);
-                    ggml_ext_tensor_set_f32(denoise_mask, m, ix, iy);
-                }
+                LOG_WARN("%scfg value out of expected range may produce unexpected results", prefix);
             }
         }
-    } else {
-        LOG_INFO("TXT2IMG");
-        if (sd_version_is_inpaint(sd_ctx->sd->version)) {
-            LOG_WARN("This is an inpainting model, this should only be used in img2img mode with a mask");
-        }
-        init_latent = sd_ctx->sd->generate_init_latent(work_ctx, width, height);
     }
 
-    sd_guidance_params_t guidance = sd_img_gen_params->sample_params.guidance;
-    std::vector<sd_image_t*> ref_images;
-    for (int i = 0; i < sd_img_gen_params->ref_images_count; i++) {
-        ref_images.push_back(&sd_img_gen_params->ref_images[i]);
-    }
+    void resolve(sd_ctx_t* sd_ctx) {
+        align_generation_request_size();
+        seed = resolve_seed(seed);
 
-    std::vector<uint8_t> empty_image_data;
-    sd_image_t empty_image = {(uint32_t)width, (uint32_t)height, 3, nullptr};
-    if (ref_images.empty() && sd_version_is_unet_edit(sd_ctx->sd->version)) {
-        LOG_WARN("This model needs at least one reference image; using an empty reference");
-        empty_image_data.resize(width * height * 3);
-        ref_images.push_back(&empty_image);
-        empty_image.data = empty_image_data.data();
-        guidance.img_cfg = 0.f;
-    }
-
-    if (ref_images.size() > 0) {
-        LOG_INFO("EDIT mode");
-    }
-
-    std::vector<ggml_tensor*> ref_latents;
-    for (int i = 0; i < ref_images.size(); i++) {
-        ggml_tensor* img;
-        if (sd_img_gen_params->auto_resize_ref_image) {
-            LOG_DEBUG("auto resize ref images");
-            sd_image_f32_t ref_image = sd_image_t_to_sd_image_f32_t(*ref_images[i]);
-            int VAE_IMAGE_SIZE       = std::min(1024 * 1024, width * height);
-            double vae_width         = sqrt(VAE_IMAGE_SIZE * ref_image.width / ref_image.height);
-            double vae_height        = vae_width * ref_image.height / ref_image.width;
-
-            int factor = 16;
-            if (sd_version_is_qwen_image(sd_ctx->sd->version)) {
-                factor = 32;
-            }
-
-            vae_height = round(vae_height / factor) * factor;
-            vae_width  = round(vae_width / factor) * factor;
-
-            sd_image_f32_t resized_image = resize_sd_image_f32_t(ref_image, static_cast<int>(vae_width), static_cast<int>(vae_height));
-            free(ref_image.data);
-            ref_image.data = nullptr;
-
-            LOG_DEBUG("resize vae ref image %d from %dx%d to %dx%d", i, ref_image.height, ref_image.width, resized_image.height, resized_image.width);
-
-            img = ggml_new_tensor_4d(work_ctx,
-                                     GGML_TYPE_F32,
-                                     resized_image.width,
-                                     resized_image.height,
-                                     3,
-                                     1);
-            sd_image_f32_to_ggml_tensor(resized_image, img);
-            free(resized_image.data);
-            resized_image.data = nullptr;
-        } else {
-            img = ggml_new_tensor_4d(work_ctx,
-                                     GGML_TYPE_F32,
-                                     ref_images[i]->width,
-                                     ref_images[i]->height,
-                                     3,
-                                     1);
-            sd_image_to_ggml_tensor(*ref_images[i], img);
+        resolve_guidance(sd_ctx, &guidance, &use_uncond, &use_img_cond);
+        if (sd_ctx->sd->high_noise_diffusion_model) {
+            resolve_guidance(sd_ctx,
+                             &high_noise_guidance,
+                             &use_high_noise_uncond,
+                             &use_high_noise_img_cond,
+                             "high noise: ");
         }
 
-        // print_ggml_tensor(img, false, "img");
-
-        ggml_tensor* latent = sd_ctx->sd->encode_first_stage(work_ctx, img);
-        ref_latents.push_back(latent);
-    }
-
-    if (sd_img_gen_params->init_image.data != nullptr || sd_img_gen_params->ref_images_count > 0) {
-        size_t t1 = ggml_time_ms();
-        LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
-    }
-
-    sd_image_t* result_images = generate_image_internal(sd_ctx,
-                                                        work_ctx,
-                                                        init_latent,
-                                                        SAFE_STR(sd_img_gen_params->prompt),
-                                                        SAFE_STR(sd_img_gen_params->negative_prompt),
-                                                        sd_img_gen_params->clip_skip,
-                                                        guidance,
-                                                        sd_img_gen_params->sample_params.eta,
-                                                        sd_img_gen_params->sample_params.shifted_timestep,
-                                                        width,
-                                                        height,
-                                                        sample_method,
-                                                        sigmas,
-                                                        seed,
-                                                        sd_img_gen_params->batch_count,
-                                                        sd_img_gen_params->control_image,
-                                                        sd_img_gen_params->control_strength,
-                                                        sd_img_gen_params->pm_params,
-                                                        ref_images,
-                                                        ref_latents,
-                                                        sd_img_gen_params->increase_ref_index,
-                                                        concat_latent,
-                                                        denoise_mask,
-                                                        &sd_img_gen_params->cache);
-
-    size_t t2 = ggml_time_ms();
-
-    LOG_INFO("generate_image completed in %.2fs", (t2 - t0) * 1.0f / 1000);
-
-    return result_images;
-}
-
-SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, int* num_frames_out) {
-    if (sd_ctx == nullptr || sd_vid_gen_params == nullptr) {
-        return nullptr;
-    }
-    sd_ctx->sd->vae_tiling_params = sd_vid_gen_params->vae_tiling_params;
-
-    std::string prompt          = SAFE_STR(sd_vid_gen_params->prompt);
-    std::string negative_prompt = SAFE_STR(sd_vid_gen_params->negative_prompt);
-
-    int width        = sd_vid_gen_params->width;
-    int height       = sd_vid_gen_params->height;
-    int frames       = sd_vid_gen_params->video_frames;
-    frames           = (frames - 1) / 4 * 4 + 1;
-    int sample_steps = sd_vid_gen_params->sample_params.sample_steps;
-
-    int vae_scale_factor            = sd_ctx->sd->get_vae_scale_factor();
-    int diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor();
-    int spatial_multiple            = vae_scale_factor * diffusion_model_down_factor;
-
-    int width_offset  = align_up_offset(width, spatial_multiple);
-    int height_offset = align_up_offset(height, spatial_multiple);
-    if (width_offset > 0 || height_offset > 0) {
-        width += width_offset;
-        height += height_offset;
-        LOG_WARN("align up %dx%d to %dx%d (multiple=%d)", sd_vid_gen_params->width, sd_vid_gen_params->height, width, height, spatial_multiple);
-    }
-    LOG_INFO("generate_video %dx%dx%d", width, height, frames);
-
-    sd_ctx->sd->set_flow_shift(sd_vid_gen_params->sample_params.flow_shift);
-
-    enum sample_method_t sample_method = sd_vid_gen_params->sample_params.sample_method;
-    if (sample_method == SAMPLE_METHOD_COUNT) {
-        sample_method = sd_get_default_sample_method(sd_ctx);
-    }
-    LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
-
-    int high_noise_sample_steps = 0;
-    if (sd_ctx->sd->high_noise_diffusion_model) {
-        high_noise_sample_steps = sd_vid_gen_params->high_noise_sample_params.sample_steps;
-    }
-
-    int total_steps = sample_steps;
-
-    if (high_noise_sample_steps > 0) {
-        total_steps += high_noise_sample_steps;
+        if (shifted_timestep > 0 && !sd_version_is_sdxl(sd_ctx->sd->version)) {
+            LOG_WARN("timestep shifting is only supported for SDXL models!");
+            shifted_timestep = 0;
+        }
     }
+};
 
+struct SamplePlan {
+    enum sample_method_t sample_method            = SAMPLE_METHOD_COUNT;
+    enum sample_method_t high_noise_sample_method = SAMPLE_METHOD_COUNT;
+    int sample_steps                              = 0;
+    int high_noise_sample_steps                   = 0;
+    int total_steps                               = 0;
+    float moe_boundary                            = 0.f;
+    int start_merge_step                          = -1;
     std::vector<float> sigmas;
-    if (sd_vid_gen_params->sample_params.custom_sigmas_count > 0) {
-        sigmas = std::vector<float>(sd_vid_gen_params->sample_params.custom_sigmas,
-                                    sd_vid_gen_params->sample_params.custom_sigmas + sd_vid_gen_params->sample_params.custom_sigmas_count);
-        if (total_steps != sigmas.size() - 1) {
+
+    SamplePlan(sd_ctx_t* sd_ctx,
+               const sd_img_gen_params_t* sd_img_gen_params,
+               const GenerationRequest& request) {
+        sample_method = sd_img_gen_params->sample_params.sample_method;
+        sample_steps  = sd_img_gen_params->sample_params.sample_steps;
+        resolve(sd_ctx, &request, &sd_img_gen_params->sample_params);
+    }
+
+    SamplePlan(sd_ctx_t* sd_ctx,
+               const sd_vid_gen_params_t* sd_vid_gen_params,
+               const GenerationRequest& request) {
+        sample_method = sd_vid_gen_params->sample_params.sample_method;
+        sample_steps  = sd_vid_gen_params->sample_params.sample_steps;
+        if (sd_ctx->sd->high_noise_diffusion_model) {
+            high_noise_sample_steps  = sd_vid_gen_params->high_noise_sample_params.sample_steps;
+            high_noise_sample_method = sd_vid_gen_params->high_noise_sample_params.sample_method;
+        }
+        moe_boundary = sd_vid_gen_params->moe_boundary;
+        resolve(sd_ctx, &request, &sd_vid_gen_params->sample_params);
+    }
+
+    void resolve(sd_ctx_t* sd_ctx,
+                 const GenerationRequest* request,
+                 const sd_sample_params_t* sample_params) {
+        sample_method = resolve_sample_method(sd_ctx, sample_method);
+
+        total_steps = sample_steps + std::max(0, high_noise_sample_steps);
+
+        if (sample_params->custom_sigmas_count > 0) {
+            sigmas      = std::vector<float>(sample_params->custom_sigmas,
+                                        sample_params->custom_sigmas + sample_params->custom_sigmas_count);
             total_steps = static_cast<int>(sigmas.size()) - 1;
             LOG_WARN("total_steps != custom_sigmas_count - 1, set total_steps to %d", total_steps);
             if (sample_steps >= total_steps) {
@@ -3875,60 +2634,559 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
                 high_noise_sample_steps = total_steps - sample_steps;
                 LOG_WARN("total_steps != custom_sigmas_count - 1, set high_noise_sample_steps to %d", high_noise_sample_steps);
             }
+        } else {
+            scheduler_t scheduler = resolve_scheduler(sd_ctx,
+                                                      sample_params->scheduler,
+                                                      sample_method);
+            sigmas                = sd_ctx->sd->denoiser->get_sigmas(total_steps,
+                                                                     sd_ctx->sd->get_image_seq_len(request->height, request->width),
+                                                                     scheduler,
+                                                                     sd_ctx->sd->version);
         }
-    } else {
-        scheduler_t scheduler = sd_vid_gen_params->sample_params.scheduler;
-        if (scheduler == SCHEDULER_COUNT) {
-            scheduler = sd_get_default_scheduler(sd_ctx, sample_method);
-        }
-        sigmas = sd_ctx->sd->denoiser->get_sigmas(total_steps,
-                                                  0,
-                                                  scheduler,
-                                                  sd_ctx->sd->version);
-    }
 
-    if (high_noise_sample_steps < 0) {
-        // timesteps ∝ sigmas for Flow models (like wan2.2 a14b)
-        for (size_t i = 0; i < sigmas.size(); ++i) {
-            if (sigmas[i] < sd_vid_gen_params->moe_boundary) {
-                high_noise_sample_steps = static_cast<int>(i);
-                break;
+        if (high_noise_sample_steps < 0) {
+            for (size_t i = 0; i < sigmas.size(); ++i) {
+                if (sigmas[i] < moe_boundary) {
+                    high_noise_sample_steps = static_cast<int>(i);
+                    break;
+                }
             }
+            LOG_DEBUG("switching from high noise model at step %d", high_noise_sample_steps);
         }
-        LOG_DEBUG("switching from high noise model at step %d", high_noise_sample_steps);
+
+        LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
+        if (high_noise_sample_steps > 0) {
+            high_noise_sample_method = resolve_sample_method(sd_ctx,
+                                                             high_noise_sample_method);
+            LOG_INFO("sampling(high noise) using %s method", sampling_methods_str[high_noise_sample_method]);
+        }
+
+        if (sd_ctx->sd->use_pmid) {
+            start_merge_step = int(sd_ctx->sd->pmid_model->style_strength / 100.f * total_steps);
+            LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step);
+        }
+    }
+};
+
+struct ImageGenerationLatents {
+    sd::Tensor<float> init_latent;
+    sd::Tensor<float> concat_latent;
+    sd::Tensor<float> uncond_concat_latent;
+    sd::Tensor<float> control_image;
+    std::vector<sd::Tensor<float>> ref_images;
+    std::vector<sd::Tensor<float>> ref_latents;
+    sd::Tensor<float> denoise_mask;
+    sd::Tensor<float> clip_vision_output;
+    sd::Tensor<float> vace_context;
+    int64_t ref_image_num = 0;
+};
+
+struct ImageGenerationEmbeds {
+    SDCondition cond;
+    SDCondition uncond;
+    SDCondition img_cond;
+    SDCondition id_cond;
+};
+
+struct CircularAxesState {
+    bool circular_x = false;
+    bool circular_y = false;
+};
+
+static CircularAxesState configure_image_vae_axes(sd_ctx_t* sd_ctx,
+                                                  const sd_img_gen_params_t* sd_img_gen_params,
+                                                  const GenerationRequest& request) {
+    CircularAxesState original_axes = {sd_ctx->sd->circular_x, sd_ctx->sd->circular_y};
+
+    if (!sd_img_gen_params->vae_tiling_params.enabled) {
+        if (sd_ctx->sd->first_stage_model) {
+            sd_ctx->sd->first_stage_model->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y);
+        }
+        if (sd_ctx->sd->preview_vae) {
+            sd_ctx->sd->preview_vae->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y);
+        }
+        return original_axes;
     }
 
-    struct ggml_init_params params;
-    params.mem_size   = static_cast<size_t>(1024 * 1024) * 1024;  // 1G
-    params.mem_buffer = nullptr;
-    params.no_alloc   = false;
-    // LOG_DEBUG("mem_size %u ", params.mem_size);
+    int tile_size_x, tile_size_y;
+    float overlap;
+    int latent_size_x = request.width / request.vae_scale_factor;
+    int latent_size_y = request.height / request.vae_scale_factor;
+    sd_ctx->sd->first_stage_model->get_tile_sizes(tile_size_x,
+                                                  tile_size_y,
+                                                  overlap,
+                                                  sd_img_gen_params->vae_tiling_params,
+                                                  latent_size_x,
+                                                  latent_size_y);
 
-    struct ggml_context* work_ctx = ggml_init(params);
-    if (!work_ctx) {
-        LOG_ERROR("ggml_init() failed");
+    sd_ctx->sd->circular_x = sd_ctx->sd->circular_x && (tile_size_x >= latent_size_x);
+    sd_ctx->sd->circular_y = sd_ctx->sd->circular_y && (tile_size_y >= latent_size_y);
+
+    if (sd_ctx->sd->first_stage_model) {
+        sd_ctx->sd->first_stage_model->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y);
+    }
+    if (sd_ctx->sd->preview_vae) {
+        sd_ctx->sd->preview_vae->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y);
+    }
+
+    sd_ctx->sd->circular_x = original_axes.circular_x && (tile_size_x < latent_size_x);
+    sd_ctx->sd->circular_y = original_axes.circular_y && (tile_size_y < latent_size_y);
+
+    return original_axes;
+}
+
+static void restore_image_vae_axes(sd_ctx_t* sd_ctx, const CircularAxesState& original_axes) {
+    sd_ctx->sd->circular_x = original_axes.circular_x;
+    sd_ctx->sd->circular_y = original_axes.circular_y;
+}
+
+class ImageVaeAxesGuard {
+private:
+    sd_ctx_t* sd_ctx = nullptr;
+    CircularAxesState original_axes;
+
+public:
+    ImageVaeAxesGuard(sd_ctx_t* sd_ctx,
+                      const sd_img_gen_params_t* sd_img_gen_params,
+                      const GenerationRequest& request)
+        : sd_ctx(sd_ctx),
+          original_axes(configure_image_vae_axes(sd_ctx, sd_img_gen_params, request)) {}
+
+    ~ImageVaeAxesGuard() {
+        restore_image_vae_axes(sd_ctx, original_axes);
+    }
+
+    ImageVaeAxesGuard(const ImageVaeAxesGuard&)            = delete;
+    ImageVaeAxesGuard& operator=(const ImageVaeAxesGuard&) = delete;
+};
+
+static std::optional<ImageGenerationLatents> prepare_image_generation_latents(sd_ctx_t* sd_ctx,
+                                                                              const sd_img_gen_params_t* sd_img_gen_params,
+                                                                              GenerationRequest* request,
+                                                                              SamplePlan* plan) {
+    int64_t prepare_start_ms = ggml_time_ms();
+
+    sd::Tensor<float> init_image_tensor;
+    sd::Tensor<float> control_image_tensor;
+    sd::Tensor<float> mask_image_tensor;
+
+    if (sd_img_gen_params->init_image.data != nullptr) {
+        LOG_INFO("IMG2IMG");
+
+        if (request->strength < 1.f) {
+            size_t t_enc = static_cast<size_t>(plan->sample_steps * request->strength);
+            if (t_enc == static_cast<size_t>(plan->sample_steps)) {
+                t_enc--;
+            }
+            LOG_INFO("target t_enc is %zu steps", t_enc);
+            std::vector<float> sigma_sched;
+            sigma_sched.assign(plan->sigmas.begin() + plan->sample_steps - t_enc - 1, plan->sigmas.end());
+            plan->sigmas       = std::move(sigma_sched);
+            plan->sample_steps = static_cast<int>(plan->sigmas.size() - 1);
+        }
+
+        init_image_tensor = sd_image_to_tensor(sd_img_gen_params->init_image, request->width, request->height);
+    }
+
+    if (sd_img_gen_params->mask_image.data != nullptr) {
+        mask_image_tensor = sd_image_to_tensor(sd_img_gen_params->mask_image, request->width, request->height);
+        mask_image_tensor = sd::ops::round(mask_image_tensor);
+    }
+
+    if (sd_img_gen_params->control_image.data != nullptr) {
+        control_image_tensor = sd_image_to_tensor(sd_img_gen_params->control_image, request->width, request->height);
+    }
+
+    if (init_image_tensor.empty() || mask_image_tensor.empty()) {
+        if (sd_version_is_inpaint(sd_ctx->sd->version)) {
+            LOG_WARN("inpainting model requires both an init image and a mask image.");
+        }
+    }
+
+    if (mask_image_tensor.empty()) {
+        mask_image_tensor = sd::full<float>({request->width, request->height, 1, 1}, 1.f);
+    }
+
+    sd::Tensor<float> latent_mask = sd::ops::interpolate(mask_image_tensor,
+                                                         {request->width / request->vae_scale_factor,
+                                                          request->height / request->vae_scale_factor,
+                                                          1,
+                                                          1});
+
+    sd::Tensor<float> init_latent;
+    sd::Tensor<float> control_latent;
+    if (init_image_tensor.empty()) {
+        init_latent = sd_ctx->sd->generate_init_latent(request->width, request->height);
+    } else {
+        init_latent = sd_ctx->sd->encode_first_stage(init_image_tensor);
+        if (init_latent.empty()) {
+            LOG_ERROR("failed to encode init image");
+            return std::nullopt;
+        }
+    }
+
+    if (!control_image_tensor.empty() && !sd_ctx->sd->vae_decode_only) {
+        control_latent = sd_ctx->sd->encode_first_stage(control_image_tensor);
+        if (control_latent.empty()) {
+            LOG_ERROR("failed to encode control image");
+            return std::nullopt;
+        }
+    }
+
+    std::vector<sd::Tensor<float>> ref_images;
+    for (int i = 0; i < sd_img_gen_params->ref_images_count; i++) {
+        ref_images.push_back(sd_image_to_tensor(sd_img_gen_params->ref_images[i]));
+    }
+
+    if (ref_images.empty() && sd_version_is_unet_edit(sd_ctx->sd->version)) {
+        LOG_WARN("This model needs at least one reference image; using an empty reference");
+        ref_images.push_back(sd::zeros<float>({request->width, request->height, 3, 1}));
+        request->guidance.img_cfg = request->guidance.txt_cfg;
+    }
+
+    if (!ref_images.empty()) {
+        LOG_INFO("EDIT mode");
+    }
+
+    std::vector<sd::Tensor<float>> ref_latents;
+    for (size_t i = 0; i < ref_images.size(); i++) {
+        sd::Tensor<float> ref_latent;
+        if (request->auto_resize_ref_image) {
+            LOG_DEBUG("auto resize ref images");
+            int vae_image_size = std::min(1024 * 1024, request->width * request->height);
+            double vae_width   = sqrt(vae_image_size * ref_images[i].shape()[0] / ref_images[i].shape()[1]);
+            double vae_height  = vae_width * ref_images[i].shape()[1] / ref_images[i].shape()[0];
+
+            int factor = sd_version_is_qwen_image(sd_ctx->sd->version) ? 32 : 16;
+            vae_height = round(vae_height / factor) * factor;
+            vae_width  = round(vae_width / factor) * factor;
+
+            auto resized_ref_img = sd::ops::interpolate(ref_images[i],
+                                                        {static_cast<int>(vae_width), static_cast<int>(vae_height), 3, 1});
+
+            LOG_DEBUG("resize vae ref image %d from %" PRId64 "x%" PRId64 " to %" PRId64 "x%" PRId64,
+                      static_cast<int>(i),
+                      ref_images[i].shape()[1],
+                      ref_images[i].shape()[0],
+                      resized_ref_img.shape()[1],
+                      resized_ref_img.shape()[0]);
+
+            ref_latent = sd_ctx->sd->encode_first_stage(resized_ref_img);
+        } else {
+            ref_latent = sd_ctx->sd->encode_first_stage(ref_images[i]);
+        }
+        if (ref_latent.empty()) {
+            LOG_ERROR("failed to encode reference image %d", static_cast<int>(i));
+            return std::nullopt;
+        }
+
+        ref_latents.push_back(std::move(ref_latent));
+    }
+
+    sd::Tensor<float> concat_latent;
+    sd::Tensor<float> uncond_concat_latent;
+    if (sd_version_is_inpaint(sd_ctx->sd->version)) {
+        sd::Tensor<float> masked_init_latent;
+
+        if (sd_ctx->sd->version != VERSION_FLEX_2) {
+            if (!init_image_tensor.empty()) {
+                auto masked_image  = ((1.0f - mask_image_tensor) * (init_image_tensor - 0.5f)) + 0.5f;
+                masked_init_latent = sd_ctx->sd->encode_first_stage(masked_image);
+                if (masked_init_latent.empty()) {
+                    LOG_ERROR("failed to encode masked init image");
+                    return std::nullopt;
+                }
+            } else {
+                masked_init_latent = sd::Tensor<float>::zeros_like(init_latent);
+            }
+        } else {
+            masked_init_latent = ((1.0f - latent_mask) * init_latent);
+        }
+
+        auto uncond_masked_init_latent = sd::Tensor<float>::zeros_like(masked_init_latent);
+
+        if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
+            auto mask = mask_image_tensor.reshape({request->vae_scale_factor,
+                                                   request->width / request->vae_scale_factor,
+                                                   request->vae_scale_factor,
+                                                   request->height / request->vae_scale_factor});
+            mask      = mask.permute({1, 3, 0, 2}).reshape({request->width / request->vae_scale_factor, request->height / request->vae_scale_factor, request->vae_scale_factor * request->vae_scale_factor, 1});
+
+            concat_latent        = sd::ops::concat(masked_init_latent, mask, 2);
+            uncond_concat_latent = sd::ops::concat(uncond_masked_init_latent, mask, 2);
+        } else if (sd_ctx->sd->version == VERSION_FLEX_2) {
+            concat_latent = sd::ops::concat(masked_init_latent, latent_mask, 2);
+            if (!control_latent.empty()) {
+                concat_latent = sd::ops::concat(concat_latent, control_latent, 2);
+            } else {
+                concat_latent = sd::ops::concat(concat_latent, sd::Tensor<float>::zeros_like(masked_init_latent), 2);
+            }
+
+            uncond_concat_latent = sd::ops::concat(uncond_masked_init_latent, latent_mask, 2);
+            uncond_concat_latent = sd::ops::concat(uncond_concat_latent, sd::Tensor<float>::zeros_like(masked_init_latent), 2);
+        } else {  // SD1.x SD2.x SDXL inpaint
+            concat_latent        = sd::ops::concat(latent_mask, masked_init_latent, 2);
+            uncond_concat_latent = sd::ops::concat(latent_mask, uncond_masked_init_latent, 2);
+        }
+    }
+    if (sd_version_is_unet_edit(sd_ctx->sd->version)) {
+        concat_latent        = sd::ops::interpolate<float>(ref_latents[0], init_latent.shape());
+        uncond_concat_latent = sd::Tensor<float>::zeros_like(concat_latent);
+    }
+    if (sd_version_is_control(sd_ctx->sd->version)) {
+        if (!control_latent.empty()) {
+            concat_latent = control_latent;
+        } else {
+            concat_latent = sd::Tensor<float>::zeros_like(init_latent);
+        }
+        uncond_concat_latent = sd::Tensor<float>::zeros_like(concat_latent);
+    }
+
+    if (sd_img_gen_params->init_image.data != nullptr || sd_img_gen_params->ref_images_count > 0) {
+        int64_t t1 = ggml_time_ms();
+        LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - prepare_start_ms) * 1.0f / 1000);
+    }
+
+    ImageGenerationLatents latents;
+    latents.init_latent          = std::move(init_latent);
+    latents.concat_latent        = std::move(concat_latent);
+    latents.uncond_concat_latent = std::move(uncond_concat_latent);
+    latents.control_image        = std::move(control_image_tensor);
+    latents.ref_images           = std::move(ref_images);
+    latents.ref_latents          = std::move(ref_latents);
+
+    if (sd_version_is_inpaint(sd_ctx->sd->version)) {
+        latents.denoise_mask = std::move(latent_mask);
+    }
+
+    return latents;
+}
+
+static std::optional<ImageGenerationEmbeds> prepare_image_generation_embeds(sd_ctx_t* sd_ctx,
+                                                                            const sd_img_gen_params_t* sd_img_gen_params,
+                                                                            GenerationRequest* request,
+                                                                            SamplePlan* plan,
+                                                                            ImageGenerationLatents* latents) {
+    ConditionerParams condition_params;
+    condition_params.text            = request->prompt;
+    condition_params.clip_skip       = request->clip_skip;
+    condition_params.width           = request->width;
+    condition_params.height          = request->height;
+    condition_params.ref_images      = &latents->ref_images;
+    condition_params.adm_in_channels = static_cast<int>(sd_ctx->sd->diffusion_model->get_adm_in_channels());
+
+    auto id_cond                     = sd_ctx->sd->get_pmid_conditon(request->pm_params, condition_params);
+    int64_t prepare_start_ms         = ggml_time_ms();
+    condition_params.zero_out_masked = false;
+    auto cond                        = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads,
+                                                                                           condition_params);
+    if (cond.c_concat.empty()) {
+        cond.c_concat = latents->concat_latent;  // TODO: optimize
+    }
+
+    SDCondition uncond;
+    if (request->use_uncond || request->use_high_noise_uncond) {
+        bool zero_out_masked = false;
+        if (sd_version_is_sdxl(sd_ctx->sd->version) &&
+            request->negative_prompt.empty() &&
+            !sd_ctx->sd->is_using_edm_v_parameterization) {
+            zero_out_masked = true;
+        }
+        condition_params.text            = request->negative_prompt;
+        condition_params.zero_out_masked = zero_out_masked;
+        uncond                           = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads,
+                                                                                               condition_params);
+        if (uncond.c_concat.empty()) {
+            uncond.c_concat = latents->uncond_concat_latent;  // TODO: optimize
+        }
+    }
+
+    int64_t t1 = ggml_time_ms();
+    LOG_INFO("get_learned_condition completed, taking %.2fs", (t1 - prepare_start_ms) * 1.0f / 1000);
+
+    if (sd_ctx->sd->free_params_immediately) {
+        sd_ctx->sd->cond_stage_model->free_params_buffer();
+    }
+
+    ImageGenerationEmbeds embeds;
+    if (request->use_img_cond) {
+        embeds.img_cond = SDCondition(uncond.c_crossattn, uncond.c_vector, cond.c_concat);
+    }
+    embeds.cond    = std::move(cond);
+    embeds.uncond  = std::move(uncond);
+    embeds.id_cond = std::move(id_cond);
+
+    return embeds;
+}
+
+static sd_image_t* decode_image_outputs(sd_ctx_t* sd_ctx,
+                                        const GenerationRequest& request,
+                                        const std::vector<sd::Tensor<float>>& final_latents) {
+    if (final_latents.size() != static_cast<size_t>(request.batch_count)) {
+        LOG_ERROR("expected %d latents, got %zu", request.batch_count, final_latents.size());
+        return nullptr;
+    }
+    LOG_INFO("decoding %zu latents", final_latents.size());
+    std::vector<sd::Tensor<float>> decoded_images;
+    int64_t t0 = ggml_time_ms();
+
+    for (size_t i = 0; i < final_latents.size(); i++) {
+        int64_t t1              = ggml_time_ms();
+        sd::Tensor<float> image = sd_ctx->sd->decode_first_stage(final_latents[i]);
+        if (image.empty()) {
+            LOG_ERROR("decode_first_stage failed for latent %" PRId64, i + 1);
+            if (sd_ctx->sd->free_params_immediately) {
+                sd_ctx->sd->first_stage_model->free_params_buffer();
+            }
+            return nullptr;
+        }
+        decoded_images.push_back(std::move(image));
+        int64_t t2 = ggml_time_ms();
+        LOG_INFO("latent %" PRId64 " decoded, taking %.2fs", i + 1, (t2 - t1) * 1.0f / 1000);
+    }
+
+    int64_t t4 = ggml_time_ms();
+    LOG_INFO("decode_first_stage completed, taking %.2fs", (t4 - t0) * 1.0f / 1000);
+    if (sd_ctx->sd->free_params_immediately) {
+        sd_ctx->sd->first_stage_model->free_params_buffer();
+    }
+
+    sd_image_t* result_images = (sd_image_t*)calloc(request.batch_count, sizeof(sd_image_t));
+    if (result_images == nullptr) {
+        return nullptr;
+    }
+    memset(result_images, 0, request.batch_count * sizeof(sd_image_t));
+
+    for (size_t i = 0; i < decoded_images.size(); i++) {
+        result_images[i] = tensor_to_sd_image(decoded_images[i]);
+    }
+
+    return result_images;
+}
+
+SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) {
+    if (sd_ctx == nullptr || sd_img_gen_params == nullptr) {
         return nullptr;
     }
 
-    int64_t seed = sd_vid_gen_params->seed;
-    if (seed < 0) {
-        seed = (int)time(nullptr);
+    int64_t t0                    = ggml_time_ms();
+    sd_ctx->sd->vae_tiling_params = sd_img_gen_params->vae_tiling_params;
+    GenerationRequest request(sd_ctx, sd_img_gen_params);
+    LOG_INFO("generate_image %dx%d", request.width, request.height);
+
+    sd_ctx->sd->rng->manual_seed(request.seed);
+    sd_ctx->sd->sampler_rng->manual_seed(request.seed);
+    sd_ctx->sd->set_flow_shift(sd_img_gen_params->sample_params.flow_shift);
+    sd_ctx->sd->apply_loras(sd_img_gen_params->loras, sd_img_gen_params->lora_count);
+
+    ImageVaeAxesGuard axes_guard(sd_ctx, sd_img_gen_params, request);
+
+    SamplePlan plan(sd_ctx, sd_img_gen_params, request);
+    auto latents_opt = prepare_image_generation_latents(sd_ctx,
+                                                        sd_img_gen_params,
+                                                        &request,
+                                                        &plan);
+    if (!latents_opt.has_value()) {
+        return nullptr;
+    }
+    ImageGenerationLatents latents = std::move(*latents_opt);
+
+    auto embeds_opt = prepare_image_generation_embeds(sd_ctx,
+                                                      sd_img_gen_params,
+                                                      &request,
+                                                      &plan,
+                                                      &latents);
+    if (!embeds_opt.has_value()) {
+        return nullptr;
+    }
+    ImageGenerationEmbeds embeds = std::move(*embeds_opt);
+
+    std::vector<sd::Tensor<float>> final_latents;
+    int64_t denoise_start = ggml_time_ms();
+    for (int b = 0; b < request.batch_count; b++) {
+        int64_t sampling_start = ggml_time_ms();
+        int64_t cur_seed       = request.seed + b;
+        LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, request.batch_count, cur_seed);
+
+        sd_ctx->sd->rng->manual_seed(cur_seed);
+        sd_ctx->sd->sampler_rng->manual_seed(cur_seed);
+        sd::Tensor<float> noise = sd::randn_like<float>(latents.init_latent, sd_ctx->sd->rng);
+
+        sd::Tensor<float> x_0 = sd_ctx->sd->sample(sd_ctx->sd->diffusion_model,
+                                                   true,
+                                                   latents.init_latent,
+                                                   std::move(noise),
+                                                   embeds.cond,
+                                                   embeds.uncond,
+                                                   embeds.img_cond,
+                                                   embeds.id_cond,
+                                                   latents.control_image,
+                                                   request.control_strength,
+                                                   request.guidance,
+                                                   request.eta,
+                                                   request.shifted_timestep,
+                                                   plan.sample_method,
+                                                   plan.sigmas,
+                                                   plan.start_merge_step,
+                                                   latents.ref_latents,
+                                                   request.increase_ref_index,
+                                                   latents.denoise_mask,
+                                                   sd::Tensor<float>(),
+                                                   1.f,
+                                                   request.cache_params);
+        int64_t sampling_end  = ggml_time_ms();
+        if (!x_0.empty()) {
+            LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
+            final_latents.push_back(std::move(x_0));
+            continue;
+        }
+
+        LOG_ERROR("sampling for image %d/%d failed after %.2fs",
+                  b + 1,
+                  request.batch_count,
+                  (sampling_end - sampling_start) * 1.0f / 1000);
+        if (sd_ctx->sd->free_params_immediately) {
+            sd_ctx->sd->diffusion_model->free_params_buffer();
+        }
+        return nullptr;
+    }
+    if (sd_ctx->sd->free_params_immediately) {
+        sd_ctx->sd->diffusion_model->free_params_buffer();
+    }
+    int64_t denoise_end = ggml_time_ms();
+    LOG_INFO("generating %" PRId64 " latent images completed, taking %.2fs",
+             final_latents.size(),
+             (denoise_end - denoise_start) * 1.0f / 1000);
+
+    auto result = decode_image_outputs(sd_ctx, request, final_latents);
+    if (result == nullptr) {
+        return nullptr;
     }
 
-    sd_ctx->sd->rng->manual_seed(seed);
-    sd_ctx->sd->sampler_rng->manual_seed(seed);
+    sd_ctx->sd->lora_stat();
 
-    int64_t t0 = ggml_time_ms();
+    int64_t t1 = ggml_time_ms();
+    LOG_INFO("generate_image completed in %.2fs", (t1 - t0) * 1.0f / 1000);
+    return result;
+}
 
-    // Apply lora
-    sd_ctx->sd->apply_loras(sd_vid_gen_params->loras, sd_vid_gen_params->lora_count);
+static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd_ctx_t* sd_ctx,
+                                                                              const sd_vid_gen_params_t* sd_vid_gen_params,
+                                                                              GenerationRequest* request) {
+    ImageGenerationLatents latents;
+    int64_t prepare_start_ms = ggml_time_ms();
+
+    sd::Tensor<float> start_image;
+    sd::Tensor<float> end_image;
+
+    if (sd_vid_gen_params->init_image.data) {
+        start_image = sd_image_to_tensor(sd_vid_gen_params->init_image, request->width, request->height);
+    }
+
+    if (sd_vid_gen_params->end_image.data) {
+        end_image = sd_image_to_tensor(sd_vid_gen_params->end_image, request->width, request->height);
+    }
 
-    ggml_tensor* init_latent        = nullptr;
-    ggml_tensor* clip_vision_output = nullptr;
-    ggml_tensor* concat_latent      = nullptr;
-    ggml_tensor* denoise_mask       = nullptr;
-    ggml_tensor* vace_context       = nullptr;
-    int64_t ref_image_num           = 0;  // for vace
     if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-14B" ||
         sd_ctx->sd->diffusion_model->get_desc() == "Wan2.2-I2V-14B" ||
         sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-1.3B" ||
@@ -3938,350 +3196,381 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
         if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-14B" ||
             sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-1.3B" ||
             sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-FLF2V-14B") {
-            if (sd_vid_gen_params->init_image.data) {
-                clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->init_image, false, -2);
+            if (!start_image.empty()) {
+                auto clip_vision_output = sd_ctx->sd->get_clip_vision_output(start_image, false, -2);
+                if (clip_vision_output.empty()) {
+                    LOG_ERROR("failed to compute clip vision output for init image");
+                    return std::nullopt;
+                }
+                latents.clip_vision_output = std::move(clip_vision_output);
             } else {
-                clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->init_image, false, -2, true);
+                latents.clip_vision_output = sd_ctx->sd->get_clip_vision_output(start_image, false, -2, true);
             }
 
             if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-FLF2V-14B") {
-                ggml_tensor* end_image_clip_vision_output = nullptr;
-                if (sd_vid_gen_params->end_image.data) {
-                    end_image_clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->end_image, false, -2);
+                sd::Tensor<float> end_image_clip_vision_output;
+                if (!end_image.empty()) {
+                    end_image_clip_vision_output = sd_ctx->sd->get_clip_vision_output(end_image, false, -2);
+                    if (end_image_clip_vision_output.empty()) {
+                        LOG_ERROR("failed to compute clip vision output for end image");
+                        return std::nullopt;
+                    }
                 } else {
-                    end_image_clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->end_image, false, -2, true);
+                    end_image_clip_vision_output = sd_ctx->sd->get_clip_vision_output(end_image, false, -2, true);
                 }
-                clip_vision_output = ggml_ext_tensor_concat(work_ctx, clip_vision_output, end_image_clip_vision_output, 1);
+                latents.clip_vision_output = sd::ops::concat(latents.clip_vision_output, end_image_clip_vision_output, 1);
             }
 
             int64_t t1 = ggml_time_ms();
-            LOG_INFO("get_clip_vision_output completed, taking %" PRId64 " ms", t1 - t0);
+            LOG_INFO("get_clip_vision_output completed, taking %" PRId64 " ms", t1 - prepare_start_ms);
         }
 
-        int64_t t1         = ggml_time_ms();
-        ggml_tensor* image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, frames, 3);
-        ggml_ext_tensor_iter(image, [&](ggml_tensor* image, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-            float value = 0.5f;
-            if (i2 == 0 && sd_vid_gen_params->init_image.data) {  // start image
-                value = *(sd_vid_gen_params->init_image.data + i1 * width * 3 + i0 * 3 + i3);
-                value /= 255.f;
-            } else if (i2 == frames - 1 && sd_vid_gen_params->end_image.data) {
-                value = *(sd_vid_gen_params->end_image.data + i1 * width * 3 + i0 * 3 + i3);
-                value /= 255.f;
-            }
-            ggml_ext_tensor_set_f32(image, value, i0, i1, i2, i3);
-        });
+        int64_t t1              = ggml_time_ms();
+        sd::Tensor<float> image = sd::full<float>({request->width, request->height, request->frames, 3, 1}, 0.5f);
+        if (!start_image.empty()) {
+            sd::ops::slice_assign(&image, 2, 0, 1, start_image.unsqueeze(2));
+        }
+        if (!end_image.empty()) {
+            sd::ops::slice_assign(&image, 2, request->frames - 1, request->frames, end_image.unsqueeze(2));
+        }
 
-        concat_latent = sd_ctx->sd->encode_first_stage(work_ctx, image);  // [b*c, t, h/vae_scale_factor, w/vae_scale_factor]
+        auto concat_latent = sd_ctx->sd->encode_first_stage(image);  // [b, c, t, h/vae_scale_factor, w/vae_scale_factor]
+        if (concat_latent.empty()) {
+            LOG_ERROR("failed to encode video conditioning frames");
+            return std::nullopt;
+        }
+        latents.concat_latent = std::move(concat_latent);
 
         int64_t t2 = ggml_time_ms();
         LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1);
 
-        ggml_tensor* concat_mask = ggml_new_tensor_4d(work_ctx,
-                                                      GGML_TYPE_F32,
-                                                      concat_latent->ne[0],
-                                                      concat_latent->ne[1],
-                                                      concat_latent->ne[2],
-                                                      4);  // [b*4, t, w/vae_scale_factor, h/vae_scale_factor]
-        ggml_ext_tensor_iter(concat_mask, [&](ggml_tensor* concat_mask, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-            float value = 0.0f;
-            if (i2 == 0 && sd_vid_gen_params->init_image.data) {  // start image
-                value = 1.0f;
-            } else if (i2 == frames - 1 && sd_vid_gen_params->end_image.data && i3 == 3) {
-                value = 1.0f;
-            }
-            ggml_ext_tensor_set_f32(concat_mask, value, i0, i1, i2, i3);
-        });
-
-        concat_latent = ggml_ext_tensor_concat(work_ctx, concat_mask, concat_latent, 3);  // [b*(c+4), t, h/vae_scale_factor, w/vae_scale_factor]
-    } else if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.2-TI2V-5B" && sd_vid_gen_params->init_image.data) {
+        sd::Tensor<float> concat_mask = sd::zeros<float>({latents.concat_latent.shape()[0],
+                                                          latents.concat_latent.shape()[1],
+                                                          latents.concat_latent.shape()[2],
+                                                          4,
+                                                          1});  // [b, 4, t, h/vae_scale_factor, w/vae_scale_factor]
+        if (!start_image.empty()) {
+            sd::ops::fill_slice(&concat_mask, 2, 0, 1, 1.0f);
+        }
+        if (!end_image.empty()) {
+            auto last_channel = sd::ops::slice(concat_mask, 3, 3, 4);
+            sd::ops::fill_slice(&last_channel, 2, last_channel.shape()[2] - 1, last_channel.shape()[2], 1.0f);
+            sd::ops::slice_assign(&concat_mask, 3, 3, 4, last_channel);
+        }
+        latents.concat_latent = sd::ops::concat(concat_mask, latents.concat_latent, 3);  // [b, 4+c, t, h/vae_scale_factor, w/vae_scale_factor]
+    } else if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.2-TI2V-5B" && !start_image.empty()) {
         LOG_INFO("IMG2VID");
 
-        int64_t t1            = ggml_time_ms();
-        ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
-        sd_image_to_ggml_tensor(sd_vid_gen_params->init_image, init_img);
-        init_img = ggml_reshape_4d(work_ctx, init_img, width, height, 1, 3);
-
-        auto init_image_latent = sd_ctx->sd->vae_encode(work_ctx, init_img);  // [b*c, 1, h/16, w/16]
-
-        init_latent  = sd_ctx->sd->generate_init_latent(work_ctx, width, height, frames, true);
-        denoise_mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2], 1);
-        ggml_set_f32(denoise_mask, 1.f);
-
-        if (!sd_ctx->sd->use_tiny_autoencoder)
-            sd_ctx->sd->process_latent_out(init_latent);
-
-        ggml_ext_tensor_iter(init_image_latent, [&](ggml_tensor* t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-            float value = ggml_ext_tensor_get_f32(t, i0, i1, i2, i3);
-            ggml_ext_tensor_set_f32(init_latent, value, i0, i1, i2, i3);
-            if (i3 == 0) {
-                ggml_ext_tensor_set_f32(denoise_mask, 0.f, i0, i1, i2, i3);
-            }
-        });
-
-        if (sd_vid_gen_params->end_image.data) {
-            LOG_INFO("FLF2V");
-            ggml_tensor* end_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
-            sd_image_to_ggml_tensor(sd_vid_gen_params->end_image, end_img);
-            end_img = ggml_reshape_4d(work_ctx, end_img, width, height, 1, 3);
-
-            auto end_image_latent = sd_ctx->sd->vae_encode(work_ctx, end_img);  // [b*c, 1, h/16, w/16]
-
-            ggml_ext_tensor_iter(end_image_latent, [&](ggml_tensor* t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-                float value = ggml_ext_tensor_get_f32(t, i0, i1, i2, i3);
-                ggml_ext_tensor_set_f32(init_latent, value, i0, i1, init_latent->ne[2] - 1, i3);
-                if (i3 == 0) {
-                    ggml_ext_tensor_set_f32(denoise_mask, 0.f, i0, i1, denoise_mask->ne[2] - 1, i3);
-                }
-            });
+        int64_t t1             = ggml_time_ms();
+        auto init_img          = start_image.reshape({start_image.shape()[0], start_image.shape()[1], 1, start_image.shape()[2], 1});
+        auto init_image_latent = sd_ctx->sd->encode_first_stage(init_img);  // [b, c, 1, h/vae_scale_factor, w/vae_scale_factor]
+        if (init_image_latent.empty()) {
+            LOG_ERROR("failed to encode init video frame");
+            return std::nullopt;
         }
 
-        if (!sd_ctx->sd->use_tiny_autoencoder)
-            sd_ctx->sd->process_latent_in(init_latent);
+        latents.init_latent = sd_ctx->sd->generate_init_latent(request->width, request->height, request->frames, true);  // [b, c, t, h/vae_scale_factor, w/vae_scale_factor]
+        sd::ops::slice_assign(&latents.init_latent, 2, 0, init_image_latent.shape()[2], init_image_latent);
+
+        latents.denoise_mask = sd::full<float>({latents.init_latent.shape()[0], latents.init_latent.shape()[1], latents.init_latent.shape()[2], 1, 1}, 1.f);
+        sd::ops::fill_slice(&latents.denoise_mask, 2, 0, init_image_latent.shape()[2], 0.0f);
+
+        if (!end_image.empty()) {
+            auto end_img          = end_image.reshape({end_image.shape()[0], end_image.shape()[1], 1, end_image.shape()[2], 1});
+            auto end_image_latent = sd_ctx->sd->encode_first_stage(end_img);  // [b, c, 1, h/vae_scale_factor, w/vae_scale_factor]
+            if (end_image_latent.empty()) {
+                LOG_ERROR("failed to encode end video frame");
+                return std::nullopt;
+            }
+            sd::ops::slice_assign(&latents.init_latent, 2, latents.init_latent.shape()[2] - 1, latents.init_latent.shape()[2], end_image_latent);
+            sd::ops::fill_slice(&latents.denoise_mask, 2, latents.init_latent.shape()[2] - 1, latents.init_latent.shape()[2], 0.0f);
+        }
 
         int64_t t2 = ggml_time_ms();
         LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1);
     } else if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-VACE-1.3B" ||
                sd_ctx->sd->diffusion_model->get_desc() == "Wan2.x-VACE-14B") {
         LOG_INFO("VACE");
-        int64_t t1                    = ggml_time_ms();
-        ggml_tensor* ref_image_latent = nullptr;
-        if (sd_vid_gen_params->init_image.data) {
-            ggml_tensor* ref_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
-            sd_image_to_ggml_tensor(sd_vid_gen_params->init_image, ref_img);
-            ref_img = ggml_reshape_4d(work_ctx, ref_img, width, height, 1, 3);
-
-            ref_image_latent = sd_ctx->sd->encode_first_stage(work_ctx, ref_img);  // [b*c, 1, h/16, w/16]
-            auto zero_latent = ggml_dup_tensor(work_ctx, ref_image_latent);
-            ggml_set_f32(zero_latent, 0.f);
-            ref_image_latent = ggml_ext_tensor_concat(work_ctx, ref_image_latent, zero_latent, 3);  // [b*2*c, 1, h/16, w/16]
+        int64_t t1 = ggml_time_ms();
+        sd::Tensor<float> ref_image_latent;
+        if (!start_image.empty()) {
+            auto ref_img     = start_image.reshape({start_image.shape()[0], start_image.shape()[1], 1, start_image.shape()[2], 1});
+            auto encoded_ref = sd_ctx->sd->encode_first_stage(ref_img);  // [b, c, 1, h/vae_scale_factor, w/vae_scale_factor]
+            if (encoded_ref.empty()) {
+                LOG_ERROR("failed to encode VACE reference image");
+                return std::nullopt;
+            }
+            ref_image_latent = sd::ops::concat(encoded_ref, sd::zeros<float>(encoded_ref.shape()), 3);  // [b, 2*c, 1, h/vae_scale_factor, w/vae_scale_factor]
         }
 
-        ggml_tensor* control_video = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, frames, 3);
-        ggml_ext_tensor_iter(control_video, [&](ggml_tensor* control_video, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-            float value = 0.5f;
-            if (i2 < sd_vid_gen_params->control_frames_size) {
-                value = sd_image_get_f32(sd_vid_gen_params->control_frames[i2], i0, i1, i3);
-            }
-            ggml_ext_tensor_set_f32(control_video, value, i0, i1, i2, i3);
-        });
-        ggml_tensor* mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, frames, 1);
-        ggml_set_f32(mask, 1.0f);
-        ggml_tensor* inactive = ggml_dup_tensor(work_ctx, control_video);
-        ggml_tensor* reactive = ggml_dup_tensor(work_ctx, control_video);
+        sd::Tensor<float> control_video = sd::full<float>({request->width, request->height, request->frames, 3, 1}, 0.5f);
+        int64_t control_frame_count     = std::min<int64_t>(request->frames, sd_vid_gen_params->control_frames_size);
+        for (int64_t i = 0; i < control_frame_count; ++i) {
+            auto control_frame = sd_image_to_tensor(sd_vid_gen_params->control_frames[i], request->width, request->height);
+            sd::ops::slice_assign(&control_video, 2, i, i + 1, control_frame.unsqueeze(2));
+        }
 
-        ggml_ext_tensor_iter(control_video, [&](ggml_tensor* t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-            float control_video_value = ggml_ext_tensor_get_f32(t, i0, i1, i2, i3) - 0.5f;
-            float mask_value          = ggml_ext_tensor_get_f32(mask, i0, i1, i2, 0);
-            float inactive_value      = (control_video_value * (1.f - mask_value)) + 0.5f;
-            float reactive_value      = (control_video_value * mask_value) + 0.5f;
+        sd::Tensor<float> mask = sd::full<float>({request->width, request->height, request->frames, 1, 1}, 1.0f);
 
-            ggml_ext_tensor_set_f32(inactive, inactive_value, i0, i1, i2, i3);
-            ggml_ext_tensor_set_f32(reactive, reactive_value, i0, i1, i2, i3);
-        });
+        control_video              = control_video - 0.5f;
+        sd::Tensor<float> inactive = control_video * (1.0f - mask) + 0.5f;
+        sd::Tensor<float> reactive = control_video * mask + 0.5f;
 
-        inactive = sd_ctx->sd->encode_first_stage(work_ctx, inactive);  // [b*c, t, h/vae_scale_factor, w/vae_scale_factor]
-        reactive = sd_ctx->sd->encode_first_stage(work_ctx, reactive);  // [b*c, t, h/vae_scale_factor, w/vae_scale_factor]
+        inactive = sd_ctx->sd->encode_first_stage(inactive);  // [b, c, t, h/vae_scale_factor, w/vae_scale_factor]
+        if (inactive.empty()) {
+            LOG_ERROR("failed to encode VACE inactive context");
+            return std::nullopt;
+        }
 
-        int64_t length = inactive->ne[2];
-        if (ref_image_latent) {
+        reactive = sd_ctx->sd->encode_first_stage(reactive);  // [b, c, t, h/vae_scale_factor, w/vae_scale_factor]
+        if (reactive.empty()) {
+            LOG_ERROR("failed to encode VACE reactive context");
+            return std::nullopt;
+        }
+
+        int64_t length = inactive.shape()[2];
+        if (!ref_image_latent.empty()) {
             length += 1;
-            frames        = static_cast<int>((length - 1) * 4 + 1);
-            ref_image_num = 1;
+            request->frames       = static_cast<int>((length - 1) * 4 + 1);
+            latents.ref_image_num = 1;
         }
-        vace_context = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, inactive->ne[0], inactive->ne[1], length, 96);  // [b*96, t, h/vae_scale_factor, w/vae_scale_factor]
-        ggml_ext_tensor_iter(vace_context, [&](ggml_tensor* vace_context, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-            float value;
-            if (i3 < 32) {
-                if (ref_image_latent && i2 == 0) {
-                    value = ggml_ext_tensor_get_f32(ref_image_latent, i0, i1, 0, i3);
-                } else {
-                    if (i3 < 16) {
-                        value = ggml_ext_tensor_get_f32(inactive, i0, i1, i2 - ref_image_num, i3);
-                    } else {
-                        value = ggml_ext_tensor_get_f32(reactive, i0, i1, i2 - ref_image_num, i3 - 16);
-                    }
-                }
-            } else {  // mask
-                if (ref_image_latent && i2 == 0) {
-                    value = 0.f;
-                } else {
-                    int64_t vae_stride        = vae_scale_factor;
-                    int64_t mask_height_index = i1 * vae_stride + (i3 - 32) / vae_stride;
-                    int64_t mask_width_index  = i0 * vae_stride + (i3 - 32) % vae_stride;
-                    value                     = ggml_ext_tensor_get_f32(mask, mask_width_index, mask_height_index, i2 - ref_image_num, 0);
-                }
-            }
-            ggml_ext_tensor_set_f32(vace_context, value, i0, i1, i2, i3);
-        });
-        int64_t t2 = ggml_time_ms();
+        auto vace_context = sd::ops::concat(inactive, reactive, 3);  // [b, 2*c, t, h/vae_scale_factor, w/vae_scale_factor]
+
+        mask              = sd::full<float>({request->width, request->height, inactive.shape()[2], 1, 1}, 1.0f);
+        auto mask_context = mask.reshape({request->vae_scale_factor,
+                                          inactive.shape()[0],
+                                          request->vae_scale_factor,
+                                          inactive.shape()[1],
+                                          inactive.shape()[2]});   // [t, h/vae_scale_factor, vae_scale_factor, w/vae_scale_factor, vae_scale_factor]
+        mask_context      = mask_context.permute({1, 3, 4, 0, 2})  // [vae_scale_factor, vae_scale_factor, t, h/vae_scale_factor, w/vae_scale_factor]
+                           .reshape({inactive.shape()[0],
+                                     inactive.shape()[1],
+                                     inactive.shape()[2],
+                                     request->vae_scale_factor * request->vae_scale_factor});  // [vae_scale_factor*vae_scale_factor, t, h/vae_scale_factor, w/vae_scale_factor]
+
+        if (!ref_image_latent.empty()) {
+            vace_context  = sd::ops::concat(ref_image_latent, vace_context, 2);  // [b, 2*c, t+1, h/vae_scale_factor, w/vae_scale_factor]
+            auto mask_pad = sd::zeros<float>({mask_context.shape()[0],
+                                              mask_context.shape()[1],
+                                              1,
+                                              mask_context.shape()[3]});  // [vae_scale_factor*vae_scale_factor, 1, h/vae_scale_factor, w/vae_scale_factor]
+            mask_context  = sd::ops::concat(mask_pad, mask_context, 2);   // [vae_scale_factor*vae_scale_factor, t + 1, h/vae_scale_factor, w/vae_scale_factor]
+        }
+
+        mask_context.unsqueeze_(mask_context.dim());  // [b, vae_scale_factor*vae_scale_factor, t + 1 or t, h/vae_scale_factor, w/vae_scale_factor]
+
+        latents.vace_context = sd::ops::concat(vace_context, mask_context, 3);  // [b, 2*c + vae_scale_factor*vae_scale_factor, t + 1 or t, h/vae_scale_factor, w/vae_scale_factor]
+        int64_t t2           = ggml_time_ms();
         LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1);
     }
 
-    if (init_latent == nullptr) {
-        init_latent = sd_ctx->sd->generate_init_latent(work_ctx, width, height, frames, true);
+    if (latents.init_latent.empty()) {
+        latents.init_latent = sd_ctx->sd->generate_init_latent(request->width, request->height, request->frames, true);
     }
 
-    // Get learned condition
+    return latents;
+}
+
+static ImageGenerationEmbeds prepare_video_generation_embeds(sd_ctx_t* sd_ctx,
+                                                             const sd_vid_gen_params_t* sd_vid_gen_params,
+                                                             const GenerationRequest& request,
+                                                             const ImageGenerationLatents& latents) {
+    ImageGenerationEmbeds embeds;
     ConditionerParams condition_params;
-    condition_params.clip_skip       = sd_vid_gen_params->clip_skip;
+    condition_params.clip_skip       = request.clip_skip;
+    condition_params.text            = request.prompt;
     condition_params.zero_out_masked = true;
-    condition_params.text            = prompt;
 
-    int64_t t1       = ggml_time_ms();
-    SDCondition cond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
-                                                                           sd_ctx->sd->n_threads,
-                                                                           condition_params);
-    cond.c_concat    = concat_latent;
-    cond.c_vector    = clip_vision_output;
-    SDCondition uncond;
-    if (sd_vid_gen_params->sample_params.guidance.txt_cfg != 1.0 || sd_vid_gen_params->high_noise_sample_params.guidance.txt_cfg != 1.0) {
-        condition_params.text = negative_prompt;
-        uncond                = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
-                                                                                    sd_ctx->sd->n_threads,
-                                                                                    condition_params);
-        uncond.c_concat       = concat_latent;
-        uncond.c_vector       = clip_vision_output;
+    int64_t prepare_start_ms = ggml_time_ms();
+    embeds.cond              = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads,
+                                                                                   condition_params);
+    embeds.cond.c_concat     = latents.concat_latent;
+    embeds.cond.c_vector     = latents.clip_vision_output;
+    if (request.use_uncond) {
+        condition_params.text  = request.negative_prompt;
+        embeds.uncond          = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads,
+                                                                                     condition_params);
+        embeds.uncond.c_concat = latents.concat_latent;
+        embeds.uncond.c_vector = latents.clip_vision_output;
     }
-    int64_t t2 = ggml_time_ms();
-    LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t2 - t1);
+
+    int64_t t1 = ggml_time_ms();
+    LOG_INFO("get_learned_condition completed, taking %.2fs", (t1 - prepare_start_ms) * 1.0f / 1000);
 
     if (sd_ctx->sd->free_params_immediately) {
         sd_ctx->sd->cond_stage_model->free_params_buffer();
     }
+    return embeds;
+}
 
-    int W = width / vae_scale_factor;
-    int H = height / vae_scale_factor;
-    int T = static_cast<int>(init_latent->ne[2]);
-    int C = sd_ctx->sd->get_latent_channel();
+static sd_image_t* decode_video_outputs(sd_ctx_t* sd_ctx,
+                                        const sd::Tensor<float>& final_latent,
+                                        int* num_frames_out) {
+    if (final_latent.empty()) {
+        LOG_ERROR("no latent video to decode");
+        return nullptr;
+    }
+    int64_t t4            = ggml_time_ms();
+    sd::Tensor<float> vid = sd_ctx->sd->decode_first_stage(final_latent, true);
+    int64_t t5            = ggml_time_ms();
+    LOG_INFO("decode_first_stage completed, taking %.2fs", (t5 - t4) * 1.0f / 1000);
+    if (sd_ctx->sd->free_params_immediately) {
+        sd_ctx->sd->first_stage_model->free_params_buffer();
+    }
+    if (vid.empty()) {
+        LOG_ERROR("decode_first_stage failed for video");
+        return nullptr;
+    }
 
-    struct ggml_tensor* final_latent;
-    struct ggml_tensor* x_t   = init_latent;
-    struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, T, C);
-    ggml_ext_im_set_randn_f32(noise, sd_ctx->sd->rng);
-    // High Noise Sample
-    if (high_noise_sample_steps > 0) {
+    sd_image_t* result_images = (sd_image_t*)calloc(vid.shape()[2], sizeof(sd_image_t));
+    if (result_images == nullptr) {
+        return nullptr;
+    }
+    if (num_frames_out != nullptr) {
+        *num_frames_out = static_cast<int>(vid.shape()[2]);
+    }
+
+    for (int64_t i = 0; i < vid.shape()[2]; i++) {
+        result_images[i] = tensor_to_sd_image(vid, static_cast<int>(i));
+    }
+
+    return result_images;
+}
+
+SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, int* num_frames_out) {
+    if (sd_ctx == nullptr || sd_vid_gen_params == nullptr) {
+        return nullptr;
+    }
+    if (num_frames_out != nullptr) {
+        *num_frames_out = 0;
+    }
+    int64_t t0                    = ggml_time_ms();
+    sd_ctx->sd->vae_tiling_params = sd_vid_gen_params->vae_tiling_params;
+    GenerationRequest request(sd_ctx, sd_vid_gen_params);
+    sd_ctx->sd->rng->manual_seed(request.seed);
+    sd_ctx->sd->sampler_rng->manual_seed(request.seed);
+    sd_ctx->sd->set_flow_shift(sd_vid_gen_params->sample_params.flow_shift);
+    sd_ctx->sd->apply_loras(sd_vid_gen_params->loras, sd_vid_gen_params->lora_count);
+
+    SamplePlan plan(sd_ctx, sd_vid_gen_params, request);
+    auto latent_inputs_opt = prepare_video_generation_latents(sd_ctx, sd_vid_gen_params, &request);
+    if (!latent_inputs_opt.has_value()) {
+        return nullptr;
+    }
+    ImageGenerationLatents latents = std::move(*latent_inputs_opt);
+    ImageGenerationEmbeds embeds   = prepare_video_generation_embeds(sd_ctx,
+                                                                     sd_vid_gen_params,
+                                                                     request,
+                                                                     latents);
+    LOG_INFO("generate_video %dx%dx%d",
+             request.width,
+             request.height,
+             request.frames);
+
+    int64_t latent_start = ggml_time_ms();
+    int W                = request.width / request.vae_scale_factor;
+    int H                = request.height / request.vae_scale_factor;
+    int T                = static_cast<int>(latents.init_latent.shape()[2]);
+
+    sd::Tensor<float> x_t   = latents.init_latent;
+    sd::Tensor<float> noise = sd::Tensor<float>::randn_like(x_t, sd_ctx->sd->rng);
+
+    if (plan.high_noise_sample_steps > 0) {
         LOG_DEBUG("sample(high noise) %dx%dx%d", W, H, T);
-        enum sample_method_t high_noise_sample_method = sd_vid_gen_params->high_noise_sample_params.sample_method;
-        if (high_noise_sample_method == SAMPLE_METHOD_COUNT) {
-            high_noise_sample_method = sd_get_default_sample_method(sd_ctx);
-        }
-        LOG_INFO("sampling(high noise) using %s method", sampling_methods_str[high_noise_sample_method]);
 
         int64_t sampling_start = ggml_time_ms();
+        std::vector<float> high_noise_sigmas(plan.sigmas.begin(), plan.sigmas.begin() + plan.high_noise_sample_steps + 1);
+        plan.sigmas = std::vector<float>(plan.sigmas.begin() + plan.high_noise_sample_steps, plan.sigmas.end());
 
-        std::vector<float> high_noise_sigmas = std::vector<float>(sigmas.begin(), sigmas.begin() + high_noise_sample_steps + 1);
-        sigmas                               = std::vector<float>(sigmas.begin() + high_noise_sample_steps, sigmas.end());
+        sd::Tensor<float> x_t_sampled = sd_ctx->sd->sample(sd_ctx->sd->high_noise_diffusion_model,
+                                                           false,
+                                                           x_t,
+                                                           std::move(noise),
+                                                           embeds.cond,
+                                                           request.use_high_noise_uncond ? embeds.uncond : SDCondition(),
+                                                           embeds.img_cond,
+                                                           embeds.id_cond,
+                                                           sd::Tensor<float>(),
+                                                           0.f,
+                                                           request.high_noise_guidance,
+                                                           sd_vid_gen_params->high_noise_sample_params.eta,
+                                                           request.shifted_timestep,
+                                                           plan.high_noise_sample_method,
+                                                           high_noise_sigmas,
+                                                           -1,
+                                                           std::vector<sd::Tensor<float>>{},
+                                                           false,
+                                                           latents.denoise_mask,
+                                                           latents.vace_context,
+                                                           request.vace_strength,
+                                                           request.cache_params);
+        int64_t sampling_end          = ggml_time_ms();
+        if (x_t_sampled.empty()) {
+            LOG_ERROR("sampling(high noise) failed after %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
+            if (sd_ctx->sd->free_params_immediately) {
+                sd_ctx->sd->high_noise_diffusion_model->free_params_buffer();
+            }
+            return nullptr;
+        }
 
-        x_t = sd_ctx->sd->sample(work_ctx,
-                                 sd_ctx->sd->high_noise_diffusion_model,
-                                 false,
-                                 x_t,
-                                 noise,
-                                 cond,
-                                 uncond,
-                                 {},
-                                 nullptr,
-                                 0,
-                                 sd_vid_gen_params->high_noise_sample_params.guidance,
-                                 sd_vid_gen_params->high_noise_sample_params.eta,
-                                 sd_vid_gen_params->high_noise_sample_params.shifted_timestep,
-                                 high_noise_sample_method,
-                                 high_noise_sigmas,
-                                 -1,
-                                 {},
-                                 {},
-                                 false,
-                                 denoise_mask,
-                                 vace_context,
-                                 sd_vid_gen_params->vace_strength,
-                                 &sd_vid_gen_params->cache);
-
-        int64_t sampling_end = ggml_time_ms();
+        x_t   = std::move(x_t_sampled);
+        noise = {};
         LOG_INFO("sampling(high noise) completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
         if (sd_ctx->sd->free_params_immediately) {
             sd_ctx->sd->high_noise_diffusion_model->free_params_buffer();
         }
-        noise = nullptr;
     }
 
-    // Sample
-    {
-        LOG_DEBUG("sample %dx%dx%d", W, H, T);
-        int64_t sampling_start = ggml_time_ms();
+    LOG_DEBUG("sample %dx%dx%d", W, H, T);
+    int64_t sampling_start         = ggml_time_ms();
+    sd::Tensor<float> final_latent = sd_ctx->sd->sample(sd_ctx->sd->diffusion_model,
+                                                        true,
+                                                        x_t,
+                                                        std::move(noise),
+                                                        embeds.cond,
+                                                        request.use_uncond ? embeds.uncond : SDCondition(),
+                                                        embeds.img_cond,
+                                                        embeds.id_cond,
+                                                        sd::Tensor<float>(),
+                                                        0.f,
+                                                        sd_vid_gen_params->sample_params.guidance,
+                                                        sd_vid_gen_params->sample_params.eta,
+                                                        sd_vid_gen_params->sample_params.shifted_timestep,
+                                                        plan.sample_method,
+                                                        plan.sigmas,
+                                                        -1,
+                                                        std::vector<sd::Tensor<float>>{},
+                                                        false,
+                                                        latents.denoise_mask,
+                                                        latents.vace_context,
+                                                        request.vace_strength,
+                                                        request.cache_params);
 
-        final_latent = sd_ctx->sd->sample(work_ctx,
-                                          sd_ctx->sd->diffusion_model,
-                                          true,
-                                          x_t,
-                                          noise,
-                                          cond,
-                                          uncond,
-                                          {},
-                                          nullptr,
-                                          0,
-                                          sd_vid_gen_params->sample_params.guidance,
-                                          sd_vid_gen_params->sample_params.eta,
-                                          sd_vid_gen_params->sample_params.shifted_timestep,
-                                          sample_method,
-                                          sigmas,
-                                          -1,
-                                          {},
-                                          {},
-                                          false,
-                                          denoise_mask,
-                                          vace_context,
-                                          sd_vid_gen_params->vace_strength,
-                                          &sd_vid_gen_params->cache);
+    int64_t sampling_end = ggml_time_ms();
+    if (sd_ctx->sd->free_params_immediately) {
+        sd_ctx->sd->diffusion_model->free_params_buffer();
+    }
+    if (final_latent.empty()) {
+        LOG_ERROR("sampling failed after %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
+        return nullptr;
+    }
+    LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
 
-        int64_t sampling_end = ggml_time_ms();
-        LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
-        if (sd_ctx->sd->free_params_immediately) {
-            sd_ctx->sd->diffusion_model->free_params_buffer();
-        }
+    if (latents.ref_image_num > 0) {
+        final_latent = sd::ops::slice(final_latent, 2, latents.ref_image_num, final_latent.shape()[2]);
     }
 
-    if (ref_image_num > 0) {
-        ggml_tensor* trim_latent = ggml_new_tensor_4d(work_ctx,
-                                                      GGML_TYPE_F32,
-                                                      final_latent->ne[0],
-                                                      final_latent->ne[1],
-                                                      final_latent->ne[2] - ref_image_num,
-                                                      final_latent->ne[3]);
-        ggml_ext_tensor_iter(trim_latent, [&](ggml_tensor* trim_latent, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-            float value = ggml_ext_tensor_get_f32(final_latent, i0, i1, i2 + ref_image_num, i3);
-            ggml_ext_tensor_set_f32(trim_latent, value, i0, i1, i2, i3);
-        });
-        final_latent = trim_latent;
-    }
+    int64_t latent_end = ggml_time_ms();
+    LOG_INFO("generating latent video completed, taking %.2fs", (latent_end - latent_start) * 1.0f / 1000);
 
-    int64_t t4 = ggml_time_ms();
-    LOG_INFO("generating latent video completed, taking %.2fs", (t4 - t2) * 1.0f / 1000);
-    struct ggml_tensor* vid = sd_ctx->sd->decode_first_stage(work_ctx, final_latent, true);
-    int64_t t5              = ggml_time_ms();
-    LOG_INFO("decode_first_stage completed, taking %.2fs", (t5 - t4) * 1.0f / 1000);
-    if (sd_ctx->sd->free_params_immediately && !sd_ctx->sd->use_tiny_autoencoder) {
-        sd_ctx->sd->first_stage_model->free_params_buffer();
+    auto result = decode_video_outputs(sd_ctx, final_latent, num_frames_out);
+    if (result == nullptr) {
+        return nullptr;
     }
 
     sd_ctx->sd->lora_stat();
 
-    sd_image_t* result_images = (sd_image_t*)calloc(vid->ne[2], sizeof(sd_image_t));
-    if (result_images == nullptr) {
-        ggml_free(work_ctx);
-        return nullptr;
-    }
-    *num_frames_out = static_cast<int>(vid->ne[2]);
-
-    for (int64_t i = 0; i < vid->ne[2]; i++) {
-        result_images[i].width   = static_cast<uint32_t>(vid->ne[0]);
-        result_images[i].height  = static_cast<uint32_t>(vid->ne[1]);
-        result_images[i].channel = 3;
-        result_images[i].data    = ggml_tensor_to_sd_image(vid, static_cast<int>(i), true);
-    }
-    ggml_free(work_ctx);
-
-    LOG_INFO("generate_video completed in %.2fs", (t5 - t0) * 1.0f / 1000);
-
-    return result_images;
+    int64_t t1 = ggml_time_ms();
+    LOG_INFO("generate_video completed in %.2fs", (t1 - t0) * 1.0f / 1000);
+    return result;
 }
diff --git a/src/t5.hpp b/src/t5.hpp
index d789c5bd..f64d0b6d 100644
--- a/src/t5.hpp
+++ b/src/t5.hpp
@@ -1,1038 +1,1036 @@
-#ifndef __T5_HPP__
-#define __T5_HPP__
-
-#include <cfloat>
-#include <limits>
-#include <map>
-#include <memory>
-#include <regex>
-#include <sstream>
-#include <string>
-#include <unordered_map>
-
-#include "darts.h"
-#include "ggml_extend.hpp"
-#include "json.hpp"
-#include "model.h"
-#include "vocab/vocab.h"
-
-// Port from: https://github.com/google/sentencepiece/blob/master/src/unigram_model.h
-// and https://github.com/google/sentencepiece/blob/master/src/unigram_model.h.
-// Original License: https://github.com/google/sentencepiece/blob/master/LICENSE
-//
-// Since tokenization is not the bottleneck in SD, performance was not a major consideration
-// during the migration.
-class MetaspacePreTokenizer {
-private:
-    std::string replacement;
-    bool add_prefix_space;
-
-public:
-    MetaspacePreTokenizer(const std::string replacement = " ", bool add_prefix_space = true)
-        : replacement(replacement), add_prefix_space(add_prefix_space) {}
-
-    std::string tokenize(const std::string& input) const {
-        std::string tokens;
-        std::stringstream ss(input);
-
-        if (add_prefix_space) {
-            tokens += replacement;
-        }
-
-        std::string token;
-        bool firstToken = true;
-        while (std::getline(ss, token, ' ')) {
-            if (!firstToken)
-                tokens += replacement + token;
-            else
-                tokens += token;
-
-            firstToken = false;
-        }
-
-        return tokens;
-    }
-};
-
-using EncodeResult = std::vector<std::pair<std::string, int>>;
-class T5UniGramTokenizer {
-public:
-    enum Status {
-        OK,
-        NO_PIECES_LOADED,
-        NO_ENTRY_FOUND,
-        BUILD_DOUBLE_ARRAY_FAILED,
-        PIECE_ALREADY_DEFINED,
-        INVLIAD_JSON
-    };
-
-protected:
-    MetaspacePreTokenizer pre_tokenizer;
-
-    // all <piece, score> pairs
-    std::vector<std::pair<std::string, float>> piece_score_pairs;
-
-    float min_score_ = 0.0;
-    float max_score_ = 0.0;
-    std::unique_ptr<Darts::DoubleArray> trie_;
-
-    // Maximum size of the return value of Trie, which corresponds
-    // to the maximum size of shared common prefix in the sentence pieces.
-    int trie_results_size_;
-    // unknown id.
-    int unk_id_            = 2;
-    std::string eos_token_ = "</s>";
-    int eos_id_            = 1;
-    int pad_id_            = 0;
-    // status.
-    Status status_ = OK;
-
-    float kUnkPenalty = 10.0;
-
-    std::string replacement;
-    bool add_prefix_space = true;
-
-    void InitializePieces(const std::string& json_str) {
-        nlohmann::json data;
-
-        try {
-            data = nlohmann::json::parse(json_str);
-        } catch (const nlohmann::json::parse_error&) {
-            status_ = INVLIAD_JSON;
-            return;
-        }
-        if (!data.contains("model")) {
-            status_ = INVLIAD_JSON;
-            return;
-        }
-        nlohmann::json model = data["model"];
-        if (!model.contains("vocab")) {
-            status_ = INVLIAD_JSON;
-            return;
-        }
-        if (model.contains("unk_id")) {
-            unk_id_ = model["unk_id"];
-        }
-
-        replacement      = data["pre_tokenizer"]["replacement"];
-        add_prefix_space = data["pre_tokenizer"]["add_prefix_space"];
-
-        pre_tokenizer = MetaspacePreTokenizer(replacement, add_prefix_space);
-
-        for (const auto& item : model["vocab"]) {
-            if (item.size() != 2 || !item[0].is_string() || !item[1].is_number_float()) {
-                status_ = INVLIAD_JSON;
-                return;
-            }
-            std::string piece = item[0];
-            if (piece.empty()) {
-                piece = "<empty_token>";
-            }
-            float score = item[1];
-            piece_score_pairs.emplace_back(piece, score);
-        }
-    }
-
-    // Builds a Trie index.
-    void BuildTrie(std::vector<std::pair<std::string, int>>* pieces) {
-        if (status_ != OK)
-            return;
-
-        if (pieces->empty()) {
-            status_ = NO_PIECES_LOADED;
-            return;
-        }
-
-        // sort by sentencepiece since DoubleArray::build()
-        // only accepts sorted strings.
-        sort(pieces->begin(), pieces->end());
-
-        // Makes key/value set for DoubleArrayTrie.
-        std::vector<const char*> key(pieces->size());
-        std::vector<int> value(pieces->size());
-        for (size_t i = 0; i < pieces->size(); ++i) {
-            // LOG_DEBUG("%s %d", (*pieces)[i].first.c_str(), (*pieces)[i].second);
-            key[i]   = (*pieces)[i].first.data();  // sorted piece.
-            value[i] = (*pieces)[i].second;        // vocab_id
-        }
-
-        trie_ = std::unique_ptr<Darts::DoubleArray>(new Darts::DoubleArray());
-        if (trie_->build(key.size(), const_cast<char**>(&key[0]), nullptr,
-                         &value[0]) != 0) {
-            status_ = BUILD_DOUBLE_ARRAY_FAILED;
-            return;
-        }
-
-        // Computes the maximum number of shared prefixes in the trie.
-        const int kMaxTrieResultsSize = 1024;
-        std::vector<Darts::DoubleArray::result_pair_type> results(
-            kMaxTrieResultsSize);
-        trie_results_size_ = 0;
-        for (const auto& p : *pieces) {
-            const size_t num_nodes = trie_->commonPrefixSearch(
-                p.first.data(), results.data(), results.size(), p.first.size());
-            trie_results_size_ = std::max(trie_results_size_, static_cast<int>(num_nodes));
-        }
-
-        if (trie_results_size_ == 0)
-            status_ = NO_ENTRY_FOUND;
-    }
-
-    // Non-virtual (inlined) implementation for faster execution.
-    inline float GetScoreInlined(int id) const {
-        return piece_score_pairs[id].second;
-    }
-
-    inline bool IsUnusedInlined(int id) const {
-        return false;  // TODO
-    }
-
-    inline bool IsUserDefinedInlined(int id) const {
-        return false;  // TODO
-    }
-
-    inline size_t OneCharLen(const char* src) const {
-        return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4];
-    }
-
-    // The optimized Viterbi encode.
-    // Main differences from the original function:
-    // 1. Memorizes the best path at each postion so far,
-    // 2. No need to store the Lattice nodes,
-    // 3. Works in utf-8 directly,
-    // 4. Defines a new struct with fewer fields than Lattice,
-    // 5. Does not depend on `class Lattice` nor call `SetSentence()`,
-    // `PopulateNodes()`, or `Viterbi()`. It does everything in one function.
-    // For detailed explanations please see the comments inside the function body.
-    EncodeResult EncodeOptimized(const std::string& normalized) const {
-        // An optimized Viterbi algorithm for unigram language models. Benchmarking
-        // results show that it generates almost identical outputs and achieves 2.1x
-        // speedup on average for 102 languages compared to the original
-        // implementation. It's based on the following three ideas:
-        //
-        // 1. Because it uses the *unigram* model:
-        //     best_score(x1, x2, …, xt) = best_score(x1, x2, …, x{t-1}) + score(xt)
-        // Deciding the best path (and score) can be decoupled into two isolated
-        // terms: (a) the best path ended before the last token `best_score(x1, x2, …,
-        // x{t-1})`, and (b) the last token and its `score(xt)`. The two terms are
-        // not related to each other at all.
-        //
-        // Therefore, we can compute once and store the *best_path ending at
-        // each character position*. In this way, when we know best_path_ends_at[M],
-        // we can reuse it to compute all the best_path_ends_at_[...] where the last
-        // token starts at the same character position M.
-        //
-        // This improves the time complexity from O(n*k*k) to O(n*k) because it
-        // eliminates the extra loop of recomputing the best path ending at the same
-        // position, where n is the input length and k is the maximum number of tokens
-        // that can be recognized starting at each position.
-        //
-        // 2. Again, because it uses the *unigram* model, we don’t need to actually
-        // store the lattice nodes. We still recognize all the tokens and lattice
-        // nodes from the input, but along identifying them, we use and discard them
-        // on the fly. There is no need to actually store them for best path Viterbi
-        // decoding. The only thing we need to store is the best_path ending at
-        // each character position.
-        //
-        // This improvement reduces the things needed to store in memory from O(n*k)
-        // to O(n), where n is the input length and k is the maximum number of tokens
-        // that can be recognized starting at each position.
-        //
-        // It also avoids the need of dynamic-size lattice node pool, because the
-        // number of things to store is fixed as n.
-        //
-        // 3. SentencePiece is designed to work with unicode, taking utf-8 encoding
-        // inputs. In the original implementation, the lattice positions are based on
-        // unicode positions. A mapping from unicode position to the utf-8 position is
-        // maintained to recover the utf-8 string piece.
-        //
-        // We found that it is sufficient and beneficial to directly work with utf-8
-        // positions:
-        //
-        // Firstly, it saves the conversion and mapping between unicode positions and
-        // utf-8 positions.
-        //
-        // Secondly, it reduces the number of fields we need to maintain in the
-        // node/path structure. Specifically, there are 8 fields defined in
-        // `Lattice::Node` used by the original encoder, but here in the optimized
-        // encoder we only need to define 3 fields in `BestPathNode`.
-
-        if (status() != OK || normalized.empty()) {
-            return {};
-        }
-        // Represents the last node of the best path.
-        struct BestPathNode {
-            int id = -1;  // The vocab id. (maybe -1 for UNK)
-            float best_path_score =
-                0;  // The total score of the best path ending at this node.
-            int starts_at =
-                -1;  // The starting position (in utf-8) of this node. The entire best
-                     // path can be constructed by backtracking along this link.
-        };
-        const int size        = static_cast<int>(normalized.size());
-        const float unk_score = min_score() - kUnkPenalty;
-        // The ends are exclusive.
-        std::vector<BestPathNode> best_path_ends_at(size + 1);
-        // Generate lattice on-the-fly (not stored) and update best_path_ends_at.
-        int starts_at = 0;
-        while (starts_at < size) {
-            std::size_t node_pos = 0;
-            std::size_t key_pos  = starts_at;
-            const auto best_path_score_till_here =
-                best_path_ends_at[starts_at].best_path_score;
-            bool has_single_node = false;
-            const int mblen =
-                std::min<int>(static_cast<int>(OneCharLen(normalized.data() + starts_at)),
-                              size - starts_at);
-            while (key_pos < size) {
-                const int ret =
-                    trie_->traverse(normalized.data(), node_pos, key_pos, key_pos + 1);
-                if (ret == -2)
-                    break;
-                if (ret >= 0) {
-                    if (IsUnusedInlined(ret))
-                        continue;
-                    // Update the best path node.
-                    auto& target_node = best_path_ends_at[key_pos];
-                    const auto length = (key_pos - starts_at);
-                    // User defined symbol receives extra bonus to always be selected.
-                    const auto score = IsUserDefinedInlined(ret)
-                                           ? (length * max_score_ - 0.1)
-                                           : GetScoreInlined(ret);
-                    const auto candidate_best_path_score =
-                        score + best_path_score_till_here;
-                    if (target_node.starts_at == -1 ||
-                        candidate_best_path_score > target_node.best_path_score) {
-                        target_node.best_path_score = static_cast<float>(candidate_best_path_score);
-                        target_node.starts_at       = starts_at;
-                        target_node.id              = ret;
-                    }
-                    if (!has_single_node && length == mblen) {
-                        has_single_node = true;
-                    }
-                }
-            }
-            if (!has_single_node) {
-                auto& target_node = best_path_ends_at[starts_at + mblen];
-                const auto candidate_best_path_score =
-                    unk_score + best_path_score_till_here;
-                if (target_node.starts_at == -1 ||
-                    candidate_best_path_score > target_node.best_path_score) {
-                    target_node.best_path_score = candidate_best_path_score;
-                    target_node.starts_at       = starts_at;
-                    target_node.id              = unk_id_;
-                }
-            }
-            // Move by one unicode character.
-            starts_at += mblen;
-        }
-        // Backtrack to identify the best path.
-        EncodeResult results;
-        int ends_at = size;
-        while (ends_at > 0) {
-            const auto& node = best_path_ends_at[ends_at];
-            results.emplace_back(
-                normalized.substr(node.starts_at, ends_at - node.starts_at), node.id);
-            ends_at = node.starts_at;
-        }
-        std::reverse(results.begin(), results.end());
-        return results;
-    }
-
-public:
-    explicit T5UniGramTokenizer(bool is_umt5 = false) {
-        if (is_umt5) {
-            InitializePieces(load_umt5_tokenizer_json());
-        } else {
-            InitializePieces(load_t5_tokenizer_json());
-        }
-
-        min_score_ = FLT_MAX;
-        max_score_ = FLT_MIN;
-
-        std::vector<std::pair<std::string, int>> pieces;
-        for (int i = 0; i < piece_score_pairs.size(); i++) {
-            const auto& sp = piece_score_pairs[i];
-
-            min_score_ = std::min(min_score_, sp.second);
-            max_score_ = std::max(max_score_, sp.second);
-
-            pieces.emplace_back(sp.first, i);
-        }
-
-        BuildTrie(&pieces);
-    }
-    ~T5UniGramTokenizer(){};
-
-    std::string Normalize(const std::string& input) const {
-        // Ref: https://github.com/huggingface/tokenizers/blob/1ff56c0c70b045f0cd82da1af9ac08cd4c7a6f9f/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py#L29
-        // TODO: nmt-nfkc
-        std::string normalized = std::regex_replace(input, std::regex(" {2,}"), " ");
-        return normalized;
-    }
-
-    std::vector<int> Encode(const std::string& input, bool append_eos_if_not_present = true) const {
-        std::string normalized = Normalize(input);
-        normalized             = pre_tokenizer.tokenize(normalized);
-        EncodeResult result    = EncodeOptimized(normalized);
-        if (result.size() > 0 && append_eos_if_not_present) {
-            auto item = result[result.size() - 1];
-            if (item.first != eos_token_) {
-                result.emplace_back(eos_token_, eos_id_);
-            }
-        }
-        std::vector<int> tokens;
-        for (auto item : result) {
-            tokens.push_back(item.second);
-        }
-        return tokens;
-    }
-
-    void pad_tokens(std::vector<int>& tokens,
-                    std::vector<float>& weights,
-                    std::vector<float>* attention_mask,
-                    size_t max_length = 0,
-                    bool padding      = false) {
-        if (max_length > 0 && padding) {
-            size_t orig_token_num = tokens.size() - 1;
-            size_t n              = static_cast<size_t>(std::ceil(orig_token_num * 1.0 / (max_length - 1)));
-            if (n == 0) {
-                n = 1;
-            }
-            size_t length = max_length * n;
-            LOG_DEBUG("token length: %llu", length);
-            std::vector<int> new_tokens;
-            std::vector<float> new_weights;
-            std::vector<float> new_attention_mask;
-            int token_idx = 0;
-            for (int i = 0; i < length; i++) {
-                if (token_idx >= orig_token_num) {
-                    break;
-                }
-                if (attention_mask != nullptr) {
-                    new_attention_mask.push_back(0.0);
-                }
-                if (i % max_length == max_length - 1) {
-                    new_tokens.push_back(eos_id_);
-                    new_weights.push_back(1.0);
-                } else {
-                    new_tokens.push_back(tokens[token_idx]);
-                    new_weights.push_back(weights[token_idx]);
-                    token_idx++;
-                }
-            }
-
-            new_tokens.push_back(eos_id_);
-            new_weights.push_back(1.0);
-            if (attention_mask != nullptr) {
-                new_attention_mask.push_back(0.0);
-            }
-
-            tokens  = new_tokens;
-            weights = new_weights;
-            if (attention_mask != nullptr) {
-                *attention_mask = new_attention_mask;
-            }
-
-            if (padding) {
-                int pad_token_id = pad_id_;
-                tokens.insert(tokens.end(), length - tokens.size(), pad_token_id);
-                weights.insert(weights.end(), length - weights.size(), 1.0);
-                if (attention_mask != nullptr) {
-                    // maybe keep some padding tokens unmasked?
-                    attention_mask->insert(attention_mask->end(), length - attention_mask->size(), -HUGE_VALF);
-                }
-            }
-        }
-    }
-
-    // Returns the minimum score in sentence pieces.
-    // min_score() - 10 is used for the cost of unknown sentence.
-    float min_score() const { return min_score_; }
-
-    // Returns the maximum score in sentence pieces.
-    // max_score() is used for the cost of user defined symbols.
-    float max_score() const { return max_score_; }
-
-    Status status() const { return status_; }
-};
-
-class T5LayerNorm : public UnaryBlock {
-protected:
-    int64_t hidden_size;
-    float eps;
-
-    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
-        enum ggml_type wtype = GGML_TYPE_F32;
-        params["weight"]     = ggml_new_tensor_1d(ctx, wtype, hidden_size);
-    }
-
-public:
-    T5LayerNorm(int64_t hidden_size,
-                float eps = 1e-06f)
-        : hidden_size(hidden_size),
-          eps(eps) {}
-
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
-        struct ggml_tensor* w = params["weight"];
-        x                     = ggml_rms_norm(ctx->ggml_ctx, x, eps);
-        x                     = ggml_mul(ctx->ggml_ctx, x, w);
-        return x;
-    }
-};
-
-struct T5DenseActDense : public UnaryBlock {
-public:
-    T5DenseActDense(int64_t model_dim, int64_t ff_dim) {
-        blocks["wi"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
-        blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false));
-    }
-
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
-        // x: [N, n_token, model_dim]
-        auto wi = std::dynamic_pointer_cast<Linear>(blocks["wi"]);
-        auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]);
-
-        x = wi->forward(ctx, x);
-        x = ggml_relu_inplace(ctx->ggml_ctx, x);
-        x = wo->forward(ctx, x);
-        return x;
-    }
-};
-
-struct T5DenseGatedActDense : public UnaryBlock {
-public:
-    T5DenseGatedActDense(int64_t model_dim, int64_t ff_dim) {
-        blocks["wi_0"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
-        blocks["wi_1"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
-        float scale    = 1.f / 32.f;
-        // The purpose of the scale here is to prevent NaN issues on some backends(CUDA, ...).
-        blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false, false, false, scale));
-    }
-
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
-        // x: [N, n_token, model_dim]
-        auto wi_0 = std::dynamic_pointer_cast<Linear>(blocks["wi_0"]);
-        auto wi_1 = std::dynamic_pointer_cast<Linear>(blocks["wi_1"]);
-        auto wo   = std::dynamic_pointer_cast<Linear>(blocks["wo"]);
-
-        auto hidden_gelu   = ggml_ext_gelu(ctx->ggml_ctx, wi_0->forward(ctx, x), true);
-        auto hidden_linear = wi_1->forward(ctx, x);
-        x                  = ggml_mul_inplace(ctx->ggml_ctx, hidden_gelu, hidden_linear);
-        x                  = wo->forward(ctx, x);
-        return x;
-    }
-};
-
-struct T5LayerFF : public UnaryBlock {
-public:
-    T5LayerFF(int64_t model_dim, int64_t ff_dim) {
-        blocks["DenseReluDense"] = std::shared_ptr<GGMLBlock>(new T5DenseGatedActDense(model_dim, ff_dim));
-        blocks["layer_norm"]     = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
-    }
-
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
-        // x: [N, n_token, model_dim]
-        auto DenseReluDense = std::dynamic_pointer_cast<T5DenseGatedActDense>(blocks["DenseReluDense"]);
-        auto layer_norm     = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);
-
-        auto forwarded_states = layer_norm->forward(ctx, x);
-        forwarded_states      = DenseReluDense->forward(ctx, forwarded_states);
-        x                     = ggml_add_inplace(ctx->ggml_ctx, forwarded_states, x);
-        return x;
-    }
-};
-
-class T5Attention : public GGMLBlock {
-protected:
-    int64_t model_dim;
-    int64_t inner_dim;
-    int64_t num_heads;
-    bool using_relative_attention_bias;
-    int64_t relative_attention_num_buckets  = 32;
-    int64_t relative_attention_max_distance = 128;
-
-public:
-    T5Attention(int64_t model_dim,
-                int64_t inner_dim,
-                int64_t num_heads,
-                bool using_relative_attention_bias = false)
-        : model_dim(model_dim),
-          inner_dim(inner_dim),
-          num_heads(num_heads),
-          using_relative_attention_bias(using_relative_attention_bias) {
-        blocks["q"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
-        blocks["k"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
-        blocks["v"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
-        blocks["o"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, model_dim, false));
-        if (using_relative_attention_bias) {
-            blocks["relative_attention_bias"] = std::shared_ptr<GGMLBlock>(new Embedding(relative_attention_num_buckets, num_heads));
-        }
-    }
-
-    struct ggml_tensor* compute_bias(GGMLRunnerContext* ctx,
-                                     struct ggml_tensor* relative_position_bucket) {
-        auto relative_attention_bias = std::dynamic_pointer_cast<Embedding>(blocks["relative_attention_bias"]);
-
-        auto values = relative_attention_bias->forward(ctx, relative_position_bucket);            // shape (query_length, key_length, num_heads)
-        values      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, values, 2, 0, 1, 3));  // shape (1, num_heads, query_length, key_length)
-        return values;
-    }
-
-    // x: [N, n_token, model_dim]
-    std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(GGMLRunnerContext* ctx,
-                                                                struct ggml_tensor* x,
-                                                                struct ggml_tensor* past_bias                = nullptr,
-                                                                struct ggml_tensor* mask                     = nullptr,
-                                                                struct ggml_tensor* relative_position_bucket = nullptr) {
-        auto q_proj   = std::dynamic_pointer_cast<Linear>(blocks["q"]);
-        auto k_proj   = std::dynamic_pointer_cast<Linear>(blocks["k"]);
-        auto v_proj   = std::dynamic_pointer_cast<Linear>(blocks["v"]);
-        auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["o"]);
-
-        int64_t n_head = num_heads;
-        int64_t d_head = inner_dim / n_head;
-
-        auto q = q_proj->forward(ctx, x);
-        auto k = k_proj->forward(ctx, x);
-        auto v = v_proj->forward(ctx, x);
-
-        if (using_relative_attention_bias && relative_position_bucket != nullptr) {
-            past_bias = compute_bias(ctx, relative_position_bucket);
-        }
-        if (past_bias != nullptr) {
-            if (mask != nullptr) {
-                mask = ggml_repeat(ctx->ggml_ctx, mask, past_bias);
-                mask = ggml_add(ctx->ggml_ctx, mask, past_bias);
-            } else {
-                mask = past_bias;
-            }
-        }
-
-        k = ggml_ext_scale(ctx->ggml_ctx, k, ::sqrtf(static_cast<float>(d_head)), true);
-
-        x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, mask);  // [N, n_token, d_head * n_head]
-
-        x = out_proj->forward(ctx, x);  // [N, n_token, model_dim]
-        return {x, past_bias};
-    }
-};
-
-struct T5LayerSelfAttention : public GGMLBlock {
-public:
-    T5LayerSelfAttention(int64_t model_dim,
-                         int64_t inner_dim,
-                         int64_t ff_dim,
-                         int64_t num_heads,
-                         bool using_relative_attention_bias) {
-        blocks["SelfAttention"] = std::shared_ptr<GGMLBlock>(new T5Attention(model_dim, inner_dim, num_heads, using_relative_attention_bias));
-        blocks["layer_norm"]    = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
-    }
-
-    std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(GGMLRunnerContext* ctx,
-                                                                struct ggml_tensor* x,
-                                                                struct ggml_tensor* past_bias                = nullptr,
-                                                                struct ggml_tensor* mask                     = nullptr,
-                                                                struct ggml_tensor* relative_position_bucket = nullptr) {
-        // x: [N, n_token, model_dim]
-        auto SelfAttention = std::dynamic_pointer_cast<T5Attention>(blocks["SelfAttention"]);
-        auto layer_norm    = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);
-
-        auto normed_hidden_state = layer_norm->forward(ctx, x);
-        auto ret                 = SelfAttention->forward(ctx, normed_hidden_state, past_bias, mask, relative_position_bucket);
-        auto output              = ret.first;
-        past_bias                = ret.second;
-
-        x = ggml_add_inplace(ctx->ggml_ctx, output, x);
-        return {x, past_bias};
-    }
-};
-
-struct T5Block : public GGMLBlock {
-public:
-    T5Block(int64_t model_dim, int64_t inner_dim, int64_t ff_dim, int64_t num_heads, bool using_relative_attention_bias) {
-        blocks["layer.0"] = std::shared_ptr<GGMLBlock>(new T5LayerSelfAttention(model_dim, inner_dim, ff_dim, num_heads, using_relative_attention_bias));
-        blocks["layer.1"] = std::shared_ptr<GGMLBlock>(new T5LayerFF(model_dim, ff_dim));
-    }
-
-    std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(GGMLRunnerContext* ctx,
-                                                                struct ggml_tensor* x,
-                                                                struct ggml_tensor* past_bias                = nullptr,
-                                                                struct ggml_tensor* mask                     = nullptr,
-                                                                struct ggml_tensor* relative_position_bucket = nullptr) {
-        // x: [N, n_token, model_dim]
-        auto layer_0 = std::dynamic_pointer_cast<T5LayerSelfAttention>(blocks["layer.0"]);
-        auto layer_1 = std::dynamic_pointer_cast<T5LayerFF>(blocks["layer.1"]);
-
-        auto ret  = layer_0->forward(ctx, x, past_bias, mask, relative_position_bucket);
-        x         = ret.first;
-        past_bias = ret.second;
-        x         = layer_1->forward(ctx, x);
-        return {x, past_bias};
-    }
-};
-
-struct T5Stack : public GGMLBlock {
-    int64_t num_layers;
-
-public:
-    T5Stack(int64_t num_layers,
-            int64_t model_dim,
-            int64_t inner_dim,
-            int64_t ff_dim,
-            int64_t num_heads,
-            bool relative_attention = true)
-        : num_layers(num_layers) {
-        for (int i = 0; i < num_layers; i++) {
-            blocks["block." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new T5Block(model_dim, inner_dim, ff_dim, num_heads, (!relative_attention || i == 0)));
-        }
-
-        blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
-    }
-
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                struct ggml_tensor* x,
-                                struct ggml_tensor* past_bias                = nullptr,
-                                struct ggml_tensor* attention_mask           = nullptr,
-                                struct ggml_tensor* relative_position_bucket = nullptr) {
-        // x: [N, n_token, model_dim]
-        for (int i = 0; i < num_layers; i++) {
-            auto block = std::dynamic_pointer_cast<T5Block>(blocks["block." + std::to_string(i)]);
-
-            auto ret  = block->forward(ctx, x, past_bias, attention_mask, relative_position_bucket);
-            x         = ret.first;
-            past_bias = ret.second;
-        }
-
-        auto final_layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["final_layer_norm"]);
-
-        x = final_layer_norm->forward(ctx, x);
-        return x;
-    }
-};
-
-struct T5Params {
-    int64_t num_layers      = 24;
-    int64_t model_dim       = 4096;
-    int64_t ff_dim          = 10240;
-    int64_t num_heads       = 64;
-    int64_t vocab_size      = 32128;
-    bool relative_attention = true;
-};
-
-struct T5 : public GGMLBlock {
-    T5Params params;
-
-public:
-    T5() {}
-    T5(T5Params params)
-        : params(params) {
-        blocks["encoder"] = std::shared_ptr<GGMLBlock>(new T5Stack(params.num_layers,
-                                                                   params.model_dim,
-                                                                   params.model_dim,
-                                                                   params.ff_dim,
-                                                                   params.num_heads,
-                                                                   params.relative_attention));
-        blocks["shared"]  = std::shared_ptr<GGMLBlock>(new Embedding(params.vocab_size,
-                                                                     params.model_dim));
-    }
-
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                struct ggml_tensor* input_ids,
-                                struct ggml_tensor* past_bias                = nullptr,
-                                struct ggml_tensor* attention_mask           = nullptr,
-                                struct ggml_tensor* relative_position_bucket = nullptr) {
-        // input_ids: [N, n_token]
-
-        auto shared  = std::dynamic_pointer_cast<Embedding>(blocks["shared"]);
-        auto encoder = std::dynamic_pointer_cast<T5Stack>(blocks["encoder"]);
-
-        auto x = shared->forward(ctx, input_ids);
-        x      = encoder->forward(ctx, x, past_bias, attention_mask, relative_position_bucket);
-        return x;
-    }
-};
-
-struct T5Runner : public GGMLRunner {
-    T5Params params;
-    T5 model;
-    std::vector<int> relative_position_bucket_vec;
-
-    T5Runner(ggml_backend_t backend,
-             bool offload_params_to_cpu,
-             const String2TensorStorage& tensor_storage_map,
-             const std::string prefix,
-             bool is_umt5 = false)
-        : GGMLRunner(backend, offload_params_to_cpu) {
-        if (is_umt5) {
-            params.vocab_size         = 256384;
-            params.relative_attention = false;
-        }
-        model = T5(params);
-        model.init(params_ctx, tensor_storage_map, prefix);
-    }
-
-    std::string get_desc() override {
-        return "t5";
-    }
-
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
-        model.get_param_tensors(tensors, prefix);
-    }
-
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                struct ggml_tensor* input_ids,
-                                struct ggml_tensor* relative_position_bucket,
-                                struct ggml_tensor* attention_mask = nullptr) {
-        size_t N       = input_ids->ne[1];
-        size_t n_token = input_ids->ne[0];
-
-        auto hidden_states = model.forward(ctx, input_ids, nullptr, attention_mask, relative_position_bucket);  // [N, n_token, model_dim]
-        return hidden_states;
-    }
-
-    struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
-                                    struct ggml_tensor* attention_mask = nullptr) {
-        struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
-
-        input_ids      = to_backend(input_ids);
-        attention_mask = to_backend(attention_mask);
-
-        relative_position_bucket_vec = compute_relative_position_bucket(static_cast<int>(input_ids->ne[0]), static_cast<int>(input_ids->ne[0]));
-
-        // for (int i = 0; i < relative_position_bucket_vec.size(); i++) {
-        //     if (i % 77 == 0) {
-        //         printf("\n");
-        //     }
-        //     printf("%d ", relative_position_bucket_vec[i]);
-        // }
-
-        auto relative_position_bucket = ggml_new_tensor_2d(compute_ctx,
-                                                           GGML_TYPE_I32,
-                                                           input_ids->ne[0],
-                                                           input_ids->ne[0]);
-        set_backend_tensor_data(relative_position_bucket, relative_position_bucket_vec.data());
-
-        auto runner_ctx                   = get_context();
-        struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, relative_position_bucket, attention_mask);
-
-        ggml_build_forward_expand(gf, hidden_states);
-
-        return gf;
-    }
-
-    bool compute(const int n_threads,
-                 struct ggml_tensor* input_ids,
-                 struct ggml_tensor* attention_mask,
-                 ggml_tensor** output,
-                 ggml_context* output_ctx = nullptr) {
-        auto get_graph = [&]() -> struct ggml_cgraph* {
-            return build_graph(input_ids, attention_mask);
-        };
-        return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
-    }
-
-    static std::vector<int> _relative_position_bucket(const std::vector<int>& relative_position,
-                                                      bool bidirectional = true,
-                                                      int num_buckets    = 32,
-                                                      int max_distance   = 128) {
-        std::vector<int> relative_buckets(relative_position.size(), 0);
-        std::vector<int> abs_relative_position = relative_position;
-
-        if (bidirectional) {
-            num_buckets = num_buckets / 2;
-            for (size_t i = 0; i < relative_position.size(); ++i) {
-                if (relative_position[i] > 0) {
-                    relative_buckets[i] += num_buckets;
-                }
-                abs_relative_position[i] = std::abs(relative_position[i]);
-            }
-        } else {
-            for (size_t i = 0; i < relative_position.size(); ++i) {
-                abs_relative_position[i] = std::max(-relative_position[i], 0);
-            }
-        }
-
-        int max_exact = num_buckets / 2;
-        std::vector<int> relative_position_if_large(relative_position.size(), 0);
-
-        for (size_t i = 0; i < relative_position.size(); ++i) {
-            if (abs_relative_position[i] < max_exact) {
-                relative_buckets[i] += abs_relative_position[i];
-            } else {
-                float log_pos                 = std::log(static_cast<float>(abs_relative_position[i]) / max_exact);
-                float log_base                = std::log(static_cast<float>(max_distance) / max_exact);
-                relative_position_if_large[i] = max_exact + static_cast<int>((log_pos / log_base) * (num_buckets - max_exact));
-                relative_position_if_large[i] = std::min(relative_position_if_large[i], num_buckets - 1);
-                relative_buckets[i] += relative_position_if_large[i];
-            }
-        }
-
-        return relative_buckets;
-    }
-
-    std::vector<int> compute_relative_position_bucket(int query_length,
-                                                      int key_length) {
-        std::vector<int> context_position(query_length);
-        std::vector<int> memory_position(key_length);
-
-        for (int i = 0; i < query_length; ++i) {
-            context_position[i] = i;
-        }
-        for (int i = 0; i < key_length; ++i) {
-            memory_position[i] = i;
-        }
-
-        std::vector<std::vector<int>> relative_position(query_length, std::vector<int>(key_length, 0));
-        for (int i = 0; i < query_length; ++i) {
-            for (int j = 0; j < key_length; ++j) {
-                relative_position[i][j] = memory_position[j] - context_position[i];
-            }
-        }
-
-        std::vector<int> relative_position_bucket;
-        for (int i = 0; i < query_length; ++i) {
-            std::vector<int> result = _relative_position_bucket(relative_position[i], true);
-            relative_position_bucket.insert(relative_position_bucket.end(), result.begin(), result.end());
-        }
-
-        return relative_position_bucket;
-    }
-};
-
-struct T5Embedder {
-    T5UniGramTokenizer tokenizer;
-    T5Runner model;
-
-    T5Embedder(ggml_backend_t backend,
-               bool offload_params_to_cpu,
-               const String2TensorStorage& tensor_storage_map = {},
-               const std::string prefix                       = "",
-               bool is_umt5                                   = false)
-        : model(backend, offload_params_to_cpu, tensor_storage_map, prefix, is_umt5), tokenizer(is_umt5) {
-    }
-
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
-        model.get_param_tensors(tensors, prefix);
-    }
-
-    void alloc_params_buffer() {
-        model.alloc_params_buffer();
-    }
-
-    std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text,
-                                                                                  size_t max_length = 0,
-                                                                                  bool padding      = false) {
-        auto parsed_attention = parse_prompt_attention(text);
-
-        {
-            std::stringstream ss;
-            ss << "[";
-            for (const auto& item : parsed_attention) {
-                ss << "['" << item.first << "', " << item.second << "], ";
-            }
-            ss << "]";
-            LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
-        }
-
-        std::vector<int> tokens;
-        std::vector<float> weights;
-        for (const auto& item : parsed_attention) {
-            const std::string& curr_text = item.first;
-            float curr_weight            = item.second;
-            std::vector<int> curr_tokens = tokenizer.Encode(curr_text, false);
-            tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
-            weights.insert(weights.end(), curr_tokens.size(), curr_weight);
-        }
-
-        int EOS_TOKEN_ID = 1;
-        tokens.push_back(EOS_TOKEN_ID);
-        weights.push_back(1.0);
-
-        std::vector<float> attention_mask;
-
-        tokenizer.pad_tokens(tokens, weights, &attention_mask, max_length, padding);
-
-        // for (int i = 0; i < tokens.size(); i++) {
-        //     std::cout << tokens[i] << ":" << weights[i] << ", ";
-        // }
-        // std::cout << std::endl;
-
-        return {tokens, weights, attention_mask};
-    }
-
-    void test() {
-        struct ggml_init_params params;
-        params.mem_size   = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
-        params.mem_buffer = nullptr;
-        params.no_alloc   = false;
-
-        struct ggml_context* work_ctx = ggml_init(params);
-        GGML_ASSERT(work_ctx != nullptr);
-
-        {
-            std::string text("a lovely cat");
-            // std::string text("一只可爱的猫"); // umt5 chinease test
-            auto tokens_and_weights     = tokenize(text, 512, true);
-            std::vector<int>& tokens    = std::get<0>(tokens_and_weights);
-            std::vector<float>& weights = std::get<1>(tokens_and_weights);
-            std::vector<float>& masks   = std::get<2>(tokens_and_weights);
-            for (auto token : tokens) {
-                printf("%d ", token);
-            }
-            printf("\n");
-            auto input_ids          = vector_to_ggml_tensor_i32(work_ctx, tokens);
-            auto attention_mask     = vector_to_ggml_tensor(work_ctx, masks);
-            struct ggml_tensor* out = nullptr;
-
-            int64_t t0 = ggml_time_ms();
-            model.compute(8, input_ids, attention_mask, &out, work_ctx);
-            int64_t t1 = ggml_time_ms();
-
-            print_ggml_tensor(out);
-            LOG_DEBUG("t5 test done in %lldms", t1 - t0);
-        }
-    }
-
-    static void load_from_file_and_test(const std::string& file_path) {
-        // cpu f16: pass
-        // cpu f32: pass
-        // cuda f16: pass
-        // cuda f32: pass
-        // cuda q8_0: pass
-        // ggml_backend_t backend = ggml_backend_cuda_init(0);
-        ggml_backend_t backend    = ggml_backend_cpu_init();
-        ggml_type model_data_type = GGML_TYPE_F16;
-
-        ModelLoader model_loader;
-        if (!model_loader.init_from_file_and_convert_name(file_path)) {
-            LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
-            return;
-        }
-
-        auto& tensor_storage_map = model_loader.get_tensor_storage_map();
-        for (auto& [name, tensor_storage] : tensor_storage_map) {
-            if (ends_with(name, "weight")) {
-                tensor_storage.expected_type = model_data_type;
-            }
-        }
-
-        std::shared_ptr<T5Embedder> t5 = std::make_shared<T5Embedder>(backend, false, tensor_storage_map, "", true);
-
-        t5->alloc_params_buffer();
-        std::map<std::string, ggml_tensor*> tensors;
-        t5->get_param_tensors(tensors, "");
-
-        bool success = model_loader.load_tensors(tensors);
-
-        if (!success) {
-            LOG_ERROR("load tensors from model loader failed");
-            return;
-        }
-
-        LOG_INFO("t5 model loaded");
-        t5->test();
-    }
-};
-
-#endif  // __T5_HPP__
\ No newline at end of file
+﻿#ifndef __T5_HPP__
+#define __T5_HPP__
+
+#include <cfloat>
+#include <limits>
+#include <map>
+#include <memory>
+#include <regex>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+
+#include "darts.h"
+#include "ggml_extend.hpp"
+#include "json.hpp"
+#include "model.h"
+#include "vocab/vocab.h"
+
+// Port from: https://github.com/google/sentencepiece/blob/master/src/unigram_model.h
+// and https://github.com/google/sentencepiece/blob/master/src/unigram_model.h.
+// Original License: https://github.com/google/sentencepiece/blob/master/LICENSE
+//
+// Since tokenization is not the bottleneck in SD, performance was not a major consideration
+// during the migration.
+class MetaspacePreTokenizer {
+private:
+    std::string replacement;
+    bool add_prefix_space;
+
+public:
+    MetaspacePreTokenizer(const std::string replacement = " ", bool add_prefix_space = true)
+        : replacement(replacement), add_prefix_space(add_prefix_space) {}
+
+    std::string tokenize(const std::string& input) const {
+        std::string tokens;
+        std::stringstream ss(input);
+
+        if (add_prefix_space) {
+            tokens += replacement;
+        }
+
+        std::string token;
+        bool firstToken = true;
+        while (std::getline(ss, token, ' ')) {
+            if (!firstToken)
+                tokens += replacement + token;
+            else
+                tokens += token;
+
+            firstToken = false;
+        }
+
+        return tokens;
+    }
+};
+
+using EncodeResult = std::vector<std::pair<std::string, int>>;
+class T5UniGramTokenizer {
+public:
+    enum Status {
+        OK,
+        NO_PIECES_LOADED,
+        NO_ENTRY_FOUND,
+        BUILD_DOUBLE_ARRAY_FAILED,
+        PIECE_ALREADY_DEFINED,
+        INVLIAD_JSON
+    };
+
+protected:
+    MetaspacePreTokenizer pre_tokenizer;
+
+    // all <piece, score> pairs
+    std::vector<std::pair<std::string, float>> piece_score_pairs;
+
+    float min_score_ = 0.0;
+    float max_score_ = 0.0;
+    std::unique_ptr<Darts::DoubleArray> trie_;
+
+    // Maximum size of the return value of Trie, which corresponds
+    // to the maximum size of shared common prefix in the sentence pieces.
+    int trie_results_size_;
+    // unknown id.
+    int unk_id_            = 2;
+    std::string eos_token_ = "</s>";
+    int eos_id_            = 1;
+    int pad_id_            = 0;
+    // status.
+    Status status_ = OK;
+
+    float kUnkPenalty = 10.0;
+
+    std::string replacement;
+    bool add_prefix_space = true;
+
+    void InitializePieces(const std::string& json_str) {
+        nlohmann::json data;
+
+        try {
+            data = nlohmann::json::parse(json_str);
+        } catch (const nlohmann::json::parse_error&) {
+            status_ = INVLIAD_JSON;
+            return;
+        }
+        if (!data.contains("model")) {
+            status_ = INVLIAD_JSON;
+            return;
+        }
+        nlohmann::json model = data["model"];
+        if (!model.contains("vocab")) {
+            status_ = INVLIAD_JSON;
+            return;
+        }
+        if (model.contains("unk_id")) {
+            unk_id_ = model["unk_id"];
+        }
+
+        replacement      = data["pre_tokenizer"]["replacement"];
+        add_prefix_space = data["pre_tokenizer"]["add_prefix_space"];
+
+        pre_tokenizer = MetaspacePreTokenizer(replacement, add_prefix_space);
+
+        for (const auto& item : model["vocab"]) {
+            if (item.size() != 2 || !item[0].is_string() || !item[1].is_number_float()) {
+                status_ = INVLIAD_JSON;
+                return;
+            }
+            std::string piece = item[0];
+            if (piece.empty()) {
+                piece = "<empty_token>";
+            }
+            float score = item[1];
+            piece_score_pairs.emplace_back(piece, score);
+        }
+    }
+
+    // Builds a Trie index.
+    void BuildTrie(std::vector<std::pair<std::string, int>>* pieces) {
+        if (status_ != OK)
+            return;
+
+        if (pieces->empty()) {
+            status_ = NO_PIECES_LOADED;
+            return;
+        }
+
+        // sort by sentencepiece since DoubleArray::build()
+        // only accepts sorted strings.
+        sort(pieces->begin(), pieces->end());
+
+        // Makes key/value set for DoubleArrayTrie.
+        std::vector<const char*> key(pieces->size());
+        std::vector<int> value(pieces->size());
+        for (size_t i = 0; i < pieces->size(); ++i) {
+            // LOG_DEBUG("%s %d", (*pieces)[i].first.c_str(), (*pieces)[i].second);
+            key[i]   = (*pieces)[i].first.data();  // sorted piece.
+            value[i] = (*pieces)[i].second;        // vocab_id
+        }
+
+        trie_ = std::unique_ptr<Darts::DoubleArray>(new Darts::DoubleArray());
+        if (trie_->build(key.size(), const_cast<char**>(&key[0]), nullptr,
+                         &value[0]) != 0) {
+            status_ = BUILD_DOUBLE_ARRAY_FAILED;
+            return;
+        }
+
+        // Computes the maximum number of shared prefixes in the trie.
+        const int kMaxTrieResultsSize = 1024;
+        std::vector<Darts::DoubleArray::result_pair_type> results(
+            kMaxTrieResultsSize);
+        trie_results_size_ = 0;
+        for (const auto& p : *pieces) {
+            const size_t num_nodes = trie_->commonPrefixSearch(
+                p.first.data(), results.data(), results.size(), p.first.size());
+            trie_results_size_ = std::max(trie_results_size_, static_cast<int>(num_nodes));
+        }
+
+        if (trie_results_size_ == 0)
+            status_ = NO_ENTRY_FOUND;
+    }
+
+    // Non-virtual (inlined) implementation for faster execution.
+    inline float GetScoreInlined(int id) const {
+        return piece_score_pairs[id].second;
+    }
+
+    inline bool IsUnusedInlined(int id) const {
+        return false;  // TODO
+    }
+
+    inline bool IsUserDefinedInlined(int id) const {
+        return false;  // TODO
+    }
+
+    inline size_t OneCharLen(const char* src) const {
+        return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4];
+    }
+
+    // The optimized Viterbi encode.
+    // Main differences from the original function:
+    // 1. Memorizes the best path at each postion so far,
+    // 2. No need to store the Lattice nodes,
+    // 3. Works in utf-8 directly,
+    // 4. Defines a new struct with fewer fields than Lattice,
+    // 5. Does not depend on `class Lattice` nor call `SetSentence()`,
+    // `PopulateNodes()`, or `Viterbi()`. It does everything in one function.
+    // For detailed explanations please see the comments inside the function body.
+    EncodeResult EncodeOptimized(const std::string& normalized) const {
+        // An optimized Viterbi algorithm for unigram language models. Benchmarking
+        // results show that it generates almost identical outputs and achieves 2.1x
+        // speedup on average for 102 languages compared to the original
+        // implementation. It's based on the following three ideas:
+        //
+        // 1. Because it uses the *unigram* model:
+        //     best_score(x1, x2, ... xt) = best_score(x1, x2, ... x{t-1}) + score(xt)
+        // Deciding the best path (and score) can be decoupled into two isolated
+        // terms: (a) the best path ended before the last token `best_score(x1, x2, ...)`
+        // x{t-1})`, and (b) the last token and its `score(xt)`. The two terms are
+        // not related to each other at all.
+        //
+        // Therefore, we can compute once and store the *best_path ending at
+        // each character position*. In this way, when we know best_path_ends_at[M],
+        // we can reuse it to compute all the best_path_ends_at_[...] where the last
+        // token starts at the same character position M.
+        //
+        // This improves the time complexity from O(n*k*k) to O(n*k) because it
+        // eliminates the extra loop of recomputing the best path ending at the same
+        // position, where n is the input length and k is the maximum number of tokens
+        // that can be recognized starting at each position.
+        //
+        // 2. Again, because it uses the *unigram* model, we don't need to actually
+        // store the lattice nodes. We still recognize all the tokens and lattice
+        // nodes from the input, but along identifying them, we use and discard them
+        // on the fly. There is no need to actually store them for best path Viterbi
+        // decoding. The only thing we need to store is the best_path ending at
+        // each character position.
+        //
+        // This improvement reduces the things needed to store in memory from O(n*k)
+        // to O(n), where n is the input length and k is the maximum number of tokens
+        // that can be recognized starting at each position.
+        //
+        // It also avoids the need of dynamic-size lattice node pool, because the
+        // number of things to store is fixed as n.
+        //
+        // 3. SentencePiece is designed to work with unicode, taking utf-8 encoding
+        // inputs. In the original implementation, the lattice positions are based on
+        // unicode positions. A mapping from unicode position to the utf-8 position is
+        // maintained to recover the utf-8 string piece.
+        //
+        // We found that it is sufficient and beneficial to directly work with utf-8
+        // positions:
+        //
+        // Firstly, it saves the conversion and mapping between unicode positions and
+        // utf-8 positions.
+        //
+        // Secondly, it reduces the number of fields we need to maintain in the
+        // node/path structure. Specifically, there are 8 fields defined in
+        // `Lattice::Node` used by the original encoder, but here in the optimized
+        // encoder we only need to define 3 fields in `BestPathNode`.
+
+        if (status() != OK || normalized.empty()) {
+            return {};
+        }
+        // Represents the last node of the best path.
+        struct BestPathNode {
+            int id = -1;  // The vocab id. (maybe -1 for UNK)
+            float best_path_score =
+                0;  // The total score of the best path ending at this node.
+            int starts_at =
+                -1;  // The starting position (in utf-8) of this node. The entire best
+                     // path can be constructed by backtracking along this link.
+        };
+        const int size        = static_cast<int>(normalized.size());
+        const float unk_score = min_score() - kUnkPenalty;
+        // The ends are exclusive.
+        std::vector<BestPathNode> best_path_ends_at(size + 1);
+        // Generate lattice on-the-fly (not stored) and update best_path_ends_at.
+        int starts_at = 0;
+        while (starts_at < size) {
+            std::size_t node_pos = 0;
+            std::size_t key_pos  = starts_at;
+            const auto best_path_score_till_here =
+                best_path_ends_at[starts_at].best_path_score;
+            bool has_single_node = false;
+            const int mblen =
+                std::min<int>(static_cast<int>(OneCharLen(normalized.data() + starts_at)),
+                              size - starts_at);
+            while (key_pos < size) {
+                const int ret =
+                    trie_->traverse(normalized.data(), node_pos, key_pos, key_pos + 1);
+                if (ret == -2)
+                    break;
+                if (ret >= 0) {
+                    if (IsUnusedInlined(ret))
+                        continue;
+                    // Update the best path node.
+                    auto& target_node = best_path_ends_at[key_pos];
+                    const auto length = (key_pos - starts_at);
+                    // User defined symbol receives extra bonus to always be selected.
+                    const auto score = IsUserDefinedInlined(ret)
+                                           ? (length * max_score_ - 0.1)
+                                           : GetScoreInlined(ret);
+                    const auto candidate_best_path_score =
+                        score + best_path_score_till_here;
+                    if (target_node.starts_at == -1 ||
+                        candidate_best_path_score > target_node.best_path_score) {
+                        target_node.best_path_score = static_cast<float>(candidate_best_path_score);
+                        target_node.starts_at       = starts_at;
+                        target_node.id              = ret;
+                    }
+                    if (!has_single_node && length == mblen) {
+                        has_single_node = true;
+                    }
+                }
+            }
+            if (!has_single_node) {
+                auto& target_node = best_path_ends_at[starts_at + mblen];
+                const auto candidate_best_path_score =
+                    unk_score + best_path_score_till_here;
+                if (target_node.starts_at == -1 ||
+                    candidate_best_path_score > target_node.best_path_score) {
+                    target_node.best_path_score = candidate_best_path_score;
+                    target_node.starts_at       = starts_at;
+                    target_node.id              = unk_id_;
+                }
+            }
+            // Move by one unicode character.
+            starts_at += mblen;
+        }
+        // Backtrack to identify the best path.
+        EncodeResult results;
+        int ends_at = size;
+        while (ends_at > 0) {
+            const auto& node = best_path_ends_at[ends_at];
+            results.emplace_back(
+                normalized.substr(node.starts_at, ends_at - node.starts_at), node.id);
+            ends_at = node.starts_at;
+        }
+        std::reverse(results.begin(), results.end());
+        return results;
+    }
+
+public:
+    explicit T5UniGramTokenizer(bool is_umt5 = false) {
+        if (is_umt5) {
+            InitializePieces(load_umt5_tokenizer_json());
+        } else {
+            InitializePieces(load_t5_tokenizer_json());
+        }
+
+        min_score_ = FLT_MAX;
+        max_score_ = FLT_MIN;
+
+        std::vector<std::pair<std::string, int>> pieces;
+        for (int i = 0; i < piece_score_pairs.size(); i++) {
+            const auto& sp = piece_score_pairs[i];
+
+            min_score_ = std::min(min_score_, sp.second);
+            max_score_ = std::max(max_score_, sp.second);
+
+            pieces.emplace_back(sp.first, i);
+        }
+
+        BuildTrie(&pieces);
+    }
+    ~T5UniGramTokenizer(){};
+
+    std::string Normalize(const std::string& input) const {
+        // Ref: https://github.com/huggingface/tokenizers/blob/1ff56c0c70b045f0cd82da1af9ac08cd4c7a6f9f/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py#L29
+        // TODO: nmt-nfkc
+        std::string normalized = std::regex_replace(input, std::regex(" {2,}"), " ");
+        return normalized;
+    }
+
+    std::vector<int> Encode(const std::string& input, bool append_eos_if_not_present = true) const {
+        std::string normalized = Normalize(input);
+        normalized             = pre_tokenizer.tokenize(normalized);
+        EncodeResult result    = EncodeOptimized(normalized);
+        if (result.size() > 0 && append_eos_if_not_present) {
+            auto item = result[result.size() - 1];
+            if (item.first != eos_token_) {
+                result.emplace_back(eos_token_, eos_id_);
+            }
+        }
+        std::vector<int> tokens;
+        for (auto item : result) {
+            tokens.push_back(item.second);
+        }
+        return tokens;
+    }
+
+    void pad_tokens(std::vector<int>& tokens,
+                    std::vector<float>& weights,
+                    std::vector<float>* attention_mask,
+                    size_t max_length = 0,
+                    bool padding      = false) {
+        if (max_length > 0 && padding) {
+            size_t orig_token_num = tokens.size() - 1;
+            size_t n              = static_cast<size_t>(std::ceil(orig_token_num * 1.0 / (max_length - 1)));
+            if (n == 0) {
+                n = 1;
+            }
+            size_t length = max_length * n;
+            LOG_DEBUG("token length: %llu", length);
+            std::vector<int> new_tokens;
+            std::vector<float> new_weights;
+            std::vector<float> new_attention_mask;
+            int token_idx = 0;
+            for (int i = 0; i < length; i++) {
+                if (token_idx >= orig_token_num) {
+                    break;
+                }
+                if (attention_mask != nullptr) {
+                    new_attention_mask.push_back(0.0);
+                }
+                if (i % max_length == max_length - 1) {
+                    new_tokens.push_back(eos_id_);
+                    new_weights.push_back(1.0);
+                } else {
+                    new_tokens.push_back(tokens[token_idx]);
+                    new_weights.push_back(weights[token_idx]);
+                    token_idx++;
+                }
+            }
+
+            new_tokens.push_back(eos_id_);
+            new_weights.push_back(1.0);
+            if (attention_mask != nullptr) {
+                new_attention_mask.push_back(0.0);
+            }
+
+            tokens  = new_tokens;
+            weights = new_weights;
+            if (attention_mask != nullptr) {
+                *attention_mask = new_attention_mask;
+            }
+
+            if (padding) {
+                int pad_token_id = pad_id_;
+                tokens.insert(tokens.end(), length - tokens.size(), pad_token_id);
+                weights.insert(weights.end(), length - weights.size(), 1.0);
+                if (attention_mask != nullptr) {
+                    // maybe keep some padding tokens unmasked?
+                    attention_mask->insert(attention_mask->end(), length - attention_mask->size(), -HUGE_VALF);
+                }
+            }
+        }
+    }
+
+    // Returns the minimum score in sentence pieces.
+    // min_score() - 10 is used for the cost of unknown sentence.
+    float min_score() const { return min_score_; }
+
+    // Returns the maximum score in sentence pieces.
+    // max_score() is used for the cost of user defined symbols.
+    float max_score() const { return max_score_; }
+
+    Status status() const { return status_; }
+};
+
+class T5LayerNorm : public UnaryBlock {
+protected:
+    int64_t hidden_size;
+    float eps;
+
+    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+        enum ggml_type wtype = GGML_TYPE_F32;
+        params["weight"]     = ggml_new_tensor_1d(ctx, wtype, hidden_size);
+    }
+
+public:
+    T5LayerNorm(int64_t hidden_size,
+                float eps = 1e-06f)
+        : hidden_size(hidden_size),
+          eps(eps) {}
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+        ggml_tensor* w = params["weight"];
+        x              = ggml_rms_norm(ctx->ggml_ctx, x, eps);
+        x              = ggml_mul(ctx->ggml_ctx, x, w);
+        return x;
+    }
+};
+
+struct T5DenseActDense : public UnaryBlock {
+public:
+    T5DenseActDense(int64_t model_dim, int64_t ff_dim) {
+        blocks["wi"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
+        blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false));
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+        // x: [N, n_token, model_dim]
+        auto wi = std::dynamic_pointer_cast<Linear>(blocks["wi"]);
+        auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]);
+
+        x = wi->forward(ctx, x);
+        x = ggml_relu_inplace(ctx->ggml_ctx, x);
+        x = wo->forward(ctx, x);
+        return x;
+    }
+};
+
+struct T5DenseGatedActDense : public UnaryBlock {
+public:
+    T5DenseGatedActDense(int64_t model_dim, int64_t ff_dim) {
+        blocks["wi_0"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
+        blocks["wi_1"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
+        float scale    = 1.f / 32.f;
+        // The purpose of the scale here is to prevent NaN issues on some backends(CUDA, ...).
+        blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false, false, false, scale));
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+        // x: [N, n_token, model_dim]
+        auto wi_0 = std::dynamic_pointer_cast<Linear>(blocks["wi_0"]);
+        auto wi_1 = std::dynamic_pointer_cast<Linear>(blocks["wi_1"]);
+        auto wo   = std::dynamic_pointer_cast<Linear>(blocks["wo"]);
+
+        auto hidden_gelu   = ggml_ext_gelu(ctx->ggml_ctx, wi_0->forward(ctx, x), true);
+        auto hidden_linear = wi_1->forward(ctx, x);
+        x                  = ggml_mul_inplace(ctx->ggml_ctx, hidden_gelu, hidden_linear);
+        x                  = wo->forward(ctx, x);
+        return x;
+    }
+};
+
+struct T5LayerFF : public UnaryBlock {
+public:
+    T5LayerFF(int64_t model_dim, int64_t ff_dim) {
+        blocks["DenseReluDense"] = std::shared_ptr<GGMLBlock>(new T5DenseGatedActDense(model_dim, ff_dim));
+        blocks["layer_norm"]     = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+        // x: [N, n_token, model_dim]
+        auto DenseReluDense = std::dynamic_pointer_cast<T5DenseGatedActDense>(blocks["DenseReluDense"]);
+        auto layer_norm     = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);
+
+        auto forwarded_states = layer_norm->forward(ctx, x);
+        forwarded_states      = DenseReluDense->forward(ctx, forwarded_states);
+        x                     = ggml_add_inplace(ctx->ggml_ctx, forwarded_states, x);
+        return x;
+    }
+};
+
+class T5Attention : public GGMLBlock {
+protected:
+    int64_t model_dim;
+    int64_t inner_dim;
+    int64_t num_heads;
+    bool using_relative_attention_bias;
+    int64_t relative_attention_num_buckets  = 32;
+    int64_t relative_attention_max_distance = 128;
+
+public:
+    T5Attention(int64_t model_dim,
+                int64_t inner_dim,
+                int64_t num_heads,
+                bool using_relative_attention_bias = false)
+        : model_dim(model_dim),
+          inner_dim(inner_dim),
+          num_heads(num_heads),
+          using_relative_attention_bias(using_relative_attention_bias) {
+        blocks["q"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
+        blocks["k"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
+        blocks["v"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
+        blocks["o"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, model_dim, false));
+        if (using_relative_attention_bias) {
+            blocks["relative_attention_bias"] = std::shared_ptr<GGMLBlock>(new Embedding(relative_attention_num_buckets, num_heads));
+        }
+    }
+
+    ggml_tensor* compute_bias(GGMLRunnerContext* ctx,
+                              ggml_tensor* relative_position_bucket) {
+        auto relative_attention_bias = std::dynamic_pointer_cast<Embedding>(blocks["relative_attention_bias"]);
+
+        auto values = relative_attention_bias->forward(ctx, relative_position_bucket);            // shape (query_length, key_length, num_heads)
+        values      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, values, 2, 0, 1, 3));  // shape (1, num_heads, query_length, key_length)
+        return values;
+    }
+
+    // x: [N, n_token, model_dim]
+    std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
+                                                  ggml_tensor* x,
+                                                  ggml_tensor* past_bias                = nullptr,
+                                                  ggml_tensor* mask                     = nullptr,
+                                                  ggml_tensor* relative_position_bucket = nullptr) {
+        auto q_proj   = std::dynamic_pointer_cast<Linear>(blocks["q"]);
+        auto k_proj   = std::dynamic_pointer_cast<Linear>(blocks["k"]);
+        auto v_proj   = std::dynamic_pointer_cast<Linear>(blocks["v"]);
+        auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["o"]);
+
+        int64_t n_head = num_heads;
+        int64_t d_head = inner_dim / n_head;
+
+        auto q = q_proj->forward(ctx, x);
+        auto k = k_proj->forward(ctx, x);
+        auto v = v_proj->forward(ctx, x);
+
+        if (using_relative_attention_bias && relative_position_bucket != nullptr) {
+            past_bias = compute_bias(ctx, relative_position_bucket);
+        }
+        if (past_bias != nullptr) {
+            if (mask != nullptr) {
+                mask = ggml_repeat(ctx->ggml_ctx, mask, past_bias);
+                mask = ggml_add(ctx->ggml_ctx, mask, past_bias);
+            } else {
+                mask = past_bias;
+            }
+        }
+
+        k = ggml_ext_scale(ctx->ggml_ctx, k, ::sqrtf(static_cast<float>(d_head)), true);
+
+        x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, mask);  // [N, n_token, d_head * n_head]
+
+        x = out_proj->forward(ctx, x);  // [N, n_token, model_dim]
+        return {x, past_bias};
+    }
+};
+
+struct T5LayerSelfAttention : public GGMLBlock {
+public:
+    T5LayerSelfAttention(int64_t model_dim,
+                         int64_t inner_dim,
+                         int64_t ff_dim,
+                         int64_t num_heads,
+                         bool using_relative_attention_bias) {
+        blocks["SelfAttention"] = std::shared_ptr<GGMLBlock>(new T5Attention(model_dim, inner_dim, num_heads, using_relative_attention_bias));
+        blocks["layer_norm"]    = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
+    }
+
+    std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
+                                                  ggml_tensor* x,
+                                                  ggml_tensor* past_bias                = nullptr,
+                                                  ggml_tensor* mask                     = nullptr,
+                                                  ggml_tensor* relative_position_bucket = nullptr) {
+        // x: [N, n_token, model_dim]
+        auto SelfAttention = std::dynamic_pointer_cast<T5Attention>(blocks["SelfAttention"]);
+        auto layer_norm    = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);
+
+        auto normed_hidden_state = layer_norm->forward(ctx, x);
+        auto ret                 = SelfAttention->forward(ctx, normed_hidden_state, past_bias, mask, relative_position_bucket);
+        auto output              = ret.first;
+        past_bias                = ret.second;
+
+        x = ggml_add_inplace(ctx->ggml_ctx, output, x);
+        return {x, past_bias};
+    }
+};
+
+struct T5Block : public GGMLBlock {
+public:
+    T5Block(int64_t model_dim, int64_t inner_dim, int64_t ff_dim, int64_t num_heads, bool using_relative_attention_bias) {
+        blocks["layer.0"] = std::shared_ptr<GGMLBlock>(new T5LayerSelfAttention(model_dim, inner_dim, ff_dim, num_heads, using_relative_attention_bias));
+        blocks["layer.1"] = std::shared_ptr<GGMLBlock>(new T5LayerFF(model_dim, ff_dim));
+    }
+
+    std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
+                                                  ggml_tensor* x,
+                                                  ggml_tensor* past_bias                = nullptr,
+                                                  ggml_tensor* mask                     = nullptr,
+                                                  ggml_tensor* relative_position_bucket = nullptr) {
+        // x: [N, n_token, model_dim]
+        auto layer_0 = std::dynamic_pointer_cast<T5LayerSelfAttention>(blocks["layer.0"]);
+        auto layer_1 = std::dynamic_pointer_cast<T5LayerFF>(blocks["layer.1"]);
+
+        auto ret  = layer_0->forward(ctx, x, past_bias, mask, relative_position_bucket);
+        x         = ret.first;
+        past_bias = ret.second;
+        x         = layer_1->forward(ctx, x);
+        return {x, past_bias};
+    }
+};
+
+struct T5Stack : public GGMLBlock {
+    int64_t num_layers;
+
+public:
+    T5Stack(int64_t num_layers,
+            int64_t model_dim,
+            int64_t inner_dim,
+            int64_t ff_dim,
+            int64_t num_heads,
+            bool relative_attention = true)
+        : num_layers(num_layers) {
+        for (int i = 0; i < num_layers; i++) {
+            blocks["block." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new T5Block(model_dim, inner_dim, ff_dim, num_heads, (!relative_attention || i == 0)));
+        }
+
+        blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* x,
+                         ggml_tensor* past_bias                = nullptr,
+                         ggml_tensor* attention_mask           = nullptr,
+                         ggml_tensor* relative_position_bucket = nullptr) {
+        // x: [N, n_token, model_dim]
+        for (int i = 0; i < num_layers; i++) {
+            auto block = std::dynamic_pointer_cast<T5Block>(blocks["block." + std::to_string(i)]);
+
+            auto ret  = block->forward(ctx, x, past_bias, attention_mask, relative_position_bucket);
+            x         = ret.first;
+            past_bias = ret.second;
+        }
+
+        auto final_layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["final_layer_norm"]);
+
+        x = final_layer_norm->forward(ctx, x);
+        return x;
+    }
+};
+
+struct T5Params {
+    int64_t num_layers      = 24;
+    int64_t model_dim       = 4096;
+    int64_t ff_dim          = 10240;
+    int64_t num_heads       = 64;
+    int64_t vocab_size      = 32128;
+    bool relative_attention = true;
+};
+
+struct T5 : public GGMLBlock {
+    T5Params params;
+
+public:
+    T5() {}
+    T5(T5Params params)
+        : params(params) {
+        blocks["encoder"] = std::shared_ptr<GGMLBlock>(new T5Stack(params.num_layers,
+                                                                   params.model_dim,
+                                                                   params.model_dim,
+                                                                   params.ff_dim,
+                                                                   params.num_heads,
+                                                                   params.relative_attention));
+        blocks["shared"]  = std::shared_ptr<GGMLBlock>(new Embedding(params.vocab_size,
+                                                                     params.model_dim));
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* input_ids,
+                         ggml_tensor* past_bias                = nullptr,
+                         ggml_tensor* attention_mask           = nullptr,
+                         ggml_tensor* relative_position_bucket = nullptr) {
+        // input_ids: [N, n_token]
+
+        auto shared  = std::dynamic_pointer_cast<Embedding>(blocks["shared"]);
+        auto encoder = std::dynamic_pointer_cast<T5Stack>(blocks["encoder"]);
+
+        auto x = shared->forward(ctx, input_ids);
+        x      = encoder->forward(ctx, x, past_bias, attention_mask, relative_position_bucket);
+        return x;
+    }
+};
+
+struct T5Runner : public GGMLRunner {
+    T5Params params;
+    T5 model;
+    std::vector<int> relative_position_bucket_vec;
+
+    T5Runner(ggml_backend_t backend,
+             bool offload_params_to_cpu,
+             const String2TensorStorage& tensor_storage_map,
+             const std::string prefix,
+             bool is_umt5 = false)
+        : GGMLRunner(backend, offload_params_to_cpu) {
+        if (is_umt5) {
+            params.vocab_size         = 256384;
+            params.relative_attention = false;
+        }
+        model = T5(params);
+        model.init(params_ctx, tensor_storage_map, prefix);
+    }
+
+    std::string get_desc() override {
+        return "t5";
+    }
+
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
+        model.get_param_tensors(tensors, prefix);
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* input_ids,
+                         ggml_tensor* relative_position_bucket,
+                         ggml_tensor* attention_mask = nullptr) {
+        size_t N       = input_ids->ne[1];
+        size_t n_token = input_ids->ne[0];
+
+        auto hidden_states = model.forward(ctx, input_ids, nullptr, attention_mask, relative_position_bucket);  // [N, n_token, model_dim]
+        return hidden_states;
+    }
+
+    ggml_cgraph* build_graph(const sd::Tensor<int32_t>& input_ids_tensor,
+                             const sd::Tensor<float>& attention_mask_tensor = {}) {
+        ggml_cgraph* gf             = ggml_new_graph(compute_ctx);
+        ggml_tensor* input_ids      = make_input(input_ids_tensor);
+        ggml_tensor* attention_mask = attention_mask_tensor.empty() ? nullptr : make_input(attention_mask_tensor);
+
+        relative_position_bucket_vec = compute_relative_position_bucket(static_cast<int>(input_ids->ne[0]), static_cast<int>(input_ids->ne[0]));
+
+        // for (int i = 0; i < relative_position_bucket_vec.size(); i++) {
+        //     if (i % 77 == 0) {
+        //         printf("\n");
+        //     }
+        //     printf("%d ", relative_position_bucket_vec[i]);
+        // }
+
+        auto relative_position_bucket = ggml_new_tensor_2d(compute_ctx,
+                                                           GGML_TYPE_I32,
+                                                           input_ids->ne[0],
+                                                           input_ids->ne[0]);
+        set_backend_tensor_data(relative_position_bucket, relative_position_bucket_vec.data());
+
+        auto runner_ctx            = get_context();
+        ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, relative_position_bucket, attention_mask);
+
+        ggml_build_forward_expand(gf, hidden_states);
+
+        return gf;
+    }
+
+    sd::Tensor<float> compute(const int n_threads,
+                              const sd::Tensor<int32_t>& input_ids,
+                              const sd::Tensor<float>& attention_mask) {
+        auto get_graph = [&]() -> ggml_cgraph* {
+            return build_graph(input_ids, attention_mask);
+        };
+        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, true), 3);
+    }
+
+    static std::vector<int> _relative_position_bucket(const std::vector<int>& relative_position,
+                                                      bool bidirectional = true,
+                                                      int num_buckets    = 32,
+                                                      int max_distance   = 128) {
+        std::vector<int> relative_buckets(relative_position.size(), 0);
+        std::vector<int> abs_relative_position = relative_position;
+
+        if (bidirectional) {
+            num_buckets = num_buckets / 2;
+            for (size_t i = 0; i < relative_position.size(); ++i) {
+                if (relative_position[i] > 0) {
+                    relative_buckets[i] += num_buckets;
+                }
+                abs_relative_position[i] = std::abs(relative_position[i]);
+            }
+        } else {
+            for (size_t i = 0; i < relative_position.size(); ++i) {
+                abs_relative_position[i] = std::max(-relative_position[i], 0);
+            }
+        }
+
+        int max_exact = num_buckets / 2;
+        std::vector<int> relative_position_if_large(relative_position.size(), 0);
+
+        for (size_t i = 0; i < relative_position.size(); ++i) {
+            if (abs_relative_position[i] < max_exact) {
+                relative_buckets[i] += abs_relative_position[i];
+            } else {
+                float log_pos                 = std::log(static_cast<float>(abs_relative_position[i]) / max_exact);
+                float log_base                = std::log(static_cast<float>(max_distance) / max_exact);
+                relative_position_if_large[i] = max_exact + static_cast<int>((log_pos / log_base) * (num_buckets - max_exact));
+                relative_position_if_large[i] = std::min(relative_position_if_large[i], num_buckets - 1);
+                relative_buckets[i] += relative_position_if_large[i];
+            }
+        }
+
+        return relative_buckets;
+    }
+
+    std::vector<int> compute_relative_position_bucket(int query_length,
+                                                      int key_length) {
+        std::vector<int> context_position(query_length);
+        std::vector<int> memory_position(key_length);
+
+        for (int i = 0; i < query_length; ++i) {
+            context_position[i] = i;
+        }
+        for (int i = 0; i < key_length; ++i) {
+            memory_position[i] = i;
+        }
+
+        std::vector<std::vector<int>> relative_position(query_length, std::vector<int>(key_length, 0));
+        for (int i = 0; i < query_length; ++i) {
+            for (int j = 0; j < key_length; ++j) {
+                relative_position[i][j] = memory_position[j] - context_position[i];
+            }
+        }
+
+        std::vector<int> relative_position_bucket;
+        for (int i = 0; i < query_length; ++i) {
+            std::vector<int> result = _relative_position_bucket(relative_position[i], true);
+            relative_position_bucket.insert(relative_position_bucket.end(), result.begin(), result.end());
+        }
+
+        return relative_position_bucket;
+    }
+};
+
+struct T5Embedder {
+    T5UniGramTokenizer tokenizer;
+    T5Runner model;
+
+    T5Embedder(ggml_backend_t backend,
+               bool offload_params_to_cpu,
+               const String2TensorStorage& tensor_storage_map = {},
+               const std::string prefix                       = "",
+               bool is_umt5                                   = false)
+        : model(backend, offload_params_to_cpu, tensor_storage_map, prefix, is_umt5), tokenizer(is_umt5) {
+    }
+
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
+        model.get_param_tensors(tensors, prefix);
+    }
+
+    void alloc_params_buffer() {
+        model.alloc_params_buffer();
+    }
+
+    std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text,
+                                                                                  size_t max_length = 0,
+                                                                                  bool padding      = false) {
+        auto parsed_attention = parse_prompt_attention(text);
+
+        {
+            std::stringstream ss;
+            ss << "[";
+            for (const auto& item : parsed_attention) {
+                ss << "['" << item.first << "', " << item.second << "], ";
+            }
+            ss << "]";
+            LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
+        }
+
+        std::vector<int> tokens;
+        std::vector<float> weights;
+        for (const auto& item : parsed_attention) {
+            const std::string& curr_text = item.first;
+            float curr_weight            = item.second;
+            std::vector<int> curr_tokens = tokenizer.Encode(curr_text, false);
+            tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
+            weights.insert(weights.end(), curr_tokens.size(), curr_weight);
+        }
+
+        int EOS_TOKEN_ID = 1;
+        tokens.push_back(EOS_TOKEN_ID);
+        weights.push_back(1.0);
+
+        std::vector<float> attention_mask;
+
+        tokenizer.pad_tokens(tokens, weights, &attention_mask, max_length, padding);
+
+        // for (int i = 0; i < tokens.size(); i++) {
+        //     std::cout << tokens[i] << ":" << weights[i] << ", ";
+        // }
+        // std::cout << std::endl;
+
+        return {tokens, weights, attention_mask};
+    }
+
+    void test() {
+        ggml_init_params params;
+        params.mem_size   = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
+        params.mem_buffer = nullptr;
+        params.no_alloc   = false;
+
+        ggml_context* ctx = ggml_init(params);
+        GGML_ASSERT(ctx != nullptr);
+
+        {
+            std::string text("a lovely cat");
+            auto tokens_and_weights     = tokenize(text, 512, true);
+            std::vector<int>& tokens    = std::get<0>(tokens_and_weights);
+            std::vector<float>& weights = std::get<1>(tokens_and_weights);
+            std::vector<float>& masks   = std::get<2>(tokens_and_weights);
+            for (auto token : tokens) {
+                printf("%d ", token);
+            }
+            printf("\n");
+            auto input_ids      = sd::Tensor<int32_t>::from_vector(tokens);
+            auto attention_mask = sd::Tensor<float>::from_vector(masks);
+            sd::Tensor<float> out;
+
+            int64_t t0   = ggml_time_ms();
+            auto out_opt = model.compute(8, input_ids, attention_mask);
+            int64_t t1   = ggml_time_ms();
+
+            GGML_ASSERT(!out_opt.empty());
+            out = std::move(out_opt);
+            print_sd_tensor(out);
+            LOG_DEBUG("t5 test done in %lldms", t1 - t0);
+        }
+    }
+
+    static void load_from_file_and_test(const std::string& file_path) {
+        // cpu f16: pass
+        // cpu f32: pass
+        // cuda f16: pass
+        // cuda f32: pass
+        // cuda q8_0: pass
+        // ggml_backend_t backend = ggml_backend_cuda_init(0);
+        ggml_backend_t backend    = ggml_backend_cpu_init();
+        ggml_type model_data_type = GGML_TYPE_F16;
+
+        ModelLoader model_loader;
+        if (!model_loader.init_from_file_and_convert_name(file_path)) {
+            LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
+            return;
+        }
+
+        auto& tensor_storage_map = model_loader.get_tensor_storage_map();
+        for (auto& [name, tensor_storage] : tensor_storage_map) {
+            if (ends_with(name, "weight")) {
+                tensor_storage.expected_type = model_data_type;
+            }
+        }
+
+        std::shared_ptr<T5Embedder> t5 = std::make_shared<T5Embedder>(backend, false, tensor_storage_map, "", true);
+
+        t5->alloc_params_buffer();
+        std::map<std::string, ggml_tensor*> tensors;
+        t5->get_param_tensors(tensors, "");
+
+        bool success = model_loader.load_tensors(tensors);
+
+        if (!success) {
+            LOG_ERROR("load tensors from model loader failed");
+            return;
+        }
+
+        LOG_INFO("t5 model loaded");
+        t5->test();
+    }
+};
+
+#endif  // __T5_HPP__
diff --git a/src/tae.hpp b/src/tae.hpp
index 83152578..0a0ca682 100644
--- a/src/tae.hpp
+++ b/src/tae.hpp
@@ -37,7 +37,7 @@ public:
         }
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
         // x: [n, n_in, h, w]
         // return: [n, n_out, h, w]
 
@@ -107,7 +107,7 @@ public:
         blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, z_channels, {3, 3}, {1, 1}, {1, 1}));
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
         // x: [n, in_channels, h, w]
         // return: [n, z_channels, h/8, w/8]
 
@@ -157,7 +157,7 @@ public:
         blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* z) override {
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* z) override {
         // z: [n, z_channels, h, w]
         // return: [n, out_channels, h*8, w*8]
 
@@ -192,7 +192,7 @@ public:
         blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels * stride, channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, false));
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
         auto conv = std::dynamic_pointer_cast<UnaryBlock>(blocks["conv"]);
         auto h    = x;
         if (stride != 1) {
@@ -212,7 +212,7 @@ public:
         blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels * stride, {1, 1}, {1, 1}, {0, 0}, {1, 1}, false));
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
         auto conv = std::dynamic_pointer_cast<UnaryBlock>(blocks["conv"]);
         auto h    = conv->forward(ctx, x);
         if (stride != 1) {
@@ -236,7 +236,7 @@ public:
         }
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* past) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* past) {
         // x: [n, channels, h, w]
         auto conv0 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.0"]);
         auto conv1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.2"]);
@@ -260,10 +260,10 @@ public:
     }
 };
 
-struct ggml_tensor* patchify(struct ggml_context* ctx,
-                             struct ggml_tensor* x,
-                             int64_t patch_size,
-                             int64_t b = 1) {
+ggml_tensor* patchify(ggml_context* ctx,
+                      ggml_tensor* x,
+                      int64_t patch_size,
+                      int64_t b = 1) {
     // x: [f, b*c, h*q, w*r]
     // return: [f, b*c*r*q, h, w]
     if (patch_size == 1) {
@@ -289,10 +289,10 @@ struct ggml_tensor* patchify(struct ggml_context* ctx,
     return x;
 }
 
-struct ggml_tensor* unpatchify(struct ggml_context* ctx,
-                               struct ggml_tensor* x,
-                               int64_t patch_size,
-                               int64_t b = 1) {
+ggml_tensor* unpatchify(ggml_context* ctx,
+                        ggml_tensor* x,
+                        int64_t patch_size,
+                        int64_t b = 1) {
     // x: [f, b*c*r*q, h, w]
     // return: [f, b*c, h*q, w*r]
     if (patch_size == 1) {
@@ -339,7 +339,7 @@ public:
         blocks[std::to_string(index)] = std::shared_ptr<GGMLBlock>(new Conv2d(hidden, z_channels, {3, 3}, {1, 1}, {1, 1}));
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* z) override {
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* z) override {
         auto first_conv = std::dynamic_pointer_cast<Conv2d>(blocks["0"]);
 
         if (patch_size > 1) {
@@ -396,7 +396,7 @@ public:
         blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels[num_layers], out_channels * patch_size * patch_size, {3, 3}, {1, 1}, {1, 1}));
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* z) override {
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* z) override {
         auto first_conv = std::dynamic_pointer_cast<Conv2d>(blocks["1"]);
 
         // Clamp()
@@ -442,11 +442,13 @@ protected:
     bool decode_only;
     SDVersion version;
 
+public:
+    int z_channels = 16;
+
 public:
     TAEHV(bool decode_only = true, SDVersion version = VERSION_WAN2)
         : decode_only(decode_only), version(version) {
-        int z_channels = 16;
-        int patch      = 1;
+        int patch = 1;
         if (version == VERSION_WAN2_2_TI2V) {
             z_channels = 48;
             patch      = 2;
@@ -457,7 +459,7 @@ public:
         }
     }
 
-    struct ggml_tensor* decode(GGMLRunnerContext* ctx, struct ggml_tensor* z) {
+    ggml_tensor* decode(GGMLRunnerContext* ctx, ggml_tensor* z) {
         auto decoder = std::dynamic_pointer_cast<TinyVideoDecoder>(blocks["decoder"]);
         if (sd_version_is_wan(version)) {
             // (W, H, C, T) -> (W, H, T, C)
@@ -471,7 +473,7 @@ public:
         return result;
     }
 
-    struct ggml_tensor* encode(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+    ggml_tensor* encode(GGMLRunnerContext* ctx, ggml_tensor* x) {
         auto encoder = std::dynamic_pointer_cast<TinyVideoEncoder>(blocks["encoder"]);
         // (W, H, T, C) -> (W, H, C, T)
         x                  = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 1, 3, 2));
@@ -494,10 +496,12 @@ protected:
     bool decode_only;
     bool taef2 = false;
 
+public:
+    int z_channels = 4;
+
 public:
     TAESD(bool decode_only = true, SDVersion version = VERSION_SD1)
         : decode_only(decode_only) {
-        int z_channels       = 4;
         bool use_midblock_gn = false;
         taef2                = sd_version_is_flux2(version);
 
@@ -515,7 +519,7 @@ public:
         }
     }
 
-    struct ggml_tensor* decode(GGMLRunnerContext* ctx, struct ggml_tensor* z) {
+    ggml_tensor* decode(GGMLRunnerContext* ctx, ggml_tensor* z) {
         auto decoder = std::dynamic_pointer_cast<TinyDecoder>(blocks["decoder.layers"]);
         if (taef2) {
             z = unpatchify(ctx->ggml_ctx, z, 2);
@@ -523,7 +527,7 @@ public:
         return decoder->forward(ctx, z);
     }
 
-    struct ggml_tensor* encode(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+    ggml_tensor* encode(GGMLRunnerContext* ctx, ggml_tensor* x) {
         auto encoder = std::dynamic_pointer_cast<TinyEncoder>(blocks["encoder.layers"]);
         auto z       = encoder->forward(ctx, x);
         if (taef2) {
@@ -533,20 +537,7 @@ public:
     }
 };
 
-struct TinyAutoEncoder : public GGMLRunner {
-    TinyAutoEncoder(ggml_backend_t backend, bool offload_params_to_cpu)
-        : GGMLRunner(backend, offload_params_to_cpu) {}
-    virtual bool compute(const int n_threads,
-                         struct ggml_tensor* z,
-                         bool decode_graph,
-                         struct ggml_tensor** output,
-                         struct ggml_context* output_ctx = nullptr) = 0;
-
-    virtual bool load_from_file(const std::string& file_path, int n_threads)                                      = 0;
-    virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) = 0;
-};
-
-struct TinyImageAutoEncoder : public TinyAutoEncoder {
+struct TinyImageAutoEncoder : public VAE {
     TAESD taesd;
     bool decode_only = false;
 
@@ -558,7 +549,8 @@ struct TinyImageAutoEncoder : public TinyAutoEncoder {
                          SDVersion version = VERSION_SD1)
         : decode_only(decoder_only),
           taesd(decoder_only, version),
-          TinyAutoEncoder(backend, offload_params_to_cpu) {
+          VAE(version, backend, offload_params_to_cpu) {
+        scale_input = false;
         taesd.init(params_ctx, tensor_storage_map, prefix);
     }
 
@@ -566,60 +558,48 @@ struct TinyImageAutoEncoder : public TinyAutoEncoder {
         return "taesd";
     }
 
-    bool load_from_file(const std::string& file_path, int n_threads) {
-        LOG_INFO("loading taesd from '%s', decode_only = %s", file_path.c_str(), decode_only ? "true" : "false");
-        alloc_params_buffer();
-        std::map<std::string, ggml_tensor*> taesd_tensors;
-        taesd.get_param_tensors(taesd_tensors);
-        std::set<std::string> ignore_tensors;
-        if (decode_only) {
-            ignore_tensors.insert("encoder.");
-        }
-
-        ModelLoader model_loader;
-        if (!model_loader.init_from_file_and_convert_name(file_path)) {
-            LOG_ERROR("init taesd model loader from file failed: '%s'", file_path.c_str());
-            return false;
-        }
-
-        bool success = model_loader.load_tensors(taesd_tensors, ignore_tensors, n_threads);
-
-        if (!success) {
-            LOG_ERROR("load tae tensors from model loader failed");
-            return false;
-        }
-
-        LOG_INFO("taesd model loaded");
-        return success;
-    }
-
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
         taesd.get_param_tensors(tensors, prefix);
     }
 
-    struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
-        struct ggml_cgraph* gf  = ggml_new_graph(compute_ctx);
-        z                       = to_backend(z);
-        auto runner_ctx         = get_context();
-        struct ggml_tensor* out = decode_graph ? taesd.decode(&runner_ctx, z) : taesd.encode(&runner_ctx, z);
+    sd::Tensor<float> vae_output_to_latents(const sd::Tensor<float>& vae_output, std::shared_ptr<RNG> rng) override {
+        SD_UNUSED(rng);
+        return vae_output;
+    }
+
+    sd::Tensor<float> diffusion_to_vae_latents(const sd::Tensor<float>& latents) override {
+        return latents;
+    }
+
+    sd::Tensor<float> vae_to_diffusion_latents(const sd::Tensor<float>& latents) override {
+        return latents;
+    }
+
+    int get_encoder_output_channels(int input_channels) {
+        return taesd.z_channels;
+    }
+
+    ggml_cgraph* build_graph(const sd::Tensor<float>& z_tensor, bool decode_graph) {
+        ggml_cgraph* gf  = ggml_new_graph(compute_ctx);
+        ggml_tensor* z   = make_input(z_tensor);
+        auto runner_ctx  = get_context();
+        ggml_tensor* out = decode_graph ? taesd.decode(&runner_ctx, z) : taesd.encode(&runner_ctx, z);
         ggml_build_forward_expand(gf, out);
         return gf;
     }
 
-    bool compute(const int n_threads,
-                 struct ggml_tensor* z,
-                 bool decode_graph,
-                 struct ggml_tensor** output,
-                 struct ggml_context* output_ctx = nullptr) {
-        auto get_graph = [&]() -> struct ggml_cgraph* {
-            return build_graph(z, decode_graph);
+    sd::Tensor<float> _compute(const int n_threads,
+                               const sd::Tensor<float>& z_tensor,
+                               bool decode_graph) override {
+        auto get_graph = [&]() -> ggml_cgraph* {
+            return build_graph(z_tensor, decode_graph);
         };
 
-        return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), z_tensor.dim());
     }
 };
 
-struct TinyVideoAutoEncoder : public TinyAutoEncoder {
+struct TinyVideoAutoEncoder : public VAE {
     TAEHV taehv;
     bool decode_only = false;
 
@@ -631,7 +611,8 @@ struct TinyVideoAutoEncoder : public TinyAutoEncoder {
                          SDVersion version = VERSION_WAN2)
         : decode_only(decoder_only),
           taehv(decoder_only, version),
-          TinyAutoEncoder(backend, offload_params_to_cpu) {
+          VAE(version, backend, offload_params_to_cpu) {
+        scale_input = false;
         taehv.init(params_ctx, tensor_storage_map, prefix);
     }
 
@@ -639,57 +620,45 @@ struct TinyVideoAutoEncoder : public TinyAutoEncoder {
         return "taehv";
     }
 
-    bool load_from_file(const std::string& file_path, int n_threads) {
-        LOG_INFO("loading taehv from '%s', decode_only = %s", file_path.c_str(), decode_only ? "true" : "false");
-        alloc_params_buffer();
-        std::map<std::string, ggml_tensor*> taehv_tensors;
-        taehv.get_param_tensors(taehv_tensors);
-        std::set<std::string> ignore_tensors;
-        if (decode_only) {
-            ignore_tensors.insert("encoder.");
-        }
-
-        ModelLoader model_loader;
-        if (!model_loader.init_from_file(file_path)) {
-            LOG_ERROR("init taehv model loader from file failed: '%s'", file_path.c_str());
-            return false;
-        }
-
-        bool success = model_loader.load_tensors(taehv_tensors, ignore_tensors, n_threads);
-
-        if (!success) {
-            LOG_ERROR("load tae tensors from model loader failed");
-            return false;
-        }
-
-        LOG_INFO("taehv model loaded");
-        return success;
-    }
-
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
         taehv.get_param_tensors(tensors, prefix);
     }
 
-    struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
-        struct ggml_cgraph* gf  = ggml_new_graph(compute_ctx);
-        z                       = to_backend(z);
-        auto runner_ctx         = get_context();
-        struct ggml_tensor* out = decode_graph ? taehv.decode(&runner_ctx, z) : taehv.encode(&runner_ctx, z);
+    sd::Tensor<float> vae_output_to_latents(const sd::Tensor<float>& vae_output, std::shared_ptr<RNG> rng) override {
+        SD_UNUSED(rng);
+        return vae_output;
+    }
+
+    sd::Tensor<float> diffusion_to_vae_latents(const sd::Tensor<float>& latents) override {
+        return latents;
+    }
+
+    sd::Tensor<float> vae_to_diffusion_latents(const sd::Tensor<float>& latents) override {
+        return latents;
+    }
+
+    int get_encoder_output_channels(int input_channels) {
+        return taehv.z_channels;
+    }
+
+    ggml_cgraph* build_graph(const sd::Tensor<float>& z_tensor, bool decode_graph) {
+        ggml_cgraph* gf  = ggml_new_graph(compute_ctx);
+        ggml_tensor* z   = make_input(z_tensor);
+        auto runner_ctx  = get_context();
+        ggml_tensor* out = decode_graph ? taehv.decode(&runner_ctx, z) : taehv.encode(&runner_ctx, z);
         ggml_build_forward_expand(gf, out);
         return gf;
     }
 
-    bool compute(const int n_threads,
-                 struct ggml_tensor* z,
-                 bool decode_graph,
-                 struct ggml_tensor** output,
-                 struct ggml_context* output_ctx = nullptr) {
-        auto get_graph = [&]() -> struct ggml_cgraph* {
-            return build_graph(z, decode_graph);
+    sd::Tensor<float> _compute(const int n_threads,
+                               const sd::Tensor<float>& z_tensor,
+                               bool decode_graph) override {
+        auto get_graph = [&]() -> ggml_cgraph* {
+            return build_graph(z_tensor, decode_graph);
         };
 
-        return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), z_tensor.dim());
     }
 };
 
-#endif  // __TAE_HPP__
\ No newline at end of file
+#endif  // __TAE_HPP__
diff --git a/src/tensor.hpp b/src/tensor.hpp
new file mode 100644
index 00000000..33a2bdea
--- /dev/null
+++ b/src/tensor.hpp
@@ -0,0 +1,1249 @@
+#ifndef __SD_TENSOR_HPP__
+#define __SD_TENSOR_HPP__
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <initializer_list>
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "rng.hpp"
+
+namespace sd {
+
+    template <typename T>
+    class Tensor;
+
+    inline std::vector<int64_t> tensor_unravel_index(int64_t flat, const std::vector<int64_t>& shape);
+
+    [[noreturn]] inline void tensor_throw_invalid_argument(const std::string& message) {
+        std::fprintf(stderr, "sd::Tensor error: %s\n", message.c_str());
+        std::fflush(stderr);
+        throw std::invalid_argument(message);
+    }
+
+    inline std::string tensor_shape_to_string(const std::vector<int64_t>& shape) {
+        std::ostringstream oss;
+        oss << "[";
+        for (size_t i = 0; i < shape.size(); ++i) {
+            if (i != 0) {
+                oss << ", ";
+            }
+            oss << shape[i];
+        }
+        oss << "]";
+        return oss.str();
+    }
+
+    inline int64_t tensor_numel(const std::vector<int64_t>& shape) {
+        if (shape.empty()) {
+            return 0;
+        }
+        int64_t numel = 1;
+        for (int64_t dim : shape) {
+            if (dim < 0) {
+                tensor_throw_invalid_argument("Tensor shape must be non-negative, got shape=" +
+                                              tensor_shape_to_string(shape));
+            }
+            numel *= dim;
+        }
+        return numel;
+    }
+
+    template <typename T>
+    class Tensor {
+    public:
+        Tensor() = default;
+
+        explicit Tensor(std::vector<int64_t> shape)
+            : data_(static_cast<size_t>(tensor_numel(shape))), shape_(std::move(shape)) {
+        }
+
+        Tensor(std::vector<int64_t> shape, std::vector<T> data)
+            : data_(std::move(data)), shape_(std::move(shape)) {
+            if (static_cast<int64_t>(data_.size()) != tensor_numel(shape_)) {
+                tensor_throw_invalid_argument("Tensor data size does not match shape: data.size()=" +
+                                              std::to_string(data_.size()) + ", shape=" +
+                                              tensor_shape_to_string(shape_) + ", numel=" +
+                                              std::to_string(tensor_numel(shape_)));
+            }
+        }
+
+        const std::vector<int64_t>& shape() const {
+            return shape_;
+        }
+
+        int64_t dim() const {
+            return static_cast<int64_t>(shape_.size());
+        }
+
+        int64_t numel() const {
+            return static_cast<int64_t>(data_.size());
+        }
+
+        bool empty() const {
+            return data_.empty();
+        }
+
+        T* data() {
+            return data_.data();
+        }
+
+        const T* data() const {
+            return data_.data();
+        }
+
+        std::vector<T>& values() {
+            return data_;
+        }
+
+        const std::vector<T>& values() const {
+            return data_;
+        }
+
+        void resize(std::vector<int64_t> shape) {
+            shape_ = std::move(shape);
+            data_.resize(static_cast<size_t>(tensor_numel(shape_)));
+        }
+
+        Tensor& reshape_(std::vector<int64_t> shape) {
+            if (tensor_numel(shape) != numel()) {
+                tensor_throw_invalid_argument("Tensor reshape changes element count: from shape=" +
+                                              tensor_shape_to_string(shape_) + " (numel=" +
+                                              std::to_string(numel()) + ") to shape=" +
+                                              tensor_shape_to_string(shape) + " (numel=" +
+                                              std::to_string(tensor_numel(shape)) + ")");
+            }
+            shape_ = std::move(shape);
+            return *this;
+        }
+
+        Tensor reshape(std::vector<int64_t> shape) const {
+            Tensor result = *this;
+            result.reshape_(std::move(shape));
+            return result;
+        }
+
+        Tensor& squeeze_() {
+            std::vector<int64_t> new_shape;
+            new_shape.reserve(shape_.size());
+            for (int64_t dim : shape_) {
+                if (dim != 1) {
+                    new_shape.push_back(dim);
+                }
+            }
+            shape_ = std::move(new_shape);
+            return *this;
+        }
+
+        Tensor& squeeze_(size_t dim) {
+            if (dim >= shape_.size()) {
+                tensor_throw_invalid_argument("Tensor squeeze dimension out of range: dim=" +
+                                              std::to_string(dim) + ", shape=" +
+                                              tensor_shape_to_string(shape_));
+            }
+            if (shape_[dim] != 1) {
+                tensor_throw_invalid_argument("Tensor squeeze requires dimension size 1: dim=" +
+                                              std::to_string(dim) + ", shape=" +
+                                              tensor_shape_to_string(shape_));
+            }
+            shape_.erase(shape_.begin() + static_cast<std::ptrdiff_t>(dim));
+            return *this;
+        }
+
+        Tensor squeeze() const {
+            Tensor result = *this;
+            result.squeeze_();
+            return result;
+        }
+
+        Tensor squeeze(size_t dim) const {
+            Tensor result = *this;
+            result.squeeze_(dim);
+            return result;
+        }
+
+        Tensor& unsqueeze_(size_t dim) {
+            if (dim > shape_.size()) {
+                tensor_throw_invalid_argument("Tensor unsqueeze dimension out of range: dim=" +
+                                              std::to_string(dim) + ", shape=" +
+                                              tensor_shape_to_string(shape_));
+            }
+            shape_.insert(shape_.begin() + static_cast<std::ptrdiff_t>(dim), 1);
+            return *this;
+        }
+
+        Tensor unsqueeze(size_t dim) const {
+            Tensor result = *this;
+            result.unsqueeze_(dim);
+            return result;
+        }
+
+        Tensor permute(const std::vector<size_t>& dims) const {
+            if (dims.size() != static_cast<size_t>(dim())) {
+                tensor_throw_invalid_argument("Tensor permute requires one dimension index per axis: tensor_shape=" +
+                                              tensor_shape_to_string(shape_) + ", dims_size=" +
+                                              std::to_string(dims.size()));
+            }
+
+            std::vector<bool> seen(dims.size(), false);
+            std::vector<int64_t> out_shape(dims.size(), 1);
+            for (size_t i = 0; i < dims.size(); ++i) {
+                size_t dim_index = dims[i];
+                if (dim_index >= dims.size() || seen[dim_index]) {
+                    tensor_throw_invalid_argument("Tensor permute dimensions must be a valid permutation: tensor_shape=" +
+                                                  tensor_shape_to_string(shape_));
+                }
+                seen[dim_index] = true;
+                out_shape[i]    = shape_[dim_index];
+            }
+
+            Tensor result(out_shape);
+            if (result.numel() == 0) {
+                return result;
+            }
+
+            for (int64_t flat = 0; flat < result.numel(); ++flat) {
+                std::vector<int64_t> out_coord = tensor_unravel_index(flat, out_shape);
+                std::vector<int64_t> src_coord(static_cast<size_t>(dim()), 0);
+                for (size_t i = 0; i < dims.size(); ++i) {
+                    src_coord[dims[i]] = out_coord[i];
+                }
+                result[flat] = index(src_coord);
+            }
+
+            return result;
+        }
+
+        Tensor& permute_(const std::vector<size_t>& dims) {
+            *this = permute(dims);
+            return *this;
+        }
+
+        void fill_(const T& value) {
+            std::fill(data_.begin(), data_.end(), value);
+        }
+
+        Tensor& masked_fill_(const Tensor<uint8_t>& mask, const T& value);
+
+        T mean() const;
+
+        static Tensor zeros(std::vector<int64_t> shape) {
+            return Tensor(std::move(shape));
+        }
+
+        static Tensor zeros_like(const Tensor& other) {
+            return zeros(other.shape());
+        }
+
+        static Tensor ones(std::vector<int64_t> shape) {
+            return full(std::move(shape), static_cast<T>(1));
+        }
+
+        static Tensor ones_like(const Tensor& other) {
+            return ones(other.shape());
+        }
+
+        static Tensor full(std::vector<int64_t> shape, const T& value) {
+            Tensor tensor(std::move(shape));
+            tensor.fill_(value);
+            return tensor;
+        }
+
+        static Tensor randn(std::vector<int64_t> shape, const std::shared_ptr<RNG>& rng) {
+            static_assert(std::is_same_v<T, float>, "Tensor::randn currently requires Tensor<float>");
+            if (!rng) {
+                tensor_throw_invalid_argument("Tensor randn requires a valid RNG");
+            }
+            const uint32_t size = static_cast<uint32_t>(tensor_numel(shape));
+            return Tensor(std::move(shape), rng->randn(size));
+        }
+
+        static Tensor randn_like(const Tensor& other, const std::shared_ptr<RNG>& rng) {
+            return randn(other.shape(), rng);
+        }
+
+        static Tensor from_vector(std::vector<T> data) {
+            const int64_t size = static_cast<int64_t>(data.size());
+            return Tensor({size}, std::move(data));
+        }
+
+        T& index(const std::vector<int64_t>& coord) {
+            return data_.at(offset_of(coord));
+        }
+
+        const T& index(const std::vector<int64_t>& coord) const {
+            return data_.at(offset_of(coord));
+        }
+
+        template <typename... Indices, typename = std::enable_if_t<(std::is_convertible_v<Indices, int64_t> && ...)>>
+        T& index(Indices... indices) {
+            return index(std::vector<int64_t>{static_cast<int64_t>(indices)...});
+        }
+
+        template <typename... Indices, typename = std::enable_if_t<(std::is_convertible_v<Indices, int64_t> && ...)>>
+        const T& index(Indices... indices) const {
+            return index(std::vector<int64_t>{static_cast<int64_t>(indices)...});
+        }
+
+        T& operator[](int64_t index) {
+            return data_.at(static_cast<size_t>(index));
+        }
+
+        const T& operator[](int64_t index) const {
+            return data_.at(static_cast<size_t>(index));
+        }
+
+    private:
+        size_t offset_of(const std::vector<int64_t>& coord) const {
+            if (coord.size() != shape_.size()) {
+                tensor_throw_invalid_argument("Tensor index rank mismatch: coord_rank=" +
+                                              std::to_string(coord.size()) + ", shape=" +
+                                              tensor_shape_to_string(shape_));
+            }
+            size_t offset = 0;
+            size_t stride = 1;
+            for (size_t i = 0; i < shape_.size(); ++i) {
+                if (coord[i] < 0 || coord[i] >= shape_[i]) {
+                    tensor_throw_invalid_argument("Tensor index out of range: shape=" +
+                                                  tensor_shape_to_string(shape_));
+                }
+                offset += static_cast<size_t>(coord[i]) * stride;
+                stride *= static_cast<size_t>(shape_[i]);
+            }
+            return offset;
+        }
+
+        std::vector<T> data_;
+        std::vector<int64_t> shape_;
+    };
+
+    template <typename T>
+    inline T Tensor<T>::mean() const {
+        if (empty()) {
+            return T{};
+        }
+        T sum = T{};
+        for (const T& value : data_) {
+            sum += value;
+        }
+        return sum / static_cast<T>(numel());
+    }
+
+    template <>
+    inline float Tensor<float>::mean() const {
+        if (empty()) {
+            return 0.0f;
+        }
+        double sum = 0.0;
+        for (float value : data_) {
+            sum += static_cast<double>(value);
+        }
+        return static_cast<float>(sum / static_cast<double>(numel()));
+    }
+
+    template <typename T>
+    inline void tensor_check_same_shape(const Tensor<T>& lhs, const Tensor<T>& rhs) {
+        if (lhs.shape() != rhs.shape()) {
+            tensor_throw_invalid_argument("Tensor shapes must match: lhs_shape=" +
+                                          tensor_shape_to_string(lhs.shape()) + ", rhs_shape=" +
+                                          tensor_shape_to_string(rhs.shape()));
+        }
+    }
+
+    inline std::vector<int64_t> tensor_broadcast_shape(const std::vector<int64_t>& lhs, const std::vector<int64_t>& rhs) {
+        size_t ndim = std::max(lhs.size(), rhs.size());
+        std::vector<int64_t> shape(ndim, 1);
+        for (size_t i = 0; i < ndim; ++i) {
+            int64_t lhs_dim = lhs.size() > i ? lhs[i] : 1;
+            int64_t rhs_dim = rhs.size() > i ? rhs[i] : 1;
+            if (lhs_dim != rhs_dim && lhs_dim != 1 && rhs_dim != 1) {
+                tensor_throw_invalid_argument("Tensor shapes are not broadcastable: lhs_shape=" +
+                                              tensor_shape_to_string(lhs) + ", rhs_shape=" +
+                                              tensor_shape_to_string(rhs));
+            }
+            shape[i] = std::max(lhs_dim, rhs_dim);
+        }
+        return shape;
+    }
+
+    inline std::vector<int64_t> tensor_unravel_index(int64_t flat, const std::vector<int64_t>& shape) {
+        std::vector<int64_t> coord(shape.size(), 0);
+        for (size_t i = 0; i < shape.size(); ++i) {
+            if (shape[i] <= 0) {
+                tensor_throw_invalid_argument("Tensor unravel_index requires positive shape: shape=" +
+                                              tensor_shape_to_string(shape));
+            }
+            coord[i] = flat % shape[i];
+            flat /= shape[i];
+        }
+        return coord;
+    }
+
+    inline std::vector<int64_t> tensor_compute_strides(const std::vector<int64_t>& shape) {
+        std::vector<int64_t> strides(shape.size(), 1);
+        int64_t stride = 1;
+        for (size_t i = 0; i < shape.size(); ++i) {
+            strides[i] = stride;
+            stride *= shape[i];
+        }
+        return strides;
+    }
+
+    template <typename F>
+    inline void tensor_for_each_broadcast_offset(const std::vector<int64_t>& out_shape,
+                                                 const std::vector<int64_t>& lhs_shape_raw,
+                                                 const std::vector<int64_t>& lhs_strides_raw,
+                                                 const std::vector<int64_t>& rhs_shape_raw,
+                                                 const std::vector<int64_t>& rhs_strides_raw,
+                                                 F&& fn) {
+        const size_t ndim                = out_shape.size();
+        std::vector<int64_t> out_strides = tensor_compute_strides(out_shape);
+        std::vector<int64_t> lhs_shape(ndim, 1);
+        std::vector<int64_t> lhs_strides(ndim, 0);
+        std::vector<int64_t> rhs_shape(ndim, 1);
+        std::vector<int64_t> rhs_strides(ndim, 0);
+
+        for (size_t i = 0; i < lhs_shape_raw.size(); ++i) {
+            lhs_shape[i]   = lhs_shape_raw[i];
+            lhs_strides[i] = lhs_strides_raw[i];
+        }
+        for (size_t i = 0; i < rhs_shape_raw.size(); ++i) {
+            rhs_shape[i]   = rhs_shape_raw[i];
+            rhs_strides[i] = rhs_strides_raw[i];
+        }
+
+        const int64_t numel = tensor_numel(out_shape);
+        for (int64_t flat = 0; flat < numel; ++flat) {
+            int64_t remaining  = flat;
+            int64_t lhs_offset = 0;
+            int64_t rhs_offset = 0;
+            for (size_t i = ndim; i-- > 0;) {
+                int64_t coord = remaining / out_strides[i];
+                remaining %= out_strides[i];
+                if (lhs_shape[i] != 1) {
+                    lhs_offset += coord * lhs_strides[i];
+                }
+                if (rhs_shape[i] != 1) {
+                    rhs_offset += coord * rhs_strides[i];
+                }
+            }
+            fn(flat, lhs_offset, rhs_offset);
+        }
+    }
+
+    template <typename T>
+    inline Tensor<T>& Tensor<T>::masked_fill_(const Tensor<uint8_t>& mask, const T& value) {
+        if (empty()) {
+            return *this;
+        }
+        tensor_broadcast_shape(shape_, mask.shape());
+        const std::vector<int64_t> data_strides = tensor_compute_strides(shape_);
+        const std::vector<int64_t> mask_strides = tensor_compute_strides(mask.shape());
+        const uint8_t* mask_data                = mask.data();
+        tensor_for_each_broadcast_offset(shape_,
+                                         shape_,
+                                         data_strides,
+                                         mask.shape(),
+                                         mask_strides,
+                                         [&](int64_t, int64_t data_offset, int64_t mask_offset) {
+                                             if (mask_data[mask_offset] != 0) {
+                                                 data_[static_cast<size_t>(data_offset)] = value;
+                                             }
+                                         });
+        return *this;
+    }
+
+    template <typename T, typename Scalar, typename = std::enable_if_t<std::is_arithmetic<Scalar>::value>>
+    inline Tensor<uint8_t> operator<(const Tensor<T>& lhs, Scalar rhs) {
+        Tensor<uint8_t> result(lhs.shape());
+        const T value = static_cast<T>(rhs);
+        for (int64_t i = 0; i < lhs.numel(); ++i) {
+            result[i] = lhs[i] < value ? 1 : 0;
+        }
+        return result;
+    }
+
+    template <typename T, typename Scalar, typename = std::enable_if_t<std::is_arithmetic<Scalar>::value>>
+    inline Tensor<uint8_t> operator<(Scalar lhs, const Tensor<T>& rhs) {
+        Tensor<uint8_t> result(rhs.shape());
+        const T value = static_cast<T>(lhs);
+        for (int64_t i = 0; i < rhs.numel(); ++i) {
+            result[i] = value < rhs[i] ? 1 : 0;
+        }
+        return result;
+    }
+
+    template <typename T>
+    inline Tensor<uint8_t> operator<(const Tensor<T>& lhs, const Tensor<T>& rhs) {
+        const std::vector<int64_t> out_shape = tensor_broadcast_shape(lhs.shape(), rhs.shape());
+        Tensor<uint8_t> result(out_shape);
+        const std::vector<int64_t> lhs_strides = tensor_compute_strides(lhs.shape());
+        const std::vector<int64_t> rhs_strides = tensor_compute_strides(rhs.shape());
+        const T* lhs_data                      = lhs.data();
+        const T* rhs_data                      = rhs.data();
+        tensor_for_each_broadcast_offset(out_shape,
+                                         lhs.shape(),
+                                         lhs_strides,
+                                         rhs.shape(),
+                                         rhs_strides,
+                                         [&](int64_t flat, int64_t lhs_offset, int64_t rhs_offset) {
+                                             result[flat] = lhs_data[lhs_offset] < rhs_data[rhs_offset] ? 1 : 0;
+                                         });
+        return result;
+    }
+
+    template <typename T>
+    inline Tensor<T>& operator+=(Tensor<T>& lhs, const Tensor<T>& rhs) {
+        if (lhs.shape() == rhs.shape()) {
+            for (int64_t i = 0; i < lhs.numel(); ++i) {
+                lhs[i] += rhs[i];
+            }
+            return lhs;
+        }
+        tensor_broadcast_shape(lhs.shape(), rhs.shape());
+        const std::vector<int64_t> lhs_strides = tensor_compute_strides(lhs.shape());
+        const std::vector<int64_t> rhs_strides = tensor_compute_strides(rhs.shape());
+        const T* rhs_data                      = rhs.data();
+        tensor_for_each_broadcast_offset(lhs.shape(),
+                                         lhs.shape(),
+                                         lhs_strides,
+                                         rhs.shape(),
+                                         rhs_strides,
+                                         [&](int64_t, int64_t lhs_offset, int64_t rhs_offset) {
+                                             lhs[static_cast<int64_t>(lhs_offset)] += rhs_data[rhs_offset];
+                                         });
+        return lhs;
+    }
+
+    template <typename T, typename Scalar, typename = std::enable_if_t<std::is_arithmetic<Scalar>::value>>
+    inline Tensor<T>& operator+=(Tensor<T>& lhs, Scalar rhs) {
+        const T value = static_cast<T>(rhs);
+        for (int64_t i = 0; i < lhs.numel(); ++i) {
+            lhs[i] += value;
+        }
+        return lhs;
+    }
+
+    template <typename T>
+    inline Tensor<T>& operator-=(Tensor<T>& lhs, const Tensor<T>& rhs) {
+        if (lhs.shape() == rhs.shape()) {
+            for (int64_t i = 0; i < lhs.numel(); ++i) {
+                lhs[i] -= rhs[i];
+            }
+            return lhs;
+        }
+        tensor_broadcast_shape(lhs.shape(), rhs.shape());
+        const std::vector<int64_t> lhs_strides = tensor_compute_strides(lhs.shape());
+        const std::vector<int64_t> rhs_strides = tensor_compute_strides(rhs.shape());
+        const T* rhs_data                      = rhs.data();
+        tensor_for_each_broadcast_offset(lhs.shape(),
+                                         lhs.shape(),
+                                         lhs_strides,
+                                         rhs.shape(),
+                                         rhs_strides,
+                                         [&](int64_t, int64_t lhs_offset, int64_t rhs_offset) {
+                                             lhs[static_cast<int64_t>(lhs_offset)] -= rhs_data[rhs_offset];
+                                         });
+        return lhs;
+    }
+
+    template <typename T, typename Scalar, typename = std::enable_if_t<std::is_arithmetic<Scalar>::value>>
+    inline Tensor<T>& operator-=(Tensor<T>& lhs, Scalar rhs) {
+        const T value = static_cast<T>(rhs);
+        for (int64_t i = 0; i < lhs.numel(); ++i) {
+            lhs[i] -= value;
+        }
+        return lhs;
+    }
+
+    template <typename T>
+    inline Tensor<T>& operator*=(Tensor<T>& lhs, const Tensor<T>& rhs) {
+        if (lhs.shape() == rhs.shape()) {
+            for (int64_t i = 0; i < lhs.numel(); ++i) {
+                lhs[i] *= rhs[i];
+            }
+            return lhs;
+        }
+        tensor_broadcast_shape(lhs.shape(), rhs.shape());
+        const std::vector<int64_t> lhs_strides = tensor_compute_strides(lhs.shape());
+        const std::vector<int64_t> rhs_strides = tensor_compute_strides(rhs.shape());
+        const T* rhs_data                      = rhs.data();
+        tensor_for_each_broadcast_offset(lhs.shape(),
+                                         lhs.shape(),
+                                         lhs_strides,
+                                         rhs.shape(),
+                                         rhs_strides,
+                                         [&](int64_t, int64_t lhs_offset, int64_t rhs_offset) {
+                                             lhs[static_cast<int64_t>(lhs_offset)] *= rhs_data[rhs_offset];
+                                         });
+        return lhs;
+    }
+
+    template <typename T, typename Scalar, typename = std::enable_if_t<std::is_arithmetic<Scalar>::value>>
+    inline Tensor<T>& operator*=(Tensor<T>& lhs, Scalar rhs) {
+        const T value = static_cast<T>(rhs);
+        for (int64_t i = 0; i < lhs.numel(); ++i) {
+            lhs[i] *= value;
+        }
+        return lhs;
+    }
+
+    template <typename T>
+    inline Tensor<T>& operator/=(Tensor<T>& lhs, const Tensor<T>& rhs) {
+        if (lhs.shape() == rhs.shape()) {
+            for (int64_t i = 0; i < lhs.numel(); ++i) {
+                lhs[i] /= rhs[i];
+            }
+            return lhs;
+        }
+        tensor_broadcast_shape(lhs.shape(), rhs.shape());
+        const std::vector<int64_t> lhs_strides = tensor_compute_strides(lhs.shape());
+        const std::vector<int64_t> rhs_strides = tensor_compute_strides(rhs.shape());
+        const T* rhs_data                      = rhs.data();
+        tensor_for_each_broadcast_offset(lhs.shape(),
+                                         lhs.shape(),
+                                         lhs_strides,
+                                         rhs.shape(),
+                                         rhs_strides,
+                                         [&](int64_t, int64_t lhs_offset, int64_t rhs_offset) {
+                                             lhs[static_cast<int64_t>(lhs_offset)] /= rhs_data[rhs_offset];
+                                         });
+        return lhs;
+    }
+
+    template <typename T, typename Scalar, typename = std::enable_if_t<std::is_arithmetic<Scalar>::value>>
+    inline Tensor<T>& operator/=(Tensor<T>& lhs, Scalar rhs) {
+        const T value = static_cast<T>(rhs);
+        for (int64_t i = 0; i < lhs.numel(); ++i) {
+            lhs[i] /= value;
+        }
+        return lhs;
+    }
+
+    template <typename T>
+    inline Tensor<T> operator+(Tensor<T> lhs, const Tensor<T>& rhs) {
+        if (lhs.shape() != rhs.shape()) {
+            const std::vector<int64_t> out_shape = tensor_broadcast_shape(lhs.shape(), rhs.shape());
+            Tensor<T> result(out_shape);
+            const std::vector<int64_t> lhs_strides = tensor_compute_strides(lhs.shape());
+            const std::vector<int64_t> rhs_strides = tensor_compute_strides(rhs.shape());
+            const T* lhs_data                      = lhs.data();
+            const T* rhs_data                      = rhs.data();
+            tensor_for_each_broadcast_offset(out_shape,
+                                             lhs.shape(),
+                                             lhs_strides,
+                                             rhs.shape(),
+                                             rhs_strides,
+                                             [&](int64_t flat, int64_t lhs_offset, int64_t rhs_offset) {
+                                                 result[flat] = lhs_data[lhs_offset] + rhs_data[rhs_offset];
+                                             });
+            return result;
+        }
+        lhs += rhs;
+        return lhs;
+    }
+
+    template <typename T, typename Scalar, typename = std::enable_if_t<std::is_arithmetic<Scalar>::value>>
+    inline Tensor<T> operator+(Tensor<T> lhs, Scalar rhs) {
+        lhs += rhs;
+        return lhs;
+    }
+
+    template <typename T, typename Scalar, typename = std::enable_if_t<std::is_arithmetic<Scalar>::value>>
+    inline Tensor<T> operator+(Scalar lhs, Tensor<T> rhs) {
+        rhs += lhs;
+        return rhs;
+    }
+
+    template <typename T>
+    inline Tensor<T> operator-(Tensor<T> lhs, const Tensor<T>& rhs) {
+        if (lhs.shape() != rhs.shape()) {
+            const std::vector<int64_t> out_shape = tensor_broadcast_shape(lhs.shape(), rhs.shape());
+            Tensor<T> result(out_shape);
+            const std::vector<int64_t> lhs_strides = tensor_compute_strides(lhs.shape());
+            const std::vector<int64_t> rhs_strides = tensor_compute_strides(rhs.shape());
+            const T* lhs_data                      = lhs.data();
+            const T* rhs_data                      = rhs.data();
+            tensor_for_each_broadcast_offset(out_shape,
+                                             lhs.shape(),
+                                             lhs_strides,
+                                             rhs.shape(),
+                                             rhs_strides,
+                                             [&](int64_t flat, int64_t lhs_offset, int64_t rhs_offset) {
+                                                 result[flat] = lhs_data[lhs_offset] - rhs_data[rhs_offset];
+                                             });
+            return result;
+        }
+        lhs -= rhs;
+        return lhs;
+    }
+
+    template <typename T, typename Scalar, typename = std::enable_if_t<std::is_arithmetic<Scalar>::value>>
+    inline Tensor<T> operator-(Tensor<T> lhs, Scalar rhs) {
+        lhs -= rhs;
+        return lhs;
+    }
+
+    template <typename T, typename Scalar, typename = std::enable_if_t<std::is_arithmetic<Scalar>::value>>
+    inline Tensor<T> operator-(Scalar lhs, const Tensor<T>& rhs) {
+        Tensor<T> result = rhs;
+        const T value    = static_cast<T>(lhs);
+        for (int64_t i = 0; i < result.numel(); ++i) {
+            result[i] = value - result[i];
+        }
+        return result;
+    }
+
+    template <typename T>
+    inline Tensor<T> operator*(Tensor<T> lhs, const Tensor<T>& rhs) {
+        if (lhs.shape() != rhs.shape()) {
+            const std::vector<int64_t> out_shape = tensor_broadcast_shape(lhs.shape(), rhs.shape());
+            Tensor<T> result(out_shape);
+            const std::vector<int64_t> lhs_strides = tensor_compute_strides(lhs.shape());
+            const std::vector<int64_t> rhs_strides = tensor_compute_strides(rhs.shape());
+            const T* lhs_data                      = lhs.data();
+            const T* rhs_data                      = rhs.data();
+            tensor_for_each_broadcast_offset(out_shape,
+                                             lhs.shape(),
+                                             lhs_strides,
+                                             rhs.shape(),
+                                             rhs_strides,
+                                             [&](int64_t flat, int64_t lhs_offset, int64_t rhs_offset) {
+                                                 result[flat] = lhs_data[lhs_offset] * rhs_data[rhs_offset];
+                                             });
+            return result;
+        }
+        lhs *= rhs;
+        return lhs;
+    }
+
+    template <typename T, typename Scalar, typename = std::enable_if_t<std::is_arithmetic<Scalar>::value>>
+    inline Tensor<T> operator*(Tensor<T> lhs, Scalar rhs) {
+        lhs *= rhs;
+        return lhs;
+    }
+
+    template <typename T, typename Scalar, typename = std::enable_if_t<std::is_arithmetic<Scalar>::value>>
+    inline Tensor<T> operator*(Scalar lhs, Tensor<T> rhs) {
+        rhs *= lhs;
+        return rhs;
+    }
+
+    template <typename T>
+    inline Tensor<T> operator/(Tensor<T> lhs, const Tensor<T>& rhs) {
+        if (lhs.shape() != rhs.shape()) {
+            const std::vector<int64_t> out_shape = tensor_broadcast_shape(lhs.shape(), rhs.shape());
+            Tensor<T> result(out_shape);
+            const std::vector<int64_t> lhs_strides = tensor_compute_strides(lhs.shape());
+            const std::vector<int64_t> rhs_strides = tensor_compute_strides(rhs.shape());
+            const T* lhs_data                      = lhs.data();
+            const T* rhs_data                      = rhs.data();
+            tensor_for_each_broadcast_offset(out_shape,
+                                             lhs.shape(),
+                                             lhs_strides,
+                                             rhs.shape(),
+                                             rhs_strides,
+                                             [&](int64_t flat, int64_t lhs_offset, int64_t rhs_offset) {
+                                                 result[flat] = lhs_data[lhs_offset] / rhs_data[rhs_offset];
+                                             });
+            return result;
+        }
+        lhs /= rhs;
+        return lhs;
+    }
+
+    template <typename T, typename Scalar, typename = std::enable_if_t<std::is_arithmetic<Scalar>::value>>
+    inline Tensor<T> operator/(Tensor<T> lhs, Scalar rhs) {
+        lhs /= rhs;
+        return lhs;
+    }
+
+    template <typename T, typename Scalar, typename = std::enable_if_t<std::is_arithmetic<Scalar>::value>>
+    inline Tensor<T> operator/(Scalar lhs, const Tensor<T>& rhs) {
+        Tensor<T> result = rhs;
+        const T value    = static_cast<T>(lhs);
+        for (int64_t i = 0; i < result.numel(); ++i) {
+            result[i] = value / result[i];
+        }
+        return result;
+    }
+
+    template <typename T>
+    inline Tensor<T> operator-(const Tensor<T>& tensor) {
+        Tensor<T> result = tensor;
+        for (int64_t i = 0; i < result.numel(); ++i) {
+            result[i] = -result[i];
+        }
+        return result;
+    }
+
+    template <typename T>
+    inline Tensor<T> zeros(std::vector<int64_t> shape) {
+        return Tensor<T>::zeros(std::move(shape));
+    }
+
+    template <typename T>
+    inline Tensor<T> full(std::vector<int64_t> shape, const T& value) {
+        return Tensor<T>::full(std::move(shape), value);
+    }
+
+    template <typename T>
+    inline Tensor<T> randn(std::vector<int64_t> shape, const std::shared_ptr<RNG>& rng) {
+        return Tensor<T>::randn(std::move(shape), rng);
+    }
+
+    template <typename T>
+    inline Tensor<T> randn_like(const Tensor<T>& tensor, const std::shared_ptr<RNG>& rng) {
+        return Tensor<T>::randn(tensor.shape(), rng);
+    }
+
+    template <typename T>
+    inline std::vector<T> tensor_to_vector(const Tensor<T>& tensor) {
+        return tensor.values();
+    }
+
+    namespace ops {
+        enum class InterpolateMode {
+            Nearest,
+        };
+
+        inline int64_t normalize_slice_bound(int64_t index, int64_t dim_size) {
+            if (index < 0) {
+                index += dim_size;
+            }
+            return index;
+        }
+
+        template <typename T>
+        inline std::pair<int64_t, int64_t> resolve_slice_bounds(const Tensor<T>& input,
+                                                                size_t dim,
+                                                                int64_t start,
+                                                                int64_t end) {
+            if (dim >= static_cast<size_t>(input.dim())) {
+                tensor_throw_invalid_argument("Tensor slice dimension out of range: dim=" +
+                                              std::to_string(dim) + ", rank=" +
+                                              std::to_string(input.dim()) + ", input_shape=" +
+                                              tensor_shape_to_string(input.shape()));
+            }
+
+            int64_t dim_size = input.shape()[dim];
+            start            = normalize_slice_bound(start, dim_size);
+            end              = normalize_slice_bound(end, dim_size);
+
+            if (start < 0 || start > dim_size || end < 0 || end > dim_size || start > end) {
+                tensor_throw_invalid_argument("Tensor slice bounds out of range: dim=" +
+                                              std::to_string(dim) + ", start=" +
+                                              std::to_string(start) + ", end=" +
+                                              std::to_string(end) + ", input_shape=" +
+                                              tensor_shape_to_string(input.shape()));
+            }
+
+            return {start, end};
+        }
+
+        template <typename T>
+        inline Tensor<T> exp(const Tensor<T>& input) {
+            Tensor<T> output(input.shape());
+            for (int64_t i = 0; i < input.numel(); ++i) {
+                output[i] = static_cast<T>(std::exp(static_cast<double>(input[i])));
+            }
+            return output;
+        }
+
+        template <typename T>
+        inline Tensor<T> clamp(const Tensor<T>& input, const T& min_value, const T& max_value) {
+            if (min_value > max_value) {
+                tensor_throw_invalid_argument("Tensor clamp requires min_value <= max_value");
+            }
+            Tensor<T> output(input.shape());
+            for (int64_t i = 0; i < input.numel(); ++i) {
+                output[i] = std::clamp(input[i], min_value, max_value);
+            }
+            return output;
+        }
+
+        template <typename T>
+        inline Tensor<T> round(const Tensor<T>& input) {
+            Tensor<T> output(input.shape());
+            for (int64_t i = 0; i < input.numel(); ++i) {
+                output[i] = static_cast<T>(std::round(static_cast<double>(input[i])));
+            }
+            return output;
+        }
+
+        template <typename T>
+        inline Tensor<T> slice(const Tensor<T>& input,
+                               size_t dim,
+                               int64_t start,
+                               int64_t end) {
+            auto [resolved_start, resolved_end] = resolve_slice_bounds(input, dim, start, end);
+            std::vector<int64_t> out_shape      = input.shape();
+            out_shape[dim]                      = resolved_end - resolved_start;
+
+            Tensor<T> output(out_shape);
+            if (output.numel() == 0) {
+                return output;
+            }
+
+            int64_t inner = 1;
+            for (size_t i = 0; i < dim; ++i) {
+                inner *= input.shape()[i];
+            }
+
+            int64_t outer = 1;
+            for (size_t i = dim + 1; i < static_cast<size_t>(input.dim()); ++i) {
+                outer *= input.shape()[i];
+            }
+
+            int64_t src_chunk  = (resolved_end - resolved_start) * inner;
+            int64_t src_stride = input.shape()[dim] * inner;
+            for (int64_t i = 0; i < outer; ++i) {
+                const int64_t src_offset = i * src_stride + resolved_start * inner;
+                const int64_t dst_offset = i * src_chunk;
+                std::copy_n(input.data() + src_offset, src_chunk, output.data() + dst_offset);
+            }
+
+            return output;
+        }
+
+        template <typename T>
+        inline Tensor<T> narrow(const Tensor<T>& input,
+                                size_t dim,
+                                int64_t start,
+                                int64_t length) {
+            if (length < 0) {
+                tensor_throw_invalid_argument("Tensor narrow requires non-negative length: length=" +
+                                              std::to_string(length) + ", input_shape=" +
+                                              tensor_shape_to_string(input.shape()));
+            }
+            return slice(input, dim, start, start + length);
+        }
+
+        template <typename T>
+        inline void slice_assign(Tensor<T>* dst,
+                                 size_t dim,
+                                 int64_t start,
+                                 int64_t end,
+                                 const Tensor<T>& src) {
+            if (dst == nullptr) {
+                tensor_throw_invalid_argument("Tensor slice_assign requires non-null dst");
+            }
+
+            auto [resolved_start, resolved_end] = resolve_slice_bounds(*dst, dim, start, end);
+            if (src.dim() != dst->dim()) {
+                tensor_throw_invalid_argument("Tensor slice_assign requires matching rank: dst_shape=" +
+                                              tensor_shape_to_string(dst->shape()) + ", src_shape=" +
+                                              tensor_shape_to_string(src.shape()));
+            }
+
+            std::vector<int64_t> expected_shape = dst->shape();
+            expected_shape[dim]                 = resolved_end - resolved_start;
+            if (src.shape() != expected_shape) {
+                tensor_throw_invalid_argument("Tensor slice_assign requires matching source shape: dst_shape=" +
+                                              tensor_shape_to_string(dst->shape()) + ", src_shape=" +
+                                              tensor_shape_to_string(src.shape()) + ", expected_src_shape=" +
+                                              tensor_shape_to_string(expected_shape));
+            }
+
+            if (src.numel() == 0) {
+                return;
+            }
+
+            int64_t inner = 1;
+            for (size_t i = 0; i < dim; ++i) {
+                inner *= dst->shape()[i];
+            }
+
+            int64_t outer = 1;
+            for (size_t i = dim + 1; i < static_cast<size_t>(dst->dim()); ++i) {
+                outer *= dst->shape()[i];
+            }
+
+            int64_t dst_chunk  = (resolved_end - resolved_start) * inner;
+            int64_t dst_stride = dst->shape()[dim] * inner;
+            for (int64_t i = 0; i < outer; ++i) {
+                const int64_t dst_offset = i * dst_stride + resolved_start * inner;
+                const int64_t src_offset = i * dst_chunk;
+                std::copy_n(src.data() + src_offset, dst_chunk, dst->data() + dst_offset);
+            }
+        }
+
+        template <typename T>
+        inline void fill_slice(Tensor<T>* dst,
+                               size_t dim,
+                               int64_t start,
+                               int64_t end,
+                               const T& value) {
+            if (dst == nullptr) {
+                tensor_throw_invalid_argument("Tensor fill_slice requires non-null dst");
+            }
+
+            auto [resolved_start, resolved_end] = resolve_slice_bounds(*dst, dim, start, end);
+            int64_t inner                       = 1;
+            for (size_t i = 0; i < dim; ++i) {
+                inner *= dst->shape()[i];
+            }
+
+            int64_t outer = 1;
+            for (size_t i = dim + 1; i < static_cast<size_t>(dst->dim()); ++i) {
+                outer *= dst->shape()[i];
+            }
+
+            int64_t chunk  = (resolved_end - resolved_start) * inner;
+            int64_t stride = dst->shape()[dim] * inner;
+            for (int64_t i = 0; i < outer; ++i) {
+                const int64_t offset = i * stride + resolved_start * inner;
+                std::fill_n(dst->data() + offset, chunk, value);
+            }
+        }
+
+        template <typename T>
+        inline Tensor<T> interpolate(const Tensor<T>& input,
+                                     std::vector<int64_t> output_shape,
+                                     InterpolateMode mode = InterpolateMode::Nearest,
+                                     bool align_corners   = false) {
+            if (mode != InterpolateMode::Nearest) {
+                tensor_throw_invalid_argument("Only nearest interpolate mode is implemented, got mode=" +
+                                              std::to_string(static_cast<int>(mode)));
+            }
+            if (align_corners) {
+                tensor_throw_invalid_argument("align_corners is not supported for nearest interpolate: input_shape=" +
+                                              tensor_shape_to_string(input.shape()) + ", output_shape=" +
+                                              tensor_shape_to_string(output_shape));
+            }
+            if (input.shape() == output_shape) {
+                return input;
+            }
+            if (input.dim() != static_cast<int64_t>(output_shape.size())) {
+                tensor_throw_invalid_argument("Tensor interpolate requires matching rank: input_dim=" +
+                                              std::to_string(input.dim()) + ", output_dim=" +
+                                              std::to_string(output_shape.size()) + ", input_shape=" +
+                                              tensor_shape_to_string(input.shape()) + ", output_shape=" +
+                                              tensor_shape_to_string(output_shape));
+            }
+            for (size_t i = 0; i < output_shape.size(); ++i) {
+                if (output_shape[i] <= 0) {
+                    tensor_throw_invalid_argument("Tensor interpolate output shape must be positive: input_shape=" +
+                                                  tensor_shape_to_string(input.shape()) + ", output_shape=" +
+                                                  tensor_shape_to_string(output_shape));
+                }
+                if (input.shape()[i] <= 0) {
+                    tensor_throw_invalid_argument("Tensor interpolate input shape must be positive: input_shape=" +
+                                                  tensor_shape_to_string(input.shape()) + ", output_shape=" +
+                                                  tensor_shape_to_string(output_shape));
+                }
+            }
+
+            Tensor<T> output(std::move(output_shape));
+            for (int64_t flat = 0; flat < output.numel(); ++flat) {
+                std::vector<int64_t> output_coord = tensor_unravel_index(flat, output.shape());
+                std::vector<int64_t> input_coord(static_cast<size_t>(input.dim()), 0);
+                for (size_t i = 0; i < static_cast<size_t>(input.dim()); ++i) {
+                    input_coord[i] = output_coord[i] * input.shape()[i] / output.shape()[i];
+                }
+                output[flat] = input.index(input_coord);
+            }
+
+            return output;
+        }
+
+        template <typename T>
+        inline Tensor<T> interpolate(const Tensor<T>& input,
+                                     const std::optional<std::vector<int64_t>>& size,
+                                     const std::optional<std::vector<double>>& scale_factor,
+                                     InterpolateMode mode = InterpolateMode::Nearest,
+                                     bool align_corners   = false) {
+            if (mode != InterpolateMode::Nearest) {
+                tensor_throw_invalid_argument("Only nearest interpolate mode is implemented, got mode=" +
+                                              std::to_string(static_cast<int>(mode)));
+            }
+            if (align_corners) {
+                tensor_throw_invalid_argument("align_corners is not supported for nearest interpolate: input_shape=" +
+                                              tensor_shape_to_string(input.shape()));
+            }
+            if (size.has_value() == scale_factor.has_value()) {
+                tensor_throw_invalid_argument("Tensor interpolate requires exactly one of size or scale_factor: input_shape=" +
+                                              tensor_shape_to_string(input.shape()));
+            }
+
+            std::vector<int64_t> output_shape = input.shape();
+            if (size.has_value()) {
+                if (size->empty() || size->size() > output_shape.size()) {
+                    tensor_throw_invalid_argument("Tensor interpolate size must target low dimensions: input_shape=" +
+                                                  tensor_shape_to_string(input.shape()) + ", size_rank=" +
+                                                  std::to_string(size->size()));
+                }
+                for (size_t i = 0; i < size->size(); ++i) {
+                    if ((*size)[i] <= 0) {
+                        tensor_throw_invalid_argument("Tensor interpolate size must be positive: input_shape=" +
+                                                      tensor_shape_to_string(input.shape()) + ", size=" +
+                                                      tensor_shape_to_string(*size));
+                    }
+                    output_shape[i] = (*size)[i];
+                }
+            } else {
+                if (scale_factor->empty() || scale_factor->size() > output_shape.size()) {
+                    tensor_throw_invalid_argument("Tensor interpolate scale_factor must target low dimensions: input_shape=" +
+                                                  tensor_shape_to_string(input.shape()) + ", scale_factor_rank=" +
+                                                  std::to_string(scale_factor->size()));
+                }
+                for (size_t i = 0; i < scale_factor->size(); ++i) {
+                    if ((*scale_factor)[i] <= 0.0) {
+                        tensor_throw_invalid_argument("Tensor interpolate scale_factor must be positive: input_shape=" +
+                                                      tensor_shape_to_string(input.shape()));
+                    }
+                    output_shape[i] = static_cast<int64_t>(
+                        std::floor(static_cast<double>(output_shape[i]) * (*scale_factor)[i]));
+                    if (output_shape[i] <= 0) {
+                        tensor_throw_invalid_argument("Tensor interpolate output shape must be positive: input_shape=" +
+                                                      tensor_shape_to_string(input.shape()) + ", output_shape=" +
+                                                      tensor_shape_to_string(output_shape));
+                    }
+                }
+            }
+
+            return interpolate(input, std::move(output_shape), mode, align_corners);
+        }
+
+        template <typename T>
+        inline Tensor<T> interpolate(const Tensor<T>& input,
+                                     const std::optional<std::vector<int64_t>>& size,
+                                     double scale_factor,
+                                     InterpolateMode mode = InterpolateMode::Nearest,
+                                     bool align_corners   = false) {
+            return interpolate(input,
+                               size,
+                               std::vector<double>(size.has_value() ? size->size() : input.dim(), scale_factor),
+                               mode,
+                               align_corners);
+        }
+
+        template <typename T>
+        inline Tensor<T> concat(const Tensor<T>& lhs, const Tensor<T>& rhs, size_t dim) {
+            if (lhs.dim() != rhs.dim()) {
+                tensor_throw_invalid_argument("Tensor concat requires same rank: lhs_dim=" +
+                                              std::to_string(lhs.dim()) + ", rhs_dim=" +
+                                              std::to_string(rhs.dim()) + ", lhs_shape=" +
+                                              tensor_shape_to_string(lhs.shape()) + ", rhs_shape=" +
+                                              tensor_shape_to_string(rhs.shape()));
+            }
+            if (dim >= static_cast<size_t>(lhs.dim())) {
+                tensor_throw_invalid_argument("Tensor concat dimension out of range: dim=" +
+                                              std::to_string(dim) + ", rank=" +
+                                              std::to_string(lhs.dim()) + ", lhs_shape=" +
+                                              tensor_shape_to_string(lhs.shape()));
+            }
+            std::vector<int64_t> out_shape = lhs.shape();
+            for (size_t i = 0; i < static_cast<size_t>(lhs.dim()); ++i) {
+                if (i == dim) {
+                    continue;
+                }
+                if (lhs.shape()[i] != rhs.shape()[i]) {
+                    tensor_throw_invalid_argument("Tensor concat requires matching non-concat dimensions: dim=" +
+                                                  std::to_string(dim) + ", lhs_shape=" +
+                                                  tensor_shape_to_string(lhs.shape()) + ", rhs_shape=" +
+                                                  tensor_shape_to_string(rhs.shape()));
+                }
+            }
+            out_shape[dim] += rhs.shape()[dim];
+
+            Tensor<T> out(out_shape);
+            int64_t inner = 1;
+            for (size_t i = 0; i < dim; ++i) {
+                inner *= lhs.shape()[i];
+            }
+
+            int64_t outer = 1;
+            for (size_t i = dim + 1; i < static_cast<size_t>(lhs.dim()); ++i) {
+                outer *= lhs.shape()[i];
+            }
+
+            int64_t lhs_chunk = lhs.shape()[dim] * inner;
+            int64_t rhs_chunk = rhs.shape()[dim] * inner;
+            int64_t out_chunk = lhs_chunk + rhs_chunk;
+
+            for (int64_t i = 0; i < outer; ++i) {
+                int64_t lhs_offset = i * lhs_chunk;
+                int64_t rhs_offset = i * rhs_chunk;
+                int64_t out_offset = i * out_chunk;
+
+                std::copy_n(lhs.data() + lhs_offset, lhs_chunk, out.data() + out_offset);
+                std::copy_n(rhs.data() + rhs_offset, rhs_chunk, out.data() + out_offset + lhs_chunk);
+            }
+            return out;
+        }
+
+        template <typename T>
+        inline std::vector<Tensor<T>> chunk(const Tensor<T>& tensor, int64_t chunks, size_t dim) {
+            if (chunks <= 0) {
+                tensor_throw_invalid_argument("Tensor chunk requires chunks > 0: chunks=" +
+                                              std::to_string(chunks) + ", tensor_shape=" +
+                                              tensor_shape_to_string(tensor.shape()));
+            }
+            if (dim >= static_cast<size_t>(tensor.dim())) {
+                tensor_throw_invalid_argument("Tensor chunk dimension out of range: dim=" +
+                                              std::to_string(dim) + ", rank=" +
+                                              std::to_string(tensor.dim()) + ", tensor_shape=" +
+                                              tensor_shape_to_string(tensor.shape()));
+            }
+
+            const int64_t dim_size = tensor.shape()[dim];
+            if (dim_size == 0) {
+                return {};
+            }
+            if (dim_size % chunks != 0) {
+                tensor_throw_invalid_argument("Tensor chunk requires the dimension size to be divisible by chunks: dim=" +
+                                              std::to_string(dim) + ", dim_size=" +
+                                              std::to_string(dim_size) + ", chunks=" +
+                                              std::to_string(chunks) + ", tensor_shape=" +
+                                              tensor_shape_to_string(tensor.shape()));
+            }
+
+            const int64_t chunk_size = dim_size / chunks;
+            int64_t inner            = 1;
+            for (size_t i = 0; i < dim; ++i) {
+                inner *= tensor.shape()[i];
+            }
+
+            int64_t outer = 1;
+            for (size_t i = dim + 1; i < static_cast<size_t>(tensor.dim()); ++i) {
+                outer *= tensor.shape()[i];
+            }
+
+            std::vector<Tensor<T>> parts;
+            parts.reserve(static_cast<size_t>(chunks));
+
+            for (int64_t start = 0; start < dim_size; start += chunk_size) {
+                std::vector<int64_t> part_shape = tensor.shape();
+                part_shape[dim]                 = chunk_size;
+                Tensor<T> part(part_shape);
+
+                const int64_t src_chunk = chunk_size * inner;
+                const int64_t dst_chunk = src_chunk;
+                for (int64_t i = 0; i < outer; ++i) {
+                    const int64_t src_offset = (i * dim_size + start) * inner;
+                    const int64_t dst_offset = i * dst_chunk;
+                    std::copy_n(tensor.data() + src_offset, src_chunk, part.data() + dst_offset);
+                }
+
+                parts.push_back(std::move(part));
+            }
+
+            return parts;
+        }
+
+    }  // namespace ops
+
+}  // namespace sd
+
+#endif
diff --git a/src/tensor_ggml.hpp b/src/tensor_ggml.hpp
new file mode 100644
index 00000000..493a958c
--- /dev/null
+++ b/src/tensor_ggml.hpp
@@ -0,0 +1,127 @@
+#ifndef __SD_TENSOR_GGML_HPP__
+#define __SD_TENSOR_GGML_HPP__
+
+#include <array>
+#include <cstring>
+#include <fstream>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+
+#include "ggml.h"
+#include "tensor.hpp"
+
+namespace sd {
+
+    template <typename T>
+    struct GGMLTypeTraits;
+
+    template <>
+    struct GGMLTypeTraits<float> {
+        static constexpr ggml_type type = GGML_TYPE_F32;
+    };
+
+    template <>
+    struct GGMLTypeTraits<ggml_fp16_t> {
+        static constexpr ggml_type type = GGML_TYPE_F16;
+    };
+
+    template <>
+    struct GGMLTypeTraits<int32_t> {
+        static constexpr ggml_type type = GGML_TYPE_I32;
+    };
+
+    template <>
+    struct GGMLTypeTraits<int64_t> {
+        static constexpr ggml_type type = GGML_TYPE_I64;
+    };
+
+    inline std::vector<int64_t> shape_from_ggml(const ggml_tensor* tensor) {
+        std::vector<int64_t> shape;
+        shape.reserve(static_cast<size_t>(ggml_n_dims(tensor)));
+        for (int i = 0; i < ggml_n_dims(tensor); ++i) {
+            shape.push_back(tensor->ne[i]);
+        }
+        return shape;
+    }
+
+    template <typename T>
+    inline Tensor<T> make_sd_tensor_from_ggml(const ggml_tensor* tensor) {
+        if (tensor == nullptr) {
+            return {};
+        }
+        if (tensor->type != GGMLTypeTraits<T>::type) {
+            GGML_ABORT("ggml tensor type does not match sd::Tensor type");
+        }
+        Tensor<T> result(shape_from_ggml(tensor));
+        if (tensor->buffer != nullptr) {
+            ggml_backend_tensor_get(tensor, result.data(), 0, ggml_nbytes(tensor));
+        } else {
+            std::memcpy(result.data(), tensor->data, ggml_nbytes(tensor));
+        }
+        return result;
+    }
+
+    template <typename T>
+    inline ggml_tensor* make_ggml_tensor(ggml_context* ctx, const Tensor<T>& tensor, bool copy_data = true) {
+        GGML_ASSERT(tensor.dim() > 0 && tensor.dim() <= 5);
+
+        int n_dims = std::min(static_cast<int>(tensor.dim()), GGML_MAX_DIMS);
+
+        std::array<int64_t, GGML_MAX_DIMS> ne = {1, 1, 1, 1};
+        for (int64_t i = 0; i < n_dims; ++i) {
+            ne[static_cast<size_t>(i)] = tensor.shape()[static_cast<size_t>(i)];
+        }
+
+        if (tensor.dim() == 5) {
+            ne[3] *= tensor.shape()[4];
+        }
+
+        ggml_tensor* result = ggml_new_tensor(ctx, GGMLTypeTraits<T>::type, n_dims, ne.data());
+        if (copy_data && tensor.numel() > 0) {
+            std::memcpy(result->data, tensor.data(), static_cast<size_t>(ggml_nbytes(result)));
+        }
+        return result;
+    }
+
+    template <typename T>
+    inline Tensor<T> load_tensor_from_file_as_tensor(const std::string& file_path) {
+        std::ifstream file(file_path, std::ios::binary);
+        if (!file.is_open()) {
+            throw std::runtime_error("failed to open tensor file: " + file_path);
+        }
+
+        int32_t n_dims = 0;
+        int32_t length = 0;
+        int32_t ttype  = 0;
+        file.read(reinterpret_cast<char*>(&n_dims), sizeof(n_dims));
+        file.read(reinterpret_cast<char*>(&length), sizeof(length));
+        file.read(reinterpret_cast<char*>(&ttype), sizeof(ttype));
+        if (!file.good()) {
+            throw std::runtime_error("incomplete tensor file header: " + file_path);
+        }
+        if (static_cast<ggml_type>(ttype) != GGMLTypeTraits<T>::type) {
+            throw std::invalid_argument("tensor file type does not match requested sd::Tensor type");
+        }
+
+        std::vector<int64_t> shape(4, 1);
+        for (int i = 0; i < n_dims; ++i) {
+            int32_t dim = 1;
+            file.read(reinterpret_cast<char*>(&dim), sizeof(dim));
+            shape[static_cast<size_t>(i)] = dim;
+        }
+        std::string name(static_cast<size_t>(length), '\0');
+        file.read(name.data(), length);
+
+        shape.resize(static_cast<size_t>(n_dims));
+        Tensor<T> tensor(shape);
+        file.read(reinterpret_cast<char*>(tensor.data()), static_cast<std::streamsize>(tensor.numel() * sizeof(T)));
+        if (!file.good()) {
+            throw std::runtime_error("incomplete tensor file data: " + file_path);
+        }
+        return tensor;
+    }
+
+}  // namespace sd
+
+#endif
diff --git a/src/tokenize_util.cpp b/src/tokenize_util.cpp
index 22cf8ae2..33fdad26 100644
--- a/src/tokenize_util.cpp
+++ b/src/tokenize_util.cpp
@@ -1,993 +1,993 @@
-#include <algorithm>
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "tokenize_util.h"
-
-bool is_number(char32_t ch) {
-    return (ch >= U'0' && ch <= U'9');
-}
-
-bool is_letter(char32_t ch) {
-    static const struct { char32_t start, end; } ranges[] = {
-        {0x41, 0x5A},
-        {0x61, 0x7A},
-        {0xAA, 0xAA},
-        {0xB5, 0xB5},
-        {0xBA, 0xBA},
-        {0xC0, 0xD6},
-        {0xD8, 0xF6},
-        {0xF8, 0x2C1},
-        {0x2C6, 0x2D1},
-        {0x2E0, 0x2E4},
-        {0x2EC, 0x2EC},
-        {0x2EE, 0x2EE},
-        {0x370, 0x374},
-        {0x376, 0x377},
-        {0x37A, 0x37D},
-        {0x37F, 0x37F},
-        {0x386, 0x386},
-        {0x388, 0x38A},
-        {0x38C, 0x38C},
-        {0x38E, 0x3A1},
-        {0x3A3, 0x3F5},
-        {0x3F7, 0x481},
-        {0x48A, 0x52F},
-        {0x531, 0x556},
-        {0x559, 0x559},
-        {0x560, 0x588},
-        {0x5D0, 0x5EA},
-        {0x5EF, 0x5F2},
-        {0x620, 0x64A},
-        {0x66E, 0x66F},
-        {0x671, 0x6D3},
-        {0x6D5, 0x6D5},
-        {0x6E5, 0x6E6},
-        {0x6EE, 0x6EF},
-        {0x6FA, 0x6FC},
-        {0x6FF, 0x6FF},
-        {0x710, 0x710},
-        {0x712, 0x72F},
-        {0x74D, 0x7A5},
-        {0x7B1, 0x7B1},
-        {0x7CA, 0x7EA},
-        {0x7F4, 0x7F5},
-        {0x7FA, 0x7FA},
-        {0x800, 0x815},
-        {0x81A, 0x81A},
-        {0x824, 0x824},
-        {0x828, 0x828},
-        {0x840, 0x858},
-        {0x860, 0x86A},
-        {0x870, 0x887},
-        {0x889, 0x88F},
-        {0x8A0, 0x8C9},
-        {0x904, 0x939},
-        {0x93D, 0x93D},
-        {0x950, 0x950},
-        {0x958, 0x961},
-        {0x971, 0x980},
-        {0x985, 0x98C},
-        {0x98F, 0x990},
-        {0x993, 0x9A8},
-        {0x9AA, 0x9B0},
-        {0x9B2, 0x9B2},
-        {0x9B6, 0x9B9},
-        {0x9BD, 0x9BD},
-        {0x9CE, 0x9CE},
-        {0x9DC, 0x9DD},
-        {0x9DF, 0x9E1},
-        {0x9F0, 0x9F1},
-        {0x9FC, 0x9FC},
-        {0xA05, 0xA0A},
-        {0xA0F, 0xA10},
-        {0xA13, 0xA28},
-        {0xA2A, 0xA30},
-        {0xA32, 0xA33},
-        {0xA35, 0xA36},
-        {0xA38, 0xA39},
-        {0xA59, 0xA5C},
-        {0xA5E, 0xA5E},
-        {0xA72, 0xA74},
-        {0xA85, 0xA8D},
-        {0xA8F, 0xA91},
-        {0xA93, 0xAA8},
-        {0xAAA, 0xAB0},
-        {0xAB2, 0xAB3},
-        {0xAB5, 0xAB9},
-        {0xABD, 0xABD},
-        {0xAD0, 0xAD0},
-        {0xAE0, 0xAE1},
-        {0xAF9, 0xAF9},
-        {0xB05, 0xB0C},
-        {0xB0F, 0xB10},
-        {0xB13, 0xB28},
-        {0xB2A, 0xB30},
-        {0xB32, 0xB33},
-        {0xB35, 0xB39},
-        {0xB3D, 0xB3D},
-        {0xB5C, 0xB5D},
-        {0xB5F, 0xB61},
-        {0xB71, 0xB71},
-        {0xB83, 0xB83},
-        {0xB85, 0xB8A},
-        {0xB8E, 0xB90},
-        {0xB92, 0xB95},
-        {0xB99, 0xB9A},
-        {0xB9C, 0xB9C},
-        {0xB9E, 0xB9F},
-        {0xBA3, 0xBA4},
-        {0xBA8, 0xBAA},
-        {0xBAE, 0xBB9},
-        {0xBD0, 0xBD0},
-        {0xC05, 0xC0C},
-        {0xC0E, 0xC10},
-        {0xC12, 0xC28},
-        {0xC2A, 0xC39},
-        {0xC3D, 0xC3D},
-        {0xC58, 0xC5A},
-        {0xC5C, 0xC5D},
-        {0xC60, 0xC61},
-        {0xC80, 0xC80},
-        {0xC85, 0xC8C},
-        {0xC8E, 0xC90},
-        {0xC92, 0xCA8},
-        {0xCAA, 0xCB3},
-        {0xCB5, 0xCB9},
-        {0xCBD, 0xCBD},
-        {0xCDC, 0xCDE},
-        {0xCE0, 0xCE1},
-        {0xCF1, 0xCF2},
-        {0xD04, 0xD0C},
-        {0xD0E, 0xD10},
-        {0xD12, 0xD3A},
-        {0xD3D, 0xD3D},
-        {0xD4E, 0xD4E},
-        {0xD54, 0xD56},
-        {0xD5F, 0xD61},
-        {0xD7A, 0xD7F},
-        {0xD85, 0xD96},
-        {0xD9A, 0xDB1},
-        {0xDB3, 0xDBB},
-        {0xDBD, 0xDBD},
-        {0xDC0, 0xDC6},
-        {0xE01, 0xE30},
-        {0xE32, 0xE33},
-        {0xE40, 0xE46},
-        {0xE81, 0xE82},
-        {0xE84, 0xE84},
-        {0xE86, 0xE8A},
-        {0xE8C, 0xEA3},
-        {0xEA5, 0xEA5},
-        {0xEA7, 0xEB0},
-        {0xEB2, 0xEB3},
-        {0xEBD, 0xEBD},
-        {0xEC0, 0xEC4},
-        {0xEC6, 0xEC6},
-        {0xEDC, 0xEDF},
-        {0xF00, 0xF00},
-        {0xF40, 0xF47},
-        {0xF49, 0xF6C},
-        {0xF88, 0xF8C},
-        {0x1000, 0x102A},
-        {0x103F, 0x103F},
-        {0x1050, 0x1055},
-        {0x105A, 0x105D},
-        {0x1061, 0x1061},
-        {0x1065, 0x1066},
-        {0x106E, 0x1070},
-        {0x1075, 0x1081},
-        {0x108E, 0x108E},
-        {0x10A0, 0x10C5},
-        {0x10C7, 0x10C7},
-        {0x10CD, 0x10CD},
-        {0x10D0, 0x10FA},
-        {0x10FC, 0x1248},
-        {0x124A, 0x124D},
-        {0x1250, 0x1256},
-        {0x1258, 0x1258},
-        {0x125A, 0x125D},
-        {0x1260, 0x1288},
-        {0x128A, 0x128D},
-        {0x1290, 0x12B0},
-        {0x12B2, 0x12B5},
-        {0x12B8, 0x12BE},
-        {0x12C0, 0x12C0},
-        {0x12C2, 0x12C5},
-        {0x12C8, 0x12D6},
-        {0x12D8, 0x1310},
-        {0x1312, 0x1315},
-        {0x1318, 0x135A},
-        {0x1380, 0x138F},
-        {0x13A0, 0x13F5},
-        {0x13F8, 0x13FD},
-        {0x1401, 0x166C},
-        {0x166F, 0x167F},
-        {0x1681, 0x169A},
-        {0x16A0, 0x16EA},
-        {0x16F1, 0x16F8},
-        {0x1700, 0x1711},
-        {0x171F, 0x1731},
-        {0x1740, 0x1751},
-        {0x1760, 0x176C},
-        {0x176E, 0x1770},
-        {0x1780, 0x17B3},
-        {0x17D7, 0x17D7},
-        {0x17DC, 0x17DC},
-        {0x1820, 0x1878},
-        {0x1880, 0x1884},
-        {0x1887, 0x18A8},
-        {0x18AA, 0x18AA},
-        {0x18B0, 0x18F5},
-        {0x1900, 0x191E},
-        {0x1950, 0x196D},
-        {0x1970, 0x1974},
-        {0x1980, 0x19AB},
-        {0x19B0, 0x19C9},
-        {0x1A00, 0x1A16},
-        {0x1A20, 0x1A54},
-        {0x1AA7, 0x1AA7},
-        {0x1B05, 0x1B33},
-        {0x1B45, 0x1B4C},
-        {0x1B83, 0x1BA0},
-        {0x1BAE, 0x1BAF},
-        {0x1BBA, 0x1BE5},
-        {0x1C00, 0x1C23},
-        {0x1C4D, 0x1C4F},
-        {0x1C5A, 0x1C7D},
-        {0x1C80, 0x1C8A},
-        {0x1C90, 0x1CBA},
-        {0x1CBD, 0x1CBF},
-        {0x1CE9, 0x1CEC},
-        {0x1CEE, 0x1CF3},
-        {0x1CF5, 0x1CF6},
-        {0x1CFA, 0x1CFA},
-        {0x1D00, 0x1DBF},
-        {0x1E00, 0x1F15},
-        {0x1F18, 0x1F1D},
-        {0x1F20, 0x1F45},
-        {0x1F48, 0x1F4D},
-        {0x1F50, 0x1F57},
-        {0x1F59, 0x1F59},
-        {0x1F5B, 0x1F5B},
-        {0x1F5D, 0x1F5D},
-        {0x1F5F, 0x1F7D},
-        {0x1F80, 0x1FB4},
-        {0x1FB6, 0x1FBC},
-        {0x1FBE, 0x1FBE},
-        {0x1FC2, 0x1FC4},
-        {0x1FC6, 0x1FCC},
-        {0x1FD0, 0x1FD3},
-        {0x1FD6, 0x1FDB},
-        {0x1FE0, 0x1FEC},
-        {0x1FF2, 0x1FF4},
-        {0x1FF6, 0x1FFC},
-        {0x2071, 0x2071},
-        {0x207F, 0x207F},
-        {0x2090, 0x209C},
-        {0x2102, 0x2102},
-        {0x2107, 0x2107},
-        {0x210A, 0x2113},
-        {0x2115, 0x2115},
-        {0x2119, 0x211D},
-        {0x2124, 0x2124},
-        {0x2126, 0x2126},
-        {0x2128, 0x2128},
-        {0x212A, 0x212D},
-        {0x212F, 0x2139},
-        {0x213C, 0x213F},
-        {0x2145, 0x2149},
-        {0x214E, 0x214E},
-        {0x2183, 0x2184},
-        {0x2C00, 0x2CE4},
-        {0x2CEB, 0x2CEE},
-        {0x2CF2, 0x2CF3},
-        {0x2D00, 0x2D25},
-        {0x2D27, 0x2D27},
-        {0x2D2D, 0x2D2D},
-        {0x2D30, 0x2D67},
-        {0x2D6F, 0x2D6F},
-        {0x2D80, 0x2D96},
-        {0x2DA0, 0x2DA6},
-        {0x2DA8, 0x2DAE},
-        {0x2DB0, 0x2DB6},
-        {0x2DB8, 0x2DBE},
-        {0x2DC0, 0x2DC6},
-        {0x2DC8, 0x2DCE},
-        {0x2DD0, 0x2DD6},
-        {0x2DD8, 0x2DDE},
-        {0x2E2F, 0x2E2F},
-        {0x3005, 0x3006},
-        {0x3031, 0x3035},
-        {0x303B, 0x303C},
-        {0x3041, 0x3096},
-        {0x309D, 0x309F},
-        {0x30A1, 0x30FA},
-        {0x30FC, 0x30FF},
-        {0x3105, 0x312F},
-        {0x3131, 0x318E},
-        {0x31A0, 0x31BF},
-        {0x31F0, 0x31FF},
-        {0x3400, 0x4DBF},
-        {0x4E00, 0xA48C},
-        {0xA4D0, 0xA4FD},
-        {0xA500, 0xA60C},
-        {0xA610, 0xA61F},
-        {0xA62A, 0xA62B},
-        {0xA640, 0xA66E},
-        {0xA67F, 0xA69D},
-        {0xA6A0, 0xA6E5},
-        {0xA717, 0xA71F},
-        {0xA722, 0xA788},
-        {0xA78B, 0xA7DC},
-        {0xA7F1, 0xA801},
-        {0xA803, 0xA805},
-        {0xA807, 0xA80A},
-        {0xA80C, 0xA822},
-        {0xA840, 0xA873},
-        {0xA882, 0xA8B3},
-        {0xA8F2, 0xA8F7},
-        {0xA8FB, 0xA8FB},
-        {0xA8FD, 0xA8FE},
-        {0xA90A, 0xA925},
-        {0xA930, 0xA946},
-        {0xA960, 0xA97C},
-        {0xA984, 0xA9B2},
-        {0xA9CF, 0xA9CF},
-        {0xA9E0, 0xA9E4},
-        {0xA9E6, 0xA9EF},
-        {0xA9FA, 0xA9FE},
-        {0xAA00, 0xAA28},
-        {0xAA40, 0xAA42},
-        {0xAA44, 0xAA4B},
-        {0xAA60, 0xAA76},
-        {0xAA7A, 0xAA7A},
-        {0xAA7E, 0xAAAF},
-        {0xAAB1, 0xAAB1},
-        {0xAAB5, 0xAAB6},
-        {0xAAB9, 0xAABD},
-        {0xAAC0, 0xAAC0},
-        {0xAAC2, 0xAAC2},
-        {0xAADB, 0xAADD},
-        {0xAAE0, 0xAAEA},
-        {0xAAF2, 0xAAF4},
-        {0xAB01, 0xAB06},
-        {0xAB09, 0xAB0E},
-        {0xAB11, 0xAB16},
-        {0xAB20, 0xAB26},
-        {0xAB28, 0xAB2E},
-        {0xAB30, 0xAB5A},
-        {0xAB5C, 0xAB69},
-        {0xAB70, 0xABE2},
-        {0xAC00, 0xD7A3},
-        {0xD7B0, 0xD7C6},
-        {0xD7CB, 0xD7FB},
-        {0xF900, 0xFA6D},
-        {0xFA70, 0xFAD9},
-        {0xFB00, 0xFB06},
-        {0xFB13, 0xFB17},
-        {0xFB1D, 0xFB1D},
-        {0xFB1F, 0xFB28},
-        {0xFB2A, 0xFB36},
-        {0xFB38, 0xFB3C},
-        {0xFB3E, 0xFB3E},
-        {0xFB40, 0xFB41},
-        {0xFB43, 0xFB44},
-        {0xFB46, 0xFBB1},
-        {0xFBD3, 0xFD3D},
-        {0xFD50, 0xFD8F},
-        {0xFD92, 0xFDC7},
-        {0xFDF0, 0xFDFB},
-        {0xFE70, 0xFE74},
-        {0xFE76, 0xFEFC},
-        {0xFF21, 0xFF3A},
-        {0xFF41, 0xFF5A},
-        {0xFF66, 0xFFBE},
-        {0xFFC2, 0xFFC7},
-        {0xFFCA, 0xFFCF},
-        {0xFFD2, 0xFFD7},
-        {0xFFDA, 0xFFDC},
-        {0x10000, 0x1000B},
-        {0x1000D, 0x10026},
-        {0x10028, 0x1003A},
-        {0x1003C, 0x1003D},
-        {0x1003F, 0x1004D},
-        {0x10050, 0x1005D},
-        {0x10080, 0x100FA},
-        {0x10280, 0x1029C},
-        {0x102A0, 0x102D0},
-        {0x10300, 0x1031F},
-        {0x1032D, 0x10340},
-        {0x10342, 0x10349},
-        {0x10350, 0x10375},
-        {0x10380, 0x1039D},
-        {0x103A0, 0x103C3},
-        {0x103C8, 0x103CF},
-        {0x10400, 0x1049D},
-        {0x104B0, 0x104D3},
-        {0x104D8, 0x104FB},
-        {0x10500, 0x10527},
-        {0x10530, 0x10563},
-        {0x10570, 0x1057A},
-        {0x1057C, 0x1058A},
-        {0x1058C, 0x10592},
-        {0x10594, 0x10595},
-        {0x10597, 0x105A1},
-        {0x105A3, 0x105B1},
-        {0x105B3, 0x105B9},
-        {0x105BB, 0x105BC},
-        {0x105C0, 0x105F3},
-        {0x10600, 0x10736},
-        {0x10740, 0x10755},
-        {0x10760, 0x10767},
-        {0x10780, 0x10785},
-        {0x10787, 0x107B0},
-        {0x107B2, 0x107BA},
-        {0x10800, 0x10805},
-        {0x10808, 0x10808},
-        {0x1080A, 0x10835},
-        {0x10837, 0x10838},
-        {0x1083C, 0x1083C},
-        {0x1083F, 0x10855},
-        {0x10860, 0x10876},
-        {0x10880, 0x1089E},
-        {0x108E0, 0x108F2},
-        {0x108F4, 0x108F5},
-        {0x10900, 0x10915},
-        {0x10920, 0x10939},
-        {0x10940, 0x10959},
-        {0x10980, 0x109B7},
-        {0x109BE, 0x109BF},
-        {0x10A00, 0x10A00},
-        {0x10A10, 0x10A13},
-        {0x10A15, 0x10A17},
-        {0x10A19, 0x10A35},
-        {0x10A60, 0x10A7C},
-        {0x10A80, 0x10A9C},
-        {0x10AC0, 0x10AC7},
-        {0x10AC9, 0x10AE4},
-        {0x10B00, 0x10B35},
-        {0x10B40, 0x10B55},
-        {0x10B60, 0x10B72},
-        {0x10B80, 0x10B91},
-        {0x10C00, 0x10C48},
-        {0x10C80, 0x10CB2},
-        {0x10CC0, 0x10CF2},
-        {0x10D00, 0x10D23},
-        {0x10D4A, 0x10D65},
-        {0x10D6F, 0x10D85},
-        {0x10E80, 0x10EA9},
-        {0x10EB0, 0x10EB1},
-        {0x10EC2, 0x10EC7},
-        {0x10F00, 0x10F1C},
-        {0x10F27, 0x10F27},
-        {0x10F30, 0x10F45},
-        {0x10F70, 0x10F81},
-        {0x10FB0, 0x10FC4},
-        {0x10FE0, 0x10FF6},
-        {0x11003, 0x11037},
-        {0x11071, 0x11072},
-        {0x11075, 0x11075},
-        {0x11083, 0x110AF},
-        {0x110D0, 0x110E8},
-        {0x11103, 0x11126},
-        {0x11144, 0x11144},
-        {0x11147, 0x11147},
-        {0x11150, 0x11172},
-        {0x11176, 0x11176},
-        {0x11183, 0x111B2},
-        {0x111C1, 0x111C4},
-        {0x111DA, 0x111DA},
-        {0x111DC, 0x111DC},
-        {0x11200, 0x11211},
-        {0x11213, 0x1122B},
-        {0x1123F, 0x11240},
-        {0x11280, 0x11286},
-        {0x11288, 0x11288},
-        {0x1128A, 0x1128D},
-        {0x1128F, 0x1129D},
-        {0x1129F, 0x112A8},
-        {0x112B0, 0x112DE},
-        {0x11305, 0x1130C},
-        {0x1130F, 0x11310},
-        {0x11313, 0x11328},
-        {0x1132A, 0x11330},
-        {0x11332, 0x11333},
-        {0x11335, 0x11339},
-        {0x1133D, 0x1133D},
-        {0x11350, 0x11350},
-        {0x1135D, 0x11361},
-        {0x11380, 0x11389},
-        {0x1138B, 0x1138B},
-        {0x1138E, 0x1138E},
-        {0x11390, 0x113B5},
-        {0x113B7, 0x113B7},
-        {0x113D1, 0x113D1},
-        {0x113D3, 0x113D3},
-        {0x11400, 0x11434},
-        {0x11447, 0x1144A},
-        {0x1145F, 0x11461},
-        {0x11480, 0x114AF},
-        {0x114C4, 0x114C5},
-        {0x114C7, 0x114C7},
-        {0x11580, 0x115AE},
-        {0x115D8, 0x115DB},
-        {0x11600, 0x1162F},
-        {0x11644, 0x11644},
-        {0x11680, 0x116AA},
-        {0x116B8, 0x116B8},
-        {0x11700, 0x1171A},
-        {0x11740, 0x11746},
-        {0x11800, 0x1182B},
-        {0x118A0, 0x118DF},
-        {0x118FF, 0x11906},
-        {0x11909, 0x11909},
-        {0x1190C, 0x11913},
-        {0x11915, 0x11916},
-        {0x11918, 0x1192F},
-        {0x1193F, 0x1193F},
-        {0x11941, 0x11941},
-        {0x119A0, 0x119A7},
-        {0x119AA, 0x119D0},
-        {0x119E1, 0x119E1},
-        {0x119E3, 0x119E3},
-        {0x11A00, 0x11A00},
-        {0x11A0B, 0x11A32},
-        {0x11A3A, 0x11A3A},
-        {0x11A50, 0x11A50},
-        {0x11A5C, 0x11A89},
-        {0x11A9D, 0x11A9D},
-        {0x11AB0, 0x11AF8},
-        {0x11BC0, 0x11BE0},
-        {0x11C00, 0x11C08},
-        {0x11C0A, 0x11C2E},
-        {0x11C40, 0x11C40},
-        {0x11C72, 0x11C8F},
-        {0x11D00, 0x11D06},
-        {0x11D08, 0x11D09},
-        {0x11D0B, 0x11D30},
-        {0x11D46, 0x11D46},
-        {0x11D60, 0x11D65},
-        {0x11D67, 0x11D68},
-        {0x11D6A, 0x11D89},
-        {0x11D98, 0x11D98},
-        {0x11DB0, 0x11DDB},
-        {0x11EE0, 0x11EF2},
-        {0x11F02, 0x11F02},
-        {0x11F04, 0x11F10},
-        {0x11F12, 0x11F33},
-        {0x11FB0, 0x11FB0},
-        {0x12000, 0x12399},
-        {0x12480, 0x12543},
-        {0x12F90, 0x12FF0},
-        {0x13000, 0x1342F},
-        {0x13441, 0x13446},
-        {0x13460, 0x143FA},
-        {0x14400, 0x14646},
-        {0x16100, 0x1611D},
-        {0x16800, 0x16A38},
-        {0x16A40, 0x16A5E},
-        {0x16A70, 0x16ABE},
-        {0x16AD0, 0x16AED},
-        {0x16B00, 0x16B2F},
-        {0x16B40, 0x16B43},
-        {0x16B63, 0x16B77},
-        {0x16B7D, 0x16B8F},
-        {0x16D40, 0x16D6C},
-        {0x16E40, 0x16E7F},
-        {0x16EA0, 0x16EB8},
-        {0x16EBB, 0x16ED3},
-        {0x16F00, 0x16F4A},
-        {0x16F50, 0x16F50},
-        {0x16F93, 0x16F9F},
-        {0x16FE0, 0x16FE1},
-        {0x16FE3, 0x16FE3},
-        {0x16FF2, 0x16FF3},
-        {0x17000, 0x18CD5},
-        {0x18CFF, 0x18D1E},
-        {0x18D80, 0x18DF2},
-        {0x1AFF0, 0x1AFF3},
-        {0x1AFF5, 0x1AFFB},
-        {0x1AFFD, 0x1AFFE},
-        {0x1B000, 0x1B122},
-        {0x1B132, 0x1B132},
-        {0x1B150, 0x1B152},
-        {0x1B155, 0x1B155},
-        {0x1B164, 0x1B167},
-        {0x1B170, 0x1B2FB},
-        {0x1BC00, 0x1BC6A},
-        {0x1BC70, 0x1BC7C},
-        {0x1BC80, 0x1BC88},
-        {0x1BC90, 0x1BC99},
-        {0x1D400, 0x1D454},
-        {0x1D456, 0x1D49C},
-        {0x1D49E, 0x1D49F},
-        {0x1D4A2, 0x1D4A2},
-        {0x1D4A5, 0x1D4A6},
-        {0x1D4A9, 0x1D4AC},
-        {0x1D4AE, 0x1D4B9},
-        {0x1D4BB, 0x1D4BB},
-        {0x1D4BD, 0x1D4C3},
-        {0x1D4C5, 0x1D505},
-        {0x1D507, 0x1D50A},
-        {0x1D50D, 0x1D514},
-        {0x1D516, 0x1D51C},
-        {0x1D51E, 0x1D539},
-        {0x1D53B, 0x1D53E},
-        {0x1D540, 0x1D544},
-        {0x1D546, 0x1D546},
-        {0x1D54A, 0x1D550},
-        {0x1D552, 0x1D6A5},
-        {0x1D6A8, 0x1D6C0},
-        {0x1D6C2, 0x1D6DA},
-        {0x1D6DC, 0x1D6FA},
-        {0x1D6FC, 0x1D714},
-        {0x1D716, 0x1D734},
-        {0x1D736, 0x1D74E},
-        {0x1D750, 0x1D76E},
-        {0x1D770, 0x1D788},
-        {0x1D78A, 0x1D7A8},
-        {0x1D7AA, 0x1D7C2},
-        {0x1D7C4, 0x1D7CB},
-        {0x1DF00, 0x1DF1E},
-        {0x1DF25, 0x1DF2A},
-        {0x1E030, 0x1E06D},
-        {0x1E100, 0x1E12C},
-        {0x1E137, 0x1E13D},
-        {0x1E14E, 0x1E14E},
-        {0x1E290, 0x1E2AD},
-        {0x1E2C0, 0x1E2EB},
-        {0x1E4D0, 0x1E4EB},
-        {0x1E5D0, 0x1E5ED},
-        {0x1E5F0, 0x1E5F0},
-        {0x1E6C0, 0x1E6DE},
-        {0x1E6E0, 0x1E6E2},
-        {0x1E6E4, 0x1E6E5},
-        {0x1E6E7, 0x1E6ED},
-        {0x1E6F0, 0x1E6F4},
-        {0x1E6FE, 0x1E6FF},
-        {0x1E7E0, 0x1E7E6},
-        {0x1E7E8, 0x1E7EB},
-        {0x1E7ED, 0x1E7EE},
-        {0x1E7F0, 0x1E7FE},
-        {0x1E800, 0x1E8C4},
-        {0x1E900, 0x1E943},
-        {0x1E94B, 0x1E94B},
-        {0x1EE00, 0x1EE03},
-        {0x1EE05, 0x1EE1F},
-        {0x1EE21, 0x1EE22},
-        {0x1EE24, 0x1EE24},
-        {0x1EE27, 0x1EE27},
-        {0x1EE29, 0x1EE32},
-        {0x1EE34, 0x1EE37},
-        {0x1EE39, 0x1EE39},
-        {0x1EE3B, 0x1EE3B},
-        {0x1EE42, 0x1EE42},
-        {0x1EE47, 0x1EE47},
-        {0x1EE49, 0x1EE49},
-        {0x1EE4B, 0x1EE4B},
-        {0x1EE4D, 0x1EE4F},
-        {0x1EE51, 0x1EE52},
-        {0x1EE54, 0x1EE54},
-        {0x1EE57, 0x1EE57},
-        {0x1EE59, 0x1EE59},
-        {0x1EE5B, 0x1EE5B},
-        {0x1EE5D, 0x1EE5D},
-        {0x1EE5F, 0x1EE5F},
-        {0x1EE61, 0x1EE62},
-        {0x1EE64, 0x1EE64},
-        {0x1EE67, 0x1EE6A},
-        {0x1EE6C, 0x1EE72},
-        {0x1EE74, 0x1EE77},
-        {0x1EE79, 0x1EE7C},
-        {0x1EE7E, 0x1EE7E},
-        {0x1EE80, 0x1EE89},
-        {0x1EE8B, 0x1EE9B},
-        {0x1EEA1, 0x1EEA3},
-        {0x1EEA5, 0x1EEA9},
-        {0x1EEAB, 0x1EEBB},
-        {0x20000, 0x2A6DF},
-        {0x2A700, 0x2B81D},
-        {0x2B820, 0x2CEAD},
-        {0x2CEB0, 0x2EBE0},
-        {0x2EBF0, 0x2EE5D},
-        {0x2F800, 0x2FA1D},
-        {0x30000, 0x3134A},
-        {0x31350, 0x33479},
-    };
-
-    for (const auto& r : ranges) {
-        if (ch >= r.start && ch <= r.end)
-            return true;
-    }
-    return false;
-}
-
-bool is_space(char32_t cp) {
-    switch (cp) {
-        case 0x0009:  // TAB \t
-        case 0x000A:  // LF \n
-        case 0x000B:  // VT
-        case 0x000C:  // FF
-        case 0x000D:  // CR \r
-        case 0x0020:  // Space
-        case 0x00A0:  // No-Break Space
-        case 0x1680:  // Ogham Space Mark
-        case 0x2000:  // En Quad
-        case 0x2001:  // Em Quad
-        case 0x2002:  // En Space
-        case 0x2003:  // Em Space
-        case 0x2004:  // Three-Per-Em Space
-        case 0x2005:  // Four-Per-Em Space
-        case 0x2006:  // Six-Per-Em Space
-        case 0x2007:  // Figure Space
-        case 0x2008:  // Punctuation Space
-        case 0x2009:  // Thin Space
-        case 0x200A:  // Hair Space
-        case 0x202F:  // Narrow No-Break Space
-        case 0x205F:  // Medium Mathematical Space
-        case 0x3000:  // Ideographic Space
-            return true;
-        default:
-            return false;
-    }
-}
-
-std::string str_to_lower(const std::string& input) {
-    std::string result = input;
-    std::transform(result.begin(), result.end(), result.begin(),
-                   [](unsigned char c) { return std::tolower(c); });
-    return result;
-}
-
-// UTF-8 -> Unicode code points
-std::vector<char32_t> utf8_to_codepoints(const std::string& str) {
-    std::vector<char32_t> codepoints;
-    size_t i = 0;
-    while (i < str.size()) {
-        unsigned char c    = str[i];
-        char32_t cp        = 0;
-        size_t extra_bytes = 0;
-
-        if ((c & 0x80) == 0)
-            cp = c;
-        else if ((c & 0xE0) == 0xC0) {
-            cp          = c & 0x1F;
-            extra_bytes = 1;
-        } else if ((c & 0xF0) == 0xE0) {
-            cp          = c & 0x0F;
-            extra_bytes = 2;
-        } else if ((c & 0xF8) == 0xF0) {
-            cp          = c & 0x07;
-            extra_bytes = 3;
-        } else {
-            ++i;
-            continue;
-        }  // Invalid UTF-8
-
-        if (i + extra_bytes >= str.size())
-            break;
-
-        for (size_t j = 1; j <= extra_bytes; ++j)
-            cp = (cp << 6) | (str[i + j] & 0x3F);
-
-        codepoints.push_back(cp);
-        i += 1 + extra_bytes;
-    }
-    return codepoints;
-}
-
-// Unicode code point -> UTF-8
-std::string codepoint_to_utf8(char32_t cp) {
-    std::string out;
-    if (cp <= 0x7F)
-        out.push_back(static_cast<char>(cp));
-    else if (cp <= 0x7FF) {
-        out.push_back(static_cast<char>(0xC0 | (cp >> 6)));
-        out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
-    } else if (cp <= 0xFFFF) {
-        out.push_back(static_cast<char>(0xE0 | (cp >> 12)));
-        out.push_back(static_cast<char>(0x80 | ((cp >> 6) & 0x3F)));
-        out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
-    } else {
-        out.push_back(static_cast<char>(0xF0 | (cp >> 18)));
-        out.push_back(static_cast<char>(0x80 | ((cp >> 12) & 0x3F)));
-        out.push_back(static_cast<char>(0x80 | ((cp >> 6) & 0x3F)));
-        out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
-    }
-    return out;
-}
-
-bool starts_with(const std::vector<char32_t>& text,
-                 const std::vector<char32_t>& prefix,
-                 std::size_t index) {
-    if (index > text.size()) {
-        return false;
-    }
-    if (prefix.size() > text.size() - index) {
-        return false;
-    }
-    return std::equal(prefix.begin(), prefix.end(), text.begin() + index);
-}
-
-// mistral: [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+
-// qwen2: (?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+
-std::vector<std::string> token_split(const std::string& text) {
-    std::vector<std::string> tokens;
-    auto cps = utf8_to_codepoints(text);
-    size_t i = 0;
-
-    while (i < cps.size()) {
-        char32_t cp = cps[i];
-
-        // `(?i:'s|'t|'re|'ve|'m|'ll|'d)`
-        if (cp == U'\'' && i + 1 < cps.size()) {
-            std::string next = str_to_lower(codepoint_to_utf8(cps[i + 1]));
-            if (next == "s" || next == "t" || next == "m") {
-                tokens.push_back("'" + next);
-                i += 2;
-                continue;
-            }
-            if (i + 2 < cps.size()) {
-                next += str_to_lower(codepoint_to_utf8(cps[i + 2]));
-                if (next == "re" || next == "ve" || next == "ll" || next == "d") {
-                    tokens.push_back("'" + next);
-                    i += 3;
-                    continue;
-                }
-            }
-        }
-
-        // `\p{N}`
-        if (is_number(cp)) {
-            tokens.push_back(codepoint_to_utf8(cp));
-            ++i;
-            continue;
-        }
-
-        // `[^\r\n\p{L}\p{N}]?\p{L}+`
-        {
-            // `[^\r\n\p{L}\p{N}]\p{L}+`
-            if (!is_letter(cp) && cp != U'\r' && cp != U'\n' && i + 1 < cps.size() && is_letter(cps[i + 1])) {
-                std::string token = codepoint_to_utf8(cp);
-                ++i;
-
-                while (i < cps.size() && is_letter(cps[i])) {
-                    token += codepoint_to_utf8(cps[i]);
-                    ++i;
-                }
-                tokens.push_back(token);
-                continue;
-            }
-
-            // `\p{L}+`
-            if (is_letter(cp)) {
-                std::string token = codepoint_to_utf8(cp);
-                ++i;
-                while (i < cps.size() && is_letter(cps[i])) {
-                    token += codepoint_to_utf8(cps[i]);
-                    ++i;
-                }
-                tokens.push_back(token);
-                continue;
-            }
-        }
-
-        // ` ?[^\s\p{L}\p{N}]+[\r\n]*`
-        {
-            // ` [^\s\p{L}\p{N}]+[\r\n]*`
-            if (cp == U' ' && i + 1 < cps.size() && !isspace(cps[i + 1]) && !is_letter(cps[i + 1]) && !is_number(cps[i + 1])) {
-                std::string token = codepoint_to_utf8(cp);
-                token += codepoint_to_utf8(cps[i + 1]);
-                i += 2;
-
-                while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) {
-                    token += codepoint_to_utf8(cps[i]);
-                    ++i;
-                }
-
-                while (i < cps.size() && (cps[i] == U'\r' || cps[i] == U'\n')) {
-                    token += codepoint_to_utf8(cps[i]);
-                    ++i;
-                }
-
-                tokens.push_back(token);
-                continue;
-            }
-
-            // `[^\s\p{L}\p{N}]+[\r\n]*`
-            std::string token;
-            if (!is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) {
-                std::string token = codepoint_to_utf8(cp);
-                ++i;
-
-                while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) {
-                    token += codepoint_to_utf8(cps[i]);
-                    ++i;
-                }
-
-                while (i < cps.size() && (cps[i] == U'\r' || cps[i] == U'\n')) {
-                    token += codepoint_to_utf8(cps[i]);
-                    ++i;
-                }
-
-                tokens.push_back(token);
-                continue;
-            }
-        }
-
-        // `\s*[\r\n]+|\s+(?!\S)|\s+`
-        if (is_space(cp)) {
-            std::string token;
-            bool saw_new_line = false;
-
-            while (i < cps.size() && is_space(cps[i])) {
-                token += codepoint_to_utf8(cps[i]);
-
-                if (cps[i] == U'\r' || cps[i] == U'\n') {
-                    saw_new_line = true;
-                } else {
-                    if (saw_new_line) {
-                        break;
-                    }
-                }
-
-                ++i;
-            }
-
-            tokens.push_back(token);
-            continue;
-        }
-
-        // skip
-        ++i;
-    }
-
-    return tokens;
-}
-
-std::vector<std::string> split_with_special_tokens(
-    const std::string& text,
-    const std::vector<std::string>& special_tokens) {
-    std::vector<std::string> result;
-    size_t pos      = 0;
-    size_t text_len = text.size();
-
-    while (pos < text_len) {
-        size_t next_pos = text_len;
-        std::string matched_token;
-
-        for (const auto& token : special_tokens) {
-            size_t token_pos = text.find(token, pos);
-            if (token_pos != std::string::npos && token_pos < next_pos) {
-                next_pos      = token_pos;
-                matched_token = token;
-            }
-        }
-
-        if (next_pos > pos) {
-            result.push_back(text.substr(pos, next_pos - pos));
-        }
-
-        if (!matched_token.empty()) {
-            result.push_back(matched_token);
-            pos = next_pos + matched_token.size();
-        } else {
-            break;
-        }
-    }
-
-    return result;
-}
-
-// int main() {
-//     std::string text = "I'm testing C++ token_split function. 你好，世界! 123";
-//     auto tokens = token_split(text);
-
-//     for (const auto& t : tokens) {
-//         std::cout << "[" << t << "] ";
-//     }
-//     std::cout << "\n";
-//     return 0;
-// }
+﻿#include <algorithm>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "tokenize_util.h"
+
+bool is_number(char32_t ch) {
+    return (ch >= U'0' && ch <= U'9');
+}
+
+bool is_letter(char32_t ch) {
+    static const struct { char32_t start, end; } ranges[] = {
+        {0x41, 0x5A},
+        {0x61, 0x7A},
+        {0xAA, 0xAA},
+        {0xB5, 0xB5},
+        {0xBA, 0xBA},
+        {0xC0, 0xD6},
+        {0xD8, 0xF6},
+        {0xF8, 0x2C1},
+        {0x2C6, 0x2D1},
+        {0x2E0, 0x2E4},
+        {0x2EC, 0x2EC},
+        {0x2EE, 0x2EE},
+        {0x370, 0x374},
+        {0x376, 0x377},
+        {0x37A, 0x37D},
+        {0x37F, 0x37F},
+        {0x386, 0x386},
+        {0x388, 0x38A},
+        {0x38C, 0x38C},
+        {0x38E, 0x3A1},
+        {0x3A3, 0x3F5},
+        {0x3F7, 0x481},
+        {0x48A, 0x52F},
+        {0x531, 0x556},
+        {0x559, 0x559},
+        {0x560, 0x588},
+        {0x5D0, 0x5EA},
+        {0x5EF, 0x5F2},
+        {0x620, 0x64A},
+        {0x66E, 0x66F},
+        {0x671, 0x6D3},
+        {0x6D5, 0x6D5},
+        {0x6E5, 0x6E6},
+        {0x6EE, 0x6EF},
+        {0x6FA, 0x6FC},
+        {0x6FF, 0x6FF},
+        {0x710, 0x710},
+        {0x712, 0x72F},
+        {0x74D, 0x7A5},
+        {0x7B1, 0x7B1},
+        {0x7CA, 0x7EA},
+        {0x7F4, 0x7F5},
+        {0x7FA, 0x7FA},
+        {0x800, 0x815},
+        {0x81A, 0x81A},
+        {0x824, 0x824},
+        {0x828, 0x828},
+        {0x840, 0x858},
+        {0x860, 0x86A},
+        {0x870, 0x887},
+        {0x889, 0x88F},
+        {0x8A0, 0x8C9},
+        {0x904, 0x939},
+        {0x93D, 0x93D},
+        {0x950, 0x950},
+        {0x958, 0x961},
+        {0x971, 0x980},
+        {0x985, 0x98C},
+        {0x98F, 0x990},
+        {0x993, 0x9A8},
+        {0x9AA, 0x9B0},
+        {0x9B2, 0x9B2},
+        {0x9B6, 0x9B9},
+        {0x9BD, 0x9BD},
+        {0x9CE, 0x9CE},
+        {0x9DC, 0x9DD},
+        {0x9DF, 0x9E1},
+        {0x9F0, 0x9F1},
+        {0x9FC, 0x9FC},
+        {0xA05, 0xA0A},
+        {0xA0F, 0xA10},
+        {0xA13, 0xA28},
+        {0xA2A, 0xA30},
+        {0xA32, 0xA33},
+        {0xA35, 0xA36},
+        {0xA38, 0xA39},
+        {0xA59, 0xA5C},
+        {0xA5E, 0xA5E},
+        {0xA72, 0xA74},
+        {0xA85, 0xA8D},
+        {0xA8F, 0xA91},
+        {0xA93, 0xAA8},
+        {0xAAA, 0xAB0},
+        {0xAB2, 0xAB3},
+        {0xAB5, 0xAB9},
+        {0xABD, 0xABD},
+        {0xAD0, 0xAD0},
+        {0xAE0, 0xAE1},
+        {0xAF9, 0xAF9},
+        {0xB05, 0xB0C},
+        {0xB0F, 0xB10},
+        {0xB13, 0xB28},
+        {0xB2A, 0xB30},
+        {0xB32, 0xB33},
+        {0xB35, 0xB39},
+        {0xB3D, 0xB3D},
+        {0xB5C, 0xB5D},
+        {0xB5F, 0xB61},
+        {0xB71, 0xB71},
+        {0xB83, 0xB83},
+        {0xB85, 0xB8A},
+        {0xB8E, 0xB90},
+        {0xB92, 0xB95},
+        {0xB99, 0xB9A},
+        {0xB9C, 0xB9C},
+        {0xB9E, 0xB9F},
+        {0xBA3, 0xBA4},
+        {0xBA8, 0xBAA},
+        {0xBAE, 0xBB9},
+        {0xBD0, 0xBD0},
+        {0xC05, 0xC0C},
+        {0xC0E, 0xC10},
+        {0xC12, 0xC28},
+        {0xC2A, 0xC39},
+        {0xC3D, 0xC3D},
+        {0xC58, 0xC5A},
+        {0xC5C, 0xC5D},
+        {0xC60, 0xC61},
+        {0xC80, 0xC80},
+        {0xC85, 0xC8C},
+        {0xC8E, 0xC90},
+        {0xC92, 0xCA8},
+        {0xCAA, 0xCB3},
+        {0xCB5, 0xCB9},
+        {0xCBD, 0xCBD},
+        {0xCDC, 0xCDE},
+        {0xCE0, 0xCE1},
+        {0xCF1, 0xCF2},
+        {0xD04, 0xD0C},
+        {0xD0E, 0xD10},
+        {0xD12, 0xD3A},
+        {0xD3D, 0xD3D},
+        {0xD4E, 0xD4E},
+        {0xD54, 0xD56},
+        {0xD5F, 0xD61},
+        {0xD7A, 0xD7F},
+        {0xD85, 0xD96},
+        {0xD9A, 0xDB1},
+        {0xDB3, 0xDBB},
+        {0xDBD, 0xDBD},
+        {0xDC0, 0xDC6},
+        {0xE01, 0xE30},
+        {0xE32, 0xE33},
+        {0xE40, 0xE46},
+        {0xE81, 0xE82},
+        {0xE84, 0xE84},
+        {0xE86, 0xE8A},
+        {0xE8C, 0xEA3},
+        {0xEA5, 0xEA5},
+        {0xEA7, 0xEB0},
+        {0xEB2, 0xEB3},
+        {0xEBD, 0xEBD},
+        {0xEC0, 0xEC4},
+        {0xEC6, 0xEC6},
+        {0xEDC, 0xEDF},
+        {0xF00, 0xF00},
+        {0xF40, 0xF47},
+        {0xF49, 0xF6C},
+        {0xF88, 0xF8C},
+        {0x1000, 0x102A},
+        {0x103F, 0x103F},
+        {0x1050, 0x1055},
+        {0x105A, 0x105D},
+        {0x1061, 0x1061},
+        {0x1065, 0x1066},
+        {0x106E, 0x1070},
+        {0x1075, 0x1081},
+        {0x108E, 0x108E},
+        {0x10A0, 0x10C5},
+        {0x10C7, 0x10C7},
+        {0x10CD, 0x10CD},
+        {0x10D0, 0x10FA},
+        {0x10FC, 0x1248},
+        {0x124A, 0x124D},
+        {0x1250, 0x1256},
+        {0x1258, 0x1258},
+        {0x125A, 0x125D},
+        {0x1260, 0x1288},
+        {0x128A, 0x128D},
+        {0x1290, 0x12B0},
+        {0x12B2, 0x12B5},
+        {0x12B8, 0x12BE},
+        {0x12C0, 0x12C0},
+        {0x12C2, 0x12C5},
+        {0x12C8, 0x12D6},
+        {0x12D8, 0x1310},
+        {0x1312, 0x1315},
+        {0x1318, 0x135A},
+        {0x1380, 0x138F},
+        {0x13A0, 0x13F5},
+        {0x13F8, 0x13FD},
+        {0x1401, 0x166C},
+        {0x166F, 0x167F},
+        {0x1681, 0x169A},
+        {0x16A0, 0x16EA},
+        {0x16F1, 0x16F8},
+        {0x1700, 0x1711},
+        {0x171F, 0x1731},
+        {0x1740, 0x1751},
+        {0x1760, 0x176C},
+        {0x176E, 0x1770},
+        {0x1780, 0x17B3},
+        {0x17D7, 0x17D7},
+        {0x17DC, 0x17DC},
+        {0x1820, 0x1878},
+        {0x1880, 0x1884},
+        {0x1887, 0x18A8},
+        {0x18AA, 0x18AA},
+        {0x18B0, 0x18F5},
+        {0x1900, 0x191E},
+        {0x1950, 0x196D},
+        {0x1970, 0x1974},
+        {0x1980, 0x19AB},
+        {0x19B0, 0x19C9},
+        {0x1A00, 0x1A16},
+        {0x1A20, 0x1A54},
+        {0x1AA7, 0x1AA7},
+        {0x1B05, 0x1B33},
+        {0x1B45, 0x1B4C},
+        {0x1B83, 0x1BA0},
+        {0x1BAE, 0x1BAF},
+        {0x1BBA, 0x1BE5},
+        {0x1C00, 0x1C23},
+        {0x1C4D, 0x1C4F},
+        {0x1C5A, 0x1C7D},
+        {0x1C80, 0x1C8A},
+        {0x1C90, 0x1CBA},
+        {0x1CBD, 0x1CBF},
+        {0x1CE9, 0x1CEC},
+        {0x1CEE, 0x1CF3},
+        {0x1CF5, 0x1CF6},
+        {0x1CFA, 0x1CFA},
+        {0x1D00, 0x1DBF},
+        {0x1E00, 0x1F15},
+        {0x1F18, 0x1F1D},
+        {0x1F20, 0x1F45},
+        {0x1F48, 0x1F4D},
+        {0x1F50, 0x1F57},
+        {0x1F59, 0x1F59},
+        {0x1F5B, 0x1F5B},
+        {0x1F5D, 0x1F5D},
+        {0x1F5F, 0x1F7D},
+        {0x1F80, 0x1FB4},
+        {0x1FB6, 0x1FBC},
+        {0x1FBE, 0x1FBE},
+        {0x1FC2, 0x1FC4},
+        {0x1FC6, 0x1FCC},
+        {0x1FD0, 0x1FD3},
+        {0x1FD6, 0x1FDB},
+        {0x1FE0, 0x1FEC},
+        {0x1FF2, 0x1FF4},
+        {0x1FF6, 0x1FFC},
+        {0x2071, 0x2071},
+        {0x207F, 0x207F},
+        {0x2090, 0x209C},
+        {0x2102, 0x2102},
+        {0x2107, 0x2107},
+        {0x210A, 0x2113},
+        {0x2115, 0x2115},
+        {0x2119, 0x211D},
+        {0x2124, 0x2124},
+        {0x2126, 0x2126},
+        {0x2128, 0x2128},
+        {0x212A, 0x212D},
+        {0x212F, 0x2139},
+        {0x213C, 0x213F},
+        {0x2145, 0x2149},
+        {0x214E, 0x214E},
+        {0x2183, 0x2184},
+        {0x2C00, 0x2CE4},
+        {0x2CEB, 0x2CEE},
+        {0x2CF2, 0x2CF3},
+        {0x2D00, 0x2D25},
+        {0x2D27, 0x2D27},
+        {0x2D2D, 0x2D2D},
+        {0x2D30, 0x2D67},
+        {0x2D6F, 0x2D6F},
+        {0x2D80, 0x2D96},
+        {0x2DA0, 0x2DA6},
+        {0x2DA8, 0x2DAE},
+        {0x2DB0, 0x2DB6},
+        {0x2DB8, 0x2DBE},
+        {0x2DC0, 0x2DC6},
+        {0x2DC8, 0x2DCE},
+        {0x2DD0, 0x2DD6},
+        {0x2DD8, 0x2DDE},
+        {0x2E2F, 0x2E2F},
+        {0x3005, 0x3006},
+        {0x3031, 0x3035},
+        {0x303B, 0x303C},
+        {0x3041, 0x3096},
+        {0x309D, 0x309F},
+        {0x30A1, 0x30FA},
+        {0x30FC, 0x30FF},
+        {0x3105, 0x312F},
+        {0x3131, 0x318E},
+        {0x31A0, 0x31BF},
+        {0x31F0, 0x31FF},
+        {0x3400, 0x4DBF},
+        {0x4E00, 0xA48C},
+        {0xA4D0, 0xA4FD},
+        {0xA500, 0xA60C},
+        {0xA610, 0xA61F},
+        {0xA62A, 0xA62B},
+        {0xA640, 0xA66E},
+        {0xA67F, 0xA69D},
+        {0xA6A0, 0xA6E5},
+        {0xA717, 0xA71F},
+        {0xA722, 0xA788},
+        {0xA78B, 0xA7DC},
+        {0xA7F1, 0xA801},
+        {0xA803, 0xA805},
+        {0xA807, 0xA80A},
+        {0xA80C, 0xA822},
+        {0xA840, 0xA873},
+        {0xA882, 0xA8B3},
+        {0xA8F2, 0xA8F7},
+        {0xA8FB, 0xA8FB},
+        {0xA8FD, 0xA8FE},
+        {0xA90A, 0xA925},
+        {0xA930, 0xA946},
+        {0xA960, 0xA97C},
+        {0xA984, 0xA9B2},
+        {0xA9CF, 0xA9CF},
+        {0xA9E0, 0xA9E4},
+        {0xA9E6, 0xA9EF},
+        {0xA9FA, 0xA9FE},
+        {0xAA00, 0xAA28},
+        {0xAA40, 0xAA42},
+        {0xAA44, 0xAA4B},
+        {0xAA60, 0xAA76},
+        {0xAA7A, 0xAA7A},
+        {0xAA7E, 0xAAAF},
+        {0xAAB1, 0xAAB1},
+        {0xAAB5, 0xAAB6},
+        {0xAAB9, 0xAABD},
+        {0xAAC0, 0xAAC0},
+        {0xAAC2, 0xAAC2},
+        {0xAADB, 0xAADD},
+        {0xAAE0, 0xAAEA},
+        {0xAAF2, 0xAAF4},
+        {0xAB01, 0xAB06},
+        {0xAB09, 0xAB0E},
+        {0xAB11, 0xAB16},
+        {0xAB20, 0xAB26},
+        {0xAB28, 0xAB2E},
+        {0xAB30, 0xAB5A},
+        {0xAB5C, 0xAB69},
+        {0xAB70, 0xABE2},
+        {0xAC00, 0xD7A3},
+        {0xD7B0, 0xD7C6},
+        {0xD7CB, 0xD7FB},
+        {0xF900, 0xFA6D},
+        {0xFA70, 0xFAD9},
+        {0xFB00, 0xFB06},
+        {0xFB13, 0xFB17},
+        {0xFB1D, 0xFB1D},
+        {0xFB1F, 0xFB28},
+        {0xFB2A, 0xFB36},
+        {0xFB38, 0xFB3C},
+        {0xFB3E, 0xFB3E},
+        {0xFB40, 0xFB41},
+        {0xFB43, 0xFB44},
+        {0xFB46, 0xFBB1},
+        {0xFBD3, 0xFD3D},
+        {0xFD50, 0xFD8F},
+        {0xFD92, 0xFDC7},
+        {0xFDF0, 0xFDFB},
+        {0xFE70, 0xFE74},
+        {0xFE76, 0xFEFC},
+        {0xFF21, 0xFF3A},
+        {0xFF41, 0xFF5A},
+        {0xFF66, 0xFFBE},
+        {0xFFC2, 0xFFC7},
+        {0xFFCA, 0xFFCF},
+        {0xFFD2, 0xFFD7},
+        {0xFFDA, 0xFFDC},
+        {0x10000, 0x1000B},
+        {0x1000D, 0x10026},
+        {0x10028, 0x1003A},
+        {0x1003C, 0x1003D},
+        {0x1003F, 0x1004D},
+        {0x10050, 0x1005D},
+        {0x10080, 0x100FA},
+        {0x10280, 0x1029C},
+        {0x102A0, 0x102D0},
+        {0x10300, 0x1031F},
+        {0x1032D, 0x10340},
+        {0x10342, 0x10349},
+        {0x10350, 0x10375},
+        {0x10380, 0x1039D},
+        {0x103A0, 0x103C3},
+        {0x103C8, 0x103CF},
+        {0x10400, 0x1049D},
+        {0x104B0, 0x104D3},
+        {0x104D8, 0x104FB},
+        {0x10500, 0x10527},
+        {0x10530, 0x10563},
+        {0x10570, 0x1057A},
+        {0x1057C, 0x1058A},
+        {0x1058C, 0x10592},
+        {0x10594, 0x10595},
+        {0x10597, 0x105A1},
+        {0x105A3, 0x105B1},
+        {0x105B3, 0x105B9},
+        {0x105BB, 0x105BC},
+        {0x105C0, 0x105F3},
+        {0x10600, 0x10736},
+        {0x10740, 0x10755},
+        {0x10760, 0x10767},
+        {0x10780, 0x10785},
+        {0x10787, 0x107B0},
+        {0x107B2, 0x107BA},
+        {0x10800, 0x10805},
+        {0x10808, 0x10808},
+        {0x1080A, 0x10835},
+        {0x10837, 0x10838},
+        {0x1083C, 0x1083C},
+        {0x1083F, 0x10855},
+        {0x10860, 0x10876},
+        {0x10880, 0x1089E},
+        {0x108E0, 0x108F2},
+        {0x108F4, 0x108F5},
+        {0x10900, 0x10915},
+        {0x10920, 0x10939},
+        {0x10940, 0x10959},
+        {0x10980, 0x109B7},
+        {0x109BE, 0x109BF},
+        {0x10A00, 0x10A00},
+        {0x10A10, 0x10A13},
+        {0x10A15, 0x10A17},
+        {0x10A19, 0x10A35},
+        {0x10A60, 0x10A7C},
+        {0x10A80, 0x10A9C},
+        {0x10AC0, 0x10AC7},
+        {0x10AC9, 0x10AE4},
+        {0x10B00, 0x10B35},
+        {0x10B40, 0x10B55},
+        {0x10B60, 0x10B72},
+        {0x10B80, 0x10B91},
+        {0x10C00, 0x10C48},
+        {0x10C80, 0x10CB2},
+        {0x10CC0, 0x10CF2},
+        {0x10D00, 0x10D23},
+        {0x10D4A, 0x10D65},
+        {0x10D6F, 0x10D85},
+        {0x10E80, 0x10EA9},
+        {0x10EB0, 0x10EB1},
+        {0x10EC2, 0x10EC7},
+        {0x10F00, 0x10F1C},
+        {0x10F27, 0x10F27},
+        {0x10F30, 0x10F45},
+        {0x10F70, 0x10F81},
+        {0x10FB0, 0x10FC4},
+        {0x10FE0, 0x10FF6},
+        {0x11003, 0x11037},
+        {0x11071, 0x11072},
+        {0x11075, 0x11075},
+        {0x11083, 0x110AF},
+        {0x110D0, 0x110E8},
+        {0x11103, 0x11126},
+        {0x11144, 0x11144},
+        {0x11147, 0x11147},
+        {0x11150, 0x11172},
+        {0x11176, 0x11176},
+        {0x11183, 0x111B2},
+        {0x111C1, 0x111C4},
+        {0x111DA, 0x111DA},
+        {0x111DC, 0x111DC},
+        {0x11200, 0x11211},
+        {0x11213, 0x1122B},
+        {0x1123F, 0x11240},
+        {0x11280, 0x11286},
+        {0x11288, 0x11288},
+        {0x1128A, 0x1128D},
+        {0x1128F, 0x1129D},
+        {0x1129F, 0x112A8},
+        {0x112B0, 0x112DE},
+        {0x11305, 0x1130C},
+        {0x1130F, 0x11310},
+        {0x11313, 0x11328},
+        {0x1132A, 0x11330},
+        {0x11332, 0x11333},
+        {0x11335, 0x11339},
+        {0x1133D, 0x1133D},
+        {0x11350, 0x11350},
+        {0x1135D, 0x11361},
+        {0x11380, 0x11389},
+        {0x1138B, 0x1138B},
+        {0x1138E, 0x1138E},
+        {0x11390, 0x113B5},
+        {0x113B7, 0x113B7},
+        {0x113D1, 0x113D1},
+        {0x113D3, 0x113D3},
+        {0x11400, 0x11434},
+        {0x11447, 0x1144A},
+        {0x1145F, 0x11461},
+        {0x11480, 0x114AF},
+        {0x114C4, 0x114C5},
+        {0x114C7, 0x114C7},
+        {0x11580, 0x115AE},
+        {0x115D8, 0x115DB},
+        {0x11600, 0x1162F},
+        {0x11644, 0x11644},
+        {0x11680, 0x116AA},
+        {0x116B8, 0x116B8},
+        {0x11700, 0x1171A},
+        {0x11740, 0x11746},
+        {0x11800, 0x1182B},
+        {0x118A0, 0x118DF},
+        {0x118FF, 0x11906},
+        {0x11909, 0x11909},
+        {0x1190C, 0x11913},
+        {0x11915, 0x11916},
+        {0x11918, 0x1192F},
+        {0x1193F, 0x1193F},
+        {0x11941, 0x11941},
+        {0x119A0, 0x119A7},
+        {0x119AA, 0x119D0},
+        {0x119E1, 0x119E1},
+        {0x119E3, 0x119E3},
+        {0x11A00, 0x11A00},
+        {0x11A0B, 0x11A32},
+        {0x11A3A, 0x11A3A},
+        {0x11A50, 0x11A50},
+        {0x11A5C, 0x11A89},
+        {0x11A9D, 0x11A9D},
+        {0x11AB0, 0x11AF8},
+        {0x11BC0, 0x11BE0},
+        {0x11C00, 0x11C08},
+        {0x11C0A, 0x11C2E},
+        {0x11C40, 0x11C40},
+        {0x11C72, 0x11C8F},
+        {0x11D00, 0x11D06},
+        {0x11D08, 0x11D09},
+        {0x11D0B, 0x11D30},
+        {0x11D46, 0x11D46},
+        {0x11D60, 0x11D65},
+        {0x11D67, 0x11D68},
+        {0x11D6A, 0x11D89},
+        {0x11D98, 0x11D98},
+        {0x11DB0, 0x11DDB},
+        {0x11EE0, 0x11EF2},
+        {0x11F02, 0x11F02},
+        {0x11F04, 0x11F10},
+        {0x11F12, 0x11F33},
+        {0x11FB0, 0x11FB0},
+        {0x12000, 0x12399},
+        {0x12480, 0x12543},
+        {0x12F90, 0x12FF0},
+        {0x13000, 0x1342F},
+        {0x13441, 0x13446},
+        {0x13460, 0x143FA},
+        {0x14400, 0x14646},
+        {0x16100, 0x1611D},
+        {0x16800, 0x16A38},
+        {0x16A40, 0x16A5E},
+        {0x16A70, 0x16ABE},
+        {0x16AD0, 0x16AED},
+        {0x16B00, 0x16B2F},
+        {0x16B40, 0x16B43},
+        {0x16B63, 0x16B77},
+        {0x16B7D, 0x16B8F},
+        {0x16D40, 0x16D6C},
+        {0x16E40, 0x16E7F},
+        {0x16EA0, 0x16EB8},
+        {0x16EBB, 0x16ED3},
+        {0x16F00, 0x16F4A},
+        {0x16F50, 0x16F50},
+        {0x16F93, 0x16F9F},
+        {0x16FE0, 0x16FE1},
+        {0x16FE3, 0x16FE3},
+        {0x16FF2, 0x16FF3},
+        {0x17000, 0x18CD5},
+        {0x18CFF, 0x18D1E},
+        {0x18D80, 0x18DF2},
+        {0x1AFF0, 0x1AFF3},
+        {0x1AFF5, 0x1AFFB},
+        {0x1AFFD, 0x1AFFE},
+        {0x1B000, 0x1B122},
+        {0x1B132, 0x1B132},
+        {0x1B150, 0x1B152},
+        {0x1B155, 0x1B155},
+        {0x1B164, 0x1B167},
+        {0x1B170, 0x1B2FB},
+        {0x1BC00, 0x1BC6A},
+        {0x1BC70, 0x1BC7C},
+        {0x1BC80, 0x1BC88},
+        {0x1BC90, 0x1BC99},
+        {0x1D400, 0x1D454},
+        {0x1D456, 0x1D49C},
+        {0x1D49E, 0x1D49F},
+        {0x1D4A2, 0x1D4A2},
+        {0x1D4A5, 0x1D4A6},
+        {0x1D4A9, 0x1D4AC},
+        {0x1D4AE, 0x1D4B9},
+        {0x1D4BB, 0x1D4BB},
+        {0x1D4BD, 0x1D4C3},
+        {0x1D4C5, 0x1D505},
+        {0x1D507, 0x1D50A},
+        {0x1D50D, 0x1D514},
+        {0x1D516, 0x1D51C},
+        {0x1D51E, 0x1D539},
+        {0x1D53B, 0x1D53E},
+        {0x1D540, 0x1D544},
+        {0x1D546, 0x1D546},
+        {0x1D54A, 0x1D550},
+        {0x1D552, 0x1D6A5},
+        {0x1D6A8, 0x1D6C0},
+        {0x1D6C2, 0x1D6DA},
+        {0x1D6DC, 0x1D6FA},
+        {0x1D6FC, 0x1D714},
+        {0x1D716, 0x1D734},
+        {0x1D736, 0x1D74E},
+        {0x1D750, 0x1D76E},
+        {0x1D770, 0x1D788},
+        {0x1D78A, 0x1D7A8},
+        {0x1D7AA, 0x1D7C2},
+        {0x1D7C4, 0x1D7CB},
+        {0x1DF00, 0x1DF1E},
+        {0x1DF25, 0x1DF2A},
+        {0x1E030, 0x1E06D},
+        {0x1E100, 0x1E12C},
+        {0x1E137, 0x1E13D},
+        {0x1E14E, 0x1E14E},
+        {0x1E290, 0x1E2AD},
+        {0x1E2C0, 0x1E2EB},
+        {0x1E4D0, 0x1E4EB},
+        {0x1E5D0, 0x1E5ED},
+        {0x1E5F0, 0x1E5F0},
+        {0x1E6C0, 0x1E6DE},
+        {0x1E6E0, 0x1E6E2},
+        {0x1E6E4, 0x1E6E5},
+        {0x1E6E7, 0x1E6ED},
+        {0x1E6F0, 0x1E6F4},
+        {0x1E6FE, 0x1E6FF},
+        {0x1E7E0, 0x1E7E6},
+        {0x1E7E8, 0x1E7EB},
+        {0x1E7ED, 0x1E7EE},
+        {0x1E7F0, 0x1E7FE},
+        {0x1E800, 0x1E8C4},
+        {0x1E900, 0x1E943},
+        {0x1E94B, 0x1E94B},
+        {0x1EE00, 0x1EE03},
+        {0x1EE05, 0x1EE1F},
+        {0x1EE21, 0x1EE22},
+        {0x1EE24, 0x1EE24},
+        {0x1EE27, 0x1EE27},
+        {0x1EE29, 0x1EE32},
+        {0x1EE34, 0x1EE37},
+        {0x1EE39, 0x1EE39},
+        {0x1EE3B, 0x1EE3B},
+        {0x1EE42, 0x1EE42},
+        {0x1EE47, 0x1EE47},
+        {0x1EE49, 0x1EE49},
+        {0x1EE4B, 0x1EE4B},
+        {0x1EE4D, 0x1EE4F},
+        {0x1EE51, 0x1EE52},
+        {0x1EE54, 0x1EE54},
+        {0x1EE57, 0x1EE57},
+        {0x1EE59, 0x1EE59},
+        {0x1EE5B, 0x1EE5B},
+        {0x1EE5D, 0x1EE5D},
+        {0x1EE5F, 0x1EE5F},
+        {0x1EE61, 0x1EE62},
+        {0x1EE64, 0x1EE64},
+        {0x1EE67, 0x1EE6A},
+        {0x1EE6C, 0x1EE72},
+        {0x1EE74, 0x1EE77},
+        {0x1EE79, 0x1EE7C},
+        {0x1EE7E, 0x1EE7E},
+        {0x1EE80, 0x1EE89},
+        {0x1EE8B, 0x1EE9B},
+        {0x1EEA1, 0x1EEA3},
+        {0x1EEA5, 0x1EEA9},
+        {0x1EEAB, 0x1EEBB},
+        {0x20000, 0x2A6DF},
+        {0x2A700, 0x2B81D},
+        {0x2B820, 0x2CEAD},
+        {0x2CEB0, 0x2EBE0},
+        {0x2EBF0, 0x2EE5D},
+        {0x2F800, 0x2FA1D},
+        {0x30000, 0x3134A},
+        {0x31350, 0x33479},
+    };
+
+    for (const auto& r : ranges) {
+        if (ch >= r.start && ch <= r.end)
+            return true;
+    }
+    return false;
+}
+
+bool is_space(char32_t cp) {
+    switch (cp) {
+        case 0x0009:  // TAB \t
+        case 0x000A:  // LF \n
+        case 0x000B:  // VT
+        case 0x000C:  // FF
+        case 0x000D:  // CR \r
+        case 0x0020:  // Space
+        case 0x00A0:  // No-Break Space
+        case 0x1680:  // Ogham Space Mark
+        case 0x2000:  // En Quad
+        case 0x2001:  // Em Quad
+        case 0x2002:  // En Space
+        case 0x2003:  // Em Space
+        case 0x2004:  // Three-Per-Em Space
+        case 0x2005:  // Four-Per-Em Space
+        case 0x2006:  // Six-Per-Em Space
+        case 0x2007:  // Figure Space
+        case 0x2008:  // Punctuation Space
+        case 0x2009:  // Thin Space
+        case 0x200A:  // Hair Space
+        case 0x202F:  // Narrow No-Break Space
+        case 0x205F:  // Medium Mathematical Space
+        case 0x3000:  // Ideographic Space
+            return true;
+        default:
+            return false;
+    }
+}
+
+std::string str_to_lower(const std::string& input) {
+    std::string result = input;
+    std::transform(result.begin(), result.end(), result.begin(),
+                   [](unsigned char c) { return std::tolower(c); });
+    return result;
+}
+
+// UTF-8 -> Unicode code points
+std::vector<char32_t> utf8_to_codepoints(const std::string& str) {
+    std::vector<char32_t> codepoints;
+    size_t i = 0;
+    while (i < str.size()) {
+        unsigned char c    = str[i];
+        char32_t cp        = 0;
+        size_t extra_bytes = 0;
+
+        if ((c & 0x80) == 0)
+            cp = c;
+        else if ((c & 0xE0) == 0xC0) {
+            cp          = c & 0x1F;
+            extra_bytes = 1;
+        } else if ((c & 0xF0) == 0xE0) {
+            cp          = c & 0x0F;
+            extra_bytes = 2;
+        } else if ((c & 0xF8) == 0xF0) {
+            cp          = c & 0x07;
+            extra_bytes = 3;
+        } else {
+            ++i;
+            continue;
+        }  // Invalid UTF-8
+
+        if (i + extra_bytes >= str.size())
+            break;
+
+        for (size_t j = 1; j <= extra_bytes; ++j)
+            cp = (cp << 6) | (str[i + j] & 0x3F);
+
+        codepoints.push_back(cp);
+        i += 1 + extra_bytes;
+    }
+    return codepoints;
+}
+
+// Unicode code point -> UTF-8
+std::string codepoint_to_utf8(char32_t cp) {
+    std::string out;
+    if (cp <= 0x7F)
+        out.push_back(static_cast<char>(cp));
+    else if (cp <= 0x7FF) {
+        out.push_back(static_cast<char>(0xC0 | (cp >> 6)));
+        out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
+    } else if (cp <= 0xFFFF) {
+        out.push_back(static_cast<char>(0xE0 | (cp >> 12)));
+        out.push_back(static_cast<char>(0x80 | ((cp >> 6) & 0x3F)));
+        out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
+    } else {
+        out.push_back(static_cast<char>(0xF0 | (cp >> 18)));
+        out.push_back(static_cast<char>(0x80 | ((cp >> 12) & 0x3F)));
+        out.push_back(static_cast<char>(0x80 | ((cp >> 6) & 0x3F)));
+        out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
+    }
+    return out;
+}
+
+bool starts_with(const std::vector<char32_t>& text,
+                 const std::vector<char32_t>& prefix,
+                 std::size_t index) {
+    if (index > text.size()) {
+        return false;
+    }
+    if (prefix.size() > text.size() - index) {
+        return false;
+    }
+    return std::equal(prefix.begin(), prefix.end(), text.begin() + index);
+}
+
+// mistral: [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+
+// qwen2: (?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+
+std::vector<std::string> token_split(const std::string& text) {
+    std::vector<std::string> tokens;
+    auto cps = utf8_to_codepoints(text);
+    size_t i = 0;
+
+    while (i < cps.size()) {
+        char32_t cp = cps[i];
+
+        // `(?i:'s|'t|'re|'ve|'m|'ll|'d)`
+        if (cp == U'\'' && i + 1 < cps.size()) {
+            std::string next = str_to_lower(codepoint_to_utf8(cps[i + 1]));
+            if (next == "s" || next == "t" || next == "m") {
+                tokens.push_back("'" + next);
+                i += 2;
+                continue;
+            }
+            if (i + 2 < cps.size()) {
+                next += str_to_lower(codepoint_to_utf8(cps[i + 2]));
+                if (next == "re" || next == "ve" || next == "ll" || next == "d") {
+                    tokens.push_back("'" + next);
+                    i += 3;
+                    continue;
+                }
+            }
+        }
+
+        // `\p{N}`
+        if (is_number(cp)) {
+            tokens.push_back(codepoint_to_utf8(cp));
+            ++i;
+            continue;
+        }
+
+        // `[^\r\n\p{L}\p{N}]?\p{L}+`
+        {
+            // `[^\r\n\p{L}\p{N}]\p{L}+`
+            if (!is_letter(cp) && cp != U'\r' && cp != U'\n' && i + 1 < cps.size() && is_letter(cps[i + 1])) {
+                std::string token = codepoint_to_utf8(cp);
+                ++i;
+
+                while (i < cps.size() && is_letter(cps[i])) {
+                    token += codepoint_to_utf8(cps[i]);
+                    ++i;
+                }
+                tokens.push_back(token);
+                continue;
+            }
+
+            // `\p{L}+`
+            if (is_letter(cp)) {
+                std::string token = codepoint_to_utf8(cp);
+                ++i;
+                while (i < cps.size() && is_letter(cps[i])) {
+                    token += codepoint_to_utf8(cps[i]);
+                    ++i;
+                }
+                tokens.push_back(token);
+                continue;
+            }
+        }
+
+        // ` ?[^\s\p{L}\p{N}]+[\r\n]*`
+        {
+            // ` [^\s\p{L}\p{N}]+[\r\n]*`
+            if (cp == U' ' && i + 1 < cps.size() && !isspace(cps[i + 1]) && !is_letter(cps[i + 1]) && !is_number(cps[i + 1])) {
+                std::string token = codepoint_to_utf8(cp);
+                token += codepoint_to_utf8(cps[i + 1]);
+                i += 2;
+
+                while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) {
+                    token += codepoint_to_utf8(cps[i]);
+                    ++i;
+                }
+
+                while (i < cps.size() && (cps[i] == U'\r' || cps[i] == U'\n')) {
+                    token += codepoint_to_utf8(cps[i]);
+                    ++i;
+                }
+
+                tokens.push_back(token);
+                continue;
+            }
+
+            // `[^\s\p{L}\p{N}]+[\r\n]*`
+            std::string token;
+            if (!is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) {
+                std::string token = codepoint_to_utf8(cp);
+                ++i;
+
+                while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) {
+                    token += codepoint_to_utf8(cps[i]);
+                    ++i;
+                }
+
+                while (i < cps.size() && (cps[i] == U'\r' || cps[i] == U'\n')) {
+                    token += codepoint_to_utf8(cps[i]);
+                    ++i;
+                }
+
+                tokens.push_back(token);
+                continue;
+            }
+        }
+
+        // `\s*[\r\n]+|\s+(?!\S)|\s+`
+        if (is_space(cp)) {
+            std::string token;
+            bool saw_new_line = false;
+
+            while (i < cps.size() && is_space(cps[i])) {
+                token += codepoint_to_utf8(cps[i]);
+
+                if (cps[i] == U'\r' || cps[i] == U'\n') {
+                    saw_new_line = true;
+                } else {
+                    if (saw_new_line) {
+                        break;
+                    }
+                }
+
+                ++i;
+            }
+
+            tokens.push_back(token);
+            continue;
+        }
+
+        // skip
+        ++i;
+    }
+
+    return tokens;
+}
+
+std::vector<std::string> split_with_special_tokens(
+    const std::string& text,
+    const std::vector<std::string>& special_tokens) {
+    std::vector<std::string> result;
+    size_t pos      = 0;
+    size_t text_len = text.size();
+
+    while (pos < text_len) {
+        size_t next_pos = text_len;
+        std::string matched_token;
+
+        for (const auto& token : special_tokens) {
+            size_t token_pos = text.find(token, pos);
+            if (token_pos != std::string::npos && token_pos < next_pos) {
+                next_pos      = token_pos;
+                matched_token = token;
+            }
+        }
+
+        if (next_pos > pos) {
+            result.push_back(text.substr(pos, next_pos - pos));
+        }
+
+        if (!matched_token.empty()) {
+            result.push_back(matched_token);
+            pos = next_pos + matched_token.size();
+        } else {
+            break;
+        }
+    }
+
+    return result;
+}
+
+// int main() {
+//     std::string text = "I'm testing C++ token_split function. Hello world 123";
+//     auto tokens = token_split(text);
+
+//     for (const auto& t : tokens) {
+//         std::cout << "[" << t << "] ";
+//     }
+//     std::cout << "\n";
+//     return 0;
+// }
diff --git a/src/ucache.hpp b/src/ucache.hpp
index d3247618..3d785c5e 100644
--- a/src/ucache.hpp
+++ b/src/ucache.hpp
@@ -6,8 +6,10 @@
 #include <unordered_map>
 #include <vector>
 
+#include "condition_cache_utils.hpp"
 #include "denoiser.hpp"
 #include "ggml_extend.hpp"
+#include "tensor.hpp"
 
 struct UCacheConfig {
     bool enabled                = false;
@@ -29,15 +31,15 @@ struct UCacheCacheEntry {
 
 struct UCacheState {
     UCacheConfig config;
-    Denoiser* denoiser                  = nullptr;
-    float start_sigma                   = std::numeric_limits<float>::max();
-    float end_sigma                     = 0.0f;
-    bool initialized                    = false;
-    bool initial_step                   = true;
-    bool skip_current_step              = false;
-    bool step_active                    = false;
-    const SDCondition* anchor_condition = nullptr;
-    std::unordered_map<const SDCondition*, UCacheCacheEntry> cache_diffs;
+    Denoiser* denoiser           = nullptr;
+    float start_sigma            = std::numeric_limits<float>::max();
+    float end_sigma              = 0.0f;
+    bool initialized             = false;
+    bool initial_step            = true;
+    bool skip_current_step       = false;
+    bool step_active             = false;
+    const void* anchor_condition = nullptr;
+    std::unordered_map<const void*, UCacheCacheEntry> cache_diffs;
     std::vector<float> prev_input;
     std::vector<float> prev_output;
     float output_prev_norm                = 0.0f;
@@ -233,43 +235,30 @@ struct UCacheState {
         return base_threshold * multiplier;
     }
 
-    bool has_cache(const SDCondition* cond) const {
+    bool has_cache(const void* cond) const {
         auto it = cache_diffs.find(cond);
         return it != cache_diffs.end() && !it->second.diff.empty();
     }
 
-    void update_cache(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) {
+    void update_cache(const void* cond, const sd::Tensor<float>& input, const sd::Tensor<float>& output) {
         UCacheCacheEntry& entry = cache_diffs[cond];
-        size_t ne               = static_cast<size_t>(ggml_nelements(output));
-        entry.diff.resize(ne);
-        float* out_data = (float*)output->data;
-        float* in_data  = (float*)input->data;
-
-        for (size_t i = 0; i < ne; ++i) {
-            entry.diff[i] = out_data[i] - in_data[i];
-        }
+        sd::store_condition_cache_diff(&entry.diff, input, output);
     }
 
-    void apply_cache(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) {
+    void apply_cache(const void* cond, const sd::Tensor<float>& input, sd::Tensor<float>* output) {
         auto it = cache_diffs.find(cond);
         if (it == cache_diffs.end() || it->second.diff.empty()) {
             return;
         }
-
-        copy_ggml_tensor(output, input);
-        float* out_data                = (float*)output->data;
-        const std::vector<float>& diff = it->second.diff;
-        for (size_t i = 0; i < diff.size(); ++i) {
-            out_data[i] += diff[i];
-        }
+        sd::apply_condition_cache_diff(it->second.diff, input, output);
     }
 
-    bool before_condition(const SDCondition* cond,
-                          ggml_tensor* input,
-                          ggml_tensor* output,
+    bool before_condition(const void* cond,
+                          const sd::Tensor<float>& input,
+                          sd::Tensor<float>* output,
                           float sigma,
                           int step_index) {
-        if (!enabled() || step_index < 0) {
+        if (!enabled() || step_index < 0 || output == nullptr) {
             return false;
         }
         if (step_index != current_step_index) {
@@ -302,13 +291,13 @@ struct UCacheState {
             return false;
         }
 
-        size_t ne = static_cast<size_t>(ggml_nelements(input));
+        size_t ne = static_cast<size_t>(input.numel());
         if (prev_input.size() != ne) {
             return false;
         }
 
-        float* input_data = (float*)input->data;
-        last_input_change = 0.0f;
+        const float* input_data = input.data();
+        last_input_change       = 0.0f;
         for (size_t i = 0; i < ne; ++i) {
             last_input_change += std::fabs(input_data[i] - prev_input[i]);
         }
@@ -354,7 +343,7 @@ struct UCacheState {
         return false;
     }
 
-    void after_condition(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) {
+    void after_condition(const void* cond, const sd::Tensor<float>& input, const sd::Tensor<float>& output) {
         if (!step_is_active()) {
             return;
         }
@@ -367,16 +356,16 @@ struct UCacheState {
         steps_computed_since_active++;
         consecutive_skipped_steps = 0;
 
-        size_t ne      = static_cast<size_t>(ggml_nelements(input));
-        float* in_data = (float*)input->data;
+        size_t ne            = static_cast<size_t>(input.numel());
+        const float* in_data = input.data();
         prev_input.resize(ne);
         for (size_t i = 0; i < ne; ++i) {
             prev_input[i] = in_data[i];
         }
         has_prev_input = true;
 
-        float* out_data     = (float*)output->data;
-        float output_change = 0.0f;
+        const float* out_data = output.data();
+        float output_change   = 0.0f;
         if (has_prev_output && prev_output.size() == ne) {
             for (size_t i = 0; i < ne; ++i) {
                 output_change += std::fabs(out_data[i] - prev_output[i]);
diff --git a/src/unet.hpp b/src/unet.hpp
index e0fd4c52..63e23eb9 100644
--- a/src/unet.hpp
+++ b/src/unet.hpp
@@ -60,10 +60,10 @@ public:
         blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender());
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                struct ggml_tensor* x,
-                                struct ggml_tensor* context,
-                                int timesteps) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* x,
+                         ggml_tensor* context,
+                         int timesteps) {
         // x: [N, in_channels, h, w] aka [b*t, in_channels, h, w], t == timesteps
         // context: [N, max_position(aka n_context), hidden_size(aka context_dim)] aka [b*t, n_context, context_dim], t == timesteps
         // t_emb: [N, in_channels] aka [b*t, in_channels]
@@ -388,11 +388,11 @@ public:
         blocks["out.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(model_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
     }
 
-    struct ggml_tensor* resblock_forward(std::string name,
-                                         GGMLRunnerContext* ctx,
-                                         struct ggml_tensor* x,
-                                         struct ggml_tensor* emb,
-                                         int num_video_frames) {
+    ggml_tensor* resblock_forward(std::string name,
+                                  GGMLRunnerContext* ctx,
+                                  ggml_tensor* x,
+                                  ggml_tensor* emb,
+                                  int num_video_frames) {
         if (version == VERSION_SVD) {
             auto block = std::dynamic_pointer_cast<VideoResBlock>(blocks[name]);
 
@@ -404,11 +404,11 @@ public:
         }
     }
 
-    struct ggml_tensor* attention_layer_forward(std::string name,
-                                                GGMLRunnerContext* ctx,
-                                                struct ggml_tensor* x,
-                                                struct ggml_tensor* context,
-                                                int timesteps) {
+    ggml_tensor* attention_layer_forward(std::string name,
+                                         GGMLRunnerContext* ctx,
+                                         ggml_tensor* x,
+                                         ggml_tensor* context,
+                                         int timesteps) {
         if (version == VERSION_SVD) {
             auto block = std::dynamic_pointer_cast<SpatialVideoTransformer>(blocks[name]);
 
@@ -420,15 +420,15 @@ public:
         }
     }
 
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                struct ggml_tensor* x,
-                                struct ggml_tensor* timesteps,
-                                struct ggml_tensor* context,
-                                struct ggml_tensor* c_concat              = nullptr,
-                                struct ggml_tensor* y                     = nullptr,
-                                int num_video_frames                      = -1,
-                                std::vector<struct ggml_tensor*> controls = {},
-                                float control_strength                    = 0.f) {
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* x,
+                         ggml_tensor* timesteps,
+                         ggml_tensor* context,
+                         ggml_tensor* c_concat              = nullptr,
+                         ggml_tensor* y                     = nullptr,
+                         int num_video_frames               = -1,
+                         std::vector<ggml_tensor*> controls = {},
+                         float control_strength             = 0.f) {
         // x: [N, in_channels, h, w] or [N, in_channels/2, h, w]
         // timesteps: [N,]
         // context: [N, max_position, hidden_size] or [1, max_position, hidden_size]. for example, [N, 77, 768]
@@ -480,7 +480,7 @@ public:
         }
 
         // input_blocks
-        std::vector<struct ggml_tensor*> hs;
+        std::vector<ggml_tensor*> hs;
 
         // input block 0
         auto h = input_blocks_0_0->forward(ctx, x);
@@ -605,82 +605,81 @@ struct UNetModelRunner : public GGMLRunner {
         return "unet";
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
         unet.get_param_tensors(tensors, prefix);
     }
 
-    struct ggml_cgraph* build_graph(struct ggml_tensor* x,
-                                    struct ggml_tensor* timesteps,
-                                    struct ggml_tensor* context,
-                                    struct ggml_tensor* c_concat              = nullptr,
-                                    struct ggml_tensor* y                     = nullptr,
-                                    int num_video_frames                      = -1,
-                                    std::vector<struct ggml_tensor*> controls = {},
-                                    float control_strength                    = 0.f) {
-        struct ggml_cgraph* gf = new_graph_custom(UNET_GRAPH_SIZE);
+    ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
+                             const sd::Tensor<float>& timesteps_tensor,
+                             const sd::Tensor<float>& context_tensor               = {},
+                             const sd::Tensor<float>& c_concat_tensor              = {},
+                             const sd::Tensor<float>& y_tensor                     = {},
+                             int num_video_frames                                  = -1,
+                             const std::vector<sd::Tensor<float>>& controls_tensor = {},
+                             float control_strength                                = 0.f) {
+        ggml_cgraph* gf = new_graph_custom(UNET_GRAPH_SIZE);
+
+        ggml_tensor* x         = make_input(x_tensor);
+        ggml_tensor* timesteps = make_input(timesteps_tensor);
+        ggml_tensor* context   = make_optional_input(context_tensor);
+        ggml_tensor* c_concat  = make_optional_input(c_concat_tensor);
+        ggml_tensor* y         = make_optional_input(y_tensor);
+        std::vector<ggml_tensor*> controls;
+        controls.reserve(controls_tensor.size());
+        for (const auto& control_tensor : controls_tensor) {
+            controls.push_back(make_input(control_tensor));
+        }
 
         if (num_video_frames == -1) {
             num_video_frames = static_cast<int>(x->ne[3]);
         }
 
-        x         = to_backend(x);
-        context   = to_backend(context);
-        y         = to_backend(y);
-        timesteps = to_backend(timesteps);
-        c_concat  = to_backend(c_concat);
-
-        for (int i = 0; i < controls.size(); i++) {
-            controls[i] = to_backend(controls[i]);
-        }
-
         auto runner_ctx = get_context();
 
-        struct ggml_tensor* out = unet.forward(&runner_ctx,
-                                               x,
-                                               timesteps,
-                                               context,
-                                               c_concat,
-                                               y,
-                                               num_video_frames,
-                                               controls,
-                                               control_strength);
+        ggml_tensor* out = unet.forward(&runner_ctx,
+                                        x,
+                                        timesteps,
+                                        context,
+                                        c_concat,
+                                        y,
+                                        num_video_frames,
+                                        controls,
+                                        control_strength);
 
         ggml_build_forward_expand(gf, out);
 
         return gf;
     }
 
-    bool compute(int n_threads,
-                 struct ggml_tensor* x,
-                 struct ggml_tensor* timesteps,
-                 struct ggml_tensor* context,
-                 struct ggml_tensor* c_concat,
-                 struct ggml_tensor* y,
-                 int num_video_frames                      = -1,
-                 std::vector<struct ggml_tensor*> controls = {},
-                 float control_strength                    = 0.f,
-                 struct ggml_tensor** output               = nullptr,
-                 struct ggml_context* output_ctx           = nullptr) {
+    sd::Tensor<float> compute(int n_threads,
+                              const sd::Tensor<float>& x,
+                              const sd::Tensor<float>& timesteps,
+                              const sd::Tensor<float>& context               = {},
+                              const sd::Tensor<float>& c_concat              = {},
+                              const sd::Tensor<float>& y                     = {},
+                              int num_video_frames                           = -1,
+                              const std::vector<sd::Tensor<float>>& controls = {},
+                              float control_strength                         = 0.f) {
         // x: [N, in_channels, h, w]
         // timesteps: [N, ]
         // context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size]
         // c_concat: [N, in_channels, h, w] or [1, in_channels, h, w]
         // y: [N, adm_in_channels] or [1, adm_in_channels]
-        auto get_graph = [&]() -> struct ggml_cgraph* {
+        auto get_graph = [&]() -> ggml_cgraph* {
             return build_graph(x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength);
         };
 
-        return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
     }
 
     void test() {
-        struct ggml_init_params params;
+        ggml_init_params params;
         params.mem_size   = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
         params.mem_buffer = nullptr;
         params.no_alloc   = false;
 
-        struct ggml_context* work_ctx = ggml_init(params);
-        GGML_ASSERT(work_ctx != nullptr);
+        ggml_context* ctx = ggml_init(params);
+        GGML_ASSERT(ctx != nullptr);
 
         {
             // CPU, num_video_frames = 1, x{num_video_frames, 8, 8, 8}: Pass
@@ -689,27 +688,37 @@ struct UNetModelRunner : public GGMLRunner {
             // CUDA, num_video_frames = 3, x{num_video_frames, 8, 8, 8}: nan
             int num_video_frames = 3;
 
-            auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 8, num_video_frames);
+            sd::Tensor<float> x({8, 8, 8, num_video_frames});
             std::vector<float> timesteps_vec(num_video_frames, 999.f);
-            auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
-            ggml_set_f32(x, 0.5f);
+            auto timesteps = sd::Tensor<float>::from_vector(timesteps_vec);
+            x.fill_(0.5f);
             // print_ggml_tensor(x);
 
-            auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 1024, 1, num_video_frames);
-            ggml_set_f32(context, 0.5f);
+            sd::Tensor<float> context({1024, 1, num_video_frames});
+            context.fill_(0.5f);
             // print_ggml_tensor(context);
 
-            auto y = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 768, num_video_frames);
-            ggml_set_f32(y, 0.5f);
+            sd::Tensor<float> y({768, num_video_frames});
+            y.fill_(0.5f);
             // print_ggml_tensor(y);
 
-            struct ggml_tensor* out = nullptr;
+            sd::Tensor<float> out;
 
-            int64_t t0 = ggml_time_ms();
-            compute(8, x, timesteps, context, nullptr, y, num_video_frames, {}, 0.f, &out, work_ctx);
-            int64_t t1 = ggml_time_ms();
+            int64_t t0   = ggml_time_ms();
+            auto out_opt = compute(8,
+                                   x,
+                                   timesteps,
+                                   context,
+                                   {},
+                                   y,
+                                   num_video_frames,
+                                   {},
+                                   0.f);
+            int64_t t1   = ggml_time_ms();
 
-            print_ggml_tensor(out);
+            GGML_ASSERT(!out_opt.empty());
+            out = std::move(out_opt);
+            print_sd_tensor(out);
             LOG_DEBUG("unet test done in %lldms", t1 - t0);
         }
     }
diff --git a/src/upscaler.cpp b/src/upscaler.cpp
index fd0dc824..03f7714e 100644
--- a/src/upscaler.cpp
+++ b/src/upscaler.cpp
@@ -2,6 +2,7 @@
 #include "ggml_extend.hpp"
 #include "model.h"
 #include "stable-diffusion.h"
+#include "util.h"
 
 struct UpscalerGGML {
     ggml_backend_t backend    = nullptr;  // general backend
@@ -64,6 +65,39 @@ struct UpscalerGGML {
         return true;
     }
 
+    sd::Tensor<float> upscale_tensor(const sd::Tensor<float>& input_tensor) {
+        sd::Tensor<float> upscaled;
+        if (tile_size <= 0 || (input_tensor.shape()[0] <= tile_size && input_tensor.shape()[1] <= tile_size)) {
+            upscaled = esrgan_upscaler->compute(n_threads, input_tensor);
+        } else {
+            auto on_processing = [&](const sd::Tensor<float>& input_tile) -> sd::Tensor<float> {
+                auto output_tile = esrgan_upscaler->compute(n_threads, input_tile);
+                if (output_tile.empty()) {
+                    LOG_ERROR("esrgan compute failed while processing a tile");
+                    return {};
+                }
+                return output_tile;
+            };
+
+            upscaled = process_tiles_2d(input_tensor,
+                                        static_cast<int>(input_tensor.shape()[0] * esrgan_upscaler->scale),
+                                        static_cast<int>(input_tensor.shape()[1] * esrgan_upscaler->scale),
+                                        esrgan_upscaler->scale,
+                                        tile_size,
+                                        tile_size,
+                                        0.25f,
+                                        false,
+                                        false,
+                                        on_processing);
+        }
+        esrgan_upscaler->free_compute_buffer();
+        if (upscaled.empty()) {
+            LOG_ERROR("esrgan compute failed");
+            return {};
+        }
+        return upscaled;
+    }
+
     sd_image_t upscale(sd_image_t input_image, uint32_t upscale_factor) {
         // upscale_factor, unused for RealESRGAN_x4plus_anime_6B.pth
         sd_image_t upscaled_image = {0, 0, 0, nullptr};
@@ -72,39 +106,17 @@ struct UpscalerGGML {
         LOG_INFO("upscaling from (%i x %i) to (%i x %i)",
                  input_image.width, input_image.height, output_width, output_height);
 
-        struct ggml_init_params params;
-        params.mem_size   = static_cast<size_t>(1024 * 1024) * 1024;  // 1G
-        params.mem_buffer = nullptr;
-        params.no_alloc   = false;
-
-        // draft context
-        struct ggml_context* upscale_ctx = ggml_init(params);
-        if (!upscale_ctx) {
-            LOG_ERROR("ggml_init() failed");
+        sd::Tensor<float> input_tensor = sd_image_to_tensor(input_image);
+        sd::Tensor<float> upscaled;
+        int64_t t0 = ggml_time_ms();
+        upscaled   = upscale_tensor(input_tensor);
+        if (upscaled.empty()) {
             return upscaled_image;
         }
-        // LOG_DEBUG("upscale work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f);
-        ggml_tensor* input_image_tensor = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, input_image.width, input_image.height, 3, 1);
-        sd_image_to_ggml_tensor(input_image, input_image_tensor);
-
-        ggml_tensor* upscaled = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, output_width, output_height, 3, 1);
-        auto on_tiling        = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-            return esrgan_upscaler->compute(n_threads, in, &out);
-        };
-        int64_t t0 = ggml_time_ms();
-        sd_tiling(input_image_tensor, upscaled, esrgan_upscaler->scale, esrgan_upscaler->tile_size, 0.25f, on_tiling);
-        esrgan_upscaler->free_compute_buffer();
-        ggml_ext_tensor_clamp_inplace(upscaled, 0.f, 1.f);
-        uint8_t* upscaled_data = ggml_tensor_to_sd_image(upscaled);
-        ggml_free(upscale_ctx);
-        int64_t t3 = ggml_time_ms();
+        sd_image_t upscaled_data = tensor_to_sd_image(upscaled);
+        int64_t t3               = ggml_time_ms();
         LOG_INFO("input_image_tensor upscaled, taking %.2fs", (t3 - t0) / 1000.0f);
-        upscaled_image = {
-            (uint32_t)output_width,
-            (uint32_t)output_height,
-            3,
-            upscaled_data,
-        };
+        upscaled_image = upscaled_data;
         return upscaled_image;
     }
 };
diff --git a/src/util.cpp b/src/util.cpp
index a94cfd98..2d330a2d 100644
--- a/src/util.cpp
+++ b/src/util.cpp
@@ -479,158 +479,96 @@ const char* sd_get_system_info() {
     return buffer;
 }
 
-sd_image_f32_t sd_image_t_to_sd_image_f32_t(sd_image_t image) {
-    sd_image_f32_t converted_image;
-    converted_image.width   = image.width;
-    converted_image.height  = image.height;
-    converted_image.channel = image.channel;
+sd_image_t tensor_to_sd_image(const sd::Tensor<float>& tensor, int frame_index) {
+    const auto& shape = tensor.shape();
+    GGML_ASSERT(shape.size() == 4 || shape.size() == 5);
+    int width     = static_cast<int>(shape[0]);
+    int height    = static_cast<int>(shape[1]);
+    int channel   = static_cast<int>(shape[shape.size() == 5 ? 3 : 2]);
+    uint8_t* data = (uint8_t*)malloc(static_cast<size_t>(width * height * channel));
+    GGML_ASSERT(data != nullptr);
 
-    // Allocate memory for float data
-    converted_image.data = (float*)malloc(image.width * image.height * image.channel * sizeof(float));
-
-    for (uint32_t i = 0; i < image.width * image.height * image.channel; i++) {
-        // Convert uint8_t to float
-        converted_image.data[i] = (float)image.data[i];
-    }
-
-    return converted_image;
-}
-
-// Function to perform double linear interpolation
-float interpolate(float v1, float v2, float v3, float v4, float x_ratio, float y_ratio) {
-    return v1 * (1 - x_ratio) * (1 - y_ratio) + v2 * x_ratio * (1 - y_ratio) + v3 * (1 - x_ratio) * y_ratio + v4 * x_ratio * y_ratio;
-}
-
-sd_image_f32_t resize_sd_image_f32_t(sd_image_f32_t image, int target_width, int target_height) {
-    sd_image_f32_t resized_image;
-    resized_image.width   = target_width;
-    resized_image.height  = target_height;
-    resized_image.channel = image.channel;
-
-    // Allocate memory for resized float data
-    resized_image.data = (float*)malloc(target_width * target_height * image.channel * sizeof(float));
-
-    for (int y = 0; y < target_height; y++) {
-        for (int x = 0; x < target_width; x++) {
-            float original_x = (float)x * image.width / target_width;
-            float original_y = (float)y * image.height / target_height;
-
-            uint32_t x1 = (uint32_t)original_x;
-            uint32_t y1 = (uint32_t)original_y;
-            uint32_t x2 = std::min(x1 + 1, image.width - 1);
-            uint32_t y2 = std::min(y1 + 1, image.height - 1);
-
-            for (uint32_t k = 0; k < image.channel; k++) {
-                float v1 = *(image.data + y1 * image.width * image.channel + x1 * image.channel + k);
-                float v2 = *(image.data + y1 * image.width * image.channel + x2 * image.channel + k);
-                float v3 = *(image.data + y2 * image.width * image.channel + x1 * image.channel + k);
-                float v4 = *(image.data + y2 * image.width * image.channel + x2 * image.channel + k);
-
-                float x_ratio = original_x - x1;
-                float y_ratio = original_y - y1;
-
-                float value = interpolate(v1, v2, v3, v4, x_ratio, y_ratio);
-
-                *(resized_image.data + y * target_width * image.channel + x * image.channel + k) = value;
+    for (int iw = 0; iw < width; ++iw) {
+        for (int ih = 0; ih < height; ++ih) {
+            for (int ic = 0; ic < channel; ++ic) {
+                float value                            = shape.size() == 5 ? tensor.index(iw, ih, frame_index, ic, 0)
+                                                                           : tensor.index(iw, ih, ic, frame_index);
+                value                                  = std::clamp(value, 0.0f, 1.0f);
+                data[(ih * width + iw) * channel + ic] = static_cast<uint8_t>(std::round(value * 255.0f));
             }
         }
     }
-
-    return resized_image;
+    return {
+        static_cast<uint32_t>(width),
+        static_cast<uint32_t>(height),
+        static_cast<uint32_t>(channel),
+        data,
+    };
 }
 
-void normalize_sd_image_f32_t(sd_image_f32_t image, float means[3], float stds[3]) {
-    for (uint32_t y = 0; y < image.height; y++) {
-        for (uint32_t x = 0; x < image.width; x++) {
-            for (uint32_t k = 0; k < image.channel; k++) {
-                int index         = (y * image.width + x) * image.channel + k;
-                image.data[index] = (image.data[index] - means[k]) / stds[k];
+sd::Tensor<float> sd_image_to_tensor(sd_image_t image,
+                                     int target_width,
+                                     int target_height,
+                                     bool scale) {
+    sd::Tensor<float> tensor = sd::zeros<float>({static_cast<int64_t>(image.width),
+                                                 static_cast<int64_t>(image.height),
+                                                 static_cast<int64_t>(image.channel),
+                                                 1});
+    for (uint32_t iw = 0; iw < image.width; ++iw) {
+        for (uint32_t ih = 0; ih < image.height; ++ih) {
+            for (uint32_t ic = 0; ic < image.channel; ++ic) {
+                tensor.index(iw, ih, ic, 0) = sd_image_get_f32(image, iw, ih, ic, scale);
             }
         }
     }
+    if (target_width >= 0 && target_height >= 0 &&
+        (tensor.shape()[0] != target_width || tensor.shape()[1] != target_height)) {
+        tensor = sd::ops::interpolate(tensor,
+                                      {target_width,
+                                       target_height,
+                                       tensor.shape()[2],
+                                       tensor.shape()[3]});
+    }
+    return tensor;
 }
 
 // Constants for means and std
 float means[3] = {0.48145466f, 0.4578275f, 0.40821073f};
 float stds[3]  = {0.26862954f, 0.26130258f, 0.27577711f};
 
-// Function to clip and preprocess sd_image_f32_t
-sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int target_height) {
-    float width_scale  = (float)target_width / image.width;
-    float height_scale = (float)target_height / image.height;
+sd::Tensor<float> clip_preprocess(const sd::Tensor<float>& image, int target_width, int target_height) {
+    GGML_ASSERT(image.dim() == 4);
+    GGML_ASSERT(image.shape()[2] == 3);
+    GGML_ASSERT(image.shape()[3] == 1);
+    GGML_ASSERT(target_width > 0 && target_height > 0);
 
-    float scale = std::fmax(width_scale, height_scale);
+    float width_scale  = static_cast<float>(target_width) / static_cast<float>(image.shape()[0]);
+    float height_scale = static_cast<float>(target_height) / static_cast<float>(image.shape()[1]);
+    float scale        = std::fmax(width_scale, height_scale);
 
-    // Interpolation
-    int resized_width   = (int)(scale * image.width);
-    int resized_height  = (int)(scale * image.height);
-    float* resized_data = (float*)malloc(resized_width * resized_height * image.channel * sizeof(float));
+    int64_t resized_width  = static_cast<int64_t>(scale * static_cast<float>(image.shape()[0]));
+    int64_t resized_height = static_cast<int64_t>(scale * static_cast<float>(image.shape()[1]));
 
-    for (int y = 0; y < resized_height; y++) {
-        for (int x = 0; x < resized_width; x++) {
-            float original_x = (float)x * image.width / resized_width;
-            float original_y = (float)y * image.height / resized_height;
+    sd::Tensor<float> resized = sd::ops::interpolate(
+        image,
+        {resized_width, resized_height, image.shape()[2], image.shape()[3]});
 
-            uint32_t x1 = (uint32_t)original_x;
-            uint32_t y1 = (uint32_t)original_y;
-            uint32_t x2 = std::min(x1 + 1, image.width - 1);
-            uint32_t y2 = std::min(y1 + 1, image.height - 1);
+    int64_t h_offset = std::max<int64_t>((resized_height - target_height) / 2, 0);
+    int64_t w_offset = std::max<int64_t>((resized_width - target_width) / 2, 0);
 
-            for (uint32_t k = 0; k < image.channel; k++) {
-                float v1 = *(image.data + y1 * image.width * image.channel + x1 * image.channel + k);
-                float v2 = *(image.data + y1 * image.width * image.channel + x2 * image.channel + k);
-                float v3 = *(image.data + y2 * image.width * image.channel + x1 * image.channel + k);
-                float v4 = *(image.data + y2 * image.width * image.channel + x2 * image.channel + k);
-
-                float x_ratio = original_x - x1;
-                float y_ratio = original_y - y1;
-
-                float value = interpolate(v1, v2, v3, v4, x_ratio, y_ratio);
-
-                *(resized_data + y * resized_width * image.channel + x * image.channel + k) = value;
+    sd::Tensor<float> cropped({target_width, target_height, image.shape()[2], image.shape()[3]});
+    for (int64_t y = 0; y < target_height; ++y) {
+        for (int64_t x = 0; x < target_width; ++x) {
+            for (int64_t c = 0; c < image.shape()[2]; ++c) {
+                cropped.index(x, y, c, 0) = resized.index(x + w_offset, y + h_offset, c, 0);
             }
         }
     }
 
-    // Clip and preprocess
-    int h_offset = std::max((int)(resized_height - target_height) / 2, 0);
-    int w_offset = std::max((int)(resized_width - target_width) / 2, 0);
-
-    sd_image_f32_t result;
-    result.width   = target_width;
-    result.height  = target_height;
-    result.channel = image.channel;
-    result.data    = (float*)malloc(target_height * target_width * image.channel * sizeof(float));
-
-    for (uint32_t k = 0; k < image.channel; k++) {
-        for (uint32_t i = 0; i < result.height; i++) {
-            for (uint32_t j = 0; j < result.width; j++) {
-                int src_y = std::min(static_cast<int>(i + h_offset), resized_height - 1);
-                int src_x = std::min(static_cast<int>(j + w_offset), resized_width - 1);
-                *(result.data + i * result.width * image.channel + j * image.channel + k) =
-                    fmin(fmax(*(resized_data + src_y * resized_width * image.channel + src_x * image.channel + k), 0.0f), 255.0f) / 255.0f;
-            }
-        }
-    }
-
-    // Free allocated memory
-    free(resized_data);
-
-    // Normalize
-    for (uint32_t k = 0; k < image.channel; k++) {
-        for (uint32_t i = 0; i < result.height; i++) {
-            for (uint32_t j = 0; j < result.width; j++) {
-                // *(result.data + i * size * image.channel + j * image.channel + k) = 0.5f;
-                int offset  = i * result.width * image.channel + j * image.channel + k;
-                float value = *(result.data + offset);
-                value       = (value - means[k]) / stds[k];
-                // value = 0.5f;
-                *(result.data + offset) = value;
-            }
-        }
-    }
-
-    return result;
+    sd::Tensor<float> normalized = sd::ops::clamp(cropped, 0.0f, 1.0f);
+    sd::Tensor<float> mean({1, 1, 3, 1}, {means[0], means[1], means[2]});
+    sd::Tensor<float> std({1, 1, 3, 1}, {stds[0], stds[1], stds[2]});
+    return (normalized - mean) / std;
 }
 
 // Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/prompt_parser.py#L345
diff --git a/src/util.h b/src/util.h
index 7dee7bf5..24ce4cf3 100644
--- a/src/util.h
+++ b/src/util.h
@@ -7,6 +7,7 @@
 #include <vector>
 
 #include "stable-diffusion.h"
+#include "tensor.hpp"
 
 #define SAFE_STR(s) ((s) ? (s) : "")
 #define BOOL_STR(b) ((b) ? "true" : "false")
@@ -29,20 +30,14 @@ std::string utf32_to_utf8(const std::u32string& utf32_str);
 std::u32string unicode_value_to_utf32(int unicode_value);
 // std::string sd_basename(const std::string& path);
 
-typedef struct {
-    uint32_t width;
-    uint32_t height;
-    uint32_t channel;
-    float* data;
-} sd_image_f32_t;
+sd_image_t tensor_to_sd_image(const sd::Tensor<float>& tensor, int frame_index = 0);
 
-void normalize_sd_image_f32_t(sd_image_f32_t image, float means[3], float stds[3]);
+sd::Tensor<float> sd_image_to_tensor(sd_image_t image,
+                                     int target_width  = -1,
+                                     int target_height = -1,
+                                     bool scale        = true);
 
-sd_image_f32_t sd_image_t_to_sd_image_f32_t(sd_image_t image);
-
-sd_image_f32_t resize_sd_image_f32_t(sd_image_f32_t image, int target_width, int target_height);
-
-sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int target_height);
+sd::Tensor<float> clip_preprocess(const sd::Tensor<float>& image, int target_width, int target_height);
 
 class MmapWrapper {
 public:
diff --git a/src/vae.hpp b/src/vae.hpp
index 7ccba6ee..22be8867 100644
--- a/src/vae.hpp
+++ b/src/vae.hpp
@@ -2,773 +2,252 @@
 #define __VAE_HPP__
 
 #include "common_block.hpp"
-
-/*================================================== AutoEncoderKL ===================================================*/
-
-#define VAE_GRAPH_SIZE 20480
-
-class ResnetBlock : public UnaryBlock {
-protected:
-    int64_t in_channels;
-    int64_t out_channels;
-
-public:
-    ResnetBlock(int64_t in_channels,
-                int64_t out_channels)
-        : in_channels(in_channels),
-          out_channels(out_channels) {
-        // temb_channels is always 0
-        blocks["norm1"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
-        blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
-
-        blocks["norm2"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(out_channels));
-        blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv2d(out_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
-
-        if (out_channels != in_channels) {
-            blocks["nin_shortcut"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, {1, 1}));
-        }
-    }
-
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
-        // x: [N, in_channels, h, w]
-        // t_emb is always None
-        auto norm1 = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm1"]);
-        auto conv1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv1"]);
-        auto norm2 = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm2"]);
-        auto conv2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv2"]);
-
-        auto h = x;
-        h      = norm1->forward(ctx, h);
-        h      = ggml_silu_inplace(ctx->ggml_ctx, h);  // swish
-        h      = conv1->forward(ctx, h);
-        // return h;
-
-        h = norm2->forward(ctx, h);
-        h = ggml_silu_inplace(ctx->ggml_ctx, h);  // swish
-        // dropout, skip for inference
-        h = conv2->forward(ctx, h);
-
-        // skip connection
-        if (out_channels != in_channels) {
-            auto nin_shortcut = std::dynamic_pointer_cast<Conv2d>(blocks["nin_shortcut"]);
-
-            x = nin_shortcut->forward(ctx, x);  // [N, out_channels, h, w]
-        }
-
-        h = ggml_add(ctx->ggml_ctx, h, x);
-        return h;  // [N, out_channels, h, w]
-    }
-};
-
-class AttnBlock : public UnaryBlock {
-protected:
-    int64_t in_channels;
-    bool use_linear;
-
-    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
-        auto iter = tensor_storage_map.find(prefix + "proj_out.weight");
-        if (iter != tensor_storage_map.end()) {
-            if (iter->second.n_dims == 4 && use_linear) {
-                use_linear         = false;
-                blocks["q"]        = std::make_shared<Conv2d>(in_channels, in_channels, std::pair{1, 1});
-                blocks["k"]        = std::make_shared<Conv2d>(in_channels, in_channels, std::pair{1, 1});
-                blocks["v"]        = std::make_shared<Conv2d>(in_channels, in_channels, std::pair{1, 1});
-                blocks["proj_out"] = std::make_shared<Conv2d>(in_channels, in_channels, std::pair{1, 1});
-            } else if (iter->second.n_dims == 2 && !use_linear) {
-                use_linear         = true;
-                blocks["q"]        = std::make_shared<Linear>(in_channels, in_channels);
-                blocks["k"]        = std::make_shared<Linear>(in_channels, in_channels);
-                blocks["v"]        = std::make_shared<Linear>(in_channels, in_channels);
-                blocks["proj_out"] = std::make_shared<Linear>(in_channels, in_channels);
-            }
-        }
-    }
-
-public:
-    AttnBlock(int64_t in_channels, bool use_linear)
-        : in_channels(in_channels), use_linear(use_linear) {
-        blocks["norm"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
-        if (use_linear) {
-            blocks["q"]        = std::shared_ptr<GGMLBlock>(new Linear(in_channels, in_channels));
-            blocks["k"]        = std::shared_ptr<GGMLBlock>(new Linear(in_channels, in_channels));
-            blocks["v"]        = std::shared_ptr<GGMLBlock>(new Linear(in_channels, in_channels));
-            blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, in_channels));
-        } else {
-            blocks["q"]        = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
-            blocks["k"]        = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
-            blocks["v"]        = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
-            blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
-        }
-    }
-
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
-        // x: [N, in_channels, h, w]
-        auto norm     = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
-        auto q_proj   = std::dynamic_pointer_cast<UnaryBlock>(blocks["q"]);
-        auto k_proj   = std::dynamic_pointer_cast<UnaryBlock>(blocks["k"]);
-        auto v_proj   = std::dynamic_pointer_cast<UnaryBlock>(blocks["v"]);
-        auto proj_out = std::dynamic_pointer_cast<UnaryBlock>(blocks["proj_out"]);
-
-        auto h_ = norm->forward(ctx, x);
-
-        const int64_t n = h_->ne[3];
-        const int64_t c = h_->ne[2];
-        const int64_t h = h_->ne[1];
-        const int64_t w = h_->ne[0];
-
-        ggml_tensor* q;
-        ggml_tensor* k;
-        ggml_tensor* v;
-        if (use_linear) {
-            h_ = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, h_, 1, 2, 0, 3));  // [N, h, w, in_channels]
-            h_ = ggml_reshape_3d(ctx->ggml_ctx, h_, c, h * w, n);                        // [N, h * w, in_channels]
-
-            q = q_proj->forward(ctx, h_);  // [N, h * w, in_channels]
-            k = k_proj->forward(ctx, h_);  // [N, h * w, in_channels]
-            v = v_proj->forward(ctx, h_);  // [N, h * w, in_channels]
-        } else {
-            q = q_proj->forward(ctx, h_);                                              // [N, in_channels, h, w]
-            q = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, q, 1, 2, 0, 3));  // [N, h, w, in_channels]
-            q = ggml_reshape_3d(ctx->ggml_ctx, q, c, h * w, n);                        // [N, h * w, in_channels]
-
-            k = k_proj->forward(ctx, h_);                                              // [N, in_channels, h, w]
-            k = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, k, 1, 2, 0, 3));  // [N, h, w, in_channels]
-            k = ggml_reshape_3d(ctx->ggml_ctx, k, c, h * w, n);                        // [N, h * w, in_channels]
-
-            v = v_proj->forward(ctx, h_);                                              // [N, in_channels, h, w]
-            v = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, v, 1, 2, 0, 3));  // [N, h, w, in_channels]
-            v = ggml_reshape_3d(ctx->ggml_ctx, v, c, h * w, n);                        // [N, h * w, in_channels]
-        }
-
-        h_ = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, 1, nullptr, false, ctx->flash_attn_enabled);
-
-        if (use_linear) {
-            h_ = proj_out->forward(ctx, h_);  // [N, h * w, in_channels]
-
-            h_ = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, h_, 1, 0, 2, 3));  // [N, in_channels, h * w]
-            h_ = ggml_reshape_4d(ctx->ggml_ctx, h_, w, h, c, n);                         // [N, in_channels, h, w]
-        } else {
-            h_ = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, h_, 1, 0, 2, 3));  // [N, in_channels, h * w]
-            h_ = ggml_reshape_4d(ctx->ggml_ctx, h_, w, h, c, n);                         // [N, in_channels, h, w]
-
-            h_ = proj_out->forward(ctx, h_);  // [N, in_channels, h, w]
-        }
-
-        h_ = ggml_add(ctx->ggml_ctx, h_, x);
-        return h_;
-    }
-};
-
-class AE3DConv : public Conv2d {
-public:
-    AE3DConv(int64_t in_channels,
-             int64_t out_channels,
-             std::pair<int, int> kernel_size,
-             int video_kernel_size        = 3,
-             std::pair<int, int> stride   = {1, 1},
-             std::pair<int, int> padding  = {0, 0},
-             std::pair<int, int> dilation = {1, 1},
-             bool bias                    = true)
-        : Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias) {
-        int kernel_padding      = video_kernel_size / 2;
-        blocks["time_mix_conv"] = std::shared_ptr<GGMLBlock>(new Conv3d(out_channels,
-                                                                        out_channels,
-                                                                        {video_kernel_size, 1, 1},
-                                                                        {1, 1, 1},
-                                                                        {kernel_padding, 0, 0}));
-    }
-
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                struct ggml_tensor* x) override {
-        // timesteps always None
-        // skip_video always False
-        // x: [N, IC, IH, IW]
-        // result: [N, OC, OH, OW]
-        auto time_mix_conv = std::dynamic_pointer_cast<Conv3d>(blocks["time_mix_conv"]);
-
-        x = Conv2d::forward(ctx, x);
-        // timesteps = x.shape[0]
-        // x = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps)
-        // x = conv3d(x)
-        // return rearrange(x, "b c t h w -> (b t) c h w")
-        int64_t T = x->ne[3];
-        int64_t B = x->ne[3] / T;
-        int64_t C = x->ne[2];
-        int64_t H = x->ne[1];
-        int64_t W = x->ne[0];
-
-        x = ggml_reshape_4d(ctx->ggml_ctx, x, W * H, C, T, B);                     // (b t) c h w -> b t c (h w)
-        x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3));  // b t c (h w) -> b c t (h w)
-        x = time_mix_conv->forward(ctx, x);                                        // [B, OC, T, OH * OW]
-        x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3));  // b c t (h w) -> b t c (h w)
-        x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B);                     // b t c (h w) -> (b t) c h w
-        return x;                                                                  // [B*T, OC, OH, OW]
-    }
-};
-
-class VideoResnetBlock : public ResnetBlock {
-protected:
-    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
-        enum ggml_type wtype = get_type(prefix + "mix_factor", tensor_storage_map, GGML_TYPE_F32);
-        params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
-    }
-
-    float get_alpha() {
-        float alpha = ggml_ext_backend_tensor_get_f32(params["mix_factor"]);
-        return sigmoid(alpha);
-    }
-
-public:
-    VideoResnetBlock(int64_t in_channels,
-                     int64_t out_channels,
-                     int video_kernel_size = 3)
-        : ResnetBlock(in_channels, out_channels) {
-        // merge_strategy is always learned
-        blocks["time_stack"] = std::shared_ptr<GGMLBlock>(new ResBlock(out_channels, 0, out_channels, {video_kernel_size, 1}, 3, false, true));
-    }
-
-    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
-        // x: [N, in_channels, h, w] aka [b*t, in_channels, h, w]
-        // return: [N, out_channels, h, w] aka [b*t, out_channels, h, w]
-        // t_emb is always None
-        // skip_video is always False
-        // timesteps is always None
-        auto time_stack = std::dynamic_pointer_cast<ResBlock>(blocks["time_stack"]);
-
-        x = ResnetBlock::forward(ctx, x);  // [N, out_channels, h, w]
-        // return x;
-
-        int64_t T = x->ne[3];
-        int64_t B = x->ne[3] / T;
-        int64_t C = x->ne[2];
-        int64_t H = x->ne[1];
-        int64_t W = x->ne[0];
-
-        x          = ggml_reshape_4d(ctx->ggml_ctx, x, W * H, C, T, B);                     // (b t) c h w -> b t c (h w)
-        x          = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3));  // b t c (h w) -> b c t (h w)
-        auto x_mix = x;
-
-        x = time_stack->forward(ctx, x);  // b t c (h w)
-
-        float alpha = get_alpha();
-        x           = ggml_add(ctx->ggml_ctx,
-                               ggml_ext_scale(ctx->ggml_ctx, x, alpha),
-                               ggml_ext_scale(ctx->ggml_ctx, x_mix, 1.0f - alpha));
-
-        x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3));  // b c t (h w) -> b t c (h w)
-        x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B);                     // b t c (h w) -> (b t) c h w
-
-        return x;
-    }
-};
-
-// ldm.modules.diffusionmodules.model.Encoder
-class Encoder : public GGMLBlock {
-protected:
-    int ch                   = 128;
-    std::vector<int> ch_mult = {1, 2, 4, 4};
-    int num_res_blocks       = 2;
-    int in_channels          = 3;
-    int z_channels           = 4;
-    bool double_z            = true;
-
-public:
-    Encoder(int ch,
-            std::vector<int> ch_mult,
-            int num_res_blocks,
-            int in_channels,
-            int z_channels,
-            bool double_z              = true,
-            bool use_linear_projection = false)
-        : ch(ch),
-          ch_mult(ch_mult),
-          num_res_blocks(num_res_blocks),
-          in_channels(in_channels),
-          z_channels(z_channels),
-          double_z(double_z) {
-        blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, ch, {3, 3}, {1, 1}, {1, 1}));
-
-        size_t num_resolutions = ch_mult.size();
-
-        int block_in = 1;
-        for (int i = 0; i < num_resolutions; i++) {
-            if (i == 0) {
-                block_in = ch;
-            } else {
-                block_in = ch * ch_mult[i - 1];
-            }
-            int block_out = ch * ch_mult[i];
-            for (int j = 0; j < num_res_blocks; j++) {
-                std::string name = "down." + std::to_string(i) + ".block." + std::to_string(j);
-                blocks[name]     = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_out));
-                block_in         = block_out;
-            }
-            if (i != num_resolutions - 1) {
-                std::string name = "down." + std::to_string(i) + ".downsample";
-                blocks[name]     = std::shared_ptr<GGMLBlock>(new DownSampleBlock(block_in, block_in, true));
-            }
-        }
-
-        blocks["mid.block_1"] = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_in));
-        blocks["mid.attn_1"]  = std::shared_ptr<GGMLBlock>(new AttnBlock(block_in, use_linear_projection));
-        blocks["mid.block_2"] = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_in));
-
-        blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(block_in));
-        blocks["conv_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1}));
-    }
-
-    virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
-        // x: [N, in_channels, h, w]
-
-        auto conv_in     = std::dynamic_pointer_cast<Conv2d>(blocks["conv_in"]);
-        auto mid_block_1 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_1"]);
-        auto mid_attn_1  = std::dynamic_pointer_cast<AttnBlock>(blocks["mid.attn_1"]);
-        auto mid_block_2 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_2"]);
-        auto norm_out    = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm_out"]);
-        auto conv_out    = std::dynamic_pointer_cast<Conv2d>(blocks["conv_out"]);
-
-        auto h = conv_in->forward(ctx, x);  // [N, ch, h, w]
-
-        // downsampling
-        size_t num_resolutions = ch_mult.size();
-        for (int i = 0; i < num_resolutions; i++) {
-            for (int j = 0; j < num_res_blocks; j++) {
-                std::string name = "down." + std::to_string(i) + ".block." + std::to_string(j);
-                auto down_block  = std::dynamic_pointer_cast<ResnetBlock>(blocks[name]);
-
-                h = down_block->forward(ctx, h);
-            }
-            if (i != num_resolutions - 1) {
-                std::string name = "down." + std::to_string(i) + ".downsample";
-                auto down_sample = std::dynamic_pointer_cast<DownSampleBlock>(blocks[name]);
-
-                h = down_sample->forward(ctx, h);
-            }
-        }
-
-        // middle
-        h = mid_block_1->forward(ctx, h);
-        h = mid_attn_1->forward(ctx, h);
-        h = mid_block_2->forward(ctx, h);  // [N, block_in, h, w]
-
-        // end
-        h = norm_out->forward(ctx, h);
-        h = ggml_silu_inplace(ctx->ggml_ctx, h);  // nonlinearity/swish
-        h = conv_out->forward(ctx, h);            // [N, z_channels*2, h, w]
-        return h;
-    }
-};
-
-// ldm.modules.diffusionmodules.model.Decoder
-class Decoder : public GGMLBlock {
-protected:
-    int ch                   = 128;
-    int out_ch               = 3;
-    std::vector<int> ch_mult = {1, 2, 4, 4};
-    int num_res_blocks       = 2;
-    int z_channels           = 4;
-    bool video_decoder       = false;
-    int video_kernel_size    = 3;
-
-    virtual std::shared_ptr<GGMLBlock> get_conv_out(int64_t in_channels,
-                                                    int64_t out_channels,
-                                                    std::pair<int, int> kernel_size,
-                                                    std::pair<int, int> stride  = {1, 1},
-                                                    std::pair<int, int> padding = {0, 0}) {
-        if (video_decoder) {
-            return std::shared_ptr<GGMLBlock>(new AE3DConv(in_channels, out_channels, kernel_size, video_kernel_size, stride, padding));
-        } else {
-            return std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, kernel_size, stride, padding));
-        }
-    }
-
-    virtual std::shared_ptr<GGMLBlock> get_resnet_block(int64_t in_channels,
-                                                        int64_t out_channels) {
-        if (video_decoder) {
-            return std::shared_ptr<GGMLBlock>(new VideoResnetBlock(in_channels, out_channels, video_kernel_size));
-        } else {
-            return std::shared_ptr<GGMLBlock>(new ResnetBlock(in_channels, out_channels));
-        }
-    }
-
-public:
-    Decoder(int ch,
-            int out_ch,
-            std::vector<int> ch_mult,
-            int num_res_blocks,
-            int z_channels,
-            bool use_linear_projection = false,
-            bool video_decoder         = false,
-            int video_kernel_size      = 3)
-        : ch(ch),
-          out_ch(out_ch),
-          ch_mult(ch_mult),
-          num_res_blocks(num_res_blocks),
-          z_channels(z_channels),
-          video_decoder(video_decoder),
-          video_kernel_size(video_kernel_size) {
-        int num_resolutions = static_cast<int>(ch_mult.size());
-        int block_in        = ch * ch_mult[num_resolutions - 1];
-
-        blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, block_in, {3, 3}, {1, 1}, {1, 1}));
-
-        blocks["mid.block_1"] = get_resnet_block(block_in, block_in);
-        blocks["mid.attn_1"]  = std::shared_ptr<GGMLBlock>(new AttnBlock(block_in, use_linear_projection));
-        blocks["mid.block_2"] = get_resnet_block(block_in, block_in);
-
-        for (int i = num_resolutions - 1; i >= 0; i--) {
-            int mult      = ch_mult[i];
-            int block_out = ch * mult;
-            for (int j = 0; j < num_res_blocks + 1; j++) {
-                std::string name = "up." + std::to_string(i) + ".block." + std::to_string(j);
-                blocks[name]     = get_resnet_block(block_in, block_out);
-
-                block_in = block_out;
-            }
-            if (i != 0) {
-                std::string name = "up." + std::to_string(i) + ".upsample";
-                blocks[name]     = std::shared_ptr<GGMLBlock>(new UpSampleBlock(block_in, block_in));
-            }
-        }
-
-        blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(block_in));
-        blocks["conv_out"] = get_conv_out(block_in, out_ch, {3, 3}, {1, 1}, {1, 1});
-    }
-
-    virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* z) {
-        // z: [N, z_channels, h, w]
-        // alpha is always 0
-        // merge_strategy is always learned
-        // time_mode is always conv-only, so we need to replace conv_out_op/resnet_op to AE3DConv/VideoResBlock
-        // AttnVideoBlock will not be used
-        auto conv_in     = std::dynamic_pointer_cast<Conv2d>(blocks["conv_in"]);
-        auto mid_block_1 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_1"]);
-        auto mid_attn_1  = std::dynamic_pointer_cast<AttnBlock>(blocks["mid.attn_1"]);
-        auto mid_block_2 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_2"]);
-        auto norm_out    = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm_out"]);
-        auto conv_out    = std::dynamic_pointer_cast<Conv2d>(blocks["conv_out"]);
-
-        // conv_in
-        auto h = conv_in->forward(ctx, z);  // [N, block_in, h, w]
-
-        // middle
-        h = mid_block_1->forward(ctx, h);
-        // return h;
-
-        h = mid_attn_1->forward(ctx, h);
-        h = mid_block_2->forward(ctx, h);  // [N, block_in, h, w]
-
-        // upsampling
-        int num_resolutions = static_cast<int>(ch_mult.size());
-        for (int i = num_resolutions - 1; i >= 0; i--) {
-            for (int j = 0; j < num_res_blocks + 1; j++) {
-                std::string name = "up." + std::to_string(i) + ".block." + std::to_string(j);
-                auto up_block    = std::dynamic_pointer_cast<ResnetBlock>(blocks[name]);
-
-                h = up_block->forward(ctx, h);
-            }
-            if (i != 0) {
-                std::string name = "up." + std::to_string(i) + ".upsample";
-                auto up_sample   = std::dynamic_pointer_cast<UpSampleBlock>(blocks[name]);
-
-                h = up_sample->forward(ctx, h);
-            }
-        }
-
-        h = norm_out->forward(ctx, h);
-        h = ggml_silu_inplace(ctx->ggml_ctx, h);  // nonlinearity/swish
-        h = conv_out->forward(ctx, h);            // [N, out_ch, h*8, w*8]
-        return h;
-    }
-};
-
-// ldm.models.autoencoder.AutoencoderKL
-class AutoencodingEngine : public GGMLBlock {
-protected:
-    SDVersion version;
-    bool decode_only       = true;
-    bool use_video_decoder = false;
-    bool use_quant         = true;
-    int embed_dim          = 4;
-    struct {
-        int z_channels           = 4;
-        int resolution           = 256;
-        int in_channels          = 3;
-        int out_ch               = 3;
-        int ch                   = 128;
-        std::vector<int> ch_mult = {1, 2, 4, 4};
-        int num_res_blocks       = 2;
-        bool double_z            = true;
-    } dd_config;
-
-public:
-    AutoencodingEngine(SDVersion version          = VERSION_SD1,
-                       bool decode_only           = true,
-                       bool use_linear_projection = false,
-                       bool use_video_decoder     = false)
-        : version(version), decode_only(decode_only), use_video_decoder(use_video_decoder) {
-        if (sd_version_is_dit(version)) {
-            if (sd_version_is_flux2(version)) {
-                dd_config.z_channels = 32;
-                embed_dim            = 32;
-            } else {
-                use_quant            = false;
-                dd_config.z_channels = 16;
-            }
-        }
-        if (use_video_decoder) {
-            use_quant = false;
-        }
-        blocks["decoder"] = std::shared_ptr<GGMLBlock>(new Decoder(dd_config.ch,
-                                                                   dd_config.out_ch,
-                                                                   dd_config.ch_mult,
-                                                                   dd_config.num_res_blocks,
-                                                                   dd_config.z_channels,
-                                                                   use_linear_projection,
-                                                                   use_video_decoder));
-        if (use_quant) {
-            blocks["post_quant_conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(dd_config.z_channels,
-                                                                              embed_dim,
-                                                                              {1, 1}));
-        }
-        if (!decode_only) {
-            blocks["encoder"] = std::shared_ptr<GGMLBlock>(new Encoder(dd_config.ch,
-                                                                       dd_config.ch_mult,
-                                                                       dd_config.num_res_blocks,
-                                                                       dd_config.in_channels,
-                                                                       dd_config.z_channels,
-                                                                       dd_config.double_z,
-                                                                       use_linear_projection));
-            if (use_quant) {
-                int factor = dd_config.double_z ? 2 : 1;
-
-                blocks["quant_conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(embed_dim * factor,
-                                                                             dd_config.z_channels * factor,
-                                                                             {1, 1}));
-            }
-        }
-    }
-
-    struct ggml_tensor* decode(GGMLRunnerContext* ctx, struct ggml_tensor* z) {
-        // z: [N, z_channels, h, w]
-        if (sd_version_is_flux2(version)) {
-            // [N, C*p*p, h, w] -> [N, C, h*p, w*p]
-            int64_t p = 2;
-
-            int64_t N = z->ne[3];
-            int64_t C = z->ne[2] / p / p;
-            int64_t h = z->ne[1];
-            int64_t w = z->ne[0];
-            int64_t H = h * p;
-            int64_t W = w * p;
-
-            z = ggml_reshape_4d(ctx->ggml_ctx, z, w * h, p * p, C, N);                           // [N, C, p*p, h*w]
-            z = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, z, 1, 0, 2, 3));  // [N, C, h*w, p*p]
-            z = ggml_reshape_4d(ctx->ggml_ctx, z, p, p, w, h * C * N);                           // [N*C*h, w, p, p]
-            z = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, z, 0, 2, 1, 3));  // [N*C*h, p, w, p]
-            z = ggml_reshape_4d(ctx->ggml_ctx, z, W, H, C, N);                                   // [N, C, h*p, w*p]
-        }
-
-        if (use_quant) {
-            auto post_quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["post_quant_conv"]);
-            z                    = post_quant_conv->forward(ctx, z);  // [N, z_channels, h, w]
-        }
-        auto decoder = std::dynamic_pointer_cast<Decoder>(blocks["decoder"]);
-
-        ggml_set_name(z, "bench-start");
-        auto h = decoder->forward(ctx, z);
-        ggml_set_name(h, "bench-end");
-        return h;
-    }
-
-    struct ggml_tensor* encode(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
-        // x: [N, in_channels, h, w]
-        auto encoder = std::dynamic_pointer_cast<Encoder>(blocks["encoder"]);
-
-        auto z = encoder->forward(ctx, x);  // [N, 2*z_channels, h/8, w/8]
-        if (use_quant) {
-            auto quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["quant_conv"]);
-            z               = quant_conv->forward(ctx, z);  // [N, 2*embed_dim, h/8, w/8]
-        }
-        if (sd_version_is_flux2(version)) {
-            z = ggml_ext_chunk(ctx->ggml_ctx, z, 2, 2)[0];
-
-            // [N, C, H, W] -> [N, C*p*p, H/p, W/p]
-            int64_t p = 2;
-            int64_t N = z->ne[3];
-            int64_t C = z->ne[2];
-            int64_t H = z->ne[1];
-            int64_t W = z->ne[0];
-            int64_t h = H / p;
-            int64_t w = W / p;
-
-            z = ggml_reshape_4d(ctx->ggml_ctx, z, p, w, p, h * C * N);                 // [N*C*h, p, w, p]
-            z = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, z, 0, 2, 1, 3));  // [N*C*h, w, p, p]
-            z = ggml_reshape_4d(ctx->ggml_ctx, z, p * p, w * h, C, N);                 // [N, C, h*w, p*p]
-            z = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, z, 1, 0, 2, 3));  // [N, C, p*p, h*w]
-            z = ggml_reshape_4d(ctx->ggml_ctx, z, w, h, p * p * C, N);                 // [N, C*p*p, h*w]
-        }
-        return z;
-    }
-};
+#include "tensor_ggml.hpp"
 
 struct VAE : public GGMLRunner {
-    VAE(ggml_backend_t backend, bool offload_params_to_cpu)
-        : GGMLRunner(backend, offload_params_to_cpu) {}
-    virtual bool compute(const int n_threads,
-                         struct ggml_tensor* z,
-                         bool decode_graph,
-                         struct ggml_tensor** output,
-                         struct ggml_context* output_ctx)                                                         = 0;
-    virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) = 0;
+protected:
+    SDVersion version;
+    bool scale_input                                      = true;
+    virtual sd::Tensor<float> _compute(const int n_threads,
+                                       const sd::Tensor<float>& z,
+                                       bool decode_graph) = 0;
+
+    static inline void scale_tensor_to_minus1_1(sd::Tensor<float>* tensor) {
+        GGML_ASSERT(tensor != nullptr);
+        for (int64_t i = 0; i < tensor->numel(); ++i) {
+            (*tensor)[i] = (*tensor)[i] * 2.0f - 1.0f;
+        }
+    }
+
+    static inline void scale_tensor_to_0_1(sd::Tensor<float>* tensor) {
+        GGML_ASSERT(tensor != nullptr);
+        for (int64_t i = 0; i < tensor->numel(); ++i) {
+            float value  = ((*tensor)[i] + 1.0f) * 0.5f;
+            (*tensor)[i] = std::max(0.0f, std::min(1.0f, value));
+        }
+    }
+
+    sd::Tensor<float> tiled_compute(const sd::Tensor<float>& input,
+                                    int n_threads,
+                                    int output_width,
+                                    int output_height,
+                                    int scale,
+                                    int p_tile_size_x,
+                                    int p_tile_size_y,
+                                    float tile_overlap_factor,
+                                    bool circular_x,
+                                    bool circular_y,
+                                    bool decode_graph,
+                                    const char* error_message,
+                                    bool silent = false) {
+        auto on_processing = [&](const sd::Tensor<float>& input_tile) {
+            auto output_tile = _compute(n_threads, input_tile, decode_graph);
+            if (output_tile.empty()) {
+                LOG_ERROR("%s", error_message);
+                return sd::Tensor<float>();
+            }
+            return output_tile;
+        };
+        return ::process_tiles_2d(input,
+                                  output_width,
+                                  output_height,
+                                  scale,
+                                  p_tile_size_x,
+                                  p_tile_size_y,
+                                  tile_overlap_factor,
+                                  circular_x,
+                                  circular_y,
+                                  on_processing,
+                                  silent);
+    }
+
+public:
+    VAE(SDVersion version, ggml_backend_t backend, bool offload_params_to_cpu)
+        : version(version), GGMLRunner(backend, offload_params_to_cpu) {}
+
+    int get_scale_factor() {
+        int scale_factor = 8;
+        if (version == VERSION_WAN2_2_TI2V) {
+            scale_factor = 16;
+        } else if (sd_version_is_flux2(version)) {
+            scale_factor = 16;
+        } else if (version == VERSION_CHROMA_RADIANCE) {
+            scale_factor = 1;
+        }
+        return scale_factor;
+    }
+
+    virtual int get_encoder_output_channels(int input_channels) = 0;
+
+    void get_tile_sizes(int& tile_size_x,
+                        int& tile_size_y,
+                        float& tile_overlap,
+                        const sd_tiling_params_t& params,
+                        int64_t latent_x,
+                        int64_t latent_y,
+                        float encoding_factor = 1.0f) {
+        tile_overlap       = std::max(std::min(params.target_overlap, 0.5f), 0.0f);
+        auto get_tile_size = [&](int requested_size, float factor, int64_t latent_size) {
+            const int default_tile_size  = 32;
+            const int min_tile_dimension = 4;
+            int tile_size                = default_tile_size;
+            // factor <= 1 means simple fraction of the latent dimension
+            // factor > 1 means number of tiles across that dimension
+            if (factor > 0.f) {
+                if (factor > 1.0)
+                    factor = 1 / (factor - factor * tile_overlap + tile_overlap);
+                tile_size = static_cast<int>(std::round(latent_size * factor));
+            } else if (requested_size >= min_tile_dimension) {
+                tile_size = requested_size;
+            }
+            tile_size = static_cast<int>(tile_size * encoding_factor);
+            return std::max(std::min(tile_size, static_cast<int>(latent_size)), min_tile_dimension);
+        };
+
+        tile_size_x = get_tile_size(params.tile_size_x, params.rel_size_x, latent_x);
+        tile_size_y = get_tile_size(params.tile_size_y, params.rel_size_y, latent_y);
+    }
+
+    sd::Tensor<float> encode(int n_threads,
+                             const sd::Tensor<float>& x,
+                             sd_tiling_params_t tiling_params,
+                             bool circular_x = false,
+                             bool circular_y = false) {
+        int64_t t0              = ggml_time_ms();
+        sd::Tensor<float> input = x;
+        sd::Tensor<float> output;
+        if (scale_input) {
+            scale_tensor_to_minus1_1(&input);
+        }
+
+        if (tiling_params.enabled) {
+            const int scale_factor = get_scale_factor();
+            int64_t W              = input.shape()[0] / scale_factor;
+            int64_t H              = input.shape()[1] / scale_factor;
+            float tile_overlap;
+            int tile_size_x, tile_size_y;
+            get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, tiling_params, W, H, 1.30539f);
+            LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y);
+            output = tiled_compute(input,
+                                   n_threads,
+                                   static_cast<int>(W),
+                                   static_cast<int>(H),
+                                   scale_factor,
+                                   tile_size_x,
+                                   tile_size_y,
+                                   tile_overlap,
+                                   circular_x,
+                                   circular_y,
+                                   false,
+                                   "vae encode compute failed while processing a tile");
+        } else {
+            output = _compute(n_threads, input, false);
+            free_compute_buffer();
+        }
+
+        if (output.empty()) {
+            LOG_ERROR("vae encode compute failed");
+            return {};
+        }
+        int64_t t1 = ggml_time_ms();
+        LOG_DEBUG("computing vae encode graph completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
+        return std::move(output);
+    }
+
+    sd::Tensor<float> decode(int n_threads,
+                             const sd::Tensor<float>& x,
+                             sd_tiling_params_t tiling_params,
+                             bool decode_video = false,
+                             bool circular_x   = false,
+                             bool circular_y   = false,
+                             bool silent       = false) {
+        int64_t t0              = ggml_time_ms();
+        sd::Tensor<float> input = x;
+        sd::Tensor<float> output;
+
+        if (tiling_params.enabled) {
+            const int scale_factor = get_scale_factor();
+            int64_t W              = input.shape()[0] * scale_factor;
+            int64_t H              = input.shape()[1] * scale_factor;
+            float tile_overlap;
+            int tile_size_x, tile_size_y;
+            get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, tiling_params, input.shape()[0], input.shape()[1]);
+            if (!silent) {
+                LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y);
+            }
+            output = tiled_compute(
+                input,
+                n_threads,
+                static_cast<int>(W),
+                static_cast<int>(H),
+                scale_factor,
+                tile_size_x,
+                tile_size_y,
+                tile_overlap,
+                circular_x,
+                circular_y,
+                true,
+                "vae decode compute failed while processing a tile",
+                silent);
+        } else {
+            output = _compute(n_threads, input, true);
+        }
+
+        free_compute_buffer();
+
+        if (output.empty()) {
+            LOG_ERROR("vae decode compute failed");
+            return {};
+        }
+        if (scale_input) {
+            scale_tensor_to_0_1(&output);
+        }
+        int64_t t1 = ggml_time_ms();
+        LOG_DEBUG("computing vae decode graph completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
+        return std::move(output);
+    }
+
+    virtual sd::Tensor<float> vae_output_to_latents(const sd::Tensor<float>& vae_output, std::shared_ptr<RNG> rng) = 0;
+    virtual sd::Tensor<float> diffusion_to_vae_latents(const sd::Tensor<float>& latents)                           = 0;
+    virtual sd::Tensor<float> vae_to_diffusion_latents(const sd::Tensor<float>& latents)                           = 0;
+    virtual void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix)         = 0;
     virtual void set_conv2d_scale(float scale) { SD_UNUSED(scale); };
 };
 
 struct FakeVAE : public VAE {
-    FakeVAE(ggml_backend_t backend, bool offload_params_to_cpu)
-        : VAE(backend, offload_params_to_cpu) {}
-    bool compute(const int n_threads,
-                 struct ggml_tensor* z,
-                 bool decode_graph,
-                 struct ggml_tensor** output,
-                 struct ggml_context* output_ctx) override {
-        if (*output == nullptr && output_ctx != nullptr) {
-            *output = ggml_dup_tensor(output_ctx, z);
-        }
-        ggml_ext_tensor_iter(z, [&](ggml_tensor* z, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-            float value = ggml_ext_tensor_get_f32(z, i0, i1, i2, i3);
-            ggml_ext_tensor_set_f32(*output, value, i0, i1, i2, i3);
-        });
-        return true;
+    FakeVAE(SDVersion version, ggml_backend_t backend, bool offload_params_to_cpu)
+        : VAE(version, backend, offload_params_to_cpu) {}
+
+    int get_encoder_output_channels(int input_channels) {
+        return input_channels;
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) override {}
+    sd::Tensor<float> _compute(const int n_threads,
+                               const sd::Tensor<float>& z,
+                               bool decode_graph) override {
+        SD_UNUSED(n_threads);
+        SD_UNUSED(decode_graph);
+        return z;
+    }
+
+    sd::Tensor<float> vae_output_to_latents(const sd::Tensor<float>& vae_output, std::shared_ptr<RNG> rng) override {
+        SD_UNUSED(rng);
+        return vae_output;
+    }
+
+    sd::Tensor<float> diffusion_to_vae_latents(const sd::Tensor<float>& latents) override {
+        return latents;
+    }
+
+    sd::Tensor<float> vae_to_diffusion_latents(const sd::Tensor<float>& latents) override {
+        return latents;
+    }
+
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) override {}
 
     std::string get_desc() override {
         return "fake_vae";
     }
 };
 
-struct AutoEncoderKL : public VAE {
-    bool decode_only = true;
-    AutoencodingEngine ae;
-
-    AutoEncoderKL(ggml_backend_t backend,
-                  bool offload_params_to_cpu,
-                  const String2TensorStorage& tensor_storage_map,
-                  const std::string prefix,
-                  bool decode_only       = false,
-                  bool use_video_decoder = false,
-                  SDVersion version      = VERSION_SD1)
-        : decode_only(decode_only), VAE(backend, offload_params_to_cpu) {
-        bool use_linear_projection = false;
-        for (const auto& [name, tensor_storage] : tensor_storage_map) {
-            if (!starts_with(name, prefix)) {
-                continue;
-            }
-            if (ends_with(name, "attn_1.proj_out.weight")) {
-                if (tensor_storage.n_dims == 2) {
-                    use_linear_projection = true;
-                }
-                break;
-            }
-        }
-        ae = AutoencodingEngine(version, decode_only, use_linear_projection, use_video_decoder);
-        ae.init(params_ctx, tensor_storage_map, prefix);
-    }
-
-    void set_conv2d_scale(float scale) override {
-        std::vector<GGMLBlock*> blocks;
-        ae.get_all_blocks(blocks);
-        for (auto block : blocks) {
-            if (block->get_desc() == "Conv2d") {
-                auto conv_block = (Conv2d*)block;
-                conv_block->set_scale(scale);
-            }
-        }
-    }
-
-    std::string get_desc() override {
-        return "vae";
-    }
-
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) override {
-        ae.get_param_tensors(tensors, prefix);
-    }
-
-    struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
-        struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
-
-        z = to_backend(z);
-
-        auto runner_ctx = get_context();
-
-        struct ggml_tensor* out = decode_graph ? ae.decode(&runner_ctx, z) : ae.encode(&runner_ctx, z);
-
-        ggml_build_forward_expand(gf, out);
-
-        return gf;
-    }
-
-    bool compute(const int n_threads,
-                 struct ggml_tensor* z,
-                 bool decode_graph,
-                 struct ggml_tensor** output,
-                 struct ggml_context* output_ctx = nullptr) override {
-        GGML_ASSERT(!decode_only || decode_graph);
-        auto get_graph = [&]() -> struct ggml_cgraph* {
-            return build_graph(z, decode_graph);
-        };
-        // ggml_set_f32(z, 0.5f);
-        // print_ggml_tensor(z);
-        return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
-    }
-
-    void test() {
-        struct ggml_init_params params;
-        params.mem_size   = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
-        params.mem_buffer = nullptr;
-        params.no_alloc   = false;
-
-        struct ggml_context* work_ctx = ggml_init(params);
-        GGML_ASSERT(work_ctx != nullptr);
-
-        {
-            // CPU, x{1, 3, 64, 64}: Pass
-            // CUDA, x{1, 3, 64, 64}: Pass, but sill get wrong result for some image, may be due to interlnal nan
-            // CPU, x{2, 3, 64, 64}: Wrong result
-            // CUDA, x{2, 3, 64, 64}: Wrong result, and different from CPU result
-            auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 64, 64, 3, 2);
-            ggml_set_f32(x, 0.5f);
-            print_ggml_tensor(x);
-            struct ggml_tensor* out = nullptr;
-
-            int64_t t0 = ggml_time_ms();
-            compute(8, x, false, &out, work_ctx);
-            int64_t t1 = ggml_time_ms();
-
-            print_ggml_tensor(out);
-            LOG_DEBUG("encode test done in %lldms", t1 - t0);
-        }
-
-        if (false) {
-            // CPU, z{1, 4, 8, 8}: Pass
-            // CUDA, z{1, 4, 8, 8}: Pass
-            // CPU, z{3, 4, 8, 8}: Wrong result
-            // CUDA, z{3, 4, 8, 8}: Wrong result, and different from CPU result
-            auto z = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1);
-            ggml_set_f32(z, 0.5f);
-            print_ggml_tensor(z);
-            struct ggml_tensor* out = nullptr;
-
-            int64_t t0 = ggml_time_ms();
-            compute(8, z, true, &out, work_ctx);
-            int64_t t1 = ggml_time_ms();
-
-            print_ggml_tensor(out);
-            LOG_DEBUG("decode test done in %lldms", t1 - t0);
-        }
-    };
-};
-
-#endif
+#endif  // __VAE_HPP__
diff --git a/src/wan.hpp b/src/wan.hpp
index d94fbd48..6860262c 100644
--- a/src/wan.hpp
+++ b/src/wan.hpp
@@ -25,7 +25,7 @@ namespace WAN {
         std::tuple<int, int, int> dilation;
         bool bias;
 
-        void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+        void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
             params["weight"] = ggml_new_tensor_4d(ctx,
                                                   GGML_TYPE_F16,
                                                   std::get<2>(kernel_size),
@@ -53,11 +53,11 @@ namespace WAN {
               dilation(std::move(dilation)),
               bias(bias) {}
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* cache_x = nullptr) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* cache_x = nullptr) {
             // x: [N*IC, ID, IH, IW]
             // result: x: [N*OC, ID, IH, IW]
-            struct ggml_tensor* w = params["weight"];
-            struct ggml_tensor* b = nullptr;
+            ggml_tensor* w = params["weight"];
+            ggml_tensor* b = nullptr;
             if (bias) {
                 b = params["bias"];
             }
@@ -86,7 +86,7 @@ namespace WAN {
     protected:
         int64_t dim;
 
-        void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+        void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
             ggml_type wtype = GGML_TYPE_F32;
             auto iter       = tensor_storage_map.find(prefix + "gamma");
             if (iter != tensor_storage_map.end()) {
@@ -100,16 +100,16 @@ namespace WAN {
         RMS_norm(int64_t dim)
             : dim(dim) {}
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
             // x: [N*IC, ID, IH, IW], IC == dim
             // assert N == 1
 
-            struct ggml_tensor* w = params["gamma"];
-            w                     = ggml_reshape_1d(ctx->ggml_ctx, w, ggml_nelements(w));
-            auto h                = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 3, 0, 1, 2));  // [ID, IH, IW, N*IC]
-            h                     = ggml_rms_norm(ctx->ggml_ctx, h, 1e-12f);
-            h                     = ggml_mul(ctx->ggml_ctx, h, w);
-            h                     = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, h, 1, 2, 3, 0));
+            ggml_tensor* w = params["gamma"];
+            w              = ggml_reshape_1d(ctx->ggml_ctx, w, ggml_nelements(w));
+            auto h         = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 3, 0, 1, 2));  // [ID, IH, IW, N*IC]
+            h              = ggml_rms_norm(ctx->ggml_ctx, h, 1e-12f);
+            h              = ggml_mul(ctx->ggml_ctx, h, w);
+            h              = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, h, 1, 2, 3, 0));
 
             return h;
         }
@@ -148,12 +148,12 @@ namespace WAN {
             }
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    int64_t b,
-                                    std::vector<struct ggml_tensor*>& feat_cache,
-                                    int& feat_idx,
-                                    int chunk_idx) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             int64_t b,
+                             std::vector<ggml_tensor*>& feat_cache,
+                             int& feat_idx,
+                             int chunk_idx) {
             // x: [b*c, t, h, w]
             GGML_ASSERT(b == 1);
             int64_t c = x->ne[3] / b;
@@ -254,9 +254,9 @@ namespace WAN {
             GGML_ASSERT(in_channels * factor % out_channels == 0);
             group_size = in_channels * factor / out_channels;
         }
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    int64_t B = 1) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             int64_t B = 1) {
             // x: [B*IC, T, H, W]
             // return: [B*OC, T/factor_t, H/factor_s, W/factor_s]
             GGML_ASSERT(B == 1);
@@ -301,10 +301,10 @@ namespace WAN {
             GGML_ASSERT(out_channels * factor % in_channels == 0);
             repeats = out_channels * factor / in_channels;
         }
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    bool first_chunk = false,
-                                    int64_t B        = 1) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             bool first_chunk = false,
+                             int64_t B        = 1) {
             // x: [B*IC, T, H, W]
             // return: [B*OC, T/factor_t, H/factor_s, W/factor_s]
             GGML_ASSERT(B == 1);
@@ -356,14 +356,14 @@ namespace WAN {
             }
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    int64_t b,
-                                    std::vector<struct ggml_tensor*>& feat_cache,
-                                    int& feat_idx) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             int64_t b,
+                             std::vector<ggml_tensor*>& feat_cache,
+                             int& feat_idx) {
             // x: [b*c, t, h, w]
             GGML_ASSERT(b == 1);
-            struct ggml_tensor* h = x;
+            ggml_tensor* h = x;
             if (in_dim != out_dim) {
                 auto shortcut = std::dynamic_pointer_cast<CausalConv3d>(blocks["shortcut"]);
 
@@ -430,15 +430,15 @@ namespace WAN {
             }
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    int64_t b,
-                                    std::vector<struct ggml_tensor*>& feat_cache,
-                                    int& feat_idx,
-                                    int chunk_idx) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             int64_t b,
+                             std::vector<ggml_tensor*>& feat_cache,
+                             int& feat_idx,
+                             int chunk_idx) {
             // x: [b*c, t, h, w]
             GGML_ASSERT(b == 1);
-            struct ggml_tensor* x_copy = x;
+            ggml_tensor* x_copy = x;
 
             auto avg_shortcut = std::dynamic_pointer_cast<AvgDown3D>(blocks["avg_shortcut"]);
 
@@ -492,15 +492,15 @@ namespace WAN {
             }
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    int64_t b,
-                                    std::vector<struct ggml_tensor*>& feat_cache,
-                                    int& feat_idx,
-                                    int chunk_idx) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             int64_t b,
+                             std::vector<ggml_tensor*>& feat_cache,
+                             int& feat_idx,
+                             int chunk_idx) {
             // x: [b*c, t, h, w]
             GGML_ASSERT(b == 1);
-            struct ggml_tensor* x_copy = x;
+            ggml_tensor* x_copy = x;
 
             int i = 0;
             for (; i < mult; i++) {
@@ -537,9 +537,9 @@ namespace WAN {
             blocks["proj"]   = std::shared_ptr<GGMLBlock>(new Conv2d(dim, dim, {1, 1}));
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    int64_t b) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             int64_t b) {
             // x: [b*c, t, h, w]
             GGML_ASSERT(b == 1);
             auto norm   = std::dynamic_pointer_cast<RMS_norm>(blocks["norm"]);
@@ -659,12 +659,12 @@ namespace WAN {
             blocks["head.2"] = std::shared_ptr<GGMLBlock>(new CausalConv3d(out_dim, z_dim, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}));
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    int64_t b,
-                                    std::vector<struct ggml_tensor*>& feat_cache,
-                                    int& feat_idx,
-                                    int chunk_idx) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             int64_t b,
+                             std::vector<ggml_tensor*>& feat_cache,
+                             int& feat_idx,
+                             int chunk_idx) {
             // x: [b*c, t, h, w]
             GGML_ASSERT(b == 1);
             auto conv1    = std::dynamic_pointer_cast<CausalConv3d>(blocks["conv1"]);
@@ -830,12 +830,12 @@ namespace WAN {
             }
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    int64_t b,
-                                    std::vector<struct ggml_tensor*>& feat_cache,
-                                    int& feat_idx,
-                                    int chunk_idx) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             int64_t b,
+                             std::vector<ggml_tensor*>& feat_cache,
+                             int& feat_idx,
+                             int chunk_idx) {
             // x: [b*c, t, h, w]
             GGML_ASSERT(b == 1);
             auto conv1    = std::dynamic_pointer_cast<CausalConv3d>(blocks["conv1"]);
@@ -934,16 +934,16 @@ namespace WAN {
 
         int _conv_num = 33;
         int _conv_idx = 0;
-        std::vector<struct ggml_tensor*> _feat_map;
+        std::vector<ggml_tensor*> _feat_map;
         int _enc_conv_num = 28;
         int _enc_conv_idx = 0;
-        std::vector<struct ggml_tensor*> _enc_feat_map;
+        std::vector<ggml_tensor*> _enc_feat_map;
 
         void clear_cache() {
             _conv_idx     = 0;
-            _feat_map     = std::vector<struct ggml_tensor*>(_conv_num, nullptr);
+            _feat_map     = std::vector<ggml_tensor*>(_conv_num, nullptr);
             _enc_conv_idx = 0;
-            _enc_feat_map = std::vector<struct ggml_tensor*>(_enc_conv_num, nullptr);
+            _enc_feat_map = std::vector<ggml_tensor*>(_enc_conv_num, nullptr);
         }
 
     public:
@@ -966,10 +966,10 @@ namespace WAN {
             blocks["conv2"]   = std::shared_ptr<GGMLBlock>(new CausalConv3d(z_dim, z_dim, {1, 1, 1}));
         }
 
-        struct ggml_tensor* patchify(struct ggml_context* ctx,
-                                     struct ggml_tensor* x,
-                                     int64_t patch_size,
-                                     int64_t b = 1) {
+        ggml_tensor* patchify(ggml_context* ctx,
+                              ggml_tensor* x,
+                              int64_t patch_size,
+                              int64_t b = 1) {
             // x: [b*c, f, h*q, w*r]
             // return: [b*c*r*q, f, h, w]
             if (patch_size == 1) {
@@ -993,10 +993,10 @@ namespace WAN {
             return x;
         }
 
-        struct ggml_tensor* unpatchify(struct ggml_context* ctx,
-                                       struct ggml_tensor* x,
-                                       int64_t patch_size,
-                                       int64_t b = 1) {
+        ggml_tensor* unpatchify(ggml_context* ctx,
+                                ggml_tensor* x,
+                                int64_t patch_size,
+                                int64_t b = 1) {
             // x: [b*c*r*q, f, h, w]
             // return: [b*c, f, h*q, w*r]
             if (patch_size == 1) {
@@ -1019,9 +1019,9 @@ namespace WAN {
             return x;
         }
 
-        struct ggml_tensor* encode(GGMLRunnerContext* ctx,
-                                   struct ggml_tensor* x,
-                                   int64_t b = 1) {
+        ggml_tensor* encode(GGMLRunnerContext* ctx,
+                            ggml_tensor* x,
+                            int64_t b = 1) {
             // x: [b*c, t, h, w]
             GGML_ASSERT(b == 1);
             GGML_ASSERT(decode_only == false);
@@ -1037,7 +1037,7 @@ namespace WAN {
 
             int64_t t     = x->ne[2];
             int64_t iter_ = 1 + (t - 1) / 4;
-            struct ggml_tensor* out;
+            ggml_tensor* out;
             for (int i = 0; i < iter_; i++) {
                 _enc_conv_idx = 0;
                 if (i == 0) {
@@ -1055,9 +1055,9 @@ namespace WAN {
             return mu;
         }
 
-        struct ggml_tensor* decode(GGMLRunnerContext* ctx,
-                                   struct ggml_tensor* z,
-                                   int64_t b = 1) {
+        ggml_tensor* decode(GGMLRunnerContext* ctx,
+                            ggml_tensor* z,
+                            int64_t b = 1) {
             // z: [b*c, t, h, w]
             GGML_ASSERT(b == 1);
 
@@ -1068,7 +1068,7 @@ namespace WAN {
 
             int64_t iter_ = z->ne[2];
             auto x        = conv2->forward(ctx, z);
-            struct ggml_tensor* out;
+            ggml_tensor* out;
             for (int i = 0; i < iter_; i++) {
                 _conv_idx = 0;
                 if (i == 0) {
@@ -1087,10 +1087,10 @@ namespace WAN {
             return out;
         }
 
-        struct ggml_tensor* decode_partial(GGMLRunnerContext* ctx,
-                                           struct ggml_tensor* z,
-                                           int i,
-                                           int64_t b = 1) {
+        ggml_tensor* decode_partial(GGMLRunnerContext* ctx,
+                                    ggml_tensor* z,
+                                    int i,
+                                    int64_t b = 1) {
             // z: [b*c, t, h, w]
             GGML_ASSERT(b == 1);
 
@@ -1109,7 +1109,8 @@ namespace WAN {
     };
 
     struct WanVAERunner : public VAE {
-        bool decode_only = true;
+        float scale_factor = 1.0f;
+        bool decode_only   = true;
         WanVAE ae;
 
         WanVAERunner(ggml_backend_t backend,
@@ -1118,7 +1119,7 @@ namespace WAN {
                      const std::string prefix                       = "",
                      bool decode_only                               = false,
                      SDVersion version                              = VERSION_WAN2)
-            : decode_only(decode_only), ae(decode_only, version == VERSION_WAN2_2_TI2V), VAE(backend, offload_params_to_cpu) {
+            : decode_only(decode_only), ae(decode_only, version == VERSION_WAN2_2_TI2V), VAE(version, backend, offload_params_to_cpu) {
             ae.init(params_ctx, tensor_storage_map, prefix);
         }
 
@@ -1126,26 +1127,82 @@ namespace WAN {
             return "wan_vae";
         }
 
-        void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) override {
+        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) override {
             ae.get_param_tensors(tensors, prefix);
         }
 
-        struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
-            struct ggml_cgraph* gf = new_graph_custom(10240 * z->ne[2]);
+        sd::Tensor<float> vae_output_to_latents(const sd::Tensor<float>& vae_output, std::shared_ptr<RNG> rng) override {
+            SD_UNUSED(rng);
+            return vae_output;
+        }
 
-            z = to_backend(z);
+        std::pair<sd::Tensor<float>, sd::Tensor<float>> get_latents_mean_std(const sd::Tensor<float>& latents) {
+            int channel_dim = latents.dim() == 5 ? 3 : 2;
+            std::vector<int64_t> stats_shape(static_cast<size_t>(latents.dim()), 1);
+            if (latents.shape()[channel_dim] == 16) {  // Wan2.1 VAE
+                stats_shape[static_cast<size_t>(channel_dim)] = 16;
+
+                auto mean_tensor = sd::Tensor<float>::from_vector({-0.7571f, -0.7089f, -0.9113f, 0.1075f, -0.1745f, 0.9653f, -0.1517f, 1.5508f,
+                                                                   0.4134f, -0.0715f, 0.5517f, -0.3632f, -0.1922f, -0.9497f, 0.2503f, -0.2921f});
+                mean_tensor.reshape_(stats_shape);
+                auto std_tensor = sd::Tensor<float>::from_vector({2.8184f, 1.4541f, 2.3275f, 2.6558f, 1.2196f, 1.7708f, 2.6052f, 2.0743f,
+                                                                  3.2687f, 2.1526f, 2.8652f, 1.5579f, 1.6382f, 1.1253f, 2.8251f, 1.9160f});
+                std_tensor.reshape_(stats_shape);
+                return {std::move(mean_tensor), std::move(std_tensor)};
+            }
+            if (latents.shape()[channel_dim] == 48) {  // Wan2.2 VAE
+                stats_shape[static_cast<size_t>(channel_dim)] = 48;
+
+                auto mean_tensor = sd::Tensor<float>::from_vector({-0.2289f, -0.0052f, -0.1323f, -0.2339f, -0.2799f, 0.0174f, 0.1838f, 0.1557f,
+                                                                   -0.1382f, 0.0542f, 0.2813f, 0.0891f, 0.1570f, -0.0098f, 0.0375f, -0.1825f,
+                                                                   -0.2246f, -0.1207f, -0.0698f, 0.5109f, 0.2665f, -0.2108f, -0.2158f, 0.2502f,
+                                                                   -0.2055f, -0.0322f, 0.1109f, 0.1567f, -0.0729f, 0.0899f, -0.2799f, -0.1230f,
+                                                                   -0.0313f, -0.1649f, 0.0117f, 0.0723f, -0.2839f, -0.2083f, -0.0520f, 0.3748f,
+                                                                   0.0152f, 0.1957f, 0.1433f, -0.2944f, 0.3573f, -0.0548f, -0.1681f, -0.0667f});
+                mean_tensor.reshape_(stats_shape);
+                auto std_tensor = sd::Tensor<float>::from_vector({0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f,
+                                                                  0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f,
+                                                                  0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f,
+                                                                  0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f,
+                                                                  0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f,
+                                                                  0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f});
+                std_tensor.reshape_(stats_shape);
+                return {std::move(mean_tensor), std::move(std_tensor)};
+            }
+            GGML_ABORT("unexpected latent channel dimension %lld for version %d",
+                       (long long)latents.shape()[channel_dim],
+                       version);
+        }
+
+        sd::Tensor<float> diffusion_to_vae_latents(const sd::Tensor<float>& latents) override {
+            auto [mean_tensor, std_tensor] = get_latents_mean_std(latents);
+            return (latents * std_tensor) / scale_factor + mean_tensor;
+        }
+
+        sd::Tensor<float> vae_to_diffusion_latents(const sd::Tensor<float>& latents) override {
+            auto [mean_tensor, std_tensor] = get_latents_mean_std(latents);
+            return ((latents - mean_tensor) * scale_factor) / std_tensor;
+        }
+
+        int get_encoder_output_channels(int input_channels) {
+            return static_cast<int>(ae.z_dim);
+        }
+
+        ggml_cgraph* build_graph(const sd::Tensor<float>& z_tensor, bool decode_graph) {
+            ggml_cgraph* gf = new_graph_custom(10240 * z_tensor.shape()[2]);
+            ggml_tensor* z  = make_input(z_tensor);
 
             auto runner_ctx = get_context();
 
-            struct ggml_tensor* out = decode_graph ? ae.decode(&runner_ctx, z) : ae.encode(&runner_ctx, z);
+            ggml_tensor* out = decode_graph ? ae.decode(&runner_ctx, z) : ae.encode(&runner_ctx, z);
 
             ggml_build_forward_expand(gf, out);
 
             return gf;
         }
 
-        struct ggml_cgraph* build_graph_partial(struct ggml_tensor* z, bool decode_graph, int i) {
-            struct ggml_cgraph* gf = new_graph_custom(20480);
+        ggml_cgraph* build_graph_partial(const sd::Tensor<float>& z_tensor, bool decode_graph, int i) {
+            ggml_cgraph* gf = new_graph_custom(20480);
 
             ae.clear_cache();
 
@@ -1154,11 +1211,11 @@ namespace WAN {
                 ae._feat_map[feat_idx] = feat_cache;
             }
 
-            z = to_backend(z);
+            ggml_tensor* z = make_input(z_tensor);
 
             auto runner_ctx = get_context();
 
-            struct ggml_tensor* out = decode_graph ? ae.decode_partial(&runner_ctx, z, i) : ae.encode(&runner_ctx, z);
+            ggml_tensor* out = decode_graph ? ae.decode_partial(&runner_ctx, z, i) : ae.encode(&runner_ctx, z);
 
             for (size_t feat_idx = 0; feat_idx < ae._feat_map.size(); feat_idx++) {
                 ggml_tensor* feat_cache = ae._feat_map[feat_idx];
@@ -1173,86 +1230,85 @@ namespace WAN {
             return gf;
         }
 
-        bool compute(const int n_threads,
-                     struct ggml_tensor* z,
-                     bool decode_graph,
-                     struct ggml_tensor** output,
-                     struct ggml_context* output_ctx = nullptr) override {
+        sd::Tensor<float> _compute(const int n_threads,
+                                   const sd::Tensor<float>& z,
+                                   bool decode_graph) override {
             if (true) {
-                auto get_graph = [&]() -> struct ggml_cgraph* {
-                    return build_graph(z, decode_graph);
-                };
-                return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
-            } else {  // chunk 1 result is weird
-                ae.clear_cache();
-                int64_t t      = z->ne[2];
-                int i          = 0;
-                auto get_graph = [&]() -> struct ggml_cgraph* {
-                    return build_graph_partial(z, decode_graph, i);
-                };
-                struct ggml_tensor* out = nullptr;
-                bool res                = GGMLRunner::compute(get_graph, n_threads, true, &out, output_ctx);
-                ae.clear_cache();
-                if (t == 1) {
-                    *output = out;
-                    return res;
+                sd::Tensor<float> input;
+                if (z.dim() == 4) {
+                    input = z.unsqueeze(2);
                 }
-
-                *output = ggml_new_tensor_4d(output_ctx, GGML_TYPE_F32, out->ne[0], out->ne[1], (t - 1) * 4 + 1, out->ne[3]);
-
-                auto copy_to_output = [&]() {
-                    for (int64_t i3 = 0; i3 < out->ne[3]; i3++) {
-                        for (int64_t i2 = 0; i2 < out->ne[2]; i2++) {
-                            for (int64_t i1 = 0; i1 < out->ne[1]; i1++) {
-                                for (int64_t i0 = 0; i0 < out->ne[0]; i0++) {
-                                    float value    = ggml_ext_tensor_get_f32(out, i0, i1, i2, i3);
-                                    int64_t offset = (i == 0) ? 0 : (1 + (i - 1) * 4);
-                                    ggml_ext_tensor_set_f32(*output, value, i0, i1, offset + i2, i3);
-                                }
-                            }
-                        }
+                auto get_graph = [&]() -> ggml_cgraph* {
+                    if (input.empty()) {
+                        return build_graph(z, decode_graph);
+                    } else {
+                        return build_graph(input, decode_graph);
                     }
                 };
+                auto result = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, true),
+                                                              input.empty() ? z.dim() : input.dim());
+                if (!result.empty() && z.dim() == 4) {
+                    result.squeeze_(2);
+                }
+                return result;
+            } else {  // chunk 1 result is weird
+                ae.clear_cache();
+                int64_t t      = z.shape()[2];
+                int i          = 0;
+                auto get_graph = [&]() -> ggml_cgraph* {
+                    return build_graph_partial(z, decode_graph, i);
+                };
+                auto out_opt = GGMLRunner::compute<float>(get_graph, n_threads, true);
+                if (!out_opt.has_value()) {
+                    return {};
+                }
+                sd::Tensor<float> out = std::move(*out_opt);
+                ae.clear_cache();
+                if (t == 1) {
+                    return out;
+                }
 
-                copy_to_output();
-
-                out = ggml_new_tensor_4d(output_ctx, GGML_TYPE_F32, out->ne[0], out->ne[1], 4, out->ne[3]);
+                sd::Tensor<float> output = std::move(out);
 
                 for (i = 1; i < t; i++) {
-                    res = res || GGMLRunner::compute(get_graph, n_threads, true, &out);
+                    auto chunk_opt = GGMLRunner::compute<float>(get_graph, n_threads, true);
+                    if (!chunk_opt.has_value()) {
+                        return {};
+                    }
+                    out = std::move(*chunk_opt);
                     ae.clear_cache();
-                    copy_to_output();
+                    output = sd::ops::concat(output, out, 2);
                 }
                 free_cache_ctx_and_buffer();
-                return res;
+                return output;
             }
         }
 
         void test() {
-            struct ggml_init_params params;
+            ggml_init_params params;
             params.mem_size   = static_cast<size_t>(1024 * 1024) * 1024;  // 1G
             params.mem_buffer = nullptr;
             params.no_alloc   = false;
 
-            struct ggml_context* work_ctx = ggml_init(params);
-            GGML_ASSERT(work_ctx != nullptr);
+            ggml_context* ctx = ggml_init(params);
+            GGML_ASSERT(ctx != nullptr);
 
             if (true) {
                 // cpu f32, pass
                 // cpu f16, pass
                 // cuda f16, pass
                 // cuda f32, pass
-                auto z = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 104, 60, 2, 16);
-                ggml_set_f32(z, 0.5f);
-                z = load_tensor_from_file(work_ctx, "wan_vae_z.bin");
-                print_ggml_tensor(z);
-                struct ggml_tensor* out = nullptr;
+                auto z = sd::load_tensor_from_file_as_tensor<float>("wan_vae_z.bin");
+                print_sd_tensor(z);
+                sd::Tensor<float> out;
 
-                int64_t t0 = ggml_time_ms();
-                compute(8, z, true, &out, work_ctx);
-                int64_t t1 = ggml_time_ms();
+                int64_t t0   = ggml_time_ms();
+                auto out_opt = _compute(8, z, true);
+                int64_t t1   = ggml_time_ms();
 
-                print_ggml_tensor(out);
+                GGML_ASSERT(!out_opt.empty());
+                out = std::move(out_opt);
+                print_sd_tensor(out);
                 LOG_DEBUG("decode test done in %ldms", t1 - t0);
             }
         };
@@ -1314,10 +1370,10 @@ namespace WAN {
             }
         }
 
-        virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                            struct ggml_tensor* x,
-                                            struct ggml_tensor* pe,
-                                            struct ggml_tensor* mask = nullptr) {
+        virtual ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                     ggml_tensor* x,
+                                     ggml_tensor* pe,
+                                     ggml_tensor* mask = nullptr) {
             // x: [N, n_token, dim]
             // pe: [n_token, d_head/2, 2, 2]
             // return [N, n_token, dim]
@@ -1355,10 +1411,10 @@ namespace WAN {
                           bool qk_norm = true,
                           float eps    = 1e-6)
             : WanSelfAttention(dim, num_heads, qk_norm, eps) {}
-        virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                            struct ggml_tensor* x,
-                                            struct ggml_tensor* context,
-                                            int64_t context_img_len) = 0;
+        virtual ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                     ggml_tensor* x,
+                                     ggml_tensor* context,
+                                     int64_t context_img_len) = 0;
     };
 
     class WanT2VCrossAttention : public WanCrossAttention {
@@ -1368,10 +1424,10 @@ namespace WAN {
                              bool qk_norm = true,
                              float eps    = 1e-6)
             : WanCrossAttention(dim, num_heads, qk_norm, eps) {}
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    struct ggml_tensor* context,
-                                    int64_t context_img_len) override {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* context,
+                             int64_t context_img_len) override {
             // x: [N, n_token, dim]
             // context: [N, n_context, dim]
             // context_img_len: unused
@@ -1416,10 +1472,10 @@ namespace WAN {
             }
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    struct ggml_tensor* context,
-                                    int64_t context_img_len) override {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* context,
+                             int64_t context_img_len) override {
             // x: [N, n_token, dim]
             // context: [N, context_img_len + context_txt_len, dim]
             // return [N, n_token, dim]
@@ -1464,7 +1520,7 @@ namespace WAN {
         }
     };
 
-    static struct ggml_tensor* modulate_add(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* e) {
+    static ggml_tensor* modulate_add(ggml_context* ctx, ggml_tensor* x, ggml_tensor* e) {
         // x: [N, n_token, dim]
         // e: [N, 1, dim] or [N, T, 1, dim]
         if (ggml_n_dims(e) == 3) {
@@ -1478,7 +1534,7 @@ namespace WAN {
         return x;
     }
 
-    static struct ggml_tensor* modulate_mul(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* e) {
+    static ggml_tensor* modulate_mul(ggml_context* ctx, ggml_tensor* x, ggml_tensor* e) {
         // x: [N, n_token, dim]
         // e: [N, 1, dim] or [N, T, 1, dim]
         if (ggml_n_dims(e) == 3) {
@@ -1496,7 +1552,7 @@ namespace WAN {
     protected:
         int64_t dim;
 
-        void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+        void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
             enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32);
             params["modulation"] = ggml_new_tensor_3d(ctx, wtype, dim, 6, 1);
         }
@@ -1530,12 +1586,12 @@ namespace WAN {
             blocks["ffn.2"] = std::shared_ptr<GGMLBlock>(new Linear(ffn_dim, dim));
         }
 
-        virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                            struct ggml_tensor* x,
-                                            struct ggml_tensor* e,
-                                            struct ggml_tensor* pe,
-                                            struct ggml_tensor* context,
-                                            int64_t context_img_len = 257) {
+        virtual ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                     ggml_tensor* x,
+                                     ggml_tensor* e,
+                                     ggml_tensor* pe,
+                                     ggml_tensor* context,
+                                     int64_t context_img_len = 257) {
             // x: [N, n_token, dim]
             // e: [N, 6, dim] or [N, T, 6, dim]
             // context: [N, context_img_len + context_txt_len, dim]
@@ -1584,7 +1640,7 @@ namespace WAN {
     class VaceWanAttentionBlock : public WanAttentionBlock {
     protected:
         int block_id;
-        void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+        void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
             enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32);
             params["modulation"] = ggml_new_tensor_3d(ctx, wtype, dim, 6, 1);
         }
@@ -1606,11 +1662,11 @@ namespace WAN {
         }
 
         std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
-                                                      struct ggml_tensor* c,
-                                                      struct ggml_tensor* x,
-                                                      struct ggml_tensor* e,
-                                                      struct ggml_tensor* pe,
-                                                      struct ggml_tensor* context,
+                                                      ggml_tensor* c,
+                                                      ggml_tensor* x,
+                                                      ggml_tensor* e,
+                                                      ggml_tensor* pe,
+                                                      ggml_tensor* context,
                                                       int64_t context_img_len = 257) {
             // x: [N, n_token, dim]
             // e: [N, 6, dim] or [N, T, 6, dim]
@@ -1636,7 +1692,7 @@ namespace WAN {
     protected:
         int64_t dim;
 
-        void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+        void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
             enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32);
             params["modulation"] = ggml_new_tensor_3d(ctx, wtype, dim, 2, 1);
         }
@@ -1653,9 +1709,9 @@ namespace WAN {
             blocks["head"] = std::shared_ptr<GGMLBlock>(new Linear(dim, out_dim));
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    struct ggml_tensor* e) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* e) {
             // x: [N, n_token, dim]
             // e: [N, dim] or [N, T, dim]
             // return [N, n_token, out_dim]
@@ -1683,7 +1739,7 @@ namespace WAN {
         int64_t in_dim;
         int64_t flf_pos_embed_token_number;
 
-        void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+        void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
             if (flf_pos_embed_token_number > 0) {
                 params["emb_pos"] = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, in_dim, flf_pos_embed_token_number, 1);
             }
@@ -1701,8 +1757,8 @@ namespace WAN {
             blocks["proj.4"] = std::shared_ptr<GGMLBlock>(new LayerNorm(out_dim));
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* image_embeds) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* image_embeds) {
             if (flf_pos_embed_token_number > 0) {
                 auto emb_pos = params["emb_pos"];
 
@@ -1821,8 +1877,8 @@ namespace WAN {
             }
         }
 
-        struct ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx,
-                                              struct ggml_tensor* x) {
+        ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx,
+                                       ggml_tensor* x) {
             int64_t W = x->ne[0];
             int64_t H = x->ne[1];
             int64_t T = x->ne[2];
@@ -1834,11 +1890,11 @@ namespace WAN {
             return x;
         }
 
-        struct ggml_tensor* unpatchify(struct ggml_context* ctx,
-                                       struct ggml_tensor* x,
-                                       int64_t t_len,
-                                       int64_t h_len,
-                                       int64_t w_len) {
+        ggml_tensor* unpatchify(ggml_context* ctx,
+                                ggml_tensor* x,
+                                int64_t t_len,
+                                int64_t h_len,
+                                int64_t w_len) {
             // x: [N, t_len*h_len*w_len, pt*ph*pw*C]
             // return: [N*C, t_len*pt, h_len*ph, w_len*pw]
             int64_t N  = x->ne[3];
@@ -1861,15 +1917,15 @@ namespace WAN {
             return x;
         }
 
-        struct ggml_tensor* forward_orig(GGMLRunnerContext* ctx,
-                                         struct ggml_tensor* x,
-                                         struct ggml_tensor* timestep,
-                                         struct ggml_tensor* context,
-                                         struct ggml_tensor* pe,
-                                         struct ggml_tensor* clip_fea     = nullptr,
-                                         struct ggml_tensor* vace_context = nullptr,
-                                         float vace_strength              = 1.f,
-                                         int64_t N                        = 1) {
+        ggml_tensor* forward_orig(GGMLRunnerContext* ctx,
+                                  ggml_tensor* x,
+                                  ggml_tensor* timestep,
+                                  ggml_tensor* context,
+                                  ggml_tensor* pe,
+                                  ggml_tensor* clip_fea     = nullptr,
+                                  ggml_tensor* vace_context = nullptr,
+                                  float vace_strength       = 1.f,
+                                  int64_t N                 = 1) {
             // x: [N*C, T, H, W], C => in_dim
             // vace_context: [N*vace_in_dim, T, H, W]
             // timestep: [N,] or [T]
@@ -1955,16 +2011,16 @@ namespace WAN {
             return x;
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    struct ggml_tensor* timestep,
-                                    struct ggml_tensor* context,
-                                    struct ggml_tensor* pe,
-                                    struct ggml_tensor* clip_fea        = nullptr,
-                                    struct ggml_tensor* time_dim_concat = nullptr,
-                                    struct ggml_tensor* vace_context    = nullptr,
-                                    float vace_strength                 = 1.f,
-                                    int64_t N                           = 1) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* timestep,
+                             ggml_tensor* context,
+                             ggml_tensor* pe,
+                             ggml_tensor* clip_fea        = nullptr,
+                             ggml_tensor* time_dim_concat = nullptr,
+                             ggml_tensor* vace_context    = nullptr,
+                             float vace_strength          = 1.f,
+                             int64_t N                    = 1) {
             // Forward pass of DiT.
             // x: [N*C, T, H, W]
             // timestep: [N,]
@@ -2129,27 +2185,27 @@ namespace WAN {
             return desc;
         }
 
-        void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
             wan.get_param_tensors(tensors, prefix);
         }
 
-        struct ggml_cgraph* build_graph(struct ggml_tensor* x,
-                                        struct ggml_tensor* timesteps,
-                                        struct ggml_tensor* context,
-                                        struct ggml_tensor* clip_fea        = nullptr,
-                                        struct ggml_tensor* c_concat        = nullptr,
-                                        struct ggml_tensor* time_dim_concat = nullptr,
-                                        struct ggml_tensor* vace_context    = nullptr,
-                                        float vace_strength                 = 1.f) {
-            struct ggml_cgraph* gf = new_graph_custom(WAN_GRAPH_SIZE);
+        ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
+                                 const sd::Tensor<float>& timesteps_tensor,
+                                 const sd::Tensor<float>& context_tensor         = {},
+                                 const sd::Tensor<float>& clip_fea_tensor        = {},
+                                 const sd::Tensor<float>& c_concat_tensor        = {},
+                                 const sd::Tensor<float>& time_dim_concat_tensor = {},
+                                 const sd::Tensor<float>& vace_context_tensor    = {},
+                                 float vace_strength                             = 1.f) {
+            ggml_cgraph* gf = new_graph_custom(WAN_GRAPH_SIZE);
 
-            x               = to_backend(x);
-            timesteps       = to_backend(timesteps);
-            context         = to_backend(context);
-            clip_fea        = to_backend(clip_fea);
-            c_concat        = to_backend(c_concat);
-            time_dim_concat = to_backend(time_dim_concat);
-            vace_context    = to_backend(vace_context);
+            ggml_tensor* x               = make_input(x_tensor);
+            ggml_tensor* timesteps       = make_input(timesteps_tensor);
+            ggml_tensor* context         = make_optional_input(context_tensor);
+            ggml_tensor* clip_fea        = make_optional_input(clip_fea_tensor);
+            ggml_tensor* c_concat        = make_optional_input(c_concat_tensor);
+            ggml_tensor* time_dim_concat = make_optional_input(time_dim_concat_tensor);
+            ggml_tensor* vace_context    = make_optional_input(vace_context_tensor);
 
             pe_vec      = Rope::gen_wan_pe(static_cast<int>(x->ne[2]),
                                            static_cast<int>(x->ne[1]),
@@ -2174,75 +2230,75 @@ namespace WAN {
 
             auto runner_ctx = get_context();
 
-            struct ggml_tensor* out = wan.forward(&runner_ctx,
-                                                  x,
-                                                  timesteps,
-                                                  context,
-                                                  pe,
-                                                  clip_fea,
-                                                  time_dim_concat,
-                                                  vace_context,
-                                                  vace_strength);
+            ggml_tensor* out = wan.forward(&runner_ctx,
+                                           x,
+                                           timesteps,
+                                           context,
+                                           pe,
+                                           clip_fea,
+                                           time_dim_concat,
+                                           vace_context,
+                                           vace_strength);
 
             ggml_build_forward_expand(gf, out);
 
             return gf;
         }
 
-        bool compute(int n_threads,
-                     struct ggml_tensor* x,
-                     struct ggml_tensor* timesteps,
-                     struct ggml_tensor* context,
-                     struct ggml_tensor* clip_fea        = nullptr,
-                     struct ggml_tensor* c_concat        = nullptr,
-                     struct ggml_tensor* time_dim_concat = nullptr,
-                     struct ggml_tensor* vace_context    = nullptr,
-                     float vace_strength                 = 1.f,
-                     struct ggml_tensor** output         = nullptr,
-                     struct ggml_context* output_ctx     = nullptr) {
-            auto get_graph = [&]() -> struct ggml_cgraph* {
+        sd::Tensor<float> compute(int n_threads,
+                                  const sd::Tensor<float>& x,
+                                  const sd::Tensor<float>& timesteps,
+                                  const sd::Tensor<float>& context         = {},
+                                  const sd::Tensor<float>& clip_fea        = {},
+                                  const sd::Tensor<float>& c_concat        = {},
+                                  const sd::Tensor<float>& time_dim_concat = {},
+                                  const sd::Tensor<float>& vace_context    = {},
+                                  float vace_strength                      = 1.f) {
+            auto get_graph = [&]() -> ggml_cgraph* {
                 return build_graph(x, timesteps, context, clip_fea, c_concat, time_dim_concat, vace_context, vace_strength);
             };
 
-            return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
         }
 
         void test() {
-            struct ggml_init_params params;
+            ggml_init_params params;
             params.mem_size   = static_cast<size_t>(200 * 1024 * 1024);  // 200 MB
             params.mem_buffer = nullptr;
             params.no_alloc   = false;
 
-            struct ggml_context* work_ctx = ggml_init(params);
-            GGML_ASSERT(work_ctx != nullptr);
+            ggml_context* ctx = ggml_init(params);
+            GGML_ASSERT(ctx != nullptr);
 
             {
                 // cpu f16: pass
                 // cuda f16: pass
                 // cpu q8_0: pass
-                // auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 104, 60, 1, 16);
+                // auto x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 104, 60, 1, 16);
                 // ggml_set_f32(x, 0.01f);
-                auto x = load_tensor_from_file(work_ctx, "wan_dit_x.bin");
-                print_ggml_tensor(x);
+                auto x = sd::load_tensor_from_file_as_tensor<float>("wan_dit_x.bin");
+                print_sd_tensor(x);
 
                 std::vector<float> timesteps_vec(3, 1000.f);
                 timesteps_vec[0] = 0.f;
-                auto timesteps   = vector_to_ggml_tensor(work_ctx, timesteps_vec);
+                auto timesteps   = sd::Tensor<float>::from_vector(timesteps_vec);
 
-                // auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 4096, 512, 1);
+                // auto context = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 4096, 512, 1);
                 // ggml_set_f32(context, 0.01f);
-                auto context = load_tensor_from_file(work_ctx, "wan_dit_context.bin");
-                print_ggml_tensor(context);
-                // auto clip_fea = load_tensor_from_file(work_ctx, "wan_dit_clip_fea.bin");
+                auto context = sd::load_tensor_from_file_as_tensor<float>("wan_dit_context.bin");
+                print_sd_tensor(context);
+                // auto clip_fea = load_tensor_from_file(ctx, "wan_dit_clip_fea.bin");
                 // print_ggml_tensor(clip_fea);
 
-                struct ggml_tensor* out = nullptr;
+                sd::Tensor<float> out;
 
-                int64_t t0 = ggml_time_ms();
-                compute(8, x, timesteps, context, nullptr, nullptr, nullptr, nullptr, 1.f, &out, work_ctx);
-                int64_t t1 = ggml_time_ms();
+                int64_t t0   = ggml_time_ms();
+                auto out_opt = compute(8, x, timesteps, context, {}, {}, {}, {}, 1.f);
+                int64_t t1   = ggml_time_ms();
 
-                print_ggml_tensor(out);
+                GGML_ASSERT(!out_opt.empty());
+                out = std::move(out_opt);
+                print_sd_tensor(out);
                 LOG_DEBUG("wan test done in %lldms", t1 - t0);
             }
         }
diff --git a/src/z_image.hpp b/src/z_image.hpp
index 8f405a59..363ce5f4 100644
--- a/src/z_image.hpp
+++ b/src/z_image.hpp
@@ -42,10 +42,10 @@ namespace ZImage {
             }
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    struct ggml_tensor* pe,
-                                    struct ggml_tensor* mask = nullptr) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* pe,
+                             ggml_tensor* mask = nullptr) {
             // x: [N, n_token, hidden_size]
             int64_t n_token = x->ne[1];
             int64_t N       = x->ne[2];
@@ -124,23 +124,23 @@ namespace ZImage {
             blocks["w3"] = std::make_shared<Linear>(dim, hidden_dim, false);
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
             auto w1 = std::dynamic_pointer_cast<Linear>(blocks["w1"]);
             auto w2 = std::dynamic_pointer_cast<Linear>(blocks["w2"]);
             auto w3 = std::dynamic_pointer_cast<Linear>(blocks["w3"]);
 
             auto x1 = w1->forward(ctx, x);
             auto x3 = w3->forward(ctx, x);
-            x       = ggml_mul(ctx->ggml_ctx, ggml_silu(ctx->ggml_ctx, x1), x3);
+            x       = ggml_swiglu_split(ctx->ggml_ctx, x1, x3);
             x       = w2->forward(ctx, x);
 
             return x;
         }
     };
 
-    __STATIC_INLINE__ struct ggml_tensor* modulate(struct ggml_context* ctx,
-                                                   struct ggml_tensor* x,
-                                                   struct ggml_tensor* scale) {
+    __STATIC_INLINE__ ggml_tensor* modulate(ggml_context* ctx,
+                                            ggml_tensor* x,
+                                            ggml_tensor* scale) {
         // x: [N, L, C]
         // scale: [N, C]
         scale = ggml_reshape_3d(ctx, scale, scale->ne[0], 1, scale->ne[1]);  // [N, 1, C]
@@ -175,11 +175,11 @@ namespace ZImage {
             }
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    struct ggml_tensor* pe,
-                                    struct ggml_tensor* mask        = nullptr,
-                                    struct ggml_tensor* adaln_input = nullptr) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* pe,
+                             ggml_tensor* mask        = nullptr,
+                             ggml_tensor* adaln_input = nullptr) {
             auto attention       = std::dynamic_pointer_cast<JointAttention>(blocks["attention"]);
             auto feed_forward    = std::dynamic_pointer_cast<FeedForward>(blocks["feed_forward"]);
             auto attention_norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["attention_norm1"]);
@@ -241,9 +241,9 @@ namespace ZImage {
             blocks["adaLN_modulation.1"] = std::make_shared<Linear>(MIN(hidden_size, ADALN_EMBED_DIM), hidden_size);
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    struct ggml_tensor* c) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* c) {
             // x: [N, n_token, hidden_size]
             // c: [N, hidden_size]
             // return: [N, n_token, patch_size * patch_size * out_channels]
@@ -284,7 +284,7 @@ namespace ZImage {
     protected:
         ZImageParams z_image_params;
 
-        void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+        void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
             params["cap_pad_token"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, z_image_params.hidden_size);
             params["x_pad_token"]   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, z_image_params.hidden_size);
         }
@@ -346,11 +346,11 @@ namespace ZImage {
             blocks["final_layer"] = std::make_shared<FinalLayer>(z_image_params.hidden_size, z_image_params.patch_size, z_image_params.out_channels);
         }
 
-        struct ggml_tensor* forward_core(GGMLRunnerContext* ctx,
-                                         struct ggml_tensor* x,
-                                         struct ggml_tensor* timestep,
-                                         struct ggml_tensor* context,
-                                         struct ggml_tensor* pe) {
+        ggml_tensor* forward_core(GGMLRunnerContext* ctx,
+                                  ggml_tensor* x,
+                                  ggml_tensor* timestep,
+                                  ggml_tensor* context,
+                                  ggml_tensor* pe) {
             auto x_embedder     = std::dynamic_pointer_cast<Linear>(blocks["x_embedder"]);
             auto t_embedder     = std::dynamic_pointer_cast<TimestepEmbedder>(blocks["t_embedder"]);
             auto cap_embedder_0 = std::dynamic_pointer_cast<RMSNorm>(blocks["cap_embedder.0"]);
@@ -414,12 +414,12 @@ namespace ZImage {
             return img;
         }
 
-        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                    struct ggml_tensor* x,
-                                    struct ggml_tensor* timestep,
-                                    struct ggml_tensor* context,
-                                    struct ggml_tensor* pe,
-                                    std::vector<ggml_tensor*> ref_latents = {}) {
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* timestep,
+                             ggml_tensor* context,
+                             ggml_tensor* pe,
+                             std::vector<ggml_tensor*> ref_latents = {}) {
             // Forward pass of DiT.
             // x: [N, C, H, W]
             // timestep: [N,]
@@ -477,24 +477,25 @@ namespace ZImage {
             return "z_image";
         }
 
-        void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
             z_image.get_param_tensors(tensors, prefix);
         }
 
-        struct ggml_cgraph* build_graph(struct ggml_tensor* x,
-                                        struct ggml_tensor* timesteps,
-                                        struct ggml_tensor* context,
-                                        std::vector<ggml_tensor*> ref_latents = {},
-                                        bool increase_ref_index               = false) {
+        ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
+                                 const sd::Tensor<float>& timesteps_tensor,
+                                 const sd::Tensor<float>& context_tensor,
+                                 const std::vector<sd::Tensor<float>>& ref_latents_tensor = {},
+                                 bool increase_ref_index                                  = false) {
+            ggml_cgraph* gf        = new_graph_custom(Z_IMAGE_GRAPH_SIZE);
+            ggml_tensor* x         = make_input(x_tensor);
+            ggml_tensor* timesteps = make_input(timesteps_tensor);
             GGML_ASSERT(x->ne[3] == 1);
-            struct ggml_cgraph* gf = new_graph_custom(Z_IMAGE_GRAPH_SIZE);
-
-            x         = to_backend(x);
-            context   = to_backend(context);
-            timesteps = to_backend(timesteps);
-
-            for (int i = 0; i < ref_latents.size(); i++) {
-                ref_latents[i] = to_backend(ref_latents[i]);
+            GGML_ASSERT(!context_tensor.empty());
+            ggml_tensor* context = make_input(context_tensor);
+            std::vector<ggml_tensor*> ref_latents;
+            ref_latents.reserve(ref_latents_tensor.size());
+            for (const auto& ref_latent_tensor : ref_latents_tensor) {
+                ref_latents.push_back(make_input(ref_latent_tensor));
             }
 
             pe_vec      = Rope::gen_z_image_pe(static_cast<int>(x->ne[1]),
@@ -518,66 +519,71 @@ namespace ZImage {
             set_backend_tensor_data(pe, pe_vec.data());
             auto runner_ctx = get_context();
 
-            struct ggml_tensor* out = z_image.forward(&runner_ctx,
-                                                      x,
-                                                      timesteps,
-                                                      context,
-                                                      pe,
-                                                      ref_latents);
+            ggml_tensor* out = z_image.forward(&runner_ctx,
+                                               x,
+                                               timesteps,
+                                               context,
+                                               pe,
+                                               ref_latents);
 
             ggml_build_forward_expand(gf, out);
 
             return gf;
         }
 
-        bool compute(int n_threads,
-                     struct ggml_tensor* x,
-                     struct ggml_tensor* timesteps,
-                     struct ggml_tensor* context,
-                     std::vector<ggml_tensor*> ref_latents = {},
-                     bool increase_ref_index               = false,
-                     struct ggml_tensor** output           = nullptr,
-                     struct ggml_context* output_ctx       = nullptr) {
+        sd::Tensor<float> compute(int n_threads,
+                                  const sd::Tensor<float>& x,
+                                  const sd::Tensor<float>& timesteps,
+                                  const sd::Tensor<float>& context,
+                                  const std::vector<sd::Tensor<float>>& ref_latents = {},
+                                  bool increase_ref_index                           = false) {
             // x: [N, in_channels, h, w]
             // timesteps: [N, ]
             // context: [N, max_position, hidden_size]
-            auto get_graph = [&]() -> struct ggml_cgraph* {
+            auto get_graph = [&]() -> ggml_cgraph* {
                 return build_graph(x, timesteps, context, ref_latents, increase_ref_index);
             };
 
-            return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
         }
 
         void test() {
-            struct ggml_init_params params;
+            ggml_init_params params;
             params.mem_size   = static_cast<size_t>(1024 * 1024) * 1024;  // 1GB
             params.mem_buffer = nullptr;
             params.no_alloc   = false;
 
-            struct ggml_context* work_ctx = ggml_init(params);
-            GGML_ASSERT(work_ctx != nullptr);
+            ggml_context* ctx = ggml_init(params);
+            GGML_ASSERT(ctx != nullptr);
 
             {
-                // auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 16, 16, 16, 1);
+                // auto x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 16, 16, 16, 1);
                 // ggml_set_f32(x, 0.01f);
-                auto x = load_tensor_from_file(work_ctx, "./z_image_x.bin");
-                print_ggml_tensor(x);
+                auto x = sd::load_tensor_from_file_as_tensor<float>("./z_image_x.bin");
+                print_sd_tensor(x);
 
                 std::vector<float> timesteps_vec(1, 0.f);
-                auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
+                auto timesteps = sd::Tensor<float>::from_vector(timesteps_vec);
 
-                // auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 2560, 256, 1);
+                // auto context = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 2560, 256, 1);
                 // ggml_set_f32(context, 0.01f);
-                auto context = load_tensor_from_file(work_ctx, "./z_image_context.bin");
-                print_ggml_tensor(context);
+                auto context = sd::load_tensor_from_file_as_tensor<float>("./z_image_context.bin");
+                print_sd_tensor(context);
 
-                struct ggml_tensor* out = nullptr;
+                sd::Tensor<float> out;
 
-                int64_t t0 = ggml_time_ms();
-                compute(8, x, timesteps, context, {}, false, &out, work_ctx);
-                int64_t t1 = ggml_time_ms();
+                int64_t t0   = ggml_time_ms();
+                auto out_opt = compute(8,
+                                       x,
+                                       timesteps,
+                                       context,
+                                       {},
+                                       false);
+                int64_t t1   = ggml_time_ms();
 
-                print_ggml_tensor(out);
+                GGML_ASSERT(!out_opt.empty());
+                out = std::move(out_opt);
+                print_sd_tensor(out);
                 LOG_DEBUG("z_image test done in %lldms", t1 - t0);
             }
         }