2026-06-24 15:16:38 +00:00
82 changed files with 3493 additions and 6254 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -204,12 +204,6 @@ if(SD_WEBM)
    endif()
 endif()

-if (SD_RPC)
-    message("-- Use RPC as backend stable-diffusion")
-    set(GGML_RPC ON)
-    add_definitions(-DSD_USE_RPC)
-endif ()
-
 set(SD_LIB stable-diffusion)

 file(GLOB SD_LIB_SOURCES CONFIGURE_DEPENDS
--- a/README.md
+++ b/README.md
@ -34,8 +34,8 @@ API and command-line option may change frequently.***
 - Super lightweight and without external dependencies
 - Supported models
  - Image Models
-    - [SD1.x, SD2.x, SD-Turbo](./docs/sd.md)
-    - [SDXL, SDXL-Turbo](./docs/sd.md)
+    - SD1.x, SD2.x, [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo)
+    - SDXL, [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo)
    - [Some SD1.x and SDXL distilled models](./docs/distilled_sd.md)
    - [SD3/SD3.5](./docs/sd3.md)
    - [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md)
@ -50,23 +50,21 @@ API and command-line option may change frequently.***
    - [Ovis-Image](./docs/ovis_image.md)
    - [Anima](./docs/anima.md)
    - [ERNIE-Image](./docs/ernie_image.md)
-    - [Boogu Image](./docs/boogu_image.md)
    - [HiDream-O1-Image](./docs/hidream_o1_image.md)
    - [Ideogram4](./docs/ideogram4.md)
  - Image Edit Models
    - [FLUX.1-Kontext-dev](./docs/kontext.md)
    - [Qwen Image Edit series](./docs/qwen_image_edit.md)
    - [LongCat Image Edit](./docs/longcat_image.md)
-    - [Boogu Image Edit](./docs/boogu_image.md)
  - Video Models
    - [Wan2.1/Wan2.2](./docs/wan.md)
    - [LTX-2.3](./docs/ltx2.md)
-  - [PhotoMaker](./docs/photo_maker.md) support.
+  - [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support.
  - Control Net support with SD 1.5
  - LoRA support, same as [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora)
  - Latent Consistency Models support (LCM/LCM-LoRA)
-  - Faster and memory efficient latent decoding with [TAESD](./docs/taesd.md)
-  - Upscale images generated with [ESRGAN](./docs/esrgan.md)
+  - Faster and memory efficient latent decoding with [TAESD](https://github.com/madebyollin/taesd)
+  - Upscale images generated with [ESRGAN](https://github.com/xinntao/Real-ESRGAN)
 - Supported backends
  - CPU (AVX, AVX2 and AVX512 support for x86 architectures)
  - CUDA
@ -135,9 +133,28 @@ For runtime and parameter backend placement, see the [backend selection guide](.
 ## More Guides

 - [Backend selection](./docs/backend.md)
- [RPC](./docs/rpc.md)
+- [SD1.x/SD2.x/SDXL](./docs/sd.md)
+- [SD3/SD3.5](./docs/sd3.md)
+- [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md)
+- [FLUX.2-dev/FLUX.2-klein](./docs/flux2.md)
+- [FLUX.1-Kontext-dev](./docs/kontext.md)
+- [Chroma](./docs/chroma.md)
+- [🔥Qwen Image](./docs/qwen_image.md)
+- [🔥Qwen Image Edit series](./docs/qwen_image_edit.md)
+- [🔥Wan2.1/Wan2.2](./docs/wan.md)
+- [🔥LTX-2.3](./docs/ltx2.md)
+- [🔥Z-Image](./docs/z_image.md)
+- [Ovis-Image](./docs/ovis_image.md)
+- [Anima](./docs/anima.md)
+- [ERNIE-Image](./docs/ernie_image.md)
+- [HiDream-O1-Image](./docs/hidream_o1_image.md)
+- [Lens](./docs/lens.md)
+- [LongCat Image / LongCat Image Edit](./docs/longcat_image.md)
 - [LoRA](./docs/lora.md)
 - [LCM/LCM-LoRA](./docs/lcm.md)
+- [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)
+- [Using ESRGAN to upscale results](./docs/esrgan.md)
+- [Using TAESD to faster decoding](./docs/taesd.md)
 - [Docker](./docs/docker.md)
 - [Quantization and GGUF](./docs/quantization_and_gguf.md)
 - [Inference acceleration via caching](./docs/caching.md)
--- a/assets/boogu/edit_example.png
+++ b/assets/boogu/edit_example.png
--- a/assets/boogu/example.png
+++ b/assets/boogu/example.png
--- a/docs/backend.md
+++ b/docs/backend.md
@ -3,7 +3,7 @@
 `stable-diffusion.cpp` has two backend assignments:

 - `--backend` selects the runtime backend used to execute model graphs.
- `--params-backend` selects where model parameters are kept.
+- `--params-backend` selects the backend used to allocate model parameters.

 If `--params-backend` is not set, parameters use the same backend as their module runtime backend.

@ -29,20 +29,6 @@ The same syntax is used for parameter placement:
 sd-cli -m model.safetensors -p "a cat" --backend cuda0 --params-backend te=cpu,vae=cpu
 ```

-`--params-backend` also accepts the special value `disk`:
-
-```shell
-sd-cli -m model.safetensors -p "a cat" --backend cuda0 --params-backend disk
-```
-
-`--max-vram` can target resolved backend/device names:
-
-```shell
-sd-cli -m model.safetensors -p "a cat" --backend diffusion=cuda0,vae=vulkan0 --max-vram cuda0=6,vulkan0=2
-```
-
-The budget applies to every module running on that backend.
-
 Module names are case-insensitive. Hyphens and underscores in module names are ignored, so `clip_vision`, `clip-vision`, and `clipvision` are equivalent.

 `all=`, `default=`, and `*=` can be used to set the default backend inside a mixed assignment:
@ -78,11 +64,9 @@ The special values `auto`, `default`, and an empty backend name select the defau

 The special value `gpu` selects the first GPU backend, falling back to the first integrated GPU backend.

-The special value `disk` is accepted only by `--params-backend`. `--backend disk` is invalid because `disk` is a parameter residency mode, not a runtime compute backend.
-
 ## Runtime backend vs. parameter backend

-The runtime backend controls where graph execution runs. The parameter backend controls where model weights are allocated or whether they are reloaded from disk on demand.
+The runtime backend controls where graph execution runs. The parameter backend controls where model weights are allocated.

 For example:

@ -92,16 +76,6 @@ sd-cli -m model.safetensors -p "a cat" --backend cuda0 --params-backend cpu

 This runs all modules on `cuda0`, but stores parameters in CPU RAM. During execution, parameters are moved to the runtime backend as needed.

-For example:
-
-```shell
-sd-cli -m model.safetensors -p "a cat" --backend cuda0 --params-backend disk
-```
-
-This runs all modules on `cuda0`, reloads parameters from the model file as needed, and releases those parameter buffers after use.
-
-`disk` is never selected implicitly. If `--params-backend` is not set, parameters use the runtime backend.
-
 Per-module assignments can be mixed:

 ```shell
@ -126,27 +100,23 @@ uses one shared CPU backend for both `te` and `vae` runtime execution.

 Runtime and parameter assignments also share the same backend cache. If `--backend diffusion=cuda0` and `--params-backend diffusion=cuda0` resolve to the same device, both use the same backend instance.

-`--params-backend disk` does not create a separate backend instance. Parameters are loaded lazily using the module runtime backend.
-
 `SDBackendManager` owns the backend instances and frees them when the context or upscaler is destroyed. Model runners receive non-owning runtime and parameter backend pointers and do not free them.

 ## Compatibility flags

-The example CLI/server still accepts these older CPU placement flags as compatibility aliases:
+The older CPU placement flags are still supported:

 - `--clip-on-cpu`
 - `--vae-on-cpu`
 - `--control-net-cpu`
 - `--offload-to-cpu`

-`--clip-on-cpu`, `--vae-on-cpu`, and `--control-net-cpu` are deprecated. The example argument layer prepends `te=cpu`, `vae=cpu`, and `controlnet=cpu` to `--backend` before creating the context.
+`--clip-on-cpu`, `--vae-on-cpu`, and `--control-net-cpu` affect runtime backend assignment only when `--backend` is not set. They map to `te=cpu`, `vae=cpu`, and `controlnet=cpu`.

-`--offload-to-cpu` prepends a CPU default to the parameter assignment in the caller before creating the context:
+`--offload-to-cpu` affects parameter backend assignment only when `--params-backend` is not set. It is equivalent to:

 ```shell
--params-backend '*=cpu'
+--params-backend cpu
 ```

-Because this default is inserted first, later explicit `--params-backend` entries can still override it, for example `--offload-to-cpu --params-backend te=disk` keeps non-TE parameters on CPU and reloads TE parameters from disk.
-
-Library callers should set `backend` and `params_backend` directly. The old CPU/offload fields are no longer part of the C API. Explicit `--backend` and `--params-backend` assignments are preferred for new commands.
+Explicit `--backend` and `--params-backend` assignments are preferred for new commands.
--- a/docs/boogu_image.md
+++ b/docs/boogu_image.md
@ -1,31 +0,0 @@
-# How to Use
-
-Boogu Image uses a Boogu diffusion transformer, the FLUX VAE, and Qwen3-VL as the LLM text and vision encoder.
-
-## Download weights
-
- Download Boogu Image
-    - safetensors: https://huggingface.co/Comfy-Org/Boogu-Image/tree/main/diffusion_models
- Download vae
-    - safetensors: https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
- Download Qwen3-VL 8B
-    - gguf: https://huggingface.co/unsloth/Qwen3-VL-8B-Instruct-GGUF/tree/main
-        - For image editing with GGUF text encoders, also download the matching mmproj file and pass it with `--llm_vision`.
-
-## Examples
-
-### Boogu Image Base
-
-```
-.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\boogu_image_base_bf16.safetensors --llm ..\..\llm\Qwen3VL-8B-Instruct-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\ae.sft -p "a lovely cat" --diffusion-fa -v --offload-to-cpu
-```
-
-<img width="256" alt="Boogu Image Base example" src="../assets/boogu/example.png" />
-
-### Boogu Image Edit
-
-```
-.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\boogu_image_edit_bf16.safetensors --llm ..\..\llm\Qwen3VL-8B-Instruct-Q4_K_M.gguf --llm_vision ..\..\llm\mmproj-Qwen3VL-8B-Instruct-F16.gguf --vae ..\..\ComfyUI\models\vae\ae.sft --diffusion-fa -v --offload-to-cpu -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'boogu.cpp'"
-```
-
-<img width="256" alt="Boogu Image Edit example" src="../assets/boogu/edit_example.png" />
--- a/docs/performance.md
+++ b/docs/performance.md
@ -21,38 +21,6 @@ and the compute buffer shrink in the debug log:

 Using `--offload-to-cpu` allows you to offload weights to the CPU, saving VRAM without reducing generation speed.

-## Use params backend to reduce VRAM or RAM usage.
-
-`--params-backend` controls where model parameters are kept. If it is not set, parameters use the same backend as `--backend`, so a GPU runtime backend also keeps parameters in VRAM.
-
-Use CPU params to reduce VRAM usage:
-
-```shell
--backend cuda0 --params-backend cpu
-```
-
-This keeps model weights in system RAM and moves them to the runtime backend when needed. In the example CLI/server, `--offload-to-cpu` is a compatibility shortcut that prepends `*=cpu` to `--params-backend` before creating the context, so explicit module assignments can still override it:
-
-```shell
--offload-to-cpu --params-backend te=disk
-```
-
-Use disk params to reduce both VRAM and RAM usage:
-
-```shell
--backend cuda0 --params-backend disk
-```
-
-This reloads parameters from the model file on demand and releases them after use. It has the lowest memory residency, but can be slower because weights must be read again. `disk` is never selected implicitly; set it explicitly when RAM usage matters more than reload cost.
-
-Per-module assignments can target only the largest modules:
-
-```shell
--backend cuda0 --params-backend diffusion=disk,te=cpu,vae=cpu
-```
-
-See [backend selection](./backend.md) for full syntax.
-
 ## Use quantization to reduce memory usage.

 [quantization](./quantization_and_gguf.md)
--- a/docs/pulid.md
+++ b/docs/pulid.md
@ -1,196 +0,0 @@
-# PuLID-Flux face-identity preservation
-
-stable-diffusion.cpp supports the [PuLID-Flux](https://github.com/ToTheBeginning/PuLID)
-identity-injection technique on top of Flux.1 (schnell or dev) models.
-Given a single source portrait, PuLID-Flux produces new generations that
-preserve the source person's face across arbitrary scenes, poses, and
-prompts.
-
-Unlike PhotoMaker (which extracts the identity inside the inference
-process from a directory of images), PuLID-Flux's identity extractor is
-a heavy stack (insightface ArcFace + EVA-CLIP-L + IDFormer encoder) that
-is impractical to port to C++/ggml. To keep this implementation small and
-cross-vendor, **stable-diffusion.cpp consumes a precomputed identity
-embedding** produced by an external Python tool that runs once per source
-portrait. Everything downstream of that one-shot extraction is C++ and
-runs on any backend (Vulkan, CUDA, Metal, ROCm, CPU).
-
-## Architecture summary
-
-The PuLID-Flux contribution to the Flux denoise loop is a stack of 20
-small cross-attention modules (`PerceiverAttentionCA`) inserted between
-the Flux transformer blocks:
-
- After every 2nd of the 19 double-stream blocks (10 hook points)
- After every 4th of the 38 single-stream blocks (10 hook points)
-
-Each cross-attention layer takes the current image tokens as query, the
-32-token / 2048-dim identity embedding as key+value, and adds its output
-(scaled by `id_weight`, typically 1.0) back to the image tokens.
-
-## Required weights
-
-Three files in addition to the standard Flux weight set:
-
-1. **Flux base** (transformer + VAE + clip_l + t5xxl) -- exactly as
-   [docs/flux.md](flux.md) describes.
-2. **PuLID weights** -- download from
-   [guozinan/PuLID](https://huggingface.co/guozinan/PuLID):
-   - `pulid_flux_v0.9.0.safetensors` or `pulid_flux_v0.9.1.safetensors`
-     (recommended; this implementation is verified against v0.9.1)
-   - **v1.1 (`pulid_v1.1.safetensors`) is NOT yet supported** -- it uses
-     renamed keys (`id_adapter_attn_layers.*` instead of `pulid_ca.*`)
-     and possibly different module structure. Future PR.
-3. **Identity embedding (.pulidembd)** -- produced by the precompute
-   tool below.
-
-## Precompute the identity embedding
-
-The precompute tool runs the PyTorch identity-extraction stack on a
-single portrait image and writes the resulting `(32, 2048)` embedding
-to a `.pulidembd` binary file (about 131 KB). Run it once per source
-person; the same file is reused for any number of generations.
-
-A reference Python script is provided alongside this docs file at
-[`script/pulid_extract_id.py`](../script/pulid_extract_id.py). It
-requires:
- A working CUDA / CPU PyTorch stack
- `insightface`, `facexlib`, `eva-clip`, `torchvision`, `opencv-python`,
-  `huggingface_hub`, `gguf`
- The PuLID weights file (same one stable-diffusion.cpp will load below)
- The ToTheBeginning/PuLID repo's `pulid/` package (including
-  `pulid/pipeline_flux.py`) and `eva_clip/` package on `PYTHONPATH`; `flux/`
-  is not needed for embedding extraction
-
-Run it as:
-
-```
-python pulid_extract_id.py \
-  --portrait /path/to/source-photo.jpg \
-  --pulid-weights /path/to/pulid_flux_v0.9.1.safetensors \
-  --out /path/to/source.pulidembd
-```
-
-## Format (gguf)
-
-The embedding is a standard **gguf** container holding a single tensor:
-
-```
-tensor name : "pulid_id"
-shape       : [token_dim, num_tokens]   (ggml order; typically [2048, 32])
-type        : F16 (also accepts F32 / BF16)
-metadata    : general.architecture = "pulid", pulid.version = 1
-```
-
-stable-diffusion.cpp loads it with the normal gguf reader
-(`gguf_init_from_file`) and converts to fp32 at load time -- no bespoke
-parser. Total file size for the typical (32, 2048, fp16) case is ~131 KB.
-
-## Command-line usage
-
-```
-.\bin\Release\sd-cli.exe \
-  --diffusion-model     models\flux1-schnell-Q4_K_S.gguf \
-  --vae                 models\ae.safetensors \
-  --clip_l              models\clip_l.safetensors \
-  --t5xxl               models\t5xxl_fp16.safetensors \
-  --pulid-weights       models\pulid_flux_v0.9.1.safetensors \
-  --pulid-id-embedding  source.pulidembd \
-  --pulid-id-weight     1.0 \
-  -p "candid photograph of a young woman on a beach at sunset" \
-  --cfg-scale 1.0 --sampling-method euler --steps 4 -W 512 -H 512 \
-  --seed 42 --clip-on-cpu \
-  -o out.png
-```
-
-For Flux Dev (instead of Schnell), add `--guidance 3.5` and `--steps 20`.
-
-## Flags
-
-| Flag                       | Purpose                                                           |
-|----------------------------|-------------------------------------------------------------------|
-| `--pulid-weights <path>`   | Path to `pulid_flux_v0.9.x.safetensors`. Loaded with the model.   |
-| `--pulid-id-embedding <p>` | Path to a `.pulidembd` binary produced by the precompute tool.    |
-| `--pulid-id-weight <f>`    | Identity-injection strength. Typical 0.7-1.2; default 1.0.        |
-
-All three flags must be set together to activate PuLID. Setting only
-`--pulid-weights` (no embedding) loads the weights but disables injection
-at runtime. Setting `--pulid-id-weight 0` zeros out the contribution
-(useful for falsification testing: outputs should be byte-identical to
-a no-PuLID run with the same seed).
-
-## Memory budget
-
-At 512x512, 4 steps (Schnell), the 20 cross-attention layers add roughly
-10% to denoise time and almost nothing to peak VRAM. Tested on a 12 GB
-consumer card alongside Flux Schnell Q4 GGUF + CPU-offloaded clip_l and
-t5xxl + GPU-resident VAE.
-
-At 1024x1024 with Flux Dev Q4 + 20 steps + PuLID, the VAE decode compute
-buffer doesn't fit on a 12 GB card even with `--vae-on-cpu`. Workaround:
-explicitly route VAE to the CPU backend instead of the offload flag:
-
-```
--backend "diffusion=vulkan0,vae=cpu"
-```
-
-The `--vae-on-cpu` flag offloads VAE weights but leaves the compute graph
-on the default backend; this is existing stable-diffusion.cpp behavior,
-not a PuLID-specific issue. Documented here because anyone running PuLID
-at 1024 will hit it.
-
-## Backend selection
-
-The standard `--backend` flag works as documented. Common patterns:
-
-```
-# AMD Vulkan
--backend "diffusion=vulkan0,vae=cpu"
-
-# NVIDIA Vulkan
--backend "diffusion=vulkan1,vae=cpu"
-
-# CUDA
--backend "diffusion=cuda0,vae=cpu"
-```
-
-The PuLID cross-attention layers run on the same backend as the main
-diffusion model. They have not yet been independently profiled on every
-backend; only Vulkan and CPU have been tested by the original contributor.
-
-## Verification
-
-A three-way SHA-256 check is the recommended sanity test when bringing up
-a new combination of model + backend + hardware:
-
-| Run                                          | Expected hash relation             |
-|----------------------------------------------|------------------------------------|
-| A: no `--pulid-*` flags                      | baseline                           |
-| B: PuLID flags, `--pulid-id-weight 0.0`      | **byte-identical to A**            |
-| C: PuLID flags, `--pulid-id-weight 1.0`      | **different from A,B**, preserves source identity |
-
-If A and C differ but A and B differ too, the injection is allocating
-or computing something even at zero weight -- likely a bug.
-
-## Limitations / not yet supported
-
- **`--skip-layers` (skip-layer-guidance / SLG) combined with PuLID** is not
-  supported. The `pulid_ca` index advances per non-skipped block, so a
-  skipped block silently misaligns the cross-attention weight assignment
-  vs. the trained intervals. The reference PyTorch implementation does
-  not have SLG either, so there is no well-defined behavior to emulate.
-  Use either feature alone.
- **PuLID v1.1 weights** (`pulid_v1.1.safetensors`, renamed key layout).
- **Multiple ID images.** The reference PyTorch implementation can fuse
-  several portraits into one embedding for stronger identity. This
-  implementation accepts a single embedding produced from one or more
-  images by the external precompute tool.
- **Negative-prompt branch of CFG.** PuLID only injects on the positive
-  conditioning path in the published reference, and the implementation
-  here follows that. Flux's distilled guidance doesn't run a separate
-  uncond branch in normal use, so this matters only for `--true-cfg`
-  workflows that aren't standard for Flux.
- **Backends other than Vulkan and CPU** are untested by the original
-  contributor. The implementation is pure-ggml and should work on CUDA,
-  ROCm, and Metal, but verification by users on those backends is
-  welcomed.
--- a/docs/rpc.md
+++ b/docs/rpc.md
@ -1,220 +0,0 @@
-# Building and Using the RPC Server with `stable-diffusion.cpp`
-
-This guide covers how to build a version of [the RPC server from `llama.cpp`](https://github.com/ggml-org/llama.cpp/blob/master/tools/rpc/README.md) that is compatible with your version of `stable-diffusion.cpp` to manage multi-backends setups. RPC allows you to offload specific model components to a remote server.
-
-> **Note on Model Location:** The model files (e.g., `.safetensors` or `.gguf`) remain on the **Client** machine. The client parses the file and transmits the necessary tensor data and computational graphs to the server. The server does not need to store the model files locally.
-
-## 1. Building `stable-diffusion.cpp` with RPC client
-
-First, you should build the client application from source. It requires `SD_RPC=ON` to include the RPC backend to your client.
-
-```bash
-mkdir build
-cd build
-cmake .. \
-    -DSD_RPC=ON \
-    # Add other build flags here (e.g., -DSD_VULKAN=ON)
-cmake --build . --config Release -j $(nproc)
-```
-
-> **Note:** Ensure you add the other flags you would normally use (e.g., `-DSD_VULKAN=ON`, `-DSD_CUDA=ON`, `-DSD_HIPBLAS=ON`, or `-DGGML_METAL=ON`), for more information about building `stable-diffusion.cpp` from source, please refer to the [build.md](build.md) documentation.
-
-## 2. Ensure `llama.cpp` is at the correct commit
-
-`stable-diffusion.cpp`'s RPC client is designed to work with a specific version of `llama.cpp` (compatible with the `ggml` submodule) to ensure API compatibility. The commit hash for `llama.cpp` is stored in `ggml/scripts/sync-llama.last`.
-
-> **Start from Root:** Perform these steps from the root of your `stable-diffusion.cpp` directory.
-
-1.  Read the target commit hash from the submodule tracker:
-
-    ```bash
-    # Linux / WSL / MacOS
-    HASH=$(cat ggml/scripts/sync-llama.last)
-
-    # Windows (PowerShell)
-    $HASH = Get-Content -Path "ggml\scripts\sync-llama.last"
-    ```
-
-2.  Clone `llama.cpp` at the target commit .
-    ```bash
-    git clone https://github.com/ggml-org/llama.cpp.git
-    cd llama.cpp
-    git checkout $HASH
-    ```
-    To save on download time and storage, you can use a shallow clone to download only the target commit:
-    ```bash
-    mkdir -p llama.cpp
-    cd llama.cpp
-    git init
-    git remote add origin https://github.com/ggml-org/llama.cpp.git
-    git fetch --depth 1 origin $HASH
-    git checkout FETCH_HEAD
-    ```
-
-## 3. Build `llama.cpp` (RPC Server)
-
-The RPC server acts as the worker. You must explicitly enable the **backend** (the hardware interface, such as CUDA for Nvidia, Metal for Apple Silicon, or Vulkan) when building, otherwise the server will default to using only the CPU.
-
-To find the correct flags for your system, refer to the official documentation for the [`llama.cpp`](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md) repository.
-
-> **Crucial:** You must include the compiler flags required to satisfy the API compatibility with `stable-diffusion.cpp` (`-DGGML_MAX_NAME=128`). Without this flag, `GGML_MAX_NAME` will default to `64` for the server, and data transfers between the client and server will fail. Of course, `-DGGML_RPC` must also be enabled.
->
-> I recommend disabling the `LLAMA_CURL` flag to avoid unnecessary dependencies, and disabling shared library builds to avoid potential conflicts.
-
-> **Build Target:** We are specifically building the `rpc-server` target. This prevents the build system from compiling the entire `llama.cpp` suite (like `llama-server`), making the build significantly faster.
-
-### Linux / WSL (Vulkan)
-
-```bash
-mkdir build
-cd build
-cmake .. -DGGML_RPC=ON \
-    -DGGML_VULKAN=ON \        # Ensure backend is enabled
-    -DGGML_BUILD_SHARED_LIBS=OFF \
-    -DLLAMA_CURL=OFF \
-    -DCMAKE_C_FLAGS=-DGGML_MAX_NAME=128 \
-    -DCMAKE_CXX_FLAGS=-DGGML_MAX_NAME=128
-cmake --build . --config Release --target rpc-server -j $(nproc)
-```
-
-### macOS (Metal)
-
-```bash
-mkdir build
-cd build
-cmake .. -DGGML_RPC=ON \
-    -DGGML_METAL=ON \
-    -DGGML_BUILD_SHARED_LIBS=OFF \
-    -DLLAMA_CURL=OFF \
-    -DCMAKE_C_FLAGS=-DGGML_MAX_NAME=128 \
-    -DCMAKE_CXX_FLAGS=-DGGML_MAX_NAME=128
-cmake --build . --config Release --target rpc-server
-```
-
-### Windows (Visual Studio 2022, Vulkan)
-
-```powershell
-mkdir build
-cd build
-cmake .. -G "Visual Studio 17 2022" -A x64 `
-    -DGGML_RPC=ON `
-    -DGGML_VULKAN=ON `
-    -DGGML_BUILD_SHARED_LIBS=OFF `
-    -DLLAMA_CURL=OFF `
-    -DCMAKE_C_FLAGS=-DGGML_MAX_NAME=128 `
-    -DCMAKE_CXX_FLAGS=-DGGML_MAX_NAME=128
-cmake --build . --config Release --target rpc-server
-```
-
-## 4. Usage
-
-Once both applications are built, you can run the server and the client to manage your GPU allocation.
-
-### Step A: Run the RPC Server
-
-Start the server. It listens for connections on the default address (usually `localhost:50052`). If your server is on a different machine, ensure the server binds to the correct interface and your firewall allows the connection.
-
-**On the Server :**
-If running on the same machine, you can use the default address:
-
-```bash
-./rpc-server
-```
-
-If you want to allow connections from other machines on the network:
-
-```bash
-./rpc-server --host 0.0.0.0
-```
-
-> **Security Warning:** The RPC server does not currently support authentication or encryption. **Only run the server on trusted local networks**. Never expose the RPC server directly to the open internet.
-
-> **Drivers & Hardware:** Ensure the Server machine has the necessary drivers installed and functional (e.g., Nvidia Drivers for CUDA, Vulkan SDK, or Metal). If no devices are found, the server will simply fallback to CPU usage.
-
-<!-- ### Step B: Check if the client is able to connect to the server and see the available devices
-
-We're assuming the server is running on your local machine, and listening on the default port `50052`. If it's running on a different machine, you can replace `localhost` with the IP address of the server.
-
-**On the Client:**
-
-```bash
-./sd-cli --rpc-servers localhost:50052 --list-devices
-```
-
-If the server is running and the client is able to connect, you should see `RPC0    localhost:50052` in the list of devices.
-
-Example output:
-(Client built without GPU acceleration, two GPUs available on the server)
-
-```
-List of available GGML devices:
-Name    Description
-------------------
-CPU     AMD Ryzen 9 5900X 12-Core Processor
-RPC0    localhost:50052
-RPC1    localhost:50052
-``` -->
-
-### Step B: Run with RPC device
-
-If everything is working correctly, you can now run the client while offloading some or all of the work to the RPC server.
-
-Example: Setting the main backend to the RPC0 device for doing all the work on the server.
-
-```bash
-./sd-cli -m models/sd1.5.safetensors -p "A cat" --rpc-servers localhost:50052  --backend RPC0
-```
-
---
-
-## 5. Scaling: Multiple RPC Servers
-
-You can connect the client to multiple RPC servers simultaneously to scale out your hardware usage.
-
-Example: A main machine (192.168.1.10) with 3 GPUs, with one GPU running CUDA and the other two running Vulkan, and a second machine (192.168.1.11) only one GPU.
-
-**On the first machine (Running two server instances):**
-
-**Terminal 1 (CUDA):**
-
-```bash
-# Linux / WSL
-export CUDA_VISIBLE_DEVICES=0
-cd ./build_cuda/bin/Release
-./rpc-server --host 0.0.0.0
-
-# Windows PowerShell
-$env:CUDA_VISIBLE_DEVICES="0"
-cd .\build_cuda\bin\Release
-./rpc-server --host 0.0.0.0
-```
-
-**Terminal 2 (Vulkan):**
-
-```bash
-cd ./build_vulkan/bin/Release
-# ignore the first GPU (used by CUDA server)
-./rpc-server --host 0.0.0.0 --port 50053 -d Vulkan1,Vulkan2
-```
-
-**On the second machine:**
-
-```bash
-cd ./build/bin/Release
-./rpc-server --host 0.0.0.0
-```
-
-**On the Client:**
-Pass multiple server addresses separated by commas.
-
-```bash
-./sd-cli --rpc-servers 192.168.1.10:50052,192.168.1.10:50053,192.168.1.11:50052 [...]
-```
-
-The client will map these servers to sequential device IDs (e.g., RPC0 from the first server, RPC2, RPC3 from the second, and RPC4 from the third). With this setup, you could for example use RPC0 for the main backend, RPC1 and RPC2 for the text encoders, and RPC3 for the VAE.
-
---
-
-## 6. Performance Considerations
-
-RPC performance is heavily dependent on network bandwidth, as large weights and activations must be transferred back and forth over the network, especially for large models, or when using high resolutions. For best results, ensure your network connection is stable and has sufficient bandwidth (>1Gbps recommended). This shoumd not be a concern if you are running the server and client on the same machine, as the data transfer will happen over the loopback interface.
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@ -1,9 +1,204 @@
-# Usage
+# Run

-For detailed command-line arguments, run:
+```
+usage: ./bin/sd-cli  [options]

-```bash
-./bin/sd-cli -h
+CLI Options:
+  -o, --output <string>         path to write result image to. you can use printf-style %d format specifiers for image
+                                sequences (default: ./output.png) (eg. output_%03d.png). Single-file video outputs
+                                support .avi, .webm, and animated .webp
+  --image <string>              path to the image to inspect (for metadata mode)
+  --metadata-format <string>    metadata output format, one of [text, json] (default: text)
+  --preview-path <string>       path to write preview image to (default: ./preview.png). Multi-frame previews support
+                                .avi, .webm, and animated .webp
+  --preview-interval <int>      interval in denoising steps between consecutive updates of the image preview file
+                                (default is 1, meaning updating at every step)
+  --output-begin-idx <int>      starting index for output image sequence, must be non-negative (default 0 if specified
+                                %d in output path, 1 otherwise)
+  --canny                       apply canny preprocessor (edge detection)
+  --convert-name                convert tensor name (for convert mode)
+  -v, --verbose                 print extra info
+  --color                       colors the logging tags according to level
+  --taesd-preview-only          prevents usage of taesd for decoding the final image. (for use with --preview tae)
+  --preview-noisy               enables previewing noisy inputs of the models rather than the denoised outputs
+  --metadata-raw                include raw hex previews for unparsed metadata payloads
+  --metadata-brief              truncate long metadata text values in text output
+  --metadata-all                include structural/container entries such as IHDR, IDAT, and non-metadata JPEG segments
+  -M, --mode                    run mode, one of [img_gen, vid_gen, upscale, convert, metadata], default: img_gen
+  --preview                     preview method. must be one of the following [none, proj, tae, vae] (default is none)
+  -h, --help                    show this help message and exit
+
+Context Options:
+  -m, --model <string>                     path to full model
+  --clip_l <string>                        path to the clip-l text encoder
+  --clip_g <string>                        path to the clip-g text encoder
+  --clip_vision <string>                   path to the clip-vision encoder
+  --t5xxl <string>                         path to the t5xxl text encoder
+  --llm <string>                           path to the llm text encoder. For example: (qwenvl2.5 for qwen-image,
+                                           mistral-small3.2 for flux2, ...)
+  --llm_vision <string>                    path to the llm vit
+  --qwen2vl <string>                       alias of --llm. Deprecated.
+  --qwen2vl_vision <string>                alias of --llm_vision. Deprecated.
+  --diffusion-model <string>               path to the standalone diffusion model
+  --high-noise-diffusion-model <string>    path to the standalone high noise diffusion model
+  --uncond-diffusion-model <string>        path to the standalone unconditional diffusion model, currently used by
+                                           Ideogram4 CFG
+  --vae <string>                           path to standalone vae model
+  --taesd <string>                         path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
+  --tae <string>                           alias of --taesd
+  --control-net <string>                   path to control net model
+  --embd-dir <string>                      embeddings directory
+  --lora-model-dir <string>                lora model directory
+  --hires-upscalers-dir <string>           highres fix upscaler model directory
+  --tensor-type-rules <string>             weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
+  --photo-maker <string>                   path to PHOTOMAKER model
+  --upscale-model <string>                 path to esrgan model.
+  -t, --threads <int>                      number of threads to use during computation (default: -1). If threads <= 0,
+                                           then threads will be set to the number of CPU physical cores
+  --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
+  --max-vram <float>                       maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
+                                           graph splitting; a negative value auto-detects free VRAM, sparing the
+                                           specified value (e.g. -0.5 will keep at least 0.5 GiB free)
+  --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
+  --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM
+                                           when needed
+  --mmap                                   whether to memory-map model
+  --control-net-cpu                        keep controlnet in cpu (for low vram)
+  --clip-on-cpu                            keep clip in cpu (for low vram)
+  --vae-on-cpu                             keep vae in cpu (for low vram)
+  --fa                                     use flash attention
+  --diffusion-fa                           use flash attention in the diffusion model only
+  --diffusion-conv-direct                  use ggml_conv2d_direct in the diffusion model
+  --vae-conv-direct                        use ggml_conv2d_direct in the vae model
+  --circular                               enable circular padding for convolutions
+  --circularx                              enable circular RoPE wrapping on x-axis (width) only
+  --circulary                              enable circular RoPE wrapping on y-axis (height) only
+  --chroma-disable-dit-mask                disable dit mask for chroma
+  --qwen-image-zero-cond-t                 enable zero_cond_t for qwen image
+  --chroma-enable-t5-mask                  enable t5 mask for chroma
+  --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K,
+                                           q4_K). If not specified, the default is the type of the weight file
+  --rng                                    RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
+  --sampler-rng                            sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
+  --prediction                             prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow,
+                                           flux2_flow]
+  --lora-apply-mode                        the way to apply LoRA, one of [auto, immediately, at_runtime], default is
+                                           auto. In auto mode, if the model weights contain any quantized parameters,
+                                           the at_runtime mode will be used; otherwise, immediately will be used.The
+                                           immediately mode may have precision and compatibility issues with quantized
+                                           parameters, but it usually offers faster inference speed and, in some cases,
+                                           lower memory usage. The at_runtime mode, on the other hand, is exactly the
+                                           opposite.
+
+Generation Options:
+  -p, --prompt <string>                    the prompt to render
+  -n, --negative-prompt <string>           the negative prompt (default: "")
+  -i, --init-img <string>                  path to the init image
+  --end-img <string>                       path to the end image, required by flf2v
+  --mask <string>                          path to the mask image
+  --control-image <string>                 path to control image, control net
+  --control-video <string>                 path to control video frames, It must be a directory path. The video frames
+                                           inside should be stored as images in lexicographical (character) order. For
+                                           example, if the control video path is `frames`, the directory contain images
+                                           such as 00.png, 01.png, ... etc.
+  --pm-id-images-dir <string>              path to PHOTOMAKER input id images dir
+  --pm-id-embed-path <string>              path to PHOTOMAKER v2 id embed
+  --hires-upscaler <string>                highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent
+                                           (nearest-exact), Latent (antialiased), Latent (bicubic), Latent (bicubic
+                                           antialiased), or a model name under --hires-upscalers-dir (default: Latent)
+  --extra-sample-args <string>             extra sampler/scheduler/guidance args, key=value list. APG supports apg_eta,
+                                           apg_momentum, apg_norm_threshold, apg_norm_threshold_smoothing; SLG supports
+                                           slg_uncond; lcm supports noise_clip_std, noise_scale_start, noise_scale_end;
+                                           ltx2 supports max_shift, base_shift, stretch, terminal; euler_ge supports gamma
+  --extra-tiling-args <string>             extra VAE tiling args, key=value list. LTX video VAE supports
+                                           temporal_tile_frames (default: 4), temporal_tile_overlap (default: 1)
+  -H, --height <int>                       image height, in pixel space (default: 512)
+  -W, --width <int>                        image width, in pixel space (default: 512)
+  --steps <int>                            number of sample steps (default: 20)
+  --high-noise-steps <int>                 (high noise) number of sample steps (default: -1 = auto)
+  --clip-skip <int>                        ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer
+                                           (default: -1). <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
+  -b, --batch-count <int>                  batch count
+  --video-frames <int>                     video frames (default: 1)
+  --fps <int>                              fps (default: 24)
+  --timestep-shift <int>                   shift timestep for NitroFusion models (default: 0). recommended N for
+                                           NitroSD-Realism around 250 and 500 for NitroSD-Vibrant
+  --upscale-repeats <int>                  Run the ESRGAN upscaler this many times (default: 1)
+  --upscale-tile-size <int>                tile size for ESRGAN upscaling (default: 128)
+  --hires-width <int>                      highres fix target width, 0 to use --hires-scale (default: 0)
+  --hires-height <int>                     highres fix target height, 0 to use --hires-scale (default: 0)
+  --hires-steps <int>                      highres fix second pass sample steps, 0 to reuse --steps (default: 0)
+  --hires-upscale-tile-size <int>          highres fix upscaler tile size, reserved for model-backed upscalers (default:
+                                           128)
+  --cfg-scale <float>                      unconditional guidance scale: (default: 7.0)
+  --img-cfg-scale <float>                  image guidance scale for inpaint or image edit models: (default: same as
+                                           --cfg-scale)
+  --guidance <float>                       distilled guidance scale for models with guidance input (default: 3.5)
+  --slg-scale <float>                      skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means
+                                           disabled, a value of 2.5 is nice for sd3.5 medium
+  --skip-layer-start <float>               SLG enabling point (default: 0.01)
+  --skip-layer-end <float>                 SLG disabling point (default: 0.2)
+  --eta <float>                            noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and
+                                           res_2s; 1 for euler_a, er_sde and dpm++2s_a)
+  --flow-shift <float>                     shift value for Flow models like SD3.x or WAN (default: auto)
+  --high-noise-cfg-scale <float>           (high noise) unconditional guidance scale: (default: 7.0)
+  --high-noise-img-cfg-scale <float>       (high noise) image guidance scale for inpaint or image edit models (default:
+                                           same as --cfg-scale)
+  --high-noise-guidance <float>            (high noise) distilled guidance scale for models with guidance input
+                                           (default: 3.5)
+  --high-noise-slg-scale <float>           (high noise) skip layer guidance (SLG) scale, only for DiT models: (default:
+                                           0)
+  --high-noise-skip-layer-start <float>    (high noise) SLG enabling point (default: 0.01)
+  --high-noise-skip-layer-end <float>      (high noise) SLG disabling point (default: 0.2)
+  --high-noise-eta <float>                 (high noise) noise multiplier (default: 0 for ddim_trailing, tcd,
+                                           res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
+  --strength <float>                       strength for noising/unnoising (default: 0.75)
+  --pm-style-strength <float>
+  --control-strength <float>               strength to apply Control Net (default: 0.9). 1.0 corresponds to full
+                                           destruction of information in init image
+  --moe-boundary <float>                   timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if
+                                           `--high-noise-steps` is set to -1
+  --vace-strength <float>                  wan vace strength
+  --vae-tile-overlap <float>               tile overlap for vae tiling, in fraction of tile size (default: 0.5)
+  --hires-scale <float>                    highres fix scale when target size is not set (default: 2.0)
+  --hires-denoising-strength <float>       highres fix second pass denoising strength (default: 0.7)
+  --increase-ref-index                     automatically increase the indices of references images based on the order
+                                           they are listed (starting with 1).
+  --disable-auto-resize-ref-image          disable auto resize of ref images
+  --disable-image-metadata                 do not embed generation metadata on image files
+  --vae-tiling                             process vae in tiles to reduce memory usage
+  --temporal-tiling                        enable temporal tiling for LTX video VAE decode
+  --hires                                  enable highres fix
+  -s, --seed                               RNG seed (default: 42, use random seed for < 0)
+  --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m,
+                                           dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s,
+                                           er_sde, euler_cfg_pp, euler_a_cfg_pp] (default: euler for Flux/SD3/Wan, euler_a otherwise)
+  --high-noise-sampling-method             (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a,
+                                           dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep,
+                                           res_2s, er_sde, euler_cfg_pp, euler_a_cfg_pp] default: euler for Flux/SD3/Wan, euler_a otherwise
+  --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits,
+                                           smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent, ltx2], default:
+                                           model-specific
+  --sigmas                                 custom sigma values for the sampler, comma-separated (e.g.,
+                                           "14.61,7.8,3.5,0.0").
+  --hires-sigmas                           custom sigma values for the highres fix second pass, comma-separated (e.g.,
+                                           "0.85,0.725,0.421875,0.0").
+  --skip-layers                            layers to skip for SLG steps (default: [7,8,9])
+  --high-noise-skip-layers                 (high noise) layers to skip for SLG steps (default: [7,8,9])
+  -r, --ref-image                          reference image for Flux Kontext models (can be used multiple times)
+  --cache-mode                             caching method: 'easycache' (DiT), 'ucache' (UNET),
+                                           'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT
+                                           Chebyshev+Taylor forecasting)
+  --cache-option                           named cache params (key=value format, comma-separated). easycache/ucache:
+                                           threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit:
+                                           Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=.
+                                           Examples: "threshold=0.25" or "threshold=1.5,reset=0"
+  --scm-mask                               SCM steps mask for cache-dit: comma-separated 0/1 (e.g.,
+                                           "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
+  --scm-policy                             SCM policy: 'dynamic' (default) or 'static'
+  --vae-tile-size                          tile size for vae tiling, format [X]x[Y] (default: 32x32)
+  --vae-relative-tile-size                 relative tile size for vae tiling, format [X]x[Y], in fraction of image size
+                                           if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)
 ```

 Metadata mode inspects PNG/JPEG container metadata without loading any model:
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -62,22 +62,18 @@ struct SDCliParams {
            {"-o",
             "--output",
             "path to write result image to. you can use printf-style %d format specifiers for image sequences (default: ./output.png) (eg. output_%03d.png). Single-file video outputs support .avi, .webm, and animated .webp",
-             0,
             &output_path},
            {"",
             "--image",
             "path to the image to inspect (for metadata mode)",
-             0,
             &image_path},
            {"",
             "--metadata-format",
             "metadata output format, one of [text, json] (default: text)",
-             0,
             &metadata_format},
            {"",
             "--preview-path",
             "path to write preview image to (default: ./preview.png). Multi-frame previews support .avi, .webm, and animated .webp",
-             0,
             &preview_path},
        };

@ -627,6 +623,8 @@ int main(int argc, const char* argv[]) {
        }
    }

+    bool vae_decode_only = true;
+
    auto load_image_and_update_size = [&](const std::string& path,
                                          SDImageOwner& image,
                                          bool resize_image    = true,
@ -648,18 +646,21 @@ int main(int argc, const char* argv[]) {
    };

    if (gen_params.init_image_path.size() > 0) {
+        vae_decode_only = false;
        if (!load_image_and_update_size(gen_params.init_image_path, gen_params.init_image)) {
            return 1;
        }
    }

    if (gen_params.end_image_path.size() > 0) {
+        vae_decode_only = false;
        if (!load_image_and_update_size(gen_params.end_image_path, gen_params.end_image)) {
            return 1;
        }
    }

    if (gen_params.ref_image_paths.size() > 0) {
+        vae_decode_only = false;
        gen_params.ref_images.clear();
        for (auto& path : gen_params.ref_image_paths) {
            SDImageOwner ref_image({0, 0, 3, nullptr});
@ -734,7 +735,18 @@ int main(int argc, const char* argv[]) {
        }
    }

-    sd_ctx_params_t sd_ctx_params = ctx_params.to_sd_ctx_params_t(cli_params.taesd_preview);
+    if (cli_params.mode == VID_GEN) {
+        vae_decode_only = false;
+    }
+
+    if (gen_params.hires_enabled &&
+        (gen_params.resolved_hires_upscaler == SD_HIRES_UPSCALER_MODEL ||
+         gen_params.resolved_hires_upscaler == SD_HIRES_UPSCALER_LANCZOS ||
+         gen_params.resolved_hires_upscaler == SD_HIRES_UPSCALER_NEAREST)) {
+        vae_decode_only = false;
+    }
+
+    sd_ctx_params_t sd_ctx_params = ctx_params.to_sd_ctx_params_t(vae_decode_only, true, cli_params.taesd_preview);

    SDImageVec results;
    int num_results             = 0;
@ -786,11 +798,12 @@ int main(int argc, const char* argv[]) {
    int upscale_factor = 4;  // unused for RealESRGAN_x4plus_anime_6B.pth
    if (ctx_params.esrgan_path.size() > 0 && gen_params.upscale_repeats > 0) {
        UpscalerCtxPtr upscaler_ctx(new_upscaler_ctx(ctx_params.esrgan_path.c_str(),
+                                                     ctx_params.offload_params_to_cpu,
                                                     ctx_params.diffusion_conv_direct,
                                                     ctx_params.n_threads,
                                                     gen_params.upscale_tile_size,
-                                                     sd_ctx_params.backend,
-                                                     sd_ctx_params.params_backend));
+                                                     ctx_params.backend.c_str(),
+                                                     ctx_params.params_backend.c_str()));

        if (upscaler_ctx == nullptr) {
            LOG_ERROR("new_upscaler_ctx failed");
--- a/examples/common/common.cpp
+++ b/examples/common/common.cpp
@ -6,7 +6,6 @@
 #include <cstdlib>
 #include <ctime>
 #include <filesystem>
-#include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <regex>
@ -52,10 +51,6 @@ static sd_vae_format_t str_to_vae_format(const std::string& value) {
    return SD_VAE_FORMAT_COUNT;
 }

-static void prepend_backend_assignment(std::string& spec, const char* assignment) {
-    spec = spec.empty() ? assignment : std::string(assignment) + "," + spec;
-}
-
 #if defined(_WIN32)
 static std::string utf16_to_utf8(const std::wstring& wstr) {
    if (wstr.empty())
@ -261,15 +256,8 @@ bool parse_options(int argc, const char** argv, const std::vector<ArgOptions>& o
                        invalid_arg = true;
                        return;
                    }
-                    if (option.concat && !option.target->empty()) {
-                        if (option.concat > 0 && option.concat <= 0xff) {
-                            *option.target += static_cast<char>(option.concat);
-                        }
-                        *option.target += argv_to_utf8(i, argv);
-                    } else {
-                        *option.target = argv_to_utf8(i, argv);
-                    }
-                    found_arg = true;
+                    *option.target = argv_to_utf8(i, argv);
+                    found_arg      = true;
                }))
                break;

@ -332,152 +320,109 @@ ArgOptions SDContextParams::get_options() {
        {"-m",
         "--model",
         "path to full model",
-         0,
         &model_path},
        {"",
         "--clip_l",
-         "path to the clip-l text encoder",
-         0,
-         &clip_l_path},
+         "path to the clip-l text encoder", &clip_l_path},
        {"", "--clip_g",
         "path to the clip-g text encoder",
-         0,
         &clip_g_path},
        {"",
         "--clip_vision",
         "path to the clip-vision encoder",
-         0,
         &clip_vision_path},
        {"",
         "--t5xxl",
         "path to the t5xxl text encoder",
-         0,
         &t5xxl_path},
        {"",
         "--llm",
         "path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)",
-         0,
         &llm_path},
        {"",
         "--llm_vision",
         "path to the llm vit",
-         0,
         &llm_vision_path},
        {"",
         "--qwen2vl",
         "alias of --llm. Deprecated.",
-         0,
         &llm_path},
        {"",
         "--qwen2vl_vision",
         "alias of --llm_vision. Deprecated.",
-         0,
         &llm_vision_path},
        {"",
         "--diffusion-model",
         "path to the standalone diffusion model",
-         0,
         &diffusion_model_path},
        {"",
         "--high-noise-diffusion-model",
         "path to the standalone high noise diffusion model",
-         0,
         &high_noise_diffusion_model_path},
        {"",
         "--uncond-diffusion-model",
         "path to the standalone unconditional diffusion model, currently used by Ideogram4 CFG",
-         0,
         &uncond_diffusion_model_path},
        {"",
         "--embeddings-connectors",
         "path to LTXAV embeddings connectors",
-         0,
         &embeddings_connectors_path},
        {"",
         "--vae",
         "path to standalone vae model",
-         0,
         &vae_path},
        {"",
         "--vae-format",
         "VAE latent format override: auto, flux, sd3, or flux2 (default: auto)",
-         0,
         &vae_format},
        {"",
         "--audio-vae",
         "path to standalone LTX audio vae model",
-         0,
         &audio_vae_path},
        {"",
         "--taesd",
         "path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)",
-         0,
         &taesd_path},
        {"",
         "--tae",
         "alias of --taesd",
-         0,
         &taesd_path},
        {"",
         "--control-net",
         "path to control net model",
-         0,
         &control_net_path},
        {"",
         "--embd-dir",
         "embeddings directory",
-         0,
         &embedding_dir},
        {"",
         "--lora-model-dir",
         "lora model directory",
-         0,
         &lora_model_dir},
        {"",
         "--hires-upscalers-dir",
         "highres fix upscaler model directory",
-         0,
         &hires_upscalers_dir},
        {"",
         "--tensor-type-rules",
         "weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")",
-         (int)',',
         &tensor_type_rules},
        {"",
         "--photo-maker",
         "path to PHOTOMAKER model",
-         0,
         &photo_maker_path},
-        {"",
-         "--pulid-weights",
-         "path to PuLID Flux weights",
-         0,
-         &pulid_weights_path},
        {"",
         "--upscale-model",
         "path to esrgan model.",
-         0,
         &esrgan_path},
        {"",
         "--backend",
         "runtime backend assignment, e.g. cpu or clip=cpu,vae=cuda0,diffusion=vulkan0",
-         (int)',',
         &backend},
        {"",
         "--params-backend",
-         "parameter backend assignment, e.g. disk, cpu, or diffusion=disk,clip=cpu",
-         (int)',',
+         "parameter backend assignment, e.g. cpu or diffusion=cpu,clip=cpu",
         &params_backend},
-        {"",
-         "--rpc-servers",
-         "comma-separated list of RPC servers to connect to for offloading, in the format host:port, e.g. localhost:50052,192.168.1.3:50052",
-         (int)',',
-         &rpc_servers},
-        {"",
-         "--max-vram",
-         "maximum VRAM budget in GiB for graph-cut segmented execution. Accepts a single value or assignments by backend/device, e.g. 6 or cuda0=6,vulkan0=4. 0 disables graph splitting; a negative value auto-detects free VRAM, sparing the specified value",
-         0,
-         &max_vram},
    };

    options.int_options = {
@ -492,15 +437,18 @@ ArgOptions SDContextParams::get_options() {
         &chroma_t5_mask_pad},
    };

+    options.float_options = {
+        {"",
+         "--max-vram",
+         "maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting; a negative value auto-detects free VRAM, sparing the specified value (e.g. -0.5 will keep at least 0.5 GiB free)",
+         &max_vram},
+    };
+
    options.bool_options = {
        {"",
         "--stream-layers",
         "enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram; defaults to false)",
         true, &stream_layers},
-        {"",
-         "--eager-load",
-         "load all params into the params backend at model-load time instead of lazily on first use (defaults to false)",
-         true, &eager_load},
        {"",
         "--force-sdxl-vae-conv-scale",
         "force use of conv scale on sdxl vae",
@ -515,15 +463,15 @@ ArgOptions SDContextParams::get_options() {
         true, &enable_mmap},
        {"",
         "--control-net-cpu",
-         "deprecated; use --backend controlnet=cpu",
+         "keep controlnet in cpu (for low vram)",
         true, &control_net_cpu},
        {"",
         "--clip-on-cpu",
-         "deprecated; use --backend te=cpu",
+         "keep clip in cpu (for low vram)",
         true, &clip_on_cpu},
        {"",
         "--vae-on-cpu",
-         "deprecated; use --backend vae=cpu",
+         "keep vae in cpu (for low vram)",
         true, &vae_on_cpu},
        {"",
         "--fa",
@ -740,25 +688,6 @@ bool SDContextParams::resolve_and_validate(SDMode mode) {
    return true;
 }

-void SDContextParams::prepare_backend_assignments() {
-    effective_backend        = backend;
-    effective_params_backend = params_backend;
-
-    if (offload_params_to_cpu) {
-        prepend_backend_assignment(effective_params_backend, "*=cpu");
-    }
-
-    if (clip_on_cpu) {
-        prepend_backend_assignment(effective_backend, "te=cpu");
-    }
-    if (vae_on_cpu) {
-        prepend_backend_assignment(effective_backend, "vae=cpu");
-    }
-    if (control_net_cpu) {
-        prepend_backend_assignment(effective_backend, "controlnet=cpu");
-    }
-}
-
 std::string SDContextParams::to_string() const {
    std::ostringstream emb_ss;
    emb_ss << "{\n";
@ -802,9 +731,8 @@ std::string SDContextParams::to_string() const {
        << "  rng_type: " << sd_rng_type_name(rng_type) << ",\n"
        << "  sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n"
        << "  offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n"
-        << "  max_vram: \"" << max_vram << "\",\n"
+        << "  max_vram: " << max_vram << ",\n"
        << "  stream_layers: " << (stream_layers ? "true" : "false") << ",\n"
-        << "  eager_load: " << (eager_load ? "true" : "false") << ",\n"
        << "  backend: \"" << backend << "\",\n"
        << "  params_backend: \"" << params_backend << "\",\n"
        << "  enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n"
@ -829,8 +757,7 @@ std::string SDContextParams::to_string() const {
    return oss.str();
 }

-sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool taesd_preview) {
-    prepare_backend_assignments();
+sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool free_params_immediately, bool taesd_preview) {
    embedding_vec.clear();
    embedding_vec.reserve(embedding_map.size());
    for (const auto& kv : embedding_map) {
@ -840,54 +767,57 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool taesd_preview) {
        embedding_vec.emplace_back(item);
    }

-    sd_ctx_params_t sd_ctx_params;
-    sd_ctx_params_init(&sd_ctx_params);
-    sd_ctx_params.model_path                      = model_path.c_str();
-    sd_ctx_params.clip_l_path                     = clip_l_path.c_str();
-    sd_ctx_params.clip_g_path                     = clip_g_path.c_str();
-    sd_ctx_params.clip_vision_path                = clip_vision_path.c_str();
-    sd_ctx_params.t5xxl_path                      = t5xxl_path.c_str();
-    sd_ctx_params.llm_path                        = llm_path.c_str();
-    sd_ctx_params.llm_vision_path                 = llm_vision_path.c_str();
-    sd_ctx_params.diffusion_model_path            = diffusion_model_path.c_str();
-    sd_ctx_params.high_noise_diffusion_model_path = high_noise_diffusion_model_path.c_str();
-    sd_ctx_params.uncond_diffusion_model_path     = uncond_diffusion_model_path.c_str();
-    sd_ctx_params.embeddings_connectors_path      = embeddings_connectors_path.c_str();
-    sd_ctx_params.vae_path                        = vae_path.c_str();
-    sd_ctx_params.audio_vae_path                  = audio_vae_path.c_str();
-    sd_ctx_params.taesd_path                      = taesd_path.c_str();
-    sd_ctx_params.control_net_path                = control_net_path.c_str();
-    sd_ctx_params.embeddings                      = embedding_vec.data();
-    sd_ctx_params.embedding_count                 = static_cast<uint32_t>(embedding_vec.size());
-    sd_ctx_params.photo_maker_path                = photo_maker_path.c_str();
-    sd_ctx_params.pulid_weights_path              = pulid_weights_path.c_str();
-    sd_ctx_params.tensor_type_rules               = tensor_type_rules.c_str();
-    sd_ctx_params.n_threads                       = n_threads;
-    sd_ctx_params.wtype                           = wtype;
-    sd_ctx_params.rng_type                        = rng_type;
-    sd_ctx_params.sampler_rng_type                = sampler_rng_type;
-    sd_ctx_params.prediction                      = prediction;
-    sd_ctx_params.lora_apply_mode                 = lora_apply_mode;
-    sd_ctx_params.enable_mmap                     = enable_mmap;
-    sd_ctx_params.flash_attn                      = flash_attn;
-    sd_ctx_params.diffusion_flash_attn            = diffusion_flash_attn;
-    sd_ctx_params.tae_preview_only                = taesd_preview;
-    sd_ctx_params.diffusion_conv_direct           = diffusion_conv_direct;
-    sd_ctx_params.vae_conv_direct                 = vae_conv_direct;
-    sd_ctx_params.circular_x                      = circular || circular_x;
-    sd_ctx_params.circular_y                      = circular || circular_y;
-    sd_ctx_params.force_sdxl_vae_conv_scale       = force_sdxl_vae_conv_scale;
-    sd_ctx_params.chroma_use_dit_mask             = chroma_use_dit_mask;
-    sd_ctx_params.chroma_use_t5_mask              = chroma_use_t5_mask;
-    sd_ctx_params.chroma_t5_mask_pad              = chroma_t5_mask_pad;
-    sd_ctx_params.qwen_image_zero_cond_t          = qwen_image_zero_cond_t;
-    sd_ctx_params.vae_format                      = str_to_vae_format(vae_format);
-    sd_ctx_params.max_vram                        = max_vram.c_str();
-    sd_ctx_params.stream_layers                   = stream_layers;
-    sd_ctx_params.eager_load                      = eager_load;
-    sd_ctx_params.backend                         = effective_backend.c_str();
-    sd_ctx_params.params_backend                  = effective_params_backend.c_str();
-    sd_ctx_params.rpc_servers                     = rpc_servers.c_str();
+    sd_ctx_params_t sd_ctx_params = {
+        model_path.c_str(),
+        clip_l_path.c_str(),
+        clip_g_path.c_str(),
+        clip_vision_path.c_str(),
+        t5xxl_path.c_str(),
+        llm_path.c_str(),
+        llm_vision_path.c_str(),
+        diffusion_model_path.c_str(),
+        high_noise_diffusion_model_path.c_str(),
+        uncond_diffusion_model_path.c_str(),
+        embeddings_connectors_path.c_str(),
+        vae_path.c_str(),
+        audio_vae_path.c_str(),
+        taesd_path.c_str(),
+        control_net_path.c_str(),
+        embedding_vec.data(),
+        static_cast<uint32_t>(embedding_vec.size()),
+        photo_maker_path.c_str(),
+        tensor_type_rules.c_str(),
+        vae_decode_only,
+        free_params_immediately,
+        n_threads,
+        wtype,
+        rng_type,
+        sampler_rng_type,
+        prediction,
+        lora_apply_mode,
+        offload_params_to_cpu,
+        enable_mmap,
+        clip_on_cpu,
+        control_net_cpu,
+        vae_on_cpu,
+        flash_attn,
+        diffusion_flash_attn,
+        taesd_preview,
+        diffusion_conv_direct,
+        vae_conv_direct,
+        circular || circular_x,
+        circular || circular_y,
+        force_sdxl_vae_conv_scale,
+        chroma_use_dit_mask,
+        chroma_use_t5_mask,
+        chroma_t5_mask_pad,
+        qwen_image_zero_cond_t,
+        str_to_vae_format(vae_format),
+        max_vram,
+        stream_layers,
+        backend.c_str(),
+        params_backend.c_str(),
+    };
    return sd_ctx_params;
 }

@ -902,71 +832,54 @@ ArgOptions SDGenerationParams::get_options() {
        {"-p",
         "--prompt",
         "the prompt to render",
-         0,
         &prompt},
        {"-n",
         "--negative-prompt",
         "the negative prompt (default: \"\")",
-         0,
         &negative_prompt},
        {"-i",
         "--init-img",
         "path to the init image",
-         0,
         &init_image_path},
        {"",
         "--end-img",
         "path to the end image, required by flf2v",
-         0,
         &end_image_path},
        {"",
         "--mask",
         "path to the mask image",
-         0,
         &mask_image_path},
        {"",
         "--control-image",
         "path to control image, control net",
-         0,
         &control_image_path},
        {"",
         "--control-video",
         "path to control video frames, It must be a directory path. The video frames inside should be stored as images in "
         "lexicographical (character) order. For example, if the control video path is `frames`, the directory contain images "
         "such as 00.png, 01.png, ... etc.",
-         0,
         &control_video_path},
        {"",
         "--pm-id-images-dir",
         "path to PHOTOMAKER input id images dir",
-         0,
         &pm_id_images_dir},
        {"",
         "--pm-id-embed-path",
         "path to PHOTOMAKER v2 id embed",
-         0,
         &pm_id_embed_path},
-        {"",
-         "--pulid-id-embedding",
-         "path to PuLID id embedding",
-         0,
-         &pulid_id_embedding_path},
        {"",
         "--hires-upscaler",
         "highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent (nearest-exact), "
         "Latent (antialiased), Latent (bicubic), Latent (bicubic antialiased), or a model name "
         "under --hires-upscalers-dir (default: Latent)",
-         0,
         &hires_upscaler},
        {"",
         "--extra-sample-args",
-         "extra sampler/scheduler/guidance args, key=value list. CFG supports guidance_schedule; APG supports apg_eta, apg_momentum, apg_norm_threshold, apg_norm_threshold_smoothing; SLG supports slg_uncond; lcm supports noise_clip_std, noise_scale_start, noise_scale_end; ltx2 supports max_shift, base_shift, stretch, terminal; euler_ge supports gamma;",
-         (int)',',
+         "extra sampler/scheduler/guidance args, key=value list. APG supports apg_eta, apg_momentum, apg_norm_threshold, apg_norm_threshold_smoothing; SLG supports slg_uncond; lcm supports noise_clip_std, noise_scale_start, noise_scale_end; ltx2 supports max_shift, base_shift, stretch, terminal; euler_ge supports gamma",
         &extra_sample_args},
        {"",
         "--extra-tiling-args",
         "extra VAE tiling args, key=value list. LTX video VAE supports temporal_tile_frames (default: 4), temporal_tile_overlap (default: 1)",
-         (int)',',
         &extra_tiling_args},
    };

@ -1104,10 +1017,6 @@ ArgOptions SDGenerationParams::get_options() {
         "--pm-style-strength",
         "",
         &pm_style_strength},
-        {"",
-         "--pulid-id-weight",
-         "strength of PuLID identity injection",
-         &pulid_id_weight},
        {"",
         "--control-strength",
         "strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image",
@ -1422,42 +1331,6 @@ ArgOptions SDGenerationParams::get_options() {
        return 1;
    };

-    auto on_prompt_file_arg = [&](int argc, const char** argv, int index) {
-        if (++index >= argc) {
-            return -1;
-        }
-        const char* arg = argv[index];
-        std::ifstream f(arg, std::ios::binary);
-        try {
-            prompt = std::string(std::istreambuf_iterator<char>{f}, {});
-        } catch (const std::ios_base::failure&) {
-            f.setstate(std::ios_base::failbit);
-        }
-        if (f.fail()) {
-            LOG_ERROR("error: failed to read prompt file '%s'\n", arg);
-            return -1;
-        }
-        return 1;
-    };
-
-    auto on_negative_prompt_file_arg = [&](int argc, const char** argv, int index) {
-        if (++index >= argc) {
-            return -1;
-        }
-        const char* arg = argv[index];
-        std::ifstream f(arg, std::ios::binary);
-        try {
-            negative_prompt = std::string(std::istreambuf_iterator<char>{f}, {});
-        } catch (const std::ios_base::failure&) {
-            f.setstate(std::ios_base::failbit);
-        }
-        if (f.fail()) {
-            LOG_ERROR("error: failed to read negative prompt file '%s'\n", arg);
-            return -1;
-        }
-        return 1;
-    };
-
    options.manual_options = {
        {"-s",
         "--seed",
@ -1521,14 +1394,6 @@ ArgOptions SDGenerationParams::get_options() {
         "--vae-relative-tile-size",
         "relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)",
         on_relative_tile_size_arg},
-        {"",
-         "--prompt-file",
-         "path to the file containing the prompt to render",
-         on_prompt_file_arg},
-        {"",
-         "--negative-prompt-file",
-         "path to the file containing the negative prompt",
-         on_negative_prompt_file_arg},

    };

@ -2384,11 +2249,6 @@ sd_img_gen_params_t SDGenerationParams::to_sd_img_gen_params_t() {
        pm_style_strength,
    };

-    sd_pulid_params_t pulid_params = {
-        pulid_id_embedding_path.empty() ? nullptr : pulid_id_embedding_path.c_str(),
-        pulid_id_weight,
-    };
-
    params.loras                 = lora_vec.empty() ? nullptr : lora_vec.data();
    params.lora_count            = static_cast<uint32_t>(lora_vec.size());
    params.prompt                = prompt.c_str();
@ -2409,7 +2269,6 @@ sd_img_gen_params_t SDGenerationParams::to_sd_img_gen_params_t() {
    params.control_image         = control_image.get();
    params.control_strength      = control_strength;
    params.pm_params             = pm_params;
-    params.pulid_params          = pulid_params;
    params.vae_tiling_params     = vae_tiling_params;
    params.cache                 = cache_params;

--- a/examples/common/common.h
+++ b/examples/common/common.h
@ -31,7 +31,6 @@ struct StringOption {
    std::string short_name;
    std::string long_name;
    std::string desc;
-    int concat;
    std::string* target;
 };

@ -134,7 +133,6 @@ struct SDContextParams {
    std::string control_net_path;
    std::string embedding_dir;
    std::string photo_maker_path;
-    std::string pulid_weights_path;
    sd_type_t wtype = SD_TYPE_COUNT;
    std::string tensor_type_rules;
    std::string lora_model_dir = ".";
@ -146,14 +144,10 @@ struct SDContextParams {
    rng_type_t rng_type         = CUDA_RNG;
    rng_type_t sampler_rng_type = RNG_TYPE_COUNT;
    bool offload_params_to_cpu  = false;
-    std::string max_vram        = "0";
+    float max_vram              = 0.f;
    bool stream_layers          = false;
-    bool eager_load             = false;
    std::string backend;
    std::string params_backend;
-    std::string rpc_servers;
-    std::string effective_backend;
-    std::string effective_params_backend;
    bool enable_mmap           = false;
    bool control_net_cpu       = false;
    bool clip_on_cpu           = false;
@ -181,12 +175,11 @@ struct SDContextParams {
    float flow_shift = INFINITY;
    ArgOptions get_options();
    void build_embedding_map();
-    void prepare_backend_assignments();
    bool resolve(SDMode mode);
    bool validate(SDMode mode);
    bool resolve_and_validate(SDMode mode);
    std::string to_string() const;
-    sd_ctx_params_t to_sd_ctx_params_t(bool taesd_preview);
+    sd_ctx_params_t to_sd_ctx_params_t(bool vae_decode_only, bool free_params_immediately, bool taesd_preview);
 };

 struct SDGenerationParams {
@ -237,9 +230,6 @@ struct SDGenerationParams {
    std::string pm_id_embed_path;
    float pm_style_strength = 20.f;

-    std::string pulid_id_embedding_path;
-    float pulid_id_weight = 1.0f;
-
    int upscale_repeats   = 1;
    int upscale_tile_size = 128;

--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -117,10 +117,188 @@ In this case, the server will load and serve the specified `index.html` file ins
 * using a custom UI
 * avoiding rebuilding the binary after frontend modifications

-# Usage
+# Run

-For detailed command-line arguments, run:
-
-```bash
-./bin/sd-server -h
+```
+usage: ./bin/sd-server  [options]
+
+Svr Options:
+  -l, --listen-ip <string>      server listen ip (default: 127.0.0.1)
+  --serve-html-path <string>    path to HTML file to serve at root (optional)
+  --listen-port <int>           server listen port (default: 1234)
+  -v, --verbose                 print extra info
+  --color                       colors the logging tags according to level
+  -h, --help                    show this help message and exit
+
+Context Options:
+  -m, --model <string>                     path to full model
+  --clip_l <string>                        path to the clip-l text encoder
+  --clip_g <string>                        path to the clip-g text encoder
+  --clip_vision <string>                   path to the clip-vision encoder
+  --t5xxl <string>                         path to the t5xxl text encoder
+  --llm <string>                           path to the llm text encoder. For example: (qwenvl2.5 for qwen-image,
+                                           mistral-small3.2 for flux2, ...)
+  --llm_vision <string>                    path to the llm vit
+  --qwen2vl <string>                       alias of --llm. Deprecated.
+  --qwen2vl_vision <string>                alias of --llm_vision. Deprecated.
+  --diffusion-model <string>               path to the standalone diffusion model
+  --high-noise-diffusion-model <string>    path to the standalone high noise diffusion model
+  --uncond-diffusion-model <string>        path to the standalone unconditional diffusion model, currently used by
+                                           Ideogram4 CFG
+  --vae <string>                           path to standalone vae model
+  --taesd <string>                         path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
+  --tae <string>                           alias of --taesd
+  --control-net <string>                   path to control net model
+  --embd-dir <string>                      embeddings directory
+  --lora-model-dir <string>                lora model directory
+  --hires-upscalers-dir <string>           highres fix upscaler model directory
+  --tensor-type-rules <string>             weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
+  --photo-maker <string>                   path to PHOTOMAKER model
+  --upscale-model <string>                 path to esrgan model.
+  -t, --threads <int>                      number of threads to use during computation (default: -1). If threads <= 0,
+                                           then threads will be set to the number of CPU physical cores
+  --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
+  --max-vram <float>                       maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
+                                           graph splitting; a negative value auto-detects free VRAM, sparing the
+                                           specified value (e.g. -0.5 will keep at least 0.5 GiB free)
+  --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
+  --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM
+                                           when needed
+  --mmap                                   whether to memory-map model
+  --control-net-cpu                        keep controlnet in cpu (for low vram)
+  --clip-on-cpu                            keep clip in cpu (for low vram)
+  --vae-on-cpu                             keep vae in cpu (for low vram)
+  --fa                                     use flash attention
+  --diffusion-fa                           use flash attention in the diffusion model only
+  --diffusion-conv-direct                  use ggml_conv2d_direct in the diffusion model
+  --vae-conv-direct                        use ggml_conv2d_direct in the vae model
+  --circular                               enable circular padding for convolutions
+  --circularx                              enable circular RoPE wrapping on x-axis (width) only
+  --circulary                              enable circular RoPE wrapping on y-axis (height) only
+  --chroma-disable-dit-mask                disable dit mask for chroma
+  --qwen-image-zero-cond-t                 enable zero_cond_t for qwen image
+  --chroma-enable-t5-mask                  enable t5 mask for chroma
+  --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K,
+                                           q4_K). If not specified, the default is the type of the weight file
+  --rng                                    RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
+  --sampler-rng                            sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
+  --prediction                             prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow,
+                                           flux2_flow]
+  --lora-apply-mode                        the way to apply LoRA, one of [auto, immediately, at_runtime], default is
+                                           auto. In auto mode, if the model weights contain any quantized parameters,
+                                           the at_runtime mode will be used; otherwise, immediately will be used.The
+                                           immediately mode may have precision and compatibility issues with quantized
+                                           parameters, but it usually offers faster inference speed and, in some cases,
+                                           lower memory usage. The at_runtime mode, on the other hand, is exactly the
+                                           opposite.
+
+Default Generation Options:
+  -p, --prompt <string>                    the prompt to render
+  -n, --negative-prompt <string>           the negative prompt (default: "")
+  -i, --init-img <string>                  path to the init image
+  --end-img <string>                       path to the end image, required by flf2v
+  --mask <string>                          path to the mask image
+  --control-image <string>                 path to control image, control net
+  --control-video <string>                 path to control video frames, It must be a directory path. The video frames
+                                           inside should be stored as images in lexicographical (character) order. For
+                                           example, if the control video path is `frames`, the directory contain images
+                                           such as 00.png, 01.png, ... etc.
+  --pm-id-images-dir <string>              path to PHOTOMAKER input id images dir
+  --pm-id-embed-path <string>              path to PHOTOMAKER v2 id embed
+  --hires-upscaler <string>                highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent
+                                           (nearest-exact), Latent (antialiased), Latent (bicubic), Latent (bicubic
+                                           antialiased), or a model name under --hires-upscalers-dir (default: Latent)
+  --extra-sample-args <string>             extra sampler/scheduler/guidance args, key=value list. APG supports apg_eta,
+                                           apg_momentum, apg_norm_threshold, apg_norm_threshold_smoothing; SLG supports
+                                           slg_uncond; lcm supports noise_clip_std, noise_scale_start, noise_scale_end;
+                                           ltx2 supports max_shift, base_shift, stretch, terminal; euler_ge supports gamma
+  --extra-tiling-args <string>             extra VAE tiling args, key=value list. LTX video VAE supports
+                                           temporal_tile_frames (default: 4), temporal_tile_overlap (default: 1)
+  -H, --height <int>                       image height, in pixel space (default: 512)
+  -W, --width <int>                        image width, in pixel space (default: 512)
+  --steps <int>                            number of sample steps (default: 20)
+  --high-noise-steps <int>                 (high noise) number of sample steps (default: -1 = auto)
+  --clip-skip <int>                        ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer
+                                           (default: -1). <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
+  -b, --batch-count <int>                  batch count
+  --video-frames <int>                     video frames (default: 1)
+  --fps <int>                              fps (default: 24)
+  --timestep-shift <int>                   shift timestep for NitroFusion models (default: 0). recommended N for
+                                           NitroSD-Realism around 250 and 500 for NitroSD-Vibrant
+  --upscale-repeats <int>                  Run the ESRGAN upscaler this many times (default: 1)
+  --upscale-tile-size <int>                tile size for ESRGAN upscaling (default: 128)
+  --hires-width <int>                      highres fix target width, 0 to use --hires-scale (default: 0)
+  --hires-height <int>                     highres fix target height, 0 to use --hires-scale (default: 0)
+  --hires-steps <int>                      highres fix second pass sample steps, 0 to reuse --steps (default: 0)
+  --hires-upscale-tile-size <int>          highres fix upscaler tile size, reserved for model-backed upscalers (default:
+                                           128)
+  --cfg-scale <float>                      unconditional guidance scale: (default: 7.0)
+  --img-cfg-scale <float>                  image guidance scale for inpaint or image edit models: (default: same as
+                                           --cfg-scale)
+  --guidance <float>                       distilled guidance scale for models with guidance input (default: 3.5)
+  --slg-scale <float>                      skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means
+                                           disabled, a value of 2.5 is nice for sd3.5 medium
+  --skip-layer-start <float>               SLG enabling point (default: 0.01)
+  --skip-layer-end <float>                 SLG disabling point (default: 0.2)
+  --eta <float>                            noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and
+                                           res_2s; 1 for euler_a, er_sde and dpm++2s_a)
+  --flow-shift <float>                     shift value for Flow models like SD3.x or WAN (default: auto)
+  --high-noise-cfg-scale <float>           (high noise) unconditional guidance scale: (default: 7.0)
+  --high-noise-img-cfg-scale <float>       (high noise) image guidance scale for inpaint or image edit models (default:
+                                           same as --cfg-scale)
+  --high-noise-guidance <float>            (high noise) distilled guidance scale for models with guidance input
+                                           (default: 3.5)
+  --high-noise-slg-scale <float>           (high noise) skip layer guidance (SLG) scale, only for DiT models: (default:
+                                           0)
+  --high-noise-skip-layer-start <float>    (high noise) SLG enabling point (default: 0.01)
+  --high-noise-skip-layer-end <float>      (high noise) SLG disabling point (default: 0.2)
+  --high-noise-eta <float>                 (high noise) noise multiplier (default: 0 for ddim_trailing, tcd,
+                                           res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
+  --strength <float>                       strength for noising/unnoising (default: 0.75)
+  --pm-style-strength <float>
+  --control-strength <float>               strength to apply Control Net (default: 0.9). 1.0 corresponds to full
+                                           destruction of information in init image
+  --moe-boundary <float>                   timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if
+                                           `--high-noise-steps` is set to -1
+  --vace-strength <float>                  wan vace strength
+  --vae-tile-overlap <float>               tile overlap for vae tiling, in fraction of tile size (default: 0.5)
+  --hires-scale <float>                    highres fix scale when target size is not set (default: 2.0)
+  --hires-denoising-strength <float>       highres fix second pass denoising strength (default: 0.7)
+  --increase-ref-index                     automatically increase the indices of references images based on the order
+                                           they are listed (starting with 1).
+  --disable-auto-resize-ref-image          disable auto resize of ref images
+  --disable-image-metadata                 do not embed generation metadata on image files
+  --vae-tiling                             process vae in tiles to reduce memory usage
+  --temporal-tiling                        enable temporal tiling for LTX video VAE decode
+  --hires                                  enable highres fix
+  -s, --seed                               RNG seed (default: 42, use random seed for < 0)
+  --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m,
+                                           dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s,
+                                           er_sde, euler_cfg_pp, euler_a_cfg_pp] (default: euler for Flux/SD3/Wan, euler_a otherwise)
+  --high-noise-sampling-method             (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a,
+                                           dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep,
+                                           res_2s, er_sde, euler_cfg_pp, euler_a_cfg_pp] default: euler for Flux/SD3/Wan, euler_a otherwise
+  --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits,
+                                           smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent, ltx2], default:
+                                           model-specific
+  --sigmas                                 custom sigma values for the sampler, comma-separated (e.g.,
+                                           "14.61,7.8,3.5,0.0").
+  --hires-sigmas                           custom sigma values for the highres fix second pass, comma-separated (e.g.,
+                                           "0.85,0.725,0.421875,0.0").
+  --skip-layers                            layers to skip for SLG steps (default: [7,8,9])
+  --high-noise-skip-layers                 (high noise) layers to skip for SLG steps (default: [7,8,9])
+  -r, --ref-image                          reference image for Flux Kontext models (can be used multiple times)
+  --cache-mode                             caching method: 'easycache' (DiT), 'ucache' (UNET),
+                                           'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT
+                                           Chebyshev+Taylor forecasting)
+  --cache-option                           named cache params (key=value format, comma-separated). easycache/ucache:
+                                           threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit:
+                                           Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=.
+                                           Examples: "threshold=0.25" or "threshold=1.5,reset=0"
+  --scm-mask                               SCM steps mask for cache-dit: comma-separated 0/1 (e.g.,
+                                           "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
+  --scm-policy                             SCM policy: 'dynamic' (default) or 'static'
+  --vae-tile-size                          tile size for vae tiling, format [X]x[Y] (default: 32x32)
+  --vae-relative-tile-size                 relative tile size for vae tiling, format [X]x[Y], in fraction of image size
+                                           if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)
 ```
--- a/examples/server/frontend
+++ b/examples/server/frontend
@ -1 +1 @@
-Subproject commit c4bce3d6b3f236614cca21014f076083b7270ba8
+Subproject commit 797ccf80825cc035508ba9b599b2a21953e7f835
--- a/examples/server/main.cpp
+++ b/examples/server/main.cpp
@ -85,7 +85,7 @@ int main(int argc, const char** argv) {
    LOG_DEBUG("%s", ctx_params.to_string().c_str());
    LOG_DEBUG("%s", default_gen_params.to_string().c_str());

-    sd_ctx_params_t sd_ctx_params = ctx_params.to_sd_ctx_params_t(false);
+    sd_ctx_params_t sd_ctx_params = ctx_params.to_sd_ctx_params_t(false, false, false);
    SDCtxPtr sd_ctx(new_sd_ctx(&sd_ctx_params));

    if (sd_ctx == nullptr) {
--- a/examples/server/runtime.cpp
+++ b/examples/server/runtime.cpp
@ -190,8 +190,8 @@ ArgOptions SDSvrParams::get_options() {
    ArgOptions options;

    options.string_options = {
-        {"-l", "--listen-ip", "server listen ip (default: 127.0.0.1)", 0, &listen_ip},
-        {"", "--serve-html-path", "path to HTML file to serve at root (optional)", 0, &serve_html_path},
+        {"-l", "--listen-ip", "server listen ip (default: 127.0.0.1)", &listen_ip},
+        {"", "--serve-html-path", "path to HTML file to serve at root (optional)", &serve_html_path},
    };

    options.int_options = {
--- a/format-code.ps1
+++ b/format-code.ps1
@ -1,54 +0,0 @@
-$patterns = @(
-    "src/*.cpp"
-    "src/*.h"
-    "src/*.hpp"
-    "src/conditioning/*.cpp"
-    "src/conditioning/*.h"
-    "src/conditioning/*.hpp"
-    "src/core/*.cpp"
-    "src/core/*.h"
-    "src/core/*.hpp"
-    "src/extensions/*.cpp"
-    "src/extensions/*.h"
-    "src/extensions/*.hpp"
-    "src/runtime/*.cpp"
-    "src/runtime/*.h"
-    "src/runtime/*.hpp"
-    "src/model/*/*.cpp"
-    "src/model/*/*.h"
-    "src/model/*/*.hpp"
-    "src/tokenizers/*.h"
-    "src/tokenizers/*.cpp"
-    "src/tokenizers/vocab/*.h"
-    "src/tokenizers/vocab/*.cpp"
-    "src/model_io/*.h"
-    "src/model_io/*.cpp"
-    "examples/cli/*.cpp"
-    "examples/cli/*.h"
-    "examples/server/*.cpp"
-    "examples/common/*.hpp"
-    "examples/common/*.h"
-    "examples/common/*.cpp"
-)
-
-$root = (Get-Location).Path
-
-foreach ($pattern in $patterns) {
-    $files = Get-ChildItem -Path $pattern -File -ErrorAction SilentlyContinue | Sort-Object FullName
-
-    foreach ($file in $files) {
-        $relativePath = $file.FullName.Substring($root.Length).TrimStart('\', '/') -replace '\\', '/'
-
-        if ($relativePath -like "vocab*") {
-            continue
-        }
-
-        Write-Host "formatting '$relativePath'"
-
-        # if ($relativePath -ne "stable-diffusion.h") {
-        #     clang-tidy -fix -p build_linux/ "$relativePath"
-        # }
-
-        & clang-format -style=file -i $relativePath
-    }
-}
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 3af5f5760e19a96427f5f7a93b79cbdf3d4b265b
+Subproject commit 0ce7ad348a3151e1da9f65d962044546bcaad421
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@ -195,15 +195,20 @@ typedef struct {
    const sd_embedding_t* embeddings;
    uint32_t embedding_count;
    const char* photo_maker_path;
-    const char* pulid_weights_path;
    const char* tensor_type_rules;
+    bool vae_decode_only;
+    bool free_params_immediately;
    int n_threads;
    enum sd_type_t wtype;
    enum rng_type_t rng_type;
    enum rng_type_t sampler_rng_type;
    enum prediction_t prediction;
    enum lora_apply_mode_t lora_apply_mode;
+    bool offload_params_to_cpu;
    bool enable_mmap;
+    bool keep_clip_on_cpu;
+    bool keep_control_net_on_cpu;
+    bool keep_vae_on_cpu;
    bool flash_attn;
    bool diffusion_flash_attn;
    bool tae_preview_only;
@ -217,12 +222,10 @@ typedef struct {
    int chroma_t5_mask_pad;
    bool qwen_image_zero_cond_t;
    enum sd_vae_format_t vae_format;
-    const char* max_vram;  // GiB budget or backend assignment spec for graph-cut segmented param offload (0 = disabled, -1 = auto)
+    float max_vram;  // GiB budget for graph-cut segmented param offload (0 = disabled, -1 = auto free VRAM minus 1 GiB)
    bool stream_layers;  // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram)
-    bool eager_load;  // Load all params into the params backend at model-load time instead of lazily on first use
    const char* backend;
    const char* params_backend;
-    const char* rpc_servers;
 } sd_ctx_params_t;

 typedef struct {
@ -274,11 +277,6 @@ typedef struct {
    float style_strength;
 } sd_pm_params_t;  // photo maker

-typedef struct {
-    const char* id_embedding_path;
-    float id_weight;
-} sd_pulid_params_t;
-
 enum sd_cache_mode_t {
    SD_CACHE_DISABLED = 0,
    SD_CACHE_EASYCACHE,
@ -371,7 +369,6 @@ typedef struct {
    sd_image_t control_image;
    float control_strength;
    sd_pm_params_t pm_params;
-    sd_pulid_params_t pulid_params;
    sd_tiling_params_t vae_tiling_params;
    sd_cache_params_t cache;
    sd_hires_params_t hires;
@ -453,17 +450,6 @@ SD_API void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params);
 SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params);
 SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params);

-enum sd_cancel_mode_t {
-    // Stop the current generation as soon as possible.
-    SD_CANCEL_ALL,
-    // Finish the current image sample, then skip additional batch latents and return completed images.
-    SD_CANCEL_NEW_LATENTS,
-    // Clear a pending cancellation request.
-    SD_CANCEL_RESET
-};
-
-SD_API void sd_cancel_generation(sd_ctx_t* sd_ctx, enum sd_cancel_mode_t mode);
-
 SD_API void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params);
 SD_API bool generate_video(sd_ctx_t* sd_ctx,
                           const sd_vid_gen_params_t* sd_vid_gen_params,
@ -474,6 +460,7 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
 typedef struct upscaler_ctx_t upscaler_ctx_t;

 SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
+                                        bool offload_params_to_cpu,
                                        bool direct,
                                        int n_threads,
                                        int tile_size,
@ -504,10 +491,6 @@ SD_API bool preprocess_canny(sd_image_t image,
 SD_API const char* sd_commit(void);
 SD_API const char* sd_version(void);

-// for C API, caller needs to call free_sd_images to free the memory after use
-// This helps avoid CRT problems on Windows when memory is allocated in the library but freed in the caller, which may use a different CRT.
-SD_API void free_sd_images(sd_image_t* result_images, int num_images);
-
 #ifdef __cplusplus
 }
 #endif
--- a/script/pulid_extract_id.py
+++ b/script/pulid_extract_id.py
@ -1,134 +0,0 @@
-"""
-Precompute a PuLID-Flux identity embedding from a single source portrait.
-
-Writes a gguf file (a single tensor `pulid_id`) that stable-diffusion.cpp's
-`--pulid-id-embedding` flag consumes.
-
-Dependencies (recommended: vendor rather than pip-install due to upstream
-packaging quirks):
-  - torch + safetensors
-  - The ToTheBeginning/PuLID repository's `pulid/` package and `eva_clip/`.
-    Put them on PYTHONPATH or sys.path before running this script.
-  - insightface, facexlib, torchvision, opencv-python, huggingface_hub, gguf
-  - numpy, Pillow
-
-Usage:
-  python script/pulid_extract_id.py \\
-    --portrait /path/to/source-photo.jpg \\
-    --pulid-weights /path/to/pulid_flux_v0.9.1.safetensors \\
-    --out /path/to/source.pulidembd
-
-The portrait must contain a clearly visible face. insightface's antelopev2
-detector will be auto-downloaded on first run.
-"""
-
-from __future__ import annotations
-
-import argparse
-import os
-import sys
-from types import SimpleNamespace
-
-
-def extract(portrait_path: str, pulid_weights: str) -> "torch.Tensor":
-    import numpy as np
-    import torch
-    from PIL import Image
-    from pulid.pipeline_flux import PuLIDPipeline
-
-    if torch.cuda.is_available():
-        device, onnx_provider = "cuda", "gpu"
-    else:
-        device, onnx_provider = "cpu", "cpu"
-
-    print(f"device={device}", flush=True)
-
-    # PuLIDPipeline only attaches pulid_ca attributes to `dit` during
-    # construction; get_id_embedding() never runs Flux, so a dummy object is
-    # enough and avoids importing/building a Flux skeleton.
-    print("instantiating PuLIDPipeline with a dummy Flux object", flush=True)
-    dit = SimpleNamespace()
-    pulid = PuLIDPipeline(dit=dit,
-                          device=device,
-                          weight_dtype=torch.bfloat16,
-                          onnx_provider=onnx_provider)
-
-    print(f"loading PuLID weights from {pulid_weights}", flush=True)
-    pulid.load_pretrain(pretrain_path=pulid_weights, version="v0.9.1")
-
-    print(f"extracting ID embedding from {portrait_path}", flush=True)
-    face_img = np.array(Image.open(portrait_path).convert("RGB"))
-    id_embedding, _ = pulid.get_id_embedding(face_img)
-    print(f"id embedding shape={tuple(id_embedding.shape)} dtype={id_embedding.dtype}",
-          flush=True)
-
-    if id_embedding.ndim == 3 and id_embedding.shape[0] == 1:
-        id_embedding = id_embedding[0]
-    return id_embedding
-
-
-def write_embd(tensor, out_path: str, dtype_choice: str) -> None:
-    import gguf
-    import torch
-
-    if tensor.ndim != 2:
-        raise ValueError(f"expected (num_tokens, token_dim); got {tuple(tensor.shape)}")
-    num_tokens, token_dim = tensor.shape
-
-    os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)
-
-    writer = gguf.GGUFWriter(out_path, arch="pulid")
-    writer.add_uint32("pulid.version", 1)
-
-    if dtype_choice == "fp16":
-        arr = tensor.to(torch.float16).contiguous().cpu().numpy()
-        writer.add_tensor("pulid_id", arr)
-    elif dtype_choice == "fp32":
-        arr = tensor.to(torch.float32).contiguous().cpu().numpy()
-        writer.add_tensor("pulid_id", arr)
-    elif dtype_choice == "bf16":
-        raw = tensor.to(torch.bfloat16).contiguous().view(torch.uint16).cpu().numpy()
-        writer.add_tensor("pulid_id", raw,
-                          raw_shape=(int(num_tokens), int(token_dim)),
-                          raw_dtype=gguf.GGMLQuantizationType.BF16)
-    else:
-        raise ValueError(f"unknown --dtype {dtype_choice}")
-
-    writer.write_header_to_file()
-    writer.write_kv_data_to_file()
-    writer.write_tensors_to_file()
-    writer.close()
-
-    print(f"wrote {out_path}: gguf, tensor pulid_id [{token_dim}, {num_tokens}] {dtype_choice}",
-          flush=True)
-
-
-def main() -> int:
-    ap = argparse.ArgumentParser(
-        description=__doc__,
-        formatter_class=argparse.RawDescriptionHelpFormatter)
-    ap.add_argument("--portrait", required=True,
-                    help="Path to the source portrait image (JPG/PNG).")
-    ap.add_argument("--pulid-weights", required=True,
-                    help="Path to pulid_flux_v0.9.x.safetensors.")
-    ap.add_argument("--out", required=True,
-                    help="Output path for the .pulidembd binary.")
-    ap.add_argument("--dtype", default="fp16",
-                    choices=["fp16", "bf16", "fp32"],
-                    help="Storage dtype (default fp16; produces ~131 KB).")
-    args = ap.parse_args()
-
-    if not os.path.exists(args.portrait):
-        print(f"ERROR: portrait not found at {args.portrait}", file=sys.stderr)
-        return 2
-    if not os.path.exists(args.pulid_weights):
-        print(f"ERROR: PuLID weights not found at {args.pulid_weights}", file=sys.stderr)
-        return 3
-
-    embedding = extract(args.portrait, args.pulid_weights)
-    write_embd(embedding, args.out, args.dtype)
-    return 0
-
-
-if __name__ == "__main__":
-    raise SystemExit(main())
--- a/src/conditioning/conditioner.hpp
+++ b/src/conditioning/conditioner.hpp
@ -1,4 +1,4 @@
-#ifndef __SD_CONDITIONING_CONDITIONER_HPP__
+#ifndef __SD_CONDITIONING_CONDITIONER_HPP__
 #define __SD_CONDITIONING_CONDITIONER_HPP__

 #include <cmath>
@ -113,12 +113,14 @@ struct Conditioner {
 public:
    virtual SDCondition get_learned_condition(int n_threads,
                                              const ConditionerParams& conditioner_params) = 0;
+    virtual bool alloc_params_buffer()                                                     = 0;
+    virtual void free_params_buffer()                                                      = 0;
    virtual void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors)           = 0;
+    virtual size_t get_params_buffer_size()                                                = 0;
    virtual void set_max_graph_vram_bytes(size_t max_vram_bytes) {}
    virtual void set_stream_layers_enabled(bool enabled) {}
    virtual void set_flash_attention_enabled(bool enabled) = 0;
    virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {}
-    virtual void runner_done() {}
 };

 // ldm.modules.encoders.modules.FrozenCLIPEmbedder
@ -136,24 +138,25 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
    std::map<std::string, std::pair<int, int>> embedding_pos_map;

    FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
+                                      ggml_backend_t params_backend,
                                      const String2TensorStorage& tensor_storage_map,
                                      const std::map<std::string, std::string>& orig_embedding_map,
-                                      SDVersion version                                   = VERSION_SD1,
-                                      std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
+                                      SDVersion version = VERSION_SD1)
        : version(version), tokenizer(sd_version_is_sd2(version) ? 0 : 49407) {
        for (const auto& kv : orig_embedding_map) {
-            std::string name    = normalize_embedding_name(kv.first);
+            std::string name = kv.first;
+            std::transform(name.begin(), name.end(), name.begin(), [](unsigned char c) { return std::tolower(c); });
            embedding_map[name] = kv.second;
            tokenizer.add_special_token(name);
        }
        bool force_clip_f32 = !embedding_map.empty();
        if (sd_version_is_sd1(version)) {
-            text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32, weight_manager);
+            text_model = std::make_shared<CLIPTextModelRunner>(backend, params_backend, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
        } else if (sd_version_is_sd2(version)) {
-            text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32, weight_manager);
+            text_model = std::make_shared<CLIPTextModelRunner>(backend, params_backend, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
        } else if (sd_version_is_sdxl(version)) {
-            text_model  = std::make_shared<CLIPTextModelRunner>(backend, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32, weight_manager);
-            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_storage_map, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32, weight_manager);
+            text_model  = std::make_shared<CLIPTextModelRunner>(backend, params_backend, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
+            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, params_backend, tensor_storage_map, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
        }
    }

@ -164,6 +167,33 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
        }
    }

+    bool alloc_params_buffer() override {
+        if (!text_model->alloc_params_buffer()) {
+            return false;
+        }
+        if (sd_version_is_sdxl(version)) {
+            if (!text_model2->alloc_params_buffer()) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    void free_params_buffer() override {
+        text_model->free_params_buffer();
+        if (sd_version_is_sdxl(version)) {
+            text_model2->free_params_buffer();
+        }
+    }
+
+    size_t get_params_buffer_size() override {
+        size_t buffer_size = text_model->get_params_buffer_size();
+        if (sd_version_is_sdxl(version)) {
+            buffer_size += text_model2->get_params_buffer_size();
+        }
+        return buffer_size;
+    }
+
    void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
        text_model->set_max_graph_vram_bytes(max_vram_bytes);
        if (sd_version_is_sdxl(version)) {
@ -192,13 +222,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
        }
    }

-    void runner_done() override {
-        text_model->runner_done();
-        if (sd_version_is_sdxl(version)) {
-            text_model2->runner_done();
-        }
-    }
-
    bool load_embedding(std::string embd_name, std::string embd_path, std::vector<int32_t>& bpe_tokens) {
        ModelLoader model_loader;
        if (!model_loader.init_from_file_and_convert_name(embd_path)) {
@ -240,8 +263,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
            }
            return true;
        };
-        model_loader.set_n_threads(1);
-        model_loader.load_tensors(on_load);
+        model_loader.load_tensors(on_load, 1);
        int pos_start = num_custom_embeddings;
        if (embd) {
            int64_t hidden_size = text_model->model.hidden_size;
@ -277,23 +299,17 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
        return true;
    }

-    static std::string normalize_embedding_name(std::string name) {
-        std::transform(name.begin(), name.end(), name.begin(), [](unsigned char c) { return std::tolower(c); });
-        return name;
-    }
-
-    bool append_embedding_tokens(std::string str, std::vector<int32_t>& bpe_tokens) {
-        std::string name = normalize_embedding_name(std::move(str));
-        auto iter        = embedding_map.find(name);
-        if (iter == embedding_map.end()) {
-            return false;
-        }
-        return load_embedding(name, iter->second, bpe_tokens);
-    }
-
    std::vector<int> convert_token_to_id(std::string text) {
        auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
-            return append_embedding_tokens(str, bpe_tokens);
+            auto iter = embedding_map.find(str);
+            if (iter == embedding_map.end()) {
+                return false;
+            }
+            std::string embedding_path = iter->second;
+            if (load_embedding(str, embedding_path, bpe_tokens)) {
+                return true;
+            }
+            return false;
        };
        std::vector<int> curr_tokens = tokenizer.encode(text, on_new_token_cb);
        return curr_tokens;
@ -320,7 +336,15 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
        }

        auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
-            return append_embedding_tokens(str, bpe_tokens);
+            auto iter = embedding_map.find(str);
+            if (iter == embedding_map.end()) {
+                return false;
+            }
+            std::string embedding_path = iter->second;
+            if (load_embedding(str, embedding_path, bpe_tokens)) {
+                return true;
+            }
+            return false;
        };

        std::vector<int> tokens;
@ -408,10 +432,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                                               token_embed_custom.data(),
                                                               max_token_idx,
                                                               false,
-                                                               clip_skip,
-                                                               false,
-                                                               true,
-                                                               true);
+                                                               clip_skip);
                GGML_ASSERT(!chunk_hidden_states.empty());
                if (sd_version_is_sdxl(version)) {
                    auto chunk_hidden_states2 = text_model2->compute(n_threads,
@ -420,10 +441,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                                                     token_embed_custom.data(),
                                                                     max_token_idx,
                                                                     false,
-                                                                     clip_skip,
-                                                                     false,
-                                                                     true,
-                                                                     true);
+                                                                     clip_skip);
                    GGML_ASSERT(!chunk_hidden_states2.empty());
                    chunk_hidden_states = sd::ops::concat(chunk_hidden_states, chunk_hidden_states2, 0);

@ -434,10 +452,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                                      token_embed_custom.data(),
                                                      max_token_idx,
                                                      true,
-                                                      clip_skip,
-                                                      false,
-                                                      true,
-                                                      true);
+                                                      clip_skip);
                        GGML_ASSERT(!pooled.empty());
                    }
                }
@ -508,15 +523,15 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {

 struct FrozenCLIPVisionEmbedder : public GGMLRunner {
    CLIPVisionModelProjection vision_model;
-    std::string weight_prefix = "cond_stage_model.transformer";

    FrozenCLIPVisionEmbedder(ggml_backend_t backend,
-                             const String2TensorStorage& tensor_storage_map      = {},
-                             std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-        : GGMLRunner(backend, weight_manager) {
-        bool proj_in = false;
+                             ggml_backend_t params_backend,
+                             const String2TensorStorage& tensor_storage_map = {})
+        : GGMLRunner(backend, params_backend) {
+        std::string prefix = "cond_stage_model.transformer";
+        bool proj_in       = false;
        for (const auto& [name, tensor_storage] : tensor_storage_map) {
-            if (!starts_with(name, weight_prefix)) {
+            if (!starts_with(name, prefix)) {
                continue;
            }
            if (contains(name, "self_attn.in_proj")) {
@ -525,7 +540,7 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
            }
        }
        vision_model = CLIPVisionModelProjection(OPEN_CLIP_VIT_H_14, false, proj_in);
-        vision_model.init(params_ctx, tensor_storage_map, weight_prefix);
+        vision_model.init(params_ctx, tensor_storage_map, prefix);
    }

    std::string get_desc() override {
@ -533,7 +548,7 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
    }

    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) {
-        vision_model.get_param_tensors(tensors, weight_prefix);
+        vision_model.get_param_tensors(tensors, "cond_stage_model.transformer");
    }

    ggml_cgraph* build_graph(const sd::Tensor<float>& pixel_values_tensor, bool return_pooled, int clip_skip) {
@ -556,7 +571,7 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
        auto get_graph = [&]() -> ggml_cgraph* {
            return build_graph(pixel_values, return_pooled, clip_skip);
        };
-        return take_or_empty(GGMLRunner::compute<float>(get_graph, n_threads, true, true, true));
+        return take_or_empty(GGMLRunner::compute<float>(get_graph, n_threads, true));
    }
 };

@ -569,8 +584,8 @@ struct SD3CLIPEmbedder : public Conditioner {
    std::shared_ptr<T5Runner> t5;

    SD3CLIPEmbedder(ggml_backend_t backend,
-                    const String2TensorStorage& tensor_storage_map      = {},
-                    std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
+                    ggml_backend_t params_backend,
+                    const String2TensorStorage& tensor_storage_map = {})
        : clip_g_tokenizer(0) {
        bool use_clip_l = false;
        bool use_clip_g = false;
@ -589,13 +604,13 @@ struct SD3CLIPEmbedder : public Conditioner {
            return;
        }
        if (use_clip_l) {
-            clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, false, weight_manager);
+            clip_l = std::make_shared<CLIPTextModelRunner>(backend, params_backend, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
        }
        if (use_clip_g) {
-            clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_storage_map, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, false, weight_manager);
+            clip_g = std::make_shared<CLIPTextModelRunner>(backend, params_backend, tensor_storage_map, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
        }
        if (use_t5) {
-            t5 = std::make_shared<T5Runner>(backend, tensor_storage_map, "text_encoders.t5xxl.transformer", false, weight_manager);
+            t5 = std::make_shared<T5Runner>(backend, params_backend, tensor_storage_map, "text_encoders.t5xxl.transformer");
        }
    }

@ -611,6 +626,51 @@ struct SD3CLIPEmbedder : public Conditioner {
        }
    }

+    bool alloc_params_buffer() override {
+        if (clip_l) {
+            if (!clip_l->alloc_params_buffer()) {
+                return false;
+            }
+        }
+        if (clip_g) {
+            if (!clip_g->alloc_params_buffer()) {
+                return false;
+            }
+        }
+        if (t5) {
+            if (!t5->alloc_params_buffer()) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    void free_params_buffer() override {
+        if (clip_l) {
+            clip_l->free_params_buffer();
+        }
+        if (clip_g) {
+            clip_g->free_params_buffer();
+        }
+        if (t5) {
+            t5->free_params_buffer();
+        }
+    }
+
+    size_t get_params_buffer_size() override {
+        size_t buffer_size = 0;
+        if (clip_l) {
+            buffer_size += clip_l->get_params_buffer_size();
+        }
+        if (clip_g) {
+            buffer_size += clip_g->get_params_buffer_size();
+        }
+        if (t5) {
+            buffer_size += t5->get_params_buffer_size();
+        }
+        return buffer_size;
+    }
+
    void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
        if (clip_l) {
            clip_l->set_max_graph_vram_bytes(max_vram_bytes);
@ -659,18 +719,6 @@ struct SD3CLIPEmbedder : public Conditioner {
        }
    }

-    void runner_done() override {
-        if (clip_l) {
-            clip_l->runner_done();
-        }
-        if (clip_g) {
-            clip_g->runner_done();
-        }
-        if (t5) {
-            t5->runner_done();
-        }
-    }
-
    std::vector<std::pair<std::vector<int>, std::vector<float>>> tokenize(std::string text,
                                                                          size_t min_length          = 0,
                                                                          size_t max_length          = 0,
@ -786,10 +834,7 @@ struct SD3CLIPEmbedder : public Conditioner {
                                                        nullptr,
                                                        max_token_idx,
                                                        false,
-                                                        clip_skip,
-                                                        false,
-                                                        true,
-                                                        true);
+                                                        clip_skip);
                GGML_ASSERT(!chunk_hidden_states_l.empty());
                chunk_hidden_states_l = ::apply_token_weights(std::move(chunk_hidden_states_l), chunk_weights);

@ -802,16 +847,13 @@ struct SD3CLIPEmbedder : public Conditioner {
                                                    nullptr,
                                                    max_token_idx,
                                                    true,
-                                                    clip_skip,
-                                                    false,
-                                                    true,
-                                                    true);
+                                                    clip_skip);
                    GGML_ASSERT(!pooled_l.empty());
                }
            } else {
                chunk_hidden_states_l = sd::Tensor<float>::zeros({768, static_cast<int64_t>(chunk_len), 1});
                if (chunk_idx == 0) {
-                    pooled_l = sd::Tensor<float>::zeros({768, 1});
+                    pooled = sd::Tensor<float>::zeros({768, 1});
                }
            }

@ -833,10 +875,7 @@ struct SD3CLIPEmbedder : public Conditioner {
                                                        nullptr,
                                                        max_token_idx,
                                                        false,
-                                                        clip_skip,
-                                                        false,
-                                                        true,
-                                                        true);
+                                                        clip_skip);
                GGML_ASSERT(!chunk_hidden_states_g.empty());
                chunk_hidden_states_g = ::apply_token_weights(std::move(chunk_hidden_states_g), chunk_weights);

@ -849,10 +888,7 @@ struct SD3CLIPEmbedder : public Conditioner {
                                                    nullptr,
                                                    max_token_idx,
                                                    true,
-                                                    clip_skip,
-                                                    false,
-                                                    true,
-                                                    true);
+                                                    clip_skip);
                    GGML_ASSERT(!pooled_g.empty());
                }
            } else {
@ -874,10 +910,7 @@ struct SD3CLIPEmbedder : public Conditioner {

                chunk_hidden_states_t5 = t5->compute(n_threads,
                                                     input_ids,
-                                                     sd::Tensor<float>(),
-                                                     false,
-                                                     true,
-                                                     true);
+                                                     sd::Tensor<float>());
                GGML_ASSERT(!chunk_hidden_states_t5.empty());
                chunk_hidden_states_t5 = ::apply_token_weights(std::move(chunk_hidden_states_t5), chunk_weights);
            } else {
@ -938,8 +971,8 @@ struct FluxCLIPEmbedder : public Conditioner {
    size_t chunk_len = 256;

    FluxCLIPEmbedder(ggml_backend_t backend,
-                     const String2TensorStorage& tensor_storage_map      = {},
-                     std::shared_ptr<RunnerWeightManager> weight_manager = nullptr) {
+                     ggml_backend_t params_backend,
+                     const String2TensorStorage& tensor_storage_map = {}) {
        bool use_clip_l = false;
        bool use_t5     = false;
        for (auto pair : tensor_storage_map) {
@ -956,12 +989,12 @@ struct FluxCLIPEmbedder : public Conditioner {
        }

        if (use_clip_l) {
-            clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, false, weight_manager);
+            clip_l = std::make_shared<CLIPTextModelRunner>(backend, params_backend, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
        } else {
            LOG_WARN("clip_l text encoder not found! Prompt adherence might be degraded.");
        }
        if (use_t5) {
-            t5 = std::make_shared<T5Runner>(backend, tensor_storage_map, "text_encoders.t5xxl.transformer", false, weight_manager);
+            t5 = std::make_shared<T5Runner>(backend, params_backend, tensor_storage_map, "text_encoders.t5xxl.transformer");
        } else {
            LOG_WARN("t5xxl text encoder not found! Prompt adherence might be degraded.");
        }
@ -976,6 +1009,40 @@ struct FluxCLIPEmbedder : public Conditioner {
        }
    }

+    bool alloc_params_buffer() override {
+        if (clip_l) {
+            if (!clip_l->alloc_params_buffer()) {
+                return false;
+            }
+        }
+        if (t5) {
+            if (!t5->alloc_params_buffer()) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    void free_params_buffer() override {
+        if (clip_l) {
+            clip_l->free_params_buffer();
+        }
+        if (t5) {
+            t5->free_params_buffer();
+        }
+    }
+
+    size_t get_params_buffer_size() override {
+        size_t buffer_size = 0;
+        if (clip_l) {
+            buffer_size += clip_l->get_params_buffer_size();
+        }
+        if (t5) {
+            buffer_size += t5->get_params_buffer_size();
+        }
+        return buffer_size;
+    }
+
    void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
        if (clip_l) {
            clip_l->set_max_graph_vram_bytes(max_vram_bytes);
@ -1003,7 +1070,7 @@ struct FluxCLIPEmbedder : public Conditioner {
        }
    }

-    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {
        if (clip_l) {
            clip_l->set_weight_adapter(adapter);
        }
@ -1012,15 +1079,6 @@ struct FluxCLIPEmbedder : public Conditioner {
        }
    }

-    void runner_done() override {
-        if (clip_l) {
-            clip_l->runner_done();
-        }
-        if (t5) {
-            t5->runner_done();
-        }
-    }
-
    std::vector<std::pair<std::vector<int>, std::vector<float>>> tokenize(std::string text,
                                                                          size_t min_length = 0,
                                                                          size_t max_length = 0) {
@ -1119,10 +1177,7 @@ struct FluxCLIPEmbedder : public Conditioner {
                                             nullptr,
                                             max_token_idx,
                                             true,
-                                             clip_skip,
-                                             false,
-                                             true,
-                                             true);
+                                             clip_skip);
                    GGML_ASSERT(!pooled.empty());
                } else {
                    pooled = sd::Tensor<float>::zeros({768});
@ -1140,10 +1195,7 @@ struct FluxCLIPEmbedder : public Conditioner {
                sd::Tensor<int32_t> input_ids({static_cast<int64_t>(chunk_tokens.size())}, chunk_tokens);
                chunk_hidden_states = t5->compute(n_threads,
                                                  input_ids,
-                                                  sd::Tensor<float>(),
-                                                  false,
-                                                  true,
-                                                  true);
+                                                  sd::Tensor<float>());
                GGML_ASSERT(!chunk_hidden_states.empty());
                chunk_hidden_states = ::apply_token_weights(std::move(chunk_hidden_states), chunk_weights);
                if (zero_out_masked) {
@ -1187,11 +1239,11 @@ struct T5CLIPEmbedder : public Conditioner {
    bool is_umt5     = false;

    T5CLIPEmbedder(ggml_backend_t backend,
-                   const String2TensorStorage& tensor_storage_map      = {},
-                   bool use_mask                                       = false,
-                   int mask_pad                                        = 0,
-                   bool is_umt5                                        = false,
-                   std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
+                   ggml_backend_t params_backend,
+                   const String2TensorStorage& tensor_storage_map = {},
+                   bool use_mask                                  = false,
+                   int mask_pad                                   = 0,
+                   bool is_umt5                                   = false)
        : use_mask(use_mask), mask_pad(mask_pad), t5_tokenizer(is_umt5) {
        bool use_t5 = false;
        for (auto pair : tensor_storage_map) {
@ -1204,7 +1256,7 @@ struct T5CLIPEmbedder : public Conditioner {
            LOG_WARN("IMPORTANT NOTICE: No text encoders provided, cannot process prompts!");
            return;
        } else {
-            t5 = std::make_shared<T5Runner>(backend, tensor_storage_map, "text_encoders.t5xxl.transformer", is_umt5, weight_manager);
+            t5 = std::make_shared<T5Runner>(backend, params_backend, tensor_storage_map, "text_encoders.t5xxl.transformer", is_umt5);
        }
    }

@ -1214,6 +1266,29 @@ struct T5CLIPEmbedder : public Conditioner {
        }
    }

+    bool alloc_params_buffer() override {
+        if (t5) {
+            if (!t5->alloc_params_buffer()) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    void free_params_buffer() override {
+        if (t5) {
+            t5->free_params_buffer();
+        }
+    }
+
+    size_t get_params_buffer_size() override {
+        size_t buffer_size = 0;
+        if (t5) {
+            buffer_size += t5->get_params_buffer_size();
+        }
+        return buffer_size;
+    }
+
    void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
        if (t5) {
            t5->set_max_graph_vram_bytes(max_vram_bytes);
@ -1238,12 +1313,6 @@ struct T5CLIPEmbedder : public Conditioner {
        }
    }

-    void runner_done() override {
-        if (t5) {
-            t5->runner_done();
-        }
-    }
-
    std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text,
                                                                                  size_t min_length = 0,
                                                                                  size_t max_length = 0) {
@ -1337,10 +1406,7 @@ struct T5CLIPEmbedder : public Conditioner {

            auto chunk_hidden_states = t5->compute(n_threads,
                                                   input_ids,
-                                                   t5_attn_mask_chunk,
-                                                   false,
-                                                   true,
-                                                   true);
+                                                   t5_attn_mask_chunk);
            GGML_ASSERT(!chunk_hidden_states.empty());
            chunk_hidden_states = apply_token_weights(std::move(chunk_hidden_states), chunk_weights);

@ -1384,21 +1450,36 @@ struct AnimaConditioner : public Conditioner {
    std::shared_ptr<LLM::LLMRunner> llm;

    AnimaConditioner(ggml_backend_t backend,
-                     const String2TensorStorage& tensor_storage_map      = {},
-                     std::shared_ptr<RunnerWeightManager> weight_manager = nullptr) {
+                     ggml_backend_t params_backend,
+                     const String2TensorStorage& tensor_storage_map = {}) {
        qwen_tokenizer = std::make_shared<Qwen2Tokenizer>();
        llm            = std::make_shared<LLM::LLMRunner>(LLM::LLMArch::QWEN3,
                                               backend,
+                                               params_backend,
                                               tensor_storage_map,
                                               "text_encoders.llm",
-                                               false,
-                                               weight_manager);
+                                               false);
    }

    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
        llm->get_param_tensors(tensors, "text_encoders.llm");
    }

+    bool alloc_params_buffer() override {
+        if (!llm->alloc_params_buffer()) {
+            return false;
+        }
+        return true;
+    }
+
+    void free_params_buffer() override {
+        llm->free_params_buffer();
+    }
+
+    size_t get_params_buffer_size() override {
+        return llm->get_params_buffer_size();
+    }
+
    void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
        llm->set_max_graph_vram_bytes(max_vram_bytes);
    }
@ -1415,10 +1496,6 @@ struct AnimaConditioner : public Conditioner {
        llm->set_weight_adapter(adapter);
    }

-    void runner_done() override {
-        llm->runner_done();
-    }
-
    std::tuple<std::vector<int>, std::vector<float>, std::vector<int>, std::vector<float>> tokenize(std::string text) {
        auto parsed_attention = parse_prompt_attention(text);

@ -1476,11 +1553,7 @@ struct AnimaConditioner : public Conditioner {
                                          input_ids,
                                          sd::Tensor<float>(),
                                          {},
-                                          {},
-                                          false,
-                                          false,
-                                          true,
-                                          true);
+                                          {});
        GGML_ASSERT(!hidden_states.empty());
        hidden_states         = apply_token_weights(std::move(hidden_states), qwen_weights);
        auto t5_ids_tensor    = sd::Tensor<int32_t>::from_vector(t5_tokens);
@ -1503,11 +1576,11 @@ struct LLMEmbedder : public Conditioner {
    std::shared_ptr<LLM::LLMRunner> llm;

    LLMEmbedder(ggml_backend_t backend,
-                const String2TensorStorage& tensor_storage_map      = {},
-                SDVersion version                                   = VERSION_QWEN_IMAGE,
-                const std::string prefix                            = "",
-                bool enable_vision                                  = false,
-                std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
+                ggml_backend_t params_backend,
+                const String2TensorStorage& tensor_storage_map = {},
+                SDVersion version                              = VERSION_QWEN_IMAGE,
+                const std::string prefix                       = "",
+                bool enable_vision                             = false)
        : version(version) {
        LLM::LLMArch arch = LLM::LLMArch::QWEN2_5_VL;
        if (version == VERSION_FLUX2) {
@ -1518,7 +1591,7 @@ struct LLMEmbedder : public Conditioner {
            arch = LLM::LLMArch::GPT_OSS_20B;
        } else if (sd_version_is_pid(version)) {
            arch = LLM::LLMArch::GEMMA2_2B;
-        } else if (sd_version_is_ideogram4(version) || sd_version_is_boogu_image(version)) {
+        } else if (sd_version_is_ideogram4(version)) {
            arch = LLM::LLMArch::QWEN3_VL;
        } else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) {
            arch = LLM::LLMArch::QWEN3;
@ -1534,16 +1607,33 @@ struct LLMEmbedder : public Conditioner {
        }
        llm = std::make_shared<LLM::LLMRunner>(arch,
                                               backend,
+                                               params_backend,
                                               tensor_storage_map,
                                               "text_encoders.llm",
-                                               enable_vision,
-                                               weight_manager);
+                                               enable_vision);
    }

    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
        llm->get_param_tensors(tensors, "text_encoders.llm");
    }

+    bool alloc_params_buffer() override {
+        if (!llm->alloc_params_buffer()) {
+            return false;
+        }
+        return true;
+    }
+
+    void free_params_buffer() override {
+        llm->free_params_buffer();
+    }
+
+    size_t get_params_buffer_size() override {
+        size_t buffer_size = 0;
+        buffer_size += llm->get_params_buffer_size();
+        return buffer_size;
+    }
+
    void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
        llm->set_max_graph_vram_bytes(max_vram_bytes);
    }
@ -1562,12 +1652,6 @@ struct LLMEmbedder : public Conditioner {
        }
    }

-    void runner_done() override {
-        if (llm) {
-            llm->runner_done();
-        }
-    }
-
    std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text,
                                                                                  const std::pair<int, int>& attn_range,
                                                                                  size_t min_length = 0,
@ -1663,11 +1747,7 @@ struct LLMEmbedder : public Conditioner {
                                          input_ids,
                                          attention_mask,
                                          image_embeds,
-                                          out_layers,
-                                          false,
-                                          false,
-                                          true,
-                                          true);
+                                          out_layers);
        GGML_ASSERT(!hidden_states.empty());
        hidden_states = apply_token_weights(std::move(hidden_states), weights);
        GGML_ASSERT(hidden_states.shape()[1] > prompt_template_encode_start_idx);
@ -1745,7 +1825,7 @@ struct LLMEmbedder : public Conditioner {

                    auto resized_image = clip_preprocess(image, w_bar, h_bar);

-                    auto image_embed = llm->encode_image(n_threads, resized_image, false, true, true);
+                    auto image_embed = llm->encode_image(n_threads, resized_image);
                    GGML_ASSERT(!image_embed.empty());
                    image_embeds.emplace_back(image_embed_idx, image_embed);
                    image_embed_idx += 1 + static_cast<int>(image_embed.shape()[1]) + 6;
@ -1778,65 +1858,6 @@ struct LLMEmbedder : public Conditioner {

                prompt += "<|im_end|>\n<|im_start|>assistant\n";
            }
-        } else if (sd_version_is_boogu_image(version)) {
-            prompt_template_encode_start_idx = 0;
-
-            const std::string t2i_system_prompt =
-                "You are a helpful assistant that generates high-quality images based on user instructions. The instructions are as follows.";
-            const std::string edit_system_prompt =
-                "Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.";
-            const bool has_ref_images = llm->enable_vision && conditioner_params.ref_images != nullptr && !conditioner_params.ref_images->empty();
-            const bool text_empty     = conditioner_params.text.find_first_not_of(" \t\r\n") == std::string::npos;
-
-            if (has_ref_images) {
-                LOG_INFO("BooguImageEditPipeline");
-                const std::string prompt_prefix = "<|im_start|>system\n" + edit_system_prompt + "<|im_end|>\n<|im_start|>user\n";
-                std::string img_prompt;
-                const std::string placeholder = "<|image_pad|>";
-
-                for (int i = 0; i < conditioner_params.ref_images->size(); i++) {
-                    const auto& image = (*conditioner_params.ref_images)[i];
-                    double factor     = llm->config.vision.patch_size * llm->config.vision.spatial_merge_size;
-                    int height        = static_cast<int>(image.shape()[1]);
-                    int width         = static_cast<int>(image.shape()[0]);
-                    double beta       = std::sqrt((384.0 * 384.0) / (static_cast<double>(height) * static_cast<double>(width)));
-                    int h_bar         = std::max(static_cast<int>(factor),
-                                                 static_cast<int>(std::round(height * beta / factor)) * static_cast<int>(factor));
-                    int w_bar         = std::max(static_cast<int>(factor),
-                                                 static_cast<int>(std::round(width * beta / factor)) * static_cast<int>(factor));
-
-                    LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, height, width, h_bar, w_bar);
-
-                    auto resized_image = clip_preprocess(image, w_bar, h_bar);
-                    auto image_embed   = llm->encode_image(n_threads, resized_image, false, true, true);
-                    GGML_ASSERT(!image_embed.empty());
-
-                    std::string image_prefix = prompt_prefix + img_prompt + "<|vision_start|>";
-                    int image_embed_idx      = static_cast<int>(tokenizer->encode(image_prefix, nullptr).size());
-                    image_embeds.emplace_back(image_embed_idx, image_embed);
-
-                    img_prompt += "<|vision_start|>";
-                    int64_t num_image_tokens = image_embed.shape()[1];
-                    img_prompt.reserve(img_prompt.size() + static_cast<size_t>(num_image_tokens) * placeholder.size() + 32);
-                    for (int j = 0; j < num_image_tokens; j++) {
-                        img_prompt += placeholder;
-                    }
-                    img_prompt += "<|vision_end|>";
-                }
-
-                prompt                  = prompt_prefix + img_prompt;
-                prompt_attn_range.first = static_cast<int>(prompt.size());
-                prompt += conditioner_params.text;
-                prompt_attn_range.second = static_cast<int>(prompt.size());
-                prompt += "<|im_end|>\n";
-            } else {
-                const std::string& system_prompt = text_empty ? edit_system_prompt : t2i_system_prompt;
-                prompt                           = "<|im_start|>system\n" + system_prompt + "<|im_end|>\n<|im_start|>user\n";
-                prompt_attn_range.first          = static_cast<int>(prompt.size());
-                prompt += conditioner_params.text;
-                prompt_attn_range.second = static_cast<int>(prompt.size());
-                prompt += "<|im_end|>\n";
-            }
        } else if (sd_version_is_longcat(version)) {
            spell_quotes = true;

@ -1874,7 +1895,7 @@ struct LLMEmbedder : public Conditioner {
                    LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, height, width, h_bar, w_bar);

                    auto resized_image = clip_preprocess(image, w_bar, h_bar);
-                    auto image_embed   = llm->encode_image(n_threads, resized_image, false, true, true);
+                    auto image_embed   = llm->encode_image(n_threads, resized_image);
                    GGML_ASSERT(!image_embed.empty());
                    image_embeds.emplace_back(image_embed_idx, image_embed);
                    image_embed_idx += 1 + static_cast<int>(image_embed.shape()[1]) + 6;
@ -2117,10 +2138,10 @@ struct LTXAVTextProjectionRunner : public GGMLRunner {
    LTXAVTextProjection model;

    LTXAVTextProjectionRunner(ggml_backend_t backend,
-                              const String2TensorStorage& tensor_storage_map      = {},
-                              const std::string& prefix                           = "",
-                              std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-        : GGMLRunner(backend, weight_manager),
+                              ggml_backend_t params_backend,
+                              const String2TensorStorage& tensor_storage_map = {},
+                              const std::string& prefix                      = "")
+        : GGMLRunner(backend, params_backend),
          model(tensor_storage_map.find(prefix + ".video_aggregate_embed.weight") != tensor_storage_map.end()) {
        model.init(params_ctx, tensor_storage_map, prefix);
    }
@ -2142,15 +2163,11 @@ struct LTXAVTextProjectionRunner : public GGMLRunner {
        return gf;
    }

-    sd::Tensor<float> compute(int n_threads,
-                              const sd::Tensor<float>& x,
-                              bool auto_free           = true,
-                              bool free_compute_buffer = true,
-                              bool free_compute_params = true) {
+    sd::Tensor<float> compute(int n_threads, const sd::Tensor<float>& x) {
        auto get_graph = [&]() -> ggml_cgraph* {
            return build_graph(x);
        };
-        return take_or_empty(GGMLRunner::compute<float>(get_graph, n_threads, auto_free, free_compute_buffer, free_compute_params));
+        return take_or_empty(GGMLRunner::compute<float>(get_graph, n_threads, true));
    }
 };

@ -2165,22 +2182,22 @@ struct LTXAVEmbedder : public Conditioner {
    bool dual_projection = false;

    LTXAVEmbedder(ggml_backend_t backend,
-                  const String2TensorStorage& tensor_storage_map      = {},
-                  const std::string& llm_prefix                       = "text_encoders.llm",
-                  const std::string& projector_prefix                 = "text_embedding_projection",
-                  std::shared_ptr<RunnerWeightManager> weight_manager = nullptr) {
+                  ggml_backend_t params_backend,
+                  const String2TensorStorage& tensor_storage_map = {},
+                  const std::string& llm_prefix                  = "text_encoders.llm",
+                  const std::string& projector_prefix            = "text_embedding_projection") {
        tokenizer       = std::make_shared<GemmaTokenizer>();
        llm             = std::make_shared<LLM::LLMRunner>(LLM::LLMArch::GEMMA3_12B,
                                               backend,
+                                               params_backend,
                                               tensor_storage_map,
                                               llm_prefix,
-                                               false,
-                                               weight_manager);
+                                               false);
        dual_projection = tensor_storage_map.find(projector_prefix + ".video_aggregate_embed.weight") != tensor_storage_map.end();
        projector       = std::make_shared<LTXAVTextProjectionRunner>(backend,
+                                                                params_backend,
                                                                tensor_storage_map,
-                                                                projector_prefix,
-                                                                weight_manager);
+                                                                projector_prefix);
    }

    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
@ -2188,6 +2205,25 @@ struct LTXAVEmbedder : public Conditioner {
        projector->get_param_tensors(tensors, "text_embedding_projection");
    }

+    bool alloc_params_buffer() override {
+        if (!llm->alloc_params_buffer()) {
+            return false;
+        }
+        if (!projector->alloc_params_buffer()) {
+            return false;
+        }
+        return true;
+    }
+
+    void free_params_buffer() override {
+        llm->free_params_buffer();
+        projector->free_params_buffer();
+    }
+
+    size_t get_params_buffer_size() override {
+        return llm->get_params_buffer_size() + projector->get_params_buffer_size();
+    }
+
    void set_flash_attention_enabled(bool enabled) override {
        llm->set_flash_attention_enabled(enabled);
        projector->set_flash_attention_enabled(enabled);
@ -2203,11 +2239,6 @@ struct LTXAVEmbedder : public Conditioner {
        projector->set_weight_adapter(adapter);
    }

-    void runner_done() override {
-        llm->runner_done();
-        projector->runner_done();
-    }
-
    std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text,
                                                                                  const std::pair<int, int>& attn_range) {
        std::vector<std::pair<std::string, float>> parsed_attention;
@ -2271,9 +2302,6 @@ struct LTXAVEmbedder : public Conditioner {
                                          attention_mask,
                                          {},
                                          {},
-                                          true,
-                                          false,
-                                          true,
                                          true);
        GGML_ASSERT(!hidden_states.empty());
        hidden_states = apply_token_weights(std::move(hidden_states), weights);
@ -2333,7 +2361,7 @@ struct LTXAVEmbedder : public Conditioner {
        }

        hidden_states.reshape_({kNumStates * kHiddenSize, valid_tokens});
-        return projector->compute(n_threads, hidden_states, false, true, true);
+        return projector->compute(n_threads, hidden_states);
    }

    SDCondition get_learned_condition(int n_threads,
--- a/src/convert.cpp
+++ b/src/convert.cpp
@ -99,7 +99,7 @@ bool convert(const char* input_path,
        model_loader.convert_tensors_name();
    }

-    ggml_type type             = sd_type_to_ggml_type(output_type);
+    ggml_type type             = (ggml_type)output_type;
    bool output_is_safetensors = ends_with(output_path, ".safetensors");
    TensorTypeRules type_rules = parse_tensor_type_rules(tensor_type_rules);

--- a/src/core/ggml_extend.hpp
+++ b/src/core/ggml_extend.hpp
--- a/src/core/ggml_extend_backend.cpp
+++ b/src/core/ggml_extend_backend.cpp
@ -45,10 +45,6 @@ static bool is_default_backend_token(const std::string& name) {
    return lower.empty() || lower == "default" || lower == "auto";
 }

-static bool is_disk_backend_token(const std::string& name) {
-    return lower_copy(trim_copy(name)) == "disk";
-}
-
 static bool parse_backend_module(const std::string& raw_name, SDBackendModule* module) {
    std::string name = lower_copy(trim_copy(raw_name));
    name.erase(std::remove(name.begin(), name.end(), '-'), name.end());
@ -204,36 +200,6 @@ void ggml_ext_im_set_f32_1d(const struct ggml_tensor* tensor, int i, float value
    }
 }

-bool add_rpc_devices(const std::string& servers) {
-    const std::string in = trim_copy(servers);
-    if (in.empty()) {
-        return true;
-    }
-    auto rpc_servers = split_copy(in, ',');
-    if (rpc_servers.empty()) {
-        LOG_ERROR("invalid RPC servers specification: '%s'", servers.c_str());
-        return false;
-    }
-    ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
-    if (!rpc_reg) {
-        LOG_ERROR("RPC backend not found, cannot add RPC servers");
-        return false;
-    }
-    typedef ggml_backend_reg_t (*ggml_backend_rpc_add_server_t)(const char* endpoint);
-    ggml_backend_rpc_add_server_t ggml_backend_rpc_add_server_fn = (ggml_backend_rpc_add_server_t)ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_server");
-    if (!ggml_backend_rpc_add_server_fn) {
-        LOG_ERROR("RPC backend does not have ggml_backend_rpc_add_server function, cannot add RPC servers");
-        return false;
-    }
-    for (const auto& server : rpc_servers) {
-        LOG_INFO("Adding RPC server: %s", server.c_str());
-        auto reg = ggml_backend_rpc_add_server_fn(server.c_str());
-        // no return value to check for success but should print errors from the RPC backend if it fails to add the server
-        ggml_backend_register(reg);
-    }
-    return true;
-}
-
 static void ggml_backend_load_all_once() {
    // If the registry already has devices and the CPU backend is present,
    // assume either static registration or explicit host-side preloading has
@ -280,7 +246,7 @@ static std::string get_default_backend_name() {
    return resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
 }

-std::string sd_backend_resolve_name(const std::string& name) {
+static std::string sd_resolve_backend_name(const std::string& name) {
    ggml_backend_load_all_once();
    std::string requested = trim_copy(name);
    std::string lower     = lower_copy(requested);
@ -318,7 +284,7 @@ std::string sd_backend_resolve_name(const std::string& name) {
 }

 static bool backend_name_exists(const std::string& name) {
-    return !sd_backend_resolve_name(name).empty();
+    return !sd_resolve_backend_name(name).empty();
 }

 static ggml_backend_t init_named_backend(const std::string& name) {
@ -328,7 +294,7 @@ static ggml_backend_t init_named_backend(const std::string& name) {
        return ggml_backend_init_best();
    }

-    std::string resolved = sd_backend_resolve_name(name);
+    std::string resolved = sd_resolve_backend_name(name);
    if (resolved.empty()) {
        return nullptr;
    }
@ -538,9 +504,6 @@ ggml_backend_t SDBackendManager::params_backend(SDBackendModule module) {
    if (name.empty()) {
        return runtime_backend(module);
    }
-    if (is_disk_backend_token(name)) {
-        return runtime_backend(module);
-    }
    return init_cached_backend(name);
 }

@ -552,10 +515,6 @@ bool SDBackendManager::params_backend_is_cpu(SDBackendModule module) {
    return sd_backend_is_cpu(params_backend(module));
 }

-bool SDBackendManager::params_backend_is_disk(SDBackendModule module) const {
-    return is_disk_backend_token(params_assignment_.get(module));
-}
-
 bool SDBackendManager::runtime_backend_supports_host_buffer(SDBackendModule module) {
    ggml_backend_t backend = runtime_backend(module);
    if (backend == nullptr) {
@ -575,6 +534,10 @@ bool SDBackendManager::runtime_backend_supports_host_buffer(SDBackendModule modu

 bool SDBackendManager::init(const char* backend_spec,
                            const char* params_backend_spec,
+                            bool offload_params_to_cpu,
+                            bool keep_clip_on_cpu,
+                            bool keep_vae_on_cpu,
+                            bool keep_control_net_on_cpu,
                            std::string* error) {
    reset();

@ -585,21 +548,31 @@ bool SDBackendManager::init(const char* backend_spec,
        return false;
    }

+    if (runtime_assignment_.empty()) {
+        if (keep_clip_on_cpu) {
+            runtime_assignment_.set_module(SDBackendModule::TE, "cpu");
+        }
+        if (keep_vae_on_cpu) {
+            runtime_assignment_.set_module(SDBackendModule::VAE, "cpu");
+        }
+        if (keep_control_net_on_cpu) {
+            runtime_assignment_.set_module(SDBackendModule::CONTROL_NET, "cpu");
+        }
+    }
+
+    if (params_assignment_.empty() && offload_params_to_cpu) {
+        params_assignment_.set_default("cpu");
+    }
+
    return validate(error);
 }

 bool SDBackendManager::validate(std::string* error) const {
-    auto validate_runtime_name = [&](const std::string& name) -> bool {
+    auto validate_name = [&](const std::string& name) -> bool {
        if (is_default_backend_token(name)) {
            return true;
        }
-        if (is_disk_backend_token(name)) {
-            if (error != nullptr) {
-                *error = "backend 'disk' is only supported by params_backend";
-            }
-            return false;
-        }
-        if (!sd_backend_resolve_name(name).empty()) {
+        if (!sd_resolve_backend_name(name).empty()) {
            return true;
        }
        if (error != nullptr) {
@ -607,24 +580,18 @@ bool SDBackendManager::validate(std::string* error) const {
        }
        return false;
    };
-    auto validate_params_name = [&](const std::string& name) -> bool {
-        if (is_disk_backend_token(name)) {
-            return true;
-        }
-        return validate_runtime_name(name);
-    };

-    if (!validate_runtime_name(runtime_assignment_.default_name) ||
-        !validate_params_name(params_assignment_.default_name)) {
+    if (!validate_name(runtime_assignment_.default_name) ||
+        !validate_name(params_assignment_.default_name)) {
        return false;
    }
    for (const auto& kv : runtime_assignment_.module_names) {
-        if (!validate_runtime_name(kv.second)) {
+        if (!validate_name(kv.second)) {
            return false;
        }
    }
    for (const auto& kv : params_assignment_.module_names) {
-        if (!validate_params_name(kv.second)) {
+        if (!validate_name(kv.second)) {
            return false;
        }
    }
@ -632,7 +599,7 @@ bool SDBackendManager::validate(std::string* error) const {
 }

 ggml_backend_t SDBackendManager::init_cached_backend(const std::string& name) {
-    std::string resolved   = sd_backend_resolve_name(name);
+    std::string resolved   = sd_resolve_backend_name(name);
    std::string key        = lower_copy(resolved);
    ggml_backend_t backend = nullptr;

--- a/src/core/ggml_extend_backend.h
+++ b/src/core/ggml_extend_backend.h
@ -51,6 +51,10 @@ public:

    bool init(const char* backend_spec,
              const char* params_backend_spec,
+              bool offload_params_to_cpu,
+              bool keep_clip_on_cpu,
+              bool keep_vae_on_cpu,
+              bool keep_control_net_on_cpu,
              std::string* error);
    void reset();

@ -59,7 +63,6 @@ public:

    bool runtime_backend_is_cpu(SDBackendModule module);
    bool params_backend_is_cpu(SDBackendModule module);
-    bool params_backend_is_disk(SDBackendModule module) const;
    bool runtime_backend_supports_host_buffer(SDBackendModule module);

 private:
@ -71,8 +74,6 @@ bool sd_backend_is(ggml_backend_t backend, const std::string& name);
 bool sd_backend_is_cpu(ggml_backend_t backend);
 ggml_backend_t sd_backend_cpu_init();
 bool sd_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
-std::string sd_backend_resolve_name(const std::string& name);
 const char* sd_backend_module_name(SDBackendModule module);
 void ggml_ext_im_set_f32_1d(const struct ggml_tensor* tensor, int i, float value);
-bool add_rpc_devices(const std::string& servers);
 #endif  // __SD_CORE_GGML_EXTEND_BACKEND_H__
--- a/src/core/ggml_graph_cut.cpp
+++ b/src/core/ggml_graph_cut.cpp
@ -1,8 +1,6 @@
 #include "core/ggml_graph_cut.h"

 #include <algorithm>
-#include <cctype>
-#include <cmath>
 #include <cstring>
 #include <map>
 #include <set>
@ -10,7 +8,6 @@
 #include <stack>
 #include <unordered_map>

-#include "core/ggml_extend_backend.h"
 #include "core/util.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
@ -47,9 +44,7 @@ namespace sd::ggml_graph_cut {
        if (tensor == nullptr) {
            return false;
        }
-        return params_tensor_set.find(tensor) != params_tensor_set.end() ||
-               (tensor->view_src != nullptr &&
-                params_tensor_set.find(tensor->view_src) != params_tensor_set.end());
+        return params_tensor_set.find(tensor) != params_tensor_set.end();
    }

    static int graph_node_index_by_name(ggml_cgraph* gf, const char* name) {
@ -86,157 +81,6 @@ namespace sd::ggml_graph_cut {
               segment.output_bytes;
    }

-    static std::string lower_ascii_copy(std::string value) {
-        std::transform(value.begin(), value.end(), value.begin(), [](unsigned char c) {
-            return static_cast<char>(std::tolower(c));
-        });
-        return value;
-    }
-
-    static std::string normalize_backend_budget_key(const std::string& value) {
-        return lower_ascii_copy(trim(value));
-    }
-
-    static bool is_default_max_vram_key(const std::string& key) {
-        std::string normalized = normalize_backend_budget_key(key);
-        return normalized == "all" || normalized == "default" || normalized == "*";
-    }
-
-    static bool parse_max_vram_budget_value(const std::string& text, float* value, std::string* error) {
-        float parsed = 0.f;
-        if (!parse_strict_float(text, parsed) || !std::isfinite(parsed)) {
-            if (error != nullptr) {
-                *error = "invalid --max-vram value '" + text + "'";
-            }
-            return false;
-        }
-        *value = parsed;
-        return true;
-    }
-
-    static std::vector<std::string> backend_budget_keys(ggml_backend_t backend) {
-        std::vector<std::string> keys;
-        if (backend == nullptr) {
-            return keys;
-        }
-
-        ggml_backend_dev_t dev = ggml_backend_get_device(backend);
-        if (dev != nullptr) {
-            keys.push_back(normalize_backend_budget_key(ggml_backend_dev_name(dev)));
-        }
-        const char* backend_name = ggml_backend_name(backend);
-        if (backend_name != nullptr) {
-            keys.push_back(normalize_backend_budget_key(backend_name));
-        }
-        return keys;
-    }
-
-    void MaxVramAssignment::reset(float fallback_gib) {
-        default_gib = fallback_gib;
-        backend_gib.clear();
-        resolved_backend_bytes.clear();
-    }
-
-    bool MaxVramAssignment::parse(const std::string& raw_spec, std::string* error) {
-        const std::string in = trim(raw_spec);
-        if (in.empty()) {
-            return true;
-        }
-
-        for (const std::string& raw_part : split_string(in, ',')) {
-            const std::string part = trim(raw_part);
-            if (part.empty()) {
-                continue;
-            }
-
-            const size_t eq = part.find('=');
-            if (eq == std::string::npos) {
-                float value = 0.f;
-                if (!parse_max_vram_budget_value(part, &value, error)) {
-                    return false;
-                }
-                default_gib = value;
-                continue;
-            }
-
-            const std::string key        = trim(part.substr(0, eq));
-            const std::string value_text = trim(part.substr(eq + 1));
-            if (key.empty() || value_text.empty()) {
-                if (error != nullptr) {
-                    *error = "invalid --max-vram assignment '" + part + "'";
-                }
-                return false;
-            }
-
-            float value = 0.f;
-            if (!parse_max_vram_budget_value(value_text, &value, error)) {
-                return false;
-            }
-
-            if (is_default_max_vram_key(key)) {
-                default_gib = value;
-                continue;
-            }
-
-            const std::string backend_key = trim(key);
-            if (backend_key.empty()) {
-                if (error != nullptr) {
-                    *error = "invalid --max-vram backend key in '" + part + "'";
-                }
-                return false;
-            }
-            backend_gib[backend_key] = value;
-        }
-        resolved_backend_bytes.clear();
-        return true;
-    }
-
-    bool MaxVramAssignment::canonicalize_backend_keys(std::string* error) {
-        if (backend_gib.empty()) {
-            return true;
-        }
-
-        std::unordered_map<std::string, float> normalized;
-        for (const auto& kv : backend_gib) {
-            std::string resolved = sd_backend_resolve_name(kv.first);
-            if (resolved.empty()) {
-                if (error != nullptr) {
-                    *error = "unknown --max-vram backend '" + kv.first + "'";
-                }
-                return false;
-            }
-            normalized[normalize_backend_budget_key(resolved)] = kv.second;
-        }
-        backend_gib = std::move(normalized);
-        resolved_backend_bytes.clear();
-        return true;
-    }
-
-    size_t MaxVramAssignment::bytes_for_backend(ggml_backend_t backend) {
-        std::vector<std::string> keys = backend_budget_keys(backend);
-        const std::string cache_key   = keys.empty() ? std::string("<none>") : keys.front();
-        auto cached                   = resolved_backend_bytes.find(cache_key);
-        if (cached != resolved_backend_bytes.end()) {
-            return cached->second;
-        }
-
-        float budget_gib = default_gib;
-        if (!backend_gib.empty()) {
-            for (const std::string& key : keys) {
-                auto backend_it = backend_gib.find(key);
-                if (backend_it != backend_gib.end()) {
-                    budget_gib = backend_it->second;
-                    break;
-                }
-            }
-        }
-
-        const float resolved_gib          = resolve_max_vram_gib(budget_gib, backend);
-        const size_t bytes                = max_vram_gib_to_bytes(resolved_gib);
-        resolved_backend_bytes[cache_key] = bytes;
-        return bytes;
-    }
-
    size_t max_vram_gib_to_bytes(float max_vram) {
        if (max_vram <= 0.f) {
            return 0;
@ -291,24 +135,6 @@ namespace sd::ggml_graph_cut {
        return max_vram_bytes_to_gib(resolve_auto_max_vram_bytes(-max_vram, backend));
    }

-    static bool is_segment_output_needed_after(const Plan& plan,
-                                               size_t end_segment_index,
-                                               int output_node_index) {
-        if (end_segment_index + 1 >= plan.segments.size()) {
-            return false;
-        }
-        for (size_t seg_idx = end_segment_index + 1; seg_idx < plan.segments.size(); ++seg_idx) {
-            const auto& segment = plan.segments[seg_idx];
-            for (const auto& input_ref : segment.input_refs) {
-                if (input_ref.type == Segment::INPUT_PREVIOUS_CUT &&
-                    input_ref.node_index == output_node_index) {
-                    return true;
-                }
-            }
-        }
-        return false;
-    }
-
    static Segment make_segment_seed(const Plan& plan,
                                     size_t start_segment_index,
                                     size_t end_segment_index) {
@ -321,11 +147,8 @@ namespace sd::ggml_graph_cut {
        const auto& target_segment = plan.segments[end_segment_index];
        std::unordered_set<int> seen_output_node_indices;
        for (size_t seg_idx = start_segment_index; seg_idx <= end_segment_index; ++seg_idx) {
-            const bool is_boundary_segment = seg_idx == end_segment_index;
            for (int output_node_index : plan.segments[seg_idx].output_node_indices) {
-                if ((is_boundary_segment ||
-                     is_segment_output_needed_after(plan, end_segment_index, output_node_index)) &&
-                    seen_output_node_indices.insert(output_node_index).second) {
+                if (seen_output_node_indices.insert(output_node_index).second) {
                    seed.output_node_indices.push_back(output_node_index);
                }
            }
@ -577,6 +400,23 @@ namespace sd::ggml_graph_cut {
        return tensors;
    }

+    std::vector<ggml_tensor*> runtime_param_tensors(ggml_cgraph* gf, const Segment& segment, const char* log_desc) {
+        std::vector<ggml_tensor*> tensors = param_tensors(gf, segment);
+        std::vector<ggml_tensor*> filtered_tensors;
+        filtered_tensors.reserve(tensors.size());
+        for (ggml_tensor* tensor : tensors) {
+            if (tensor_buffer(tensor) == nullptr) {
+                LOG_WARN("%s graph cut skipping param input without buffer: segment=%s tensor=%s",
+                         log_desc == nullptr ? "unknown" : log_desc,
+                         segment.group_name.c_str(),
+                         tensor->name);
+                continue;
+            }
+            filtered_tensors.push_back(tensor);
+        }
+        return filtered_tensors;
+    }
+
    std::unordered_set<std::string> collect_future_input_names(ggml_cgraph* gf,
                                                               const Plan& plan,
                                                               size_t current_segment_index) {
@ -647,44 +487,6 @@ namespace sd::ggml_graph_cut {
            return 0;
        }

-        struct TensorRuntimeBinding {
-            ggml_backend_buffer_t buffer = nullptr;
-            void* data                   = nullptr;
-            void* extra                  = nullptr;
-        };
-        std::unordered_map<ggml_tensor*, TensorRuntimeBinding> saved_bindings;
-        auto mark_measurement_external = [&](ggml_tensor* tensor) {
-            if (tensor == nullptr) {
-                return;
-            }
-            auto save_tensor = [&](ggml_tensor* t) {
-                if (t == nullptr || saved_bindings.find(t) != saved_bindings.end()) {
-                    return;
-                }
-                saved_bindings[t] = {t->buffer, t->data, t->extra};
-                // During real execution params and previous-cut inputs already
-                // have backend/cache buffers, so gallocr must not reserve them.
-                t->data = reinterpret_cast<void*>(static_cast<uintptr_t>(1));
-            };
-            save_tensor(tensor);
-            save_tensor(tensor->view_src);
-        };
-        for (const auto& input : segment.input_refs) {
-            if (input.type != Segment::INPUT_PARAM &&
-                input.type != Segment::INPUT_PREVIOUS_CUT) {
-                continue;
-            }
-            mark_measurement_external(input_tensor(gf, input));
-        }
-
-        std::unordered_map<ggml_tensor*, int32_t> saved_output_flags;
-        for (int output_node_index : segment.output_node_indices) {
-            ggml_tensor* output = ggml_graph_node(gf, output_node_index);
-            if (output != nullptr && saved_output_flags.find(output) == saved_output_flags.end()) {
-                saved_output_flags[output] = output->flags;
-            }
-        }
-
        ggml_context* graph_ctx    = nullptr;
        ggml_cgraph* segment_graph = build_segment_graph(gf, segment, &graph_ctx);
        ggml_gallocr_t allocr      = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
@ -700,14 +502,6 @@ namespace sd::ggml_graph_cut {

        ggml_gallocr_free(allocr);
        ggml_free(graph_ctx);
-        for (const auto& kv : saved_output_flags) {
-            kv.first->flags = kv.second;
-        }
-        for (const auto& kv : saved_bindings) {
-            kv.first->buffer = kv.second.buffer;
-            kv.first->data   = kv.second.data;
-            kv.first->extra  = kv.second.extra;
-        }
        return buffer_size;
    }

@ -875,8 +669,7 @@ namespace sd::ggml_graph_cut {
                GGML_ASSERT(!candidate_plan.segments.empty());

                const auto& candidate_segment = candidate_plan.segments.back();
-                const size_t candidate_bytes  = graph_cut_segment_vram_bytes(candidate_segment);
-                if (candidate_bytes > max_graph_vram_bytes) {
+                if (graph_cut_segment_vram_bytes(candidate_segment) > max_graph_vram_bytes) {
                    break;
                }

--- a/src/core/ggml_graph_cut.h
+++ b/src/core/ggml_graph_cut.h
@ -4,7 +4,6 @@
 #include <array>
 #include <cstdint>
 #include <string>
-#include <unordered_map>
 #include <unordered_set>
 #include <vector>

@ -69,17 +68,6 @@ namespace sd::ggml_graph_cut {

    static constexpr const char* GGML_RUNNER_CUT_PREFIX = "ggml_runner_cut:";

-    struct MaxVramAssignment {
-        float default_gib = 0.f;
-        std::unordered_map<std::string, float> backend_gib;
-        std::unordered_map<std::string, size_t> resolved_backend_bytes;
-
-        void reset(float fallback_gib);
-        bool parse(const std::string& raw_spec, std::string* error);
-        bool canonicalize_backend_keys(std::string* error);
-        size_t bytes_for_backend(ggml_backend_t backend);
-    };
-
    bool is_graph_cut_tensor(const ggml_tensor* tensor);
    std::string make_graph_cut_name(const std::string& group, const std::string& output);
    void mark_graph_cut(ggml_tensor* tensor, const std::string& group, const std::string& output);
@ -92,6 +80,7 @@ namespace sd::ggml_graph_cut {
    ggml_tensor* output_tensor(ggml_cgraph* gf, const Segment& segment, size_t output_index);
    ggml_tensor* input_tensor(ggml_cgraph* gf, const Segment::InputRef& input_ref);
    std::vector<ggml_tensor*> param_tensors(ggml_cgraph* gf, const Segment& segment);
+    std::vector<ggml_tensor*> runtime_param_tensors(ggml_cgraph* gf, const Segment& segment, const char* log_desc);
    std::unordered_set<std::string> collect_future_input_names(ggml_cgraph* gf,
                                                               const Plan& plan,
                                                               size_t current_segment_index);
--- a/src/core/layer_registry.cpp
+++ b/src/core/layer_registry.cpp
@ -0,0 +1,132 @@
+#include "core/layer_registry.h"
+
+#include <utility>
+
+#include "core/util.h"
+
+namespace sd::layer_registry {
+
+    void LayerRegistry::register_layer(const std::string& name, ggml_tensor* tensor) {
+        auto& info = layers_[name];
+        info.tensors.push_back(tensor);
+        info.bytes += ggml_nbytes(tensor);
+    }
+
+    bool LayerRegistry::move_layer_to_gpu(const std::string& name) {
+        auto it = layers_.find(name);
+        if (it == layers_.end())
+            return false;
+
+        LayerInfo& info = it->second;
+        if (info.on_gpu)
+            return true;
+        if (gpu_backend_ == nullptr || cpu_backend_ == nullptr) {
+            LOG_ERROR("layer_registry: backends not set; cannot move '%s' to GPU",
+                      name.c_str());
+            return false;
+        }
+        if (info.tensors.empty()) {
+            info.on_gpu = true;
+            return true;
+        }
+
+        // 1. Build a no_alloc context big enough to hold one twin tensor per CPU
+        //    tensor, plus a little overhead.
+        const size_t ctx_size = info.tensors.size() * ggml_tensor_overhead() + 1024;
+        ggml_init_params ctx_params{ctx_size, /*mem_buffer=*/nullptr, /*no_alloc=*/true};
+        ggml_context* twin_ctx = ggml_init(ctx_params);
+        if (twin_ctx == nullptr) {
+            LOG_ERROR("layer_registry: failed to allocate twin context for '%s'",
+                      name.c_str());
+            return false;
+        }
+
+        // 2. Create one GPU twin per CPU tensor. The twin shares the original
+        //    name so any name-based lookup keeps working.
+        std::vector<ggml_tensor*> gpu_twins;
+        gpu_twins.reserve(info.tensors.size());
+        for (ggml_tensor* cpu_t : info.tensors) {
+            ggml_tensor* twin = ggml_dup_tensor(twin_ctx, cpu_t);
+            if (cpu_t->name[0] != '\0') {
+                ggml_set_name(twin, cpu_t->name);
+            }
+            gpu_twins.push_back(twin);
+        }
+
+        // 3. Back the twins with a GPU buffer in one alloc call.
+        ggml_backend_buffer_t gpu_buffer = ggml_backend_alloc_ctx_tensors(twin_ctx, gpu_backend_);
+        if (gpu_buffer == nullptr) {
+            LOG_ERROR("layer_registry: failed to allocate GPU buffer for '%s'",
+                      name.c_str());
+            ggml_free(twin_ctx);
+            return false;
+        }
+
+        // 4. H2D copy + sync.
+        for (size_t i = 0; i < info.tensors.size(); ++i) {
+            ggml_backend_tensor_copy(info.tensors[i], gpu_twins[i]);
+        }
+        ggml_backend_synchronize(gpu_backend_);
+
+        // 5. Swap buffer/data/extra so the originals now point at GPU memory.
+        for (size_t i = 0; i < info.tensors.size(); ++i) {
+            std::swap(info.tensors[i]->buffer, gpu_twins[i]->buffer);
+            std::swap(info.tensors[i]->data, gpu_twins[i]->data);
+            std::swap(info.tensors[i]->extra, gpu_twins[i]->extra);
+        }
+
+        info.gpu_twins  = std::move(gpu_twins);
+        info.twin_ctx   = twin_ctx;
+        info.gpu_buffer = gpu_buffer;
+        info.on_gpu     = true;
+        return true;
+    }
+
+    bool LayerRegistry::move_layer_to_cpu(const std::string& name) {
+        auto it = layers_.find(name);
+        if (it == layers_.end())
+            return false;
+
+        LayerInfo& info = it->second;
+        if (!info.on_gpu)
+            return true;
+        if (info.tensors.size() != info.gpu_twins.size()) {
+            LOG_ERROR("layer_registry: twin/tensor count mismatch for '%s'",
+                      name.c_str());
+            return false;
+        }
+
+        // 1. Swap back: originals point at CPU memory again.
+        for (size_t i = 0; i < info.tensors.size(); ++i) {
+            if (info.gpu_twins[i] == nullptr)
+                continue;
+            std::swap(info.tensors[i]->buffer, info.gpu_twins[i]->buffer);
+            std::swap(info.tensors[i]->data, info.gpu_twins[i]->data);
+            std::swap(info.tensors[i]->extra, info.gpu_twins[i]->extra);
+        }
+
+        // 2. Free the GPU buffer + twin context.
+        if (info.gpu_buffer != nullptr) {
+            ggml_backend_buffer_free(info.gpu_buffer);
+            info.gpu_buffer = nullptr;
+        }
+        if (info.twin_ctx != nullptr) {
+            ggml_free(info.twin_ctx);
+            info.twin_ctx = nullptr;
+        }
+        info.gpu_twins.clear();
+        info.on_gpu = false;
+        return true;
+    }
+
+    bool LayerRegistry::is_layer_on_gpu(const std::string& name) const {
+        auto it = layers_.find(name);
+        return it != layers_.end() && it->second.on_gpu;
+    }
+
+    size_t LayerRegistry::get_layer_size(const std::string& name) const {
+        auto it = layers_.find(name);
+        return it != layers_.end() ? it->second.bytes : 0;
+    }
+
+}  // namespace sd::layer_registry
--- a/src/core/layer_registry.h
+++ b/src/core/layer_registry.h
@ -0,0 +1,50 @@
+#ifndef __SD_CORE_LAYER_REGISTRY_H__
+#define __SD_CORE_LAYER_REGISTRY_H__
+
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "ggml-backend.h"
+#include "ggml.h"
+
+namespace sd::layer_registry {
+
+    struct LayerInfo {
+        std::vector<ggml_tensor*> tensors;
+        std::vector<ggml_tensor*> gpu_twins;
+        ggml_context* twin_ctx           = nullptr;
+        ggml_backend_buffer_t gpu_buffer = nullptr;
+        bool on_gpu                      = false;
+        size_t bytes                     = 0;
+    };
+
+    class LayerRegistry {
+    public:
+        LayerRegistry() = default;
+        LayerRegistry(ggml_backend_t gpu_backend, ggml_backend_t cpu_backend)
+            : gpu_backend_(gpu_backend), cpu_backend_(cpu_backend) {}
+
+        void set_backends(ggml_backend_t gpu_backend, ggml_backend_t cpu_backend) {
+            gpu_backend_ = gpu_backend;
+            cpu_backend_ = cpu_backend;
+        }
+        void register_layer(const std::string& name, ggml_tensor* tensor);
+        bool move_layer_to_gpu(const std::string& name);
+        bool move_layer_to_cpu(const std::string& name);
+        bool is_layer_on_gpu(const std::string& name) const;
+        size_t get_layer_size(const std::string& name) const;
+        size_t get_layer_count() const { return layers_.size(); }
+
+        const std::map<std::string, LayerInfo>& layers() const { return layers_; }
+
+    private:
+        ggml_backend_t gpu_backend_ = nullptr;
+        ggml_backend_t cpu_backend_ = nullptr;
+        std::map<std::string, LayerInfo> layers_;
+    };
+
+}  // namespace sd::layer_registry
+
+#endif  // __SD_CORE_LAYER_REGISTRY_H__
--- a/src/core/util.cpp
+++ b/src/core/util.cpp
@ -406,15 +406,6 @@ std::vector<std::string> split_string(const std::string& str, char delimiter) {
    return result;
 }

-ggml_type sd_type_to_ggml_type(sd_type_t sdtype) {
-    const int type_value = static_cast<int>(sdtype);
-    if (type_value < std::min<int>(SD_TYPE_COUNT, GGML_TYPE_COUNT)) {
-        return static_cast<ggml_type>(type_value);
-    } else {
-        return GGML_TYPE_COUNT;
-    }
-}
-
 KeyValueArgs parse_key_value_args(const char* args, const char* context) {
    KeyValueArgs pairs;

@ -497,7 +488,7 @@ bool parse_strict_bool(const std::string& text, bool& value) {
    return false;
 }

-static std::string build_progress_bar(int step, int steps, char progress_char = '=', bool show_head = true) {
+static std::string build_progress_bar(int step, int steps) {
    std::string progress = "  |";
    int max_progress     = 50;
    int32_t current      = 0;
@ -507,21 +498,21 @@ static std::string build_progress_bar(int step, int steps, char progress_char =
    for (int i = 0; i < 50; i++) {
        if (i > current) {
            progress += " ";
-        } else if (show_head && i == current && i != max_progress - 1) {
+        } else if (i == current && i != max_progress - 1) {
            progress += ">";
        } else {
-            progress += progress_char;
+            progress += "=";
        }
    }
    progress += "|";
    return progress;
 }

-static void print_progress_line(int step, int steps, const std::string& speed_text, char progress_char = '=', bool show_head = true) {
+static void print_progress_line(int step, int steps, const std::string& speed_text) {
    if (step == 0) {
        return;
    }
-    std::string progress = build_progress_bar(step, steps, progress_char, show_head);
+    std::string progress = build_progress_bar(step, steps);
    const char* lf       = (step == steps ? "\n" : "");
    printf("\r%s %i/%i - %s\033[K%s", progress.c_str(), step, steps, speed_text.c_str(), lf);
    fflush(stdout);  // for linux
@ -561,9 +552,9 @@ void pretty_bytes_progress(int step, int steps, uint64_t bytes_processed, float

    double speed_mb = bytes_per_second / (1024.0 * 1024.0);
    if (speed_mb >= 1024.0) {
-        print_progress_line(step, steps, sd_format("%.2fGB/s", speed_mb / 1024.0), '#', false);
+        print_progress_line(step, steps, sd_format("%.2fGB/s", speed_mb / 1024.0));
    } else {
-        print_progress_line(step, steps, sd_format("%.2fMB/s", speed_mb), '#', false);
+        print_progress_line(step, steps, sd_format("%.2fMB/s", speed_mb));
    }
 }

--- a/src/core/util.h
+++ b/src/core/util.h
@ -80,8 +80,6 @@ void pretty_bytes_progress(int step, int steps, uint64_t bytes_processed, float

 void log_printf(sd_log_level_t level, const char* file, int line, const char* format, ...);

-ggml_type sd_type_to_ggml_type(sd_type_t sdtype);
-
 std::string trim(const std::string& s);

 std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text);
--- a/src/extensions/generation_extension.h
+++ b/src/extensions/generation_extension.h
@ -6,13 +6,10 @@
 #include <memory>
 #include <set>
 #include <string>
-#include <vector>

 #include "conditioning/conditioner.hpp"
 #include "core/ggml_extend_backend.h"
-#include "model/diffusion/model.hpp"
 #include "model_loader.h"
-#include "model_manager.h"
 #include "stable-diffusion.h"

 struct GenerationExtensionInitContext {
@ -20,20 +17,27 @@ struct GenerationExtensionInitContext {
    SDVersion version;
    const String2TensorStorage& tensor_storage_map;
    ModelLoader& model_loader;
-    std::shared_ptr<ModelManager> model_manager;
    int n_threads;
    std::function<bool(SDBackendModule)> ensure_backend_pair;
    std::function<ggml_backend_t(SDBackendModule)> backend_for;
    std::function<ggml_backend_t(SDBackendModule)> params_backend_for;
 };

+struct GenerationExtensionTensorContext {
+    std::map<std::string, ggml_tensor*>& tensors;
+    std::map<std::string, ggml_tensor*>& mmap_able_tensors;
+    std::function<bool(SDBackendModule)> module_can_mmap;
+};
+
 struct GenerationExtensionConditionContext {
    Conditioner* conditioner;
    ConditionerParams& condition_params;
    const sd_pm_params_t& pm_params;
-    const sd_pulid_params_t& pulid_params;
+    std::map<std::string, ggml_tensor*>& tensors;
+    SDVersion version;
    int n_threads;
    int total_steps;
+    bool free_params_immediately;
 };

 struct GenerationExtension {
@ -46,10 +50,14 @@ struct GenerationExtension {
    virtual bool init(const GenerationExtensionInitContext&) {
        return true;
    }
-    virtual void get_param_tensors(std::map<std::string, ggml_tensor*>&) {}
-    virtual void collect_loras(std::vector<ModelManager::LoraSpec>&) {}
+    virtual void collect_param_tensors(GenerationExtensionTensorContext&) {}
    virtual void add_ignore_tensors(std::set<std::string>&) const {}
-    virtual void runner_done() {}
+    virtual bool alloc_params_buffer() {
+        return true;
+    }
+    virtual size_t get_params_buffer_size() const {
+        return 0;
+    }
    virtual void reset_runtime_condition() {}
    virtual bool prepare_condition(GenerationExtensionConditionContext&) {
        return false;
@ -58,20 +66,8 @@ struct GenerationExtension {
                                                const SDCondition& condition) const {
        return condition;
    }
-
-    // Called in the denoise loop for each enabled extension, after the per-step
-    // DiffusionParams (including its version-specific `extra`) has been built,
-    // but before diffusion_model->compute(). Lets an extension feed data into
-    // the diffusion forward that the conditioning-side hooks can't reach -- it
-    // can set/override fields on `params` (typically the architecture-specific
-    // `params.extra`, e.g. a guidance tensor, control payload, or an identity
-    // embedding for an adapter that injects inside the model's blocks). The
-    // extension targets whichever `extra` variant matches the active model.
-    // Mutates `params` only, never the extension. Default no-op.
-    virtual void before_diffusion(DiffusionParams& /*params*/, int /*step*/) const {}
 };

 std::shared_ptr<GenerationExtension> create_photomaker_extension();
-std::shared_ptr<GenerationExtension> create_pulid_extension();

 #endif
--- a/src/extensions/photomaker_extension.cpp
+++ b/src/extensions/photomaker_extension.cpp
@ -7,6 +7,7 @@

 #include "core/tensor_ggml.hpp"
 #include "core/util.h"
+#include "model/adapter/lora.hpp"
 #include "model/adapter/pmid.hpp"

 static std::tuple<std::vector<int>, std::vector<float>, std::vector<bool>>
@ -102,6 +103,7 @@ static std::string remove_photomaker_trigger_from_prompt(FrozenCLIPEmbedderWithC

 struct PhotoMakerExtension : public GenerationExtension {
    std::shared_ptr<PhotoMakerIDEncoder> pmid_model;
+    std::shared_ptr<LoraModel> pmid_lora;
    bool enabled = false;
    std::string model_path;
    std::string trigger_word = "img";
@ -127,45 +129,54 @@ struct PhotoMakerExtension : public GenerationExtension {
        }

        PMVersion pm_version = std::strstr(model_path.c_str(), "v2") != nullptr ? PM_VERSION_2 : PM_VERSION_1;
+        pmid_model           = std::make_shared<PhotoMakerIDEncoder>(ctx.backend_for(SDBackendModule::PHOTOMAKER),
+                                                           ctx.params_backend_for(SDBackendModule::PHOTOMAKER),
+                                                           ctx.tensor_storage_map,
+                                                           "pmid",
+                                                           ctx.version,
+                                                           pm_version);
+        if (pm_version == PM_VERSION_2) {
+            LOG_INFO("using PhotoMaker Version 2");
+        }
+
+        pmid_lora               = std::make_shared<LoraModel>("pmid",
+                                                ctx.backend_for(SDBackendModule::PHOTOMAKER),
+                                                ctx.params_backend_for(SDBackendModule::PHOTOMAKER),
+                                                model_path,
+                                                "",
+                                                ctx.version);
+        auto lora_tensor_filter = [&](const std::string& tensor_name) {
+            return starts_with(tensor_name, "lora.model");
+        };
+        if (!pmid_lora->load_from_file(ctx.n_threads, lora_tensor_filter)) {
+            LOG_WARN("load photomaker lora tensors from %s failed", model_path.c_str());
+            return false;
+        }
+
        LOG_INFO("loading stacked ID embedding (PHOTOMAKER) model file from '%s'", model_path.c_str());
        if (!ctx.model_loader.init_from_file_and_convert_name(model_path, "pmid.")) {
            LOG_WARN("loading stacked ID embedding from '%s' failed", model_path.c_str());
            return true;
        }

-        pmid_model = std::make_shared<PhotoMakerIDEncoder>(ctx.backend_for(SDBackendModule::PHOTOMAKER),
-                                                           ctx.tensor_storage_map,
-                                                           "pmid",
-                                                           ctx.version,
-                                                           pm_version,
-                                                           20.f,
-                                                           ctx.model_manager);
-        if (pm_version == PM_VERSION_2) {
-            LOG_INFO("using PhotoMaker Version 2");
-        }
-
        enabled = true;
        return true;
    }

-    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
+    void collect_param_tensors(GenerationExtensionTensorContext& ctx) override {
        if (!enabled || pmid_model == nullptr) {
            return;
        }

-        pmid_model->get_param_tensors(tensors, "pmid");
-    }
-
-    void collect_loras(std::vector<ModelManager::LoraSpec>& loras) override {
-        if (!enabled || model_path.empty()) {
-            return;
+        std::map<std::string, ggml_tensor*> temp;
+        pmid_model->get_param_tensors(temp, "pmid");
+        bool do_mmap = ctx.module_can_mmap(SDBackendModule::PHOTOMAKER);
+        for (const auto& [key, tensor] : temp) {
+            ctx.tensors[key] = tensor;
+            if (do_mmap) {
+                ctx.mmap_able_tensors[key] = tensor;
+            }
        }
-        ModelManager::LoraSpec lora;
-        lora.path                      = model_path;
-        lora.multiplier                = 1.0f;
-        lora.tensor_name_prefix_filter = "lora.model";
-        lora.required                  = true;
-        loras.push_back(std::move(lora));
    }

    void add_ignore_tensors(std::set<std::string>& ignore_tensors) const override {
@ -175,10 +186,18 @@ struct PhotoMakerExtension : public GenerationExtension {
        ignore_tensors.insert("pmid.unet.");
    }

-    void runner_done() override {
-        if (pmid_model != nullptr) {
-            pmid_model->runner_done();
+    bool alloc_params_buffer() override {
+        if (!enabled || pmid_model == nullptr) {
+            return true;
        }
+        return pmid_model->alloc_params_buffer();
+    }
+
+    size_t get_params_buffer_size() const override {
+        if (!enabled || pmid_model == nullptr) {
+            return 0;
+        }
+        return pmid_model->get_params_buffer_size();
    }

    void reset_runtime_condition() override {
@ -188,10 +207,21 @@ struct PhotoMakerExtension : public GenerationExtension {

    bool prepare_condition(GenerationExtensionConditionContext& ctx) override {
        reset_runtime_condition();
-        if (!enabled || pmid_model == nullptr) {
+        if (!enabled || pmid_model == nullptr || pmid_lora == nullptr) {
            return false;
        }

+        if (!pmid_lora->applied) {
+            int64_t t0 = ggml_time_ms();
+            pmid_lora->apply(ctx.tensors, ctx.version, ctx.n_threads);
+            int64_t t1         = ggml_time_ms();
+            pmid_lora->applied = true;
+            LOG_INFO("pmid_lora apply completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
+            if (ctx.free_params_immediately) {
+                pmid_lora->free_params_buffer();
+            }
+        }
+
        bool pmv2 = pmid_model->get_version() == PM_VERSION_2;
        if (ctx.pm_params.id_images_count <= 0 || ctx.pm_params.id_images == nullptr) {
            LOG_WARN("Provided PhotoMaker model file, but NO input ID images");
@ -275,6 +305,9 @@ struct PhotoMakerExtension : public GenerationExtension {
        LOG_INFO("Photomaker ID Stacking, taking %" PRId64 " ms", t1 - t0);
        LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step);

+        if (ctx.free_params_immediately) {
+            pmid_model->free_params_buffer();
+        }
        return true;
    }

--- a/src/extensions/pulid_extension.cpp
+++ b/src/extensions/pulid_extension.cpp
@ -1,123 +0,0 @@
-#include "extensions/generation_extension.h"
-
-#include <cstring>
-#include <variant>
-
-#include "core/tensor_ggml.hpp"
-#include "core/util.h"
-#include "gguf.h"
-
-static sd::Tensor<float> load_pulid_id_embedding(const char* path) {
-    sd::Tensor<float> empty;
-    if (path == nullptr || strlen(path) == 0) {
-        return empty;
-    }
-
-    struct ggml_context* ctx_data = nullptr;
-    struct gguf_init_params gp    = {/*.no_alloc =*/false, /*.ctx =*/&ctx_data};
-    struct gguf_context* gguf_ctx = gguf_init_from_file(path, gp);
-    if (gguf_ctx == nullptr || ctx_data == nullptr) {
-        LOG_WARN("PuLID id-embedding: cannot read gguf '%s'", path);
-        if (gguf_ctx != nullptr)
-            gguf_free(gguf_ctx);
-        if (ctx_data != nullptr)
-            ggml_free(ctx_data);
-        return empty;
-    }
-
-    struct ggml_tensor* t = ggml_get_tensor(ctx_data, "pulid_id");
-    if (t == nullptr) {
-        LOG_WARN("PuLID id-embedding: no 'pulid_id' tensor in '%s'", path);
-        gguf_free(gguf_ctx);
-        ggml_free(ctx_data);
-        return empty;
-    }
-
-    const int64_t token_dim  = t->ne[0];
-    const int64_t num_tokens = t->ne[1];
-    if (token_dim <= 0 || num_tokens <= 0 || token_dim > 65536 || num_tokens > 1024 ||
-        t->ne[2] != 1 || t->ne[3] != 1) {
-        LOG_WARN("PuLID id-embedding: implausible shape [%lld, %lld] in '%s'",
-                 (long long)token_dim, (long long)num_tokens, path);
-        gguf_free(gguf_ctx);
-        ggml_free(ctx_data);
-        return empty;
-    }
-
-    const size_t n_elem = (size_t)token_dim * (size_t)num_tokens;
-    sd::Tensor<float> out({token_dim, num_tokens, 1});
-    float* dst = out.data();
-    if (t->type == GGML_TYPE_F32) {
-        memcpy(dst, t->data, n_elem * sizeof(float));
-    } else if (t->type == GGML_TYPE_F16) {
-        const ggml_fp16_t* src = reinterpret_cast<const ggml_fp16_t*>(t->data);
-        for (size_t i = 0; i < n_elem; i++) {
-            dst[i] = ggml_fp16_to_fp32(src[i]);
-        }
-    } else if (t->type == GGML_TYPE_BF16) {
-        const ggml_bf16_t* src = reinterpret_cast<const ggml_bf16_t*>(t->data);
-        for (size_t i = 0; i < n_elem; i++) {
-            dst[i] = ggml_bf16_to_fp32(src[i]);
-        }
-    } else {
-        LOG_WARN("PuLID id-embedding: unsupported tensor type %s in '%s'",
-                 ggml_type_name(t->type), path);
-        gguf_free(gguf_ctx);
-        ggml_free(ctx_data);
-        return empty;
-    }
-
-    LOG_INFO("PuLID id-embedding: loaded [%lld, %lld] type=%s from '%s'",
-             (long long)token_dim, (long long)num_tokens, ggml_type_name(t->type), path);
-    gguf_free(gguf_ctx);
-    ggml_free(ctx_data);
-    return out;
-}
-
-struct PuLIDExtension : public GenerationExtension {
-    bool enabled = false;
-    sd::Tensor<float> id_embedding;
-    float id_weight = 1.0f;
-
-    const char* name() const override {
-        return "pulid";
-    }
-
-    bool is_enabled() const override {
-        return enabled;
-    }
-
-    bool init(const GenerationExtensionInitContext& ctx) override {
-        enabled = strlen(SAFE_STR(ctx.params->pulid_weights_path)) > 0;
-        return true;
-    }
-
-    void reset_runtime_condition() override {
-        id_embedding = {};
-        id_weight    = 1.0f;
-    }
-
-    bool prepare_condition(GenerationExtensionConditionContext& ctx) override {
-        reset_runtime_condition();
-        if (!enabled) {
-            return false;
-        }
-        id_embedding = load_pulid_id_embedding(ctx.pulid_params.id_embedding_path);
-        id_weight    = ctx.pulid_params.id_weight;
-        return false;  // PuLID does not modify the conditioning
-    }
-
-    void before_diffusion(DiffusionParams& params, int /*step*/) const override {
-        if (!enabled || id_embedding.empty()) {
-            return;
-        }
-        if (auto* flux_extra = std::get_if<FluxDiffusionExtra>(&params.extra)) {
-            flux_extra->pulid_id        = &id_embedding;
-            flux_extra->pulid_id_weight = id_weight;
-        }
-    }
-};
-
-std::shared_ptr<GenerationExtension> create_pulid_extension() {
-    return std::make_shared<PuLIDExtension>();
-}
--- a/src/model.h
+++ b/src/model.h
@ -42,14 +42,12 @@ enum SDVersion {
    VERSION_LTXAV,
    VERSION_HIDREAM_O1,
    VERSION_Z_IMAGE,
-    VERSION_BOOGU_IMAGE,
    VERSION_OVIS_IMAGE,
    VERSION_ERNIE_IMAGE,
    VERSION_LENS,
    VERSION_LONGCAT,
    VERSION_PID,
    VERSION_IDEOGRAM4,
-    VERSION_ESRGAN,
    VERSION_COUNT,
 };

@ -144,13 +142,6 @@ static inline bool sd_version_is_z_image(SDVersion version) {
    return false;
 }

-static inline bool sd_version_is_boogu_image(SDVersion version) {
-    if (version == VERSION_BOOGU_IMAGE) {
-        return true;
-    }
-    return false;
-}
-
 static inline bool sd_version_is_longcat(SDVersion version) {
    if (version == VERSION_LONGCAT) {
        return true;
@ -186,13 +177,6 @@ static inline bool sd_version_is_ideogram4(SDVersion version) {
    return false;
 }

-static inline bool sd_version_uses_flux_vae(SDVersion version) {
-    if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_boogu_image(version) || sd_version_is_longcat(version)) {
-        return true;
-    }
-    return false;
-}
-
 static inline bool sd_version_uses_flux2_vae(SDVersion version) {
    if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version) || sd_version_is_lens(version) || sd_version_is_ideogram4(version)) {
        return true;
@ -221,7 +205,6 @@ static inline bool sd_version_is_dit(SDVersion version) {
        version == VERSION_HIDREAM_O1 ||
        sd_version_is_anima(version) ||
        sd_version_is_z_image(version) ||
-        sd_version_is_boogu_image(version) ||
        sd_version_is_ernie_image(version) ||
        sd_version_is_lens(version) ||
        sd_version_is_longcat(version) ||
--- a/src/model/adapter/lora.hpp
+++ b/src/model/adapter/lora.hpp
@ -4,7 +4,6 @@
 #include <mutex>
 #include "core/ggml_extend.hpp"
 #include "model_loader.h"
-#include "model_manager.h"

 #define LORA_GRAPH_BASE_SIZE 10240

@ -15,24 +14,22 @@ struct LoraModel : public GGMLRunner {
    std::map<ggml_tensor*, ggml_tensor*> original_tensor_to_final_tensor;
    std::set<std::string> applied_lora_tensors;
    std::string file_path;
-    std::shared_ptr<ModelManager> model_manager;
-    ggml_backend_t params_backend = nullptr;
-    bool load_failed              = false;
-    bool applied                  = false;
-    bool tensor_preprocessed      = false;
+    ModelLoader model_loader;
+    bool load_failed         = false;
+    bool applied             = false;
+    bool tensor_preprocessed = false;

    typedef std::function<bool(const std::string&)> filter_t;

    LoraModel(const std::string& lora_id,
              ggml_backend_t backend,
-              ggml_backend_t params_backend_,
-              const std::string& file_path          = "",
-              std::string prefix                    = "",
-              SDVersion version                     = VERSION_COUNT,
-              std::shared_ptr<ModelManager> manager = std::make_shared<ModelManager>())
-        : GGMLRunner(backend, manager), lora_id(lora_id), file_path(file_path), model_manager(std::move(manager)), params_backend(params_backend_) {
+              ggml_backend_t params_backend,
+              const std::string& file_path = "",
+              std::string prefix           = "",
+              SDVersion version            = VERSION_COUNT)
+        : lora_id(lora_id), file_path(file_path), GGMLRunner(backend, params_backend) {
        prefix = "lora." + prefix;
-        if (model_manager == nullptr || !model_manager->loader().init_from_file_and_convert_name(file_path, prefix, version)) {
+        if (!model_loader.init_from_file_and_convert_name(file_path, prefix, version)) {
            load_failed = true;
        }
    }
@ -74,11 +71,7 @@ struct LoraModel : public GGMLRunner {
            return true;
        };

-        if (model_manager != nullptr) {
-            model_manager->set_n_threads(n_threads);
-        }
-        ModelLoader& model_loader = model_manager->loader();
-        model_loader.load_tensors(on_new_tensor_cb);
+        model_loader.load_tensors(on_new_tensor_cb, n_threads);

        if (tensors_to_create.empty()) {
            return true;
@ -94,64 +87,25 @@ struct LoraModel : public GGMLRunner {
            lora_tensors[name] = real;
        }

-        std::map<std::string, ggml_tensor*> tensors;
-        for (const auto& pair : lora_tensors) {
-            tensors[pair.first] = pair.second;
-        }
-        if (model_manager == nullptr ||
-            !model_manager->register_param_tensors("LoRA",
-                                                   std::move(tensors),
-                                                   ModelManager::ResidencyMode::ParamBackend,
-                                                   runtime_backend,
-                                                   params_backend) ||
-            !model_manager->validate_registered_tensors()) {
-            LOG_ERROR("lora model manager registration failed");
-            return false;
-        }
-        std::vector<ggml_tensor*> lora_params;
-        lora_params.reserve(lora_tensors.size());
-        for (const auto& pair : lora_tensors) {
-            lora_params.push_back(pair.second);
-        }
-        if (!model_manager->prepare_params(lora_params)) {
-            LOG_ERROR("lora model manager prepare params failed");
+        if (!alloc_params_buffer()) {
+            LOG_ERROR("lora model buffer allocation failed");
            return false;
        }

+        dry_run = false;
+        model_loader.load_tensors(on_new_tensor_cb, n_threads);
+
        LOG_DEBUG("finished loaded lora");
        return true;
    }

-    void release_loaded_tensors() {
-        runner_done();
-        free_compute_buffer();
-        model_manager.reset();
-        free_params_ctx();
-        alloc_params_ctx();
-        model_manager  = std::make_shared<ModelManager>();
-        weight_manager = model_manager;
-        lora_tensors.clear();
-        original_tensor_to_final_tensor.clear();
-        applied_lora_tensors.clear();
-        applied             = false;
-        tensor_preprocessed = false;
-    }
-
-    static std::set<std::string> tensor_names(const std::map<std::string, ggml_tensor*>& model_tensors) {
-        std::set<std::string> names;
-        for (const auto& item : model_tensors) {
-            names.insert(item.first);
-        }
-        return names;
-    }
-
-    void preprocess_lora_tensors(const std::set<std::string>& model_tensor_names) {
+    void preprocess_lora_tensors(const std::map<std::string, ggml_tensor*>& model_tensors) {
        if (tensor_preprocessed) {
            return;
        }
        tensor_preprocessed = true;
        // I really hate these hardcoded processes.
-        if (model_tensor_names.find("cond_stage_model.1.transformer.text_model.encoder.layers.0.self_attn.in_proj.weight") != model_tensor_names.end()) {
+        if (model_tensors.find("cond_stage_model.1.transformer.text_model.encoder.layers.0.self_attn.in_proj.weight") != model_tensors.end()) {
            std::unordered_map<std::string, ggml_tensor*> new_lora_tensors;
            for (auto& [old_name, tensor] : lora_tensors) {
                std::string new_name = old_name;
@ -658,7 +612,7 @@ struct LoraModel : public GGMLRunner {
                if (lokr_w2)
                    applied_lora_tensors.insert(lokr_w2_name);
                if (lokr_w2_a)
-                    applied_lora_tensors.insert(lokr_w2_a_name);
+                    applied_lora_tensors.insert(lokr_w2_name);
                if (lokr_w2_b)
                    applied_lora_tensors.insert(lokr_w2_b_name);
                applied_lora_tensors.insert(alpha_name);
@ -799,13 +753,11 @@ struct LoraModel : public GGMLRunner {
        return out_diff;
    }

-    ggml_cgraph* build_lora_graph(const std::map<std::string, ggml_tensor*>& model_tensors,
-                                  const std::set<std::string>& model_tensor_names,
-                                  SDVersion version) {
+    ggml_cgraph* build_lora_graph(const std::map<std::string, ggml_tensor*>& model_tensors, SDVersion version) {
        size_t lora_graph_size = LORA_GRAPH_BASE_SIZE + lora_tensors.size() * 10;
        ggml_cgraph* gf        = ggml_new_graph_custom(compute_ctx, lora_graph_size, false);

-        preprocess_lora_tensors(model_tensor_names);
+        preprocess_lora_tensors(model_tensors);

        original_tensor_to_final_tensor.clear();
        applied_lora_tensors.clear();
@ -842,16 +794,12 @@ struct LoraModel : public GGMLRunner {
        return gf;
    }

-    void apply(std::map<std::string, ggml_tensor*> model_tensors,
-               const std::set<std::string>& model_tensor_names,
-               SDVersion version,
-               int n_threads,
-               bool warn_unused = true) {
+    void apply(std::map<std::string, ggml_tensor*> model_tensors, SDVersion version, int n_threads) {
        auto get_graph = [&]() -> ggml_cgraph* {
-            return build_lora_graph(model_tensors, model_tensor_names, version);
+            return build_lora_graph(model_tensors, version);
        };
-        GGMLRunner::compute<float>(get_graph, n_threads, false, false, false, true);
-        stat(!warn_unused);
+        GGMLRunner::compute<float>(get_graph, n_threads, false, true);
+        stat();
        for (auto item : original_tensor_to_final_tensor) {
            ggml_tensor* original_tensor = item.first;
            ggml_tensor* final_tensor    = item.second;
@ -862,10 +810,6 @@ struct LoraModel : public GGMLRunner {
        GGMLRunner::free_compute_buffer();
    }

-    void apply(std::map<std::string, ggml_tensor*> model_tensors, SDVersion version, int n_threads, bool warn_unused = true) {
-        apply(model_tensors, tensor_names(model_tensors), version, n_threads, warn_unused);
-    }
-
    void stat(bool at_runntime = false) {
        size_t total_lora_tensors_count   = 0;
        size_t applied_lora_tensors_count = 0;
--- a/src/model/adapter/pmid.hpp
+++ b/src/model/adapter/pmid.hpp
@ -413,13 +413,13 @@ public:

 public:
    PhotoMakerIDEncoder(ggml_backend_t backend,
+                        ggml_backend_t params_backend,
                        const String2TensorStorage& tensor_storage_map,
                        const std::string prefix,
-                        SDVersion version                                   = VERSION_SDXL,
-                        PMVersion pm_v                                      = PM_VERSION_1,
-                        float sty                                           = 20.f,
-                        std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-        : GGMLRunner(backend, weight_manager),
+                        SDVersion version = VERSION_SDXL,
+                        PMVersion pm_v    = PM_VERSION_1,
+                        float sty         = 20.f)
+        : GGMLRunner(backend, params_backend),
          version(version),
          pm_version(pm_v),
          style_strength(sty) {
@ -558,25 +558,24 @@ public:
            return build_graph(id_pixel_values, prompt_embeds, class_tokens_mask, id_embeds);
        };

-        return take_or_empty(GGMLRunner::compute<float>(get_graph, n_threads, true, true, true));
+        return take_or_empty(GGMLRunner::compute<float>(get_graph, n_threads, true));
    }
 };

 struct PhotoMakerIDEmbed : public GGMLRunner {
    std::map<std::string, ggml_tensor*> tensors;
    std::string file_path;
-    std::shared_ptr<ModelManager> model_manager;
-    ggml_backend_t params_backend = nullptr;
-    bool load_failed              = false;
-    bool applied                  = false;
+    ModelLoader* model_loader;
+    bool load_failed = false;
+    bool applied     = false;

    PhotoMakerIDEmbed(ggml_backend_t backend,
-                      ggml_backend_t params_backend_,
-                      std::shared_ptr<ModelManager> manager = std::make_shared<ModelManager>(),
-                      const std::string& file_path          = "",
-                      const std::string& prefix             = "")
-        : GGMLRunner(backend, manager), file_path(file_path), model_manager(std::move(manager)), params_backend(params_backend_) {
-        if (model_manager == nullptr || !model_manager->loader().init_from_file_and_convert_name(file_path, prefix)) {
+                      ggml_backend_t params_backend,
+                      ModelLoader* ml,
+                      const std::string& file_path = "",
+                      const std::string& prefix    = "")
+        : file_path(file_path), GGMLRunner(backend, params_backend), model_loader(ml) {
+        if (!model_loader->init_from_file_and_convert_name(file_path, prefix)) {
            load_failed = true;
        }
    }
@ -617,28 +616,15 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
            return true;
        };

-        model_manager->set_n_threads(n_threads);
-        ModelLoader& model_loader = model_manager->loader();
-        model_loader.load_tensors(on_new_tensor_cb);
-        if (!model_manager->register_param_tensors("PhotoMaker ID embeds",
-                                                   tensors,
-                                                   ModelManager::ResidencyMode::ParamBackend,
-                                                   runtime_backend,
-                                                   params_backend) ||
-            !model_manager->validate_registered_tensors()) {
-            LOG_ERROR("PhotoMaker ID embeds model manager registration failed");
-            return false;
-        }
-        std::vector<ggml_tensor*> id_embed_params;
-        id_embed_params.reserve(tensors.size());
-        for (const auto& pair : tensors) {
-            id_embed_params.push_back(pair.second);
-        }
-        if (!model_manager->prepare_params(id_embed_params)) {
-            LOG_ERROR("PhotoMaker ID embeds model manager prepare params failed");
+        model_loader->load_tensors(on_new_tensor_cb, n_threads);
+        if (!alloc_params_buffer()) {
+            LOG_ERROR("PhotoMaker ID embeds buffer allocation failed");
            return false;
        }

+        dry_run = false;
+        model_loader->load_tensors(on_new_tensor_cb, n_threads);
+
        LOG_DEBUG("finished loading PhotoMaker ID Embeds ");
        return true;
    }
--- a/src/model/adapter/pulid.hpp
+++ b/src/model/adapter/pulid.hpp
@ -1,76 +0,0 @@
-#ifndef __PULID_HPP__
-#define __PULID_HPP__
-
-#include "core/ggml_extend.hpp"
-#include "model/common/block.hpp"
-
-class PuLIDPerceiverAttentionCA : public GGMLBlock {
-public:
-    static constexpr int64_t DEFAULT_DIM      = 3072;  // Flux hidden size
-    static constexpr int64_t DEFAULT_DIM_HEAD = 128;
-    static constexpr int64_t DEFAULT_HEADS    = 16;
-    static constexpr int64_t DEFAULT_KV_DIM   = 2048;  // PuLID ID-embedding dim
-
-protected:
-    int64_t dim;
-    int64_t dim_head;
-    int64_t heads;
-    int64_t kv_dim;
-    int64_t inner_dim;
-
-public:
-    PuLIDPerceiverAttentionCA(int64_t dim      = DEFAULT_DIM,
-                              int64_t dim_head = DEFAULT_DIM_HEAD,
-                              int64_t heads    = DEFAULT_HEADS,
-                              int64_t kv_dim   = DEFAULT_KV_DIM)
-        : dim(dim),
-          dim_head(dim_head),
-          heads(heads),
-          kv_dim(kv_dim),
-          inner_dim(dim_head * heads) {
-        blocks["norm1"]  = std::shared_ptr<GGMLBlock>(new LayerNorm(kv_dim));
-        blocks["norm2"]  = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
-        blocks["to_q"]   = std::shared_ptr<GGMLBlock>(new Linear(dim, inner_dim, /*bias=*/false));
-        blocks["to_kv"]  = std::shared_ptr<GGMLBlock>(new Linear(kv_dim, inner_dim * 2, /*bias=*/false));
-        blocks["to_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim, /*bias=*/false));
-    }
-
-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* id_embedding,
-                         ggml_tensor* image_tokens) {
-        auto norm1  = std::dynamic_pointer_cast<LayerNorm>(blocks["norm1"]);
-        auto norm2  = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]);
-        auto to_q   = std::dynamic_pointer_cast<Linear>(blocks["to_q"]);
-        auto to_kv  = std::dynamic_pointer_cast<Linear>(blocks["to_kv"]);
-        auto to_out = std::dynamic_pointer_cast<Linear>(blocks["to_out"]);
-
-        ggml_tensor* x_normed   = norm1->forward(ctx, id_embedding);
-        ggml_tensor* lat_normed = norm2->forward(ctx, image_tokens);
-
-        ggml_tensor* q  = to_q->forward(ctx, lat_normed);  // [N, T_img, 2048]
-        ggml_tensor* kv = to_kv->forward(ctx, x_normed);   // [N, T_img, 3072]
-
-        ggml_tensor* k = ggml_view_3d(ctx->ggml_ctx, kv,
-                                      inner_dim, kv->ne[1], kv->ne[2],
-                                      kv->nb[1], kv->nb[2],
-                                      /*offset=*/0);
-        ggml_tensor* v = ggml_view_3d(ctx->ggml_ctx, kv,
-                                      inner_dim, kv->ne[1], kv->ne[2],
-                                      kv->nb[1], kv->nb[2],
-                                      /*offset=*/inner_dim * ggml_element_size(kv));
-        k              = ggml_cont(ctx->ggml_ctx, k);
-        v              = ggml_cont(ctx->ggml_ctx, v);
-
-        ggml_tensor* attn_out = ggml_ext_attention_ext(
-            ctx->ggml_ctx, ctx->backend,
-            q, k, v,
-            heads,
-            /*mask=*/nullptr,
-            /*diag_mask_inf=*/false);
-
-        ggml_tensor* out = to_out->forward(ctx, attn_out);
-        return out;
-    }
-};
-
-#endif  // __PULID_HPP__
--- a/src/model/common/block.hpp
+++ b/src/model/common/block.hpp
@ -560,11 +560,11 @@ protected:
        params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
    }

-    ggml_tensor* get_alpha(GGMLRunnerContext* ctx) {
+    float get_alpha() {
        // image_only_indicator is always tensor([0.]) and since mix_factor.shape is [1,]
        // so learned_with_images is same as learned
-        auto mix_factor = ggml_ext_cast_f32(ctx->ggml_ctx, ctx->backend, params["mix_factor"]);
-        return ggml_sigmoid(ctx->ggml_ctx, mix_factor);
+        float alpha = ggml_ext_backend_tensor_get_f32(params["mix_factor"]);
+        return sigmoid(alpha);
    }

 public:
@ -578,12 +578,11 @@ public:
                         ggml_tensor* x_spatial,
                         ggml_tensor* x_temporal) {
        // image_only_indicator is always tensor([0.])
-        auto alpha = get_alpha(ctx);
-        return ggml_add(ctx->ggml_ctx,
-                        x_temporal,
-                        ggml_mul(ctx->ggml_ctx,
-                                 ggml_sub(ctx->ggml_ctx, x_spatial, x_temporal),
-                                 alpha));
+        float alpha = get_alpha();
+        auto x      = ggml_add(ctx->ggml_ctx,
+                               ggml_ext_scale(ctx->ggml_ctx, x_spatial, alpha),
+                               ggml_ext_scale(ctx->ggml_ctx, x_temporal, 1.0f - alpha));
+        return x;
    }
 };

--- a/src/model/common/rope.hpp
+++ b/src/model/common/rope.hpp
@ -253,8 +253,7 @@ namespace Rope {
                                                                 int bs,
                                                                 float theta,
                                                                 int head_dim,
-                                                                 const std::vector<int>& mrope_section,
-                                                                 const std::vector<std::vector<int>>& axis_wrap_dims = {}) {
+                                                                 const std::vector<int>& mrope_section) {
        GGML_ASSERT(bs > 0);
        GGML_ASSERT(head_dim % 2 == 0);
        GGML_ASSERT(mrope_section.size() >= 3);
@ -266,11 +265,7 @@ namespace Rope {
        std::vector<std::vector<std::vector<float>>> axis_embs;
        axis_embs.reserve(3);
        for (int axis = 0; axis < 3; ++axis) {
-            std::vector<int> axis_wrap;
-            if (axis < static_cast<int>(axis_wrap_dims.size())) {
-                axis_wrap = axis_wrap_dims[axis];
-            }
-            axis_embs.push_back(rope(trans_ids[axis], head_dim, theta, axis_wrap));
+            axis_embs.push_back(rope(trans_ids[axis], head_dim, theta));
        }

        std::vector<std::vector<float>> emb = axis_embs[0];
@ -899,12 +894,10 @@ namespace Rope {
        // q,k,v: [N, L, n_head, d_head]
        // pe: [L, d_head/2, 2, 2]
        // return: [N, L, n_head*d_head]
-        int64_t n_head = q->ne[1];
-
        q = apply_rope(ctx->ggml_ctx, q, pe, rope_interleaved);  // [N*n_head, L, d_head]
        k = apply_rope(ctx->ggml_ctx, k, pe, rope_interleaved);  // [N*n_head, L, d_head]

-        auto x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, mask, true, ctx->flash_attn_enabled, kv_scale);  // [N, L, n_head*d_head]
+        auto x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, v->ne[1], mask, true, ctx->flash_attn_enabled, kv_scale);  // [N, L, n_head*d_head]
        return x;
    }
 };  // namespace Rope
--- a/src/model/diffusion/anima.hpp
+++ b/src/model/diffusion/anima.hpp
@ -227,7 +227,6 @@ namespace Anima {
            k4 = k_norm->forward(ctx, k4);

            ggml_tensor* attn_out = nullptr;
-            float scale           = (sd_backend_is(ctx->backend, "Vulkan") && ctx->flash_attn_enabled) ? 1.0f / 32.0f : 1.0f;
            if (pe_q != nullptr || pe_k != nullptr) {
                if (pe_q == nullptr) {
                    pe_q = pe_k;
@ -245,8 +244,7 @@ namespace Anima {
                                                     num_heads,
                                                     nullptr,
                                                     true,
-                                                     ctx->flash_attn_enabled,
-                                                     scale);
+                                                     ctx->flash_attn_enabled);
            } else {
                auto q_flat = ggml_reshape_3d(ctx->ggml_ctx, q4, head_dim * num_heads, L_q, N);
                auto k_flat = ggml_reshape_3d(ctx->ggml_ctx, k4, head_dim * num_heads, L_k, N);
@ -258,8 +256,7 @@ namespace Anima {
                                                     num_heads,
                                                     nullptr,
                                                     false,
-                                                     ctx->flash_attn_enabled,
-                                                     scale);
+                                                     ctx->flash_attn_enabled);
            }

            return out_proj->forward(ctx, attn_out);
@ -564,10 +561,10 @@ namespace Anima {
        AnimaNet net;

        AnimaRunner(ggml_backend_t backend,
-                    const String2TensorStorage& tensor_storage_map      = {},
-                    const std::string prefix                            = "model.diffusion_model",
-                    std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-            : DiffusionModelRunner(backend, prefix, weight_manager),
+                    ggml_backend_t params_backend,
+                    const String2TensorStorage& tensor_storage_map = {},
+                    const std::string prefix                       = "model.diffusion_model")
+            : DiffusionModelRunner(backend, params_backend, prefix),
              config(AnimaConfig::detect_from_weights(tensor_storage_map, prefix + ".net")) {
            net = AnimaNet(config);
            net.init(params_ctx, tensor_storage_map, prefix + ".net");
@ -700,7 +697,7 @@ namespace Anima {
            auto get_graph = [&]() -> ggml_cgraph* {
                return build_graph(x, timesteps, context, t5_ids, t5_weights);
            };
-            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
        }

        sd::Tensor<float> compute(int n_threads,
--- a/src/model/diffusion/boogu.hpp
+++ b/src/model/diffusion/boogu.hpp
@ -1,835 +0,0 @@
-#ifndef __SD_MODEL_DIFFUSION_BOOGU_HPP__
-#define __SD_MODEL_DIFFUSION_BOOGU_HPP__
-
-#include <algorithm>
-#include <cmath>
-#include <tuple>
-#include <vector>
-
-#include "core/ggml_extend.hpp"
-#include "model/common/rope.hpp"
-#include "model/diffusion/dit.hpp"
-#include "model/diffusion/model.hpp"
-#include "model/diffusion/qwen_image.hpp"
-#include "model_loader.h"
-
-namespace Boogu {
-    constexpr int BOOGU_GRAPH_SIZE = 65536;
-
-    struct BooguConfig {
-        int patch_size                   = 2;
-        int64_t in_channels              = 16;
-        int64_t out_channels             = 16;
-        int64_t hidden_size              = 3360;
-        int64_t num_layers               = 32;
-        int64_t num_double_stream_layers = 8;
-        int64_t num_refiner_layers       = 2;
-        int64_t num_attention_heads      = 28;
-        int64_t num_kv_heads             = 7;
-        int64_t head_dim                 = 120;
-        int64_t multiple_of              = 256;
-        int64_t instruction_feat_dim     = 4096;
-        int64_t timestep_embed_dim       = 1024;
-        int theta                        = 10000;
-        float timestep_scale             = 1000.0f;
-        float norm_eps                   = 1e-5f;
-        std::vector<int> axes_dim        = {40, 40, 40};
-        int64_t axes_dim_sum             = 120;
-
-        static int64_t count_blocks(const String2TensorStorage& tensor_storage_map,
-                                    const std::string& prefix,
-                                    const std::string& block_prefix) {
-            int64_t count = 0;
-            for (const auto& [name, _] : tensor_storage_map) {
-                if (!starts_with(name, prefix)) {
-                    continue;
-                }
-                size_t pos = name.find(block_prefix);
-                if (pos == std::string::npos) {
-                    continue;
-                }
-                auto items = split_string(name.substr(pos), '.');
-                if (items.size() > 1) {
-                    count = std::max<int64_t>(count, atoi(items[1].c_str()) + 1);
-                }
-            }
-            return count;
-        }
-
-        static BooguConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
-            BooguConfig config;
-            int64_t detected_head_dim = 0;
-            int64_t detected_kv_dim   = 0;
-
-            for (const auto& [name, tensor_storage] : tensor_storage_map) {
-                if (!starts_with(name, prefix)) {
-                    continue;
-                }
-                if (ends_with(name, "x_embedder.weight") && tensor_storage.n_dims == 2) {
-                    int64_t patch_area = config.patch_size * config.patch_size;
-                    config.in_channels = tensor_storage.ne[0] / patch_area;
-                    config.hidden_size = tensor_storage.ne[1];
-                } else if (ends_with(name, "time_caption_embed.caption_embedder.1.weight") && tensor_storage.n_dims == 2) {
-                    config.instruction_feat_dim = tensor_storage.ne[0];
-                    config.hidden_size          = tensor_storage.ne[1];
-                } else if (ends_with(name, "single_stream_layers.0.attn.norm_q.weight") && tensor_storage.n_dims == 1) {
-                    detected_head_dim = tensor_storage.ne[0];
-                } else if (ends_with(name, "double_stream_layers.0.img_self_attn.norm_q.weight") && tensor_storage.n_dims == 1) {
-                    detected_head_dim = tensor_storage.ne[0];
-                } else if (ends_with(name, "single_stream_layers.0.attn.to_k.weight") && tensor_storage.n_dims == 2) {
-                    detected_kv_dim = tensor_storage.ne[1];
-                } else if (ends_with(name, "double_stream_layers.0.img_instruct_attn.processor.img_to_k.weight") && tensor_storage.n_dims == 2) {
-                    detected_kv_dim = tensor_storage.ne[1];
-                } else if (ends_with(name, "norm_out.linear_2.weight") && tensor_storage.n_dims == 2) {
-                    int64_t patch_area  = config.patch_size * config.patch_size;
-                    config.out_channels = tensor_storage.ne[1] / patch_area;
-                }
-            }
-
-            config.num_layers               = std::max<int64_t>(1, count_blocks(tensor_storage_map, prefix, "single_stream_layers."));
-            config.num_double_stream_layers = std::max<int64_t>(0, count_blocks(tensor_storage_map, prefix, "double_stream_layers."));
-            int64_t noise_refiner_layers    = count_blocks(tensor_storage_map, prefix, "noise_refiner.");
-            int64_t ref_refiner_layers      = count_blocks(tensor_storage_map, prefix, "ref_image_refiner.");
-            int64_t context_refiner_layers  = count_blocks(tensor_storage_map, prefix, "context_refiner.");
-            config.num_refiner_layers       = std::max<int64_t>(1, std::max(noise_refiner_layers, std::max(ref_refiner_layers, context_refiner_layers)));
-
-            if (detected_head_dim > 0) {
-                config.head_dim            = detected_head_dim;
-                config.num_attention_heads = config.hidden_size / config.head_dim;
-                config.axes_dim_sum        = config.head_dim;
-                if (detected_kv_dim > 0) {
-                    config.num_kv_heads = detected_kv_dim / config.head_dim;
-                }
-                if (config.axes_dim_sum == 120) {
-                    config.axes_dim = {40, 40, 40};
-                } else if (config.axes_dim_sum % 3 == 0) {
-                    int axis        = static_cast<int>(config.axes_dim_sum / 3);
-                    config.axes_dim = {axis, axis, axis};
-                }
-            }
-            config.timestep_embed_dim = std::min<int64_t>(config.hidden_size, 1024);
-
-            LOG_DEBUG("boogu_image: layers=%" PRId64 ", double_stream_layers=%" PRId64 ", refiner_layers=%" PRId64 ", hidden=%" PRId64 ", heads=%" PRId64 ", kv_heads=%" PRId64 ", head_dim=%" PRId64 ", in_channels=%" PRId64 ", out_channels=%" PRId64,
-                      config.num_layers,
-                      config.num_double_stream_layers,
-                      config.num_refiner_layers,
-                      config.hidden_size,
-                      config.num_attention_heads,
-                      config.num_kv_heads,
-                      config.head_dim,
-                      config.in_channels,
-                      config.out_channels);
-            return config;
-        }
-    };
-
-    __STATIC_INLINE__ ggml_tensor* scale_modulate(ggml_context* ctx, ggml_tensor* x, ggml_tensor* scale) {
-        scale = ggml_reshape_3d(ctx, scale, scale->ne[0], 1, scale->ne[1]);
-        return ggml_add(ctx, x, ggml_mul(ctx, x, scale));
-    }
-
-    __STATIC_INLINE__ ggml_tensor* gate_residual(ggml_context* ctx, ggml_tensor* residual, ggml_tensor* x, ggml_tensor* gate) {
-        gate = ggml_tanh(ctx, gate);
-        gate = ggml_reshape_3d(ctx, gate, gate->ne[0], 1, gate->ne[1]);
-        x    = ggml_mul(ctx, x, gate);
-        return ggml_add(ctx, residual, x);
-    }
-
-    struct LuminaCombinedTimestepCaptionEmbedding : public GGMLBlock {
-        int64_t frequency_embedding_size;
-        float timestep_scale;
-
-        LuminaCombinedTimestepCaptionEmbedding(int64_t hidden_size,
-                                               int64_t instruction_feat_dim,
-                                               int64_t frequency_embedding_size,
-                                               float norm_eps,
-                                               float timestep_scale)
-            : frequency_embedding_size(frequency_embedding_size),
-              timestep_scale(timestep_scale) {
-            blocks["timestep_embedder"]  = std::make_shared<Qwen::TimestepEmbedding>(frequency_embedding_size, std::min<int64_t>(hidden_size, 1024));
-            blocks["caption_embedder.0"] = std::make_shared<RMSNorm>(instruction_feat_dim, norm_eps);
-            blocks["caption_embedder.1"] = std::make_shared<Linear>(instruction_feat_dim, hidden_size, true);
-        }
-
-        std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx, ggml_tensor* timestep, ggml_tensor* text_hidden_states) {
-            auto timestep_embedder  = std::dynamic_pointer_cast<Qwen::TimestepEmbedding>(blocks["timestep_embedder"]);
-            auto caption_embedder_0 = std::dynamic_pointer_cast<RMSNorm>(blocks["caption_embedder.0"]);
-            auto caption_embedder_1 = std::dynamic_pointer_cast<Linear>(blocks["caption_embedder.1"]);
-
-            auto timestep_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep, static_cast<int>(frequency_embedding_size), 10000, timestep_scale);
-            auto time_embed    = timestep_embedder->forward(ctx, timestep_proj);
-            auto caption_embed = caption_embedder_1->forward(ctx, caption_embedder_0->forward(ctx, text_hidden_states));
-            return {time_embed, caption_embed};
-        }
-    };
-
-    struct LuminaRMSNormZero : public GGMLBlock {
-        LuminaRMSNormZero(int64_t embedding_dim, int64_t conditioning_embedding_dim, float norm_eps) {
-            blocks["linear"] = std::make_shared<Linear>(conditioning_embedding_dim, 4 * embedding_dim, true);
-            blocks["norm"]   = std::make_shared<RMSNorm>(embedding_dim, norm_eps);
-        }
-
-        std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* emb) {
-            auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
-            auto norm   = std::dynamic_pointer_cast<RMSNorm>(blocks["norm"]);
-
-            emb       = linear->forward(ctx, ggml_silu(ctx->ggml_ctx, emb));
-            auto mods = ggml_ext_chunk(ctx->ggml_ctx, emb, 4, 0);
-
-            auto scale_msa = mods[0];
-            auto gate_msa  = mods[1];
-            auto scale_mlp = mods[2];
-            auto gate_mlp  = mods[3];
-
-            x = scale_modulate(ctx->ggml_ctx, norm->forward(ctx, x), scale_msa);
-            return {x, gate_msa, scale_mlp, gate_mlp};
-        }
-    };
-
-    struct LuminaFeedForward : public GGMLBlock {
-        LuminaFeedForward(int64_t dim, int64_t inner_dim, int64_t multiple_of) {
-            inner_dim          = multiple_of * ((inner_dim + multiple_of - 1) / multiple_of);
-            blocks["linear_1"] = std::make_shared<Linear>(dim, inner_dim, false);
-            blocks["linear_2"] = std::make_shared<Linear>(inner_dim, dim, false);
-            blocks["linear_3"] = std::make_shared<Linear>(dim, inner_dim, false);
-        }
-
-        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
-            auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["linear_1"]);
-            auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["linear_2"]);
-            auto linear_3 = std::dynamic_pointer_cast<Linear>(blocks["linear_3"]);
-
-            if (sd_backend_is(ctx->backend, "Vulkan")) {
-                linear_2->set_force_prec_f32(true);
-            }
-
-            auto h1 = linear_1->forward(ctx, x);
-            auto h2 = linear_3->forward(ctx, x);
-            x       = ggml_swiglu_split(ctx->ggml_ctx, h1, h2);
-            x       = linear_2->forward(ctx, x);
-            return x;
-        }
-    };
-
-    struct LuminaLayerNormContinuous : public GGMLBlock {
-        LuminaLayerNormContinuous(int64_t embedding_dim,
-                                  int64_t conditioning_embedding_dim,
-                                  int64_t out_dim) {
-            blocks["linear_1"] = std::make_shared<Linear>(conditioning_embedding_dim, embedding_dim, true);
-            blocks["norm"]     = std::make_shared<LayerNorm>(embedding_dim, 1e-6f, false);
-            blocks["linear_2"] = std::make_shared<Linear>(embedding_dim, out_dim, true);
-        }
-
-        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* conditioning_embedding) {
-            auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["linear_1"]);
-            auto norm     = std::dynamic_pointer_cast<LayerNorm>(blocks["norm"]);
-            auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["linear_2"]);
-
-            auto emb = linear_1->forward(ctx, ggml_silu(ctx->ggml_ctx, conditioning_embedding));
-            x        = scale_modulate(ctx->ggml_ctx, norm->forward(ctx, x), emb);
-            x        = linear_2->forward(ctx, x);
-            return x;
-        }
-    };
-
-    struct Attention : public GGMLBlock {
-        int64_t dim_head;
-        int64_t heads;
-        int64_t kv_heads;
-
-        Attention(int64_t query_dim, int64_t dim_head, int64_t heads, int64_t kv_heads, float eps = 1e-5f)
-            : dim_head(dim_head), heads(heads), kv_heads(kv_heads) {
-            blocks["to_q"]     = std::make_shared<Linear>(query_dim, heads * dim_head, false);
-            blocks["to_k"]     = std::make_shared<Linear>(query_dim, kv_heads * dim_head, false);
-            blocks["to_v"]     = std::make_shared<Linear>(query_dim, kv_heads * dim_head, false);
-            blocks["norm_q"]   = std::make_shared<RMSNorm>(dim_head, eps);
-            blocks["norm_k"]   = std::make_shared<RMSNorm>(dim_head, eps);
-            blocks["to_out.0"] = std::make_shared<Linear>(heads * dim_head, query_dim, false);
-        }
-
-        ggml_tensor* forward(GGMLRunnerContext* ctx,
-                             ggml_tensor* hidden_states,
-                             ggml_tensor* encoder_hidden_states,
-                             ggml_tensor* rotary_emb,
-                             ggml_tensor* attention_mask = nullptr) {
-            auto to_q     = std::dynamic_pointer_cast<Linear>(blocks["to_q"]);
-            auto to_k     = std::dynamic_pointer_cast<Linear>(blocks["to_k"]);
-            auto to_v     = std::dynamic_pointer_cast<Linear>(blocks["to_v"]);
-            auto norm_q   = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_q"]);
-            auto norm_k   = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_k"]);
-            auto to_out_0 = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
-
-            if (sd_backend_is(ctx->backend, "Vulkan")) {
-                to_out_0->set_force_prec_f32(true);
-            }
-
-            int64_t N  = hidden_states->ne[2];
-            int64_t Lq = hidden_states->ne[1];
-            int64_t Lk = encoder_hidden_states->ne[1];
-
-            auto q = to_q->forward(ctx, hidden_states);
-            q      = ggml_reshape_4d(ctx->ggml_ctx, q, dim_head, heads, Lq, N);
-            auto k = to_k->forward(ctx, encoder_hidden_states);
-            k      = ggml_reshape_4d(ctx->ggml_ctx, k, dim_head, kv_heads, Lk, N);
-            auto v = to_v->forward(ctx, encoder_hidden_states);
-            v      = ggml_reshape_4d(ctx->ggml_ctx, v, dim_head, kv_heads, Lk, N);
-
-            q = norm_q->forward(ctx, q);
-            k = norm_k->forward(ctx, k);
-
-            auto out = Rope::attention(ctx, q, k, v, rotary_emb, attention_mask);
-            out      = to_out_0->forward(ctx, out);
-            return out;
-        }
-    };
-
-    struct BooguImageTransformerBlock : public GGMLBlock {
-        bool modulation;
-
-        BooguImageTransformerBlock(int64_t dim,
-                                   int64_t num_attention_heads,
-                                   int64_t num_kv_heads,
-                                   int64_t multiple_of,
-                                   float norm_eps,
-                                   bool modulation)
-            : modulation(modulation) {
-            int64_t head_dim       = dim / num_attention_heads;
-            blocks["attn"]         = std::make_shared<Attention>(dim, head_dim, num_attention_heads, num_kv_heads, 1e-5f);
-            blocks["feed_forward"] = std::make_shared<LuminaFeedForward>(dim, 4 * dim, multiple_of);
-            if (modulation) {
-                blocks["norm1"] = std::make_shared<LuminaRMSNormZero>(dim, std::min<int64_t>(dim, 1024), norm_eps);
-            } else {
-                blocks["norm1"] = std::make_shared<RMSNorm>(dim, norm_eps);
-            }
-            blocks["ffn_norm1"] = std::make_shared<RMSNorm>(dim, norm_eps);
-            blocks["norm2"]     = std::make_shared<RMSNorm>(dim, norm_eps);
-            blocks["ffn_norm2"] = std::make_shared<RMSNorm>(dim, norm_eps);
-        }
-
-        ggml_tensor* forward(GGMLRunnerContext* ctx,
-                             ggml_tensor* hidden_states,
-                             ggml_tensor* rotary_emb,
-                             ggml_tensor* temb           = nullptr,
-                             ggml_tensor* attention_mask = nullptr) {
-            auto attn         = std::dynamic_pointer_cast<Attention>(blocks["attn"]);
-            auto feed_forward = std::dynamic_pointer_cast<LuminaFeedForward>(blocks["feed_forward"]);
-            auto ffn_norm1    = std::dynamic_pointer_cast<RMSNorm>(blocks["ffn_norm1"]);
-            auto norm2        = std::dynamic_pointer_cast<RMSNorm>(blocks["norm2"]);
-            auto ffn_norm2    = std::dynamic_pointer_cast<RMSNorm>(blocks["ffn_norm2"]);
-
-            if (modulation) {
-                auto norm1 = std::dynamic_pointer_cast<LuminaRMSNormZero>(blocks["norm1"]);
-                auto mods  = norm1->forward(ctx, hidden_states, temb);
-
-                auto norm_hidden_states = std::get<0>(mods);
-                auto gate_msa           = std::get<1>(mods);
-                auto scale_mlp          = std::get<2>(mods);
-                auto gate_mlp           = std::get<3>(mods);
-
-                auto attn_output = attn->forward(ctx, norm_hidden_states, norm_hidden_states, rotary_emb, attention_mask);
-                hidden_states    = gate_residual(ctx->ggml_ctx, hidden_states, norm2->forward(ctx, attn_output), gate_msa);
-
-                auto mlp_input  = scale_modulate(ctx->ggml_ctx, ffn_norm1->forward(ctx, hidden_states), scale_mlp);
-                auto mlp_output = feed_forward->forward(ctx, mlp_input);
-                hidden_states   = gate_residual(ctx->ggml_ctx, hidden_states, ffn_norm2->forward(ctx, mlp_output), gate_mlp);
-            } else {
-                auto norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["norm1"]);
-
-                auto norm_hidden_states = norm1->forward(ctx, hidden_states);
-                auto attn_output        = attn->forward(ctx, norm_hidden_states, norm_hidden_states, rotary_emb, attention_mask);
-                hidden_states           = ggml_add(ctx->ggml_ctx, hidden_states, norm2->forward(ctx, attn_output));
-
-                auto mlp_output = feed_forward->forward(ctx, ffn_norm1->forward(ctx, hidden_states));
-                hidden_states   = ggml_add(ctx->ggml_ctx, hidden_states, ffn_norm2->forward(ctx, mlp_output));
-            }
-            return hidden_states;
-        }
-    };
-
-    struct BooguImageJointAttention : public GGMLBlock {
-        int64_t dim_head;
-        int64_t heads;
-        int64_t kv_heads;
-
-        BooguImageJointAttention(int64_t dim, int64_t dim_head, int64_t heads, int64_t kv_heads)
-            : dim_head(dim_head), heads(heads), kv_heads(kv_heads) {
-            blocks["norm_q"]                  = std::make_shared<RMSNorm>(dim_head, 1e-5f);
-            blocks["norm_k"]                  = std::make_shared<RMSNorm>(dim_head, 1e-5f);
-            blocks["to_out.0"]                = std::make_shared<Linear>(heads * dim_head, dim, false);
-            blocks["processor.img_to_q"]      = std::make_shared<Linear>(dim, heads * dim_head, false);
-            blocks["processor.img_to_k"]      = std::make_shared<Linear>(dim, kv_heads * dim_head, false);
-            blocks["processor.img_to_v"]      = std::make_shared<Linear>(dim, kv_heads * dim_head, false);
-            blocks["processor.instruct_to_q"] = std::make_shared<Linear>(dim, heads * dim_head, false);
-            blocks["processor.instruct_to_k"] = std::make_shared<Linear>(dim, kv_heads * dim_head, false);
-            blocks["processor.instruct_to_v"] = std::make_shared<Linear>(dim, kv_heads * dim_head, false);
-            blocks["processor.instruct_out"]  = std::make_shared<Linear>(heads * dim_head, dim, false);
-            blocks["processor.img_out"]       = std::make_shared<Linear>(heads * dim_head, dim, false);
-        }
-
-        ggml_tensor* forward(GGMLRunnerContext* ctx,
-                             ggml_tensor* img_hidden_states,
-                             ggml_tensor* instruct_hidden_states,
-                             ggml_tensor* rotary_emb,
-                             ggml_tensor* attention_mask = nullptr) {
-            auto norm_q        = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_q"]);
-            auto norm_k        = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_k"]);
-            auto to_out_0      = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
-            auto img_to_q      = std::dynamic_pointer_cast<Linear>(blocks["processor.img_to_q"]);
-            auto img_to_k      = std::dynamic_pointer_cast<Linear>(blocks["processor.img_to_k"]);
-            auto img_to_v      = std::dynamic_pointer_cast<Linear>(blocks["processor.img_to_v"]);
-            auto instruct_to_q = std::dynamic_pointer_cast<Linear>(blocks["processor.instruct_to_q"]);
-            auto instruct_to_k = std::dynamic_pointer_cast<Linear>(blocks["processor.instruct_to_k"]);
-            auto instruct_to_v = std::dynamic_pointer_cast<Linear>(blocks["processor.instruct_to_v"]);
-            auto instruct_out  = std::dynamic_pointer_cast<Linear>(blocks["processor.instruct_out"]);
-            auto img_out       = std::dynamic_pointer_cast<Linear>(blocks["processor.img_out"]);
-
-            if (sd_backend_is(ctx->backend, "Vulkan")) {
-                to_out_0->set_force_prec_f32(true);
-            }
-
-            int64_t N          = img_hidden_states->ne[2];
-            int64_t L_img      = img_hidden_states->ne[1];
-            int64_t L_instruct = instruct_hidden_states->ne[1];
-
-            auto img_q = img_to_q->forward(ctx, img_hidden_states);
-            img_q      = ggml_reshape_4d(ctx->ggml_ctx, img_q, dim_head, heads, L_img, N);
-            auto img_k = img_to_k->forward(ctx, img_hidden_states);
-            img_k      = ggml_reshape_4d(ctx->ggml_ctx, img_k, dim_head, kv_heads, L_img, N);
-            auto img_v = img_to_v->forward(ctx, img_hidden_states);
-            img_v      = ggml_reshape_4d(ctx->ggml_ctx, img_v, dim_head, kv_heads, L_img, N);
-
-            auto instruct_q = instruct_to_q->forward(ctx, instruct_hidden_states);
-            instruct_q      = ggml_reshape_4d(ctx->ggml_ctx, instruct_q, dim_head, heads, L_instruct, N);
-            auto instruct_k = instruct_to_k->forward(ctx, instruct_hidden_states);
-            instruct_k      = ggml_reshape_4d(ctx->ggml_ctx, instruct_k, dim_head, kv_heads, L_instruct, N);
-            auto instruct_v = instruct_to_v->forward(ctx, instruct_hidden_states);
-            instruct_v      = ggml_reshape_4d(ctx->ggml_ctx, instruct_v, dim_head, kv_heads, L_instruct, N);
-
-            auto q = ggml_concat(ctx->ggml_ctx, instruct_q, img_q, 2);
-            auto k = ggml_concat(ctx->ggml_ctx, instruct_k, img_k, 2);
-            auto v = ggml_concat(ctx->ggml_ctx, instruct_v, img_v, 2);
-            q      = norm_q->forward(ctx, q);
-            k      = norm_k->forward(ctx, k);
-
-            auto hidden_states = Rope::attention(ctx, q, k, v, rotary_emb, attention_mask);
-            auto instruct_attn = ggml_ext_slice(ctx->ggml_ctx, hidden_states, 1, 0, L_instruct);
-            auto img_attn      = ggml_ext_slice(ctx->ggml_ctx, hidden_states, 1, L_instruct, L_instruct + L_img);
-
-            instruct_attn = instruct_out->forward(ctx, instruct_attn);
-            img_attn      = img_out->forward(ctx, img_attn);
-            hidden_states = ggml_concat(ctx->ggml_ctx, instruct_attn, img_attn, 1);
-            hidden_states = to_out_0->forward(ctx, hidden_states);
-            return hidden_states;
-        }
-    };
-
-    struct BooguImageDoubleStreamBlock : public GGMLBlock {
-        BooguImageDoubleStreamBlock(int64_t dim,
-                                    int64_t num_attention_heads,
-                                    int64_t num_kv_heads,
-                                    int64_t multiple_of,
-                                    float norm_eps) {
-            int64_t head_dim                = dim / num_attention_heads;
-            blocks["img_instruct_attn"]     = std::make_shared<BooguImageJointAttention>(dim, head_dim, num_attention_heads, num_kv_heads);
-            blocks["img_self_attn"]         = std::make_shared<Attention>(dim, head_dim, num_attention_heads, num_kv_heads, 1e-5f);
-            blocks["img_feed_forward"]      = std::make_shared<LuminaFeedForward>(dim, 4 * dim, multiple_of);
-            blocks["instruct_feed_forward"] = std::make_shared<LuminaFeedForward>(dim, 4 * dim, multiple_of);
-            blocks["img_norm1"]             = std::make_shared<LuminaRMSNormZero>(dim, std::min<int64_t>(dim, 1024), norm_eps);
-            blocks["img_norm2"]             = std::make_shared<LuminaRMSNormZero>(dim, std::min<int64_t>(dim, 1024), norm_eps);
-            blocks["img_norm3"]             = std::make_shared<LuminaRMSNormZero>(dim, std::min<int64_t>(dim, 1024), norm_eps);
-            blocks["instruct_norm1"]        = std::make_shared<LuminaRMSNormZero>(dim, std::min<int64_t>(dim, 1024), norm_eps);
-            blocks["instruct_norm2"]        = std::make_shared<LuminaRMSNormZero>(dim, std::min<int64_t>(dim, 1024), norm_eps);
-            blocks["img_attn_norm"]         = std::make_shared<RMSNorm>(dim, norm_eps);
-            blocks["img_self_attn_norm"]    = std::make_shared<RMSNorm>(dim, norm_eps);
-            blocks["img_ffn_norm1"]         = std::make_shared<RMSNorm>(dim, norm_eps);
-            blocks["img_ffn_norm2"]         = std::make_shared<RMSNorm>(dim, norm_eps);
-            blocks["instruct_attn_norm"]    = std::make_shared<RMSNorm>(dim, norm_eps);
-            blocks["instruct_ffn_norm1"]    = std::make_shared<RMSNorm>(dim, norm_eps);
-            blocks["instruct_ffn_norm2"]    = std::make_shared<RMSNorm>(dim, norm_eps);
-        }
-
-        std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
-                                                      ggml_tensor* img_hidden_states,
-                                                      ggml_tensor* instruct_hidden_states,
-                                                      ggml_tensor* joint_rotary_emb,
-                                                      ggml_tensor* img_rotary_emb,
-                                                      ggml_tensor* temb) {
-            auto img_instruct_attn     = std::dynamic_pointer_cast<BooguImageJointAttention>(blocks["img_instruct_attn"]);
-            auto img_self_attn         = std::dynamic_pointer_cast<Attention>(blocks["img_self_attn"]);
-            auto img_feed_forward      = std::dynamic_pointer_cast<LuminaFeedForward>(blocks["img_feed_forward"]);
-            auto instruct_feed_forward = std::dynamic_pointer_cast<LuminaFeedForward>(blocks["instruct_feed_forward"]);
-            auto img_norm1             = std::dynamic_pointer_cast<LuminaRMSNormZero>(blocks["img_norm1"]);
-            auto img_norm2             = std::dynamic_pointer_cast<LuminaRMSNormZero>(blocks["img_norm2"]);
-            auto img_norm3             = std::dynamic_pointer_cast<LuminaRMSNormZero>(blocks["img_norm3"]);
-            auto instruct_norm1        = std::dynamic_pointer_cast<LuminaRMSNormZero>(blocks["instruct_norm1"]);
-            auto instruct_norm2        = std::dynamic_pointer_cast<LuminaRMSNormZero>(blocks["instruct_norm2"]);
-            auto img_attn_norm         = std::dynamic_pointer_cast<RMSNorm>(blocks["img_attn_norm"]);
-            auto img_self_attn_norm    = std::dynamic_pointer_cast<RMSNorm>(blocks["img_self_attn_norm"]);
-            auto img_ffn_norm1         = std::dynamic_pointer_cast<RMSNorm>(blocks["img_ffn_norm1"]);
-            auto img_ffn_norm2         = std::dynamic_pointer_cast<RMSNorm>(blocks["img_ffn_norm2"]);
-            auto instruct_attn_norm    = std::dynamic_pointer_cast<RMSNorm>(blocks["instruct_attn_norm"]);
-            auto instruct_ffn_norm1    = std::dynamic_pointer_cast<RMSNorm>(blocks["instruct_ffn_norm1"]);
-            auto instruct_ffn_norm2    = std::dynamic_pointer_cast<RMSNorm>(blocks["instruct_ffn_norm2"]);
-
-            int64_t L_instruct = instruct_hidden_states->ne[1];
-
-            auto img_norm1_out_vec      = img_norm1->forward(ctx, img_hidden_states, temb);
-            auto img_norm2_out_vec      = img_norm2->forward(ctx, img_hidden_states, temb);
-            auto img_norm3_out_vec      = img_norm3->forward(ctx, img_hidden_states, temb);
-            auto instruct_norm1_out_vec = instruct_norm1->forward(ctx, instruct_hidden_states, temb);
-            auto instruct_norm2_out_vec = instruct_norm2->forward(ctx, instruct_hidden_states, temb);
-
-            auto img_norm1_out = std::get<0>(img_norm1_out_vec);
-            auto img_gate_msa  = std::get<1>(img_norm1_out_vec);
-            auto img_scale_mlp = std::get<2>(img_norm1_out_vec);
-            auto img_gate_mlp  = std::get<3>(img_norm1_out_vec);
-
-            auto img_norm2_out = std::get<0>(img_norm2_out_vec);
-            auto img_shift_mlp = std::get<1>(img_norm2_out_vec);
-
-            auto img_norm3_out = std::get<0>(img_norm3_out_vec);
-            auto img_gate_self = std::get<1>(img_norm3_out_vec);
-
-            auto instruct_norm1_out = std::get<0>(instruct_norm1_out_vec);
-            auto instruct_gate_msa  = std::get<1>(instruct_norm1_out_vec);
-            auto instruct_scale_mlp = std::get<2>(instruct_norm1_out_vec);
-            auto instruct_gate_mlp  = std::get<3>(instruct_norm1_out_vec);
-
-            auto instruct_norm2_out = std::get<0>(instruct_norm2_out_vec);
-            auto instruct_shift_mlp = std::get<1>(instruct_norm2_out_vec);
-
-            auto joint_attn_out    = img_instruct_attn->forward(ctx, img_norm1_out, instruct_norm1_out, joint_rotary_emb);
-            auto instruct_attn_out = ggml_ext_slice(ctx->ggml_ctx, joint_attn_out, 1, 0, L_instruct);
-            auto img_attn_out      = ggml_ext_slice(ctx->ggml_ctx, joint_attn_out, 1, L_instruct, joint_attn_out->ne[1]);
-
-            auto img_self_attn_out = img_self_attn->forward(ctx, img_norm3_out, img_norm3_out, img_rotary_emb);
-
-            img_hidden_states = gate_residual(ctx->ggml_ctx, img_hidden_states, img_attn_norm->forward(ctx, img_attn_out), img_gate_msa);
-            img_hidden_states = gate_residual(ctx->ggml_ctx, img_hidden_states, img_self_attn_norm->forward(ctx, img_self_attn_out), img_gate_self);
-
-            auto img_mlp_input = scale_modulate(ctx->ggml_ctx, img_norm2_out, img_scale_mlp);
-            img_shift_mlp      = ggml_reshape_3d(ctx->ggml_ctx, img_shift_mlp, img_shift_mlp->ne[0], 1, img_shift_mlp->ne[1]);
-            img_mlp_input      = ggml_add(ctx->ggml_ctx, img_mlp_input, img_shift_mlp);
-            auto img_mlp_out   = img_feed_forward->forward(ctx, img_ffn_norm1->forward(ctx, img_mlp_input));
-            img_hidden_states  = gate_residual(ctx->ggml_ctx, img_hidden_states, img_ffn_norm2->forward(ctx, img_mlp_out), img_gate_mlp);
-
-            instruct_hidden_states  = gate_residual(ctx->ggml_ctx, instruct_hidden_states, instruct_attn_norm->forward(ctx, instruct_attn_out), instruct_gate_msa);
-            auto instruct_mlp_input = scale_modulate(ctx->ggml_ctx, instruct_norm2_out, instruct_scale_mlp);
-            instruct_shift_mlp      = ggml_reshape_3d(ctx->ggml_ctx, instruct_shift_mlp, instruct_shift_mlp->ne[0], 1, instruct_shift_mlp->ne[1]);
-            instruct_mlp_input      = ggml_add(ctx->ggml_ctx, instruct_mlp_input, instruct_shift_mlp);
-            auto instruct_mlp_out   = instruct_feed_forward->forward(ctx, instruct_ffn_norm1->forward(ctx, instruct_mlp_input));
-            instruct_hidden_states  = gate_residual(ctx->ggml_ctx, instruct_hidden_states, instruct_ffn_norm2->forward(ctx, instruct_mlp_out), instruct_gate_mlp);
-
-            return {img_hidden_states, instruct_hidden_states};
-        }
-    };
-
-    struct BooguImageModel : public GGMLBlock {
-        BooguConfig config;
-
-        void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
-            GGML_UNUSED(tensor_storage_map);
-            GGML_UNUSED(prefix);
-            params["image_index_embedding"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, config.hidden_size, 5);
-        }
-
-        BooguImageModel() = default;
-        BooguImageModel(BooguConfig config)
-            : config(std::move(config)) {
-            blocks["x_embedder"]               = std::make_shared<Linear>(this->config.patch_size * this->config.patch_size * this->config.in_channels, this->config.hidden_size, true);
-            blocks["ref_image_patch_embedder"] = std::make_shared<Linear>(this->config.patch_size * this->config.patch_size * this->config.in_channels, this->config.hidden_size, true);
-            blocks["time_caption_embed"]       = std::make_shared<LuminaCombinedTimestepCaptionEmbedding>(this->config.hidden_size,
-                                                                                                    this->config.instruction_feat_dim,
-                                                                                                    256,
-                                                                                                    this->config.norm_eps,
-                                                                                                    this->config.timestep_scale);
-
-            for (int i = 0; i < this->config.num_refiner_layers; i++) {
-                blocks["noise_refiner." + std::to_string(i)]     = std::make_shared<BooguImageTransformerBlock>(this->config.hidden_size,
-                                                                                                            this->config.num_attention_heads,
-                                                                                                            this->config.num_kv_heads,
-                                                                                                            this->config.multiple_of,
-                                                                                                            this->config.norm_eps,
-                                                                                                            true);
-                blocks["ref_image_refiner." + std::to_string(i)] = std::make_shared<BooguImageTransformerBlock>(this->config.hidden_size,
-                                                                                                                this->config.num_attention_heads,
-                                                                                                                this->config.num_kv_heads,
-                                                                                                                this->config.multiple_of,
-                                                                                                                this->config.norm_eps,
-                                                                                                                true);
-                blocks["context_refiner." + std::to_string(i)]   = std::make_shared<BooguImageTransformerBlock>(this->config.hidden_size,
-                                                                                                              this->config.num_attention_heads,
-                                                                                                              this->config.num_kv_heads,
-                                                                                                              this->config.multiple_of,
-                                                                                                              this->config.norm_eps,
-                                                                                                              false);
-            }
-
-            for (int i = 0; i < this->config.num_double_stream_layers; i++) {
-                blocks["double_stream_layers." + std::to_string(i)] = std::make_shared<BooguImageDoubleStreamBlock>(this->config.hidden_size,
-                                                                                                                    this->config.num_attention_heads,
-                                                                                                                    this->config.num_kv_heads,
-                                                                                                                    this->config.multiple_of,
-                                                                                                                    this->config.norm_eps);
-            }
-
-            for (int i = 0; i < this->config.num_layers; i++) {
-                blocks["single_stream_layers." + std::to_string(i)] = std::make_shared<BooguImageTransformerBlock>(this->config.hidden_size,
-                                                                                                                   this->config.num_attention_heads,
-                                                                                                                   this->config.num_kv_heads,
-                                                                                                                   this->config.multiple_of,
-                                                                                                                   this->config.norm_eps,
-                                                                                                                   true);
-            }
-
-            blocks["norm_out"] = std::make_shared<LuminaLayerNormContinuous>(this->config.hidden_size,
-                                                                             this->config.timestep_embed_dim,
-                                                                             this->config.patch_size * this->config.patch_size * this->config.out_channels);
-        }
-
-        ggml_tensor* image_index_embedding(GGMLRunnerContext* ctx, int index) {
-            GGML_ASSERT(index >= 0 && index < 5);
-            auto embedding = params["image_index_embedding"];
-            auto out       = ggml_view_1d(ctx->ggml_ctx,
-                                          embedding,
-                                          config.hidden_size,
-                                          index * config.hidden_size * ggml_element_size(embedding));
-            out            = ggml_reshape_3d(ctx->ggml_ctx, out, config.hidden_size, 1, 1);
-            return out;
-        }
-
-        ggml_tensor* embed_refs(GGMLRunnerContext* ctx, const std::vector<ggml_tensor*>& ref_latents) {
-            if (ref_latents.empty()) {
-                return nullptr;
-            }
-            auto ref_image_patch_embedder = std::dynamic_pointer_cast<Linear>(blocks["ref_image_patch_embedder"]);
-
-            ggml_tensor* ref_img = nullptr;
-            for (int i = 0; i < static_cast<int>(ref_latents.size()); i++) {
-                auto ref = DiT::pad_and_patchify(ctx, ref_latents[i], config.patch_size, config.patch_size, false);
-                ref      = ref_image_patch_embedder->forward(ctx, ref);
-                ref      = ggml_add(ctx->ggml_ctx, ref, image_index_embedding(ctx, std::min(i, 4)));
-                ref_img  = ref_img == nullptr ? ref : ggml_concat(ctx->ggml_ctx, ref_img, ref, 1);
-            }
-            return ref_img;
-        }
-
-        ggml_tensor* forward(GGMLRunnerContext* ctx,
-                             ggml_tensor* x,
-                             ggml_tensor* timesteps,
-                             ggml_tensor* context,
-                             ggml_tensor* pe,
-                             std::vector<ggml_tensor*> ref_latents = {}) {
-            int64_t W = x->ne[0];
-            int64_t H = x->ne[1];
-            int64_t N = x->ne[3];
-            GGML_ASSERT(N == 1);
-
-            auto x_embedder         = std::dynamic_pointer_cast<Linear>(blocks["x_embedder"]);
-            auto time_caption_embed = std::dynamic_pointer_cast<LuminaCombinedTimestepCaptionEmbedding>(blocks["time_caption_embed"]);
-            auto norm_out           = std::dynamic_pointer_cast<LuminaLayerNormContinuous>(blocks["norm_out"]);
-
-            auto timestep = ggml_sub(ctx->ggml_ctx, ggml_ext_ones_like(ctx->ggml_ctx, timesteps), timesteps);
-            auto embeds   = time_caption_embed->forward(ctx, timestep, context);
-            auto temb     = embeds.first;
-            auto txt      = embeds.second;
-
-            auto img        = DiT::pad_and_patchify(ctx, x, config.patch_size, config.patch_size, false);
-            int64_t img_len = img->ne[1];
-            img             = x_embedder->forward(ctx, img);
-            auto ref_img    = embed_refs(ctx, ref_latents);
-            int64_t ref_len = ref_img != nullptr ? ref_img->ne[1] : 0;
-            int64_t txt_len = txt->ne[1];
-
-            GGML_ASSERT(pe->ne[3] == txt_len + ref_len + img_len);
-            auto txt_pe   = ggml_ext_slice(ctx->ggml_ctx, pe, 3, 0, txt_len);
-            auto noise_pe = ggml_ext_slice(ctx->ggml_ctx, pe, 3, txt_len + ref_len, txt_len + ref_len + img_len);
-
-            for (int i = 0; i < config.num_refiner_layers; i++) {
-                auto block = std::dynamic_pointer_cast<BooguImageTransformerBlock>(blocks["context_refiner." + std::to_string(i)]);
-                txt        = block->forward(ctx, txt, txt_pe);
-                sd::ggml_graph_cut::mark_graph_cut(txt, "boogu.context_refiner." + std::to_string(i), "txt");
-            }
-
-            for (int i = 0; i < config.num_refiner_layers; i++) {
-                auto block = std::dynamic_pointer_cast<BooguImageTransformerBlock>(blocks["noise_refiner." + std::to_string(i)]);
-                img        = block->forward(ctx, img, noise_pe, temb);
-                sd::ggml_graph_cut::mark_graph_cut(img, "boogu.noise_refiner." + std::to_string(i), "img");
-            }
-
-            ggml_tensor* combined_img = img;
-            if (ref_img != nullptr) {
-                auto ref_pe = ggml_ext_slice(ctx->ggml_ctx, pe, 3, txt_len, txt_len + ref_len);
-                for (int i = 0; i < config.num_refiner_layers; i++) {
-                    auto block = std::dynamic_pointer_cast<BooguImageTransformerBlock>(blocks["ref_image_refiner." + std::to_string(i)]);
-                    ref_img    = block->forward(ctx, ref_img, ref_pe, temb);
-                    sd::ggml_graph_cut::mark_graph_cut(ref_img, "boogu.ref_image_refiner." + std::to_string(i), "ref_img");
-                }
-                combined_img = ggml_concat(ctx->ggml_ctx, ref_img, img, 1);
-            }
-
-            auto img_pe = ggml_ext_slice(ctx->ggml_ctx, pe, 3, txt_len, txt_len + combined_img->ne[1]);
-            for (int i = 0; i < config.num_double_stream_layers; i++) {
-                auto block   = std::dynamic_pointer_cast<BooguImageDoubleStreamBlock>(blocks["double_stream_layers." + std::to_string(i)]);
-                auto result  = block->forward(ctx, combined_img, txt, pe, img_pe, temb);
-                combined_img = result.first;
-                txt          = result.second;
-                sd::ggml_graph_cut::mark_graph_cut(combined_img, "boogu.double_stream_layers." + std::to_string(i), "img");
-                sd::ggml_graph_cut::mark_graph_cut(txt, "boogu.double_stream_layers." + std::to_string(i), "txt");
-            }
-
-            auto hidden_states = ggml_concat(ctx->ggml_ctx, txt, combined_img, 1);
-            for (int i = 0; i < config.num_layers; i++) {
-                auto block    = std::dynamic_pointer_cast<BooguImageTransformerBlock>(blocks["single_stream_layers." + std::to_string(i)]);
-                hidden_states = block->forward(ctx, hidden_states, pe, temb);
-                sd::ggml_graph_cut::mark_graph_cut(hidden_states, "boogu.single_stream_layers." + std::to_string(i), "hidden_states");
-            }
-
-            hidden_states = norm_out->forward(ctx, hidden_states, temb);
-            hidden_states = ggml_ext_slice(ctx->ggml_ctx, hidden_states, 1, hidden_states->ne[1] - img_len, hidden_states->ne[1]);
-            hidden_states = DiT::unpatchify_and_crop(ctx->ggml_ctx, hidden_states, H, W, config.patch_size, config.patch_size, false);
-            hidden_states = ggml_ext_scale(ctx->ggml_ctx, hidden_states, -1.f);
-            return hidden_states;
-        }
-    };
-
-    __STATIC_INLINE__ int patched_token_count(int64_t size, int patch_size) {
-        int pad = (patch_size - (static_cast<int>(size) % patch_size)) % patch_size;
-        return (static_cast<int>(size) + pad) / patch_size;
-    }
-
-    __STATIC_INLINE__ void append_spatial_ids(std::vector<std::vector<float>>& ids,
-                                              int bs,
-                                              int pe_shift,
-                                              int h_tokens,
-                                              int w_tokens) {
-        std::vector<std::vector<float>> image_ids(h_tokens * w_tokens, std::vector<float>(3, 0.0f));
-        for (int h = 0; h < h_tokens; h++) {
-            for (int w = 0; w < w_tokens; w++) {
-                image_ids[h * w_tokens + w][0] = static_cast<float>(pe_shift);
-                image_ids[h * w_tokens + w][1] = static_cast<float>(h);
-                image_ids[h * w_tokens + w][2] = static_cast<float>(w);
-            }
-        }
-        for (int b = 0; b < bs; b++) {
-            ids.insert(ids.end(), image_ids.begin(), image_ids.end());
-        }
-    }
-
-    __STATIC_INLINE__ std::vector<float> gen_boogu_pe(int h,
-                                                      int w,
-                                                      int patch_size,
-                                                      int bs,
-                                                      int context_len,
-                                                      const std::vector<ggml_tensor*>& ref_latents,
-                                                      int theta,
-                                                      const std::vector<int>& axes_dim) {
-        std::vector<std::vector<float>> ids;
-        ids.reserve(static_cast<size_t>(bs) * context_len);
-        for (int b = 0; b < bs; b++) {
-            for (int i = 0; i < context_len; i++) {
-                float pos = static_cast<float>(i);
-                ids.push_back({pos, pos, pos});
-            }
-        }
-
-        int pe_shift = context_len;
-        for (ggml_tensor* ref : ref_latents) {
-            int ref_h_tokens = patched_token_count(ref->ne[1], patch_size);
-            int ref_w_tokens = patched_token_count(ref->ne[0], patch_size);
-            append_spatial_ids(ids, bs, pe_shift, ref_h_tokens, ref_w_tokens);
-            pe_shift += std::max(ref_h_tokens, ref_w_tokens);
-        }
-
-        int h_tokens = patched_token_count(h, patch_size);
-        int w_tokens = patched_token_count(w, patch_size);
-        append_spatial_ids(ids, bs, pe_shift, h_tokens, w_tokens);
-
-        return Rope::embed_nd(ids, bs, static_cast<float>(theta), axes_dim);
-    }
-
-    struct BooguImageRunner : public DiffusionModelRunner {
-        BooguConfig config;
-        BooguImageModel boogu;
-        std::vector<float> pe_vec;
-
-        BooguImageRunner(ggml_backend_t backend,
-                         const String2TensorStorage& tensor_storage_map      = {},
-                         const std::string prefix                            = "",
-                         SDVersion version                                   = VERSION_BOOGU_IMAGE,
-                         std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-            : DiffusionModelRunner(backend, prefix, weight_manager),
-              config(BooguConfig::detect_from_weights(tensor_storage_map, prefix)) {
-            boogu = BooguImageModel(config);
-            boogu.init(params_ctx, tensor_storage_map, prefix);
-        }
-
-        std::string get_desc() override {
-            return "boogu_image";
-        }
-
-        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
-            boogu.get_param_tensors(tensors, prefix);
-        }
-
-        ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
-                                 const sd::Tensor<float>& timesteps_tensor,
-                                 const sd::Tensor<float>& context_tensor,
-                                 const std::vector<sd::Tensor<float>>& ref_latents_tensor = {}) {
-            ggml_cgraph* gf        = new_graph_custom(BOOGU_GRAPH_SIZE);
-            ggml_tensor* x         = make_input(x_tensor);
-            ggml_tensor* timesteps = make_input(timesteps_tensor);
-            GGML_ASSERT(x->ne[3] == 1);
-            GGML_ASSERT(!context_tensor.empty());
-            ggml_tensor* context = make_input(context_tensor);
-
-            std::vector<ggml_tensor*> ref_latents;
-            ref_latents.reserve(ref_latents_tensor.size());
-            for (const auto& ref_latent_tensor : ref_latents_tensor) {
-                ref_latents.push_back(make_input(ref_latent_tensor));
-            }
-
-            pe_vec      = gen_boogu_pe(static_cast<int>(x->ne[1]),
-                                       static_cast<int>(x->ne[0]),
-                                       config.patch_size,
-                                       static_cast<int>(x->ne[3]),
-                                       static_cast<int>(context->ne[1]),
-                                       ref_latents,
-                                       config.theta,
-                                       config.axes_dim);
-            int pos_len = static_cast<int>(pe_vec.size() / config.axes_dim_sum / 2);
-            auto pe     = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.axes_dim_sum / 2, pos_len);
-            set_backend_tensor_data(pe, pe_vec.data());
-
-            auto runner_ctx  = get_context();
-            ggml_tensor* out = boogu.forward(&runner_ctx, x, timesteps, context, pe, ref_latents);
-            ggml_build_forward_expand(gf, out);
-            return gf;
-        }
-
-        sd::Tensor<float> compute(int n_threads,
-                                  const sd::Tensor<float>& x,
-                                  const sd::Tensor<float>& timesteps,
-                                  const sd::Tensor<float>& context,
-                                  const std::vector<sd::Tensor<float>>& ref_latents = {}) {
-            auto get_graph = [&]() -> ggml_cgraph* {
-                return build_graph(x, timesteps, context, ref_latents);
-            };
-            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
-        }
-
-        sd::Tensor<float> compute(int n_threads,
-                                  const DiffusionParams& diffusion_params) override {
-            GGML_ASSERT(diffusion_params.x != nullptr);
-            GGML_ASSERT(diffusion_params.timesteps != nullptr);
-            static const std::vector<sd::Tensor<float>> empty_ref_latents;
-            return compute(n_threads,
-                           *diffusion_params.x,
-                           *diffusion_params.timesteps,
-                           tensor_or_empty(diffusion_params.context),
-                           diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents);
-        }
-    };
-}  // namespace Boogu
-
-#endif  // __SD_MODEL_DIFFUSION_BOOGU_HPP__
--- a/src/model/diffusion/control.hpp
+++ b/src/model/diffusion/control.hpp
@ -1,9 +1,8 @@
-#ifndef __SD_MODEL_DIFFUSION_CONTROL_HPP__
+#ifndef __SD_MODEL_DIFFUSION_CONTROL_HPP__
 #define __SD_MODEL_DIFFUSION_CONTROL_HPP__

 #include "model/common/block.hpp"
 #include "model_loader.h"
-#include "model_manager.h"

 #define CONTROL_NET_GRAPH_SIZE 1536

@ -310,47 +309,73 @@ public:
 struct ControlNet : public GGMLRunner {
    SDVersion version = VERSION_SD1;
    ControlNetBlock control_net;
-    std::string weight_prefix;

+    ggml_backend_buffer_t control_buffer = nullptr;
+    ggml_context* control_ctx            = nullptr;
    std::vector<ggml_tensor*> control_outputs_ggml;
    ggml_tensor* guided_hint_output_ggml = nullptr;
    std::vector<sd::Tensor<float>> controls;
+    sd::Tensor<float> guided_hint;
    bool guided_hint_cached = false;
-    std::shared_ptr<ModelManager> owned_model_manager;
-    ggml_backend_t params_backend = nullptr;
-
-    static const char* guided_hint_cache_name() {
-        return "controlnet.guided_hint";
-    }

    ControlNet(ggml_backend_t backend,
-               ggml_backend_t params_backend_,
-               const String2TensorStorage& tensor_storage_map      = {},
-               SDVersion version                                   = VERSION_SD1,
-               const std::string& prefix                           = "",
-               std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-        : GGMLRunner(backend, weight_manager), version(version), control_net(version), weight_prefix(prefix), params_backend(params_backend_) {
-        control_net.init(params_ctx, tensor_storage_map, prefix);
+               ggml_backend_t params_backend,
+               const String2TensorStorage& tensor_storage_map = {},
+               SDVersion version                              = VERSION_SD1)
+        : GGMLRunner(backend, params_backend), control_net(version) {
+        control_net.init(params_ctx, tensor_storage_map, "");
    }

    ~ControlNet() override {
        free_control_ctx();
    }

+    void alloc_control_ctx(std::vector<ggml_tensor*> outs) {
+        ggml_init_params params;
+        params.mem_size   = static_cast<size_t>(outs.size() * ggml_tensor_overhead()) + 1024 * 1024;
+        params.mem_buffer = nullptr;
+        params.no_alloc   = true;
+        control_ctx       = ggml_init(params);
+
+        control_outputs_ggml.resize(outs.size() - 1);
+
+        size_t control_buffer_size = 0;
+
+        guided_hint_output_ggml = ggml_dup_tensor(control_ctx, outs[0]);
+        control_buffer_size += ggml_nbytes(guided_hint_output_ggml);
+
+        for (int i = 0; i < outs.size() - 1; i++) {
+            control_outputs_ggml[i] = ggml_dup_tensor(control_ctx, outs[i + 1]);
+            control_buffer_size += ggml_nbytes(control_outputs_ggml[i]);
+        }
+
+        control_buffer = ggml_backend_alloc_ctx_tensors(control_ctx, runtime_backend);
+
+        LOG_DEBUG("control buffer size %.2fMB", control_buffer_size * 1.f / 1024.f / 1024.f);
+    }
+
    void free_control_ctx() {
+        if (control_buffer != nullptr) {
+            ggml_backend_buffer_free(control_buffer);
+            control_buffer = nullptr;
+        }
+        if (control_ctx != nullptr) {
+            ggml_free(control_ctx);
+            control_ctx = nullptr;
+        }
        guided_hint_output_ggml = nullptr;
        guided_hint_cached      = false;
+        guided_hint             = {};
        control_outputs_ggml.clear();
        controls.clear();
-        free_cache_ctx_and_buffer();
    }

    std::string get_desc() override {
        return "control_net";
    }

-    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) {
-        control_net.get_param_tensors(tensors, weight_prefix);
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
+        control_net.get_param_tensors(tensors, prefix);
    }

    ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
@ -366,17 +391,11 @@ struct ControlNet : public GGMLRunner {
        ggml_tensor* context   = make_optional_input(context_tensor);
        ggml_tensor* y         = make_optional_input(y_tensor);

-        guided_hint_output_ggml = nullptr;
-        control_outputs_ggml.clear();
-
        ggml_tensor* guided_hint_input = nullptr;
-        if (guided_hint_cached) {
-            guided_hint_input = get_cache_tensor_by_name(guided_hint_cache_name());
-            if (guided_hint_input == nullptr) {
-                guided_hint_cached = false;
-            }
-        }
-        if (guided_hint_input == nullptr) {
+        if (guided_hint_cached && !guided_hint.empty()) {
+            guided_hint_input = make_input(guided_hint);
+            hint              = nullptr;
+        } else {
            hint = make_input(hint_tensor);
        }

@ -390,19 +409,13 @@ struct ControlNet : public GGMLRunner {
                                        context,
                                        y);

-        if (guided_hint_input == nullptr && !outs.empty()) {
-            guided_hint_output_ggml = outs[0];
-            ggml_set_output(guided_hint_output_ggml);
-            cache(guided_hint_cache_name(), guided_hint_output_ggml);
-            ggml_build_forward_expand(gf, guided_hint_output_ggml);
+        if (control_ctx == nullptr) {
+            alloc_control_ctx(outs);
        }

-        control_outputs_ggml.reserve(outs.size() > 0 ? outs.size() - 1 : 0);
-        for (size_t i = 1; i < outs.size(); i++) {
-            ggml_tensor* control_output = outs[i];
-            ggml_set_output(control_output);
-            ggml_build_forward_expand(gf, control_output);
-            control_outputs_ggml.push_back(control_output);
+        ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[0], guided_hint_output_ggml));
+        for (int i = 0; i < outs.size() - 1; i++) {
+            ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[i + 1], control_outputs_ggml[i]));
        }

        return gf;
@ -422,12 +435,15 @@ struct ControlNet : public GGMLRunner {
            return build_graph(x, hint, timesteps, context, y);
        };

-        auto compute_result = GGMLRunner::compute<float>(get_graph, n_threads, false, false, false, true);
+        auto compute_result = GGMLRunner::compute<float>(get_graph, n_threads, false);
        if (!compute_result.has_value()) {
            return std::nullopt;
        }

-        guided_hint_cached = get_cache_tensor_by_name(guided_hint_cache_name()) != nullptr;
+        if (guided_hint_output_ggml != nullptr) {
+            guided_hint = restore_trailing_singleton_dims(sd::make_sd_tensor_from_ggml<float>(guided_hint_output_ggml),
+                                                          4);
+        }
        controls.clear();
        controls.reserve(control_outputs_ggml.size());
        for (ggml_tensor* control : control_outputs_ggml) {
@ -435,40 +451,36 @@ struct ControlNet : public GGMLRunner {
            GGML_ASSERT(!control_host.empty());
            controls.push_back(std::move(control_host));
        }
+        guided_hint_cached = true;
        return controls;
    }

    bool load_from_file(const std::string& file_path, int n_threads) {
        LOG_INFO("loading control net from '%s'", file_path.c_str());
-        std::map<std::string, ggml_tensor*> tensors;
-        control_net.get_param_tensors(tensors);
-
-        auto manager = std::dynamic_pointer_cast<ModelManager>(weight_manager.lock());
-        if (manager == nullptr) {
-            owned_model_manager = std::make_shared<ModelManager>();
-            weight_manager      = owned_model_manager;
-            manager             = owned_model_manager;
+        if (!alloc_params_buffer()) {
+            LOG_ERROR("control net model buffer allocation failed");
+            return false;
        }

-        ModelLoader& model_loader = manager->loader();
+        std::map<std::string, ggml_tensor*> tensors;
+        control_net.get_param_tensors(tensors);
+        std::set<std::string> ignore_tensors;
+
+        ModelLoader model_loader;
        if (!model_loader.init_from_file_and_convert_name(file_path)) {
            LOG_ERROR("init control net model loader from file failed: '%s'", file_path.c_str());
            return false;
        }

-        manager->set_n_threads(n_threads);
-        if (!manager->register_param_tensors("ControlNet",
-                                             std::move(tensors),
-                                             ModelManager::ResidencyMode::ParamBackend,
-                                             runtime_backend,
-                                             params_backend) ||
-            !manager->validate_registered_tensors()) {
-            LOG_ERROR("register control net tensors with model manager failed");
+        bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads);
+
+        if (!success) {
+            LOG_ERROR("load control net tensors from model loader failed");
            return false;
        }

        LOG_INFO("control net model loaded");
-        return true;
+        return success;
    }
 };

--- a/src/model/diffusion/ernie_image.hpp
+++ b/src/model/diffusion/ernie_image.hpp
@ -162,8 +162,6 @@ namespace ErnieImage {
            int64_t S = x->ne[1];
            int64_t N = x->ne[2];

-            float scale = (sd_backend_is(ctx->backend, "Vulkan") && ctx->flash_attn_enabled) ? 1.0f / 32.0f : 1.0f;
-
            auto q = to_q->forward(ctx, x);
            auto k = to_k->forward(ctx, x);
            auto v = to_v->forward(ctx, x);
@ -184,7 +182,7 @@ namespace ErnieImage {
            k = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, k, 0, 2, 1, 3));  // [N, heads, S, head_dim]
            k = ggml_reshape_3d(ctx->ggml_ctx, k, k->ne[0], k->ne[1], k->ne[2] * k->ne[3]);

-            x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, attention_mask, true, ctx->flash_attn_enabled, scale);  // [N, S, hidden_size]
+            x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, attention_mask, true, ctx->flash_attn_enabled);  // [N, S, hidden_size]
            x = to_out_0->forward(ctx, x);
            return x;
        }
@ -389,10 +387,10 @@ namespace ErnieImage {
        std::vector<float> pe_vec;

        ErnieImageRunner(ggml_backend_t backend,
-                         const String2TensorStorage& tensor_storage_map      = {},
-                         const std::string prefix                            = "",
-                         std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-            : DiffusionModelRunner(backend, prefix, weight_manager),
+                         ggml_backend_t params_backend,
+                         const String2TensorStorage& tensor_storage_map = {},
+                         const std::string prefix                       = "")
+            : DiffusionModelRunner(backend, params_backend, prefix),
              config(ErnieImageConfig::detect_from_weights(tensor_storage_map, prefix)) {
            ernie_image = ErnieImageModel(config);
            ernie_image.init(params_ctx, tensor_storage_map, prefix);
@ -442,7 +440,7 @@ namespace ErnieImage {
            auto get_graph = [&]() -> ggml_cgraph* {
                return build_graph(x, timesteps, context);
            };
-            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
        }

        sd::Tensor<float> compute(int n_threads,
--- a/src/model/diffusion/flux.hpp
+++ b/src/model/diffusion/flux.hpp
@ -4,7 +4,6 @@
 #include <memory>
 #include <vector>

-#include "model/adapter/pulid.hpp"
 #include "model/common/rope.hpp"
 #include "model/diffusion/dit.hpp"
 #include "model/diffusion/model.hpp"
@ -50,10 +49,6 @@ namespace Flux {
        float ref_index_scale     = 1.f;
        ChromaRadianceConfig chroma_radiance_params;

-        bool pulid_enabled        = false;
-        int pulid_double_interval = 2;
-        int pulid_single_interval = 4;
-
        static FluxConfig detect_from_weights(const String2TensorStorage& tensor_storage_map,
                                              const std::string& prefix,
                                              SDVersion version = VERSION_FLUX) {
@ -143,9 +138,6 @@ namespace Flux {
                if (ends_with(name, "double_blocks.0.txt_attn.norm.key_norm.scale")) {
                    head_dim = tensor_storage.ne[0];
                }
-                if (name.find("pulid_ca.") != std::string::npos) {
-                    config.pulid_enabled = true;
-                }
            }
            if (actual_radiance_patch_size > 0 && actual_radiance_patch_size != config.patch_size) {
                GGML_ASSERT(config.patch_size == 2 * actual_radiance_patch_size);
@ -965,20 +957,6 @@ namespace Flux {
                blocks["double_stream_modulation_txt"] = std::make_shared<Modulation>(config.hidden_size, true, !config.disable_bias);
                blocks["single_stream_modulation"]     = std::make_shared<Modulation>(config.hidden_size, false, !config.disable_bias);
            }
-
-            if (config.pulid_enabled) {
-                int num_double_ca = (config.depth + config.pulid_double_interval - 1) / config.pulid_double_interval;
-                int num_single_ca = (config.depth_single_blocks + config.pulid_single_interval - 1) / config.pulid_single_interval;
-                int num_ca        = num_double_ca + num_single_ca;
-                for (int i = 0; i < num_ca; i++) {
-                    blocks["pulid_ca." + std::to_string(i)] =
-                        std::shared_ptr<GGMLBlock>(new PuLIDPerceiverAttentionCA(
-                            /*dim=*/config.hidden_size,
-                            /*dim_head=*/PuLIDPerceiverAttentionCA::DEFAULT_DIM_HEAD,
-                            /*heads=*/PuLIDPerceiverAttentionCA::DEFAULT_HEADS,
-                            /*kv_dim=*/PuLIDPerceiverAttentionCA::DEFAULT_KV_DIM));
-                }
-            }
        }

        ggml_tensor* forward_orig(GGMLRunnerContext* ctx,
@ -989,9 +967,7 @@ namespace Flux {
                                  ggml_tensor* guidance,
                                  ggml_tensor* pe,
                                  ggml_tensor* mod_index_arange = nullptr,
-                                  std::vector<int> skip_layers  = {},
-                                  ggml_tensor* pulid_id         = nullptr,
-                                  float pulid_id_weight         = 1.0f) {
+                                  std::vector<int> skip_layers  = {}) {
            auto img_in      = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
            auto txt_in      = std::dynamic_pointer_cast<Linear>(blocks["txt_in"]);
            auto final_layer = std::dynamic_pointer_cast<LastLayer>(blocks["final_layer"]);
@ -1068,13 +1044,6 @@ namespace Flux {
            sd::ggml_graph_cut::mark_graph_cut(txt, "flux.prelude", "txt");
            sd::ggml_graph_cut::mark_graph_cut(vec, "flux.prelude", "vec");

-            const bool pulid_active = config.pulid_enabled && pulid_id != nullptr;
-            if (pulid_active && !skip_layers.empty()) {
-                LOG_WARN("PuLID + skip_layers is not supported; disabling PuLID for this generation.");
-            }
-            const bool pulid_run = pulid_active && skip_layers.empty();
-            int ca_idx           = 0;
-
            for (int i = 0; i < config.depth; i++) {
                if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i) != skip_layers.end()) {
                    continue;
@ -1087,19 +1056,9 @@ namespace Flux {
                txt          = img_txt.second;  // [N, n_txt_token, hidden_size]
                sd::ggml_graph_cut::mark_graph_cut(img, "flux.double_blocks." + std::to_string(i), "img");
                sd::ggml_graph_cut::mark_graph_cut(txt, "flux.double_blocks." + std::to_string(i), "txt");
-
-                if (pulid_run && (i % config.pulid_double_interval == 0)) {
-                    auto pulid_ca = std::dynamic_pointer_cast<PuLIDPerceiverAttentionCA>(
-                        blocks["pulid_ca." + std::to_string(ca_idx)]);
-                    ggml_tensor* ca_out = pulid_ca->forward(ctx, pulid_id, img);  // [N, n_img_token, hidden_size]
-                    img                 = ggml_add(ctx->ggml_ctx, img, ggml_scale(ctx->ggml_ctx, ca_out, pulid_id_weight));
-                    sd::ggml_graph_cut::mark_graph_cut(img, "flux.pulid_ca." + std::to_string(ca_idx), "img");
-                    ca_idx++;
-                }
            }

-            auto txt_img            = ggml_concat(ctx->ggml_ctx, txt, img, 1);  // [N, n_txt_token + n_img_token, hidden_size]
-            const int64_t n_txt_tok = txt->ne[1];
+            auto txt_img = ggml_concat(ctx->ggml_ctx, txt, img, 1);  // [N, n_txt_token + n_img_token, hidden_size]
            for (int i = 0; i < config.depth_single_blocks; i++) {
                if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i + config.depth) != skip_layers.end()) {
                    continue;
@ -1108,29 +1067,6 @@ namespace Flux {

                txt_img = block->forward(ctx, txt_img, vec, pe, txt_img_mask, ss_mods);
                sd::ggml_graph_cut::mark_graph_cut(txt_img, "flux.single_blocks." + std::to_string(i), "txt_img");
-
-                if (pulid_run && (i % config.pulid_single_interval == 0)) {
-                    auto pulid_ca = std::dynamic_pointer_cast<PuLIDPerceiverAttentionCA>(
-                        blocks["pulid_ca." + std::to_string(ca_idx)]);
-                    ggml_tensor* txt_part = ggml_view_3d(ctx->ggml_ctx, txt_img,
-                                                         txt_img->ne[0], n_txt_tok, txt_img->ne[2],
-                                                         txt_img->nb[1], txt_img->nb[2],
-                                                         0);
-                    ggml_tensor* img_part = ggml_view_3d(ctx->ggml_ctx, txt_img,
-                                                         txt_img->ne[0],
-                                                         txt_img->ne[1] - n_txt_tok,
-                                                         txt_img->ne[2],
-                                                         txt_img->nb[1],
-                                                         txt_img->nb[2],
-                                                         n_txt_tok * txt_img->nb[1]);
-                    txt_part              = ggml_cont(ctx->ggml_ctx, txt_part);
-                    img_part              = ggml_cont(ctx->ggml_ctx, img_part);
-                    ggml_tensor* ca_out   = pulid_ca->forward(ctx, pulid_id, img_part);
-                    img_part              = ggml_add(ctx->ggml_ctx, img_part, ggml_scale(ctx->ggml_ctx, ca_out, pulid_id_weight));
-                    txt_img               = ggml_concat(ctx->ggml_ctx, txt_part, img_part, 1);
-                    sd::ggml_graph_cut::mark_graph_cut(txt_img, "flux.pulid_ca." + std::to_string(ca_idx), "txt_img");
-                    ca_idx++;
-                }
            }

            img = ggml_view_3d(ctx->ggml_ctx,
@ -1169,9 +1105,7 @@ namespace Flux {
                                             ggml_tensor* mod_index_arange         = nullptr,
                                             ggml_tensor* dct                      = nullptr,
                                             std::vector<ggml_tensor*> ref_latents = {},
-                                             std::vector<int> skip_layers          = {},
-                                             ggml_tensor* pulid_id                 = nullptr,
-                                             float pulid_id_weight                 = 1.0f) {
+                                             std::vector<int> skip_layers          = {}) {
            GGML_ASSERT(x->ne[3] == 1);

            int64_t W      = x->ne[0];
@ -1197,8 +1131,7 @@ namespace Flux {
            img = ggml_reshape_3d(ctx->ggml_ctx, img, img->ne[0] * img->ne[1], img->ne[2], img->ne[3]);  // [N, hidden_size, H/patch_size*W/patch_size]
            img = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, img, 1, 0, 2, 3));      // [N, H/patch_size*W/patch_size, hidden_size]

-            auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, mod_index_arange, skip_layers,
-                                    pulid_id, pulid_id_weight);  // [N, n_img_token, hidden_size]
+            auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, mod_index_arange, skip_layers);  // [N, n_img_token, hidden_size]

            // nerf decode
            auto nerf_image_embedder   = std::dynamic_pointer_cast<NerfEmbedder>(blocks["nerf_image_embedder"]);
@ -1246,9 +1179,7 @@ namespace Flux {
                                         ggml_tensor* mod_index_arange         = nullptr,
                                         ggml_tensor* dct                      = nullptr,
                                         std::vector<ggml_tensor*> ref_latents = {},
-                                         std::vector<int> skip_layers          = {},
-                                         ggml_tensor* pulid_id                 = nullptr,
-                                         float pulid_id_weight                 = 1.0f) {
+                                         std::vector<int> skip_layers          = {}) {
            GGML_ASSERT(x->ne[3] == 1);

            int64_t W      = x->ne[0];
@ -1295,8 +1226,7 @@ namespace Flux {
                }
            }

-            auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, mod_index_arange, skip_layers,
-                                    pulid_id, pulid_id_weight);  // [N, num_tokens, C * patch_size * patch_size]
+            auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, mod_index_arange, skip_layers);  // [N, num_tokens, C * patch_size * patch_size]

            if (out->ne[1] > img_tokens) {
                out = ggml_view_3d(ctx->ggml_ctx, out, out->ne[0], img_tokens, out->ne[2], out->nb[1], out->nb[2], 0);
@ -1318,9 +1248,7 @@ namespace Flux {
                             ggml_tensor* mod_index_arange         = nullptr,
                             ggml_tensor* dct                      = nullptr,
                             std::vector<ggml_tensor*> ref_latents = {},
-                             std::vector<int> skip_layers          = {},
-                             ggml_tensor* pulid_id                 = nullptr,
-                             float pulid_id_weight                 = 1.0f) {
+                             std::vector<int> skip_layers          = {}) {
            // Forward pass of DiT.
            // x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
            // timestep: (N,) tensor of diffusion timesteps
@ -1343,9 +1271,7 @@ namespace Flux {
                                               mod_index_arange,
                                               dct,
                                               ref_latents,
-                                               skip_layers,
-                                               pulid_id,
-                                               pulid_id_weight);
+                                               skip_layers);
            } else {
                return forward_flux_chroma(ctx,
                                           x,
@ -1358,9 +1284,7 @@ namespace Flux {
                                           mod_index_arange,
                                           dct,
                                           ref_latents,
-                                           skip_layers,
-                                           pulid_id,
-                                           pulid_id_weight);
+                                           skip_layers);
            }
        }
    };
@ -1377,12 +1301,12 @@ namespace Flux {
        bool use_mask = false;

        FluxRunner(ggml_backend_t backend,
-                   const String2TensorStorage& tensor_storage_map      = {},
-                   const std::string prefix                            = "",
-                   SDVersion version                                   = VERSION_FLUX,
-                   bool use_mask                                       = false,
-                   std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-            : DiffusionModelRunner(backend, prefix, weight_manager),
+                   ggml_backend_t params_backend,
+                   const String2TensorStorage& tensor_storage_map = {},
+                   const std::string prefix                       = "",
+                   SDVersion version                              = VERSION_FLUX,
+                   bool use_mask                                  = false)
+            : DiffusionModelRunner(backend, params_backend, prefix),
              config(FluxConfig::detect_from_weights(tensor_storage_map, prefix, version)),
              version(version),
              use_mask(use_mask) {
@ -1460,9 +1384,7 @@ namespace Flux {
                                 const sd::Tensor<float>& guidance_tensor                 = {},
                                 const std::vector<sd::Tensor<float>>& ref_latents_tensor = {},
                                 bool increase_ref_index                                  = false,
-                                 std::vector<int> skip_layers                             = {},
-                                 const sd::Tensor<float>& pulid_id_tensor                 = {},
-                                 float pulid_id_weight                                    = 1.0f) {
+                                 std::vector<int> skip_layers                             = {}) {
            ggml_tensor* x         = make_input(x_tensor);
            ggml_tensor* timesteps = make_input(timesteps_tensor);
            ggml_tensor* context   = make_optional_input(context_tensor);
@ -1539,10 +1461,6 @@ namespace Flux {
                set_backend_tensor_data(dct, dct_vec.data());
            }

-            ggml_tensor* pulid_id = pulid_id_tensor.empty()
-                                        ? nullptr
-                                        : make_input(pulid_id_tensor);
-
            auto runner_ctx = get_context();

            ggml_tensor* out = flux.forward(&runner_ctx,
@ -1556,9 +1474,7 @@ namespace Flux {
                                            mod_index_arange,
                                            dct,
                                            ref_latents,
-                                            skip_layers,
-                                            pulid_id,
-                                            pulid_id_weight);
+                                            skip_layers);

            ggml_build_forward_expand(gf, out);

@ -1574,20 +1490,17 @@ namespace Flux {
                                  const sd::Tensor<float>& guidance                 = {},
                                  const std::vector<sd::Tensor<float>>& ref_latents = {},
                                  bool increase_ref_index                           = false,
-                                  std::vector<int> skip_layers                      = std::vector<int>(),
-                                  const sd::Tensor<float>& pulid_id                 = {},
-                                  float pulid_id_weight                             = 1.0f) {
+                                  std::vector<int> skip_layers                      = std::vector<int>()) {
            // x: [N, in_channels, h, w]
            // timesteps: [N, ]
            // context: [N, max_position, hidden_size]
            // y: [N, adm_in_channels] or [1, adm_in_channels]
            // guidance: [N, ]
-            // pulid_id: empty (no injection) or [N, num_id_tokens=32, kv_dim=2048]
            auto get_graph = [&]() -> ggml_cgraph* {
-                return build_graph(x, timesteps, context, c_concat, y, guidance, ref_latents, increase_ref_index, skip_layers, pulid_id, pulid_id_weight);
+                return build_graph(x, timesteps, context, c_concat, y, guidance, ref_latents, increase_ref_index, skip_layers);
            };

-            auto result = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
+            auto result = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
            return result;
        }

@ -1607,9 +1520,7 @@ namespace Flux {
                           tensor_or_empty(extra->guidance),
                           diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
                           diffusion_params.increase_ref_index,
-                           extra->skip_layers ? *extra->skip_layers : empty_skip_layers,
-                           tensor_or_empty(extra->pulid_id),
-                           extra->pulid_id_weight);
+                           extra->skip_layers ? *extra->skip_layers : empty_skip_layers);
        }

        void test() {
@ -1672,8 +1583,7 @@ namespace Flux {
            ggml_backend_t backend    = sd_backend_cpu_init();
            ggml_type model_data_type = GGML_TYPE_COUNT;

-            auto model_manager        = std::make_shared<ModelManager>();
-            ModelLoader& model_loader = model_manager->loader();
+            ModelLoader model_loader;
            if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) {
                LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
                return;
@ -1689,20 +1599,24 @@ namespace Flux {
            }

            std::shared_ptr<FluxRunner> flux = std::make_shared<FluxRunner>(backend,
+                                                                            backend,
                                                                            tensor_storage_map,
                                                                            "model.diffusion_model",
                                                                            VERSION_FLUX2,
-                                                                            false,
-                                                                            model_manager);
+                                                                            false);

-            if (!model_manager->register_runner_params("Flux test",
-                                                       *flux,
-                                                       "model.diffusion_model",
-                                                       ModelManager::ResidencyMode::ParamBackend,
-                                                       backend,
-                                                       backend) ||
-                !model_manager->validate_registered_tensors()) {
-                LOG_ERROR("register flux tensors with model manager failed");
+            if (!flux->alloc_params_buffer()) {
+                LOG_ERROR("flux model allocation failed");
+                return;
+            }
+
+            std::map<std::string, ggml_tensor*> tensors;
+            flux->get_param_tensors(tensors, "model.diffusion_model");
+
+            bool success = model_loader.load_tensors(tensors);
+
+            if (!success) {
+                LOG_ERROR("load tensors from model loader failed");
                return;
            }

--- a/src/model/diffusion/hidream_o1.hpp
+++ b/src/model/diffusion/hidream_o1.hpp
@ -1,4 +1,4 @@
-#ifndef __SD_MODEL_DIFFUSION_HIDREAM_O1_HPP__
+#ifndef __SD_MODEL_DIFFUSION_HIDREAM_O1_HPP__
 #define __SD_MODEL_DIFFUSION_HIDREAM_O1_HPP__

 #include <algorithm>
@ -282,10 +282,10 @@ namespace HiDreamO1 {
        std::array<std::vector<float>, 4> pos_embed_weight_data_;

        HiDreamO1VisionRunner(ggml_backend_t backend,
-                              const String2TensorStorage& tensor_storage_map      = {},
-                              const std::string& prefix                           = "model.visual",
-                              std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-            : GGMLRunner(backend, weight_manager),
+                              ggml_backend_t params_backend,
+                              const String2TensorStorage& tensor_storage_map = {},
+                              const std::string& prefix                      = "model.visual")
+            : GGMLRunner(backend, params_backend),
              config(HiDreamO1Config::detect_from_weights(tensor_storage_map, prefix)),
              model(std::make_shared<LLM::VisionModel>(false, config.llm.vision)) {
            model->init(params_ctx, tensor_storage_map, prefix);
@ -323,15 +323,11 @@ namespace HiDreamO1 {
            return gf;
        }

-        sd::Tensor<float> compute(int n_threads,
-                                  const sd::Tensor<float>& image,
-                                  bool auto_free           = true,
-                                  bool free_compute_buffer = true,
-                                  bool free_compute_params = true) {
+        sd::Tensor<float> compute(int n_threads, const sd::Tensor<float>& image) {
            auto get_graph = [&]() {
                return build_graph(image);
            };
-            auto output = GGMLRunner::compute<float>(get_graph, n_threads, auto_free, free_compute_buffer, free_compute_params);
+            auto output = GGMLRunner::compute<float>(get_graph, n_threads, false);
            return output.has_value() ? std::move(output.value()) : sd::Tensor<float>();
        }
    };
@ -343,10 +339,10 @@ namespace HiDreamO1 {
        std::vector<float> attention_mask_vec;

        HiDreamO1Runner(ggml_backend_t backend,
-                        const String2TensorStorage& tensor_storage_map      = {},
-                        const std::string& prefix                           = "model",
-                        std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-            : DiffusionModelRunner(backend, prefix, weight_manager),
+                        ggml_backend_t params_backend,
+                        const String2TensorStorage& tensor_storage_map = {},
+                        const std::string& prefix                      = "model")
+            : DiffusionModelRunner(backend, params_backend, prefix),
              config(HiDreamO1Config::detect_from_weights(tensor_storage_map, prefix)) {
            model = HiDreamO1Model(config);
            model.init(params_ctx, tensor_storage_map, prefix);
@ -459,7 +455,7 @@ namespace HiDreamO1 {
            auto get_graph = [&]() {
                return build_graph(x, timestep, input_ids, input_pos, token_types, vinput_mask, image_embeds, ref_images);
            };
-            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
        }

        sd::Tensor<float> compute(int n_threads,
@ -490,14 +486,29 @@ namespace HiDreamO1 {
        std::shared_ptr<HiDreamO1VisionRunner> vision_runner;

        HiDreamO1Conditioner(ggml_backend_t backend,
-                             const String2TensorStorage& tensor_storage_map      = {},
-                             std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-            : vision_runner(std::make_shared<HiDreamO1VisionRunner>(backend, tensor_storage_map, "model.visual", weight_manager)) {}
+                             ggml_backend_t params_backend,
+                             const String2TensorStorage& tensor_storage_map = {})
+            : vision_runner(std::make_shared<HiDreamO1VisionRunner>(backend, params_backend, tensor_storage_map)) {}

        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
            vision_runner->get_param_tensors(tensors);
        }

+        bool alloc_params_buffer() override {
+            if (!vision_runner->alloc_params_buffer()) {
+                return false;
+            }
+            return true;
+        }
+
+        void free_params_buffer() override {
+            vision_runner->free_params_buffer();
+        }
+
+        size_t get_params_buffer_size() override {
+            return vision_runner->get_params_buffer_size();
+        }
+
        void set_max_graph_vram_bytes(size_t max_graph_vram_bytes) override {
            vision_runner->set_max_graph_vram_bytes(max_graph_vram_bytes);
        }
@ -510,10 +521,6 @@ namespace HiDreamO1 {
            vision_runner->set_weight_adapter(adapter);
        }

-        void runner_done() override {
-            vision_runner->runner_done();
-        }
-
        SDCondition get_learned_condition(int n_threads,
                                          const ConditionerParams& conditioner_params) override {
            SDCondition result;
@ -659,7 +666,7 @@ namespace HiDreamO1 {
            result.c_vinput_mask  = sd::Tensor<int32_t>(vinput_mask_shape, std::move(vinput_mask));
            result.c_image_embeds.reserve(vlm_images.size());
            for (const auto& vlm_image : vlm_images) {
-                auto image_embed = vision_runner->compute(n_threads, vlm_image.second, false, true, true);
+                auto image_embed = vision_runner->compute(n_threads, vlm_image.second);
                if (image_embed.empty()) {
                    LOG_ERROR("hidream_o1 conditioner: encode VLM image failed");
                    return SDCondition();
--- a/src/model/diffusion/ideogram4.hpp
+++ b/src/model/diffusion/ideogram4.hpp
@ -151,9 +151,7 @@ namespace Ideogram4 {
                                                          int context_len,
                                                          int head_dim,
                                                          int rope_theta,
-                                                          const std::vector<int>& mrope_section,
-                                                          bool circular_x = false,
-                                                          bool circular_y = false) {
+                                                          const std::vector<int>& mrope_section) {
        GGML_ASSERT(bs == 1);
        std::vector<std::vector<float>> ids(static_cast<size_t>(bs) * (context_len + grid_h * grid_w),
                                            std::vector<float>(3, 0.f));
@ -171,29 +169,7 @@ namespace Ideogram4 {
            }
        }

-        std::vector<std::vector<int>> axis_wrap_dims(3);
-        if (circular_y || circular_x) {
-            size_t total_len = static_cast<size_t>(bs) * (context_len + grid_h * grid_w);
-            axis_wrap_dims[1].assign(total_len, 0);
-            axis_wrap_dims[2].assign(total_len, 0);
-            if (circular_y) {
-                for (size_t idx = static_cast<size_t>(context_len); idx < total_len; ++idx) {
-                    axis_wrap_dims[1][idx] = grid_h;
-                }
-            }
-            if (circular_x) {
-                for (size_t idx = static_cast<size_t>(context_len); idx < total_len; ++idx) {
-                    axis_wrap_dims[2][idx] = grid_w;
-                }
-            }
-        }
-
-        return Rope::embed_interleaved_mrope(ids,
-                                             bs,
-                                             static_cast<float>(rope_theta),
-                                             head_dim,
-                                             mrope_section,
-                                             axis_wrap_dims);
+        return Rope::embed_interleaved_mrope(ids, bs, static_cast<float>(rope_theta), head_dim, mrope_section);
    }

    class Ideogram4Attention : public GGMLBlock {
@ -449,10 +425,10 @@ namespace Ideogram4 {
        std::vector<int32_t> image_indicator_vec;

        Ideogram4Runner(ggml_backend_t backend,
-                        const String2TensorStorage& tensor_storage_map      = {},
-                        const std::string prefix                            = "",
-                        std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-            : DiffusionModelRunner(backend, prefix, weight_manager),
+                        ggml_backend_t params_backend,
+                        const String2TensorStorage& tensor_storage_map = {},
+                        const std::string prefix                       = "")
+            : DiffusionModelRunner(backend, params_backend, prefix),
              config(Ideogram4Config::detect_from_weights(tensor_storage_map, prefix)),
              uncond_prefix(prefix + ".uncond") {
            model = Ideogram4Transformer(config);
@ -504,17 +480,14 @@ namespace Ideogram4 {
            int64_t pos_len  = context_len + grid_h * grid_w;
            int64_t head_dim = config.emb_dim / config.num_heads;

-            auto runner_ctx = get_context();
-            pe_vec          = gen_ideogram4_pe(static_cast<int>(grid_h),
-                                               static_cast<int>(grid_w),
-                                               static_cast<int>(x->ne[3]),
-                                               static_cast<int>(context_len),
-                                               static_cast<int>(head_dim),
-                                               static_cast<int>(config.rope_theta),
-                                               config.mrope_section,
-                                               runner_ctx.circular_x_enabled,
-                                               runner_ctx.circular_y_enabled);
-            auto pe         = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, head_dim / 2, pos_len);
+            pe_vec  = gen_ideogram4_pe(static_cast<int>(grid_h),
+                                       static_cast<int>(grid_w),
+                                       static_cast<int>(x->ne[3]),
+                                       static_cast<int>(context_len),
+                                       static_cast<int>(head_dim),
+                                       static_cast<int>(config.rope_theta),
+                                       config.mrope_section);
+            auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, head_dim / 2, pos_len);
            set_backend_tensor_data(pe, pe_vec.data());

            image_indicator_vec.assign(static_cast<size_t>(pos_len), 1);
@ -524,6 +497,7 @@ namespace Ideogram4 {
            auto indicator = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_I32, pos_len, x->ne[3]);
            set_backend_tensor_data(indicator, image_indicator_vec.data());

+            auto runner_ctx  = get_context();
            ggml_tensor* out = active_model.forward(&runner_ctx, x, timesteps, context, pe, indicator);
            ggml_build_forward_expand(gf, out);
            return gf;
@ -537,7 +511,7 @@ namespace Ideogram4 {
            auto get_graph = [&]() -> ggml_cgraph* {
                return build_graph(x, timesteps, context, use_uncond_model);
            };
-            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
        }

        sd::Tensor<float> compute(int n_threads,
--- a/src/model/diffusion/lens.hpp
+++ b/src/model/diffusion/lens.hpp
@ -356,10 +356,10 @@ namespace Lens {
        std::vector<float> pe_vec;

        LensRunner(ggml_backend_t backend,
-                   const String2TensorStorage& tensor_storage_map      = {},
-                   const std::string prefix                            = "",
-                   std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-            : DiffusionModelRunner(backend, prefix, weight_manager),
+                   ggml_backend_t params_backend,
+                   const String2TensorStorage& tensor_storage_map = {},
+                   const std::string prefix                       = "")
+            : DiffusionModelRunner(backend, params_backend, prefix),
              config(LensConfig::detect_from_weights(tensor_storage_map, prefix)) {
            lens = LensModel(config);
            lens.init(params_ctx, tensor_storage_map, prefix);
@ -408,7 +408,7 @@ namespace Lens {
            auto get_graph = [&]() -> ggml_cgraph* {
                return build_graph(x, timesteps, context);
            };
-            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
        }

        sd::Tensor<float> compute(int n_threads,
--- a/src/model/diffusion/ltxv.hpp
+++ b/src/model/diffusion/ltxv.hpp
@ -1686,10 +1686,10 @@ namespace LTXV {
        sd::Tensor<float> ax_input_cache;

        LTXAVRunner(ggml_backend_t backend,
-                    const String2TensorStorage& tensor_storage_map      = {},
-                    const std::string& prefix                           = "model.diffusion_model",
-                    std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-            : DiffusionModelRunner(backend, prefix, weight_manager),
+                    ggml_backend_t params_backend,
+                    const String2TensorStorage& tensor_storage_map = {},
+                    const std::string& prefix                      = "model.diffusion_model")
+            : DiffusionModelRunner(backend, params_backend, prefix),
              config(LTXAVConfig::detect_from_weights(tensor_storage_map, prefix)),
              model(config) {
            model.init(params_ctx, tensor_storage_map, prefix);
@ -1939,7 +1939,7 @@ namespace LTXV {
            auto get_graph = [&]() -> ggml_cgraph* {
                return build_graph(x, timesteps, context, audio_x, audio_timesteps, audio_length, frame_rate, video_positions);
            };
-            auto out = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
+            auto out = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
            return out;
        }

@ -2025,8 +2025,7 @@ namespace LTXV {
            ggml_backend_t backend = sd_backend_cpu_init();
            LOG_INFO("loading ltxav from '%s'", model_path.c_str());

-            auto model_manager        = std::make_shared<ModelManager>();
-            ModelLoader& model_loader = model_manager->loader();
+            ModelLoader model_loader;
            if (!model_loader.init_from_file_and_convert_name(model_path, "model.diffusion_model.")) {
                LOG_ERROR("init model loader from file failed: '%s'", model_path.c_str());
                return;
@ -2041,18 +2040,19 @@ namespace LTXV {

            auto& tensor_storage_map           = model_loader.get_tensor_storage_map();
            std::shared_ptr<LTXAVRunner> ltxav = std::make_shared<LTXAVRunner>(backend,
+                                                                               backend,
                                                                               tensor_storage_map,
-                                                                               "model.diffusion_model",
-                                                                               model_manager);
+                                                                               "model.diffusion_model");

-            if (!model_manager->register_runner_params("LTXAV test",
-                                                       *ltxav,
-                                                       "model.diffusion_model",
-                                                       ModelManager::ResidencyMode::ParamBackend,
-                                                       backend,
-                                                       backend) ||
-                !model_manager->validate_registered_tensors()) {
-                LOG_ERROR("register ltxav tensors with model manager failed");
+            if (!ltxav->alloc_params_buffer()) {
+                LOG_ERROR("ltxav buffer allocation failed");
+                return;
+            }
+            std::map<std::string, ggml_tensor*> tensors;
+            ltxav->get_param_tensors(tensors, "model.diffusion_model");
+
+            if (!model_loader.load_tensors(tensors)) {
+                LOG_ERROR("load tensors from model loader failed");
                return;
            }

--- a/src/model/diffusion/mmdit.hpp
+++ b/src/model/diffusion/mmdit.hpp
@ -879,10 +879,10 @@ struct MMDiTRunner : public DiffusionModelRunner {
    MMDiT mmdit;

    MMDiTRunner(ggml_backend_t backend,
-                const String2TensorStorage& tensor_storage_map      = {},
-                const std::string prefix                            = "",
-                std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-        : DiffusionModelRunner(backend, prefix, weight_manager),
+                ggml_backend_t params_backend,
+                const String2TensorStorage& tensor_storage_map = {},
+                const std::string prefix                       = "")
+        : DiffusionModelRunner(backend, params_backend, prefix),
          config(MMDiTConfig::detect_from_weights(tensor_storage_map, prefix)),
          mmdit(config) {
        mmdit.init(params_ctx, tensor_storage_map, prefix);
@ -935,7 +935,7 @@ struct MMDiTRunner : public DiffusionModelRunner {
            return build_graph(x, timesteps, context, y, skip_layers);
        };

-        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
+        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
    }

    sd::Tensor<float> compute(int n_threads,
@ -1001,25 +1001,28 @@ struct MMDiTRunner : public DiffusionModelRunner {
        // ggml_backend_t backend    = ggml_backend_cuda_init(0);
        ggml_backend_t backend             = sd_backend_cpu_init();
        ggml_type model_data_type          = GGML_TYPE_F16;
-        auto model_manager                 = std::make_shared<ModelManager>();
-        std::shared_ptr<MMDiTRunner> mmdit = std::make_shared<MMDiTRunner>(backend, String2TensorStorage{}, "", model_manager);
+        std::shared_ptr<MMDiTRunner> mmdit = std::make_shared<MMDiTRunner>(backend, backend);
        {
            LOG_INFO("loading from '%s'", file_path.c_str());

-            ModelLoader& model_loader = model_manager->loader();
+            if (!mmdit->alloc_params_buffer()) {
+                LOG_ERROR("mmdit embeds buffer allocation failed");
+                return;
+            }
+
+            std::map<std::string, ggml_tensor*> tensors;
+            mmdit->get_param_tensors(tensors, "model.diffusion_model");
+
+            ModelLoader model_loader;
            if (!model_loader.init_from_file_and_convert_name(file_path)) {
                LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
                return;
            }

-            if (!model_manager->register_runner_params("MMDiT test",
-                                                       *mmdit,
-                                                       "model.diffusion_model",
-                                                       ModelManager::ResidencyMode::ParamBackend,
-                                                       backend,
-                                                       backend) ||
-                !model_manager->validate_registered_tensors()) {
-                LOG_ERROR("register mmdit tensors with model manager failed");
+            bool success = model_loader.load_tensors(tensors);
+
+            if (!success) {
+                LOG_ERROR("load tensors from model loader failed");
                return;
            }

--- a/src/model/diffusion/model.hpp
+++ b/src/model/diffusion/model.hpp
@ -1,4 +1,4 @@
-#ifndef __SD_MODEL_DIFFUSION_MODEL_HPP__
+#ifndef __SD_MODEL_DIFFUSION_MODEL_HPP__
 #define __SD_MODEL_DIFFUSION_MODEL_HPP__

 #include <string>
@ -7,7 +7,6 @@

 #include "core/ggml_extend.hpp"
 #include "core/tensor_ggml.hpp"
-#include "model_manager.h"

 struct UNetDiffusionExtra {
    int num_video_frames                           = -1;
@ -22,8 +21,6 @@ struct SkipLayerDiffusionExtra {
 struct FluxDiffusionExtra {
    const sd::Tensor<float>* guidance   = nullptr;
    const std::vector<int>* skip_layers = nullptr;
-    const sd::Tensor<float>* pulid_id   = nullptr;
-    float pulid_id_weight               = 1.0f;
 };

 struct AnimaDiffusionExtra {
@ -91,9 +88,9 @@ protected:

 public:
    DiffusionModelRunner(ggml_backend_t backend,
-                         const std::string& prefix,
-                         std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-        : GGMLRunner(backend, weight_manager),
+                         ggml_backend_t params_backend,
+                         const std::string& prefix)
+        : GGMLRunner(backend, params_backend),
          prefix(prefix) {}

    virtual sd::Tensor<float> compute(int n_threads,
--- a/src/model/diffusion/pid.hpp
+++ b/src/model/diffusion/pid.hpp
@ -710,10 +710,10 @@ namespace Pid {
        std::vector<float> pixel_pos_comp_vec;

        PiDRunner(ggml_backend_t backend,
+                  ggml_backend_t params_backend,
                  const String2TensorStorage& tensor_storage_map,
-                  const std::string prefix                            = "model.diffusion_model",
-                  std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-            : DiffusionModelRunner(backend, prefix, weight_manager),
+                  const std::string prefix = "model.diffusion_model")
+            : DiffusionModelRunner(backend, params_backend, prefix),
              config(PixelDiTConfig::detect_from_weights(tensor_storage_map, prefix)) {
            model = PixelDiT(config);
            model.init(params_ctx, tensor_storage_map, prefix);
@ -823,7 +823,7 @@ namespace Pid {
            auto get_graph = [&]() -> ggml_cgraph* {
                return build_graph(x, timesteps, context, lq_latent, degrade_sigma);
            };
-            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
        }

        sd::Tensor<float> compute(int n_threads,
--- a/src/model/diffusion/qwen_image.hpp
+++ b/src/model/diffusion/qwen_image.hpp
@ -518,12 +518,12 @@ namespace Qwen {
        SDVersion version;

        QwenImageRunner(ggml_backend_t backend,
-                        const String2TensorStorage& tensor_storage_map      = {},
-                        const std::string prefix                            = "",
-                        SDVersion version                                   = VERSION_QWEN_IMAGE,
-                        bool zero_cond_t                                    = false,
-                        std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-            : DiffusionModelRunner(backend, prefix, weight_manager),
+                        ggml_backend_t params_backend,
+                        const String2TensorStorage& tensor_storage_map = {},
+                        const std::string prefix                       = "",
+                        SDVersion version                              = VERSION_QWEN_IMAGE,
+                        bool zero_cond_t                               = false)
+            : DiffusionModelRunner(backend, params_backend, prefix),
              config(QwenImageConfig::detect_from_weights(tensor_storage_map, prefix)) {
            config.zero_cond_t = config.zero_cond_t || zero_cond_t;
            qwen_image         = QwenImageModel(config);
@ -627,7 +627,7 @@ namespace Qwen {
                return build_graph(x, timesteps, context, ref_latents, increase_ref_index);
            };

-            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
        }

        sd::Tensor<float> compute(int n_threads,
@ -691,8 +691,7 @@ namespace Qwen {
            ggml_backend_t backend    = sd_backend_cpu_init();
            ggml_type model_data_type = GGML_TYPE_Q8_0;

-            auto model_manager        = std::make_shared<ModelManager>();
-            ModelLoader& model_loader = model_manager->loader();
+            ModelLoader model_loader;
            if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) {
                LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
                return;
@ -706,20 +705,23 @@ namespace Qwen {
            }

            std::shared_ptr<QwenImageRunner> qwen_image = std::make_shared<QwenImageRunner>(backend,
+                                                                                            backend,
                                                                                            tensor_storage_map,
                                                                                            "model.diffusion_model",
-                                                                                            VERSION_QWEN_IMAGE,
-                                                                                            false,
-                                                                                            model_manager);
+                                                                                            VERSION_QWEN_IMAGE);

-            if (!model_manager->register_runner_params("Qwen image test",
-                                                       *qwen_image,
-                                                       "model.diffusion_model",
-                                                       ModelManager::ResidencyMode::ParamBackend,
-                                                       backend,
-                                                       backend) ||
-                !model_manager->validate_registered_tensors()) {
-                LOG_ERROR("register qwen_image tensors with model manager failed");
+            if (!qwen_image->alloc_params_buffer()) {
+                LOG_ERROR("qwen_image buffer allocation failed");
+                return;
+            }
+
+            std::map<std::string, ggml_tensor*> tensors;
+            qwen_image->get_param_tensors(tensors, "model.diffusion_model");
+
+            bool success = model_loader.load_tensors(tensors);
+
+            if (!success) {
+                LOG_ERROR("load tensors from model loader failed");
                return;
            }

--- a/src/model/diffusion/unet.hpp
+++ b/src/model/diffusion/unet.hpp
@ -694,11 +694,11 @@ struct UNetModelRunner : public DiffusionModelRunner {
    UnetModelBlock unet;

    UNetModelRunner(ggml_backend_t backend,
+                    ggml_backend_t params_backend,
                    const String2TensorStorage& tensor_storage_map,
                    const std::string prefix,
-                    SDVersion version                                   = VERSION_SD1,
-                    std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-        : DiffusionModelRunner(backend, prefix, weight_manager),
+                    SDVersion version = VERSION_SD1)
+        : DiffusionModelRunner(backend, params_backend, prefix),
          config(UNetConfig::detect_from_weights(tensor_storage_map, prefix, version)),
          unet(config) {
        unet.init(params_ctx, tensor_storage_map, prefix);
@ -772,7 +772,7 @@ struct UNetModelRunner : public DiffusionModelRunner {
            return build_graph(x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength);
        };

-        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
+        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
    }

    sd::Tensor<float> compute(int n_threads,
--- a/src/model/diffusion/wan.hpp
+++ b/src/model/diffusion/wan.hpp
@ -799,11 +799,11 @@ namespace WAN {
        SDVersion version;

        WanRunner(ggml_backend_t backend,
-                  const String2TensorStorage& tensor_storage_map      = {},
-                  const std::string prefix                            = "",
-                  SDVersion version                                   = VERSION_WAN2,
-                  std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-            : DiffusionModelRunner(backend, prefix, weight_manager),
+                  ggml_backend_t params_backend,
+                  const String2TensorStorage& tensor_storage_map = {},
+                  const std::string prefix                       = "",
+                  SDVersion version                              = VERSION_WAN2)
+            : DiffusionModelRunner(backend, params_backend, prefix),
              config(WanConfig::detect_from_weights(tensor_storage_map, prefix)) {
            if (config.num_layers == 30) {
                if (version == VERSION_WAN2_2_TI2V) {
@ -950,7 +950,7 @@ namespace WAN {
                return build_graph(x, timesteps, context, clip_fea, c_concat, time_dim_concat, vace_context, vace_strength);
            };

-            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
        }

        sd::Tensor<float> compute(int n_threads,
@ -1017,8 +1017,7 @@ namespace WAN {
            ggml_type model_data_type = GGML_TYPE_F16;
            LOG_INFO("loading from '%s'", file_path.c_str());

-            auto model_manager        = std::make_shared<ModelManager>();
-            ModelLoader& model_loader = model_manager->loader();
+            ModelLoader model_loader;
            if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) {
                LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
                return;
@ -1032,19 +1031,23 @@ namespace WAN {
            }

            std::shared_ptr<WanRunner> wan = std::make_shared<WanRunner>(backend,
+                                                                         backend,
                                                                         tensor_storage_map,
                                                                         "model.diffusion_model",
-                                                                         VERSION_WAN2_2_TI2V,
-                                                                         model_manager);
+                                                                         VERSION_WAN2_2_TI2V);

-            if (!model_manager->register_runner_params("Wan test",
-                                                       *wan,
-                                                       "model.diffusion_model",
-                                                       ModelManager::ResidencyMode::ParamBackend,
-                                                       backend,
-                                                       backend) ||
-                !model_manager->validate_registered_tensors()) {
-                LOG_ERROR("register wan tensors with model manager failed");
+            if (!wan->alloc_params_buffer()) {
+                LOG_ERROR("wan buffer allocation failed");
+                return;
+            }
+
+            std::map<std::string, ggml_tensor*> tensors;
+            wan->get_param_tensors(tensors, "model.diffusion_model");
+
+            bool success = model_loader.load_tensors(tensors);
+
+            if (!success) {
+                LOG_ERROR("load tensors from model loader failed");
                return;
            }

--- a/src/model/diffusion/z_image.hpp
+++ b/src/model/diffusion/z_image.hpp
@ -553,11 +553,11 @@ namespace ZImage {
        SDVersion version;

        ZImageRunner(ggml_backend_t backend,
-                     const String2TensorStorage& tensor_storage_map      = {},
-                     const std::string prefix                            = "",
-                     SDVersion version                                   = VERSION_Z_IMAGE,
-                     std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-            : DiffusionModelRunner(backend, prefix, weight_manager),
+                     ggml_backend_t params_backend,
+                     const String2TensorStorage& tensor_storage_map = {},
+                     const std::string prefix                       = "",
+                     SDVersion version                              = VERSION_Z_IMAGE)
+            : DiffusionModelRunner(backend, params_backend, prefix),
              config(ZImageConfig::detect_from_weights(tensor_storage_map, prefix)) {
            z_image = ZImageModel(config);
            z_image.init(params_ctx, tensor_storage_map, prefix);
@ -634,7 +634,7 @@ namespace ZImage {
                return build_graph(x, timesteps, context, ref_latents, increase_ref_index);
            };

-            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
        }

        sd::Tensor<float> compute(int n_threads,
@ -698,8 +698,7 @@ namespace ZImage {
            ggml_backend_t backend    = sd_backend_cpu_init();
            ggml_type model_data_type = GGML_TYPE_Q8_0;

-            auto model_manager        = std::make_shared<ModelManager>();
-            ModelLoader& model_loader = model_manager->loader();
+            ModelLoader model_loader;
            if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) {
                LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
                return;
@ -715,19 +714,22 @@ namespace ZImage {
            }

            std::shared_ptr<ZImageRunner> z_image = std::make_shared<ZImageRunner>(backend,
+                                                                                   backend,
                                                                                   tensor_storage_map,
                                                                                   "model.diffusion_model",
-                                                                                   VERSION_QWEN_IMAGE,
-                                                                                   model_manager);
+                                                                                   VERSION_QWEN_IMAGE);

-            if (!model_manager->register_runner_params("ZImage test",
-                                                       *z_image,
-                                                       "model.diffusion_model",
-                                                       ModelManager::ResidencyMode::ParamBackend,
-                                                       backend,
-                                                       backend) ||
-                !model_manager->validate_registered_tensors()) {
-                LOG_ERROR("register z_image tensors with model manager failed");
+            if (!z_image->alloc_params_buffer()) {
+                LOG_ERROR("z_image buffer allocation failed");
+                return;
+            }
+            std::map<std::string, ggml_tensor*> tensors;
+            z_image->get_param_tensors(tensors, "model.diffusion_model");
+
+            bool success = model_loader.load_tensors(tensors);
+
+            if (!success) {
+                LOG_ERROR("load tensors from model loader failed");
                return;
            }

--- a/src/model/te/clip.hpp
+++ b/src/model/te/clip.hpp
@ -1,4 +1,4 @@
-#ifndef __SD_MODEL_TE_CLIP_HPP__
+#ifndef __SD_MODEL_TE_CLIP_HPP__
 #define __SD_MODEL_TE_CLIP_HPP__

 #include "core/ggml_extend.hpp"
@ -469,13 +469,13 @@ struct CLIPTextModelRunner : public GGMLRunner {
    std::vector<float> attention_mask_vec;

    CLIPTextModelRunner(ggml_backend_t backend,
+                        ggml_backend_t params_backend,
                        const String2TensorStorage& tensor_storage_map,
                        const std::string prefix,
-                        CLIPVersion version                                 = OPENAI_CLIP_VIT_L_14,
-                        bool with_final_ln                                  = true,
-                        bool force_clip_f32                                 = false,
-                        std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-        : GGMLRunner(backend, weight_manager) {
+                        CLIPVersion version = OPENAI_CLIP_VIT_L_14,
+                        bool with_final_ln  = true,
+                        bool force_clip_f32 = false)
+        : GGMLRunner(backend, params_backend) {
        bool proj_in = false;
        for (const auto& [name, tensor_storage] : tensor_storage_map) {
            if (!starts_with(name, prefix)) {
@ -567,14 +567,11 @@ struct CLIPTextModelRunner : public GGMLRunner {
                              void* custom_embeddings_data,
                              size_t max_token_idx,
                              bool return_pooled,
-                              int clip_skip,
-                              bool auto_free           = true,
-                              bool free_compute_buffer = true,
-                              bool free_compute_params = true) {
+                              int clip_skip) {
        auto get_graph = [&]() -> ggml_cgraph* {
            return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip);
        };
-        auto result = GGMLRunner::compute<float>(get_graph, n_threads, auto_free, free_compute_buffer, free_compute_params);
+        auto result = GGMLRunner::compute<float>(get_graph, n_threads, true);
        if (return_pooled) {
            return take_or_empty(std::move(result));
        }
--- a/src/model/te/llm.hpp
+++ b/src/model/te/llm.hpp
@ -1,4 +1,4 @@
-#ifndef __SD_MODEL_TE_LLM_HPP__
+#ifndef __SD_MODEL_TE_LLM_HPP__
 #define __SD_MODEL_TE_LLM_HPP__

 #include <algorithm>
@ -22,7 +22,6 @@
 #include "json.hpp"
 #include "model/common/rope.hpp"
 #include "model_loader.h"
-#include "model_manager.h"
 #include "tokenizers/bpe_tokenizer.h"
 #include "tokenizers/gemma_tokenizer.h"
 #include "tokenizers/gpt_oss_tokenizer.h"
@ -79,7 +78,6 @@ namespace LLM {
        int window_size                     = 112;
        int num_position_embeddings         = 0;
        std::set<int> fullatt_block_indexes = {7, 15, 23, 31};
-        bool split_patch_embed              = false;
    };

    struct LLMConfig {
@ -180,8 +178,7 @@ namespace LLM {
                config.num_experts_per_tok     = 4;
            }

-            config.num_layers          = 0;
-            int detected_vision_layers = 0;
+            config.num_layers = 0;
            for (const auto& [name, tensor_storage] : tensor_storage_map) {
                if (!starts_with(name, prefix)) {
                    continue;
@ -192,38 +189,6 @@ namespace LLM {
                    if (contains(name, "attn.q_proj")) {
                        config.llama_cpp_style = true;
                    }
-                    if (contains(name, "visual.patch_embed.proj.1.weight")) {
-                        config.vision.split_patch_embed = true;
-                    }
-                    if (contains(name, "visual.patch_embed.proj.0.weight")) {
-                        config.vision.patch_size  = static_cast<int>(tensor_storage.ne[0]);
-                        config.vision.in_channels = tensor_storage.ne[2];
-                        config.vision.hidden_size = tensor_storage.ne[3];
-                    }
-                    if (contains(name, "visual.patch_embed.bias")) {
-                        config.vision.hidden_size = tensor_storage.ne[0];
-                    }
-                    if (contains(name, "visual.pos_embed.weight")) {
-                        config.vision.hidden_size             = tensor_storage.ne[0];
-                        config.vision.num_position_embeddings = static_cast<int>(tensor_storage.ne[1]);
-                    }
-                    if (contains(name, "visual.blocks.")) {
-                        auto items = split_string(name.substr(pos), '.');
-                        if (items.size() > 2) {
-                            int block_index = atoi(items[2].c_str());
-                            if (block_index + 1 > detected_vision_layers) {
-                                detected_vision_layers = block_index + 1;
-                            }
-                        }
-                    }
-                    if (contains(name, "visual.blocks.0.mlp.linear_fc1.weight") ||
-                        contains(name, "visual.blocks.0.mlp.gate_proj.weight")) {
-                        config.vision.intermediate_size = tensor_storage.ne[1];
-                    }
-                    if (contains(name, "visual.merger.linear_fc2.weight") ||
-                        contains(name, "visual.merger.mlp.2.weight")) {
-                        config.vision.out_hidden_size = tensor_storage.ne[1];
-                    }
                    continue;
                }
                pos = name.find("layers.");
@ -253,9 +218,6 @@ namespace LLM {
            if (arch == LLMArch::QWEN3 && config.num_layers == 28) {
                config.num_heads = 16;
            }
-            if (detected_vision_layers > 0) {
-                config.vision.num_layers = detected_vision_layers;
-            }
            LOG_DEBUG("llm: num_layers = %" PRId64 ", vocab_size = %" PRId64 ", hidden_size = %" PRId64 ", intermediate_size = %" PRId64,
                      config.num_layers,
                      config.vocab_size,
@ -576,51 +538,40 @@ namespace LLM {

    struct VisionPatchEmbed : public GGMLBlock {
    protected:
-        bool split_patch_embed;
-        bool bias;
+        bool llama_cpp_style;
        int patch_size;
        int temporal_patch_size;
        int64_t in_channels;
        int64_t embed_dim;

-        void init_params(ggml_context* ctx,
-                         const String2TensorStorage& tensor_storage_map = {},
-                         const std::string prefix                       = "") override {
-            GGML_UNUSED(tensor_storage_map);
-            GGML_UNUSED(prefix);
-            if (split_patch_embed && bias) {
-                params["bias"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, embed_dim);
-            }
-        }
-
    public:
-        VisionPatchEmbed(bool split_patch_embed,
+        VisionPatchEmbed(bool llama_cpp_style,
                         LLMVisionArch arch,
                         int patch_size          = 14,
                         int temporal_patch_size = 2,
                         int64_t in_channels     = 3,
                         int64_t embed_dim       = 1152)
-            : split_patch_embed(split_patch_embed),
-              bias(arch == LLMVisionArch::QWEN3_VL),
+            : llama_cpp_style(llama_cpp_style),
              patch_size(patch_size),
              temporal_patch_size(temporal_patch_size),
              in_channels(in_channels),
              embed_dim(embed_dim) {
-            if (split_patch_embed) {
+            bool bias = arch == LLMVisionArch::QWEN3_VL;
+            if (llama_cpp_style) {
                blocks["proj.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels,
                                                                         embed_dim,
                                                                         {patch_size, patch_size},
                                                                         {patch_size, patch_size},
                                                                         {0, 0},
                                                                         {1, 1},
-                                                                         false));
+                                                                         bias));
                blocks["proj.1"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels,
                                                                         embed_dim,
                                                                         {patch_size, patch_size},
                                                                         {patch_size, patch_size},
                                                                         {0, 0},
                                                                         {1, 1},
-                                                                         false));
+                                                                         bias));
            } else {
                std::tuple<int, int, int> kernel_size = {(int)temporal_patch_size, (int)patch_size, (int)patch_size};
                blocks["proj"]                        = std::shared_ptr<GGMLBlock>(new Conv3d(in_channels,
@ -641,7 +592,7 @@ namespace LLM {
                                temporal_patch_size,
                                ggml_nelements(x) / (temporal_patch_size * patch_size * patch_size));

-            if (split_patch_embed) {
+            if (llama_cpp_style) {
                auto proj_0 = std::dynamic_pointer_cast<Conv2d>(blocks["proj.0"]);
                auto proj_1 = std::dynamic_pointer_cast<Conv2d>(blocks["proj.1"]);

@ -654,10 +605,6 @@ namespace LLM {
                x1      = proj_1->forward(ctx, x1);

                x = ggml_add(ctx->ggml_ctx, x0, x1);
-                if (bias) {
-                    auto b = ggml_reshape_4d(ctx->ggml_ctx, params["bias"], 1, 1, embed_dim, 1);
-                    x      = ggml_add_inplace(ctx->ggml_ctx, x, b);
-                }
            } else {
                auto proj = std::dynamic_pointer_cast<Conv3d>(blocks["proj"]);

@ -850,7 +797,7 @@ namespace LLM {
              spatial_merge_size(vision_params.spatial_merge_size),
              num_grid_per_side(vision_params.num_position_embeddings > 0 ? static_cast<int>(std::sqrt(vision_params.num_position_embeddings)) : 0),
              fullatt_block_indexes(vision_params.fullatt_block_indexes) {
-            blocks["patch_embed"] = std::shared_ptr<GGMLBlock>(new VisionPatchEmbed(vision_params.split_patch_embed,
+            blocks["patch_embed"] = std::shared_ptr<GGMLBlock>(new VisionPatchEmbed(llama_cpp_style,
                                                                                    arch_,
                                                                                    vision_params.patch_size,
                                                                                    vision_params.temporal_patch_size,
@ -1624,11 +1571,11 @@ namespace LLM {
    public:
        LLMRunner(LLMArch arch,
                  ggml_backend_t backend,
+                  ggml_backend_t params_backend,
                  const String2TensorStorage& tensor_storage_map,
                  const std::string prefix,
-                  bool enable_vision_                                 = false,
-                  std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-            : GGMLRunner(backend, weight_manager),
+                  bool enable_vision_ = false)
+            : GGMLRunner(backend, params_backend),
              config(LLMConfig::detect_from_weights(tensor_storage_map, prefix, arch)),
              enable_vision(enable_vision_) {
            if (enable_vision && !config.have_vision_weight) {
@ -1786,10 +1733,7 @@ namespace LLM {
                                  const sd::Tensor<float>& attention_mask,
                                  const std::vector<std::pair<int, sd::Tensor<float>>>& image_embeds,
                                  std::set<int> out_layers,
-                                  bool return_all_hidden_states = false,
-                                  bool auto_free                = true,
-                                  bool free_compute_buffer      = true,
-                                  bool free_compute_params      = true) {
+                                  bool return_all_hidden_states = false) {
            auto get_graph = [&]() -> ggml_cgraph* {
                return build_graph(input_ids,
                                   attention_mask,
@ -1797,7 +1741,7 @@ namespace LLM {
                                   out_layers,
                                   return_all_hidden_states);
            };
-            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, auto_free, free_compute_buffer, free_compute_params),
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, true),
                                                   input_ids.dim() + 1);
        }

@ -1858,14 +1802,11 @@ namespace LLM {
        }

        sd::Tensor<float> encode_image(const int n_threads,
-                                       const sd::Tensor<float>& image,
-                                       bool auto_free           = false,
-                                       bool free_compute_buffer = false,
-                                       bool free_compute_params = false) {
+                                       const sd::Tensor<float>& image) {
            auto get_graph = [&]() -> ggml_cgraph* {
                return build_encode_image_graph(image);
            };
-            return take_or_empty(GGMLRunner::compute<float>(get_graph, n_threads, auto_free, free_compute_buffer, free_compute_params));
+            return take_or_empty(GGMLRunner::compute<float>(get_graph, n_threads, false));
        }
    };

@ -1875,11 +1816,11 @@ namespace LLM {

        LLMEmbedder(LLMArch arch,
                    ggml_backend_t backend,
-                    const String2TensorStorage& tensor_storage_map      = {},
-                    const std::string prefix                            = "",
-                    bool enable_vision                                  = false,
-                    std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-            : model(arch, backend, tensor_storage_map, prefix, enable_vision, weight_manager) {
+                    ggml_backend_t params_backend,
+                    const String2TensorStorage& tensor_storage_map = {},
+                    const std::string prefix                       = "",
+                    bool enable_vision                             = false)
+            : model(arch, backend, params_backend, tensor_storage_map, prefix, enable_vision) {
            if (arch == LLMArch::MISTRAL_SMALL_3_2 || arch == LLMArch::MINISTRAL_3_3B) {
                tokenizer = std::make_shared<MistralTokenizer>();
            } else if (arch == LLMArch::GPT_OSS_20B) {
@ -1893,6 +1834,13 @@ namespace LLM {
            model.get_param_tensors(tensors, prefix);
        }

+        bool alloc_params_buffer() {
+            if (!model.alloc_params_buffer()) {
+                return false;
+            }
+            return true;
+        }
+
        std::tuple<std::vector<int>, std::vector<float>> tokenize(std::string text,
                                                                  std::pair<int, int> attn_range,
                                                                  size_t max_length = 0,
@ -2108,8 +2056,7 @@ namespace LLM {
            ggml_backend_t backend    = sd_backend_cpu_init();
            ggml_type model_data_type = GGML_TYPE_COUNT;

-            auto model_manager        = std::make_shared<ModelManager>();
-            ModelLoader& model_loader = model_manager->loader();
+            ModelLoader model_loader;
            if (!model_loader.init_from_file_and_convert_name(file_path, "text_encoders.llm.")) {
                LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
                return;
@ -2127,20 +2074,24 @@ namespace LLM {
            LLMArch arch = LLMArch::QWEN3;

            std::shared_ptr<LLMEmbedder> llm = std::make_shared<LLMEmbedder>(arch,
+                                                                             backend,
                                                                             backend,
                                                                             tensor_storage_map,
                                                                             "text_encoders.llm",
-                                                                             true,
-                                                                             model_manager);
+                                                                             true);

-            if (!model_manager->register_runner_params("LLM test",
-                                                       *llm,
-                                                       "text_encoders.llm",
-                                                       ModelManager::ResidencyMode::ParamBackend,
-                                                       backend,
-                                                       backend) ||
-                !model_manager->validate_registered_tensors()) {
-                LOG_ERROR("register llm tensors with model manager failed");
+            if (!llm->alloc_params_buffer()) {
+                LOG_ERROR("llm model allocation failed");
+                return;
+            }
+
+            std::map<std::string, ggml_tensor*> tensors;
+            llm->get_param_tensors(tensors, "text_encoders.llm");
+
+            bool success = model_loader.load_tensors(tensors);
+
+            if (!success) {
+                LOG_ERROR("load tensors from model loader failed");
                return;
            }

--- a/src/model/te/t5.hpp
+++ b/src/model/te/t5.hpp
@ -1,4 +1,4 @@
-#ifndef __SD_MODEL_TE_T5_HPP__
+#ifndef __SD_MODEL_TE_T5_HPP__
 #define __SD_MODEL_TE_T5_HPP__

 #include <cfloat>
@ -12,7 +12,6 @@

 #include "core/ggml_extend.hpp"
 #include "model_loader.h"
-#include "model_manager.h"
 #include "tokenizers/t5_unigram_tokenizer.h"

 struct T5Config {
@ -335,11 +334,11 @@ struct T5Runner : public GGMLRunner {
    std::vector<int> relative_position_bucket_vec;

    T5Runner(ggml_backend_t backend,
+             ggml_backend_t params_backend,
             const String2TensorStorage& tensor_storage_map,
             const std::string prefix,
-             bool is_umt5                                        = false,
-             std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-        : GGMLRunner(backend, weight_manager),
+             bool is_umt5 = false)
+        : GGMLRunner(backend, params_backend),
          config(T5Config::detect_from_weights(tensor_storage_map, prefix, is_umt5)) {
        model = T5(config);
        model.init(params_ctx, tensor_storage_map, prefix);
@ -395,14 +394,11 @@ struct T5Runner : public GGMLRunner {

    sd::Tensor<float> compute(const int n_threads,
                              const sd::Tensor<int32_t>& input_ids,
-                              const sd::Tensor<float>& attention_mask,
-                              bool auto_free           = true,
-                              bool free_compute_buffer = true,
-                              bool free_compute_params = true) {
+                              const sd::Tensor<float>& attention_mask) {
        auto get_graph = [&]() -> ggml_cgraph* {
            return build_graph(input_ids, attention_mask);
        };
-        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, auto_free, free_compute_buffer, free_compute_params), 3);
+        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, true), 3);
    }

    static std::vector<int> _relative_position_bucket(const std::vector<int>& relative_position,
@ -478,17 +474,24 @@ struct T5Embedder {
    T5Runner model;

    T5Embedder(ggml_backend_t backend,
-               const String2TensorStorage& tensor_storage_map      = {},
-               const std::string prefix                            = "",
-               bool is_umt5                                        = false,
-               std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-        : model(backend, tensor_storage_map, prefix, is_umt5, weight_manager), tokenizer(is_umt5) {
+               ggml_backend_t params_backend,
+               const String2TensorStorage& tensor_storage_map = {},
+               const std::string prefix                       = "",
+               bool is_umt5                                   = false)
+        : model(backend, params_backend, tensor_storage_map, prefix, is_umt5), tokenizer(is_umt5) {
    }

    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
        model.get_param_tensors(tensors, prefix);
    }

+    bool alloc_params_buffer() {
+        if (!model.alloc_params_buffer()) {
+            return false;
+        }
+        return true;
+    }
+
    std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text,
                                                                                  size_t max_length = 0,
                                                                                  bool padding      = false) {
@ -573,8 +576,7 @@ struct T5Embedder {
        ggml_backend_t backend    = sd_backend_cpu_init();
        ggml_type model_data_type = GGML_TYPE_F16;

-        auto model_manager        = std::make_shared<ModelManager>();
-        ModelLoader& model_loader = model_manager->loader();
+        ModelLoader model_loader;
        if (!model_loader.init_from_file_and_convert_name(file_path)) {
            LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
            return;
@ -587,16 +589,19 @@ struct T5Embedder {
            }
        }

-        std::shared_ptr<T5Embedder> t5 = std::make_shared<T5Embedder>(backend, tensor_storage_map, "", true, model_manager);
+        std::shared_ptr<T5Embedder> t5 = std::make_shared<T5Embedder>(backend, backend, tensor_storage_map, "", true);

-        if (!model_manager->register_runner_params("T5 test",
-                                                   *t5,
-                                                   "",
-                                                   ModelManager::ResidencyMode::ParamBackend,
-                                                   backend,
-                                                   backend) ||
-            !model_manager->validate_registered_tensors()) {
-            LOG_ERROR("register t5 tensors with model manager failed");
+        if (!t5->alloc_params_buffer()) {
+            LOG_ERROR("t5 params buffer allocation failed");
+            return;
+        }
+        std::map<std::string, ggml_tensor*> tensors;
+        t5->get_param_tensors(tensors, "");
+
+        bool success = model_loader.load_tensors(tensors);
+
+        if (!success) {
+            LOG_ERROR("load tensors from model loader failed");
            return;
        }

--- a/src/model/upscaler/esrgan.hpp
+++ b/src/model/upscaler/esrgan.hpp
@ -1,14 +1,8 @@
-#ifndef __SD_MODEL_UPSCALER_ESRGAN_HPP__
+#ifndef __SD_MODEL_UPSCALER_ESRGAN_HPP__
 #define __SD_MODEL_UPSCALER_ESRGAN_HPP__

-#include <algorithm>
-#include <map>
-#include <string>
-#include <utility>
-#include <vector>
-
 #include "core/ggml_extend.hpp"
-#include "core/util.h"
+#include "model_loader.h"

 /*
    ===================================    ESRGAN  ===================================
@ -18,74 +12,6 @@

 */

-struct ESRGANConfig {
-    int scale       = 4;
-    int num_block   = 23;
-    int num_in_ch   = 3;
-    int num_out_ch  = 3;
-    int num_feat    = 64;
-    int num_grow_ch = 32;
-
-    static ESRGANConfig detect_from_weights(const String2TensorStorage& tensor_storage_map,
-                                            const std::string& prefix = "") {
-        ESRGANConfig config;
-        auto find_weight = [&](const std::string& suffix) -> const TensorStorage* {
-            std::string name = prefix.empty() ? suffix : prefix + "." + suffix;
-            auto iter        = tensor_storage_map.find(name);
-            if (iter == tensor_storage_map.end()) {
-                return nullptr;
-            }
-            return &iter->second;
-        };
-
-        int detected_num_block        = 0;
-        const std::string body_prefix = prefix.empty() ? "body." : prefix + ".body.";
-        for (const auto& [name, _] : tensor_storage_map) {
-            if (!starts_with(name, body_prefix)) {
-                continue;
-            }
-            size_t pos = name.find('.', body_prefix.size());
-            if (pos == std::string::npos) {
-                continue;
-            }
-            try {
-                int idx            = std::stoi(name.substr(body_prefix.size(), pos - body_prefix.size()));
-                detected_num_block = std::max(detected_num_block, idx + 1);
-            } catch (...) {
-            }
-        }
-        if (detected_num_block > 0) {
-            config.num_block = detected_num_block;
-        }
-
-        bool has_conv_up2 = find_weight("conv_up2.weight") != nullptr;
-        bool has_conv_up1 = find_weight("conv_up1.weight") != nullptr;
-        bool has_model_tensor =
-            detected_num_block > 0 ||
-            find_weight("conv_first.weight") != nullptr ||
-            find_weight("conv_hr.weight") != nullptr ||
-            find_weight("conv_last.weight") != nullptr;
-        if (has_conv_up2) {
-            config.scale = 4;
-        } else if (has_conv_up1) {
-            config.scale = 2;
-        } else if (has_model_tensor) {
-            config.scale = 1;
-        }
-
-        if (has_model_tensor || has_conv_up1 || has_conv_up2) {
-            LOG_DEBUG("esrgan: scale = %d, num_block = %d, num_in_ch = %d, num_out_ch = %d, num_feat = %d, num_grow_ch = %d",
-                      config.scale,
-                      config.num_block,
-                      config.num_in_ch,
-                      config.num_out_ch,
-                      config.num_feat,
-                      config.num_grow_ch);
-        }
-        return config;
-    }
-};
-
 class ResidualDenseBlock : public GGMLBlock {
 protected:
    int num_feat;
@ -157,29 +83,34 @@ public:

 class RRDBNet : public GGMLBlock {
 protected:
-    ESRGANConfig config;
+    int scale       = 4;
+    int num_block   = 23;
+    int num_in_ch   = 3;
+    int num_out_ch  = 3;
+    int num_feat    = 64;
+    int num_grow_ch = 32;

 public:
-    explicit RRDBNet(ESRGANConfig config)
-        : config(std::move(config)) {
-        blocks["conv_first"] = std::shared_ptr<GGMLBlock>(new Conv2d(this->config.num_in_ch, this->config.num_feat, {3, 3}, {1, 1}, {1, 1}));
-        for (int i = 0; i < this->config.num_block; i++) {
+    RRDBNet(int scale, int num_block, int num_in_ch, int num_out_ch, int num_feat, int num_grow_ch)
+        : scale(scale), num_block(num_block), num_in_ch(num_in_ch), num_out_ch(num_out_ch), num_feat(num_feat), num_grow_ch(num_grow_ch) {
+        blocks["conv_first"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_in_ch, num_feat, {3, 3}, {1, 1}, {1, 1}));
+        for (int i = 0; i < num_block; i++) {
            std::string name = "body." + std::to_string(i);
-            blocks[name]     = std::shared_ptr<GGMLBlock>(new RRDB(this->config.num_feat, this->config.num_grow_ch));
+            blocks[name]     = std::shared_ptr<GGMLBlock>(new RRDB(num_feat, num_grow_ch));
        }
-        blocks["conv_body"] = std::shared_ptr<GGMLBlock>(new Conv2d(this->config.num_feat, this->config.num_feat, {3, 3}, {1, 1}, {1, 1}));
-        if (this->config.scale >= 2) {
-            blocks["conv_up1"] = std::shared_ptr<GGMLBlock>(new Conv2d(this->config.num_feat, this->config.num_feat, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv_body"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
+        if (scale >= 2) {
+            blocks["conv_up1"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
        }
-        if (this->config.scale == 4) {
-            blocks["conv_up2"] = std::shared_ptr<GGMLBlock>(new Conv2d(this->config.num_feat, this->config.num_feat, {3, 3}, {1, 1}, {1, 1}));
+        if (scale == 4) {
+            blocks["conv_up2"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
        }
-        blocks["conv_hr"]   = std::shared_ptr<GGMLBlock>(new Conv2d(this->config.num_feat, this->config.num_feat, {3, 3}, {1, 1}, {1, 1}));
-        blocks["conv_last"] = std::shared_ptr<GGMLBlock>(new Conv2d(this->config.num_feat, this->config.num_out_ch, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv_hr"]   = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv_last"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_out_ch, {3, 3}, {1, 1}, {1, 1}));
    }

-    int get_scale() { return config.scale; }
-    int get_num_block() { return config.num_block; }
+    int get_scale() { return scale; }
+    int get_num_block() { return num_block; }

    ggml_tensor* lrelu(GGMLRunnerContext* ctx, ggml_tensor* x) {
        return ggml_leaky_relu(ctx->ggml_ctx, x, 0.2f, true);
@ -196,7 +127,7 @@ public:
        auto feat = conv_first->forward(ctx, x);
        sd::ggml_graph_cut::mark_graph_cut(feat, "esrgan.prelude", "feat");
        auto body_feat = feat;
-        for (int i = 0; i < config.num_block; i++) {
+        for (int i = 0; i < num_block; i++) {
            std::string name = "body." + std::to_string(i);
            auto block       = std::dynamic_pointer_cast<RRDB>(blocks[name]);

@ -207,11 +138,11 @@ public:
        feat      = ggml_add(ctx->ggml_ctx, feat, body_feat);
        sd::ggml_graph_cut::mark_graph_cut(feat, "esrgan.body.out", "feat");
        // upsample
-        if (config.scale >= 2) {
+        if (scale >= 2) {
            auto conv_up1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up1"]);
            feat          = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx->ggml_ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
            sd::ggml_graph_cut::mark_graph_cut(feat, "esrgan.up1", "feat");
-            if (config.scale == 4) {
+            if (scale == 4) {
                auto conv_up2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up2"]);
                feat          = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx->ggml_ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
                sd::ggml_graph_cut::mark_graph_cut(feat, "esrgan.up2", "feat");
@ -225,28 +156,199 @@ public:
 };

 struct ESRGAN : public GGMLRunner {
-    ESRGANConfig config;
    std::unique_ptr<RRDBNet> rrdb_net;
+    int scale     = 4;
+    int tile_size = 128;  // avoid cuda OOM for 4gb VRAM

    ESRGAN(ggml_backend_t backend,
-           const String2TensorStorage& tensor_storage_map      = {},
-           std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-        : GGMLRunner(backend, weight_manager),
-          config(ESRGANConfig::detect_from_weights(tensor_storage_map)),
-          rrdb_net(std::make_unique<RRDBNet>(config)) {
-        rrdb_net->init(params_ctx, tensor_storage_map, "");
+           ggml_backend_t params_backend,
+           int tile_size                                  = 128,
+           const String2TensorStorage& tensor_storage_map = {})
+        : GGMLRunner(backend, params_backend) {
+        this->tile_size = tile_size;
    }

    std::string get_desc() override {
        return "esrgan";
    }

-    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) {
-        if (!rrdb_net) {
-            return;
+    bool load_from_file(const std::string& file_path, int n_threads) {
+        LOG_INFO("loading esrgan from '%s'", file_path.c_str());
+
+        ModelLoader model_loader;
+        if (!model_loader.init_from_file_and_convert_name(file_path)) {
+            LOG_ERROR("init esrgan model loader from file failed: '%s'", file_path.c_str());
+            return false;
        }

-        rrdb_net->get_param_tensors(tensors);
+        // Get tensor names
+        auto tensor_names = model_loader.get_tensor_names();
+
+        // Detect if it's ESRGAN format
+        bool is_ESRGAN = std::find(tensor_names.begin(), tensor_names.end(), "model.0.weight") != tensor_names.end();
+
+        // Detect parameters from tensor names
+        int detected_num_block = 0;
+        if (is_ESRGAN) {
+            for (const auto& name : tensor_names) {
+                if (name.find("model.1.sub.") == 0) {
+                    size_t first_dot = name.find('.', 12);
+                    if (first_dot != std::string::npos) {
+                        size_t second_dot = name.find('.', first_dot + 1);
+                        if (second_dot != std::string::npos && name.substr(first_dot + 1, 3) == "RDB") {
+                            try {
+                                int idx            = std::stoi(name.substr(12, first_dot - 12));
+                                detected_num_block = std::max(detected_num_block, idx + 1);
+                            } catch (...) {
+                            }
+                        }
+                    }
+                }
+            }
+        } else {
+            // Original format
+            for (const auto& name : tensor_names) {
+                if (name.find("body.") == 0) {
+                    size_t pos = name.find('.', 5);
+                    if (pos != std::string::npos) {
+                        try {
+                            int idx            = std::stoi(name.substr(5, pos - 5));
+                            detected_num_block = std::max(detected_num_block, idx + 1);
+                        } catch (...) {
+                        }
+                    }
+                }
+            }
+        }
+
+        int detected_scale = 4;  // default
+        if (is_ESRGAN) {
+            // For ESRGAN format, detect scale by highest model number
+            int max_model_num = 0;
+            for (const auto& name : tensor_names) {
+                if (name.find("model.") == 0) {
+                    size_t dot_pos = name.find('.', 6);
+                    if (dot_pos != std::string::npos) {
+                        try {
+                            int num       = std::stoi(name.substr(6, dot_pos - 6));
+                            max_model_num = std::max(max_model_num, num);
+                        } catch (...) {
+                        }
+                    }
+                }
+            }
+            if (max_model_num <= 4) {
+                detected_scale = 1;
+            } else if (max_model_num <= 7) {
+                detected_scale = 2;
+            } else {
+                detected_scale = 4;
+            }
+        } else {
+            // Original format
+            bool has_conv_up2 = std::any_of(tensor_names.begin(), tensor_names.end(), [](const std::string& name) {
+                return name == "conv_up2.weight";
+            });
+            bool has_conv_up1 = std::any_of(tensor_names.begin(), tensor_names.end(), [](const std::string& name) {
+                return name == "conv_up1.weight";
+            });
+            if (has_conv_up2) {
+                detected_scale = 4;
+            } else if (has_conv_up1) {
+                detected_scale = 2;
+            } else {
+                detected_scale = 1;
+            }
+        }
+
+        int detected_num_in_ch   = 3;
+        int detected_num_out_ch  = 3;
+        int detected_num_feat    = 64;
+        int detected_num_grow_ch = 32;
+
+        // Create RRDBNet with detected parameters
+        rrdb_net = std::make_unique<RRDBNet>(detected_scale, detected_num_block, detected_num_in_ch, detected_num_out_ch, detected_num_feat, detected_num_grow_ch);
+        rrdb_net->init(params_ctx, {}, "");
+
+        if (!alloc_params_buffer()) {
+            LOG_ERROR("esrgan model buffer allocation failed");
+            return false;
+        }
+
+        std::map<std::string, ggml_tensor*> esrgan_tensors;
+        rrdb_net->get_param_tensors(esrgan_tensors);
+
+        bool success;
+        if (is_ESRGAN) {
+            // Build name mapping for ESRGAN format
+            std::map<std::string, std::string> expected_to_model;
+            expected_to_model["conv_first.weight"] = "model.0.weight";
+            expected_to_model["conv_first.bias"]   = "model.0.bias";
+
+            for (int i = 0; i < detected_num_block; i++) {
+                for (int j = 1; j <= 3; j++) {
+                    for (int k = 1; k <= 5; k++) {
+                        std::string expected_weight        = "body." + std::to_string(i) + ".rdb" + std::to_string(j) + ".conv" + std::to_string(k) + ".weight";
+                        std::string model_weight           = "model.1.sub." + std::to_string(i) + ".RDB" + std::to_string(j) + ".conv" + std::to_string(k) + ".0.weight";
+                        expected_to_model[expected_weight] = model_weight;
+
+                        std::string expected_bias        = "body." + std::to_string(i) + ".rdb" + std::to_string(j) + ".conv" + std::to_string(k) + ".bias";
+                        std::string model_bias           = "model.1.sub." + std::to_string(i) + ".RDB" + std::to_string(j) + ".conv" + std::to_string(k) + ".0.bias";
+                        expected_to_model[expected_bias] = model_bias;
+                    }
+                }
+            }
+
+            if (detected_scale == 1) {
+                expected_to_model["conv_body.weight"] = "model.1.sub." + std::to_string(detected_num_block) + ".weight";
+                expected_to_model["conv_body.bias"]   = "model.1.sub." + std::to_string(detected_num_block) + ".bias";
+                expected_to_model["conv_hr.weight"]   = "model.2.weight";
+                expected_to_model["conv_hr.bias"]     = "model.2.bias";
+                expected_to_model["conv_last.weight"] = "model.4.weight";
+                expected_to_model["conv_last.bias"]   = "model.4.bias";
+            } else {
+                expected_to_model["conv_body.weight"] = "model.1.sub." + std::to_string(detected_num_block) + ".weight";
+                expected_to_model["conv_body.bias"]   = "model.1.sub." + std::to_string(detected_num_block) + ".bias";
+                if (detected_scale >= 2) {
+                    expected_to_model["conv_up1.weight"] = "model.3.weight";
+                    expected_to_model["conv_up1.bias"]   = "model.3.bias";
+                }
+                if (detected_scale == 4) {
+                    expected_to_model["conv_up2.weight"]  = "model.6.weight";
+                    expected_to_model["conv_up2.bias"]    = "model.6.bias";
+                    expected_to_model["conv_hr.weight"]   = "model.8.weight";
+                    expected_to_model["conv_hr.bias"]     = "model.8.bias";
+                    expected_to_model["conv_last.weight"] = "model.10.weight";
+                    expected_to_model["conv_last.bias"]   = "model.10.bias";
+                } else if (detected_scale == 2) {
+                    expected_to_model["conv_hr.weight"]   = "model.5.weight";
+                    expected_to_model["conv_hr.bias"]     = "model.5.bias";
+                    expected_to_model["conv_last.weight"] = "model.7.weight";
+                    expected_to_model["conv_last.bias"]   = "model.7.bias";
+                }
+            }
+
+            std::map<std::string, ggml_tensor*> model_tensors;
+            for (auto& p : esrgan_tensors) {
+                auto it = expected_to_model.find(p.first);
+                if (it != expected_to_model.end()) {
+                    model_tensors[it->second] = p.second;
+                }
+            }
+
+            success = model_loader.load_tensors(model_tensors, {}, n_threads);
+        } else {
+            success = model_loader.load_tensors(esrgan_tensors, {}, n_threads);
+        }
+
+        if (!success) {
+            LOG_ERROR("load esrgan tensors from model loader failed");
+            return false;
+        }
+
+        scale = rrdb_net->get_scale();
+        LOG_INFO("esrgan model loaded with scale=%d, num_block=%d", scale, detected_num_block);
+        return success;
    }

    ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor) {
@ -265,7 +367,7 @@ struct ESRGAN : public GGMLRunner {
    sd::Tensor<float> compute(const int n_threads,
                              const sd::Tensor<float>& x) {
        auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x); };
-        auto result    = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
+        auto result    = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
        return result;
    }
 };
--- a/src/model/upscaler/ltx_latent_upscaler.hpp
+++ b/src/model/upscaler/ltx_latent_upscaler.hpp
@ -1,9 +1,9 @@
-#ifndef __SD_MODEL_UPSCALER_LTX_LATENT_UPSCALER_HPP__
+#ifndef __SD_MODEL_UPSCALER_LTX_LATENT_UPSCALER_HPP__
 #define __SD_MODEL_UPSCALER_LTX_LATENT_UPSCALER_HPP__

-#include <algorithm>
 #include <cinttypes>
 #include <cmath>
+#include <cstdlib>
 #include <map>
 #include <memory>
 #include <set>
@ -32,101 +32,91 @@ namespace LTXVUpsampler {
        int spatial_up_num       = 2;
        int spatial_down_den     = 1;
        int temporal_up_factor   = 1;
-
-        static LatentUpsamplerConfig detect_from_weights(const String2TensorStorage& tensor_storage_map,
-                                                         const std::string& prefix = "") {
-            LatentUpsamplerConfig config;
-            auto find_weight = [&](const std::string& suffix) -> const TensorStorage* {
-                std::string name = prefix.empty() ? suffix : prefix + "." + suffix;
-                auto iter        = tensor_storage_map.find(name);
-                if (iter == tensor_storage_map.end()) {
-                    return nullptr;
-                }
-                return &iter->second;
-            };
-
-            bool inferred = false;
-
-            const TensorStorage* initial_norm = find_weight("initial_norm.weight");
-            if (initial_norm != nullptr) {
-                config.mid_channels = initial_norm->ne[0];
-                inferred            = true;
-            }
-
-            const TensorStorage* final_conv = find_weight("final_conv.bias");
-            if (final_conv != nullptr) {
-                config.in_channels = final_conv->ne[0];
-                inferred           = true;
-            }
-
-            int detected_blocks                 = 0;
-            const std::string res_blocks_prefix = prefix.empty() ? "res_blocks." : prefix + ".res_blocks.";
-            for (const auto& [name, _] : tensor_storage_map) {
-                if (!starts_with(name, res_blocks_prefix)) {
-                    continue;
-                }
-                size_t begin = res_blocks_prefix.size();
-                size_t end   = name.find('.', begin);
-                if (end == std::string::npos) {
-                    continue;
-                }
-                try {
-                    int idx         = std::stoi(name.substr(begin, end - begin));
-                    detected_blocks = std::max(detected_blocks, idx + 1);
-                } catch (...) {
-                }
-            }
-            if (detected_blocks > 0) {
-                config.num_blocks_per_stage = detected_blocks;
-                inferred                    = true;
-            }
-
-            const TensorStorage* rational_upsampler_weight = find_weight("upsampler.conv.weight");
-            const TensorStorage* upsampler_bias            = find_weight("upsampler.0.bias");
-            config.rational_resampler                      = rational_upsampler_weight != nullptr;
-            int64_t upsampler_out_channels                 = upsampler_bias == nullptr ? 0 : upsampler_bias->ne[0];
-            config.spatial_upsample                        = config.rational_resampler || upsampler_out_channels == 4 * config.mid_channels;
-            config.temporal_upsample                       = upsampler_out_channels == 2 * config.mid_channels;
-            if (config.rational_resampler || upsampler_out_channels > 0) {
-                inferred = true;
-            }
-            if (config.temporal_upsample) {
-                config.temporal_up_factor = 2;
-            }
-            if (rational_upsampler_weight != nullptr) {
-                int64_t out_channels = rational_upsampler_weight->ne[3];
-                if (config.mid_channels > 0 && out_channels % config.mid_channels == 0) {
-                    int64_t ratio = out_channels / config.mid_channels;
-                    int num       = static_cast<int>(std::round(std::sqrt(static_cast<double>(ratio))));
-                    if (num > 0 && static_cast<int64_t>(num) * num == ratio) {
-                        config.spatial_up_num = num;
-                    }
-                }
-                if (config.spatial_up_num == 3) {
-                    config.spatial_down_den = 2;
-                    config.spatial_scale    = 1.5f;
-                } else if (config.spatial_up_num == 4) {
-                    config.spatial_down_den = 1;
-                    config.spatial_scale    = 4.f;
-                } else {
-                    config.spatial_down_den = 1;
-                    config.spatial_scale    = static_cast<float>(config.spatial_up_num);
-                }
-            }
-
-            if (inferred) {
-                LOG_DEBUG("ltx latent upsampler: in_channels = %" PRId64 ", mid_channels = %" PRId64 ", num_blocks_per_stage = %d, spatial_scale = %.3f, temporal_up_factor = %d, rational_resampler = %d",
-                          config.in_channels,
-                          config.mid_channels,
-                          config.num_blocks_per_stage,
-                          config.spatial_scale,
-                          config.temporal_up_factor,
-                          config.rational_resampler);
-            }
-            return config;
-        }
    };

+    static inline bool has_tensor(const String2TensorStorage& tensor_storage_map,
+                                  const std::string& name) {
+        return tensor_storage_map.find(name) != tensor_storage_map.end();
+    }
+
+    static inline int64_t get_tensor_ne(const String2TensorStorage& tensor_storage_map,
+                                        const std::string& name,
+                                        int axis,
+                                        int64_t fallback) {
+        auto it = tensor_storage_map.find(name);
+        if (it == tensor_storage_map.end() || axis < 0 || axis >= GGML_MAX_DIMS) {
+            return fallback;
+        }
+        return it->second.ne[axis];
+    }
+
+    static inline int64_t get_tensor_ne0(const String2TensorStorage& tensor_storage_map,
+                                         const std::string& name,
+                                         int64_t fallback) {
+        return get_tensor_ne(tensor_storage_map, name, 0, fallback);
+    }
+
+    static inline int count_module_blocks(const String2TensorStorage& tensor_storage_map,
+                                          const std::string& module_name) {
+        int max_block            = -1;
+        const std::string prefix = module_name + ".";
+        for (const auto& pair : tensor_storage_map) {
+            const std::string& name = pair.first;
+            if (name.find(prefix) != 0) {
+                continue;
+            }
+            size_t begin = prefix.size();
+            size_t end   = name.find('.', begin);
+            if (end == std::string::npos) {
+                continue;
+            }
+            int index = atoi(name.substr(begin, end - begin).c_str());
+            max_block = std::max(max_block, index);
+        }
+        return max_block + 1;
+    }
+
+    static inline LatentUpsamplerConfig detect_config_from_weights(const String2TensorStorage& tensor_storage_map) {
+        LatentUpsamplerConfig config;
+        config.mid_channels = get_tensor_ne0(tensor_storage_map, "initial_norm.weight", config.mid_channels);
+        config.in_channels  = get_tensor_ne0(tensor_storage_map, "final_conv.bias", config.in_channels);
+        int detected_blocks = count_module_blocks(tensor_storage_map, "res_blocks");
+        if (detected_blocks > 0) {
+            config.num_blocks_per_stage = detected_blocks;
+        }
+        config.rational_resampler      = has_tensor(tensor_storage_map, "upsampler.conv.weight");
+        int64_t upsampler_out_channels = get_tensor_ne0(tensor_storage_map, "upsampler.0.bias", 0);
+        config.spatial_upsample        = config.rational_resampler || upsampler_out_channels == 4 * config.mid_channels;
+        config.temporal_upsample       = upsampler_out_channels == 2 * config.mid_channels;
+        if (config.temporal_upsample) {
+            config.temporal_up_factor = 2;
+        }
+        if (config.rational_resampler) {
+            int64_t out_channels = get_tensor_ne(tensor_storage_map,
+                                                 "upsampler.conv.weight",
+                                                 3,
+                                                 config.mid_channels * 9);
+            if (config.mid_channels > 0 && out_channels % config.mid_channels == 0) {
+                int64_t ratio = out_channels / config.mid_channels;
+                int num       = static_cast<int>(std::round(std::sqrt(static_cast<double>(ratio))));
+                if (num > 0 && static_cast<int64_t>(num) * num == ratio) {
+                    config.spatial_up_num = num;
+                }
+            }
+            if (config.spatial_up_num == 3) {
+                config.spatial_down_den = 2;
+                config.spatial_scale    = 1.5f;
+            } else if (config.spatial_up_num == 4) {
+                config.spatial_down_den = 1;
+                config.spatial_scale    = 4.f;
+            } else {
+                config.spatial_down_den = 1;
+                config.spatial_scale    = static_cast<float>(config.spatial_up_num);
+            }
+        }
+        return config;
+    }
+
    class VideoGroupNorm : public GGMLBlock {
    protected:
        int num_groups;
@ -250,25 +240,20 @@ namespace LTXVUpsampler {
    protected:
        int64_t channels;
        int stride;
+        ggml_tensor* kernel = nullptr;
        std::vector<float> kernel_data;
-        std::string kernel_name;

        void init_params(ggml_context* ctx,
                         const String2TensorStorage& tensor_storage_map = {},
                         const std::string prefix                       = "") override {
-            SD_UNUSED(ctx);
            SD_UNUSED(tensor_storage_map);
            if (stride == 1) {
                return;
            }
-            kernel_name = prefix + "kernel";
-        }
+            kernel           = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 5, 5, 1, channels);
+            std::string name = prefix + "kernel";
+            ggml_set_name(kernel, name.c_str());

-    public:
-        BlurDownsample(int64_t channels, int stride)
-            : channels(channels),
-              stride(stride) {
-            GGML_ASSERT(stride >= 1);
            static const float binomial[5] = {1.f, 4.f, 6.f, 4.f, 1.f};
            kernel_data.resize(static_cast<size_t>(5 * 5 * channels));
            for (int64_t c = 0; c < channels; ++c) {
@ -281,16 +266,26 @@ namespace LTXVUpsampler {
            }
        }

+    public:
+        BlurDownsample(int64_t channels, int stride)
+            : channels(channels),
+              stride(stride) {
+            GGML_ASSERT(stride >= 1);
+        }
+
+        void load_fixed_tensors() {
+            if (kernel == nullptr || kernel_data.empty()) {
+                return;
+            }
+            ggml_backend_tensor_set(kernel, kernel_data.data(), 0, kernel_data.size() * sizeof(float));
+        }
+
        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
            if (stride == 1) {
                return x;
            }
-            GGML_ASSERT(ctx != nullptr);
-            GGML_ASSERT(!kernel_data.empty());
+            GGML_ASSERT(kernel != nullptr);
            GGML_ASSERT(x->ne[2] == channels);
-            ggml_tensor* kernel = ggml_new_tensor_4d(ctx->ggml_ctx, GGML_TYPE_F32, 5, 5, 1, channels);
-            ggml_set_name(kernel, kernel_name.empty() ? "blur_down.kernel" : kernel_name.c_str());
-            ctx->bind_backend_tensor_data(kernel, kernel_data.data());
            if (ctx->conv2d_direct_enabled) {
                return ggml_conv_2d_dw_direct(ctx->ggml_ctx, kernel, x, stride, stride, 2, 2, 1, 1);
            }
@ -316,6 +311,11 @@ namespace LTXVUpsampler {
            blocks["blur_down"]     = std::shared_ptr<GGMLBlock>(new BlurDownsample(mid_channels, den));
        }

+        void load_fixed_tensors() {
+            auto blur_down = std::dynamic_pointer_cast<BlurDownsample>(blocks["blur_down"]);
+            blur_down->load_fixed_tensors();
+        }
+
        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
            auto conv          = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
            auto pixel_shuffle = std::dynamic_pointer_cast<PixelShuffleND>(blocks["pixel_shuffle"]);
@ -426,17 +426,45 @@ namespace LTXVUpsampler {
            sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.final", "x");
            return x;
        }
+
+        void load_fixed_tensors() {
+            if (!config.rational_resampler) {
+                return;
+            }
+            auto upsampler = std::dynamic_pointer_cast<SpatialRationalResampler>(blocks["upsampler"]);
+            upsampler->load_fixed_tensors();
+        }
    };

    struct LatentUpsamplerRunner : public GGMLRunner {
-        LatentUpsamplerConfig config;
        std::unique_ptr<LatentUpsampler> model;

        LatentUpsamplerRunner(ggml_backend_t backend,
-                              const String2TensorStorage& tensor_storage_map,
-                              std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-            : GGMLRunner(backend, weight_manager),
-              config(LatentUpsamplerConfig::detect_from_weights(tensor_storage_map)) {
+                              ggml_backend_t params_backend)
+            : GGMLRunner(backend, params_backend) {}
+
+        std::string get_desc() override {
+            return "ltx_latent_upsampler";
+        }
+
+        bool load_from_file(const std::string& file_path, int n_threads) {
+            LOG_INFO("loading LTX latent upsampler from '%s'", file_path.c_str());
+            ModelLoader model_loader;
+            if (!model_loader.init_from_file(file_path)) {
+                LOG_ERROR("init LTX latent upsampler model loader from file failed: '%s'", file_path.c_str());
+                return false;
+            }
+
+            const auto& tensor_storage_map = model_loader.get_tensor_storage_map();
+            bool has_regular_upsampler     = has_tensor(tensor_storage_map, "upsampler.0.weight");
+            bool has_rational_spatial      = has_tensor(tensor_storage_map, "upsampler.conv.weight");
+            if (!has_tensor(tensor_storage_map, "post_upsample_res_blocks.0.conv2.bias") ||
+                (!has_regular_upsampler && !has_rational_spatial)) {
+                LOG_ERROR("unsupported LTX latent upsampler weights: expected upsampler tensors");
+                return false;
+            }
+
+            LatentUpsamplerConfig config = detect_config_from_weights(tensor_storage_map);
            if (config.dims != 3 || (!config.spatial_upsample && !config.temporal_upsample) ||
                config.spatial_up_num < 1 || config.spatial_down_den < 1 || config.temporal_up_factor < 1) {
                LOG_ERROR("unsupported LTX latent upsampler config: dims=%d spatial=%d temporal=%d rational=%d scale=%.3f temporal_factor=%d",
@ -446,21 +474,36 @@ namespace LTXVUpsampler {
                          config.rational_resampler,
                          config.spatial_scale,
                          config.temporal_up_factor);
-                return;
+                return false;
            }

            model = std::make_unique<LatentUpsampler>(config);
            model->init(params_ctx, tensor_storage_map, "");
-        }
-
-        std::string get_desc() override {
-            return "ltx_latent_upsampler";
-        }
-
-        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) {
-            if (model) {
-                model->get_param_tensors(tensors);
+            if (!alloc_params_buffer()) {
+                LOG_ERROR("LTX latent upsampler params buffer allocation failed");
+                return false;
            }
+
+            std::map<std::string, ggml_tensor*> tensors;
+            model->get_param_tensors(tensors);
+            std::set<std::string> ignore_tensors;
+            if (config.rational_resampler) {
+                ignore_tensors.insert("upsampler.blur_down.kernel");
+            }
+            if (!model_loader.load_tensors(tensors, ignore_tensors, n_threads)) {
+                LOG_ERROR("load LTX latent upsampler tensors failed");
+                return false;
+            }
+            model->load_fixed_tensors();
+
+            LOG_INFO("LTX latent upsampler loaded: in_channels=%" PRId64 ", mid_channels=%" PRId64 ", blocks=%d, scale=%.3f, temporal_factor=%d, rational=%d",
+                     config.in_channels,
+                     config.mid_channels,
+                     config.num_blocks_per_stage,
+                     config.spatial_scale,
+                     config.temporal_up_factor,
+                     config.rational_resampler);
+            return true;
        }

        ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor) {
@ -491,15 +534,15 @@ namespace LTXVUpsampler {
                          (long long)x.shape()[4]);
                return {};
            }
-            if (x.shape()[3] != config.in_channels) {
+            if (x.shape()[3] != model->config.in_channels) {
                LOG_ERROR("LTX latent upsampler expected %" PRId64 " channels, got %lld",
-                          config.in_channels,
+                          model->config.in_channels,
                          (long long)x.shape()[3]);
                return {};
            }
            size_t expected_dim = static_cast<size_t>(x.dim());
            auto get_graph      = [&]() -> ggml_cgraph* { return build_graph(x); };
-            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), expected_dim);
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), expected_dim);
        }
    };

--- a/src/model/vae/auto_encoder_kl.hpp
+++ b/src/model/vae/auto_encoder_kl.hpp
@ -213,9 +213,9 @@ protected:
        params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
    }

-    ggml_tensor* get_alpha(GGMLRunnerContext* ctx) {
-        auto mix_factor = ggml_ext_cast_f32(ctx->ggml_ctx, ctx->backend, params["mix_factor"]);
-        return ggml_sigmoid(ctx->ggml_ctx, mix_factor);
+    float get_alpha() {
+        float alpha = ggml_ext_backend_tensor_get_f32(params["mix_factor"]);
+        return sigmoid(alpha);
    }

 public:
@ -250,12 +250,10 @@ public:

        x = time_stack->forward(ctx, x);  // b t c (h w)

-        auto alpha = get_alpha(ctx);
-        x          = ggml_add(ctx->ggml_ctx,
-                              x_mix,
-                              ggml_mul(ctx->ggml_ctx,
-                                       ggml_sub(ctx->ggml_ctx, x, x_mix),
-                                       alpha));
+        float alpha = get_alpha();
+        x           = ggml_add(ctx->ggml_ctx,
+                               ggml_ext_scale(ctx->ggml_ctx, x, alpha),
+                               ggml_ext_scale(ctx->ggml_ctx, x_mix, 1.0f - alpha));

        x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3));  // b c t (h w) -> b t c (h w)
        x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B);                     // b t c (h w) -> (b t) c h w
@ -666,13 +664,13 @@ struct AutoEncoderKL : public VAE {
    AutoEncoderKLModel ae;

    AutoEncoderKL(ggml_backend_t backend,
+                  ggml_backend_t params_backend,
                  const String2TensorStorage& tensor_storage_map,
                  const std::string prefix,
-                  bool decode_only                                    = false,
-                  bool use_video_decoder                              = false,
-                  SDVersion version                                   = VERSION_SD1,
-                  std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-        : VAE(version, backend, prefix, weight_manager), decode_only(decode_only) {
+                  bool decode_only       = false,
+                  bool use_video_decoder = false,
+                  SDVersion version      = VERSION_SD1)
+        : decode_only(decode_only), VAE(version, backend, params_backend) {
        if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) {
            scale_factor = 0.18215f;
            shift_factor = 0.f;
@ -682,7 +680,7 @@ struct AutoEncoderKL : public VAE {
        } else if (sd_version_is_sd3(version)) {
            scale_factor = 1.5305f;
            shift_factor = 0.0609f;
-        } else if (sd_version_uses_flux_vae(version)) {
+        } else if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_longcat(version)) {
            scale_factor = 0.3611f;
            shift_factor = 0.1159f;
        } else if (sd_version_uses_flux2_vae(version)) {
@ -720,8 +718,8 @@ struct AutoEncoderKL : public VAE {
        return "vae";
    }

-    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
-        ae.get_param_tensors(tensors, weight_prefix);
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) override {
+        ae.get_param_tensors(tensors, prefix);
    }

    ggml_cgraph* build_graph(const sd::Tensor<float>& z_tensor, bool decode_graph) {
@ -744,7 +742,7 @@ struct AutoEncoderKL : public VAE {
        auto get_graph = [&]() -> ggml_cgraph* {
            return build_graph(z, decode_graph);
        };
-        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), z.dim());
+        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), z.dim());
    }

    sd::Tensor<float> gaussian_latent_sample(const sd::Tensor<float>& moments, std::shared_ptr<RNG> rng) {
--- a/src/model/vae/ltx_audio_vae.hpp
+++ b/src/model/vae/ltx_audio_vae.hpp
@ -1,4 +1,4 @@
-#ifndef __SD_MODEL_VAE_LTX_AUDIO_VAE_HPP__
+#ifndef __SD_MODEL_VAE_LTX_AUDIO_VAE_HPP__
 #define __SD_MODEL_VAE_LTX_AUDIO_VAE_HPP__

 #include <cmath>
@ -9,7 +9,6 @@

 #include "core/ggml_extend.hpp"
 #include "model_loader.h"
-#include "model_manager.h"

 namespace LTXV {

@ -998,15 +997,13 @@ namespace LTXV {
    struct LTXAudioVAERunner : public GGMLRunner {
        LTXAudioVAEConfig config;
        LTXAudioVAE model;
-        std::string weight_prefix;
        sd::Tensor<float> bwe_skip_filter_tensor;

        LTXAudioVAERunner(ggml_backend_t backend,
+                          ggml_backend_t params_backend,
                          const String2TensorStorage& tensor_storage_map,
-                          const std::string& prefix                           = "",
-                          std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-            : GGMLRunner(backend, weight_manager),
-              weight_prefix(prefix),
+                          const std::string& prefix = "")
+            : GGMLRunner(backend, params_backend),
              config(LTXAudioVAEConfig::detect_from_weights(tensor_storage_map)),
              model(config) {
            model.init(params_ctx, tensor_storage_map, prefix);
@ -1016,11 +1013,11 @@ namespace LTXV {
            }
        }

-        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) {
-            model.get_param_tensors(tensors, weight_prefix);
+        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
+            model.get_param_tensors(tensors, prefix);
        }

-        size_t get_params_mem_size() {
+        size_t get_params_buffer_size() {
            return model.get_params_mem_size();
        }

@ -1040,7 +1037,7 @@ namespace LTXV {
                ggml_build_forward_expand(gf, waveform);
                return gf;
            };
-            auto result = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), 4);
+            auto result = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), 4);
            int64_t t1  = ggml_time_ms();
            LOG_INFO("ltx audio vae decode completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
            return result;
@ -1067,8 +1064,7 @@ namespace LTXV {
            // ggml_backend_t backend = ggml_backend_cuda_init(0);
            LOG_INFO("loading ltx audio vae from '%s'", model_path.c_str());

-            auto model_manager        = std::make_shared<ModelManager>();
-            ModelLoader& model_loader = model_manager->loader();
+            ModelLoader model_loader;
            if (!model_loader.init_from_file(model_path)) {
                LOG_ERROR("init model loader from file failed: '%s'", model_path.c_str());
                return;
@ -1076,17 +1072,20 @@ namespace LTXV {

            auto& tensor_storage_map = model_loader.get_tensor_storage_map();
            auto ltx_audio_vae       = std::make_shared<LTXAudioVAERunner>(backend,
+                                                                     backend,
                                                                     tensor_storage_map,
-                                                                     prefix,
-                                                                     model_manager);
+                                                                     prefix);

-            if (!model_manager->register_runner_params("LTX audio VAE test",
-                                                       *ltx_audio_vae,
-                                                       ModelManager::ResidencyMode::ParamBackend,
-                                                       backend,
-                                                       backend) ||
-                !model_manager->validate_registered_tensors()) {
-                LOG_ERROR("register ltx audio vae tensors with model manager failed");
+            if (!ltx_audio_vae->alloc_params_buffer()) {
+                LOG_ERROR("ltx audio vae buffer allocation failed");
+                return;
+            }
+
+            std::map<std::string, ggml_tensor*> tensors;
+            ltx_audio_vae->get_param_tensors(tensors, "");
+
+            if (!model_loader.load_tensors(tensors)) {
+                LOG_ERROR("load tensors from model loader failed");
                return;
            }

--- a/src/model/vae/ltx_vae.hpp
+++ b/src/model/vae/ltx_vae.hpp
@ -957,8 +957,8 @@ namespace LTXVAE {

            ggml_tensor* scaled_timestep = timestep;
            if (timestep_conditioning) {
-                auto multiplier = ggml_ext_cast_f32(ctx->ggml_ctx, ctx->backend, params["timestep_scale_multiplier"]);
-                scaled_timestep = ggml_mul(ctx->ggml_ctx, timestep, multiplier);
+                auto multiplier = ggml_ext_backend_tensor_get_f32(params["timestep_scale_multiplier"]);
+                scaled_timestep = ggml_ext_scale(ctx->ggml_ctx, timestep, multiplier);
            }

            x = conv_in->forward(ctx, x, causal_decoder);
@ -1008,8 +1008,8 @@ namespace LTXVAE {

            ggml_tensor* scaled_timestep = timestep;
            if (timestep_conditioning && timestep != nullptr) {
-                auto multiplier = ggml_ext_cast_f32(ctx->ggml_ctx, ctx->backend, params["timestep_scale_multiplier"]);
-                scaled_timestep = ggml_mul(ctx->ggml_ctx, timestep, multiplier);
+                auto multiplier = ggml_ext_backend_tensor_get_f32(params["timestep_scale_multiplier"]);
+                scaled_timestep = ggml_ext_scale(ctx->ggml_ctx, timestep, multiplier);
            }

            // conv_in with feat_map for left temporal context
@ -1223,11 +1223,11 @@ struct LTXVideoVAE : public VAE {
    LTXVAE::VideoVAE vae;

    LTXVideoVAE(ggml_backend_t backend,
+                ggml_backend_t params_backend,
                const String2TensorStorage& tensor_storage_map,
                const std::string& prefix,
-                bool decode_only                                    = true,
-                SDVersion version                                   = VERSION_LTXAV,
-                std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
+                bool decode_only  = true,
+                SDVersion version = VERSION_LTXAV)
        : decode_only(decode_only),
          ltx_vae_version(LTXVAE::detect_ltx_vae_version(tensor_storage_map, prefix)),
          timestep_conditioning(LTXVAE::detect_ltx_vae_timestep_conditioning(tensor_storage_map, prefix)),
@ -1239,7 +1239,7 @@ struct LTXVideoVAE : public VAE {
              patch_size,
              tensor_storage_map,
              prefix),
-          VAE(version, backend, prefix, weight_manager) {
+          VAE(version, backend, params_backend) {
        vae.init(params_ctx, tensor_storage_map, prefix);
        decode_timestep_tensor.values()[0] = vae.decode_timestep;
    }
@ -1271,8 +1271,8 @@ struct LTXVideoVAE : public VAE {
        }
    }

-    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
-        vae.get_param_tensors(tensors, weight_prefix);
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) override {
+        vae.get_param_tensors(tensors, prefix);
    }

    struct TemporalTilePlan {
@ -1396,7 +1396,7 @@ struct LTXVideoVAE : public VAE {
                                                 static_cast<int>(start),
                                                 chunk_overlap);
            };
-            auto chunk = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, true, true, true),
+            auto chunk = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, true),
                                                         expected_dim);
            if (chunk.empty()) {
                free_cache_ctx_and_buffer();
@ -1426,7 +1426,7 @@ struct LTXVideoVAE : public VAE {
                               const sd::Tensor<float>& z,
                               bool decode_graph) override {
        if (!decode_graph && decode_only) {
-            LOG_ERROR("LTX video VAE encode requires encoder weights");
+            LOG_ERROR("LTX video VAE encode requires encoder weights; create the context with vae_decode_only=false");
            return {};
        }
        sd::Tensor<float> input = z;
@ -1452,7 +1452,7 @@ struct LTXVideoVAE : public VAE {
        auto get_graph = [&]() -> ggml_cgraph* {
            return build_graph(input, decode_graph);
        };
-        auto result = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), expected_dim);
+        auto result = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), expected_dim);
        if (result.empty()) {
            return {};
        }
@ -1465,7 +1465,7 @@ struct LTXVideoVAE : public VAE {
        auto get_graph = [&]() -> ggml_cgraph* {
            return build_latent_statistics_graph(z, normalize);
        };
-        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false),
+        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false),
                                               static_cast<size_t>(z.dim()));
    }

@ -1521,8 +1521,7 @@ struct LTXVideoVAE : public VAE {
        ggml_backend_t backend = sd_backend_cpu_init();
        LOG_INFO("loading ltx vae from '%s'", model_path.c_str());

-        auto model_manager        = std::make_shared<ModelManager>();
-        ModelLoader& model_loader = model_manager->loader();
+        ModelLoader model_loader;
        if (!model_loader.init_from_file_and_convert_name(model_path, "vae.")) {
            LOG_ERROR("init model loader from file failed: '%s'", model_path.c_str());
            return;
@ -1530,19 +1529,22 @@ struct LTXVideoVAE : public VAE {

        auto& tensor_storage_map         = model_loader.get_tensor_storage_map();
        std::shared_ptr<LTXVideoVAE> vae = std::make_shared<LTXVideoVAE>(backend,
+                                                                         backend,
                                                                         tensor_storage_map,
                                                                         "first_stage_model",
                                                                         true,
-                                                                         VERSION_LTXAV,
-                                                                         model_manager);
+                                                                         VERSION_LTXAV);

-        if (!model_manager->register_runner_params("LTX VAE test",
-                                                   *vae,
-                                                   ModelManager::ResidencyMode::ParamBackend,
-                                                   backend,
-                                                   backend) ||
-            !model_manager->validate_registered_tensors()) {
-            LOG_ERROR("register ltx vae tensors with model manager failed");
+        if (!vae->alloc_params_buffer()) {
+            LOG_ERROR("vae buffer allocation failed");
+            return;
+        }
+
+        std::map<std::string, ggml_tensor*> tensors;
+        vae->get_param_tensors(tensors, "first_stage_model");
+
+        if (!model_loader.load_tensors(tensors)) {
+            LOG_ERROR("load tensors from model loader failed");
            return;
        }

--- a/src/model/vae/tae.hpp
+++ b/src/model/vae/tae.hpp
@ -623,14 +623,14 @@ struct TinyImageAutoEncoder : public VAE {
    bool decode_only = false;

    TinyImageAutoEncoder(ggml_backend_t backend,
+                         ggml_backend_t params_backend,
                         const String2TensorStorage& tensor_storage_map,
                         const std::string prefix,
-                         bool decoder_only                                   = true,
-                         SDVersion version                                   = VERSION_SD1,
-                         std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-        : VAE(version, backend, "tae", weight_manager),
-          decode_only(decoder_only),
-          taesd(decoder_only, version) {
+                         bool decoder_only = true,
+                         SDVersion version = VERSION_SD1)
+        : decode_only(decoder_only),
+          taesd(decoder_only, version),
+          VAE(version, backend, params_backend) {
        scale_input = false;
        taesd.init(params_ctx, tensor_storage_map, prefix);
    }
@ -639,8 +639,8 @@ struct TinyImageAutoEncoder : public VAE {
        return "taesd";
    }

-    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
-        taesd.get_param_tensors(tensors, weight_prefix);
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
+        taesd.get_param_tensors(tensors, prefix);
    }

    sd::Tensor<float> vae_output_to_latents(const sd::Tensor<float>& vae_output, std::shared_ptr<RNG> rng) override {
@ -676,7 +676,7 @@ struct TinyImageAutoEncoder : public VAE {
            return build_graph(z_tensor, decode_graph);
        };

-        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), z_tensor.dim());
+        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), z_tensor.dim());
    }
 };

@ -686,13 +686,13 @@ struct TinyVideoAutoEncoder : public VAE {
    bool is_wide     = false;

    TinyVideoAutoEncoder(ggml_backend_t backend,
+                         ggml_backend_t params_backend,
                         const String2TensorStorage& tensor_storage_map,
                         const std::string prefix,
-                         bool decoder_only                                   = true,
-                         SDVersion version                                   = VERSION_WAN2,
-                         std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-        : VAE(version, backend, "tae", weight_manager),
-          decode_only(decoder_only) {
+                         bool decoder_only = true,
+                         SDVersion version = VERSION_WAN2)
+        : decode_only(decoder_only),
+          VAE(version, backend, params_backend) {
        for (auto tensor_storage : tensor_storage_map) {
            if (tensor_storage.first.find(prefix + ".3.conv.6.weight") != std::string::npos) {
                is_wide = true;
@ -708,8 +708,8 @@ struct TinyVideoAutoEncoder : public VAE {
        return "taehv";
    }

-    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
-        taehv.get_param_tensors(tensors, weight_prefix);
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
+        taehv.get_param_tensors(tensors, prefix);
    }

    sd::Tensor<float> vae_output_to_latents(const sd::Tensor<float>& vae_output, std::shared_ptr<RNG> rng) override {
@ -746,7 +746,7 @@ struct TinyVideoAutoEncoder : public VAE {
            return build_graph(z_tensor, decode_graph);
        };

-        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), z_tensor.dim());
+        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), z_tensor.dim());
    }
 };

--- a/src/model/vae/vae.hpp
+++ b/src/model/vae/vae.hpp
@ -1,14 +1,12 @@
-#ifndef __SD_MODEL_VAE_VAE_HPP__
+#ifndef __SD_MODEL_VAE_VAE_HPP__
 #define __SD_MODEL_VAE_VAE_HPP__

 #include "core/tensor_ggml.hpp"
 #include "model/common/block.hpp"
-#include "model_manager.h"

 struct VAE : public GGMLRunner {
 protected:
    SDVersion version;
-    std::string weight_prefix;
    bool scale_input                                      = true;
    virtual sd::Tensor<float> _compute(const int n_threads,
                                       const sd::Tensor<float>& z,
@ -64,11 +62,8 @@ protected:
    }

 public:
-    VAE(SDVersion version,
-        ggml_backend_t backend,
-        const std::string& weight_prefix                    = "",
-        std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-        : version(version), weight_prefix(weight_prefix), GGMLRunner(backend, weight_manager) {}
+    VAE(SDVersion version, ggml_backend_t backend, ggml_backend_t params_backend)
+        : version(version), GGMLRunner(backend, params_backend) {}

    int get_scale_factor() {
        int scale_factor = 8;
@ -219,7 +214,7 @@ public:
    virtual sd::Tensor<float> vae_output_to_latents(const sd::Tensor<float>& vae_output, std::shared_ptr<RNG> rng) = 0;
    virtual sd::Tensor<float> diffusion_to_vae_latents(const sd::Tensor<float>& latents)                           = 0;
    virtual sd::Tensor<float> vae_to_diffusion_latents(const sd::Tensor<float>& latents)                           = 0;
-    virtual void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors)                                   = 0;
+    virtual void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix)         = 0;
    virtual void set_conv2d_scale(float scale) { SD_UNUSED(scale); };
    virtual void set_temporal_tiling_enabled(bool enabled) { SD_UNUSED(enabled); };
    virtual void set_tiling_params(const sd_tiling_params_t& params) {
@ -228,10 +223,8 @@ public:
 };

 struct FakeVAE : public VAE {
-    FakeVAE(SDVersion version,
-            ggml_backend_t backend,
-            std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-        : VAE(version, backend, "", weight_manager) {}
+    FakeVAE(SDVersion version, ggml_backend_t backend, ggml_backend_t params_backend)
+        : VAE(version, backend, params_backend) {}

    int get_encoder_output_channels(int input_channels) {
        return input_channels;
@ -258,7 +251,7 @@ struct FakeVAE : public VAE {
        return latents;
    }

-    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {}
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) override {}

    std::string get_desc() override {
        return "fake_vae";
--- a/src/model/vae/wan_vae.hpp
+++ b/src/model/vae/wan_vae.hpp
@ -1124,12 +1124,12 @@ namespace WAN {
        WanVAE ae;

        WanVAERunner(ggml_backend_t backend,
-                     const String2TensorStorage& tensor_storage_map      = {},
-                     const std::string prefix                            = "",
-                     bool decode_only                                    = false,
-                     SDVersion version                                   = VERSION_WAN2,
-                     std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-            : VAE(version, backend, prefix, weight_manager), decode_only(decode_only), ae(decode_only, version == VERSION_WAN2_2_TI2V) {
+                     ggml_backend_t params_backend,
+                     const String2TensorStorage& tensor_storage_map = {},
+                     const std::string prefix                       = "",
+                     bool decode_only                               = false,
+                     SDVersion version                              = VERSION_WAN2)
+            : decode_only(decode_only), ae(decode_only, version == VERSION_WAN2_2_TI2V), VAE(version, backend, params_backend) {
            ae.init(params_ctx, tensor_storage_map, prefix);
        }

@ -1137,8 +1137,8 @@ namespace WAN {
            return "wan_vae";
        }

-        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
-            ae.get_param_tensors(tensors, weight_prefix);
+        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) override {
+            ae.get_param_tensors(tensors, prefix);
        }

        sd::Tensor<float> vae_output_to_latents(const sd::Tensor<float>& vae_output, std::shared_ptr<RNG> rng) override {
@ -1255,7 +1255,7 @@ namespace WAN {
                        return build_graph(input, decode_graph);
                    }
                };
-                auto result = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, true, true, true),
+                auto result = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, true),
                                                              input.empty() ? z.dim() : input.dim());
                if (!result.empty() && z.dim() == 4) {
                    result.squeeze_(2);
@ -1268,7 +1268,7 @@ namespace WAN {
                auto get_graph = [&]() -> ggml_cgraph* {
                    return build_graph_partial(z, decode_graph, i);
                };
-                auto out_opt = GGMLRunner::compute<float>(get_graph, n_threads, true, true, true);
+                auto out_opt = GGMLRunner::compute<float>(get_graph, n_threads, true);
                if (!out_opt.has_value()) {
                    return {};
                }
@ -1281,7 +1281,7 @@ namespace WAN {
                sd::Tensor<float> output = std::move(out);

                for (i = 1; i < t; i++) {
-                    auto chunk_opt = GGMLRunner::compute<float>(get_graph, n_threads, true, true, true);
+                    auto chunk_opt = GGMLRunner::compute<float>(get_graph, n_threads, true);
                    if (!chunk_opt.has_value()) {
                        return {};
                    }
@ -1327,24 +1327,27 @@ namespace WAN {
            // ggml_backend_t backend = ggml_backend_cuda_init(0);
            ggml_backend_t backend            = sd_backend_cpu_init();
            ggml_type model_data_type         = GGML_TYPE_F16;
-            auto model_manager                = std::make_shared<ModelManager>();
-            std::shared_ptr<WanVAERunner> vae = std::make_shared<WanVAERunner>(backend, String2TensorStorage{}, "first_stage_model", false, VERSION_WAN2_2_TI2V, model_manager);
+            std::shared_ptr<WanVAERunner> vae = std::make_shared<WanVAERunner>(backend, backend, String2TensorStorage{}, "", false, VERSION_WAN2_2_TI2V);
            {
                LOG_INFO("loading from '%s'", file_path.c_str());

-                ModelLoader& model_loader = model_manager->loader();
+                if (!vae->alloc_params_buffer()) {
+                    LOG_ERROR("vae buffer allocation failed");
+                    return;
+                }
+                std::map<std::string, ggml_tensor*> tensors;
+                vae->get_param_tensors(tensors, "first_stage_model");
+
+                ModelLoader model_loader;
                if (!model_loader.init_from_file_and_convert_name(file_path, "vae.")) {
                    LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
                    return;
                }

-                if (!model_manager->register_runner_params("Wan VAE test",
-                                                           *vae,
-                                                           ModelManager::ResidencyMode::ParamBackend,
-                                                           backend,
-                                                           backend) ||
-                    !model_manager->validate_registered_tensors()) {
-                    LOG_ERROR("register wan vae tensors with model manager failed");
+                bool success = model_loader.load_tensors(tensors);
+
+                if (!success) {
+                    LOG_ERROR("load tensors from model loader failed");
                    return;
                }

--- a/src/model_loader.cpp
+++ b/src/model_loader.cpp
@ -1,7 +1,6 @@
 #include <algorithm>
 #include <atomic>
 #include <chrono>
-#include <cinttypes>
 #include <cstdarg>
 #include <cstdlib>
 #include <fstream>
@ -205,28 +204,10 @@ void convert_tensor(void* src,

 /*================================================= ModelLoader ==================================================*/

-ModelLoader::ModelLoader()
-    : n_threads_(sd_get_num_physical_cores()) {
-}
-
-size_t ModelLoader::add_file_path(const std::string& file_path) {
-    if (model_files_processed) {
-        file_data.clear();
-        model_files_processed = false;
-    }
-    file_paths_.push_back(file_path);
-    return file_paths_.size() - 1;
-}
-
 void ModelLoader::add_tensor_storage(const TensorStorage& tensor_storage) {
    tensor_storage_map[tensor_storage.name] = tensor_storage;
 }

-void ModelLoader::set_n_threads(int n_threads) {
-    n_threads_ = n_threads > 0 ? n_threads : sd_get_num_physical_cores();
-    LOG_DEBUG("using %d threads for model loading", n_threads_);
-}
-
 bool ModelLoader::init_from_file(const std::string& file_path, const std::string& prefix) {
    if (is_directory(file_path)) {
        LOG_INFO("load %s using diffusers format", file_path.c_str());
@ -290,7 +271,8 @@ bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::s
        return false;
    }

-    size_t file_index = add_file_path(file_path);
+    file_paths_.push_back(file_path);
+    size_t file_index = file_paths_.size() - 1;

    for (auto& tensor_storage : tensor_storages) {
        // LOG_DEBUG("%s", tensor_storage.name.c_str());
@ -318,7 +300,8 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
        return false;
    }

-    size_t file_index = add_file_path(file_path);
+    file_paths_.push_back(file_path);
+    size_t file_index = file_paths_.size() - 1;

    for (auto& tensor_storage : tensor_storages) {
        if (is_unused_tensor(tensor_storage.name)) {
@ -352,7 +335,8 @@ bool ModelLoader::init_from_torch_legacy_file(const std::string& file_path, cons
        return false;
    }

-    size_t file_index = add_file_path(file_path);
+    file_paths_.push_back(file_path);
+    size_t file_index = file_paths_.size() - 1;

    for (auto& tensor_storage : tensor_storages) {
        if (is_unused_tensor(tensor_storage.name)) {
@ -382,7 +366,8 @@ bool ModelLoader::init_from_torch_zip_file(const std::string& file_path, const s
        return false;
    }

-    size_t file_index = add_file_path(file_path);
+    file_paths_.push_back(file_path);
+    size_t file_index = file_paths_.size() - 1;

    for (auto& tensor_storage : tensor_storages) {
        if (!starts_with(tensor_storage.name, prefix)) {
@ -485,9 +470,6 @@ SDVersion ModelLoader::get_sd_version() {
        if (tensor_storage.name.find("model.diffusion_model.cap_embedder.0.weight") != std::string::npos) {
            return VERSION_Z_IMAGE;
        }
-        if (tensor_storage.name.find("double_stream_layers.0.img_instruct_attn.processor.img_to_q.weight") != std::string::npos) {
-            return VERSION_BOOGU_IMAGE;
-        }
        if (tensor_storage.name.find("model.diffusion_model.layers.0.adaLN_sa_ln.weight") != std::string::npos) {
            return VERSION_ERNIE_IMAGE;
        }
@ -778,6 +760,8 @@ void ModelLoader::process_model_files(bool enable_mmap, bool writable_mmap) {
        return;
    }

+    int64_t start_time = ggml_time_ms();
+
    std::vector<TensorStorage> processed_tensor_storages;
    for (const auto& [name, tensor_storage] : tensor_storage_map) {
        if (is_unused_tensor(tensor_storage.name)) {
@ -828,12 +812,20 @@ void ModelLoader::process_model_files(bool enable_mmap, bool writable_mmap) {
            } else {
                LOG_WARN("failed to memory-map '%s' (falling back to read())", file_path.c_str());
            }
+        } else if (!is_zip) {
+            LOG_INFO("NOT using mmap for '%s' (mmap disabled by caller)",
+                     file_path.c_str());
        }

        file_data.push_back(std::move(fdata));
    }

    model_files_processed = true;
+
+    int64_t end_time        = ggml_time_ms();
+    int64_t process_time_ms = end_time - start_time;
+
+    LOG_INFO("model files processing completed in %.2fs", process_time_ms / 1000.f);
 }

 std::vector<MmapTensorStore> ModelLoader::mmap_tensors(std::map<std::string, ggml_tensor*>& tensors,
@ -927,9 +919,7 @@ std::vector<MmapTensorStore> ModelLoader::mmap_tensors(std::map<std::string, ggm
    return result;
 }

-bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb,
-                               bool enable_mmap,
-                               const std::set<std::string>* target_tensor_names) {
+bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p, bool enable_mmap) {
    process_model_files(enable_mmap, false);

    std::atomic<int64_t> read_time_ms(0);
@ -938,26 +928,14 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb,
    std::atomic<int64_t> convert_time_ms(0);
    std::atomic<uint64_t> bytes_processed(0);

-    int num_threads_to_use = n_threads_;
+    int num_threads_to_use = n_threads_p > 0 ? n_threads_p : sd_get_num_physical_cores();
+    LOG_DEBUG("using %d threads for model loading", num_threads_to_use);

    int64_t start_time = ggml_time_ms();

    size_t total_tensors_to_process = 0;
-    std::vector<size_t> file_tensors_to_process_counts;
-    file_tensors_to_process_counts.reserve(file_data.size());
    for (const auto& fdata : file_data) {
-        size_t file_tensors_to_process = 0;
-        if (target_tensor_names == nullptr) {
-            file_tensors_to_process = fdata.tensors.size();
-        } else {
-            for (const TensorStorage& tensor_storage : fdata.tensors) {
-                if (target_tensor_names->find(tensor_storage.name) != target_tensor_names->end()) {
-                    file_tensors_to_process++;
-                }
-            }
-        }
-        file_tensors_to_process_counts.push_back(file_tensors_to_process);
-        total_tensors_to_process += file_tensors_to_process;
+        total_tensors_to_process += fdata.tensors.size();
    }

    bool success                   = true;
@ -965,38 +943,17 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb,
    const int64_t t_start          = start_time;
    int last_n_threads             = 1;

-    for (size_t file_index = 0; file_index < file_data.size(); ++file_index) {
-        auto& fdata                  = file_data[file_index];
+    for (auto& fdata : file_data) {
        const std::string& file_path = fdata.path;
+        LOG_DEBUG("loading tensors from %s", file_path.c_str());

        const std::vector<TensorStorage>& file_tensors = fdata.tensors;
-        std::vector<const TensorStorage*> tensors_to_process;
-        size_t file_tensors_to_process = file_tensors_to_process_counts[file_index];
-        tensors_to_process.reserve(file_tensors_to_process);
-        if (target_tensor_names == nullptr) {
-            for (const TensorStorage& tensor_storage : file_tensors) {
-                tensors_to_process.push_back(&tensor_storage);
-            }
-        } else {
-            for (const TensorStorage& tensor_storage : file_tensors) {
-                if (target_tensor_names->find(tensor_storage.name) != target_tensor_names->end()) {
-                    tensors_to_process.push_back(&tensor_storage);
-                }
-            }
-        }
-        if (tensors_to_process.empty()) {
-            continue;
-        }
-        LOG_DEBUG("loading %zu/%zu tensors from %s",
-                  tensors_to_process.size(),
-                  file_tensors.size(),
-                  file_path.c_str());

        bool is_zip = fdata.is_zip;

        std::shared_ptr<MmapWrapper> mmapped = fdata.mmapped;

-        int n_threads = is_zip ? 1 : std::min(num_threads_to_use, (int)tensors_to_process.size());
+        int n_threads = is_zip ? 1 : std::min(num_threads_to_use, (int)file_tensors.size());
        if (n_threads < 1) {
            n_threads = 1;
        }
@ -1005,7 +962,6 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb,
        std::atomic<size_t> tensor_idx(0);
        std::atomic<bool> failed(false);
        std::vector<std::thread> workers;
-        std::mutex rpc_backend_mutex;

        for (int i = 0; i < n_threads; ++i) {
            workers.emplace_back([&, file_path, is_zip]() {
@ -1033,11 +989,11 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb,
                while (true) {
                    int64_t t0, t1;
                    size_t idx = tensor_idx.fetch_add(1);
-                    if (idx >= tensors_to_process.size() || failed) {
+                    if (idx >= file_tensors.size() || failed) {
                        break;
                    }

-                    const TensorStorage& tensor_storage = *tensors_to_process[idx];
+                    const TensorStorage& tensor_storage = file_tensors[idx];
                    ggml_tensor* dst_tensor             = nullptr;

                    t0 = ggml_time_ms();
@ -1162,19 +1118,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb,

                    if (dst_tensor->buffer != nullptr && !ggml_backend_buffer_is_host(dst_tensor->buffer)) {
                        t0 = ggml_time_ms();
-
-                        // RPC backends require serialized access to prevent concurrency issues
-                        const char* buffer_type_name = ggml_backend_buft_name(ggml_backend_buffer_get_type(dst_tensor->buffer));
-                        bool is_rpc_buffer           = buffer_type_name != nullptr &&
-                                             std::string(buffer_type_name).find("RPC") != std::string::npos;
-
-                        if (is_rpc_buffer) {
-                            std::lock_guard<std::mutex> lock(rpc_backend_mutex);
-                            ggml_backend_tensor_set(dst_tensor, convert_buf, 0, ggml_nbytes(dst_tensor));
-                        } else {
-                            ggml_backend_tensor_set(dst_tensor, convert_buf, 0, ggml_nbytes(dst_tensor));
-                        }
-
+                        ggml_backend_tensor_set(dst_tensor, convert_buf, 0, ggml_nbytes(dst_tensor));
                        t1 = ggml_time_ms();
                        copy_to_backend_time_ms.fetch_add(t1 - t0);
                    }
@ -1189,18 +1133,16 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb,

        while (true) {
            size_t current_idx = tensor_idx.load();
-            if (current_idx >= tensors_to_process.size() || failed) {
+            if (current_idx >= file_tensors.size() || failed) {
                break;
            }
            size_t curr_num       = total_tensors_processed + current_idx;
            float elapsed_seconds = (ggml_time_ms() - t_start) / 1000.0f;
-            if (total_tensors_to_process > 0) {
-                pretty_bytes_progress(static_cast<int>(curr_num),
-                                      static_cast<int>(total_tensors_to_process),
-                                      bytes_processed.load(),
-                                      elapsed_seconds);
-            }
-            std::this_thread::sleep_for(std::chrono::milliseconds(total_tensors_to_process <= 4 ? 10 : 200));
+            pretty_bytes_progress(static_cast<int>(curr_num),
+                                  static_cast<int>(total_tensors_to_process),
+                                  bytes_processed.load(),
+                                  elapsed_seconds);
+            std::this_thread::sleep_for(std::chrono::milliseconds(200));
        }

        for (auto& w : workers) {
@ -1211,14 +1153,12 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb,
            success = false;
            break;
        }
-        total_tensors_processed += tensors_to_process.size();
-        if (total_tensors_to_process > 0) {
-            pretty_bytes_progress(static_cast<int>(total_tensors_processed),
-                                  static_cast<int>(total_tensors_to_process),
-                                  bytes_processed.load(),
-                                  (ggml_time_ms() - t_start) / 1000.0f);
-        }
-        if (total_tensors_processed < total_tensors_to_process && total_tensors_to_process > 0) {
+        total_tensors_processed += file_tensors.size();
+        pretty_bytes_progress(static_cast<int>(total_tensors_processed),
+                              static_cast<int>(total_tensors_to_process),
+                              bytes_processed.load(),
+                              (ggml_time_ms() - t_start) / 1000.0f);
+        if (total_tensors_processed < total_tensors_to_process) {
            printf("\n");
        }
    }
@ -1233,77 +1173,9 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb,
    return success;
 }

-bool ModelLoader::load_float_tensor(const std::string& name,
-                                    std::vector<float>& data,
-                                    int n_threads,
-                                    bool use_mmap) {
-    data.clear();
-
-    auto tensor_storage_it = tensor_storage_map.find(name);
-    if (tensor_storage_it == tensor_storage_map.end()) {
-        return false;
-    }
-
-    const TensorStorage& tensor_storage = tensor_storage_it->second;
-    int64_t n_elements                  = tensor_storage.nelements();
-    if (n_elements <= 0) {
-        LOG_ERROR("tensor '%s' has invalid element count: %" PRId64, name.c_str(), n_elements);
-        return false;
-    }
-    if (tensor_storage.n_dims <= 0 || tensor_storage.n_dims > GGML_MAX_DIMS) {
-        LOG_ERROR("tensor '%s' has unsupported dims: %d", name.c_str(), tensor_storage.n_dims);
-        return false;
-    }
-
-    std::vector<float> loaded_data(static_cast<size_t>(n_elements));
-    ggml_init_params params;
-    params.mem_size   = ggml_tensor_overhead();
-    params.mem_buffer = nullptr;
-    params.no_alloc   = true;
-
-    ggml_context* ctx = ggml_init(params);
-    if (ctx == nullptr) {
-        LOG_ERROR("failed to create context for tensor '%s'", name.c_str());
-        return false;
-    }
-
-    ggml_tensor* tensor = ggml_new_tensor(ctx, GGML_TYPE_F32, tensor_storage.n_dims, tensor_storage.ne);
-    ggml_set_name(tensor, name.c_str());
-    tensor->data = loaded_data.data();
-
-    bool loaded           = false;
-    auto on_new_tensor_cb = [&](const TensorStorage& current_tensor_storage, ggml_tensor** dst_tensor) -> bool {
-        *dst_tensor = nullptr;
-        if (current_tensor_storage.name != name) {
-            return true;
-        }
-        if (current_tensor_storage.nelements() != n_elements) {
-            LOG_ERROR("tensor '%s' element count changed during load", name.c_str());
-            return false;
-        }
-        *dst_tensor = tensor;
-        loaded      = true;
-        return true;
-    };
-
-    std::set<std::string> target_tensor_names{name};
-    if (n_threads > 0) {
-        set_n_threads(n_threads);
-    }
-    bool success = load_tensors(on_new_tensor_cb, use_mmap, &target_tensor_names);
-    ggml_free(ctx);
-
-    if (!success || !loaded) {
-        data.clear();
-        return false;
-    }
-
-    data = std::move(loaded_data);
-    return true;
-}
-
 bool ModelLoader::load_tensors(std::map<std::string, ggml_tensor*>& tensors,
                               std::set<std::string> ignore_tensors,
+                               int n_threads,
                               bool enable_mmap) {
    std::set<std::string> tensor_names_in_file;
    std::mutex tensor_names_mutex;
@ -1347,7 +1219,7 @@ bool ModelLoader::load_tensors(std::map<std::string, ggml_tensor*>& tensors,
        return true;
    };

-    bool success = load_tensors(on_new_tensor_cb, enable_mmap);
+    bool success = load_tensors(on_new_tensor_cb, n_threads, enable_mmap);
    if (!success) {
        LOG_ERROR("load tensors from file failed");
        return false;
--- a/src/model_loader.h
+++ b/src/model_loader.h
@ -34,9 +34,7 @@ protected:
    std::vector<ModelFileData> file_data;
    bool model_files_processed = false;
    String2TensorStorage tensor_storage_map;
-    int n_threads_;

-    size_t add_file_path(const std::string& file_path);
    void add_tensor_storage(const TensorStorage& tensor_storage);

    bool init_from_gguf_file(const std::string& file_path, const std::string& prefix = "");
@ -46,8 +44,6 @@ protected:
    bool init_from_diffusers_file(const std::string& file_path, const std::string& prefix = "");

 public:
-    ModelLoader();
-
    bool init_from_file(const std::string& file_path, const std::string& prefix = "");
    void convert_tensors_name();
    bool init_from_file_and_convert_name(const std::string& file_path,
@ -59,23 +55,16 @@ public:
    std::map<ggml_type, uint32_t> get_diffusion_model_wtype_stat();
    std::map<ggml_type, uint32_t> get_vae_wtype_stat();
    String2TensorStorage& get_tensor_storage_map() { return tensor_storage_map; }
-    const String2TensorStorage& get_tensor_storage_map() const { return tensor_storage_map; }
-    void set_n_threads(int n_threads);
    void set_wtype_override(ggml_type wtype, std::string tensor_type_rules = "");
    void process_model_files(bool enable_mmap = false, bool writable_mmap = true);
    std::vector<MmapTensorStore> mmap_tensors(std::map<std::string, ggml_tensor*>& tensors,
                                              std::set<std::string> ignore_tensors = {},
                                              bool writable                        = true);
-    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb,
-                      bool use_mmap                                    = false,
-                      const std::set<std::string>* target_tensor_names = nullptr);
+    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0, bool use_mmap = false);
    bool load_tensors(std::map<std::string, ggml_tensor*>& tensors,
                      std::set<std::string> ignore_tensors = {},
+                      int n_threads                        = 0,
                      bool use_mmap                        = false);
-    bool load_float_tensor(const std::string& name,
-                           std::vector<float>& data,
-                           int n_threads = 0,
-                           bool use_mmap = false);

    std::vector<std::string> get_tensor_names() const {
        std::vector<std::string> names;
--- a/src/model_manager.cpp
+++ b/src/model_manager.cpp
@ -1,950 +0,0 @@
-#include "model_manager.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <iterator>
-#include <mutex>
-#include <unordered_set>
-
-#include "core/ggml_extend_backend.h"
-#include "core/util.h"
-#include "model/adapter/lora.hpp"
-
-static size_t aligned_offset(const void* buffer, size_t offset, size_t alignment) {
-    GGML_ASSERT(alignment != 0 && (alignment & (alignment - 1)) == 0);
-    size_t align = (alignment - ((reinterpret_cast<uintptr_t>(buffer) + offset) % alignment)) % alignment;
-    return offset + align;
-}
-
-static bool lora_specs_equal(const std::vector<ModelManager::LoraSpec>& lhs,
-                             const std::vector<ModelManager::LoraSpec>& rhs) {
-    if (lhs.size() != rhs.size()) {
-        return false;
-    }
-    for (size_t i = 0; i < lhs.size(); ++i) {
-        if (lhs[i].path != rhs[i].path ||
-            lhs[i].multiplier != rhs[i].multiplier ||
-            lhs[i].is_high_noise != rhs[i].is_high_noise ||
-            lhs[i].tensor_name_prefix_filter != rhs[i].tensor_name_prefix_filter ||
-            lhs[i].required != rhs[i].required) {
-            return false;
-        }
-    }
-    return true;
-}
-
-static std::string lora_id(const ModelManager::LoraSpec& lora) {
-    return lora.is_high_noise ? "|high_noise|" + lora.path : lora.path;
-}
-
-static bool backend_supports_host_buffer(ggml_backend_t backend) {
-    if (backend == nullptr) {
-        return false;
-    }
-    if (sd_backend_is_cpu(backend)) {
-        return true;
-    }
-    ggml_backend_dev_t dev = ggml_backend_get_device(backend);
-    if (dev == nullptr) {
-        return false;
-    }
-    ggml_backend_dev_props props;
-    ggml_backend_dev_get_props(dev, &props);
-    return props.caps.buffer_from_host_ptr;
-}
-
-ModelManager::~ModelManager() {
-    release_all();
-}
-
-void ModelManager::set_common_ignore_tensors(std::set<std::string> ignore_tensors) {
-    common_ignore_tensors_ = std::move(ignore_tensors);
-}
-
-void ModelManager::set_loras(std::vector<LoraSpec> loras, SDVersion version) {
-    if (loras.empty() && loras_.empty()) {
-        lora_version_ = version;
-        return;
-    }
-    if (lora_version_ == version && lora_specs_equal(loras_, loras)) {
-        return;
-    }
-
-    loras_        = std::move(loras);
-    lora_version_ = version;
-    current_lora_epoch_++;
-    reset_lora_applied_params();
-}
-
-std::set<std::string> ModelManager::tensor_names() const {
-    std::set<std::string> names;
-    for (const auto& state : tensor_states_) {
-        if (state != nullptr) {
-            names.insert(state->name);
-        }
-    }
-    return names;
-}
-
-size_t estimate_tensors_size(const std::map<std::string, ggml_tensor*>& tensors) {
-    size_t size = 0;
-    std::unordered_set<ggml_tensor*> seen;
-    for (const auto& pair : tensors) {
-        ggml_tensor* tensor = pair.second;
-        if (tensor == nullptr || seen.find(tensor) != seen.end()) {
-            continue;
-        }
-        seen.insert(tensor);
-        size += ggml_nbytes(tensor);
-    }
-    return size;
-}
-
-bool ModelManager::register_param_tensors(const std::string& desc,
-                                          std::map<std::string, ggml_tensor*> tensors,
-                                          ResidencyMode residency_mode,
-                                          ggml_backend_t compute_backend,
-                                          ggml_backend_t params_backend,
-                                          size_t* registered_tensor_size) {
-    if (desc.empty()) {
-        LOG_ERROR("model manager tensor desc is empty");
-        return false;
-    }
-    if (registered_tensor_size != nullptr) {
-        *registered_tensor_size += estimate_tensors_size(tensors);
-    }
-
-    std::vector<std::unique_ptr<TensorState>> new_states;
-    new_states.reserve(tensors.size());
-
-    for (const auto& pair : tensors) {
-        const std::string& name = pair.first;
-        ggml_tensor* tensor     = pair.second;
-        if (tensor == nullptr) {
-            continue;
-        }
-        if (tensor_states_by_name_.find(name) != tensor_states_by_name_.end()) {
-            LOG_ERROR("model manager tensor name '%s' is already registered", name.c_str());
-            return false;
-        }
-        ggml_set_name(tensor, name.c_str());
-
-        auto state             = std::make_unique<TensorState>();
-        state->name            = name;
-        state->tensor          = tensor;
-        state->desc            = desc;
-        state->residency_mode  = residency_mode;
-        state->compute_backend = compute_backend;
-        state->params_backend  = params_backend;
-        new_states.push_back(std::move(state));
-    }
-
-    for (auto& state : new_states) {
-        TensorState* registered_state                  = state.get();
-        tensor_states_by_name_[registered_state->name] = registered_state;
-        tensor_states_.push_back(std::move(state));
-    }
-    return true;
-}
-
-bool ModelManager::load_all_params_eagerly() {
-    std::vector<TensorState*> all_states;
-    all_states.reserve(tensor_states_.size());
-    for (const auto& s : tensor_states_) {
-        if (s != nullptr) {
-            all_states.push_back(s.get());
-        }
-    }
-    return load_tensors_to_params_backend(all_states);
-}
-
-bool ModelManager::validate_registered_tensors() {
-    bool ok = true;
-    for (const auto& state : tensor_states_) {
-        if (state == nullptr) {
-            ok = false;
-            continue;
-        }
-        bool state_ok = validate_tensor(*state);
-        if (state_ok) {
-            state->metadata_validated = true;
-        }
-        ok = state_ok && ok;
-    }
-    return ok;
-}
-
-bool ModelManager::load_tensors_to_params_backend(const std::vector<TensorState*>& states) {
-    std::vector<TensorState*> need_load;
-    need_load.reserve(states.size());
-    for (TensorState* state : states) {
-        if (state == nullptr || should_ignore(*state) || is_optional_missing_tensor(state->name)) {
-            continue;
-        }
-        if (!state->metadata_validated) {
-            if (!validate_tensor(*state)) {
-                return false;
-            }
-            state->metadata_validated = true;
-        }
-        if (!state->loaded_to_params_backend) {
-            need_load.push_back(state);
-        }
-    }
-    if (need_load.empty()) {
-        return true;
-    }
-
-    std::vector<ParamsStorageBlock*> created_storage_blocks;
-    if (!mmap_params(need_load, created_storage_blocks)) {
-        for (ParamsStorageBlock* block : created_storage_blocks) {
-            if (block != nullptr) {
-                free_params_storage_block(*block);
-                erase_params_storage_block(block);
-            }
-        }
-        return false;
-    }
-
-    std::vector<TensorState*> need_alloc;
-    need_alloc.reserve(need_load.size());
-    for (TensorState* state : need_load) {
-        if (state->tensor != nullptr && state->tensor->data == nullptr && state->tensor->view_src == nullptr) {
-            need_alloc.push_back(state);
-        }
-    }
-
-    if (!alloc_params_buffers(need_alloc, created_storage_blocks) ||
-        !load_tensors(need_load)) {
-        for (ParamsStorageBlock* block : created_storage_blocks) {
-            if (block != nullptr) {
-                free_params_storage_block(*block);
-                erase_params_storage_block(block);
-            }
-        }
-        return false;
-    }
-    for (ParamsStorageBlock* block : created_storage_blocks) {
-        if (block != nullptr && block->buffer != nullptr) {
-            LOG_DEBUG("model manager prepared params backend buffer (%6.2f MB, %zu tensors, %s)",
-                      ggml_backend_buffer_get_size(block->buffer) / (1024.f * 1024.f),
-                      block->states.size(),
-                      ggml_backend_buffer_is_host(block->buffer) ? "RAM" : "VRAM");
-        }
-    }
-
-    return true;
-}
-
-bool ModelManager::stage_tensors_to_compute_backend(const std::vector<TensorState*>& states) {
-    std::map<ggml_backend_t, std::vector<TensorState*>> states_by_compute_backend;
-    for (TensorState* state : states) {
-        if (state == nullptr || should_ignore(*state) || is_optional_missing_tensor(state->name)) {
-            continue;
-        }
-        if (state->compute_backend == nullptr) {
-            LOG_ERROR("model manager compute backend is null for tensor '%s'", state->name.c_str());
-            return false;
-        }
-        if (state->params_backend == nullptr) {
-            LOG_ERROR("model manager params backend is null for tensor '%s'", state->name.c_str());
-            return false;
-        }
-        if (state->compute_backend == state->params_backend || state->staged_to_compute_backend) {
-            continue;
-        }
-        if (!state->loaded_to_params_backend || state->tensor == nullptr || state->tensor->data == nullptr) {
-            LOG_ERROR("model manager tensor '%s' is not loaded to params backend", state->name.c_str());
-            return false;
-        }
-        states_by_compute_backend[state->compute_backend].push_back(state);
-    }
-
-    for (const auto& pair : states_by_compute_backend) {
-        ggml_backend_t compute_backend          = pair.first;
-        const std::vector<TensorState*>& states = pair.second;
-        if (states.empty()) {
-            continue;
-        }
-
-        int64_t t0 = ggml_time_ms();
-
-        ggml_init_params init_params;
-        init_params.mem_size   = std::max<size_t>(1, states.size()) * ggml_tensor_overhead();
-        init_params.mem_buffer = nullptr;
-        init_params.no_alloc   = true;
-
-        ggml_context* staging_ctx = ggml_init(init_params);
-        GGML_ASSERT(staging_ctx != nullptr);
-
-        std::vector<std::pair<TensorState*, ggml_tensor*>> staged_tensors;
-        staged_tensors.reserve(states.size());
-        for (TensorState* state : states) {
-            ggml_tensor* staging_tensor = ggml_dup_tensor(staging_ctx, state->tensor);
-            ggml_set_name(staging_tensor, state->tensor->name);
-            staged_tensors.push_back({state, staging_tensor});
-        }
-
-        ggml_backend_buffer_t compute_buffer = ggml_backend_alloc_ctx_tensors(staging_ctx, compute_backend);
-        if (compute_buffer == nullptr) {
-            LOG_ERROR("model manager alloc compute params backend buffer failed, num_tensors = %zu",
-                      staged_tensors.size());
-            ggml_free(staging_ctx);
-            return false;
-        }
-        ggml_backend_buffer_set_usage(compute_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
-
-        for (auto& staged_tensor : staged_tensors) {
-            TensorState* state          = staged_tensor.first;
-            ggml_tensor* managed_tensor = state->tensor;
-            ggml_tensor* staging_tensor = staged_tensor.second;
-            ggml_backend_tensor_copy(managed_tensor, staging_tensor);
-            std::swap(managed_tensor->buffer, staging_tensor->buffer);
-            std::swap(managed_tensor->data, staging_tensor->data);
-            std::swap(managed_tensor->extra, staging_tensor->extra);
-        }
-        ggml_backend_synchronize(compute_backend);
-
-        auto block             = std::make_unique<ComputeStagingBlock>();
-        block->compute_backend = compute_backend;
-        block->buffer          = compute_buffer;
-        block->staging_ctx     = staging_ctx;
-        block->staged_tensors  = std::move(staged_tensors);
-        for (auto& staged_tensor : block->staged_tensors) {
-            TensorState* state               = staged_tensor.first;
-            state->staged_to_compute_backend = true;
-        }
-        compute_staging_blocks_.push_back(std::move(block));
-
-        int64_t t1 = ggml_time_ms();
-        LOG_DEBUG("model manager staged compute params (%6.2f MB, %zu tensors) to %s, taking %.2fs",
-                  ggml_backend_buffer_get_size(compute_buffer) / (1024.f * 1024.f),
-                  states.size(),
-                  ggml_backend_name(compute_backend),
-                  (t1 - t0) * 1.0f / 1000);
-    }
-
-    return true;
-}
-
-bool ModelManager::apply_loras_to_params(const std::vector<TensorState*>& states) {
-    if (loras_.empty()) {
-        return true;
-    }
-
-    struct LoraApplyGroup {
-        std::map<std::string, ggml_tensor*> model_tensors;
-        std::vector<TensorState*> states;
-    };
-
-    std::map<ggml_backend_t, LoraApplyGroup> groups;
-    for (TensorState* state : states) {
-        if (state == nullptr || state->tensor == nullptr ||
-            should_ignore(*state) || is_optional_missing_tensor(state->name)) {
-            continue;
-        }
-        if (state->applied_lora_epoch == current_lora_epoch_) {
-            continue;
-        }
-        if (state->compute_backend == nullptr) {
-            LOG_ERROR("model manager compute backend is null for lora target tensor '%s'", state->name.c_str());
-            return false;
-        }
-        if (state->tensor->data == nullptr) {
-            LOG_ERROR("model manager lora target tensor '%s' is not prepared", state->name.c_str());
-            return false;
-        }
-        LoraApplyGroup& group            = groups[state->compute_backend];
-        group.model_tensors[state->name] = state->tensor;
-        group.states.push_back(state);
-    }
-
-    if (groups.empty()) {
-        return true;
-    }
-
-    std::set<std::string> all_tensor_names = tensor_names();
-    for (auto& group_pair : groups) {
-        ggml_backend_t compute_backend = group_pair.first;
-        LoraApplyGroup& group          = group_pair.second;
-        for (const LoraSpec& lora_spec : loras_) {
-            if (group.model_tensors.empty()) {
-                continue;
-            }
-
-            std::string id = lora_id(lora_spec);
-            auto lora      = std::make_shared<LoraModel>(id,
-                                                    compute_backend,
-                                                    compute_backend,
-                                                    lora_spec.path,
-                                                    lora_spec.is_high_noise ? "model.high_noise_" : "",
-                                                    lora_version_);
-
-            LoraModel::filter_t lora_tensor_filter = nullptr;
-            if (!lora_spec.tensor_name_prefix_filter.empty()) {
-                lora_tensor_filter = [&](const std::string& tensor_name) {
-                    return starts_with(tensor_name, lora_spec.tensor_name_prefix_filter);
-                };
-            }
-            if (!lora->load_from_file(n_threads_, lora_tensor_filter)) {
-                LOG_WARN("load lora tensors from %s failed", lora_spec.path.c_str());
-                if (lora_spec.required) {
-                    return false;
-                }
-                continue;
-            }
-            if (lora->lora_tensors.empty()) {
-                if (lora_spec.required) {
-                    LOG_ERROR("required lora has no tensors: %s", lora_spec.path.c_str());
-                    return false;
-                }
-                continue;
-            }
-            lora->multiplier = lora_spec.multiplier;
-            lora->apply(group.model_tensors, all_tensor_names, lora_version_, n_threads_, false);
-            lora->release_loaded_tensors();
-        }
-
-        for (TensorState* state : group.states) {
-            if (state != nullptr) {
-                state->applied_lora_epoch = current_lora_epoch_;
-            }
-        }
-    }
-    return true;
-}
-
-void ModelManager::reset_lora_applied_params() {
-    release_compute_staging_blocks(true);
-    release_params_storage_blocks(true);
-    for (auto& state : tensor_states_) {
-        state->applied_lora_epoch = UINT64_MAX;
-    }
-}
-
-bool ModelManager::should_ignore(const TensorState& state) const {
-    for (const auto& ignore_prefix : common_ignore_tensors_) {
-        if (starts_with(state.name, ignore_prefix)) {
-            return true;
-        }
-    }
-    return false;
-}
-
-bool ModelManager::is_optional_missing_tensor(const std::string& name) const {
-    return name.find("cond_stage_model.transformer.text_model.encoder.layers.23") != std::string::npos ||
-           name.find("alphas_cumprod") != std::string::npos;
-}
-
-bool ModelManager::validate_tensor(const TensorState& state) const {
-    if (state.tensor == nullptr || should_ignore(state) || is_optional_missing_tensor(state.name)) {
-        return true;
-    }
-
-    const auto& tensor_storage_map = model_loader_.get_tensor_storage_map();
-    auto ts_it                     = tensor_storage_map.find(state.name);
-    if (ts_it == tensor_storage_map.end()) {
-        LOG_ERROR("%s tensor '%s' not in model metadata", state.desc.c_str(), state.name.c_str());
-        return false;
-    }
-
-    const TensorStorage& tensor_storage = ts_it->second;
-    if (state.tensor->ne[0] != tensor_storage.ne[0] ||
-        state.tensor->ne[1] != tensor_storage.ne[1] ||
-        state.tensor->ne[2] != tensor_storage.ne[2] ||
-        state.tensor->ne[3] != tensor_storage.ne[3]) {
-        LOG_ERROR(
-            "%s tensor '%s' has wrong shape in model metadata: got [%d, %d, %d, %d], expected [%d, %d, %d, %d]",
-            state.desc.c_str(),
-            state.name.c_str(),
-            (int)tensor_storage.ne[0], (int)tensor_storage.ne[1], (int)tensor_storage.ne[2], (int)tensor_storage.ne[3],
-            (int)state.tensor->ne[0], (int)state.tensor->ne[1], (int)state.tensor->ne[2], (int)state.tensor->ne[3]);
-        return false;
-    }
-    return true;
-}
-
-bool ModelManager::mmap_params(const std::vector<TensorState*>& states,
-                               std::vector<ParamsStorageBlock*>& created_storage_blocks) {
-    std::map<std::string, ggml_tensor*> mmap_candidates;
-    std::map<std::string, TensorState*> mmap_states;
-    for (TensorState* state : states) {
-        if (state == nullptr || !can_mmap_storage(*state) || state->tensor == nullptr ||
-            state->tensor->data != nullptr || state->tensor->view_src != nullptr) {
-            continue;
-        }
-        mmap_candidates[state->name] = state->tensor;
-        mmap_states[state->name]     = state;
-    }
-    if (mmap_candidates.empty()) {
-        return true;
-    }
-
-    auto mmap_store = model_loader_.mmap_tensors(mmap_candidates, {}, writable_mmap_);
-    if (mmap_store.empty()) {
-        return true;
-    }
-
-    auto block                = std::make_unique<ParamsStorageBlock>();
-    block->mmap_tensor_stores = std::move(mmap_store);
-    ParamsStorageBlock* raw   = block.get();
-    for (const auto& pair : mmap_states) {
-        TensorState* state = pair.second;
-        if (state != nullptr && state->tensor != nullptr && state->tensor->data != nullptr) {
-            block->states.push_back(state);
-        }
-    }
-
-    if (!block->states.empty()) {
-        params_storage_blocks_.push_back(std::move(block));
-        created_storage_blocks.push_back(raw);
-    }
-    return true;
-}
-
-bool ModelManager::can_mmap_storage(const TensorState& state) const {
-    if (!enable_mmap_ || state.residency_mode != ResidencyMode::ParamBackend) {
-        return false;
-    }
-    if (state.compute_backend == nullptr || state.params_backend == nullptr) {
-        return false;
-    }
-    return sd_backend_is_cpu(state.compute_backend) ||
-           sd_backend_is_cpu(state.params_backend) ||
-           backend_supports_host_buffer(state.compute_backend);
-}
-
-bool ModelManager::alloc_params_buffers(const std::vector<TensorState*>& states,
-                                        std::vector<ParamsStorageBlock*>& created_storage_blocks) {
-    std::map<std::pair<ggml_backend_buffer_type_t, int>, std::vector<TensorState*>> states_by_buffer_type;
-    for (TensorState* state : states) {
-        if (state == nullptr || state->tensor == nullptr) {
-            continue;
-        }
-        ggml_backend_buffer_type_t params_buft = params_buffer_type_for(*state);
-        if (params_buft == nullptr) {
-            return false;
-        }
-        states_by_buffer_type[{params_buft, static_cast<int>(state->residency_mode)}].push_back(state);
-    }
-
-    for (const auto& pair : states_by_buffer_type) {
-        ggml_backend_buffer_type_t params_buft  = pair.first.first;
-        const std::vector<TensorState*>& states = pair.second;
-        size_t alignment                        = ggml_backend_buft_get_alignment(params_buft);
-        size_t max_size                         = ggml_backend_buft_get_max_size(params_buft);
-
-        auto alloc_chunk = [&](const std::vector<TensorState*>& chunk, size_t chunk_size) -> bool {
-            if (chunk.empty() || chunk_size == 0) {
-                return true;
-            }
-
-            ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(params_buft, chunk_size);
-            if (buffer == nullptr) {
-                LOG_ERROR("model manager alloc params backend buffer failed, size = %.2fMB",
-                          chunk_size / (1024.0 * 1024.0));
-                return false;
-            }
-            ggml_backend_buffer_set_usage(buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
-
-            std::vector<ggml_tensor*> initialized_tensors;
-            void* base    = ggml_backend_buffer_get_base(buffer);
-            size_t offset = aligned_offset(base, 0, ggml_backend_buffer_get_alignment(buffer));
-            for (TensorState* state : chunk) {
-                ggml_tensor* tensor     = state->tensor;
-                size_t tensor_size      = GGML_PAD(ggml_backend_buffer_get_alloc_size(buffer, tensor),
-                                                   ggml_backend_buffer_get_alignment(buffer));
-                enum ggml_status status = ggml_backend_tensor_alloc(buffer, tensor, static_cast<char*>(base) + offset);
-                if (status != GGML_STATUS_SUCCESS) {
-                    LOG_ERROR("model manager failed to initialize params tensor '%s'", ggml_get_name(tensor));
-                    for (ggml_tensor* initialized : initialized_tensors) {
-                        initialized->buffer = nullptr;
-                        initialized->data   = nullptr;
-                        initialized->extra  = nullptr;
-                    }
-                    LOG_DEBUG("model manager releasing params backend buffer (%6.2f MB, %zu tensors, %s)",
-                              ggml_backend_buffer_get_size(buffer) / (1024.f * 1024.f),
-                              initialized_tensors.size(),
-                              ggml_backend_buffer_is_host(buffer) ? "RAM" : "VRAM");
-                    ggml_backend_buffer_free(buffer);
-                    return false;
-                }
-                initialized_tensors.push_back(tensor);
-                offset += tensor_size;
-            }
-
-            auto block              = std::make_unique<ParamsStorageBlock>();
-            block->buffer           = buffer;
-            block->states           = chunk;
-            ParamsStorageBlock* raw = block.get();
-            params_storage_blocks_.push_back(std::move(block));
-            created_storage_blocks.push_back(raw);
-
-            return true;
-        };
-
-        std::vector<TensorState*> chunk;
-        size_t chunk_size = 0;
-        for (TensorState* state : states) {
-            ggml_tensor* tensor = state->tensor;
-            size_t tensor_size  = GGML_PAD(ggml_backend_buft_get_alloc_size(params_buft, tensor), alignment);
-            // Some backends, e.g. Vulkan, report a preferred chunk size here rather than a
-            // hard per-tensor allocation limit. Oversized tensors are allocated alone.
-            if (!chunk.empty() && max_size > 0 && chunk_size + tensor_size > max_size) {
-                if (!alloc_chunk(chunk, chunk_size)) {
-                    return false;
-                }
-                chunk.clear();
-                chunk_size = 0;
-            }
-            chunk.push_back(state);
-            chunk_size += tensor_size;
-        }
-
-        if (!alloc_chunk(chunk, chunk_size)) {
-            return false;
-        }
-    }
-
-    return true;
-}
-
-bool ModelManager::load_tensors(const std::vector<TensorState*>& states) {
-    std::map<std::string, TensorState*> states_by_name;
-    std::set<std::string> target_tensor_names;
-    for (TensorState* state : states) {
-        if (state == nullptr) {
-            continue;
-        }
-        states_by_name[state->name] = state;
-        target_tensor_names.insert(state->name);
-    }
-    if (states_by_name.empty()) {
-        return true;
-    }
-
-    std::set<std::string> loaded_names;
-    std::mutex loaded_names_mutex;
-    auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
-        const std::string& name = tensor_storage.name;
-        *dst_tensor             = nullptr;
-
-        auto state_it = states_by_name.find(name);
-        if (state_it == states_by_name.end()) {
-            return true;
-        }
-
-        TensorState* state = state_it->second;
-        if (state == nullptr || state->tensor == nullptr) {
-            LOG_ERROR("model manager tensor '%s' is null", name.c_str());
-            return false;
-        }
-
-        if (state->tensor->ne[0] != tensor_storage.ne[0] ||
-            state->tensor->ne[1] != tensor_storage.ne[1] ||
-            state->tensor->ne[2] != tensor_storage.ne[2] ||
-            state->tensor->ne[3] != tensor_storage.ne[3]) {
-            LOG_ERROR(
-                "model manager tensor '%s' has wrong shape in model file: got [%d, %d, %d, %d], expected [%d, %d, %d, %d]",
-                name.c_str(),
-                (int)tensor_storage.ne[0], (int)tensor_storage.ne[1], (int)tensor_storage.ne[2], (int)tensor_storage.ne[3],
-                (int)state->tensor->ne[0], (int)state->tensor->ne[1], (int)state->tensor->ne[2], (int)state->tensor->ne[3]);
-            return false;
-        }
-
-        {
-            std::lock_guard<std::mutex> lock(loaded_names_mutex);
-            loaded_names.insert(name);
-        }
-        *dst_tensor = state->tensor;
-        return true;
-    };
-
-    if (!model_loader_.load_tensors(on_new_tensor_cb, enable_mmap_, &target_tensor_names)) {
-        LOG_ERROR("model manager load tensors failed");
-        return false;
-    }
-
-    bool missing = false;
-    for (const auto& pair : states_by_name) {
-        const std::string& name = pair.first;
-        if (loaded_names.find(name) == loaded_names.end()) {
-            LOG_ERROR("model manager tensor '%s' was not loaded", name.c_str());
-            missing = true;
-        }
-    }
-    if (missing) {
-        return false;
-    }
-
-    for (const auto& pair : states_by_name) {
-        pair.second->loaded_to_params_backend = true;
-    }
-    return true;
-}
-
-ggml_backend_buffer_type_t ModelManager::params_buffer_type_for(const TensorState& state) const {
-    if (state.params_backend == nullptr) {
-        LOG_ERROR("model manager params backend is null for tensor '%s'", state.name.c_str());
-        return nullptr;
-    }
-    ggml_backend_buffer_type_t params_buft = nullptr;
-    if (state.compute_backend != nullptr && state.params_backend != state.compute_backend) {
-        ggml_backend_dev_t compute_dev = ggml_backend_get_device(state.compute_backend);
-        if (compute_dev != nullptr) {
-            params_buft = ggml_backend_dev_host_buffer_type(compute_dev);
-        }
-    }
-    if (params_buft == nullptr) {
-        params_buft = ggml_backend_get_default_buffer_type(state.params_backend);
-    }
-    return params_buft;
-}
-
-void ModelManager::free_compute_staging_block(ComputeStagingBlock& block) {
-    for (auto& staged_tensor : block.staged_tensors) {
-        TensorState* state          = staged_tensor.first;
-        ggml_tensor* staging_tensor = staged_tensor.second;
-        if (state == nullptr || state->tensor == nullptr || staging_tensor == nullptr) {
-            continue;
-        }
-        ggml_tensor* managed_tensor = state->tensor;
-        managed_tensor->buffer      = staging_tensor->buffer;
-        managed_tensor->data        = staging_tensor->data;
-        managed_tensor->extra       = staging_tensor->extra;
-        staging_tensor->buffer      = nullptr;
-        staging_tensor->data        = nullptr;
-        staging_tensor->extra       = nullptr;
-
-        state->staged_to_compute_backend = false;
-        state->applied_lora_epoch        = UINT64_MAX;
-    }
-
-    if (block.buffer != nullptr) {
-        LOG_DEBUG("model manager releasing compute params (%6.2f MB, %zu tensors) from %s",
-                  ggml_backend_buffer_get_size(block.buffer) / (1024.f * 1024.f),
-                  block.staged_tensors.size(),
-                  block.compute_backend != nullptr ? ggml_backend_name(block.compute_backend) : "unknown");
-        ggml_backend_buffer_free(block.buffer);
-        block.buffer = nullptr;
-    }
-    if (block.staging_ctx != nullptr) {
-        ggml_free(block.staging_ctx);
-        block.staging_ctx = nullptr;
-    }
-    block.staged_tensors.clear();
-}
-
-void ModelManager::release_compute_staging_blocks(bool force,
-                                                  const std::unordered_set<TensorState*>* target_states) {
-    for (auto it = compute_staging_blocks_.begin(); it != compute_staging_blocks_.end();) {
-        ComputeStagingBlock* block = it->get();
-        bool can_release           = force;
-        if (!can_release) {
-            can_release = std::all_of(block->staged_tensors.begin(),
-                                      block->staged_tensors.end(),
-                                      [target_states](const std::pair<TensorState*, ggml_tensor*>& pair) {
-                                          TensorState* state = pair.first;
-                                          if (state == nullptr) {
-                                              return true;
-                                          }
-                                          if (target_states != nullptr &&
-                                              target_states->find(state) == target_states->end()) {
-                                              return false;
-                                          }
-                                          return state->active_prepare_count == 0;
-                                      });
-        }
-
-        if (can_release) {
-            free_compute_staging_block(*block);
-            it = compute_staging_blocks_.erase(it);
-        } else {
-            ++it;
-        }
-    }
-}
-
-void ModelManager::free_params_storage_block(ParamsStorageBlock& block) {
-    if (block.buffer != nullptr) {
-        LOG_DEBUG("model manager releasing params backend buffer (%6.2f MB, %zu tensors, %s)",
-                  ggml_backend_buffer_get_size(block.buffer) / (1024.f * 1024.f),
-                  block.states.size(),
-                  ggml_backend_buffer_is_host(block.buffer) ? "RAM" : "VRAM");
-        ggml_backend_buffer_free(block.buffer);
-        block.buffer = nullptr;
-    }
-    block.mmap_tensor_stores.clear();
-
-    for (TensorState* state : block.states) {
-        if (state == nullptr || state->tensor == nullptr) {
-            continue;
-        }
-        state->tensor->buffer = nullptr;
-        state->tensor->data   = nullptr;
-        state->tensor->extra  = nullptr;
-
-        state->loaded_to_params_backend = false;
-        state->applied_lora_epoch       = UINT64_MAX;
-    }
-    block.states.clear();
-}
-
-void ModelManager::release_params_storage_blocks(bool force,
-                                                 const std::unordered_set<TensorState*>* target_states) {
-    for (auto it = params_storage_blocks_.begin(); it != params_storage_blocks_.end();) {
-        ParamsStorageBlock* block = it->get();
-        bool can_release          = force;
-        if (!can_release) {
-            can_release = std::all_of(block->states.begin(),
-                                      block->states.end(),
-                                      [target_states](TensorState* state) {
-                                          if (state == nullptr) {
-                                              return true;
-                                          }
-                                          if (target_states != nullptr &&
-                                              target_states->find(state) == target_states->end()) {
-                                              return false;
-                                          }
-                                          return state->active_prepare_count == 0 &&
-                                                 !state->staged_to_compute_backend &&
-                                                 state->residency_mode == ResidencyMode::Disk;
-                                      });
-        }
-
-        if (can_release) {
-            free_params_storage_block(*block);
-            it = params_storage_blocks_.erase(it);
-        } else {
-            ++it;
-        }
-    }
-}
-
-void ModelManager::erase_params_storage_block(ParamsStorageBlock* block) {
-    auto it = std::find_if(params_storage_blocks_.begin(),
-                           params_storage_blocks_.end(),
-                           [block](const std::unique_ptr<ParamsStorageBlock>& item) {
-                               return item.get() == block;
-                           });
-    if (it != params_storage_blocks_.end()) {
-        params_storage_blocks_.erase(it);
-    }
-}
-
-void ModelManager::release_all() {
-    for (auto& state : tensor_states_) {
-        state->active_prepare_count = 0;
-        state->applied_lora_epoch   = UINT64_MAX;
-    }
-    release_compute_staging_blocks(true);
-    release_params_storage_blocks(true);
-}
-
-bool ModelManager::resolve_required_tensor_states(const std::vector<ggml_tensor*>& tensors,
-                                                  std::vector<TensorState*>& required_states) const {
-    required_states.clear();
-    std::unordered_set<TensorState*> seen;
-    for (ggml_tensor* tensor : tensors) {
-        if (tensor == nullptr) {
-            continue;
-        }
-        const char* raw_name = ggml_get_name(tensor);
-        if (raw_name == nullptr || raw_name[0] == '\0') {
-            LOG_ERROR("model manager unnamed tensor is not registered");
-            return false;
-        }
-        auto state_it = tensor_states_by_name_.find(raw_name);
-        if (state_it == tensor_states_by_name_.end()) {
-            LOG_ERROR("model manager tensor '%s' is not registered", raw_name);
-            return false;
-        }
-        TensorState* state = state_it->second;
-        if (state == nullptr) {
-            LOG_ERROR("model manager tensor '%s' has no tensor state", raw_name);
-            return false;
-        }
-        if (seen.insert(state).second) {
-            required_states.push_back(state);
-        }
-    }
-    return true;
-}
-
-bool ModelManager::prepare_params(const std::vector<ggml_tensor*>& tensors) {
-    if (tensors.empty()) {
-        return true;
-    }
-
-    std::vector<TensorState*> required_states;
-    if (!resolve_required_tensor_states(tensors, required_states)) {
-        return false;
-    }
-
-    if (!load_tensors_to_params_backend(required_states)) {
-        return false;
-    }
-
-    if (!stage_tensors_to_compute_backend(required_states)) {
-        release_compute_staging_blocks(false);
-        release_params_storage_blocks(false);
-        return false;
-    }
-
-    if (!apply_loras_to_params(required_states)) {
-        release_compute_staging_blocks(false);
-        release_params_storage_blocks(false);
-        return false;
-    }
-
-    for (TensorState* state : required_states) {
-        if (state == nullptr) {
-            continue;
-        }
-        state->active_prepare_count++;
-    }
-    return true;
-}
-
-void ModelManager::finish_compute_backend_usage(const std::vector<TensorState*>& states) {
-    if (states.empty()) {
-        return;
-    }
-
-    std::unordered_set<TensorState*> target_states;
-    for (TensorState* state : states) {
-        if (state == nullptr || !target_states.insert(state).second) {
-            continue;
-        }
-        if (state->active_prepare_count > 0) {
-            state->active_prepare_count--;
-        }
-    }
-    release_compute_staging_blocks(false, &target_states);
-}
-
-void ModelManager::release_compute_backend_params(const std::vector<ggml_tensor*>& tensors) {
-    if (tensors.empty()) {
-        return;
-    }
-    std::vector<TensorState*> required_states;
-    if (!resolve_required_tensor_states(tensors, required_states)) {
-        return;
-    }
-    finish_compute_backend_usage(required_states);
-}
-
-void ModelManager::release_params_backend_params(const std::vector<ggml_tensor*>& tensors) {
-    if (tensors.empty()) {
-        return;
-    }
-    std::vector<TensorState*> required_states;
-    if (!resolve_required_tensor_states(tensors, required_states)) {
-        return;
-    }
-    if (required_states.empty()) {
-        return;
-    }
-    std::unordered_set<TensorState*> target_states(required_states.begin(), required_states.end());
-    release_params_storage_blocks(false, &target_states);
-}
--- a/src/model_manager.h
+++ b/src/model_manager.h
@ -1,170 +0,0 @@
-#ifndef __MODEL_MANAGER_H__
-#define __MODEL_MANAGER_H__
-
-#include <cstdint>
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-#include "model_loader.h"
-#include "weight_manager.h"
-
-class ModelManager : public RunnerWeightManager {
-public:
-    enum class ResidencyMode {
-        Disk,
-        ParamBackend,
-    };
-
-    struct LoraSpec {
-        std::string path;
-        float multiplier   = 1.0f;
-        bool is_high_noise = false;
-        std::string tensor_name_prefix_filter;
-        bool required = false;
-    };
-
-private:
-    struct TensorState {
-        std::string name;
-        ggml_tensor* tensor = nullptr;
-        std::string desc;
-
-        ResidencyMode residency_mode   = ResidencyMode::ParamBackend;
-        ggml_backend_t compute_backend = nullptr;
-        ggml_backend_t params_backend  = nullptr;
-        bool metadata_validated        = false;
-
-        int active_prepare_count = 0;
-
-        bool loaded_to_params_backend  = false;
-        bool staged_to_compute_backend = false;
-        uint64_t applied_lora_epoch    = UINT64_MAX;
-    };
-
-    struct ParamsStorageBlock {
-        ggml_backend_buffer_t buffer = nullptr;
-        std::vector<MmapTensorStore> mmap_tensor_stores;
-        std::vector<TensorState*> states;
-    };
-
-    struct ComputeStagingBlock {
-        ggml_backend_t compute_backend = nullptr;
-        ggml_backend_buffer_t buffer   = nullptr;
-        ggml_context* staging_ctx      = nullptr;
-        std::vector<std::pair<TensorState*, ggml_tensor*>> staged_tensors;
-    };
-
-    ModelLoader model_loader_;
-    std::vector<std::unique_ptr<TensorState>> tensor_states_;
-    std::map<std::string, TensorState*> tensor_states_by_name_;
-    std::vector<std::unique_ptr<ParamsStorageBlock>> params_storage_blocks_;
-    std::vector<std::unique_ptr<ComputeStagingBlock>> compute_staging_blocks_;
-    std::set<std::string> common_ignore_tensors_;
-    std::vector<LoraSpec> loras_;
-    SDVersion lora_version_      = VERSION_COUNT;
-    uint64_t current_lora_epoch_ = 0;
-    int n_threads_               = 0;
-    bool enable_mmap_            = false;
-    bool writable_mmap_          = false;
-
-    void finish_compute_backend_usage(const std::vector<TensorState*>& states);
-    void release_all();
-
-    bool resolve_required_tensor_states(const std::vector<ggml_tensor*>& tensors,
-                                        std::vector<TensorState*>& required_states) const;
-    bool should_ignore(const TensorState& state) const;
-    bool is_optional_missing_tensor(const std::string& name) const;
-    bool validate_tensor(const TensorState& state) const;
-
-    bool load_tensors_to_params_backend(const std::vector<TensorState*>& states);
-    bool apply_loras_to_params(const std::vector<TensorState*>& states);
-    bool mmap_params(const std::vector<TensorState*>& states,
-                     std::vector<ParamsStorageBlock*>& created_storage_blocks);
-    bool can_mmap_storage(const TensorState& state) const;
-    bool alloc_params_buffers(const std::vector<TensorState*>& states,
-                              std::vector<ParamsStorageBlock*>& created_storage_blocks);
-    bool load_tensors(const std::vector<TensorState*>& states);
-    bool stage_tensors_to_compute_backend(const std::vector<TensorState*>& states);
-
-    ggml_backend_buffer_type_t params_buffer_type_for(const TensorState& state) const;
-    void release_compute_staging_blocks(bool force                                            = false,
-                                        const std::unordered_set<TensorState*>* target_states = nullptr);
-    void release_params_storage_blocks(bool force                                            = false,
-                                       const std::unordered_set<TensorState*>* target_states = nullptr);
-    void free_compute_staging_block(ComputeStagingBlock& block);
-    void free_params_storage_block(ParamsStorageBlock& block);
-    void erase_params_storage_block(ParamsStorageBlock* block);
-    void reset_lora_applied_params();
-
-public:
-    ~ModelManager() override;
-
-    ModelLoader& loader() { return model_loader_; }
-    const ModelLoader& loader() const { return model_loader_; }
-
-    void set_n_threads(int n_threads) {
-        n_threads_ = n_threads;
-        model_loader_.set_n_threads(n_threads);
-    }
-    void set_enable_mmap(bool enable_mmap) { enable_mmap_ = enable_mmap; }
-    void set_writable_mmap(bool writable_mmap) { writable_mmap_ = writable_mmap; }
-    void set_common_ignore_tensors(std::set<std::string> ignore_tensors);
-    void set_loras(std::vector<LoraSpec> loras, SDVersion version);
-
-    std::set<std::string> tensor_names() const;
-
-    bool register_param_tensors(const std::string& desc,
-                                std::map<std::string, ggml_tensor*> tensors,
-                                ResidencyMode residency_mode,
-                                ggml_backend_t compute_backend,
-                                ggml_backend_t params_backend,
-                                size_t* registered_tensor_size = nullptr);
-
-    template <typename Runner>
-    bool register_runner_params(const std::string& desc,
-                                Runner& runner,
-                                ResidencyMode residency_mode,
-                                ggml_backend_t compute_backend,
-                                ggml_backend_t params_backend,
-                                size_t* registered_tensor_size = nullptr) {
-        std::map<std::string, ggml_tensor*> tensors;
-        runner.get_param_tensors(tensors);
-        return register_param_tensors(desc,
-                                      std::move(tensors),
-                                      residency_mode,
-                                      compute_backend,
-                                      params_backend,
-                                      registered_tensor_size);
-    }
-
-    template <typename Runner>
-    bool register_runner_params(const std::string& desc,
-                                Runner& runner,
-                                const std::string& prefix,
-                                ResidencyMode residency_mode,
-                                ggml_backend_t compute_backend,
-                                ggml_backend_t params_backend,
-                                size_t* registered_tensor_size = nullptr) {
-        std::map<std::string, ggml_tensor*> tensors;
-        runner.get_param_tensors(tensors, prefix);
-        return register_param_tensors(desc,
-                                      std::move(tensors),
-                                      residency_mode,
-                                      compute_backend,
-                                      params_backend,
-                                      registered_tensor_size);
-    }
-
-    bool validate_registered_tensors();
-    bool load_all_params_eagerly();
-
-    bool prepare_params(const std::vector<ggml_tensor*>& tensors) override;
-    void release_compute_backend_params(const std::vector<ggml_tensor*>& tensors) override;
-    void release_params_backend_params(const std::vector<ggml_tensor*>& tensors) override;
-};
-
-#endif  // __MODEL_MANAGER_H__
--- a/src/name_conversion.cpp
+++ b/src/name_conversion.cpp
@ -184,27 +184,6 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
    return name;
 }

-std::string convert_qwen3_vl_vision_name(std::string name) {
-    static const std::vector<std::pair<std::string, std::string>> qwen3_vl_vision_name_map{
-        {"mm.0.", "merger.linear_fc1."},
-        {"mm.2.", "merger.linear_fc2."},
-        {"v.post_ln.", "merger.norm."},
-        {"v.position_embd.weight", "pos_embed.weight"},
-        {"v.patch_embd.weight.1", "patch_embed.proj.1.weight"},
-        {"v.patch_embd.weight", "patch_embed.proj.0.weight"},
-        {"v.patch_embd.bias", "patch_embed.bias"},
-        {"v.blk.", "blocks."},
-        {"attn_qkv.", "attn.qkv."},
-        {"attn_out.", "attn.proj."},
-        {"ffn_up.", "mlp.linear_fc1."},
-        {"ffn_down.", "mlp.linear_fc2."},
-        {"ln1.", "norm1."},
-        {"ln2.", "norm2."},
-    };
-    replace_with_name_map(name, qwen3_vl_vision_name_map);
-    return name;
-}
-
 // ref: https://github.com/huggingface/diffusers/blob/main/scripts/convert_diffusers_to_original_stable_diffusion.py
 std::string convert_diffusers_unet_to_original_sd1(std::string name) {
    // (stable-diffusion, HF Diffusers)
@ -1011,46 +990,7 @@ bool is_first_stage_model_name(const std::string& name) {
    return false;
 }

-static std::string convert_esrgan_tensor_name(std::string name) {
-    static std::unordered_map<std::string, std::string> esrgan_name_map;
-
-    if (esrgan_name_map.empty()) {
-        esrgan_name_map["model.0."] = "conv_first.";
-
-        constexpr int max_num_blocks = 64;
-        for (int i = 0; i < max_num_blocks; i++) {
-            std::string block_prefix = "model.1.sub." + std::to_string(i) + ".";
-            for (int rdb = 1; rdb <= 3; rdb++) {
-                for (int conv = 1; conv <= 5; conv++) {
-                    esrgan_name_map[block_prefix + "RDB" + std::to_string(rdb) + ".conv" + std::to_string(conv) + ".0."] =
-                        "body." + std::to_string(i) + ".rdb" + std::to_string(rdb) + ".conv" + std::to_string(conv) + ".";
-                }
-            }
-            esrgan_name_map[block_prefix + "weight"] = "conv_body.weight";
-            esrgan_name_map[block_prefix + "bias"]   = "conv_body.bias";
-        }
-
-        // RealESRGAN stores only the learned layers in a Sequential. These indices
-        // cover the common x1, x2 and x4 layouts.
-        esrgan_name_map["model.2."]  = "conv_hr.";
-        esrgan_name_map["model.3."]  = "conv_up1.";
-        esrgan_name_map["model.4."]  = "conv_last.";
-        esrgan_name_map["model.5."]  = "conv_hr.";
-        esrgan_name_map["model.6."]  = "conv_up2.";
-        esrgan_name_map["model.7."]  = "conv_last.";
-        esrgan_name_map["model.8."]  = "conv_hr.";
-        esrgan_name_map["model.10."] = "conv_last.";
-    }
-
-    replace_with_prefix_map(name, esrgan_name_map);
-    return name;
-}
-
 std::string convert_tensor_name(std::string name, SDVersion version) {
-    if (version == VERSION_ESRGAN) {
-        return convert_esrgan_tensor_name(std::move(name));
-    }
-
    bool is_lora                             = false;
    bool is_lycoris_underline                = false;
    bool is_underline                        = false;
@ -1175,10 +1115,6 @@ std::string convert_tensor_name(std::string name, SDVersion version) {

    replace_with_prefix_map(name, prefix_map);

-    if (sd_version_is_boogu_image(version) && starts_with(name, "text_encoders.llm.visual.")) {
-        name = convert_qwen3_vl_vision_name(std::move(name));
-    }
-
    // diffusion model
    {
        for (const auto& prefix : diffuison_model_prefix_vec) {
--- a/src/runtime/guidance.cpp
+++ b/src/runtime/guidance.cpp
@ -3,7 +3,6 @@
 #include <algorithm>
 #include <cmath>
 #include <cstdlib>
-#include <optional>
 #include <string>
 #include <utility>

@ -64,82 +63,6 @@ namespace sd::guidance {
        return uncond;
    }

-    std::vector<float> parse_guidance_schedule_from_spec(std::string spec) {
-        std::vector<float> schedule;
-
-        while (!spec.empty()) {
-            auto sep     = spec.find('+');
-            auto segment = spec.substr(0, sep);
-
-            auto x = segment.find('x');
-            if (x == std::string::npos) {
-                LOG_ERROR("Invalid guidance schedule segment: '%s' (expected <guidance>x<count>)", segment.c_str());
-                return {};
-            }
-
-            float guidance;
-            int count;
-
-            auto guidance_str = segment.substr(0, x);
-            auto count_str    = segment.substr(x + 1);
-
-            try {
-                size_t idx = 0;
-                guidance   = std::stof(guidance_str, &idx);
-                if (idx != guidance_str.size()) {
-                    LOG_ERROR("Invalid guidance value in guidance schedule: '%s'", guidance_str.c_str());
-                    return {};
-                }
-            } catch (const std::exception&) {
-                LOG_ERROR("Invalid guidance value in guidance schedule: '%s'", guidance_str.c_str());
-                return {};
-            }
-
-            try {
-                size_t idx = 0;
-                count      = std::stoi(count_str, &idx);
-                if (idx != count_str.size()) {
-                    LOG_ERROR("Invalid count in guidance schedule: '%s'", count_str.c_str());
-                    return {};
-                }
-            } catch (const std::exception&) {
-                LOG_ERROR("Invalid count in guidance schedule: '%s'", count_str.c_str());
-                return {};
-            }
-
-            if (count <= 0) {
-                LOG_ERROR("Guidance schedule count must be positive");
-                return {};
-            }
-
-            schedule.insert(schedule.end(), count, guidance);
-
-            if (sep == std::string::npos) {
-                break;
-            }
-
-            spec = spec.substr(sep + 1);
-        }
-
-        return schedule;
-    }
-
-    std::vector<float> parse_guidance_schedule(const char* extra_sample_args) {
-        std::vector<float> guidance_schedule;
-        std::string guidance_schedule_str = "";
-        for (const auto& [key, value] : parse_key_value_args(extra_sample_args, "extra sample arg")) {
-            float parsed = 0.0f;
-            if (key == "guidance_schedule") {
-                guidance_schedule_str = value;
-            }
-        }
-
-        if (!guidance_schedule_str.empty()) {
-            guidance_schedule = parse_guidance_schedule_from_spec(guidance_schedule_str);
-        }
-        return guidance_schedule;
-    }
-
    ClassifierFreeGuidance::ClassifierFreeGuidance(float guidance_scale,
                                                   float image_guidance_scale)
        : guidance_scale_(guidance_scale),
@ -147,10 +70,8 @@ namespace sd::guidance {
    }

    GuiderOutput ClassifierFreeGuidance::forward(const GuidanceInput& input,
-                                                 GuiderOutput previous,
-                                                 std::optional<float> scale_override) const {
+                                                 GuiderOutput previous) const {
        (void)previous;
-        float guidance_scale = scale_override.value_or(guidance_scale_);

        GuiderOutput output;
        if (!has_tensor(input.pred_cond)) {
@ -165,14 +86,14 @@ namespace sd::guidance {
                const sd::Tensor<float>& pred_img_uncond = *input.pred_img_uncond;
                output.pred                              = pred_img_uncond +
                              image_guidance_scale_ * (pred_uncond - pred_img_uncond) +
-                              guidance_scale * (pred_cond - pred_uncond);
+                              guidance_scale_ * (pred_cond - pred_uncond);

            } else {
-                output.pred = pred_uncond + guidance_scale * (pred_cond - pred_uncond);
+                output.pred = pred_uncond + guidance_scale_ * (pred_cond - pred_uncond);
            }
        } else if (has_tensor(input.pred_img_uncond)) {
            const sd::Tensor<float>& pred_img_uncond = *input.pred_img_uncond;
-            output.pred                              = pred_img_uncond + guidance_scale * (pred_cond - pred_img_uncond);
+            output.pred                              = pred_img_uncond + guidance_scale_ * (pred_cond - pred_img_uncond);
        }

        return output;
@ -207,10 +128,8 @@ namespace sd::guidance {
    }

    GuiderOutput AdaptiveProjectedGuidance::forward(const GuidanceInput& input,
-                                                    GuiderOutput previous,
-                                                    std::optional<float> scale_override) const {
+                                                    GuiderOutput previous) const {
        (void)previous;
-        float guidance_scale = scale_override.value_or(guidance_scale_);

        GuiderOutput output;
        if (!has_tensor(input.pred_cond)) {
@ -225,13 +144,13 @@ namespace sd::guidance {
                const sd::Tensor<float>& pred_img_uncond = *input.pred_img_uncond;
                output.pred                              = pred_img_uncond +
                              image_guidance_scale_ * (pred_uncond - pred_img_uncond) +
-                              guidance_scale * (pred_cond - pred_uncond);
+                              guidance_scale_ * (pred_cond - pred_uncond);
            } else {
-                output.pred = pred_uncond + guidance_scale * (pred_cond - pred_uncond);
+                output.pred = pred_uncond + guidance_scale_ * (pred_cond - pred_uncond);
            }
        } else if (has_tensor(input.pred_img_uncond)) {
            const sd::Tensor<float>& pred_img_uncond = *input.pred_img_uncond;
-            output.pred                              = pred_img_uncond + guidance_scale * (pred_cond - pred_img_uncond);
+            output.pred                              = pred_img_uncond + guidance_scale_ * (pred_cond - pred_img_uncond);
        }
        if (!has_tensor(input.pred_uncond) && !has_tensor(input.pred_img_uncond)) {
            return output;
@ -243,7 +162,7 @@ namespace sd::guidance {
        sd::Tensor<float> deltas = calculate_guidance_delta(pred_cond,
                                                            pred_uncond,
                                                            pred_img_uncond,
-                                                            guidance_scale,
+                                                            guidance_scale_,
                                                            image_guidance_scale_);
        if (params_.momentum != 0.0f) {
            if (momentum_buffer_.shape() != deltas.shape()) {
@ -253,8 +172,8 @@ namespace sd::guidance {
            momentum_buffer_ = deltas;
        }

-        float diff_norm        = 0.0f;
-        const int standard_res = 2 * 1024 / 8;  // Use SDXL as the standard resolution (1024x1024, 8x8 patches, 4=2x2 channels)
+        float diff_norm = 0.0f;
+        const int standard_res = 2 * 1024 / 8; // Use SDXL as the standard resolution (1024x1024, 8x8 patches, 4=2x2 channels)
        if (params_.norm_threshold > 0.0f) {
            diff_norm = std::sqrt((deltas * deltas).sum()) * standard_res / std::sqrt(static_cast<float>(deltas.numel()));
        }
@ -320,8 +239,7 @@ namespace sd::guidance {
    }

    GuiderOutput SkipLayerGuidance::forward(const GuidanceInput& input,
-                                            GuiderOutput output,
-                                            std::optional<float> /*scale_override*/) const {
+                                            GuiderOutput output) const {
        if (scale_ == 0.0f || !is_enabled_for_step(input) || !input.predict_skip_layer) {
            return output;
        }
--- a/src/runtime/guidance.h
+++ b/src/runtime/guidance.h
@ -3,7 +3,6 @@

 #include <cstddef>
 #include <functional>
-#include <optional>
 #include <vector>

 #include "core/tensor.hpp"
@ -28,7 +27,6 @@ namespace sd::guidance {
    AdaptiveProjectedGuidanceParams parse_adaptive_projected_guidance_args(const char* extra_sample_args);
    bool is_adaptive_projected_guidance_enabled(const AdaptiveProjectedGuidanceParams& params);
    bool parse_skip_layer_guidance_uncond_arg(const char* extra_sample_args);
-    std::vector<float> parse_guidance_schedule(const char* extra_sample_args);

    struct GuidanceInput {
        int step                                 = 0;
@ -42,10 +40,9 @@ namespace sd::guidance {

    class BaseGuidance {
    public:
-        virtual ~BaseGuidance()                                                                = default;
+        virtual ~BaseGuidance()                                   = default;
        virtual GuiderOutput forward(const GuidanceInput& input,
-                                     GuiderOutput previous,
-                                     std::optional<float> scale_override = std::nullopt) const = 0;
+                                     GuiderOutput previous) const = 0;
    };

    class ClassifierFreeGuidance : public BaseGuidance {
@ -57,8 +54,7 @@ namespace sd::guidance {
                               float image_guidance_scale);

        GuiderOutput forward(const GuidanceInput& input,
-                             GuiderOutput previous,
-                             std::optional<float> scale_override = std::nullopt) const override;
+                             GuiderOutput previous) const override;
    };

    class AdaptiveProjectedGuidance : public BaseGuidance {
@ -73,8 +69,7 @@ namespace sd::guidance {
                                  AdaptiveProjectedGuidanceParams params);

        GuiderOutput forward(const GuidanceInput& input,
-                             GuiderOutput previous,
-                             std::optional<float> scale_override = std::nullopt) const override;
+                             GuiderOutput previous) const override;
    };

    class SkipLayerGuidance : public BaseGuidance {
@ -93,8 +88,7 @@ namespace sd::guidance {
        const std::vector<int>& layers() const;

        GuiderOutput forward(const GuidanceInput& input,
-                             GuiderOutput previous,
-                             std::optional<float> scale_override = std::nullopt) const override;
+                             GuiderOutput previous) const override;
    };

 }  // namespace sd::guidance
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
--- a/src/tokenizers/bpe_tokenizer.cpp
+++ b/src/tokenizers/bpe_tokenizer.cpp
@ -134,8 +134,7 @@ std::vector<int> BPETokenizer::encode(const std::string& text, on_new_token_cb_t
    std::vector<int32_t> bpe_tokens;
    std::vector<std::string> token_strs;

-    std::string normalized_text = normalize_before_split ? normalize(text) : text;
-    auto splited_texts          = split_with_special_tokens(normalized_text, special_tokens);
+    auto splited_texts = split_with_special_tokens(text, special_tokens);

    for (auto& splited_text : splited_texts) {
        if (is_special_token(splited_text)) {
@ -160,7 +159,7 @@ std::vector<int> BPETokenizer::encode(const std::string& text, on_new_token_cb_t
                }
            }

-            std::string token_str = normalize_before_split ? token : normalize(token);
+            std::string token_str = normalize(token);
            std::u32string utf32_token;
            if (byte_level_bpe) {
                for (int i = 0; i < token_str.length(); i++) {
--- a/src/tokenizers/clip_tokenizer.cpp
+++ b/src/tokenizers/clip_tokenizer.cpp
@ -22,10 +22,9 @@ CLIPTokenizer::CLIPTokenizer(int pad_token_id, const std::string& merges_utf8_st
    EOS_TOKEN_ID = 49407;
    PAD_TOKEN_ID = pad_token_id;

-    end_of_word_suffix     = "</w>";
-    add_bos_token          = true;
-    add_eos_token          = true;
-    normalize_before_split = true;
+    end_of_word_suffix = "</w>";
+    add_bos_token      = true;
+    add_eos_token      = true;

    if (merges_utf8_str.size() > 0) {
        load_from_merges(merges_utf8_str);
--- a/src/tokenizers/tokenizer.h
+++ b/src/tokenizers/tokenizer.h
@ -12,10 +12,9 @@ using on_new_token_cb_t = std::function<bool(std::string&, std::vector<int32_t>&
 class Tokenizer {
 protected:
    std::vector<std::string> special_tokens;
-    bool add_bos_token          = false;
-    bool add_eos_token          = false;
-    bool pad_left               = false;
-    bool normalize_before_split = false;
+    bool add_bos_token = false;
+    bool add_eos_token = false;
+    bool pad_left      = false;
    std::string end_of_word_suffix;

    virtual std::string decode_token(int token_id) const = 0;
--- a/src/upscaler.cpp
+++ b/src/upscaler.cpp
@ -18,12 +18,6 @@ UpscalerGGML::UpscalerGGML(int n_threads,
      params_backend_spec(std::move(params_backend_spec)) {
 }

-UpscalerGGML::~UpscalerGGML() {
-    // ModelManager holds raw ggml tensor pointers owned by the runner context.
-    model_manager.reset();
-    esrgan_upscaler.reset();
-}
-
 void UpscalerGGML::set_max_graph_vram_bytes(size_t max_vram_bytes) {
    max_graph_vram_bytes = max_vram_bytes;
    if (esrgan_upscaler) {
@ -39,12 +33,17 @@ void UpscalerGGML::set_stream_layers_enabled(bool enabled) {
 }

 bool UpscalerGGML::load_from_file(const std::string& esrgan_path,
+                                  bool offload_params_to_cpu,
                                  int n_threads) {
    ggml_log_set(ggml_log_callback_default, nullptr);

    std::string error;
    if (!backend_manager.init(backend_spec.c_str(),
                              params_backend_spec.c_str(),
+                              offload_params_to_cpu,
+                              false,
+                              false,
+                              false,
                              &error)) {
        LOG_ERROR("upscaler backend config failed: %s", error.c_str());
        return false;
@ -73,39 +72,22 @@ bool UpscalerGGML::load_from_file(const std::string& esrgan_path,
        return false;
    }

-    model_manager = std::make_shared<ModelManager>();
-    model_manager->set_n_threads(n_threads);
-    model_manager->set_enable_mmap(false);
-
-    ModelLoader& model_loader = model_manager->loader();
-    if (!model_loader.init_from_file_and_convert_name(esrgan_path, "", VERSION_ESRGAN)) {
+    ModelLoader model_loader;
+    if (!model_loader.init_from_file_and_convert_name(esrgan_path)) {
        LOG_ERROR("init model loader from file failed: '%s'", esrgan_path.c_str());
-        return false;
    }
    model_loader.set_wtype_override(model_data_type);
    LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
    esrgan_upscaler = std::make_shared<ESRGAN>(backend_for(SDBackendModule::UPSCALER),
-                                               model_loader.get_tensor_storage_map(),
-                                               model_manager);
-    if (esrgan_upscaler == nullptr || esrgan_upscaler->rrdb_net == nullptr) {
-        LOG_ERROR("init esrgan model from metadata failed: '%s'", esrgan_path.c_str());
-        return false;
-    }
+                                               params_backend_for(SDBackendModule::UPSCALER),
+                                               tile_size,
+                                               model_loader.get_tensor_storage_map());
    esrgan_upscaler->set_max_graph_vram_bytes(max_graph_vram_bytes);
    esrgan_upscaler->set_stream_layers_enabled(stream_layers_enabled);
    if (direct) {
        esrgan_upscaler->set_conv2d_direct_enabled(true);
    }
-
-    std::map<std::string, ggml_tensor*> tensors;
-    esrgan_upscaler->get_param_tensors(tensors);
-    if (!model_manager->register_param_tensors("ESRGAN",
-                                               std::move(tensors),
-                                               backend_manager.params_backend_is_disk(SDBackendModule::UPSCALER) ? ModelManager::ResidencyMode::Disk : ModelManager::ResidencyMode::ParamBackend,
-                                               backend_for(SDBackendModule::UPSCALER),
-                                               params_backend_for(SDBackendModule::UPSCALER)) ||
-        !model_manager->validate_registered_tensors()) {
-        LOG_ERROR("register esrgan tensors with model manager failed");
+    if (!esrgan_upscaler->load_from_file(esrgan_path, n_threads)) {
        return false;
    }
    return true;
@ -113,7 +95,6 @@ bool UpscalerGGML::load_from_file(const std::string& esrgan_path,

 sd::Tensor<float> UpscalerGGML::upscale_tensor(const sd::Tensor<float>& input_tensor) {
    sd::Tensor<float> upscaled;
-    const int scale = esrgan_upscaler->config.scale;
    if (tile_size <= 0 || (input_tensor.shape()[0] <= tile_size && input_tensor.shape()[1] <= tile_size)) {
        upscaled = esrgan_upscaler->compute(n_threads, input_tensor);
    } else {
@ -127,9 +108,9 @@ sd::Tensor<float> UpscalerGGML::upscale_tensor(const sd::Tensor<float>& input_te
        };

        upscaled = process_tiles_2d(input_tensor,
-                                    static_cast<int>(input_tensor.shape()[0] * scale),
-                                    static_cast<int>(input_tensor.shape()[1] * scale),
-                                    scale,
+                                    static_cast<int>(input_tensor.shape()[0] * esrgan_upscaler->scale),
+                                    static_cast<int>(input_tensor.shape()[1] * esrgan_upscaler->scale),
+                                    esrgan_upscaler->scale,
                                    tile_size,
                                    tile_size,
                                    0.25f,
@ -148,9 +129,8 @@ sd::Tensor<float> UpscalerGGML::upscale_tensor(const sd::Tensor<float>& input_te
 sd_image_t UpscalerGGML::upscale(sd_image_t input_image, uint32_t upscale_factor) {
    // upscale_factor, unused for RealESRGAN_x4plus_anime_6B.pth
    sd_image_t upscaled_image = {0, 0, 0, nullptr};
-    const int scale           = esrgan_upscaler->config.scale;
-    int output_width          = (int)input_image.width * scale;
-    int output_height         = (int)input_image.height * scale;
+    int output_width          = (int)input_image.width * esrgan_upscaler->scale;
+    int output_height         = (int)input_image.height * esrgan_upscaler->scale;
    LOG_INFO("upscaling from (%i x %i) to (%i x %i)",
             input_image.width, input_image.height, output_width, output_height);

@ -173,6 +153,7 @@ struct upscaler_ctx_t {
 };

 upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
+                                 bool offload_params_to_cpu,
                                 bool direct,
                                 int n_threads,
                                 int tile_size,
@ -189,7 +170,7 @@ upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
        return nullptr;
    }

-    if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, n_threads)) {
+    if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu, n_threads)) {
        delete upscaler_ctx->upscaler;
        upscaler_ctx->upscaler = nullptr;
        free(upscaler_ctx);
@ -206,7 +187,7 @@ int get_upscale_factor(upscaler_ctx_t* upscaler_ctx) {
    if (upscaler_ctx == nullptr || upscaler_ctx->upscaler == nullptr || upscaler_ctx->upscaler->esrgan_upscaler == nullptr) {
        return 1;
    }
-    return upscaler_ctx->upscaler->esrgan_upscaler->config.scale;
+    return upscaler_ctx->upscaler->esrgan_upscaler->scale;
 }

 void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx) {
--- a/src/upscaler.h
+++ b/src/upscaler.h
@ -4,7 +4,6 @@
 #include "core/ggml_extend_backend.h"
 #include "core/tensor.hpp"
 #include "model/upscaler/esrgan.hpp"
-#include "model_manager.h"
 #include "stable-diffusion.h"

 #include <memory>
@ -12,7 +11,6 @@

 struct UpscalerGGML {
    SDBackendManager backend_manager;
-    std::shared_ptr<ModelManager> model_manager;
    ggml_type model_data_type = GGML_TYPE_F16;
    std::shared_ptr<ESRGAN> esrgan_upscaler;
    std::string esrgan_path;
@ -29,9 +27,9 @@ struct UpscalerGGML {
                 int tile_size                   = 128,
                 std::string backend_spec        = "",
                 std::string params_backend_spec = "");
-    ~UpscalerGGML();

    bool load_from_file(const std::string& esrgan_path,
+                        bool offload_params_to_cpu,
                        int n_threads);
    void set_max_graph_vram_bytes(size_t max_vram_bytes);
    void set_stream_layers_enabled(bool enabled);
--- a/src/weight_manager.h
+++ b/src/weight_manager.h
@ -1,15 +0,0 @@
-#ifndef __WEIGHT_MANAGER_H__
-#define __WEIGHT_MANAGER_H__
-
-#include <vector>
-
-struct ggml_tensor;
-
-struct RunnerWeightManager {
-    virtual ~RunnerWeightManager()                                                        = default;
-    virtual bool prepare_params(const std::vector<ggml_tensor*>& tensors)                 = 0;
-    virtual void release_compute_backend_params(const std::vector<ggml_tensor*>& tensors) = 0;
-    virtual void release_params_backend_params(const std::vector<ggml_tensor*>& tensors)  = 0;
-};
-
-#endif  // __WEIGHT_MANAGER_H__