feat: add microsoft lens support (#1560)

2026-06-09 15:56:39 +00:00 · 2026-05-27 01:04:17 +08:00 · 2026-05-27 01:04:17 +08:00 · 92dc7268fc
commit 92dc7268fc
parent 07b2b18e70
19 changed files with 1042 additions and 20 deletions
--- a/README.md
+++ b/README.md
@ -15,6 +15,7 @@ API and command-line option may change frequently.***
 ## 🔥Important News
 * **2026/05/27** 🚀 stable-diffusion.cpp now supports **Lens**
 * **2026/05/17** 🚀 stable-diffusion.cpp now supports **LTX-2.3**
 * **2026/04/11** 🚀 stable-diffusion.cpp now uses a brand-new embedded web UI.  
 * **2026/01/18** 🚀 stable-diffusion.cpp now supports **FLUX.2-klein**  
@ -37,6 +38,7 @@ API and command-line option may change frequently.***
    - [SD3/SD3.5](./docs/sd3.md)
    - [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md)
    - [FLUX.2-dev/FLUX.2-klein](./docs/flux2.md)
    - [Lens](./docs/lens.md)
    - [Chroma](./docs/chroma.md)
    - [Chroma1-Radiance](./docs/chroma_radiance.md)
    - [Qwen Image](./docs/qwen_image.md)
@ -135,7 +137,6 @@ For runtime and parameter backend placement, see the [backend selection guide](.
 - [Chroma](./docs/chroma.md)
 - [🔥Qwen Image](./docs/qwen_image.md)
 - [🔥Qwen Image Edit series](./docs/qwen_image_edit.md)
 - [🔥LongCat Image / LongCat Image Edit](./docs/longcat_image.md)
 - [🔥Wan2.1/Wan2.2](./docs/wan.md)
 - [🔥LTX-2.3](./docs/ltx2.md)
 - [🔥Z-Image](./docs/z_image.md)
@ -143,6 +144,8 @@ For runtime and parameter backend placement, see the [backend selection guide](.
 - [Anima](./docs/anima.md)
 - [ERNIE-Image](./docs/ernie_image.md)
 - [HiDream-O1-Image](./docs/hidream_o1_image.md)
 - [Lens](./docs/lens.md)
 - [LongCat Image / LongCat Image Edit](./docs/longcat_image.md)
 - [LoRA](./docs/lora.md)
 - [LCM/LCM-LoRA](./docs/lcm.md)
 - [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)
--- a/assets/lens/example.png
+++ b/assets/lens/example.png
--- a/assets/lens/turbo_example.png
+++ b/assets/lens/turbo_example.png
--- a/docs/lens.md
+++ b/docs/lens.md
@ -0,0 +1,32 @@
 # How to Use
 Lens uses a Lens diffusion transformer, the FLUX.2 VAE, and GPT-OSS-20B as the LLM text encoder.
 ## Download weights
 - Download Lens
    - safetensors: https://huggingface.co/Comfy-Org/Lens/tree/main/diffusion_models
 - Download Lens Turbo
    - safetensors: https://huggingface.co/Comfy-Org/Lens/tree/main/diffusion_models
 - Download vae
    - safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
 - Download GPT-OSS-20B
    - gguf: https://huggingface.co/unsloth/gpt-oss-20b-GGUF/tree/main
 ## Examples
 ### Lens
 ```
 .\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\lens_bf16.safetensors --llm "..\..\llm\gpt-oss-20b-UD-Q8_K_XL.gguf" --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --cfg-scale 5.0  -p "A crystal dragon soaring through an aurora borealis sky, its entire body made of transparent faceted crystal refracting the green and purple aurora light into rainbow spectra, ice particles trailing from its wings, high fantasy digital art" --diffusion-fa -v
 ```
 <img width="256" alt="Lens example" src="../assets/lens/example.png" />
 ### Lens Turbo
 ```
 .\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\lens_turbo_bf16.safetensors --llm "..\..\llm\gpt-oss-20b-UD-Q8_K_XL.gguf" --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --cfg-scale 1.0  -p "A crystal dragon soaring through an aurora borealis sky, its entire body made of transparent faceted crystal refracting the green and purple aurora light into rainbow spectra, ice particles trailing from its wings, high fantasy digital art" --diffusion-fa -v --steps 4
 ```
 <img width="256" alt="Lens Turbo example" src="../assets/lens/turbo_example.png" />
--- a/src/conditioner.hpp
+++ b/src/conditioner.hpp
@ -1696,11 +1696,15 @@ struct LLMEmbedder : public Conditioner {
            arch = LLM::LLMArch::MISTRAL_SMALL_3_2;
        } else if (sd_version_is_ernie_image(version)) {
            arch = LLM::LLMArch::MINISTRAL_3_3B;
        } else if (sd_version_is_lens(version)) {
            arch = LLM::LLMArch::GPT_OSS_20B;
        } else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) {
            arch = LLM::LLMArch::QWEN3;
        }
        if (arch == LLM::LLMArch::MISTRAL_SMALL_3_2 || arch == LLM::LLMArch::MINISTRAL_3_3B) {
            tokenizer = std::make_shared<MistralTokenizer>();
        } else if (arch == LLM::LLMArch::GPT_OSS_20B) {
            tokenizer = std::make_shared<GPTOSSTokenizer>();
        } else {
            tokenizer = std::make_shared<Qwen2Tokenizer>();
        }
@ -1871,6 +1875,7 @@ struct LLMEmbedder : public Conditioner {
        std::vector<std::pair<int, sd::Tensor<float>>> image_embeds;
        int prompt_template_encode_start_idx = 34;
        int min_length                       = 0;  // pad tokens
        int max_length                       = 100000000;
        int hidden_states_min_length         = 0;  // zero pad hidden_states
        bool spell_quotes                    = false;
        std::set<int> out_layers;
@ -2029,6 +2034,30 @@ struct LLMEmbedder : public Conditioner {
            prompt_attn_range.first = 0;
            prompt += conditioner_params.text;
            prompt_attn_range.second = static_cast<int>(prompt.size());
        } else if (sd_version_is_lens(version)) {
            prompt_template_encode_start_idx = 97;
            min_length                       = 0;
            max_length                       = 512;
            out_layers                       = {6, 12, 18, 24};
            prompt =
                "<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.\n"
                "Knowledge cutoff: 2024-06\n"
                "Current date: 2026-05-26\n"  // fix for current date
                "\n"
                "Reasoning: medium\n"
                "\n"
                "# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>developer<|message|># Instructions\n"
                "\n"
                "Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background.\n"
                "\n"
                "<|end|><|start|>user<|message|>";
            prompt_attn_range.first = static_cast<int>(prompt.size());
            prompt += conditioner_params.text;
            prompt_attn_range.second = static_cast<int>(prompt.size());
            prompt += "<|end|><|start|>assistant<|channel|>analysis<|message|>Need to generate one image according to the description.<|end|><|start|>assistant<|channel|>final<|message|>";
        } else if (sd_version_is_z_image(version)) {
            prompt_template_encode_start_idx = 0;
            out_layers                       = {35};  // -2
@ -2085,7 +2114,8 @@ struct LLMEmbedder : public Conditioner {
                                           image_embeds,
                                           out_layers,
                                           prompt_template_encode_start_idx,
-                                           spell_quotes);
+                                           spell_quotes,
                                           max_length);
        std::vector<sd::Tensor<float>> extra_hidden_states_vec;
        for (int i = 0; i < extra_prompts.size(); i++) {
            auto extra_hidden_states = encode_prompt(n_threads,
@ -2096,7 +2126,8 @@ struct LLMEmbedder : public Conditioner {
                                                     image_embeds,
                                                     out_layers,
                                                     prompt_template_encode_start_idx,
-                                                     spell_quotes);
+                                                     spell_quotes,
                                                     max_length);
            extra_hidden_states_vec.push_back(std::move(extra_hidden_states));
        }
--- a/src/diffusion_model.hpp
+++ b/src/diffusion_model.hpp
@ -6,6 +6,7 @@
 #include "ernie_image.hpp"
 #include "flux.hpp"
 #include "hidream_o1.hpp"
 #include "lens.hpp"
 #include "ltxv.hpp"
 #include "mmdit.hpp"
 #include "qwen_image.hpp"
@ -701,6 +702,72 @@ struct ErnieImageModel : public DiffusionModel {
    }
 };
 struct LensModel : public DiffusionModel {
    std::string prefix;
    Lens::LensRunner lens;
    LensModel(ggml_backend_t backend,
              ggml_backend_t params_backend,
              const String2TensorStorage& tensor_storage_map = {},
              const std::string prefix                       = "model.diffusion_model")
        : prefix(prefix), lens(backend, params_backend, tensor_storage_map, prefix) {
    }
    std::string get_desc() override {
        return lens.get_desc();
    }
    void alloc_params_buffer() override {
        lens.alloc_params_buffer();
    }
    void free_params_buffer() override {
        lens.free_params_buffer();
    }
    void free_compute_buffer() override {
        lens.free_compute_buffer();
    }
    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
        lens.get_param_tensors(tensors, prefix);
    }
    size_t get_params_buffer_size() override {
        return lens.get_params_buffer_size();
    }
    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
        lens.set_weight_adapter(adapter);
    }
    int64_t get_adm_in_channels() override {
        return 768;
    }
    void set_flash_attention_enabled(bool enabled) {
        lens.set_flash_attention_enabled(enabled);
    }
    void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
        lens.set_max_graph_vram_bytes(max_vram_bytes);
    }
    void set_circular_axes(bool circular_x, bool circular_y) override {
        lens.set_circular_axes(circular_x, circular_y);
    }
    sd::Tensor<float> compute(int n_threads,
                              const DiffusionParams& diffusion_params) override {
        GGML_ASSERT(diffusion_params.x != nullptr);
        GGML_ASSERT(diffusion_params.timesteps != nullptr);
        return lens.compute(n_threads,
                            *diffusion_params.x,
                            *diffusion_params.timesteps,
                            tensor_or_empty(diffusion_params.context));
    }
 };
 struct LTXAVModel : public DiffusionModel {
    std::string prefix;
    LTXV::LTXAVRunner ltxav;
--- a/src/lens.hpp
+++ b/src/lens.hpp
@ -0,0 +1,408 @@
 #ifndef __SD_LENS_HPP__
 #define __SD_LENS_HPP__
 #include <memory>
 #include <vector>
 #include "common_block.hpp"
 #include "flux.hpp"
 #include "qwen_image.hpp"
 #include "rope.hpp"
 namespace Lens {
    constexpr int LENS_GRAPH_SIZE = 40960;
    struct LensTimestepProjEmbeddings : public GGMLBlock {
        LensTimestepProjEmbeddings(int64_t embedding_dim) {
            blocks["timestep_embedder"] = std::make_shared<Qwen::TimestepEmbedding>(256, embedding_dim);
        }
        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* timesteps) {
            auto timestep_embedder = std::dynamic_pointer_cast<Qwen::TimestepEmbedding>(blocks["timestep_embedder"]);
            auto timesteps_proj    = ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 256, 10000, 1000.f);
            return timestep_embedder->forward(ctx, timesteps_proj);
        }
    };
    struct LensGateMLP : public GGMLBlock {
        LensGateMLP(int64_t dim, int64_t hidden_dim) {
            blocks["w1"] = std::make_shared<Linear>(dim, hidden_dim, false);
            blocks["w2"] = std::make_shared<Linear>(hidden_dim, dim, false);
            blocks["w3"] = std::make_shared<Linear>(dim, hidden_dim, false);
        }
        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
            auto w1 = std::dynamic_pointer_cast<Linear>(blocks["w1"]);
            auto w2 = std::dynamic_pointer_cast<Linear>(blocks["w2"]);
            auto w3 = std::dynamic_pointer_cast<Linear>(blocks["w3"]);
            auto gate = ggml_silu(ctx->ggml_ctx, w1->forward(ctx, x));
            auto up   = w3->forward(ctx, x);
            x         = ggml_mul(ctx->ggml_ctx, gate, up);
            return w2->forward(ctx, x);
        }
    };
    struct LensJointAttention : public GGMLBlock {
        int64_t dim_head;
        int64_t num_heads;
        LensJointAttention(int64_t query_dim,
                           int64_t dim_head,
                           int64_t num_heads,
                           float eps = 1e-5f)
            : dim_head(dim_head), num_heads(num_heads) {
            int64_t inner_dim = dim_head * num_heads;
            blocks["img_qkv"] = std::make_shared<Linear>(query_dim, inner_dim * 3, true);
            blocks["txt_qkv"] = std::make_shared<Linear>(query_dim, inner_dim * 3, true);
            blocks["norm_q"]       = std::make_shared<RMSNorm>(dim_head, eps);
            blocks["norm_k"]       = std::make_shared<RMSNorm>(dim_head, eps);
            blocks["norm_added_q"] = std::make_shared<RMSNorm>(dim_head, eps);
            blocks["norm_added_k"] = std::make_shared<RMSNorm>(dim_head, eps);
            blocks["to_out.0"]   = std::make_shared<Linear>(inner_dim, query_dim, true);
            blocks["to_add_out"] = std::make_shared<Linear>(inner_dim, query_dim, true);
        }
        std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
                                                      ggml_tensor* img,
                                                      ggml_tensor* txt,
                                                      ggml_tensor* pe,
                                                      ggml_tensor* mask = nullptr) {
            auto img_qkv    = std::dynamic_pointer_cast<Linear>(blocks["img_qkv"]);
            auto txt_qkv    = std::dynamic_pointer_cast<Linear>(blocks["txt_qkv"]);
            auto norm_q     = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_q"]);
            auto norm_k     = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_k"]);
            auto norm_add_q = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_added_q"]);
            auto norm_add_k = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_added_k"]);
            auto to_out_0   = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
            auto to_add_out = std::dynamic_pointer_cast<Linear>(blocks["to_add_out"]);
            int64_t n_img   = img->ne[1];
            int64_t n_txt   = txt->ne[1];
            int64_t N       = img->ne[2];
            int64_t inner   = dim_head * num_heads;
            auto img_qkv_vec = split_qkv(ctx->ggml_ctx, img_qkv->forward(ctx, img));
            auto txt_qkv_vec = split_qkv(ctx->ggml_ctx, txt_qkv->forward(ctx, txt));
            auto img_q = ggml_reshape_4d(ctx->ggml_ctx, img_qkv_vec[0], dim_head, num_heads, n_img, N);
            auto img_k = ggml_reshape_4d(ctx->ggml_ctx, img_qkv_vec[1], dim_head, num_heads, n_img, N);
            auto img_v = ggml_reshape_4d(ctx->ggml_ctx, img_qkv_vec[2], dim_head, num_heads, n_img, N);
            img_q = norm_q->forward(ctx, img_q);
            img_k = norm_k->forward(ctx, img_k);
            auto txt_q = ggml_reshape_4d(ctx->ggml_ctx, txt_qkv_vec[0], dim_head, num_heads, n_txt, N);
            auto txt_k = ggml_reshape_4d(ctx->ggml_ctx, txt_qkv_vec[1], dim_head, num_heads, n_txt, N);
            auto txt_v = ggml_reshape_4d(ctx->ggml_ctx, txt_qkv_vec[2], dim_head, num_heads, n_txt, N);
            txt_q = norm_add_q->forward(ctx, txt_q);
            txt_k = norm_add_k->forward(ctx, txt_k);
            auto q = ggml_concat(ctx->ggml_ctx, img_q, txt_q, 2);
            auto k = ggml_concat(ctx->ggml_ctx, img_k, txt_k, 2);
            auto v = ggml_concat(ctx->ggml_ctx, img_v, txt_v, 2);
            auto attn = Rope::attention(ctx, q, k, v, pe, mask, (1.0f / 128.f));
            auto img_attn_out = ggml_view_3d(ctx->ggml_ctx,
                                             attn,
                                             inner,
                                             n_img,
                                             N,
                                             attn->nb[1],
                                             attn->nb[2],
                                             0);
            auto txt_attn_out = ggml_view_3d(ctx->ggml_ctx,
                                             attn,
                                             inner,
                                             n_txt,
                                             N,
                                             attn->nb[1],
                                             attn->nb[2],
                                             n_img * attn->nb[1]);
            img_attn_out = to_out_0->forward(ctx, ggml_cont(ctx->ggml_ctx, img_attn_out));
            txt_attn_out = to_add_out->forward(ctx, ggml_cont(ctx->ggml_ctx, txt_attn_out));
            return {img_attn_out, txt_attn_out};
        }
    };
    struct LensTransformerBlock : public GGMLBlock {
        LensTransformerBlock(int64_t dim,
                             int64_t num_attention_heads,
                             int64_t attention_head_dim,
                             float eps = 1e-6f) {
            int64_t mlp_hidden_dim = dim / 3 * 8;
            blocks["img_mod.1"]    = std::make_shared<Linear>(dim, 6 * dim, true);
            blocks["txt_mod.1"]    = std::make_shared<Linear>(dim, 6 * dim, true);
            blocks["img_norm1"]    = std::make_shared<RMSNorm>(dim, eps);
            blocks["img_norm2"]    = std::make_shared<RMSNorm>(dim, eps);
            blocks["txt_norm1"]    = std::make_shared<RMSNorm>(dim, eps);
            blocks["txt_norm2"]    = std::make_shared<RMSNorm>(dim, eps);
            blocks["img_mlp"]      = std::make_shared<LensGateMLP>(dim, mlp_hidden_dim);
            blocks["txt_mlp"]      = std::make_shared<LensGateMLP>(dim, mlp_hidden_dim);
            blocks["attn"]         = std::make_shared<LensJointAttention>(dim, attention_head_dim, num_attention_heads);
        }
        std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
                                                      ggml_tensor* img,
                                                      ggml_tensor* txt,
                                                      ggml_tensor* t_emb,
                                                      ggml_tensor* pe) {
            auto img_mod_1 = std::dynamic_pointer_cast<Linear>(blocks["img_mod.1"]);
            auto txt_mod_1 = std::dynamic_pointer_cast<Linear>(blocks["txt_mod.1"]);
            auto img_norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["img_norm1"]);
            auto img_norm2 = std::dynamic_pointer_cast<RMSNorm>(blocks["img_norm2"]);
            auto txt_norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm1"]);
            auto txt_norm2 = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm2"]);
            auto img_mlp   = std::dynamic_pointer_cast<LensGateMLP>(blocks["img_mlp"]);
            auto txt_mlp   = std::dynamic_pointer_cast<LensGateMLP>(blocks["txt_mlp"]);
            auto attn      = std::dynamic_pointer_cast<LensJointAttention>(blocks["attn"]);
            auto temb = ggml_silu(ctx->ggml_ctx, t_emb);
            auto img_mod_params = img_mod_1->forward(ctx, temb);
            auto img_mod_vec    = ggml_ext_chunk(ctx->ggml_ctx, img_mod_params, 6, 0);
            auto txt_mod_params = txt_mod_1->forward(ctx, temb);
            auto txt_mod_vec    = ggml_ext_chunk(ctx->ggml_ctx, txt_mod_params, 6, 0);
            auto img_normed    = img_norm1->forward(ctx, img);
            auto img_modulated = Flux::modulate(ctx->ggml_ctx, img_normed, img_mod_vec[0], img_mod_vec[1]);
            auto txt_normed    = txt_norm1->forward(ctx, txt);
            auto txt_modulated = Flux::modulate(ctx->ggml_ctx, txt_normed, txt_mod_vec[0], txt_mod_vec[1]);
            auto [img_attn_output, txt_attn_output] = attn->forward(ctx, img_modulated, txt_modulated, pe);
            img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_attn_output, img_mod_vec[2]));
            txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_attn_output, txt_mod_vec[2]));
            auto img_normed2    = img_norm2->forward(ctx, img);
            auto img_modulated2 = Flux::modulate(ctx->ggml_ctx, img_normed2, img_mod_vec[3], img_mod_vec[4]);
            auto txt_normed2    = txt_norm2->forward(ctx, txt);
            auto txt_modulated2 = Flux::modulate(ctx->ggml_ctx, txt_normed2, txt_mod_vec[3], txt_mod_vec[4]);
            img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_mlp->forward(ctx, img_modulated2), img_mod_vec[5]));
            txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_mlp->forward(ctx, txt_modulated2), txt_mod_vec[5]));
            return {img, txt};
        }
    };
    struct LensAdaLayerNormContinuous : public GGMLBlock {
        int64_t hidden_size;
        float eps;
        LensAdaLayerNormContinuous(int64_t hidden_size, float eps = 1e-6f)
            : hidden_size(hidden_size), eps(eps) {
            blocks["linear"] = std::make_shared<Linear>(hidden_size, hidden_size * 2, true);
        }
        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* conditioning) {
            auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
            auto mods   = ggml_ext_chunk(ctx->ggml_ctx, linear->forward(ctx, ggml_silu(ctx->ggml_ctx, conditioning)), 2, 0);
            auto scale  = mods[0];
            auto shift  = mods[1];
            x           = ggml_norm(ctx->ggml_ctx, x, eps);
            return Flux::modulate(ctx->ggml_ctx, x, shift, scale);
        }
    };
    struct LensParams {
        int patch_size              = 2;
        int64_t in_channels         = 128;
        int64_t out_channels        = 32;
        int num_layers              = 48;
        int64_t attention_head_dim  = 64;
        int64_t num_attention_heads = 24;
        int64_t joint_attention_dim = 2880;
        int selected_layer_count    = 4;
        int theta                   = 10000;
        std::vector<int> axes_dim   = {8, 28, 28};
        int axes_dim_sum            = 64;
    };
    class LensModel : public GGMLBlock {
    public:
        LensParams params;
        LensModel() = default;
        LensModel(LensParams params)
            : params(params) {
            int64_t inner_dim         = params.num_attention_heads * params.attention_head_dim;
            blocks["time_text_embed"] = std::make_shared<LensTimestepProjEmbeddings>(inner_dim);
            blocks["img_in"]          = std::make_shared<Linear>(params.in_channels, inner_dim, true);
            blocks["txt_in"]          = std::make_shared<Linear>(params.joint_attention_dim * params.selected_layer_count, inner_dim, true);
            for (int i = 0; i < params.selected_layer_count; ++i) {
                blocks["txt_norm." + std::to_string(i)] = std::make_shared<RMSNorm>(params.joint_attention_dim, 1e-5f);
            }
            for (int i = 0; i < params.num_layers; ++i) {
                blocks["transformer_blocks." + std::to_string(i)] = std::make_shared<LensTransformerBlock>(inner_dim,
                                                                                                           params.num_attention_heads,
                                                                                                           params.attention_head_dim);
            }
            blocks["norm_out"] = std::make_shared<LensAdaLayerNormContinuous>(inner_dim, 1e-6f);
            blocks["proj_out"] = std::make_shared<Linear>(inner_dim, params.patch_size * params.patch_size * params.out_channels, true);
        }
        ggml_tensor* forward(GGMLRunnerContext* ctx,
                             ggml_tensor* x,
                             ggml_tensor* timestep,
                             ggml_tensor* context,
                             ggml_tensor* pe) {
            GGML_ASSERT(context != nullptr);
            int64_t W = x->ne[0];
            int64_t H = x->ne[1];
            int64_t C = x->ne[2];
            int64_t N = x->ne[3];
            auto time_text_embed = std::dynamic_pointer_cast<LensTimestepProjEmbeddings>(blocks["time_text_embed"]);
            auto img_in          = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
            auto txt_in          = std::dynamic_pointer_cast<Linear>(blocks["txt_in"]);
            auto norm_out        = std::dynamic_pointer_cast<LensAdaLayerNormContinuous>(blocks["norm_out"]);
            auto proj_out        = std::dynamic_pointer_cast<Linear>(blocks["proj_out"]);
            auto t_emb = time_text_embed->forward(ctx, timestep);
            auto img = ggml_reshape_3d(ctx->ggml_ctx, x, W * H, C, N);
            img      = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, img, 1, 0, 2, 3));
            img      = img_in->forward(ctx, img);
            std::vector<ggml_tensor*> txt_chunks = ggml_ext_chunk(ctx->ggml_ctx, context, params.selected_layer_count, 0);
            ggml_tensor* txt                     = nullptr;
            for (int i = 0; i < params.selected_layer_count; ++i) {
                auto txt_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm." + std::to_string(i)]);
                auto chunk    = txt_norm->forward(ctx, txt_chunks[i]);
                txt           = txt == nullptr ? chunk : ggml_concat(ctx->ggml_ctx, txt, chunk, 0);
            }
            txt = txt_in->forward(ctx, txt);
            sd::ggml_graph_cut::mark_graph_cut(img, "lens.prelude", "img");
            sd::ggml_graph_cut::mark_graph_cut(txt, "lens.prelude", "txt");
            for (int i = 0; i < params.num_layers; ++i) {
                auto block = std::dynamic_pointer_cast<LensTransformerBlock>(blocks["transformer_blocks." + std::to_string(i)]);
                auto out   = block->forward(ctx, img, txt, t_emb, pe);
                img        = out.first;
                txt        = out.second;
                sd::ggml_graph_cut::mark_graph_cut(img, "lens.transformer_blocks." + std::to_string(i), "img");
                sd::ggml_graph_cut::mark_graph_cut(txt, "lens.transformer_blocks." + std::to_string(i), "txt");
            }
            img = norm_out->forward(ctx, img, t_emb);
            img = proj_out->forward(ctx, img);
            auto out = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, img, 1, 0, 2, 3));
            out      = ggml_reshape_4d(ctx->ggml_ctx, out, W, H, params.patch_size * params.patch_size * params.out_channels, N);
            return out;
        }
    };
    struct LensRunner : public GGMLRunner {
        LensParams lens_params;
        LensModel lens;
        std::vector<float> pe_vec;
        LensRunner(ggml_backend_t backend,
                   ggml_backend_t params_backend,
                   const String2TensorStorage& tensor_storage_map = {},
                   const std::string prefix                       = "")
            : GGMLRunner(backend, params_backend) {
            lens_params.num_layers = 0;
            for (const auto& [name, tensor_storage] : tensor_storage_map) {
                if (!starts_with(name, prefix)) {
                    continue;
                }
                if (ends_with(name, "img_in.weight") && tensor_storage.n_dims == 2) {
                    lens_params.in_channels         = tensor_storage.ne[0];
                    int64_t inner_dim               = tensor_storage.ne[1];
                    lens_params.num_attention_heads = inner_dim / lens_params.attention_head_dim;
                } else if (ends_with(name, "txt_in.weight") && tensor_storage.n_dims == 2) {
                    lens_params.selected_layer_count = static_cast<int>(tensor_storage.ne[0] / lens_params.joint_attention_dim);
                } else if (ends_with(name, "proj_out.weight") && tensor_storage.n_dims == 2) {
                    lens_params.out_channels = tensor_storage.ne[1] / lens_params.patch_size / lens_params.patch_size;
                } else if (ends_with(name, "transformer_blocks.0.attn.norm_q.weight") && tensor_storage.n_dims == 1) {
                    lens_params.attention_head_dim = tensor_storage.ne[0];
                }
                size_t pos = name.find("transformer_blocks.");
                if (pos != std::string::npos) {
                    std::string layer_name = name.substr(pos);
                    auto items             = split_string(layer_name, '.');
                    if (items.size() > 1) {
                        int block_index = atoi(items[1].c_str());
                        if (block_index + 1 > lens_params.num_layers) {
                            lens_params.num_layers = block_index + 1;
                        }
                    }
                }
            }
            if (lens_params.num_layers == 0) {
                lens_params.num_layers = 48;
            }
            lens_params.axes_dim_sum = 0;
            for (int axis_dim : lens_params.axes_dim) {
                lens_params.axes_dim_sum += axis_dim;
            }
            LOG_INFO("lens: layers = %d, in_channels = %" PRId64 ", out_channels = %" PRId64
                     ", heads = %" PRId64 ", head_dim = %" PRId64,
                     lens_params.num_layers,
                     lens_params.in_channels,
                     lens_params.out_channels,
                     lens_params.num_attention_heads,
                     lens_params.attention_head_dim);
            lens = LensModel(lens_params);
            lens.init(params_ctx, tensor_storage_map, prefix);
        }
        std::string get_desc() override {
            return "lens";
        }
        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
            lens.get_param_tensors(tensors, prefix);
        }
        ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
                                 const sd::Tensor<float>& timesteps_tensor,
                                 const sd::Tensor<float>& context_tensor) {
            ggml_cgraph* gf        = new_graph_custom(LENS_GRAPH_SIZE);
            ggml_tensor* x         = make_input(x_tensor);
            ggml_tensor* timesteps = make_input(timesteps_tensor);
            GGML_ASSERT(x->ne[3] == 1);
            GGML_ASSERT(!context_tensor.empty());
            ggml_tensor* context = make_input(context_tensor);
            pe_vec      = Rope::gen_lens_pe(static_cast<int>(x->ne[1]),
                                            static_cast<int>(x->ne[0]),
                                            static_cast<int>(x->ne[3]),
                                            static_cast<int>(context->ne[1]),
                                            lens_params.theta,
                                            circular_y_enabled,
                                            circular_x_enabled,
                                            lens_params.axes_dim);
            int pos_len = static_cast<int>(pe_vec.size() / lens_params.axes_dim_sum / 2);
            auto pe     = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, lens_params.axes_dim_sum / 2, pos_len);
            set_backend_tensor_data(pe, pe_vec.data());
            auto runner_ctx  = get_context();
            ggml_tensor* out = lens.forward(&runner_ctx, x, timesteps, context, pe);
            ggml_build_forward_expand(gf, out);
            return gf;
        }
        sd::Tensor<float> compute(int n_threads,
                                  const sd::Tensor<float>& x,
                                  const sd::Tensor<float>& timesteps,
                                  const sd::Tensor<float>& context) {
            auto get_graph = [&]() -> ggml_cgraph* {
                return build_graph(x, timesteps, context);
            };
            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
        }
    };
 }  // namespace Lens
 #endif  // __SD_LENS_HPP__
--- a/src/llm.hpp
+++ b/src/llm.hpp
@ -23,11 +23,12 @@
 #include "rope.hpp"
 #include "tokenizers/bpe_tokenizer.h"
 #include "tokenizers/gemma_tokenizer.h"
 #include "tokenizers/gpt_oss_tokenizer.h"
 #include "tokenizers/mistral_tokenizer.h"
 #include "tokenizers/qwen2_tokenizer.h"
 namespace LLM {
-    constexpr int LLM_GRAPH_SIZE = 10240;
+    constexpr int LLM_GRAPH_SIZE = 65536;
    enum class LLMArch {
        QWEN2_5_VL,
@ -36,6 +37,7 @@ namespace LLM {
        MISTRAL_SMALL_3_2,
        MINISTRAL_3_3B,
        GEMMA3_12B,
        GPT_OSS_20B,
        ARCH_COUNT,
    };
@ -46,6 +48,7 @@ namespace LLM {
        "mistral_small3.2",
        "ministral3.3b",
        "gemma3_12b",
        "gpt_oss_20b",
    };
    enum class MLPActivation {
@ -83,6 +86,7 @@ namespace LLM {
        int num_kv_heads                = 4;
        int head_dim                    = 128;
        bool qkv_bias                   = true;
        bool attention_out_bias         = false;
        bool qk_norm                    = false;
        bool rms_norm_add               = false;
        bool normalize_input            = false;
@ -93,6 +97,8 @@ namespace LLM {
        std::vector<float> rope_thetas  = {1000000.f};
        std::vector<float> rope_scales  = {1.f};
        std::vector<int> sliding_attention;
        int64_t num_experts         = 0;
        int64_t num_experts_per_tok = 0;
        LLMVisionParams vision;
    };
@ -163,6 +169,170 @@ namespace LLM {
        }
    };
    struct GPTOSSMLP : public GGMLBlock {
    protected:
        int64_t hidden_size;
        int64_t intermediate_size;
        int64_t num_experts;
        int64_t num_experts_per_tok;
        bool has_combined_gate_up = false;
        void init_params(ggml_context* ctx,
                         const String2TensorStorage& tensor_storage_map = {},
                         std::string prefix                             = "") override {
            auto supported_type = [](ggml_type wtype, int64_t in_features) {
                if (in_features % ggml_blck_size(wtype) != 0) {
                    return GGML_TYPE_F32;
                }
                return wtype;
            };
            params["router.weight"] = ggml_new_tensor_2d(ctx,
                                                         supported_type(get_type(prefix + "router.weight", tensor_storage_map, GGML_TYPE_F32), hidden_size),
                                                         hidden_size,
                                                         num_experts);
            params["router.bias"]   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_experts);
            has_combined_gate_up = tensor_storage_map.find(prefix + "experts.gate_up_proj.weight") != tensor_storage_map.end();
            if (has_combined_gate_up) {
                ggml_type gate_up_type                = supported_type(get_type(prefix + "experts.gate_up_proj.weight", tensor_storage_map, GGML_TYPE_F32), hidden_size);
                params["experts.gate_up_proj.weight"] = ggml_new_tensor_3d(ctx,
                                                                           gate_up_type,
                                                                           hidden_size,
                                                                           intermediate_size * 2,
                                                                           num_experts);
                params["experts.gate_up_proj.bias"]   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, intermediate_size * 2, num_experts);
            } else {
                ggml_type gate_type                = supported_type(get_type(prefix + "experts.gate_proj.weight", tensor_storage_map, GGML_TYPE_F32), hidden_size);
                ggml_type up_type                  = supported_type(get_type(prefix + "experts.up_proj.weight", tensor_storage_map, GGML_TYPE_F32), hidden_size);
                params["experts.gate_proj.weight"] = ggml_new_tensor_3d(ctx, gate_type, hidden_size, intermediate_size, num_experts);
                params["experts.up_proj.weight"]   = ggml_new_tensor_3d(ctx, up_type, hidden_size, intermediate_size, num_experts);
                params["experts.gate_proj.bias"]   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, intermediate_size, num_experts);
                params["experts.up_proj.bias"]     = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, intermediate_size, num_experts);
            }
            ggml_type down_type                = supported_type(get_type(prefix + "experts.down_proj.weight", tensor_storage_map, GGML_TYPE_F32), intermediate_size);
            params["experts.down_proj.weight"] = ggml_new_tensor_3d(ctx, down_type, intermediate_size, hidden_size, num_experts);
            params["experts.down_proj.bias"]   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hidden_size, num_experts);
        }
        ggml_tensor* expert_linear(GGMLRunnerContext* ctx,
                                   const std::string& weight_name,
                                   const std::string& bias_name,
                                   ggml_tensor* x,
                                   ggml_tensor* selected_experts) {
            auto out = ggml_mul_mat_id(ctx->ggml_ctx, params[weight_name], x, selected_experts);
            auto it  = params.find(bias_name);
            if (it != params.end()) {
                out = ggml_add_id(ctx->ggml_ctx, out, it->second, selected_experts);
            }
            return out;
        }
    public:
        GPTOSSMLP(const LLMParams& params)
            : hidden_size(params.hidden_size),
              intermediate_size(params.intermediate_size),
              num_experts(params.num_experts),
              num_experts_per_tok(params.num_experts_per_tok) {}
        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
            // x: [N, n_token, hidden_size]
            GGML_ASSERT(num_experts > 0 && num_experts_per_tok > 0);
            const int64_t n_token       = x->ne[1];
            const int64_t N             = x->ne[2];
            const int64_t n_token_total = n_token * N;
            ggml_tensor* router_weight  = params["router.weight"];
            ggml_tensor* router_bias    = params["router.bias"];
            ggml_tensor* router_logits  = ggml_mul_mat(ctx->ggml_ctx, router_weight, x);
            router_logits               = ggml_add(ctx->ggml_ctx, router_logits, router_bias);
            router_logits               = ggml_reshape_2d(ctx->ggml_ctx, router_logits, num_experts, n_token_total);
            ggml_tensor* selected_experts = ggml_argsort_top_k(ctx->ggml_ctx, router_logits, (int)num_experts_per_tok);  // [top_k, tokens]
            ggml_tensor* probs            = ggml_reshape_3d(ctx->ggml_ctx, router_logits, 1, num_experts, n_token_total);
            ggml_tensor* weights          = ggml_get_rows(ctx->ggml_ctx, probs, selected_experts);  // [1, top_k, tokens]
            weights                       = ggml_reshape_2d(ctx->ggml_ctx, weights, num_experts_per_tok, n_token_total);
            weights                       = ggml_soft_max(ctx->ggml_ctx, weights);
            weights                       = ggml_reshape_3d(ctx->ggml_ctx, weights, 1, num_experts_per_tok, n_token_total);
            x = ggml_reshape_3d(ctx->ggml_ctx, x, hidden_size, 1, n_token_total);
            ggml_tensor* gate = nullptr;
            ggml_tensor* up   = nullptr;
            if (has_combined_gate_up) {
                auto gate_up = expert_linear(ctx,
                                             "experts.gate_up_proj.weight",
                                             "experts.gate_up_proj.bias",
                                             x,
                                             selected_experts);  // [2 * intermediate, top_k, tokens]
                gate_up      = ggml_reshape_4d(ctx->ggml_ctx,
                                               gate_up,
                                               2,
                                               intermediate_size,
                                               num_experts_per_tok,
                                               n_token_total);
                gate         = ggml_view_4d(ctx->ggml_ctx,
                                            gate_up,
                                            1,
                                            intermediate_size,
                                            num_experts_per_tok,
                                            n_token_total,
                                            gate_up->nb[1],
                                            gate_up->nb[2],
                                            gate_up->nb[3],
                                            0);
                up           = ggml_view_4d(ctx->ggml_ctx,
                                            gate_up,
                                            1,
                                            intermediate_size,
                                            num_experts_per_tok,
                                            n_token_total,
                                            gate_up->nb[1],
                                            gate_up->nb[2],
                                            gate_up->nb[3],
                                            gate_up->nb[0]);
                gate         = ggml_reshape_3d(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, gate), intermediate_size, num_experts_per_tok, n_token_total);
                up           = ggml_reshape_3d(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, up), intermediate_size, num_experts_per_tok, n_token_total);
            } else {
                gate = expert_linear(ctx,
                                     "experts.gate_proj.weight",
                                     "experts.gate_proj.bias",
                                     x,
                                     selected_experts);
                up   = expert_linear(ctx,
                                     "experts.up_proj.weight",
                                     "experts.up_proj.bias",
                                     x,
                                     selected_experts);
            }
            auto activated = ggml_swiglu_oai(ctx->ggml_ctx, gate, up, 1.702f, 7.0f);
            auto experts   = expert_linear(ctx,
                                           "experts.down_proj.weight",
                                           "experts.down_proj.bias",
                                           activated,
                                           selected_experts);
            experts        = ggml_mul(ctx->ggml_ctx, experts, weights);
            ggml_tensor* out = nullptr;
            for (int64_t i = 0; i < num_experts_per_tok; ++i) {
                auto expert_out = ggml_view_2d(ctx->ggml_ctx,
                                               experts,
                                               hidden_size,
                                               n_token_total,
                                               experts->nb[2],
                                               i * experts->nb[1]);
                out             = out == nullptr ? expert_out : ggml_add(ctx->ggml_ctx, out, expert_out);
            }
            if (num_experts_per_tok == 1) {
                out = ggml_cont(ctx->ggml_ctx, out);
            }
            return ggml_reshape_3d(ctx->ggml_ctx, out, hidden_size, n_token, N);
        }
    };
    static ggml_tensor* splice_image_embeds(GGMLRunnerContext* ctx,
                                            ggml_tensor* x,
                                            const std::vector<std::pair<int, ggml_tensor*>>& image_embeds) {
@ -601,6 +771,15 @@ namespace LLM {
        int64_t max_position_embeddings;
        std::vector<float> rope_thetas;
        std::vector<float> rope_scales;
        bool has_attention_sinks;
        void init_params(ggml_context* ctx,
                         const String2TensorStorage& tensor_storage_map = {},
                         std::string prefix                             = "") override {
            if (has_attention_sinks) {
                params["sinks"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_heads);
            }
        }
    public:
        Attention(const LLMParams& params)
@ -611,11 +790,12 @@ namespace LLM {
              qk_norm(params.qk_norm),
              max_position_embeddings(params.max_position_embeddings),
              rope_thetas(params.rope_thetas),
-              rope_scales(params.rope_scales) {
+              rope_scales(params.rope_scales),
              has_attention_sinks(params.arch == LLMArch::GPT_OSS_20B) {
            blocks["q_proj"] = std::make_shared<Linear>(params.hidden_size, num_heads * head_dim, params.qkv_bias);
            blocks["k_proj"] = std::make_shared<Linear>(params.hidden_size, num_kv_heads * head_dim, params.qkv_bias);
            blocks["v_proj"] = std::make_shared<Linear>(params.hidden_size, num_kv_heads * head_dim, params.qkv_bias);
-            blocks["o_proj"] = std::make_shared<Linear>(num_heads * head_dim, params.hidden_size, false);
+            blocks["o_proj"] = std::make_shared<Linear>(num_heads * head_dim, params.hidden_size, params.attention_out_bias);
            if (params.qk_norm) {
                blocks["q_norm"] = std::make_shared<LLMRMSNorm>(head_dim, params.rms_norm_eps, params.rms_norm_add);
                blocks["k_norm"] = std::make_shared<LLMRMSNorm>(head_dim, params.rms_norm_eps, params.rms_norm_add);
@ -660,6 +840,36 @@ namespace LLM {
            } else if (arch == LLMArch::QWEN3) {
                q = ggml_rope_ext(ctx->ggml_ctx, q, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 40960, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
                k = ggml_rope_ext(ctx->ggml_ctx, k, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 40960, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
            } else if (arch == LLMArch::GPT_OSS_20B) {
                float rope_theta = rope_thetas.empty() ? 150000.f : rope_thetas[0];
                float rope_scale = rope_scales.empty() ? 32.f : rope_scales[0];
                float freq_scale = 1.f / rope_scale;
                q                = ggml_rope_ext(ctx->ggml_ctx,
                                                 q,
                                                 input_pos,
                                                 nullptr,
                                                 head_dim,
                                                 GGML_ROPE_TYPE_NEOX,
                                                 4096,
                                                 rope_theta,
                                                 freq_scale,
                                                 1.f,
                                                 1.f,
                                                 32.f,
                                                 1.f);
                k                = ggml_rope_ext(ctx->ggml_ctx,
                                                 k,
                                                 input_pos,
                                                 nullptr,
                                                 head_dim,
                                                 GGML_ROPE_TYPE_NEOX,
                                                 4096,
                                                 rope_theta,
                                                 freq_scale,
                                                 1.f,
                                                 1.f,
                                                 32.f,
                                                 1.f);
            } else if (arch == LLMArch::GEMMA3_12B) {
                float rope_theta = (rope_index == 1 ? 10000.0f : 1000000.0f);
                float rope_scale = (rope_index == 1 ? 1.f : 8.f);
@ -706,7 +916,28 @@ namespace LLM {
            k = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, k, 0, 2, 1, 3));  // [N, num_kv_heads, n_token, head_dim]
            k = ggml_reshape_3d(ctx->ggml_ctx, k, k->ne[0], k->ne[1], k->ne[2] * k->ne[3]);      // [N*num_kv_heads, n_token, head_dim]
            if (arch == LLMArch::GPT_OSS_20B) {
                GGML_ASSERT(N == 1);
                auto v_attn = ggml_ext_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, v, 1, 2, 0, 3));  // [N, kv_heads, head_dim, tokens]
                v_attn      = ggml_reshape_3d(ctx->ggml_ctx, v_attn, n_token, head_dim, num_kv_heads * N);
                auto kq = ggml_mul_mat(ctx->ggml_ctx, k, q);
                ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
                kq = ggml_scale_inplace(ctx->ggml_ctx, kq, 1.0f / std::sqrt(static_cast<float>(head_dim)));
                if (attention_mask != nullptr) {
                    kq = ggml_add_inplace(ctx->ggml_ctx, kq, attention_mask);
                }
                kq = ggml_soft_max_inplace(ctx->ggml_ctx, kq);
                ggml_soft_max_add_sinks(kq, params["sinks"]);
                auto kqv = ggml_mul_mat(ctx->ggml_ctx, v_attn, kq);
                kqv      = ggml_reshape_4d(ctx->ggml_ctx, kqv, head_dim, n_token, num_heads, N);
                kqv      = ggml_permute(ctx->ggml_ctx, kqv, 0, 2, 1, 3);
                x        = ggml_ext_cont(ctx->ggml_ctx, kqv);
                x        = ggml_reshape_3d(ctx->ggml_ctx, x, head_dim * num_heads, n_token, N);
            } else {
                x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, attention_mask, true, false);  // [N, n_token, hidden_size]
            }
            x = out_proj->forward(ctx, x);  // [N, n_token, hidden_size]
            return x;
@ -727,10 +958,14 @@ namespace LLM {
              has_post_attention_norm(params.arch == LLMArch::GEMMA3_12B),
              has_post_ffw_norm(params.arch == LLMArch::GEMMA3_12B) {
            blocks["self_attn"] = std::make_shared<Attention>(params);
            if (params.arch == LLMArch::GPT_OSS_20B) {
                blocks["mlp"] = std::make_shared<GPTOSSMLP>(params);
            } else {
                blocks["mlp"] = std::make_shared<MLP>(params.hidden_size,
                                                      params.intermediate_size,
                                                      false,
                                                      params.mlp_activation);
            }
            blocks["input_layernorm"]          = std::make_shared<LLMRMSNorm>(params.hidden_size, params.rms_norm_eps, params.rms_norm_add);
            blocks["post_attention_layernorm"] = std::make_shared<LLMRMSNorm>(params.hidden_size, params.rms_norm_eps, params.rms_norm_add);
            if (has_post_attention_norm) {
@ -751,7 +986,6 @@ namespace LLM {
                             ggml_tensor* sliding_attention_mask = nullptr) {
            // x: [N, n_token, hidden_size]
            auto self_attn                                  = std::dynamic_pointer_cast<Attention>(blocks["self_attn"]);
            auto mlp                                        = std::dynamic_pointer_cast<MLP>(blocks["mlp"]);
            auto input_layernorm                            = std::dynamic_pointer_cast<LLMRMSNorm>(blocks["input_layernorm"]);
            auto post_attention_layernorm                   = std::dynamic_pointer_cast<LLMRMSNorm>(blocks["post_attention_layernorm"]);
            std::shared_ptr<LLMRMSNorm> post_attention_norm = nullptr;
@ -764,7 +998,7 @@ namespace LLM {
            }
            ggml_tensor* block_attention_mask = attention_mask;
            int rope_index                    = 0;
-            if (arch == LLMArch::GEMMA3_12B && sliding_attention > 0) {
+            if ((arch == LLMArch::GEMMA3_12B || arch == LLMArch::GPT_OSS_20B) && sliding_attention > 0) {
                block_attention_mask = sliding_attention_mask;
                rope_index           = 1;
            }
@ -779,7 +1013,13 @@ namespace LLM {
            residual = x;
            x        = post_attention_layernorm->forward(ctx, x);
            if (arch == LLMArch::GPT_OSS_20B) {
                auto mlp = std::dynamic_pointer_cast<GPTOSSMLP>(blocks["mlp"]);
                x        = mlp->forward(ctx, x);
            } else {
                auto mlp = std::dynamic_pointer_cast<MLP>(blocks["mlp"]);
                x        = mlp->forward(ctx, x);
            }
            if (post_ffw_norm != nullptr) {
                x = post_ffw_norm->forward(ctx, x);
            }
@ -1202,6 +1442,24 @@ namespace LLM {
                params.rope_thetas             = {1000000.f, 10000.f};
                params.rope_scales             = {8.f, 1.f};
                params.sliding_attention       = {1024, 1024, 1024, 1024, 1024, 0};
            } else if (arch == LLMArch::GPT_OSS_20B) {
                params.head_dim                = 64;
                params.num_heads               = 64;
                params.num_kv_heads            = 8;
                params.qkv_bias                = true;
                params.attention_out_bias      = true;
                params.qk_norm                 = false;
                params.rms_norm_eps            = 1e-5f;
                params.hidden_size             = 2880;
                params.intermediate_size       = 2880;
                params.num_layers              = 24;
                params.vocab_size              = 201088;
                params.max_position_embeddings = 131072;
                params.rope_thetas             = {150000.f};
                params.rope_scales             = {32.f};
                params.sliding_attention       = {128, 0};
                params.num_experts             = 32;
                params.num_experts_per_tok     = 4;
            }
            bool have_vision_weight = false;
            bool llama_cpp_style    = false;
@ -1236,6 +1494,12 @@ namespace LLM {
                if (contains(tensor_name, "layers.0.mlp.gate_proj.weight")) {
                    params.intermediate_size = pair.second.ne[1];
                }
                if (contains(tensor_name, "layers.0.mlp.experts.gate_up_proj.weight")) {
                    params.intermediate_size = pair.second.ne[1] / 2;
                }
                if (contains(tensor_name, "layers.0.mlp.experts.gate_proj.weight")) {
                    params.intermediate_size = pair.second.ne[1];
                }
            }
            if (arch == LLMArch::QWEN3 && params.num_layers == 28) {  // Qwen3 2B
                params.num_heads = 16;
@ -1315,7 +1579,8 @@ namespace LLM {
            if (params.arch == LLMArch::MISTRAL_SMALL_3_2 ||
                params.arch == LLMArch::MINISTRAL_3_3B ||
                params.arch == LLMArch::QWEN3 ||
-                params.arch == LLMArch::GEMMA3_12B) {
+                params.arch == LLMArch::GEMMA3_12B ||
                params.arch == LLMArch::GPT_OSS_20B) {
                input_pos_vec.resize(n_tokens);
                for (int i = 0; i < n_tokens; ++i) {
                    input_pos_vec[i] = i;
@ -1354,7 +1619,11 @@ namespace LLM {
                set_backend_tensor_data(attention_mask, attention_mask_vec.data());
            }
-            if (params.arch == LLMArch::GEMMA3_12B) {
+            if (params.arch == LLMArch::GEMMA3_12B || params.arch == LLMArch::GPT_OSS_20B) {
                int sliding_window = 0;
                for (int window : params.sliding_attention) {
                    sliding_window = std::max(sliding_window, window);
                }
                sliding_attention_mask_vec.resize(n_tokens * n_tokens);
                if (!attention_mask_tensor.empty()) {
                    GGML_ASSERT(attention_mask_tensor.numel() == n_tokens * n_tokens);
@ -1364,8 +1633,7 @@ namespace LLM {
                }
                for (int i0 = 0; i0 < n_tokens; i0++) {
                    for (int i1 = 0; i1 < n_tokens; i1++) {
-                        if (i0 + 1024 <= i1) {
+                        if (sliding_window > 0 && i0 + sliding_window <= i1) {
                            LOG_DEBUG("xxxxxxxxxxxxxx");
                            sliding_attention_mask_vec[i1 * n_tokens + i0] = -INFINITY;
                        }
                    }
@ -1485,6 +1753,8 @@ namespace LLM {
            : model(arch, backend, params_backend, tensor_storage_map, prefix, enable_vision) {
            if (arch == LLMArch::MISTRAL_SMALL_3_2 || arch == LLMArch::MINISTRAL_3_3B) {
                tokenizer = std::make_shared<MistralTokenizer>();
            } else if (arch == LLMArch::GPT_OSS_20B) {
                tokenizer = std::make_shared<GPTOSSTokenizer>();
            } else {
                tokenizer = std::make_shared<Qwen2Tokenizer>();
            }
--- a/src/model.cpp
+++ b/src/model.cpp
@ -442,6 +442,10 @@ SDVersion ModelLoader::get_sd_version() {
            tensor_storage_map.find("model.language_model.layers.0.self_attn.q_proj.weight") != tensor_storage_map.end()) {
            return VERSION_HIDREAM_O1;
        }
        if (tensor_storage.name.find("model.diffusion_model.transformer_blocks.0.attn.norm_added_q.weight") != std::string::npos &&
            tensor_storage_map.find("model.diffusion_model.transformer_blocks.0.img_mlp.w1.weight") != tensor_storage_map.end()) {
            return VERSION_LENS;
        }
        if (tensor_storage.name.find("model.diffusion_model.transformer_blocks.0.img_mod.1.weight") != std::string::npos) {
            return VERSION_QWEN_IMAGE;
        }
--- a/src/model.h
+++ b/src/model.h
@ -47,6 +47,7 @@ enum SDVersion {
    VERSION_Z_IMAGE,
    VERSION_OVIS_IMAGE,
    VERSION_ERNIE_IMAGE,
    VERSION_LENS,
    VERSION_LONGCAT,
    VERSION_COUNT,
 };
@ -156,8 +157,15 @@ static inline bool sd_version_is_ernie_image(SDVersion version) {
    return false;
 }
 static inline bool sd_version_is_lens(SDVersion version) {
    if (version == VERSION_LENS) {
        return true;
    }
    return false;
 }
 static inline bool sd_version_uses_flux2_vae(SDVersion version) {
-    if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version)) {
+    if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version) || sd_version_is_lens(version)) {
        return true;
    }
    return false;
@ -185,6 +193,7 @@ static inline bool sd_version_is_dit(SDVersion version) {
        sd_version_is_anima(version) ||
        sd_version_is_z_image(version) ||
        sd_version_is_ernie_image(version) ||
        sd_version_is_lens(version) ||
        sd_version_is_longcat(version)) {
        return true;
    }
--- a/src/name_conversion.cpp
+++ b/src/name_conversion.cpp
@ -128,6 +128,7 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
    };
    static const std::vector<std::pair<std::string, std::string>> llm_name_map{
        {"attn_sinks.weight", "self_attn.sinks"},
        {"token_embd.", "model.embed_tokens."},
        {"blk.", "model.layers."},
        {"attn_q.", "self_attn.q_proj."},
@ -137,6 +138,12 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
        {"attn_k_norm.", "self_attn.k_norm."},
        {"attn_output.", "self_attn.o_proj."},
        {"attn_norm.", "input_layernorm."},
        {"attn_post_norm.", "post_attention_layernorm."},
        {"post_attention_norm.", "post_attention_layernorm."},
        {"ffn_gate_inp.", "mlp.router."},
        {"ffn_gate_exps.", "mlp.experts.gate_proj."},
        {"ffn_up_exps.", "mlp.experts.up_proj."},
        {"ffn_down_exps.", "mlp.experts.down_proj."},
        {"ffn_down.", "mlp.down_proj."},
        {"ffn_gate.", "mlp.gate_proj."},
        {"ffn_up.", "mlp.up_proj."},
--- a/src/rope.hpp
+++ b/src/rope.hpp
@ -478,6 +478,52 @@ namespace Rope {
        return embed_nd(ids, bs, static_cast<float>(theta), axes_dim, wrap_dims);
    }
    __STATIC_INLINE__ std::vector<std::vector<float>> gen_lens_ids(int h,
                                                                   int w,
                                                                   int bs,
                                                                   int context_len,
                                                                   bool scale_rope = true) {
        auto img_ids_repeated = gen_flux_img_ids(h, w, 1, bs, 3, 0, 0, 0, scale_rope);
        int txt_id_start = scale_rope ? std::max(h / 2, w / 2) : 0;
        auto txt_ids     = linspace<float>(1.f * txt_id_start, 1.f * context_len + txt_id_start, context_len);
        std::vector<std::vector<float>> txt_ids_repeated(bs * context_len, std::vector<float>(3));
        for (int i = 0; i < bs; ++i) {
            for (int j = 0; j < txt_ids.size(); ++j) {
                txt_ids_repeated[i * txt_ids.size() + j] = {txt_ids[j], txt_ids[j], txt_ids[j]};
            }
        }
        return concat_ids(img_ids_repeated, txt_ids_repeated, bs);
    }
    __STATIC_INLINE__ std::vector<float> gen_lens_pe(int h,
                                                     int w,
                                                     int bs,
                                                     int context_len,
                                                     int theta,
                                                     bool circular_h,
                                                     bool circular_w,
                                                     const std::vector<int>& axes_dim) {
        std::vector<std::vector<float>> ids = gen_lens_ids(h, w, bs, context_len, true);
        std::vector<std::vector<int>> wrap_dims;
        if ((circular_h || circular_w) && bs > 0 && axes_dim.size() >= 3) {
            size_t pos_len = ids.size() / bs;
            wrap_dims.assign(axes_dim.size(), std::vector<int>(pos_len, 0));
            const size_t img_tokens = static_cast<size_t>(h) * static_cast<size_t>(w);
            for (size_t token_i = 0; token_i < img_tokens; ++token_i) {
                if (circular_h) {
                    wrap_dims[1][token_i] = h;
                }
                if (circular_w) {
                    wrap_dims[2][token_i] = w;
                }
            }
        }
        return embed_nd(ids, bs, static_cast<float>(theta), axes_dim, wrap_dims);
    }
    __STATIC_INLINE__ std::vector<std::vector<float>> gen_ernie_image_ids(int h,
                                                                          int w,
                                                                          int patch_size,
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@ -62,6 +62,7 @@ const char* model_version_to_str[] = {
    "Z-Image",
    "Ovis Image",
    "Ernie Image",
    "Lens",
    "Longcat-Image",
 };
@ -646,6 +647,15 @@ public:
                                                                    params_backend_for(SDBackendModule::DIFFUSION),
                                                                    tensor_storage_map,
                                                                    "model.diffusion_model");
            } else if (sd_version_is_lens(version)) {
                cond_stage_model = std::make_shared<LLMEmbedder>(backend_for(SDBackendModule::TE),
                                                                 params_backend_for(SDBackendModule::TE),
                                                                 tensor_storage_map,
                                                                 version);
                diffusion_model  = std::make_shared<LensModel>(backend_for(SDBackendModule::DIFFUSION),
                                                              params_backend_for(SDBackendModule::DIFFUSION),
                                                              tensor_storage_map,
                                                              "model.diffusion_model");
            } else {  // SD1.x SD2.x SDXL
                std::map<std::string, std::string> embbeding_map;
                for (uint32_t i = 0; i < sd_ctx_params->embedding_count; i++) {
@ -935,6 +945,11 @@ public:
            ignore_tensors.insert("text_encoders.llm.vision_tower.");
            ignore_tensors.insert("text_encoders.llm.multi_modal_projector.");
        }
        if (sd_version_is_lens(version)) {
            ignore_tensors.insert("text_encoders.llm.tokenizer_json");
            ignore_tensors.insert("text_encoders.llm.model.layers.0.mlp.experts.gate_up_proj.weight_scale_2");
            ignore_tensors.insert("text_encoders.llm.model.layers.0.mlp.experts.down_proj.weight_scale_2");
        }
        if (version == VERSION_HIDREAM_O1) {
            ignore_tensors.insert("lm_head.");
            ignore_tensors.insert("model.visual.deepstack_merger_list.");
@ -1114,6 +1129,7 @@ public:
                    }
                } else if (sd_version_is_flux(version) ||
                           sd_version_is_longcat(version) ||
                           sd_version_is_lens(version) ||
                           sd_version_is_ltxav(version)) {
                    pred_type = FLUX_FLOW_PRED;
@ -1126,6 +1142,8 @@ public:
                    }
                    if (sd_version_is_longcat(version)) {
                        default_flow_shift = 3.0f;
                    } else if (sd_version_is_lens(version)) {
                        default_flow_shift = 1.83f;
                    } else if (sd_version_is_ltxav(version)) {
                        default_flow_shift = 2.37f;
                    }
--- a/src/tokenizers/gpt_oss_tokenizer.cpp
+++ b/src/tokenizers/gpt_oss_tokenizer.cpp
@ -0,0 +1,91 @@
 #include "gpt_oss_tokenizer.h"
 #include "json.hpp"
 #include "util.h"
 #include "vocab/vocab.h"
 void GPTOSSTokenizer::load_from_merges(const std::string& merges_utf8_str, const std::string& vocab_utf8_str) {
    auto byte_unicode_pairs = bytes_to_unicode();
    byte_encoder            = std::map<int, std::u32string>(byte_unicode_pairs.begin(), byte_unicode_pairs.end());
    for (auto& pair : byte_unicode_pairs) {
        byte_decoder[pair.second] = pair.first;
    }
    nlohmann::json vocab;
    try {
        vocab = nlohmann::json::parse(vocab_utf8_str);
    } catch (const nlohmann::json::parse_error&) {
        GGML_ABORT("invalid vocab json str");
    }
    for (const auto& [key, value] : vocab.items()) {
        std::u32string token = utf8_to_utf32(key);
        int i                = value;
        encoder[token]       = i;
        decoder[i]           = token;
    }
    encoder_len = static_cast<int>(encoder.size());
    for (auto& special_token : special_tokens) {
        auto token           = utf8_to_utf32(special_token);
        encoder[token]       = encoder_len;
        decoder[encoder_len] = token;
        encoder_len++;
    }
    encoder_len = static_cast<int>(encoder.size());
    LOG_DEBUG("vocab size: %d", encoder_len);
    std::vector<std::u32string> merges = split_utf32(merges_utf8_str);
    std::vector<std::pair<std::u32string, std::u32string>> merge_pairs;
    for (const auto& merge : merges) {
        size_t space_pos = merge.find(' ');
        merge_pairs.emplace_back(merge.substr(0, space_pos), merge.substr(space_pos + 1));
    }
    LOG_DEBUG("merges size %zu", merge_pairs.size());
    int rank = 0;
    for (const auto& merge : merge_pairs) {
        bpe_ranks[merge] = rank++;
    }
    bpe_len = rank;
 }
 GPTOSSTokenizer::GPTOSSTokenizer(const std::string& merges_utf8_str, const std::string& vocab_utf8_str) {
    BOS_TOKEN = "<|startoftext|>";
    UNK_TOKEN = "<|endoftext|>";
    EOS_TOKEN = "<|endoftext|>";
    PAD_TOKEN = "<|endoftext|>";
    BOS_TOKEN_ID = 199998;
    EOS_TOKEN_ID = 199999;
    UNK_TOKEN_ID = 199999;
    PAD_TOKEN_ID = 199999;
    special_tokens = {
        "<|startoftext|>",
        "<|endoftext|>",
        "<|reserved_200000|>",
        "<|reserved_200001|>",
        "<|return|>",
        "<|constrain|>",
        "<|reserved_200004|>",
        "<|channel|>",
        "<|start|>",
        "<|end|>",
        "<|message|>",
        "<|reserved_200009|>",
        "<|reserved_200010|>",
        "<|reserved_200011|>",
        "<|call|>",
        "<|reserved_200013|>",
        "<|reserved_200014|>",
        "<|reserved_200015|>",
        "<|reserved_200016|>",
        "<|reserved_200017|>",
        "<|endofprompt|>",
    };
    if (merges_utf8_str.size() > 0) {
        load_from_merges(merges_utf8_str, vocab_utf8_str);
    } else {
        load_from_merges(load_gpt_oss_merges(), load_gpt_oss_vocab_json());
    }
 }
--- a/src/tokenizers/gpt_oss_tokenizer.h
+++ b/src/tokenizers/gpt_oss_tokenizer.h
@ -0,0 +1,16 @@
 #ifndef __SD_TOKENIZERS_GPT_OSS_TOKENIZER_H__
 #define __SD_TOKENIZERS_GPT_OSS_TOKENIZER_H__
 #include <string>
 #include "bpe_tokenizer.h"
 class GPTOSSTokenizer : public BPETokenizer {
 protected:
    void load_from_merges(const std::string& merges_utf8_str, const std::string& vocab_utf8_str);
 public:
    explicit GPTOSSTokenizer(const std::string& merges_utf8_str = "", const std::string& vocab_utf8_str = "");
 };
 #endif  // __SD_TOKENIZERS_GPT_OSS_TOKENIZER_H__
--- a/src/tokenizers/vocab/gpt_oss_merges.hpp
+++ b/src/tokenizers/vocab/gpt_oss_merges.hpp
--- a/src/tokenizers/vocab/gpt_oss_vocab.hpp
+++ b/src/tokenizers/vocab/gpt_oss_vocab.hpp
--- a/src/tokenizers/vocab/vocab.cpp
+++ b/src/tokenizers/vocab/vocab.cpp
@ -2,6 +2,8 @@
 #include "clip_merges.hpp"
 #include "gemma_merges.hpp"
 #include "gemma_vocab.hpp"
 #include "gpt_oss_merges.hpp"
 #include "gpt_oss_vocab.hpp"
 #include "mistral_merges.hpp"
 #include "mistral_vocab.hpp"
 #include "qwen_merges.hpp"
@ -47,3 +49,13 @@ std::string load_gemma_vocab_json() {
    std::string json_str(reinterpret_cast<const char*>(gemma_vocab_json_utf8_c_str), sizeof(gemma_vocab_json_utf8_c_str));
    return json_str;
 }
 std::string load_gpt_oss_merges() {
    std::string merges_utf8_str(reinterpret_cast<const char*>(gpt_oss_merges_utf8_c_str), sizeof(gpt_oss_merges_utf8_c_str));
    return merges_utf8_str;
 }
 std::string load_gpt_oss_vocab_json() {
    std::string json_str(reinterpret_cast<const char*>(gpt_oss_vocab_json_utf8_c_str), sizeof(gpt_oss_vocab_json_utf8_c_str));
    return json_str;
 }
--- a/src/tokenizers/vocab/vocab.h
+++ b/src/tokenizers/vocab/vocab.h
@ -11,5 +11,7 @@ std::string load_t5_tokenizer_json();
 std::string load_umt5_tokenizer_json();
 std::string load_gemma_merges();
 std::string load_gemma_vocab_json();
 std::string load_gpt_oss_merges();
 std::string load_gpt_oss_vocab_json();
 #endif  // __SD_TOKENIZERS_VOCAB_VOCAB_H__