add lens support

2026-06-24 23:26:43 +00:00 · 2026-05-27 00:25:14 +08:00 · 2026-05-27 00:25:14 +08:00 · 1f8ced13f6
commit 1f8ced13f6
parent 1fa06bac5c
9 changed files with 886 additions and 20 deletions
--- a/src/conditioner.hpp
+++ b/src/conditioner.hpp
@ -1696,11 +1696,15 @@ struct LLMEmbedder : public Conditioner {
            arch = LLM::LLMArch::MISTRAL_SMALL_3_2;
        } else if (sd_version_is_ernie_image(version)) {
            arch = LLM::LLMArch::MINISTRAL_3_3B;
+        } else if (sd_version_is_lens(version)) {
+            arch = LLM::LLMArch::GPT_OSS_20B;
        } else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) {
            arch = LLM::LLMArch::QWEN3;
        }
        if (arch == LLM::LLMArch::MISTRAL_SMALL_3_2 || arch == LLM::LLMArch::MINISTRAL_3_3B) {
            tokenizer = std::make_shared<MistralTokenizer>();
+        } else if (arch == LLM::LLMArch::GPT_OSS_20B) {
+            tokenizer = std::make_shared<GPTOSSTokenizer>();
        } else {
            tokenizer = std::make_shared<Qwen2Tokenizer>();
        }
@ -1871,6 +1875,7 @@ struct LLMEmbedder : public Conditioner {
        std::vector<std::pair<int, sd::Tensor<float>>> image_embeds;
        int prompt_template_encode_start_idx = 34;
        int min_length                       = 0;  // pad tokens
+        int max_length                       = 100000000;
        int hidden_states_min_length         = 0;  // zero pad hidden_states
        bool spell_quotes                    = false;
        std::set<int> out_layers;
@ -2029,6 +2034,30 @@ struct LLMEmbedder : public Conditioner {
            prompt_attn_range.first = 0;
            prompt += conditioner_params.text;
            prompt_attn_range.second = static_cast<int>(prompt.size());
+        } else if (sd_version_is_lens(version)) {
+            prompt_template_encode_start_idx = 97;
+            min_length                       = 0;
+            max_length                       = 512;
+            out_layers                       = {6, 12, 18, 24};
+
+            prompt =
+                "<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.\n"
+                "Knowledge cutoff: 2024-06\n"
+                "Current date: 2026-05-26\n"  // fix for current date
+                "\n"
+                "Reasoning: medium\n"
+                "\n"
+                "# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>developer<|message|># Instructions\n"
+                "\n"
+                "Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background.\n"
+                "\n"
+                "<|end|><|start|>user<|message|>";
+
+            prompt_attn_range.first = static_cast<int>(prompt.size());
+            prompt += conditioner_params.text;
+            prompt_attn_range.second = static_cast<int>(prompt.size());
+
+            prompt += "<|end|><|start|>assistant<|channel|>analysis<|message|>Need to generate one image according to the description.<|end|><|start|>assistant<|channel|>final<|message|>";
        } else if (sd_version_is_z_image(version)) {
            prompt_template_encode_start_idx = 0;
            out_layers                       = {35};  // -2
@ -2085,7 +2114,8 @@ struct LLMEmbedder : public Conditioner {
                                           image_embeds,
                                           out_layers,
                                           prompt_template_encode_start_idx,
-                                           spell_quotes);
+                                           spell_quotes,
+                                           max_length);
        std::vector<sd::Tensor<float>> extra_hidden_states_vec;
        for (int i = 0; i < extra_prompts.size(); i++) {
            auto extra_hidden_states = encode_prompt(n_threads,
@ -2096,7 +2126,8 @@ struct LLMEmbedder : public Conditioner {
                                                     image_embeds,
                                                     out_layers,
                                                     prompt_template_encode_start_idx,
-                                                     spell_quotes);
+                                                     spell_quotes,
+                                                     max_length);
            extra_hidden_states_vec.push_back(std::move(extra_hidden_states));
        }

--- a/src/diffusion_model.hpp
+++ b/src/diffusion_model.hpp
@ -6,6 +6,7 @@
 #include "ernie_image.hpp"
 #include "flux.hpp"
 #include "hidream_o1.hpp"
+#include "lens.hpp"
 #include "ltxv.hpp"
 #include "mmdit.hpp"
 #include "qwen_image.hpp"
@ -701,6 +702,72 @@ struct ErnieImageModel : public DiffusionModel {
    }
 };

+struct LensModel : public DiffusionModel {
+    std::string prefix;
+    Lens::LensRunner lens;
+
+    LensModel(ggml_backend_t backend,
+              ggml_backend_t params_backend,
+              const String2TensorStorage& tensor_storage_map = {},
+              const std::string prefix                       = "model.diffusion_model")
+        : prefix(prefix), lens(backend, params_backend, tensor_storage_map, prefix) {
+    }
+
+    std::string get_desc() override {
+        return lens.get_desc();
+    }
+
+    void alloc_params_buffer() override {
+        lens.alloc_params_buffer();
+    }
+
+    void free_params_buffer() override {
+        lens.free_params_buffer();
+    }
+
+    void free_compute_buffer() override {
+        lens.free_compute_buffer();
+    }
+
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
+        lens.get_param_tensors(tensors, prefix);
+    }
+
+    size_t get_params_buffer_size() override {
+        return lens.get_params_buffer_size();
+    }
+
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        lens.set_weight_adapter(adapter);
+    }
+
+    int64_t get_adm_in_channels() override {
+        return 768;
+    }
+
+    void set_flash_attention_enabled(bool enabled) {
+        lens.set_flash_attention_enabled(enabled);
+    }
+
+    void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
+        lens.set_max_graph_vram_bytes(max_vram_bytes);
+    }
+
+    void set_circular_axes(bool circular_x, bool circular_y) override {
+        lens.set_circular_axes(circular_x, circular_y);
+    }
+
+    sd::Tensor<float> compute(int n_threads,
+                              const DiffusionParams& diffusion_params) override {
+        GGML_ASSERT(diffusion_params.x != nullptr);
+        GGML_ASSERT(diffusion_params.timesteps != nullptr);
+        return lens.compute(n_threads,
+                            *diffusion_params.x,
+                            *diffusion_params.timesteps,
+                            tensor_or_empty(diffusion_params.context));
+    }
+};
+
 struct LTXAVModel : public DiffusionModel {
    std::string prefix;
    LTXV::LTXAVRunner ltxav;
--- a/src/lens.hpp
+++ b/src/lens.hpp
@ -0,0 +1,408 @@
+#ifndef __SD_LENS_HPP__
+#define __SD_LENS_HPP__
+
+#include <memory>
+#include <vector>
+
+#include "common_block.hpp"
+#include "flux.hpp"
+#include "qwen_image.hpp"
+#include "rope.hpp"
+
+namespace Lens {
+    constexpr int LENS_GRAPH_SIZE = 40960;
+
+    struct LensTimestepProjEmbeddings : public GGMLBlock {
+        LensTimestepProjEmbeddings(int64_t embedding_dim) {
+            blocks["timestep_embedder"] = std::make_shared<Qwen::TimestepEmbedding>(256, embedding_dim);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* timesteps) {
+            auto timestep_embedder = std::dynamic_pointer_cast<Qwen::TimestepEmbedding>(blocks["timestep_embedder"]);
+            auto timesteps_proj    = ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 256, 10000, 1000.f);
+            return timestep_embedder->forward(ctx, timesteps_proj);
+        }
+    };
+
+    struct LensGateMLP : public GGMLBlock {
+        LensGateMLP(int64_t dim, int64_t hidden_dim) {
+            blocks["w1"] = std::make_shared<Linear>(dim, hidden_dim, false);
+            blocks["w2"] = std::make_shared<Linear>(hidden_dim, dim, false);
+            blocks["w3"] = std::make_shared<Linear>(dim, hidden_dim, false);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+            auto w1 = std::dynamic_pointer_cast<Linear>(blocks["w1"]);
+            auto w2 = std::dynamic_pointer_cast<Linear>(blocks["w2"]);
+            auto w3 = std::dynamic_pointer_cast<Linear>(blocks["w3"]);
+
+            auto gate = ggml_silu(ctx->ggml_ctx, w1->forward(ctx, x));
+            auto up   = w3->forward(ctx, x);
+            x         = ggml_mul(ctx->ggml_ctx, gate, up);
+            return w2->forward(ctx, x);
+        }
+    };
+
+    struct LensJointAttention : public GGMLBlock {
+        int64_t dim_head;
+        int64_t num_heads;
+
+        LensJointAttention(int64_t query_dim,
+                           int64_t dim_head,
+                           int64_t num_heads,
+                           float eps = 1e-5f)
+            : dim_head(dim_head), num_heads(num_heads) {
+            int64_t inner_dim = dim_head * num_heads;
+            blocks["img_qkv"] = std::make_shared<Linear>(query_dim, inner_dim * 3, true);
+            blocks["txt_qkv"] = std::make_shared<Linear>(query_dim, inner_dim * 3, true);
+
+            blocks["norm_q"]       = std::make_shared<RMSNorm>(dim_head, eps);
+            blocks["norm_k"]       = std::make_shared<RMSNorm>(dim_head, eps);
+            blocks["norm_added_q"] = std::make_shared<RMSNorm>(dim_head, eps);
+            blocks["norm_added_k"] = std::make_shared<RMSNorm>(dim_head, eps);
+
+            blocks["to_out.0"]   = std::make_shared<Linear>(inner_dim, query_dim, true);
+            blocks["to_add_out"] = std::make_shared<Linear>(inner_dim, query_dim, true);
+        }
+
+        std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
+                                                      ggml_tensor* img,
+                                                      ggml_tensor* txt,
+                                                      ggml_tensor* pe,
+                                                      ggml_tensor* mask = nullptr) {
+            auto img_qkv    = std::dynamic_pointer_cast<Linear>(blocks["img_qkv"]);
+            auto txt_qkv    = std::dynamic_pointer_cast<Linear>(blocks["txt_qkv"]);
+            auto norm_q     = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_q"]);
+            auto norm_k     = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_k"]);
+            auto norm_add_q = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_added_q"]);
+            auto norm_add_k = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_added_k"]);
+            auto to_out_0   = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
+            auto to_add_out = std::dynamic_pointer_cast<Linear>(blocks["to_add_out"]);
+            int64_t n_img   = img->ne[1];
+            int64_t n_txt   = txt->ne[1];
+            int64_t N       = img->ne[2];
+            int64_t inner   = dim_head * num_heads;
+
+            auto img_qkv_vec = split_qkv(ctx->ggml_ctx, img_qkv->forward(ctx, img));
+            auto txt_qkv_vec = split_qkv(ctx->ggml_ctx, txt_qkv->forward(ctx, txt));
+
+            auto img_q = ggml_reshape_4d(ctx->ggml_ctx, img_qkv_vec[0], dim_head, num_heads, n_img, N);
+            auto img_k = ggml_reshape_4d(ctx->ggml_ctx, img_qkv_vec[1], dim_head, num_heads, n_img, N);
+            auto img_v = ggml_reshape_4d(ctx->ggml_ctx, img_qkv_vec[2], dim_head, num_heads, n_img, N);
+
+            img_q = norm_q->forward(ctx, img_q);
+            img_k = norm_k->forward(ctx, img_k);
+
+            auto txt_q = ggml_reshape_4d(ctx->ggml_ctx, txt_qkv_vec[0], dim_head, num_heads, n_txt, N);
+            auto txt_k = ggml_reshape_4d(ctx->ggml_ctx, txt_qkv_vec[1], dim_head, num_heads, n_txt, N);
+            auto txt_v = ggml_reshape_4d(ctx->ggml_ctx, txt_qkv_vec[2], dim_head, num_heads, n_txt, N);
+
+            txt_q = norm_add_q->forward(ctx, txt_q);
+            txt_k = norm_add_k->forward(ctx, txt_k);
+
+            auto q = ggml_concat(ctx->ggml_ctx, img_q, txt_q, 2);
+            auto k = ggml_concat(ctx->ggml_ctx, img_k, txt_k, 2);
+            auto v = ggml_concat(ctx->ggml_ctx, img_v, txt_v, 2);
+
+            auto attn = Rope::attention(ctx, q, k, v, pe, mask, (1.0f / 128.f));
+
+            auto img_attn_out = ggml_view_3d(ctx->ggml_ctx,
+                                             attn,
+                                             inner,
+                                             n_img,
+                                             N,
+                                             attn->nb[1],
+                                             attn->nb[2],
+                                             0);
+            auto txt_attn_out = ggml_view_3d(ctx->ggml_ctx,
+                                             attn,
+                                             inner,
+                                             n_txt,
+                                             N,
+                                             attn->nb[1],
+                                             attn->nb[2],
+                                             n_img * attn->nb[1]);
+
+            img_attn_out = to_out_0->forward(ctx, ggml_cont(ctx->ggml_ctx, img_attn_out));
+            txt_attn_out = to_add_out->forward(ctx, ggml_cont(ctx->ggml_ctx, txt_attn_out));
+            return {img_attn_out, txt_attn_out};
+        }
+    };
+
+    struct LensTransformerBlock : public GGMLBlock {
+        LensTransformerBlock(int64_t dim,
+                             int64_t num_attention_heads,
+                             int64_t attention_head_dim,
+                             float eps = 1e-6f) {
+            int64_t mlp_hidden_dim = dim / 3 * 8;
+            blocks["img_mod.1"]    = std::make_shared<Linear>(dim, 6 * dim, true);
+            blocks["txt_mod.1"]    = std::make_shared<Linear>(dim, 6 * dim, true);
+            blocks["img_norm1"]    = std::make_shared<RMSNorm>(dim, eps);
+            blocks["img_norm2"]    = std::make_shared<RMSNorm>(dim, eps);
+            blocks["txt_norm1"]    = std::make_shared<RMSNorm>(dim, eps);
+            blocks["txt_norm2"]    = std::make_shared<RMSNorm>(dim, eps);
+            blocks["img_mlp"]      = std::make_shared<LensGateMLP>(dim, mlp_hidden_dim);
+            blocks["txt_mlp"]      = std::make_shared<LensGateMLP>(dim, mlp_hidden_dim);
+            blocks["attn"]         = std::make_shared<LensJointAttention>(dim, attention_head_dim, num_attention_heads);
+        }
+
+        std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
+                                                      ggml_tensor* img,
+                                                      ggml_tensor* txt,
+                                                      ggml_tensor* t_emb,
+                                                      ggml_tensor* pe) {
+            auto img_mod_1 = std::dynamic_pointer_cast<Linear>(blocks["img_mod.1"]);
+            auto txt_mod_1 = std::dynamic_pointer_cast<Linear>(blocks["txt_mod.1"]);
+            auto img_norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["img_norm1"]);
+            auto img_norm2 = std::dynamic_pointer_cast<RMSNorm>(blocks["img_norm2"]);
+            auto txt_norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm1"]);
+            auto txt_norm2 = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm2"]);
+            auto img_mlp   = std::dynamic_pointer_cast<LensGateMLP>(blocks["img_mlp"]);
+            auto txt_mlp   = std::dynamic_pointer_cast<LensGateMLP>(blocks["txt_mlp"]);
+            auto attn      = std::dynamic_pointer_cast<LensJointAttention>(blocks["attn"]);
+
+            auto temb = ggml_silu(ctx->ggml_ctx, t_emb);
+
+            auto img_mod_params = img_mod_1->forward(ctx, temb);
+            auto img_mod_vec    = ggml_ext_chunk(ctx->ggml_ctx, img_mod_params, 6, 0);
+            auto txt_mod_params = txt_mod_1->forward(ctx, temb);
+            auto txt_mod_vec    = ggml_ext_chunk(ctx->ggml_ctx, txt_mod_params, 6, 0);
+
+            auto img_normed    = img_norm1->forward(ctx, img);
+            auto img_modulated = Flux::modulate(ctx->ggml_ctx, img_normed, img_mod_vec[0], img_mod_vec[1]);
+            auto txt_normed    = txt_norm1->forward(ctx, txt);
+            auto txt_modulated = Flux::modulate(ctx->ggml_ctx, txt_normed, txt_mod_vec[0], txt_mod_vec[1]);
+
+            auto [img_attn_output, txt_attn_output] = attn->forward(ctx, img_modulated, txt_modulated, pe);
+
+            img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_attn_output, img_mod_vec[2]));
+            txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_attn_output, txt_mod_vec[2]));
+
+            auto img_normed2    = img_norm2->forward(ctx, img);
+            auto img_modulated2 = Flux::modulate(ctx->ggml_ctx, img_normed2, img_mod_vec[3], img_mod_vec[4]);
+            auto txt_normed2    = txt_norm2->forward(ctx, txt);
+            auto txt_modulated2 = Flux::modulate(ctx->ggml_ctx, txt_normed2, txt_mod_vec[3], txt_mod_vec[4]);
+
+            img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_mlp->forward(ctx, img_modulated2), img_mod_vec[5]));
+            txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_mlp->forward(ctx, txt_modulated2), txt_mod_vec[5]));
+            return {img, txt};
+        }
+    };
+
+    struct LensAdaLayerNormContinuous : public GGMLBlock {
+        int64_t hidden_size;
+        float eps;
+
+        LensAdaLayerNormContinuous(int64_t hidden_size, float eps = 1e-6f)
+            : hidden_size(hidden_size), eps(eps) {
+            blocks["linear"] = std::make_shared<Linear>(hidden_size, hidden_size * 2, true);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* conditioning) {
+            auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
+            auto mods   = ggml_ext_chunk(ctx->ggml_ctx, linear->forward(ctx, ggml_silu(ctx->ggml_ctx, conditioning)), 2, 0);
+            auto scale  = mods[0];
+            auto shift  = mods[1];
+            x           = ggml_norm(ctx->ggml_ctx, x, eps);
+            return Flux::modulate(ctx->ggml_ctx, x, shift, scale);
+        }
+    };
+
+    struct LensParams {
+        int patch_size              = 2;
+        int64_t in_channels         = 128;
+        int64_t out_channels        = 32;
+        int num_layers              = 48;
+        int64_t attention_head_dim  = 64;
+        int64_t num_attention_heads = 24;
+        int64_t joint_attention_dim = 2880;
+        int selected_layer_count    = 4;
+        int theta                   = 10000;
+        std::vector<int> axes_dim   = {8, 28, 28};
+        int axes_dim_sum            = 64;
+    };
+
+    class LensModel : public GGMLBlock {
+    public:
+        LensParams params;
+
+        LensModel() = default;
+        LensModel(LensParams params)
+            : params(params) {
+            int64_t inner_dim         = params.num_attention_heads * params.attention_head_dim;
+            blocks["time_text_embed"] = std::make_shared<LensTimestepProjEmbeddings>(inner_dim);
+            blocks["img_in"]          = std::make_shared<Linear>(params.in_channels, inner_dim, true);
+            blocks["txt_in"]          = std::make_shared<Linear>(params.joint_attention_dim * params.selected_layer_count, inner_dim, true);
+            for (int i = 0; i < params.selected_layer_count; ++i) {
+                blocks["txt_norm." + std::to_string(i)] = std::make_shared<RMSNorm>(params.joint_attention_dim, 1e-5f);
+            }
+            for (int i = 0; i < params.num_layers; ++i) {
+                blocks["transformer_blocks." + std::to_string(i)] = std::make_shared<LensTransformerBlock>(inner_dim,
+                                                                                                           params.num_attention_heads,
+                                                                                                           params.attention_head_dim);
+            }
+            blocks["norm_out"] = std::make_shared<LensAdaLayerNormContinuous>(inner_dim, 1e-6f);
+            blocks["proj_out"] = std::make_shared<Linear>(inner_dim, params.patch_size * params.patch_size * params.out_channels, true);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* timestep,
+                             ggml_tensor* context,
+                             ggml_tensor* pe) {
+            GGML_ASSERT(context != nullptr);
+            int64_t W = x->ne[0];
+            int64_t H = x->ne[1];
+            int64_t C = x->ne[2];
+            int64_t N = x->ne[3];
+
+            auto time_text_embed = std::dynamic_pointer_cast<LensTimestepProjEmbeddings>(blocks["time_text_embed"]);
+            auto img_in          = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
+            auto txt_in          = std::dynamic_pointer_cast<Linear>(blocks["txt_in"]);
+            auto norm_out        = std::dynamic_pointer_cast<LensAdaLayerNormContinuous>(blocks["norm_out"]);
+            auto proj_out        = std::dynamic_pointer_cast<Linear>(blocks["proj_out"]);
+
+            auto t_emb = time_text_embed->forward(ctx, timestep);
+
+            auto img = ggml_reshape_3d(ctx->ggml_ctx, x, W * H, C, N);
+            img      = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, img, 1, 0, 2, 3));
+            img      = img_in->forward(ctx, img);
+
+            std::vector<ggml_tensor*> txt_chunks = ggml_ext_chunk(ctx->ggml_ctx, context, params.selected_layer_count, 0);
+            ggml_tensor* txt                     = nullptr;
+            for (int i = 0; i < params.selected_layer_count; ++i) {
+                auto txt_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm." + std::to_string(i)]);
+                auto chunk    = txt_norm->forward(ctx, txt_chunks[i]);
+                txt           = txt == nullptr ? chunk : ggml_concat(ctx->ggml_ctx, txt, chunk, 0);
+            }
+            txt = txt_in->forward(ctx, txt);
+
+            sd::ggml_graph_cut::mark_graph_cut(img, "lens.prelude", "img");
+            sd::ggml_graph_cut::mark_graph_cut(txt, "lens.prelude", "txt");
+
+            for (int i = 0; i < params.num_layers; ++i) {
+                auto block = std::dynamic_pointer_cast<LensTransformerBlock>(blocks["transformer_blocks." + std::to_string(i)]);
+                auto out   = block->forward(ctx, img, txt, t_emb, pe);
+                img        = out.first;
+                txt        = out.second;
+                sd::ggml_graph_cut::mark_graph_cut(img, "lens.transformer_blocks." + std::to_string(i), "img");
+                sd::ggml_graph_cut::mark_graph_cut(txt, "lens.transformer_blocks." + std::to_string(i), "txt");
+            }
+
+            img = norm_out->forward(ctx, img, t_emb);
+            img = proj_out->forward(ctx, img);
+
+            auto out = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, img, 1, 0, 2, 3));
+            out      = ggml_reshape_4d(ctx->ggml_ctx, out, W, H, params.patch_size * params.patch_size * params.out_channels, N);
+            return out;
+        }
+    };
+
+    struct LensRunner : public GGMLRunner {
+        LensParams lens_params;
+        LensModel lens;
+        std::vector<float> pe_vec;
+
+        LensRunner(ggml_backend_t backend,
+                   ggml_backend_t params_backend,
+                   const String2TensorStorage& tensor_storage_map = {},
+                   const std::string prefix                       = "")
+            : GGMLRunner(backend, params_backend) {
+            lens_params.num_layers = 0;
+            for (const auto& [name, tensor_storage] : tensor_storage_map) {
+                if (!starts_with(name, prefix)) {
+                    continue;
+                }
+                if (ends_with(name, "img_in.weight") && tensor_storage.n_dims == 2) {
+                    lens_params.in_channels         = tensor_storage.ne[0];
+                    int64_t inner_dim               = tensor_storage.ne[1];
+                    lens_params.num_attention_heads = inner_dim / lens_params.attention_head_dim;
+                } else if (ends_with(name, "txt_in.weight") && tensor_storage.n_dims == 2) {
+                    lens_params.selected_layer_count = static_cast<int>(tensor_storage.ne[0] / lens_params.joint_attention_dim);
+                } else if (ends_with(name, "proj_out.weight") && tensor_storage.n_dims == 2) {
+                    lens_params.out_channels = tensor_storage.ne[1] / lens_params.patch_size / lens_params.patch_size;
+                } else if (ends_with(name, "transformer_blocks.0.attn.norm_q.weight") && tensor_storage.n_dims == 1) {
+                    lens_params.attention_head_dim = tensor_storage.ne[0];
+                }
+
+                size_t pos = name.find("transformer_blocks.");
+                if (pos != std::string::npos) {
+                    std::string layer_name = name.substr(pos);
+                    auto items             = split_string(layer_name, '.');
+                    if (items.size() > 1) {
+                        int block_index = atoi(items[1].c_str());
+                        if (block_index + 1 > lens_params.num_layers) {
+                            lens_params.num_layers = block_index + 1;
+                        }
+                    }
+                }
+            }
+            if (lens_params.num_layers == 0) {
+                lens_params.num_layers = 48;
+            }
+            lens_params.axes_dim_sum = 0;
+            for (int axis_dim : lens_params.axes_dim) {
+                lens_params.axes_dim_sum += axis_dim;
+            }
+
+            LOG_INFO("lens: layers = %d, in_channels = %" PRId64 ", out_channels = %" PRId64
+                     ", heads = %" PRId64 ", head_dim = %" PRId64,
+                     lens_params.num_layers,
+                     lens_params.in_channels,
+                     lens_params.out_channels,
+                     lens_params.num_attention_heads,
+                     lens_params.attention_head_dim);
+
+            lens = LensModel(lens_params);
+            lens.init(params_ctx, tensor_storage_map, prefix);
+        }
+
+        std::string get_desc() override {
+            return "lens";
+        }
+
+        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
+            lens.get_param_tensors(tensors, prefix);
+        }
+
+        ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
+                                 const sd::Tensor<float>& timesteps_tensor,
+                                 const sd::Tensor<float>& context_tensor) {
+            ggml_cgraph* gf        = new_graph_custom(LENS_GRAPH_SIZE);
+            ggml_tensor* x         = make_input(x_tensor);
+            ggml_tensor* timesteps = make_input(timesteps_tensor);
+            GGML_ASSERT(x->ne[3] == 1);
+            GGML_ASSERT(!context_tensor.empty());
+            ggml_tensor* context = make_input(context_tensor);
+
+            pe_vec      = Rope::gen_lens_pe(static_cast<int>(x->ne[1]),
+                                            static_cast<int>(x->ne[0]),
+                                            static_cast<int>(x->ne[3]),
+                                            static_cast<int>(context->ne[1]),
+                                            lens_params.theta,
+                                            circular_y_enabled,
+                                            circular_x_enabled,
+                                            lens_params.axes_dim);
+            int pos_len = static_cast<int>(pe_vec.size() / lens_params.axes_dim_sum / 2);
+            auto pe     = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, lens_params.axes_dim_sum / 2, pos_len);
+            set_backend_tensor_data(pe, pe_vec.data());
+
+            auto runner_ctx  = get_context();
+            ggml_tensor* out = lens.forward(&runner_ctx, x, timesteps, context, pe);
+            ggml_build_forward_expand(gf, out);
+            return gf;
+        }
+
+        sd::Tensor<float> compute(int n_threads,
+                                  const sd::Tensor<float>& x,
+                                  const sd::Tensor<float>& timesteps,
+                                  const sd::Tensor<float>& context) {
+            auto get_graph = [&]() -> ggml_cgraph* {
+                return build_graph(x, timesteps, context);
+            };
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
+        }
+    };
+}  // namespace Lens
+
+#endif  // __SD_LENS_HPP__
--- a/src/llm.hpp
+++ b/src/llm.hpp
@ -23,11 +23,12 @@
 #include "rope.hpp"
 #include "tokenizers/bpe_tokenizer.h"
 #include "tokenizers/gemma_tokenizer.h"
+#include "tokenizers/gpt_oss_tokenizer.h"
 #include "tokenizers/mistral_tokenizer.h"
 #include "tokenizers/qwen2_tokenizer.h"

 namespace LLM {
-    constexpr int LLM_GRAPH_SIZE = 10240;
+    constexpr int LLM_GRAPH_SIZE = 65536;

    enum class LLMArch {
        QWEN2_5_VL,
@ -36,6 +37,7 @@ namespace LLM {
        MISTRAL_SMALL_3_2,
        MINISTRAL_3_3B,
        GEMMA3_12B,
+        GPT_OSS_20B,
        ARCH_COUNT,
    };

@ -46,6 +48,7 @@ namespace LLM {
        "mistral_small3.2",
        "ministral3.3b",
        "gemma3_12b",
+        "gpt_oss_20b",
    };

    enum class MLPActivation {
@ -83,6 +86,7 @@ namespace LLM {
        int num_kv_heads                = 4;
        int head_dim                    = 128;
        bool qkv_bias                   = true;
+        bool attention_out_bias         = false;
        bool qk_norm                    = false;
        bool rms_norm_add               = false;
        bool normalize_input            = false;
@ -93,6 +97,8 @@ namespace LLM {
        std::vector<float> rope_thetas  = {1000000.f};
        std::vector<float> rope_scales  = {1.f};
        std::vector<int> sliding_attention;
+        int64_t num_experts         = 0;
+        int64_t num_experts_per_tok = 0;
        LLMVisionParams vision;
    };

@ -163,6 +169,170 @@ namespace LLM {
        }
    };

+    struct GPTOSSMLP : public GGMLBlock {
+    protected:
+        int64_t hidden_size;
+        int64_t intermediate_size;
+        int64_t num_experts;
+        int64_t num_experts_per_tok;
+        bool has_combined_gate_up = false;
+
+        void init_params(ggml_context* ctx,
+                         const String2TensorStorage& tensor_storage_map = {},
+                         std::string prefix                             = "") override {
+            auto supported_type = [](ggml_type wtype, int64_t in_features) {
+                if (in_features % ggml_blck_size(wtype) != 0) {
+                    return GGML_TYPE_F32;
+                }
+                return wtype;
+            };
+
+            params["router.weight"] = ggml_new_tensor_2d(ctx,
+                                                         supported_type(get_type(prefix + "router.weight", tensor_storage_map, GGML_TYPE_F32), hidden_size),
+                                                         hidden_size,
+                                                         num_experts);
+            params["router.bias"]   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_experts);
+
+            has_combined_gate_up = tensor_storage_map.find(prefix + "experts.gate_up_proj.weight") != tensor_storage_map.end();
+            if (has_combined_gate_up) {
+                ggml_type gate_up_type                = supported_type(get_type(prefix + "experts.gate_up_proj.weight", tensor_storage_map, GGML_TYPE_F32), hidden_size);
+                params["experts.gate_up_proj.weight"] = ggml_new_tensor_3d(ctx,
+                                                                           gate_up_type,
+                                                                           hidden_size,
+                                                                           intermediate_size * 2,
+                                                                           num_experts);
+                params["experts.gate_up_proj.bias"]   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, intermediate_size * 2, num_experts);
+            } else {
+                ggml_type gate_type                = supported_type(get_type(prefix + "experts.gate_proj.weight", tensor_storage_map, GGML_TYPE_F32), hidden_size);
+                ggml_type up_type                  = supported_type(get_type(prefix + "experts.up_proj.weight", tensor_storage_map, GGML_TYPE_F32), hidden_size);
+                params["experts.gate_proj.weight"] = ggml_new_tensor_3d(ctx, gate_type, hidden_size, intermediate_size, num_experts);
+                params["experts.up_proj.weight"]   = ggml_new_tensor_3d(ctx, up_type, hidden_size, intermediate_size, num_experts);
+                params["experts.gate_proj.bias"]   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, intermediate_size, num_experts);
+                params["experts.up_proj.bias"]     = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, intermediate_size, num_experts);
+            }
+
+            ggml_type down_type                = supported_type(get_type(prefix + "experts.down_proj.weight", tensor_storage_map, GGML_TYPE_F32), intermediate_size);
+            params["experts.down_proj.weight"] = ggml_new_tensor_3d(ctx, down_type, intermediate_size, hidden_size, num_experts);
+            params["experts.down_proj.bias"]   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hidden_size, num_experts);
+        }
+
+        ggml_tensor* expert_linear(GGMLRunnerContext* ctx,
+                                   const std::string& weight_name,
+                                   const std::string& bias_name,
+                                   ggml_tensor* x,
+                                   ggml_tensor* selected_experts) {
+            auto out = ggml_mul_mat_id(ctx->ggml_ctx, params[weight_name], x, selected_experts);
+            auto it  = params.find(bias_name);
+            if (it != params.end()) {
+                out = ggml_add_id(ctx->ggml_ctx, out, it->second, selected_experts);
+            }
+            return out;
+        }
+
+    public:
+        GPTOSSMLP(const LLMParams& params)
+            : hidden_size(params.hidden_size),
+              intermediate_size(params.intermediate_size),
+              num_experts(params.num_experts),
+              num_experts_per_tok(params.num_experts_per_tok) {}
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+            // x: [N, n_token, hidden_size]
+            GGML_ASSERT(num_experts > 0 && num_experts_per_tok > 0);
+
+            const int64_t n_token       = x->ne[1];
+            const int64_t N             = x->ne[2];
+            const int64_t n_token_total = n_token * N;
+            ggml_tensor* router_weight  = params["router.weight"];
+            ggml_tensor* router_bias    = params["router.bias"];
+            ggml_tensor* router_logits  = ggml_mul_mat(ctx->ggml_ctx, router_weight, x);
+            router_logits               = ggml_add(ctx->ggml_ctx, router_logits, router_bias);
+            router_logits               = ggml_reshape_2d(ctx->ggml_ctx, router_logits, num_experts, n_token_total);
+
+            ggml_tensor* selected_experts = ggml_argsort_top_k(ctx->ggml_ctx, router_logits, (int)num_experts_per_tok);  // [top_k, tokens]
+            ggml_tensor* probs            = ggml_reshape_3d(ctx->ggml_ctx, router_logits, 1, num_experts, n_token_total);
+            ggml_tensor* weights          = ggml_get_rows(ctx->ggml_ctx, probs, selected_experts);  // [1, top_k, tokens]
+            weights                       = ggml_reshape_2d(ctx->ggml_ctx, weights, num_experts_per_tok, n_token_total);
+            weights                       = ggml_soft_max(ctx->ggml_ctx, weights);
+            weights                       = ggml_reshape_3d(ctx->ggml_ctx, weights, 1, num_experts_per_tok, n_token_total);
+
+            x = ggml_reshape_3d(ctx->ggml_ctx, x, hidden_size, 1, n_token_total);
+
+            ggml_tensor* gate = nullptr;
+            ggml_tensor* up   = nullptr;
+            if (has_combined_gate_up) {
+                auto gate_up = expert_linear(ctx,
+                                             "experts.gate_up_proj.weight",
+                                             "experts.gate_up_proj.bias",
+                                             x,
+                                             selected_experts);  // [2 * intermediate, top_k, tokens]
+                gate_up      = ggml_reshape_4d(ctx->ggml_ctx,
+                                               gate_up,
+                                               2,
+                                               intermediate_size,
+                                               num_experts_per_tok,
+                                               n_token_total);
+                gate         = ggml_view_4d(ctx->ggml_ctx,
+                                            gate_up,
+                                            1,
+                                            intermediate_size,
+                                            num_experts_per_tok,
+                                            n_token_total,
+                                            gate_up->nb[1],
+                                            gate_up->nb[2],
+                                            gate_up->nb[3],
+                                            0);
+                up           = ggml_view_4d(ctx->ggml_ctx,
+                                            gate_up,
+                                            1,
+                                            intermediate_size,
+                                            num_experts_per_tok,
+                                            n_token_total,
+                                            gate_up->nb[1],
+                                            gate_up->nb[2],
+                                            gate_up->nb[3],
+                                            gate_up->nb[0]);
+                gate         = ggml_reshape_3d(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, gate), intermediate_size, num_experts_per_tok, n_token_total);
+                up           = ggml_reshape_3d(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, up), intermediate_size, num_experts_per_tok, n_token_total);
+            } else {
+                gate = expert_linear(ctx,
+                                     "experts.gate_proj.weight",
+                                     "experts.gate_proj.bias",
+                                     x,
+                                     selected_experts);
+                up   = expert_linear(ctx,
+                                     "experts.up_proj.weight",
+                                     "experts.up_proj.bias",
+                                     x,
+                                     selected_experts);
+            }
+
+            auto activated = ggml_swiglu_oai(ctx->ggml_ctx, gate, up, 1.702f, 7.0f);
+            auto experts   = expert_linear(ctx,
+                                           "experts.down_proj.weight",
+                                           "experts.down_proj.bias",
+                                           activated,
+                                           selected_experts);
+            experts        = ggml_mul(ctx->ggml_ctx, experts, weights);
+
+            ggml_tensor* out = nullptr;
+            for (int64_t i = 0; i < num_experts_per_tok; ++i) {
+                auto expert_out = ggml_view_2d(ctx->ggml_ctx,
+                                               experts,
+                                               hidden_size,
+                                               n_token_total,
+                                               experts->nb[2],
+                                               i * experts->nb[1]);
+                out             = out == nullptr ? expert_out : ggml_add(ctx->ggml_ctx, out, expert_out);
+            }
+            if (num_experts_per_tok == 1) {
+                out = ggml_cont(ctx->ggml_ctx, out);
+            }
+
+            return ggml_reshape_3d(ctx->ggml_ctx, out, hidden_size, n_token, N);
+        }
+    };
+
    static ggml_tensor* splice_image_embeds(GGMLRunnerContext* ctx,
                                            ggml_tensor* x,
                                            const std::vector<std::pair<int, ggml_tensor*>>& image_embeds) {
@ -601,6 +771,15 @@ namespace LLM {
        int64_t max_position_embeddings;
        std::vector<float> rope_thetas;
        std::vector<float> rope_scales;
+        bool has_attention_sinks;
+
+        void init_params(ggml_context* ctx,
+                         const String2TensorStorage& tensor_storage_map = {},
+                         std::string prefix                             = "") override {
+            if (has_attention_sinks) {
+                params["sinks"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_heads);
+            }
+        }

    public:
        Attention(const LLMParams& params)
@ -611,11 +790,12 @@ namespace LLM {
              qk_norm(params.qk_norm),
              max_position_embeddings(params.max_position_embeddings),
              rope_thetas(params.rope_thetas),
-              rope_scales(params.rope_scales) {
+              rope_scales(params.rope_scales),
+              has_attention_sinks(params.arch == LLMArch::GPT_OSS_20B) {
            blocks["q_proj"] = std::make_shared<Linear>(params.hidden_size, num_heads * head_dim, params.qkv_bias);
            blocks["k_proj"] = std::make_shared<Linear>(params.hidden_size, num_kv_heads * head_dim, params.qkv_bias);
            blocks["v_proj"] = std::make_shared<Linear>(params.hidden_size, num_kv_heads * head_dim, params.qkv_bias);
-            blocks["o_proj"] = std::make_shared<Linear>(num_heads * head_dim, params.hidden_size, false);
+            blocks["o_proj"] = std::make_shared<Linear>(num_heads * head_dim, params.hidden_size, params.attention_out_bias);
            if (params.qk_norm) {
                blocks["q_norm"] = std::make_shared<LLMRMSNorm>(head_dim, params.rms_norm_eps, params.rms_norm_add);
                blocks["k_norm"] = std::make_shared<LLMRMSNorm>(head_dim, params.rms_norm_eps, params.rms_norm_add);
@ -660,6 +840,36 @@ namespace LLM {
            } else if (arch == LLMArch::QWEN3) {
                q = ggml_rope_ext(ctx->ggml_ctx, q, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 40960, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
                k = ggml_rope_ext(ctx->ggml_ctx, k, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 40960, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
+            } else if (arch == LLMArch::GPT_OSS_20B) {
+                float rope_theta = rope_thetas.empty() ? 150000.f : rope_thetas[0];
+                float rope_scale = rope_scales.empty() ? 32.f : rope_scales[0];
+                float freq_scale = 1.f / rope_scale;
+                q                = ggml_rope_ext(ctx->ggml_ctx,
+                                                 q,
+                                                 input_pos,
+                                                 nullptr,
+                                                 head_dim,
+                                                 GGML_ROPE_TYPE_NEOX,
+                                                 4096,
+                                                 rope_theta,
+                                                 freq_scale,
+                                                 1.f,
+                                                 1.f,
+                                                 32.f,
+                                                 1.f);
+                k                = ggml_rope_ext(ctx->ggml_ctx,
+                                                 k,
+                                                 input_pos,
+                                                 nullptr,
+                                                 head_dim,
+                                                 GGML_ROPE_TYPE_NEOX,
+                                                 4096,
+                                                 rope_theta,
+                                                 freq_scale,
+                                                 1.f,
+                                                 1.f,
+                                                 32.f,
+                                                 1.f);
            } else if (arch == LLMArch::GEMMA3_12B) {
                float rope_theta = (rope_index == 1 ? 10000.0f : 1000000.0f);
                float rope_scale = (rope_index == 1 ? 1.f : 8.f);
@ -706,7 +916,28 @@ namespace LLM {
            k = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, k, 0, 2, 1, 3));  // [N, num_kv_heads, n_token, head_dim]
            k = ggml_reshape_3d(ctx->ggml_ctx, k, k->ne[0], k->ne[1], k->ne[2] * k->ne[3]);      // [N*num_kv_heads, n_token, head_dim]

-            x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, attention_mask, true, false);  // [N, n_token, hidden_size]
+            if (arch == LLMArch::GPT_OSS_20B) {
+                GGML_ASSERT(N == 1);
+                auto v_attn = ggml_ext_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, v, 1, 2, 0, 3));  // [N, kv_heads, head_dim, tokens]
+                v_attn      = ggml_reshape_3d(ctx->ggml_ctx, v_attn, n_token, head_dim, num_kv_heads * N);
+
+                auto kq = ggml_mul_mat(ctx->ggml_ctx, k, q);
+                ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
+                kq = ggml_scale_inplace(ctx->ggml_ctx, kq, 1.0f / std::sqrt(static_cast<float>(head_dim)));
+                if (attention_mask != nullptr) {
+                    kq = ggml_add_inplace(ctx->ggml_ctx, kq, attention_mask);
+                }
+                kq = ggml_soft_max_inplace(ctx->ggml_ctx, kq);
+                ggml_soft_max_add_sinks(kq, params["sinks"]);
+
+                auto kqv = ggml_mul_mat(ctx->ggml_ctx, v_attn, kq);
+                kqv      = ggml_reshape_4d(ctx->ggml_ctx, kqv, head_dim, n_token, num_heads, N);
+                kqv      = ggml_permute(ctx->ggml_ctx, kqv, 0, 2, 1, 3);
+                x        = ggml_ext_cont(ctx->ggml_ctx, kqv);
+                x        = ggml_reshape_3d(ctx->ggml_ctx, x, head_dim * num_heads, n_token, N);
+            } else {
+                x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, attention_mask, true, false);  // [N, n_token, hidden_size]
+            }

            x = out_proj->forward(ctx, x);  // [N, n_token, hidden_size]
            return x;
@ -726,11 +957,15 @@ namespace LLM {
              sliding_attention(0),
              has_post_attention_norm(params.arch == LLMArch::GEMMA3_12B),
              has_post_ffw_norm(params.arch == LLMArch::GEMMA3_12B) {
-            blocks["self_attn"]                = std::make_shared<Attention>(params);
-            blocks["mlp"]                      = std::make_shared<MLP>(params.hidden_size,
-                                                  params.intermediate_size,
-                                                  false,
-                                                  params.mlp_activation);
+            blocks["self_attn"] = std::make_shared<Attention>(params);
+            if (params.arch == LLMArch::GPT_OSS_20B) {
+                blocks["mlp"] = std::make_shared<GPTOSSMLP>(params);
+            } else {
+                blocks["mlp"] = std::make_shared<MLP>(params.hidden_size,
+                                                      params.intermediate_size,
+                                                      false,
+                                                      params.mlp_activation);
+            }
            blocks["input_layernorm"]          = std::make_shared<LLMRMSNorm>(params.hidden_size, params.rms_norm_eps, params.rms_norm_add);
            blocks["post_attention_layernorm"] = std::make_shared<LLMRMSNorm>(params.hidden_size, params.rms_norm_eps, params.rms_norm_add);
            if (has_post_attention_norm) {
@ -751,7 +986,6 @@ namespace LLM {
                             ggml_tensor* sliding_attention_mask = nullptr) {
            // x: [N, n_token, hidden_size]
            auto self_attn                                  = std::dynamic_pointer_cast<Attention>(blocks["self_attn"]);
-            auto mlp                                        = std::dynamic_pointer_cast<MLP>(blocks["mlp"]);
            auto input_layernorm                            = std::dynamic_pointer_cast<LLMRMSNorm>(blocks["input_layernorm"]);
            auto post_attention_layernorm                   = std::dynamic_pointer_cast<LLMRMSNorm>(blocks["post_attention_layernorm"]);
            std::shared_ptr<LLMRMSNorm> post_attention_norm = nullptr;
@ -764,7 +998,7 @@ namespace LLM {
            }
            ggml_tensor* block_attention_mask = attention_mask;
            int rope_index                    = 0;
-            if (arch == LLMArch::GEMMA3_12B && sliding_attention > 0) {
+            if ((arch == LLMArch::GEMMA3_12B || arch == LLMArch::GPT_OSS_20B) && sliding_attention > 0) {
                block_attention_mask = sliding_attention_mask;
                rope_index           = 1;
            }
@ -779,7 +1013,13 @@ namespace LLM {

            residual = x;
            x        = post_attention_layernorm->forward(ctx, x);
-            x        = mlp->forward(ctx, x);
+            if (arch == LLMArch::GPT_OSS_20B) {
+                auto mlp = std::dynamic_pointer_cast<GPTOSSMLP>(blocks["mlp"]);
+                x        = mlp->forward(ctx, x);
+            } else {
+                auto mlp = std::dynamic_pointer_cast<MLP>(blocks["mlp"]);
+                x        = mlp->forward(ctx, x);
+            }
            if (post_ffw_norm != nullptr) {
                x = post_ffw_norm->forward(ctx, x);
            }
@ -1202,6 +1442,24 @@ namespace LLM {
                params.rope_thetas             = {1000000.f, 10000.f};
                params.rope_scales             = {8.f, 1.f};
                params.sliding_attention       = {1024, 1024, 1024, 1024, 1024, 0};
+            } else if (arch == LLMArch::GPT_OSS_20B) {
+                params.head_dim                = 64;
+                params.num_heads               = 64;
+                params.num_kv_heads            = 8;
+                params.qkv_bias                = true;
+                params.attention_out_bias      = true;
+                params.qk_norm                 = false;
+                params.rms_norm_eps            = 1e-5f;
+                params.hidden_size             = 2880;
+                params.intermediate_size       = 2880;
+                params.num_layers              = 24;
+                params.vocab_size              = 201088;
+                params.max_position_embeddings = 131072;
+                params.rope_thetas             = {150000.f};
+                params.rope_scales             = {32.f};
+                params.sliding_attention       = {128, 0};
+                params.num_experts             = 32;
+                params.num_experts_per_tok     = 4;
            }
            bool have_vision_weight = false;
            bool llama_cpp_style    = false;
@ -1236,6 +1494,12 @@ namespace LLM {
                if (contains(tensor_name, "layers.0.mlp.gate_proj.weight")) {
                    params.intermediate_size = pair.second.ne[1];
                }
+                if (contains(tensor_name, "layers.0.mlp.experts.gate_up_proj.weight")) {
+                    params.intermediate_size = pair.second.ne[1] / 2;
+                }
+                if (contains(tensor_name, "layers.0.mlp.experts.gate_proj.weight")) {
+                    params.intermediate_size = pair.second.ne[1];
+                }
            }
            if (arch == LLMArch::QWEN3 && params.num_layers == 28) {  // Qwen3 2B
                params.num_heads = 16;
@ -1315,7 +1579,8 @@ namespace LLM {
            if (params.arch == LLMArch::MISTRAL_SMALL_3_2 ||
                params.arch == LLMArch::MINISTRAL_3_3B ||
                params.arch == LLMArch::QWEN3 ||
-                params.arch == LLMArch::GEMMA3_12B) {
+                params.arch == LLMArch::GEMMA3_12B ||
+                params.arch == LLMArch::GPT_OSS_20B) {
                input_pos_vec.resize(n_tokens);
                for (int i = 0; i < n_tokens; ++i) {
                    input_pos_vec[i] = i;
@ -1354,7 +1619,11 @@ namespace LLM {
                set_backend_tensor_data(attention_mask, attention_mask_vec.data());
            }

-            if (params.arch == LLMArch::GEMMA3_12B) {
+            if (params.arch == LLMArch::GEMMA3_12B || params.arch == LLMArch::GPT_OSS_20B) {
+                int sliding_window = 0;
+                for (int window : params.sliding_attention) {
+                    sliding_window = std::max(sliding_window, window);
+                }
                sliding_attention_mask_vec.resize(n_tokens * n_tokens);
                if (!attention_mask_tensor.empty()) {
                    GGML_ASSERT(attention_mask_tensor.numel() == n_tokens * n_tokens);
@ -1364,8 +1633,7 @@ namespace LLM {
                }
                for (int i0 = 0; i0 < n_tokens; i0++) {
                    for (int i1 = 0; i1 < n_tokens; i1++) {
-                        if (i0 + 1024 <= i1) {
-                            LOG_DEBUG("xxxxxxxxxxxxxx");
+                        if (sliding_window > 0 && i0 + sliding_window <= i1) {
                            sliding_attention_mask_vec[i1 * n_tokens + i0] = -INFINITY;
                        }
                    }
@ -1485,6 +1753,8 @@ namespace LLM {
            : model(arch, backend, params_backend, tensor_storage_map, prefix, enable_vision) {
            if (arch == LLMArch::MISTRAL_SMALL_3_2 || arch == LLMArch::MINISTRAL_3_3B) {
                tokenizer = std::make_shared<MistralTokenizer>();
+            } else if (arch == LLMArch::GPT_OSS_20B) {
+                tokenizer = std::make_shared<GPTOSSTokenizer>();
            } else {
                tokenizer = std::make_shared<Qwen2Tokenizer>();
            }
--- a/src/model.cpp
+++ b/src/model.cpp
@ -442,6 +442,10 @@ SDVersion ModelLoader::get_sd_version() {
            tensor_storage_map.find("model.language_model.layers.0.self_attn.q_proj.weight") != tensor_storage_map.end()) {
            return VERSION_HIDREAM_O1;
        }
+        if (tensor_storage.name.find("model.diffusion_model.transformer_blocks.0.attn.norm_added_q.weight") != std::string::npos &&
+            tensor_storage_map.find("model.diffusion_model.transformer_blocks.0.img_mlp.w1.weight") != tensor_storage_map.end()) {
+            return VERSION_LENS;
+        }
        if (tensor_storage.name.find("model.diffusion_model.transformer_blocks.0.img_mod.1.weight") != std::string::npos) {
            return VERSION_QWEN_IMAGE;
        }
--- a/src/model.h
+++ b/src/model.h
@ -47,6 +47,7 @@ enum SDVersion {
    VERSION_Z_IMAGE,
    VERSION_OVIS_IMAGE,
    VERSION_ERNIE_IMAGE,
+    VERSION_LENS,
    VERSION_LONGCAT,
    VERSION_COUNT,
 };
@ -156,8 +157,15 @@ static inline bool sd_version_is_ernie_image(SDVersion version) {
    return false;
 }

+static inline bool sd_version_is_lens(SDVersion version) {
+    if (version == VERSION_LENS) {
+        return true;
+    }
+    return false;
+}
+
 static inline bool sd_version_uses_flux2_vae(SDVersion version) {
-    if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version)) {
+    if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version) || sd_version_is_lens(version)) {
        return true;
    }
    return false;
@ -185,6 +193,7 @@ static inline bool sd_version_is_dit(SDVersion version) {
        sd_version_is_anima(version) ||
        sd_version_is_z_image(version) ||
        sd_version_is_ernie_image(version) ||
+        sd_version_is_lens(version) ||
        sd_version_is_longcat(version)) {
        return true;
    }
--- a/src/name_conversion.cpp
+++ b/src/name_conversion.cpp
@ -128,6 +128,7 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
    };

    static const std::vector<std::pair<std::string, std::string>> llm_name_map{
+        {"attn_sinks.weight", "self_attn.sinks"},
        {"token_embd.", "model.embed_tokens."},
        {"blk.", "model.layers."},
        {"attn_q.", "self_attn.q_proj."},
@ -137,6 +138,12 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
        {"attn_k_norm.", "self_attn.k_norm."},
        {"attn_output.", "self_attn.o_proj."},
        {"attn_norm.", "input_layernorm."},
+        {"attn_post_norm.", "post_attention_layernorm."},
+        {"post_attention_norm.", "post_attention_layernorm."},
+        {"ffn_gate_inp.", "mlp.router."},
+        {"ffn_gate_exps.", "mlp.experts.gate_proj."},
+        {"ffn_up_exps.", "mlp.experts.up_proj."},
+        {"ffn_down_exps.", "mlp.experts.down_proj."},
        {"ffn_down.", "mlp.down_proj."},
        {"ffn_gate.", "mlp.gate_proj."},
        {"ffn_up.", "mlp.up_proj."},
@ -144,6 +151,12 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
        {"output_norm.", "model.norm."},
    };

+    static const std::vector<std::pair<std::string, std::string>> llm_safetensors_prefix_map{
+        {"text_encoders.llm.embed_tokens.", "text_encoders.llm.model.embed_tokens."},
+        {"text_encoders.llm.layers.", "text_encoders.llm.model.layers."},
+        {"text_encoders.llm.norm.", "text_encoders.llm.model.norm."},
+    };
+
    static const std::vector<std::pair<std::string, std::string>> llm_vision_name_map{
        {"mm.", "merger.mlp."},
        {"v.post_ln.", "merger.ln_q."},
@ -168,6 +181,7 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
            replace_with_name_map(name, llm_vision_name_map);
        } else {
            replace_with_name_map(name, llm_name_map);
+            replace_with_prefix_map(name, llm_safetensors_prefix_map);
        }
    } else {
        name = convert_open_clip_to_hf_clip_name(name);
--- a/src/rope.hpp
+++ b/src/rope.hpp
@ -478,6 +478,52 @@ namespace Rope {
        return embed_nd(ids, bs, static_cast<float>(theta), axes_dim, wrap_dims);
    }

+    __STATIC_INLINE__ std::vector<std::vector<float>> gen_lens_ids(int h,
+                                                                   int w,
+                                                                   int bs,
+                                                                   int context_len,
+                                                                   bool scale_rope = true) {
+        auto img_ids_repeated = gen_flux_img_ids(h, w, 1, bs, 3, 0, 0, 0, scale_rope);
+
+        int txt_id_start = scale_rope ? std::max(h / 2, w / 2) : 0;
+        auto txt_ids     = linspace<float>(1.f * txt_id_start, 1.f * context_len + txt_id_start, context_len);
+        std::vector<std::vector<float>> txt_ids_repeated(bs * context_len, std::vector<float>(3));
+        for (int i = 0; i < bs; ++i) {
+            for (int j = 0; j < txt_ids.size(); ++j) {
+                txt_ids_repeated[i * txt_ids.size() + j] = {txt_ids[j], txt_ids[j], txt_ids[j]};
+            }
+        }
+
+        return concat_ids(img_ids_repeated, txt_ids_repeated, bs);
+    }
+
+    __STATIC_INLINE__ std::vector<float> gen_lens_pe(int h,
+                                                     int w,
+                                                     int bs,
+                                                     int context_len,
+                                                     int theta,
+                                                     bool circular_h,
+                                                     bool circular_w,
+                                                     const std::vector<int>& axes_dim) {
+        std::vector<std::vector<float>> ids = gen_lens_ids(h, w, bs, context_len, true);
+        std::vector<std::vector<int>> wrap_dims;
+        if ((circular_h || circular_w) && bs > 0 && axes_dim.size() >= 3) {
+            size_t pos_len = ids.size() / bs;
+            wrap_dims.assign(axes_dim.size(), std::vector<int>(pos_len, 0));
+            const size_t img_tokens = static_cast<size_t>(h) * static_cast<size_t>(w);
+            for (size_t token_i = 0; token_i < img_tokens; ++token_i) {
+                if (circular_h) {
+                    wrap_dims[1][token_i] = h;
+                }
+                if (circular_w) {
+                    wrap_dims[2][token_i] = w;
+                }
+            }
+        }
+
+        return embed_nd(ids, bs, static_cast<float>(theta), axes_dim, wrap_dims);
+    }
+
    __STATIC_INLINE__ std::vector<std::vector<float>> gen_ernie_image_ids(int h,
                                                                          int w,
                                                                          int patch_size,
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@ -62,6 +62,7 @@ const char* model_version_to_str[] = {
    "Z-Image",
    "Ovis Image",
    "Ernie Image",
+    "Lens",
    "Longcat-Image",
 };

@ -646,6 +647,15 @@ public:
                                                                    params_backend_for(SDBackendModule::DIFFUSION),
                                                                    tensor_storage_map,
                                                                    "model.diffusion_model");
+            } else if (sd_version_is_lens(version)) {
+                cond_stage_model = std::make_shared<LLMEmbedder>(backend_for(SDBackendModule::TE),
+                                                                 params_backend_for(SDBackendModule::TE),
+                                                                 tensor_storage_map,
+                                                                 version);
+                diffusion_model  = std::make_shared<LensModel>(backend_for(SDBackendModule::DIFFUSION),
+                                                              params_backend_for(SDBackendModule::DIFFUSION),
+                                                              tensor_storage_map,
+                                                              "model.diffusion_model");
            } else {  // SD1.x SD2.x SDXL
                std::map<std::string, std::string> embbeding_map;
                for (uint32_t i = 0; i < sd_ctx_params->embedding_count; i++) {
@ -935,6 +945,11 @@ public:
            ignore_tensors.insert("text_encoders.llm.vision_tower.");
            ignore_tensors.insert("text_encoders.llm.multi_modal_projector.");
        }
+        if (sd_version_is_lens(version)) {
+            ignore_tensors.insert("text_encoders.llm.tokenizer_json");
+            ignore_tensors.insert("text_encoders.llm.model.layers.0.mlp.experts.gate_up_proj.weight_scale_2");
+            ignore_tensors.insert("text_encoders.llm.model.layers.0.mlp.experts.down_proj.weight_scale_2");
+        }
        if (version == VERSION_HIDREAM_O1) {
            ignore_tensors.insert("lm_head.");
            ignore_tensors.insert("model.visual.deepstack_merger_list.");
@ -1115,7 +1130,7 @@ public:
                    } else {
                        default_flow_shift = 3.f;
                    }
-                } else if (sd_version_is_flux(version) || sd_version_is_longcat(version)) {
+                } else if (sd_version_is_flux(version) || sd_version_is_longcat(version) || sd_version_is_lens(version)) {
                    pred_type = FLUX_FLOW_PRED;

                    default_flow_shift = 1.0f;  // TODO: validate
@ -1127,6 +1142,8 @@ public:
                    }
                    if (sd_version_is_longcat(version)) {
                        default_flow_shift = 3.0f;
+                    } else if (sd_version_is_lens(version)) {
+                        default_flow_shift = 1.83f;
                    }
                } else if (sd_version_is_flux2(version)) {
                    pred_type = FLUX2_FLOW_PRED;