diff --git a/src/conditioner.hpp b/src/conditioner.hpp index 3963f3ab..9cee68a9 100644 --- a/src/conditioner.hpp +++ b/src/conditioner.hpp @@ -1696,11 +1696,15 @@ struct LLMEmbedder : public Conditioner { arch = LLM::LLMArch::MISTRAL_SMALL_3_2; } else if (sd_version_is_ernie_image(version)) { arch = LLM::LLMArch::MINISTRAL_3_3B; + } else if (sd_version_is_lens(version)) { + arch = LLM::LLMArch::GPT_OSS_20B; } else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) { arch = LLM::LLMArch::QWEN3; } if (arch == LLM::LLMArch::MISTRAL_SMALL_3_2 || arch == LLM::LLMArch::MINISTRAL_3_3B) { tokenizer = std::make_shared(); + } else if (arch == LLM::LLMArch::GPT_OSS_20B) { + tokenizer = std::make_shared(); } else { tokenizer = std::make_shared(); } @@ -1871,6 +1875,7 @@ struct LLMEmbedder : public Conditioner { std::vector>> image_embeds; int prompt_template_encode_start_idx = 34; int min_length = 0; // pad tokens + int max_length = 100000000; int hidden_states_min_length = 0; // zero pad hidden_states bool spell_quotes = false; std::set out_layers; @@ -2029,6 +2034,30 @@ struct LLMEmbedder : public Conditioner { prompt_attn_range.first = 0; prompt += conditioner_params.text; prompt_attn_range.second = static_cast(prompt.size()); + } else if (sd_version_is_lens(version)) { + prompt_template_encode_start_idx = 97; + min_length = 0; + max_length = 512; + out_layers = {6, 12, 18, 24}; + + prompt = + "<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.\n" + "Knowledge cutoff: 2024-06\n" + "Current date: 2026-05-26\n" // fix for current date + "\n" + "Reasoning: medium\n" + "\n" + "# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>developer<|message|># Instructions\n" + "\n" + "Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background.\n" + "\n" + "<|end|><|start|>user<|message|>"; + + prompt_attn_range.first = static_cast(prompt.size()); + prompt += conditioner_params.text; + prompt_attn_range.second = static_cast(prompt.size()); + + prompt += "<|end|><|start|>assistant<|channel|>analysis<|message|>Need to generate one image according to the description.<|end|><|start|>assistant<|channel|>final<|message|>"; } else if (sd_version_is_z_image(version)) { prompt_template_encode_start_idx = 0; out_layers = {35}; // -2 @@ -2085,7 +2114,8 @@ struct LLMEmbedder : public Conditioner { image_embeds, out_layers, prompt_template_encode_start_idx, - spell_quotes); + spell_quotes, + max_length); std::vector> extra_hidden_states_vec; for (int i = 0; i < extra_prompts.size(); i++) { auto extra_hidden_states = encode_prompt(n_threads, @@ -2096,7 +2126,8 @@ struct LLMEmbedder : public Conditioner { image_embeds, out_layers, prompt_template_encode_start_idx, - spell_quotes); + spell_quotes, + max_length); extra_hidden_states_vec.push_back(std::move(extra_hidden_states)); } diff --git a/src/diffusion_model.hpp b/src/diffusion_model.hpp index 9e4e444e..d774c6a0 100644 --- a/src/diffusion_model.hpp +++ b/src/diffusion_model.hpp @@ -6,6 +6,7 @@ #include "ernie_image.hpp" #include "flux.hpp" #include "hidream_o1.hpp" +#include "lens.hpp" #include "ltxv.hpp" #include "mmdit.hpp" #include "qwen_image.hpp" @@ -701,6 +702,72 @@ struct ErnieImageModel : public DiffusionModel { } }; +struct LensModel : public DiffusionModel { + std::string prefix; + Lens::LensRunner lens; + + LensModel(ggml_backend_t backend, + ggml_backend_t params_backend, + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "model.diffusion_model") + : prefix(prefix), lens(backend, params_backend, tensor_storage_map, prefix) { + } + + std::string get_desc() override { + return lens.get_desc(); + } + + void alloc_params_buffer() override { + lens.alloc_params_buffer(); + } + + void free_params_buffer() override { + lens.free_params_buffer(); + } + + void free_compute_buffer() override { + lens.free_compute_buffer(); + } + + void get_param_tensors(std::map& tensors) override { + lens.get_param_tensors(tensors, prefix); + } + + size_t get_params_buffer_size() override { + return lens.get_params_buffer_size(); + } + + void set_weight_adapter(const std::shared_ptr& adapter) override { + lens.set_weight_adapter(adapter); + } + + int64_t get_adm_in_channels() override { + return 768; + } + + void set_flash_attention_enabled(bool enabled) { + lens.set_flash_attention_enabled(enabled); + } + + void set_max_graph_vram_bytes(size_t max_vram_bytes) override { + lens.set_max_graph_vram_bytes(max_vram_bytes); + } + + void set_circular_axes(bool circular_x, bool circular_y) override { + lens.set_circular_axes(circular_x, circular_y); + } + + sd::Tensor compute(int n_threads, + const DiffusionParams& diffusion_params) override { + GGML_ASSERT(diffusion_params.x != nullptr); + GGML_ASSERT(diffusion_params.timesteps != nullptr); + return lens.compute(n_threads, + *diffusion_params.x, + *diffusion_params.timesteps, + tensor_or_empty(diffusion_params.context)); + } +}; + struct LTXAVModel : public DiffusionModel { std::string prefix; LTXV::LTXAVRunner ltxav; diff --git a/src/lens.hpp b/src/lens.hpp new file mode 100644 index 00000000..c7a4e227 --- /dev/null +++ b/src/lens.hpp @@ -0,0 +1,408 @@ +#ifndef __SD_LENS_HPP__ +#define __SD_LENS_HPP__ + +#include +#include + +#include "common_block.hpp" +#include "flux.hpp" +#include "qwen_image.hpp" +#include "rope.hpp" + +namespace Lens { + constexpr int LENS_GRAPH_SIZE = 40960; + + struct LensTimestepProjEmbeddings : public GGMLBlock { + LensTimestepProjEmbeddings(int64_t embedding_dim) { + blocks["timestep_embedder"] = std::make_shared(256, embedding_dim); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* timesteps) { + auto timestep_embedder = std::dynamic_pointer_cast(blocks["timestep_embedder"]); + auto timesteps_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 256, 10000, 1000.f); + return timestep_embedder->forward(ctx, timesteps_proj); + } + }; + + struct LensGateMLP : public GGMLBlock { + LensGateMLP(int64_t dim, int64_t hidden_dim) { + blocks["w1"] = std::make_shared(dim, hidden_dim, false); + blocks["w2"] = std::make_shared(hidden_dim, dim, false); + blocks["w3"] = std::make_shared(dim, hidden_dim, false); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { + auto w1 = std::dynamic_pointer_cast(blocks["w1"]); + auto w2 = std::dynamic_pointer_cast(blocks["w2"]); + auto w3 = std::dynamic_pointer_cast(blocks["w3"]); + + auto gate = ggml_silu(ctx->ggml_ctx, w1->forward(ctx, x)); + auto up = w3->forward(ctx, x); + x = ggml_mul(ctx->ggml_ctx, gate, up); + return w2->forward(ctx, x); + } + }; + + struct LensJointAttention : public GGMLBlock { + int64_t dim_head; + int64_t num_heads; + + LensJointAttention(int64_t query_dim, + int64_t dim_head, + int64_t num_heads, + float eps = 1e-5f) + : dim_head(dim_head), num_heads(num_heads) { + int64_t inner_dim = dim_head * num_heads; + blocks["img_qkv"] = std::make_shared(query_dim, inner_dim * 3, true); + blocks["txt_qkv"] = std::make_shared(query_dim, inner_dim * 3, true); + + blocks["norm_q"] = std::make_shared(dim_head, eps); + blocks["norm_k"] = std::make_shared(dim_head, eps); + blocks["norm_added_q"] = std::make_shared(dim_head, eps); + blocks["norm_added_k"] = std::make_shared(dim_head, eps); + + blocks["to_out.0"] = std::make_shared(inner_dim, query_dim, true); + blocks["to_add_out"] = std::make_shared(inner_dim, query_dim, true); + } + + std::pair forward(GGMLRunnerContext* ctx, + ggml_tensor* img, + ggml_tensor* txt, + ggml_tensor* pe, + ggml_tensor* mask = nullptr) { + auto img_qkv = std::dynamic_pointer_cast(blocks["img_qkv"]); + auto txt_qkv = std::dynamic_pointer_cast(blocks["txt_qkv"]); + auto norm_q = std::dynamic_pointer_cast(blocks["norm_q"]); + auto norm_k = std::dynamic_pointer_cast(blocks["norm_k"]); + auto norm_add_q = std::dynamic_pointer_cast(blocks["norm_added_q"]); + auto norm_add_k = std::dynamic_pointer_cast(blocks["norm_added_k"]); + auto to_out_0 = std::dynamic_pointer_cast(blocks["to_out.0"]); + auto to_add_out = std::dynamic_pointer_cast(blocks["to_add_out"]); + int64_t n_img = img->ne[1]; + int64_t n_txt = txt->ne[1]; + int64_t N = img->ne[2]; + int64_t inner = dim_head * num_heads; + + auto img_qkv_vec = split_qkv(ctx->ggml_ctx, img_qkv->forward(ctx, img)); + auto txt_qkv_vec = split_qkv(ctx->ggml_ctx, txt_qkv->forward(ctx, txt)); + + auto img_q = ggml_reshape_4d(ctx->ggml_ctx, img_qkv_vec[0], dim_head, num_heads, n_img, N); + auto img_k = ggml_reshape_4d(ctx->ggml_ctx, img_qkv_vec[1], dim_head, num_heads, n_img, N); + auto img_v = ggml_reshape_4d(ctx->ggml_ctx, img_qkv_vec[2], dim_head, num_heads, n_img, N); + + img_q = norm_q->forward(ctx, img_q); + img_k = norm_k->forward(ctx, img_k); + + auto txt_q = ggml_reshape_4d(ctx->ggml_ctx, txt_qkv_vec[0], dim_head, num_heads, n_txt, N); + auto txt_k = ggml_reshape_4d(ctx->ggml_ctx, txt_qkv_vec[1], dim_head, num_heads, n_txt, N); + auto txt_v = ggml_reshape_4d(ctx->ggml_ctx, txt_qkv_vec[2], dim_head, num_heads, n_txt, N); + + txt_q = norm_add_q->forward(ctx, txt_q); + txt_k = norm_add_k->forward(ctx, txt_k); + + auto q = ggml_concat(ctx->ggml_ctx, img_q, txt_q, 2); + auto k = ggml_concat(ctx->ggml_ctx, img_k, txt_k, 2); + auto v = ggml_concat(ctx->ggml_ctx, img_v, txt_v, 2); + + auto attn = Rope::attention(ctx, q, k, v, pe, mask, (1.0f / 128.f)); + + auto img_attn_out = ggml_view_3d(ctx->ggml_ctx, + attn, + inner, + n_img, + N, + attn->nb[1], + attn->nb[2], + 0); + auto txt_attn_out = ggml_view_3d(ctx->ggml_ctx, + attn, + inner, + n_txt, + N, + attn->nb[1], + attn->nb[2], + n_img * attn->nb[1]); + + img_attn_out = to_out_0->forward(ctx, ggml_cont(ctx->ggml_ctx, img_attn_out)); + txt_attn_out = to_add_out->forward(ctx, ggml_cont(ctx->ggml_ctx, txt_attn_out)); + return {img_attn_out, txt_attn_out}; + } + }; + + struct LensTransformerBlock : public GGMLBlock { + LensTransformerBlock(int64_t dim, + int64_t num_attention_heads, + int64_t attention_head_dim, + float eps = 1e-6f) { + int64_t mlp_hidden_dim = dim / 3 * 8; + blocks["img_mod.1"] = std::make_shared(dim, 6 * dim, true); + blocks["txt_mod.1"] = std::make_shared(dim, 6 * dim, true); + blocks["img_norm1"] = std::make_shared(dim, eps); + blocks["img_norm2"] = std::make_shared(dim, eps); + blocks["txt_norm1"] = std::make_shared(dim, eps); + blocks["txt_norm2"] = std::make_shared(dim, eps); + blocks["img_mlp"] = std::make_shared(dim, mlp_hidden_dim); + blocks["txt_mlp"] = std::make_shared(dim, mlp_hidden_dim); + blocks["attn"] = std::make_shared(dim, attention_head_dim, num_attention_heads); + } + + std::pair forward(GGMLRunnerContext* ctx, + ggml_tensor* img, + ggml_tensor* txt, + ggml_tensor* t_emb, + ggml_tensor* pe) { + auto img_mod_1 = std::dynamic_pointer_cast(blocks["img_mod.1"]); + auto txt_mod_1 = std::dynamic_pointer_cast(blocks["txt_mod.1"]); + auto img_norm1 = std::dynamic_pointer_cast(blocks["img_norm1"]); + auto img_norm2 = std::dynamic_pointer_cast(blocks["img_norm2"]); + auto txt_norm1 = std::dynamic_pointer_cast(blocks["txt_norm1"]); + auto txt_norm2 = std::dynamic_pointer_cast(blocks["txt_norm2"]); + auto img_mlp = std::dynamic_pointer_cast(blocks["img_mlp"]); + auto txt_mlp = std::dynamic_pointer_cast(blocks["txt_mlp"]); + auto attn = std::dynamic_pointer_cast(blocks["attn"]); + + auto temb = ggml_silu(ctx->ggml_ctx, t_emb); + + auto img_mod_params = img_mod_1->forward(ctx, temb); + auto img_mod_vec = ggml_ext_chunk(ctx->ggml_ctx, img_mod_params, 6, 0); + auto txt_mod_params = txt_mod_1->forward(ctx, temb); + auto txt_mod_vec = ggml_ext_chunk(ctx->ggml_ctx, txt_mod_params, 6, 0); + + auto img_normed = img_norm1->forward(ctx, img); + auto img_modulated = Flux::modulate(ctx->ggml_ctx, img_normed, img_mod_vec[0], img_mod_vec[1]); + auto txt_normed = txt_norm1->forward(ctx, txt); + auto txt_modulated = Flux::modulate(ctx->ggml_ctx, txt_normed, txt_mod_vec[0], txt_mod_vec[1]); + + auto [img_attn_output, txt_attn_output] = attn->forward(ctx, img_modulated, txt_modulated, pe); + + img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_attn_output, img_mod_vec[2])); + txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_attn_output, txt_mod_vec[2])); + + auto img_normed2 = img_norm2->forward(ctx, img); + auto img_modulated2 = Flux::modulate(ctx->ggml_ctx, img_normed2, img_mod_vec[3], img_mod_vec[4]); + auto txt_normed2 = txt_norm2->forward(ctx, txt); + auto txt_modulated2 = Flux::modulate(ctx->ggml_ctx, txt_normed2, txt_mod_vec[3], txt_mod_vec[4]); + + img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_mlp->forward(ctx, img_modulated2), img_mod_vec[5])); + txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_mlp->forward(ctx, txt_modulated2), txt_mod_vec[5])); + return {img, txt}; + } + }; + + struct LensAdaLayerNormContinuous : public GGMLBlock { + int64_t hidden_size; + float eps; + + LensAdaLayerNormContinuous(int64_t hidden_size, float eps = 1e-6f) + : hidden_size(hidden_size), eps(eps) { + blocks["linear"] = std::make_shared(hidden_size, hidden_size * 2, true); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* conditioning) { + auto linear = std::dynamic_pointer_cast(blocks["linear"]); + auto mods = ggml_ext_chunk(ctx->ggml_ctx, linear->forward(ctx, ggml_silu(ctx->ggml_ctx, conditioning)), 2, 0); + auto scale = mods[0]; + auto shift = mods[1]; + x = ggml_norm(ctx->ggml_ctx, x, eps); + return Flux::modulate(ctx->ggml_ctx, x, shift, scale); + } + }; + + struct LensParams { + int patch_size = 2; + int64_t in_channels = 128; + int64_t out_channels = 32; + int num_layers = 48; + int64_t attention_head_dim = 64; + int64_t num_attention_heads = 24; + int64_t joint_attention_dim = 2880; + int selected_layer_count = 4; + int theta = 10000; + std::vector axes_dim = {8, 28, 28}; + int axes_dim_sum = 64; + }; + + class LensModel : public GGMLBlock { + public: + LensParams params; + + LensModel() = default; + LensModel(LensParams params) + : params(params) { + int64_t inner_dim = params.num_attention_heads * params.attention_head_dim; + blocks["time_text_embed"] = std::make_shared(inner_dim); + blocks["img_in"] = std::make_shared(params.in_channels, inner_dim, true); + blocks["txt_in"] = std::make_shared(params.joint_attention_dim * params.selected_layer_count, inner_dim, true); + for (int i = 0; i < params.selected_layer_count; ++i) { + blocks["txt_norm." + std::to_string(i)] = std::make_shared(params.joint_attention_dim, 1e-5f); + } + for (int i = 0; i < params.num_layers; ++i) { + blocks["transformer_blocks." + std::to_string(i)] = std::make_shared(inner_dim, + params.num_attention_heads, + params.attention_head_dim); + } + blocks["norm_out"] = std::make_shared(inner_dim, 1e-6f); + blocks["proj_out"] = std::make_shared(inner_dim, params.patch_size * params.patch_size * params.out_channels, true); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* timestep, + ggml_tensor* context, + ggml_tensor* pe) { + GGML_ASSERT(context != nullptr); + int64_t W = x->ne[0]; + int64_t H = x->ne[1]; + int64_t C = x->ne[2]; + int64_t N = x->ne[3]; + + auto time_text_embed = std::dynamic_pointer_cast(blocks["time_text_embed"]); + auto img_in = std::dynamic_pointer_cast(blocks["img_in"]); + auto txt_in = std::dynamic_pointer_cast(blocks["txt_in"]); + auto norm_out = std::dynamic_pointer_cast(blocks["norm_out"]); + auto proj_out = std::dynamic_pointer_cast(blocks["proj_out"]); + + auto t_emb = time_text_embed->forward(ctx, timestep); + + auto img = ggml_reshape_3d(ctx->ggml_ctx, x, W * H, C, N); + img = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, img, 1, 0, 2, 3)); + img = img_in->forward(ctx, img); + + std::vector txt_chunks = ggml_ext_chunk(ctx->ggml_ctx, context, params.selected_layer_count, 0); + ggml_tensor* txt = nullptr; + for (int i = 0; i < params.selected_layer_count; ++i) { + auto txt_norm = std::dynamic_pointer_cast(blocks["txt_norm." + std::to_string(i)]); + auto chunk = txt_norm->forward(ctx, txt_chunks[i]); + txt = txt == nullptr ? chunk : ggml_concat(ctx->ggml_ctx, txt, chunk, 0); + } + txt = txt_in->forward(ctx, txt); + + sd::ggml_graph_cut::mark_graph_cut(img, "lens.prelude", "img"); + sd::ggml_graph_cut::mark_graph_cut(txt, "lens.prelude", "txt"); + + for (int i = 0; i < params.num_layers; ++i) { + auto block = std::dynamic_pointer_cast(blocks["transformer_blocks." + std::to_string(i)]); + auto out = block->forward(ctx, img, txt, t_emb, pe); + img = out.first; + txt = out.second; + sd::ggml_graph_cut::mark_graph_cut(img, "lens.transformer_blocks." + std::to_string(i), "img"); + sd::ggml_graph_cut::mark_graph_cut(txt, "lens.transformer_blocks." + std::to_string(i), "txt"); + } + + img = norm_out->forward(ctx, img, t_emb); + img = proj_out->forward(ctx, img); + + auto out = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, img, 1, 0, 2, 3)); + out = ggml_reshape_4d(ctx->ggml_ctx, out, W, H, params.patch_size * params.patch_size * params.out_channels, N); + return out; + } + }; + + struct LensRunner : public GGMLRunner { + LensParams lens_params; + LensModel lens; + std::vector pe_vec; + + LensRunner(ggml_backend_t backend, + ggml_backend_t params_backend, + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "") + : GGMLRunner(backend, params_backend) { + lens_params.num_layers = 0; + for (const auto& [name, tensor_storage] : tensor_storage_map) { + if (!starts_with(name, prefix)) { + continue; + } + if (ends_with(name, "img_in.weight") && tensor_storage.n_dims == 2) { + lens_params.in_channels = tensor_storage.ne[0]; + int64_t inner_dim = tensor_storage.ne[1]; + lens_params.num_attention_heads = inner_dim / lens_params.attention_head_dim; + } else if (ends_with(name, "txt_in.weight") && tensor_storage.n_dims == 2) { + lens_params.selected_layer_count = static_cast(tensor_storage.ne[0] / lens_params.joint_attention_dim); + } else if (ends_with(name, "proj_out.weight") && tensor_storage.n_dims == 2) { + lens_params.out_channels = tensor_storage.ne[1] / lens_params.patch_size / lens_params.patch_size; + } else if (ends_with(name, "transformer_blocks.0.attn.norm_q.weight") && tensor_storage.n_dims == 1) { + lens_params.attention_head_dim = tensor_storage.ne[0]; + } + + size_t pos = name.find("transformer_blocks."); + if (pos != std::string::npos) { + std::string layer_name = name.substr(pos); + auto items = split_string(layer_name, '.'); + if (items.size() > 1) { + int block_index = atoi(items[1].c_str()); + if (block_index + 1 > lens_params.num_layers) { + lens_params.num_layers = block_index + 1; + } + } + } + } + if (lens_params.num_layers == 0) { + lens_params.num_layers = 48; + } + lens_params.axes_dim_sum = 0; + for (int axis_dim : lens_params.axes_dim) { + lens_params.axes_dim_sum += axis_dim; + } + + LOG_INFO("lens: layers = %d, in_channels = %" PRId64 ", out_channels = %" PRId64 + ", heads = %" PRId64 ", head_dim = %" PRId64, + lens_params.num_layers, + lens_params.in_channels, + lens_params.out_channels, + lens_params.num_attention_heads, + lens_params.attention_head_dim); + + lens = LensModel(lens_params); + lens.init(params_ctx, tensor_storage_map, prefix); + } + + std::string get_desc() override { + return "lens"; + } + + void get_param_tensors(std::map& tensors, const std::string prefix) { + lens.get_param_tensors(tensors, prefix); + } + + ggml_cgraph* build_graph(const sd::Tensor& x_tensor, + const sd::Tensor& timesteps_tensor, + const sd::Tensor& context_tensor) { + ggml_cgraph* gf = new_graph_custom(LENS_GRAPH_SIZE); + ggml_tensor* x = make_input(x_tensor); + ggml_tensor* timesteps = make_input(timesteps_tensor); + GGML_ASSERT(x->ne[3] == 1); + GGML_ASSERT(!context_tensor.empty()); + ggml_tensor* context = make_input(context_tensor); + + pe_vec = Rope::gen_lens_pe(static_cast(x->ne[1]), + static_cast(x->ne[0]), + static_cast(x->ne[3]), + static_cast(context->ne[1]), + lens_params.theta, + circular_y_enabled, + circular_x_enabled, + lens_params.axes_dim); + int pos_len = static_cast(pe_vec.size() / lens_params.axes_dim_sum / 2); + auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, lens_params.axes_dim_sum / 2, pos_len); + set_backend_tensor_data(pe, pe_vec.data()); + + auto runner_ctx = get_context(); + ggml_tensor* out = lens.forward(&runner_ctx, x, timesteps, context, pe); + ggml_build_forward_expand(gf, out); + return gf; + } + + sd::Tensor compute(int n_threads, + const sd::Tensor& x, + const sd::Tensor& timesteps, + const sd::Tensor& context) { + auto get_graph = [&]() -> ggml_cgraph* { + return build_graph(x, timesteps, context); + }; + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); + } + }; +} // namespace Lens + +#endif // __SD_LENS_HPP__ diff --git a/src/llm.hpp b/src/llm.hpp index cec8b1dc..d5d97d7f 100644 --- a/src/llm.hpp +++ b/src/llm.hpp @@ -23,11 +23,12 @@ #include "rope.hpp" #include "tokenizers/bpe_tokenizer.h" #include "tokenizers/gemma_tokenizer.h" +#include "tokenizers/gpt_oss_tokenizer.h" #include "tokenizers/mistral_tokenizer.h" #include "tokenizers/qwen2_tokenizer.h" namespace LLM { - constexpr int LLM_GRAPH_SIZE = 10240; + constexpr int LLM_GRAPH_SIZE = 65536; enum class LLMArch { QWEN2_5_VL, @@ -36,6 +37,7 @@ namespace LLM { MISTRAL_SMALL_3_2, MINISTRAL_3_3B, GEMMA3_12B, + GPT_OSS_20B, ARCH_COUNT, }; @@ -46,6 +48,7 @@ namespace LLM { "mistral_small3.2", "ministral3.3b", "gemma3_12b", + "gpt_oss_20b", }; enum class MLPActivation { @@ -83,6 +86,7 @@ namespace LLM { int num_kv_heads = 4; int head_dim = 128; bool qkv_bias = true; + bool attention_out_bias = false; bool qk_norm = false; bool rms_norm_add = false; bool normalize_input = false; @@ -93,6 +97,8 @@ namespace LLM { std::vector rope_thetas = {1000000.f}; std::vector rope_scales = {1.f}; std::vector sliding_attention; + int64_t num_experts = 0; + int64_t num_experts_per_tok = 0; LLMVisionParams vision; }; @@ -163,6 +169,170 @@ namespace LLM { } }; + struct GPTOSSMLP : public GGMLBlock { + protected: + int64_t hidden_size; + int64_t intermediate_size; + int64_t num_experts; + int64_t num_experts_per_tok; + bool has_combined_gate_up = false; + + void init_params(ggml_context* ctx, + const String2TensorStorage& tensor_storage_map = {}, + std::string prefix = "") override { + auto supported_type = [](ggml_type wtype, int64_t in_features) { + if (in_features % ggml_blck_size(wtype) != 0) { + return GGML_TYPE_F32; + } + return wtype; + }; + + params["router.weight"] = ggml_new_tensor_2d(ctx, + supported_type(get_type(prefix + "router.weight", tensor_storage_map, GGML_TYPE_F32), hidden_size), + hidden_size, + num_experts); + params["router.bias"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_experts); + + has_combined_gate_up = tensor_storage_map.find(prefix + "experts.gate_up_proj.weight") != tensor_storage_map.end(); + if (has_combined_gate_up) { + ggml_type gate_up_type = supported_type(get_type(prefix + "experts.gate_up_proj.weight", tensor_storage_map, GGML_TYPE_F32), hidden_size); + params["experts.gate_up_proj.weight"] = ggml_new_tensor_3d(ctx, + gate_up_type, + hidden_size, + intermediate_size * 2, + num_experts); + params["experts.gate_up_proj.bias"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, intermediate_size * 2, num_experts); + } else { + ggml_type gate_type = supported_type(get_type(prefix + "experts.gate_proj.weight", tensor_storage_map, GGML_TYPE_F32), hidden_size); + ggml_type up_type = supported_type(get_type(prefix + "experts.up_proj.weight", tensor_storage_map, GGML_TYPE_F32), hidden_size); + params["experts.gate_proj.weight"] = ggml_new_tensor_3d(ctx, gate_type, hidden_size, intermediate_size, num_experts); + params["experts.up_proj.weight"] = ggml_new_tensor_3d(ctx, up_type, hidden_size, intermediate_size, num_experts); + params["experts.gate_proj.bias"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, intermediate_size, num_experts); + params["experts.up_proj.bias"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, intermediate_size, num_experts); + } + + ggml_type down_type = supported_type(get_type(prefix + "experts.down_proj.weight", tensor_storage_map, GGML_TYPE_F32), intermediate_size); + params["experts.down_proj.weight"] = ggml_new_tensor_3d(ctx, down_type, intermediate_size, hidden_size, num_experts); + params["experts.down_proj.bias"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hidden_size, num_experts); + } + + ggml_tensor* expert_linear(GGMLRunnerContext* ctx, + const std::string& weight_name, + const std::string& bias_name, + ggml_tensor* x, + ggml_tensor* selected_experts) { + auto out = ggml_mul_mat_id(ctx->ggml_ctx, params[weight_name], x, selected_experts); + auto it = params.find(bias_name); + if (it != params.end()) { + out = ggml_add_id(ctx->ggml_ctx, out, it->second, selected_experts); + } + return out; + } + + public: + GPTOSSMLP(const LLMParams& params) + : hidden_size(params.hidden_size), + intermediate_size(params.intermediate_size), + num_experts(params.num_experts), + num_experts_per_tok(params.num_experts_per_tok) {} + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { + // x: [N, n_token, hidden_size] + GGML_ASSERT(num_experts > 0 && num_experts_per_tok > 0); + + const int64_t n_token = x->ne[1]; + const int64_t N = x->ne[2]; + const int64_t n_token_total = n_token * N; + ggml_tensor* router_weight = params["router.weight"]; + ggml_tensor* router_bias = params["router.bias"]; + ggml_tensor* router_logits = ggml_mul_mat(ctx->ggml_ctx, router_weight, x); + router_logits = ggml_add(ctx->ggml_ctx, router_logits, router_bias); + router_logits = ggml_reshape_2d(ctx->ggml_ctx, router_logits, num_experts, n_token_total); + + ggml_tensor* selected_experts = ggml_argsort_top_k(ctx->ggml_ctx, router_logits, (int)num_experts_per_tok); // [top_k, tokens] + ggml_tensor* probs = ggml_reshape_3d(ctx->ggml_ctx, router_logits, 1, num_experts, n_token_total); + ggml_tensor* weights = ggml_get_rows(ctx->ggml_ctx, probs, selected_experts); // [1, top_k, tokens] + weights = ggml_reshape_2d(ctx->ggml_ctx, weights, num_experts_per_tok, n_token_total); + weights = ggml_soft_max(ctx->ggml_ctx, weights); + weights = ggml_reshape_3d(ctx->ggml_ctx, weights, 1, num_experts_per_tok, n_token_total); + + x = ggml_reshape_3d(ctx->ggml_ctx, x, hidden_size, 1, n_token_total); + + ggml_tensor* gate = nullptr; + ggml_tensor* up = nullptr; + if (has_combined_gate_up) { + auto gate_up = expert_linear(ctx, + "experts.gate_up_proj.weight", + "experts.gate_up_proj.bias", + x, + selected_experts); // [2 * intermediate, top_k, tokens] + gate_up = ggml_reshape_4d(ctx->ggml_ctx, + gate_up, + 2, + intermediate_size, + num_experts_per_tok, + n_token_total); + gate = ggml_view_4d(ctx->ggml_ctx, + gate_up, + 1, + intermediate_size, + num_experts_per_tok, + n_token_total, + gate_up->nb[1], + gate_up->nb[2], + gate_up->nb[3], + 0); + up = ggml_view_4d(ctx->ggml_ctx, + gate_up, + 1, + intermediate_size, + num_experts_per_tok, + n_token_total, + gate_up->nb[1], + gate_up->nb[2], + gate_up->nb[3], + gate_up->nb[0]); + gate = ggml_reshape_3d(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, gate), intermediate_size, num_experts_per_tok, n_token_total); + up = ggml_reshape_3d(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, up), intermediate_size, num_experts_per_tok, n_token_total); + } else { + gate = expert_linear(ctx, + "experts.gate_proj.weight", + "experts.gate_proj.bias", + x, + selected_experts); + up = expert_linear(ctx, + "experts.up_proj.weight", + "experts.up_proj.bias", + x, + selected_experts); + } + + auto activated = ggml_swiglu_oai(ctx->ggml_ctx, gate, up, 1.702f, 7.0f); + auto experts = expert_linear(ctx, + "experts.down_proj.weight", + "experts.down_proj.bias", + activated, + selected_experts); + experts = ggml_mul(ctx->ggml_ctx, experts, weights); + + ggml_tensor* out = nullptr; + for (int64_t i = 0; i < num_experts_per_tok; ++i) { + auto expert_out = ggml_view_2d(ctx->ggml_ctx, + experts, + hidden_size, + n_token_total, + experts->nb[2], + i * experts->nb[1]); + out = out == nullptr ? expert_out : ggml_add(ctx->ggml_ctx, out, expert_out); + } + if (num_experts_per_tok == 1) { + out = ggml_cont(ctx->ggml_ctx, out); + } + + return ggml_reshape_3d(ctx->ggml_ctx, out, hidden_size, n_token, N); + } + }; + static ggml_tensor* splice_image_embeds(GGMLRunnerContext* ctx, ggml_tensor* x, const std::vector>& image_embeds) { @@ -601,6 +771,15 @@ namespace LLM { int64_t max_position_embeddings; std::vector rope_thetas; std::vector rope_scales; + bool has_attention_sinks; + + void init_params(ggml_context* ctx, + const String2TensorStorage& tensor_storage_map = {}, + std::string prefix = "") override { + if (has_attention_sinks) { + params["sinks"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_heads); + } + } public: Attention(const LLMParams& params) @@ -611,11 +790,12 @@ namespace LLM { qk_norm(params.qk_norm), max_position_embeddings(params.max_position_embeddings), rope_thetas(params.rope_thetas), - rope_scales(params.rope_scales) { + rope_scales(params.rope_scales), + has_attention_sinks(params.arch == LLMArch::GPT_OSS_20B) { blocks["q_proj"] = std::make_shared(params.hidden_size, num_heads * head_dim, params.qkv_bias); blocks["k_proj"] = std::make_shared(params.hidden_size, num_kv_heads * head_dim, params.qkv_bias); blocks["v_proj"] = std::make_shared(params.hidden_size, num_kv_heads * head_dim, params.qkv_bias); - blocks["o_proj"] = std::make_shared(num_heads * head_dim, params.hidden_size, false); + blocks["o_proj"] = std::make_shared(num_heads * head_dim, params.hidden_size, params.attention_out_bias); if (params.qk_norm) { blocks["q_norm"] = std::make_shared(head_dim, params.rms_norm_eps, params.rms_norm_add); blocks["k_norm"] = std::make_shared(head_dim, params.rms_norm_eps, params.rms_norm_add); @@ -660,6 +840,36 @@ namespace LLM { } else if (arch == LLMArch::QWEN3) { q = ggml_rope_ext(ctx->ggml_ctx, q, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 40960, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f); k = ggml_rope_ext(ctx->ggml_ctx, k, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 40960, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f); + } else if (arch == LLMArch::GPT_OSS_20B) { + float rope_theta = rope_thetas.empty() ? 150000.f : rope_thetas[0]; + float rope_scale = rope_scales.empty() ? 32.f : rope_scales[0]; + float freq_scale = 1.f / rope_scale; + q = ggml_rope_ext(ctx->ggml_ctx, + q, + input_pos, + nullptr, + head_dim, + GGML_ROPE_TYPE_NEOX, + 4096, + rope_theta, + freq_scale, + 1.f, + 1.f, + 32.f, + 1.f); + k = ggml_rope_ext(ctx->ggml_ctx, + k, + input_pos, + nullptr, + head_dim, + GGML_ROPE_TYPE_NEOX, + 4096, + rope_theta, + freq_scale, + 1.f, + 1.f, + 32.f, + 1.f); } else if (arch == LLMArch::GEMMA3_12B) { float rope_theta = (rope_index == 1 ? 10000.0f : 1000000.0f); float rope_scale = (rope_index == 1 ? 1.f : 8.f); @@ -706,7 +916,28 @@ namespace LLM { k = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, k, 0, 2, 1, 3)); // [N, num_kv_heads, n_token, head_dim] k = ggml_reshape_3d(ctx->ggml_ctx, k, k->ne[0], k->ne[1], k->ne[2] * k->ne[3]); // [N*num_kv_heads, n_token, head_dim] - x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, attention_mask, true, false); // [N, n_token, hidden_size] + if (arch == LLMArch::GPT_OSS_20B) { + GGML_ASSERT(N == 1); + auto v_attn = ggml_ext_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, v, 1, 2, 0, 3)); // [N, kv_heads, head_dim, tokens] + v_attn = ggml_reshape_3d(ctx->ggml_ctx, v_attn, n_token, head_dim, num_kv_heads * N); + + auto kq = ggml_mul_mat(ctx->ggml_ctx, k, q); + ggml_mul_mat_set_prec(kq, GGML_PREC_F32); + kq = ggml_scale_inplace(ctx->ggml_ctx, kq, 1.0f / std::sqrt(static_cast(head_dim))); + if (attention_mask != nullptr) { + kq = ggml_add_inplace(ctx->ggml_ctx, kq, attention_mask); + } + kq = ggml_soft_max_inplace(ctx->ggml_ctx, kq); + ggml_soft_max_add_sinks(kq, params["sinks"]); + + auto kqv = ggml_mul_mat(ctx->ggml_ctx, v_attn, kq); + kqv = ggml_reshape_4d(ctx->ggml_ctx, kqv, head_dim, n_token, num_heads, N); + kqv = ggml_permute(ctx->ggml_ctx, kqv, 0, 2, 1, 3); + x = ggml_ext_cont(ctx->ggml_ctx, kqv); + x = ggml_reshape_3d(ctx->ggml_ctx, x, head_dim * num_heads, n_token, N); + } else { + x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, attention_mask, true, false); // [N, n_token, hidden_size] + } x = out_proj->forward(ctx, x); // [N, n_token, hidden_size] return x; @@ -726,11 +957,15 @@ namespace LLM { sliding_attention(0), has_post_attention_norm(params.arch == LLMArch::GEMMA3_12B), has_post_ffw_norm(params.arch == LLMArch::GEMMA3_12B) { - blocks["self_attn"] = std::make_shared(params); - blocks["mlp"] = std::make_shared(params.hidden_size, - params.intermediate_size, - false, - params.mlp_activation); + blocks["self_attn"] = std::make_shared(params); + if (params.arch == LLMArch::GPT_OSS_20B) { + blocks["mlp"] = std::make_shared(params); + } else { + blocks["mlp"] = std::make_shared(params.hidden_size, + params.intermediate_size, + false, + params.mlp_activation); + } blocks["input_layernorm"] = std::make_shared(params.hidden_size, params.rms_norm_eps, params.rms_norm_add); blocks["post_attention_layernorm"] = std::make_shared(params.hidden_size, params.rms_norm_eps, params.rms_norm_add); if (has_post_attention_norm) { @@ -751,7 +986,6 @@ namespace LLM { ggml_tensor* sliding_attention_mask = nullptr) { // x: [N, n_token, hidden_size] auto self_attn = std::dynamic_pointer_cast(blocks["self_attn"]); - auto mlp = std::dynamic_pointer_cast(blocks["mlp"]); auto input_layernorm = std::dynamic_pointer_cast(blocks["input_layernorm"]); auto post_attention_layernorm = std::dynamic_pointer_cast(blocks["post_attention_layernorm"]); std::shared_ptr post_attention_norm = nullptr; @@ -764,7 +998,7 @@ namespace LLM { } ggml_tensor* block_attention_mask = attention_mask; int rope_index = 0; - if (arch == LLMArch::GEMMA3_12B && sliding_attention > 0) { + if ((arch == LLMArch::GEMMA3_12B || arch == LLMArch::GPT_OSS_20B) && sliding_attention > 0) { block_attention_mask = sliding_attention_mask; rope_index = 1; } @@ -779,7 +1013,13 @@ namespace LLM { residual = x; x = post_attention_layernorm->forward(ctx, x); - x = mlp->forward(ctx, x); + if (arch == LLMArch::GPT_OSS_20B) { + auto mlp = std::dynamic_pointer_cast(blocks["mlp"]); + x = mlp->forward(ctx, x); + } else { + auto mlp = std::dynamic_pointer_cast(blocks["mlp"]); + x = mlp->forward(ctx, x); + } if (post_ffw_norm != nullptr) { x = post_ffw_norm->forward(ctx, x); } @@ -1202,6 +1442,24 @@ namespace LLM { params.rope_thetas = {1000000.f, 10000.f}; params.rope_scales = {8.f, 1.f}; params.sliding_attention = {1024, 1024, 1024, 1024, 1024, 0}; + } else if (arch == LLMArch::GPT_OSS_20B) { + params.head_dim = 64; + params.num_heads = 64; + params.num_kv_heads = 8; + params.qkv_bias = true; + params.attention_out_bias = true; + params.qk_norm = false; + params.rms_norm_eps = 1e-5f; + params.hidden_size = 2880; + params.intermediate_size = 2880; + params.num_layers = 24; + params.vocab_size = 201088; + params.max_position_embeddings = 131072; + params.rope_thetas = {150000.f}; + params.rope_scales = {32.f}; + params.sliding_attention = {128, 0}; + params.num_experts = 32; + params.num_experts_per_tok = 4; } bool have_vision_weight = false; bool llama_cpp_style = false; @@ -1236,6 +1494,12 @@ namespace LLM { if (contains(tensor_name, "layers.0.mlp.gate_proj.weight")) { params.intermediate_size = pair.second.ne[1]; } + if (contains(tensor_name, "layers.0.mlp.experts.gate_up_proj.weight")) { + params.intermediate_size = pair.second.ne[1] / 2; + } + if (contains(tensor_name, "layers.0.mlp.experts.gate_proj.weight")) { + params.intermediate_size = pair.second.ne[1]; + } } if (arch == LLMArch::QWEN3 && params.num_layers == 28) { // Qwen3 2B params.num_heads = 16; @@ -1315,7 +1579,8 @@ namespace LLM { if (params.arch == LLMArch::MISTRAL_SMALL_3_2 || params.arch == LLMArch::MINISTRAL_3_3B || params.arch == LLMArch::QWEN3 || - params.arch == LLMArch::GEMMA3_12B) { + params.arch == LLMArch::GEMMA3_12B || + params.arch == LLMArch::GPT_OSS_20B) { input_pos_vec.resize(n_tokens); for (int i = 0; i < n_tokens; ++i) { input_pos_vec[i] = i; @@ -1354,7 +1619,11 @@ namespace LLM { set_backend_tensor_data(attention_mask, attention_mask_vec.data()); } - if (params.arch == LLMArch::GEMMA3_12B) { + if (params.arch == LLMArch::GEMMA3_12B || params.arch == LLMArch::GPT_OSS_20B) { + int sliding_window = 0; + for (int window : params.sliding_attention) { + sliding_window = std::max(sliding_window, window); + } sliding_attention_mask_vec.resize(n_tokens * n_tokens); if (!attention_mask_tensor.empty()) { GGML_ASSERT(attention_mask_tensor.numel() == n_tokens * n_tokens); @@ -1364,8 +1633,7 @@ namespace LLM { } for (int i0 = 0; i0 < n_tokens; i0++) { for (int i1 = 0; i1 < n_tokens; i1++) { - if (i0 + 1024 <= i1) { - LOG_DEBUG("xxxxxxxxxxxxxx"); + if (sliding_window > 0 && i0 + sliding_window <= i1) { sliding_attention_mask_vec[i1 * n_tokens + i0] = -INFINITY; } } @@ -1485,6 +1753,8 @@ namespace LLM { : model(arch, backend, params_backend, tensor_storage_map, prefix, enable_vision) { if (arch == LLMArch::MISTRAL_SMALL_3_2 || arch == LLMArch::MINISTRAL_3_3B) { tokenizer = std::make_shared(); + } else if (arch == LLMArch::GPT_OSS_20B) { + tokenizer = std::make_shared(); } else { tokenizer = std::make_shared(); } diff --git a/src/model.cpp b/src/model.cpp index 8351a2be..25d78b94 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -442,6 +442,10 @@ SDVersion ModelLoader::get_sd_version() { tensor_storage_map.find("model.language_model.layers.0.self_attn.q_proj.weight") != tensor_storage_map.end()) { return VERSION_HIDREAM_O1; } + if (tensor_storage.name.find("model.diffusion_model.transformer_blocks.0.attn.norm_added_q.weight") != std::string::npos && + tensor_storage_map.find("model.diffusion_model.transformer_blocks.0.img_mlp.w1.weight") != tensor_storage_map.end()) { + return VERSION_LENS; + } if (tensor_storage.name.find("model.diffusion_model.transformer_blocks.0.img_mod.1.weight") != std::string::npos) { return VERSION_QWEN_IMAGE; } diff --git a/src/model.h b/src/model.h index fadeeefb..8ecea16b 100644 --- a/src/model.h +++ b/src/model.h @@ -47,6 +47,7 @@ enum SDVersion { VERSION_Z_IMAGE, VERSION_OVIS_IMAGE, VERSION_ERNIE_IMAGE, + VERSION_LENS, VERSION_LONGCAT, VERSION_COUNT, }; @@ -156,8 +157,15 @@ static inline bool sd_version_is_ernie_image(SDVersion version) { return false; } +static inline bool sd_version_is_lens(SDVersion version) { + if (version == VERSION_LENS) { + return true; + } + return false; +} + static inline bool sd_version_uses_flux2_vae(SDVersion version) { - if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version)) { + if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version) || sd_version_is_lens(version)) { return true; } return false; @@ -185,6 +193,7 @@ static inline bool sd_version_is_dit(SDVersion version) { sd_version_is_anima(version) || sd_version_is_z_image(version) || sd_version_is_ernie_image(version) || + sd_version_is_lens(version) || sd_version_is_longcat(version)) { return true; } diff --git a/src/name_conversion.cpp b/src/name_conversion.cpp index 819066d0..135a1f2c 100644 --- a/src/name_conversion.cpp +++ b/src/name_conversion.cpp @@ -128,6 +128,7 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix) }; static const std::vector> llm_name_map{ + {"attn_sinks.weight", "self_attn.sinks"}, {"token_embd.", "model.embed_tokens."}, {"blk.", "model.layers."}, {"attn_q.", "self_attn.q_proj."}, @@ -137,6 +138,12 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix) {"attn_k_norm.", "self_attn.k_norm."}, {"attn_output.", "self_attn.o_proj."}, {"attn_norm.", "input_layernorm."}, + {"attn_post_norm.", "post_attention_layernorm."}, + {"post_attention_norm.", "post_attention_layernorm."}, + {"ffn_gate_inp.", "mlp.router."}, + {"ffn_gate_exps.", "mlp.experts.gate_proj."}, + {"ffn_up_exps.", "mlp.experts.up_proj."}, + {"ffn_down_exps.", "mlp.experts.down_proj."}, {"ffn_down.", "mlp.down_proj."}, {"ffn_gate.", "mlp.gate_proj."}, {"ffn_up.", "mlp.up_proj."}, @@ -144,6 +151,12 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix) {"output_norm.", "model.norm."}, }; + static const std::vector> llm_safetensors_prefix_map{ + {"text_encoders.llm.embed_tokens.", "text_encoders.llm.model.embed_tokens."}, + {"text_encoders.llm.layers.", "text_encoders.llm.model.layers."}, + {"text_encoders.llm.norm.", "text_encoders.llm.model.norm."}, + }; + static const std::vector> llm_vision_name_map{ {"mm.", "merger.mlp."}, {"v.post_ln.", "merger.ln_q."}, @@ -168,6 +181,7 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix) replace_with_name_map(name, llm_vision_name_map); } else { replace_with_name_map(name, llm_name_map); + replace_with_prefix_map(name, llm_safetensors_prefix_map); } } else { name = convert_open_clip_to_hf_clip_name(name); diff --git a/src/rope.hpp b/src/rope.hpp index 4c959760..875ed854 100644 --- a/src/rope.hpp +++ b/src/rope.hpp @@ -478,6 +478,52 @@ namespace Rope { return embed_nd(ids, bs, static_cast(theta), axes_dim, wrap_dims); } + __STATIC_INLINE__ std::vector> gen_lens_ids(int h, + int w, + int bs, + int context_len, + bool scale_rope = true) { + auto img_ids_repeated = gen_flux_img_ids(h, w, 1, bs, 3, 0, 0, 0, scale_rope); + + int txt_id_start = scale_rope ? std::max(h / 2, w / 2) : 0; + auto txt_ids = linspace(1.f * txt_id_start, 1.f * context_len + txt_id_start, context_len); + std::vector> txt_ids_repeated(bs * context_len, std::vector(3)); + for (int i = 0; i < bs; ++i) { + for (int j = 0; j < txt_ids.size(); ++j) { + txt_ids_repeated[i * txt_ids.size() + j] = {txt_ids[j], txt_ids[j], txt_ids[j]}; + } + } + + return concat_ids(img_ids_repeated, txt_ids_repeated, bs); + } + + __STATIC_INLINE__ std::vector gen_lens_pe(int h, + int w, + int bs, + int context_len, + int theta, + bool circular_h, + bool circular_w, + const std::vector& axes_dim) { + std::vector> ids = gen_lens_ids(h, w, bs, context_len, true); + std::vector> wrap_dims; + if ((circular_h || circular_w) && bs > 0 && axes_dim.size() >= 3) { + size_t pos_len = ids.size() / bs; + wrap_dims.assign(axes_dim.size(), std::vector(pos_len, 0)); + const size_t img_tokens = static_cast(h) * static_cast(w); + for (size_t token_i = 0; token_i < img_tokens; ++token_i) { + if (circular_h) { + wrap_dims[1][token_i] = h; + } + if (circular_w) { + wrap_dims[2][token_i] = w; + } + } + } + + return embed_nd(ids, bs, static_cast(theta), axes_dim, wrap_dims); + } + __STATIC_INLINE__ std::vector> gen_ernie_image_ids(int h, int w, int patch_size, diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 9e8e4744..8f4f159a 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -62,6 +62,7 @@ const char* model_version_to_str[] = { "Z-Image", "Ovis Image", "Ernie Image", + "Lens", "Longcat-Image", }; @@ -646,6 +647,15 @@ public: params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, "model.diffusion_model"); + } else if (sd_version_is_lens(version)) { + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + params_backend_for(SDBackendModule::TE), + tensor_storage_map, + version); + diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), + params_backend_for(SDBackendModule::DIFFUSION), + tensor_storage_map, + "model.diffusion_model"); } else { // SD1.x SD2.x SDXL std::map embbeding_map; for (uint32_t i = 0; i < sd_ctx_params->embedding_count; i++) { @@ -935,6 +945,11 @@ public: ignore_tensors.insert("text_encoders.llm.vision_tower."); ignore_tensors.insert("text_encoders.llm.multi_modal_projector."); } + if (sd_version_is_lens(version)) { + ignore_tensors.insert("text_encoders.llm.tokenizer_json"); + ignore_tensors.insert("text_encoders.llm.model.layers.0.mlp.experts.gate_up_proj.weight_scale_2"); + ignore_tensors.insert("text_encoders.llm.model.layers.0.mlp.experts.down_proj.weight_scale_2"); + } if (version == VERSION_HIDREAM_O1) { ignore_tensors.insert("lm_head."); ignore_tensors.insert("model.visual.deepstack_merger_list."); @@ -1115,7 +1130,7 @@ public: } else { default_flow_shift = 3.f; } - } else if (sd_version_is_flux(version) || sd_version_is_longcat(version)) { + } else if (sd_version_is_flux(version) || sd_version_is_longcat(version) || sd_version_is_lens(version)) { pred_type = FLUX_FLOW_PRED; default_flow_shift = 1.0f; // TODO: validate @@ -1127,6 +1142,8 @@ public: } if (sd_version_is_longcat(version)) { default_flow_shift = 3.0f; + } else if (sd_version_is_lens(version)) { + default_flow_shift = 1.83f; } } else if (sd_version_is_flux2(version)) { pred_type = FLUX2_FLOW_PRED;