mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2026-06-24 23:26:43 +00:00
add lens support
This commit is contained in:
parent
1fa06bac5c
commit
1f8ced13f6
@ -1696,11 +1696,15 @@ struct LLMEmbedder : public Conditioner {
|
||||
arch = LLM::LLMArch::MISTRAL_SMALL_3_2;
|
||||
} else if (sd_version_is_ernie_image(version)) {
|
||||
arch = LLM::LLMArch::MINISTRAL_3_3B;
|
||||
} else if (sd_version_is_lens(version)) {
|
||||
arch = LLM::LLMArch::GPT_OSS_20B;
|
||||
} else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) {
|
||||
arch = LLM::LLMArch::QWEN3;
|
||||
}
|
||||
if (arch == LLM::LLMArch::MISTRAL_SMALL_3_2 || arch == LLM::LLMArch::MINISTRAL_3_3B) {
|
||||
tokenizer = std::make_shared<MistralTokenizer>();
|
||||
} else if (arch == LLM::LLMArch::GPT_OSS_20B) {
|
||||
tokenizer = std::make_shared<GPTOSSTokenizer>();
|
||||
} else {
|
||||
tokenizer = std::make_shared<Qwen2Tokenizer>();
|
||||
}
|
||||
@ -1871,6 +1875,7 @@ struct LLMEmbedder : public Conditioner {
|
||||
std::vector<std::pair<int, sd::Tensor<float>>> image_embeds;
|
||||
int prompt_template_encode_start_idx = 34;
|
||||
int min_length = 0; // pad tokens
|
||||
int max_length = 100000000;
|
||||
int hidden_states_min_length = 0; // zero pad hidden_states
|
||||
bool spell_quotes = false;
|
||||
std::set<int> out_layers;
|
||||
@ -2029,6 +2034,30 @@ struct LLMEmbedder : public Conditioner {
|
||||
prompt_attn_range.first = 0;
|
||||
prompt += conditioner_params.text;
|
||||
prompt_attn_range.second = static_cast<int>(prompt.size());
|
||||
} else if (sd_version_is_lens(version)) {
|
||||
prompt_template_encode_start_idx = 97;
|
||||
min_length = 0;
|
||||
max_length = 512;
|
||||
out_layers = {6, 12, 18, 24};
|
||||
|
||||
prompt =
|
||||
"<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.\n"
|
||||
"Knowledge cutoff: 2024-06\n"
|
||||
"Current date: 2026-05-26\n" // fix for current date
|
||||
"\n"
|
||||
"Reasoning: medium\n"
|
||||
"\n"
|
||||
"# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>developer<|message|># Instructions\n"
|
||||
"\n"
|
||||
"Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background.\n"
|
||||
"\n"
|
||||
"<|end|><|start|>user<|message|>";
|
||||
|
||||
prompt_attn_range.first = static_cast<int>(prompt.size());
|
||||
prompt += conditioner_params.text;
|
||||
prompt_attn_range.second = static_cast<int>(prompt.size());
|
||||
|
||||
prompt += "<|end|><|start|>assistant<|channel|>analysis<|message|>Need to generate one image according to the description.<|end|><|start|>assistant<|channel|>final<|message|>";
|
||||
} else if (sd_version_is_z_image(version)) {
|
||||
prompt_template_encode_start_idx = 0;
|
||||
out_layers = {35}; // -2
|
||||
@ -2085,7 +2114,8 @@ struct LLMEmbedder : public Conditioner {
|
||||
image_embeds,
|
||||
out_layers,
|
||||
prompt_template_encode_start_idx,
|
||||
spell_quotes);
|
||||
spell_quotes,
|
||||
max_length);
|
||||
std::vector<sd::Tensor<float>> extra_hidden_states_vec;
|
||||
for (int i = 0; i < extra_prompts.size(); i++) {
|
||||
auto extra_hidden_states = encode_prompt(n_threads,
|
||||
@ -2096,7 +2126,8 @@ struct LLMEmbedder : public Conditioner {
|
||||
image_embeds,
|
||||
out_layers,
|
||||
prompt_template_encode_start_idx,
|
||||
spell_quotes);
|
||||
spell_quotes,
|
||||
max_length);
|
||||
extra_hidden_states_vec.push_back(std::move(extra_hidden_states));
|
||||
}
|
||||
|
||||
|
||||
@ -6,6 +6,7 @@
|
||||
#include "ernie_image.hpp"
|
||||
#include "flux.hpp"
|
||||
#include "hidream_o1.hpp"
|
||||
#include "lens.hpp"
|
||||
#include "ltxv.hpp"
|
||||
#include "mmdit.hpp"
|
||||
#include "qwen_image.hpp"
|
||||
@ -701,6 +702,72 @@ struct ErnieImageModel : public DiffusionModel {
|
||||
}
|
||||
};
|
||||
|
||||
struct LensModel : public DiffusionModel {
|
||||
std::string prefix;
|
||||
Lens::LensRunner lens;
|
||||
|
||||
LensModel(ggml_backend_t backend,
|
||||
ggml_backend_t params_backend,
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string prefix = "model.diffusion_model")
|
||||
: prefix(prefix), lens(backend, params_backend, tensor_storage_map, prefix) {
|
||||
}
|
||||
|
||||
std::string get_desc() override {
|
||||
return lens.get_desc();
|
||||
}
|
||||
|
||||
void alloc_params_buffer() override {
|
||||
lens.alloc_params_buffer();
|
||||
}
|
||||
|
||||
void free_params_buffer() override {
|
||||
lens.free_params_buffer();
|
||||
}
|
||||
|
||||
void free_compute_buffer() override {
|
||||
lens.free_compute_buffer();
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
|
||||
lens.get_param_tensors(tensors, prefix);
|
||||
}
|
||||
|
||||
size_t get_params_buffer_size() override {
|
||||
return lens.get_params_buffer_size();
|
||||
}
|
||||
|
||||
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
|
||||
lens.set_weight_adapter(adapter);
|
||||
}
|
||||
|
||||
int64_t get_adm_in_channels() override {
|
||||
return 768;
|
||||
}
|
||||
|
||||
void set_flash_attention_enabled(bool enabled) {
|
||||
lens.set_flash_attention_enabled(enabled);
|
||||
}
|
||||
|
||||
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
|
||||
lens.set_max_graph_vram_bytes(max_vram_bytes);
|
||||
}
|
||||
|
||||
void set_circular_axes(bool circular_x, bool circular_y) override {
|
||||
lens.set_circular_axes(circular_x, circular_y);
|
||||
}
|
||||
|
||||
sd::Tensor<float> compute(int n_threads,
|
||||
const DiffusionParams& diffusion_params) override {
|
||||
GGML_ASSERT(diffusion_params.x != nullptr);
|
||||
GGML_ASSERT(diffusion_params.timesteps != nullptr);
|
||||
return lens.compute(n_threads,
|
||||
*diffusion_params.x,
|
||||
*diffusion_params.timesteps,
|
||||
tensor_or_empty(diffusion_params.context));
|
||||
}
|
||||
};
|
||||
|
||||
struct LTXAVModel : public DiffusionModel {
|
||||
std::string prefix;
|
||||
LTXV::LTXAVRunner ltxav;
|
||||
|
||||
408
src/lens.hpp
Normal file
408
src/lens.hpp
Normal file
@ -0,0 +1,408 @@
|
||||
#ifndef __SD_LENS_HPP__
|
||||
#define __SD_LENS_HPP__
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "common_block.hpp"
|
||||
#include "flux.hpp"
|
||||
#include "qwen_image.hpp"
|
||||
#include "rope.hpp"
|
||||
|
||||
namespace Lens {
|
||||
constexpr int LENS_GRAPH_SIZE = 40960;
|
||||
|
||||
struct LensTimestepProjEmbeddings : public GGMLBlock {
|
||||
LensTimestepProjEmbeddings(int64_t embedding_dim) {
|
||||
blocks["timestep_embedder"] = std::make_shared<Qwen::TimestepEmbedding>(256, embedding_dim);
|
||||
}
|
||||
|
||||
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* timesteps) {
|
||||
auto timestep_embedder = std::dynamic_pointer_cast<Qwen::TimestepEmbedding>(blocks["timestep_embedder"]);
|
||||
auto timesteps_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 256, 10000, 1000.f);
|
||||
return timestep_embedder->forward(ctx, timesteps_proj);
|
||||
}
|
||||
};
|
||||
|
||||
struct LensGateMLP : public GGMLBlock {
|
||||
LensGateMLP(int64_t dim, int64_t hidden_dim) {
|
||||
blocks["w1"] = std::make_shared<Linear>(dim, hidden_dim, false);
|
||||
blocks["w2"] = std::make_shared<Linear>(hidden_dim, dim, false);
|
||||
blocks["w3"] = std::make_shared<Linear>(dim, hidden_dim, false);
|
||||
}
|
||||
|
||||
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
|
||||
auto w1 = std::dynamic_pointer_cast<Linear>(blocks["w1"]);
|
||||
auto w2 = std::dynamic_pointer_cast<Linear>(blocks["w2"]);
|
||||
auto w3 = std::dynamic_pointer_cast<Linear>(blocks["w3"]);
|
||||
|
||||
auto gate = ggml_silu(ctx->ggml_ctx, w1->forward(ctx, x));
|
||||
auto up = w3->forward(ctx, x);
|
||||
x = ggml_mul(ctx->ggml_ctx, gate, up);
|
||||
return w2->forward(ctx, x);
|
||||
}
|
||||
};
|
||||
|
||||
struct LensJointAttention : public GGMLBlock {
|
||||
int64_t dim_head;
|
||||
int64_t num_heads;
|
||||
|
||||
LensJointAttention(int64_t query_dim,
|
||||
int64_t dim_head,
|
||||
int64_t num_heads,
|
||||
float eps = 1e-5f)
|
||||
: dim_head(dim_head), num_heads(num_heads) {
|
||||
int64_t inner_dim = dim_head * num_heads;
|
||||
blocks["img_qkv"] = std::make_shared<Linear>(query_dim, inner_dim * 3, true);
|
||||
blocks["txt_qkv"] = std::make_shared<Linear>(query_dim, inner_dim * 3, true);
|
||||
|
||||
blocks["norm_q"] = std::make_shared<RMSNorm>(dim_head, eps);
|
||||
blocks["norm_k"] = std::make_shared<RMSNorm>(dim_head, eps);
|
||||
blocks["norm_added_q"] = std::make_shared<RMSNorm>(dim_head, eps);
|
||||
blocks["norm_added_k"] = std::make_shared<RMSNorm>(dim_head, eps);
|
||||
|
||||
blocks["to_out.0"] = std::make_shared<Linear>(inner_dim, query_dim, true);
|
||||
blocks["to_add_out"] = std::make_shared<Linear>(inner_dim, query_dim, true);
|
||||
}
|
||||
|
||||
std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
|
||||
ggml_tensor* img,
|
||||
ggml_tensor* txt,
|
||||
ggml_tensor* pe,
|
||||
ggml_tensor* mask = nullptr) {
|
||||
auto img_qkv = std::dynamic_pointer_cast<Linear>(blocks["img_qkv"]);
|
||||
auto txt_qkv = std::dynamic_pointer_cast<Linear>(blocks["txt_qkv"]);
|
||||
auto norm_q = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_q"]);
|
||||
auto norm_k = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_k"]);
|
||||
auto norm_add_q = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_added_q"]);
|
||||
auto norm_add_k = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_added_k"]);
|
||||
auto to_out_0 = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
|
||||
auto to_add_out = std::dynamic_pointer_cast<Linear>(blocks["to_add_out"]);
|
||||
int64_t n_img = img->ne[1];
|
||||
int64_t n_txt = txt->ne[1];
|
||||
int64_t N = img->ne[2];
|
||||
int64_t inner = dim_head * num_heads;
|
||||
|
||||
auto img_qkv_vec = split_qkv(ctx->ggml_ctx, img_qkv->forward(ctx, img));
|
||||
auto txt_qkv_vec = split_qkv(ctx->ggml_ctx, txt_qkv->forward(ctx, txt));
|
||||
|
||||
auto img_q = ggml_reshape_4d(ctx->ggml_ctx, img_qkv_vec[0], dim_head, num_heads, n_img, N);
|
||||
auto img_k = ggml_reshape_4d(ctx->ggml_ctx, img_qkv_vec[1], dim_head, num_heads, n_img, N);
|
||||
auto img_v = ggml_reshape_4d(ctx->ggml_ctx, img_qkv_vec[2], dim_head, num_heads, n_img, N);
|
||||
|
||||
img_q = norm_q->forward(ctx, img_q);
|
||||
img_k = norm_k->forward(ctx, img_k);
|
||||
|
||||
auto txt_q = ggml_reshape_4d(ctx->ggml_ctx, txt_qkv_vec[0], dim_head, num_heads, n_txt, N);
|
||||
auto txt_k = ggml_reshape_4d(ctx->ggml_ctx, txt_qkv_vec[1], dim_head, num_heads, n_txt, N);
|
||||
auto txt_v = ggml_reshape_4d(ctx->ggml_ctx, txt_qkv_vec[2], dim_head, num_heads, n_txt, N);
|
||||
|
||||
txt_q = norm_add_q->forward(ctx, txt_q);
|
||||
txt_k = norm_add_k->forward(ctx, txt_k);
|
||||
|
||||
auto q = ggml_concat(ctx->ggml_ctx, img_q, txt_q, 2);
|
||||
auto k = ggml_concat(ctx->ggml_ctx, img_k, txt_k, 2);
|
||||
auto v = ggml_concat(ctx->ggml_ctx, img_v, txt_v, 2);
|
||||
|
||||
auto attn = Rope::attention(ctx, q, k, v, pe, mask, (1.0f / 128.f));
|
||||
|
||||
auto img_attn_out = ggml_view_3d(ctx->ggml_ctx,
|
||||
attn,
|
||||
inner,
|
||||
n_img,
|
||||
N,
|
||||
attn->nb[1],
|
||||
attn->nb[2],
|
||||
0);
|
||||
auto txt_attn_out = ggml_view_3d(ctx->ggml_ctx,
|
||||
attn,
|
||||
inner,
|
||||
n_txt,
|
||||
N,
|
||||
attn->nb[1],
|
||||
attn->nb[2],
|
||||
n_img * attn->nb[1]);
|
||||
|
||||
img_attn_out = to_out_0->forward(ctx, ggml_cont(ctx->ggml_ctx, img_attn_out));
|
||||
txt_attn_out = to_add_out->forward(ctx, ggml_cont(ctx->ggml_ctx, txt_attn_out));
|
||||
return {img_attn_out, txt_attn_out};
|
||||
}
|
||||
};
|
||||
|
||||
struct LensTransformerBlock : public GGMLBlock {
|
||||
LensTransformerBlock(int64_t dim,
|
||||
int64_t num_attention_heads,
|
||||
int64_t attention_head_dim,
|
||||
float eps = 1e-6f) {
|
||||
int64_t mlp_hidden_dim = dim / 3 * 8;
|
||||
blocks["img_mod.1"] = std::make_shared<Linear>(dim, 6 * dim, true);
|
||||
blocks["txt_mod.1"] = std::make_shared<Linear>(dim, 6 * dim, true);
|
||||
blocks["img_norm1"] = std::make_shared<RMSNorm>(dim, eps);
|
||||
blocks["img_norm2"] = std::make_shared<RMSNorm>(dim, eps);
|
||||
blocks["txt_norm1"] = std::make_shared<RMSNorm>(dim, eps);
|
||||
blocks["txt_norm2"] = std::make_shared<RMSNorm>(dim, eps);
|
||||
blocks["img_mlp"] = std::make_shared<LensGateMLP>(dim, mlp_hidden_dim);
|
||||
blocks["txt_mlp"] = std::make_shared<LensGateMLP>(dim, mlp_hidden_dim);
|
||||
blocks["attn"] = std::make_shared<LensJointAttention>(dim, attention_head_dim, num_attention_heads);
|
||||
}
|
||||
|
||||
std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
|
||||
ggml_tensor* img,
|
||||
ggml_tensor* txt,
|
||||
ggml_tensor* t_emb,
|
||||
ggml_tensor* pe) {
|
||||
auto img_mod_1 = std::dynamic_pointer_cast<Linear>(blocks["img_mod.1"]);
|
||||
auto txt_mod_1 = std::dynamic_pointer_cast<Linear>(blocks["txt_mod.1"]);
|
||||
auto img_norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["img_norm1"]);
|
||||
auto img_norm2 = std::dynamic_pointer_cast<RMSNorm>(blocks["img_norm2"]);
|
||||
auto txt_norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm1"]);
|
||||
auto txt_norm2 = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm2"]);
|
||||
auto img_mlp = std::dynamic_pointer_cast<LensGateMLP>(blocks["img_mlp"]);
|
||||
auto txt_mlp = std::dynamic_pointer_cast<LensGateMLP>(blocks["txt_mlp"]);
|
||||
auto attn = std::dynamic_pointer_cast<LensJointAttention>(blocks["attn"]);
|
||||
|
||||
auto temb = ggml_silu(ctx->ggml_ctx, t_emb);
|
||||
|
||||
auto img_mod_params = img_mod_1->forward(ctx, temb);
|
||||
auto img_mod_vec = ggml_ext_chunk(ctx->ggml_ctx, img_mod_params, 6, 0);
|
||||
auto txt_mod_params = txt_mod_1->forward(ctx, temb);
|
||||
auto txt_mod_vec = ggml_ext_chunk(ctx->ggml_ctx, txt_mod_params, 6, 0);
|
||||
|
||||
auto img_normed = img_norm1->forward(ctx, img);
|
||||
auto img_modulated = Flux::modulate(ctx->ggml_ctx, img_normed, img_mod_vec[0], img_mod_vec[1]);
|
||||
auto txt_normed = txt_norm1->forward(ctx, txt);
|
||||
auto txt_modulated = Flux::modulate(ctx->ggml_ctx, txt_normed, txt_mod_vec[0], txt_mod_vec[1]);
|
||||
|
||||
auto [img_attn_output, txt_attn_output] = attn->forward(ctx, img_modulated, txt_modulated, pe);
|
||||
|
||||
img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_attn_output, img_mod_vec[2]));
|
||||
txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_attn_output, txt_mod_vec[2]));
|
||||
|
||||
auto img_normed2 = img_norm2->forward(ctx, img);
|
||||
auto img_modulated2 = Flux::modulate(ctx->ggml_ctx, img_normed2, img_mod_vec[3], img_mod_vec[4]);
|
||||
auto txt_normed2 = txt_norm2->forward(ctx, txt);
|
||||
auto txt_modulated2 = Flux::modulate(ctx->ggml_ctx, txt_normed2, txt_mod_vec[3], txt_mod_vec[4]);
|
||||
|
||||
img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_mlp->forward(ctx, img_modulated2), img_mod_vec[5]));
|
||||
txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_mlp->forward(ctx, txt_modulated2), txt_mod_vec[5]));
|
||||
return {img, txt};
|
||||
}
|
||||
};
|
||||
|
||||
struct LensAdaLayerNormContinuous : public GGMLBlock {
|
||||
int64_t hidden_size;
|
||||
float eps;
|
||||
|
||||
LensAdaLayerNormContinuous(int64_t hidden_size, float eps = 1e-6f)
|
||||
: hidden_size(hidden_size), eps(eps) {
|
||||
blocks["linear"] = std::make_shared<Linear>(hidden_size, hidden_size * 2, true);
|
||||
}
|
||||
|
||||
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* conditioning) {
|
||||
auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
|
||||
auto mods = ggml_ext_chunk(ctx->ggml_ctx, linear->forward(ctx, ggml_silu(ctx->ggml_ctx, conditioning)), 2, 0);
|
||||
auto scale = mods[0];
|
||||
auto shift = mods[1];
|
||||
x = ggml_norm(ctx->ggml_ctx, x, eps);
|
||||
return Flux::modulate(ctx->ggml_ctx, x, shift, scale);
|
||||
}
|
||||
};
|
||||
|
||||
struct LensParams {
|
||||
int patch_size = 2;
|
||||
int64_t in_channels = 128;
|
||||
int64_t out_channels = 32;
|
||||
int num_layers = 48;
|
||||
int64_t attention_head_dim = 64;
|
||||
int64_t num_attention_heads = 24;
|
||||
int64_t joint_attention_dim = 2880;
|
||||
int selected_layer_count = 4;
|
||||
int theta = 10000;
|
||||
std::vector<int> axes_dim = {8, 28, 28};
|
||||
int axes_dim_sum = 64;
|
||||
};
|
||||
|
||||
class LensModel : public GGMLBlock {
|
||||
public:
|
||||
LensParams params;
|
||||
|
||||
LensModel() = default;
|
||||
LensModel(LensParams params)
|
||||
: params(params) {
|
||||
int64_t inner_dim = params.num_attention_heads * params.attention_head_dim;
|
||||
blocks["time_text_embed"] = std::make_shared<LensTimestepProjEmbeddings>(inner_dim);
|
||||
blocks["img_in"] = std::make_shared<Linear>(params.in_channels, inner_dim, true);
|
||||
blocks["txt_in"] = std::make_shared<Linear>(params.joint_attention_dim * params.selected_layer_count, inner_dim, true);
|
||||
for (int i = 0; i < params.selected_layer_count; ++i) {
|
||||
blocks["txt_norm." + std::to_string(i)] = std::make_shared<RMSNorm>(params.joint_attention_dim, 1e-5f);
|
||||
}
|
||||
for (int i = 0; i < params.num_layers; ++i) {
|
||||
blocks["transformer_blocks." + std::to_string(i)] = std::make_shared<LensTransformerBlock>(inner_dim,
|
||||
params.num_attention_heads,
|
||||
params.attention_head_dim);
|
||||
}
|
||||
blocks["norm_out"] = std::make_shared<LensAdaLayerNormContinuous>(inner_dim, 1e-6f);
|
||||
blocks["proj_out"] = std::make_shared<Linear>(inner_dim, params.patch_size * params.patch_size * params.out_channels, true);
|
||||
}
|
||||
|
||||
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||
ggml_tensor* x,
|
||||
ggml_tensor* timestep,
|
||||
ggml_tensor* context,
|
||||
ggml_tensor* pe) {
|
||||
GGML_ASSERT(context != nullptr);
|
||||
int64_t W = x->ne[0];
|
||||
int64_t H = x->ne[1];
|
||||
int64_t C = x->ne[2];
|
||||
int64_t N = x->ne[3];
|
||||
|
||||
auto time_text_embed = std::dynamic_pointer_cast<LensTimestepProjEmbeddings>(blocks["time_text_embed"]);
|
||||
auto img_in = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
|
||||
auto txt_in = std::dynamic_pointer_cast<Linear>(blocks["txt_in"]);
|
||||
auto norm_out = std::dynamic_pointer_cast<LensAdaLayerNormContinuous>(blocks["norm_out"]);
|
||||
auto proj_out = std::dynamic_pointer_cast<Linear>(blocks["proj_out"]);
|
||||
|
||||
auto t_emb = time_text_embed->forward(ctx, timestep);
|
||||
|
||||
auto img = ggml_reshape_3d(ctx->ggml_ctx, x, W * H, C, N);
|
||||
img = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, img, 1, 0, 2, 3));
|
||||
img = img_in->forward(ctx, img);
|
||||
|
||||
std::vector<ggml_tensor*> txt_chunks = ggml_ext_chunk(ctx->ggml_ctx, context, params.selected_layer_count, 0);
|
||||
ggml_tensor* txt = nullptr;
|
||||
for (int i = 0; i < params.selected_layer_count; ++i) {
|
||||
auto txt_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm." + std::to_string(i)]);
|
||||
auto chunk = txt_norm->forward(ctx, txt_chunks[i]);
|
||||
txt = txt == nullptr ? chunk : ggml_concat(ctx->ggml_ctx, txt, chunk, 0);
|
||||
}
|
||||
txt = txt_in->forward(ctx, txt);
|
||||
|
||||
sd::ggml_graph_cut::mark_graph_cut(img, "lens.prelude", "img");
|
||||
sd::ggml_graph_cut::mark_graph_cut(txt, "lens.prelude", "txt");
|
||||
|
||||
for (int i = 0; i < params.num_layers; ++i) {
|
||||
auto block = std::dynamic_pointer_cast<LensTransformerBlock>(blocks["transformer_blocks." + std::to_string(i)]);
|
||||
auto out = block->forward(ctx, img, txt, t_emb, pe);
|
||||
img = out.first;
|
||||
txt = out.second;
|
||||
sd::ggml_graph_cut::mark_graph_cut(img, "lens.transformer_blocks." + std::to_string(i), "img");
|
||||
sd::ggml_graph_cut::mark_graph_cut(txt, "lens.transformer_blocks." + std::to_string(i), "txt");
|
||||
}
|
||||
|
||||
img = norm_out->forward(ctx, img, t_emb);
|
||||
img = proj_out->forward(ctx, img);
|
||||
|
||||
auto out = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, img, 1, 0, 2, 3));
|
||||
out = ggml_reshape_4d(ctx->ggml_ctx, out, W, H, params.patch_size * params.patch_size * params.out_channels, N);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
struct LensRunner : public GGMLRunner {
|
||||
LensParams lens_params;
|
||||
LensModel lens;
|
||||
std::vector<float> pe_vec;
|
||||
|
||||
LensRunner(ggml_backend_t backend,
|
||||
ggml_backend_t params_backend,
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string prefix = "")
|
||||
: GGMLRunner(backend, params_backend) {
|
||||
lens_params.num_layers = 0;
|
||||
for (const auto& [name, tensor_storage] : tensor_storage_map) {
|
||||
if (!starts_with(name, prefix)) {
|
||||
continue;
|
||||
}
|
||||
if (ends_with(name, "img_in.weight") && tensor_storage.n_dims == 2) {
|
||||
lens_params.in_channels = tensor_storage.ne[0];
|
||||
int64_t inner_dim = tensor_storage.ne[1];
|
||||
lens_params.num_attention_heads = inner_dim / lens_params.attention_head_dim;
|
||||
} else if (ends_with(name, "txt_in.weight") && tensor_storage.n_dims == 2) {
|
||||
lens_params.selected_layer_count = static_cast<int>(tensor_storage.ne[0] / lens_params.joint_attention_dim);
|
||||
} else if (ends_with(name, "proj_out.weight") && tensor_storage.n_dims == 2) {
|
||||
lens_params.out_channels = tensor_storage.ne[1] / lens_params.patch_size / lens_params.patch_size;
|
||||
} else if (ends_with(name, "transformer_blocks.0.attn.norm_q.weight") && tensor_storage.n_dims == 1) {
|
||||
lens_params.attention_head_dim = tensor_storage.ne[0];
|
||||
}
|
||||
|
||||
size_t pos = name.find("transformer_blocks.");
|
||||
if (pos != std::string::npos) {
|
||||
std::string layer_name = name.substr(pos);
|
||||
auto items = split_string(layer_name, '.');
|
||||
if (items.size() > 1) {
|
||||
int block_index = atoi(items[1].c_str());
|
||||
if (block_index + 1 > lens_params.num_layers) {
|
||||
lens_params.num_layers = block_index + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (lens_params.num_layers == 0) {
|
||||
lens_params.num_layers = 48;
|
||||
}
|
||||
lens_params.axes_dim_sum = 0;
|
||||
for (int axis_dim : lens_params.axes_dim) {
|
||||
lens_params.axes_dim_sum += axis_dim;
|
||||
}
|
||||
|
||||
LOG_INFO("lens: layers = %d, in_channels = %" PRId64 ", out_channels = %" PRId64
|
||||
", heads = %" PRId64 ", head_dim = %" PRId64,
|
||||
lens_params.num_layers,
|
||||
lens_params.in_channels,
|
||||
lens_params.out_channels,
|
||||
lens_params.num_attention_heads,
|
||||
lens_params.attention_head_dim);
|
||||
|
||||
lens = LensModel(lens_params);
|
||||
lens.init(params_ctx, tensor_storage_map, prefix);
|
||||
}
|
||||
|
||||
std::string get_desc() override {
|
||||
return "lens";
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
|
||||
lens.get_param_tensors(tensors, prefix);
|
||||
}
|
||||
|
||||
ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
|
||||
const sd::Tensor<float>& timesteps_tensor,
|
||||
const sd::Tensor<float>& context_tensor) {
|
||||
ggml_cgraph* gf = new_graph_custom(LENS_GRAPH_SIZE);
|
||||
ggml_tensor* x = make_input(x_tensor);
|
||||
ggml_tensor* timesteps = make_input(timesteps_tensor);
|
||||
GGML_ASSERT(x->ne[3] == 1);
|
||||
GGML_ASSERT(!context_tensor.empty());
|
||||
ggml_tensor* context = make_input(context_tensor);
|
||||
|
||||
pe_vec = Rope::gen_lens_pe(static_cast<int>(x->ne[1]),
|
||||
static_cast<int>(x->ne[0]),
|
||||
static_cast<int>(x->ne[3]),
|
||||
static_cast<int>(context->ne[1]),
|
||||
lens_params.theta,
|
||||
circular_y_enabled,
|
||||
circular_x_enabled,
|
||||
lens_params.axes_dim);
|
||||
int pos_len = static_cast<int>(pe_vec.size() / lens_params.axes_dim_sum / 2);
|
||||
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, lens_params.axes_dim_sum / 2, pos_len);
|
||||
set_backend_tensor_data(pe, pe_vec.data());
|
||||
|
||||
auto runner_ctx = get_context();
|
||||
ggml_tensor* out = lens.forward(&runner_ctx, x, timesteps, context, pe);
|
||||
ggml_build_forward_expand(gf, out);
|
||||
return gf;
|
||||
}
|
||||
|
||||
sd::Tensor<float> compute(int n_threads,
|
||||
const sd::Tensor<float>& x,
|
||||
const sd::Tensor<float>& timesteps,
|
||||
const sd::Tensor<float>& context) {
|
||||
auto get_graph = [&]() -> ggml_cgraph* {
|
||||
return build_graph(x, timesteps, context);
|
||||
};
|
||||
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
|
||||
}
|
||||
};
|
||||
} // namespace Lens
|
||||
|
||||
#endif // __SD_LENS_HPP__
|
||||
302
src/llm.hpp
302
src/llm.hpp
@ -23,11 +23,12 @@
|
||||
#include "rope.hpp"
|
||||
#include "tokenizers/bpe_tokenizer.h"
|
||||
#include "tokenizers/gemma_tokenizer.h"
|
||||
#include "tokenizers/gpt_oss_tokenizer.h"
|
||||
#include "tokenizers/mistral_tokenizer.h"
|
||||
#include "tokenizers/qwen2_tokenizer.h"
|
||||
|
||||
namespace LLM {
|
||||
constexpr int LLM_GRAPH_SIZE = 10240;
|
||||
constexpr int LLM_GRAPH_SIZE = 65536;
|
||||
|
||||
enum class LLMArch {
|
||||
QWEN2_5_VL,
|
||||
@ -36,6 +37,7 @@ namespace LLM {
|
||||
MISTRAL_SMALL_3_2,
|
||||
MINISTRAL_3_3B,
|
||||
GEMMA3_12B,
|
||||
GPT_OSS_20B,
|
||||
ARCH_COUNT,
|
||||
};
|
||||
|
||||
@ -46,6 +48,7 @@ namespace LLM {
|
||||
"mistral_small3.2",
|
||||
"ministral3.3b",
|
||||
"gemma3_12b",
|
||||
"gpt_oss_20b",
|
||||
};
|
||||
|
||||
enum class MLPActivation {
|
||||
@ -83,6 +86,7 @@ namespace LLM {
|
||||
int num_kv_heads = 4;
|
||||
int head_dim = 128;
|
||||
bool qkv_bias = true;
|
||||
bool attention_out_bias = false;
|
||||
bool qk_norm = false;
|
||||
bool rms_norm_add = false;
|
||||
bool normalize_input = false;
|
||||
@ -93,6 +97,8 @@ namespace LLM {
|
||||
std::vector<float> rope_thetas = {1000000.f};
|
||||
std::vector<float> rope_scales = {1.f};
|
||||
std::vector<int> sliding_attention;
|
||||
int64_t num_experts = 0;
|
||||
int64_t num_experts_per_tok = 0;
|
||||
LLMVisionParams vision;
|
||||
};
|
||||
|
||||
@ -163,6 +169,170 @@ namespace LLM {
|
||||
}
|
||||
};
|
||||
|
||||
struct GPTOSSMLP : public GGMLBlock {
|
||||
protected:
|
||||
int64_t hidden_size;
|
||||
int64_t intermediate_size;
|
||||
int64_t num_experts;
|
||||
int64_t num_experts_per_tok;
|
||||
bool has_combined_gate_up = false;
|
||||
|
||||
void init_params(ggml_context* ctx,
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
std::string prefix = "") override {
|
||||
auto supported_type = [](ggml_type wtype, int64_t in_features) {
|
||||
if (in_features % ggml_blck_size(wtype) != 0) {
|
||||
return GGML_TYPE_F32;
|
||||
}
|
||||
return wtype;
|
||||
};
|
||||
|
||||
params["router.weight"] = ggml_new_tensor_2d(ctx,
|
||||
supported_type(get_type(prefix + "router.weight", tensor_storage_map, GGML_TYPE_F32), hidden_size),
|
||||
hidden_size,
|
||||
num_experts);
|
||||
params["router.bias"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_experts);
|
||||
|
||||
has_combined_gate_up = tensor_storage_map.find(prefix + "experts.gate_up_proj.weight") != tensor_storage_map.end();
|
||||
if (has_combined_gate_up) {
|
||||
ggml_type gate_up_type = supported_type(get_type(prefix + "experts.gate_up_proj.weight", tensor_storage_map, GGML_TYPE_F32), hidden_size);
|
||||
params["experts.gate_up_proj.weight"] = ggml_new_tensor_3d(ctx,
|
||||
gate_up_type,
|
||||
hidden_size,
|
||||
intermediate_size * 2,
|
||||
num_experts);
|
||||
params["experts.gate_up_proj.bias"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, intermediate_size * 2, num_experts);
|
||||
} else {
|
||||
ggml_type gate_type = supported_type(get_type(prefix + "experts.gate_proj.weight", tensor_storage_map, GGML_TYPE_F32), hidden_size);
|
||||
ggml_type up_type = supported_type(get_type(prefix + "experts.up_proj.weight", tensor_storage_map, GGML_TYPE_F32), hidden_size);
|
||||
params["experts.gate_proj.weight"] = ggml_new_tensor_3d(ctx, gate_type, hidden_size, intermediate_size, num_experts);
|
||||
params["experts.up_proj.weight"] = ggml_new_tensor_3d(ctx, up_type, hidden_size, intermediate_size, num_experts);
|
||||
params["experts.gate_proj.bias"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, intermediate_size, num_experts);
|
||||
params["experts.up_proj.bias"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, intermediate_size, num_experts);
|
||||
}
|
||||
|
||||
ggml_type down_type = supported_type(get_type(prefix + "experts.down_proj.weight", tensor_storage_map, GGML_TYPE_F32), intermediate_size);
|
||||
params["experts.down_proj.weight"] = ggml_new_tensor_3d(ctx, down_type, intermediate_size, hidden_size, num_experts);
|
||||
params["experts.down_proj.bias"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hidden_size, num_experts);
|
||||
}
|
||||
|
||||
ggml_tensor* expert_linear(GGMLRunnerContext* ctx,
|
||||
const std::string& weight_name,
|
||||
const std::string& bias_name,
|
||||
ggml_tensor* x,
|
||||
ggml_tensor* selected_experts) {
|
||||
auto out = ggml_mul_mat_id(ctx->ggml_ctx, params[weight_name], x, selected_experts);
|
||||
auto it = params.find(bias_name);
|
||||
if (it != params.end()) {
|
||||
out = ggml_add_id(ctx->ggml_ctx, out, it->second, selected_experts);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
public:
|
||||
GPTOSSMLP(const LLMParams& params)
|
||||
: hidden_size(params.hidden_size),
|
||||
intermediate_size(params.intermediate_size),
|
||||
num_experts(params.num_experts),
|
||||
num_experts_per_tok(params.num_experts_per_tok) {}
|
||||
|
||||
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
|
||||
// x: [N, n_token, hidden_size]
|
||||
GGML_ASSERT(num_experts > 0 && num_experts_per_tok > 0);
|
||||
|
||||
const int64_t n_token = x->ne[1];
|
||||
const int64_t N = x->ne[2];
|
||||
const int64_t n_token_total = n_token * N;
|
||||
ggml_tensor* router_weight = params["router.weight"];
|
||||
ggml_tensor* router_bias = params["router.bias"];
|
||||
ggml_tensor* router_logits = ggml_mul_mat(ctx->ggml_ctx, router_weight, x);
|
||||
router_logits = ggml_add(ctx->ggml_ctx, router_logits, router_bias);
|
||||
router_logits = ggml_reshape_2d(ctx->ggml_ctx, router_logits, num_experts, n_token_total);
|
||||
|
||||
ggml_tensor* selected_experts = ggml_argsort_top_k(ctx->ggml_ctx, router_logits, (int)num_experts_per_tok); // [top_k, tokens]
|
||||
ggml_tensor* probs = ggml_reshape_3d(ctx->ggml_ctx, router_logits, 1, num_experts, n_token_total);
|
||||
ggml_tensor* weights = ggml_get_rows(ctx->ggml_ctx, probs, selected_experts); // [1, top_k, tokens]
|
||||
weights = ggml_reshape_2d(ctx->ggml_ctx, weights, num_experts_per_tok, n_token_total);
|
||||
weights = ggml_soft_max(ctx->ggml_ctx, weights);
|
||||
weights = ggml_reshape_3d(ctx->ggml_ctx, weights, 1, num_experts_per_tok, n_token_total);
|
||||
|
||||
x = ggml_reshape_3d(ctx->ggml_ctx, x, hidden_size, 1, n_token_total);
|
||||
|
||||
ggml_tensor* gate = nullptr;
|
||||
ggml_tensor* up = nullptr;
|
||||
if (has_combined_gate_up) {
|
||||
auto gate_up = expert_linear(ctx,
|
||||
"experts.gate_up_proj.weight",
|
||||
"experts.gate_up_proj.bias",
|
||||
x,
|
||||
selected_experts); // [2 * intermediate, top_k, tokens]
|
||||
gate_up = ggml_reshape_4d(ctx->ggml_ctx,
|
||||
gate_up,
|
||||
2,
|
||||
intermediate_size,
|
||||
num_experts_per_tok,
|
||||
n_token_total);
|
||||
gate = ggml_view_4d(ctx->ggml_ctx,
|
||||
gate_up,
|
||||
1,
|
||||
intermediate_size,
|
||||
num_experts_per_tok,
|
||||
n_token_total,
|
||||
gate_up->nb[1],
|
||||
gate_up->nb[2],
|
||||
gate_up->nb[3],
|
||||
0);
|
||||
up = ggml_view_4d(ctx->ggml_ctx,
|
||||
gate_up,
|
||||
1,
|
||||
intermediate_size,
|
||||
num_experts_per_tok,
|
||||
n_token_total,
|
||||
gate_up->nb[1],
|
||||
gate_up->nb[2],
|
||||
gate_up->nb[3],
|
||||
gate_up->nb[0]);
|
||||
gate = ggml_reshape_3d(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, gate), intermediate_size, num_experts_per_tok, n_token_total);
|
||||
up = ggml_reshape_3d(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, up), intermediate_size, num_experts_per_tok, n_token_total);
|
||||
} else {
|
||||
gate = expert_linear(ctx,
|
||||
"experts.gate_proj.weight",
|
||||
"experts.gate_proj.bias",
|
||||
x,
|
||||
selected_experts);
|
||||
up = expert_linear(ctx,
|
||||
"experts.up_proj.weight",
|
||||
"experts.up_proj.bias",
|
||||
x,
|
||||
selected_experts);
|
||||
}
|
||||
|
||||
auto activated = ggml_swiglu_oai(ctx->ggml_ctx, gate, up, 1.702f, 7.0f);
|
||||
auto experts = expert_linear(ctx,
|
||||
"experts.down_proj.weight",
|
||||
"experts.down_proj.bias",
|
||||
activated,
|
||||
selected_experts);
|
||||
experts = ggml_mul(ctx->ggml_ctx, experts, weights);
|
||||
|
||||
ggml_tensor* out = nullptr;
|
||||
for (int64_t i = 0; i < num_experts_per_tok; ++i) {
|
||||
auto expert_out = ggml_view_2d(ctx->ggml_ctx,
|
||||
experts,
|
||||
hidden_size,
|
||||
n_token_total,
|
||||
experts->nb[2],
|
||||
i * experts->nb[1]);
|
||||
out = out == nullptr ? expert_out : ggml_add(ctx->ggml_ctx, out, expert_out);
|
||||
}
|
||||
if (num_experts_per_tok == 1) {
|
||||
out = ggml_cont(ctx->ggml_ctx, out);
|
||||
}
|
||||
|
||||
return ggml_reshape_3d(ctx->ggml_ctx, out, hidden_size, n_token, N);
|
||||
}
|
||||
};
|
||||
|
||||
static ggml_tensor* splice_image_embeds(GGMLRunnerContext* ctx,
|
||||
ggml_tensor* x,
|
||||
const std::vector<std::pair<int, ggml_tensor*>>& image_embeds) {
|
||||
@ -601,6 +771,15 @@ namespace LLM {
|
||||
int64_t max_position_embeddings;
|
||||
std::vector<float> rope_thetas;
|
||||
std::vector<float> rope_scales;
|
||||
bool has_attention_sinks;
|
||||
|
||||
void init_params(ggml_context* ctx,
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
std::string prefix = "") override {
|
||||
if (has_attention_sinks) {
|
||||
params["sinks"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_heads);
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
Attention(const LLMParams& params)
|
||||
@ -611,11 +790,12 @@ namespace LLM {
|
||||
qk_norm(params.qk_norm),
|
||||
max_position_embeddings(params.max_position_embeddings),
|
||||
rope_thetas(params.rope_thetas),
|
||||
rope_scales(params.rope_scales) {
|
||||
rope_scales(params.rope_scales),
|
||||
has_attention_sinks(params.arch == LLMArch::GPT_OSS_20B) {
|
||||
blocks["q_proj"] = std::make_shared<Linear>(params.hidden_size, num_heads * head_dim, params.qkv_bias);
|
||||
blocks["k_proj"] = std::make_shared<Linear>(params.hidden_size, num_kv_heads * head_dim, params.qkv_bias);
|
||||
blocks["v_proj"] = std::make_shared<Linear>(params.hidden_size, num_kv_heads * head_dim, params.qkv_bias);
|
||||
blocks["o_proj"] = std::make_shared<Linear>(num_heads * head_dim, params.hidden_size, false);
|
||||
blocks["o_proj"] = std::make_shared<Linear>(num_heads * head_dim, params.hidden_size, params.attention_out_bias);
|
||||
if (params.qk_norm) {
|
||||
blocks["q_norm"] = std::make_shared<LLMRMSNorm>(head_dim, params.rms_norm_eps, params.rms_norm_add);
|
||||
blocks["k_norm"] = std::make_shared<LLMRMSNorm>(head_dim, params.rms_norm_eps, params.rms_norm_add);
|
||||
@ -660,6 +840,36 @@ namespace LLM {
|
||||
} else if (arch == LLMArch::QWEN3) {
|
||||
q = ggml_rope_ext(ctx->ggml_ctx, q, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 40960, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
|
||||
k = ggml_rope_ext(ctx->ggml_ctx, k, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 40960, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
|
||||
} else if (arch == LLMArch::GPT_OSS_20B) {
|
||||
float rope_theta = rope_thetas.empty() ? 150000.f : rope_thetas[0];
|
||||
float rope_scale = rope_scales.empty() ? 32.f : rope_scales[0];
|
||||
float freq_scale = 1.f / rope_scale;
|
||||
q = ggml_rope_ext(ctx->ggml_ctx,
|
||||
q,
|
||||
input_pos,
|
||||
nullptr,
|
||||
head_dim,
|
||||
GGML_ROPE_TYPE_NEOX,
|
||||
4096,
|
||||
rope_theta,
|
||||
freq_scale,
|
||||
1.f,
|
||||
1.f,
|
||||
32.f,
|
||||
1.f);
|
||||
k = ggml_rope_ext(ctx->ggml_ctx,
|
||||
k,
|
||||
input_pos,
|
||||
nullptr,
|
||||
head_dim,
|
||||
GGML_ROPE_TYPE_NEOX,
|
||||
4096,
|
||||
rope_theta,
|
||||
freq_scale,
|
||||
1.f,
|
||||
1.f,
|
||||
32.f,
|
||||
1.f);
|
||||
} else if (arch == LLMArch::GEMMA3_12B) {
|
||||
float rope_theta = (rope_index == 1 ? 10000.0f : 1000000.0f);
|
||||
float rope_scale = (rope_index == 1 ? 1.f : 8.f);
|
||||
@ -706,7 +916,28 @@ namespace LLM {
|
||||
k = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, k, 0, 2, 1, 3)); // [N, num_kv_heads, n_token, head_dim]
|
||||
k = ggml_reshape_3d(ctx->ggml_ctx, k, k->ne[0], k->ne[1], k->ne[2] * k->ne[3]); // [N*num_kv_heads, n_token, head_dim]
|
||||
|
||||
x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, attention_mask, true, false); // [N, n_token, hidden_size]
|
||||
if (arch == LLMArch::GPT_OSS_20B) {
|
||||
GGML_ASSERT(N == 1);
|
||||
auto v_attn = ggml_ext_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, v, 1, 2, 0, 3)); // [N, kv_heads, head_dim, tokens]
|
||||
v_attn = ggml_reshape_3d(ctx->ggml_ctx, v_attn, n_token, head_dim, num_kv_heads * N);
|
||||
|
||||
auto kq = ggml_mul_mat(ctx->ggml_ctx, k, q);
|
||||
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
||||
kq = ggml_scale_inplace(ctx->ggml_ctx, kq, 1.0f / std::sqrt(static_cast<float>(head_dim)));
|
||||
if (attention_mask != nullptr) {
|
||||
kq = ggml_add_inplace(ctx->ggml_ctx, kq, attention_mask);
|
||||
}
|
||||
kq = ggml_soft_max_inplace(ctx->ggml_ctx, kq);
|
||||
ggml_soft_max_add_sinks(kq, params["sinks"]);
|
||||
|
||||
auto kqv = ggml_mul_mat(ctx->ggml_ctx, v_attn, kq);
|
||||
kqv = ggml_reshape_4d(ctx->ggml_ctx, kqv, head_dim, n_token, num_heads, N);
|
||||
kqv = ggml_permute(ctx->ggml_ctx, kqv, 0, 2, 1, 3);
|
||||
x = ggml_ext_cont(ctx->ggml_ctx, kqv);
|
||||
x = ggml_reshape_3d(ctx->ggml_ctx, x, head_dim * num_heads, n_token, N);
|
||||
} else {
|
||||
x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, attention_mask, true, false); // [N, n_token, hidden_size]
|
||||
}
|
||||
|
||||
x = out_proj->forward(ctx, x); // [N, n_token, hidden_size]
|
||||
return x;
|
||||
@ -726,11 +957,15 @@ namespace LLM {
|
||||
sliding_attention(0),
|
||||
has_post_attention_norm(params.arch == LLMArch::GEMMA3_12B),
|
||||
has_post_ffw_norm(params.arch == LLMArch::GEMMA3_12B) {
|
||||
blocks["self_attn"] = std::make_shared<Attention>(params);
|
||||
blocks["mlp"] = std::make_shared<MLP>(params.hidden_size,
|
||||
params.intermediate_size,
|
||||
false,
|
||||
params.mlp_activation);
|
||||
blocks["self_attn"] = std::make_shared<Attention>(params);
|
||||
if (params.arch == LLMArch::GPT_OSS_20B) {
|
||||
blocks["mlp"] = std::make_shared<GPTOSSMLP>(params);
|
||||
} else {
|
||||
blocks["mlp"] = std::make_shared<MLP>(params.hidden_size,
|
||||
params.intermediate_size,
|
||||
false,
|
||||
params.mlp_activation);
|
||||
}
|
||||
blocks["input_layernorm"] = std::make_shared<LLMRMSNorm>(params.hidden_size, params.rms_norm_eps, params.rms_norm_add);
|
||||
blocks["post_attention_layernorm"] = std::make_shared<LLMRMSNorm>(params.hidden_size, params.rms_norm_eps, params.rms_norm_add);
|
||||
if (has_post_attention_norm) {
|
||||
@ -751,7 +986,6 @@ namespace LLM {
|
||||
ggml_tensor* sliding_attention_mask = nullptr) {
|
||||
// x: [N, n_token, hidden_size]
|
||||
auto self_attn = std::dynamic_pointer_cast<Attention>(blocks["self_attn"]);
|
||||
auto mlp = std::dynamic_pointer_cast<MLP>(blocks["mlp"]);
|
||||
auto input_layernorm = std::dynamic_pointer_cast<LLMRMSNorm>(blocks["input_layernorm"]);
|
||||
auto post_attention_layernorm = std::dynamic_pointer_cast<LLMRMSNorm>(blocks["post_attention_layernorm"]);
|
||||
std::shared_ptr<LLMRMSNorm> post_attention_norm = nullptr;
|
||||
@ -764,7 +998,7 @@ namespace LLM {
|
||||
}
|
||||
ggml_tensor* block_attention_mask = attention_mask;
|
||||
int rope_index = 0;
|
||||
if (arch == LLMArch::GEMMA3_12B && sliding_attention > 0) {
|
||||
if ((arch == LLMArch::GEMMA3_12B || arch == LLMArch::GPT_OSS_20B) && sliding_attention > 0) {
|
||||
block_attention_mask = sliding_attention_mask;
|
||||
rope_index = 1;
|
||||
}
|
||||
@ -779,7 +1013,13 @@ namespace LLM {
|
||||
|
||||
residual = x;
|
||||
x = post_attention_layernorm->forward(ctx, x);
|
||||
x = mlp->forward(ctx, x);
|
||||
if (arch == LLMArch::GPT_OSS_20B) {
|
||||
auto mlp = std::dynamic_pointer_cast<GPTOSSMLP>(blocks["mlp"]);
|
||||
x = mlp->forward(ctx, x);
|
||||
} else {
|
||||
auto mlp = std::dynamic_pointer_cast<MLP>(blocks["mlp"]);
|
||||
x = mlp->forward(ctx, x);
|
||||
}
|
||||
if (post_ffw_norm != nullptr) {
|
||||
x = post_ffw_norm->forward(ctx, x);
|
||||
}
|
||||
@ -1202,6 +1442,24 @@ namespace LLM {
|
||||
params.rope_thetas = {1000000.f, 10000.f};
|
||||
params.rope_scales = {8.f, 1.f};
|
||||
params.sliding_attention = {1024, 1024, 1024, 1024, 1024, 0};
|
||||
} else if (arch == LLMArch::GPT_OSS_20B) {
|
||||
params.head_dim = 64;
|
||||
params.num_heads = 64;
|
||||
params.num_kv_heads = 8;
|
||||
params.qkv_bias = true;
|
||||
params.attention_out_bias = true;
|
||||
params.qk_norm = false;
|
||||
params.rms_norm_eps = 1e-5f;
|
||||
params.hidden_size = 2880;
|
||||
params.intermediate_size = 2880;
|
||||
params.num_layers = 24;
|
||||
params.vocab_size = 201088;
|
||||
params.max_position_embeddings = 131072;
|
||||
params.rope_thetas = {150000.f};
|
||||
params.rope_scales = {32.f};
|
||||
params.sliding_attention = {128, 0};
|
||||
params.num_experts = 32;
|
||||
params.num_experts_per_tok = 4;
|
||||
}
|
||||
bool have_vision_weight = false;
|
||||
bool llama_cpp_style = false;
|
||||
@ -1236,6 +1494,12 @@ namespace LLM {
|
||||
if (contains(tensor_name, "layers.0.mlp.gate_proj.weight")) {
|
||||
params.intermediate_size = pair.second.ne[1];
|
||||
}
|
||||
if (contains(tensor_name, "layers.0.mlp.experts.gate_up_proj.weight")) {
|
||||
params.intermediate_size = pair.second.ne[1] / 2;
|
||||
}
|
||||
if (contains(tensor_name, "layers.0.mlp.experts.gate_proj.weight")) {
|
||||
params.intermediate_size = pair.second.ne[1];
|
||||
}
|
||||
}
|
||||
if (arch == LLMArch::QWEN3 && params.num_layers == 28) { // Qwen3 2B
|
||||
params.num_heads = 16;
|
||||
@ -1315,7 +1579,8 @@ namespace LLM {
|
||||
if (params.arch == LLMArch::MISTRAL_SMALL_3_2 ||
|
||||
params.arch == LLMArch::MINISTRAL_3_3B ||
|
||||
params.arch == LLMArch::QWEN3 ||
|
||||
params.arch == LLMArch::GEMMA3_12B) {
|
||||
params.arch == LLMArch::GEMMA3_12B ||
|
||||
params.arch == LLMArch::GPT_OSS_20B) {
|
||||
input_pos_vec.resize(n_tokens);
|
||||
for (int i = 0; i < n_tokens; ++i) {
|
||||
input_pos_vec[i] = i;
|
||||
@ -1354,7 +1619,11 @@ namespace LLM {
|
||||
set_backend_tensor_data(attention_mask, attention_mask_vec.data());
|
||||
}
|
||||
|
||||
if (params.arch == LLMArch::GEMMA3_12B) {
|
||||
if (params.arch == LLMArch::GEMMA3_12B || params.arch == LLMArch::GPT_OSS_20B) {
|
||||
int sliding_window = 0;
|
||||
for (int window : params.sliding_attention) {
|
||||
sliding_window = std::max(sliding_window, window);
|
||||
}
|
||||
sliding_attention_mask_vec.resize(n_tokens * n_tokens);
|
||||
if (!attention_mask_tensor.empty()) {
|
||||
GGML_ASSERT(attention_mask_tensor.numel() == n_tokens * n_tokens);
|
||||
@ -1364,8 +1633,7 @@ namespace LLM {
|
||||
}
|
||||
for (int i0 = 0; i0 < n_tokens; i0++) {
|
||||
for (int i1 = 0; i1 < n_tokens; i1++) {
|
||||
if (i0 + 1024 <= i1) {
|
||||
LOG_DEBUG("xxxxxxxxxxxxxx");
|
||||
if (sliding_window > 0 && i0 + sliding_window <= i1) {
|
||||
sliding_attention_mask_vec[i1 * n_tokens + i0] = -INFINITY;
|
||||
}
|
||||
}
|
||||
@ -1485,6 +1753,8 @@ namespace LLM {
|
||||
: model(arch, backend, params_backend, tensor_storage_map, prefix, enable_vision) {
|
||||
if (arch == LLMArch::MISTRAL_SMALL_3_2 || arch == LLMArch::MINISTRAL_3_3B) {
|
||||
tokenizer = std::make_shared<MistralTokenizer>();
|
||||
} else if (arch == LLMArch::GPT_OSS_20B) {
|
||||
tokenizer = std::make_shared<GPTOSSTokenizer>();
|
||||
} else {
|
||||
tokenizer = std::make_shared<Qwen2Tokenizer>();
|
||||
}
|
||||
|
||||
@ -442,6 +442,10 @@ SDVersion ModelLoader::get_sd_version() {
|
||||
tensor_storage_map.find("model.language_model.layers.0.self_attn.q_proj.weight") != tensor_storage_map.end()) {
|
||||
return VERSION_HIDREAM_O1;
|
||||
}
|
||||
if (tensor_storage.name.find("model.diffusion_model.transformer_blocks.0.attn.norm_added_q.weight") != std::string::npos &&
|
||||
tensor_storage_map.find("model.diffusion_model.transformer_blocks.0.img_mlp.w1.weight") != tensor_storage_map.end()) {
|
||||
return VERSION_LENS;
|
||||
}
|
||||
if (tensor_storage.name.find("model.diffusion_model.transformer_blocks.0.img_mod.1.weight") != std::string::npos) {
|
||||
return VERSION_QWEN_IMAGE;
|
||||
}
|
||||
|
||||
11
src/model.h
11
src/model.h
@ -47,6 +47,7 @@ enum SDVersion {
|
||||
VERSION_Z_IMAGE,
|
||||
VERSION_OVIS_IMAGE,
|
||||
VERSION_ERNIE_IMAGE,
|
||||
VERSION_LENS,
|
||||
VERSION_LONGCAT,
|
||||
VERSION_COUNT,
|
||||
};
|
||||
@ -156,8 +157,15 @@ static inline bool sd_version_is_ernie_image(SDVersion version) {
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline bool sd_version_is_lens(SDVersion version) {
|
||||
if (version == VERSION_LENS) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline bool sd_version_uses_flux2_vae(SDVersion version) {
|
||||
if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version)) {
|
||||
if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version) || sd_version_is_lens(version)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
@ -185,6 +193,7 @@ static inline bool sd_version_is_dit(SDVersion version) {
|
||||
sd_version_is_anima(version) ||
|
||||
sd_version_is_z_image(version) ||
|
||||
sd_version_is_ernie_image(version) ||
|
||||
sd_version_is_lens(version) ||
|
||||
sd_version_is_longcat(version)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -128,6 +128,7 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
|
||||
};
|
||||
|
||||
static const std::vector<std::pair<std::string, std::string>> llm_name_map{
|
||||
{"attn_sinks.weight", "self_attn.sinks"},
|
||||
{"token_embd.", "model.embed_tokens."},
|
||||
{"blk.", "model.layers."},
|
||||
{"attn_q.", "self_attn.q_proj."},
|
||||
@ -137,6 +138,12 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
|
||||
{"attn_k_norm.", "self_attn.k_norm."},
|
||||
{"attn_output.", "self_attn.o_proj."},
|
||||
{"attn_norm.", "input_layernorm."},
|
||||
{"attn_post_norm.", "post_attention_layernorm."},
|
||||
{"post_attention_norm.", "post_attention_layernorm."},
|
||||
{"ffn_gate_inp.", "mlp.router."},
|
||||
{"ffn_gate_exps.", "mlp.experts.gate_proj."},
|
||||
{"ffn_up_exps.", "mlp.experts.up_proj."},
|
||||
{"ffn_down_exps.", "mlp.experts.down_proj."},
|
||||
{"ffn_down.", "mlp.down_proj."},
|
||||
{"ffn_gate.", "mlp.gate_proj."},
|
||||
{"ffn_up.", "mlp.up_proj."},
|
||||
@ -144,6 +151,12 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
|
||||
{"output_norm.", "model.norm."},
|
||||
};
|
||||
|
||||
static const std::vector<std::pair<std::string, std::string>> llm_safetensors_prefix_map{
|
||||
{"text_encoders.llm.embed_tokens.", "text_encoders.llm.model.embed_tokens."},
|
||||
{"text_encoders.llm.layers.", "text_encoders.llm.model.layers."},
|
||||
{"text_encoders.llm.norm.", "text_encoders.llm.model.norm."},
|
||||
};
|
||||
|
||||
static const std::vector<std::pair<std::string, std::string>> llm_vision_name_map{
|
||||
{"mm.", "merger.mlp."},
|
||||
{"v.post_ln.", "merger.ln_q."},
|
||||
@ -168,6 +181,7 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
|
||||
replace_with_name_map(name, llm_vision_name_map);
|
||||
} else {
|
||||
replace_with_name_map(name, llm_name_map);
|
||||
replace_with_prefix_map(name, llm_safetensors_prefix_map);
|
||||
}
|
||||
} else {
|
||||
name = convert_open_clip_to_hf_clip_name(name);
|
||||
|
||||
46
src/rope.hpp
46
src/rope.hpp
@ -478,6 +478,52 @@ namespace Rope {
|
||||
return embed_nd(ids, bs, static_cast<float>(theta), axes_dim, wrap_dims);
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ std::vector<std::vector<float>> gen_lens_ids(int h,
|
||||
int w,
|
||||
int bs,
|
||||
int context_len,
|
||||
bool scale_rope = true) {
|
||||
auto img_ids_repeated = gen_flux_img_ids(h, w, 1, bs, 3, 0, 0, 0, scale_rope);
|
||||
|
||||
int txt_id_start = scale_rope ? std::max(h / 2, w / 2) : 0;
|
||||
auto txt_ids = linspace<float>(1.f * txt_id_start, 1.f * context_len + txt_id_start, context_len);
|
||||
std::vector<std::vector<float>> txt_ids_repeated(bs * context_len, std::vector<float>(3));
|
||||
for (int i = 0; i < bs; ++i) {
|
||||
for (int j = 0; j < txt_ids.size(); ++j) {
|
||||
txt_ids_repeated[i * txt_ids.size() + j] = {txt_ids[j], txt_ids[j], txt_ids[j]};
|
||||
}
|
||||
}
|
||||
|
||||
return concat_ids(img_ids_repeated, txt_ids_repeated, bs);
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ std::vector<float> gen_lens_pe(int h,
|
||||
int w,
|
||||
int bs,
|
||||
int context_len,
|
||||
int theta,
|
||||
bool circular_h,
|
||||
bool circular_w,
|
||||
const std::vector<int>& axes_dim) {
|
||||
std::vector<std::vector<float>> ids = gen_lens_ids(h, w, bs, context_len, true);
|
||||
std::vector<std::vector<int>> wrap_dims;
|
||||
if ((circular_h || circular_w) && bs > 0 && axes_dim.size() >= 3) {
|
||||
size_t pos_len = ids.size() / bs;
|
||||
wrap_dims.assign(axes_dim.size(), std::vector<int>(pos_len, 0));
|
||||
const size_t img_tokens = static_cast<size_t>(h) * static_cast<size_t>(w);
|
||||
for (size_t token_i = 0; token_i < img_tokens; ++token_i) {
|
||||
if (circular_h) {
|
||||
wrap_dims[1][token_i] = h;
|
||||
}
|
||||
if (circular_w) {
|
||||
wrap_dims[2][token_i] = w;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return embed_nd(ids, bs, static_cast<float>(theta), axes_dim, wrap_dims);
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ std::vector<std::vector<float>> gen_ernie_image_ids(int h,
|
||||
int w,
|
||||
int patch_size,
|
||||
|
||||
@ -62,6 +62,7 @@ const char* model_version_to_str[] = {
|
||||
"Z-Image",
|
||||
"Ovis Image",
|
||||
"Ernie Image",
|
||||
"Lens",
|
||||
"Longcat-Image",
|
||||
};
|
||||
|
||||
@ -646,6 +647,15 @@ public:
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map,
|
||||
"model.diffusion_model");
|
||||
} else if (sd_version_is_lens(version)) {
|
||||
cond_stage_model = std::make_shared<LLMEmbedder>(backend_for(SDBackendModule::TE),
|
||||
params_backend_for(SDBackendModule::TE),
|
||||
tensor_storage_map,
|
||||
version);
|
||||
diffusion_model = std::make_shared<LensModel>(backend_for(SDBackendModule::DIFFUSION),
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map,
|
||||
"model.diffusion_model");
|
||||
} else { // SD1.x SD2.x SDXL
|
||||
std::map<std::string, std::string> embbeding_map;
|
||||
for (uint32_t i = 0; i < sd_ctx_params->embedding_count; i++) {
|
||||
@ -935,6 +945,11 @@ public:
|
||||
ignore_tensors.insert("text_encoders.llm.vision_tower.");
|
||||
ignore_tensors.insert("text_encoders.llm.multi_modal_projector.");
|
||||
}
|
||||
if (sd_version_is_lens(version)) {
|
||||
ignore_tensors.insert("text_encoders.llm.tokenizer_json");
|
||||
ignore_tensors.insert("text_encoders.llm.model.layers.0.mlp.experts.gate_up_proj.weight_scale_2");
|
||||
ignore_tensors.insert("text_encoders.llm.model.layers.0.mlp.experts.down_proj.weight_scale_2");
|
||||
}
|
||||
if (version == VERSION_HIDREAM_O1) {
|
||||
ignore_tensors.insert("lm_head.");
|
||||
ignore_tensors.insert("model.visual.deepstack_merger_list.");
|
||||
@ -1115,7 +1130,7 @@ public:
|
||||
} else {
|
||||
default_flow_shift = 3.f;
|
||||
}
|
||||
} else if (sd_version_is_flux(version) || sd_version_is_longcat(version)) {
|
||||
} else if (sd_version_is_flux(version) || sd_version_is_longcat(version) || sd_version_is_lens(version)) {
|
||||
pred_type = FLUX_FLOW_PRED;
|
||||
|
||||
default_flow_shift = 1.0f; // TODO: validate
|
||||
@ -1127,6 +1142,8 @@ public:
|
||||
}
|
||||
if (sd_version_is_longcat(version)) {
|
||||
default_flow_shift = 3.0f;
|
||||
} else if (sd_version_is_lens(version)) {
|
||||
default_flow_shift = 1.83f;
|
||||
}
|
||||
} else if (sd_version_is_flux2(version)) {
|
||||
pred_type = FLUX2_FLOW_PRED;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user