diff --git a/conditioner.hpp b/conditioner.hpp index a376b1e..005d2c1 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -1755,9 +1755,13 @@ struct LLMEmbedder : public Conditioner { std::vector> image_embeds; std::pair prompt_attn_range; int prompt_template_encode_start_idx = 34; + int prompt_template_encode_end_idx = 0; int max_length = 0; bool spell_quotes = false; std::set out_layers; + std::vector tokens; + std::vector weights; + std::vector mask; if (llm->enable_vision && conditioner_params.ref_images.size() > 0) { if (sd_version_is_longcat(version)) { LOG_INFO("LongCatEditPipeline"); @@ -1937,8 +1941,8 @@ struct LLMEmbedder : public Conditioner { prompt += "<|im_end|>\n<|im_start|>assistant\n\n\n\n\n"; } else if (sd_version_is_longcat(version)) { prompt_template_encode_start_idx = 36; - // prompt_template_encode_end_idx = 5; - max_length = 512; + max_length = 512 + prompt_template_encode_start_idx; + prompt_template_encode_end_idx = 5; spell_quotes = true; prompt = "<|im_start|>system\nAs an image captioning expert, generate a descriptive text prompt based on an image content, suitable for input to a text-to-image model.<|im_end|>\n<|im_start|>user\n"; @@ -1947,7 +1951,24 @@ struct LLMEmbedder : public Conditioner { prompt += conditioner_params.text; prompt_attn_range.second = static_cast(prompt.size()); - prompt += "<|im_end|>\n<|im_start|>assistant\n"; + auto tokens_and_weights = tokenize(prompt, prompt_attn_range, 0, false, spell_quotes); + tokens = std::get<0>(tokens_and_weights); + weights = std::get<1>(tokens_and_weights); + + mask.insert(mask.end(), tokens.size(), 1.f); + if (tokens.size() < max_length) { + mask.insert(mask.end(), max_length - tokens.size(), 0.f); + tokenizer->pad_tokens(tokens, weights, max_length, true); + } + + std::string prompt_template_suffix = "<|im_end|>\n<|im_start|>assistant\n"; + auto suffix_tokens = tokenizer->tokenize(prompt_template_suffix, nullptr); + + LOG_DEBUG("%zd", tokens.size()); + + tokens.insert(tokens.end(), suffix_tokens.begin(), suffix_tokens.end()); + weights.insert(weights.end(), suffix_tokens.size(), 1.f); + mask.insert(mask.end(), suffix_tokens.size(), 1.f); } else { prompt_template_encode_start_idx = 34; @@ -1960,17 +1981,33 @@ struct LLMEmbedder : public Conditioner { prompt += "<|im_end|>\n<|im_start|>assistant\n"; } - auto tokens_and_weights = tokenize(prompt, prompt_attn_range, max_length, max_length > 0, spell_quotes); - auto& tokens = std::get<0>(tokens_and_weights); - auto& weights = std::get<1>(tokens_and_weights); + if (tokens.empty()) { + auto tokens_and_weights = tokenize(prompt, prompt_attn_range, max_length, max_length > 0, spell_quotes); + tokens = std::get<0>(tokens_and_weights); + weights = std::get<1>(tokens_and_weights); + } + int64_t t0 = ggml_time_ms(); struct ggml_tensor* hidden_states = nullptr; // [N, n_token, 3584] auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); + ggml_tensor* attention_mask = nullptr; + if (!mask.empty()) { + attention_mask = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, mask.size(), mask.size()); + ggml_ext_tensor_iter(attention_mask, [&](ggml_tensor* attention_mask, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { + float value = 0.f; + if (mask[i0] == 0.f || mask[i1] == 0.f) { + value = -INFINITY; + } + ggml_ext_tensor_set_f32(attention_mask, value, i0, i1, i2, i3); + }); + print_ggml_tensor(attention_mask); + } llm->compute(n_threads, input_ids, + attention_mask, image_embeds, out_layers, &hidden_states, @@ -2008,18 +2045,18 @@ struct LLMEmbedder : public Conditioner { ggml_tensor* new_hidden_states = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, hidden_states->ne[0], - hidden_states->ne[1] - prompt_template_encode_start_idx + zero_pad_len, + hidden_states->ne[1] - prompt_template_encode_start_idx + zero_pad_len - prompt_template_encode_end_idx, hidden_states->ne[2]); ggml_ext_tensor_iter(new_hidden_states, [&](ggml_tensor* new_hidden_states, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { float value = 0.f; - if (i1 + prompt_template_encode_start_idx < hidden_states->ne[1]) { + if (i1 + prompt_template_encode_start_idx < hidden_states->ne[1] - prompt_template_encode_end_idx) { value = ggml_ext_tensor_get_f32(hidden_states, i0, i1 + prompt_template_encode_start_idx, i2, i3); } ggml_ext_tensor_set_f32(new_hidden_states, value, i0, i1, i2, i3); }); - // print_ggml_tensor(new_hidden_states); + print_ggml_tensor(new_hidden_states, true); int64_t t1 = ggml_time_ms(); LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 9c4975a..da834c3 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -2152,7 +2152,7 @@ public: bool bias = true, bool force_f32 = false, bool force_prec_f32 = false, - float scale = 1.f) + float scale = 1.f / 128.f) : in_features(in_features), out_features(out_features), bias(bias), diff --git a/llm.hpp b/llm.hpp index dc04c84..49ffbc5 100644 --- a/llm.hpp +++ b/llm.hpp @@ -837,7 +837,8 @@ namespace LLM { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, - struct ggml_tensor* input_pos) { + struct ggml_tensor* input_pos, + struct ggml_tensor* attention_mask = nullptr) { // x: [N, n_token, hidden_size] int64_t n_token = x->ne[1]; int64_t N = x->ne[2]; @@ -880,7 +881,7 @@ namespace LLM { k = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, k, 0, 2, 1, 3)); // [N, num_kv_heads, n_token, head_dim] k = ggml_reshape_3d(ctx->ggml_ctx, k, k->ne[0], k->ne[1], k->ne[2] * k->ne[3]); // [N*num_kv_heads, n_token, head_dim] - x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, nullptr, true, true, false); // [N, n_token, hidden_size] + x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, attention_mask, true, true, false); // [N, n_token, hidden_size] x = out_proj->forward(ctx, x); // [N, n_token, hidden_size] return x; @@ -898,7 +899,8 @@ namespace LLM { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, - struct ggml_tensor* input_pos) { + struct ggml_tensor* input_pos, + struct ggml_tensor* attention_mask = nullptr) { // x: [N, n_token, hidden_size] auto self_attn = std::dynamic_pointer_cast(blocks["self_attn"]); auto mlp = std::dynamic_pointer_cast(blocks["mlp"]); @@ -907,7 +909,7 @@ namespace LLM { auto residual = x; x = input_layernorm->forward(ctx, x); - x = self_attn->forward(ctx, x, input_pos); + x = self_attn->forward(ctx, x, input_pos, attention_mask); x = ggml_add_inplace(ctx->ggml_ctx, x, residual); residual = x; @@ -936,6 +938,7 @@ namespace LLM { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* input_ids, struct ggml_tensor* input_pos, + struct ggml_tensor* attention_mask, std::vector> image_embeds, std::set out_layers) { // input_ids: [N, n_token] @@ -990,7 +993,7 @@ namespace LLM { for (int i = 0; i < num_layers; i++) { auto block = std::dynamic_pointer_cast(blocks["layers." + std::to_string(i)]); - x = block->forward(ctx, x, input_pos); + x = block->forward(ctx, x, input_pos, attention_mask); if (out_layers.find(i + 1) != out_layers.end()) { intermediate_outputs.push_back(x); } @@ -1036,12 +1039,13 @@ namespace LLM { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* input_ids, struct ggml_tensor* input_pos, + struct ggml_tensor* attention_mask, std::vector> image_embeds, std::set out_layers) { // input_ids: [N, n_token] auto model = std::dynamic_pointer_cast(blocks["model"]); - auto x = model->forward(ctx, input_ids, input_pos, image_embeds, out_layers); + auto x = model->forward(ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers); return x; } @@ -1157,9 +1161,10 @@ namespace LLM { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* input_ids, struct ggml_tensor* input_pos, + struct ggml_tensor* attention_mask, std::vector> image_embeds, std::set out_layers) { - auto hidden_states = model.forward(ctx, input_ids, input_pos, image_embeds, out_layers); // [N, n_token, hidden_size] + auto hidden_states = model.forward(ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers); // [N, n_token, hidden_size] return hidden_states; } @@ -1174,11 +1179,13 @@ namespace LLM { } struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids, + struct ggml_tensor* attention_mask, std::vector> image_embeds, std::set out_layers) { struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); input_ids = to_backend(input_ids); + attention_mask = to_backend(attention_mask); for (auto& image_embed : image_embeds) { image_embed.second = to_backend(image_embed.second); @@ -1207,7 +1214,7 @@ namespace LLM { auto runner_ctx = get_context(); - struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, input_pos, image_embeds, out_layers); + struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers); ggml_build_forward_expand(gf, hidden_states); @@ -1216,12 +1223,13 @@ namespace LLM { bool compute(const int n_threads, struct ggml_tensor* input_ids, + struct ggml_tensor* attention_mask, std::vector> image_embeds, std::set out_layers, ggml_tensor** output, ggml_context* output_ctx = nullptr) { auto get_graph = [&]() -> struct ggml_cgraph* { - return build_graph(input_ids, image_embeds, out_layers); + return build_graph(input_ids, attention_mask, image_embeds, out_layers); }; return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); } @@ -1525,7 +1533,7 @@ namespace LLM { struct ggml_tensor* out = nullptr; int t0 = ggml_time_ms(); - model.compute(8, input_ids, image_embeds, {}, &out, work_ctx); + model.compute(8, input_ids, nullptr, image_embeds, {}, &out, work_ctx); int t1 = ggml_time_ms(); print_ggml_tensor(out); @@ -1565,7 +1573,7 @@ namespace LLM { struct ggml_tensor* out = nullptr; int t0 = ggml_time_ms(); - model.compute(8, input_ids, {}, {10, 20, 30}, &out, work_ctx); + model.compute(8, input_ids, nullptr, {}, {10, 20, 30}, &out, work_ctx); int t1 = ggml_time_ms(); print_ggml_tensor(out); @@ -1588,7 +1596,7 @@ namespace LLM { struct ggml_tensor* out = nullptr; int t0 = ggml_time_ms(); - model.compute(8, input_ids, {}, {35}, &out, work_ctx); + model.compute(8, input_ids, nullptr, {}, {35}, &out, work_ctx); int t1 = ggml_time_ms(); print_ggml_tensor(out); @@ -1611,7 +1619,7 @@ namespace LLM { struct ggml_tensor* out = nullptr; int t0 = ggml_time_ms(); - model.compute(8, input_ids, {}, {}, &out, work_ctx); + model.compute(8, input_ids, nullptr, {}, {}, &out, work_ctx); int t1 = ggml_time_ms(); print_ggml_tensor(out);