mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2026-02-04 10:53:34 +00:00
wip fix
This commit is contained in:
parent
299070fe00
commit
f3b1223a4f
@ -1755,9 +1755,13 @@ struct LLMEmbedder : public Conditioner {
|
|||||||
std::vector<std::pair<int, ggml_tensor*>> image_embeds;
|
std::vector<std::pair<int, ggml_tensor*>> image_embeds;
|
||||||
std::pair<int, int> prompt_attn_range;
|
std::pair<int, int> prompt_attn_range;
|
||||||
int prompt_template_encode_start_idx = 34;
|
int prompt_template_encode_start_idx = 34;
|
||||||
|
int prompt_template_encode_end_idx = 0;
|
||||||
int max_length = 0;
|
int max_length = 0;
|
||||||
bool spell_quotes = false;
|
bool spell_quotes = false;
|
||||||
std::set<int> out_layers;
|
std::set<int> out_layers;
|
||||||
|
std::vector<int> tokens;
|
||||||
|
std::vector<float> weights;
|
||||||
|
std::vector<float> mask;
|
||||||
if (llm->enable_vision && conditioner_params.ref_images.size() > 0) {
|
if (llm->enable_vision && conditioner_params.ref_images.size() > 0) {
|
||||||
if (sd_version_is_longcat(version)) {
|
if (sd_version_is_longcat(version)) {
|
||||||
LOG_INFO("LongCatEditPipeline");
|
LOG_INFO("LongCatEditPipeline");
|
||||||
@ -1937,8 +1941,8 @@ struct LLMEmbedder : public Conditioner {
|
|||||||
prompt += "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
|
prompt += "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
|
||||||
} else if (sd_version_is_longcat(version)) {
|
} else if (sd_version_is_longcat(version)) {
|
||||||
prompt_template_encode_start_idx = 36;
|
prompt_template_encode_start_idx = 36;
|
||||||
// prompt_template_encode_end_idx = 5;
|
max_length = 512 + prompt_template_encode_start_idx;
|
||||||
max_length = 512;
|
prompt_template_encode_end_idx = 5;
|
||||||
spell_quotes = true;
|
spell_quotes = true;
|
||||||
|
|
||||||
prompt = "<|im_start|>system\nAs an image captioning expert, generate a descriptive text prompt based on an image content, suitable for input to a text-to-image model.<|im_end|>\n<|im_start|>user\n";
|
prompt = "<|im_start|>system\nAs an image captioning expert, generate a descriptive text prompt based on an image content, suitable for input to a text-to-image model.<|im_end|>\n<|im_start|>user\n";
|
||||||
@ -1947,7 +1951,24 @@ struct LLMEmbedder : public Conditioner {
|
|||||||
prompt += conditioner_params.text;
|
prompt += conditioner_params.text;
|
||||||
prompt_attn_range.second = static_cast<int>(prompt.size());
|
prompt_attn_range.second = static_cast<int>(prompt.size());
|
||||||
|
|
||||||
prompt += "<|im_end|>\n<|im_start|>assistant\n";
|
auto tokens_and_weights = tokenize(prompt, prompt_attn_range, 0, false, spell_quotes);
|
||||||
|
tokens = std::get<0>(tokens_and_weights);
|
||||||
|
weights = std::get<1>(tokens_and_weights);
|
||||||
|
|
||||||
|
mask.insert(mask.end(), tokens.size(), 1.f);
|
||||||
|
if (tokens.size() < max_length) {
|
||||||
|
mask.insert(mask.end(), max_length - tokens.size(), 0.f);
|
||||||
|
tokenizer->pad_tokens(tokens, weights, max_length, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string prompt_template_suffix = "<|im_end|>\n<|im_start|>assistant\n";
|
||||||
|
auto suffix_tokens = tokenizer->tokenize(prompt_template_suffix, nullptr);
|
||||||
|
|
||||||
|
LOG_DEBUG("%zd", tokens.size());
|
||||||
|
|
||||||
|
tokens.insert(tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
|
||||||
|
weights.insert(weights.end(), suffix_tokens.size(), 1.f);
|
||||||
|
mask.insert(mask.end(), suffix_tokens.size(), 1.f);
|
||||||
} else {
|
} else {
|
||||||
prompt_template_encode_start_idx = 34;
|
prompt_template_encode_start_idx = 34;
|
||||||
|
|
||||||
@ -1960,17 +1981,33 @@ struct LLMEmbedder : public Conditioner {
|
|||||||
prompt += "<|im_end|>\n<|im_start|>assistant\n";
|
prompt += "<|im_end|>\n<|im_start|>assistant\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (tokens.empty()) {
|
||||||
auto tokens_and_weights = tokenize(prompt, prompt_attn_range, max_length, max_length > 0, spell_quotes);
|
auto tokens_and_weights = tokenize(prompt, prompt_attn_range, max_length, max_length > 0, spell_quotes);
|
||||||
auto& tokens = std::get<0>(tokens_and_weights);
|
tokens = std::get<0>(tokens_and_weights);
|
||||||
auto& weights = std::get<1>(tokens_and_weights);
|
weights = std::get<1>(tokens_and_weights);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
int64_t t0 = ggml_time_ms();
|
int64_t t0 = ggml_time_ms();
|
||||||
struct ggml_tensor* hidden_states = nullptr; // [N, n_token, 3584]
|
struct ggml_tensor* hidden_states = nullptr; // [N, n_token, 3584]
|
||||||
|
|
||||||
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens);
|
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens);
|
||||||
|
ggml_tensor* attention_mask = nullptr;
|
||||||
|
if (!mask.empty()) {
|
||||||
|
attention_mask = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, mask.size(), mask.size());
|
||||||
|
ggml_ext_tensor_iter(attention_mask, [&](ggml_tensor* attention_mask, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
|
||||||
|
float value = 0.f;
|
||||||
|
if (mask[i0] == 0.f || mask[i1] == 0.f) {
|
||||||
|
value = -INFINITY;
|
||||||
|
}
|
||||||
|
ggml_ext_tensor_set_f32(attention_mask, value, i0, i1, i2, i3);
|
||||||
|
});
|
||||||
|
print_ggml_tensor(attention_mask);
|
||||||
|
}
|
||||||
|
|
||||||
llm->compute(n_threads,
|
llm->compute(n_threads,
|
||||||
input_ids,
|
input_ids,
|
||||||
|
attention_mask,
|
||||||
image_embeds,
|
image_embeds,
|
||||||
out_layers,
|
out_layers,
|
||||||
&hidden_states,
|
&hidden_states,
|
||||||
@ -2008,18 +2045,18 @@ struct LLMEmbedder : public Conditioner {
|
|||||||
ggml_tensor* new_hidden_states = ggml_new_tensor_3d(work_ctx,
|
ggml_tensor* new_hidden_states = ggml_new_tensor_3d(work_ctx,
|
||||||
GGML_TYPE_F32,
|
GGML_TYPE_F32,
|
||||||
hidden_states->ne[0],
|
hidden_states->ne[0],
|
||||||
hidden_states->ne[1] - prompt_template_encode_start_idx + zero_pad_len,
|
hidden_states->ne[1] - prompt_template_encode_start_idx + zero_pad_len - prompt_template_encode_end_idx,
|
||||||
hidden_states->ne[2]);
|
hidden_states->ne[2]);
|
||||||
|
|
||||||
ggml_ext_tensor_iter(new_hidden_states, [&](ggml_tensor* new_hidden_states, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
|
ggml_ext_tensor_iter(new_hidden_states, [&](ggml_tensor* new_hidden_states, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
|
||||||
float value = 0.f;
|
float value = 0.f;
|
||||||
if (i1 + prompt_template_encode_start_idx < hidden_states->ne[1]) {
|
if (i1 + prompt_template_encode_start_idx < hidden_states->ne[1] - prompt_template_encode_end_idx) {
|
||||||
value = ggml_ext_tensor_get_f32(hidden_states, i0, i1 + prompt_template_encode_start_idx, i2, i3);
|
value = ggml_ext_tensor_get_f32(hidden_states, i0, i1 + prompt_template_encode_start_idx, i2, i3);
|
||||||
}
|
}
|
||||||
ggml_ext_tensor_set_f32(new_hidden_states, value, i0, i1, i2, i3);
|
ggml_ext_tensor_set_f32(new_hidden_states, value, i0, i1, i2, i3);
|
||||||
});
|
});
|
||||||
|
|
||||||
// print_ggml_tensor(new_hidden_states);
|
print_ggml_tensor(new_hidden_states, true);
|
||||||
|
|
||||||
int64_t t1 = ggml_time_ms();
|
int64_t t1 = ggml_time_ms();
|
||||||
LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
|
LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
|
||||||
|
|||||||
@ -2152,7 +2152,7 @@ public:
|
|||||||
bool bias = true,
|
bool bias = true,
|
||||||
bool force_f32 = false,
|
bool force_f32 = false,
|
||||||
bool force_prec_f32 = false,
|
bool force_prec_f32 = false,
|
||||||
float scale = 1.f)
|
float scale = 1.f / 128.f)
|
||||||
: in_features(in_features),
|
: in_features(in_features),
|
||||||
out_features(out_features),
|
out_features(out_features),
|
||||||
bias(bias),
|
bias(bias),
|
||||||
|
|||||||
34
llm.hpp
34
llm.hpp
@ -837,7 +837,8 @@ namespace LLM {
|
|||||||
|
|
||||||
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
|
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||||
struct ggml_tensor* x,
|
struct ggml_tensor* x,
|
||||||
struct ggml_tensor* input_pos) {
|
struct ggml_tensor* input_pos,
|
||||||
|
struct ggml_tensor* attention_mask = nullptr) {
|
||||||
// x: [N, n_token, hidden_size]
|
// x: [N, n_token, hidden_size]
|
||||||
int64_t n_token = x->ne[1];
|
int64_t n_token = x->ne[1];
|
||||||
int64_t N = x->ne[2];
|
int64_t N = x->ne[2];
|
||||||
@ -880,7 +881,7 @@ namespace LLM {
|
|||||||
k = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, k, 0, 2, 1, 3)); // [N, num_kv_heads, n_token, head_dim]
|
k = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, k, 0, 2, 1, 3)); // [N, num_kv_heads, n_token, head_dim]
|
||||||
k = ggml_reshape_3d(ctx->ggml_ctx, k, k->ne[0], k->ne[1], k->ne[2] * k->ne[3]); // [N*num_kv_heads, n_token, head_dim]
|
k = ggml_reshape_3d(ctx->ggml_ctx, k, k->ne[0], k->ne[1], k->ne[2] * k->ne[3]); // [N*num_kv_heads, n_token, head_dim]
|
||||||
|
|
||||||
x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, nullptr, true, true, false); // [N, n_token, hidden_size]
|
x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, attention_mask, true, true, false); // [N, n_token, hidden_size]
|
||||||
|
|
||||||
x = out_proj->forward(ctx, x); // [N, n_token, hidden_size]
|
x = out_proj->forward(ctx, x); // [N, n_token, hidden_size]
|
||||||
return x;
|
return x;
|
||||||
@ -898,7 +899,8 @@ namespace LLM {
|
|||||||
|
|
||||||
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
|
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||||
struct ggml_tensor* x,
|
struct ggml_tensor* x,
|
||||||
struct ggml_tensor* input_pos) {
|
struct ggml_tensor* input_pos,
|
||||||
|
struct ggml_tensor* attention_mask = nullptr) {
|
||||||
// x: [N, n_token, hidden_size]
|
// x: [N, n_token, hidden_size]
|
||||||
auto self_attn = std::dynamic_pointer_cast<Attention>(blocks["self_attn"]);
|
auto self_attn = std::dynamic_pointer_cast<Attention>(blocks["self_attn"]);
|
||||||
auto mlp = std::dynamic_pointer_cast<MLP>(blocks["mlp"]);
|
auto mlp = std::dynamic_pointer_cast<MLP>(blocks["mlp"]);
|
||||||
@ -907,7 +909,7 @@ namespace LLM {
|
|||||||
|
|
||||||
auto residual = x;
|
auto residual = x;
|
||||||
x = input_layernorm->forward(ctx, x);
|
x = input_layernorm->forward(ctx, x);
|
||||||
x = self_attn->forward(ctx, x, input_pos);
|
x = self_attn->forward(ctx, x, input_pos, attention_mask);
|
||||||
x = ggml_add_inplace(ctx->ggml_ctx, x, residual);
|
x = ggml_add_inplace(ctx->ggml_ctx, x, residual);
|
||||||
|
|
||||||
residual = x;
|
residual = x;
|
||||||
@ -936,6 +938,7 @@ namespace LLM {
|
|||||||
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
|
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||||
struct ggml_tensor* input_ids,
|
struct ggml_tensor* input_ids,
|
||||||
struct ggml_tensor* input_pos,
|
struct ggml_tensor* input_pos,
|
||||||
|
struct ggml_tensor* attention_mask,
|
||||||
std::vector<std::pair<int, ggml_tensor*>> image_embeds,
|
std::vector<std::pair<int, ggml_tensor*>> image_embeds,
|
||||||
std::set<int> out_layers) {
|
std::set<int> out_layers) {
|
||||||
// input_ids: [N, n_token]
|
// input_ids: [N, n_token]
|
||||||
@ -990,7 +993,7 @@ namespace LLM {
|
|||||||
for (int i = 0; i < num_layers; i++) {
|
for (int i = 0; i < num_layers; i++) {
|
||||||
auto block = std::dynamic_pointer_cast<TransformerBlock>(blocks["layers." + std::to_string(i)]);
|
auto block = std::dynamic_pointer_cast<TransformerBlock>(blocks["layers." + std::to_string(i)]);
|
||||||
|
|
||||||
x = block->forward(ctx, x, input_pos);
|
x = block->forward(ctx, x, input_pos, attention_mask);
|
||||||
if (out_layers.find(i + 1) != out_layers.end()) {
|
if (out_layers.find(i + 1) != out_layers.end()) {
|
||||||
intermediate_outputs.push_back(x);
|
intermediate_outputs.push_back(x);
|
||||||
}
|
}
|
||||||
@ -1036,12 +1039,13 @@ namespace LLM {
|
|||||||
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
|
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||||
struct ggml_tensor* input_ids,
|
struct ggml_tensor* input_ids,
|
||||||
struct ggml_tensor* input_pos,
|
struct ggml_tensor* input_pos,
|
||||||
|
struct ggml_tensor* attention_mask,
|
||||||
std::vector<std::pair<int, ggml_tensor*>> image_embeds,
|
std::vector<std::pair<int, ggml_tensor*>> image_embeds,
|
||||||
std::set<int> out_layers) {
|
std::set<int> out_layers) {
|
||||||
// input_ids: [N, n_token]
|
// input_ids: [N, n_token]
|
||||||
auto model = std::dynamic_pointer_cast<TextModel>(blocks["model"]);
|
auto model = std::dynamic_pointer_cast<TextModel>(blocks["model"]);
|
||||||
|
|
||||||
auto x = model->forward(ctx, input_ids, input_pos, image_embeds, out_layers);
|
auto x = model->forward(ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers);
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1157,9 +1161,10 @@ namespace LLM {
|
|||||||
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
|
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||||
struct ggml_tensor* input_ids,
|
struct ggml_tensor* input_ids,
|
||||||
struct ggml_tensor* input_pos,
|
struct ggml_tensor* input_pos,
|
||||||
|
struct ggml_tensor* attention_mask,
|
||||||
std::vector<std::pair<int, ggml_tensor*>> image_embeds,
|
std::vector<std::pair<int, ggml_tensor*>> image_embeds,
|
||||||
std::set<int> out_layers) {
|
std::set<int> out_layers) {
|
||||||
auto hidden_states = model.forward(ctx, input_ids, input_pos, image_embeds, out_layers); // [N, n_token, hidden_size]
|
auto hidden_states = model.forward(ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers); // [N, n_token, hidden_size]
|
||||||
return hidden_states;
|
return hidden_states;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1174,11 +1179,13 @@ namespace LLM {
|
|||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
|
struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
|
||||||
|
struct ggml_tensor* attention_mask,
|
||||||
std::vector<std::pair<int, ggml_tensor*>> image_embeds,
|
std::vector<std::pair<int, ggml_tensor*>> image_embeds,
|
||||||
std::set<int> out_layers) {
|
std::set<int> out_layers) {
|
||||||
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
|
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
|
||||||
|
|
||||||
input_ids = to_backend(input_ids);
|
input_ids = to_backend(input_ids);
|
||||||
|
attention_mask = to_backend(attention_mask);
|
||||||
|
|
||||||
for (auto& image_embed : image_embeds) {
|
for (auto& image_embed : image_embeds) {
|
||||||
image_embed.second = to_backend(image_embed.second);
|
image_embed.second = to_backend(image_embed.second);
|
||||||
@ -1207,7 +1214,7 @@ namespace LLM {
|
|||||||
|
|
||||||
auto runner_ctx = get_context();
|
auto runner_ctx = get_context();
|
||||||
|
|
||||||
struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, input_pos, image_embeds, out_layers);
|
struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers);
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, hidden_states);
|
ggml_build_forward_expand(gf, hidden_states);
|
||||||
|
|
||||||
@ -1216,12 +1223,13 @@ namespace LLM {
|
|||||||
|
|
||||||
bool compute(const int n_threads,
|
bool compute(const int n_threads,
|
||||||
struct ggml_tensor* input_ids,
|
struct ggml_tensor* input_ids,
|
||||||
|
struct ggml_tensor* attention_mask,
|
||||||
std::vector<std::pair<int, ggml_tensor*>> image_embeds,
|
std::vector<std::pair<int, ggml_tensor*>> image_embeds,
|
||||||
std::set<int> out_layers,
|
std::set<int> out_layers,
|
||||||
ggml_tensor** output,
|
ggml_tensor** output,
|
||||||
ggml_context* output_ctx = nullptr) {
|
ggml_context* output_ctx = nullptr) {
|
||||||
auto get_graph = [&]() -> struct ggml_cgraph* {
|
auto get_graph = [&]() -> struct ggml_cgraph* {
|
||||||
return build_graph(input_ids, image_embeds, out_layers);
|
return build_graph(input_ids, attention_mask, image_embeds, out_layers);
|
||||||
};
|
};
|
||||||
return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
|
return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
|
||||||
}
|
}
|
||||||
@ -1525,7 +1533,7 @@ namespace LLM {
|
|||||||
struct ggml_tensor* out = nullptr;
|
struct ggml_tensor* out = nullptr;
|
||||||
|
|
||||||
int t0 = ggml_time_ms();
|
int t0 = ggml_time_ms();
|
||||||
model.compute(8, input_ids, image_embeds, {}, &out, work_ctx);
|
model.compute(8, input_ids, nullptr, image_embeds, {}, &out, work_ctx);
|
||||||
int t1 = ggml_time_ms();
|
int t1 = ggml_time_ms();
|
||||||
|
|
||||||
print_ggml_tensor(out);
|
print_ggml_tensor(out);
|
||||||
@ -1565,7 +1573,7 @@ namespace LLM {
|
|||||||
struct ggml_tensor* out = nullptr;
|
struct ggml_tensor* out = nullptr;
|
||||||
|
|
||||||
int t0 = ggml_time_ms();
|
int t0 = ggml_time_ms();
|
||||||
model.compute(8, input_ids, {}, {10, 20, 30}, &out, work_ctx);
|
model.compute(8, input_ids, nullptr, {}, {10, 20, 30}, &out, work_ctx);
|
||||||
int t1 = ggml_time_ms();
|
int t1 = ggml_time_ms();
|
||||||
|
|
||||||
print_ggml_tensor(out);
|
print_ggml_tensor(out);
|
||||||
@ -1588,7 +1596,7 @@ namespace LLM {
|
|||||||
struct ggml_tensor* out = nullptr;
|
struct ggml_tensor* out = nullptr;
|
||||||
|
|
||||||
int t0 = ggml_time_ms();
|
int t0 = ggml_time_ms();
|
||||||
model.compute(8, input_ids, {}, {35}, &out, work_ctx);
|
model.compute(8, input_ids, nullptr, {}, {35}, &out, work_ctx);
|
||||||
int t1 = ggml_time_ms();
|
int t1 = ggml_time_ms();
|
||||||
|
|
||||||
print_ggml_tensor(out);
|
print_ggml_tensor(out);
|
||||||
@ -1611,7 +1619,7 @@ namespace LLM {
|
|||||||
struct ggml_tensor* out = nullptr;
|
struct ggml_tensor* out = nullptr;
|
||||||
|
|
||||||
int t0 = ggml_time_ms();
|
int t0 = ggml_time_ms();
|
||||||
model.compute(8, input_ids, {}, {}, &out, work_ctx);
|
model.compute(8, input_ids, nullptr, {}, {}, &out, work_ctx);
|
||||||
int t1 = ggml_time_ms();
|
int t1 = ggml_time_ms();
|
||||||
|
|
||||||
print_ggml_tensor(out);
|
print_ggml_tensor(out);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user