add diff lora support

This commit is contained in:
leejet 2025-08-23 19:06:30 +08:00
parent e69195d22f
commit 9fcc85688d
3 changed files with 98 additions and 78 deletions

172
lora.hpp
View File

@ -92,7 +92,7 @@ struct LoraModel : public GGMLRunner {
float multiplier = 1.0f;
std::map<std::string, struct ggml_tensor*> lora_tensors;
std::map<ggml_tensor*, ggml_tensor*> original_weight_to_final_weight;
std::map<ggml_tensor*, ggml_tensor*> original_tensor_to_final_tensor;
std::string file_path;
ModelLoader model_loader;
bool load_failed = false;
@ -248,12 +248,18 @@ struct LoraModel : public GGMLRunner {
std::set<std::string> applied_lora_tensors;
for (auto it : model_tensors) {
std::string k_tensor = it.first;
struct ggml_tensor* weight = model_tensors[it.first];
std::string model_tensor_name = it.first;
struct ggml_tensor* model_tensor = model_tensors[it.first];
std::vector<std::string> keys = to_lora_keys(k_tensor, version);
if (keys.size() == 0)
continue;
std::vector<std::string> keys = to_lora_keys(model_tensor_name, version);
bool is_bias = ends_with(model_tensor_name, ".bias");
if (keys.size() == 0) {
if (is_bias) {
keys.push_back(model_tensor_name.substr(0, model_tensor_name.size() - 5)); // remove .bias
} else {
continue;
}
}
for (auto& key : keys) {
bool is_qkv_split = starts_with(key, "SPLIT|");
@ -266,8 +272,22 @@ struct LoraModel : public GGMLRunner {
}
struct ggml_tensor* updown = NULL;
float scale_value = 1.0f;
std::string fk = lora_pre[type] + key;
if (lora_tensors.find(fk + ".hada_w1_a") != lora_tensors.end()) {
std::string full_key = lora_pre[type] + key;
if (is_bias) {
if (lora_tensors.find(full_key + ".diff_b") != lora_tensors.end()) {
std::string diff_name = full_key + ".diff_b";
ggml_tensor* diff = lora_tensors[diff_name];
updown = to_f32(compute_ctx, diff);
applied_lora_tensors.insert(diff_name);
} else {
continue;
}
} else if (lora_tensors.find(full_key + ".diff") != lora_tensors.end()) {
std::string diff_name = full_key + ".diff";
ggml_tensor* diff = lora_tensors[diff_name];
updown = to_f32(compute_ctx, diff);
applied_lora_tensors.insert(diff_name);
} else if (lora_tensors.find(full_key + ".hada_w1_a") != lora_tensors.end()) {
// LoHa mode
// TODO: split qkv convention for LoHas (is it ever used?)
@ -293,9 +313,9 @@ struct LoraModel : public GGMLRunner {
std::string hada_2_down_name = "";
std::string hada_2_up_name = "";
hada_1_down_name = fk + ".hada_w1_b";
hada_1_up_name = fk + ".hada_w1_a";
hada_1_mid_name = fk + ".hada_t1";
hada_1_down_name = full_key + ".hada_w1_b";
hada_1_up_name = full_key + ".hada_w1_a";
hada_1_mid_name = full_key + ".hada_t1";
if (lora_tensors.find(hada_1_down_name) != lora_tensors.end()) {
hada_1_down = to_f32(compute_ctx, lora_tensors[hada_1_down_name]);
}
@ -308,9 +328,9 @@ struct LoraModel : public GGMLRunner {
hada_1_up = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, hada_1_up));
}
hada_2_down_name = fk + ".hada_w2_b";
hada_2_up_name = fk + ".hada_w2_a";
hada_2_mid_name = fk + ".hada_t2";
hada_2_down_name = full_key + ".hada_w2_b";
hada_2_up_name = full_key + ".hada_w2_a";
hada_2_mid_name = full_key + ".hada_t2";
if (lora_tensors.find(hada_2_down_name) != lora_tensors.end()) {
hada_2_down = to_f32(compute_ctx, lora_tensors[hada_2_down_name]);
}
@ -323,7 +343,7 @@ struct LoraModel : public GGMLRunner {
hada_2_up = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, hada_2_up));
}
alpha_name = fk + ".alpha";
alpha_name = full_key + ".alpha";
applied_lora_tensors.insert(hada_1_down_name);
applied_lora_tensors.insert(hada_1_up_name);
@ -346,7 +366,7 @@ struct LoraModel : public GGMLRunner {
float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]);
scale_value = alpha / rank;
}
} else if (lora_tensors.find(fk + ".lokr_w1") != lora_tensors.end() || lora_tensors.find(fk + ".lokr_w1_a") != lora_tensors.end()) {
} else if (lora_tensors.find(full_key + ".lokr_w1") != lora_tensors.end() || lora_tensors.find(full_key + ".lokr_w1_a") != lora_tensors.end()) {
// LoKr mode
// TODO: split qkv convention for LoKrs (is it ever used?)
@ -355,7 +375,7 @@ struct LoraModel : public GGMLRunner {
break;
}
std::string alpha_name = fk + ".alpha";
std::string alpha_name = full_key + ".alpha";
ggml_tensor* lokr_w1 = NULL;
ggml_tensor* lokr_w2 = NULL;
@ -363,8 +383,8 @@ struct LoraModel : public GGMLRunner {
std::string lokr_w1_name = "";
std::string lokr_w2_name = "";
lokr_w1_name = fk + ".lokr_w1";
lokr_w2_name = fk + ".lokr_w2";
lokr_w1_name = full_key + ".lokr_w1";
lokr_w2_name = full_key + ".lokr_w2";
if (lora_tensors.find(lokr_w1_name) != lora_tensors.end()) {
lokr_w1 = to_f32(compute_ctx, lora_tensors[lokr_w1_name]);
@ -436,29 +456,29 @@ struct LoraModel : public GGMLRunner {
if (is_qkv_split) {
std::string suffix = "";
auto split_q_d_name = fk + "q" + suffix + lora_downs[type] + ".weight";
auto split_q_d_name = full_key + "q" + suffix + lora_downs[type] + ".weight";
if (lora_tensors.find(split_q_d_name) == lora_tensors.end()) {
suffix = "_proj";
split_q_d_name = fk + "q" + suffix + lora_downs[type] + ".weight";
split_q_d_name = full_key + "q" + suffix + lora_downs[type] + ".weight";
}
if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) {
// print_ggml_tensor(it.second, true); //[3072, 21504, 1, 1]
// find qkv and mlp up parts in LoRA model
auto split_k_d_name = fk + "k" + suffix + lora_downs[type] + ".weight";
auto split_v_d_name = fk + "v" + suffix + lora_downs[type] + ".weight";
auto split_k_d_name = full_key + "k" + suffix + lora_downs[type] + ".weight";
auto split_v_d_name = full_key + "v" + suffix + lora_downs[type] + ".weight";
auto split_q_u_name = fk + "q" + suffix + lora_ups[type] + ".weight";
auto split_k_u_name = fk + "k" + suffix + lora_ups[type] + ".weight";
auto split_v_u_name = fk + "v" + suffix + lora_ups[type] + ".weight";
auto split_q_u_name = full_key + "q" + suffix + lora_ups[type] + ".weight";
auto split_k_u_name = full_key + "k" + suffix + lora_ups[type] + ".weight";
auto split_v_u_name = full_key + "v" + suffix + lora_ups[type] + ".weight";
auto split_q_scale_name = fk + "q" + suffix + ".scale";
auto split_k_scale_name = fk + "k" + suffix + ".scale";
auto split_v_scale_name = fk + "v" + suffix + ".scale";
auto split_q_scale_name = full_key + "q" + suffix + ".scale";
auto split_k_scale_name = full_key + "k" + suffix + ".scale";
auto split_v_scale_name = full_key + "v" + suffix + ".scale";
auto split_q_alpha_name = fk + "q" + suffix + ".alpha";
auto split_k_alpha_name = fk + "k" + suffix + ".alpha";
auto split_v_alpha_name = fk + "v" + suffix + ".alpha";
auto split_q_alpha_name = full_key + "q" + suffix + ".alpha";
auto split_k_alpha_name = full_key + "k" + suffix + ".alpha";
auto split_v_alpha_name = full_key + "v" + suffix + ".alpha";
ggml_tensor* lora_q_down = NULL;
ggml_tensor* lora_q_up = NULL;
@ -572,29 +592,29 @@ struct LoraModel : public GGMLRunner {
applied_lora_tensors.insert(split_v_d_name);
}
} else if (is_qkvm_split) {
auto split_q_d_name = fk + "attn.to_q" + lora_downs[type] + ".weight";
auto split_q_d_name = full_key + "attn.to_q" + lora_downs[type] + ".weight";
if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) {
// print_ggml_tensor(it.second, true); //[3072, 21504, 1, 1]
// find qkv and mlp up parts in LoRA model
auto split_k_d_name = fk + "attn.to_k" + lora_downs[type] + ".weight";
auto split_v_d_name = fk + "attn.to_v" + lora_downs[type] + ".weight";
auto split_k_d_name = full_key + "attn.to_k" + lora_downs[type] + ".weight";
auto split_v_d_name = full_key + "attn.to_v" + lora_downs[type] + ".weight";
auto split_q_u_name = fk + "attn.to_q" + lora_ups[type] + ".weight";
auto split_k_u_name = fk + "attn.to_k" + lora_ups[type] + ".weight";
auto split_v_u_name = fk + "attn.to_v" + lora_ups[type] + ".weight";
auto split_q_u_name = full_key + "attn.to_q" + lora_ups[type] + ".weight";
auto split_k_u_name = full_key + "attn.to_k" + lora_ups[type] + ".weight";
auto split_v_u_name = full_key + "attn.to_v" + lora_ups[type] + ".weight";
auto split_m_d_name = fk + "proj_mlp" + lora_downs[type] + ".weight";
auto split_m_u_name = fk + "proj_mlp" + lora_ups[type] + ".weight";
auto split_m_d_name = full_key + "proj_mlp" + lora_downs[type] + ".weight";
auto split_m_u_name = full_key + "proj_mlp" + lora_ups[type] + ".weight";
auto split_q_scale_name = fk + "attn.to_q" + ".scale";
auto split_k_scale_name = fk + "attn.to_k" + ".scale";
auto split_v_scale_name = fk + "attn.to_v" + ".scale";
auto split_m_scale_name = fk + "proj_mlp" + ".scale";
auto split_q_scale_name = full_key + "attn.to_q" + ".scale";
auto split_k_scale_name = full_key + "attn.to_k" + ".scale";
auto split_v_scale_name = full_key + "attn.to_v" + ".scale";
auto split_m_scale_name = full_key + "proj_mlp" + ".scale";
auto split_q_alpha_name = fk + "attn.to_q" + ".alpha";
auto split_k_alpha_name = fk + "attn.to_k" + ".alpha";
auto split_v_alpha_name = fk + "attn.to_v" + ".alpha";
auto split_m_alpha_name = fk + "proj_mlp" + ".alpha";
auto split_q_alpha_name = full_key + "attn.to_q" + ".alpha";
auto split_k_alpha_name = full_key + "attn.to_k" + ".alpha";
auto split_v_alpha_name = full_key + "attn.to_v" + ".alpha";
auto split_m_alpha_name = full_key + "proj_mlp" + ".alpha";
ggml_tensor* lora_q_down = NULL;
ggml_tensor* lora_q_up = NULL;
@ -749,12 +769,12 @@ struct LoraModel : public GGMLRunner {
applied_lora_tensors.insert(split_m_d_name);
}
} else {
lora_up_name = fk + lora_ups[type] + ".weight";
lora_down_name = fk + lora_downs[type] + ".weight";
lora_mid_name = fk + ".lora_mid.weight";
lora_up_name = full_key + lora_ups[type] + ".weight";
lora_down_name = full_key + lora_downs[type] + ".weight";
lora_mid_name = full_key + ".lora_mid.weight";
alpha_name = fk + ".alpha";
scale_name = fk + ".scale";
alpha_name = full_key + ".alpha";
scale_name = full_key + ".scale";
if (lora_tensors.find(lora_up_name) != lora_tensors.end()) {
lora_up = to_f32(compute_ctx, lora_tensors[lora_up_name]);
@ -791,28 +811,25 @@ struct LoraModel : public GGMLRunner {
updown = ggml_merge_lora(compute_ctx, lora_down, lora_up, lora_mid);
}
scale_value *= multiplier;
ggml_tensor* original_weight = weight;
if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(weight->buffer)) {
weight = ggml_dup_tensor(compute_ctx, weight);
set_backend_tensor_data(weight, original_weight->data);
ggml_tensor* original_tensor = model_tensor;
if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(model_tensor->buffer)) {
model_tensor = ggml_dup_tensor(compute_ctx, model_tensor);
set_backend_tensor_data(model_tensor, original_tensor->data);
}
updown = ggml_reshape(compute_ctx, updown, weight);
GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight));
updown = ggml_reshape(compute_ctx, updown, model_tensor);
GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(model_tensor));
updown = ggml_scale_inplace(compute_ctx, updown, scale_value);
ggml_tensor* final_weight;
if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) {
// final_weight = ggml_new_tensor(compute_ctx, GGML_TYPE_F32, ggml_n_dims(weight), weight->ne);
// final_weight = ggml_cpy(compute_ctx, weight, final_weight);
final_weight = to_f32(compute_ctx, weight);
final_weight = ggml_add_inplace(compute_ctx, final_weight, updown);
final_weight = ggml_cpy(compute_ctx, final_weight, weight);
ggml_tensor* final_tensor;
if (model_tensor->type != GGML_TYPE_F32 && model_tensor->type != GGML_TYPE_F16) {
final_tensor = to_f32(compute_ctx, model_tensor);
final_tensor = ggml_add_inplace(compute_ctx, final_tensor, updown);
final_tensor = ggml_cpy(compute_ctx, final_tensor, model_tensor);
} else {
final_weight = ggml_add_inplace(compute_ctx, weight, updown);
final_tensor = ggml_add_inplace(compute_ctx, model_tensor, updown);
}
// final_weight = ggml_add_inplace(compute_ctx, weight, updown); // apply directly
ggml_build_forward_expand(gf, final_weight);
if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_weight->buffer)) {
original_weight_to_final_weight[original_weight] = final_weight;
ggml_build_forward_expand(gf, final_tensor);
if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) {
original_tensor_to_final_tensor[original_tensor] = final_tensor;
}
break;
}
@ -834,10 +851,10 @@ struct LoraModel : public GGMLRunner {
* this function is called once to calculate the required buffer size
* and then again to actually generate a graph to be used */
if (applied_lora_tensors_count != total_lora_tensors_count) {
LOG_WARN("Only (%lu / %lu) LoRA tensors have been applied",
LOG_WARN("Only (%lu / %lu) LoRA tensors will be applied",
applied_lora_tensors_count, total_lora_tensors_count);
} else {
LOG_DEBUG("(%lu / %lu) LoRA tensors applied successfully",
LOG_DEBUG("(%lu / %lu) LoRA tensors will be applied",
applied_lora_tensors_count, total_lora_tensors_count);
}
@ -849,12 +866,13 @@ struct LoraModel : public GGMLRunner {
return build_lora_graph(model_tensors, version);
};
GGMLRunner::compute(get_graph, n_threads, false);
for (auto item : original_weight_to_final_weight) {
ggml_tensor* original_weight = item.first;
ggml_tensor* final_weight = item.second;
for (auto item : original_tensor_to_final_tensor) {
ggml_tensor* original_tensor = item.first;
ggml_tensor* final_tensor = item.second;
ggml_backend_tensor_copy(final_weight, original_weight);
ggml_backend_tensor_copy(final_tensor, original_tensor);
}
original_tensor_to_final_tensor.clear();
GGMLRunner::free_compute_buffer();
}
};

View File

@ -603,6 +603,8 @@ std::string convert_tensor_name(std::string name) {
} else {
new_name = name;
}
} else if (ends_with(name, ".diff") || ends_with(name, ".diff_b")) {
new_name = "lora." + name;
} else if (contains(name, "lora_up") || contains(name, "lora_down") ||
contains(name, "lora.up") || contains(name, "lora.down") ||
contains(name, "lora_linear")) {

View File

@ -2409,7 +2409,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
}
ggml_free(work_ctx);
LOG_INFO("img2vid completed in %.2fs", (t5 - t0) * 1.0f / 1000);
LOG_INFO("generate_video completed in %.2fs", (t5 - t0) * 1.0f / 1000);
return result_images;
}