diff --git a/lora.hpp b/lora.hpp index b1a4971..2e1e490 100644 --- a/lora.hpp +++ b/lora.hpp @@ -92,7 +92,7 @@ struct LoraModel : public GGMLRunner { float multiplier = 1.0f; std::map lora_tensors; - std::map original_weight_to_final_weight; + std::map original_tensor_to_final_tensor; std::string file_path; ModelLoader model_loader; bool load_failed = false; @@ -248,12 +248,18 @@ struct LoraModel : public GGMLRunner { std::set applied_lora_tensors; for (auto it : model_tensors) { - std::string k_tensor = it.first; - struct ggml_tensor* weight = model_tensors[it.first]; + std::string model_tensor_name = it.first; + struct ggml_tensor* model_tensor = model_tensors[it.first]; - std::vector keys = to_lora_keys(k_tensor, version); - if (keys.size() == 0) - continue; + std::vector keys = to_lora_keys(model_tensor_name, version); + bool is_bias = ends_with(model_tensor_name, ".bias"); + if (keys.size() == 0) { + if (is_bias) { + keys.push_back(model_tensor_name.substr(0, model_tensor_name.size() - 5)); // remove .bias + } else { + continue; + } + } for (auto& key : keys) { bool is_qkv_split = starts_with(key, "SPLIT|"); @@ -266,8 +272,22 @@ struct LoraModel : public GGMLRunner { } struct ggml_tensor* updown = NULL; float scale_value = 1.0f; - std::string fk = lora_pre[type] + key; - if (lora_tensors.find(fk + ".hada_w1_a") != lora_tensors.end()) { + std::string full_key = lora_pre[type] + key; + if (is_bias) { + if (lora_tensors.find(full_key + ".diff_b") != lora_tensors.end()) { + std::string diff_name = full_key + ".diff_b"; + ggml_tensor* diff = lora_tensors[diff_name]; + updown = to_f32(compute_ctx, diff); + applied_lora_tensors.insert(diff_name); + } else { + continue; + } + } else if (lora_tensors.find(full_key + ".diff") != lora_tensors.end()) { + std::string diff_name = full_key + ".diff"; + ggml_tensor* diff = lora_tensors[diff_name]; + updown = to_f32(compute_ctx, diff); + applied_lora_tensors.insert(diff_name); + } else if (lora_tensors.find(full_key + ".hada_w1_a") != lora_tensors.end()) { // LoHa mode // TODO: split qkv convention for LoHas (is it ever used?) @@ -293,9 +313,9 @@ struct LoraModel : public GGMLRunner { std::string hada_2_down_name = ""; std::string hada_2_up_name = ""; - hada_1_down_name = fk + ".hada_w1_b"; - hada_1_up_name = fk + ".hada_w1_a"; - hada_1_mid_name = fk + ".hada_t1"; + hada_1_down_name = full_key + ".hada_w1_b"; + hada_1_up_name = full_key + ".hada_w1_a"; + hada_1_mid_name = full_key + ".hada_t1"; if (lora_tensors.find(hada_1_down_name) != lora_tensors.end()) { hada_1_down = to_f32(compute_ctx, lora_tensors[hada_1_down_name]); } @@ -308,9 +328,9 @@ struct LoraModel : public GGMLRunner { hada_1_up = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, hada_1_up)); } - hada_2_down_name = fk + ".hada_w2_b"; - hada_2_up_name = fk + ".hada_w2_a"; - hada_2_mid_name = fk + ".hada_t2"; + hada_2_down_name = full_key + ".hada_w2_b"; + hada_2_up_name = full_key + ".hada_w2_a"; + hada_2_mid_name = full_key + ".hada_t2"; if (lora_tensors.find(hada_2_down_name) != lora_tensors.end()) { hada_2_down = to_f32(compute_ctx, lora_tensors[hada_2_down_name]); } @@ -323,7 +343,7 @@ struct LoraModel : public GGMLRunner { hada_2_up = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, hada_2_up)); } - alpha_name = fk + ".alpha"; + alpha_name = full_key + ".alpha"; applied_lora_tensors.insert(hada_1_down_name); applied_lora_tensors.insert(hada_1_up_name); @@ -346,7 +366,7 @@ struct LoraModel : public GGMLRunner { float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]); scale_value = alpha / rank; } - } else if (lora_tensors.find(fk + ".lokr_w1") != lora_tensors.end() || lora_tensors.find(fk + ".lokr_w1_a") != lora_tensors.end()) { + } else if (lora_tensors.find(full_key + ".lokr_w1") != lora_tensors.end() || lora_tensors.find(full_key + ".lokr_w1_a") != lora_tensors.end()) { // LoKr mode // TODO: split qkv convention for LoKrs (is it ever used?) @@ -355,7 +375,7 @@ struct LoraModel : public GGMLRunner { break; } - std::string alpha_name = fk + ".alpha"; + std::string alpha_name = full_key + ".alpha"; ggml_tensor* lokr_w1 = NULL; ggml_tensor* lokr_w2 = NULL; @@ -363,8 +383,8 @@ struct LoraModel : public GGMLRunner { std::string lokr_w1_name = ""; std::string lokr_w2_name = ""; - lokr_w1_name = fk + ".lokr_w1"; - lokr_w2_name = fk + ".lokr_w2"; + lokr_w1_name = full_key + ".lokr_w1"; + lokr_w2_name = full_key + ".lokr_w2"; if (lora_tensors.find(lokr_w1_name) != lora_tensors.end()) { lokr_w1 = to_f32(compute_ctx, lora_tensors[lokr_w1_name]); @@ -436,29 +456,29 @@ struct LoraModel : public GGMLRunner { if (is_qkv_split) { std::string suffix = ""; - auto split_q_d_name = fk + "q" + suffix + lora_downs[type] + ".weight"; + auto split_q_d_name = full_key + "q" + suffix + lora_downs[type] + ".weight"; if (lora_tensors.find(split_q_d_name) == lora_tensors.end()) { suffix = "_proj"; - split_q_d_name = fk + "q" + suffix + lora_downs[type] + ".weight"; + split_q_d_name = full_key + "q" + suffix + lora_downs[type] + ".weight"; } if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) { // print_ggml_tensor(it.second, true); //[3072, 21504, 1, 1] // find qkv and mlp up parts in LoRA model - auto split_k_d_name = fk + "k" + suffix + lora_downs[type] + ".weight"; - auto split_v_d_name = fk + "v" + suffix + lora_downs[type] + ".weight"; + auto split_k_d_name = full_key + "k" + suffix + lora_downs[type] + ".weight"; + auto split_v_d_name = full_key + "v" + suffix + lora_downs[type] + ".weight"; - auto split_q_u_name = fk + "q" + suffix + lora_ups[type] + ".weight"; - auto split_k_u_name = fk + "k" + suffix + lora_ups[type] + ".weight"; - auto split_v_u_name = fk + "v" + suffix + lora_ups[type] + ".weight"; + auto split_q_u_name = full_key + "q" + suffix + lora_ups[type] + ".weight"; + auto split_k_u_name = full_key + "k" + suffix + lora_ups[type] + ".weight"; + auto split_v_u_name = full_key + "v" + suffix + lora_ups[type] + ".weight"; - auto split_q_scale_name = fk + "q" + suffix + ".scale"; - auto split_k_scale_name = fk + "k" + suffix + ".scale"; - auto split_v_scale_name = fk + "v" + suffix + ".scale"; + auto split_q_scale_name = full_key + "q" + suffix + ".scale"; + auto split_k_scale_name = full_key + "k" + suffix + ".scale"; + auto split_v_scale_name = full_key + "v" + suffix + ".scale"; - auto split_q_alpha_name = fk + "q" + suffix + ".alpha"; - auto split_k_alpha_name = fk + "k" + suffix + ".alpha"; - auto split_v_alpha_name = fk + "v" + suffix + ".alpha"; + auto split_q_alpha_name = full_key + "q" + suffix + ".alpha"; + auto split_k_alpha_name = full_key + "k" + suffix + ".alpha"; + auto split_v_alpha_name = full_key + "v" + suffix + ".alpha"; ggml_tensor* lora_q_down = NULL; ggml_tensor* lora_q_up = NULL; @@ -572,29 +592,29 @@ struct LoraModel : public GGMLRunner { applied_lora_tensors.insert(split_v_d_name); } } else if (is_qkvm_split) { - auto split_q_d_name = fk + "attn.to_q" + lora_downs[type] + ".weight"; + auto split_q_d_name = full_key + "attn.to_q" + lora_downs[type] + ".weight"; if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) { // print_ggml_tensor(it.second, true); //[3072, 21504, 1, 1] // find qkv and mlp up parts in LoRA model - auto split_k_d_name = fk + "attn.to_k" + lora_downs[type] + ".weight"; - auto split_v_d_name = fk + "attn.to_v" + lora_downs[type] + ".weight"; + auto split_k_d_name = full_key + "attn.to_k" + lora_downs[type] + ".weight"; + auto split_v_d_name = full_key + "attn.to_v" + lora_downs[type] + ".weight"; - auto split_q_u_name = fk + "attn.to_q" + lora_ups[type] + ".weight"; - auto split_k_u_name = fk + "attn.to_k" + lora_ups[type] + ".weight"; - auto split_v_u_name = fk + "attn.to_v" + lora_ups[type] + ".weight"; + auto split_q_u_name = full_key + "attn.to_q" + lora_ups[type] + ".weight"; + auto split_k_u_name = full_key + "attn.to_k" + lora_ups[type] + ".weight"; + auto split_v_u_name = full_key + "attn.to_v" + lora_ups[type] + ".weight"; - auto split_m_d_name = fk + "proj_mlp" + lora_downs[type] + ".weight"; - auto split_m_u_name = fk + "proj_mlp" + lora_ups[type] + ".weight"; + auto split_m_d_name = full_key + "proj_mlp" + lora_downs[type] + ".weight"; + auto split_m_u_name = full_key + "proj_mlp" + lora_ups[type] + ".weight"; - auto split_q_scale_name = fk + "attn.to_q" + ".scale"; - auto split_k_scale_name = fk + "attn.to_k" + ".scale"; - auto split_v_scale_name = fk + "attn.to_v" + ".scale"; - auto split_m_scale_name = fk + "proj_mlp" + ".scale"; + auto split_q_scale_name = full_key + "attn.to_q" + ".scale"; + auto split_k_scale_name = full_key + "attn.to_k" + ".scale"; + auto split_v_scale_name = full_key + "attn.to_v" + ".scale"; + auto split_m_scale_name = full_key + "proj_mlp" + ".scale"; - auto split_q_alpha_name = fk + "attn.to_q" + ".alpha"; - auto split_k_alpha_name = fk + "attn.to_k" + ".alpha"; - auto split_v_alpha_name = fk + "attn.to_v" + ".alpha"; - auto split_m_alpha_name = fk + "proj_mlp" + ".alpha"; + auto split_q_alpha_name = full_key + "attn.to_q" + ".alpha"; + auto split_k_alpha_name = full_key + "attn.to_k" + ".alpha"; + auto split_v_alpha_name = full_key + "attn.to_v" + ".alpha"; + auto split_m_alpha_name = full_key + "proj_mlp" + ".alpha"; ggml_tensor* lora_q_down = NULL; ggml_tensor* lora_q_up = NULL; @@ -749,12 +769,12 @@ struct LoraModel : public GGMLRunner { applied_lora_tensors.insert(split_m_d_name); } } else { - lora_up_name = fk + lora_ups[type] + ".weight"; - lora_down_name = fk + lora_downs[type] + ".weight"; - lora_mid_name = fk + ".lora_mid.weight"; + lora_up_name = full_key + lora_ups[type] + ".weight"; + lora_down_name = full_key + lora_downs[type] + ".weight"; + lora_mid_name = full_key + ".lora_mid.weight"; - alpha_name = fk + ".alpha"; - scale_name = fk + ".scale"; + alpha_name = full_key + ".alpha"; + scale_name = full_key + ".scale"; if (lora_tensors.find(lora_up_name) != lora_tensors.end()) { lora_up = to_f32(compute_ctx, lora_tensors[lora_up_name]); @@ -791,28 +811,25 @@ struct LoraModel : public GGMLRunner { updown = ggml_merge_lora(compute_ctx, lora_down, lora_up, lora_mid); } scale_value *= multiplier; - ggml_tensor* original_weight = weight; - if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(weight->buffer)) { - weight = ggml_dup_tensor(compute_ctx, weight); - set_backend_tensor_data(weight, original_weight->data); + ggml_tensor* original_tensor = model_tensor; + if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(model_tensor->buffer)) { + model_tensor = ggml_dup_tensor(compute_ctx, model_tensor); + set_backend_tensor_data(model_tensor, original_tensor->data); } - updown = ggml_reshape(compute_ctx, updown, weight); - GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight)); + updown = ggml_reshape(compute_ctx, updown, model_tensor); + GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(model_tensor)); updown = ggml_scale_inplace(compute_ctx, updown, scale_value); - ggml_tensor* final_weight; - if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) { - // final_weight = ggml_new_tensor(compute_ctx, GGML_TYPE_F32, ggml_n_dims(weight), weight->ne); - // final_weight = ggml_cpy(compute_ctx, weight, final_weight); - final_weight = to_f32(compute_ctx, weight); - final_weight = ggml_add_inplace(compute_ctx, final_weight, updown); - final_weight = ggml_cpy(compute_ctx, final_weight, weight); + ggml_tensor* final_tensor; + if (model_tensor->type != GGML_TYPE_F32 && model_tensor->type != GGML_TYPE_F16) { + final_tensor = to_f32(compute_ctx, model_tensor); + final_tensor = ggml_add_inplace(compute_ctx, final_tensor, updown); + final_tensor = ggml_cpy(compute_ctx, final_tensor, model_tensor); } else { - final_weight = ggml_add_inplace(compute_ctx, weight, updown); + final_tensor = ggml_add_inplace(compute_ctx, model_tensor, updown); } - // final_weight = ggml_add_inplace(compute_ctx, weight, updown); // apply directly - ggml_build_forward_expand(gf, final_weight); - if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_weight->buffer)) { - original_weight_to_final_weight[original_weight] = final_weight; + ggml_build_forward_expand(gf, final_tensor); + if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) { + original_tensor_to_final_tensor[original_tensor] = final_tensor; } break; } @@ -834,10 +851,10 @@ struct LoraModel : public GGMLRunner { * this function is called once to calculate the required buffer size * and then again to actually generate a graph to be used */ if (applied_lora_tensors_count != total_lora_tensors_count) { - LOG_WARN("Only (%lu / %lu) LoRA tensors have been applied", + LOG_WARN("Only (%lu / %lu) LoRA tensors will be applied", applied_lora_tensors_count, total_lora_tensors_count); } else { - LOG_DEBUG("(%lu / %lu) LoRA tensors applied successfully", + LOG_DEBUG("(%lu / %lu) LoRA tensors will be applied", applied_lora_tensors_count, total_lora_tensors_count); } @@ -849,12 +866,13 @@ struct LoraModel : public GGMLRunner { return build_lora_graph(model_tensors, version); }; GGMLRunner::compute(get_graph, n_threads, false); - for (auto item : original_weight_to_final_weight) { - ggml_tensor* original_weight = item.first; - ggml_tensor* final_weight = item.second; + for (auto item : original_tensor_to_final_tensor) { + ggml_tensor* original_tensor = item.first; + ggml_tensor* final_tensor = item.second; - ggml_backend_tensor_copy(final_weight, original_weight); + ggml_backend_tensor_copy(final_tensor, original_tensor); } + original_tensor_to_final_tensor.clear(); GGMLRunner::free_compute_buffer(); } }; diff --git a/model.cpp b/model.cpp index 6b775be..53305f2 100644 --- a/model.cpp +++ b/model.cpp @@ -603,6 +603,8 @@ std::string convert_tensor_name(std::string name) { } else { new_name = name; } + } else if (ends_with(name, ".diff") || ends_with(name, ".diff_b")) { + new_name = "lora." + name; } else if (contains(name, "lora_up") || contains(name, "lora_down") || contains(name, "lora.up") || contains(name, "lora.down") || contains(name, "lora_linear")) { diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 69fc7b1..94c7fab 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -2409,7 +2409,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s } ggml_free(work_ctx); - LOG_INFO("img2vid completed in %.2fs", (t5 - t0) * 1.0f / 1000); + LOG_INFO("generate_video completed in %.2fs", (t5 - t0) * 1.0f / 1000); return result_images; }