#include #include #include "name_conversion.h" #include "util.h" void replace_with_name_map(std::string& name, const std::vector>& name_map) { for (auto kv : name_map) { size_t pos = name.find(kv.first); if (pos != std::string::npos) { name.replace(pos, kv.first.size(), kv.second); } } } void replace_with_prefix_map(std::string& name, const std::vector>& prefix_map) { for (const auto& [old_prefix, new_prefix] : prefix_map) { if (starts_with(name, old_prefix)) { name = new_prefix + name.substr(old_prefix.size()); break; } } } void replace_with_prefix_map(std::string& name, const std::unordered_map& prefix_map) { for (const auto& [old_prefix, new_prefix] : prefix_map) { if (starts_with(name, old_prefix)) { name = new_prefix + name.substr(old_prefix.size()); break; } } } std::string convert_open_clip_to_hf_clip_name(std::string name) { static std::unordered_map open_clip_to_hf_clip_model = { {"model.ln_final.bias", "transformer.text_model.final_layer_norm.bias"}, {"model.ln_final.weight", "transformer.text_model.final_layer_norm.weight"}, {"model.positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"}, {"model.token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"}, {"model.text_projection", "transformer.text_model.text_projection"}, {"model.visual.class_embedding", "transformer.vision_model.embeddings.class_embedding"}, {"model.visual.conv1.weight", "transformer.vision_model.embeddings.patch_embedding.weight"}, {"model.visual.ln_post.bias", "transformer.vision_model.post_layernorm.bias"}, {"model.visual.ln_post.weight", "transformer.vision_model.post_layernorm.weight"}, {"model.visual.ln_pre.bias", "transformer.vision_model.pre_layernorm.bias"}, {"model.visual.ln_pre.weight", "transformer.vision_model.pre_layernorm.weight"}, {"model.visual.positional_embedding", "transformer.vision_model.embeddings.position_embedding.weight"}, {"model.visual.proj", "transformer.visual_projection.weight"}, }; static std::unordered_map open_clip_to_hf_clip_resblock = { {"attn.in_proj_bias", "self_attn.in_proj.bias"}, {"attn.in_proj_weight", "self_attn.in_proj.weight"}, {"attn.out_proj.bias", "self_attn.out_proj.bias"}, {"attn.out_proj.weight", "self_attn.out_proj.weight"}, {"ln_1.bias", "layer_norm1.bias"}, {"ln_1.weight", "layer_norm1.weight"}, {"ln_2.bias", "layer_norm2.bias"}, {"ln_2.weight", "layer_norm2.weight"}, {"mlp.c_fc.bias", "mlp.fc1.bias"}, {"mlp.c_fc.weight", "mlp.fc1.weight"}, {"mlp.c_proj.bias", "mlp.fc2.bias"}, {"mlp.c_proj.weight", "mlp.fc2.weight"}, }; static std::unordered_map cond_model_name_map = { {"transformer.vision_model.pre_layrnorm.weight", "transformer.vision_model.pre_layernorm.weight"}, {"transformer.vision_model.pre_layrnorm.bias", "transformer.vision_model.pre_layernorm.bias"}, }; if (open_clip_to_hf_clip_model.find(name) != open_clip_to_hf_clip_model.end()) { name = open_clip_to_hf_clip_model[name]; } if (cond_model_name_map.find(name) != cond_model_name_map.end()) { name = cond_model_name_map[name]; } std::string open_clip_resblock_prefix = "model.transformer.resblocks."; std::string hf_clip_resblock_prefix = "transformer.text_model.encoder.layers."; auto replace_suffix = [&]() { if (name.find(open_clip_resblock_prefix) == 0) { std::string remain = name.substr(open_clip_resblock_prefix.length()); std::string idx = remain.substr(0, remain.find(".")); std::string suffix = remain.substr(idx.length() + 1); if (open_clip_to_hf_clip_resblock.find(suffix) != open_clip_to_hf_clip_resblock.end()) { std::string new_suffix = open_clip_to_hf_clip_resblock[suffix]; name = hf_clip_resblock_prefix + idx + "." + new_suffix; } } }; replace_suffix(); open_clip_resblock_prefix = "model.visual.transformer.resblocks."; hf_clip_resblock_prefix = "transformer.vision_model.encoder.layers."; replace_suffix(); return name; } std::string convert_cond_stage_model_name(std::string name, std::string prefix) { static const std::vector> clip_name_map{ {"transformer.text_projection.weight", "transformer.text_model.text_projection"}, {"model.text_projection.weight", "transformer.text_model.text_projection"}, {"vision_model.visual_projection.weight", "visual_projection.weight"}, }; // llama.cpp to original static const std::vector> t5_name_map{ {"enc.", "encoder."}, {"blk.", "block."}, {"output_norm.", "final_layer_norm."}, {"attn_q.", "layer.0.SelfAttention.q."}, {"attn_k.", "layer.0.SelfAttention.k."}, {"attn_v.", "layer.0.SelfAttention.v."}, {"attn_o.", "layer.0.SelfAttention.o."}, {"attn_norm.", "layer.0.layer_norm."}, {"ffn_norm.", "layer.1.layer_norm."}, {"ffn_up.", "layer.1.DenseReluDense.wi_1."}, {"ffn_down.", "layer.1.DenseReluDense.wo."}, {"ffn_gate.", "layer.1.DenseReluDense.wi_0."}, {"attn_rel_b.", "layer.0.SelfAttention.relative_attention_bias."}, {"token_embd.", "shared."}, }; static const std::vector> qwenvl_name_map{ {"token_embd.", "model.embed_tokens."}, {"blk.", "model.layers."}, {"attn_q.", "self_attn.q_proj."}, {"attn_k.", "self_attn.k_proj."}, {"attn_v.", "self_attn.v_proj."}, {"attn_output.", "self_attn.o_proj."}, {"attn_norm.", "input_layernorm."}, {"ffn_down.", "mlp.down_proj."}, {"ffn_gate.", "mlp.gate_proj."}, {"ffn_up.", "mlp.up_proj."}, {"ffn_norm.", "post_attention_layernorm."}, {"output_norm.", "model.norm."}, }; static const std::vector> qwenvl_vision_name_map{ {"mm.", "merger.mlp."}, {"v.post_ln.", "merger.ln_q."}, {"v.patch_embd.weight", "patch_embed.proj.0.weight"}, {"patch_embed.proj.0.weight.1", "patch_embed.proj.1.weight"}, {"v.patch_embd.weight.1", "patch_embed.proj.1.weight"}, {"v.blk.", "blocks."}, {"attn_q.", "attn.q_proj."}, {"attn_k.", "attn.k_proj."}, {"attn_v.", "attn.v_proj."}, {"attn_out.", "attn.proj."}, {"ffn_down.", "mlp.down_proj."}, {"ffn_gate.", "mlp.gate_proj."}, {"ffn_up.", "mlp.up_proj."}, {"ln1.", "norm1."}, {"ln2.", "norm2."}, }; if (contains(name, "t5xxl")) { replace_with_name_map(name, t5_name_map); } else if (contains(name, "qwen2vl")) { if (contains(name, "qwen2vl.visual")) { replace_with_name_map(name, qwenvl_vision_name_map); } else { replace_with_name_map(name, qwenvl_name_map); } } else { name = convert_open_clip_to_hf_clip_name(name); replace_with_name_map(name, clip_name_map); } return name; } // ref: https://github.com/huggingface/diffusers/blob/main/scripts/convert_diffusers_to_original_stable_diffusion.py std::string convert_diffusers_unet_to_original_sd1(std::string name) { // (stable-diffusion, HF Diffusers) static const std::vector> unet_conversion_map = { {"time_embed.0.weight", "time_embedding.linear_1.weight"}, {"time_embed.0.bias", "time_embedding.linear_1.bias"}, {"time_embed.2.weight", "time_embedding.linear_2.weight"}, {"time_embed.2.bias", "time_embedding.linear_2.bias"}, {"input_blocks.0.0.weight", "conv_in.weight"}, {"input_blocks.0.0.bias", "conv_in.bias"}, {"out.0.weight", "conv_norm_out.weight"}, {"out.0.bias", "conv_norm_out.bias"}, {"out.2.weight", "conv_out.weight"}, {"out.2.bias", "conv_out.bias"}, }; static const std::vector> unet_conversion_map_resnet = { {"in_layers.0", "norm1"}, {"in_layers.2", "conv1"}, {"out_layers.0", "norm2"}, {"out_layers.3", "conv2"}, {"emb_layers.1", "time_emb_proj"}, {"skip_connection", "conv_shortcut"}, }; static std::vector> unet_conversion_map_layer; if (unet_conversion_map_layer.empty()) { for (int i = 0; i < 4; ++i) { // down_blocks for (int j = 0; j < 2; ++j) { std::string hf_down_res_prefix = "down_blocks." + std::to_string(i) + ".resnets." + std::to_string(j) + "."; std::string sd_down_res_prefix = "input_blocks." + std::to_string(3 * i + j + 1) + ".0."; unet_conversion_map_layer.emplace_back(sd_down_res_prefix, hf_down_res_prefix); if (i < 3) { std::string hf_down_atn_prefix = "down_blocks." + std::to_string(i) + ".attentions." + std::to_string(j) + "."; std::string sd_down_atn_prefix = "input_blocks." + std::to_string(3 * i + j + 1) + ".1."; unet_conversion_map_layer.emplace_back(sd_down_atn_prefix, hf_down_atn_prefix); } } // up_blocks for (int j = 0; j < 3; ++j) { std::string hf_up_res_prefix = "up_blocks." + std::to_string(i) + ".resnets." + std::to_string(j) + "."; std::string sd_up_res_prefix = "output_blocks." + std::to_string(3 * i + j) + ".0."; unet_conversion_map_layer.emplace_back(sd_up_res_prefix, hf_up_res_prefix); if (/*i > 0*/ true) { // for tiny unet std::string hf_up_atn_prefix = "up_blocks." + std::to_string(i) + ".attentions." + std::to_string(j) + "."; std::string sd_up_atn_prefix = "output_blocks." + std::to_string(3 * i + j) + ".1."; unet_conversion_map_layer.emplace_back(sd_up_atn_prefix, hf_up_atn_prefix); } } if (i < 3) { std::string hf_downsample_prefix = "down_blocks." + std::to_string(i) + ".downsamplers.0.conv."; std::string sd_downsample_prefix = "input_blocks." + std::to_string(3 * (i + 1)) + ".0.op."; unet_conversion_map_layer.emplace_back(sd_downsample_prefix, hf_downsample_prefix); std::string hf_upsample_prefix = "up_blocks." + std::to_string(i) + ".upsamplers.0."; std::string sd_upsample_prefix = "output_blocks." + std::to_string(3 * i + 2) + "." + std::to_string(i == 0 ? 1 : 2) + "."; unet_conversion_map_layer.emplace_back(sd_upsample_prefix, hf_upsample_prefix); } } // mid block unet_conversion_map_layer.emplace_back("middle_block.1.", "mid_block.attentions.0."); for (int j = 0; j < 2; ++j) { std::string hf_mid_res_prefix = "mid_block.resnets." + std::to_string(j) + "."; std::string sd_mid_res_prefix = "middle_block." + std::to_string(2 * j) + "."; unet_conversion_map_layer.emplace_back(sd_mid_res_prefix, hf_mid_res_prefix); } } std::string result = name; for (const auto& p : unet_conversion_map) { if (result == p.second) { result = p.first; return result; } } if (contains(result, "resnets")) { for (const auto& p : unet_conversion_map_resnet) { size_t pos = result.find(p.second); if (pos != std::string::npos) { result.replace(pos, p.second.size(), p.first); } } } for (const auto& p : unet_conversion_map_layer) { size_t pos = result.find(p.second); if (pos != std::string::npos) { result.replace(pos, p.second.size(), p.first); } } return result; } // ref: https://github.com/huggingface/diffusers/blob/main/scripts/convert_diffusers_to_original_sdxl.py std::string convert_diffusers_unet_to_original_sdxl(std::string name) { // (stable-diffusion, HF Diffusers) static const std::vector> unet_conversion_map = { {"time_embed.0.weight", "time_embedding.linear_1.weight"}, {"time_embed.0.bias", "time_embedding.linear_1.bias"}, {"time_embed.2.weight", "time_embedding.linear_2.weight"}, {"time_embed.2.bias", "time_embedding.linear_2.bias"}, {"input_blocks.0.0.weight", "conv_in.weight"}, {"input_blocks.0.0.bias", "conv_in.bias"}, {"out.0.weight", "conv_norm_out.weight"}, {"out.0.bias", "conv_norm_out.bias"}, {"out.2.weight", "conv_out.weight"}, {"out.2.bias", "conv_out.bias"}, // --- SDXL add_embedding mappings --- {"label_emb.0.0.weight", "add_embedding.linear_1.weight"}, {"label_emb.0.0.bias", "add_embedding.linear_1.bias"}, {"label_emb.0.2.weight", "add_embedding.linear_2.weight"}, {"label_emb.0.2.bias", "add_embedding.linear_2.bias"}, }; static const std::vector> unet_conversion_map_resnet = { {"in_layers.0", "norm1"}, {"in_layers.2", "conv1"}, {"out_layers.0", "norm2"}, {"out_layers.3", "conv2"}, {"emb_layers.1", "time_emb_proj"}, {"skip_connection", "conv_shortcut"}, }; static std::vector> unet_conversion_map_layer; if (unet_conversion_map_layer.empty()) { for (int i = 0; i < 3; ++i) { // --- down_blocks --- for (int j = 0; j < 2; ++j) { std::string hf_down_res_prefix = "down_blocks." + std::to_string(i) + ".resnets." + std::to_string(j) + "."; std::string sd_down_res_prefix = "input_blocks." + std::to_string(3 * i + j + 1) + ".0."; unet_conversion_map_layer.emplace_back(sd_down_res_prefix, hf_down_res_prefix); if (i > 0) { std::string hf_down_atn_prefix = "down_blocks." + std::to_string(i) + ".attentions." + std::to_string(j) + "."; std::string sd_down_atn_prefix = "input_blocks." + std::to_string(3 * i + j + 1) + ".1."; unet_conversion_map_layer.emplace_back(sd_down_atn_prefix, hf_down_atn_prefix); } } // --- up_blocks --- for (int j = 0; j < 4; ++j) { std::string hf_up_res_prefix = "up_blocks." + std::to_string(i) + ".resnets." + std::to_string(j) + "."; std::string sd_up_res_prefix = "output_blocks." + std::to_string(3 * i + j) + ".0."; unet_conversion_map_layer.emplace_back(sd_up_res_prefix, hf_up_res_prefix); if (i < 2) { std::string hf_up_atn_prefix = "up_blocks." + std::to_string(i) + ".attentions." + std::to_string(j) + "."; std::string sd_up_atn_prefix = "output_blocks." + std::to_string(3 * i + j) + ".1."; unet_conversion_map_layer.emplace_back(sd_up_atn_prefix, hf_up_atn_prefix); } } if (i < 3) { std::string hf_downsample_prefix = "down_blocks." + std::to_string(i) + ".downsamplers.0.conv."; std::string sd_downsample_prefix = "input_blocks." + std::to_string(3 * (i + 1)) + ".0.op."; unet_conversion_map_layer.emplace_back(sd_downsample_prefix, hf_downsample_prefix); std::string hf_upsample_prefix = "up_blocks." + std::to_string(i) + ".upsamplers.0."; std::string sd_upsample_prefix = "output_blocks." + std::to_string(3 * i + 2) + "." + std::to_string(i == 0 ? 1 : 2) + "."; unet_conversion_map_layer.emplace_back(sd_upsample_prefix, hf_upsample_prefix); } } unet_conversion_map_layer.emplace_back("output_blocks.2.2.conv.", "output_blocks.2.1.conv."); // mid block unet_conversion_map_layer.emplace_back("middle_block.1.", "mid_block.attentions.0."); for (int j = 0; j < 2; ++j) { std::string hf_mid_res_prefix = "mid_block.resnets." + std::to_string(j) + "."; std::string sd_mid_res_prefix = "middle_block." + std::to_string(2 * j) + "."; unet_conversion_map_layer.emplace_back(sd_mid_res_prefix, hf_mid_res_prefix); } } std::string result = name; for (const auto& p : unet_conversion_map) { if (result == p.second) { result = p.first; return result; } } if (contains(result, "resnets")) { for (const auto& p : unet_conversion_map_resnet) { size_t pos = result.find(p.second); if (pos != std::string::npos) { result.replace(pos, p.second.size(), p.first); } } } for (const auto& p : unet_conversion_map_layer) { size_t pos = result.find(p.second); if (pos != std::string::npos) { result.replace(pos, p.second.size(), p.first); } } static const std::vector> name_map{ {"to_out.weight", "to_out.0.weight"}, {"to_out.bias", "to_out.0.bias"}, }; replace_with_name_map(result, name_map); return result; } std::string convert_diffusers_dit_to_original_sd3(std::string name) { int num_layers = 38; static std::unordered_map sd3_name_map; if (sd3_name_map.empty()) { // --- time_text_embed --- sd3_name_map["time_text_embed.timestep_embedder.linear_1.weight"] = "t_embedder.mlp.0.weight"; sd3_name_map["time_text_embed.timestep_embedder.linear_1.bias"] = "t_embedder.mlp.0.bias"; sd3_name_map["time_text_embed.timestep_embedder.linear_2.weight"] = "t_embedder.mlp.2.weight"; sd3_name_map["time_text_embed.timestep_embedder.linear_2.bias"] = "t_embedder.mlp.2.bias"; sd3_name_map["time_text_embed.text_embedder.linear_1.weight"] = "y_embedder.mlp.0.weight"; sd3_name_map["time_text_embed.text_embedder.linear_1.bias"] = "y_embedder.mlp.0.bias"; sd3_name_map["time_text_embed.text_embedder.linear_2.weight"] = "y_embedder.mlp.2.weight"; sd3_name_map["time_text_embed.text_embedder.linear_2.bias"] = "y_embedder.mlp.2.bias"; sd3_name_map["pos_embed.pos_embed"] = "pos_embed"; sd3_name_map["pos_embed.proj.weight"] = "x_embedder.proj.weight"; sd3_name_map["pos_embed.proj.bias"] = "x_embedder.proj.bias"; // --- transformer blocks --- for (int i = 0; i < num_layers; ++i) { std::string block_prefix = "transformer_blocks." + std::to_string(i) + "."; std::string dst_prefix = "joint_blocks." + std::to_string(i) + "."; sd3_name_map[block_prefix + "norm1.linear.weight"] = dst_prefix + "x_block.adaLN_modulation.1.weight"; sd3_name_map[block_prefix + "norm1.linear.bias"] = dst_prefix + "x_block.adaLN_modulation.1.bias"; sd3_name_map[block_prefix + "norm1_context.linear.weight"] = dst_prefix + "context_block.adaLN_modulation.1.weight"; sd3_name_map[block_prefix + "norm1_context.linear.bias"] = dst_prefix + "context_block.adaLN_modulation.1.bias"; // attn sd3_name_map[block_prefix + "attn.to_q.weight"] = dst_prefix + "x_block.attn.qkv.weight"; sd3_name_map[block_prefix + "attn.to_q.bias"] = dst_prefix + "x_block.attn.qkv.bias"; sd3_name_map[block_prefix + "attn.to_k.weight"] = dst_prefix + "x_block.attn.qkv.weight.1"; sd3_name_map[block_prefix + "attn.to_k.bias"] = dst_prefix + "x_block.attn.qkv.bias.1"; sd3_name_map[block_prefix + "attn.to_v.weight"] = dst_prefix + "x_block.attn.qkv.weight.2"; sd3_name_map[block_prefix + "attn.to_v.bias"] = dst_prefix + "x_block.attn.qkv.bias.2"; sd3_name_map[block_prefix + "attn.add_q_proj.weight"] = dst_prefix + "context_block.attn.qkv.weight"; sd3_name_map[block_prefix + "attn.add_q_proj.bias"] = dst_prefix + "context_block.attn.qkv.bias"; sd3_name_map[block_prefix + "attn.add_k_proj.weight"] = dst_prefix + "context_block.attn.qkv.weight.1"; sd3_name_map[block_prefix + "attn.add_k_proj.bias"] = dst_prefix + "context_block.attn.qkv.bias.1"; sd3_name_map[block_prefix + "attn.add_v_proj.weight"] = dst_prefix + "context_block.attn.qkv.weight.2"; sd3_name_map[block_prefix + "attn.add_v_proj.bias"] = dst_prefix + "context_block.attn.qkv.bias.2"; // attn2 sd3_name_map[block_prefix + "attn2.to_q.weight"] = dst_prefix + "x_block.attn2.qkv.weight"; sd3_name_map[block_prefix + "attn2.to_q.bias"] = dst_prefix + "x_block.attn2.qkv.bias"; sd3_name_map[block_prefix + "attn2.to_k.weight"] = dst_prefix + "x_block.attn2.qkv.weight.1"; sd3_name_map[block_prefix + "attn2.to_k.bias"] = dst_prefix + "x_block.attn2.qkv.bias.1"; sd3_name_map[block_prefix + "attn2.to_v.weight"] = dst_prefix + "x_block.attn2.qkv.weight.2"; sd3_name_map[block_prefix + "attn2.to_v.bias"] = dst_prefix + "x_block.attn2.qkv.bias.2"; sd3_name_map[block_prefix + "attn2.add_q_proj.weight"] = dst_prefix + "context_block.attn2.qkv.weight"; sd3_name_map[block_prefix + "attn2.add_q_proj.bias"] = dst_prefix + "context_block.attn2.qkv.bias"; sd3_name_map[block_prefix + "attn2.add_k_proj.weight"] = dst_prefix + "context_block.attn2.qkv.weight.1"; sd3_name_map[block_prefix + "attn2.add_k_proj.bias"] = dst_prefix + "context_block.attn2.qkv.bias.1"; sd3_name_map[block_prefix + "attn2.add_v_proj.weight"] = dst_prefix + "context_block.attn2.qkv.weight.2"; sd3_name_map[block_prefix + "attn2.add_v_proj.bias"] = dst_prefix + "context_block.attn2.qkv.bias.2"; // norm sd3_name_map[block_prefix + "attn.norm_q.weight"] = dst_prefix + "x_block.attn.ln_q.weight"; sd3_name_map[block_prefix + "attn.norm_k.weight"] = dst_prefix + "x_block.attn.ln_k.weight"; sd3_name_map[block_prefix + "attn.norm_added_q.weight"] = dst_prefix + "context_block.attn.ln_q.weight"; sd3_name_map[block_prefix + "attn.norm_added_k.weight"] = dst_prefix + "context_block.attn.ln_k.weight"; // norm2 sd3_name_map[block_prefix + "attn2.norm_q.weight"] = dst_prefix + "x_block.attn2.ln_q.weight"; sd3_name_map[block_prefix + "attn2.norm_k.weight"] = dst_prefix + "x_block.attn2.ln_k.weight"; // ff sd3_name_map[block_prefix + "ff.net.0.proj.weight"] = dst_prefix + "x_block.mlp.fc1.weight"; sd3_name_map[block_prefix + "ff.net.0.proj.bias"] = dst_prefix + "x_block.mlp.fc1.bias"; sd3_name_map[block_prefix + "ff.net.2.weight"] = dst_prefix + "x_block.mlp.fc2.weight"; sd3_name_map[block_prefix + "ff.net.2.bias"] = dst_prefix + "x_block.mlp.fc2.bias"; sd3_name_map[block_prefix + "ff_context.net.0.proj.weight"] = dst_prefix + "context_block.mlp.fc1.weight"; sd3_name_map[block_prefix + "ff_context.net.0.proj.bias"] = dst_prefix + "context_block.mlp.fc1.bias"; sd3_name_map[block_prefix + "ff_context.net.2.weight"] = dst_prefix + "context_block.mlp.fc2.weight"; sd3_name_map[block_prefix + "ff_context.net.2.bias"] = dst_prefix + "context_block.mlp.fc2.bias"; // output projections sd3_name_map[block_prefix + "attn.to_out.0.weight"] = dst_prefix + "x_block.attn.proj.weight"; sd3_name_map[block_prefix + "attn.to_out.0.bias"] = dst_prefix + "x_block.attn.proj.bias"; sd3_name_map[block_prefix + "attn.to_add_out.weight"] = dst_prefix + "context_block.attn.proj.weight"; sd3_name_map[block_prefix + "attn.to_add_out.bias"] = dst_prefix + "context_block.attn.proj.bias"; // output projections 2 sd3_name_map[block_prefix + "attn2.to_out.0.weight"] = dst_prefix + "x_block.attn2.proj.weight"; sd3_name_map[block_prefix + "attn2.to_out.0.bias"] = dst_prefix + "x_block.attn2.proj.bias"; sd3_name_map[block_prefix + "attn2.to_add_out.weight"] = dst_prefix + "context_block.attn2.proj.weight"; sd3_name_map[block_prefix + "attn2.to_add_out.bias"] = dst_prefix + "context_block.attn2.proj.bias"; } // --- final layers --- sd3_name_map["proj_out.weight"] = "final_layer.linear.weight"; sd3_name_map["proj_out.bias"] = "final_layer.linear.bias"; sd3_name_map["norm_out.linear.weight"] = "final_layer.adaLN_modulation.1.weight"; sd3_name_map["norm_out.linear.bias"] = "final_layer.adaLN_modulation.1.bias"; } replace_with_prefix_map(name, sd3_name_map); return name; } std::string convert_diffusers_dit_to_original_flux(std::string name) { int num_layers = 19; int num_single_layers = 38; static std::unordered_map flux_name_map; if (flux_name_map.empty()) { // --- time_text_embed --- flux_name_map["time_text_embed.timestep_embedder.linear_1.weight"] = "time_in.in_layer.weight"; flux_name_map["time_text_embed.timestep_embedder.linear_1.bias"] = "time_in.in_layer.bias"; flux_name_map["time_text_embed.timestep_embedder.linear_2.weight"] = "time_in.out_layer.weight"; flux_name_map["time_text_embed.timestep_embedder.linear_2.bias"] = "time_in.out_layer.bias"; flux_name_map["time_text_embed.text_embedder.linear_1.weight"] = "vector_in.in_layer.weight"; flux_name_map["time_text_embed.text_embedder.linear_1.bias"] = "vector_in.in_layer.bias"; flux_name_map["time_text_embed.text_embedder.linear_2.weight"] = "vector_in.out_layer.weight"; flux_name_map["time_text_embed.text_embedder.linear_2.bias"] = "vector_in.out_layer.bias"; // guidance flux_name_map["time_text_embed.guidance_embedder.linear_1.weight"] = "guidance_in.in_layer.weight"; flux_name_map["time_text_embed.guidance_embedder.linear_1.bias"] = "guidance_in.in_layer.bias"; flux_name_map["time_text_embed.guidance_embedder.linear_2.weight"] = "guidance_in.out_layer.weight"; flux_name_map["time_text_embed.guidance_embedder.linear_2.bias"] = "guidance_in.out_layer.bias"; // --- context_embedder / x_embedder --- flux_name_map["context_embedder.weight"] = "txt_in.weight"; flux_name_map["context_embedder.bias"] = "txt_in.bias"; flux_name_map["x_embedder.weight"] = "img_in.weight"; flux_name_map["x_embedder.bias"] = "img_in.bias"; // --- double transformer blocks --- for (int i = 0; i < num_layers; ++i) { std::string block_prefix = "transformer_blocks." + std::to_string(i) + "."; std::string dst_prefix = "double_blocks." + std::to_string(i) + "."; flux_name_map[block_prefix + "norm1.linear.weight"] = dst_prefix + "img_mod.lin.weight"; flux_name_map[block_prefix + "norm1.linear.bias"] = dst_prefix + "img_mod.lin.bias"; flux_name_map[block_prefix + "norm1_context.linear.weight"] = dst_prefix + "txt_mod.lin.weight"; flux_name_map[block_prefix + "norm1_context.linear.bias"] = dst_prefix + "txt_mod.lin.bias"; // attn flux_name_map[block_prefix + "attn.to_q.weight"] = dst_prefix + "img_attn.qkv.weight"; flux_name_map[block_prefix + "attn.to_q.bias"] = dst_prefix + "img_attn.qkv.bias"; flux_name_map[block_prefix + "attn.to_k.weight"] = dst_prefix + "img_attn.qkv.weight.1"; flux_name_map[block_prefix + "attn.to_k.bias"] = dst_prefix + "img_attn.qkv.bias.1"; flux_name_map[block_prefix + "attn.to_v.weight"] = dst_prefix + "img_attn.qkv.weight.2"; flux_name_map[block_prefix + "attn.to_v.bias"] = dst_prefix + "img_attn.qkv.bias.2"; flux_name_map[block_prefix + "attn.add_q_proj.weight"] = dst_prefix + "txt_attn.qkv.weight"; flux_name_map[block_prefix + "attn.add_q_proj.bias"] = dst_prefix + "txt_attn.qkv.bias"; flux_name_map[block_prefix + "attn.add_k_proj.weight"] = dst_prefix + "txt_attn.qkv.weight.1"; flux_name_map[block_prefix + "attn.add_k_proj.bias"] = dst_prefix + "txt_attn.qkv.bias.1"; flux_name_map[block_prefix + "attn.add_v_proj.weight"] = dst_prefix + "txt_attn.qkv.weight.2"; flux_name_map[block_prefix + "attn.add_v_proj.bias"] = dst_prefix + "txt_attn.qkv.bias.2"; // norm flux_name_map[block_prefix + "attn.norm_q.weight"] = dst_prefix + "img_attn.norm.query_norm.scale"; flux_name_map[block_prefix + "attn.norm_k.weight"] = dst_prefix + "img_attn.norm.key_norm.scale"; flux_name_map[block_prefix + "attn.norm_added_q.weight"] = dst_prefix + "txt_attn.norm.query_norm.scale"; flux_name_map[block_prefix + "attn.norm_added_k.weight"] = dst_prefix + "txt_attn.norm.key_norm.scale"; // ff flux_name_map[block_prefix + "ff.net.0.proj.weight"] = dst_prefix + "img_mlp.0.weight"; flux_name_map[block_prefix + "ff.net.0.proj.bias"] = dst_prefix + "img_mlp.0.bias"; flux_name_map[block_prefix + "ff.net.2.weight"] = dst_prefix + "img_mlp.2.weight"; flux_name_map[block_prefix + "ff.net.2.bias"] = dst_prefix + "img_mlp.2.bias"; flux_name_map[block_prefix + "ff_context.net.0.proj.weight"] = dst_prefix + "txt_mlp.0.weight"; flux_name_map[block_prefix + "ff_context.net.0.proj.bias"] = dst_prefix + "txt_mlp.0.bias"; flux_name_map[block_prefix + "ff_context.net.2.weight"] = dst_prefix + "txt_mlp.2.weight"; flux_name_map[block_prefix + "ff_context.net.2.bias"] = dst_prefix + "txt_mlp.2.bias"; // output projections flux_name_map[block_prefix + "attn.to_out.0.weight"] = dst_prefix + "img_attn.proj.weight"; flux_name_map[block_prefix + "attn.to_out.0.bias"] = dst_prefix + "img_attn.proj.bias"; flux_name_map[block_prefix + "attn.to_add_out.weight"] = dst_prefix + "txt_attn.proj.weight"; flux_name_map[block_prefix + "attn.to_add_out.bias"] = dst_prefix + "txt_attn.proj.bias"; } // --- single transformer blocks --- for (int i = 0; i < num_single_layers; ++i) { std::string block_prefix = "single_transformer_blocks." + std::to_string(i) + "."; std::string dst_prefix = "single_blocks." + std::to_string(i) + "."; flux_name_map[block_prefix + "norm.linear.weight"] = dst_prefix + "modulation.lin.weight"; flux_name_map[block_prefix + "norm.linear.bias"] = dst_prefix + "modulation.lin.bias"; flux_name_map[block_prefix + "attn.to_q.weight"] = dst_prefix + "linear1.weight"; flux_name_map[block_prefix + "attn.to_q.bias"] = dst_prefix + "linear1.bias"; flux_name_map[block_prefix + "attn.to_k.weight"] = dst_prefix + "linear1.weight.1"; flux_name_map[block_prefix + "attn.to_k.bias"] = dst_prefix + "linear1.bias.1"; flux_name_map[block_prefix + "attn.to_v.weight"] = dst_prefix + "linear1.weight.2"; flux_name_map[block_prefix + "attn.to_v.bias"] = dst_prefix + "linear1.bias.2"; flux_name_map[block_prefix + "proj_mlp.weight"] = dst_prefix + "linear1.weight.3"; flux_name_map[block_prefix + "proj_mlp.bias"] = dst_prefix + "linear1.bias.3"; flux_name_map[block_prefix + "attn.norm_q.weight"] = dst_prefix + "norm.query_norm.scale"; flux_name_map[block_prefix + "attn.norm_k.weight"] = dst_prefix + "norm.key_norm.scale"; flux_name_map[block_prefix + "proj_out.weight"] = dst_prefix + "linear2.weight"; flux_name_map[block_prefix + "proj_out.bias"] = dst_prefix + "linear2.bias"; } // --- final layers --- flux_name_map["proj_out.weight"] = "final_layer.linear.weight"; flux_name_map["proj_out.bias"] = "final_layer.linear.bias"; flux_name_map["norm_out.linear.weight"] = "final_layer.adaLN_modulation.1.weight"; flux_name_map["norm_out.linear.bias"] = "final_layer.adaLN_modulation.1.bias"; } replace_with_prefix_map(name, flux_name_map); return name; } std::string convert_diffusion_model_name(std::string name, std::string prefix, SDVersion version) { if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) { name = convert_diffusers_unet_to_original_sd1(name); } else if (sd_version_is_sdxl(version)) { name = convert_diffusers_unet_to_original_sdxl(name); } else if (sd_version_is_sd3(version)) { name = convert_diffusers_dit_to_original_sd3(name); } else if (sd_version_is_flux(version)) { name = convert_diffusers_dit_to_original_flux(name); } return name; } std::string convert_diffusers_vae_to_original_sd1(std::string name) { static const std::vector> vae_conversion_map_base = { {"nin_shortcut", "conv_shortcut"}, {"norm_out", "conv_norm_out"}, {"mid.attn_1.", "mid_block.attentions.0."}, }; static std::vector> vae_conversion_map_layer; if (vae_conversion_map_layer.empty()) { for (int i = 0; i < 4; ++i) { // --- encoder down blocks --- for (int j = 0; j < 2; ++j) { std::string hf_down_prefix = "encoder.down_blocks." + std::to_string(i) + ".resnets." + std::to_string(j) + "."; std::string sd_down_prefix = "encoder.down." + std::to_string(i) + ".block." + std::to_string(j) + "."; vae_conversion_map_layer.emplace_back(sd_down_prefix, hf_down_prefix); } if (i < 3) { std::string hf_downsample_prefix = "down_blocks." + std::to_string(i) + ".downsamplers.0."; std::string sd_downsample_prefix = "down." + std::to_string(i) + ".downsample."; vae_conversion_map_layer.emplace_back(sd_downsample_prefix, hf_downsample_prefix); std::string hf_upsample_prefix = "up_blocks." + std::to_string(i) + ".upsamplers.0."; std::string sd_upsample_prefix = "up." + std::to_string(3 - i) + ".upsample."; vae_conversion_map_layer.emplace_back(sd_upsample_prefix, hf_upsample_prefix); } // --- decoder up blocks (reverse) --- for (int j = 0; j < 3; ++j) { std::string hf_up_prefix = "decoder.up_blocks." + std::to_string(i) + ".resnets." + std::to_string(j) + "."; std::string sd_up_prefix = "decoder.up." + std::to_string(3 - i) + ".block." + std::to_string(j) + "."; vae_conversion_map_layer.emplace_back(sd_up_prefix, hf_up_prefix); } } // --- mid block (encoder + decoder) --- for (int i = 0; i < 2; ++i) { std::string hf_mid_res_prefix = "mid_block.resnets." + std::to_string(i) + "."; std::string sd_mid_res_prefix = "mid.block_" + std::to_string(i + 1) + "."; vae_conversion_map_layer.emplace_back(sd_mid_res_prefix, hf_mid_res_prefix); } } static const std::vector> vae_conversion_map_attn = { {"norm.", "group_norm."}, {"q.", "query."}, {"k.", "key."}, {"v.", "value."}, {"proj_out.", "proj_attn."}, }; static const std::vector> vae_extra_conversion_map = { {"to_q", "q"}, {"to_k", "k"}, {"to_v", "v"}, {"to_out.0", "proj_out"}, }; std::string result = name; for (const auto& p : vae_conversion_map_base) { size_t pos = result.find(p.second); if (pos != std::string::npos) { result.replace(pos, p.second.size(), p.first); } } for (const auto& p : vae_conversion_map_layer) { size_t pos = result.find(p.second); if (pos != std::string::npos) { result.replace(pos, p.second.size(), p.first); } } if (name.find("attentions") != std::string::npos) { for (const auto& p : vae_conversion_map_attn) { size_t pos = result.find(p.second); if (pos != std::string::npos) { result.replace(pos, p.second.size(), p.first); } } } if (result.find("mid.attn_1.") != std::string::npos) { for (const auto& p : vae_extra_conversion_map) { size_t pos = result.find(p.first); if (pos != std::string::npos) { result.replace(pos, p.first.size(), p.second); } } } return result; } std::string convert_first_stage_model_name(std::string name, std::string prefix) { name = convert_diffusers_vae_to_original_sd1(name); return name; } std::string convert_pmid_name(const std::string& name) { static std::unordered_map pmid_name_map = { {"pmid.vision_model.visual_projection.weight", "pmid.visual_projection.weight"}, }; if (pmid_name_map.find(name) != pmid_name_map.end()) { return pmid_name_map[name]; } return name; } std::string convert_pmid_v2_name(const std::string& name) { static std::unordered_map pmid_v2_name_map = { {"pmid.qformer_perceiver.perceiver_resampler.layers.0.1.1.weight", "pmid.qformer_perceiver.perceiver_resampler.layers.0.1.1.fc1.weight"}, {"pmid.qformer_perceiver.perceiver_resampler.layers.0.1.3.weight", "pmid.qformer_perceiver.perceiver_resampler.layers.0.1.1.fc2.weight"}, {"pmid.qformer_perceiver.perceiver_resampler.layers.1.1.1.weight", "pmid.qformer_perceiver.perceiver_resampler.layers.1.1.1.fc1.weight"}, {"pmid.qformer_perceiver.perceiver_resampler.layers.1.1.3.weight", "pmid.qformer_perceiver.perceiver_resampler.layers.1.1.1.fc2.weight"}, {"pmid.qformer_perceiver.perceiver_resampler.layers.2.1.1.weight", "pmid.qformer_perceiver.perceiver_resampler.layers.2.1.1.fc1.weight"}, {"pmid.qformer_perceiver.perceiver_resampler.layers.2.1.3.weight", "pmid.qformer_perceiver.perceiver_resampler.layers.2.1.1.fc2.weight"}, {"pmid.qformer_perceiver.perceiver_resampler.layers.3.1.1.weight", "pmid.qformer_perceiver.perceiver_resampler.layers.3.1.1.fc1.weight"}, {"pmid.qformer_perceiver.perceiver_resampler.layers.3.1.3.weight", "pmid.qformer_perceiver.perceiver_resampler.layers.3.1.1.fc2.weight"}, {"pmid.qformer_perceiver.token_proj.0.bias", "pmid.qformer_perceiver.token_proj.fc1.bias"}, {"pmid.qformer_perceiver.token_proj.2.bias", "pmid.qformer_perceiver.token_proj.fc2.bias"}, {"pmid.qformer_perceiver.token_proj.0.weight", "pmid.qformer_perceiver.token_proj.fc1.weight"}, {"pmid.qformer_perceiver.token_proj.2.weight", "pmid.qformer_perceiver.token_proj.fc2.weight"}, }; if (pmid_v2_name_map.find(name) != pmid_v2_name_map.end()) { return pmid_v2_name_map[name]; } return name; } std::string convert_sep_to_dot(std::string name) { const std::vector protected_tokens = { "self_attn", "out_proj", "q_proj", "k_proj", "v_proj", "to_k", "to_q", "to_v", "to_out", "text_model", "down_blocks", "mid_block", "up_block", "proj_in", "proj_out", "transformer_blocks", "single_transformer_blocks", "diffusion_model", "cond_stage_model", "first_stage_model", "conv_in", "conv_out", "lora_down", "lora_up", "diff_b", "hada_w1_a", "hada_w1_b", "hada_w2_a", "hada_w2_b", "hada_t1", "hada_t2", ".lokr_w1", ".lokr_w1_a", ".lokr_w1_b", ".lokr_w2", ".lokr_w2_a", ".lokr_w2_b", "time_emb_proj", "conv_shortcut", "time_embedding", "conv_norm_out", "double_blocks", "txt_attn", "img_attn", "input_blocks", "output_blocks", "middle_block", "skip_connection", "emb_layers", "in_layers", "out_layers", "add_q_proj", "add_k_proj", "add_v_proj", "add_out_proj", "ff_context", "norm_added_q", "norm_added_v", "to_add_out"}; // record the positions of underscores that should NOT be replaced std::unordered_set protected_positions; for (const auto& token : protected_tokens) { size_t start = 0; while ((start = name.find(token, start)) != std::string::npos) { size_t local_pos = token.find('_'); while (local_pos != std::string::npos) { protected_positions.insert(start + local_pos); local_pos = token.find('_', local_pos + 1); } start += token.size(); } } for (size_t i = 0; i < name.size(); ++i) { if (name[i] == '_' && !protected_positions.count(i)) { name[i] = '.'; } } return name; } std::string convert_tensor_name(std::string name, SDVersion version) { bool is_lora = false; bool is_lycoris_underline = false; std::vector lora_prefix_vec = { "lora.lora.", "lora.lora_", "lora.lycoris_", "lora.lycoris.", "lora.", }; for (const auto& prefix : lora_prefix_vec) { if (starts_with(name, prefix)) { is_lora = true; name = name.substr(prefix.size()); if (contains(prefix, "lycoris_")) { is_lycoris_underline = true; } break; } } // preprocess lora tensor name if (is_lora) { std::map lora_suffix_map = { {".lora_down.weight", ".weight.lora_down"}, {".lora_up.weight", ".weight.lora_up"}, {".lora.down.weight", ".weight.lora_down"}, {".lora.up.weight", ".weight.lora_up"}, {"_lora.down.weight", ".weight.lora_down"}, {"_lora.up.weight", ".weight.lora_up"}, {".lora_A.weight", ".weight.lora_down"}, {".lora_B.weight", ".weight.lora_up"}, {".lora_A.default.weight", ".weight.lora_down"}, {".lora_B.default.weight", ".weight.lora_up"}, {".lora_linear", ".weight.alpha"}, {".alpha", ".weight.alpha"}, {".scale", ".weight.scale"}, {".diff", ".weight.diff"}, {".diff_b", ".bias.diff"}, {".hada_w1_a", ".weight.hada_w1_a"}, {".hada_w1_b", ".weight.hada_w1_b"}, {".hada_w2_a", ".weight.hada_w2_a"}, {".hada_w2_b", ".weight.hada_w2_b"}, {".hada_t1", ".weight.hada_t1"}, {".hada_t2", ".weight.hada_t2"}, {".lokr_w1", ".weight.lokr_w1"}, {".lokr_w1_a", ".weight.lokr_w1_a"}, {".lokr_w1_b", ".weight.lokr_w1_b"}, {".lokr_w2", ".weight.lokr_w2"}, {".lokr_w2_a", ".weight.lokr_w2_a"}, {".lokr_w2_b", ".weight.lokr_w2_b"}, }; for (const auto& [old_suffix, new_suffix] : lora_suffix_map) { if (ends_with(name, old_suffix)) { name.replace(name.size() - old_suffix.size(), old_suffix.size(), new_suffix); break; } } size_t pos = name.find(".processor"); if (pos != std::string::npos) { name.replace(pos, strlen(".processor"), ""); } std::vector dit_prefix_vec = { "transformer_blocks", "single_transformer_blocks", }; for (const auto& prefix : dit_prefix_vec) { if (starts_with(name, prefix)) { name = "transformer." + name; break; } } if (sd_version_is_unet(version) || is_lycoris_underline) { name = convert_sep_to_dot(name); } } std::vector> prefix_map = { {"diffusion_model.", "model.diffusion_model."}, {"unet.", "model.diffusion_model."}, {"transformer.", "model.diffusion_model."}, // dit {"vae.", "first_stage_model."}, {"text_encoder.", "cond_stage_model.transformer."}, {"te.", "cond_stage_model.transformer."}, {"text_encoder.2.", "cond_stage_model.1.transformer."}, {"conditioner.embedders.0.open_clip.", "cond_stage_model."}, // https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0 {"conditioner.embedders.0.", "cond_stage_model."}, {"conditioner.embedders.1.", "cond_stage_model.1."}, // {"te2.text_model.encoder.layers.", "cond_stage_model.1.model.transformer.resblocks."}, {"te2.", "cond_stage_model.1.transformer."}, {"te1.", "cond_stage_model.transformer."}, }; replace_with_prefix_map(name, prefix_map); // diffusion model { std::vector diffuison_model_prefix_vec = { "model.diffusion_model.", }; for (const auto& prefix : diffuison_model_prefix_vec) { if (starts_with(name, prefix)) { name = convert_diffusion_model_name(name.substr(prefix.size()), prefix, version); name = prefix + name; break; } } } // cond_stage_model { std::vector cond_stage_model_prefix_vec = { "cond_stage_model.1.", "cond_stage_model.", "conditioner.embedders.", "text_encoders.", }; for (const auto& prefix : cond_stage_model_prefix_vec) { if (starts_with(name, prefix)) { name = convert_cond_stage_model_name(name.substr(prefix.size()), prefix); name = prefix + name; break; } } } // first_stage_model { std::vector first_stage_model_prefix_vec = { "first_stage_model.", "vae.", }; for (const auto& prefix : first_stage_model_prefix_vec) { if (starts_with(name, prefix)) { name = convert_first_stage_model_name(name.substr(prefix.size()), prefix); name = prefix + name; break; } } } // pmid { if (starts_with(name, "pmid.")) { name = convert_pmid_name(name); } if (starts_with(name, "pmid.qformer_perceiver")) { name = convert_pmid_v2_name(name); } } // controlnet { if (starts_with(name, "control_model.")) { // for controlnet pth models size_t pos = name.find('.'); if (pos != std::string::npos) { name = name.substr(pos + 1); } } } if (is_lora) { name = "lora." + name; } return name; }