stable-diffusion.cpp/name_conversion.cpp

1029 lines
47 KiB
C++

#include <unordered_map>
#include <unordered_set>
#include "name_conversion.h"
#include "util.h"
void replace_with_name_map(std::string& name, const std::vector<std::pair<std::string, std::string>>& name_map) {
for (auto kv : name_map) {
size_t pos = name.find(kv.first);
if (pos != std::string::npos) {
name.replace(pos, kv.first.size(), kv.second);
}
}
}
void replace_with_prefix_map(std::string& name, const std::vector<std::pair<std::string, std::string>>& prefix_map) {
for (const auto& [old_prefix, new_prefix] : prefix_map) {
if (starts_with(name, old_prefix)) {
name = new_prefix + name.substr(old_prefix.size());
break;
}
}
}
void replace_with_prefix_map(std::string& name, const std::unordered_map<std::string, std::string>& prefix_map) {
for (const auto& [old_prefix, new_prefix] : prefix_map) {
if (starts_with(name, old_prefix)) {
name = new_prefix + name.substr(old_prefix.size());
break;
}
}
}
std::string convert_open_clip_to_hf_clip_name(std::string name) {
static std::unordered_map<std::string, std::string> open_clip_to_hf_clip_model = {
{"model.ln_final.bias", "transformer.text_model.final_layer_norm.bias"},
{"model.ln_final.weight", "transformer.text_model.final_layer_norm.weight"},
{"model.positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"},
{"model.token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"},
{"model.text_projection", "transformer.text_model.text_projection"},
{"model.visual.class_embedding", "transformer.vision_model.embeddings.class_embedding"},
{"model.visual.conv1.weight", "transformer.vision_model.embeddings.patch_embedding.weight"},
{"model.visual.ln_post.bias", "transformer.vision_model.post_layernorm.bias"},
{"model.visual.ln_post.weight", "transformer.vision_model.post_layernorm.weight"},
{"model.visual.ln_pre.bias", "transformer.vision_model.pre_layernorm.bias"},
{"model.visual.ln_pre.weight", "transformer.vision_model.pre_layernorm.weight"},
{"model.visual.positional_embedding", "transformer.vision_model.embeddings.position_embedding.weight"},
{"model.visual.proj", "transformer.visual_projection.weight"},
};
static std::unordered_map<std::string, std::string> open_clip_to_hf_clip_resblock = {
{"attn.in_proj_bias", "self_attn.in_proj.bias"},
{"attn.in_proj_weight", "self_attn.in_proj.weight"},
{"attn.out_proj.bias", "self_attn.out_proj.bias"},
{"attn.out_proj.weight", "self_attn.out_proj.weight"},
{"ln_1.bias", "layer_norm1.bias"},
{"ln_1.weight", "layer_norm1.weight"},
{"ln_2.bias", "layer_norm2.bias"},
{"ln_2.weight", "layer_norm2.weight"},
{"mlp.c_fc.bias", "mlp.fc1.bias"},
{"mlp.c_fc.weight", "mlp.fc1.weight"},
{"mlp.c_proj.bias", "mlp.fc2.bias"},
{"mlp.c_proj.weight", "mlp.fc2.weight"},
};
static std::unordered_map<std::string, std::string> cond_model_name_map = {
{"transformer.vision_model.pre_layrnorm.weight", "transformer.vision_model.pre_layernorm.weight"},
{"transformer.vision_model.pre_layrnorm.bias", "transformer.vision_model.pre_layernorm.bias"},
};
if (open_clip_to_hf_clip_model.find(name) != open_clip_to_hf_clip_model.end()) {
name = open_clip_to_hf_clip_model[name];
}
if (cond_model_name_map.find(name) != cond_model_name_map.end()) {
name = cond_model_name_map[name];
}
std::string open_clip_resblock_prefix = "model.transformer.resblocks.";
std::string hf_clip_resblock_prefix = "transformer.text_model.encoder.layers.";
auto replace_suffix = [&]() {
if (name.find(open_clip_resblock_prefix) == 0) {
std::string remain = name.substr(open_clip_resblock_prefix.length());
std::string idx = remain.substr(0, remain.find("."));
std::string suffix = remain.substr(idx.length() + 1);
if (open_clip_to_hf_clip_resblock.find(suffix) != open_clip_to_hf_clip_resblock.end()) {
std::string new_suffix = open_clip_to_hf_clip_resblock[suffix];
name = hf_clip_resblock_prefix + idx + "." + new_suffix;
}
}
};
replace_suffix();
open_clip_resblock_prefix = "model.visual.transformer.resblocks.";
hf_clip_resblock_prefix = "transformer.vision_model.encoder.layers.";
replace_suffix();
return name;
}
std::string convert_cond_stage_model_name(std::string name, std::string prefix) {
static const std::vector<std::pair<std::string, std::string>> clip_name_map{
{"transformer.text_projection.weight", "transformer.text_model.text_projection"},
{"model.text_projection.weight", "transformer.text_model.text_projection"},
{"vision_model.visual_projection.weight", "visual_projection.weight"},
};
// llama.cpp to original
static const std::vector<std::pair<std::string, std::string>> t5_name_map{
{"enc.", "encoder."},
{"blk.", "block."},
{"output_norm.", "final_layer_norm."},
{"attn_q.", "layer.0.SelfAttention.q."},
{"attn_k.", "layer.0.SelfAttention.k."},
{"attn_v.", "layer.0.SelfAttention.v."},
{"attn_o.", "layer.0.SelfAttention.o."},
{"attn_norm.", "layer.0.layer_norm."},
{"ffn_norm.", "layer.1.layer_norm."},
{"ffn_up.", "layer.1.DenseReluDense.wi_1."},
{"ffn_down.", "layer.1.DenseReluDense.wo."},
{"ffn_gate.", "layer.1.DenseReluDense.wi_0."},
{"attn_rel_b.", "layer.0.SelfAttention.relative_attention_bias."},
{"token_embd.", "shared."},
};
static const std::vector<std::pair<std::string, std::string>> qwenvl_name_map{
{"token_embd.", "model.embed_tokens."},
{"blk.", "model.layers."},
{"attn_q.", "self_attn.q_proj."},
{"attn_k.", "self_attn.k_proj."},
{"attn_v.", "self_attn.v_proj."},
{"attn_output.", "self_attn.o_proj."},
{"attn_norm.", "input_layernorm."},
{"ffn_down.", "mlp.down_proj."},
{"ffn_gate.", "mlp.gate_proj."},
{"ffn_up.", "mlp.up_proj."},
{"ffn_norm.", "post_attention_layernorm."},
{"output_norm.", "model.norm."},
};
static const std::vector<std::pair<std::string, std::string>> qwenvl_vision_name_map{
{"mm.", "merger.mlp."},
{"v.post_ln.", "merger.ln_q."},
{"v.patch_embd.weight", "patch_embed.proj.0.weight"},
{"patch_embed.proj.0.weight.1", "patch_embed.proj.1.weight"},
{"v.patch_embd.weight.1", "patch_embed.proj.1.weight"},
{"v.blk.", "blocks."},
{"attn_q.", "attn.q_proj."},
{"attn_k.", "attn.k_proj."},
{"attn_v.", "attn.v_proj."},
{"attn_out.", "attn.proj."},
{"ffn_down.", "mlp.down_proj."},
{"ffn_gate.", "mlp.gate_proj."},
{"ffn_up.", "mlp.up_proj."},
{"ln1.", "norm1."},
{"ln2.", "norm2."},
};
if (contains(name, "t5xxl")) {
replace_with_name_map(name, t5_name_map);
} else if (contains(name, "qwen2vl")) {
if (contains(name, "qwen2vl.visual")) {
replace_with_name_map(name, qwenvl_vision_name_map);
} else {
replace_with_name_map(name, qwenvl_name_map);
}
} else {
name = convert_open_clip_to_hf_clip_name(name);
replace_with_name_map(name, clip_name_map);
}
return name;
}
// ref: https://github.com/huggingface/diffusers/blob/main/scripts/convert_diffusers_to_original_stable_diffusion.py
std::string convert_diffusers_unet_to_original_sd1(std::string name) {
// (stable-diffusion, HF Diffusers)
static const std::vector<std::pair<std::string, std::string>> unet_conversion_map = {
{"time_embed.0.weight", "time_embedding.linear_1.weight"},
{"time_embed.0.bias", "time_embedding.linear_1.bias"},
{"time_embed.2.weight", "time_embedding.linear_2.weight"},
{"time_embed.2.bias", "time_embedding.linear_2.bias"},
{"input_blocks.0.0.weight", "conv_in.weight"},
{"input_blocks.0.0.bias", "conv_in.bias"},
{"out.0.weight", "conv_norm_out.weight"},
{"out.0.bias", "conv_norm_out.bias"},
{"out.2.weight", "conv_out.weight"},
{"out.2.bias", "conv_out.bias"},
};
static const std::vector<std::pair<std::string, std::string>> unet_conversion_map_resnet = {
{"in_layers.0", "norm1"},
{"in_layers.2", "conv1"},
{"out_layers.0", "norm2"},
{"out_layers.3", "conv2"},
{"emb_layers.1", "time_emb_proj"},
{"skip_connection", "conv_shortcut"},
};
static std::vector<std::pair<std::string, std::string>> unet_conversion_map_layer;
if (unet_conversion_map_layer.empty()) {
for (int i = 0; i < 4; ++i) {
// down_blocks
for (int j = 0; j < 2; ++j) {
std::string hf_down_res_prefix = "down_blocks." + std::to_string(i) + ".resnets." + std::to_string(j) + ".";
std::string sd_down_res_prefix = "input_blocks." + std::to_string(3 * i + j + 1) + ".0.";
unet_conversion_map_layer.emplace_back(sd_down_res_prefix, hf_down_res_prefix);
if (i < 3) {
std::string hf_down_atn_prefix = "down_blocks." + std::to_string(i) + ".attentions." + std::to_string(j) + ".";
std::string sd_down_atn_prefix = "input_blocks." + std::to_string(3 * i + j + 1) + ".1.";
unet_conversion_map_layer.emplace_back(sd_down_atn_prefix, hf_down_atn_prefix);
}
}
// up_blocks
for (int j = 0; j < 3; ++j) {
std::string hf_up_res_prefix = "up_blocks." + std::to_string(i) + ".resnets." + std::to_string(j) + ".";
std::string sd_up_res_prefix = "output_blocks." + std::to_string(3 * i + j) + ".0.";
unet_conversion_map_layer.emplace_back(sd_up_res_prefix, hf_up_res_prefix);
if (/*i > 0*/ true) { // for tiny unet
std::string hf_up_atn_prefix = "up_blocks." + std::to_string(i) + ".attentions." + std::to_string(j) + ".";
std::string sd_up_atn_prefix = "output_blocks." + std::to_string(3 * i + j) + ".1.";
unet_conversion_map_layer.emplace_back(sd_up_atn_prefix, hf_up_atn_prefix);
}
}
if (i < 3) {
std::string hf_downsample_prefix = "down_blocks." + std::to_string(i) + ".downsamplers.0.conv.";
std::string sd_downsample_prefix = "input_blocks." + std::to_string(3 * (i + 1)) + ".0.op.";
unet_conversion_map_layer.emplace_back(sd_downsample_prefix, hf_downsample_prefix);
std::string hf_upsample_prefix = "up_blocks." + std::to_string(i) + ".upsamplers.0.";
std::string sd_upsample_prefix = "output_blocks." + std::to_string(3 * i + 2) + "." + std::to_string(i == 0 ? 1 : 2) + ".";
unet_conversion_map_layer.emplace_back(sd_upsample_prefix, hf_upsample_prefix);
}
}
// mid block
unet_conversion_map_layer.emplace_back("middle_block.1.", "mid_block.attentions.0.");
for (int j = 0; j < 2; ++j) {
std::string hf_mid_res_prefix = "mid_block.resnets." + std::to_string(j) + ".";
std::string sd_mid_res_prefix = "middle_block." + std::to_string(2 * j) + ".";
unet_conversion_map_layer.emplace_back(sd_mid_res_prefix, hf_mid_res_prefix);
}
}
std::string result = name;
for (const auto& p : unet_conversion_map) {
if (result == p.second) {
result = p.first;
return result;
}
}
if (contains(result, "resnets")) {
for (const auto& p : unet_conversion_map_resnet) {
size_t pos = result.find(p.second);
if (pos != std::string::npos) {
result.replace(pos, p.second.size(), p.first);
}
}
}
for (const auto& p : unet_conversion_map_layer) {
size_t pos = result.find(p.second);
if (pos != std::string::npos) {
result.replace(pos, p.second.size(), p.first);
}
}
return result;
}
// ref: https://github.com/huggingface/diffusers/blob/main/scripts/convert_diffusers_to_original_sdxl.py
std::string convert_diffusers_unet_to_original_sdxl(std::string name) {
// (stable-diffusion, HF Diffusers)
static const std::vector<std::pair<std::string, std::string>> unet_conversion_map = {
{"time_embed.0.weight", "time_embedding.linear_1.weight"},
{"time_embed.0.bias", "time_embedding.linear_1.bias"},
{"time_embed.2.weight", "time_embedding.linear_2.weight"},
{"time_embed.2.bias", "time_embedding.linear_2.bias"},
{"input_blocks.0.0.weight", "conv_in.weight"},
{"input_blocks.0.0.bias", "conv_in.bias"},
{"out.0.weight", "conv_norm_out.weight"},
{"out.0.bias", "conv_norm_out.bias"},
{"out.2.weight", "conv_out.weight"},
{"out.2.bias", "conv_out.bias"},
// --- SDXL add_embedding mappings ---
{"label_emb.0.0.weight", "add_embedding.linear_1.weight"},
{"label_emb.0.0.bias", "add_embedding.linear_1.bias"},
{"label_emb.0.2.weight", "add_embedding.linear_2.weight"},
{"label_emb.0.2.bias", "add_embedding.linear_2.bias"},
};
static const std::vector<std::pair<std::string, std::string>> unet_conversion_map_resnet = {
{"in_layers.0", "norm1"},
{"in_layers.2", "conv1"},
{"out_layers.0", "norm2"},
{"out_layers.3", "conv2"},
{"emb_layers.1", "time_emb_proj"},
{"skip_connection", "conv_shortcut"},
};
static std::vector<std::pair<std::string, std::string>> unet_conversion_map_layer;
if (unet_conversion_map_layer.empty()) {
for (int i = 0; i < 3; ++i) {
// --- down_blocks ---
for (int j = 0; j < 2; ++j) {
std::string hf_down_res_prefix = "down_blocks." + std::to_string(i) + ".resnets." + std::to_string(j) + ".";
std::string sd_down_res_prefix = "input_blocks." + std::to_string(3 * i + j + 1) + ".0.";
unet_conversion_map_layer.emplace_back(sd_down_res_prefix, hf_down_res_prefix);
if (i > 0) {
std::string hf_down_atn_prefix = "down_blocks." + std::to_string(i) + ".attentions." + std::to_string(j) + ".";
std::string sd_down_atn_prefix = "input_blocks." + std::to_string(3 * i + j + 1) + ".1.";
unet_conversion_map_layer.emplace_back(sd_down_atn_prefix, hf_down_atn_prefix);
}
}
// --- up_blocks ---
for (int j = 0; j < 4; ++j) {
std::string hf_up_res_prefix = "up_blocks." + std::to_string(i) + ".resnets." + std::to_string(j) + ".";
std::string sd_up_res_prefix = "output_blocks." + std::to_string(3 * i + j) + ".0.";
unet_conversion_map_layer.emplace_back(sd_up_res_prefix, hf_up_res_prefix);
if (i < 2) {
std::string hf_up_atn_prefix = "up_blocks." + std::to_string(i) + ".attentions." + std::to_string(j) + ".";
std::string sd_up_atn_prefix = "output_blocks." + std::to_string(3 * i + j) + ".1.";
unet_conversion_map_layer.emplace_back(sd_up_atn_prefix, hf_up_atn_prefix);
}
}
if (i < 3) {
std::string hf_downsample_prefix = "down_blocks." + std::to_string(i) + ".downsamplers.0.conv.";
std::string sd_downsample_prefix = "input_blocks." + std::to_string(3 * (i + 1)) + ".0.op.";
unet_conversion_map_layer.emplace_back(sd_downsample_prefix, hf_downsample_prefix);
std::string hf_upsample_prefix = "up_blocks." + std::to_string(i) + ".upsamplers.0.";
std::string sd_upsample_prefix =
"output_blocks." + std::to_string(3 * i + 2) + "." + std::to_string(i == 0 ? 1 : 2) + ".";
unet_conversion_map_layer.emplace_back(sd_upsample_prefix, hf_upsample_prefix);
}
}
unet_conversion_map_layer.emplace_back("output_blocks.2.2.conv.", "output_blocks.2.1.conv.");
// mid block
unet_conversion_map_layer.emplace_back("middle_block.1.", "mid_block.attentions.0.");
for (int j = 0; j < 2; ++j) {
std::string hf_mid_res_prefix = "mid_block.resnets." + std::to_string(j) + ".";
std::string sd_mid_res_prefix = "middle_block." + std::to_string(2 * j) + ".";
unet_conversion_map_layer.emplace_back(sd_mid_res_prefix, hf_mid_res_prefix);
}
}
std::string result = name;
for (const auto& p : unet_conversion_map) {
if (result == p.second) {
result = p.first;
return result;
}
}
if (contains(result, "resnets")) {
for (const auto& p : unet_conversion_map_resnet) {
size_t pos = result.find(p.second);
if (pos != std::string::npos) {
result.replace(pos, p.second.size(), p.first);
}
}
}
for (const auto& p : unet_conversion_map_layer) {
size_t pos = result.find(p.second);
if (pos != std::string::npos) {
result.replace(pos, p.second.size(), p.first);
}
}
static const std::vector<std::pair<std::string, std::string>> name_map{
{"to_out.weight", "to_out.0.weight"},
{"to_out.bias", "to_out.0.bias"},
};
replace_with_name_map(result, name_map);
return result;
}
std::string convert_diffusers_dit_to_original_sd3(std::string name) {
int num_layers = 38;
static std::unordered_map<std::string, std::string> sd3_name_map;
if (sd3_name_map.empty()) {
// --- time_text_embed ---
sd3_name_map["time_text_embed.timestep_embedder.linear_1.weight"] = "t_embedder.mlp.0.weight";
sd3_name_map["time_text_embed.timestep_embedder.linear_1.bias"] = "t_embedder.mlp.0.bias";
sd3_name_map["time_text_embed.timestep_embedder.linear_2.weight"] = "t_embedder.mlp.2.weight";
sd3_name_map["time_text_embed.timestep_embedder.linear_2.bias"] = "t_embedder.mlp.2.bias";
sd3_name_map["time_text_embed.text_embedder.linear_1.weight"] = "y_embedder.mlp.0.weight";
sd3_name_map["time_text_embed.text_embedder.linear_1.bias"] = "y_embedder.mlp.0.bias";
sd3_name_map["time_text_embed.text_embedder.linear_2.weight"] = "y_embedder.mlp.2.weight";
sd3_name_map["time_text_embed.text_embedder.linear_2.bias"] = "y_embedder.mlp.2.bias";
sd3_name_map["pos_embed.pos_embed"] = "pos_embed";
sd3_name_map["pos_embed.proj.weight"] = "x_embedder.proj.weight";
sd3_name_map["pos_embed.proj.bias"] = "x_embedder.proj.bias";
// --- transformer blocks ---
for (int i = 0; i < num_layers; ++i) {
std::string block_prefix = "transformer_blocks." + std::to_string(i) + ".";
std::string dst_prefix = "joint_blocks." + std::to_string(i) + ".";
sd3_name_map[block_prefix + "norm1.linear.weight"] = dst_prefix + "x_block.adaLN_modulation.1.weight";
sd3_name_map[block_prefix + "norm1.linear.bias"] = dst_prefix + "x_block.adaLN_modulation.1.bias";
sd3_name_map[block_prefix + "norm1_context.linear.weight"] = dst_prefix + "context_block.adaLN_modulation.1.weight";
sd3_name_map[block_prefix + "norm1_context.linear.bias"] = dst_prefix + "context_block.adaLN_modulation.1.bias";
// attn
sd3_name_map[block_prefix + "attn.to_q.weight"] = dst_prefix + "x_block.attn.qkv.weight";
sd3_name_map[block_prefix + "attn.to_q.bias"] = dst_prefix + "x_block.attn.qkv.bias";
sd3_name_map[block_prefix + "attn.to_k.weight"] = dst_prefix + "x_block.attn.qkv.weight.1";
sd3_name_map[block_prefix + "attn.to_k.bias"] = dst_prefix + "x_block.attn.qkv.bias.1";
sd3_name_map[block_prefix + "attn.to_v.weight"] = dst_prefix + "x_block.attn.qkv.weight.2";
sd3_name_map[block_prefix + "attn.to_v.bias"] = dst_prefix + "x_block.attn.qkv.bias.2";
sd3_name_map[block_prefix + "attn.add_q_proj.weight"] = dst_prefix + "context_block.attn.qkv.weight";
sd3_name_map[block_prefix + "attn.add_q_proj.bias"] = dst_prefix + "context_block.attn.qkv.bias";
sd3_name_map[block_prefix + "attn.add_k_proj.weight"] = dst_prefix + "context_block.attn.qkv.weight.1";
sd3_name_map[block_prefix + "attn.add_k_proj.bias"] = dst_prefix + "context_block.attn.qkv.bias.1";
sd3_name_map[block_prefix + "attn.add_v_proj.weight"] = dst_prefix + "context_block.attn.qkv.weight.2";
sd3_name_map[block_prefix + "attn.add_v_proj.bias"] = dst_prefix + "context_block.attn.qkv.bias.2";
// attn2
sd3_name_map[block_prefix + "attn2.to_q.weight"] = dst_prefix + "x_block.attn2.qkv.weight";
sd3_name_map[block_prefix + "attn2.to_q.bias"] = dst_prefix + "x_block.attn2.qkv.bias";
sd3_name_map[block_prefix + "attn2.to_k.weight"] = dst_prefix + "x_block.attn2.qkv.weight.1";
sd3_name_map[block_prefix + "attn2.to_k.bias"] = dst_prefix + "x_block.attn2.qkv.bias.1";
sd3_name_map[block_prefix + "attn2.to_v.weight"] = dst_prefix + "x_block.attn2.qkv.weight.2";
sd3_name_map[block_prefix + "attn2.to_v.bias"] = dst_prefix + "x_block.attn2.qkv.bias.2";
sd3_name_map[block_prefix + "attn2.add_q_proj.weight"] = dst_prefix + "context_block.attn2.qkv.weight";
sd3_name_map[block_prefix + "attn2.add_q_proj.bias"] = dst_prefix + "context_block.attn2.qkv.bias";
sd3_name_map[block_prefix + "attn2.add_k_proj.weight"] = dst_prefix + "context_block.attn2.qkv.weight.1";
sd3_name_map[block_prefix + "attn2.add_k_proj.bias"] = dst_prefix + "context_block.attn2.qkv.bias.1";
sd3_name_map[block_prefix + "attn2.add_v_proj.weight"] = dst_prefix + "context_block.attn2.qkv.weight.2";
sd3_name_map[block_prefix + "attn2.add_v_proj.bias"] = dst_prefix + "context_block.attn2.qkv.bias.2";
// norm
sd3_name_map[block_prefix + "attn.norm_q.weight"] = dst_prefix + "x_block.attn.ln_q.weight";
sd3_name_map[block_prefix + "attn.norm_k.weight"] = dst_prefix + "x_block.attn.ln_k.weight";
sd3_name_map[block_prefix + "attn.norm_added_q.weight"] = dst_prefix + "context_block.attn.ln_q.weight";
sd3_name_map[block_prefix + "attn.norm_added_k.weight"] = dst_prefix + "context_block.attn.ln_k.weight";
// norm2
sd3_name_map[block_prefix + "attn2.norm_q.weight"] = dst_prefix + "x_block.attn2.ln_q.weight";
sd3_name_map[block_prefix + "attn2.norm_k.weight"] = dst_prefix + "x_block.attn2.ln_k.weight";
// ff
sd3_name_map[block_prefix + "ff.net.0.proj.weight"] = dst_prefix + "x_block.mlp.fc1.weight";
sd3_name_map[block_prefix + "ff.net.0.proj.bias"] = dst_prefix + "x_block.mlp.fc1.bias";
sd3_name_map[block_prefix + "ff.net.2.weight"] = dst_prefix + "x_block.mlp.fc2.weight";
sd3_name_map[block_prefix + "ff.net.2.bias"] = dst_prefix + "x_block.mlp.fc2.bias";
sd3_name_map[block_prefix + "ff_context.net.0.proj.weight"] = dst_prefix + "context_block.mlp.fc1.weight";
sd3_name_map[block_prefix + "ff_context.net.0.proj.bias"] = dst_prefix + "context_block.mlp.fc1.bias";
sd3_name_map[block_prefix + "ff_context.net.2.weight"] = dst_prefix + "context_block.mlp.fc2.weight";
sd3_name_map[block_prefix + "ff_context.net.2.bias"] = dst_prefix + "context_block.mlp.fc2.bias";
// output projections
sd3_name_map[block_prefix + "attn.to_out.0.weight"] = dst_prefix + "x_block.attn.proj.weight";
sd3_name_map[block_prefix + "attn.to_out.0.bias"] = dst_prefix + "x_block.attn.proj.bias";
sd3_name_map[block_prefix + "attn.to_add_out.weight"] = dst_prefix + "context_block.attn.proj.weight";
sd3_name_map[block_prefix + "attn.to_add_out.bias"] = dst_prefix + "context_block.attn.proj.bias";
// output projections 2
sd3_name_map[block_prefix + "attn2.to_out.0.weight"] = dst_prefix + "x_block.attn2.proj.weight";
sd3_name_map[block_prefix + "attn2.to_out.0.bias"] = dst_prefix + "x_block.attn2.proj.bias";
sd3_name_map[block_prefix + "attn2.to_add_out.weight"] = dst_prefix + "context_block.attn2.proj.weight";
sd3_name_map[block_prefix + "attn2.to_add_out.bias"] = dst_prefix + "context_block.attn2.proj.bias";
}
// --- final layers ---
sd3_name_map["proj_out.weight"] = "final_layer.linear.weight";
sd3_name_map["proj_out.bias"] = "final_layer.linear.bias";
sd3_name_map["norm_out.linear.weight"] = "final_layer.adaLN_modulation.1.weight";
sd3_name_map["norm_out.linear.bias"] = "final_layer.adaLN_modulation.1.bias";
}
replace_with_prefix_map(name, sd3_name_map);
return name;
}
std::string convert_diffusers_dit_to_original_flux(std::string name) {
int num_layers = 19;
int num_single_layers = 38;
static std::unordered_map<std::string, std::string> flux_name_map;
if (flux_name_map.empty()) {
// --- time_text_embed ---
flux_name_map["time_text_embed.timestep_embedder.linear_1.weight"] = "time_in.in_layer.weight";
flux_name_map["time_text_embed.timestep_embedder.linear_1.bias"] = "time_in.in_layer.bias";
flux_name_map["time_text_embed.timestep_embedder.linear_2.weight"] = "time_in.out_layer.weight";
flux_name_map["time_text_embed.timestep_embedder.linear_2.bias"] = "time_in.out_layer.bias";
flux_name_map["time_text_embed.text_embedder.linear_1.weight"] = "vector_in.in_layer.weight";
flux_name_map["time_text_embed.text_embedder.linear_1.bias"] = "vector_in.in_layer.bias";
flux_name_map["time_text_embed.text_embedder.linear_2.weight"] = "vector_in.out_layer.weight";
flux_name_map["time_text_embed.text_embedder.linear_2.bias"] = "vector_in.out_layer.bias";
// guidance
flux_name_map["time_text_embed.guidance_embedder.linear_1.weight"] = "guidance_in.in_layer.weight";
flux_name_map["time_text_embed.guidance_embedder.linear_1.bias"] = "guidance_in.in_layer.bias";
flux_name_map["time_text_embed.guidance_embedder.linear_2.weight"] = "guidance_in.out_layer.weight";
flux_name_map["time_text_embed.guidance_embedder.linear_2.bias"] = "guidance_in.out_layer.bias";
// --- context_embedder / x_embedder ---
flux_name_map["context_embedder.weight"] = "txt_in.weight";
flux_name_map["context_embedder.bias"] = "txt_in.bias";
flux_name_map["x_embedder.weight"] = "img_in.weight";
flux_name_map["x_embedder.bias"] = "img_in.bias";
// --- double transformer blocks ---
for (int i = 0; i < num_layers; ++i) {
std::string block_prefix = "transformer_blocks." + std::to_string(i) + ".";
std::string dst_prefix = "double_blocks." + std::to_string(i) + ".";
flux_name_map[block_prefix + "norm1.linear.weight"] = dst_prefix + "img_mod.lin.weight";
flux_name_map[block_prefix + "norm1.linear.bias"] = dst_prefix + "img_mod.lin.bias";
flux_name_map[block_prefix + "norm1_context.linear.weight"] = dst_prefix + "txt_mod.lin.weight";
flux_name_map[block_prefix + "norm1_context.linear.bias"] = dst_prefix + "txt_mod.lin.bias";
// attn
flux_name_map[block_prefix + "attn.to_q.weight"] = dst_prefix + "img_attn.qkv.weight";
flux_name_map[block_prefix + "attn.to_q.bias"] = dst_prefix + "img_attn.qkv.bias";
flux_name_map[block_prefix + "attn.to_k.weight"] = dst_prefix + "img_attn.qkv.weight.1";
flux_name_map[block_prefix + "attn.to_k.bias"] = dst_prefix + "img_attn.qkv.bias.1";
flux_name_map[block_prefix + "attn.to_v.weight"] = dst_prefix + "img_attn.qkv.weight.2";
flux_name_map[block_prefix + "attn.to_v.bias"] = dst_prefix + "img_attn.qkv.bias.2";
flux_name_map[block_prefix + "attn.add_q_proj.weight"] = dst_prefix + "txt_attn.qkv.weight";
flux_name_map[block_prefix + "attn.add_q_proj.bias"] = dst_prefix + "txt_attn.qkv.bias";
flux_name_map[block_prefix + "attn.add_k_proj.weight"] = dst_prefix + "txt_attn.qkv.weight.1";
flux_name_map[block_prefix + "attn.add_k_proj.bias"] = dst_prefix + "txt_attn.qkv.bias.1";
flux_name_map[block_prefix + "attn.add_v_proj.weight"] = dst_prefix + "txt_attn.qkv.weight.2";
flux_name_map[block_prefix + "attn.add_v_proj.bias"] = dst_prefix + "txt_attn.qkv.bias.2";
// norm
flux_name_map[block_prefix + "attn.norm_q.weight"] = dst_prefix + "img_attn.norm.query_norm.scale";
flux_name_map[block_prefix + "attn.norm_k.weight"] = dst_prefix + "img_attn.norm.key_norm.scale";
flux_name_map[block_prefix + "attn.norm_added_q.weight"] = dst_prefix + "txt_attn.norm.query_norm.scale";
flux_name_map[block_prefix + "attn.norm_added_k.weight"] = dst_prefix + "txt_attn.norm.key_norm.scale";
// ff
flux_name_map[block_prefix + "ff.net.0.proj.weight"] = dst_prefix + "img_mlp.0.weight";
flux_name_map[block_prefix + "ff.net.0.proj.bias"] = dst_prefix + "img_mlp.0.bias";
flux_name_map[block_prefix + "ff.net.2.weight"] = dst_prefix + "img_mlp.2.weight";
flux_name_map[block_prefix + "ff.net.2.bias"] = dst_prefix + "img_mlp.2.bias";
flux_name_map[block_prefix + "ff_context.net.0.proj.weight"] = dst_prefix + "txt_mlp.0.weight";
flux_name_map[block_prefix + "ff_context.net.0.proj.bias"] = dst_prefix + "txt_mlp.0.bias";
flux_name_map[block_prefix + "ff_context.net.2.weight"] = dst_prefix + "txt_mlp.2.weight";
flux_name_map[block_prefix + "ff_context.net.2.bias"] = dst_prefix + "txt_mlp.2.bias";
// output projections
flux_name_map[block_prefix + "attn.to_out.0.weight"] = dst_prefix + "img_attn.proj.weight";
flux_name_map[block_prefix + "attn.to_out.0.bias"] = dst_prefix + "img_attn.proj.bias";
flux_name_map[block_prefix + "attn.to_add_out.weight"] = dst_prefix + "txt_attn.proj.weight";
flux_name_map[block_prefix + "attn.to_add_out.bias"] = dst_prefix + "txt_attn.proj.bias";
}
// --- single transformer blocks ---
for (int i = 0; i < num_single_layers; ++i) {
std::string block_prefix = "single_transformer_blocks." + std::to_string(i) + ".";
std::string dst_prefix = "single_blocks." + std::to_string(i) + ".";
flux_name_map[block_prefix + "norm.linear.weight"] = dst_prefix + "modulation.lin.weight";
flux_name_map[block_prefix + "norm.linear.bias"] = dst_prefix + "modulation.lin.bias";
flux_name_map[block_prefix + "attn.to_q.weight"] = dst_prefix + "linear1.weight";
flux_name_map[block_prefix + "attn.to_q.bias"] = dst_prefix + "linear1.bias";
flux_name_map[block_prefix + "attn.to_k.weight"] = dst_prefix + "linear1.weight.1";
flux_name_map[block_prefix + "attn.to_k.bias"] = dst_prefix + "linear1.bias.1";
flux_name_map[block_prefix + "attn.to_v.weight"] = dst_prefix + "linear1.weight.2";
flux_name_map[block_prefix + "attn.to_v.bias"] = dst_prefix + "linear1.bias.2";
flux_name_map[block_prefix + "proj_mlp.weight"] = dst_prefix + "linear1.weight.3";
flux_name_map[block_prefix + "proj_mlp.bias"] = dst_prefix + "linear1.bias.3";
flux_name_map[block_prefix + "attn.norm_q.weight"] = dst_prefix + "norm.query_norm.scale";
flux_name_map[block_prefix + "attn.norm_k.weight"] = dst_prefix + "norm.key_norm.scale";
flux_name_map[block_prefix + "proj_out.weight"] = dst_prefix + "linear2.weight";
flux_name_map[block_prefix + "proj_out.bias"] = dst_prefix + "linear2.bias";
}
// --- final layers ---
flux_name_map["proj_out.weight"] = "final_layer.linear.weight";
flux_name_map["proj_out.bias"] = "final_layer.linear.bias";
flux_name_map["norm_out.linear.weight"] = "final_layer.adaLN_modulation.1.weight";
flux_name_map["norm_out.linear.bias"] = "final_layer.adaLN_modulation.1.bias";
}
replace_with_prefix_map(name, flux_name_map);
return name;
}
std::string convert_diffusion_model_name(std::string name, std::string prefix, SDVersion version) {
if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) {
name = convert_diffusers_unet_to_original_sd1(name);
} else if (sd_version_is_sdxl(version)) {
name = convert_diffusers_unet_to_original_sdxl(name);
} else if (sd_version_is_sd3(version)) {
name = convert_diffusers_dit_to_original_sd3(name);
} else if (sd_version_is_flux(version)) {
name = convert_diffusers_dit_to_original_flux(name);
}
return name;
}
std::string convert_diffusers_vae_to_original_sd1(std::string name) {
static const std::vector<std::pair<std::string, std::string>> vae_conversion_map_base = {
{"nin_shortcut", "conv_shortcut"},
{"norm_out", "conv_norm_out"},
{"mid.attn_1.", "mid_block.attentions.0."},
};
static std::vector<std::pair<std::string, std::string>> vae_conversion_map_layer;
if (vae_conversion_map_layer.empty()) {
for (int i = 0; i < 4; ++i) {
// --- encoder down blocks ---
for (int j = 0; j < 2; ++j) {
std::string hf_down_prefix = "encoder.down_blocks." + std::to_string(i) + ".resnets." + std::to_string(j) + ".";
std::string sd_down_prefix = "encoder.down." + std::to_string(i) + ".block." + std::to_string(j) + ".";
vae_conversion_map_layer.emplace_back(sd_down_prefix, hf_down_prefix);
}
if (i < 3) {
std::string hf_downsample_prefix = "down_blocks." + std::to_string(i) + ".downsamplers.0.";
std::string sd_downsample_prefix = "down." + std::to_string(i) + ".downsample.";
vae_conversion_map_layer.emplace_back(sd_downsample_prefix, hf_downsample_prefix);
std::string hf_upsample_prefix = "up_blocks." + std::to_string(i) + ".upsamplers.0.";
std::string sd_upsample_prefix = "up." + std::to_string(3 - i) + ".upsample.";
vae_conversion_map_layer.emplace_back(sd_upsample_prefix, hf_upsample_prefix);
}
// --- decoder up blocks (reverse) ---
for (int j = 0; j < 3; ++j) {
std::string hf_up_prefix = "decoder.up_blocks." + std::to_string(i) + ".resnets." + std::to_string(j) + ".";
std::string sd_up_prefix = "decoder.up." + std::to_string(3 - i) + ".block." + std::to_string(j) + ".";
vae_conversion_map_layer.emplace_back(sd_up_prefix, hf_up_prefix);
}
}
// --- mid block (encoder + decoder) ---
for (int i = 0; i < 2; ++i) {
std::string hf_mid_res_prefix = "mid_block.resnets." + std::to_string(i) + ".";
std::string sd_mid_res_prefix = "mid.block_" + std::to_string(i + 1) + ".";
vae_conversion_map_layer.emplace_back(sd_mid_res_prefix, hf_mid_res_prefix);
}
}
static const std::vector<std::pair<std::string, std::string>> vae_conversion_map_attn = {
{"norm.", "group_norm."},
{"q.", "query."},
{"k.", "key."},
{"v.", "value."},
{"proj_out.", "proj_attn."},
};
static const std::vector<std::pair<std::string, std::string>> vae_extra_conversion_map = {
{"to_q", "q"},
{"to_k", "k"},
{"to_v", "v"},
{"to_out.0", "proj_out"},
};
std::string result = name;
for (const auto& p : vae_conversion_map_base) {
size_t pos = result.find(p.second);
if (pos != std::string::npos) {
result.replace(pos, p.second.size(), p.first);
}
}
for (const auto& p : vae_conversion_map_layer) {
size_t pos = result.find(p.second);
if (pos != std::string::npos) {
result.replace(pos, p.second.size(), p.first);
}
}
if (name.find("attentions") != std::string::npos) {
for (const auto& p : vae_conversion_map_attn) {
size_t pos = result.find(p.second);
if (pos != std::string::npos) {
result.replace(pos, p.second.size(), p.first);
}
}
}
if (result.find("mid.attn_1.") != std::string::npos) {
for (const auto& p : vae_extra_conversion_map) {
size_t pos = result.find(p.first);
if (pos != std::string::npos) {
result.replace(pos, p.first.size(), p.second);
}
}
}
return result;
}
std::string convert_first_stage_model_name(std::string name, std::string prefix) {
name = convert_diffusers_vae_to_original_sd1(name);
return name;
}
std::string convert_pmid_name(const std::string& name) {
static std::unordered_map<std::string, std::string> pmid_name_map = {
{"pmid.vision_model.visual_projection.weight", "pmid.visual_projection.weight"},
};
if (pmid_name_map.find(name) != pmid_name_map.end()) {
return pmid_name_map[name];
}
return name;
}
std::string convert_pmid_v2_name(const std::string& name) {
static std::unordered_map<std::string, std::string> pmid_v2_name_map = {
{"pmid.qformer_perceiver.perceiver_resampler.layers.0.1.1.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.0.1.1.fc1.weight"},
{"pmid.qformer_perceiver.perceiver_resampler.layers.0.1.3.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.0.1.1.fc2.weight"},
{"pmid.qformer_perceiver.perceiver_resampler.layers.1.1.1.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.1.1.1.fc1.weight"},
{"pmid.qformer_perceiver.perceiver_resampler.layers.1.1.3.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.1.1.1.fc2.weight"},
{"pmid.qformer_perceiver.perceiver_resampler.layers.2.1.1.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.2.1.1.fc1.weight"},
{"pmid.qformer_perceiver.perceiver_resampler.layers.2.1.3.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.2.1.1.fc2.weight"},
{"pmid.qformer_perceiver.perceiver_resampler.layers.3.1.1.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.3.1.1.fc1.weight"},
{"pmid.qformer_perceiver.perceiver_resampler.layers.3.1.3.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.3.1.1.fc2.weight"},
{"pmid.qformer_perceiver.token_proj.0.bias",
"pmid.qformer_perceiver.token_proj.fc1.bias"},
{"pmid.qformer_perceiver.token_proj.2.bias",
"pmid.qformer_perceiver.token_proj.fc2.bias"},
{"pmid.qformer_perceiver.token_proj.0.weight",
"pmid.qformer_perceiver.token_proj.fc1.weight"},
{"pmid.qformer_perceiver.token_proj.2.weight",
"pmid.qformer_perceiver.token_proj.fc2.weight"},
};
if (pmid_v2_name_map.find(name) != pmid_v2_name_map.end()) {
return pmid_v2_name_map[name];
}
return name;
}
std::string convert_sep_to_dot(std::string name) {
const std::vector<std::string> protected_tokens = {
"self_attn",
"out_proj",
"q_proj",
"k_proj",
"v_proj",
"to_k",
"to_q",
"to_v",
"to_out",
"text_model",
"down_blocks",
"mid_block",
"up_block",
"proj_in",
"proj_out",
"transformer_blocks",
"single_transformer_blocks",
"diffusion_model",
"cond_stage_model",
"first_stage_model",
"conv_in",
"conv_out",
"lora_down",
"lora_up",
"diff_b",
"hada_w1_a",
"hada_w1_b",
"hada_w2_a",
"hada_w2_b",
"hada_t1",
"hada_t2",
".lokr_w1",
".lokr_w1_a",
".lokr_w1_b",
".lokr_w2",
".lokr_w2_a",
".lokr_w2_b",
"time_emb_proj",
"conv_shortcut",
"time_embedding",
"conv_norm_out",
"double_blocks",
"txt_attn",
"img_attn",
"input_blocks",
"output_blocks",
"middle_block",
"skip_connection",
"emb_layers",
"in_layers",
"out_layers",
"add_q_proj",
"add_k_proj",
"add_v_proj",
"add_out_proj",
"ff_context",
"norm_added_q",
"norm_added_v",
"to_add_out"};
// record the positions of underscores that should NOT be replaced
std::unordered_set<size_t> protected_positions;
for (const auto& token : protected_tokens) {
size_t start = 0;
while ((start = name.find(token, start)) != std::string::npos) {
size_t local_pos = token.find('_');
while (local_pos != std::string::npos) {
protected_positions.insert(start + local_pos);
local_pos = token.find('_', local_pos + 1);
}
start += token.size();
}
}
for (size_t i = 0; i < name.size(); ++i) {
if (name[i] == '_' && !protected_positions.count(i)) {
name[i] = '.';
}
}
return name;
}
std::string convert_tensor_name(std::string name, SDVersion version) {
bool is_lora = false;
bool is_lycoris_underline = false;
std::vector<std::string> lora_prefix_vec = {
"lora.lora.",
"lora.lora_",
"lora.lycoris_",
"lora.lycoris.",
"lora.",
};
for (const auto& prefix : lora_prefix_vec) {
if (starts_with(name, prefix)) {
is_lora = true;
name = name.substr(prefix.size());
if (contains(prefix, "lycoris_")) {
is_lycoris_underline = true;
}
break;
}
}
// preprocess lora tensor name
if (is_lora) {
std::map<std::string, std::string> lora_suffix_map = {
{".lora_down.weight", ".weight.lora_down"},
{".lora_up.weight", ".weight.lora_up"},
{".lora.down.weight", ".weight.lora_down"},
{".lora.up.weight", ".weight.lora_up"},
{"_lora.down.weight", ".weight.lora_down"},
{"_lora.up.weight", ".weight.lora_up"},
{".lora_A.weight", ".weight.lora_down"},
{".lora_B.weight", ".weight.lora_up"},
{".lora_A.default.weight", ".weight.lora_down"},
{".lora_B.default.weight", ".weight.lora_up"},
{".lora_linear", ".weight.alpha"},
{".alpha", ".weight.alpha"},
{".scale", ".weight.scale"},
{".diff", ".weight.diff"},
{".diff_b", ".bias.diff"},
{".hada_w1_a", ".weight.hada_w1_a"},
{".hada_w1_b", ".weight.hada_w1_b"},
{".hada_w2_a", ".weight.hada_w2_a"},
{".hada_w2_b", ".weight.hada_w2_b"},
{".hada_t1", ".weight.hada_t1"},
{".hada_t2", ".weight.hada_t2"},
{".lokr_w1", ".weight.lokr_w1"},
{".lokr_w1_a", ".weight.lokr_w1_a"},
{".lokr_w1_b", ".weight.lokr_w1_b"},
{".lokr_w2", ".weight.lokr_w2"},
{".lokr_w2_a", ".weight.lokr_w2_a"},
{".lokr_w2_b", ".weight.lokr_w2_b"},
};
for (const auto& [old_suffix, new_suffix] : lora_suffix_map) {
if (ends_with(name, old_suffix)) {
name.replace(name.size() - old_suffix.size(), old_suffix.size(), new_suffix);
break;
}
}
size_t pos = name.find(".processor");
if (pos != std::string::npos) {
name.replace(pos, strlen(".processor"), "");
}
std::vector<std::string> dit_prefix_vec = {
"transformer_blocks",
"single_transformer_blocks",
};
for (const auto& prefix : dit_prefix_vec) {
if (starts_with(name, prefix)) {
name = "transformer." + name;
break;
}
}
if (sd_version_is_unet(version) || is_lycoris_underline) {
name = convert_sep_to_dot(name);
}
}
std::vector<std::pair<std::string, std::string>> prefix_map = {
{"diffusion_model.", "model.diffusion_model."},
{"unet.", "model.diffusion_model."},
{"transformer.", "model.diffusion_model."}, // dit
{"vae.", "first_stage_model."},
{"text_encoder.", "cond_stage_model.transformer."},
{"te.", "cond_stage_model.transformer."},
{"text_encoder.2.", "cond_stage_model.1.transformer."},
{"conditioner.embedders.0.open_clip.", "cond_stage_model."},
// https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
{"conditioner.embedders.0.", "cond_stage_model."},
{"conditioner.embedders.1.", "cond_stage_model.1."},
// {"te2.text_model.encoder.layers.", "cond_stage_model.1.model.transformer.resblocks."},
{"te2.", "cond_stage_model.1.transformer."},
{"te1.", "cond_stage_model.transformer."},
};
replace_with_prefix_map(name, prefix_map);
// diffusion model
{
std::vector<std::string> diffuison_model_prefix_vec = {
"model.diffusion_model.",
};
for (const auto& prefix : diffuison_model_prefix_vec) {
if (starts_with(name, prefix)) {
name = convert_diffusion_model_name(name.substr(prefix.size()), prefix, version);
name = prefix + name;
break;
}
}
}
// cond_stage_model
{
std::vector<std::string> cond_stage_model_prefix_vec = {
"cond_stage_model.1.",
"cond_stage_model.",
"conditioner.embedders.",
"text_encoders.",
};
for (const auto& prefix : cond_stage_model_prefix_vec) {
if (starts_with(name, prefix)) {
name = convert_cond_stage_model_name(name.substr(prefix.size()), prefix);
name = prefix + name;
break;
}
}
}
// first_stage_model
{
std::vector<std::string> first_stage_model_prefix_vec = {
"first_stage_model.",
"vae.",
};
for (const auto& prefix : first_stage_model_prefix_vec) {
if (starts_with(name, prefix)) {
name = convert_first_stage_model_name(name.substr(prefix.size()), prefix);
name = prefix + name;
break;
}
}
}
// pmid
{
if (starts_with(name, "pmid.")) {
name = convert_pmid_name(name);
}
if (starts_with(name, "pmid.qformer_perceiver")) {
name = convert_pmid_v2_name(name);
}
}
// controlnet
{
if (starts_with(name, "control_model.")) { // for controlnet pth models
size_t pos = name.find('.');
if (pos != std::string::npos) {
name = name.substr(pos + 1);
}
}
}
if (is_lora) {
name = "lora." + name;
}
return name;
}