92 lines
4.1 KiB
C++

#ifndef __SD_MODEL_DIFFUSION_SEFI_IMAGE_HPP__
#define __SD_MODEL_DIFFUSION_SEFI_IMAGE_HPP__
#include <memory>
#include "model/common/block.hpp"
namespace SefiImage {
struct SefiImageConfig {
int64_t semantic_channels = 16;
int64_t texture_latent_channels = 32;
int64_t timestep_guidance_in_dim = 256;
int64_t hidden_size = 3072;
float timestep_shift_alpha = 0.3f;
float delta_t = 0.1f;
int64_t packed_texture_channels(int patch_size) const {
return texture_latent_channels * patch_size * patch_size;
}
int64_t packed_input_channels(int patch_size) const {
return semantic_channels + packed_texture_channels(patch_size);
}
static SefiImageConfig detect_from_weights(const String2TensorStorage& tensor_storage_map,
const std::string& prefix) {
SefiImageConfig config;
for (const auto& [name, tensor_storage] : tensor_storage_map) {
if (!starts_with(name, prefix)) {
continue;
}
if (ends_with(name, "dual_time_embed.semantic_embedder.linear_1.weight") && tensor_storage.n_dims == 2) {
config.timestep_guidance_in_dim = tensor_storage.ne[0];
config.hidden_size = tensor_storage.ne[1] * 2;
}
}
LOG_DEBUG("sefi_image: semantic_channels = %" PRId64 ", texture_latent_channels = %" PRId64 ", hidden_size = %" PRId64,
config.semantic_channels,
config.texture_latent_channels,
config.hidden_size);
return config;
}
};
struct SefiTimestepEmbedding : public GGMLBlock {
public:
SefiTimestepEmbedding(int64_t in_channels, int64_t time_embed_dim) {
blocks["linear_1"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, time_embed_dim, false));
blocks["linear_2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim, false));
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* sample) {
auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["linear_1"]);
auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["linear_2"]);
sample = linear_1->forward(ctx, sample);
sample = ggml_silu_inplace(ctx->ggml_ctx, sample);
sample = linear_2->forward(ctx, sample);
return sample;
}
};
struct SefiDualTimestepEmbeddings : public GGMLBlock {
public:
SefiDualTimestepEmbeddings(int64_t in_channels, int64_t embedding_dim) {
GGML_ASSERT(embedding_dim % 2 == 0);
int64_t half_dim = embedding_dim / 2;
blocks["semantic_embedder"] = std::make_shared<SefiTimestepEmbedding>(in_channels, half_dim);
blocks["texture_embedder"] = std::make_shared<SefiTimestepEmbedding>(in_channels, half_dim);
timestep_guidance_in_dim = in_channels;
}
ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_tensor* timestep_sem,
ggml_tensor* timestep_tex) {
auto semantic_embedder = std::dynamic_pointer_cast<SefiTimestepEmbedding>(blocks["semantic_embedder"]);
auto texture_embedder = std::dynamic_pointer_cast<SefiTimestepEmbedding>(blocks["texture_embedder"]);
auto sem_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep_sem, (int)timestep_guidance_in_dim, 10000, 1.f);
auto tex_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep_tex, (int)timestep_guidance_in_dim, 10000, 1.f);
auto sem_emb = semantic_embedder->forward(ctx, sem_proj);
auto tex_emb = texture_embedder->forward(ctx, tex_proj);
return ggml_concat(ctx->ggml_ctx, sem_emb, tex_emb, 0);
}
private:
int64_t timestep_guidance_in_dim = 256;
};
} // namespace SefiImage
#endif // __SD_MODEL_DIFFUSION_SEFI_IMAGE_HPP__