refactor: move photomaker into generation extension (#1618)

This commit is contained in:
leejet 2026-06-07 22:40:02 +08:00 committed by GitHub
parent 81abfb2548
commit 2a07540c2a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 495 additions and 357 deletions

View File

@ -216,6 +216,9 @@ file(GLOB SD_LIB_SOURCES CONFIGURE_DEPENDS
"src/core/*.h"
"src/core/*.cpp"
"src/core/*.hpp"
"src/extensions/*.h"
"src/extensions/*.cpp"
"src/extensions/*.hpp"
"src/model/*/*.h"
"src/model/*/*.cpp"
"src/model/*/*.hpp"

View File

@ -1,6 +1,7 @@
for f in src/*.cpp src/*.h src/*.hpp \
src/conditioning/*.cpp src/conditioning/*.h src/conditioning/*.hpp \
src/core/*.cpp src/core/*.h src/core/*.hpp \
src/extensions/*.cpp src/extensions/*.h src/extensions/*.hpp \
src/runtime/*.cpp src/runtime/*.h src/runtime/*.hpp \
src/model/*/*.cpp src/model/*/*.h src/model/*/*.hpp \
src/tokenizers/*.h src/tokenizers/*.cpp src/tokenizers/vocab/*.h src/tokenizers/vocab/*.cpp \

View File

@ -103,7 +103,6 @@ struct ConditionerParams {
int width = -1;
int height = -1;
bool zero_out_masked = false;
int num_input_imgs = 0; // for photomaker
const std::vector<sd::Tensor<float>>* ref_images = nullptr; // for qwen image edit
};
@ -121,25 +120,16 @@ public:
virtual void set_stream_layers_enabled(bool enabled) {}
virtual void set_flash_attention_enabled(bool enabled) = 0;
virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {}
virtual std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(int n_threads,
const ConditionerParams& conditioner_params) {
GGML_ABORT("Not implemented yet!");
}
virtual std::string remove_trigger_from_prompt(const std::string& prompt) {
GGML_ABORT("Not implemented yet!");
}
};
// ldm.modules.encoders.modules.FrozenCLIPEmbedder
// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/sd_hijack_clip.py#L283
struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
SDVersion version = VERSION_SD1;
PMVersion pm_version = PM_VERSION_1;
SDVersion version = VERSION_SD1;
CLIPTokenizer tokenizer;
std::shared_ptr<CLIPTextModelRunner> text_model;
std::shared_ptr<CLIPTextModelRunner> text_model2;
std::string trigger_word = "img"; // should be user settable
std::map<std::string, std::string> embedding_map;
int32_t num_custom_embeddings = 0;
int32_t num_custom_embeddings_2 = 0;
@ -150,9 +140,8 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map,
const std::map<std::string, std::string>& orig_embedding_map,
SDVersion version = VERSION_SD1,
PMVersion pv = PM_VERSION_1)
: version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407) {
SDVersion version = VERSION_SD1)
: version(version), tokenizer(sd_version_is_sd2(version) ? 0 : 49407) {
for (const auto& kv : orig_embedding_map) {
std::string name = kv.first;
std::transform(name.begin(), name.end(), name.begin(), [](unsigned char c) { return std::tolower(c); });
@ -329,121 +318,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
return tokenizer.decode(tokens);
}
std::tuple<std::vector<int>, std::vector<float>, std::vector<bool>>
tokenize_with_trigger_token(std::string text,
int num_input_imgs,
int32_t image_token) {
auto parsed_attention = parse_prompt_attention(text);
{
std::stringstream ss;
ss << "[";
for (const auto& item : parsed_attention) {
ss << "['" << item.first << "', " << item.second << "], ";
}
ss << "]";
LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
}
auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
auto iter = embedding_map.find(str);
if (iter == embedding_map.end()) {
return false;
}
std::string embedding_path = iter->second;
if (load_embedding(str, embedding_path, bpe_tokens)) {
return true;
}
return false;
};
std::vector<int> tokens;
std::vector<float> weights;
std::vector<bool> class_token_mask;
int32_t class_idx = -1, tokens_acc = 0;
for (const auto& item : parsed_attention) {
std::vector<int> class_token_index;
std::vector<int> clean_input_ids;
const std::string& curr_text = item.first;
float curr_weight = item.second;
// printf(" %s: %f \n", curr_text.c_str(), curr_weight);
int32_t clean_index = 0;
if (curr_text == "BREAK" && curr_weight == -1.0f) {
// Pad token array up to chunk size at this point.
// TODO: This is a hardcoded chunk_len, like in stable-diffusion.cpp, make it a parameter for the future?
// Also, this is 75 instead of 77 to leave room for BOS and EOS tokens.
int padding_size = 75 - (tokens_acc % 75);
for (int j = 0; j < padding_size; j++) {
clean_input_ids.push_back(tokenizer.EOS_TOKEN_ID);
clean_index++;
}
// After padding, continue to the next iteration to process the following text as a new segment
tokens.insert(tokens.end(), clean_input_ids.begin(), clean_input_ids.end());
weights.insert(weights.end(), padding_size, curr_weight);
continue;
}
// Regular token, process normally
std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb);
for (uint32_t i = 0; i < curr_tokens.size(); i++) {
int token_id = curr_tokens[i];
if (token_id == image_token) {
class_token_index.push_back(clean_index - 1);
} else {
clean_input_ids.push_back(token_id);
clean_index++;
}
}
// GGML_ASSERT(class_token_index.size() == 1); // PhotoMaker currently does not support multiple
// trigger words in a single prompt.
if (class_token_index.size() == 1) {
// Expand the class word token and corresponding mask
int class_token = clean_input_ids[class_token_index[0]];
class_idx = tokens_acc + class_token_index[0];
std::vector<int> clean_input_ids_tmp;
for (int i = 0; i < class_token_index[0]; i++)
clean_input_ids_tmp.push_back(clean_input_ids[i]);
for (int i = 0; i < (pm_version == PM_VERSION_2 ? 2 * num_input_imgs : num_input_imgs); i++)
clean_input_ids_tmp.push_back(class_token);
for (int i = class_token_index[0] + 1; i < clean_input_ids.size(); i++)
clean_input_ids_tmp.push_back(clean_input_ids[i]);
clean_input_ids.clear();
clean_input_ids = clean_input_ids_tmp;
}
tokens_acc += clean_index;
tokens.insert(tokens.end(), clean_input_ids.begin(), clean_input_ids.end());
weights.insert(weights.end(), clean_input_ids.size(), curr_weight);
}
// BUG!! double couting, pad_tokens will add BOS at the beginning
// tokens.insert(tokens.begin(), tokenizer.BOS_TOKEN_ID);
// weights.insert(weights.begin(), 1.0);
tokenizer.pad_tokens(tokens, &weights, nullptr, text_model->model.n_token, text_model->model.n_token, true);
int offset = pm_version == PM_VERSION_2 ? 2 * num_input_imgs : num_input_imgs;
for (int i = 0; i < tokens.size(); i++) {
// if (class_idx + 1 <= i && i < class_idx + 1 + 2*num_input_imgs) // photomaker V2 has num_tokens(=2)*num_input_imgs
if (class_idx + 1 <= i && i < class_idx + 1 + offset) // photomaker V2 has num_tokens(=2)*num_input_imgs
// hardcode for now
class_token_mask.push_back(true);
else
class_token_mask.push_back(false);
}
// printf("[");
// for (int i = 0; i < tokens.size(); i++) {
// printf("%d, ", class_token_mask[i] ? 1 : 0);
// }
// printf("]\n");
// for (int i = 0; i < tokens.size(); i++) {
// std::cout << tokens[i] << ":" << weights[i] << ", ";
// }
// std::cout << std::endl;
return std::make_tuple(tokens, weights, class_token_mask);
}
std::pair<std::vector<int>, std::vector<float>> tokenize(std::string text,
size_t min_length = 0,
size_t max_length = 0,
@ -631,49 +505,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
return result;
}
std::tuple<SDCondition, std::vector<bool>>
get_learned_condition_with_trigger(int n_threads,
const ConditionerParams& conditioner_params) override {
auto image_tokens = convert_token_to_id(trigger_word);
// if(image_tokens.size() == 1){
// printf(" image token id is: %d \n", image_tokens[0]);
// }
GGML_ASSERT(image_tokens.size() == 1);
auto tokens_and_weights = tokenize_with_trigger_token(conditioner_params.text,
conditioner_params.num_input_imgs,
image_tokens[0]);
std::vector<int>& tokens = std::get<0>(tokens_and_weights);
std::vector<float>& weights = std::get<1>(tokens_and_weights);
std::vector<bool>& clsm = std::get<2>(tokens_and_weights);
// printf("tokens: \n");
// for(int i = 0; i < tokens.size(); ++i)
// printf("%d ", tokens[i]);
// printf("\n");
// printf("clsm: \n");
// for(int i = 0; i < clsm.size(); ++i)
// printf("%d ", clsm[i]?1:0);
// printf("\n");
auto cond = get_learned_condition_common(n_threads,
tokens,
weights,
conditioner_params.clip_skip,
conditioner_params.width,
conditioner_params.height,
conditioner_params.zero_out_masked);
return std::make_tuple(cond, clsm);
}
std::string remove_trigger_from_prompt(const std::string& prompt) override {
auto image_tokens = convert_token_to_id(trigger_word);
GGML_ASSERT(image_tokens.size() == 1);
auto tokens_and_weights = tokenize(prompt);
std::vector<int>& tokens = tokens_and_weights.first;
auto it = std::find(tokens.begin(), tokens.end(), image_tokens[0]);
GGML_ASSERT(it != tokens.end()); // prompt must have trigger word
tokens.erase(it);
return decode(tokens);
}
SDCondition get_learned_condition(int n_threads,
const ConditionerParams& conditioner_params) override {
auto tokens_and_weights = tokenize(conditioner_params.text, text_model->model.n_token, text_model->model.n_token, true);

View File

@ -0,0 +1,73 @@
#ifndef __SD_EXTENSIONS_GENERATION_EXTENSION_H__
#define __SD_EXTENSIONS_GENERATION_EXTENSION_H__
#include <functional>
#include <map>
#include <memory>
#include <set>
#include <string>
#include "conditioning/conditioner.hpp"
#include "core/ggml_extend_backend.h"
#include "model.h"
#include "stable-diffusion.h"
struct GenerationExtensionInitContext {
const sd_ctx_params_t* params;
SDVersion version;
const String2TensorStorage& tensor_storage_map;
ModelLoader& model_loader;
int n_threads;
std::function<bool(SDBackendModule)> ensure_backend_pair;
std::function<ggml_backend_t(SDBackendModule)> backend_for;
std::function<ggml_backend_t(SDBackendModule)> params_backend_for;
};
struct GenerationExtensionTensorContext {
std::map<std::string, ggml_tensor*>& tensors;
std::map<std::string, ggml_tensor*>& mmap_able_tensors;
std::function<bool(SDBackendModule)> module_can_mmap;
};
struct GenerationExtensionConditionContext {
Conditioner* conditioner;
ConditionerParams& condition_params;
const sd_pm_params_t& pm_params;
std::map<std::string, ggml_tensor*>& tensors;
SDVersion version;
int n_threads;
int total_steps;
bool free_params_immediately;
};
struct GenerationExtension {
virtual ~GenerationExtension() = default;
virtual const char* name() const = 0;
virtual bool is_enabled() const {
return false;
}
virtual bool init(const GenerationExtensionInitContext&) {
return true;
}
virtual void collect_param_tensors(GenerationExtensionTensorContext&) {}
virtual void add_ignore_tensors(std::set<std::string>&) const {}
virtual bool alloc_params_buffer() {
return true;
}
virtual size_t get_params_buffer_size() const {
return 0;
}
virtual void reset_runtime_condition() {}
virtual bool prepare_condition(GenerationExtensionConditionContext&) {
return false;
}
virtual const SDCondition& before_condition(int step,
const SDCondition& condition) const {
return condition;
}
};
std::shared_ptr<GenerationExtension> create_photomaker_extension();
#endif

View File

@ -0,0 +1,325 @@
#include "extensions/generation_extension.h"
#include <algorithm>
#include <cstring>
#include <tuple>
#include <utility>
#include "core/tensor_ggml.hpp"
#include "core/util.h"
#include "model/adapter/lora.hpp"
#include "model/adapter/pmid.hpp"
static std::tuple<std::vector<int>, std::vector<float>, std::vector<bool>>
tokenize_photomaker_trigger(FrozenCLIPEmbedderWithCustomWords& clip_conditioner,
const std::string& text,
int trigger_token_count,
int32_t image_token) {
auto tokens_and_weights = clip_conditioner.tokenize(text);
std::vector<int> source_tokens = std::move(tokens_and_weights.first);
std::vector<float> source_weights = std::move(tokens_and_weights.second);
if (!source_tokens.empty() && source_tokens.front() == clip_conditioner.tokenizer.BOS_TOKEN_ID) {
source_tokens.erase(source_tokens.begin());
source_weights.erase(source_weights.begin());
}
if (!source_tokens.empty() && source_tokens.back() == clip_conditioner.tokenizer.EOS_TOKEN_ID) {
source_tokens.pop_back();
source_weights.pop_back();
}
std::vector<int> tokens;
std::vector<float> weights;
int32_t class_idx = -1;
for (size_t i = 0; i < source_tokens.size(); i++) {
int token = source_tokens[i];
if (token == image_token) {
if (!tokens.empty()) {
class_idx = static_cast<int32_t>(tokens.size()) - 1;
int class_token = tokens.back();
float class_weight = weights.back();
for (int j = 1; j < trigger_token_count; j++) {
tokens.push_back(class_token);
weights.push_back(class_weight);
}
}
continue;
}
tokens.push_back(token);
weights.push_back(source_weights[i]);
}
clip_conditioner.tokenizer.pad_tokens(tokens,
&weights,
nullptr,
clip_conditioner.text_model->model.n_token,
clip_conditioner.text_model->model.n_token,
true);
std::vector<bool> class_token_mask;
for (int i = 0; i < tokens.size(); i++) {
class_token_mask.push_back(class_idx + 1 <= i && i < class_idx + 1 + trigger_token_count);
}
return std::make_tuple(tokens, weights, class_token_mask);
}
static std::tuple<SDCondition, std::vector<bool>>
get_photomaker_condition_with_trigger(FrozenCLIPEmbedderWithCustomWords& clip_conditioner,
int n_threads,
const ConditionerParams& conditioner_params,
const std::string& trigger_word,
int trigger_token_count) {
auto image_tokens = clip_conditioner.convert_token_to_id(trigger_word);
GGML_ASSERT(image_tokens.size() == 1);
auto tokens_and_weights = tokenize_photomaker_trigger(clip_conditioner,
conditioner_params.text,
trigger_token_count,
image_tokens[0]);
std::vector<int>& tokens = std::get<0>(tokens_and_weights);
std::vector<float>& weights = std::get<1>(tokens_and_weights);
std::vector<bool>& trigger_mask = std::get<2>(tokens_and_weights);
auto cond = clip_conditioner.get_learned_condition_common(n_threads,
tokens,
weights,
conditioner_params.clip_skip,
conditioner_params.width,
conditioner_params.height,
conditioner_params.zero_out_masked);
return std::make_tuple(std::move(cond), trigger_mask);
}
static std::string remove_photomaker_trigger_from_prompt(FrozenCLIPEmbedderWithCustomWords& clip_conditioner,
const std::string& prompt,
const std::string& trigger_word) {
auto image_tokens = clip_conditioner.convert_token_to_id(trigger_word);
GGML_ASSERT(image_tokens.size() == 1);
auto tokens_and_weights = clip_conditioner.tokenize(prompt);
std::vector<int>& tokens = tokens_and_weights.first;
auto it = std::find(tokens.begin(), tokens.end(), image_tokens[0]);
GGML_ASSERT(it != tokens.end());
tokens.erase(it);
return clip_conditioner.decode(tokens);
}
struct PhotoMakerExtension : public GenerationExtension {
std::shared_ptr<PhotoMakerIDEncoder> pmid_model;
std::shared_ptr<LoraModel> pmid_lora;
bool enabled = false;
std::string model_path;
std::string trigger_word = "img";
SDCondition id_condition;
int start_merge_step = -1;
const char* name() const override {
return "photomaker";
}
bool is_enabled() const override {
return enabled;
}
bool init(const GenerationExtensionInitContext& ctx) override {
model_path = SAFE_STR(ctx.params->photo_maker_path);
if (model_path.empty()) {
return true;
}
if (!ctx.ensure_backend_pair(SDBackendModule::PHOTOMAKER)) {
return false;
}
PMVersion pm_version = std::strstr(model_path.c_str(), "v2") != nullptr ? PM_VERSION_2 : PM_VERSION_1;
pmid_model = std::make_shared<PhotoMakerIDEncoder>(ctx.backend_for(SDBackendModule::PHOTOMAKER),
ctx.params_backend_for(SDBackendModule::PHOTOMAKER),
ctx.tensor_storage_map,
"pmid",
ctx.version,
pm_version);
if (pm_version == PM_VERSION_2) {
LOG_INFO("using PhotoMaker Version 2");
}
pmid_lora = std::make_shared<LoraModel>("pmid",
ctx.backend_for(SDBackendModule::PHOTOMAKER),
ctx.params_backend_for(SDBackendModule::PHOTOMAKER),
model_path,
"",
ctx.version);
auto lora_tensor_filter = [&](const std::string& tensor_name) {
return starts_with(tensor_name, "lora.model");
};
if (!pmid_lora->load_from_file(ctx.n_threads, lora_tensor_filter)) {
LOG_WARN("load photomaker lora tensors from %s failed", model_path.c_str());
return false;
}
LOG_INFO("loading stacked ID embedding (PHOTOMAKER) model file from '%s'", model_path.c_str());
if (!ctx.model_loader.init_from_file_and_convert_name(model_path, "pmid.")) {
LOG_WARN("loading stacked ID embedding from '%s' failed", model_path.c_str());
return true;
}
enabled = true;
return true;
}
void collect_param_tensors(GenerationExtensionTensorContext& ctx) override {
if (!enabled || pmid_model == nullptr) {
return;
}
std::map<std::string, ggml_tensor*> temp;
pmid_model->get_param_tensors(temp, "pmid");
bool do_mmap = ctx.module_can_mmap(SDBackendModule::PHOTOMAKER);
for (const auto& [key, tensor] : temp) {
ctx.tensors[key] = tensor;
if (do_mmap) {
ctx.mmap_able_tensors[key] = tensor;
}
}
}
void add_ignore_tensors(std::set<std::string>& ignore_tensors) const override {
if (!enabled) {
return;
}
ignore_tensors.insert("pmid.unet.");
}
bool alloc_params_buffer() override {
if (!enabled || pmid_model == nullptr) {
return true;
}
return pmid_model->alloc_params_buffer();
}
size_t get_params_buffer_size() const override {
if (!enabled || pmid_model == nullptr) {
return 0;
}
return pmid_model->get_params_buffer_size();
}
void reset_runtime_condition() override {
id_condition = {};
start_merge_step = -1;
}
bool prepare_condition(GenerationExtensionConditionContext& ctx) override {
reset_runtime_condition();
if (!enabled || pmid_model == nullptr || pmid_lora == nullptr) {
return false;
}
if (!pmid_lora->applied) {
int64_t t0 = ggml_time_ms();
pmid_lora->apply(ctx.tensors, ctx.version, ctx.n_threads);
int64_t t1 = ggml_time_ms();
pmid_lora->applied = true;
LOG_INFO("pmid_lora apply completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
if (ctx.free_params_immediately) {
pmid_lora->free_params_buffer();
}
}
bool pmv2 = pmid_model->get_version() == PM_VERSION_2;
if (ctx.pm_params.id_images_count <= 0 || ctx.pm_params.id_images == nullptr) {
LOG_WARN("Provided PhotoMaker model file, but NO input ID images");
LOG_WARN("Turn off PhotoMaker for this request");
return false;
}
auto* clip_conditioner = dynamic_cast<FrozenCLIPEmbedderWithCustomWords*>(ctx.conditioner);
if (clip_conditioner == nullptr) {
LOG_WARN("PhotoMaker requires FrozenCLIPEmbedderWithCustomWords conditioner");
LOG_WARN("Turn off PhotoMaker for this request");
return false;
}
int clip_image_size = 224;
pmid_model->style_strength = ctx.pm_params.style_strength;
sd::Tensor<float> id_image_tensor;
for (int i = 0; i < ctx.pm_params.id_images_count; i++) {
auto id_image = sd_image_to_tensor(ctx.pm_params.id_images[i]);
auto processed_id_image = clip_preprocess(id_image, clip_image_size, clip_image_size);
if (id_image_tensor.empty()) {
id_image_tensor = processed_id_image;
} else {
id_image_tensor = sd::ops::concat(id_image_tensor, processed_id_image, 3);
}
}
int64_t t0 = ggml_time_ms();
int trigger_token_count = pmv2 ? 2 * ctx.pm_params.id_images_count : ctx.pm_params.id_images_count;
auto cond_tup = get_photomaker_condition_with_trigger(*clip_conditioner,
ctx.n_threads,
ctx.condition_params,
trigger_word,
trigger_token_count);
SDCondition prepared_id_condition = std::get<0>(cond_tup);
auto class_tokens_mask = std::get<1>(cond_tup);
if (std::find(class_tokens_mask.begin(), class_tokens_mask.end(), true) == class_tokens_mask.end()) {
LOG_WARN("PhotoMaker trigger word '%s' was not found in prompt", trigger_word.c_str());
LOG_WARN("Turn off PhotoMaker for this request");
return false;
}
sd::Tensor<float> id_embeds;
if (pmv2 && ctx.pm_params.id_embed_path != nullptr) {
try {
id_embeds = sd::load_tensor_from_file_as_tensor<float>(ctx.pm_params.id_embed_path);
} catch (const std::exception&) {
id_embeds = {};
}
}
if (pmv2 && id_embeds.empty()) {
LOG_WARN("Provided PhotoMaker images, but NO valid ID embeds file for PM v2");
LOG_WARN("Turn off PhotoMaker for this request");
return false;
}
if (pmv2 && ctx.pm_params.id_images_count != id_embeds.shape()[1]) {
LOG_WARN("PhotoMaker image count (%d) does NOT match ID embeds (%d). You should run face_detect.py again.",
ctx.pm_params.id_images_count,
static_cast<int>(id_embeds.shape()[1]));
LOG_WARN("Turn off PhotoMaker for this request");
return false;
}
auto res = pmid_model->compute(ctx.n_threads,
id_image_tensor,
prepared_id_condition.c_crossattn,
id_embeds,
class_tokens_mask);
if (res.empty()) {
LOG_ERROR("Photomaker ID Stacking failed");
LOG_WARN("Turn off PhotoMaker for this request");
return false;
}
prepared_id_condition.c_crossattn = std::move(res);
int64_t t1 = ggml_time_ms();
id_condition = std::move(prepared_id_condition);
start_merge_step = int(ctx.pm_params.style_strength / 100.f * ctx.total_steps);
ctx.condition_params.text = remove_photomaker_trigger_from_prompt(*clip_conditioner,
ctx.condition_params.text,
trigger_word);
LOG_INFO("Photomaker ID Stacking, taking %" PRId64 " ms", t1 - t0);
LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step);
if (ctx.free_params_immediately) {
pmid_model->free_params_buffer();
}
return true;
}
const SDCondition& before_condition(int step,
const SDCondition& condition) const override {
if (!id_condition.empty() && start_merge_step != -1 && step > start_merge_step) {
return id_condition;
}
return condition;
}
};
std::shared_ptr<GenerationExtension> create_photomaker_extension() {
return std::make_shared<PhotoMakerExtension>();
}

View File

@ -1,6 +1,7 @@
#include <algorithm>
#include <cmath>
#include <cstdlib>
#include <set>
#include "core/ggml_extend.hpp"
#include "core/ggml_graph_cut.h"
@ -13,8 +14,8 @@
#include "stable-diffusion.h"
#include "conditioning/conditioner.hpp"
#include "extensions/generation_extension.h"
#include "model/adapter/lora.hpp"
#include "model/adapter/pmid.hpp"
#include "model/diffusion/anima.hpp"
#include "model/diffusion/control.hpp"
#include "model/diffusion/ernie_image.hpp"
@ -180,9 +181,7 @@ public:
std::shared_ptr<VAE> preview_vae;
std::shared_ptr<LTXV::LTXAudioVAERunner> audio_vae_model;
std::shared_ptr<ControlNet> control_net;
std::shared_ptr<PhotoMakerIDEncoder> pmid_model;
std::shared_ptr<LoraModel> pmid_lora;
std::shared_ptr<PhotoMakerIDEmbed> pmid_id_embeds;
std::vector<std::shared_ptr<GenerationExtension>> generation_extensions;
std::vector<std::shared_ptr<LoraModel>> cond_stage_lora_models;
std::vector<std::shared_ptr<LoraModel>> diffusion_lora_models;
std::vector<std::shared_ptr<LoraModel>> first_stage_lora_models;
@ -193,7 +192,6 @@ public:
bool offload_params_to_cpu = false;
float max_vram = 0.f;
bool stream_layers = false;
bool use_pmid = false;
std::string backend_spec;
std::string params_backend_spec;
@ -743,21 +741,12 @@ public:
for (uint32_t i = 0; i < sd_ctx_params->embedding_count; i++) {
embbeding_map.emplace(SAFE_STR(sd_ctx_params->embeddings[i].name), SAFE_STR(sd_ctx_params->embeddings[i].path));
}
if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) {
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(backend_for(SDBackendModule::TE),
params_backend_for(SDBackendModule::TE),
tensor_storage_map,
embbeding_map,
version,
PM_VERSION_2);
} else {
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(backend_for(SDBackendModule::TE),
params_backend_for(SDBackendModule::TE),
tensor_storage_map,
embbeding_map,
version);
}
diffusion_model = std::make_shared<UNetModelRunner>(backend_for(SDBackendModule::DIFFUSION),
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(backend_for(SDBackendModule::TE),
params_backend_for(SDBackendModule::TE),
tensor_storage_map,
embbeding_map,
version);
diffusion_model = std::make_shared<UNetModelRunner>(backend_for(SDBackendModule::DIFFUSION),
params_backend_for(SDBackendModule::DIFFUSION),
tensor_storage_map,
"model.diffusion_model",
@ -914,50 +903,35 @@ public:
}
}
if (strlen(SAFE_STR(sd_ctx_params->photo_maker_path)) > 0) {
if (!ensure_backend_pair(SDBackendModule::PHOTOMAKER)) {
return false;
}
if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) {
pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend_for(SDBackendModule::PHOTOMAKER),
params_backend_for(SDBackendModule::PHOTOMAKER),
tensor_storage_map,
"pmid",
version,
PM_VERSION_2);
LOG_INFO("using PhotoMaker Version 2");
} else {
pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend_for(SDBackendModule::PHOTOMAKER),
params_backend_for(SDBackendModule::PHOTOMAKER),
tensor_storage_map,
"pmid",
version);
}
pmid_lora = std::make_shared<LoraModel>("pmid",
backend_for(SDBackendModule::PHOTOMAKER),
params_backend_for(SDBackendModule::PHOTOMAKER),
sd_ctx_params->photo_maker_path,
"",
version);
auto lora_tensor_filter = [&](const std::string& tensor_name) {
if (starts_with(tensor_name, "lora.model")) {
return true;
}
return false;
{
generation_extensions.clear();
auto photomaker_extension = create_photomaker_extension();
GenerationExtensionInitContext extension_ctx{
sd_ctx_params,
version,
tensor_storage_map,
model_loader,
n_threads,
[this](SDBackendModule module) { return ensure_backend_pair(module); },
[this](SDBackendModule module) { return backend_for(module); },
[this](SDBackendModule module) { return params_backend_for(module); },
};
if (!pmid_lora->load_from_file(n_threads, lora_tensor_filter)) {
LOG_WARN("load photomaker lora tensors from %s failed", sd_ctx_params->photo_maker_path);
if (!photomaker_extension->init(extension_ctx)) {
return false;
}
LOG_INFO("loading stacked ID embedding (PHOTOMAKER) model file from '%s'", sd_ctx_params->photo_maker_path);
if (!model_loader.init_from_file_and_convert_name(sd_ctx_params->photo_maker_path, "pmid.")) {
LOG_WARN("loading stacked ID embedding from '%s' failed", sd_ctx_params->photo_maker_path);
} else {
use_pmid = true;
if (photomaker_extension->is_enabled()) {
generation_extensions.push_back(photomaker_extension);
}
}
if (use_pmid) {
get_param_tensors_p(pmid_model, module_can_mmap(SDBackendModule::PHOTOMAKER), "pmid");
{
GenerationExtensionTensorContext extension_tensor_ctx{
tensors,
mmap_able_tensors,
module_can_mmap,
};
for (auto& extension : generation_extensions) {
extension->collect_param_tensors(extension_tensor_ctx);
}
}
if (sd_ctx_params->flash_attn) {
@ -1011,8 +985,8 @@ public:
if (use_tae && !tae_preview_only) {
ignore_tensors.insert("first_stage_model.");
}
if (use_pmid) {
ignore_tensors.insert("pmid.unet.");
for (auto& extension : generation_extensions) {
extension->add_ignore_tensors(ignore_tensors);
}
ignore_tensors.insert("model.diffusion_model.__x0__");
ignore_tensors.insert("model.diffusion_model.__32x32__");
@ -1099,10 +1073,12 @@ public:
ggml_free(ctx);
return false;
}
if (use_pmid && pmid_model && !pmid_model->alloc_params_buffer()) {
LOG_ERROR("PhotoMaker params buffer allocation failed");
ggml_free(ctx);
return false;
for (auto& extension : generation_extensions) {
if (!extension->alloc_params_buffer()) {
LOG_ERROR("%s params buffer allocation failed", extension->name());
ggml_free(ctx);
return false;
}
}
bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, sd_ctx_params->enable_mmap);
@ -1136,9 +1112,9 @@ public:
}
control_net_params_mem_size = control_net->get_params_buffer_size();
}
size_t pmid_params_mem_size = 0;
if (use_pmid) {
pmid_params_mem_size = pmid_model->get_params_buffer_size();
size_t extension_params_mem_size = 0;
for (auto& extension : generation_extensions) {
extension_params_mem_size += extension->get_params_buffer_size();
}
size_t total_params_ram_size = 0;
@ -1170,7 +1146,7 @@ public:
};
if (!add_params_memory(clip_params_mem_size, SDBackendModule::TE) ||
!add_params_memory(pmid_params_mem_size, SDBackendModule::PHOTOMAKER) ||
!add_params_memory(extension_params_mem_size, SDBackendModule::PHOTOMAKER) ||
!add_params_memory(unet_params_mem_size, SDBackendModule::DIFFUSION) ||
!add_params_memory(vae_params_mem_size, SDBackendModule::VAE) ||
!add_params_memory(control_net_params_mem_size, SDBackendModule::CONTROL_NET)) {
@ -1181,7 +1157,7 @@ public:
size_t total_params_size = total_params_ram_size + total_params_vram_size;
LOG_INFO(
"total params memory size = %.2fMB (VRAM %.2fMB, RAM %.2fMB): "
"text_encoders %.2fMB(%s), diffusion_model %.2fMB(%s), vae %.2fMB(%s), controlnet %.2fMB(%s), pmid %.2fMB(%s)",
"text_encoders %.2fMB(%s), diffusion_model %.2fMB(%s), vae %.2fMB(%s), controlnet %.2fMB(%s), extensions %.2fMB(%s)",
total_params_size / 1024.0 / 1024.0,
total_params_vram_size / 1024.0 / 1024.0,
total_params_ram_size / 1024.0 / 1024.0,
@ -1193,8 +1169,8 @@ public:
params_memory_location(vae_params_mem_size, SDBackendModule::VAE),
control_net_params_mem_size / 1024.0 / 1024.0,
params_memory_location(control_net_params_mem_size, SDBackendModule::CONTROL_NET),
pmid_params_mem_size / 1024.0 / 1024.0,
params_memory_location(pmid_params_mem_size, SDBackendModule::PHOTOMAKER));
extension_params_mem_size / 1024.0 / 1024.0,
params_memory_location(extension_params_mem_size, SDBackendModule::PHOTOMAKER));
}
// init denoiser
@ -1599,88 +1575,30 @@ public:
}
}
SDCondition get_pmid_conditon(sd_pm_params_t pm_params,
ConditionerParams& condition_params) {
SDCondition id_cond;
if (use_pmid) {
if (!pmid_lora->applied) {
int64_t t0 = ggml_time_ms();
pmid_lora->apply(tensors, version, n_threads);
int64_t t1 = ggml_time_ms();
pmid_lora->applied = true;
LOG_INFO("pmid_lora apply completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
if (free_params_immediately) {
pmid_lora->free_params_buffer();
}
}
// preprocess input id images
bool pmv2 = pmid_model->get_version() == PM_VERSION_2;
if (pm_params.id_images_count > 0) {
int clip_image_size = 224;
pmid_model->style_strength = pm_params.style_strength;
sd::Tensor<float> id_image_tensor;
for (int i = 0; i < pm_params.id_images_count; i++) {
auto id_image = sd_image_to_tensor(pm_params.id_images[i]);
auto processed_id_image = clip_preprocess(id_image, clip_image_size, clip_image_size);
if (id_image_tensor.empty()) {
id_image_tensor = processed_id_image;
} else {
id_image_tensor = sd::ops::concat(id_image_tensor, processed_id_image, 3);
}
}
int64_t t0 = ggml_time_ms();
condition_params.num_input_imgs = pm_params.id_images_count;
auto cond_tup = cond_stage_model->get_learned_condition_with_trigger(n_threads,
condition_params);
id_cond = std::get<0>(cond_tup);
auto class_tokens_mask = std::get<1>(cond_tup);
sd::Tensor<float> id_embeds;
if (pmv2 && pm_params.id_embed_path != nullptr) {
try {
id_embeds = sd::load_tensor_from_file_as_tensor<float>(pm_params.id_embed_path);
} catch (const std::exception&) {
id_embeds = {};
}
}
if (pmv2 && id_embeds.empty()) {
LOG_WARN("Provided PhotoMaker images, but NO valid ID embeds file for PM v2");
LOG_WARN("Turn off PhotoMaker");
use_pmid = false;
} else {
if (pmv2 && pm_params.id_images_count != id_embeds.shape()[1]) {
LOG_WARN("PhotoMaker image count (%d) does NOT match ID embeds (%d). You should run face_detect.py again.", pm_params.id_images_count, static_cast<int>(id_embeds.shape()[1]));
LOG_WARN("Turn off PhotoMaker");
use_pmid = false;
} else {
auto res = pmid_model->compute(n_threads,
id_image_tensor,
id_cond.c_crossattn,
id_embeds,
class_tokens_mask);
if (res.empty()) {
LOG_ERROR("Photomaker ID Stacking failed");
LOG_WARN("Turn off PhotoMaker");
use_pmid = false;
} else {
id_cond.c_crossattn = std::move(res);
int64_t t1 = ggml_time_ms();
LOG_INFO("Photomaker ID Stacking, taking %" PRId64 " ms", t1 - t0);
// Encode input prompt without the trigger word for delayed conditioning
condition_params.text = cond_stage_model->remove_trigger_from_prompt(condition_params.text);
}
if (free_params_immediately) {
pmid_model->free_params_buffer();
}
}
}
} else {
LOG_WARN("Provided PhotoMaker model file, but NO input ID images");
LOG_WARN("Turn off PhotoMaker");
use_pmid = false;
}
void reset_generation_extensions() {
for (auto& extension : generation_extensions) {
extension->reset_runtime_condition();
}
}
void prepare_generation_extensions(const sd_pm_params_t& pm_params,
ConditionerParams& condition_params,
int total_steps) {
reset_generation_extensions();
GenerationExtensionConditionContext ctx{
cond_stage_model.get(),
condition_params,
pm_params,
tensors,
version,
n_threads,
total_steps,
free_params_immediately,
};
for (auto& extension : generation_extensions) {
extension->prepare_condition(ctx);
}
return id_cond;
}
sd::Tensor<float> get_clip_vision_output(const sd::Tensor<float>& image,
@ -1979,7 +1897,6 @@ public:
const SDCondition& cond,
const SDCondition& uncond,
const SDCondition& img_uncond,
const SDCondition& id_cond,
const sd::Tensor<float>& control_image,
float control_strength,
const sd_guidance_params_t& guidance,
@ -1989,7 +1906,6 @@ public:
bool is_flow_denoiser,
const char* extra_sample_args,
const std::vector<float>& sigmas,
int start_merge_step,
const std::vector<sd::Tensor<float>>& ref_latents,
bool increase_ref_index,
const sd::Tensor<float>& denoise_mask,
@ -2181,20 +2097,24 @@ public:
return output_opt;
};
if (start_merge_step == -1 || step <= start_merge_step) {
cond_out = run_condition(cond);
if (cond_out.empty()) {
return {};
}
} else {
GGML_ASSERT(!id_cond.empty());
cond_out = run_condition(id_cond,
cond.c_concat.empty() ? nullptr : &cond.c_concat);
if (cond_out.empty()) {
return {};
const SDCondition* positive_condition = &cond;
const sd::Tensor<float>* c_concat_override = nullptr;
for (const auto& extension : generation_extensions) {
const SDCondition& next_condition = extension->before_condition(step, *positive_condition);
if (&next_condition != positive_condition) {
positive_condition = &next_condition;
if (positive_condition != &cond) {
c_concat_override = cond.c_concat.empty() ? nullptr : &cond.c_concat;
}
break;
}
}
cond_out = run_condition(*positive_condition, c_concat_override);
if (cond_out.empty()) {
return {};
}
if (!uncond.empty()) {
if (!step_cache.is_step_skipped()) {
compute_sample_controls(control_image,
@ -3470,7 +3390,6 @@ struct SamplePlan {
int high_noise_sample_steps = 0;
int total_steps = 0;
float moe_boundary = 0.f;
int start_merge_step = -1;
std::vector<float> sigmas;
SamplePlan(sd_ctx_t* sd_ctx,
@ -3555,11 +3474,6 @@ struct SamplePlan {
high_noise_eta = resolve_eta(sd_ctx, high_noise_eta, high_noise_sample_method);
LOG_INFO("sampling(high noise) using %s method", sampling_methods_str[high_noise_sample_method]);
}
if (sd_ctx->sd->use_pmid) {
start_merge_step = int(sd_ctx->sd->pmid_model->style_strength / 100.f * total_steps);
LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step);
}
}
};
@ -3890,7 +3804,6 @@ struct ImageGenerationEmbeds {
SDCondition cond;
SDCondition uncond;
SDCondition img_uncond;
SDCondition id_cond;
};
struct CircularAxesState {
@ -4195,7 +4108,9 @@ static std::optional<ImageGenerationEmbeds> prepare_image_generation_embeds(sd_c
condition_params.height = request->height;
condition_params.ref_images = &latents->ref_images;
auto id_cond = sd_ctx->sd->get_pmid_conditon(request->pm_params, condition_params);
sd_ctx->sd->prepare_generation_extensions(request->pm_params,
condition_params,
plan->total_steps);
int64_t prepare_start_ms = ggml_time_ms();
condition_params.zero_out_masked = false;
auto cond = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads,
@ -4265,7 +4180,6 @@ static std::optional<ImageGenerationEmbeds> prepare_image_generation_embeds(sd_c
embeds.img_uncond = std::move(img_uncond);
embeds.cond = std::move(cond);
embeds.uncond = std::move(uncond);
embeds.id_cond = std::move(id_cond);
return embeds;
}
@ -4546,7 +4460,6 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
embeds.cond,
embeds.uncond,
embeds.img_uncond,
embeds.id_cond,
latents.control_image,
request.control_strength,
request.guidance,
@ -4556,7 +4469,6 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
sd_ctx->sd->is_flow_denoiser(),
plan.extra_sample_args,
plan.sigmas,
plan.start_merge_step,
latents.ref_latents,
request.increase_ref_index,
latents.denoise_mask,
@ -4666,7 +4578,6 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
embeds.cond,
embeds.uncond,
embeds.img_uncond,
embeds.id_cond,
latents.control_image,
request.control_strength,
request.guidance,
@ -4676,7 +4587,6 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
sd_ctx->sd->is_flow_denoiser(),
plan.extra_sample_args,
hires_sigma_sched,
plan.start_merge_step,
latents.ref_latents,
request.increase_ref_index,
hires_denoise_mask,
@ -5335,6 +5245,7 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
sd_ctx->sd->sampler_rng->manual_seed(request.seed);
sd_ctx->sd->set_flow_shift(sd_vid_gen_params->sample_params.flow_shift);
sd_ctx->sd->apply_loras(sd_vid_gen_params->loras, sd_vid_gen_params->lora_count);
sd_ctx->sd->reset_generation_extensions();
SamplePlan plan(sd_ctx, sd_vid_gen_params, request);
auto latent_inputs_opt = prepare_video_generation_latents(sd_ctx, sd_vid_gen_params, &request);
@ -5381,7 +5292,6 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
embeds.cond,
request.use_high_noise_uncond ? embeds.uncond : SDCondition(),
embeds.img_uncond,
embeds.id_cond,
sd::Tensor<float>(),
0.f,
request.high_noise_guidance,
@ -5391,7 +5301,6 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
sd_ctx->sd->is_flow_denoiser(),
plan.high_noise_extra_sample_args,
high_noise_sigmas,
-1,
std::vector<sd::Tensor<float>>{},
false,
latents.denoise_mask,
@ -5427,7 +5336,6 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
embeds.cond,
request.use_uncond ? embeds.uncond : SDCondition(),
embeds.img_uncond,
embeds.id_cond,
sd::Tensor<float>(),
0.f,
sd_vid_gen_params->sample_params.guidance,
@ -5437,7 +5345,6 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
sd_ctx->sd->is_flow_denoiser(),
plan.extra_sample_args,
plan.sigmas,
-1,
std::vector<sd::Tensor<float>>{},
false,
latents.denoise_mask,
@ -5571,7 +5478,6 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
embeds.cond,
hires_request.use_uncond ? embeds.uncond : SDCondition(),
embeds.img_uncond,
embeds.id_cond,
sd::Tensor<float>(),
0.f,
sd_vid_gen_params->sample_params.guidance,
@ -5581,7 +5487,6 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
sd_ctx->sd->is_flow_denoiser(),
plan.extra_sample_args,
hires_sigma_sched,
-1,
std::vector<sd::Tensor<float>>{},
false,
hires_denoise_mask,