mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2026-06-09 15:56:39 +00:00
refactor: move photomaker into generation extension (#1618)
This commit is contained in:
parent
81abfb2548
commit
2a07540c2a
@ -216,6 +216,9 @@ file(GLOB SD_LIB_SOURCES CONFIGURE_DEPENDS
|
||||
"src/core/*.h"
|
||||
"src/core/*.cpp"
|
||||
"src/core/*.hpp"
|
||||
"src/extensions/*.h"
|
||||
"src/extensions/*.cpp"
|
||||
"src/extensions/*.hpp"
|
||||
"src/model/*/*.h"
|
||||
"src/model/*/*.cpp"
|
||||
"src/model/*/*.hpp"
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
for f in src/*.cpp src/*.h src/*.hpp \
|
||||
src/conditioning/*.cpp src/conditioning/*.h src/conditioning/*.hpp \
|
||||
src/core/*.cpp src/core/*.h src/core/*.hpp \
|
||||
src/extensions/*.cpp src/extensions/*.h src/extensions/*.hpp \
|
||||
src/runtime/*.cpp src/runtime/*.h src/runtime/*.hpp \
|
||||
src/model/*/*.cpp src/model/*/*.h src/model/*/*.hpp \
|
||||
src/tokenizers/*.h src/tokenizers/*.cpp src/tokenizers/vocab/*.h src/tokenizers/vocab/*.cpp \
|
||||
|
||||
@ -103,7 +103,6 @@ struct ConditionerParams {
|
||||
int width = -1;
|
||||
int height = -1;
|
||||
bool zero_out_masked = false;
|
||||
int num_input_imgs = 0; // for photomaker
|
||||
const std::vector<sd::Tensor<float>>* ref_images = nullptr; // for qwen image edit
|
||||
};
|
||||
|
||||
@ -121,25 +120,16 @@ public:
|
||||
virtual void set_stream_layers_enabled(bool enabled) {}
|
||||
virtual void set_flash_attention_enabled(bool enabled) = 0;
|
||||
virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {}
|
||||
virtual std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(int n_threads,
|
||||
const ConditionerParams& conditioner_params) {
|
||||
GGML_ABORT("Not implemented yet!");
|
||||
}
|
||||
virtual std::string remove_trigger_from_prompt(const std::string& prompt) {
|
||||
GGML_ABORT("Not implemented yet!");
|
||||
}
|
||||
};
|
||||
|
||||
// ldm.modules.encoders.modules.FrozenCLIPEmbedder
|
||||
// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/sd_hijack_clip.py#L283
|
||||
struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
||||
SDVersion version = VERSION_SD1;
|
||||
PMVersion pm_version = PM_VERSION_1;
|
||||
SDVersion version = VERSION_SD1;
|
||||
CLIPTokenizer tokenizer;
|
||||
std::shared_ptr<CLIPTextModelRunner> text_model;
|
||||
std::shared_ptr<CLIPTextModelRunner> text_model2;
|
||||
|
||||
std::string trigger_word = "img"; // should be user settable
|
||||
std::map<std::string, std::string> embedding_map;
|
||||
int32_t num_custom_embeddings = 0;
|
||||
int32_t num_custom_embeddings_2 = 0;
|
||||
@ -150,9 +140,8 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
||||
ggml_backend_t params_backend,
|
||||
const String2TensorStorage& tensor_storage_map,
|
||||
const std::map<std::string, std::string>& orig_embedding_map,
|
||||
SDVersion version = VERSION_SD1,
|
||||
PMVersion pv = PM_VERSION_1)
|
||||
: version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407) {
|
||||
SDVersion version = VERSION_SD1)
|
||||
: version(version), tokenizer(sd_version_is_sd2(version) ? 0 : 49407) {
|
||||
for (const auto& kv : orig_embedding_map) {
|
||||
std::string name = kv.first;
|
||||
std::transform(name.begin(), name.end(), name.begin(), [](unsigned char c) { return std::tolower(c); });
|
||||
@ -329,121 +318,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
||||
return tokenizer.decode(tokens);
|
||||
}
|
||||
|
||||
std::tuple<std::vector<int>, std::vector<float>, std::vector<bool>>
|
||||
tokenize_with_trigger_token(std::string text,
|
||||
int num_input_imgs,
|
||||
int32_t image_token) {
|
||||
auto parsed_attention = parse_prompt_attention(text);
|
||||
|
||||
{
|
||||
std::stringstream ss;
|
||||
ss << "[";
|
||||
for (const auto& item : parsed_attention) {
|
||||
ss << "['" << item.first << "', " << item.second << "], ";
|
||||
}
|
||||
ss << "]";
|
||||
LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
|
||||
}
|
||||
|
||||
auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
|
||||
auto iter = embedding_map.find(str);
|
||||
if (iter == embedding_map.end()) {
|
||||
return false;
|
||||
}
|
||||
std::string embedding_path = iter->second;
|
||||
if (load_embedding(str, embedding_path, bpe_tokens)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
std::vector<int> tokens;
|
||||
std::vector<float> weights;
|
||||
std::vector<bool> class_token_mask;
|
||||
int32_t class_idx = -1, tokens_acc = 0;
|
||||
for (const auto& item : parsed_attention) {
|
||||
std::vector<int> class_token_index;
|
||||
std::vector<int> clean_input_ids;
|
||||
const std::string& curr_text = item.first;
|
||||
float curr_weight = item.second;
|
||||
// printf(" %s: %f \n", curr_text.c_str(), curr_weight);
|
||||
int32_t clean_index = 0;
|
||||
if (curr_text == "BREAK" && curr_weight == -1.0f) {
|
||||
// Pad token array up to chunk size at this point.
|
||||
// TODO: This is a hardcoded chunk_len, like in stable-diffusion.cpp, make it a parameter for the future?
|
||||
// Also, this is 75 instead of 77 to leave room for BOS and EOS tokens.
|
||||
int padding_size = 75 - (tokens_acc % 75);
|
||||
for (int j = 0; j < padding_size; j++) {
|
||||
clean_input_ids.push_back(tokenizer.EOS_TOKEN_ID);
|
||||
clean_index++;
|
||||
}
|
||||
|
||||
// After padding, continue to the next iteration to process the following text as a new segment
|
||||
tokens.insert(tokens.end(), clean_input_ids.begin(), clean_input_ids.end());
|
||||
weights.insert(weights.end(), padding_size, curr_weight);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Regular token, process normally
|
||||
std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb);
|
||||
for (uint32_t i = 0; i < curr_tokens.size(); i++) {
|
||||
int token_id = curr_tokens[i];
|
||||
if (token_id == image_token) {
|
||||
class_token_index.push_back(clean_index - 1);
|
||||
} else {
|
||||
clean_input_ids.push_back(token_id);
|
||||
clean_index++;
|
||||
}
|
||||
}
|
||||
// GGML_ASSERT(class_token_index.size() == 1); // PhotoMaker currently does not support multiple
|
||||
// trigger words in a single prompt.
|
||||
if (class_token_index.size() == 1) {
|
||||
// Expand the class word token and corresponding mask
|
||||
int class_token = clean_input_ids[class_token_index[0]];
|
||||
class_idx = tokens_acc + class_token_index[0];
|
||||
std::vector<int> clean_input_ids_tmp;
|
||||
for (int i = 0; i < class_token_index[0]; i++)
|
||||
clean_input_ids_tmp.push_back(clean_input_ids[i]);
|
||||
for (int i = 0; i < (pm_version == PM_VERSION_2 ? 2 * num_input_imgs : num_input_imgs); i++)
|
||||
clean_input_ids_tmp.push_back(class_token);
|
||||
for (int i = class_token_index[0] + 1; i < clean_input_ids.size(); i++)
|
||||
clean_input_ids_tmp.push_back(clean_input_ids[i]);
|
||||
clean_input_ids.clear();
|
||||
clean_input_ids = clean_input_ids_tmp;
|
||||
}
|
||||
tokens_acc += clean_index;
|
||||
tokens.insert(tokens.end(), clean_input_ids.begin(), clean_input_ids.end());
|
||||
weights.insert(weights.end(), clean_input_ids.size(), curr_weight);
|
||||
}
|
||||
// BUG!! double couting, pad_tokens will add BOS at the beginning
|
||||
// tokens.insert(tokens.begin(), tokenizer.BOS_TOKEN_ID);
|
||||
// weights.insert(weights.begin(), 1.0);
|
||||
|
||||
tokenizer.pad_tokens(tokens, &weights, nullptr, text_model->model.n_token, text_model->model.n_token, true);
|
||||
int offset = pm_version == PM_VERSION_2 ? 2 * num_input_imgs : num_input_imgs;
|
||||
for (int i = 0; i < tokens.size(); i++) {
|
||||
// if (class_idx + 1 <= i && i < class_idx + 1 + 2*num_input_imgs) // photomaker V2 has num_tokens(=2)*num_input_imgs
|
||||
if (class_idx + 1 <= i && i < class_idx + 1 + offset) // photomaker V2 has num_tokens(=2)*num_input_imgs
|
||||
// hardcode for now
|
||||
class_token_mask.push_back(true);
|
||||
else
|
||||
class_token_mask.push_back(false);
|
||||
}
|
||||
|
||||
// printf("[");
|
||||
// for (int i = 0; i < tokens.size(); i++) {
|
||||
// printf("%d, ", class_token_mask[i] ? 1 : 0);
|
||||
// }
|
||||
// printf("]\n");
|
||||
|
||||
// for (int i = 0; i < tokens.size(); i++) {
|
||||
// std::cout << tokens[i] << ":" << weights[i] << ", ";
|
||||
// }
|
||||
// std::cout << std::endl;
|
||||
|
||||
return std::make_tuple(tokens, weights, class_token_mask);
|
||||
}
|
||||
|
||||
std::pair<std::vector<int>, std::vector<float>> tokenize(std::string text,
|
||||
size_t min_length = 0,
|
||||
size_t max_length = 0,
|
||||
@ -631,49 +505,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
||||
return result;
|
||||
}
|
||||
|
||||
std::tuple<SDCondition, std::vector<bool>>
|
||||
get_learned_condition_with_trigger(int n_threads,
|
||||
const ConditionerParams& conditioner_params) override {
|
||||
auto image_tokens = convert_token_to_id(trigger_word);
|
||||
// if(image_tokens.size() == 1){
|
||||
// printf(" image token id is: %d \n", image_tokens[0]);
|
||||
// }
|
||||
GGML_ASSERT(image_tokens.size() == 1);
|
||||
auto tokens_and_weights = tokenize_with_trigger_token(conditioner_params.text,
|
||||
conditioner_params.num_input_imgs,
|
||||
image_tokens[0]);
|
||||
std::vector<int>& tokens = std::get<0>(tokens_and_weights);
|
||||
std::vector<float>& weights = std::get<1>(tokens_and_weights);
|
||||
std::vector<bool>& clsm = std::get<2>(tokens_and_weights);
|
||||
// printf("tokens: \n");
|
||||
// for(int i = 0; i < tokens.size(); ++i)
|
||||
// printf("%d ", tokens[i]);
|
||||
// printf("\n");
|
||||
// printf("clsm: \n");
|
||||
// for(int i = 0; i < clsm.size(); ++i)
|
||||
// printf("%d ", clsm[i]?1:0);
|
||||
// printf("\n");
|
||||
auto cond = get_learned_condition_common(n_threads,
|
||||
tokens,
|
||||
weights,
|
||||
conditioner_params.clip_skip,
|
||||
conditioner_params.width,
|
||||
conditioner_params.height,
|
||||
conditioner_params.zero_out_masked);
|
||||
return std::make_tuple(cond, clsm);
|
||||
}
|
||||
|
||||
std::string remove_trigger_from_prompt(const std::string& prompt) override {
|
||||
auto image_tokens = convert_token_to_id(trigger_word);
|
||||
GGML_ASSERT(image_tokens.size() == 1);
|
||||
auto tokens_and_weights = tokenize(prompt);
|
||||
std::vector<int>& tokens = tokens_and_weights.first;
|
||||
auto it = std::find(tokens.begin(), tokens.end(), image_tokens[0]);
|
||||
GGML_ASSERT(it != tokens.end()); // prompt must have trigger word
|
||||
tokens.erase(it);
|
||||
return decode(tokens);
|
||||
}
|
||||
|
||||
SDCondition get_learned_condition(int n_threads,
|
||||
const ConditionerParams& conditioner_params) override {
|
||||
auto tokens_and_weights = tokenize(conditioner_params.text, text_model->model.n_token, text_model->model.n_token, true);
|
||||
|
||||
73
src/extensions/generation_extension.h
Normal file
73
src/extensions/generation_extension.h
Normal file
@ -0,0 +1,73 @@
|
||||
#ifndef __SD_EXTENSIONS_GENERATION_EXTENSION_H__
|
||||
#define __SD_EXTENSIONS_GENERATION_EXTENSION_H__
|
||||
|
||||
#include <functional>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <string>
|
||||
|
||||
#include "conditioning/conditioner.hpp"
|
||||
#include "core/ggml_extend_backend.h"
|
||||
#include "model.h"
|
||||
#include "stable-diffusion.h"
|
||||
|
||||
struct GenerationExtensionInitContext {
|
||||
const sd_ctx_params_t* params;
|
||||
SDVersion version;
|
||||
const String2TensorStorage& tensor_storage_map;
|
||||
ModelLoader& model_loader;
|
||||
int n_threads;
|
||||
std::function<bool(SDBackendModule)> ensure_backend_pair;
|
||||
std::function<ggml_backend_t(SDBackendModule)> backend_for;
|
||||
std::function<ggml_backend_t(SDBackendModule)> params_backend_for;
|
||||
};
|
||||
|
||||
struct GenerationExtensionTensorContext {
|
||||
std::map<std::string, ggml_tensor*>& tensors;
|
||||
std::map<std::string, ggml_tensor*>& mmap_able_tensors;
|
||||
std::function<bool(SDBackendModule)> module_can_mmap;
|
||||
};
|
||||
|
||||
struct GenerationExtensionConditionContext {
|
||||
Conditioner* conditioner;
|
||||
ConditionerParams& condition_params;
|
||||
const sd_pm_params_t& pm_params;
|
||||
std::map<std::string, ggml_tensor*>& tensors;
|
||||
SDVersion version;
|
||||
int n_threads;
|
||||
int total_steps;
|
||||
bool free_params_immediately;
|
||||
};
|
||||
|
||||
struct GenerationExtension {
|
||||
virtual ~GenerationExtension() = default;
|
||||
|
||||
virtual const char* name() const = 0;
|
||||
virtual bool is_enabled() const {
|
||||
return false;
|
||||
}
|
||||
virtual bool init(const GenerationExtensionInitContext&) {
|
||||
return true;
|
||||
}
|
||||
virtual void collect_param_tensors(GenerationExtensionTensorContext&) {}
|
||||
virtual void add_ignore_tensors(std::set<std::string>&) const {}
|
||||
virtual bool alloc_params_buffer() {
|
||||
return true;
|
||||
}
|
||||
virtual size_t get_params_buffer_size() const {
|
||||
return 0;
|
||||
}
|
||||
virtual void reset_runtime_condition() {}
|
||||
virtual bool prepare_condition(GenerationExtensionConditionContext&) {
|
||||
return false;
|
||||
}
|
||||
virtual const SDCondition& before_condition(int step,
|
||||
const SDCondition& condition) const {
|
||||
return condition;
|
||||
}
|
||||
};
|
||||
|
||||
std::shared_ptr<GenerationExtension> create_photomaker_extension();
|
||||
|
||||
#endif
|
||||
325
src/extensions/photomaker_extension.cpp
Normal file
325
src/extensions/photomaker_extension.cpp
Normal file
@ -0,0 +1,325 @@
|
||||
#include "extensions/generation_extension.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
#include <tuple>
|
||||
#include <utility>
|
||||
|
||||
#include "core/tensor_ggml.hpp"
|
||||
#include "core/util.h"
|
||||
#include "model/adapter/lora.hpp"
|
||||
#include "model/adapter/pmid.hpp"
|
||||
|
||||
static std::tuple<std::vector<int>, std::vector<float>, std::vector<bool>>
|
||||
tokenize_photomaker_trigger(FrozenCLIPEmbedderWithCustomWords& clip_conditioner,
|
||||
const std::string& text,
|
||||
int trigger_token_count,
|
||||
int32_t image_token) {
|
||||
auto tokens_and_weights = clip_conditioner.tokenize(text);
|
||||
std::vector<int> source_tokens = std::move(tokens_and_weights.first);
|
||||
std::vector<float> source_weights = std::move(tokens_and_weights.second);
|
||||
|
||||
if (!source_tokens.empty() && source_tokens.front() == clip_conditioner.tokenizer.BOS_TOKEN_ID) {
|
||||
source_tokens.erase(source_tokens.begin());
|
||||
source_weights.erase(source_weights.begin());
|
||||
}
|
||||
if (!source_tokens.empty() && source_tokens.back() == clip_conditioner.tokenizer.EOS_TOKEN_ID) {
|
||||
source_tokens.pop_back();
|
||||
source_weights.pop_back();
|
||||
}
|
||||
|
||||
std::vector<int> tokens;
|
||||
std::vector<float> weights;
|
||||
int32_t class_idx = -1;
|
||||
for (size_t i = 0; i < source_tokens.size(); i++) {
|
||||
int token = source_tokens[i];
|
||||
if (token == image_token) {
|
||||
if (!tokens.empty()) {
|
||||
class_idx = static_cast<int32_t>(tokens.size()) - 1;
|
||||
int class_token = tokens.back();
|
||||
float class_weight = weights.back();
|
||||
for (int j = 1; j < trigger_token_count; j++) {
|
||||
tokens.push_back(class_token);
|
||||
weights.push_back(class_weight);
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
tokens.push_back(token);
|
||||
weights.push_back(source_weights[i]);
|
||||
}
|
||||
|
||||
clip_conditioner.tokenizer.pad_tokens(tokens,
|
||||
&weights,
|
||||
nullptr,
|
||||
clip_conditioner.text_model->model.n_token,
|
||||
clip_conditioner.text_model->model.n_token,
|
||||
true);
|
||||
std::vector<bool> class_token_mask;
|
||||
for (int i = 0; i < tokens.size(); i++) {
|
||||
class_token_mask.push_back(class_idx + 1 <= i && i < class_idx + 1 + trigger_token_count);
|
||||
}
|
||||
|
||||
return std::make_tuple(tokens, weights, class_token_mask);
|
||||
}
|
||||
|
||||
static std::tuple<SDCondition, std::vector<bool>>
|
||||
get_photomaker_condition_with_trigger(FrozenCLIPEmbedderWithCustomWords& clip_conditioner,
|
||||
int n_threads,
|
||||
const ConditionerParams& conditioner_params,
|
||||
const std::string& trigger_word,
|
||||
int trigger_token_count) {
|
||||
auto image_tokens = clip_conditioner.convert_token_to_id(trigger_word);
|
||||
GGML_ASSERT(image_tokens.size() == 1);
|
||||
auto tokens_and_weights = tokenize_photomaker_trigger(clip_conditioner,
|
||||
conditioner_params.text,
|
||||
trigger_token_count,
|
||||
image_tokens[0]);
|
||||
std::vector<int>& tokens = std::get<0>(tokens_and_weights);
|
||||
std::vector<float>& weights = std::get<1>(tokens_and_weights);
|
||||
std::vector<bool>& trigger_mask = std::get<2>(tokens_and_weights);
|
||||
auto cond = clip_conditioner.get_learned_condition_common(n_threads,
|
||||
tokens,
|
||||
weights,
|
||||
conditioner_params.clip_skip,
|
||||
conditioner_params.width,
|
||||
conditioner_params.height,
|
||||
conditioner_params.zero_out_masked);
|
||||
return std::make_tuple(std::move(cond), trigger_mask);
|
||||
}
|
||||
|
||||
static std::string remove_photomaker_trigger_from_prompt(FrozenCLIPEmbedderWithCustomWords& clip_conditioner,
|
||||
const std::string& prompt,
|
||||
const std::string& trigger_word) {
|
||||
auto image_tokens = clip_conditioner.convert_token_to_id(trigger_word);
|
||||
GGML_ASSERT(image_tokens.size() == 1);
|
||||
auto tokens_and_weights = clip_conditioner.tokenize(prompt);
|
||||
std::vector<int>& tokens = tokens_and_weights.first;
|
||||
auto it = std::find(tokens.begin(), tokens.end(), image_tokens[0]);
|
||||
GGML_ASSERT(it != tokens.end());
|
||||
tokens.erase(it);
|
||||
return clip_conditioner.decode(tokens);
|
||||
}
|
||||
|
||||
struct PhotoMakerExtension : public GenerationExtension {
|
||||
std::shared_ptr<PhotoMakerIDEncoder> pmid_model;
|
||||
std::shared_ptr<LoraModel> pmid_lora;
|
||||
bool enabled = false;
|
||||
std::string model_path;
|
||||
std::string trigger_word = "img";
|
||||
SDCondition id_condition;
|
||||
int start_merge_step = -1;
|
||||
|
||||
const char* name() const override {
|
||||
return "photomaker";
|
||||
}
|
||||
|
||||
bool is_enabled() const override {
|
||||
return enabled;
|
||||
}
|
||||
|
||||
bool init(const GenerationExtensionInitContext& ctx) override {
|
||||
model_path = SAFE_STR(ctx.params->photo_maker_path);
|
||||
if (model_path.empty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!ctx.ensure_backend_pair(SDBackendModule::PHOTOMAKER)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
PMVersion pm_version = std::strstr(model_path.c_str(), "v2") != nullptr ? PM_VERSION_2 : PM_VERSION_1;
|
||||
pmid_model = std::make_shared<PhotoMakerIDEncoder>(ctx.backend_for(SDBackendModule::PHOTOMAKER),
|
||||
ctx.params_backend_for(SDBackendModule::PHOTOMAKER),
|
||||
ctx.tensor_storage_map,
|
||||
"pmid",
|
||||
ctx.version,
|
||||
pm_version);
|
||||
if (pm_version == PM_VERSION_2) {
|
||||
LOG_INFO("using PhotoMaker Version 2");
|
||||
}
|
||||
|
||||
pmid_lora = std::make_shared<LoraModel>("pmid",
|
||||
ctx.backend_for(SDBackendModule::PHOTOMAKER),
|
||||
ctx.params_backend_for(SDBackendModule::PHOTOMAKER),
|
||||
model_path,
|
||||
"",
|
||||
ctx.version);
|
||||
auto lora_tensor_filter = [&](const std::string& tensor_name) {
|
||||
return starts_with(tensor_name, "lora.model");
|
||||
};
|
||||
if (!pmid_lora->load_from_file(ctx.n_threads, lora_tensor_filter)) {
|
||||
LOG_WARN("load photomaker lora tensors from %s failed", model_path.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
LOG_INFO("loading stacked ID embedding (PHOTOMAKER) model file from '%s'", model_path.c_str());
|
||||
if (!ctx.model_loader.init_from_file_and_convert_name(model_path, "pmid.")) {
|
||||
LOG_WARN("loading stacked ID embedding from '%s' failed", model_path.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
enabled = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
void collect_param_tensors(GenerationExtensionTensorContext& ctx) override {
|
||||
if (!enabled || pmid_model == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::map<std::string, ggml_tensor*> temp;
|
||||
pmid_model->get_param_tensors(temp, "pmid");
|
||||
bool do_mmap = ctx.module_can_mmap(SDBackendModule::PHOTOMAKER);
|
||||
for (const auto& [key, tensor] : temp) {
|
||||
ctx.tensors[key] = tensor;
|
||||
if (do_mmap) {
|
||||
ctx.mmap_able_tensors[key] = tensor;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void add_ignore_tensors(std::set<std::string>& ignore_tensors) const override {
|
||||
if (!enabled) {
|
||||
return;
|
||||
}
|
||||
ignore_tensors.insert("pmid.unet.");
|
||||
}
|
||||
|
||||
bool alloc_params_buffer() override {
|
||||
if (!enabled || pmid_model == nullptr) {
|
||||
return true;
|
||||
}
|
||||
return pmid_model->alloc_params_buffer();
|
||||
}
|
||||
|
||||
size_t get_params_buffer_size() const override {
|
||||
if (!enabled || pmid_model == nullptr) {
|
||||
return 0;
|
||||
}
|
||||
return pmid_model->get_params_buffer_size();
|
||||
}
|
||||
|
||||
void reset_runtime_condition() override {
|
||||
id_condition = {};
|
||||
start_merge_step = -1;
|
||||
}
|
||||
|
||||
bool prepare_condition(GenerationExtensionConditionContext& ctx) override {
|
||||
reset_runtime_condition();
|
||||
if (!enabled || pmid_model == nullptr || pmid_lora == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!pmid_lora->applied) {
|
||||
int64_t t0 = ggml_time_ms();
|
||||
pmid_lora->apply(ctx.tensors, ctx.version, ctx.n_threads);
|
||||
int64_t t1 = ggml_time_ms();
|
||||
pmid_lora->applied = true;
|
||||
LOG_INFO("pmid_lora apply completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
|
||||
if (ctx.free_params_immediately) {
|
||||
pmid_lora->free_params_buffer();
|
||||
}
|
||||
}
|
||||
|
||||
bool pmv2 = pmid_model->get_version() == PM_VERSION_2;
|
||||
if (ctx.pm_params.id_images_count <= 0 || ctx.pm_params.id_images == nullptr) {
|
||||
LOG_WARN("Provided PhotoMaker model file, but NO input ID images");
|
||||
LOG_WARN("Turn off PhotoMaker for this request");
|
||||
return false;
|
||||
}
|
||||
auto* clip_conditioner = dynamic_cast<FrozenCLIPEmbedderWithCustomWords*>(ctx.conditioner);
|
||||
if (clip_conditioner == nullptr) {
|
||||
LOG_WARN("PhotoMaker requires FrozenCLIPEmbedderWithCustomWords conditioner");
|
||||
LOG_WARN("Turn off PhotoMaker for this request");
|
||||
return false;
|
||||
}
|
||||
|
||||
int clip_image_size = 224;
|
||||
pmid_model->style_strength = ctx.pm_params.style_strength;
|
||||
sd::Tensor<float> id_image_tensor;
|
||||
for (int i = 0; i < ctx.pm_params.id_images_count; i++) {
|
||||
auto id_image = sd_image_to_tensor(ctx.pm_params.id_images[i]);
|
||||
auto processed_id_image = clip_preprocess(id_image, clip_image_size, clip_image_size);
|
||||
if (id_image_tensor.empty()) {
|
||||
id_image_tensor = processed_id_image;
|
||||
} else {
|
||||
id_image_tensor = sd::ops::concat(id_image_tensor, processed_id_image, 3);
|
||||
}
|
||||
}
|
||||
|
||||
int64_t t0 = ggml_time_ms();
|
||||
int trigger_token_count = pmv2 ? 2 * ctx.pm_params.id_images_count : ctx.pm_params.id_images_count;
|
||||
auto cond_tup = get_photomaker_condition_with_trigger(*clip_conditioner,
|
||||
ctx.n_threads,
|
||||
ctx.condition_params,
|
||||
trigger_word,
|
||||
trigger_token_count);
|
||||
SDCondition prepared_id_condition = std::get<0>(cond_tup);
|
||||
auto class_tokens_mask = std::get<1>(cond_tup);
|
||||
if (std::find(class_tokens_mask.begin(), class_tokens_mask.end(), true) == class_tokens_mask.end()) {
|
||||
LOG_WARN("PhotoMaker trigger word '%s' was not found in prompt", trigger_word.c_str());
|
||||
LOG_WARN("Turn off PhotoMaker for this request");
|
||||
return false;
|
||||
}
|
||||
|
||||
sd::Tensor<float> id_embeds;
|
||||
if (pmv2 && ctx.pm_params.id_embed_path != nullptr) {
|
||||
try {
|
||||
id_embeds = sd::load_tensor_from_file_as_tensor<float>(ctx.pm_params.id_embed_path);
|
||||
} catch (const std::exception&) {
|
||||
id_embeds = {};
|
||||
}
|
||||
}
|
||||
if (pmv2 && id_embeds.empty()) {
|
||||
LOG_WARN("Provided PhotoMaker images, but NO valid ID embeds file for PM v2");
|
||||
LOG_WARN("Turn off PhotoMaker for this request");
|
||||
return false;
|
||||
}
|
||||
if (pmv2 && ctx.pm_params.id_images_count != id_embeds.shape()[1]) {
|
||||
LOG_WARN("PhotoMaker image count (%d) does NOT match ID embeds (%d). You should run face_detect.py again.",
|
||||
ctx.pm_params.id_images_count,
|
||||
static_cast<int>(id_embeds.shape()[1]));
|
||||
LOG_WARN("Turn off PhotoMaker for this request");
|
||||
return false;
|
||||
}
|
||||
|
||||
auto res = pmid_model->compute(ctx.n_threads,
|
||||
id_image_tensor,
|
||||
prepared_id_condition.c_crossattn,
|
||||
id_embeds,
|
||||
class_tokens_mask);
|
||||
if (res.empty()) {
|
||||
LOG_ERROR("Photomaker ID Stacking failed");
|
||||
LOG_WARN("Turn off PhotoMaker for this request");
|
||||
return false;
|
||||
}
|
||||
|
||||
prepared_id_condition.c_crossattn = std::move(res);
|
||||
int64_t t1 = ggml_time_ms();
|
||||
id_condition = std::move(prepared_id_condition);
|
||||
start_merge_step = int(ctx.pm_params.style_strength / 100.f * ctx.total_steps);
|
||||
ctx.condition_params.text = remove_photomaker_trigger_from_prompt(*clip_conditioner,
|
||||
ctx.condition_params.text,
|
||||
trigger_word);
|
||||
LOG_INFO("Photomaker ID Stacking, taking %" PRId64 " ms", t1 - t0);
|
||||
LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step);
|
||||
|
||||
if (ctx.free_params_immediately) {
|
||||
pmid_model->free_params_buffer();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
const SDCondition& before_condition(int step,
|
||||
const SDCondition& condition) const override {
|
||||
if (!id_condition.empty() && start_merge_step != -1 && step > start_merge_step) {
|
||||
return id_condition;
|
||||
}
|
||||
return condition;
|
||||
}
|
||||
};
|
||||
|
||||
std::shared_ptr<GenerationExtension> create_photomaker_extension() {
|
||||
return std::make_shared<PhotoMakerExtension>();
|
||||
}
|
||||
@ -1,6 +1,7 @@
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <set>
|
||||
|
||||
#include "core/ggml_extend.hpp"
|
||||
#include "core/ggml_graph_cut.h"
|
||||
@ -13,8 +14,8 @@
|
||||
#include "stable-diffusion.h"
|
||||
|
||||
#include "conditioning/conditioner.hpp"
|
||||
#include "extensions/generation_extension.h"
|
||||
#include "model/adapter/lora.hpp"
|
||||
#include "model/adapter/pmid.hpp"
|
||||
#include "model/diffusion/anima.hpp"
|
||||
#include "model/diffusion/control.hpp"
|
||||
#include "model/diffusion/ernie_image.hpp"
|
||||
@ -180,9 +181,7 @@ public:
|
||||
std::shared_ptr<VAE> preview_vae;
|
||||
std::shared_ptr<LTXV::LTXAudioVAERunner> audio_vae_model;
|
||||
std::shared_ptr<ControlNet> control_net;
|
||||
std::shared_ptr<PhotoMakerIDEncoder> pmid_model;
|
||||
std::shared_ptr<LoraModel> pmid_lora;
|
||||
std::shared_ptr<PhotoMakerIDEmbed> pmid_id_embeds;
|
||||
std::vector<std::shared_ptr<GenerationExtension>> generation_extensions;
|
||||
std::vector<std::shared_ptr<LoraModel>> cond_stage_lora_models;
|
||||
std::vector<std::shared_ptr<LoraModel>> diffusion_lora_models;
|
||||
std::vector<std::shared_ptr<LoraModel>> first_stage_lora_models;
|
||||
@ -193,7 +192,6 @@ public:
|
||||
bool offload_params_to_cpu = false;
|
||||
float max_vram = 0.f;
|
||||
bool stream_layers = false;
|
||||
bool use_pmid = false;
|
||||
std::string backend_spec;
|
||||
std::string params_backend_spec;
|
||||
|
||||
@ -743,21 +741,12 @@ public:
|
||||
for (uint32_t i = 0; i < sd_ctx_params->embedding_count; i++) {
|
||||
embbeding_map.emplace(SAFE_STR(sd_ctx_params->embeddings[i].name), SAFE_STR(sd_ctx_params->embeddings[i].path));
|
||||
}
|
||||
if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) {
|
||||
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(backend_for(SDBackendModule::TE),
|
||||
params_backend_for(SDBackendModule::TE),
|
||||
tensor_storage_map,
|
||||
embbeding_map,
|
||||
version,
|
||||
PM_VERSION_2);
|
||||
} else {
|
||||
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(backend_for(SDBackendModule::TE),
|
||||
params_backend_for(SDBackendModule::TE),
|
||||
tensor_storage_map,
|
||||
embbeding_map,
|
||||
version);
|
||||
}
|
||||
diffusion_model = std::make_shared<UNetModelRunner>(backend_for(SDBackendModule::DIFFUSION),
|
||||
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(backend_for(SDBackendModule::TE),
|
||||
params_backend_for(SDBackendModule::TE),
|
||||
tensor_storage_map,
|
||||
embbeding_map,
|
||||
version);
|
||||
diffusion_model = std::make_shared<UNetModelRunner>(backend_for(SDBackendModule::DIFFUSION),
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map,
|
||||
"model.diffusion_model",
|
||||
@ -914,50 +903,35 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
if (strlen(SAFE_STR(sd_ctx_params->photo_maker_path)) > 0) {
|
||||
if (!ensure_backend_pair(SDBackendModule::PHOTOMAKER)) {
|
||||
return false;
|
||||
}
|
||||
if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) {
|
||||
pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend_for(SDBackendModule::PHOTOMAKER),
|
||||
params_backend_for(SDBackendModule::PHOTOMAKER),
|
||||
tensor_storage_map,
|
||||
"pmid",
|
||||
version,
|
||||
PM_VERSION_2);
|
||||
LOG_INFO("using PhotoMaker Version 2");
|
||||
} else {
|
||||
pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend_for(SDBackendModule::PHOTOMAKER),
|
||||
params_backend_for(SDBackendModule::PHOTOMAKER),
|
||||
tensor_storage_map,
|
||||
"pmid",
|
||||
version);
|
||||
}
|
||||
pmid_lora = std::make_shared<LoraModel>("pmid",
|
||||
backend_for(SDBackendModule::PHOTOMAKER),
|
||||
params_backend_for(SDBackendModule::PHOTOMAKER),
|
||||
sd_ctx_params->photo_maker_path,
|
||||
"",
|
||||
version);
|
||||
auto lora_tensor_filter = [&](const std::string& tensor_name) {
|
||||
if (starts_with(tensor_name, "lora.model")) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
{
|
||||
generation_extensions.clear();
|
||||
auto photomaker_extension = create_photomaker_extension();
|
||||
GenerationExtensionInitContext extension_ctx{
|
||||
sd_ctx_params,
|
||||
version,
|
||||
tensor_storage_map,
|
||||
model_loader,
|
||||
n_threads,
|
||||
[this](SDBackendModule module) { return ensure_backend_pair(module); },
|
||||
[this](SDBackendModule module) { return backend_for(module); },
|
||||
[this](SDBackendModule module) { return params_backend_for(module); },
|
||||
};
|
||||
if (!pmid_lora->load_from_file(n_threads, lora_tensor_filter)) {
|
||||
LOG_WARN("load photomaker lora tensors from %s failed", sd_ctx_params->photo_maker_path);
|
||||
if (!photomaker_extension->init(extension_ctx)) {
|
||||
return false;
|
||||
}
|
||||
LOG_INFO("loading stacked ID embedding (PHOTOMAKER) model file from '%s'", sd_ctx_params->photo_maker_path);
|
||||
if (!model_loader.init_from_file_and_convert_name(sd_ctx_params->photo_maker_path, "pmid.")) {
|
||||
LOG_WARN("loading stacked ID embedding from '%s' failed", sd_ctx_params->photo_maker_path);
|
||||
} else {
|
||||
use_pmid = true;
|
||||
if (photomaker_extension->is_enabled()) {
|
||||
generation_extensions.push_back(photomaker_extension);
|
||||
}
|
||||
}
|
||||
if (use_pmid) {
|
||||
get_param_tensors_p(pmid_model, module_can_mmap(SDBackendModule::PHOTOMAKER), "pmid");
|
||||
{
|
||||
GenerationExtensionTensorContext extension_tensor_ctx{
|
||||
tensors,
|
||||
mmap_able_tensors,
|
||||
module_can_mmap,
|
||||
};
|
||||
for (auto& extension : generation_extensions) {
|
||||
extension->collect_param_tensors(extension_tensor_ctx);
|
||||
}
|
||||
}
|
||||
|
||||
if (sd_ctx_params->flash_attn) {
|
||||
@ -1011,8 +985,8 @@ public:
|
||||
if (use_tae && !tae_preview_only) {
|
||||
ignore_tensors.insert("first_stage_model.");
|
||||
}
|
||||
if (use_pmid) {
|
||||
ignore_tensors.insert("pmid.unet.");
|
||||
for (auto& extension : generation_extensions) {
|
||||
extension->add_ignore_tensors(ignore_tensors);
|
||||
}
|
||||
ignore_tensors.insert("model.diffusion_model.__x0__");
|
||||
ignore_tensors.insert("model.diffusion_model.__32x32__");
|
||||
@ -1099,10 +1073,12 @@ public:
|
||||
ggml_free(ctx);
|
||||
return false;
|
||||
}
|
||||
if (use_pmid && pmid_model && !pmid_model->alloc_params_buffer()) {
|
||||
LOG_ERROR("PhotoMaker params buffer allocation failed");
|
||||
ggml_free(ctx);
|
||||
return false;
|
||||
for (auto& extension : generation_extensions) {
|
||||
if (!extension->alloc_params_buffer()) {
|
||||
LOG_ERROR("%s params buffer allocation failed", extension->name());
|
||||
ggml_free(ctx);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, sd_ctx_params->enable_mmap);
|
||||
@ -1136,9 +1112,9 @@ public:
|
||||
}
|
||||
control_net_params_mem_size = control_net->get_params_buffer_size();
|
||||
}
|
||||
size_t pmid_params_mem_size = 0;
|
||||
if (use_pmid) {
|
||||
pmid_params_mem_size = pmid_model->get_params_buffer_size();
|
||||
size_t extension_params_mem_size = 0;
|
||||
for (auto& extension : generation_extensions) {
|
||||
extension_params_mem_size += extension->get_params_buffer_size();
|
||||
}
|
||||
|
||||
size_t total_params_ram_size = 0;
|
||||
@ -1170,7 +1146,7 @@ public:
|
||||
};
|
||||
|
||||
if (!add_params_memory(clip_params_mem_size, SDBackendModule::TE) ||
|
||||
!add_params_memory(pmid_params_mem_size, SDBackendModule::PHOTOMAKER) ||
|
||||
!add_params_memory(extension_params_mem_size, SDBackendModule::PHOTOMAKER) ||
|
||||
!add_params_memory(unet_params_mem_size, SDBackendModule::DIFFUSION) ||
|
||||
!add_params_memory(vae_params_mem_size, SDBackendModule::VAE) ||
|
||||
!add_params_memory(control_net_params_mem_size, SDBackendModule::CONTROL_NET)) {
|
||||
@ -1181,7 +1157,7 @@ public:
|
||||
size_t total_params_size = total_params_ram_size + total_params_vram_size;
|
||||
LOG_INFO(
|
||||
"total params memory size = %.2fMB (VRAM %.2fMB, RAM %.2fMB): "
|
||||
"text_encoders %.2fMB(%s), diffusion_model %.2fMB(%s), vae %.2fMB(%s), controlnet %.2fMB(%s), pmid %.2fMB(%s)",
|
||||
"text_encoders %.2fMB(%s), diffusion_model %.2fMB(%s), vae %.2fMB(%s), controlnet %.2fMB(%s), extensions %.2fMB(%s)",
|
||||
total_params_size / 1024.0 / 1024.0,
|
||||
total_params_vram_size / 1024.0 / 1024.0,
|
||||
total_params_ram_size / 1024.0 / 1024.0,
|
||||
@ -1193,8 +1169,8 @@ public:
|
||||
params_memory_location(vae_params_mem_size, SDBackendModule::VAE),
|
||||
control_net_params_mem_size / 1024.0 / 1024.0,
|
||||
params_memory_location(control_net_params_mem_size, SDBackendModule::CONTROL_NET),
|
||||
pmid_params_mem_size / 1024.0 / 1024.0,
|
||||
params_memory_location(pmid_params_mem_size, SDBackendModule::PHOTOMAKER));
|
||||
extension_params_mem_size / 1024.0 / 1024.0,
|
||||
params_memory_location(extension_params_mem_size, SDBackendModule::PHOTOMAKER));
|
||||
}
|
||||
|
||||
// init denoiser
|
||||
@ -1599,88 +1575,30 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
SDCondition get_pmid_conditon(sd_pm_params_t pm_params,
|
||||
ConditionerParams& condition_params) {
|
||||
SDCondition id_cond;
|
||||
if (use_pmid) {
|
||||
if (!pmid_lora->applied) {
|
||||
int64_t t0 = ggml_time_ms();
|
||||
pmid_lora->apply(tensors, version, n_threads);
|
||||
int64_t t1 = ggml_time_ms();
|
||||
pmid_lora->applied = true;
|
||||
LOG_INFO("pmid_lora apply completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
|
||||
if (free_params_immediately) {
|
||||
pmid_lora->free_params_buffer();
|
||||
}
|
||||
}
|
||||
// preprocess input id images
|
||||
bool pmv2 = pmid_model->get_version() == PM_VERSION_2;
|
||||
if (pm_params.id_images_count > 0) {
|
||||
int clip_image_size = 224;
|
||||
pmid_model->style_strength = pm_params.style_strength;
|
||||
sd::Tensor<float> id_image_tensor;
|
||||
for (int i = 0; i < pm_params.id_images_count; i++) {
|
||||
auto id_image = sd_image_to_tensor(pm_params.id_images[i]);
|
||||
auto processed_id_image = clip_preprocess(id_image, clip_image_size, clip_image_size);
|
||||
if (id_image_tensor.empty()) {
|
||||
id_image_tensor = processed_id_image;
|
||||
} else {
|
||||
id_image_tensor = sd::ops::concat(id_image_tensor, processed_id_image, 3);
|
||||
}
|
||||
}
|
||||
|
||||
int64_t t0 = ggml_time_ms();
|
||||
condition_params.num_input_imgs = pm_params.id_images_count;
|
||||
auto cond_tup = cond_stage_model->get_learned_condition_with_trigger(n_threads,
|
||||
condition_params);
|
||||
id_cond = std::get<0>(cond_tup);
|
||||
auto class_tokens_mask = std::get<1>(cond_tup);
|
||||
sd::Tensor<float> id_embeds;
|
||||
if (pmv2 && pm_params.id_embed_path != nullptr) {
|
||||
try {
|
||||
id_embeds = sd::load_tensor_from_file_as_tensor<float>(pm_params.id_embed_path);
|
||||
} catch (const std::exception&) {
|
||||
id_embeds = {};
|
||||
}
|
||||
}
|
||||
if (pmv2 && id_embeds.empty()) {
|
||||
LOG_WARN("Provided PhotoMaker images, but NO valid ID embeds file for PM v2");
|
||||
LOG_WARN("Turn off PhotoMaker");
|
||||
use_pmid = false;
|
||||
} else {
|
||||
if (pmv2 && pm_params.id_images_count != id_embeds.shape()[1]) {
|
||||
LOG_WARN("PhotoMaker image count (%d) does NOT match ID embeds (%d). You should run face_detect.py again.", pm_params.id_images_count, static_cast<int>(id_embeds.shape()[1]));
|
||||
LOG_WARN("Turn off PhotoMaker");
|
||||
use_pmid = false;
|
||||
} else {
|
||||
auto res = pmid_model->compute(n_threads,
|
||||
id_image_tensor,
|
||||
id_cond.c_crossattn,
|
||||
id_embeds,
|
||||
class_tokens_mask);
|
||||
if (res.empty()) {
|
||||
LOG_ERROR("Photomaker ID Stacking failed");
|
||||
LOG_WARN("Turn off PhotoMaker");
|
||||
use_pmid = false;
|
||||
} else {
|
||||
id_cond.c_crossattn = std::move(res);
|
||||
int64_t t1 = ggml_time_ms();
|
||||
LOG_INFO("Photomaker ID Stacking, taking %" PRId64 " ms", t1 - t0);
|
||||
// Encode input prompt without the trigger word for delayed conditioning
|
||||
condition_params.text = cond_stage_model->remove_trigger_from_prompt(condition_params.text);
|
||||
}
|
||||
if (free_params_immediately) {
|
||||
pmid_model->free_params_buffer();
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
LOG_WARN("Provided PhotoMaker model file, but NO input ID images");
|
||||
LOG_WARN("Turn off PhotoMaker");
|
||||
use_pmid = false;
|
||||
}
|
||||
void reset_generation_extensions() {
|
||||
for (auto& extension : generation_extensions) {
|
||||
extension->reset_runtime_condition();
|
||||
}
|
||||
}
|
||||
|
||||
void prepare_generation_extensions(const sd_pm_params_t& pm_params,
|
||||
ConditionerParams& condition_params,
|
||||
int total_steps) {
|
||||
reset_generation_extensions();
|
||||
GenerationExtensionConditionContext ctx{
|
||||
cond_stage_model.get(),
|
||||
condition_params,
|
||||
pm_params,
|
||||
tensors,
|
||||
version,
|
||||
n_threads,
|
||||
total_steps,
|
||||
free_params_immediately,
|
||||
};
|
||||
|
||||
for (auto& extension : generation_extensions) {
|
||||
extension->prepare_condition(ctx);
|
||||
}
|
||||
return id_cond;
|
||||
}
|
||||
|
||||
sd::Tensor<float> get_clip_vision_output(const sd::Tensor<float>& image,
|
||||
@ -1979,7 +1897,6 @@ public:
|
||||
const SDCondition& cond,
|
||||
const SDCondition& uncond,
|
||||
const SDCondition& img_uncond,
|
||||
const SDCondition& id_cond,
|
||||
const sd::Tensor<float>& control_image,
|
||||
float control_strength,
|
||||
const sd_guidance_params_t& guidance,
|
||||
@ -1989,7 +1906,6 @@ public:
|
||||
bool is_flow_denoiser,
|
||||
const char* extra_sample_args,
|
||||
const std::vector<float>& sigmas,
|
||||
int start_merge_step,
|
||||
const std::vector<sd::Tensor<float>>& ref_latents,
|
||||
bool increase_ref_index,
|
||||
const sd::Tensor<float>& denoise_mask,
|
||||
@ -2181,20 +2097,24 @@ public:
|
||||
return output_opt;
|
||||
};
|
||||
|
||||
if (start_merge_step == -1 || step <= start_merge_step) {
|
||||
cond_out = run_condition(cond);
|
||||
if (cond_out.empty()) {
|
||||
return {};
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(!id_cond.empty());
|
||||
cond_out = run_condition(id_cond,
|
||||
cond.c_concat.empty() ? nullptr : &cond.c_concat);
|
||||
if (cond_out.empty()) {
|
||||
return {};
|
||||
const SDCondition* positive_condition = &cond;
|
||||
const sd::Tensor<float>* c_concat_override = nullptr;
|
||||
for (const auto& extension : generation_extensions) {
|
||||
const SDCondition& next_condition = extension->before_condition(step, *positive_condition);
|
||||
if (&next_condition != positive_condition) {
|
||||
positive_condition = &next_condition;
|
||||
if (positive_condition != &cond) {
|
||||
c_concat_override = cond.c_concat.empty() ? nullptr : &cond.c_concat;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
cond_out = run_condition(*positive_condition, c_concat_override);
|
||||
if (cond_out.empty()) {
|
||||
return {};
|
||||
}
|
||||
|
||||
if (!uncond.empty()) {
|
||||
if (!step_cache.is_step_skipped()) {
|
||||
compute_sample_controls(control_image,
|
||||
@ -3470,7 +3390,6 @@ struct SamplePlan {
|
||||
int high_noise_sample_steps = 0;
|
||||
int total_steps = 0;
|
||||
float moe_boundary = 0.f;
|
||||
int start_merge_step = -1;
|
||||
std::vector<float> sigmas;
|
||||
|
||||
SamplePlan(sd_ctx_t* sd_ctx,
|
||||
@ -3555,11 +3474,6 @@ struct SamplePlan {
|
||||
high_noise_eta = resolve_eta(sd_ctx, high_noise_eta, high_noise_sample_method);
|
||||
LOG_INFO("sampling(high noise) using %s method", sampling_methods_str[high_noise_sample_method]);
|
||||
}
|
||||
|
||||
if (sd_ctx->sd->use_pmid) {
|
||||
start_merge_step = int(sd_ctx->sd->pmid_model->style_strength / 100.f * total_steps);
|
||||
LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@ -3890,7 +3804,6 @@ struct ImageGenerationEmbeds {
|
||||
SDCondition cond;
|
||||
SDCondition uncond;
|
||||
SDCondition img_uncond;
|
||||
SDCondition id_cond;
|
||||
};
|
||||
|
||||
struct CircularAxesState {
|
||||
@ -4195,7 +4108,9 @@ static std::optional<ImageGenerationEmbeds> prepare_image_generation_embeds(sd_c
|
||||
condition_params.height = request->height;
|
||||
condition_params.ref_images = &latents->ref_images;
|
||||
|
||||
auto id_cond = sd_ctx->sd->get_pmid_conditon(request->pm_params, condition_params);
|
||||
sd_ctx->sd->prepare_generation_extensions(request->pm_params,
|
||||
condition_params,
|
||||
plan->total_steps);
|
||||
int64_t prepare_start_ms = ggml_time_ms();
|
||||
condition_params.zero_out_masked = false;
|
||||
auto cond = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads,
|
||||
@ -4265,7 +4180,6 @@ static std::optional<ImageGenerationEmbeds> prepare_image_generation_embeds(sd_c
|
||||
embeds.img_uncond = std::move(img_uncond);
|
||||
embeds.cond = std::move(cond);
|
||||
embeds.uncond = std::move(uncond);
|
||||
embeds.id_cond = std::move(id_cond);
|
||||
|
||||
return embeds;
|
||||
}
|
||||
@ -4546,7 +4460,6 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
|
||||
embeds.cond,
|
||||
embeds.uncond,
|
||||
embeds.img_uncond,
|
||||
embeds.id_cond,
|
||||
latents.control_image,
|
||||
request.control_strength,
|
||||
request.guidance,
|
||||
@ -4556,7 +4469,6 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
|
||||
sd_ctx->sd->is_flow_denoiser(),
|
||||
plan.extra_sample_args,
|
||||
plan.sigmas,
|
||||
plan.start_merge_step,
|
||||
latents.ref_latents,
|
||||
request.increase_ref_index,
|
||||
latents.denoise_mask,
|
||||
@ -4666,7 +4578,6 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
|
||||
embeds.cond,
|
||||
embeds.uncond,
|
||||
embeds.img_uncond,
|
||||
embeds.id_cond,
|
||||
latents.control_image,
|
||||
request.control_strength,
|
||||
request.guidance,
|
||||
@ -4676,7 +4587,6 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
|
||||
sd_ctx->sd->is_flow_denoiser(),
|
||||
plan.extra_sample_args,
|
||||
hires_sigma_sched,
|
||||
plan.start_merge_step,
|
||||
latents.ref_latents,
|
||||
request.increase_ref_index,
|
||||
hires_denoise_mask,
|
||||
@ -5335,6 +5245,7 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
|
||||
sd_ctx->sd->sampler_rng->manual_seed(request.seed);
|
||||
sd_ctx->sd->set_flow_shift(sd_vid_gen_params->sample_params.flow_shift);
|
||||
sd_ctx->sd->apply_loras(sd_vid_gen_params->loras, sd_vid_gen_params->lora_count);
|
||||
sd_ctx->sd->reset_generation_extensions();
|
||||
|
||||
SamplePlan plan(sd_ctx, sd_vid_gen_params, request);
|
||||
auto latent_inputs_opt = prepare_video_generation_latents(sd_ctx, sd_vid_gen_params, &request);
|
||||
@ -5381,7 +5292,6 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
|
||||
embeds.cond,
|
||||
request.use_high_noise_uncond ? embeds.uncond : SDCondition(),
|
||||
embeds.img_uncond,
|
||||
embeds.id_cond,
|
||||
sd::Tensor<float>(),
|
||||
0.f,
|
||||
request.high_noise_guidance,
|
||||
@ -5391,7 +5301,6 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
|
||||
sd_ctx->sd->is_flow_denoiser(),
|
||||
plan.high_noise_extra_sample_args,
|
||||
high_noise_sigmas,
|
||||
-1,
|
||||
std::vector<sd::Tensor<float>>{},
|
||||
false,
|
||||
latents.denoise_mask,
|
||||
@ -5427,7 +5336,6 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
|
||||
embeds.cond,
|
||||
request.use_uncond ? embeds.uncond : SDCondition(),
|
||||
embeds.img_uncond,
|
||||
embeds.id_cond,
|
||||
sd::Tensor<float>(),
|
||||
0.f,
|
||||
sd_vid_gen_params->sample_params.guidance,
|
||||
@ -5437,7 +5345,6 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
|
||||
sd_ctx->sd->is_flow_denoiser(),
|
||||
plan.extra_sample_args,
|
||||
plan.sigmas,
|
||||
-1,
|
||||
std::vector<sd::Tensor<float>>{},
|
||||
false,
|
||||
latents.denoise_mask,
|
||||
@ -5571,7 +5478,6 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
|
||||
embeds.cond,
|
||||
hires_request.use_uncond ? embeds.uncond : SDCondition(),
|
||||
embeds.img_uncond,
|
||||
embeds.id_cond,
|
||||
sd::Tensor<float>(),
|
||||
0.f,
|
||||
sd_vid_gen_params->sample_params.guidance,
|
||||
@ -5581,7 +5487,6 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
|
||||
sd_ctx->sd->is_flow_denoiser(),
|
||||
plan.extra_sample_args,
|
||||
hires_sigma_sched,
|
||||
-1,
|
||||
std::vector<sd::Tensor<float>>{},
|
||||
false,
|
||||
hires_denoise_mask,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user