mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2025-12-13 05:48:56 +00:00
Compare commits
No commits in common. "master" and "master-401-0392273" have entirely different histories.
master
...
master-401
@ -87,38 +87,6 @@ file(GLOB SD_LIB_SOURCES
|
|||||||
"*.hpp"
|
"*.hpp"
|
||||||
)
|
)
|
||||||
|
|
||||||
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
|
|
||||||
if(GIT_EXE)
|
|
||||||
execute_process(COMMAND ${GIT_EXE} describe --tags --abbrev=7 --dirty=+
|
|
||||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
|
||||||
OUTPUT_VARIABLE SDCPP_BUILD_VERSION
|
|
||||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
|
||||||
ERROR_QUIET
|
|
||||||
)
|
|
||||||
execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
|
|
||||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
|
||||||
OUTPUT_VARIABLE SDCPP_BUILD_COMMIT
|
|
||||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
|
||||||
ERROR_QUIET
|
|
||||||
)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if(NOT SDCPP_BUILD_VERSION)
|
|
||||||
set(SDCPP_BUILD_VERSION unknown)
|
|
||||||
endif()
|
|
||||||
message(STATUS "stable-diffusion.cpp version ${SDCPP_BUILD_VERSION}")
|
|
||||||
|
|
||||||
if(NOT SDCPP_BUILD_COMMIT)
|
|
||||||
set(SDCPP_BUILD_COMMIT unknown)
|
|
||||||
endif()
|
|
||||||
message(STATUS "stable-diffusion.cpp commit ${SDCPP_BUILD_COMMIT}")
|
|
||||||
|
|
||||||
set_property(
|
|
||||||
SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/version.cpp
|
|
||||||
APPEND PROPERTY COMPILE_DEFINITIONS
|
|
||||||
SDCPP_BUILD_COMMIT=${SDCPP_BUILD_COMMIT} SDCPP_BUILD_VERSION=${SDCPP_BUILD_VERSION}
|
|
||||||
)
|
|
||||||
|
|
||||||
if(SD_BUILD_SHARED_LIBS)
|
if(SD_BUILD_SHARED_LIBS)
|
||||||
message("-- Build shared library")
|
message("-- Build shared library")
|
||||||
message(${SD_LIB_SOURCES})
|
message(${SD_LIB_SOURCES})
|
||||||
|
|||||||
@ -105,7 +105,7 @@ API and command-line option may change frequently.***
|
|||||||
### Download model weights
|
### Download model weights
|
||||||
|
|
||||||
- download weights(.ckpt or .safetensors or .gguf). For example
|
- download weights(.ckpt or .safetensors or .gguf). For example
|
||||||
- Stable Diffusion v1.5 from https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5
|
- Stable Diffusion v1.5 from https://huggingface.co/runwayml/stable-diffusion-v1-5
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
|
curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
|
||||||
|
|||||||
103
clip.hpp
103
clip.hpp
@ -3,10 +3,34 @@
|
|||||||
|
|
||||||
#include "ggml_extend.hpp"
|
#include "ggml_extend.hpp"
|
||||||
#include "model.h"
|
#include "model.h"
|
||||||
#include "tokenize_util.h"
|
|
||||||
|
|
||||||
/*================================================== CLIPTokenizer ===================================================*/
|
/*================================================== CLIPTokenizer ===================================================*/
|
||||||
|
|
||||||
|
__STATIC_INLINE__ std::pair<std::unordered_map<std::string, float>, std::string> extract_and_remove_lora(std::string text) {
|
||||||
|
std::regex re("<lora:([^:]+):([^>]+)>");
|
||||||
|
std::smatch matches;
|
||||||
|
std::unordered_map<std::string, float> filename2multiplier;
|
||||||
|
|
||||||
|
while (std::regex_search(text, matches, re)) {
|
||||||
|
std::string filename = matches[1].str();
|
||||||
|
float multiplier = std::stof(matches[2].str());
|
||||||
|
|
||||||
|
text = std::regex_replace(text, re, "", std::regex_constants::format_first_only);
|
||||||
|
|
||||||
|
if (multiplier == 0.f) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (filename2multiplier.find(filename) == filename2multiplier.end()) {
|
||||||
|
filename2multiplier[filename] = multiplier;
|
||||||
|
} else {
|
||||||
|
filename2multiplier[filename] += multiplier;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return std::make_pair(filename2multiplier, text);
|
||||||
|
}
|
||||||
|
|
||||||
__STATIC_INLINE__ std::vector<std::pair<int, std::u32string>> bytes_to_unicode() {
|
__STATIC_INLINE__ std::vector<std::pair<int, std::u32string>> bytes_to_unicode() {
|
||||||
std::vector<std::pair<int, std::u32string>> byte_unicode_pairs;
|
std::vector<std::pair<int, std::u32string>> byte_unicode_pairs;
|
||||||
std::set<int> byte_set;
|
std::set<int> byte_set;
|
||||||
@ -48,8 +72,6 @@ private:
|
|||||||
int encoder_len;
|
int encoder_len;
|
||||||
int bpe_len;
|
int bpe_len;
|
||||||
|
|
||||||
std::vector<std::string> special_tokens;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
const std::string UNK_TOKEN = "<|endoftext|>";
|
const std::string UNK_TOKEN = "<|endoftext|>";
|
||||||
const std::string BOS_TOKEN = "<|startoftext|>";
|
const std::string BOS_TOKEN = "<|startoftext|>";
|
||||||
@ -95,15 +117,6 @@ private:
|
|||||||
return pairs;
|
return pairs;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool is_special_token(const std::string& token) {
|
|
||||||
for (auto& special_token : special_tokens) {
|
|
||||||
if (special_token == token) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
CLIPTokenizer(int pad_token_id = 49407, const std::string& merges_utf8_str = "")
|
CLIPTokenizer(int pad_token_id = 49407, const std::string& merges_utf8_str = "")
|
||||||
: PAD_TOKEN_ID(pad_token_id) {
|
: PAD_TOKEN_ID(pad_token_id) {
|
||||||
@ -112,8 +125,6 @@ public:
|
|||||||
} else {
|
} else {
|
||||||
load_from_merges(ModelLoader::load_merges());
|
load_from_merges(ModelLoader::load_merges());
|
||||||
}
|
}
|
||||||
add_special_token("<|startoftext|>");
|
|
||||||
add_special_token("<|endoftext|>");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void load_from_merges(const std::string& merges_utf8_str) {
|
void load_from_merges(const std::string& merges_utf8_str) {
|
||||||
@ -190,10 +201,6 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void add_special_token(const std::string& token) {
|
|
||||||
special_tokens.push_back(token);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::u32string bpe(const std::u32string& token) {
|
std::u32string bpe(const std::u32string& token) {
|
||||||
std::vector<std::u32string> word;
|
std::vector<std::u32string> word;
|
||||||
|
|
||||||
@ -372,54 +379,25 @@ public:
|
|||||||
return trim(text);
|
return trim(text);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::string> token_split(const std::string& text) {
|
|
||||||
std::regex pat(R"('s|'t|'re|'ve|'m|'ll|'d|[[:alpha:]]+|[[:digit:]]|[^[:space:][:alpha:][:digit:]]+)",
|
|
||||||
std::regex::icase);
|
|
||||||
std::sregex_iterator iter(text.begin(), text.end(), pat);
|
|
||||||
std::sregex_iterator end;
|
|
||||||
|
|
||||||
std::vector<std::string> result;
|
|
||||||
for (; iter != end; ++iter) {
|
|
||||||
result.emplace_back(iter->str());
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<int> encode(std::string text, on_new_token_cb_t on_new_token_cb) {
|
std::vector<int> encode(std::string text, on_new_token_cb_t on_new_token_cb) {
|
||||||
std::string original_text = text;
|
std::string original_text = text;
|
||||||
std::vector<int32_t> bpe_tokens;
|
std::vector<int32_t> bpe_tokens;
|
||||||
text = whitespace_clean(text);
|
text = whitespace_clean(text);
|
||||||
std::transform(text.begin(), text.end(), text.begin(), [](unsigned char c) { return std::tolower(c); });
|
std::transform(text.begin(), text.end(), text.begin(), [](unsigned char c) { return std::tolower(c); });
|
||||||
|
|
||||||
|
std::regex pat(R"(<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[[:alpha:]]+|[[:digit:]]|[^[:space:][:alpha:][:digit:]]+)",
|
||||||
|
std::regex::icase);
|
||||||
|
|
||||||
|
std::smatch matches;
|
||||||
std::string str = text;
|
std::string str = text;
|
||||||
std::vector<std::string> token_strs;
|
std::vector<std::string> token_strs;
|
||||||
|
while (std::regex_search(str, matches, pat)) {
|
||||||
auto splited_texts = split_with_special_tokens(text, special_tokens);
|
bool skip = on_new_token_cb(str, bpe_tokens);
|
||||||
|
if (skip) {
|
||||||
for (auto& splited_text : splited_texts) {
|
|
||||||
LOG_DEBUG("token %s", splited_text.c_str());
|
|
||||||
if (is_special_token(splited_text)) {
|
|
||||||
LOG_DEBUG("special %s", splited_text.c_str());
|
|
||||||
bool skip = on_new_token_cb(splited_text, bpe_tokens);
|
|
||||||
if (skip) {
|
|
||||||
token_strs.push_back(splited_text);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
for (auto& token : matches) {
|
||||||
auto tokens = token_split(splited_text);
|
std::string token_str = token.str();
|
||||||
for (auto& token : tokens) {
|
|
||||||
if (on_new_token_cb != nullptr) {
|
|
||||||
bool skip = on_new_token_cb(token, bpe_tokens);
|
|
||||||
if (skip) {
|
|
||||||
token_strs.push_back(token);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string token_str = token;
|
|
||||||
std::u32string utf32_token;
|
std::u32string utf32_token;
|
||||||
for (int i = 0; i < token_str.length(); i++) {
|
for (int i = 0; i < token_str.length(); i++) {
|
||||||
unsigned char b = token_str[i];
|
unsigned char b = token_str[i];
|
||||||
@ -439,13 +417,14 @@ public:
|
|||||||
bpe_tokens.push_back(encoder[bpe_str]);
|
bpe_tokens.push_back(encoder[bpe_str]);
|
||||||
token_strs.push_back(utf32_to_utf8(bpe_str));
|
token_strs.push_back(utf32_to_utf8(bpe_str));
|
||||||
}
|
}
|
||||||
|
str = matches.suffix();
|
||||||
}
|
}
|
||||||
// std::stringstream ss;
|
std::stringstream ss;
|
||||||
// ss << "[";
|
ss << "[";
|
||||||
// for (auto token : token_strs) {
|
for (auto token : token_strs) {
|
||||||
// ss << "\"" << token << "\", ";
|
ss << "\"" << token << "\", ";
|
||||||
// }
|
}
|
||||||
// ss << "]";
|
ss << "]";
|
||||||
// LOG_DEBUG("split prompt \"%s\" to tokens %s", original_text.c_str(), ss.str().c_str());
|
// LOG_DEBUG("split prompt \"%s\" to tokens %s", original_text.c_str(), ss.str().c_str());
|
||||||
// printf("split prompt \"%s\" to tokens %s \n", original_text.c_str(), ss.str().c_str());
|
// printf("split prompt \"%s\" to tokens %s \n", original_text.c_str(), ss.str().c_str());
|
||||||
return bpe_tokens;
|
return bpe_tokens;
|
||||||
|
|||||||
106
conditioner.hpp
106
conditioner.hpp
@ -56,26 +56,20 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
|||||||
std::shared_ptr<CLIPTextModelRunner> text_model2;
|
std::shared_ptr<CLIPTextModelRunner> text_model2;
|
||||||
|
|
||||||
std::string trigger_word = "img"; // should be user settable
|
std::string trigger_word = "img"; // should be user settable
|
||||||
std::map<std::string, std::string> embedding_map;
|
std::string embd_dir;
|
||||||
int32_t num_custom_embeddings = 0;
|
int32_t num_custom_embeddings = 0;
|
||||||
int32_t num_custom_embeddings_2 = 0;
|
int32_t num_custom_embeddings_2 = 0;
|
||||||
std::vector<uint8_t> token_embed_custom;
|
std::vector<uint8_t> token_embed_custom;
|
||||||
std::map<std::string, std::pair<int, int>> embedding_pos_map;
|
std::vector<std::string> readed_embeddings;
|
||||||
|
|
||||||
FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
|
FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
|
||||||
bool offload_params_to_cpu,
|
bool offload_params_to_cpu,
|
||||||
const String2TensorStorage& tensor_storage_map,
|
const String2TensorStorage& tensor_storage_map,
|
||||||
const std::map<std::string, std::string>& orig_embedding_map,
|
const std::string& embd_dir,
|
||||||
SDVersion version = VERSION_SD1,
|
SDVersion version = VERSION_SD1,
|
||||||
PMVersion pv = PM_VERSION_1)
|
PMVersion pv = PM_VERSION_1)
|
||||||
: version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407) {
|
: version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
|
||||||
for (const auto& kv : orig_embedding_map) {
|
bool force_clip_f32 = embd_dir.size() > 0;
|
||||||
std::string name = kv.first;
|
|
||||||
std::transform(name.begin(), name.end(), name.begin(), [](unsigned char c) { return std::tolower(c); });
|
|
||||||
embedding_map[name] = kv.second;
|
|
||||||
tokenizer.add_special_token(name);
|
|
||||||
}
|
|
||||||
bool force_clip_f32 = !embedding_map.empty();
|
|
||||||
if (sd_version_is_sd1(version)) {
|
if (sd_version_is_sd1(version)) {
|
||||||
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
|
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
|
||||||
} else if (sd_version_is_sd2(version)) {
|
} else if (sd_version_is_sd2(version)) {
|
||||||
@ -123,17 +117,14 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool load_embedding(std::string embd_name, std::string embd_path, std::vector<int32_t>& bpe_tokens) {
|
bool load_embedding(std::string embd_name, std::string embd_path, std::vector<int32_t>& bpe_tokens) {
|
||||||
|
// the order matters
|
||||||
ModelLoader model_loader;
|
ModelLoader model_loader;
|
||||||
if (!model_loader.init_from_file_and_convert_name(embd_path)) {
|
if (!model_loader.init_from_file_and_convert_name(embd_path)) {
|
||||||
LOG_ERROR("embedding '%s' failed", embd_name.c_str());
|
LOG_ERROR("embedding '%s' failed", embd_name.c_str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
auto iter = embedding_pos_map.find(embd_name);
|
if (std::find(readed_embeddings.begin(), readed_embeddings.end(), embd_name) != readed_embeddings.end()) {
|
||||||
if (iter != embedding_pos_map.end()) {
|
|
||||||
LOG_DEBUG("embedding already read in: %s", embd_name.c_str());
|
LOG_DEBUG("embedding already read in: %s", embd_name.c_str());
|
||||||
for (int i = iter->second.first; i < iter->second.second; i++) {
|
|
||||||
bpe_tokens.push_back(text_model->model.vocab_size + i);
|
|
||||||
}
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
struct ggml_init_params params;
|
struct ggml_init_params params;
|
||||||
@ -164,7 +155,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
|||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
model_loader.load_tensors(on_load, 1);
|
model_loader.load_tensors(on_load, 1);
|
||||||
int pos_start = num_custom_embeddings;
|
readed_embeddings.push_back(embd_name);
|
||||||
if (embd) {
|
if (embd) {
|
||||||
int64_t hidden_size = text_model->model.hidden_size;
|
int64_t hidden_size = text_model->model.hidden_size;
|
||||||
token_embed_custom.resize(token_embed_custom.size() + ggml_nbytes(embd));
|
token_embed_custom.resize(token_embed_custom.size() + ggml_nbytes(embd));
|
||||||
@ -191,11 +182,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
|||||||
}
|
}
|
||||||
LOG_DEBUG("embedding '%s' applied, custom embeddings: %i (text model 2)", embd_name.c_str(), num_custom_embeddings_2);
|
LOG_DEBUG("embedding '%s' applied, custom embeddings: %i (text model 2)", embd_name.c_str(), num_custom_embeddings_2);
|
||||||
}
|
}
|
||||||
int pos_end = num_custom_embeddings;
|
|
||||||
if (pos_end == pos_start) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
embedding_pos_map[embd_name] = std::pair{pos_start, pos_end};
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -210,13 +196,25 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
|||||||
|
|
||||||
std::vector<int> convert_token_to_id(std::string text) {
|
std::vector<int> convert_token_to_id(std::string text) {
|
||||||
auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
|
auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
|
||||||
auto iter = embedding_map.find(str);
|
size_t word_end = str.find(",");
|
||||||
if (iter == embedding_map.end()) {
|
std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end);
|
||||||
return false;
|
embd_name = trim(embd_name);
|
||||||
|
std::string embd_path = get_full_path(embd_dir, embd_name + ".pt");
|
||||||
|
if (embd_path.size() == 0) {
|
||||||
|
embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
|
||||||
}
|
}
|
||||||
std::string embedding_path = iter->second;
|
if (embd_path.size() == 0) {
|
||||||
if (load_embedding(str, embedding_path, bpe_tokens)) {
|
embd_path = get_full_path(embd_dir, embd_name + ".safetensors");
|
||||||
return true;
|
}
|
||||||
|
if (embd_path.size() > 0) {
|
||||||
|
if (load_embedding(embd_name, embd_path, bpe_tokens)) {
|
||||||
|
if (word_end != std::string::npos) {
|
||||||
|
str = str.substr(word_end);
|
||||||
|
} else {
|
||||||
|
str = "";
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
};
|
};
|
||||||
@ -247,13 +245,25 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
|||||||
}
|
}
|
||||||
|
|
||||||
auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
|
auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
|
||||||
auto iter = embedding_map.find(str);
|
size_t word_end = str.find(",");
|
||||||
if (iter == embedding_map.end()) {
|
std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end);
|
||||||
return false;
|
embd_name = trim(embd_name);
|
||||||
|
std::string embd_path = get_full_path(embd_dir, embd_name + ".pt");
|
||||||
|
if (embd_path.size() == 0) {
|
||||||
|
embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
|
||||||
}
|
}
|
||||||
std::string embedding_path = iter->second;
|
if (embd_path.size() == 0) {
|
||||||
if (load_embedding(str, embedding_path, bpe_tokens)) {
|
embd_path = get_full_path(embd_dir, embd_name + ".safetensors");
|
||||||
return true;
|
}
|
||||||
|
if (embd_path.size() > 0) {
|
||||||
|
if (load_embedding(embd_name, embd_path, bpe_tokens)) {
|
||||||
|
if (word_end != std::string::npos) {
|
||||||
|
str = str.substr(word_end);
|
||||||
|
} else {
|
||||||
|
str = "";
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
};
|
};
|
||||||
@ -366,13 +376,25 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
|||||||
}
|
}
|
||||||
|
|
||||||
auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
|
auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
|
||||||
auto iter = embedding_map.find(str);
|
size_t word_end = str.find(",");
|
||||||
if (iter == embedding_map.end()) {
|
std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end);
|
||||||
return false;
|
embd_name = trim(embd_name);
|
||||||
|
std::string embd_path = get_full_path(embd_dir, embd_name + ".pt");
|
||||||
|
if (embd_path.size() == 0) {
|
||||||
|
embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
|
||||||
}
|
}
|
||||||
std::string embedding_path = iter->second;
|
if (embd_path.size() == 0) {
|
||||||
if (load_embedding(str, embedding_path, bpe_tokens)) {
|
embd_path = get_full_path(embd_dir, embd_name + ".safetensors");
|
||||||
return true;
|
}
|
||||||
|
if (embd_path.size() > 0) {
|
||||||
|
if (load_embedding(embd_name, embd_path, bpe_tokens)) {
|
||||||
|
if (word_end != std::string::npos) {
|
||||||
|
str = str.substr(word_end);
|
||||||
|
} else {
|
||||||
|
str = "";
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
};
|
};
|
||||||
@ -1706,7 +1728,7 @@ struct LLMEmbedder : public Conditioner {
|
|||||||
std::vector<std::pair<int, ggml_tensor*>> image_embeds;
|
std::vector<std::pair<int, ggml_tensor*>> image_embeds;
|
||||||
std::pair<int, int> prompt_attn_range;
|
std::pair<int, int> prompt_attn_range;
|
||||||
int prompt_template_encode_start_idx = 34;
|
int prompt_template_encode_start_idx = 34;
|
||||||
int max_length = 0;
|
int max_length = 0;
|
||||||
std::set<int> out_layers;
|
std::set<int> out_layers;
|
||||||
if (llm->enable_vision && conditioner_params.ref_images.size() > 0) {
|
if (llm->enable_vision && conditioner_params.ref_images.size() > 0) {
|
||||||
LOG_INFO("QwenImageEditPlusPipeline");
|
LOG_INFO("QwenImageEditPlusPipeline");
|
||||||
@ -1806,7 +1828,7 @@ struct LLMEmbedder : public Conditioner {
|
|||||||
prompt += "[/INST]";
|
prompt += "[/INST]";
|
||||||
} else if (version == VERSION_OVIS_IMAGE) {
|
} else if (version == VERSION_OVIS_IMAGE) {
|
||||||
prompt_template_encode_start_idx = 28;
|
prompt_template_encode_start_idx = 28;
|
||||||
max_length = prompt_template_encode_start_idx + 256;
|
max_length = prompt_template_encode_start_idx + 256;
|
||||||
|
|
||||||
prompt = "<|im_start|>user\nDescribe the image by detailing the color, quantity, text, shape, size, texture, spatial relationships of the objects and background:";
|
prompt = "<|im_start|>user\nDescribe the image by detailing the color, quantity, text, shape, size, texture, spatial relationships of the objects and background:";
|
||||||
|
|
||||||
|
|||||||
@ -156,10 +156,9 @@ struct ESRGAN : public GGMLRunner {
|
|||||||
|
|
||||||
ESRGAN(ggml_backend_t backend,
|
ESRGAN(ggml_backend_t backend,
|
||||||
bool offload_params_to_cpu,
|
bool offload_params_to_cpu,
|
||||||
int tile_size = 128,
|
|
||||||
const String2TensorStorage& tensor_storage_map = {})
|
const String2TensorStorage& tensor_storage_map = {})
|
||||||
: GGMLRunner(backend, offload_params_to_cpu) {
|
: GGMLRunner(backend, offload_params_to_cpu) {
|
||||||
this->tile_size = tile_size;
|
// rrdb_net will be created in load_from_file
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string get_desc() override {
|
std::string get_desc() override {
|
||||||
|
|||||||
@ -324,7 +324,6 @@ struct SDCliParams {
|
|||||||
std::string output_path = "output.png";
|
std::string output_path = "output.png";
|
||||||
|
|
||||||
bool verbose = false;
|
bool verbose = false;
|
||||||
bool version = false;
|
|
||||||
bool canny_preprocess = false;
|
bool canny_preprocess = false;
|
||||||
|
|
||||||
preview_t preview_method = PREVIEW_NONE;
|
preview_t preview_method = PREVIEW_NONE;
|
||||||
@ -367,10 +366,6 @@ struct SDCliParams {
|
|||||||
"--verbose",
|
"--verbose",
|
||||||
"print extra info",
|
"print extra info",
|
||||||
true, &verbose},
|
true, &verbose},
|
||||||
{"",
|
|
||||||
"--version",
|
|
||||||
"print stable-diffusion.cpp version",
|
|
||||||
true, &version},
|
|
||||||
{"",
|
{"",
|
||||||
"--color",
|
"--color",
|
||||||
"colors the logging tags according to level",
|
"colors the logging tags according to level",
|
||||||
@ -506,9 +501,6 @@ struct SDContextParams {
|
|||||||
std::string tensor_type_rules;
|
std::string tensor_type_rules;
|
||||||
std::string lora_model_dir;
|
std::string lora_model_dir;
|
||||||
|
|
||||||
std::map<std::string, std::string> embedding_map;
|
|
||||||
std::vector<sd_embedding_t> embedding_vec;
|
|
||||||
|
|
||||||
rng_type_t rng_type = CUDA_RNG;
|
rng_type_t rng_type = CUDA_RNG;
|
||||||
rng_type_t sampler_rng_type = RNG_TYPE_COUNT;
|
rng_type_t sampler_rng_type = RNG_TYPE_COUNT;
|
||||||
bool offload_params_to_cpu = false;
|
bool offload_params_to_cpu = false;
|
||||||
@ -836,37 +828,6 @@ struct SDContextParams {
|
|||||||
return options;
|
return options;
|
||||||
}
|
}
|
||||||
|
|
||||||
void build_embedding_map() {
|
|
||||||
static const std::vector<std::string> valid_ext = {".pt", ".safetensors", ".gguf"};
|
|
||||||
|
|
||||||
if (!fs::exists(embedding_dir) || !fs::is_directory(embedding_dir)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (auto& p : fs::directory_iterator(embedding_dir)) {
|
|
||||||
if (!p.is_regular_file())
|
|
||||||
continue;
|
|
||||||
|
|
||||||
auto path = p.path();
|
|
||||||
std::string ext = path.extension().string();
|
|
||||||
|
|
||||||
bool valid = false;
|
|
||||||
for (auto& e : valid_ext) {
|
|
||||||
if (ext == e) {
|
|
||||||
valid = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!valid)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
std::string key = path.stem().string();
|
|
||||||
std::string value = path.string();
|
|
||||||
|
|
||||||
embedding_map[key] = value;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bool process_and_check(SDMode mode) {
|
bool process_and_check(SDMode mode) {
|
||||||
if (mode != UPSCALE && model_path.length() == 0 && diffusion_model_path.length() == 0) {
|
if (mode != UPSCALE && model_path.length() == 0 && diffusion_model_path.length() == 0) {
|
||||||
fprintf(stderr, "error: the following arguments are required: model_path/diffusion_model\n");
|
fprintf(stderr, "error: the following arguments are required: model_path/diffusion_model\n");
|
||||||
@ -884,24 +845,10 @@ struct SDContextParams {
|
|||||||
n_threads = sd_get_num_physical_cores();
|
n_threads = sd_get_num_physical_cores();
|
||||||
}
|
}
|
||||||
|
|
||||||
build_embedding_map();
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string to_string() const {
|
std::string to_string() const {
|
||||||
std::ostringstream emb_ss;
|
|
||||||
emb_ss << "{\n";
|
|
||||||
for (auto it = embedding_map.begin(); it != embedding_map.end(); ++it) {
|
|
||||||
emb_ss << " \"" << it->first << "\": \"" << it->second << "\"";
|
|
||||||
if (std::next(it) != embedding_map.end()) {
|
|
||||||
emb_ss << ",";
|
|
||||||
}
|
|
||||||
emb_ss << "\n";
|
|
||||||
}
|
|
||||||
emb_ss << " }";
|
|
||||||
|
|
||||||
std::string embeddings_str = emb_ss.str();
|
|
||||||
std::ostringstream oss;
|
std::ostringstream oss;
|
||||||
oss << "SDContextParams {\n"
|
oss << "SDContextParams {\n"
|
||||||
<< " n_threads: " << n_threads << ",\n"
|
<< " n_threads: " << n_threads << ",\n"
|
||||||
@ -919,7 +866,6 @@ struct SDContextParams {
|
|||||||
<< " esrgan_path: \"" << esrgan_path << "\",\n"
|
<< " esrgan_path: \"" << esrgan_path << "\",\n"
|
||||||
<< " control_net_path: \"" << control_net_path << "\",\n"
|
<< " control_net_path: \"" << control_net_path << "\",\n"
|
||||||
<< " embedding_dir: \"" << embedding_dir << "\",\n"
|
<< " embedding_dir: \"" << embedding_dir << "\",\n"
|
||||||
<< " embeddings: " << embeddings_str << "\n"
|
|
||||||
<< " wtype: " << sd_type_name(wtype) << ",\n"
|
<< " wtype: " << sd_type_name(wtype) << ",\n"
|
||||||
<< " tensor_type_rules: \"" << tensor_type_rules << "\",\n"
|
<< " tensor_type_rules: \"" << tensor_type_rules << "\",\n"
|
||||||
<< " lora_model_dir: \"" << lora_model_dir << "\",\n"
|
<< " lora_model_dir: \"" << lora_model_dir << "\",\n"
|
||||||
@ -952,15 +898,6 @@ struct SDContextParams {
|
|||||||
}
|
}
|
||||||
|
|
||||||
sd_ctx_params_t to_sd_ctx_params_t(bool vae_decode_only, bool free_params_immediately, bool taesd_preview) {
|
sd_ctx_params_t to_sd_ctx_params_t(bool vae_decode_only, bool free_params_immediately, bool taesd_preview) {
|
||||||
embedding_vec.clear();
|
|
||||||
embedding_vec.reserve(embedding_map.size());
|
|
||||||
for (const auto& kv : embedding_map) {
|
|
||||||
sd_embedding_t item;
|
|
||||||
item.name = kv.first.c_str();
|
|
||||||
item.path = kv.second.c_str();
|
|
||||||
embedding_vec.emplace_back(item);
|
|
||||||
}
|
|
||||||
|
|
||||||
sd_ctx_params_t sd_ctx_params = {
|
sd_ctx_params_t sd_ctx_params = {
|
||||||
model_path.c_str(),
|
model_path.c_str(),
|
||||||
clip_l_path.c_str(),
|
clip_l_path.c_str(),
|
||||||
@ -975,8 +912,7 @@ struct SDContextParams {
|
|||||||
taesd_path.c_str(),
|
taesd_path.c_str(),
|
||||||
control_net_path.c_str(),
|
control_net_path.c_str(),
|
||||||
lora_model_dir.c_str(),
|
lora_model_dir.c_str(),
|
||||||
embedding_vec.data(),
|
embedding_dir.c_str(),
|
||||||
static_cast<uint32_t>(embedding_vec.size()),
|
|
||||||
photo_maker_path.c_str(),
|
photo_maker_path.c_str(),
|
||||||
tensor_type_rules.c_str(),
|
tensor_type_rules.c_str(),
|
||||||
vae_decode_only,
|
vae_decode_only,
|
||||||
@ -1030,15 +966,6 @@ static std::string vec_str_to_string(const std::vector<std::string>& v) {
|
|||||||
return oss.str();
|
return oss.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool is_absolute_path(const std::string& p) {
|
|
||||||
#ifdef _WIN32
|
|
||||||
// Windows: C:/path or C:\path
|
|
||||||
return p.size() > 1 && std::isalpha(static_cast<unsigned char>(p[0])) && p[1] == ':';
|
|
||||||
#else
|
|
||||||
return !p.empty() && p[0] == '/';
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
struct SDGenerationParams {
|
struct SDGenerationParams {
|
||||||
std::string prompt;
|
std::string prompt;
|
||||||
std::string negative_prompt;
|
std::string negative_prompt;
|
||||||
@ -1079,12 +1006,7 @@ struct SDGenerationParams {
|
|||||||
std::string pm_id_embed_path;
|
std::string pm_id_embed_path;
|
||||||
float pm_style_strength = 20.f;
|
float pm_style_strength = 20.f;
|
||||||
|
|
||||||
int upscale_repeats = 1;
|
int upscale_repeats = 1;
|
||||||
int upscale_tile_size = 128;
|
|
||||||
|
|
||||||
std::map<std::string, float> lora_map;
|
|
||||||
std::map<std::string, float> high_noise_lora_map;
|
|
||||||
std::vector<sd_lora_t> lora_vec;
|
|
||||||
|
|
||||||
SDGenerationParams() {
|
SDGenerationParams() {
|
||||||
sd_sample_params_init(&sample_params);
|
sd_sample_params_init(&sample_params);
|
||||||
@ -1177,10 +1099,6 @@ struct SDGenerationParams {
|
|||||||
"--upscale-repeats",
|
"--upscale-repeats",
|
||||||
"Run the ESRGAN upscaler this many times (default: 1)",
|
"Run the ESRGAN upscaler this many times (default: 1)",
|
||||||
&upscale_repeats},
|
&upscale_repeats},
|
||||||
{"",
|
|
||||||
"--upscale-tile-size",
|
|
||||||
"tile size for ESRGAN upscaling (default: 128)",
|
|
||||||
&upscale_tile_size},
|
|
||||||
};
|
};
|
||||||
|
|
||||||
options.float_options = {
|
options.float_options = {
|
||||||
@ -1460,88 +1378,7 @@ struct SDGenerationParams {
|
|||||||
return options;
|
return options;
|
||||||
}
|
}
|
||||||
|
|
||||||
void extract_and_remove_lora(const std::string& lora_model_dir) {
|
bool process_and_check(SDMode mode) {
|
||||||
static const std::regex re(R"(<lora:([^:>]+):([^>]+)>)");
|
|
||||||
static const std::vector<std::string> valid_ext = {".pt", ".safetensors", ".gguf"};
|
|
||||||
std::smatch m;
|
|
||||||
|
|
||||||
std::string tmp = prompt;
|
|
||||||
|
|
||||||
while (std::regex_search(tmp, m, re)) {
|
|
||||||
std::string raw_path = m[1].str();
|
|
||||||
const std::string raw_mul = m[2].str();
|
|
||||||
|
|
||||||
float mul = 0.f;
|
|
||||||
try {
|
|
||||||
mul = std::stof(raw_mul);
|
|
||||||
} catch (...) {
|
|
||||||
tmp = m.suffix().str();
|
|
||||||
prompt = std::regex_replace(prompt, re, "", std::regex_constants::format_first_only);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool is_high_noise = false;
|
|
||||||
static const std::string prefix = "|high_noise|";
|
|
||||||
if (raw_path.rfind(prefix, 0) == 0) {
|
|
||||||
raw_path.erase(0, prefix.size());
|
|
||||||
is_high_noise = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
fs::path final_path;
|
|
||||||
if (is_absolute_path(raw_path)) {
|
|
||||||
final_path = raw_path;
|
|
||||||
} else {
|
|
||||||
final_path = fs::path(lora_model_dir) / raw_path;
|
|
||||||
}
|
|
||||||
if (!fs::exists(final_path)) {
|
|
||||||
bool found = false;
|
|
||||||
for (const auto& ext : valid_ext) {
|
|
||||||
fs::path try_path = final_path;
|
|
||||||
try_path += ext;
|
|
||||||
if (fs::exists(try_path)) {
|
|
||||||
final_path = try_path;
|
|
||||||
found = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!found) {
|
|
||||||
printf("can not found lora %s\n", final_path.lexically_normal().string().c_str());
|
|
||||||
tmp = m.suffix().str();
|
|
||||||
prompt = std::regex_replace(prompt, re, "", std::regex_constants::format_first_only);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const std::string key = final_path.lexically_normal().string();
|
|
||||||
|
|
||||||
if (is_high_noise)
|
|
||||||
high_noise_lora_map[key] += mul;
|
|
||||||
else
|
|
||||||
lora_map[key] += mul;
|
|
||||||
|
|
||||||
prompt = std::regex_replace(prompt, re, "", std::regex_constants::format_first_only);
|
|
||||||
|
|
||||||
tmp = m.suffix().str();
|
|
||||||
}
|
|
||||||
|
|
||||||
for (const auto& kv : lora_map) {
|
|
||||||
sd_lora_t item;
|
|
||||||
item.is_high_noise = false;
|
|
||||||
item.path = kv.first.c_str();
|
|
||||||
item.multiplier = kv.second;
|
|
||||||
lora_vec.emplace_back(item);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (const auto& kv : high_noise_lora_map) {
|
|
||||||
sd_lora_t item;
|
|
||||||
item.is_high_noise = true;
|
|
||||||
item.path = kv.first.c_str();
|
|
||||||
item.multiplier = kv.second;
|
|
||||||
lora_vec.emplace_back(item);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bool process_and_check(SDMode mode, const std::string& lora_model_dir) {
|
|
||||||
if (width <= 0) {
|
if (width <= 0) {
|
||||||
fprintf(stderr, "error: the width must be greater than 0\n");
|
fprintf(stderr, "error: the width must be greater than 0\n");
|
||||||
return false;
|
return false;
|
||||||
@ -1640,10 +1477,6 @@ struct SDGenerationParams {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (upscale_tile_size < 1) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (mode == UPSCALE) {
|
if (mode == UPSCALE) {
|
||||||
if (init_image_path.length() == 0) {
|
if (init_image_path.length() == 0) {
|
||||||
fprintf(stderr, "error: upscale mode needs an init image (--init-img)\n");
|
fprintf(stderr, "error: upscale mode needs an init image (--init-img)\n");
|
||||||
@ -1656,44 +1489,14 @@ struct SDGenerationParams {
|
|||||||
seed = rand();
|
seed = rand();
|
||||||
}
|
}
|
||||||
|
|
||||||
extract_and_remove_lora(lora_model_dir);
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string to_string() const {
|
std::string to_string() const {
|
||||||
char* sample_params_str = sd_sample_params_to_str(&sample_params);
|
char* sample_params_str = sd_sample_params_to_str(&sample_params);
|
||||||
char* high_noise_sample_params_str = sd_sample_params_to_str(&high_noise_sample_params);
|
char* high_noise_sample_params_str = sd_sample_params_to_str(&high_noise_sample_params);
|
||||||
|
|
||||||
std::ostringstream lora_ss;
|
|
||||||
lora_ss << "{\n";
|
|
||||||
for (auto it = lora_map.begin(); it != lora_map.end(); ++it) {
|
|
||||||
lora_ss << " \"" << it->first << "\": \"" << it->second << "\"";
|
|
||||||
if (std::next(it) != lora_map.end()) {
|
|
||||||
lora_ss << ",";
|
|
||||||
}
|
|
||||||
lora_ss << "\n";
|
|
||||||
}
|
|
||||||
lora_ss << " }";
|
|
||||||
std::string loras_str = lora_ss.str();
|
|
||||||
|
|
||||||
lora_ss = std::ostringstream();
|
|
||||||
;
|
|
||||||
lora_ss << "{\n";
|
|
||||||
for (auto it = high_noise_lora_map.begin(); it != high_noise_lora_map.end(); ++it) {
|
|
||||||
lora_ss << " \"" << it->first << "\": \"" << it->second << "\"";
|
|
||||||
if (std::next(it) != high_noise_lora_map.end()) {
|
|
||||||
lora_ss << ",";
|
|
||||||
}
|
|
||||||
lora_ss << "\n";
|
|
||||||
}
|
|
||||||
lora_ss << " }";
|
|
||||||
std::string high_noise_loras_str = lora_ss.str();
|
|
||||||
|
|
||||||
std::ostringstream oss;
|
std::ostringstream oss;
|
||||||
oss << "SDGenerationParams {\n"
|
oss << "SDGenerationParams {\n"
|
||||||
<< " loras: \"" << loras_str << "\",\n"
|
|
||||||
<< " high_noise_loras: \"" << high_noise_loras_str << "\",\n"
|
|
||||||
<< " prompt: \"" << prompt << "\",\n"
|
<< " prompt: \"" << prompt << "\",\n"
|
||||||
<< " negative_prompt: \"" << negative_prompt << "\",\n"
|
<< " negative_prompt: \"" << negative_prompt << "\",\n"
|
||||||
<< " clip_skip: " << clip_skip << ",\n"
|
<< " clip_skip: " << clip_skip << ",\n"
|
||||||
@ -1729,7 +1532,6 @@ struct SDGenerationParams {
|
|||||||
<< " control_strength: " << control_strength << ",\n"
|
<< " control_strength: " << control_strength << ",\n"
|
||||||
<< " seed: " << seed << ",\n"
|
<< " seed: " << seed << ",\n"
|
||||||
<< " upscale_repeats: " << upscale_repeats << ",\n"
|
<< " upscale_repeats: " << upscale_repeats << ",\n"
|
||||||
<< " upscale_tile_size: " << upscale_tile_size << ",\n"
|
|
||||||
<< "}";
|
<< "}";
|
||||||
free(sample_params_str);
|
free(sample_params_str);
|
||||||
free(high_noise_sample_params_str);
|
free(high_noise_sample_params_str);
|
||||||
@ -1737,12 +1539,7 @@ struct SDGenerationParams {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static std::string version_string() {
|
|
||||||
return std::string("stable-diffusion.cpp version ") + sd_version() + ", commit " + sd_commit();
|
|
||||||
}
|
|
||||||
|
|
||||||
void print_usage(int argc, const char* argv[], const std::vector<ArgOptions>& options_list) {
|
void print_usage(int argc, const char* argv[], const std::vector<ArgOptions>& options_list) {
|
||||||
std::cout << version_string() << "\n";
|
|
||||||
std::cout << "Usage: " << argv[0] << " [options]\n\n";
|
std::cout << "Usage: " << argv[0] << " [options]\n\n";
|
||||||
std::cout << "CLI Options:\n";
|
std::cout << "CLI Options:\n";
|
||||||
options_list[0].print();
|
options_list[0].print();
|
||||||
@ -1760,9 +1557,7 @@ void parse_args(int argc, const char** argv, SDCliParams& cli_params, SDContextP
|
|||||||
exit(cli_params.normal_exit ? 0 : 1);
|
exit(cli_params.normal_exit ? 0 : 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!cli_params.process_and_check() ||
|
if (!cli_params.process_and_check() || !ctx_params.process_and_check(cli_params.mode) || !gen_params.process_and_check(cli_params.mode)) {
|
||||||
!ctx_params.process_and_check(cli_params.mode) ||
|
|
||||||
!gen_params.process_and_check(cli_params.mode, ctx_params.lora_model_dir)) {
|
|
||||||
print_usage(argc, argv, options_vec);
|
print_usage(argc, argv, options_vec);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
@ -2027,19 +1822,11 @@ void step_callback(int step, int frame_count, sd_image_t* image, bool is_noisy,
|
|||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, const char* argv[]) {
|
int main(int argc, const char* argv[]) {
|
||||||
if (argc > 1 && std::string(argv[1]) == "--version") {
|
|
||||||
std::cout << version_string() << "\n";
|
|
||||||
return EXIT_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
SDCliParams cli_params;
|
SDCliParams cli_params;
|
||||||
SDContextParams ctx_params;
|
SDContextParams ctx_params;
|
||||||
SDGenerationParams gen_params;
|
SDGenerationParams gen_params;
|
||||||
|
|
||||||
parse_args(argc, argv, cli_params, ctx_params, gen_params);
|
parse_args(argc, argv, cli_params, ctx_params, gen_params);
|
||||||
if (cli_params.verbose || cli_params.version) {
|
|
||||||
std::cout << version_string() << "\n";
|
|
||||||
}
|
|
||||||
if (gen_params.video_frames > 4) {
|
if (gen_params.video_frames > 4) {
|
||||||
size_t last_dot_pos = cli_params.preview_path.find_last_of(".");
|
size_t last_dot_pos = cli_params.preview_path.find_last_of(".");
|
||||||
std::string base_path = cli_params.preview_path;
|
std::string base_path = cli_params.preview_path;
|
||||||
@ -2275,8 +2062,6 @@ int main(int argc, const char* argv[]) {
|
|||||||
|
|
||||||
if (cli_params.mode == IMG_GEN) {
|
if (cli_params.mode == IMG_GEN) {
|
||||||
sd_img_gen_params_t img_gen_params = {
|
sd_img_gen_params_t img_gen_params = {
|
||||||
gen_params.lora_vec.data(),
|
|
||||||
static_cast<uint32_t>(gen_params.lora_vec.size()),
|
|
||||||
gen_params.prompt.c_str(),
|
gen_params.prompt.c_str(),
|
||||||
gen_params.negative_prompt.c_str(),
|
gen_params.negative_prompt.c_str(),
|
||||||
gen_params.clip_skip,
|
gen_params.clip_skip,
|
||||||
@ -2308,8 +2093,6 @@ int main(int argc, const char* argv[]) {
|
|||||||
num_results = gen_params.batch_count;
|
num_results = gen_params.batch_count;
|
||||||
} else if (cli_params.mode == VID_GEN) {
|
} else if (cli_params.mode == VID_GEN) {
|
||||||
sd_vid_gen_params_t vid_gen_params = {
|
sd_vid_gen_params_t vid_gen_params = {
|
||||||
gen_params.lora_vec.data(),
|
|
||||||
static_cast<uint32_t>(gen_params.lora_vec.size()),
|
|
||||||
gen_params.prompt.c_str(),
|
gen_params.prompt.c_str(),
|
||||||
gen_params.negative_prompt.c_str(),
|
gen_params.negative_prompt.c_str(),
|
||||||
gen_params.clip_skip,
|
gen_params.clip_skip,
|
||||||
@ -2346,8 +2129,7 @@ int main(int argc, const char* argv[]) {
|
|||||||
upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(ctx_params.esrgan_path.c_str(),
|
upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(ctx_params.esrgan_path.c_str(),
|
||||||
ctx_params.offload_params_to_cpu,
|
ctx_params.offload_params_to_cpu,
|
||||||
ctx_params.diffusion_conv_direct,
|
ctx_params.diffusion_conv_direct,
|
||||||
ctx_params.n_threads,
|
ctx_params.n_threads);
|
||||||
gen_params.upscale_tile_size);
|
|
||||||
|
|
||||||
if (upscaler_ctx == nullptr) {
|
if (upscaler_ctx == nullptr) {
|
||||||
printf("new_upscaler_ctx failed\n");
|
printf("new_upscaler_ctx failed\n");
|
||||||
|
|||||||
@ -60,14 +60,6 @@
|
|||||||
#define SD_UNUSED(x) (void)(x)
|
#define SD_UNUSED(x) (void)(x)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
__STATIC_INLINE__ int align_up_offset(int n, int multiple) {
|
|
||||||
return (multiple - n % multiple) % multiple;
|
|
||||||
}
|
|
||||||
|
|
||||||
__STATIC_INLINE__ int align_up(int n, int multiple) {
|
|
||||||
return n + align_up_offset(n, multiple);
|
|
||||||
}
|
|
||||||
|
|
||||||
__STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void*) {
|
__STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void*) {
|
||||||
switch (level) {
|
switch (level) {
|
||||||
case GGML_LOG_LEVEL_DEBUG:
|
case GGML_LOG_LEVEL_DEBUG:
|
||||||
@ -1400,14 +1392,10 @@ __STATIC_INLINE__ void ggml_ext_backend_tensor_get_and_sync(ggml_backend_t backe
|
|||||||
}
|
}
|
||||||
|
|
||||||
__STATIC_INLINE__ float ggml_ext_backend_tensor_get_f32(ggml_tensor* tensor) {
|
__STATIC_INLINE__ float ggml_ext_backend_tensor_get_f32(ggml_tensor* tensor) {
|
||||||
GGML_ASSERT(tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_I32 || tensor->type == GGML_TYPE_BF16);
|
GGML_ASSERT(tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_I32);
|
||||||
float value;
|
float value;
|
||||||
if (tensor->type == GGML_TYPE_F32) {
|
if (tensor->type == GGML_TYPE_F32) {
|
||||||
ggml_backend_tensor_get(tensor, &value, 0, sizeof(value));
|
ggml_backend_tensor_get(tensor, &value, 0, sizeof(value));
|
||||||
} else if (tensor->type == GGML_TYPE_BF16) {
|
|
||||||
ggml_bf16_t bf16_value;
|
|
||||||
ggml_backend_tensor_get(tensor, &bf16_value, 0, sizeof(bf16_value));
|
|
||||||
value = ggml_bf16_to_fp32(bf16_value);
|
|
||||||
} else if (tensor->type == GGML_TYPE_F16) {
|
} else if (tensor->type == GGML_TYPE_F16) {
|
||||||
ggml_fp16_t f16_value;
|
ggml_fp16_t f16_value;
|
||||||
ggml_backend_tensor_get(tensor, &f16_value, 0, sizeof(f16_value));
|
ggml_backend_tensor_get(tensor, &f16_value, 0, sizeof(f16_value));
|
||||||
|
|||||||
@ -91,41 +91,6 @@ const float flux_latent_rgb_proj[16][3] = {
|
|||||||
{-0.111849f, -0.055589f, -0.032361f}};
|
{-0.111849f, -0.055589f, -0.032361f}};
|
||||||
float flux_latent_rgb_bias[3] = {0.024600f, -0.006937f, -0.008089f};
|
float flux_latent_rgb_bias[3] = {0.024600f, -0.006937f, -0.008089f};
|
||||||
|
|
||||||
const float flux2_latent_rgb_proj[32][3] = {
|
|
||||||
{0.000736f, -0.008385f, -0.019710f},
|
|
||||||
{-0.001352f, -0.016392f, 0.020693f},
|
|
||||||
{-0.006376f, 0.002428f, 0.036736f},
|
|
||||||
{0.039384f, 0.074167f, 0.119789f},
|
|
||||||
{0.007464f, -0.005705f, -0.004734f},
|
|
||||||
{-0.004086f, 0.005287f, -0.000409f},
|
|
||||||
{-0.032835f, 0.050802f, -0.028120f},
|
|
||||||
{-0.003158f, -0.000835f, 0.000406f},
|
|
||||||
{-0.112840f, -0.084337f, -0.023083f},
|
|
||||||
{0.001462f, -0.006656f, 0.000549f},
|
|
||||||
{-0.009980f, -0.007480f, 0.009702f},
|
|
||||||
{0.032540f, 0.000214f, -0.061388f},
|
|
||||||
{0.011023f, 0.000694f, 0.007143f},
|
|
||||||
{-0.001468f, -0.006723f, -0.001678f},
|
|
||||||
{-0.005921f, -0.010320f, -0.003907f},
|
|
||||||
{-0.028434f, 0.027584f, 0.018457f},
|
|
||||||
{0.014349f, 0.011523f, 0.000441f},
|
|
||||||
{0.009874f, 0.003081f, 0.001507f},
|
|
||||||
{0.002218f, 0.005712f, 0.001563f},
|
|
||||||
{0.053010f, -0.019844f, 0.008683f},
|
|
||||||
{-0.002507f, 0.005384f, 0.000938f},
|
|
||||||
{-0.002177f, -0.011366f, 0.003559f},
|
|
||||||
{-0.000261f, 0.015121f, -0.003240f},
|
|
||||||
{-0.003944f, -0.002083f, 0.005043f},
|
|
||||||
{-0.009138f, 0.011336f, 0.003781f},
|
|
||||||
{0.011429f, 0.003985f, -0.003855f},
|
|
||||||
{0.010518f, -0.005586f, 0.010131f},
|
|
||||||
{0.007883f, 0.002912f, -0.001473f},
|
|
||||||
{-0.003318f, -0.003160f, 0.003684f},
|
|
||||||
{-0.034560f, -0.008740f, 0.012996f},
|
|
||||||
{0.000166f, 0.001079f, -0.012153f},
|
|
||||||
{0.017772f, 0.000937f, -0.011953f}};
|
|
||||||
float flux2_latent_rgb_bias[3] = {-0.028738f, -0.098463f, -0.107619f};
|
|
||||||
|
|
||||||
// This one was taken straight from
|
// This one was taken straight from
|
||||||
// https://github.com/Stability-AI/sd3.5/blob/8565799a3b41eb0c7ba976d18375f0f753f56402/sd3_impls.py#L288-L303
|
// https://github.com/Stability-AI/sd3.5/blob/8565799a3b41eb0c7ba976d18375f0f753f56402/sd3_impls.py#L288-L303
|
||||||
// (MiT Licence)
|
// (MiT Licence)
|
||||||
@ -163,42 +128,16 @@ const float sd_latent_rgb_proj[4][3] = {
|
|||||||
{-0.178022f, -0.200862f, -0.678514f}};
|
{-0.178022f, -0.200862f, -0.678514f}};
|
||||||
float sd_latent_rgb_bias[3] = {-0.017478f, -0.055834f, -0.105825f};
|
float sd_latent_rgb_bias[3] = {-0.017478f, -0.055834f, -0.105825f};
|
||||||
|
|
||||||
void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int patch_size) {
|
void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int width, int height, int frames, int dim) {
|
||||||
size_t buffer_head = 0;
|
size_t buffer_head = 0;
|
||||||
|
|
||||||
uint32_t latent_width = latents->ne[0];
|
|
||||||
uint32_t latent_height = latents->ne[1];
|
|
||||||
uint32_t dim = latents->ne[ggml_n_dims(latents) - 1];
|
|
||||||
uint32_t frames = 1;
|
|
||||||
if (ggml_n_dims(latents) == 4) {
|
|
||||||
frames = latents->ne[2];
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t rgb_width = latent_width * patch_size;
|
|
||||||
uint32_t rgb_height = latent_height * patch_size;
|
|
||||||
|
|
||||||
uint32_t unpatched_dim = dim / (patch_size * patch_size);
|
|
||||||
|
|
||||||
for (int k = 0; k < frames; k++) {
|
for (int k = 0; k < frames; k++) {
|
||||||
for (int rgb_x = 0; rgb_x < rgb_width; rgb_x++) {
|
for (int j = 0; j < height; j++) {
|
||||||
for (int rgb_y = 0; rgb_y < rgb_height; rgb_y++) {
|
for (int i = 0; i < width; i++) {
|
||||||
int latent_x = rgb_x / patch_size;
|
size_t latent_id = (i * latents->nb[0] + j * latents->nb[1] + k * latents->nb[2]);
|
||||||
int latent_y = rgb_y / patch_size;
|
|
||||||
|
|
||||||
int channel_offset = 0;
|
|
||||||
if (patch_size > 1) {
|
|
||||||
channel_offset = ((rgb_y % patch_size) * patch_size + (rgb_x % patch_size));
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t latent_id = (latent_x * latents->nb[0] + latent_y * latents->nb[1] + k * latents->nb[2]);
|
|
||||||
|
|
||||||
// should be incremented by 1 for each pixel
|
|
||||||
size_t pixel_id = k * rgb_width * rgb_height + rgb_y * rgb_width + rgb_x;
|
|
||||||
|
|
||||||
float r = 0, g = 0, b = 0;
|
float r = 0, g = 0, b = 0;
|
||||||
if (latent_rgb_proj != nullptr) {
|
if (latent_rgb_proj != nullptr) {
|
||||||
for (int d = 0; d < unpatched_dim; d++) {
|
for (int d = 0; d < dim; d++) {
|
||||||
float value = *(float*)((char*)latents->data + latent_id + (d * patch_size * patch_size + channel_offset) * latents->nb[ggml_n_dims(latents) - 1]);
|
float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[ggml_n_dims(latents) - 1]);
|
||||||
r += value * latent_rgb_proj[d][0];
|
r += value * latent_rgb_proj[d][0];
|
||||||
g += value * latent_rgb_proj[d][1];
|
g += value * latent_rgb_proj[d][1];
|
||||||
b += value * latent_rgb_proj[d][2];
|
b += value * latent_rgb_proj[d][2];
|
||||||
@ -225,9 +164,9 @@ void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const fl
|
|||||||
g = g >= 0 ? g <= 1 ? g : 1 : 0;
|
g = g >= 0 ? g <= 1 ? g : 1 : 0;
|
||||||
b = b >= 0 ? b <= 1 ? b : 1 : 0;
|
b = b >= 0 ? b <= 1 ? b : 1 : 0;
|
||||||
|
|
||||||
buffer[pixel_id * 3 + 0] = (uint8_t)(r * 255);
|
buffer[buffer_head++] = (uint8_t)(r * 255);
|
||||||
buffer[pixel_id * 3 + 1] = (uint8_t)(g * 255);
|
buffer[buffer_head++] = (uint8_t)(g * 255);
|
||||||
buffer[pixel_id * 3 + 2] = (uint8_t)(b * 255);
|
buffer[buffer_head++] = (uint8_t)(b * 255);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -508,22 +508,18 @@ public:
|
|||||||
"model.diffusion_model",
|
"model.diffusion_model",
|
||||||
version);
|
version);
|
||||||
} else { // SD1.x SD2.x SDXL
|
} else { // SD1.x SD2.x SDXL
|
||||||
std::map<std::string, std::string> embbeding_map;
|
|
||||||
for (int i = 0; i < sd_ctx_params->embedding_count; i++) {
|
|
||||||
embbeding_map.emplace(SAFE_STR(sd_ctx_params->embeddings[i].name), SAFE_STR(sd_ctx_params->embeddings[i].path));
|
|
||||||
}
|
|
||||||
if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) {
|
if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) {
|
||||||
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
|
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
|
||||||
offload_params_to_cpu,
|
offload_params_to_cpu,
|
||||||
tensor_storage_map,
|
tensor_storage_map,
|
||||||
embbeding_map,
|
SAFE_STR(sd_ctx_params->embedding_dir),
|
||||||
version,
|
version,
|
||||||
PM_VERSION_2);
|
PM_VERSION_2);
|
||||||
} else {
|
} else {
|
||||||
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
|
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
|
||||||
offload_params_to_cpu,
|
offload_params_to_cpu,
|
||||||
tensor_storage_map,
|
tensor_storage_map,
|
||||||
embbeding_map,
|
SAFE_STR(sd_ctx_params->embedding_dir),
|
||||||
version);
|
version);
|
||||||
}
|
}
|
||||||
diffusion_model = std::make_shared<UNetModel>(backend,
|
diffusion_model = std::make_shared<UNetModel>(backend,
|
||||||
@ -937,17 +933,28 @@ public:
|
|||||||
float multiplier,
|
float multiplier,
|
||||||
ggml_backend_t backend,
|
ggml_backend_t backend,
|
||||||
LoraModel::filter_t lora_tensor_filter = nullptr) {
|
LoraModel::filter_t lora_tensor_filter = nullptr) {
|
||||||
std::string lora_path = lora_id;
|
std::string lora_name = lora_id;
|
||||||
static std::string high_noise_tag = "|high_noise|";
|
std::string high_noise_tag = "|high_noise|";
|
||||||
bool is_high_noise = false;
|
bool is_high_noise = false;
|
||||||
if (starts_with(lora_path, high_noise_tag)) {
|
if (starts_with(lora_name, high_noise_tag)) {
|
||||||
lora_path = lora_path.substr(high_noise_tag.size());
|
lora_name = lora_name.substr(high_noise_tag.size());
|
||||||
is_high_noise = true;
|
is_high_noise = true;
|
||||||
LOG_DEBUG("high noise lora: %s", lora_path.c_str());
|
LOG_DEBUG("high noise lora: %s", lora_name.c_str());
|
||||||
}
|
}
|
||||||
auto lora = std::make_shared<LoraModel>(lora_id, backend, lora_path, is_high_noise ? "model.high_noise_" : "", version);
|
std::string st_file_path = path_join(lora_model_dir, lora_name + ".safetensors");
|
||||||
|
std::string ckpt_file_path = path_join(lora_model_dir, lora_name + ".ckpt");
|
||||||
|
std::string file_path;
|
||||||
|
if (file_exists(st_file_path)) {
|
||||||
|
file_path = st_file_path;
|
||||||
|
} else if (file_exists(ckpt_file_path)) {
|
||||||
|
file_path = ckpt_file_path;
|
||||||
|
} else {
|
||||||
|
LOG_WARN("can not find %s or %s for lora %s", st_file_path.c_str(), ckpt_file_path.c_str(), lora_name.c_str());
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
auto lora = std::make_shared<LoraModel>(lora_id, backend, file_path, is_high_noise ? "model.high_noise_" : "", version);
|
||||||
if (!lora->load_from_file(n_threads, lora_tensor_filter)) {
|
if (!lora->load_from_file(n_threads, lora_tensor_filter)) {
|
||||||
LOG_WARN("load lora tensors from %s failed", lora_path.c_str());
|
LOG_WARN("load lora tensors from %s failed", file_path.c_str());
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1132,15 +1139,12 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void apply_loras(const sd_lora_t* loras, uint32_t lora_count) {
|
std::string apply_loras_from_prompt(const std::string& prompt) {
|
||||||
std::unordered_map<std::string, float> lora_f2m;
|
auto result_pair = extract_and_remove_lora(prompt);
|
||||||
for (int i = 0; i < lora_count; i++) {
|
std::unordered_map<std::string, float> lora_f2m = result_pair.first; // lora_name -> multiplier
|
||||||
std::string lora_id = SAFE_STR(loras[i].path);
|
|
||||||
if (loras[i].is_high_noise) {
|
for (auto& kv : lora_f2m) {
|
||||||
lora_id = "|high_noise|" + lora_id;
|
LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second);
|
||||||
}
|
|
||||||
lora_f2m[lora_id] = loras[i].multiplier;
|
|
||||||
LOG_DEBUG("lora %s:%.2f", lora_id.c_str(), loras[i].multiplier);
|
|
||||||
}
|
}
|
||||||
int64_t t0 = ggml_time_ms();
|
int64_t t0 = ggml_time_ms();
|
||||||
if (apply_lora_immediately) {
|
if (apply_lora_immediately) {
|
||||||
@ -1151,7 +1155,9 @@ public:
|
|||||||
int64_t t1 = ggml_time_ms();
|
int64_t t1 = ggml_time_ms();
|
||||||
if (!lora_f2m.empty()) {
|
if (!lora_f2m.empty()) {
|
||||||
LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
|
LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
|
||||||
|
LOG_DEBUG("prompt after extract and remove lora: \"%s\"", result_pair.second.c_str());
|
||||||
}
|
}
|
||||||
|
return result_pair.second;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* id_encoder(ggml_context* work_ctx,
|
ggml_tensor* id_encoder(ggml_context* work_ctx,
|
||||||
@ -1316,17 +1322,10 @@ public:
|
|||||||
uint32_t dim = latents->ne[ggml_n_dims(latents) - 1];
|
uint32_t dim = latents->ne[ggml_n_dims(latents) - 1];
|
||||||
|
|
||||||
if (preview_mode == PREVIEW_PROJ) {
|
if (preview_mode == PREVIEW_PROJ) {
|
||||||
int64_t patch_sz = 1;
|
|
||||||
const float(*latent_rgb_proj)[channel] = nullptr;
|
const float(*latent_rgb_proj)[channel] = nullptr;
|
||||||
float* latent_rgb_bias = nullptr;
|
float* latent_rgb_bias = nullptr;
|
||||||
|
|
||||||
if (dim == 128) {
|
if (dim == 48) {
|
||||||
if (sd_version_is_flux2(version)) {
|
|
||||||
latent_rgb_proj = flux2_latent_rgb_proj;
|
|
||||||
latent_rgb_bias = flux2_latent_rgb_bias;
|
|
||||||
patch_sz = 2;
|
|
||||||
}
|
|
||||||
} else if (dim == 48) {
|
|
||||||
if (sd_version_is_wan(version)) {
|
if (sd_version_is_wan(version)) {
|
||||||
latent_rgb_proj = wan_22_latent_rgb_proj;
|
latent_rgb_proj = wan_22_latent_rgb_proj;
|
||||||
latent_rgb_bias = wan_22_latent_rgb_bias;
|
latent_rgb_bias = wan_22_latent_rgb_bias;
|
||||||
@ -1379,15 +1378,12 @@ public:
|
|||||||
frames = latents->ne[2];
|
frames = latents->ne[2];
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t img_width = width * patch_sz;
|
uint8_t* data = (uint8_t*)malloc(frames * width * height * channel * sizeof(uint8_t));
|
||||||
uint32_t img_height = height * patch_sz;
|
|
||||||
|
|
||||||
uint8_t* data = (uint8_t*)malloc(frames * img_width * img_height * channel * sizeof(uint8_t));
|
preview_latent_video(data, latents, latent_rgb_proj, latent_rgb_bias, width, height, frames, dim);
|
||||||
|
|
||||||
preview_latent_video(data, latents, latent_rgb_proj, latent_rgb_bias, patch_sz);
|
|
||||||
sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t));
|
sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t));
|
||||||
for (int i = 0; i < frames; i++) {
|
for (int i = 0; i < frames; i++) {
|
||||||
images[i] = {img_width, img_height, channel, data + i * img_width * img_height * channel};
|
images[i] = {width, height, channel, data + i * width * height * channel};
|
||||||
}
|
}
|
||||||
step_callback(step, frames, images, is_noisy, step_callback_data);
|
step_callback(step, frames, images, is_noisy, step_callback_data);
|
||||||
free(data);
|
free(data);
|
||||||
@ -1898,18 +1894,6 @@ public:
|
|||||||
return vae_scale_factor;
|
return vae_scale_factor;
|
||||||
}
|
}
|
||||||
|
|
||||||
int get_diffusion_model_down_factor() {
|
|
||||||
int down_factor = 8; // unet
|
|
||||||
if (sd_version_is_dit(version)) {
|
|
||||||
if (sd_version_is_wan(version)) {
|
|
||||||
down_factor = 2;
|
|
||||||
} else {
|
|
||||||
down_factor = 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return down_factor;
|
|
||||||
}
|
|
||||||
|
|
||||||
int get_latent_channel() {
|
int get_latent_channel() {
|
||||||
int latent_channel = 4;
|
int latent_channel = 4;
|
||||||
if (sd_version_is_dit(version)) {
|
if (sd_version_is_dit(version)) {
|
||||||
@ -2537,6 +2521,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
|
|||||||
"taesd_path: %s\n"
|
"taesd_path: %s\n"
|
||||||
"control_net_path: %s\n"
|
"control_net_path: %s\n"
|
||||||
"lora_model_dir: %s\n"
|
"lora_model_dir: %s\n"
|
||||||
|
"embedding_dir: %s\n"
|
||||||
"photo_maker_path: %s\n"
|
"photo_maker_path: %s\n"
|
||||||
"tensor_type_rules: %s\n"
|
"tensor_type_rules: %s\n"
|
||||||
"vae_decode_only: %s\n"
|
"vae_decode_only: %s\n"
|
||||||
@ -2567,6 +2552,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
|
|||||||
SAFE_STR(sd_ctx_params->taesd_path),
|
SAFE_STR(sd_ctx_params->taesd_path),
|
||||||
SAFE_STR(sd_ctx_params->control_net_path),
|
SAFE_STR(sd_ctx_params->control_net_path),
|
||||||
SAFE_STR(sd_ctx_params->lora_model_dir),
|
SAFE_STR(sd_ctx_params->lora_model_dir),
|
||||||
|
SAFE_STR(sd_ctx_params->embedding_dir),
|
||||||
SAFE_STR(sd_ctx_params->photo_maker_path),
|
SAFE_STR(sd_ctx_params->photo_maker_path),
|
||||||
SAFE_STR(sd_ctx_params->tensor_type_rules),
|
SAFE_STR(sd_ctx_params->tensor_type_rules),
|
||||||
BOOL_STR(sd_ctx_params->vae_decode_only),
|
BOOL_STR(sd_ctx_params->vae_decode_only),
|
||||||
@ -2817,6 +2803,8 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
|
|||||||
int sample_steps = sigmas.size() - 1;
|
int sample_steps = sigmas.size() - 1;
|
||||||
|
|
||||||
int64_t t0 = ggml_time_ms();
|
int64_t t0 = ggml_time_ms();
|
||||||
|
// Apply lora
|
||||||
|
prompt = sd_ctx->sd->apply_loras_from_prompt(prompt);
|
||||||
|
|
||||||
// Photo Maker
|
// Photo Maker
|
||||||
std::string prompt_text_only;
|
std::string prompt_text_only;
|
||||||
@ -3145,19 +3133,22 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
|
|||||||
sd_ctx->sd->vae_tiling_params = sd_img_gen_params->vae_tiling_params;
|
sd_ctx->sd->vae_tiling_params = sd_img_gen_params->vae_tiling_params;
|
||||||
int width = sd_img_gen_params->width;
|
int width = sd_img_gen_params->width;
|
||||||
int height = sd_img_gen_params->height;
|
int height = sd_img_gen_params->height;
|
||||||
|
int vae_scale_factor = sd_ctx->sd->get_vae_scale_factor();
|
||||||
int vae_scale_factor = sd_ctx->sd->get_vae_scale_factor();
|
if (sd_version_is_dit(sd_ctx->sd->version)) {
|
||||||
int diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor();
|
if (width % 16 || height % 16) {
|
||||||
int spatial_multiple = vae_scale_factor * diffusion_model_down_factor;
|
LOG_ERROR("Image dimensions must be must be a multiple of 16 on each axis for %s models. (Got %dx%d)",
|
||||||
|
model_version_to_str[sd_ctx->sd->version],
|
||||||
int width_offset = align_up_offset(width, spatial_multiple);
|
width,
|
||||||
int height_offset = align_up_offset(height, spatial_multiple);
|
height);
|
||||||
if (width_offset > 0 || height_offset > 0) {
|
return nullptr;
|
||||||
width += width_offset;
|
}
|
||||||
height += height_offset;
|
} else if (width % 64 || height % 64) {
|
||||||
LOG_WARN("align up %dx%d to %dx%d (multiple=%d)", sd_img_gen_params->width, sd_img_gen_params->height, width, height, spatial_multiple);
|
LOG_ERROR("Image dimensions must be must be a multiple of 64 on each axis for %s models. (Got %dx%d)",
|
||||||
|
model_version_to_str[sd_ctx->sd->version],
|
||||||
|
width,
|
||||||
|
height);
|
||||||
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_DEBUG("generate_image %dx%d", width, height);
|
LOG_DEBUG("generate_image %dx%d", width, height);
|
||||||
if (sd_ctx == nullptr || sd_img_gen_params == nullptr) {
|
if (sd_ctx == nullptr || sd_img_gen_params == nullptr) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
@ -3185,9 +3176,6 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
|
|||||||
|
|
||||||
size_t t0 = ggml_time_ms();
|
size_t t0 = ggml_time_ms();
|
||||||
|
|
||||||
// Apply lora
|
|
||||||
sd_ctx->sd->apply_loras(sd_img_gen_params->loras, sd_img_gen_params->lora_count);
|
|
||||||
|
|
||||||
enum sample_method_t sample_method = sd_img_gen_params->sample_params.sample_method;
|
enum sample_method_t sample_method = sd_img_gen_params->sample_params.sample_method;
|
||||||
if (sample_method == SAMPLE_METHOD_COUNT) {
|
if (sample_method == SAMPLE_METHOD_COUNT) {
|
||||||
sample_method = sd_get_default_sample_method(sd_ctx);
|
sample_method = sd_get_default_sample_method(sd_ctx);
|
||||||
@ -3431,20 +3419,10 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
|
|||||||
int frames = sd_vid_gen_params->video_frames;
|
int frames = sd_vid_gen_params->video_frames;
|
||||||
frames = (frames - 1) / 4 * 4 + 1;
|
frames = (frames - 1) / 4 * 4 + 1;
|
||||||
int sample_steps = sd_vid_gen_params->sample_params.sample_steps;
|
int sample_steps = sd_vid_gen_params->sample_params.sample_steps;
|
||||||
|
|
||||||
int vae_scale_factor = sd_ctx->sd->get_vae_scale_factor();
|
|
||||||
int diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor();
|
|
||||||
int spatial_multiple = vae_scale_factor * diffusion_model_down_factor;
|
|
||||||
|
|
||||||
int width_offset = align_up_offset(width, spatial_multiple);
|
|
||||||
int height_offset = align_up_offset(height, spatial_multiple);
|
|
||||||
if (width_offset > 0 || height_offset > 0) {
|
|
||||||
width += width_offset;
|
|
||||||
height += height_offset;
|
|
||||||
LOG_WARN("align up %dx%d to %dx%d (multiple=%d)", sd_vid_gen_params->width, sd_vid_gen_params->height, width, height, spatial_multiple);
|
|
||||||
}
|
|
||||||
LOG_INFO("generate_video %dx%dx%d", width, height, frames);
|
LOG_INFO("generate_video %dx%dx%d", width, height, frames);
|
||||||
|
|
||||||
|
int vae_scale_factor = sd_ctx->sd->get_vae_scale_factor();
|
||||||
|
|
||||||
enum sample_method_t sample_method = sd_vid_gen_params->sample_params.sample_method;
|
enum sample_method_t sample_method = sd_vid_gen_params->sample_params.sample_method;
|
||||||
if (sample_method == SAMPLE_METHOD_COUNT) {
|
if (sample_method == SAMPLE_METHOD_COUNT) {
|
||||||
sample_method = sd_get_default_sample_method(sd_ctx);
|
sample_method = sd_get_default_sample_method(sd_ctx);
|
||||||
@ -3497,7 +3475,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
|
|||||||
int64_t t0 = ggml_time_ms();
|
int64_t t0 = ggml_time_ms();
|
||||||
|
|
||||||
// Apply lora
|
// Apply lora
|
||||||
sd_ctx->sd->apply_loras(sd_vid_gen_params->loras, sd_vid_gen_params->lora_count);
|
prompt = sd_ctx->sd->apply_loras_from_prompt(prompt);
|
||||||
|
|
||||||
ggml_tensor* init_latent = nullptr;
|
ggml_tensor* init_latent = nullptr;
|
||||||
ggml_tensor* clip_vision_output = nullptr;
|
ggml_tensor* clip_vision_output = nullptr;
|
||||||
|
|||||||
@ -150,11 +150,6 @@ typedef struct {
|
|||||||
float rel_size_y;
|
float rel_size_y;
|
||||||
} sd_tiling_params_t;
|
} sd_tiling_params_t;
|
||||||
|
|
||||||
typedef struct {
|
|
||||||
const char* name;
|
|
||||||
const char* path;
|
|
||||||
} sd_embedding_t;
|
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
const char* model_path;
|
const char* model_path;
|
||||||
const char* clip_l_path;
|
const char* clip_l_path;
|
||||||
@ -169,8 +164,7 @@ typedef struct {
|
|||||||
const char* taesd_path;
|
const char* taesd_path;
|
||||||
const char* control_net_path;
|
const char* control_net_path;
|
||||||
const char* lora_model_dir;
|
const char* lora_model_dir;
|
||||||
const sd_embedding_t* embeddings;
|
const char* embedding_dir;
|
||||||
uint32_t embedding_count;
|
|
||||||
const char* photo_maker_path;
|
const char* photo_maker_path;
|
||||||
const char* tensor_type_rules;
|
const char* tensor_type_rules;
|
||||||
bool vae_decode_only;
|
bool vae_decode_only;
|
||||||
@ -242,14 +236,6 @@ typedef struct {
|
|||||||
} sd_easycache_params_t;
|
} sd_easycache_params_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
bool is_high_noise;
|
|
||||||
float multiplier;
|
|
||||||
const char* path;
|
|
||||||
} sd_lora_t;
|
|
||||||
|
|
||||||
typedef struct {
|
|
||||||
const sd_lora_t* loras;
|
|
||||||
uint32_t lora_count;
|
|
||||||
const char* prompt;
|
const char* prompt;
|
||||||
const char* negative_prompt;
|
const char* negative_prompt;
|
||||||
int clip_skip;
|
int clip_skip;
|
||||||
@ -273,8 +259,6 @@ typedef struct {
|
|||||||
} sd_img_gen_params_t;
|
} sd_img_gen_params_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
const sd_lora_t* loras;
|
|
||||||
uint32_t lora_count;
|
|
||||||
const char* prompt;
|
const char* prompt;
|
||||||
const char* negative_prompt;
|
const char* negative_prompt;
|
||||||
int clip_skip;
|
int clip_skip;
|
||||||
@ -347,8 +331,7 @@ typedef struct upscaler_ctx_t upscaler_ctx_t;
|
|||||||
SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
|
SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
|
||||||
bool offload_params_to_cpu,
|
bool offload_params_to_cpu,
|
||||||
bool direct,
|
bool direct,
|
||||||
int n_threads,
|
int n_threads);
|
||||||
int tile_size);
|
|
||||||
SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
|
SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
|
||||||
|
|
||||||
SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx,
|
SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx,
|
||||||
@ -370,9 +353,6 @@ SD_API bool preprocess_canny(sd_image_t image,
|
|||||||
float strong,
|
float strong,
|
||||||
bool inverse);
|
bool inverse);
|
||||||
|
|
||||||
SD_API const char* sd_commit(void);
|
|
||||||
SD_API const char* sd_version(void);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
16
upscaler.cpp
16
upscaler.cpp
@ -9,15 +9,12 @@ struct UpscalerGGML {
|
|||||||
std::shared_ptr<ESRGAN> esrgan_upscaler;
|
std::shared_ptr<ESRGAN> esrgan_upscaler;
|
||||||
std::string esrgan_path;
|
std::string esrgan_path;
|
||||||
int n_threads;
|
int n_threads;
|
||||||
bool direct = false;
|
bool direct = false;
|
||||||
int tile_size = 128;
|
|
||||||
|
|
||||||
UpscalerGGML(int n_threads,
|
UpscalerGGML(int n_threads,
|
||||||
bool direct = false,
|
bool direct = false)
|
||||||
int tile_size = 128)
|
|
||||||
: n_threads(n_threads),
|
: n_threads(n_threads),
|
||||||
direct(direct),
|
direct(direct) {
|
||||||
tile_size(tile_size) {
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool load_from_file(const std::string& esrgan_path,
|
bool load_from_file(const std::string& esrgan_path,
|
||||||
@ -54,7 +51,7 @@ struct UpscalerGGML {
|
|||||||
backend = ggml_backend_cpu_init();
|
backend = ggml_backend_cpu_init();
|
||||||
}
|
}
|
||||||
LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
|
LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
|
||||||
esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, tile_size, model_loader.get_tensor_storage_map());
|
esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, model_loader.get_tensor_storage_map());
|
||||||
if (direct) {
|
if (direct) {
|
||||||
esrgan_upscaler->set_conv2d_direct_enabled(true);
|
esrgan_upscaler->set_conv2d_direct_enabled(true);
|
||||||
}
|
}
|
||||||
@ -116,15 +113,14 @@ struct upscaler_ctx_t {
|
|||||||
upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
|
upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
|
||||||
bool offload_params_to_cpu,
|
bool offload_params_to_cpu,
|
||||||
bool direct,
|
bool direct,
|
||||||
int n_threads,
|
int n_threads) {
|
||||||
int tile_size) {
|
|
||||||
upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t));
|
upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t));
|
||||||
if (upscaler_ctx == nullptr) {
|
if (upscaler_ctx == nullptr) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
std::string esrgan_path(esrgan_path_c_str);
|
std::string esrgan_path(esrgan_path_c_str);
|
||||||
|
|
||||||
upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct, tile_size);
|
upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct);
|
||||||
if (upscaler_ctx->upscaler == nullptr) {
|
if (upscaler_ctx->upscaler == nullptr) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|||||||
34
util.cpp
34
util.cpp
@ -95,6 +95,20 @@ bool is_directory(const std::string& path) {
|
|||||||
return (attributes != INVALID_FILE_ATTRIBUTES && (attributes & FILE_ATTRIBUTE_DIRECTORY));
|
return (attributes != INVALID_FILE_ATTRIBUTES && (attributes & FILE_ATTRIBUTE_DIRECTORY));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string get_full_path(const std::string& dir, const std::string& filename) {
|
||||||
|
std::string full_path = dir + "\\" + filename;
|
||||||
|
|
||||||
|
WIN32_FIND_DATA find_file_data;
|
||||||
|
HANDLE hFind = FindFirstFile(full_path.c_str(), &find_file_data);
|
||||||
|
|
||||||
|
if (hFind != INVALID_HANDLE_VALUE) {
|
||||||
|
FindClose(hFind);
|
||||||
|
return full_path;
|
||||||
|
} else {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#else // Unix
|
#else // Unix
|
||||||
#include <dirent.h>
|
#include <dirent.h>
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
@ -109,6 +123,26 @@ bool is_directory(const std::string& path) {
|
|||||||
return (stat(path.c_str(), &buffer) == 0 && S_ISDIR(buffer.st_mode));
|
return (stat(path.c_str(), &buffer) == 0 && S_ISDIR(buffer.st_mode));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: add windows version
|
||||||
|
std::string get_full_path(const std::string& dir, const std::string& filename) {
|
||||||
|
DIR* dp = opendir(dir.c_str());
|
||||||
|
|
||||||
|
if (dp != nullptr) {
|
||||||
|
struct dirent* entry;
|
||||||
|
|
||||||
|
while ((entry = readdir(dp)) != nullptr) {
|
||||||
|
if (strcasecmp(entry->d_name, filename.c_str()) == 0) {
|
||||||
|
closedir(dp);
|
||||||
|
return dir + "/" + entry->d_name;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
closedir(dp);
|
||||||
|
}
|
||||||
|
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// get_num_physical_cores is copy from
|
// get_num_physical_cores is copy from
|
||||||
|
|||||||
1
util.h
1
util.h
@ -22,6 +22,7 @@ int round_up_to(int value, int base);
|
|||||||
|
|
||||||
bool file_exists(const std::string& filename);
|
bool file_exists(const std::string& filename);
|
||||||
bool is_directory(const std::string& path);
|
bool is_directory(const std::string& path);
|
||||||
|
std::string get_full_path(const std::string& dir, const std::string& filename);
|
||||||
|
|
||||||
std::u32string utf8_to_utf32(const std::string& utf8_str);
|
std::u32string utf8_to_utf32(const std::string& utf8_str);
|
||||||
std::string utf32_to_utf8(const std::u32string& utf32_str);
|
std::string utf32_to_utf8(const std::u32string& utf32_str);
|
||||||
|
|||||||
20
version.cpp
20
version.cpp
@ -1,20 +0,0 @@
|
|||||||
#include "stable-diffusion.h"
|
|
||||||
|
|
||||||
#ifndef SDCPP_BUILD_COMMIT
|
|
||||||
#define SDCPP_BUILD_COMMIT unknown
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef SDCPP_BUILD_VERSION
|
|
||||||
#define SDCPP_BUILD_VERSION unknown
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define STRINGIZE2(x) #x
|
|
||||||
#define STRINGIZE(x) STRINGIZE2(x)
|
|
||||||
|
|
||||||
const char* sd_commit(void) {
|
|
||||||
return STRINGIZE(SDCPP_BUILD_COMMIT);
|
|
||||||
}
|
|
||||||
|
|
||||||
const char* sd_version(void) {
|
|
||||||
return STRINGIZE(SDCPP_BUILD_VERSION);
|
|
||||||
}
|
|
||||||
Loading…
x
Reference in New Issue
Block a user