Compare commits

...

3 Commits

9 changed files with 240 additions and 125 deletions

View File

@ -7,31 +7,6 @@
/*================================================== CLIPTokenizer ===================================================*/
__STATIC_INLINE__ std::pair<std::unordered_map<std::string, float>, std::string> extract_and_remove_lora(std::string text) {
std::regex re("<lora:([^:]+):([^>]+)>");
std::smatch matches;
std::unordered_map<std::string, float> filename2multiplier;
while (std::regex_search(text, matches, re)) {
std::string filename = matches[1].str();
float multiplier = std::stof(matches[2].str());
text = std::regex_replace(text, re, "", std::regex_constants::format_first_only);
if (multiplier == 0.f) {
continue;
}
if (filename2multiplier.find(filename) == filename2multiplier.end()) {
filename2multiplier[filename] = multiplier;
} else {
filename2multiplier[filename] += multiplier;
}
}
return std::make_pair(filename2multiplier, text);
}
__STATIC_INLINE__ std::vector<std::pair<int, std::u32string>> bytes_to_unicode() {
std::vector<std::pair<int, std::u32string>> byte_unicode_pairs;
std::set<int> byte_set;

View File

@ -156,9 +156,10 @@ struct ESRGAN : public GGMLRunner {
ESRGAN(ggml_backend_t backend,
bool offload_params_to_cpu,
int tile_size = 128,
const String2TensorStorage& tensor_storage_map = {})
: GGMLRunner(backend, offload_params_to_cpu) {
// rrdb_net will be created in load_from_file
this->tile_size = tile_size;
}
std::string get_desc() override {

View File

@ -507,7 +507,7 @@ struct SDContextParams {
std::string lora_model_dir;
std::map<std::string, std::string> embedding_map;
std::vector<sd_embedding_t> embedding_array;
std::vector<sd_embedding_t> embedding_vec;
rng_type_t rng_type = CUDA_RNG;
rng_type_t sampler_rng_type = RNG_TYPE_COUNT;
@ -952,13 +952,13 @@ struct SDContextParams {
}
sd_ctx_params_t to_sd_ctx_params_t(bool vae_decode_only, bool free_params_immediately, bool taesd_preview) {
embedding_array.clear();
embedding_array.reserve(embedding_map.size());
embedding_vec.clear();
embedding_vec.reserve(embedding_map.size());
for (const auto& kv : embedding_map) {
sd_embedding_t item;
item.name = kv.first.c_str();
item.path = kv.second.c_str();
embedding_array.emplace_back(item);
embedding_vec.emplace_back(item);
}
sd_ctx_params_t sd_ctx_params = {
@ -975,8 +975,8 @@ struct SDContextParams {
taesd_path.c_str(),
control_net_path.c_str(),
lora_model_dir.c_str(),
embedding_array.data(),
static_cast<uint32_t>(embedding_array.size()),
embedding_vec.data(),
static_cast<uint32_t>(embedding_vec.size()),
photo_maker_path.c_str(),
tensor_type_rules.c_str(),
vae_decode_only,
@ -1030,6 +1030,15 @@ static std::string vec_str_to_string(const std::vector<std::string>& v) {
return oss.str();
}
static bool is_absolute_path(const std::string& p) {
#ifdef _WIN32
// Windows: C:/path or C:\path
return p.size() > 1 && std::isalpha(static_cast<unsigned char>(p[0])) && p[1] == ':';
#else
return !p.empty() && p[0] == '/';
#endif
}
struct SDGenerationParams {
std::string prompt;
std::string negative_prompt;
@ -1070,7 +1079,12 @@ struct SDGenerationParams {
std::string pm_id_embed_path;
float pm_style_strength = 20.f;
int upscale_repeats = 1;
int upscale_repeats = 1;
int upscale_tile_size = 128;
std::map<std::string, float> lora_map;
std::map<std::string, float> high_noise_lora_map;
std::vector<sd_lora_t> lora_vec;
SDGenerationParams() {
sd_sample_params_init(&sample_params);
@ -1163,6 +1177,10 @@ struct SDGenerationParams {
"--upscale-repeats",
"Run the ESRGAN upscaler this many times (default: 1)",
&upscale_repeats},
{"",
"--upscale-tile-size",
"tile size for ESRGAN upscaling (default: 128)",
&upscale_tile_size},
};
options.float_options = {
@ -1442,7 +1460,88 @@ struct SDGenerationParams {
return options;
}
bool process_and_check(SDMode mode) {
void extract_and_remove_lora(const std::string& lora_model_dir) {
static const std::regex re(R"(<lora:([^:>]+):([^>]+)>)");
static const std::vector<std::string> valid_ext = {".pt", ".safetensors", ".gguf"};
std::smatch m;
std::string tmp = prompt;
while (std::regex_search(tmp, m, re)) {
std::string raw_path = m[1].str();
const std::string raw_mul = m[2].str();
float mul = 0.f;
try {
mul = std::stof(raw_mul);
} catch (...) {
tmp = m.suffix().str();
prompt = std::regex_replace(prompt, re, "", std::regex_constants::format_first_only);
continue;
}
bool is_high_noise = false;
static const std::string prefix = "|high_noise|";
if (raw_path.rfind(prefix, 0) == 0) {
raw_path.erase(0, prefix.size());
is_high_noise = true;
}
fs::path final_path;
if (is_absolute_path(raw_path)) {
final_path = raw_path;
} else {
final_path = fs::path(lora_model_dir) / raw_path;
}
if (!fs::exists(final_path)) {
bool found = false;
for (const auto& ext : valid_ext) {
fs::path try_path = final_path;
try_path += ext;
if (fs::exists(try_path)) {
final_path = try_path;
found = true;
break;
}
}
if (!found) {
printf("can not found lora %s\n", final_path.lexically_normal().string().c_str());
tmp = m.suffix().str();
prompt = std::regex_replace(prompt, re, "", std::regex_constants::format_first_only);
continue;
}
}
const std::string key = final_path.lexically_normal().string();
if (is_high_noise)
high_noise_lora_map[key] += mul;
else
lora_map[key] += mul;
prompt = std::regex_replace(prompt, re, "", std::regex_constants::format_first_only);
tmp = m.suffix().str();
}
for (const auto& kv : lora_map) {
sd_lora_t item;
item.is_high_noise = false;
item.path = kv.first.c_str();
item.multiplier = kv.second;
lora_vec.emplace_back(item);
}
for (const auto& kv : high_noise_lora_map) {
sd_lora_t item;
item.is_high_noise = true;
item.path = kv.first.c_str();
item.multiplier = kv.second;
lora_vec.emplace_back(item);
}
}
bool process_and_check(SDMode mode, const std::string& lora_model_dir) {
if (width <= 0) {
fprintf(stderr, "error: the width must be greater than 0\n");
return false;
@ -1541,6 +1640,10 @@ struct SDGenerationParams {
return false;
}
if (upscale_tile_size < 1) {
return false;
}
if (mode == UPSCALE) {
if (init_image_path.length() == 0) {
fprintf(stderr, "error: upscale mode needs an init image (--init-img)\n");
@ -1553,14 +1656,44 @@ struct SDGenerationParams {
seed = rand();
}
extract_and_remove_lora(lora_model_dir);
return true;
}
std::string to_string() const {
char* sample_params_str = sd_sample_params_to_str(&sample_params);
char* high_noise_sample_params_str = sd_sample_params_to_str(&high_noise_sample_params);
std::ostringstream lora_ss;
lora_ss << "{\n";
for (auto it = lora_map.begin(); it != lora_map.end(); ++it) {
lora_ss << " \"" << it->first << "\": \"" << it->second << "\"";
if (std::next(it) != lora_map.end()) {
lora_ss << ",";
}
lora_ss << "\n";
}
lora_ss << " }";
std::string loras_str = lora_ss.str();
lora_ss = std::ostringstream();
;
lora_ss << "{\n";
for (auto it = high_noise_lora_map.begin(); it != high_noise_lora_map.end(); ++it) {
lora_ss << " \"" << it->first << "\": \"" << it->second << "\"";
if (std::next(it) != high_noise_lora_map.end()) {
lora_ss << ",";
}
lora_ss << "\n";
}
lora_ss << " }";
std::string high_noise_loras_str = lora_ss.str();
std::ostringstream oss;
oss << "SDGenerationParams {\n"
<< " loras: \"" << loras_str << "\",\n"
<< " high_noise_loras: \"" << high_noise_loras_str << "\",\n"
<< " prompt: \"" << prompt << "\",\n"
<< " negative_prompt: \"" << negative_prompt << "\",\n"
<< " clip_skip: " << clip_skip << ",\n"
@ -1596,6 +1729,7 @@ struct SDGenerationParams {
<< " control_strength: " << control_strength << ",\n"
<< " seed: " << seed << ",\n"
<< " upscale_repeats: " << upscale_repeats << ",\n"
<< " upscale_tile_size: " << upscale_tile_size << ",\n"
<< "}";
free(sample_params_str);
free(high_noise_sample_params_str);
@ -1626,7 +1760,9 @@ void parse_args(int argc, const char** argv, SDCliParams& cli_params, SDContextP
exit(cli_params.normal_exit ? 0 : 1);
}
if (!cli_params.process_and_check() || !ctx_params.process_and_check(cli_params.mode) || !gen_params.process_and_check(cli_params.mode)) {
if (!cli_params.process_and_check() ||
!ctx_params.process_and_check(cli_params.mode) ||
!gen_params.process_and_check(cli_params.mode, ctx_params.lora_model_dir)) {
print_usage(argc, argv, options_vec);
exit(1);
}
@ -2139,6 +2275,8 @@ int main(int argc, const char* argv[]) {
if (cli_params.mode == IMG_GEN) {
sd_img_gen_params_t img_gen_params = {
gen_params.lora_vec.data(),
static_cast<uint32_t>(gen_params.lora_vec.size()),
gen_params.prompt.c_str(),
gen_params.negative_prompt.c_str(),
gen_params.clip_skip,
@ -2170,6 +2308,8 @@ int main(int argc, const char* argv[]) {
num_results = gen_params.batch_count;
} else if (cli_params.mode == VID_GEN) {
sd_vid_gen_params_t vid_gen_params = {
gen_params.lora_vec.data(),
static_cast<uint32_t>(gen_params.lora_vec.size()),
gen_params.prompt.c_str(),
gen_params.negative_prompt.c_str(),
gen_params.clip_skip,
@ -2206,7 +2346,8 @@ int main(int argc, const char* argv[]) {
upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(ctx_params.esrgan_path.c_str(),
ctx_params.offload_params_to_cpu,
ctx_params.diffusion_conv_direct,
ctx_params.n_threads);
ctx_params.n_threads,
gen_params.upscale_tile_size);
if (upscaler_ctx == nullptr) {
printf("new_upscaler_ctx failed\n");

View File

@ -60,6 +60,14 @@
#define SD_UNUSED(x) (void)(x)
#endif
__STATIC_INLINE__ int align_up_offset(int n, int multiple) {
return (multiple - n % multiple) % multiple;
}
__STATIC_INLINE__ int align_up(int n, int multiple) {
return n + align_up_offset(n, multiple);
}
__STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void*) {
switch (level) {
case GGML_LOG_LEVEL_DEBUG:

View File

@ -937,28 +937,17 @@ public:
float multiplier,
ggml_backend_t backend,
LoraModel::filter_t lora_tensor_filter = nullptr) {
std::string lora_name = lora_id;
std::string high_noise_tag = "|high_noise|";
bool is_high_noise = false;
if (starts_with(lora_name, high_noise_tag)) {
lora_name = lora_name.substr(high_noise_tag.size());
std::string lora_path = lora_id;
static std::string high_noise_tag = "|high_noise|";
bool is_high_noise = false;
if (starts_with(lora_path, high_noise_tag)) {
lora_path = lora_path.substr(high_noise_tag.size());
is_high_noise = true;
LOG_DEBUG("high noise lora: %s", lora_name.c_str());
LOG_DEBUG("high noise lora: %s", lora_path.c_str());
}
std::string st_file_path = path_join(lora_model_dir, lora_name + ".safetensors");
std::string ckpt_file_path = path_join(lora_model_dir, lora_name + ".ckpt");
std::string file_path;
if (file_exists(st_file_path)) {
file_path = st_file_path;
} else if (file_exists(ckpt_file_path)) {
file_path = ckpt_file_path;
} else {
LOG_WARN("can not find %s or %s for lora %s", st_file_path.c_str(), ckpt_file_path.c_str(), lora_name.c_str());
return nullptr;
}
auto lora = std::make_shared<LoraModel>(lora_id, backend, file_path, is_high_noise ? "model.high_noise_" : "", version);
auto lora = std::make_shared<LoraModel>(lora_id, backend, lora_path, is_high_noise ? "model.high_noise_" : "", version);
if (!lora->load_from_file(n_threads, lora_tensor_filter)) {
LOG_WARN("load lora tensors from %s failed", file_path.c_str());
LOG_WARN("load lora tensors from %s failed", lora_path.c_str());
return nullptr;
}
@ -1143,12 +1132,15 @@ public:
}
}
std::string apply_loras_from_prompt(const std::string& prompt) {
auto result_pair = extract_and_remove_lora(prompt);
std::unordered_map<std::string, float> lora_f2m = result_pair.first; // lora_name -> multiplier
for (auto& kv : lora_f2m) {
LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second);
void apply_loras(const sd_lora_t* loras, uint32_t lora_count) {
std::unordered_map<std::string, float> lora_f2m;
for (int i = 0; i < lora_count; i++) {
std::string lora_id = SAFE_STR(loras[i].path);
if (loras[i].is_high_noise) {
lora_id = "|high_noise|" + lora_id;
}
lora_f2m[lora_id] = loras[i].multiplier;
LOG_DEBUG("lora %s:%.2f", lora_id.c_str(), loras[i].multiplier);
}
int64_t t0 = ggml_time_ms();
if (apply_lora_immediately) {
@ -1159,9 +1151,7 @@ public:
int64_t t1 = ggml_time_ms();
if (!lora_f2m.empty()) {
LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
LOG_DEBUG("prompt after extract and remove lora: \"%s\"", result_pair.second.c_str());
}
return result_pair.second;
}
ggml_tensor* id_encoder(ggml_context* work_ctx,
@ -1908,6 +1898,18 @@ public:
return vae_scale_factor;
}
int get_diffusion_model_down_factor() {
int down_factor = 8; // unet
if (sd_version_is_dit(version)) {
if (sd_version_is_wan(version)) {
down_factor = 2;
} else {
down_factor = 1;
}
}
return down_factor;
}
int get_latent_channel() {
int latent_channel = 4;
if (sd_version_is_dit(version)) {
@ -2815,8 +2817,6 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
int sample_steps = sigmas.size() - 1;
int64_t t0 = ggml_time_ms();
// Apply lora
prompt = sd_ctx->sd->apply_loras_from_prompt(prompt);
// Photo Maker
std::string prompt_text_only;
@ -3145,22 +3145,19 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
sd_ctx->sd->vae_tiling_params = sd_img_gen_params->vae_tiling_params;
int width = sd_img_gen_params->width;
int height = sd_img_gen_params->height;
int vae_scale_factor = sd_ctx->sd->get_vae_scale_factor();
if (sd_version_is_dit(sd_ctx->sd->version)) {
if (width % 16 || height % 16) {
LOG_ERROR("Image dimensions must be must be a multiple of 16 on each axis for %s models. (Got %dx%d)",
model_version_to_str[sd_ctx->sd->version],
width,
height);
return nullptr;
}
} else if (width % 64 || height % 64) {
LOG_ERROR("Image dimensions must be must be a multiple of 64 on each axis for %s models. (Got %dx%d)",
model_version_to_str[sd_ctx->sd->version],
width,
height);
return nullptr;
int vae_scale_factor = sd_ctx->sd->get_vae_scale_factor();
int diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor();
int spatial_multiple = vae_scale_factor * diffusion_model_down_factor;
int width_offset = align_up_offset(width, spatial_multiple);
int height_offset = align_up_offset(height, spatial_multiple);
if (width_offset > 0 || height_offset > 0) {
width += width_offset;
height += height_offset;
LOG_WARN("align up %dx%d to %dx%d (multiple=%d)", sd_img_gen_params->width, sd_img_gen_params->height, width, height, spatial_multiple);
}
LOG_DEBUG("generate_image %dx%d", width, height);
if (sd_ctx == nullptr || sd_img_gen_params == nullptr) {
return nullptr;
@ -3188,6 +3185,9 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
size_t t0 = ggml_time_ms();
// Apply lora
sd_ctx->sd->apply_loras(sd_img_gen_params->loras, sd_img_gen_params->lora_count);
enum sample_method_t sample_method = sd_img_gen_params->sample_params.sample_method;
if (sample_method == SAMPLE_METHOD_COUNT) {
sample_method = sd_get_default_sample_method(sd_ctx);
@ -3431,9 +3431,19 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
int frames = sd_vid_gen_params->video_frames;
frames = (frames - 1) / 4 * 4 + 1;
int sample_steps = sd_vid_gen_params->sample_params.sample_steps;
LOG_INFO("generate_video %dx%dx%d", width, height, frames);
int vae_scale_factor = sd_ctx->sd->get_vae_scale_factor();
int vae_scale_factor = sd_ctx->sd->get_vae_scale_factor();
int diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor();
int spatial_multiple = vae_scale_factor * diffusion_model_down_factor;
int width_offset = align_up_offset(width, spatial_multiple);
int height_offset = align_up_offset(height, spatial_multiple);
if (width_offset > 0 || height_offset > 0) {
width += width_offset;
height += height_offset;
LOG_WARN("align up %dx%d to %dx%d (multiple=%d)", sd_vid_gen_params->width, sd_vid_gen_params->height, width, height, spatial_multiple);
}
LOG_INFO("generate_video %dx%dx%d", width, height, frames);
enum sample_method_t sample_method = sd_vid_gen_params->sample_params.sample_method;
if (sample_method == SAMPLE_METHOD_COUNT) {
@ -3487,7 +3497,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
int64_t t0 = ggml_time_ms();
// Apply lora
prompt = sd_ctx->sd->apply_loras_from_prompt(prompt);
sd_ctx->sd->apply_loras(sd_vid_gen_params->loras, sd_vid_gen_params->lora_count);
ggml_tensor* init_latent = nullptr;
ggml_tensor* clip_vision_output = nullptr;

View File

@ -242,6 +242,14 @@ typedef struct {
} sd_easycache_params_t;
typedef struct {
bool is_high_noise;
float multiplier;
const char* path;
} sd_lora_t;
typedef struct {
const sd_lora_t* loras;
uint32_t lora_count;
const char* prompt;
const char* negative_prompt;
int clip_skip;
@ -265,6 +273,8 @@ typedef struct {
} sd_img_gen_params_t;
typedef struct {
const sd_lora_t* loras;
uint32_t lora_count;
const char* prompt;
const char* negative_prompt;
int clip_skip;
@ -337,7 +347,8 @@ typedef struct upscaler_ctx_t upscaler_ctx_t;
SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
bool offload_params_to_cpu,
bool direct,
int n_threads);
int n_threads,
int tile_size);
SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx,

View File

@ -9,12 +9,15 @@ struct UpscalerGGML {
std::shared_ptr<ESRGAN> esrgan_upscaler;
std::string esrgan_path;
int n_threads;
bool direct = false;
bool direct = false;
int tile_size = 128;
UpscalerGGML(int n_threads,
bool direct = false)
bool direct = false,
int tile_size = 128)
: n_threads(n_threads),
direct(direct) {
direct(direct),
tile_size(tile_size) {
}
bool load_from_file(const std::string& esrgan_path,
@ -51,7 +54,7 @@ struct UpscalerGGML {
backend = ggml_backend_cpu_init();
}
LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, model_loader.get_tensor_storage_map());
esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, tile_size, model_loader.get_tensor_storage_map());
if (direct) {
esrgan_upscaler->set_conv2d_direct_enabled(true);
}
@ -113,14 +116,15 @@ struct upscaler_ctx_t {
upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
bool offload_params_to_cpu,
bool direct,
int n_threads) {
int n_threads,
int tile_size) {
upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t));
if (upscaler_ctx == nullptr) {
return nullptr;
}
std::string esrgan_path(esrgan_path_c_str);
upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct);
upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct, tile_size);
if (upscaler_ctx->upscaler == nullptr) {
return nullptr;
}

View File

@ -95,20 +95,6 @@ bool is_directory(const std::string& path) {
return (attributes != INVALID_FILE_ATTRIBUTES && (attributes & FILE_ATTRIBUTE_DIRECTORY));
}
std::string get_full_path(const std::string& dir, const std::string& filename) {
std::string full_path = dir + "\\" + filename;
WIN32_FIND_DATA find_file_data;
HANDLE hFind = FindFirstFile(full_path.c_str(), &find_file_data);
if (hFind != INVALID_HANDLE_VALUE) {
FindClose(hFind);
return full_path;
} else {
return "";
}
}
#else // Unix
#include <dirent.h>
#include <sys/stat.h>
@ -123,26 +109,6 @@ bool is_directory(const std::string& path) {
return (stat(path.c_str(), &buffer) == 0 && S_ISDIR(buffer.st_mode));
}
// TODO: add windows version
std::string get_full_path(const std::string& dir, const std::string& filename) {
DIR* dp = opendir(dir.c_str());
if (dp != nullptr) {
struct dirent* entry;
while ((entry = readdir(dp)) != nullptr) {
if (strcasecmp(entry->d_name, filename.c_str()) == 0) {
closedir(dp);
return dir + "/" + entry->d_name;
}
}
closedir(dp);
}
return "";
}
#endif
// get_num_physical_cores is copy from

1
util.h
View File

@ -22,7 +22,6 @@ int round_up_to(int value, int base);
bool file_exists(const std::string& filename);
bool is_directory(const std::string& path);
std::string get_full_path(const std::string& dir, const std::string& filename);
std::u32string utf8_to_utf32(const std::string& utf8_str);
std::string utf32_to_utf8(const std::u32string& utf32_str);