mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2025-12-12 13:28:37 +00:00
feat: add Instruct-Pix2pix/CosXL-Edit support (#679)
* Instruct-p2p support * support 2 conditionings cfg * Do not re-encode the exact same image twice * fixes for 2-cfg * Fix pix2pix latent inputs + improve inpainting a bit + fix naming * prepare for other pix2pix-like models * Support sdxl ip2p * fix reference image embeddings * Support 2-cond cfg properly in cli * fix typo in help * Support masks for ip2p models * unify code style * delete unused code * use edit mode * add img_cond * format code --------- Co-authored-by: leejet <leejet714@gmail.com>
This commit is contained in:
parent
6d84a30c66
commit
a772dca27a
@ -97,15 +97,16 @@ struct SDParams {
|
||||
|
||||
std::string prompt;
|
||||
std::string negative_prompt;
|
||||
float min_cfg = 1.0f;
|
||||
float cfg_scale = 7.0f;
|
||||
float guidance = 3.5f;
|
||||
float eta = 0.f;
|
||||
float style_ratio = 20.f;
|
||||
int clip_skip = -1; // <= 0 represents unspecified
|
||||
int width = 512;
|
||||
int height = 512;
|
||||
int batch_count = 1;
|
||||
float min_cfg = 1.0f;
|
||||
float cfg_scale = 7.0f;
|
||||
float img_cfg_scale = INFINITY;
|
||||
float guidance = 3.5f;
|
||||
float eta = 0.f;
|
||||
float style_ratio = 20.f;
|
||||
int clip_skip = -1; // <= 0 represents unspecified
|
||||
int width = 512;
|
||||
int height = 512;
|
||||
int batch_count = 1;
|
||||
|
||||
int video_frames = 6;
|
||||
int motion_bucket_id = 127;
|
||||
@ -176,6 +177,7 @@ void print_params(SDParams params) {
|
||||
printf(" negative_prompt: %s\n", params.negative_prompt.c_str());
|
||||
printf(" min_cfg: %.2f\n", params.min_cfg);
|
||||
printf(" cfg_scale: %.2f\n", params.cfg_scale);
|
||||
printf(" img_cfg_scale: %.2f\n", params.img_cfg_scale);
|
||||
printf(" slg_scale: %.2f\n", params.slg_scale);
|
||||
printf(" guidance: %.2f\n", params.guidance);
|
||||
printf(" eta: %.2f\n", params.eta);
|
||||
@ -234,7 +236,8 @@ void print_usage(int argc, const char* argv[]) {
|
||||
printf(" -p, --prompt [PROMPT] the prompt to render\n");
|
||||
printf(" -n, --negative-prompt PROMPT the negative prompt (default: \"\")\n");
|
||||
printf(" --cfg-scale SCALE unconditional guidance scale: (default: 7.0)\n");
|
||||
printf(" --guidance SCALE guidance scale for img2img (default: 3.5)\n");
|
||||
printf(" --img-cfg-scale SCALE image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)\n");
|
||||
printf(" --guidance SCALE distilled guidance scale for models with guidance input (default: 3.5)\n");
|
||||
printf(" --slg-scale SCALE skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n");
|
||||
printf(" 0 means disabled, a value of 2.5 is nice for sd3.5 medium\n");
|
||||
printf(" --eta SCALE eta in DDIM, only for DDIM and TCD: (default: 0)\n");
|
||||
@ -470,6 +473,12 @@ void parse_args(int argc, const char** argv, SDParams& params) {
|
||||
break;
|
||||
}
|
||||
params.cfg_scale = std::stof(argv[i]);
|
||||
} else if (arg == "--img-cfg-scale") {
|
||||
if (++i >= argc) {
|
||||
invalid_arg = true;
|
||||
break;
|
||||
}
|
||||
params.img_cfg_scale = std::stof(argv[i]);
|
||||
} else if (arg == "--guidance") {
|
||||
if (++i >= argc) {
|
||||
invalid_arg = true;
|
||||
@ -755,6 +764,10 @@ void parse_args(int argc, const char** argv, SDParams& params) {
|
||||
params.output_path = "output.gguf";
|
||||
}
|
||||
}
|
||||
|
||||
if (!isfinite(params.img_cfg_scale)) {
|
||||
params.img_cfg_scale = params.cfg_scale;
|
||||
}
|
||||
}
|
||||
|
||||
static std::string sd_basename(const std::string& path) {
|
||||
@ -849,6 +862,18 @@ int main(int argc, const char* argv[]) {
|
||||
|
||||
parse_args(argc, argv, params);
|
||||
|
||||
sd_guidance_params_t guidance_params = {params.cfg_scale,
|
||||
params.img_cfg_scale,
|
||||
params.min_cfg,
|
||||
params.guidance,
|
||||
{
|
||||
params.skip_layers.data(),
|
||||
params.skip_layers.size(),
|
||||
params.skip_layer_start,
|
||||
params.skip_layer_end,
|
||||
params.slg_scale,
|
||||
}};
|
||||
|
||||
sd_set_log_callback(sd_log_cb, (void*)¶ms);
|
||||
|
||||
if (params.verbose) {
|
||||
@ -1041,8 +1066,7 @@ int main(int argc, const char* argv[]) {
|
||||
params.prompt.c_str(),
|
||||
params.negative_prompt.c_str(),
|
||||
params.clip_skip,
|
||||
params.cfg_scale,
|
||||
params.guidance,
|
||||
guidance_params,
|
||||
params.eta,
|
||||
params.width,
|
||||
params.height,
|
||||
@ -1054,12 +1078,7 @@ int main(int argc, const char* argv[]) {
|
||||
params.control_strength,
|
||||
params.style_ratio,
|
||||
params.normalize_input,
|
||||
params.input_id_images_path.c_str(),
|
||||
params.skip_layers.data(),
|
||||
params.skip_layers.size(),
|
||||
params.slg_scale,
|
||||
params.skip_layer_start,
|
||||
params.skip_layer_end);
|
||||
params.input_id_images_path.c_str());
|
||||
} else if (params.mode == IMG2IMG || params.mode == IMG2VID) {
|
||||
sd_image_t input_image = {(uint32_t)params.width,
|
||||
(uint32_t)params.height,
|
||||
@ -1075,8 +1094,7 @@ int main(int argc, const char* argv[]) {
|
||||
params.motion_bucket_id,
|
||||
params.fps,
|
||||
params.augmentation_level,
|
||||
params.min_cfg,
|
||||
params.cfg_scale,
|
||||
guidance_params,
|
||||
params.sample_method,
|
||||
params.sample_steps,
|
||||
params.strength,
|
||||
@ -1109,8 +1127,7 @@ int main(int argc, const char* argv[]) {
|
||||
params.prompt.c_str(),
|
||||
params.negative_prompt.c_str(),
|
||||
params.clip_skip,
|
||||
params.cfg_scale,
|
||||
params.guidance,
|
||||
guidance_params,
|
||||
params.eta,
|
||||
params.width,
|
||||
params.height,
|
||||
@ -1123,12 +1140,7 @@ int main(int argc, const char* argv[]) {
|
||||
params.control_strength,
|
||||
params.style_ratio,
|
||||
params.normalize_input,
|
||||
params.input_id_images_path.c_str(),
|
||||
params.skip_layers.data(),
|
||||
params.skip_layers.size(),
|
||||
params.slg_scale,
|
||||
params.skip_layer_start,
|
||||
params.skip_layer_end);
|
||||
params.input_id_images_path.c_str());
|
||||
}
|
||||
} else { // EDIT
|
||||
results = edit(sd_ctx,
|
||||
@ -1137,25 +1149,19 @@ int main(int argc, const char* argv[]) {
|
||||
params.prompt.c_str(),
|
||||
params.negative_prompt.c_str(),
|
||||
params.clip_skip,
|
||||
params.cfg_scale,
|
||||
params.guidance,
|
||||
guidance_params,
|
||||
params.eta,
|
||||
params.width,
|
||||
params.height,
|
||||
params.sample_method,
|
||||
params.sample_steps,
|
||||
params.strength,
|
||||
params.seed,
|
||||
params.batch_count,
|
||||
control_image,
|
||||
params.control_strength,
|
||||
params.style_ratio,
|
||||
params.normalize_input,
|
||||
params.skip_layers.data(),
|
||||
params.skip_layers.size(),
|
||||
params.slg_scale,
|
||||
params.skip_layer_start,
|
||||
params.skip_layer_end);
|
||||
params.input_id_images_path.c_str());
|
||||
}
|
||||
|
||||
if (results == NULL) {
|
||||
|
||||
@ -1673,10 +1673,14 @@ SDVersion ModelLoader::get_sd_version() {
|
||||
}
|
||||
}
|
||||
bool is_inpaint = input_block_weight.ne[2] == 9;
|
||||
bool is_ip2p = input_block_weight.ne[2] == 8;
|
||||
if (is_xl) {
|
||||
if (is_inpaint) {
|
||||
return VERSION_SDXL_INPAINT;
|
||||
}
|
||||
if (is_ip2p) {
|
||||
return VERSION_SDXL_PIX2PIX;
|
||||
}
|
||||
return VERSION_SDXL;
|
||||
}
|
||||
|
||||
@ -1692,6 +1696,9 @@ SDVersion ModelLoader::get_sd_version() {
|
||||
if (is_inpaint) {
|
||||
return VERSION_SD1_INPAINT;
|
||||
}
|
||||
if (is_ip2p) {
|
||||
return VERSION_SD1_PIX2PIX;
|
||||
}
|
||||
return VERSION_SD1;
|
||||
} else if (token_embedding_weight.ne[0] == 1024) {
|
||||
if (is_inpaint) {
|
||||
|
||||
14
model.h
14
model.h
@ -21,10 +21,12 @@
|
||||
enum SDVersion {
|
||||
VERSION_SD1,
|
||||
VERSION_SD1_INPAINT,
|
||||
VERSION_SD1_PIX2PIX,
|
||||
VERSION_SD2,
|
||||
VERSION_SD2_INPAINT,
|
||||
VERSION_SDXL,
|
||||
VERSION_SDXL_INPAINT,
|
||||
VERSION_SDXL_PIX2PIX,
|
||||
VERSION_SVD,
|
||||
VERSION_SD3,
|
||||
VERSION_FLUX,
|
||||
@ -47,7 +49,7 @@ static inline bool sd_version_is_sd3(SDVersion version) {
|
||||
}
|
||||
|
||||
static inline bool sd_version_is_sd1(SDVersion version) {
|
||||
if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT) {
|
||||
if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
@ -61,7 +63,7 @@ static inline bool sd_version_is_sd2(SDVersion version) {
|
||||
}
|
||||
|
||||
static inline bool sd_version_is_sdxl(SDVersion version) {
|
||||
if (version == VERSION_SDXL || version == VERSION_SDXL_INPAINT) {
|
||||
if (version == VERSION_SDXL || version == VERSION_SDXL_INPAINT || version == VERSION_SDXL_PIX2PIX) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
@ -81,6 +83,14 @@ static inline bool sd_version_is_dit(SDVersion version) {
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline bool sd_version_is_unet_edit(SDVersion version) {
|
||||
return version == VERSION_SD1_PIX2PIX || version == VERSION_SDXL_PIX2PIX;
|
||||
}
|
||||
|
||||
static bool sd_version_is_inpaint_or_unet_edit(SDVersion version) {
|
||||
return sd_version_is_unet_edit(version) || sd_version_is_inpaint(version);
|
||||
}
|
||||
|
||||
enum PMVersion {
|
||||
PM_VERSION_1,
|
||||
PM_VERSION_2,
|
||||
|
||||
@ -27,10 +27,12 @@
|
||||
const char* model_version_to_str[] = {
|
||||
"SD 1.x",
|
||||
"SD 1.x Inpaint",
|
||||
"Instruct-Pix2Pix",
|
||||
"SD 2.x",
|
||||
"SD 2.x Inpaint",
|
||||
"SDXL",
|
||||
"SDXL Inpaint",
|
||||
"SDXL Instruct-Pix2Pix",
|
||||
"SVD",
|
||||
"SD3.x",
|
||||
"Flux",
|
||||
@ -824,22 +826,30 @@ public:
|
||||
ggml_tensor* noise,
|
||||
SDCondition cond,
|
||||
SDCondition uncond,
|
||||
SDCondition img_cond,
|
||||
ggml_tensor* control_hint,
|
||||
float control_strength,
|
||||
float min_cfg,
|
||||
float cfg_scale,
|
||||
float guidance,
|
||||
sd_guidance_params_t guidance,
|
||||
float eta,
|
||||
sample_method_t method,
|
||||
const std::vector<float>& sigmas,
|
||||
int start_merge_step,
|
||||
SDCondition id_cond,
|
||||
std::vector<ggml_tensor*> ref_latents = {},
|
||||
std::vector<int> skip_layers = {},
|
||||
float slg_scale = 0,
|
||||
float skip_layer_start = 0.01,
|
||||
float skip_layer_end = 0.2,
|
||||
ggml_tensor* noise_mask = nullptr) {
|
||||
ggml_tensor* denoise_mask = nullptr) {
|
||||
std::vector<int> skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count);
|
||||
|
||||
float cfg_scale = guidance.txt_cfg;
|
||||
float img_cfg_scale = guidance.img_cfg;
|
||||
float slg_scale = guidance.slg.scale;
|
||||
|
||||
float min_cfg = guidance.min_cfg;
|
||||
|
||||
if (img_cfg_scale != cfg_scale && !sd_version_is_inpaint_or_unet_edit(version)) {
|
||||
LOG_WARN("2-conditioning CFG is not supported with this model, disabling it for better performance...");
|
||||
img_cfg_scale = cfg_scale;
|
||||
}
|
||||
|
||||
LOG_DEBUG("Sample");
|
||||
struct ggml_init_params params;
|
||||
size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]);
|
||||
@ -861,13 +871,15 @@ public:
|
||||
|
||||
struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, noise);
|
||||
|
||||
bool has_unconditioned = cfg_scale != 1.0 && uncond.c_crossattn != NULL;
|
||||
bool has_unconditioned = img_cfg_scale != 1.0 && uncond.c_crossattn != NULL;
|
||||
bool has_img_cond = cfg_scale != img_cfg_scale && img_cond.c_crossattn != NULL;
|
||||
bool has_skiplayer = slg_scale != 0.0 && skip_layers.size() > 0;
|
||||
|
||||
// denoise wrapper
|
||||
struct ggml_tensor* out_cond = ggml_dup_tensor(work_ctx, x);
|
||||
struct ggml_tensor* out_uncond = NULL;
|
||||
struct ggml_tensor* out_skip = NULL;
|
||||
struct ggml_tensor* out_cond = ggml_dup_tensor(work_ctx, x);
|
||||
struct ggml_tensor* out_uncond = NULL;
|
||||
struct ggml_tensor* out_skip = NULL;
|
||||
struct ggml_tensor* out_img_cond = NULL;
|
||||
|
||||
if (has_unconditioned) {
|
||||
out_uncond = ggml_dup_tensor(work_ctx, x);
|
||||
@ -880,6 +892,9 @@ public:
|
||||
LOG_WARN("SLG is incompatible with %s models", model_version_to_str[version]);
|
||||
}
|
||||
}
|
||||
if (has_img_cond) {
|
||||
out_img_cond = ggml_dup_tensor(work_ctx, x);
|
||||
}
|
||||
struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x);
|
||||
|
||||
auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
|
||||
@ -897,7 +912,7 @@ public:
|
||||
float t = denoiser->sigma_to_t(sigma);
|
||||
std::vector<float> timesteps_vec(x->ne[3], t); // [N, ]
|
||||
auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
|
||||
std::vector<float> guidance_vec(x->ne[3], guidance);
|
||||
std::vector<float> guidance_vec(x->ne[3], guidance.distilled_guidance);
|
||||
auto guidance_tensor = vector_to_ggml_tensor(work_ctx, guidance_vec);
|
||||
|
||||
copy_ggml_tensor(noised_input, input);
|
||||
@ -964,8 +979,25 @@ public:
|
||||
negative_data = (float*)out_uncond->data;
|
||||
}
|
||||
|
||||
float* img_cond_data = NULL;
|
||||
if (has_img_cond) {
|
||||
diffusion_model->compute(n_threads,
|
||||
noised_input,
|
||||
timesteps,
|
||||
img_cond.c_crossattn,
|
||||
img_cond.c_concat,
|
||||
img_cond.c_vector,
|
||||
guidance_tensor,
|
||||
ref_latents,
|
||||
-1,
|
||||
controls,
|
||||
control_strength,
|
||||
&out_img_cond);
|
||||
img_cond_data = (float*)out_img_cond->data;
|
||||
}
|
||||
|
||||
int step_count = sigmas.size();
|
||||
bool is_skiplayer_step = has_skiplayer && step > (int)(skip_layer_start * step_count) && step < (int)(skip_layer_end * step_count);
|
||||
bool is_skiplayer_step = has_skiplayer && step > (int)(guidance.slg.layer_start * step_count) && step < (int)(guidance.slg.layer_end * step_count);
|
||||
float* skip_layer_data = NULL;
|
||||
if (is_skiplayer_step) {
|
||||
LOG_DEBUG("Skipping layers at step %d\n", step);
|
||||
@ -999,8 +1031,17 @@ public:
|
||||
int64_t i3 = i / out_cond->ne[0] * out_cond->ne[1] * out_cond->ne[2];
|
||||
float scale = min_cfg + (cfg_scale - min_cfg) * (i3 * 1.0f / ne3);
|
||||
} else {
|
||||
latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]);
|
||||
if (has_img_cond) {
|
||||
// out_uncond + text_cfg_scale * (out_cond - out_img_cond) + image_cfg_scale * (out_img_cond - out_uncond)
|
||||
latent_result = negative_data[i] + img_cfg_scale * (img_cond_data[i] - negative_data[i]) + cfg_scale * (positive_data[i] - img_cond_data[i]);
|
||||
} else {
|
||||
// img_cfg_scale == cfg_scale
|
||||
latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]);
|
||||
}
|
||||
}
|
||||
} else if (has_img_cond) {
|
||||
// img_cfg_scale == 1
|
||||
latent_result = img_cond_data[i] + cfg_scale * (positive_data[i] - img_cond_data[i]);
|
||||
}
|
||||
if (is_skiplayer_step) {
|
||||
latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_scale;
|
||||
@ -1014,10 +1055,10 @@ public:
|
||||
pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f);
|
||||
// LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
|
||||
}
|
||||
if (noise_mask != nullptr) {
|
||||
if (denoise_mask != nullptr) {
|
||||
for (int64_t x = 0; x < denoised->ne[0]; x++) {
|
||||
for (int64_t y = 0; y < denoised->ne[1]; y++) {
|
||||
float mask = ggml_tensor_get_f32(noise_mask, x, y);
|
||||
float mask = ggml_tensor_get_f32(denoise_mask, x, y);
|
||||
for (int64_t k = 0; k < denoised->ne[2]; k++) {
|
||||
float init = ggml_tensor_get_f32(init_latent, x, y, k);
|
||||
float den = ggml_tensor_get_f32(denoised, x, y, k);
|
||||
@ -1240,8 +1281,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
|
||||
std::string prompt,
|
||||
std::string negative_prompt,
|
||||
int clip_skip,
|
||||
float cfg_scale,
|
||||
float guidance,
|
||||
sd_guidance_params_t guidance,
|
||||
float eta,
|
||||
int width,
|
||||
int height,
|
||||
@ -1255,11 +1295,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
|
||||
bool normalize_input,
|
||||
std::string input_id_images_path,
|
||||
std::vector<ggml_tensor*> ref_latents,
|
||||
std::vector<int> skip_layers = {},
|
||||
float slg_scale = 0,
|
||||
float skip_layer_start = 0.01,
|
||||
float skip_layer_end = 0.2,
|
||||
ggml_tensor* masked_image = NULL) {
|
||||
ggml_tensor* concat_latent = NULL,
|
||||
ggml_tensor* denoise_mask = NULL) {
|
||||
if (seed < 0) {
|
||||
// Generally, when using the provided command line, the seed is always >0.
|
||||
// However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
|
||||
@ -1407,7 +1444,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
|
||||
sd_ctx->sd->diffusion_model->get_adm_in_channels());
|
||||
|
||||
SDCondition uncond;
|
||||
if (cfg_scale != 1.0) {
|
||||
if (guidance.txt_cfg != 1.0 ||
|
||||
(sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.txt_cfg != guidance.img_cfg)) {
|
||||
bool force_zero_embeddings = false;
|
||||
if (sd_version_is_sdxl(sd_ctx->sd->version) && negative_prompt.size() == 0 && !sd_ctx->sd->is_using_edm_v_parameterization) {
|
||||
force_zero_embeddings = true;
|
||||
@ -1446,38 +1484,50 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
|
||||
int W = width / 8;
|
||||
int H = height / 8;
|
||||
LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
|
||||
ggml_tensor* noise_mask = nullptr;
|
||||
if (sd_version_is_inpaint(sd_ctx->sd->version)) {
|
||||
if (masked_image == NULL) {
|
||||
int64_t mask_channels = 1;
|
||||
if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
|
||||
mask_channels = 8 * 8; // flatten the whole mask
|
||||
}
|
||||
// no mask, set the whole image as masked
|
||||
masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1);
|
||||
for (int64_t x = 0; x < masked_image->ne[0]; x++) {
|
||||
for (int64_t y = 0; y < masked_image->ne[1]; y++) {
|
||||
if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
|
||||
// TODO: this might be wrong
|
||||
for (int64_t c = 0; c < init_latent->ne[2]; c++) {
|
||||
ggml_tensor_set_f32(masked_image, 0, x, y, c);
|
||||
}
|
||||
for (int64_t c = init_latent->ne[2]; c < masked_image->ne[2]; c++) {
|
||||
ggml_tensor_set_f32(masked_image, 1, x, y, c);
|
||||
}
|
||||
} else {
|
||||
ggml_tensor_set_f32(masked_image, 1, x, y, 0);
|
||||
for (int64_t c = 1; c < masked_image->ne[2]; c++) {
|
||||
ggml_tensor_set_f32(masked_image, 0, x, y, c);
|
||||
}
|
||||
int64_t mask_channels = 1;
|
||||
if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
|
||||
mask_channels = 8 * 8; // flatten the whole mask
|
||||
}
|
||||
auto empty_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1);
|
||||
// no mask, set the whole image as masked
|
||||
for (int64_t x = 0; x < empty_latent->ne[0]; x++) {
|
||||
for (int64_t y = 0; y < empty_latent->ne[1]; y++) {
|
||||
if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
|
||||
// TODO: this might be wrong
|
||||
for (int64_t c = 0; c < init_latent->ne[2]; c++) {
|
||||
ggml_tensor_set_f32(empty_latent, 0, x, y, c);
|
||||
}
|
||||
for (int64_t c = init_latent->ne[2]; c < empty_latent->ne[2]; c++) {
|
||||
ggml_tensor_set_f32(empty_latent, 1, x, y, c);
|
||||
}
|
||||
} else {
|
||||
ggml_tensor_set_f32(empty_latent, 1, x, y, 0);
|
||||
for (int64_t c = 1; c < empty_latent->ne[2]; c++) {
|
||||
ggml_tensor_set_f32(empty_latent, 0, x, y, c);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
cond.c_concat = masked_image;
|
||||
uncond.c_concat = masked_image;
|
||||
} else {
|
||||
noise_mask = masked_image;
|
||||
if (concat_latent == NULL) {
|
||||
concat_latent = empty_latent;
|
||||
}
|
||||
cond.c_concat = concat_latent;
|
||||
uncond.c_concat = empty_latent;
|
||||
denoise_mask = NULL;
|
||||
} else if (sd_version_is_unet_edit(sd_ctx->sd->version)) {
|
||||
auto empty_latent = ggml_dup_tensor(work_ctx, init_latent);
|
||||
ggml_set_f32(empty_latent, 0);
|
||||
uncond.c_concat = empty_latent;
|
||||
if (concat_latent == NULL) {
|
||||
concat_latent = empty_latent;
|
||||
}
|
||||
cond.c_concat = ref_latents[0];
|
||||
}
|
||||
SDCondition img_cond;
|
||||
if (uncond.c_crossattn != NULL &&
|
||||
(sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.txt_cfg != guidance.img_cfg)) {
|
||||
img_cond = SDCondition(uncond.c_crossattn, uncond.c_vector, cond.c_concat);
|
||||
}
|
||||
for (int b = 0; b < batch_count; b++) {
|
||||
int64_t sampling_start = ggml_time_ms();
|
||||
@ -1497,15 +1547,17 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
|
||||
LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step);
|
||||
}
|
||||
|
||||
// Disable min_cfg
|
||||
guidance.min_cfg = guidance.txt_cfg;
|
||||
|
||||
struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx,
|
||||
x_t,
|
||||
noise,
|
||||
cond,
|
||||
uncond,
|
||||
img_cond,
|
||||
image_hint,
|
||||
control_strength,
|
||||
cfg_scale,
|
||||
cfg_scale,
|
||||
guidance,
|
||||
eta,
|
||||
sample_method,
|
||||
@ -1513,11 +1565,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
|
||||
start_merge_step,
|
||||
id_cond,
|
||||
ref_latents,
|
||||
skip_layers,
|
||||
slg_scale,
|
||||
skip_layer_start,
|
||||
skip_layer_end,
|
||||
noise_mask);
|
||||
denoise_mask);
|
||||
|
||||
// struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
|
||||
// print_ggml_tensor(x_0);
|
||||
@ -1595,8 +1643,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
|
||||
const char* prompt_c_str,
|
||||
const char* negative_prompt_c_str,
|
||||
int clip_skip,
|
||||
float cfg_scale,
|
||||
float guidance,
|
||||
sd_guidance_params_t guidance,
|
||||
float eta,
|
||||
int width,
|
||||
int height,
|
||||
@ -1608,13 +1655,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
|
||||
float control_strength,
|
||||
float style_ratio,
|
||||
bool normalize_input,
|
||||
const char* input_id_images_path_c_str,
|
||||
int* skip_layers = NULL,
|
||||
size_t skip_layers_count = 0,
|
||||
float slg_scale = 0,
|
||||
float skip_layer_start = 0.01,
|
||||
float skip_layer_end = 0.2) {
|
||||
std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
|
||||
const char* input_id_images_path_c_str) {
|
||||
LOG_DEBUG("txt2img %dx%d", width, height);
|
||||
if (sd_ctx == NULL) {
|
||||
return NULL;
|
||||
@ -1659,7 +1700,6 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
|
||||
prompt_c_str,
|
||||
negative_prompt_c_str,
|
||||
clip_skip,
|
||||
cfg_scale,
|
||||
guidance,
|
||||
eta,
|
||||
width,
|
||||
@ -1673,11 +1713,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
|
||||
style_ratio,
|
||||
normalize_input,
|
||||
input_id_images_path_c_str,
|
||||
{},
|
||||
skip_layers_vec,
|
||||
slg_scale,
|
||||
skip_layer_start,
|
||||
skip_layer_end);
|
||||
{});
|
||||
|
||||
size_t t1 = ggml_time_ms();
|
||||
|
||||
@ -1692,8 +1728,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
|
||||
const char* prompt_c_str,
|
||||
const char* negative_prompt_c_str,
|
||||
int clip_skip,
|
||||
float cfg_scale,
|
||||
float guidance,
|
||||
sd_guidance_params_t guidance,
|
||||
float eta,
|
||||
int width,
|
||||
int height,
|
||||
@ -1706,13 +1741,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
|
||||
float control_strength,
|
||||
float style_ratio,
|
||||
bool normalize_input,
|
||||
const char* input_id_images_path_c_str,
|
||||
int* skip_layers = NULL,
|
||||
size_t skip_layers_count = 0,
|
||||
float slg_scale = 0,
|
||||
float skip_layer_start = 0.01,
|
||||
float skip_layer_end = 0.2) {
|
||||
std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
|
||||
const char* input_id_images_path_c_str) {
|
||||
LOG_DEBUG("img2img %dx%d", width, height);
|
||||
if (sd_ctx == NULL) {
|
||||
return NULL;
|
||||
@ -1756,7 +1785,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
|
||||
|
||||
sd_image_to_tensor(init_image.data, init_img);
|
||||
|
||||
ggml_tensor* masked_image;
|
||||
ggml_tensor* concat_latent;
|
||||
ggml_tensor* denoise_mask = NULL;
|
||||
|
||||
if (sd_version_is_inpaint(sd_ctx->sd->version)) {
|
||||
int64_t mask_channels = 1;
|
||||
@ -1765,22 +1795,22 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
|
||||
}
|
||||
ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
|
||||
sd_apply_mask(init_img, mask_img, masked_img);
|
||||
ggml_tensor* masked_image_0 = NULL;
|
||||
ggml_tensor* masked_latent = NULL;
|
||||
if (!sd_ctx->sd->use_tiny_autoencoder) {
|
||||
ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
|
||||
masked_image_0 = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
|
||||
masked_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
|
||||
} else {
|
||||
masked_image_0 = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
|
||||
masked_latent = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
|
||||
}
|
||||
masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_image_0->ne[0], masked_image_0->ne[1], mask_channels + masked_image_0->ne[2], 1);
|
||||
for (int ix = 0; ix < masked_image_0->ne[0]; ix++) {
|
||||
for (int iy = 0; iy < masked_image_0->ne[1]; iy++) {
|
||||
concat_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_latent->ne[0], masked_latent->ne[1], mask_channels + masked_latent->ne[2], 1);
|
||||
for (int ix = 0; ix < masked_latent->ne[0]; ix++) {
|
||||
for (int iy = 0; iy < masked_latent->ne[1]; iy++) {
|
||||
int mx = ix * 8;
|
||||
int my = iy * 8;
|
||||
if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
|
||||
for (int k = 0; k < masked_image_0->ne[2]; k++) {
|
||||
float v = ggml_tensor_get_f32(masked_image_0, ix, iy, k);
|
||||
ggml_tensor_set_f32(masked_image, v, ix, iy, k);
|
||||
for (int k = 0; k < masked_latent->ne[2]; k++) {
|
||||
float v = ggml_tensor_get_f32(masked_latent, ix, iy, k);
|
||||
ggml_tensor_set_f32(concat_latent, v, ix, iy, k);
|
||||
}
|
||||
// "Encode" 8x8 mask chunks into a flattened 1x64 vector, and concatenate to masked image
|
||||
for (int x = 0; x < 8; x++) {
|
||||
@ -1788,28 +1818,30 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
|
||||
float m = ggml_tensor_get_f32(mask_img, mx + x, my + y);
|
||||
// TODO: check if the way the mask is flattened is correct (is it supposed to be x*8+y or x+8*y?)
|
||||
// python code was using "b (h 8) (w 8) -> b (8 8) h w"
|
||||
ggml_tensor_set_f32(masked_image, m, ix, iy, masked_image_0->ne[2] + x * 8 + y);
|
||||
ggml_tensor_set_f32(concat_latent, m, ix, iy, masked_latent->ne[2] + x * 8 + y);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
float m = ggml_tensor_get_f32(mask_img, mx, my);
|
||||
ggml_tensor_set_f32(masked_image, m, ix, iy, 0);
|
||||
for (int k = 0; k < masked_image_0->ne[2]; k++) {
|
||||
float v = ggml_tensor_get_f32(masked_image_0, ix, iy, k);
|
||||
ggml_tensor_set_f32(masked_image, v, ix, iy, k + mask_channels);
|
||||
ggml_tensor_set_f32(concat_latent, m, ix, iy, 0);
|
||||
for (int k = 0; k < masked_latent->ne[2]; k++) {
|
||||
float v = ggml_tensor_get_f32(masked_latent, ix, iy, k);
|
||||
ggml_tensor_set_f32(concat_latent, v, ix, iy, k + mask_channels);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
}
|
||||
|
||||
{
|
||||
// LOG_WARN("Inpainting with a base model is not great");
|
||||
masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 1, 1);
|
||||
for (int ix = 0; ix < masked_image->ne[0]; ix++) {
|
||||
for (int iy = 0; iy < masked_image->ne[1]; iy++) {
|
||||
denoise_mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 1, 1);
|
||||
for (int ix = 0; ix < denoise_mask->ne[0]; ix++) {
|
||||
for (int iy = 0; iy < denoise_mask->ne[1]; iy++) {
|
||||
int mx = ix * 8;
|
||||
int my = iy * 8;
|
||||
float m = ggml_tensor_get_f32(mask_img, mx, my);
|
||||
ggml_tensor_set_f32(masked_image, m, ix, iy);
|
||||
ggml_tensor_set_f32(denoise_mask, m, ix, iy);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1822,7 +1854,6 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
|
||||
init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
|
||||
}
|
||||
|
||||
print_ggml_tensor(init_latent, true);
|
||||
size_t t1 = ggml_time_ms();
|
||||
LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
|
||||
|
||||
@ -1840,7 +1871,6 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
|
||||
prompt_c_str,
|
||||
negative_prompt_c_str,
|
||||
clip_skip,
|
||||
cfg_scale,
|
||||
guidance,
|
||||
eta,
|
||||
width,
|
||||
@ -1855,11 +1885,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
|
||||
normalize_input,
|
||||
input_id_images_path_c_str,
|
||||
{},
|
||||
skip_layers_vec,
|
||||
slg_scale,
|
||||
skip_layer_start,
|
||||
skip_layer_end,
|
||||
masked_image);
|
||||
concat_latent,
|
||||
denoise_mask);
|
||||
|
||||
size_t t2 = ggml_time_ms();
|
||||
|
||||
@ -1876,8 +1903,7 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
|
||||
int motion_bucket_id,
|
||||
int fps,
|
||||
float augmentation_level,
|
||||
float min_cfg,
|
||||
float cfg_scale,
|
||||
sd_guidance_params_t guidance,
|
||||
enum sample_method_t sample_method,
|
||||
int sample_steps,
|
||||
float strength,
|
||||
@ -1953,10 +1979,9 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
|
||||
cond,
|
||||
uncond,
|
||||
{},
|
||||
{},
|
||||
0.f,
|
||||
min_cfg,
|
||||
cfg_scale,
|
||||
0.f,
|
||||
guidance,
|
||||
0.f,
|
||||
sample_method,
|
||||
sigmas,
|
||||
@ -2007,26 +2032,19 @@ sd_image_t* edit(sd_ctx_t* sd_ctx,
|
||||
const char* prompt_c_str,
|
||||
const char* negative_prompt_c_str,
|
||||
int clip_skip,
|
||||
float cfg_scale,
|
||||
float guidance,
|
||||
sd_guidance_params_t guidance,
|
||||
float eta,
|
||||
int width,
|
||||
int height,
|
||||
sample_method_t sample_method,
|
||||
enum sample_method_t sample_method,
|
||||
int sample_steps,
|
||||
float strength,
|
||||
int64_t seed,
|
||||
int batch_count,
|
||||
const sd_image_t* control_cond,
|
||||
float control_strength,
|
||||
float style_ratio,
|
||||
bool normalize_input,
|
||||
int* skip_layers = NULL,
|
||||
size_t skip_layers_count = 0,
|
||||
float slg_scale = 0,
|
||||
float skip_layer_start = 0.01,
|
||||
float skip_layer_end = 0.2) {
|
||||
std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
|
||||
const char* input_id_images_path_c_str) {
|
||||
LOG_DEBUG("edit %dx%d", width, height);
|
||||
if (sd_ctx == NULL) {
|
||||
return NULL;
|
||||
@ -2064,11 +2082,21 @@ sd_image_t* edit(sd_ctx_t* sd_ctx,
|
||||
sd_image_to_tensor(ref_images[i].data, img);
|
||||
|
||||
ggml_tensor* latent = NULL;
|
||||
if (!sd_ctx->sd->use_tiny_autoencoder) {
|
||||
if (sd_ctx->sd->use_tiny_autoencoder) {
|
||||
latent = sd_ctx->sd->encode_first_stage(work_ctx, img);
|
||||
} else if (sd_ctx->sd->version == VERSION_SD1_PIX2PIX) {
|
||||
latent = sd_ctx->sd->encode_first_stage(work_ctx, img);
|
||||
latent = ggml_view_3d(work_ctx,
|
||||
latent,
|
||||
latent->ne[0],
|
||||
latent->ne[1],
|
||||
latent->ne[2] / 2,
|
||||
latent->nb[1],
|
||||
latent->nb[2],
|
||||
0);
|
||||
} else {
|
||||
ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, img);
|
||||
latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
|
||||
} else {
|
||||
latent = sd_ctx->sd->encode_first_stage(work_ctx, img);
|
||||
}
|
||||
ref_latents.push_back(latent);
|
||||
}
|
||||
@ -2086,7 +2114,6 @@ sd_image_t* edit(sd_ctx_t* sd_ctx,
|
||||
prompt_c_str,
|
||||
negative_prompt_c_str,
|
||||
clip_skip,
|
||||
cfg_scale,
|
||||
guidance,
|
||||
eta,
|
||||
width,
|
||||
@ -2101,10 +2128,6 @@ sd_image_t* edit(sd_ctx_t* sd_ctx,
|
||||
normalize_input,
|
||||
"",
|
||||
ref_latents,
|
||||
skip_layers_vec,
|
||||
slg_scale,
|
||||
skip_layer_start,
|
||||
skip_layer_end,
|
||||
NULL);
|
||||
|
||||
size_t t2 = ggml_time_ms();
|
||||
|
||||
@ -129,6 +129,22 @@ typedef struct {
|
||||
|
||||
typedef struct sd_ctx_t sd_ctx_t;
|
||||
|
||||
typedef struct {
|
||||
int* layers;
|
||||
size_t layer_count;
|
||||
float layer_start;
|
||||
float layer_end;
|
||||
float scale;
|
||||
} sd_slg_params_t;
|
||||
|
||||
typedef struct {
|
||||
float txt_cfg;
|
||||
float img_cfg;
|
||||
float min_cfg;
|
||||
float distilled_guidance;
|
||||
sd_slg_params_t slg;
|
||||
} sd_guidance_params_t;
|
||||
|
||||
SD_API sd_ctx_t* new_sd_ctx(const char* model_path,
|
||||
const char* clip_l_path,
|
||||
const char* clip_g_path,
|
||||
@ -161,8 +177,7 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
|
||||
const char* prompt,
|
||||
const char* negative_prompt,
|
||||
int clip_skip,
|
||||
float cfg_scale,
|
||||
float guidance,
|
||||
sd_guidance_params_t guidance,
|
||||
float eta,
|
||||
int width,
|
||||
int height,
|
||||
@ -174,12 +189,7 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
|
||||
float control_strength,
|
||||
float style_strength,
|
||||
bool normalize_input,
|
||||
const char* input_id_images_path,
|
||||
int* skip_layers,
|
||||
size_t skip_layers_count,
|
||||
float slg_scale,
|
||||
float skip_layer_start,
|
||||
float skip_layer_end);
|
||||
const char* input_id_images_path);
|
||||
|
||||
SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
|
||||
sd_image_t init_image,
|
||||
@ -187,8 +197,7 @@ SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
|
||||
const char* prompt,
|
||||
const char* negative_prompt,
|
||||
int clip_skip,
|
||||
float cfg_scale,
|
||||
float guidance,
|
||||
sd_guidance_params_t guidance,
|
||||
float eta,
|
||||
int width,
|
||||
int height,
|
||||
@ -201,12 +210,7 @@ SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
|
||||
float control_strength,
|
||||
float style_strength,
|
||||
bool normalize_input,
|
||||
const char* input_id_images_path,
|
||||
int* skip_layers,
|
||||
size_t skip_layers_count,
|
||||
float slg_scale,
|
||||
float skip_layer_start,
|
||||
float skip_layer_end);
|
||||
const char* input_id_images_path);
|
||||
|
||||
SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
|
||||
sd_image_t init_image,
|
||||
@ -216,8 +220,7 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
|
||||
int motion_bucket_id,
|
||||
int fps,
|
||||
float augmentation_level,
|
||||
float min_cfg,
|
||||
float cfg_scale,
|
||||
sd_guidance_params_t guidance,
|
||||
enum sample_method_t sample_method,
|
||||
int sample_steps,
|
||||
float strength,
|
||||
@ -229,25 +232,19 @@ SD_API sd_image_t* edit(sd_ctx_t* sd_ctx,
|
||||
const char* prompt,
|
||||
const char* negative_prompt,
|
||||
int clip_skip,
|
||||
float cfg_scale,
|
||||
float guidance,
|
||||
sd_guidance_params_t guidance,
|
||||
float eta,
|
||||
int width,
|
||||
int height,
|
||||
enum sample_method_t sample_method,
|
||||
int sample_steps,
|
||||
float strength,
|
||||
int64_t seed,
|
||||
int batch_count,
|
||||
const sd_image_t* control_cond,
|
||||
float control_strength,
|
||||
float style_strength,
|
||||
bool normalize_input,
|
||||
int* skip_layers,
|
||||
size_t skip_layers_count,
|
||||
float slg_scale,
|
||||
float skip_layer_start,
|
||||
float skip_layer_end);
|
||||
const char* input_id_images_path);
|
||||
|
||||
typedef struct upscaler_ctx_t upscaler_ctx_t;
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user