diff --git a/common.hpp b/common.hpp index b20c60f..9b5cc53 100644 --- a/common.hpp +++ b/common.hpp @@ -57,7 +57,7 @@ public: auto conv = std::dynamic_pointer_cast(blocks["conv"]); x = ggml_upscale(ctx, x, 2, GGML_SCALE_MODE_NEAREST); // [N, channels, h*2, w*2] - x = conv->forward(ctx, x); // [N, out_channels, h*2, w*2] + x = conv->forward(ctx, x); // [N, out_channels, h*2, w*2] return x; } }; diff --git a/denoiser.hpp b/denoiser.hpp index ee4ae51..2bd0b93 100644 --- a/denoiser.hpp +++ b/denoiser.hpp @@ -347,12 +347,13 @@ struct EDMVDenoiser : public CompVisVDenoiser { float min_sigma = 0.002; float max_sigma = 120.0; - EDMVDenoiser(float min_sigma = 0.002, float max_sigma = 120.0) : min_sigma(min_sigma), max_sigma(max_sigma) { + EDMVDenoiser(float min_sigma = 0.002, float max_sigma = 120.0) + : min_sigma(min_sigma), max_sigma(max_sigma) { schedule = std::make_shared(); } float t_to_sigma(float t) { - return std::exp(t * 4/(float)TIMESTEPS); + return std::exp(t * 4 / (float)TIMESTEPS); } float sigma_to_t(float s) { diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 635b780..9f6a4fe 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -118,7 +118,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_kronecker(ggml_context* ctx, struct g a->ne[1] * b->ne[1], a->ne[2] * b->ne[2], a->ne[3] * b->ne[3], - GGML_SCALE_MODE_NEAREST), + GGML_SCALE_MODE_NEAREST), b); } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 57c2d59..b5860cf 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1566,6 +1566,29 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, return result_images; } +ggml_tensor* generate_init_latent(sd_ctx_t* sd_ctx, + ggml_context* work_ctx, + int width, + int height) { + int C = 4; + if (sd_version_is_sd3(sd_ctx->sd->version)) { + C = 16; + } else if (sd_version_is_flux(sd_ctx->sd->version)) { + C = 16; + } + int W = width / 8; + int H = height / 8; + ggml_tensor* init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); + if (sd_version_is_sd3(sd_ctx->sd->version)) { + ggml_set_f32(init_latent, 0.0609f); + } else if (sd_version_is_flux(sd_ctx->sd->version)) { + ggml_set_f32(init_latent, 0.1159f); + } else { + ggml_set_f32(init_latent, 0.f); + } + return init_latent; +} + sd_image_t* txt2img(sd_ctx_t* sd_ctx, const char* prompt_c_str, const char* negative_prompt_c_str, @@ -1622,27 +1645,12 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, std::vector sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps); - int C = 4; - if (sd_version_is_sd3(sd_ctx->sd->version)) { - C = 16; - } else if (sd_version_is_flux(sd_ctx->sd->version)) { - C = 16; - } - int W = width / 8; - int H = height / 8; - ggml_tensor* init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); - if (sd_version_is_sd3(sd_ctx->sd->version)) { - ggml_set_f32(init_latent, 0.0609f); - } else if (sd_version_is_flux(sd_ctx->sd->version)) { - ggml_set_f32(init_latent, 0.1159f); - } else { - ggml_set_f32(init_latent, 0.f); - } - if (sd_version_is_inpaint(sd_ctx->sd->version)) { LOG_WARN("This is an inpainting model, this should only be used in img2img mode with a mask"); } + ggml_tensor* init_latent = generate_init_latent(sd_ctx, work_ctx, width, height); + sd_image_t* result_images = generate_image(sd_ctx, work_ctx, init_latent, @@ -2046,23 +2054,6 @@ sd_image_t* edit(sd_ctx_t* sd_ctx, } sd_ctx->sd->rng->manual_seed(seed); - int C = 4; - if (sd_version_is_sd3(sd_ctx->sd->version)) { - C = 16; - } else if (sd_version_is_flux(sd_ctx->sd->version)) { - C = 16; - } - int W = width / 8; - int H = height / 8; - ggml_tensor* init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); - if (sd_version_is_sd3(sd_ctx->sd->version)) { - ggml_set_f32(init_latent, 0.0609f); - } else if (sd_version_is_flux(sd_ctx->sd->version)) { - ggml_set_f32(init_latent, 0.1159f); - } else { - ggml_set_f32(init_latent, 0.f); - } - size_t t0 = ggml_time_ms(); std::vector ref_latents; @@ -2085,6 +2076,8 @@ sd_image_t* edit(sd_ctx_t* sd_ctx, std::vector sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps); + ggml_tensor* init_latent = generate_init_latent(sd_ctx, work_ctx, width, height); + sd_image_t* result_images = generate_image(sd_ctx, work_ctx, init_latent,