2025-12-13 05:48:56 +00:00
7 changed files with 32 additions and 88 deletions
--- a/conditioner.hpp
+++ b/conditioner.hpp
@ -60,7 +60,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
    int32_t num_custom_embeddings   = 0;
    int32_t num_custom_embeddings_2 = 0;
    std::vector<uint8_t> token_embed_custom;
-    std::map<std::string, std::pair<int, int>> embedding_pos_map;
+    std::vector<std::string> readed_embeddings;
    FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
                                      bool offload_params_to_cpu,
@ -123,17 +123,14 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
    }
    bool load_embedding(std::string embd_name, std::string embd_path, std::vector<int32_t>& bpe_tokens) {
        // the order matters
        ModelLoader model_loader;
        if (!model_loader.init_from_file_and_convert_name(embd_path)) {
            LOG_ERROR("embedding '%s' failed", embd_name.c_str());
            return false;
        }
-        auto iter = embedding_pos_map.find(embd_name);
+        if (std::find(readed_embeddings.begin(), readed_embeddings.end(), embd_name) != readed_embeddings.end()) {
        if (iter != embedding_pos_map.end()) {
            LOG_DEBUG("embedding already read in: %s", embd_name.c_str());
            for (int i = iter->second.first; i < iter->second.second; i++) {
                bpe_tokens.push_back(text_model->model.vocab_size + i);
            }
            return true;
        }
        struct ggml_init_params params;
@ -164,7 +161,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
            return true;
        };
        model_loader.load_tensors(on_load, 1);
-        int pos_start = num_custom_embeddings;
+        readed_embeddings.push_back(embd_name);
        if (embd) {
            int64_t hidden_size = text_model->model.hidden_size;
            token_embed_custom.resize(token_embed_custom.size() + ggml_nbytes(embd));
@ -191,11 +188,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
            }
            LOG_DEBUG("embedding '%s' applied, custom embeddings: %i (text model 2)", embd_name.c_str(), num_custom_embeddings_2);
        }
        int pos_end = num_custom_embeddings;
        if (pos_end == pos_start) {
            return false;
        }
        embedding_pos_map[embd_name] = std::pair{pos_start, pos_end};
        return true;
    }
--- a/esrgan.hpp
+++ b/esrgan.hpp
@ -156,10 +156,9 @@ struct ESRGAN : public GGMLRunner {
    ESRGAN(ggml_backend_t backend,
           bool offload_params_to_cpu,
           int tile_size                                  = 128,
           const String2TensorStorage& tensor_storage_map = {})
        : GGMLRunner(backend, offload_params_to_cpu) {
-        this->tile_size = tile_size;
+        // rrdb_net will be created in load_from_file
    }
    std::string get_desc() override {
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -1080,7 +1080,6 @@ struct SDGenerationParams {
    float pm_style_strength = 20.f;
    int upscale_repeats = 1;
    int upscale_tile_size = 128;
    std::map<std::string, float> lora_map;
    std::map<std::string, float> high_noise_lora_map;
@ -1177,10 +1176,6 @@ struct SDGenerationParams {
             "--upscale-repeats",
             "Run the ESRGAN upscaler this many times (default: 1)",
             &upscale_repeats},
            {"",
             "--upscale-tile-size",
             "tile size for ESRGAN upscaling (default: 128)",
             &upscale_tile_size},
        };
        options.float_options = {
@ -1640,10 +1635,6 @@ struct SDGenerationParams {
            return false;
        }
        if (upscale_tile_size < 1) {
            return false;
        }
        if (mode == UPSCALE) {
            if (init_image_path.length() == 0) {
                fprintf(stderr, "error: upscale mode needs an init image (--init-img)\n");
@ -1729,7 +1720,6 @@ struct SDGenerationParams {
            << "  control_strength: " << control_strength << ",\n"
            << "  seed: " << seed << ",\n"
            << "  upscale_repeats: " << upscale_repeats << ",\n"
            << "  upscale_tile_size: " << upscale_tile_size << ",\n"
            << "}";
        free(sample_params_str);
        free(high_noise_sample_params_str);
@ -2346,8 +2336,7 @@ int main(int argc, const char* argv[]) {
        upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(ctx_params.esrgan_path.c_str(),
                                                        ctx_params.offload_params_to_cpu,
                                                        ctx_params.diffusion_conv_direct,
-                                                        ctx_params.n_threads,
+                                                        ctx_params.n_threads);
                                                        gen_params.upscale_tile_size);
        if (upscaler_ctx == nullptr) {
            printf("new_upscaler_ctx failed\n");
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@ -60,14 +60,6 @@
 #define SD_UNUSED(x) (void)(x)
 #endif
 __STATIC_INLINE__ int align_up_offset(int n, int multiple) {
    return (multiple - n % multiple) % multiple;
 }
 __STATIC_INLINE__ int align_up(int n, int multiple) {
    return n + align_up_offset(n, multiple);
 }
 __STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void*) {
    switch (level) {
        case GGML_LOG_LEVEL_DEBUG:
@ -1400,14 +1392,10 @@ __STATIC_INLINE__ void ggml_ext_backend_tensor_get_and_sync(ggml_backend_t backe
 }
 __STATIC_INLINE__ float ggml_ext_backend_tensor_get_f32(ggml_tensor* tensor) {
-    GGML_ASSERT(tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_I32 || tensor->type == GGML_TYPE_BF16);
+    GGML_ASSERT(tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_I32);
    float value;
    if (tensor->type == GGML_TYPE_F32) {
        ggml_backend_tensor_get(tensor, &value, 0, sizeof(value));
    } else if (tensor->type == GGML_TYPE_BF16) {
        ggml_bf16_t bf16_value;
        ggml_backend_tensor_get(tensor, &bf16_value, 0, sizeof(bf16_value));
        value = ggml_bf16_to_fp32(bf16_value);
    } else if (tensor->type == GGML_TYPE_F16) {
        ggml_fp16_t f16_value;
        ggml_backend_tensor_get(tensor, &f16_value, 0, sizeof(f16_value));
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -1898,18 +1898,6 @@ public:
        return vae_scale_factor;
    }
    int get_diffusion_model_down_factor() {
        int down_factor = 8;  // unet
        if (sd_version_is_dit(version)) {
            if (sd_version_is_wan(version)) {
                down_factor = 2;
            } else {
                down_factor = 1;
            }
        }
        return down_factor;
    }
    int get_latent_channel() {
        int latent_channel = 4;
        if (sd_version_is_dit(version)) {
@ -3145,19 +3133,22 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
    sd_ctx->sd->vae_tiling_params = sd_img_gen_params->vae_tiling_params;
    int width                     = sd_img_gen_params->width;
    int height                    = sd_img_gen_params->height;
    int vae_scale_factor          = sd_ctx->sd->get_vae_scale_factor();
-    int diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor();
+    if (sd_version_is_dit(sd_ctx->sd->version)) {
-    int spatial_multiple            = vae_scale_factor * diffusion_model_down_factor;
+        if (width % 16 || height % 16) {
-
+            LOG_ERROR("Image dimensions must be must be a multiple of 16 on each axis for %s models. (Got %dx%d)",
-    int width_offset  = align_up_offset(width, spatial_multiple);
+                      model_version_to_str[sd_ctx->sd->version],
-    int height_offset = align_up_offset(height, spatial_multiple);
+                      width,
-    if (width_offset > 0 || height_offset > 0) {
+                      height);
-        width += width_offset;
+            return nullptr;
-        height += height_offset;
+        }
-        LOG_WARN("align up %dx%d to %dx%d (multiple=%d)", sd_img_gen_params->width, sd_img_gen_params->height, width, height, spatial_multiple);
+    } else if (width % 64 || height % 64) {
        LOG_ERROR("Image dimensions must be must be a multiple of 64 on each axis for %s models. (Got %dx%d)",
                  model_version_to_str[sd_ctx->sd->version],
                  width,
                  height);
        return nullptr;
    }
    LOG_DEBUG("generate_image %dx%d", width, height);
    if (sd_ctx == nullptr || sd_img_gen_params == nullptr) {
        return nullptr;
@ -3431,19 +3422,9 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
    int frames       = sd_vid_gen_params->video_frames;
    frames           = (frames - 1) / 4 * 4 + 1;
    int sample_steps = sd_vid_gen_params->sample_params.sample_steps;
    LOG_INFO("generate_video %dx%dx%d", width, height, frames);
    int vae_scale_factor = sd_ctx->sd->get_vae_scale_factor();
    int diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor();
    int spatial_multiple            = vae_scale_factor * diffusion_model_down_factor;
    int width_offset  = align_up_offset(width, spatial_multiple);
    int height_offset = align_up_offset(height, spatial_multiple);
    if (width_offset > 0 || height_offset > 0) {
        width += width_offset;
        height += height_offset;
        LOG_WARN("align up %dx%d to %dx%d (multiple=%d)", sd_vid_gen_params->width, sd_vid_gen_params->height, width, height, spatial_multiple);
    }
    LOG_INFO("generate_video %dx%dx%d", width, height, frames);
    enum sample_method_t sample_method = sd_vid_gen_params->sample_params.sample_method;
    if (sample_method == SAMPLE_METHOD_COUNT) {
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@ -347,8 +347,7 @@ typedef struct upscaler_ctx_t upscaler_ctx_t;
 SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
                                        bool offload_params_to_cpu,
                                        bool direct,
-                                        int n_threads,
+                                        int n_threads);
                                        int tile_size);
 SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
 SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx,
--- a/upscaler.cpp
+++ b/upscaler.cpp
@ -10,14 +10,11 @@ struct UpscalerGGML {
    std::string esrgan_path;
    int n_threads;
    bool direct = false;
    int tile_size = 128;
    UpscalerGGML(int n_threads,
-                 bool direct   = false,
+                 bool direct = false)
                 int tile_size = 128)
        : n_threads(n_threads),
-          direct(direct),
+          direct(direct) {
          tile_size(tile_size) {
    }
    bool load_from_file(const std::string& esrgan_path,
@ -54,7 +51,7 @@ struct UpscalerGGML {
            backend = ggml_backend_cpu_init();
        }
        LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
-        esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, tile_size, model_loader.get_tensor_storage_map());
+        esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, model_loader.get_tensor_storage_map());
        if (direct) {
            esrgan_upscaler->set_conv2d_direct_enabled(true);
        }
@ -116,15 +113,14 @@ struct upscaler_ctx_t {
 upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
                                 bool offload_params_to_cpu,
                                 bool direct,
-                                 int n_threads,
+                                 int n_threads) {
                                 int tile_size) {
    upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t));
    if (upscaler_ctx == nullptr) {
        return nullptr;
    }
    std::string esrgan_path(esrgan_path_c_str);
-    upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct, tile_size);
+    upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct);
    if (upscaler_ctx->upscaler == nullptr) {
        return nullptr;
    }