feat: reduce CLIP memory usage with no embeddings (#768)

2025-12-12 13:28:37 +00:00 · 2025-09-14 01:08:00 -03:00 · 2025-09-14 01:08:00 -03:00 · 48956ffb87
commit 48956ffb87
parent ddc4a18b92
3 changed files with 53 additions and 76 deletions
--- a/clip.hpp
+++ b/clip.hpp
@ -548,9 +548,15 @@ protected:
    int64_t embed_dim;
    int64_t vocab_size;
    int64_t num_positions;
+    bool force_clip_f32;

    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
-        enum ggml_type token_wtype    = GGML_TYPE_F32;
+        enum ggml_type token_wtype = GGML_TYPE_F32;
+        if (!force_clip_f32) {
+            auto tensor_type = tensor_types.find(prefix + "token_embedding.weight");
+            if (tensor_type != tensor_types.end())
+                token_wtype = tensor_type->second;
+        }
        enum ggml_type position_wtype = GGML_TYPE_F32;

        params["token_embedding.weight"]    = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
@ -560,10 +566,12 @@ protected:
 public:
    CLIPEmbeddings(int64_t embed_dim,
                   int64_t vocab_size    = 49408,
-                   int64_t num_positions = 77)
+                   int64_t num_positions = 77,
+                   bool force_clip_f32   = false)
        : embed_dim(embed_dim),
          vocab_size(vocab_size),
-          num_positions(num_positions) {
+          num_positions(num_positions),
+          force_clip_f32(force_clip_f32) {
    }

    struct ggml_tensor* get_token_embed_weight() {
@ -678,12 +686,11 @@ public:
    int32_t n_head            = 12;
    int32_t n_layer           = 12;    // num_hidden_layers
    int32_t projection_dim    = 1280;  // only for OPEN_CLIP_VIT_BIGG_14
-    int32_t clip_skip         = -1;
    bool with_final_ln        = true;

    CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
                  bool with_final_ln  = true,
-                  int clip_skip_value = -1)
+                  bool force_clip_f32 = false)
        : version(version), with_final_ln(with_final_ln) {
        if (version == OPEN_CLIP_VIT_H_14) {
            hidden_size       = 1024;
@ -696,20 +703,12 @@ public:
            n_head            = 20;
            n_layer           = 32;
        }
-        set_clip_skip(clip_skip_value);

-        blocks["embeddings"]       = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token));
+        blocks["embeddings"]       = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token, force_clip_f32));
        blocks["encoder"]          = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size));
        blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
    }

-    void set_clip_skip(int skip) {
-        if (skip <= 0) {
-            skip = -1;
-        }
-        clip_skip = skip;
-    }
-
    struct ggml_tensor* get_token_embed_weight() {
        auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
        return embeddings->get_token_embed_weight();
@ -720,7 +719,8 @@ public:
                                struct ggml_tensor* input_ids,
                                struct ggml_tensor* tkn_embeddings,
                                size_t max_token_idx = 0,
-                                bool return_pooled   = false) {
+                                bool return_pooled   = false,
+                                int clip_skip        = -1) {
        // input_ids: [N, n_token]
        auto embeddings       = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
        auto encoder          = std::dynamic_pointer_cast<CLIPEncoder>(blocks["encoder"]);
@ -889,8 +889,8 @@ struct CLIPTextModelRunner : public GGMLRunner {
                        const std::string prefix,
                        CLIPVersion version = OPENAI_CLIP_VIT_L_14,
                        bool with_final_ln  = true,
-                        int clip_skip_value = -1)
-        : GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, clip_skip_value) {
+                        bool force_clip_f32 = false)
+        : GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, force_clip_f32) {
        model.init(params_ctx, tensor_types, prefix);
    }

@ -898,10 +898,6 @@ struct CLIPTextModelRunner : public GGMLRunner {
        return "clip";
    }

-    void set_clip_skip(int clip_skip) {
-        model.set_clip_skip(clip_skip);
-    }
-
    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
        model.get_param_tensors(tensors, prefix);
    }
@ -911,7 +907,8 @@ struct CLIPTextModelRunner : public GGMLRunner {
                                struct ggml_tensor* input_ids,
                                struct ggml_tensor* embeddings,
                                size_t max_token_idx = 0,
-                                bool return_pooled   = false) {
+                                bool return_pooled   = false,
+                                int clip_skip        = -1) {
        size_t N       = input_ids->ne[1];
        size_t n_token = input_ids->ne[0];
        if (input_ids->ne[0] > model.n_token) {
@ -919,14 +916,15 @@ struct CLIPTextModelRunner : public GGMLRunner {
            input_ids = ggml_reshape_2d(ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
        }

-        return model.forward(ctx, backend, input_ids, embeddings, max_token_idx, return_pooled);
+        return model.forward(ctx, backend, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
    }

    struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
                                    int num_custom_embeddings    = 0,
                                    void* custom_embeddings_data = NULL,
                                    size_t max_token_idx         = 0,
-                                    bool return_pooled           = false) {
+                                    bool return_pooled           = false,
+                                    int clip_skip                = -1) {
        struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);

        input_ids = to_backend(input_ids);
@ -945,7 +943,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
            embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1);
        }

-        struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, embeddings, max_token_idx, return_pooled);
+        struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);

        ggml_build_forward_expand(gf, hidden_states);

@ -958,10 +956,11 @@ struct CLIPTextModelRunner : public GGMLRunner {
                 void* custom_embeddings_data,
                 size_t max_token_idx,
                 bool return_pooled,
+                 int clip_skip,
                 ggml_tensor** output,
                 ggml_context* output_ctx = NULL) {
        auto get_graph = [&]() -> struct ggml_cgraph* {
-            return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled);
+            return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip);
        };
        GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
    }
--- a/conditioner.hpp
+++ b/conditioner.hpp
@ -61,30 +61,16 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                      const String2GGMLType& tensor_types,
                                      const std::string& embd_dir,
                                      SDVersion version = VERSION_SD1,
-                                      PMVersion pv      = PM_VERSION_1,
-                                      int clip_skip     = -1)
+                                      PMVersion pv      = PM_VERSION_1)
        : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
+        bool force_clip_f32 = embd_dir.size() > 0;
        if (sd_version_is_sd1(version)) {
-            text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14);
+            text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
        } else if (sd_version_is_sd2(version)) {
-            text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14);
+            text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
        } else if (sd_version_is_sdxl(version)) {
-            text_model  = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
-            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
-        }
-        set_clip_skip(clip_skip);
-    }
-
-    void set_clip_skip(int clip_skip) {
-        if (clip_skip <= 0) {
-            clip_skip = 1;
-            if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) {
-                clip_skip = 2;
-            }
-        }
-        text_model->set_clip_skip(clip_skip);
-        if (sd_version_is_sdxl(version)) {
-            text_model2->set_clip_skip(clip_skip);
+            text_model  = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
+            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
        }
    }

@ -412,7 +398,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                             int height,
                                             int adm_in_channels  = -1,
                                             bool zero_out_masked = false) {
-        set_clip_skip(clip_skip);
        int64_t t0                               = ggml_time_ms();
        struct ggml_tensor* hidden_states        = NULL;  // [N, n_token, hidden_size]
        struct ggml_tensor* chunk_hidden_states  = NULL;  // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2]
@ -421,6 +406,10 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
        struct ggml_tensor* pooled               = NULL;
        std::vector<float> hidden_states_vec;

+        if (clip_skip <= 0) {
+            clip_skip = (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) ? 2 : 1;
+        }
+
        size_t chunk_len   = 77;
        size_t chunk_count = tokens.size() / chunk_len;
        for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) {
@ -455,6 +444,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                    token_embed_custom.data(),
                                    max_token_idx,
                                    false,
+                                    clip_skip,
                                    &chunk_hidden_states1,
                                    work_ctx);
                if (sd_version_is_sdxl(version)) {
@ -464,6 +454,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                         token_embed_custom.data(),
                                         max_token_idx,
                                         false,
+                                         clip_skip,
                                         &chunk_hidden_states2, work_ctx);
                    // concat
                    chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0);
@ -475,6 +466,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                             token_embed_custom.data(),
                                             max_token_idx,
                                             true,
+                                             clip_skip,
                                             &pooled,
                                             work_ctx);
                    }
@ -669,21 +661,11 @@ struct SD3CLIPEmbedder : public Conditioner {

    SD3CLIPEmbedder(ggml_backend_t backend,
                    bool offload_params_to_cpu,
-                    const String2GGMLType& tensor_types = {},
-                    int clip_skip                       = -1)
+                    const String2GGMLType& tensor_types = {})
        : clip_g_tokenizer(0) {
        clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
        clip_g = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
        t5     = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer");
-        set_clip_skip(clip_skip);
-    }
-
-    void set_clip_skip(int clip_skip) {
-        if (clip_skip <= 0) {
-            clip_skip = 2;
-        }
-        clip_l->set_clip_skip(clip_skip);
-        clip_g->set_clip_skip(clip_skip);
    }

    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
@ -780,7 +762,6 @@ struct SD3CLIPEmbedder : public Conditioner {
                                             std::vector<std::pair<std::vector<int>, std::vector<float>>> token_and_weights,
                                             int clip_skip,
                                             bool zero_out_masked = false) {
-        set_clip_skip(clip_skip);
        auto& clip_l_tokens  = token_and_weights[0].first;
        auto& clip_l_weights = token_and_weights[0].second;
        auto& clip_g_tokens  = token_and_weights[1].first;
@ -788,6 +769,10 @@ struct SD3CLIPEmbedder : public Conditioner {
        auto& t5_tokens      = token_and_weights[2].first;
        auto& t5_weights     = token_and_weights[2].second;

+        if (clip_skip <= 0) {
+            clip_skip = 2;
+        }
+
        int64_t t0                                 = ggml_time_ms();
        struct ggml_tensor* hidden_states          = NULL;  // [N, n_token*2, 4096]
        struct ggml_tensor* chunk_hidden_states    = NULL;  // [n_token*2, 4096]
@ -818,6 +803,7 @@ struct SD3CLIPEmbedder : public Conditioner {
                                NULL,
                                max_token_idx,
                                false,
+                                clip_skip,
                                &chunk_hidden_states_l,
                                work_ctx);
                {
@ -845,6 +831,7 @@ struct SD3CLIPEmbedder : public Conditioner {
                                    NULL,
                                    max_token_idx,
                                    true,
+                                    clip_skip,
                                    &pooled_l,
                                    work_ctx);
                }
@ -866,6 +853,7 @@ struct SD3CLIPEmbedder : public Conditioner {
                                NULL,
                                max_token_idx,
                                false,
+                                clip_skip,
                                &chunk_hidden_states_g,
                                work_ctx);

@ -894,6 +882,7 @@ struct SD3CLIPEmbedder : public Conditioner {
                                    NULL,
                                    max_token_idx,
                                    true,
+                                    clip_skip,
                                    &pooled_g,
                                    work_ctx);
                }
@ -1017,18 +1006,9 @@ struct FluxCLIPEmbedder : public Conditioner {

    FluxCLIPEmbedder(ggml_backend_t backend,
                     bool offload_params_to_cpu,
-                     const String2GGMLType& tensor_types = {},
-                     int clip_skip                       = -1) {
+                     const String2GGMLType& tensor_types = {}) {
        clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
        t5     = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer");
-        set_clip_skip(clip_skip);
-    }
-
-    void set_clip_skip(int clip_skip) {
-        if (clip_skip <= 0) {
-            clip_skip = 2;
-        }
-        clip_l->set_clip_skip(clip_skip);
    }

    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
@ -1109,12 +1089,15 @@ struct FluxCLIPEmbedder : public Conditioner {
                                             std::vector<std::pair<std::vector<int>, std::vector<float>>> token_and_weights,
                                             int clip_skip,
                                             bool zero_out_masked = false) {
-        set_clip_skip(clip_skip);
        auto& clip_l_tokens  = token_and_weights[0].first;
        auto& clip_l_weights = token_and_weights[0].second;
        auto& t5_tokens      = token_and_weights[1].first;
        auto& t5_weights     = token_and_weights[1].second;

+        if (clip_skip <= 0) {
+            clip_skip = 2;
+        }
+
        int64_t t0                              = ggml_time_ms();
        struct ggml_tensor* hidden_states       = NULL;  // [N, n_token, 4096]
        struct ggml_tensor* chunk_hidden_states = NULL;  // [n_token, 4096]
@ -1143,6 +1126,7 @@ struct FluxCLIPEmbedder : public Conditioner {
                                NULL,
                                max_token_idx,
                                true,
+                                clip_skip,
                                &pooled,
                                work_ctx);
            }
@ -1241,7 +1225,6 @@ struct T5CLIPEmbedder : public Conditioner {
    T5CLIPEmbedder(ggml_backend_t backend,
                   bool offload_params_to_cpu,
                   const String2GGMLType& tensor_types = {},
-                   int clip_skip                       = -1,
                   bool use_mask                       = false,
                   int mask_pad                        = 1,
                   bool is_umt5                        = false)
@ -1249,9 +1232,6 @@ struct T5CLIPEmbedder : public Conditioner {
        t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer", is_umt5);
    }

-    void set_clip_skip(int clip_skip) {
-    }
-
    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
        t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
    }
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -373,7 +373,6 @@ public:
                    cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
                                                                        offload_params_to_cpu,
                                                                        model_loader.tensor_storages_types,
-                                                                        -1,
                                                                        sd_ctx_params->chroma_use_t5_mask,
                                                                        sd_ctx_params->chroma_t5_mask_pad);
                } else {
@ -391,7 +390,6 @@ public:
                cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
                                                                    offload_params_to_cpu,
                                                                    model_loader.tensor_storages_types,
-                                                                    -1,
                                                                    true,
                                                                    1,
                                                                    true);