feat: reduce CLIP memory usage with no embeddings (#768)

This commit is contained in:
Wagner Bruna 2025-09-14 01:08:00 -03:00 committed by GitHub
parent ddc4a18b92
commit 48956ffb87
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 53 additions and 76 deletions

View File

@ -548,9 +548,15 @@ protected:
int64_t embed_dim;
int64_t vocab_size;
int64_t num_positions;
bool force_clip_f32;
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
enum ggml_type token_wtype = GGML_TYPE_F32;
enum ggml_type token_wtype = GGML_TYPE_F32;
if (!force_clip_f32) {
auto tensor_type = tensor_types.find(prefix + "token_embedding.weight");
if (tensor_type != tensor_types.end())
token_wtype = tensor_type->second;
}
enum ggml_type position_wtype = GGML_TYPE_F32;
params["token_embedding.weight"] = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
@ -560,10 +566,12 @@ protected:
public:
CLIPEmbeddings(int64_t embed_dim,
int64_t vocab_size = 49408,
int64_t num_positions = 77)
int64_t num_positions = 77,
bool force_clip_f32 = false)
: embed_dim(embed_dim),
vocab_size(vocab_size),
num_positions(num_positions) {
num_positions(num_positions),
force_clip_f32(force_clip_f32) {
}
struct ggml_tensor* get_token_embed_weight() {
@ -678,12 +686,11 @@ public:
int32_t n_head = 12;
int32_t n_layer = 12; // num_hidden_layers
int32_t projection_dim = 1280; // only for OPEN_CLIP_VIT_BIGG_14
int32_t clip_skip = -1;
bool with_final_ln = true;
CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
bool with_final_ln = true,
int clip_skip_value = -1)
bool force_clip_f32 = false)
: version(version), with_final_ln(with_final_ln) {
if (version == OPEN_CLIP_VIT_H_14) {
hidden_size = 1024;
@ -696,20 +703,12 @@ public:
n_head = 20;
n_layer = 32;
}
set_clip_skip(clip_skip_value);
blocks["embeddings"] = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token));
blocks["embeddings"] = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token, force_clip_f32));
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size));
blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
}
void set_clip_skip(int skip) {
if (skip <= 0) {
skip = -1;
}
clip_skip = skip;
}
struct ggml_tensor* get_token_embed_weight() {
auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
return embeddings->get_token_embed_weight();
@ -720,7 +719,8 @@ public:
struct ggml_tensor* input_ids,
struct ggml_tensor* tkn_embeddings,
size_t max_token_idx = 0,
bool return_pooled = false) {
bool return_pooled = false,
int clip_skip = -1) {
// input_ids: [N, n_token]
auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
auto encoder = std::dynamic_pointer_cast<CLIPEncoder>(blocks["encoder"]);
@ -889,8 +889,8 @@ struct CLIPTextModelRunner : public GGMLRunner {
const std::string prefix,
CLIPVersion version = OPENAI_CLIP_VIT_L_14,
bool with_final_ln = true,
int clip_skip_value = -1)
: GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, clip_skip_value) {
bool force_clip_f32 = false)
: GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, force_clip_f32) {
model.init(params_ctx, tensor_types, prefix);
}
@ -898,10 +898,6 @@ struct CLIPTextModelRunner : public GGMLRunner {
return "clip";
}
void set_clip_skip(int clip_skip) {
model.set_clip_skip(clip_skip);
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
model.get_param_tensors(tensors, prefix);
}
@ -911,7 +907,8 @@ struct CLIPTextModelRunner : public GGMLRunner {
struct ggml_tensor* input_ids,
struct ggml_tensor* embeddings,
size_t max_token_idx = 0,
bool return_pooled = false) {
bool return_pooled = false,
int clip_skip = -1) {
size_t N = input_ids->ne[1];
size_t n_token = input_ids->ne[0];
if (input_ids->ne[0] > model.n_token) {
@ -919,14 +916,15 @@ struct CLIPTextModelRunner : public GGMLRunner {
input_ids = ggml_reshape_2d(ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
}
return model.forward(ctx, backend, input_ids, embeddings, max_token_idx, return_pooled);
return model.forward(ctx, backend, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
}
struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
int num_custom_embeddings = 0,
void* custom_embeddings_data = NULL,
size_t max_token_idx = 0,
bool return_pooled = false) {
bool return_pooled = false,
int clip_skip = -1) {
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
input_ids = to_backend(input_ids);
@ -945,7 +943,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1);
}
struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, embeddings, max_token_idx, return_pooled);
struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
ggml_build_forward_expand(gf, hidden_states);
@ -958,10 +956,11 @@ struct CLIPTextModelRunner : public GGMLRunner {
void* custom_embeddings_data,
size_t max_token_idx,
bool return_pooled,
int clip_skip,
ggml_tensor** output,
ggml_context* output_ctx = NULL) {
auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled);
return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip);
};
GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
}

View File

@ -61,30 +61,16 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
const String2GGMLType& tensor_types,
const std::string& embd_dir,
SDVersion version = VERSION_SD1,
PMVersion pv = PM_VERSION_1,
int clip_skip = -1)
PMVersion pv = PM_VERSION_1)
: version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
bool force_clip_f32 = embd_dir.size() > 0;
if (sd_version_is_sd1(version)) {
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14);
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
} else if (sd_version_is_sd2(version)) {
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14);
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
} else if (sd_version_is_sdxl(version)) {
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
}
set_clip_skip(clip_skip);
}
void set_clip_skip(int clip_skip) {
if (clip_skip <= 0) {
clip_skip = 1;
if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) {
clip_skip = 2;
}
}
text_model->set_clip_skip(clip_skip);
if (sd_version_is_sdxl(version)) {
text_model2->set_clip_skip(clip_skip);
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
}
}
@ -412,7 +398,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
int height,
int adm_in_channels = -1,
bool zero_out_masked = false) {
set_clip_skip(clip_skip);
int64_t t0 = ggml_time_ms();
struct ggml_tensor* hidden_states = NULL; // [N, n_token, hidden_size]
struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2]
@ -421,6 +406,10 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
struct ggml_tensor* pooled = NULL;
std::vector<float> hidden_states_vec;
if (clip_skip <= 0) {
clip_skip = (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) ? 2 : 1;
}
size_t chunk_len = 77;
size_t chunk_count = tokens.size() / chunk_len;
for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) {
@ -455,6 +444,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
token_embed_custom.data(),
max_token_idx,
false,
clip_skip,
&chunk_hidden_states1,
work_ctx);
if (sd_version_is_sdxl(version)) {
@ -464,6 +454,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
token_embed_custom.data(),
max_token_idx,
false,
clip_skip,
&chunk_hidden_states2, work_ctx);
// concat
chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0);
@ -475,6 +466,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
token_embed_custom.data(),
max_token_idx,
true,
clip_skip,
&pooled,
work_ctx);
}
@ -669,21 +661,11 @@ struct SD3CLIPEmbedder : public Conditioner {
SD3CLIPEmbedder(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {},
int clip_skip = -1)
const String2GGMLType& tensor_types = {})
: clip_g_tokenizer(0) {
clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
clip_g = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer");
set_clip_skip(clip_skip);
}
void set_clip_skip(int clip_skip) {
if (clip_skip <= 0) {
clip_skip = 2;
}
clip_l->set_clip_skip(clip_skip);
clip_g->set_clip_skip(clip_skip);
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
@ -780,7 +762,6 @@ struct SD3CLIPEmbedder : public Conditioner {
std::vector<std::pair<std::vector<int>, std::vector<float>>> token_and_weights,
int clip_skip,
bool zero_out_masked = false) {
set_clip_skip(clip_skip);
auto& clip_l_tokens = token_and_weights[0].first;
auto& clip_l_weights = token_and_weights[0].second;
auto& clip_g_tokens = token_and_weights[1].first;
@ -788,6 +769,10 @@ struct SD3CLIPEmbedder : public Conditioner {
auto& t5_tokens = token_and_weights[2].first;
auto& t5_weights = token_and_weights[2].second;
if (clip_skip <= 0) {
clip_skip = 2;
}
int64_t t0 = ggml_time_ms();
struct ggml_tensor* hidden_states = NULL; // [N, n_token*2, 4096]
struct ggml_tensor* chunk_hidden_states = NULL; // [n_token*2, 4096]
@ -818,6 +803,7 @@ struct SD3CLIPEmbedder : public Conditioner {
NULL,
max_token_idx,
false,
clip_skip,
&chunk_hidden_states_l,
work_ctx);
{
@ -845,6 +831,7 @@ struct SD3CLIPEmbedder : public Conditioner {
NULL,
max_token_idx,
true,
clip_skip,
&pooled_l,
work_ctx);
}
@ -866,6 +853,7 @@ struct SD3CLIPEmbedder : public Conditioner {
NULL,
max_token_idx,
false,
clip_skip,
&chunk_hidden_states_g,
work_ctx);
@ -894,6 +882,7 @@ struct SD3CLIPEmbedder : public Conditioner {
NULL,
max_token_idx,
true,
clip_skip,
&pooled_g,
work_ctx);
}
@ -1017,18 +1006,9 @@ struct FluxCLIPEmbedder : public Conditioner {
FluxCLIPEmbedder(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {},
int clip_skip = -1) {
const String2GGMLType& tensor_types = {}) {
clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer");
set_clip_skip(clip_skip);
}
void set_clip_skip(int clip_skip) {
if (clip_skip <= 0) {
clip_skip = 2;
}
clip_l->set_clip_skip(clip_skip);
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
@ -1109,12 +1089,15 @@ struct FluxCLIPEmbedder : public Conditioner {
std::vector<std::pair<std::vector<int>, std::vector<float>>> token_and_weights,
int clip_skip,
bool zero_out_masked = false) {
set_clip_skip(clip_skip);
auto& clip_l_tokens = token_and_weights[0].first;
auto& clip_l_weights = token_and_weights[0].second;
auto& t5_tokens = token_and_weights[1].first;
auto& t5_weights = token_and_weights[1].second;
if (clip_skip <= 0) {
clip_skip = 2;
}
int64_t t0 = ggml_time_ms();
struct ggml_tensor* hidden_states = NULL; // [N, n_token, 4096]
struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, 4096]
@ -1143,6 +1126,7 @@ struct FluxCLIPEmbedder : public Conditioner {
NULL,
max_token_idx,
true,
clip_skip,
&pooled,
work_ctx);
}
@ -1241,7 +1225,6 @@ struct T5CLIPEmbedder : public Conditioner {
T5CLIPEmbedder(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {},
int clip_skip = -1,
bool use_mask = false,
int mask_pad = 1,
bool is_umt5 = false)
@ -1249,9 +1232,6 @@ struct T5CLIPEmbedder : public Conditioner {
t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer", is_umt5);
}
void set_clip_skip(int clip_skip) {
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
}

View File

@ -373,7 +373,6 @@ public:
cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
offload_params_to_cpu,
model_loader.tensor_storages_types,
-1,
sd_ctx_params->chroma_use_t5_mask,
sd_ctx_params->chroma_t5_mask_pad);
} else {
@ -391,7 +390,6 @@ public:
cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
offload_params_to_cpu,
model_loader.tensor_storages_types,
-1,
true,
1,
true);