mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2025-12-12 13:28:37 +00:00
feat: reduce CLIP memory usage with no embeddings (#768)
This commit is contained in:
parent
ddc4a18b92
commit
48956ffb87
51
clip.hpp
51
clip.hpp
@ -548,9 +548,15 @@ protected:
|
||||
int64_t embed_dim;
|
||||
int64_t vocab_size;
|
||||
int64_t num_positions;
|
||||
bool force_clip_f32;
|
||||
|
||||
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
|
||||
enum ggml_type token_wtype = GGML_TYPE_F32;
|
||||
enum ggml_type token_wtype = GGML_TYPE_F32;
|
||||
if (!force_clip_f32) {
|
||||
auto tensor_type = tensor_types.find(prefix + "token_embedding.weight");
|
||||
if (tensor_type != tensor_types.end())
|
||||
token_wtype = tensor_type->second;
|
||||
}
|
||||
enum ggml_type position_wtype = GGML_TYPE_F32;
|
||||
|
||||
params["token_embedding.weight"] = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
|
||||
@ -560,10 +566,12 @@ protected:
|
||||
public:
|
||||
CLIPEmbeddings(int64_t embed_dim,
|
||||
int64_t vocab_size = 49408,
|
||||
int64_t num_positions = 77)
|
||||
int64_t num_positions = 77,
|
||||
bool force_clip_f32 = false)
|
||||
: embed_dim(embed_dim),
|
||||
vocab_size(vocab_size),
|
||||
num_positions(num_positions) {
|
||||
num_positions(num_positions),
|
||||
force_clip_f32(force_clip_f32) {
|
||||
}
|
||||
|
||||
struct ggml_tensor* get_token_embed_weight() {
|
||||
@ -678,12 +686,11 @@ public:
|
||||
int32_t n_head = 12;
|
||||
int32_t n_layer = 12; // num_hidden_layers
|
||||
int32_t projection_dim = 1280; // only for OPEN_CLIP_VIT_BIGG_14
|
||||
int32_t clip_skip = -1;
|
||||
bool with_final_ln = true;
|
||||
|
||||
CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
|
||||
bool with_final_ln = true,
|
||||
int clip_skip_value = -1)
|
||||
bool force_clip_f32 = false)
|
||||
: version(version), with_final_ln(with_final_ln) {
|
||||
if (version == OPEN_CLIP_VIT_H_14) {
|
||||
hidden_size = 1024;
|
||||
@ -696,20 +703,12 @@ public:
|
||||
n_head = 20;
|
||||
n_layer = 32;
|
||||
}
|
||||
set_clip_skip(clip_skip_value);
|
||||
|
||||
blocks["embeddings"] = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token));
|
||||
blocks["embeddings"] = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token, force_clip_f32));
|
||||
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size));
|
||||
blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
|
||||
}
|
||||
|
||||
void set_clip_skip(int skip) {
|
||||
if (skip <= 0) {
|
||||
skip = -1;
|
||||
}
|
||||
clip_skip = skip;
|
||||
}
|
||||
|
||||
struct ggml_tensor* get_token_embed_weight() {
|
||||
auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
|
||||
return embeddings->get_token_embed_weight();
|
||||
@ -720,7 +719,8 @@ public:
|
||||
struct ggml_tensor* input_ids,
|
||||
struct ggml_tensor* tkn_embeddings,
|
||||
size_t max_token_idx = 0,
|
||||
bool return_pooled = false) {
|
||||
bool return_pooled = false,
|
||||
int clip_skip = -1) {
|
||||
// input_ids: [N, n_token]
|
||||
auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
|
||||
auto encoder = std::dynamic_pointer_cast<CLIPEncoder>(blocks["encoder"]);
|
||||
@ -889,8 +889,8 @@ struct CLIPTextModelRunner : public GGMLRunner {
|
||||
const std::string prefix,
|
||||
CLIPVersion version = OPENAI_CLIP_VIT_L_14,
|
||||
bool with_final_ln = true,
|
||||
int clip_skip_value = -1)
|
||||
: GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, clip_skip_value) {
|
||||
bool force_clip_f32 = false)
|
||||
: GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, force_clip_f32) {
|
||||
model.init(params_ctx, tensor_types, prefix);
|
||||
}
|
||||
|
||||
@ -898,10 +898,6 @@ struct CLIPTextModelRunner : public GGMLRunner {
|
||||
return "clip";
|
||||
}
|
||||
|
||||
void set_clip_skip(int clip_skip) {
|
||||
model.set_clip_skip(clip_skip);
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
|
||||
model.get_param_tensors(tensors, prefix);
|
||||
}
|
||||
@ -911,7 +907,8 @@ struct CLIPTextModelRunner : public GGMLRunner {
|
||||
struct ggml_tensor* input_ids,
|
||||
struct ggml_tensor* embeddings,
|
||||
size_t max_token_idx = 0,
|
||||
bool return_pooled = false) {
|
||||
bool return_pooled = false,
|
||||
int clip_skip = -1) {
|
||||
size_t N = input_ids->ne[1];
|
||||
size_t n_token = input_ids->ne[0];
|
||||
if (input_ids->ne[0] > model.n_token) {
|
||||
@ -919,14 +916,15 @@ struct CLIPTextModelRunner : public GGMLRunner {
|
||||
input_ids = ggml_reshape_2d(ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
|
||||
}
|
||||
|
||||
return model.forward(ctx, backend, input_ids, embeddings, max_token_idx, return_pooled);
|
||||
return model.forward(ctx, backend, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
|
||||
}
|
||||
|
||||
struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
|
||||
int num_custom_embeddings = 0,
|
||||
void* custom_embeddings_data = NULL,
|
||||
size_t max_token_idx = 0,
|
||||
bool return_pooled = false) {
|
||||
bool return_pooled = false,
|
||||
int clip_skip = -1) {
|
||||
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
|
||||
|
||||
input_ids = to_backend(input_ids);
|
||||
@ -945,7 +943,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
|
||||
embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1);
|
||||
}
|
||||
|
||||
struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, embeddings, max_token_idx, return_pooled);
|
||||
struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
|
||||
|
||||
ggml_build_forward_expand(gf, hidden_states);
|
||||
|
||||
@ -958,10 +956,11 @@ struct CLIPTextModelRunner : public GGMLRunner {
|
||||
void* custom_embeddings_data,
|
||||
size_t max_token_idx,
|
||||
bool return_pooled,
|
||||
int clip_skip,
|
||||
ggml_tensor** output,
|
||||
ggml_context* output_ctx = NULL) {
|
||||
auto get_graph = [&]() -> struct ggml_cgraph* {
|
||||
return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled);
|
||||
return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip);
|
||||
};
|
||||
GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
|
||||
}
|
||||
|
||||
@ -61,30 +61,16 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
||||
const String2GGMLType& tensor_types,
|
||||
const std::string& embd_dir,
|
||||
SDVersion version = VERSION_SD1,
|
||||
PMVersion pv = PM_VERSION_1,
|
||||
int clip_skip = -1)
|
||||
PMVersion pv = PM_VERSION_1)
|
||||
: version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
|
||||
bool force_clip_f32 = embd_dir.size() > 0;
|
||||
if (sd_version_is_sd1(version)) {
|
||||
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14);
|
||||
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
|
||||
} else if (sd_version_is_sd2(version)) {
|
||||
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14);
|
||||
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
|
||||
} else if (sd_version_is_sdxl(version)) {
|
||||
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
|
||||
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
|
||||
}
|
||||
set_clip_skip(clip_skip);
|
||||
}
|
||||
|
||||
void set_clip_skip(int clip_skip) {
|
||||
if (clip_skip <= 0) {
|
||||
clip_skip = 1;
|
||||
if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) {
|
||||
clip_skip = 2;
|
||||
}
|
||||
}
|
||||
text_model->set_clip_skip(clip_skip);
|
||||
if (sd_version_is_sdxl(version)) {
|
||||
text_model2->set_clip_skip(clip_skip);
|
||||
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
|
||||
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
|
||||
}
|
||||
}
|
||||
|
||||
@ -412,7 +398,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
||||
int height,
|
||||
int adm_in_channels = -1,
|
||||
bool zero_out_masked = false) {
|
||||
set_clip_skip(clip_skip);
|
||||
int64_t t0 = ggml_time_ms();
|
||||
struct ggml_tensor* hidden_states = NULL; // [N, n_token, hidden_size]
|
||||
struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2]
|
||||
@ -421,6 +406,10 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
||||
struct ggml_tensor* pooled = NULL;
|
||||
std::vector<float> hidden_states_vec;
|
||||
|
||||
if (clip_skip <= 0) {
|
||||
clip_skip = (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) ? 2 : 1;
|
||||
}
|
||||
|
||||
size_t chunk_len = 77;
|
||||
size_t chunk_count = tokens.size() / chunk_len;
|
||||
for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) {
|
||||
@ -455,6 +444,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
||||
token_embed_custom.data(),
|
||||
max_token_idx,
|
||||
false,
|
||||
clip_skip,
|
||||
&chunk_hidden_states1,
|
||||
work_ctx);
|
||||
if (sd_version_is_sdxl(version)) {
|
||||
@ -464,6 +454,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
||||
token_embed_custom.data(),
|
||||
max_token_idx,
|
||||
false,
|
||||
clip_skip,
|
||||
&chunk_hidden_states2, work_ctx);
|
||||
// concat
|
||||
chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0);
|
||||
@ -475,6 +466,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
||||
token_embed_custom.data(),
|
||||
max_token_idx,
|
||||
true,
|
||||
clip_skip,
|
||||
&pooled,
|
||||
work_ctx);
|
||||
}
|
||||
@ -669,21 +661,11 @@ struct SD3CLIPEmbedder : public Conditioner {
|
||||
|
||||
SD3CLIPEmbedder(ggml_backend_t backend,
|
||||
bool offload_params_to_cpu,
|
||||
const String2GGMLType& tensor_types = {},
|
||||
int clip_skip = -1)
|
||||
const String2GGMLType& tensor_types = {})
|
||||
: clip_g_tokenizer(0) {
|
||||
clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
|
||||
clip_g = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
|
||||
t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer");
|
||||
set_clip_skip(clip_skip);
|
||||
}
|
||||
|
||||
void set_clip_skip(int clip_skip) {
|
||||
if (clip_skip <= 0) {
|
||||
clip_skip = 2;
|
||||
}
|
||||
clip_l->set_clip_skip(clip_skip);
|
||||
clip_g->set_clip_skip(clip_skip);
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
|
||||
@ -780,7 +762,6 @@ struct SD3CLIPEmbedder : public Conditioner {
|
||||
std::vector<std::pair<std::vector<int>, std::vector<float>>> token_and_weights,
|
||||
int clip_skip,
|
||||
bool zero_out_masked = false) {
|
||||
set_clip_skip(clip_skip);
|
||||
auto& clip_l_tokens = token_and_weights[0].first;
|
||||
auto& clip_l_weights = token_and_weights[0].second;
|
||||
auto& clip_g_tokens = token_and_weights[1].first;
|
||||
@ -788,6 +769,10 @@ struct SD3CLIPEmbedder : public Conditioner {
|
||||
auto& t5_tokens = token_and_weights[2].first;
|
||||
auto& t5_weights = token_and_weights[2].second;
|
||||
|
||||
if (clip_skip <= 0) {
|
||||
clip_skip = 2;
|
||||
}
|
||||
|
||||
int64_t t0 = ggml_time_ms();
|
||||
struct ggml_tensor* hidden_states = NULL; // [N, n_token*2, 4096]
|
||||
struct ggml_tensor* chunk_hidden_states = NULL; // [n_token*2, 4096]
|
||||
@ -818,6 +803,7 @@ struct SD3CLIPEmbedder : public Conditioner {
|
||||
NULL,
|
||||
max_token_idx,
|
||||
false,
|
||||
clip_skip,
|
||||
&chunk_hidden_states_l,
|
||||
work_ctx);
|
||||
{
|
||||
@ -845,6 +831,7 @@ struct SD3CLIPEmbedder : public Conditioner {
|
||||
NULL,
|
||||
max_token_idx,
|
||||
true,
|
||||
clip_skip,
|
||||
&pooled_l,
|
||||
work_ctx);
|
||||
}
|
||||
@ -866,6 +853,7 @@ struct SD3CLIPEmbedder : public Conditioner {
|
||||
NULL,
|
||||
max_token_idx,
|
||||
false,
|
||||
clip_skip,
|
||||
&chunk_hidden_states_g,
|
||||
work_ctx);
|
||||
|
||||
@ -894,6 +882,7 @@ struct SD3CLIPEmbedder : public Conditioner {
|
||||
NULL,
|
||||
max_token_idx,
|
||||
true,
|
||||
clip_skip,
|
||||
&pooled_g,
|
||||
work_ctx);
|
||||
}
|
||||
@ -1017,18 +1006,9 @@ struct FluxCLIPEmbedder : public Conditioner {
|
||||
|
||||
FluxCLIPEmbedder(ggml_backend_t backend,
|
||||
bool offload_params_to_cpu,
|
||||
const String2GGMLType& tensor_types = {},
|
||||
int clip_skip = -1) {
|
||||
const String2GGMLType& tensor_types = {}) {
|
||||
clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
|
||||
t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer");
|
||||
set_clip_skip(clip_skip);
|
||||
}
|
||||
|
||||
void set_clip_skip(int clip_skip) {
|
||||
if (clip_skip <= 0) {
|
||||
clip_skip = 2;
|
||||
}
|
||||
clip_l->set_clip_skip(clip_skip);
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
|
||||
@ -1109,12 +1089,15 @@ struct FluxCLIPEmbedder : public Conditioner {
|
||||
std::vector<std::pair<std::vector<int>, std::vector<float>>> token_and_weights,
|
||||
int clip_skip,
|
||||
bool zero_out_masked = false) {
|
||||
set_clip_skip(clip_skip);
|
||||
auto& clip_l_tokens = token_and_weights[0].first;
|
||||
auto& clip_l_weights = token_and_weights[0].second;
|
||||
auto& t5_tokens = token_and_weights[1].first;
|
||||
auto& t5_weights = token_and_weights[1].second;
|
||||
|
||||
if (clip_skip <= 0) {
|
||||
clip_skip = 2;
|
||||
}
|
||||
|
||||
int64_t t0 = ggml_time_ms();
|
||||
struct ggml_tensor* hidden_states = NULL; // [N, n_token, 4096]
|
||||
struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, 4096]
|
||||
@ -1143,6 +1126,7 @@ struct FluxCLIPEmbedder : public Conditioner {
|
||||
NULL,
|
||||
max_token_idx,
|
||||
true,
|
||||
clip_skip,
|
||||
&pooled,
|
||||
work_ctx);
|
||||
}
|
||||
@ -1241,7 +1225,6 @@ struct T5CLIPEmbedder : public Conditioner {
|
||||
T5CLIPEmbedder(ggml_backend_t backend,
|
||||
bool offload_params_to_cpu,
|
||||
const String2GGMLType& tensor_types = {},
|
||||
int clip_skip = -1,
|
||||
bool use_mask = false,
|
||||
int mask_pad = 1,
|
||||
bool is_umt5 = false)
|
||||
@ -1249,9 +1232,6 @@ struct T5CLIPEmbedder : public Conditioner {
|
||||
t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer", is_umt5);
|
||||
}
|
||||
|
||||
void set_clip_skip(int clip_skip) {
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
|
||||
t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
|
||||
}
|
||||
|
||||
@ -373,7 +373,6 @@ public:
|
||||
cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
|
||||
offload_params_to_cpu,
|
||||
model_loader.tensor_storages_types,
|
||||
-1,
|
||||
sd_ctx_params->chroma_use_t5_mask,
|
||||
sd_ctx_params->chroma_t5_mask_pad);
|
||||
} else {
|
||||
@ -391,7 +390,6 @@ public:
|
||||
cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
|
||||
offload_params_to_cpu,
|
||||
model_loader.tensor_storages_types,
|
||||
-1,
|
||||
true,
|
||||
1,
|
||||
true);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user