diff --git a/clip.hpp b/clip.hpp index f92c9c2..bde8a78 100644 --- a/clip.hpp +++ b/clip.hpp @@ -548,9 +548,15 @@ protected: int64_t embed_dim; int64_t vocab_size; int64_t num_positions; + bool force_clip_f32; void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { - enum ggml_type token_wtype = GGML_TYPE_F32; + enum ggml_type token_wtype = GGML_TYPE_F32; + if (!force_clip_f32) { + auto tensor_type = tensor_types.find(prefix + "token_embedding.weight"); + if (tensor_type != tensor_types.end()) + token_wtype = tensor_type->second; + } enum ggml_type position_wtype = GGML_TYPE_F32; params["token_embedding.weight"] = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size); @@ -560,10 +566,12 @@ protected: public: CLIPEmbeddings(int64_t embed_dim, int64_t vocab_size = 49408, - int64_t num_positions = 77) + int64_t num_positions = 77, + bool force_clip_f32 = false) : embed_dim(embed_dim), vocab_size(vocab_size), - num_positions(num_positions) { + num_positions(num_positions), + force_clip_f32(force_clip_f32) { } struct ggml_tensor* get_token_embed_weight() { @@ -678,12 +686,11 @@ public: int32_t n_head = 12; int32_t n_layer = 12; // num_hidden_layers int32_t projection_dim = 1280; // only for OPEN_CLIP_VIT_BIGG_14 - int32_t clip_skip = -1; bool with_final_ln = true; CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14, bool with_final_ln = true, - int clip_skip_value = -1) + bool force_clip_f32 = false) : version(version), with_final_ln(with_final_ln) { if (version == OPEN_CLIP_VIT_H_14) { hidden_size = 1024; @@ -696,20 +703,12 @@ public: n_head = 20; n_layer = 32; } - set_clip_skip(clip_skip_value); - blocks["embeddings"] = std::shared_ptr(new CLIPEmbeddings(hidden_size, vocab_size, n_token)); + blocks["embeddings"] = std::shared_ptr(new CLIPEmbeddings(hidden_size, vocab_size, n_token, force_clip_f32)); blocks["encoder"] = std::shared_ptr(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size)); blocks["final_layer_norm"] = std::shared_ptr(new LayerNorm(hidden_size)); } - void set_clip_skip(int skip) { - if (skip <= 0) { - skip = -1; - } - clip_skip = skip; - } - struct ggml_tensor* get_token_embed_weight() { auto embeddings = std::dynamic_pointer_cast(blocks["embeddings"]); return embeddings->get_token_embed_weight(); @@ -720,7 +719,8 @@ public: struct ggml_tensor* input_ids, struct ggml_tensor* tkn_embeddings, size_t max_token_idx = 0, - bool return_pooled = false) { + bool return_pooled = false, + int clip_skip = -1) { // input_ids: [N, n_token] auto embeddings = std::dynamic_pointer_cast(blocks["embeddings"]); auto encoder = std::dynamic_pointer_cast(blocks["encoder"]); @@ -889,8 +889,8 @@ struct CLIPTextModelRunner : public GGMLRunner { const std::string prefix, CLIPVersion version = OPENAI_CLIP_VIT_L_14, bool with_final_ln = true, - int clip_skip_value = -1) - : GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, clip_skip_value) { + bool force_clip_f32 = false) + : GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, force_clip_f32) { model.init(params_ctx, tensor_types, prefix); } @@ -898,10 +898,6 @@ struct CLIPTextModelRunner : public GGMLRunner { return "clip"; } - void set_clip_skip(int clip_skip) { - model.set_clip_skip(clip_skip); - } - void get_param_tensors(std::map& tensors, const std::string prefix) { model.get_param_tensors(tensors, prefix); } @@ -911,7 +907,8 @@ struct CLIPTextModelRunner : public GGMLRunner { struct ggml_tensor* input_ids, struct ggml_tensor* embeddings, size_t max_token_idx = 0, - bool return_pooled = false) { + bool return_pooled = false, + int clip_skip = -1) { size_t N = input_ids->ne[1]; size_t n_token = input_ids->ne[0]; if (input_ids->ne[0] > model.n_token) { @@ -919,14 +916,15 @@ struct CLIPTextModelRunner : public GGMLRunner { input_ids = ggml_reshape_2d(ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token); } - return model.forward(ctx, backend, input_ids, embeddings, max_token_idx, return_pooled); + return model.forward(ctx, backend, input_ids, embeddings, max_token_idx, return_pooled, clip_skip); } struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids, int num_custom_embeddings = 0, void* custom_embeddings_data = NULL, size_t max_token_idx = 0, - bool return_pooled = false) { + bool return_pooled = false, + int clip_skip = -1) { struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); input_ids = to_backend(input_ids); @@ -945,7 +943,7 @@ struct CLIPTextModelRunner : public GGMLRunner { embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1); } - struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, embeddings, max_token_idx, return_pooled); + struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, embeddings, max_token_idx, return_pooled, clip_skip); ggml_build_forward_expand(gf, hidden_states); @@ -958,10 +956,11 @@ struct CLIPTextModelRunner : public GGMLRunner { void* custom_embeddings_data, size_t max_token_idx, bool return_pooled, + int clip_skip, ggml_tensor** output, ggml_context* output_ctx = NULL) { auto get_graph = [&]() -> struct ggml_cgraph* { - return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled); + return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip); }; GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); } diff --git a/conditioner.hpp b/conditioner.hpp index cfd2b4c..bda99df 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -61,30 +61,16 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { const String2GGMLType& tensor_types, const std::string& embd_dir, SDVersion version = VERSION_SD1, - PMVersion pv = PM_VERSION_1, - int clip_skip = -1) + PMVersion pv = PM_VERSION_1) : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) { + bool force_clip_f32 = embd_dir.size() > 0; if (sd_version_is_sd1(version)) { - text_model = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14); + text_model = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32); } else if (sd_version_is_sd2(version)) { - text_model = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14); + text_model = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32); } else if (sd_version_is_sdxl(version)) { - text_model = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); - text_model2 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); - } - set_clip_skip(clip_skip); - } - - void set_clip_skip(int clip_skip) { - if (clip_skip <= 0) { - clip_skip = 1; - if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) { - clip_skip = 2; - } - } - text_model->set_clip_skip(clip_skip); - if (sd_version_is_sdxl(version)) { - text_model2->set_clip_skip(clip_skip); + text_model = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32); + text_model2 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32); } } @@ -412,7 +398,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { int height, int adm_in_channels = -1, bool zero_out_masked = false) { - set_clip_skip(clip_skip); int64_t t0 = ggml_time_ms(); struct ggml_tensor* hidden_states = NULL; // [N, n_token, hidden_size] struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2] @@ -421,6 +406,10 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { struct ggml_tensor* pooled = NULL; std::vector hidden_states_vec; + if (clip_skip <= 0) { + clip_skip = (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) ? 2 : 1; + } + size_t chunk_len = 77; size_t chunk_count = tokens.size() / chunk_len; for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) { @@ -455,6 +444,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { token_embed_custom.data(), max_token_idx, false, + clip_skip, &chunk_hidden_states1, work_ctx); if (sd_version_is_sdxl(version)) { @@ -464,6 +454,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { token_embed_custom.data(), max_token_idx, false, + clip_skip, &chunk_hidden_states2, work_ctx); // concat chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0); @@ -475,6 +466,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { token_embed_custom.data(), max_token_idx, true, + clip_skip, &pooled, work_ctx); } @@ -669,21 +661,11 @@ struct SD3CLIPEmbedder : public Conditioner { SD3CLIPEmbedder(ggml_backend_t backend, bool offload_params_to_cpu, - const String2GGMLType& tensor_types = {}, - int clip_skip = -1) + const String2GGMLType& tensor_types = {}) : clip_g_tokenizer(0) { clip_l = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); clip_g = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); t5 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer"); - set_clip_skip(clip_skip); - } - - void set_clip_skip(int clip_skip) { - if (clip_skip <= 0) { - clip_skip = 2; - } - clip_l->set_clip_skip(clip_skip); - clip_g->set_clip_skip(clip_skip); } void get_param_tensors(std::map& tensors) { @@ -780,7 +762,6 @@ struct SD3CLIPEmbedder : public Conditioner { std::vector, std::vector>> token_and_weights, int clip_skip, bool zero_out_masked = false) { - set_clip_skip(clip_skip); auto& clip_l_tokens = token_and_weights[0].first; auto& clip_l_weights = token_and_weights[0].second; auto& clip_g_tokens = token_and_weights[1].first; @@ -788,6 +769,10 @@ struct SD3CLIPEmbedder : public Conditioner { auto& t5_tokens = token_and_weights[2].first; auto& t5_weights = token_and_weights[2].second; + if (clip_skip <= 0) { + clip_skip = 2; + } + int64_t t0 = ggml_time_ms(); struct ggml_tensor* hidden_states = NULL; // [N, n_token*2, 4096] struct ggml_tensor* chunk_hidden_states = NULL; // [n_token*2, 4096] @@ -818,6 +803,7 @@ struct SD3CLIPEmbedder : public Conditioner { NULL, max_token_idx, false, + clip_skip, &chunk_hidden_states_l, work_ctx); { @@ -845,6 +831,7 @@ struct SD3CLIPEmbedder : public Conditioner { NULL, max_token_idx, true, + clip_skip, &pooled_l, work_ctx); } @@ -866,6 +853,7 @@ struct SD3CLIPEmbedder : public Conditioner { NULL, max_token_idx, false, + clip_skip, &chunk_hidden_states_g, work_ctx); @@ -894,6 +882,7 @@ struct SD3CLIPEmbedder : public Conditioner { NULL, max_token_idx, true, + clip_skip, &pooled_g, work_ctx); } @@ -1017,18 +1006,9 @@ struct FluxCLIPEmbedder : public Conditioner { FluxCLIPEmbedder(ggml_backend_t backend, bool offload_params_to_cpu, - const String2GGMLType& tensor_types = {}, - int clip_skip = -1) { + const String2GGMLType& tensor_types = {}) { clip_l = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true); t5 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer"); - set_clip_skip(clip_skip); - } - - void set_clip_skip(int clip_skip) { - if (clip_skip <= 0) { - clip_skip = 2; - } - clip_l->set_clip_skip(clip_skip); } void get_param_tensors(std::map& tensors) { @@ -1109,12 +1089,15 @@ struct FluxCLIPEmbedder : public Conditioner { std::vector, std::vector>> token_and_weights, int clip_skip, bool zero_out_masked = false) { - set_clip_skip(clip_skip); auto& clip_l_tokens = token_and_weights[0].first; auto& clip_l_weights = token_and_weights[0].second; auto& t5_tokens = token_and_weights[1].first; auto& t5_weights = token_and_weights[1].second; + if (clip_skip <= 0) { + clip_skip = 2; + } + int64_t t0 = ggml_time_ms(); struct ggml_tensor* hidden_states = NULL; // [N, n_token, 4096] struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, 4096] @@ -1143,6 +1126,7 @@ struct FluxCLIPEmbedder : public Conditioner { NULL, max_token_idx, true, + clip_skip, &pooled, work_ctx); } @@ -1241,7 +1225,6 @@ struct T5CLIPEmbedder : public Conditioner { T5CLIPEmbedder(ggml_backend_t backend, bool offload_params_to_cpu, const String2GGMLType& tensor_types = {}, - int clip_skip = -1, bool use_mask = false, int mask_pad = 1, bool is_umt5 = false) @@ -1249,9 +1232,6 @@ struct T5CLIPEmbedder : public Conditioner { t5 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer", is_umt5); } - void set_clip_skip(int clip_skip) { - } - void get_param_tensors(std::map& tensors) { t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer"); } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index db4e07c..5f9dec0 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -373,7 +373,6 @@ public: cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, model_loader.tensor_storages_types, - -1, sd_ctx_params->chroma_use_t5_mask, sd_ctx_params->chroma_t5_mask_pad); } else { @@ -391,7 +390,6 @@ public: cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, model_loader.tensor_storages_types, - -1, true, 1, true);