2026-06-25 15:46:40 +00:00
5 changed files with 33 additions and 97 deletions
--- a/conditioner.hpp
+++ b/conditioner.hpp
@ -1708,9 +1708,6 @@ struct LLMEmbedder : public Conditioner {
        int prompt_template_encode_start_idx = 34;
        int max_length                       = 0;
        std::set<int> out_layers;
-        std::vector<int> tokens;
-        std::vector<float> weights;
-        std::vector<float> mask;
        if (llm->enable_vision && conditioner_params.ref_images.size() > 0) {
            LOG_INFO("QwenImageEditPlusPipeline");
            prompt_template_encode_start_idx = 64;
@ -1798,7 +1795,6 @@ struct LLMEmbedder : public Conditioner {
            prompt += "<|im_end|>\n<|im_start|>assistant\n";
        } else if (version == VERSION_FLUX2_KLEIN) {
            prompt_template_encode_start_idx = 0;
-            max_length                       = 512;
            out_layers                       = {9, 18, 27};

            prompt = "<|im_start|>user\n";
@ -1808,16 +1804,6 @@ struct LLMEmbedder : public Conditioner {
            prompt_attn_range.second = static_cast<int>(prompt.size());

            prompt += "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
-
-            auto tokens_and_weights = tokenize(prompt, prompt_attn_range, 0, false);
-            tokens                  = std::get<0>(tokens_and_weights);
-            weights                 = std::get<1>(tokens_and_weights);
-
-            mask.insert(mask.end(), tokens.size(), 1.f);
-            if (tokens.size() < max_length) {
-                mask.insert(mask.end(), max_length - tokens.size(), 0.f);
-                tokenizer->pad_tokens(tokens, weights, max_length, true);
-            }
        } else if (version == VERSION_OVIS_IMAGE) {
            prompt_template_encode_start_idx = 28;
            max_length                       = prompt_template_encode_start_idx + 256;
@ -1841,34 +1827,17 @@ struct LLMEmbedder : public Conditioner {
            prompt += "<|im_end|>\n<|im_start|>assistant\n";
        }

-        if (tokens.empty()) {
        auto tokens_and_weights = tokenize(prompt, prompt_attn_range, max_length, max_length > 0);
-            tokens                  = std::get<0>(tokens_and_weights);
-            weights                 = std::get<1>(tokens_and_weights);
-        }
+        auto& tokens            = std::get<0>(tokens_and_weights);
+        auto& weights           = std::get<1>(tokens_and_weights);

        int64_t t0                        = ggml_time_ms();
        struct ggml_tensor* hidden_states = nullptr;  // [N, n_token, 3584]

        auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens);

-        ggml_tensor* attention_mask = nullptr;
-        if (!mask.empty()) {
-            attention_mask = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, mask.size(), mask.size());
-            ggml_ext_tensor_iter(attention_mask, [&](ggml_tensor* attention_mask, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-                float value = 0.f;
-                if (mask[i0] == 0.f) {
-                    value = -INFINITY;
-                } else if (i0 > i1) {
-                    value = -INFINITY;
-                }
-                ggml_ext_tensor_set_f32(attention_mask, value, i0, i1, i2, i3);
-            });
-        }
-
        llm->compute(n_threads,
                     input_ids,
-                     attention_mask,
                     image_embeds,
                     out_layers,
                     &hidden_states,
@ -1892,7 +1861,7 @@ struct LLMEmbedder : public Conditioner {
        GGML_ASSERT(hidden_states->ne[1] > prompt_template_encode_start_idx);

        int64_t min_length = 0;
-        if (version == VERSION_FLUX2) {
+        if (sd_version_is_flux2(version)) {
            min_length = 512;
        }

--- a/flux.hpp
+++ b/flux.hpp
@ -1288,8 +1288,18 @@ namespace Flux {
            } else if (version == VERSION_OVIS_IMAGE) {
                flux_params.semantic_txt_norm = true;
                flux_params.use_yak_mlp       = true;
+                flux_params.context_in_dim    = 2048;
                flux_params.vec_in_dim        = 0;
            } else if (sd_version_is_flux2(version)) {
+                if (version == VERSION_FLUX2_KLEIN) {
+                    flux_params.context_in_dim   = 7680;
+                    flux_params.hidden_size      = 3072;
+                    flux_params.num_heads        = 24;
+                } else {
+                    flux_params.context_in_dim   = 15360;
+                    flux_params.hidden_size      = 6144;
+                    flux_params.num_heads        = 48;
+                }
                flux_params.in_channels      = 128;
                flux_params.patch_size       = 1;
                flux_params.out_channels     = 128;
@ -1303,12 +1313,12 @@ namespace Flux {
                flux_params.ref_index_scale  = 10.f;
                flux_params.use_mlp_silu_act = true;
            }
-            int64_t head_dim = 0;
            for (auto pair : tensor_storage_map) {
                std::string tensor_name = pair.first;
                if (!starts_with(tensor_name, prefix))
                    continue;
                if (tensor_name.find("guidance_in.in_layer.weight") != std::string::npos) {
+                    // not schnell
                    flux_params.guidance_embed = true;
                }
                if (tensor_name.find("__x0__") != std::string::npos) {
@ -1340,30 +1350,13 @@ namespace Flux {
                        flux_params.depth_single_blocks = block_depth + 1;
                    }
                }
-                if (ends_with(tensor_name, "txt_in.weight")) {
-                    flux_params.context_in_dim = pair.second.ne[0];
-                    flux_params.hidden_size    = pair.second.ne[1];
-                }
-                if (ends_with(tensor_name, "single_blocks.0.norm.key_norm.scale")) {
-                    head_dim = pair.second.ne[0];
-                }
-                if (ends_with(tensor_name, "double_blocks.0.txt_attn.norm.key_norm.scale")) {
-                    head_dim = pair.second.ne[0];
-                }
            }

-            flux_params.num_heads = static_cast<int>(flux_params.hidden_size / head_dim);
-
-            LOG_INFO("flux: depth = %d, depth_single_blocks = %d, guidance_embed = %s, context_in_dim = %" PRId64
-                     ", hidden_size = %" PRId64 ", num_heads = %d",
-                     flux_params.depth,
-                     flux_params.depth_single_blocks,
-                     flux_params.guidance_embed ? "true" : "false",
-                     flux_params.context_in_dim,
-                     flux_params.hidden_size,
-                     flux_params.num_heads);
+            LOG_INFO("Flux blocks: %d double, %d single", flux_params.depth, flux_params.depth_single_blocks);
            if (flux_params.is_chroma) {
                LOG_INFO("Using pruned modulation (Chroma)");
+            } else if (!flux_params.guidance_embed) {
+                LOG_INFO("Flux guidance is disabled (Schnell mode)");
            }

            flux = Flux(flux_params);
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@ -1348,7 +1348,6 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_attention_ext(struct ggml_context
        v = ggml_reshape_3d(ctx, v, L_k, d_head, n_kv_head * N);   // [N * n_kv_head, d_head, L_k]

        auto kq = ggml_mul_mat(ctx, k, q);  // [N * n_head, L_q, L_k]
-        ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
        kq      = ggml_scale_inplace(ctx, kq, scale);
        if (mask) {
            kq = ggml_add_inplace(ctx, kq, mask);
--- a/llm.hpp
+++ b/llm.hpp
@ -837,8 +837,7 @@ namespace LLM {

        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                    struct ggml_tensor* x,
-                                    struct ggml_tensor* input_pos,
-                                    struct ggml_tensor* attention_mask = nullptr) {
+                                    struct ggml_tensor* input_pos) {
            // x: [N, n_token, hidden_size]
            int64_t n_token = x->ne[1];
            int64_t N       = x->ne[2];
@ -881,7 +880,7 @@ namespace LLM {
            k = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, k, 0, 2, 1, 3));  // [N, num_kv_heads, n_token, head_dim]
            k = ggml_reshape_3d(ctx->ggml_ctx, k, k->ne[0], k->ne[1], k->ne[2] * k->ne[3]);      // [N*num_kv_heads, n_token, head_dim]

-            x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, attention_mask, false, true, false);  // [N, n_token, hidden_size]
+            x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, nullptr, true, true, false);  // [N, n_token, hidden_size]

            x = out_proj->forward(ctx, x);  // [N, n_token, hidden_size]
            return x;
@ -899,8 +898,7 @@ namespace LLM {

        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                    struct ggml_tensor* x,
-                                    struct ggml_tensor* input_pos,
-                                    struct ggml_tensor* attention_mask = nullptr) {
+                                    struct ggml_tensor* input_pos) {
            // x: [N, n_token, hidden_size]
            auto self_attn                = std::dynamic_pointer_cast<Attention>(blocks["self_attn"]);
            auto mlp                      = std::dynamic_pointer_cast<MLP>(blocks["mlp"]);
@ -909,7 +907,7 @@ namespace LLM {

            auto residual = x;
            x             = input_layernorm->forward(ctx, x);
-            x             = self_attn->forward(ctx, x, input_pos, attention_mask);
+            x             = self_attn->forward(ctx, x, input_pos);
            x             = ggml_add_inplace(ctx->ggml_ctx, x, residual);

            residual = x;
@ -938,7 +936,6 @@ namespace LLM {
        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                    struct ggml_tensor* input_ids,
                                    struct ggml_tensor* input_pos,
-                                    struct ggml_tensor* attention_mask,
                                    std::vector<std::pair<int, ggml_tensor*>> image_embeds,
                                    std::set<int> out_layers) {
            // input_ids: [N, n_token]
@ -993,7 +990,7 @@ namespace LLM {
            for (int i = 0; i < num_layers; i++) {
                auto block = std::dynamic_pointer_cast<TransformerBlock>(blocks["layers." + std::to_string(i)]);

-                x = block->forward(ctx, x, input_pos, attention_mask);
+                x = block->forward(ctx, x, input_pos);
                if (out_layers.find(i + 1) != out_layers.end()) {
                    intermediate_outputs.push_back(x);
                }
@ -1039,13 +1036,12 @@ namespace LLM {
        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                    struct ggml_tensor* input_ids,
                                    struct ggml_tensor* input_pos,
-                                    struct ggml_tensor* attention_mask,
                                    std::vector<std::pair<int, ggml_tensor*>> image_embeds,
                                    std::set<int> out_layers) {
            // input_ids: [N, n_token]
            auto model = std::dynamic_pointer_cast<TextModel>(blocks["model"]);

-            auto x = model->forward(ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers);
+            auto x = model->forward(ctx, input_ids, input_pos, image_embeds, out_layers);
            return x;
        }

@ -1067,7 +1063,6 @@ namespace LLM {
        LLM model;

        std::vector<int> input_pos_vec;
-        std::vector<float> attention_mask_vec;
        std::vector<float> window_mask_vec;
        std::vector<int> window_index_vec;
        std::vector<int> window_inverse_index_vec;
@ -1162,10 +1157,9 @@ namespace LLM {
        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                    struct ggml_tensor* input_ids,
                                    struct ggml_tensor* input_pos,
-                                    struct ggml_tensor* attention_mask,
                                    std::vector<std::pair<int, ggml_tensor*>> image_embeds,
                                    std::set<int> out_layers) {
-            auto hidden_states = model.forward(ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers);  // [N, n_token, hidden_size]
+            auto hidden_states = model.forward(ctx, input_ids, input_pos, image_embeds, out_layers);  // [N, n_token, hidden_size]
            return hidden_states;
        }

@ -1180,7 +1174,6 @@ namespace LLM {
        }

        struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
-                                        struct ggml_tensor* attention_mask,
                                        std::vector<std::pair<int, ggml_tensor*>> image_embeds,
                                        std::set<int> out_layers) {
            struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
@ -1212,26 +1205,9 @@ namespace LLM {
                                                input_pos_vec.size());
            set_backend_tensor_data(input_pos, input_pos_vec.data());

-            if (attention_mask != nullptr) {
-                attention_mask = to_backend(attention_mask);
-            } else {
-                attention_mask_vec.resize(n_tokens * n_tokens);
-                for (int i0 = 0; i0 < n_tokens; i0++) {
-                    for (int i1 = 0; i1 < n_tokens; i1++) {
-                        float value = 0.f;
-                        if (i0 > i1) {
-                            value = -INFINITY;
-                        }
-                        attention_mask_vec[i1 * n_tokens + i0] = value;
-                    }
-                }
-                attention_mask = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_F32, n_tokens, n_tokens);
-                set_backend_tensor_data(attention_mask, attention_mask_vec.data());
-            }
-
            auto runner_ctx = get_context();

-            struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers);
+            struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, input_pos, image_embeds, out_layers);

            ggml_build_forward_expand(gf, hidden_states);

@ -1240,13 +1216,12 @@ namespace LLM {

        bool compute(const int n_threads,
                     struct ggml_tensor* input_ids,
-                     struct ggml_tensor* attention_mask,
                     std::vector<std::pair<int, ggml_tensor*>> image_embeds,
                     std::set<int> out_layers,
                     ggml_tensor** output,
                     ggml_context* output_ctx = nullptr) {
            auto get_graph = [&]() -> struct ggml_cgraph* {
-                return build_graph(input_ids, attention_mask, image_embeds, out_layers);
+                return build_graph(input_ids, image_embeds, out_layers);
            };
            return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
        }
@ -1550,7 +1525,7 @@ namespace LLM {
                struct ggml_tensor* out = nullptr;

                int64_t t0 = ggml_time_ms();
-                model.compute(8, input_ids, nullptr, image_embeds, {}, &out, work_ctx);
+                model.compute(8, input_ids, image_embeds, {}, &out, work_ctx);
                int64_t t1 = ggml_time_ms();

                print_ggml_tensor(out);
@ -1590,7 +1565,7 @@ namespace LLM {
                struct ggml_tensor* out = nullptr;

                int64_t t0 = ggml_time_ms();
-                model.compute(8, input_ids, nullptr, {}, {10, 20, 30}, &out, work_ctx);
+                model.compute(8, input_ids, {}, {10, 20, 30}, &out, work_ctx);
                int64_t t1 = ggml_time_ms();

                print_ggml_tensor(out);
@ -1613,7 +1588,7 @@ namespace LLM {
                struct ggml_tensor* out = nullptr;

                int64_t t0 = ggml_time_ms();
-                model.compute(8, input_ids, nullptr, {}, {35}, &out, work_ctx);
+                model.compute(8, input_ids, {}, {35}, &out, work_ctx);
                int64_t t1 = ggml_time_ms();

                print_ggml_tensor(out);
@ -1636,7 +1611,7 @@ namespace LLM {
                struct ggml_tensor* out = nullptr;

                int64_t t0 = ggml_time_ms();
-                model.compute(8, input_ids, nullptr, {}, {}, &out, work_ctx);
+                model.compute(8, input_ids, {}, {}, &out, work_ctx);
                int64_t t1 = ggml_time_ms();

                print_ggml_tensor(out);
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -48,7 +48,7 @@ const char* model_version_to_str[] = {
    "Wan 2.2 TI2V",
    "Qwen Image",
    "Flux.2",
-    "Flux.2 klein",
+    "Flux.2 klein"
    "Z-Image",
    "Ovis Image",
 };