feat: add support for Flux Controls and Flex.2 (#692)

2026-06-24 07:06:44 +00:00 · 2025-10-10 18:06:57 +02:00 · 2025-10-10 18:06:57 +02:00 · 11f436c483
commit 11f436c483
parent 35843c77ea
7 changed files with 156 additions and 34 deletions
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -1246,7 +1246,7 @@ int main(int argc, const char* argv[]) {
        }
    }

-    if (params.control_net_path.size() > 0 && params.control_image_path.size() > 0) {
+    if (params.control_image_path.size() > 0) {
        int width          = 0;
        int height         = 0;
        control_image.data = load_image(params.control_image_path.c_str(), width, height, params.width, params.height);
--- a/flux.hpp
+++ b/flux.hpp
@ -615,6 +615,7 @@ namespace Flux {
        bool guidance_embed         = true;
        bool flash_attn             = true;
        bool is_chroma              = false;
+        SDVersion version           = VERSION_FLUX;
    };

    struct Flux : public GGMLBlock {
@ -720,6 +721,7 @@ namespace Flux {
            auto final_layer = std::dynamic_pointer_cast<LastLayer>(blocks["final_layer"]);

            img = img_in->forward(ctx, img);
+
            struct ggml_tensor* vec;
            struct ggml_tensor* txt_img_mask = NULL;
            if (params.is_chroma) {
@ -849,7 +851,8 @@ namespace Flux {
            auto img            = process_img(ctx, x);
            uint64_t img_tokens = img->ne[1];

-            if (c_concat != NULL) {
+            if (params.version == VERSION_FLUX_FILL) {
+                GGML_ASSERT(c_concat != NULL);
                ggml_tensor* masked = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
                ggml_tensor* mask   = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 8 * 8, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);

@ -857,6 +860,27 @@ namespace Flux {
                mask   = process_img(ctx, mask);

                img = ggml_concat(ctx, img, ggml_concat(ctx, masked, mask, 0), 0);
+            } else if (params.version == VERSION_FLEX_2) {
+                GGML_ASSERT(c_concat != NULL);
+                ggml_tensor* masked  = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
+                ggml_tensor* mask    = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 1, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
+                ggml_tensor* control = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * (C + 1));
+
+                masked  = ggml_pad(ctx, masked, pad_w, pad_h, 0, 0);
+                mask    = ggml_pad(ctx, mask, pad_w, pad_h, 0, 0);
+                control = ggml_pad(ctx, control, pad_w, pad_h, 0, 0);
+
+                masked  = patchify(ctx, masked, patch_size);
+                mask    = patchify(ctx, mask, patch_size);
+                control = patchify(ctx, control, patch_size);
+
+                img = ggml_concat(ctx, img, ggml_concat(ctx, ggml_concat(ctx, masked, mask, 0), control, 0), 0);
+            } else if (params.version == VERSION_FLUX_CONTROLS) {
+                GGML_ASSERT(c_concat != NULL);
+
+                ggml_tensor* control = ggml_pad(ctx, c_concat, pad_w, pad_h, 0, 0);
+                control              = patchify(ctx, control, patch_size);
+                img                  = ggml_concat(ctx, img, control, 0);
            }

            if (ref_latents.size() > 0) {
@ -867,6 +891,7 @@ namespace Flux {
            }

            auto out = forward_orig(ctx, backend, img, context, timestep, y, guidance, pe, mod_index_arange, skip_layers);  // [N, num_tokens, C * patch_size * patch_size]
+
            if (out->ne[1] > img_tokens) {
                out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3));  // [num_tokens, N, C * patch_size * patch_size]
                out = ggml_view_3d(ctx, out, out->ne[0], out->ne[1], img_tokens, out->nb[1], out->nb[2], 0);
@ -896,13 +921,18 @@ namespace Flux {
                   SDVersion version                   = VERSION_FLUX,
                   bool flash_attn                     = false,
                   bool use_mask                       = false)
-            : GGMLRunner(backend, offload_params_to_cpu), use_mask(use_mask) {
+            : GGMLRunner(backend, offload_params_to_cpu), version(version), use_mask(use_mask) {
+            flux_params.version             = version;
            flux_params.flash_attn          = flash_attn;
            flux_params.guidance_embed      = false;
            flux_params.depth               = 0;
            flux_params.depth_single_blocks = 0;
            if (version == VERSION_FLUX_FILL) {
                flux_params.in_channels = 384;
+            } else if (version == VERSION_FLUX_CONTROLS) {
+                flux_params.in_channels = 128;
+            } else if (version == VERSION_FLEX_2) {
+                flux_params.in_channels = 196;
            }
            for (auto pair : tensor_types) {
                std::string tensor_name = pair.first;
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@ -428,18 +428,24 @@ __STATIC_INLINE__ void sd_image_to_tensor(sd_image_t image,

 __STATIC_INLINE__ void sd_apply_mask(struct ggml_tensor* image_data,
                                     struct ggml_tensor* mask,
-                                     struct ggml_tensor* output) {
+                                     struct ggml_tensor* output,
+                                     float masked_value = 0.5f) {
    int64_t width    = output->ne[0];
    int64_t height   = output->ne[1];
    int64_t channels = output->ne[2];
+    float rescale_mx = mask->ne[0] / output->ne[0];
+    float rescale_my = mask->ne[1] / output->ne[1];
    GGML_ASSERT(output->type == GGML_TYPE_F32);
    for (int ix = 0; ix < width; ix++) {
        for (int iy = 0; iy < height; iy++) {
-            float m = ggml_tensor_get_f32(mask, ix, iy);
+            int mx  = (int)(ix * rescale_mx);
+            int my  = (int)(iy * rescale_my);
+            float m = ggml_tensor_get_f32(mask, mx, my);
            m       = round(m);  // inpaint models need binary masks
-            ggml_tensor_set_f32(mask, m, ix, iy);
+            ggml_tensor_set_f32(mask, m, mx, my);
            for (int k = 0; k < channels; k++) {
-                float value = (1 - m) * (ggml_tensor_get_f32(image_data, ix, iy, k) - .5) + .5;
+                float value = ggml_tensor_get_f32(image_data, ix, iy, k);
+                value       = (1 - m) * (value - masked_value) + masked_value;
                ggml_tensor_set_f32(output, value, ix, iy, k);
            }
        }
--- a/model.cpp
+++ b/model.cpp
@ -1803,10 +1803,15 @@ SDVersion ModelLoader::get_sd_version() {
    }

    if (is_flux) {
-        is_inpaint = input_block_weight.ne[0] == 384;
-        if (is_inpaint) {
+        if (input_block_weight.ne[0] == 384) {
            return VERSION_FLUX_FILL;
        }
+        if (input_block_weight.ne[0] == 128) {
+            return VERSION_FLUX_CONTROLS;
+        }
+        if (input_block_weight.ne[0] == 196) {
+            return VERSION_FLEX_2;
+        }
        return VERSION_FLUX;
    }

--- a/model.h
+++ b/model.h
@ -31,6 +31,8 @@ enum SDVersion {
    VERSION_SD3,
    VERSION_FLUX,
    VERSION_FLUX_FILL,
+    VERSION_FLUX_CONTROLS,
+    VERSION_FLEX_2,
    VERSION_WAN2,
    VERSION_WAN2_2_I2V,
    VERSION_WAN2_2_TI2V,
@ -66,7 +68,7 @@ static inline bool sd_version_is_sd3(SDVersion version) {
 }

 static inline bool sd_version_is_flux(SDVersion version) {
-    if (version == VERSION_FLUX || version == VERSION_FLUX_FILL) {
+    if (version == VERSION_FLUX || version == VERSION_FLUX_FILL || version == VERSION_FLUX_CONTROLS || version == VERSION_FLEX_2) {
        return true;
    }
    return false;
@ -80,7 +82,7 @@ static inline bool sd_version_is_wan(SDVersion version) {
 }

 static inline bool sd_version_is_inpaint(SDVersion version) {
-    if (version == VERSION_SD1_INPAINT || version == VERSION_SD2_INPAINT || version == VERSION_SDXL_INPAINT || version == VERSION_FLUX_FILL) {
+    if (version == VERSION_SD1_INPAINT || version == VERSION_SD2_INPAINT || version == VERSION_SDXL_INPAINT || version == VERSION_FLUX_FILL || version == VERSION_FLEX_2) {
        return true;
    }
    return false;
@ -97,8 +99,12 @@ static inline bool sd_version_is_unet_edit(SDVersion version) {
    return version == VERSION_SD1_PIX2PIX || version == VERSION_SDXL_PIX2PIX;
 }

+static inline bool sd_version_is_control(SDVersion version) {
+    return version == VERSION_FLUX_CONTROLS || version == VERSION_FLEX_2;
+}
+
 static bool sd_version_is_inpaint_or_unet_edit(SDVersion version) {
-    return sd_version_is_unet_edit(version) || sd_version_is_inpaint(version);
+    return sd_version_is_unet_edit(version) || sd_version_is_inpaint(version) || sd_version_is_control(version);
 }

 enum PMVersion {
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -37,6 +37,8 @@ const char* model_version_to_str[] = {
    "SD3.x",
    "Flux",
    "Flux Fill",
+    "Flux Control",
+    "Flex.2",
    "Wan 2.x",
    "Wan 2.2 I2V",
    "Wan 2.2 TI2V",
@ -102,7 +104,7 @@ public:
    std::shared_ptr<DiffusionModel> high_noise_diffusion_model;
    std::shared_ptr<VAE> first_stage_model;
    std::shared_ptr<TinyAutoEncoder> tae_first_stage;
-    std::shared_ptr<ControlNet> control_net;
+    std::shared_ptr<ControlNet> control_net = NULL;
    std::shared_ptr<PhotoMakerIDEncoder> pmid_model;
    std::shared_ptr<LoraModel> pmid_lora;
    std::shared_ptr<PhotoMakerIDEmbed> pmid_id_embeds;
@ -320,6 +322,11 @@ public:
            scale_factor = 1.0f;
        }

+        if (sd_version_is_control(version)) {
+            // Might need vae encode for control cond
+            vae_decode_only = false;
+        }
+
        bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu;

        {
@ -1147,7 +1154,7 @@ public:

            std::vector<struct ggml_tensor*> controls;

-            if (control_hint != NULL) {
+            if (control_hint != NULL && control_net != NULL) {
                control_net->compute(n_threads, noised_input, control_hint, timesteps, cond.c_crossattn, cond.c_vector);
                controls = control_net->controls;
                // print_ggml_tensor(controls[12]);
@ -1185,7 +1192,7 @@ public:
            float* negative_data = NULL;
            if (has_unconditioned) {
                // uncond
-                if (control_hint != NULL) {
+                if (control_hint != NULL && control_net != NULL) {
                    control_net->compute(n_threads, noised_input, control_hint, timesteps, uncond.c_crossattn, uncond.c_vector);
                    controls = control_net->controls;
                }
@ -2070,10 +2077,24 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
    int W = width / 8;
    int H = height / 8;
    LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
+
+    struct ggml_tensor* control_latent = NULL;
+    if (sd_version_is_control(sd_ctx->sd->version) && image_hint != NULL) {
+        if (!sd_ctx->sd->use_tiny_autoencoder) {
+            struct ggml_tensor* control_moments = sd_ctx->sd->encode_first_stage(work_ctx, image_hint);
+            control_latent                      = sd_ctx->sd->get_first_stage_encoding(work_ctx, control_moments);
+        } else {
+            control_latent = sd_ctx->sd->encode_first_stage(work_ctx, image_hint);
+        }
+        ggml_tensor_scale(control_latent, control_strength);
+    }
+
    if (sd_version_is_inpaint(sd_ctx->sd->version)) {
        int64_t mask_channels = 1;
        if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
            mask_channels = 8 * 8;  // flatten the whole mask
+        } else if (sd_ctx->sd->version == VERSION_FLEX_2) {
+            mask_channels = 1 + init_latent->ne[2];
        }
        auto empty_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1);
        // no mask, set the whole image as masked
@ -2087,6 +2108,11 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
                    for (int64_t c = init_latent->ne[2]; c < empty_latent->ne[2]; c++) {
                        ggml_tensor_set_f32(empty_latent, 1, x, y, c);
                    }
+                } else if (sd_ctx->sd->version == VERSION_FLEX_2) {
+                    for (int64_t c = 0; c < empty_latent->ne[2]; c++) {
+                        // 0x16,1x1,0x16
+                        ggml_tensor_set_f32(empty_latent, c == init_latent->ne[2], x, y, c);
+                    }
                } else {
                    ggml_tensor_set_f32(empty_latent, 1, x, y, 0);
                    for (int64_t c = 1; c < empty_latent->ne[2]; c++) {
@ -2095,7 +2121,28 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
                }
            }
        }
-        if (concat_latent == NULL) {
+
+        if (sd_ctx->sd->version == VERSION_FLEX_2 && control_latent != NULL && sd_ctx->sd->control_net == NULL) {
+            bool no_inpaint = concat_latent == NULL;
+            if (no_inpaint) {
+                concat_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1);
+            }
+            // fill in the control image here
+            for (int64_t x = 0; x < control_latent->ne[0]; x++) {
+                for (int64_t y = 0; y < control_latent->ne[1]; y++) {
+                    if (no_inpaint) {
+                        for (int64_t c = 0; c < concat_latent->ne[2] - control_latent->ne[2]; c++) {
+                            // 0x16,1x1,0x16
+                            ggml_tensor_set_f32(concat_latent, c == init_latent->ne[2], x, y, c);
+                        }
+                    }
+                    for (int64_t c = 0; c < control_latent->ne[2]; c++) {
+                        float v = ggml_tensor_get_f32(control_latent, x, y, c);
+                        ggml_tensor_set_f32(concat_latent, v, x, y, concat_latent->ne[2] - control_latent->ne[2] + c);
+                    }
+                }
+            }
+        } else if (concat_latent == NULL) {
            concat_latent = empty_latent;
        }
        cond.c_concat   = concat_latent;
@ -2105,10 +2152,20 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
        auto empty_latent = ggml_dup_tensor(work_ctx, init_latent);
        ggml_set_f32(empty_latent, 0);
        uncond.c_concat = empty_latent;
-        if (concat_latent == NULL) {
-            concat_latent = empty_latent;
-        }
        cond.c_concat   = ref_latents[0];
+        if (cond.c_concat == NULL) {
+            cond.c_concat = empty_latent;
+        }
+    } else if (sd_version_is_control(sd_ctx->sd->version)) {
+        auto empty_latent = ggml_dup_tensor(work_ctx, init_latent);
+        ggml_set_f32(empty_latent, 0);
+        uncond.c_concat = empty_latent;
+        if (sd_ctx->sd->control_net == NULL) {
+            cond.c_concat = control_latent;
+        }
+        if (cond.c_concat == NULL) {
+            cond.c_concat = empty_latent;
+        }
    }
    SDCondition img_cond;
    if (uncond.c_crossattn != NULL &&
@ -2291,6 +2348,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps);

    ggml_tensor* init_latent   = NULL;
+    ggml_tensor* init_moments  = NULL;
    ggml_tensor* concat_latent = NULL;
    ggml_tensor* denoise_mask  = NULL;
    if (sd_img_gen_params->init_image.data) {
@ -2310,20 +2368,36 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
        sd_image_to_tensor(sd_img_gen_params->mask_image, mask_img);
        sd_image_to_tensor(sd_img_gen_params->init_image, init_img);

+        if (!sd_ctx->sd->use_tiny_autoencoder) {
+            init_moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
+            init_latent  = sd_ctx->sd->get_first_stage_encoding(work_ctx, init_moments);
+        } else {
+            init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
+        }
+
        if (sd_version_is_inpaint(sd_ctx->sd->version)) {
            int64_t mask_channels = 1;
            if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
                mask_channels = 8 * 8;  // flatten the whole mask
+            } else if (sd_ctx->sd->version == VERSION_FLEX_2) {
+                mask_channels = 1 + init_latent->ne[2];
            }
+            ggml_tensor* masked_latent = NULL;
+            if (sd_ctx->sd->version != VERSION_FLEX_2) {
+                // most inpaint models mask before vae
                ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
                sd_apply_mask(init_img, mask_img, masked_img);
-            ggml_tensor* masked_latent = NULL;
                if (!sd_ctx->sd->use_tiny_autoencoder) {
                    ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
                    masked_latent        = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
                } else {
                    masked_latent = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
                }
+            } else {
+                // mask after vae
+                masked_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2], 1);
+                sd_apply_mask(init_latent, mask_img, masked_latent, 0.);
+            }
            concat_latent = ggml_new_tensor_4d(work_ctx,
                                               GGML_TYPE_F32,
                                               masked_latent->ne[0],
@ -2348,12 +2422,18 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
                                ggml_tensor_set_f32(concat_latent, m, ix, iy, masked_latent->ne[2] + x * 8 + y);
                            }
                        }
-                    } else {
+                    } else if (sd_ctx->sd->version == VERSION_FLEX_2) {
                        float m = ggml_tensor_get_f32(mask_img, mx, my);
-                        ggml_tensor_set_f32(concat_latent, m, ix, iy, 0);
+                        // masked image
                        for (int k = 0; k < masked_latent->ne[2]; k++) {
                            float v = ggml_tensor_get_f32(masked_latent, ix, iy, k);
-                            ggml_tensor_set_f32(concat_latent, v, ix, iy, k + mask_channels);
+                            ggml_tensor_set_f32(concat_latent, v, ix, iy, k);
+                        }
+                        // downsampled mask
+                        ggml_tensor_set_f32(concat_latent, m, ix, iy, masked_latent->ne[2]);
+                        // control (todo: support this)
+                        for (int k = 0; k < masked_latent->ne[2]; k++) {
+                            ggml_tensor_set_f32(concat_latent, 0, ix, iy, masked_latent->ne[2] + 1 + k);
                        }
                    }
                }
@ -2373,12 +2453,6 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
            }
        }

-        if (!sd_ctx->sd->use_tiny_autoencoder) {
-            ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
-            init_latent          = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
-        } else {
-            init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
-        }
    } else {
        LOG_INFO("TXT2IMG");
        if (sd_version_is_inpaint(sd_ctx->sd->version)) {
--- a/vae.hpp
+++ b/vae.hpp
@ -583,6 +583,7 @@ struct AutoEncoderKL : public VAE {
                 bool decode_graph,
                 struct ggml_tensor** output,
                 struct ggml_context* output_ctx = NULL) {
+        GGML_ASSERT(!decode_only || decode_graph);
        auto get_graph = [&]() -> struct ggml_cgraph* {
            return build_graph(z, decode_graph);
        };