feat: do not convert more flux tensors

fix: do not force using f32 for some flux layers
This sometimes leads to worse result
2025-12-13 05:48:56 +00:00 · 2024-08-25 16:01:36 +08:00 · 2024-08-25 14:07:22 +08:00 · 2024-08-25 13:53:16 +08:00 · 2024-08-25 13:37:37 +08:00 · 2024-08-25 13:11:34 +08:00
7 changed files with 79 additions and 41 deletions
--- a/assets/flux/flux1-dev-q4_k.png
+++ b/assets/flux/flux1-dev-q4_k.png
--- a/docs/flux.md
+++ b/docs/flux.md
@ -4,14 +4,17 @@ You can run Flux using stable-diffusion.cpp with a GPU that has 6GB or even 4GB

 ## Download weights

- Download flux-dev from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/flux1-dev.safetensors
- Download flux-schnell from https://huggingface.co/black-forest-labs/FLUX.1-schnell/blob/main/flux1-schnell.safetensors
+- Download flux
+    - If you don't want to do the conversion yourself, download the preconverted gguf model from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf)
+    - Otherwise, download flux-dev from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/flux1-dev.safetensors or flux-schnell from https://huggingface.co/black-forest-labs/FLUX.1-schnell/blob/main/flux1-schnell.safetensors
 - Download vae from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
 - Download clip_l from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/clip_l.safetensors
 - Download t5xxl from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors

 ## Convert flux weights

+You can download the preconverted gguf weights from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf), this way you don't have to do the conversion yourself.
+
 Using fp16 will lead to overflow, but ggml's support for bf16 is not yet fully developed. Therefore, we need to convert flux to gguf format here, which also saves VRAM. For example:
 ```
 .\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-dev.sft -o ..\models\flux1-dev-q8_0.gguf -v --type q8_0
@ -30,10 +33,10 @@ For example:

 Using formats of different precisions will yield results of varying quality.

-| Type | q8_0  | q4_0  | q3_k  | q2_k |
-|---- | ----  |----  |----  |----  |
-| **Memory** | 12068.09 MB  | 6394.53 MB  | 4888.16 MB  | 3735.73 MB |
-| **Result** | ![](../assets/flux/flux1-dev-q8_0.png) |![](../assets/flux/flux1-dev-q4_0.png) |![](../assets/flux/flux1-dev-q3_k.png) |![](../assets/flux/flux1-dev-q2_k.png)|
+| Type | q8_0  | q4_0  | q4_k  | q3_k  | q2_k |
+|---- | ----  |----  |----  |----  |----  |
+| **Memory** | 12068.09 MB  | 6394.53 MB | 6395.17 MB | 4888.16 MB  | 3735.73 MB |
+| **Result** | ![](../assets/flux/flux1-dev-q8_0.png) |![](../assets/flux/flux1-dev-q4_0.png) |![](../assets/flux/flux1-dev-q4_k.png) |![](../assets/flux/flux1-dev-q3_k.png) |![](../assets/flux/flux1-dev-q2_k.png)|



--- a/flux.hpp
+++ b/flux.hpp
@ -634,13 +634,13 @@ namespace Flux {
            int64_t out_channels = params.in_channels;
            int64_t pe_dim       = params.hidden_size / params.num_heads;

-            blocks["img_in"]    = std::shared_ptr<GGMLBlock>(new Linear(params.in_channels, params.hidden_size));
+            blocks["img_in"]    = std::shared_ptr<GGMLBlock>(new Linear(params.in_channels, params.hidden_size, true));
            blocks["time_in"]   = std::shared_ptr<GGMLBlock>(new MLPEmbedder(256, params.hidden_size));
            blocks["vector_in"] = std::shared_ptr<GGMLBlock>(new MLPEmbedder(params.vec_in_dim, params.hidden_size));
            if (params.guidance_embed) {
                blocks["guidance_in"] = std::shared_ptr<GGMLBlock>(new MLPEmbedder(256, params.hidden_size));
            }
-            blocks["txt_in"] = std::shared_ptr<GGMLBlock>(new Linear(params.context_in_dim, params.hidden_size));
+            blocks["txt_in"] = std::shared_ptr<GGMLBlock>(new Linear(params.context_in_dim, params.hidden_size, true));

            for (int i = 0; i < params.depth; i++) {
                blocks["double_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new DoubleStreamBlock(params.hidden_size,
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@ -1187,9 +1187,10 @@ protected:
    int64_t in_features;
    int64_t out_features;
    bool bias;
+    bool force_f32;

    void init_params(struct ggml_context* ctx, ggml_type wtype) {
-        if (in_features % ggml_blck_size(wtype) != 0) {
+        if (in_features % ggml_blck_size(wtype) != 0 || force_f32) {
            wtype = GGML_TYPE_F32;
        }
        params["weight"] = ggml_new_tensor_2d(ctx, wtype, in_features, out_features);
@ -1201,10 +1202,12 @@ protected:
 public:
    Linear(int64_t in_features,
           int64_t out_features,
-           bool bias = true)
+           bool bias      = true,
+           bool force_f32 = false)
        : in_features(in_features),
          out_features(out_features),
-          bias(bias) {}
+          bias(bias),
+          force_f32(force_f32) {}

    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        struct ggml_tensor* w = params["weight"];
--- a/mmdit.hpp
+++ b/mmdit.hpp
@ -101,8 +101,8 @@ public:
    TimestepEmbedder(int64_t hidden_size,
                     int64_t frequency_embedding_size = 256)
        : frequency_embedding_size(frequency_embedding_size) {
-        blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(frequency_embedding_size, hidden_size));
-        blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size));
+        blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(frequency_embedding_size, hidden_size, true, true));
+        blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true));
    }

    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* t) {
@ -125,8 +125,8 @@ struct VectorEmbedder : public GGMLBlock {
 public:
    VectorEmbedder(int64_t input_dim,
                   int64_t hidden_size) {
-        blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(input_dim, hidden_size));
-        blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size));
+        blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(input_dim, hidden_size, true, true));
+        blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true));
    }

    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
@ -423,7 +423,7 @@ public:
               int64_t out_channels) {
        // total_out_channels is always None
        blocks["norm_final"]         = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-06f, false));
-        blocks["linear"]             = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, patch_size * patch_size * out_channels));
+        blocks["linear"]             = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, patch_size * patch_size * out_channels, true, true));
        blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, 2 * hidden_size));
    }

@ -510,7 +510,7 @@ public:
            blocks["y_embedder"] = std::shared_ptr<GGMLBlock>(new VectorEmbedder(adm_in_channels, hidden_size));
        }

-        blocks["context_embedder"] = std::shared_ptr<GGMLBlock>(new Linear(4096, 1536));
+        blocks["context_embedder"] = std::shared_ptr<GGMLBlock>(new Linear(4096, 1536, true, true));

        for (int i = 0; i < depth; i++) {
            blocks["joint_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new JointBlock(hidden_size,
--- a/model.cpp
+++ b/model.cpp
@ -1397,10 +1397,11 @@ ggml_type ModelLoader::get_sd_wtype() {
            continue;
        }

-        if (tensor_storage.name.find(".weight") != std::string::npos &&
-            (tensor_storage.name.find("time_embed") != std::string::npos ||
-             tensor_storage.name.find("context_embedder") != std::string::npos ||
-             tensor_storage.name.find("time_in") != std::string::npos)) {
+        if (ggml_is_quantized(tensor_storage.type)) {
+            return tensor_storage.type;
+        }
+
+        if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) {
            return tensor_storage.type;
        }
    }
@ -1420,7 +1421,11 @@ ggml_type ModelLoader::get_conditioner_wtype() {
            continue;
        }

-        if (tensor_storage.name.find(".weight") != std::string::npos) {
+        if (ggml_is_quantized(tensor_storage.type)) {
+            return tensor_storage.type;
+        }
+
+        if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) {
            return tensor_storage.type;
        }
    }
@ -1437,10 +1442,11 @@ ggml_type ModelLoader::get_diffusion_model_wtype() {
            continue;
        }

-        if (tensor_storage.name.find(".weight") != std::string::npos &&
-            (tensor_storage.name.find("time_embed") != std::string::npos ||
-             tensor_storage.name.find("context_embedder") != std::string::npos ||
-             tensor_storage.name.find("time_in") != std::string::npos)) {
+        if (ggml_is_quantized(tensor_storage.type)) {
+            return tensor_storage.type;
+        }
+
+        if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) {
            return tensor_storage.type;
        }
    }
@ -1458,7 +1464,11 @@ ggml_type ModelLoader::get_vae_wtype() {
            continue;
        }

-        if (tensor_storage.name.find(".weight")) {
+        if (ggml_is_quantized(tensor_storage.type)) {
+            return tensor_storage.type;
+        }
+
+        if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) {
            return tensor_storage.type;
        }
    }
@ -1723,6 +1733,37 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
    return true;
 }

+bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type) {
+    const std::string& name = tensor_storage.name;
+    if (type != GGML_TYPE_COUNT) {
+        if (ggml_is_quantized(type) && tensor_storage.ne[0] % ggml_blck_size(type) != 0) {
+            // Pass, do not convert
+        } else if (ends_with(name, ".bias")) {
+            // Pass, do not convert
+        } else if (ends_with(name, ".scale")) {
+            // Pass, do not convert
+        } else if (contains(name, "img_in.") ||
+                   contains(name, "txt_in.") ||
+                   contains(name, "time_in.") ||
+                   contains(name, "vector_in.") ||
+                   contains(name, "guidance_in.") ||
+                   contains(name, "final_layer.")) {
+            // Pass, do not convert. For FLUX
+        } else if (contains(name, "x_embedder.") ||
+                   contains(name, "t_embedder.") ||
+                   contains(name, "y_embedder.") ||
+                   contains(name, "pos_embed") ||
+                   contains(name, "context_embedder.")) {
+            // Pass, do not convert. For MMDiT
+        } else if (contains(name, "time_embed.") || contains(name, "label_emb.")) {
+            // Pass, do not convert. For Unet
+        } else {
+            return true;
+        }
+    }
+    return false;
+}
+
 bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type) {
    auto backend    = ggml_backend_cpu_init();
    size_t mem_size = 1 * 1024 * 1024;  // for padding
@ -1737,12 +1778,8 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type
        const std::string& name = tensor_storage.name;

        ggml_type tensor_type = tensor_storage.type;
-        if (type != GGML_TYPE_COUNT) {
-            if (ggml_is_quantized(type) && tensor_storage.ne[0] % ggml_blck_size(type) != 0) {
-                tensor_type = GGML_TYPE_F16;
-            } else {
-                tensor_type = type;
-            }
+        if (tensor_should_be_converted(tensor_storage, type)) {
+            tensor_type = type;
        }

        ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne);
@ -1792,15 +1829,9 @@ int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type)
    }

    for (auto& tensor_storage : processed_tensor_storages) {
-        ggml_type tensor_type = tensor_storage.type;
-        if (type != GGML_TYPE_COUNT) {
-            if (ggml_is_quantized(type) && tensor_storage.ne[0] % 32 != 0) {
-                tensor_type = GGML_TYPE_F16;
-            } else {
-                tensor_type = type;
-            }
+        if (tensor_should_be_converted(tensor_storage, type)) {
+            tensor_storage.type = type;
        }
-        tensor_storage.type = tensor_type;
        mem_size += tensor_storage.nbytes() + alignment;
    }

--- a/model.h
+++ b/model.h
@ -157,6 +157,7 @@ public:
                      ggml_backend_t backend,
                      std::set<std::string> ignore_tensors = {});
    bool save_to_gguf_file(const std::string& file_path, ggml_type type);
+    bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);
    int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
    ~ModelLoader() = default;
Author	SHA1	Message	Date
leejet	5c561eab31	feat: do not convert more flux tensors	2024-08-25 16:01:36 +08:00
leejet	f5997a1951	fix: do not force using f32 for some flux layers This sometimes leads to worse result	2024-08-25 14:07:22 +08:00
leejet	1bdc767aaf	feat: force using f32 for some layers	2024-08-25 13:53:16 +08:00
leejet	79c9fe9556	feat: do not convert some tensors	2024-08-25 13:37:37 +08:00
leejet	28a614769a	docs: update docs/flux.md	2024-08-25 13:11:34 +08:00