Compare commits

...

5 Commits

Author SHA1 Message Date
leejet
5c561eab31 feat: do not convert more flux tensors 2024-08-25 16:01:36 +08:00
leejet
f5997a1951 fix: do not force using f32 for some flux layers
This sometimes leads to worse result
2024-08-25 14:07:22 +08:00
leejet
1bdc767aaf feat: force using f32 for some layers 2024-08-25 13:53:16 +08:00
leejet
79c9fe9556 feat: do not convert some tensors 2024-08-25 13:37:37 +08:00
leejet
28a614769a docs: update docs/flux.md 2024-08-25 13:11:34 +08:00
7 changed files with 79 additions and 41 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 468 KiB

View File

@ -4,14 +4,17 @@ You can run Flux using stable-diffusion.cpp with a GPU that has 6GB or even 4GB
## Download weights
- Download flux-dev from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/flux1-dev.safetensors
- Download flux-schnell from https://huggingface.co/black-forest-labs/FLUX.1-schnell/blob/main/flux1-schnell.safetensors
- Download flux
- If you don't want to do the conversion yourself, download the preconverted gguf model from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf)
- Otherwise, download flux-dev from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/flux1-dev.safetensors or flux-schnell from https://huggingface.co/black-forest-labs/FLUX.1-schnell/blob/main/flux1-schnell.safetensors
- Download vae from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
- Download clip_l from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/clip_l.safetensors
- Download t5xxl from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
## Convert flux weights
You can download the preconverted gguf weights from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf), this way you don't have to do the conversion yourself.
Using fp16 will lead to overflow, but ggml's support for bf16 is not yet fully developed. Therefore, we need to convert flux to gguf format here, which also saves VRAM. For example:
```
.\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-dev.sft -o ..\models\flux1-dev-q8_0.gguf -v --type q8_0
@ -30,10 +33,10 @@ For example:
Using formats of different precisions will yield results of varying quality.
| Type | q8_0 | q4_0 | q3_k | q2_k |
|---- | ---- |---- |---- |---- |
| **Memory** | 12068.09 MB | 6394.53 MB | 4888.16 MB | 3735.73 MB |
| **Result** | ![](../assets/flux/flux1-dev-q8_0.png) |![](../assets/flux/flux1-dev-q4_0.png) |![](../assets/flux/flux1-dev-q3_k.png) |![](../assets/flux/flux1-dev-q2_k.png)|
| Type | q8_0 | q4_0 | q4_k | q3_k | q2_k |
|---- | ---- |---- |---- |---- |---- |
| **Memory** | 12068.09 MB | 6394.53 MB | 6395.17 MB | 4888.16 MB | 3735.73 MB |
| **Result** | ![](../assets/flux/flux1-dev-q8_0.png) |![](../assets/flux/flux1-dev-q4_0.png) |![](../assets/flux/flux1-dev-q4_k.png) |![](../assets/flux/flux1-dev-q3_k.png) |![](../assets/flux/flux1-dev-q2_k.png)|

View File

@ -634,13 +634,13 @@ namespace Flux {
int64_t out_channels = params.in_channels;
int64_t pe_dim = params.hidden_size / params.num_heads;
blocks["img_in"] = std::shared_ptr<GGMLBlock>(new Linear(params.in_channels, params.hidden_size));
blocks["img_in"] = std::shared_ptr<GGMLBlock>(new Linear(params.in_channels, params.hidden_size, true));
blocks["time_in"] = std::shared_ptr<GGMLBlock>(new MLPEmbedder(256, params.hidden_size));
blocks["vector_in"] = std::shared_ptr<GGMLBlock>(new MLPEmbedder(params.vec_in_dim, params.hidden_size));
if (params.guidance_embed) {
blocks["guidance_in"] = std::shared_ptr<GGMLBlock>(new MLPEmbedder(256, params.hidden_size));
}
blocks["txt_in"] = std::shared_ptr<GGMLBlock>(new Linear(params.context_in_dim, params.hidden_size));
blocks["txt_in"] = std::shared_ptr<GGMLBlock>(new Linear(params.context_in_dim, params.hidden_size, true));
for (int i = 0; i < params.depth; i++) {
blocks["double_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new DoubleStreamBlock(params.hidden_size,

View File

@ -1187,9 +1187,10 @@ protected:
int64_t in_features;
int64_t out_features;
bool bias;
bool force_f32;
void init_params(struct ggml_context* ctx, ggml_type wtype) {
if (in_features % ggml_blck_size(wtype) != 0) {
if (in_features % ggml_blck_size(wtype) != 0 || force_f32) {
wtype = GGML_TYPE_F32;
}
params["weight"] = ggml_new_tensor_2d(ctx, wtype, in_features, out_features);
@ -1201,10 +1202,12 @@ protected:
public:
Linear(int64_t in_features,
int64_t out_features,
bool bias = true)
bool bias = true,
bool force_f32 = false)
: in_features(in_features),
out_features(out_features),
bias(bias) {}
bias(bias),
force_f32(force_f32) {}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
struct ggml_tensor* w = params["weight"];

View File

@ -101,8 +101,8 @@ public:
TimestepEmbedder(int64_t hidden_size,
int64_t frequency_embedding_size = 256)
: frequency_embedding_size(frequency_embedding_size) {
blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(frequency_embedding_size, hidden_size));
blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size));
blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(frequency_embedding_size, hidden_size, true, true));
blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true));
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* t) {
@ -125,8 +125,8 @@ struct VectorEmbedder : public GGMLBlock {
public:
VectorEmbedder(int64_t input_dim,
int64_t hidden_size) {
blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(input_dim, hidden_size));
blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size));
blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(input_dim, hidden_size, true, true));
blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true));
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
@ -423,7 +423,7 @@ public:
int64_t out_channels) {
// total_out_channels is always None
blocks["norm_final"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-06f, false));
blocks["linear"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, patch_size * patch_size * out_channels));
blocks["linear"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, patch_size * patch_size * out_channels, true, true));
blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, 2 * hidden_size));
}
@ -510,7 +510,7 @@ public:
blocks["y_embedder"] = std::shared_ptr<GGMLBlock>(new VectorEmbedder(adm_in_channels, hidden_size));
}
blocks["context_embedder"] = std::shared_ptr<GGMLBlock>(new Linear(4096, 1536));
blocks["context_embedder"] = std::shared_ptr<GGMLBlock>(new Linear(4096, 1536, true, true));
for (int i = 0; i < depth; i++) {
blocks["joint_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new JointBlock(hidden_size,

View File

@ -1397,10 +1397,11 @@ ggml_type ModelLoader::get_sd_wtype() {
continue;
}
if (tensor_storage.name.find(".weight") != std::string::npos &&
(tensor_storage.name.find("time_embed") != std::string::npos ||
tensor_storage.name.find("context_embedder") != std::string::npos ||
tensor_storage.name.find("time_in") != std::string::npos)) {
if (ggml_is_quantized(tensor_storage.type)) {
return tensor_storage.type;
}
if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) {
return tensor_storage.type;
}
}
@ -1420,7 +1421,11 @@ ggml_type ModelLoader::get_conditioner_wtype() {
continue;
}
if (tensor_storage.name.find(".weight") != std::string::npos) {
if (ggml_is_quantized(tensor_storage.type)) {
return tensor_storage.type;
}
if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) {
return tensor_storage.type;
}
}
@ -1437,10 +1442,11 @@ ggml_type ModelLoader::get_diffusion_model_wtype() {
continue;
}
if (tensor_storage.name.find(".weight") != std::string::npos &&
(tensor_storage.name.find("time_embed") != std::string::npos ||
tensor_storage.name.find("context_embedder") != std::string::npos ||
tensor_storage.name.find("time_in") != std::string::npos)) {
if (ggml_is_quantized(tensor_storage.type)) {
return tensor_storage.type;
}
if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) {
return tensor_storage.type;
}
}
@ -1458,7 +1464,11 @@ ggml_type ModelLoader::get_vae_wtype() {
continue;
}
if (tensor_storage.name.find(".weight")) {
if (ggml_is_quantized(tensor_storage.type)) {
return tensor_storage.type;
}
if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) {
return tensor_storage.type;
}
}
@ -1723,6 +1733,37 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
return true;
}
bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type) {
const std::string& name = tensor_storage.name;
if (type != GGML_TYPE_COUNT) {
if (ggml_is_quantized(type) && tensor_storage.ne[0] % ggml_blck_size(type) != 0) {
// Pass, do not convert
} else if (ends_with(name, ".bias")) {
// Pass, do not convert
} else if (ends_with(name, ".scale")) {
// Pass, do not convert
} else if (contains(name, "img_in.") ||
contains(name, "txt_in.") ||
contains(name, "time_in.") ||
contains(name, "vector_in.") ||
contains(name, "guidance_in.") ||
contains(name, "final_layer.")) {
// Pass, do not convert. For FLUX
} else if (contains(name, "x_embedder.") ||
contains(name, "t_embedder.") ||
contains(name, "y_embedder.") ||
contains(name, "pos_embed") ||
contains(name, "context_embedder.")) {
// Pass, do not convert. For MMDiT
} else if (contains(name, "time_embed.") || contains(name, "label_emb.")) {
// Pass, do not convert. For Unet
} else {
return true;
}
}
return false;
}
bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type) {
auto backend = ggml_backend_cpu_init();
size_t mem_size = 1 * 1024 * 1024; // for padding
@ -1737,12 +1778,8 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type
const std::string& name = tensor_storage.name;
ggml_type tensor_type = tensor_storage.type;
if (type != GGML_TYPE_COUNT) {
if (ggml_is_quantized(type) && tensor_storage.ne[0] % ggml_blck_size(type) != 0) {
tensor_type = GGML_TYPE_F16;
} else {
tensor_type = type;
}
if (tensor_should_be_converted(tensor_storage, type)) {
tensor_type = type;
}
ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne);
@ -1792,15 +1829,9 @@ int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type)
}
for (auto& tensor_storage : processed_tensor_storages) {
ggml_type tensor_type = tensor_storage.type;
if (type != GGML_TYPE_COUNT) {
if (ggml_is_quantized(type) && tensor_storage.ne[0] % 32 != 0) {
tensor_type = GGML_TYPE_F16;
} else {
tensor_type = type;
}
if (tensor_should_be_converted(tensor_storage, type)) {
tensor_storage.type = type;
}
tensor_storage.type = tensor_type;
mem_size += tensor_storage.nbytes() + alignment;
}

View File

@ -157,6 +157,7 @@ public:
ggml_backend_t backend,
std::set<std::string> ignore_tensors = {});
bool save_to_gguf_file(const std::string& file_path, ggml_type type);
bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);
int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
~ModelLoader() = default;