mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2025-12-13 05:48:56 +00:00
Compare commits
5 Commits
c837c5d9cc
...
5c561eab31
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5c561eab31 | ||
|
|
f5997a1951 | ||
|
|
1bdc767aaf | ||
|
|
79c9fe9556 | ||
|
|
28a614769a |
BIN
assets/flux/flux1-dev-q4_k.png
Normal file
BIN
assets/flux/flux1-dev-q4_k.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 468 KiB |
15
docs/flux.md
15
docs/flux.md
@ -4,14 +4,17 @@ You can run Flux using stable-diffusion.cpp with a GPU that has 6GB or even 4GB
|
||||
|
||||
## Download weights
|
||||
|
||||
- Download flux-dev from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/flux1-dev.safetensors
|
||||
- Download flux-schnell from https://huggingface.co/black-forest-labs/FLUX.1-schnell/blob/main/flux1-schnell.safetensors
|
||||
- Download flux
|
||||
- If you don't want to do the conversion yourself, download the preconverted gguf model from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf)
|
||||
- Otherwise, download flux-dev from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/flux1-dev.safetensors or flux-schnell from https://huggingface.co/black-forest-labs/FLUX.1-schnell/blob/main/flux1-schnell.safetensors
|
||||
- Download vae from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
|
||||
- Download clip_l from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/clip_l.safetensors
|
||||
- Download t5xxl from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
|
||||
|
||||
## Convert flux weights
|
||||
|
||||
You can download the preconverted gguf weights from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf), this way you don't have to do the conversion yourself.
|
||||
|
||||
Using fp16 will lead to overflow, but ggml's support for bf16 is not yet fully developed. Therefore, we need to convert flux to gguf format here, which also saves VRAM. For example:
|
||||
```
|
||||
.\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-dev.sft -o ..\models\flux1-dev-q8_0.gguf -v --type q8_0
|
||||
@ -30,10 +33,10 @@ For example:
|
||||
|
||||
Using formats of different precisions will yield results of varying quality.
|
||||
|
||||
| Type | q8_0 | q4_0 | q3_k | q2_k |
|
||||
|---- | ---- |---- |---- |---- |
|
||||
| **Memory** | 12068.09 MB | 6394.53 MB | 4888.16 MB | 3735.73 MB |
|
||||
| **Result** |  | | ||
|
||||
| Type | q8_0 | q4_0 | q4_k | q3_k | q2_k |
|
||||
|---- | ---- |---- |---- |---- |---- |
|
||||
| **Memory** | 12068.09 MB | 6394.53 MB | 6395.17 MB | 4888.16 MB | 3735.73 MB |
|
||||
| **Result** |  | | | ||
|
||||
|
||||
|
||||
|
||||
|
||||
4
flux.hpp
4
flux.hpp
@ -634,13 +634,13 @@ namespace Flux {
|
||||
int64_t out_channels = params.in_channels;
|
||||
int64_t pe_dim = params.hidden_size / params.num_heads;
|
||||
|
||||
blocks["img_in"] = std::shared_ptr<GGMLBlock>(new Linear(params.in_channels, params.hidden_size));
|
||||
blocks["img_in"] = std::shared_ptr<GGMLBlock>(new Linear(params.in_channels, params.hidden_size, true));
|
||||
blocks["time_in"] = std::shared_ptr<GGMLBlock>(new MLPEmbedder(256, params.hidden_size));
|
||||
blocks["vector_in"] = std::shared_ptr<GGMLBlock>(new MLPEmbedder(params.vec_in_dim, params.hidden_size));
|
||||
if (params.guidance_embed) {
|
||||
blocks["guidance_in"] = std::shared_ptr<GGMLBlock>(new MLPEmbedder(256, params.hidden_size));
|
||||
}
|
||||
blocks["txt_in"] = std::shared_ptr<GGMLBlock>(new Linear(params.context_in_dim, params.hidden_size));
|
||||
blocks["txt_in"] = std::shared_ptr<GGMLBlock>(new Linear(params.context_in_dim, params.hidden_size, true));
|
||||
|
||||
for (int i = 0; i < params.depth; i++) {
|
||||
blocks["double_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new DoubleStreamBlock(params.hidden_size,
|
||||
|
||||
@ -1187,9 +1187,10 @@ protected:
|
||||
int64_t in_features;
|
||||
int64_t out_features;
|
||||
bool bias;
|
||||
bool force_f32;
|
||||
|
||||
void init_params(struct ggml_context* ctx, ggml_type wtype) {
|
||||
if (in_features % ggml_blck_size(wtype) != 0) {
|
||||
if (in_features % ggml_blck_size(wtype) != 0 || force_f32) {
|
||||
wtype = GGML_TYPE_F32;
|
||||
}
|
||||
params["weight"] = ggml_new_tensor_2d(ctx, wtype, in_features, out_features);
|
||||
@ -1201,10 +1202,12 @@ protected:
|
||||
public:
|
||||
Linear(int64_t in_features,
|
||||
int64_t out_features,
|
||||
bool bias = true)
|
||||
bool bias = true,
|
||||
bool force_f32 = false)
|
||||
: in_features(in_features),
|
||||
out_features(out_features),
|
||||
bias(bias) {}
|
||||
bias(bias),
|
||||
force_f32(force_f32) {}
|
||||
|
||||
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||
struct ggml_tensor* w = params["weight"];
|
||||
|
||||
12
mmdit.hpp
12
mmdit.hpp
@ -101,8 +101,8 @@ public:
|
||||
TimestepEmbedder(int64_t hidden_size,
|
||||
int64_t frequency_embedding_size = 256)
|
||||
: frequency_embedding_size(frequency_embedding_size) {
|
||||
blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(frequency_embedding_size, hidden_size));
|
||||
blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size));
|
||||
blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(frequency_embedding_size, hidden_size, true, true));
|
||||
blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true));
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* t) {
|
||||
@ -125,8 +125,8 @@ struct VectorEmbedder : public GGMLBlock {
|
||||
public:
|
||||
VectorEmbedder(int64_t input_dim,
|
||||
int64_t hidden_size) {
|
||||
blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(input_dim, hidden_size));
|
||||
blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size));
|
||||
blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(input_dim, hidden_size, true, true));
|
||||
blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true));
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||
@ -423,7 +423,7 @@ public:
|
||||
int64_t out_channels) {
|
||||
// total_out_channels is always None
|
||||
blocks["norm_final"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-06f, false));
|
||||
blocks["linear"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, patch_size * patch_size * out_channels));
|
||||
blocks["linear"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, patch_size * patch_size * out_channels, true, true));
|
||||
blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, 2 * hidden_size));
|
||||
}
|
||||
|
||||
@ -510,7 +510,7 @@ public:
|
||||
blocks["y_embedder"] = std::shared_ptr<GGMLBlock>(new VectorEmbedder(adm_in_channels, hidden_size));
|
||||
}
|
||||
|
||||
blocks["context_embedder"] = std::shared_ptr<GGMLBlock>(new Linear(4096, 1536));
|
||||
blocks["context_embedder"] = std::shared_ptr<GGMLBlock>(new Linear(4096, 1536, true, true));
|
||||
|
||||
for (int i = 0; i < depth; i++) {
|
||||
blocks["joint_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new JointBlock(hidden_size,
|
||||
|
||||
79
model.cpp
79
model.cpp
@ -1397,10 +1397,11 @@ ggml_type ModelLoader::get_sd_wtype() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (tensor_storage.name.find(".weight") != std::string::npos &&
|
||||
(tensor_storage.name.find("time_embed") != std::string::npos ||
|
||||
tensor_storage.name.find("context_embedder") != std::string::npos ||
|
||||
tensor_storage.name.find("time_in") != std::string::npos)) {
|
||||
if (ggml_is_quantized(tensor_storage.type)) {
|
||||
return tensor_storage.type;
|
||||
}
|
||||
|
||||
if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) {
|
||||
return tensor_storage.type;
|
||||
}
|
||||
}
|
||||
@ -1420,7 +1421,11 @@ ggml_type ModelLoader::get_conditioner_wtype() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (tensor_storage.name.find(".weight") != std::string::npos) {
|
||||
if (ggml_is_quantized(tensor_storage.type)) {
|
||||
return tensor_storage.type;
|
||||
}
|
||||
|
||||
if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) {
|
||||
return tensor_storage.type;
|
||||
}
|
||||
}
|
||||
@ -1437,10 +1442,11 @@ ggml_type ModelLoader::get_diffusion_model_wtype() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (tensor_storage.name.find(".weight") != std::string::npos &&
|
||||
(tensor_storage.name.find("time_embed") != std::string::npos ||
|
||||
tensor_storage.name.find("context_embedder") != std::string::npos ||
|
||||
tensor_storage.name.find("time_in") != std::string::npos)) {
|
||||
if (ggml_is_quantized(tensor_storage.type)) {
|
||||
return tensor_storage.type;
|
||||
}
|
||||
|
||||
if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) {
|
||||
return tensor_storage.type;
|
||||
}
|
||||
}
|
||||
@ -1458,7 +1464,11 @@ ggml_type ModelLoader::get_vae_wtype() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (tensor_storage.name.find(".weight")) {
|
||||
if (ggml_is_quantized(tensor_storage.type)) {
|
||||
return tensor_storage.type;
|
||||
}
|
||||
|
||||
if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) {
|
||||
return tensor_storage.type;
|
||||
}
|
||||
}
|
||||
@ -1723,6 +1733,37 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type) {
|
||||
const std::string& name = tensor_storage.name;
|
||||
if (type != GGML_TYPE_COUNT) {
|
||||
if (ggml_is_quantized(type) && tensor_storage.ne[0] % ggml_blck_size(type) != 0) {
|
||||
// Pass, do not convert
|
||||
} else if (ends_with(name, ".bias")) {
|
||||
// Pass, do not convert
|
||||
} else if (ends_with(name, ".scale")) {
|
||||
// Pass, do not convert
|
||||
} else if (contains(name, "img_in.") ||
|
||||
contains(name, "txt_in.") ||
|
||||
contains(name, "time_in.") ||
|
||||
contains(name, "vector_in.") ||
|
||||
contains(name, "guidance_in.") ||
|
||||
contains(name, "final_layer.")) {
|
||||
// Pass, do not convert. For FLUX
|
||||
} else if (contains(name, "x_embedder.") ||
|
||||
contains(name, "t_embedder.") ||
|
||||
contains(name, "y_embedder.") ||
|
||||
contains(name, "pos_embed") ||
|
||||
contains(name, "context_embedder.")) {
|
||||
// Pass, do not convert. For MMDiT
|
||||
} else if (contains(name, "time_embed.") || contains(name, "label_emb.")) {
|
||||
// Pass, do not convert. For Unet
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type) {
|
||||
auto backend = ggml_backend_cpu_init();
|
||||
size_t mem_size = 1 * 1024 * 1024; // for padding
|
||||
@ -1737,12 +1778,8 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type
|
||||
const std::string& name = tensor_storage.name;
|
||||
|
||||
ggml_type tensor_type = tensor_storage.type;
|
||||
if (type != GGML_TYPE_COUNT) {
|
||||
if (ggml_is_quantized(type) && tensor_storage.ne[0] % ggml_blck_size(type) != 0) {
|
||||
tensor_type = GGML_TYPE_F16;
|
||||
} else {
|
||||
tensor_type = type;
|
||||
}
|
||||
if (tensor_should_be_converted(tensor_storage, type)) {
|
||||
tensor_type = type;
|
||||
}
|
||||
|
||||
ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne);
|
||||
@ -1792,15 +1829,9 @@ int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type)
|
||||
}
|
||||
|
||||
for (auto& tensor_storage : processed_tensor_storages) {
|
||||
ggml_type tensor_type = tensor_storage.type;
|
||||
if (type != GGML_TYPE_COUNT) {
|
||||
if (ggml_is_quantized(type) && tensor_storage.ne[0] % 32 != 0) {
|
||||
tensor_type = GGML_TYPE_F16;
|
||||
} else {
|
||||
tensor_type = type;
|
||||
}
|
||||
if (tensor_should_be_converted(tensor_storage, type)) {
|
||||
tensor_storage.type = type;
|
||||
}
|
||||
tensor_storage.type = tensor_type;
|
||||
mem_size += tensor_storage.nbytes() + alignment;
|
||||
}
|
||||
|
||||
|
||||
1
model.h
1
model.h
@ -157,6 +157,7 @@ public:
|
||||
ggml_backend_t backend,
|
||||
std::set<std::string> ignore_tensors = {});
|
||||
bool save_to_gguf_file(const std::string& file_path, ggml_type type);
|
||||
bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);
|
||||
int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
|
||||
~ModelLoader() = default;
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user