mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2025-12-12 13:28:37 +00:00
feat: do not convert bf16 to f32 (#1055)
This commit is contained in:
parent
689e44c9a8
commit
bfbb929790
@ -15,7 +15,7 @@ You can run Flux using stable-diffusion.cpp with a GPU that has 6GB or even 4GB
|
||||
|
||||
You can download the preconverted gguf weights from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf), this way you don't have to do the conversion yourself.
|
||||
|
||||
Using fp16 will lead to overflow, but ggml's support for bf16 is not yet fully developed. Therefore, we need to convert flux to gguf format here, which also saves VRAM. For example:
|
||||
For example:
|
||||
```
|
||||
.\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-dev.sft -o ..\models\flux1-dev-q8_0.gguf -v --type q8_0
|
||||
```
|
||||
|
||||
23
model.cpp
23
model.cpp
@ -123,11 +123,6 @@ bool is_unused_tensor(std::string name) {
|
||||
return false;
|
||||
}
|
||||
|
||||
float bf16_to_f32(uint16_t bfloat16) {
|
||||
uint32_t val_bits = (static_cast<uint32_t>(bfloat16) << 16);
|
||||
return *reinterpret_cast<float*>(&val_bits);
|
||||
}
|
||||
|
||||
uint16_t f8_e4m3_to_f16(uint8_t f8) {
|
||||
// do we need to support uz?
|
||||
|
||||
@ -210,13 +205,6 @@ uint16_t f8_e5m2_to_f16(uint8_t fp8) {
|
||||
return fp16_sign | (fp16_exponent << 10) | fp16_mantissa;
|
||||
}
|
||||
|
||||
void bf16_to_f32_vec(uint16_t* src, float* dst, int64_t n) {
|
||||
// support inplace op
|
||||
for (int64_t i = n - 1; i >= 0; i--) {
|
||||
dst[i] = bf16_to_f32(src[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void f8_e4m3_to_f16_vec(uint8_t* src, uint16_t* dst, int64_t n) {
|
||||
// support inplace op
|
||||
for (int64_t i = n - 1; i >= 0; i--) {
|
||||
@ -495,7 +483,7 @@ ggml_type str_to_ggml_type(const std::string& dtype) {
|
||||
if (dtype == "F16") {
|
||||
ttype = GGML_TYPE_F16;
|
||||
} else if (dtype == "BF16") {
|
||||
ttype = GGML_TYPE_F32;
|
||||
ttype = GGML_TYPE_BF16;
|
||||
} else if (dtype == "F32") {
|
||||
ttype = GGML_TYPE_F32;
|
||||
} else if (dtype == "F64") {
|
||||
@ -623,10 +611,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
|
||||
|
||||
size_t tensor_data_size = end - begin;
|
||||
|
||||
if (dtype == "BF16") {
|
||||
tensor_storage.is_bf16 = true;
|
||||
GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size * 2);
|
||||
} else if (dtype == "F8_E4M3") {
|
||||
if (dtype == "F8_E4M3") {
|
||||
tensor_storage.is_f8_e4m3 = true;
|
||||
// f8 -> f16
|
||||
GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size * 2);
|
||||
@ -1522,9 +1507,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
|
||||
read_time_ms.fetch_add(t1 - t0);
|
||||
|
||||
t0 = ggml_time_ms();
|
||||
if (tensor_storage.is_bf16) {
|
||||
bf16_to_f32_vec((uint16_t*)read_buf, (float*)target_buf, tensor_storage.nelements());
|
||||
} else if (tensor_storage.is_f8_e4m3) {
|
||||
if (tensor_storage.is_f8_e4m3) {
|
||||
f8_e4m3_to_f16_vec((uint8_t*)read_buf, (uint16_t*)target_buf, tensor_storage.nelements());
|
||||
} else if (tensor_storage.is_f8_e5m2) {
|
||||
f8_e5m2_to_f16_vec((uint8_t*)read_buf, (uint16_t*)target_buf, tensor_storage.nelements());
|
||||
|
||||
7
model.h
7
model.h
@ -168,7 +168,6 @@ struct TensorStorage {
|
||||
std::string name;
|
||||
ggml_type type = GGML_TYPE_F32;
|
||||
ggml_type expected_type = GGML_TYPE_COUNT;
|
||||
bool is_bf16 = false;
|
||||
bool is_f8_e4m3 = false;
|
||||
bool is_f8_e5m2 = false;
|
||||
bool is_f64 = false;
|
||||
@ -202,7 +201,7 @@ struct TensorStorage {
|
||||
}
|
||||
|
||||
int64_t nbytes_to_read() const {
|
||||
if (is_bf16 || is_f8_e4m3 || is_f8_e5m2) {
|
||||
if (is_f8_e4m3 || is_f8_e5m2) {
|
||||
return nbytes() / 2;
|
||||
} else if (is_f64 || is_i64) {
|
||||
return nbytes() * 2;
|
||||
@ -250,9 +249,7 @@ struct TensorStorage {
|
||||
std::string to_string() const {
|
||||
std::stringstream ss;
|
||||
const char* type_name = ggml_type_name(type);
|
||||
if (is_bf16) {
|
||||
type_name = "bf16";
|
||||
} else if (is_f8_e4m3) {
|
||||
if (is_f8_e4m3) {
|
||||
type_name = "f8_e4m3";
|
||||
} else if (is_f8_e5m2) {
|
||||
type_name = "f8_e5m2";
|
||||
|
||||
@ -307,13 +307,6 @@ public:
|
||||
}
|
||||
|
||||
auto& tensor_storage_map = model_loader.get_tensor_storage_map();
|
||||
for (auto& [name, tensor_storage] : tensor_storage_map) {
|
||||
if (contains(name, "llm") &&
|
||||
ends_with(name, "weight") &&
|
||||
(tensor_storage.type == GGML_TYPE_F32 || tensor_storage.type == GGML_TYPE_BF16)) {
|
||||
tensor_storage.expected_type = GGML_TYPE_F16;
|
||||
}
|
||||
}
|
||||
|
||||
LOG_INFO("Version: %s ", model_version_to_str[version]);
|
||||
ggml_type wtype = (int)sd_ctx_params->wtype < std::min<int>(SD_TYPE_COUNT, GGML_TYPE_COUNT)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user