mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2025-12-13 05:48:56 +00:00
Compare commits
2 Commits
26f3f61d37
...
bd1eaef93e
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bd1eaef93e | ||
|
|
ab835f7d39 |
@ -840,6 +840,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*
|
|||||||
|
|
||||||
float scale = (1.0f / sqrt((float)d_head));
|
float scale = (1.0f / sqrt((float)d_head));
|
||||||
|
|
||||||
|
int kv_pad = 0;
|
||||||
//if (flash_attn) {
|
//if (flash_attn) {
|
||||||
// LOG_DEBUG("attention_ext L_q:%d L_k:%d n_head:%d C:%d d_head:%d N:%d", L_q, L_k, n_head, C, d_head, N);
|
// LOG_DEBUG("attention_ext L_q:%d L_k:%d n_head:%d C:%d d_head:%d N:%d", L_q, L_k, n_head, C, d_head, N);
|
||||||
//}
|
//}
|
||||||
@ -847,11 +848,26 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*
|
|||||||
GGML_ASSERT(((L_k % 256 == 0) && L_q == L_k) || !(L_k % 256 == 0));
|
GGML_ASSERT(((L_k % 256 == 0) && L_q == L_k) || !(L_k % 256 == 0));
|
||||||
|
|
||||||
bool can_use_flash_attn = true;
|
bool can_use_flash_attn = true;
|
||||||
|
can_use_flash_attn = can_use_flash_attn && (
|
||||||
|
d_head == 64 ||
|
||||||
|
d_head == 80 ||
|
||||||
|
d_head == 96 ||
|
||||||
|
d_head == 112 ||
|
||||||
|
d_head == 128 ||
|
||||||
|
d_head == 256
|
||||||
|
);
|
||||||
|
#if 0
|
||||||
can_use_flash_attn = can_use_flash_attn && L_k % 256 == 0;
|
can_use_flash_attn = can_use_flash_attn && L_k % 256 == 0;
|
||||||
can_use_flash_attn = can_use_flash_attn && d_head % 64 == 0; // double check
|
#else
|
||||||
|
if (can_use_flash_attn && L_k % 256 != 0) {
|
||||||
// cuda max d_head seems to be 256, cpu does seem to work with 512
|
// TODO(Green-Sky): might be worth just padding by default
|
||||||
can_use_flash_attn = can_use_flash_attn && d_head <= 256; // double check
|
if (L_k == 77 || L_k == 4208 || L_k == 3952) {
|
||||||
|
kv_pad = GGML_PAD(L_k, 256) - L_k;
|
||||||
|
} else {
|
||||||
|
can_use_flash_attn = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
if (mask != nullptr) {
|
if (mask != nullptr) {
|
||||||
// TODO(Green-Sky): figure out if we can bend t5 to work too
|
// TODO(Green-Sky): figure out if we can bend t5 to work too
|
||||||
@ -864,11 +880,18 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*
|
|||||||
ggml_tensor* kqv = nullptr;
|
ggml_tensor* kqv = nullptr;
|
||||||
// GGML_ASSERT((flash_attn && can_use_flash_attn) || !flash_attn);
|
// GGML_ASSERT((flash_attn && can_use_flash_attn) || !flash_attn);
|
||||||
if (can_use_flash_attn && flash_attn) {
|
if (can_use_flash_attn && flash_attn) {
|
||||||
// LOG_DEBUG("using flash attention");
|
//LOG_DEBUG(" uses flash attention");
|
||||||
|
if (kv_pad != 0) {
|
||||||
|
//LOG_DEBUG(" padding k and v dim1 by %d", kv_pad);
|
||||||
|
k = ggml_pad(ctx, k, 0, kv_pad, 0, 0);
|
||||||
|
}
|
||||||
k = ggml_cast(ctx, k, GGML_TYPE_F16);
|
k = ggml_cast(ctx, k, GGML_TYPE_F16);
|
||||||
|
|
||||||
v = ggml_cont(ctx, ggml_permute(ctx, v, 0, 2, 1, 3)); // [N, n_head, L_k, d_head]
|
v = ggml_cont(ctx, ggml_permute(ctx, v, 0, 2, 1, 3)); // [N, n_head, L_k, d_head]
|
||||||
v = ggml_reshape_3d(ctx, v, d_head, L_k, n_head * N); // [N * n_head, L_k, d_head]
|
v = ggml_reshape_3d(ctx, v, d_head, L_k, n_head * N); // [N * n_head, L_k, d_head]
|
||||||
|
if (kv_pad != 0) {
|
||||||
|
v = ggml_pad(ctx, v, 0, kv_pad, 0, 0);
|
||||||
|
}
|
||||||
v = ggml_cast(ctx, v, GGML_TYPE_F16);
|
v = ggml_cast(ctx, v, GGML_TYPE_F16);
|
||||||
|
|
||||||
if (mask != nullptr) {
|
if (mask != nullptr) {
|
||||||
|
|||||||
52
model.cpp
52
model.cpp
@ -815,6 +815,7 @@ void f8_e4m3_to_f16_vec(uint8_t* src, uint16_t* dst, int64_t n) {
|
|||||||
dst[i] = f8_e4m3_to_f16(src[i]);
|
dst[i] = f8_e4m3_to_f16(src[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void f8_e5m2_to_f16_vec(uint8_t* src, uint16_t* dst, int64_t n) {
|
void f8_e5m2_to_f16_vec(uint8_t* src, uint16_t* dst, int64_t n) {
|
||||||
// support inplace op
|
// support inplace op
|
||||||
for (int64_t i = n - 1; i >= 0; i--) {
|
for (int64_t i = n - 1; i >= 0; i--) {
|
||||||
@ -822,6 +823,20 @@ void f8_e5m2_to_f16_vec(uint8_t* src, uint16_t* dst, int64_t n) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void f64_to_f32_vec(double* src, float* dst, int64_t n) {
|
||||||
|
// support inplace op
|
||||||
|
for (int64_t i = 0; i < n; i++) {
|
||||||
|
dst[i] = (float)src[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void i64_to_i32_vec(int64_t* src, int32_t* dst, int64_t n) {
|
||||||
|
// support inplace op
|
||||||
|
for (int64_t i = 0; i < n; i++) {
|
||||||
|
dst[i] = (int32_t)src[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void convert_tensor(void* src,
|
void convert_tensor(void* src,
|
||||||
ggml_type src_type,
|
ggml_type src_type,
|
||||||
void* dst,
|
void* dst,
|
||||||
@ -1057,13 +1072,13 @@ ggml_type str_to_ggml_type(const std::string& dtype) {
|
|||||||
} else if (dtype == "F32") {
|
} else if (dtype == "F32") {
|
||||||
ttype = GGML_TYPE_F32;
|
ttype = GGML_TYPE_F32;
|
||||||
} else if (dtype == "F64") {
|
} else if (dtype == "F64") {
|
||||||
ttype = GGML_TYPE_F64;
|
ttype = GGML_TYPE_F32;
|
||||||
} else if (dtype == "F8_E4M3") {
|
} else if (dtype == "F8_E4M3") {
|
||||||
ttype = GGML_TYPE_F16;
|
ttype = GGML_TYPE_F16;
|
||||||
} else if (dtype == "F8_E5M2") {
|
} else if (dtype == "F8_E5M2") {
|
||||||
ttype = GGML_TYPE_F16;
|
ttype = GGML_TYPE_F16;
|
||||||
} else if (dtype == "I64") {
|
} else if (dtype == "I64") {
|
||||||
ttype = GGML_TYPE_I64;
|
ttype = GGML_TYPE_I32;
|
||||||
}
|
}
|
||||||
return ttype;
|
return ttype;
|
||||||
}
|
}
|
||||||
@ -1185,6 +1200,14 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
|
|||||||
tensor_storage.is_f8_e5m2 = true;
|
tensor_storage.is_f8_e5m2 = true;
|
||||||
// f8 -> f16
|
// f8 -> f16
|
||||||
GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size * 2);
|
GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size * 2);
|
||||||
|
} else if (dtype == "F64") {
|
||||||
|
tensor_storage.is_f64 = true;
|
||||||
|
// f64 -> f32
|
||||||
|
GGML_ASSERT(tensor_storage.nbytes() * 2 == tensor_data_size);
|
||||||
|
} else if (dtype == "I64") {
|
||||||
|
tensor_storage.is_i64 = true;
|
||||||
|
// i64 -> i32
|
||||||
|
GGML_ASSERT(tensor_storage.nbytes() * 2 == tensor_data_size);
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size);
|
GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size);
|
||||||
}
|
}
|
||||||
@ -1945,7 +1968,12 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
|
|||||||
// for the CPU and Metal backend, we can copy directly into the tensor
|
// for the CPU and Metal backend, we can copy directly into the tensor
|
||||||
if (tensor_storage.type == dst_tensor->type) {
|
if (tensor_storage.type == dst_tensor->type) {
|
||||||
GGML_ASSERT(ggml_nbytes(dst_tensor) == tensor_storage.nbytes());
|
GGML_ASSERT(ggml_nbytes(dst_tensor) == tensor_storage.nbytes());
|
||||||
|
if (tensor_storage.is_f64 || tensor_storage.is_i64) {
|
||||||
|
read_buffer.resize(tensor_storage.nbytes_to_read());
|
||||||
|
read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);
|
||||||
|
} else {
|
||||||
read_data(tensor_storage, (char*)dst_tensor->data, nbytes_to_read);
|
read_data(tensor_storage, (char*)dst_tensor->data, nbytes_to_read);
|
||||||
|
}
|
||||||
|
|
||||||
if (tensor_storage.is_bf16) {
|
if (tensor_storage.is_bf16) {
|
||||||
// inplace op
|
// inplace op
|
||||||
@ -1956,9 +1984,13 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
|
|||||||
} else if (tensor_storage.is_f8_e5m2) {
|
} else if (tensor_storage.is_f8_e5m2) {
|
||||||
// inplace op
|
// inplace op
|
||||||
f8_e5m2_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements());
|
f8_e5m2_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements());
|
||||||
|
} else if (tensor_storage.is_f64) {
|
||||||
|
f64_to_f32_vec((double*)read_buffer.data(), (float*)dst_tensor->data, tensor_storage.nelements());
|
||||||
|
} else if (tensor_storage.is_i64) {
|
||||||
|
i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)dst_tensor->data, tensor_storage.nelements());
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
read_buffer.resize(tensor_storage.nbytes());
|
read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read()));
|
||||||
read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);
|
read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);
|
||||||
|
|
||||||
if (tensor_storage.is_bf16) {
|
if (tensor_storage.is_bf16) {
|
||||||
@ -1970,13 +2002,19 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
|
|||||||
} else if (tensor_storage.is_f8_e5m2) {
|
} else if (tensor_storage.is_f8_e5m2) {
|
||||||
// inplace op
|
// inplace op
|
||||||
f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
|
f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
|
||||||
|
} else if (tensor_storage.is_f64) {
|
||||||
|
// inplace op
|
||||||
|
f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
|
||||||
|
} else if (tensor_storage.is_i64) {
|
||||||
|
// inplace op
|
||||||
|
i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements());
|
||||||
}
|
}
|
||||||
|
|
||||||
convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data,
|
convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data,
|
||||||
dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
|
dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
read_buffer.resize(tensor_storage.nbytes());
|
read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read()));
|
||||||
read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);
|
read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);
|
||||||
|
|
||||||
if (tensor_storage.is_bf16) {
|
if (tensor_storage.is_bf16) {
|
||||||
@ -1988,6 +2026,12 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
|
|||||||
} else if (tensor_storage.is_f8_e5m2) {
|
} else if (tensor_storage.is_f8_e5m2) {
|
||||||
// inplace op
|
// inplace op
|
||||||
f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
|
f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
|
||||||
|
} else if (tensor_storage.is_f64) {
|
||||||
|
// inplace op
|
||||||
|
f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
|
||||||
|
} else if (tensor_storage.is_i64) {
|
||||||
|
// inplace op
|
||||||
|
i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tensor_storage.type == dst_tensor->type) {
|
if (tensor_storage.type == dst_tensor->type) {
|
||||||
|
|||||||
8
model.h
8
model.h
@ -102,6 +102,8 @@ struct TensorStorage {
|
|||||||
bool is_bf16 = false;
|
bool is_bf16 = false;
|
||||||
bool is_f8_e4m3 = false;
|
bool is_f8_e4m3 = false;
|
||||||
bool is_f8_e5m2 = false;
|
bool is_f8_e5m2 = false;
|
||||||
|
bool is_f64 = false;
|
||||||
|
bool is_i64 = false;
|
||||||
int64_t ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1};
|
int64_t ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1};
|
||||||
int n_dims = 0;
|
int n_dims = 0;
|
||||||
|
|
||||||
@ -133,6 +135,8 @@ struct TensorStorage {
|
|||||||
int64_t nbytes_to_read() const {
|
int64_t nbytes_to_read() const {
|
||||||
if (is_bf16 || is_f8_e4m3 || is_f8_e5m2) {
|
if (is_bf16 || is_f8_e4m3 || is_f8_e5m2) {
|
||||||
return nbytes() / 2;
|
return nbytes() / 2;
|
||||||
|
} else if (is_f64 || is_i64) {
|
||||||
|
return nbytes() * 2;
|
||||||
} else {
|
} else {
|
||||||
return nbytes();
|
return nbytes();
|
||||||
}
|
}
|
||||||
@ -183,6 +187,10 @@ struct TensorStorage {
|
|||||||
type_name = "f8_e4m3";
|
type_name = "f8_e4m3";
|
||||||
} else if (is_f8_e5m2) {
|
} else if (is_f8_e5m2) {
|
||||||
type_name = "f8_e5m2";
|
type_name = "f8_e5m2";
|
||||||
|
} else if (is_f64) {
|
||||||
|
type_name = "f64";
|
||||||
|
} else if (is_i64) {
|
||||||
|
type_name = "i64";
|
||||||
}
|
}
|
||||||
ss << name << " | " << type_name << " | ";
|
ss << name << " | " << type_name << " | ";
|
||||||
ss << n_dims << " [";
|
ss << n_dims << " [";
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user