mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2025-12-12 13:28:37 +00:00
* Feat: handle compute failures more gracefully * fix Unreachable code after return Co-authored-by: idostyle <idostyl3@googlemail.com> * adjust z_image.hpp --------- Co-authored-by: idostyle <idostyl3@googlemail.com> Co-authored-by: leejet <leejet714@gmail.com>
2599 lines
98 KiB
C++
2599 lines
98 KiB
C++
#ifndef __GGML_EXTEND_HPP__
|
||
#define __GGML_EXTEND_HPP__
|
||
|
||
#include <assert.h>
|
||
#include <inttypes.h>
|
||
#include <stdarg.h>
|
||
#include <algorithm>
|
||
#include <cstring>
|
||
#include <fstream>
|
||
#include <functional>
|
||
#include <iostream>
|
||
#include <iterator>
|
||
#include <map>
|
||
#include <memory>
|
||
#include <random>
|
||
#include <regex>
|
||
#include <set>
|
||
#include <sstream>
|
||
#include <string>
|
||
#include <unordered_map>
|
||
#include <vector>
|
||
|
||
#include "ggml-alloc.h"
|
||
#include "ggml-backend.h"
|
||
#include "ggml-cpu.h"
|
||
#include "ggml.h"
|
||
|
||
#include "model.h"
|
||
|
||
#ifdef SD_USE_CUDA
|
||
#include "ggml-cuda.h"
|
||
#endif
|
||
|
||
#ifdef SD_USE_METAL
|
||
#include "ggml-metal.h"
|
||
#endif
|
||
|
||
#ifdef SD_USE_VULKAN
|
||
#include "ggml-vulkan.h"
|
||
#endif
|
||
|
||
#ifdef SD_USE_OPENCL
|
||
#include "ggml-opencl.h"
|
||
#endif
|
||
|
||
#ifdef SD_USE_SYCL
|
||
#include "ggml-sycl.h"
|
||
#endif
|
||
|
||
#include "rng.hpp"
|
||
#include "util.h"
|
||
|
||
#define EPS 1e-05f
|
||
|
||
#ifndef __STATIC_INLINE__
|
||
#define __STATIC_INLINE__ static inline
|
||
#endif
|
||
|
||
#ifndef SD_UNUSED
|
||
#define SD_UNUSED(x) (void)(x)
|
||
#endif
|
||
|
||
__STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void*) {
|
||
switch (level) {
|
||
case GGML_LOG_LEVEL_DEBUG:
|
||
LOG_DEBUG(text);
|
||
break;
|
||
case GGML_LOG_LEVEL_INFO:
|
||
LOG_INFO(text);
|
||
break;
|
||
case GGML_LOG_LEVEL_WARN:
|
||
LOG_WARN(text);
|
||
break;
|
||
case GGML_LOG_LEVEL_ERROR:
|
||
LOG_ERROR(text);
|
||
break;
|
||
default:
|
||
LOG_DEBUG(text);
|
||
}
|
||
}
|
||
|
||
static_assert(GGML_MAX_NAME >= 128, "GGML_MAX_NAME must be at least 128");
|
||
|
||
// n-mode tensor-matrix product
|
||
// example: 2-mode product
|
||
// A: [ne03, k, ne01, ne00]
|
||
// B: k rows, m columns => [k, m]
|
||
// result is [ne03, m, ne01, ne00]
|
||
__STATIC_INLINE__ struct ggml_tensor* ggml_ext_mul_n_mode(struct ggml_context* ctx, struct ggml_tensor* a, struct ggml_tensor* b, int mode = 0) {
|
||
// reshape A
|
||
// swap 0th and nth axis
|
||
a = ggml_cont(ctx, ggml_permute(ctx, a, mode, mode != 1 ? 1 : 0, mode != 2 ? 2 : 0, mode != 3 ? 3 : 0));
|
||
int ne1 = a->ne[1];
|
||
int ne2 = a->ne[2];
|
||
int ne3 = a->ne[3];
|
||
// make 2D
|
||
a = ggml_cont(ctx, ggml_reshape_2d(ctx, a, a->ne[0], (ne3 * ne2 * ne1)));
|
||
|
||
struct ggml_tensor* result = ggml_cont(ctx, ggml_transpose(ctx, ggml_mul_mat(ctx, a, b)));
|
||
|
||
// reshape output (same shape as a after permutation except first dim)
|
||
result = ggml_reshape_4d(ctx, result, result->ne[0], ne1, ne2, ne3);
|
||
// swap back 0th and nth axis
|
||
result = ggml_permute(ctx, result, mode, mode != 1 ? 1 : 0, mode != 2 ? 2 : 0, mode != 3 ? 3 : 0);
|
||
return result;
|
||
}
|
||
|
||
__STATIC_INLINE__ struct ggml_tensor* ggml_ext_merge_lora(ggml_context* ctx,
|
||
ggml_tensor* lora_down,
|
||
ggml_tensor* lora_up,
|
||
ggml_tensor* lora_mid = nullptr) {
|
||
struct ggml_tensor* updown;
|
||
// flat lora tensors to multiply it
|
||
int64_t lora_up_rows = lora_up->ne[ggml_n_dims(lora_up) - 1];
|
||
lora_up = ggml_reshape_2d(ctx, lora_up, ggml_nelements(lora_up) / lora_up_rows, lora_up_rows);
|
||
auto lora_down_n_dims = ggml_n_dims(lora_down);
|
||
// assume n_dims should always be a multiple of 2 (otherwise rank 1 doesn't work)
|
||
lora_down_n_dims = (lora_down_n_dims + lora_down_n_dims % 2);
|
||
int64_t lora_down_rows = lora_down->ne[lora_down_n_dims - 1];
|
||
lora_down = ggml_reshape_2d(ctx, lora_down, ggml_nelements(lora_down) / lora_down_rows, lora_down_rows);
|
||
|
||
// ggml_mul_mat requires tensor b transposed
|
||
lora_down = ggml_cont(ctx, ggml_transpose(ctx, lora_down));
|
||
if (lora_mid == nullptr) {
|
||
updown = ggml_mul_mat(ctx, lora_up, lora_down);
|
||
updown = ggml_cont(ctx, ggml_transpose(ctx, updown));
|
||
} else {
|
||
// undoing tucker decomposition for conv layers.
|
||
// lora_mid has shape (3, 3, Rank, Rank)
|
||
// lora_down has shape (Rank, In, 1, 1)
|
||
// lora_up has shape (Rank, Out, 1, 1)
|
||
// conv layer shape is (3, 3, Out, In)
|
||
updown = ggml_ext_mul_n_mode(ctx, ggml_ext_mul_n_mode(ctx, lora_mid, lora_down, 3), lora_up, 2);
|
||
updown = ggml_cont(ctx, updown);
|
||
}
|
||
return updown;
|
||
}
|
||
|
||
// Kronecker product
|
||
// [ne03,ne02,ne01,ne00] x [ne13,ne12,ne11,ne10] => [ne03*ne13,ne02*ne12,ne01*ne11,ne00*ne10]
|
||
__STATIC_INLINE__ struct ggml_tensor* ggml_ext_kronecker(ggml_context* ctx, struct ggml_tensor* a, struct ggml_tensor* b) {
|
||
return ggml_mul(ctx,
|
||
ggml_interpolate(ctx,
|
||
a,
|
||
a->ne[0] * b->ne[0],
|
||
a->ne[1] * b->ne[1],
|
||
a->ne[2] * b->ne[2],
|
||
a->ne[3] * b->ne[3],
|
||
GGML_SCALE_MODE_NEAREST),
|
||
b);
|
||
}
|
||
|
||
__STATIC_INLINE__ void ggml_ext_im_set_randn_f32(struct ggml_tensor* tensor, std::shared_ptr<RNG> rng) {
|
||
uint32_t n = (uint32_t)ggml_nelements(tensor);
|
||
std::vector<float> random_numbers = rng->randn(n);
|
||
for (uint32_t i = 0; i < n; i++) {
|
||
ggml_set_f32_1d(tensor, i, random_numbers[i]);
|
||
}
|
||
}
|
||
|
||
__STATIC_INLINE__ void ggml_ext_tensor_set_f32(struct ggml_tensor* tensor, float value, int i0, int i1 = 0, int i2 = 0, int i3 = 0) {
|
||
GGML_ASSERT(tensor->nb[0] == sizeof(float));
|
||
*(float*)((char*)(tensor->data) + i3 * tensor->nb[3] + i2 * tensor->nb[2] + i1 * tensor->nb[1] + i0 * tensor->nb[0]) = value;
|
||
}
|
||
|
||
__STATIC_INLINE__ float ggml_ext_tensor_get_f32(const ggml_tensor* tensor, int i0, int i1 = 0, int i2 = 0, int i3 = 0) {
|
||
if (tensor->buffer != nullptr) {
|
||
float value;
|
||
ggml_backend_tensor_get(tensor, &value, i3 * tensor->nb[3] + i2 * tensor->nb[2] + i1 * tensor->nb[1] + i0 * tensor->nb[0], sizeof(float));
|
||
return value;
|
||
}
|
||
GGML_ASSERT(tensor->nb[0] == sizeof(float));
|
||
return *(float*)((char*)(tensor->data) + i3 * tensor->nb[3] + i2 * tensor->nb[2] + i1 * tensor->nb[1] + i0 * tensor->nb[0]);
|
||
}
|
||
|
||
__STATIC_INLINE__ int ggml_ext_tensor_get_i32(const ggml_tensor* tensor, int i0, int i1 = 0, int i2 = 0, int i3 = 0) {
|
||
if (tensor->buffer != nullptr) {
|
||
float value;
|
||
ggml_backend_tensor_get(tensor, &value, i3 * tensor->nb[3] + i2 * tensor->nb[2] + i1 * tensor->nb[1] + i0 * tensor->nb[0], sizeof(int));
|
||
return value;
|
||
}
|
||
GGML_ASSERT(tensor->nb[0] == sizeof(int));
|
||
return *(int*)((char*)(tensor->data) + i3 * tensor->nb[3] + i2 * tensor->nb[2] + i1 * tensor->nb[1] + i0 * tensor->nb[0]);
|
||
}
|
||
|
||
__STATIC_INLINE__ ggml_fp16_t ggml_ext_tensor_get_f16(const ggml_tensor* tensor, int i0, int i1 = 0, int i2 = 0, int i3 = 0) {
|
||
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
||
return *(ggml_fp16_t*)((char*)(tensor->data) + i3 * tensor->nb[3] + i2 * tensor->nb[2] + i1 * tensor->nb[1] + i0 * tensor->nb[0]);
|
||
}
|
||
|
||
__STATIC_INLINE__ float sd_image_get_f32(sd_image_t image, int iw, int ih, int ic, bool scale = true) {
|
||
float value = *(image.data + ih * image.width * image.channel + iw * image.channel + ic);
|
||
if (scale) {
|
||
value /= 255.f;
|
||
}
|
||
return value;
|
||
}
|
||
|
||
__STATIC_INLINE__ float sd_image_get_f32(sd_image_f32_t image, int iw, int ih, int ic, bool scale = true) {
|
||
float value = *(image.data + ih * image.width * image.channel + iw * image.channel + ic);
|
||
if (scale) {
|
||
value /= 255.f;
|
||
}
|
||
return value;
|
||
}
|
||
|
||
__STATIC_INLINE__ void print_ggml_tensor(struct ggml_tensor* tensor, bool shape_only = false, const char* mark = "") {
|
||
printf("%s (%s): shape(%zu, %zu, %zu, %zu)\n", mark, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
|
||
fflush(stdout);
|
||
if (shape_only) {
|
||
return;
|
||
}
|
||
int range = 3;
|
||
for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
|
||
if (i3 >= range && i3 + range < tensor->ne[3]) {
|
||
continue;
|
||
}
|
||
for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
|
||
if (i2 >= range && i2 + range < tensor->ne[2]) {
|
||
continue;
|
||
}
|
||
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
|
||
if (i1 >= range && i1 + range < tensor->ne[1]) {
|
||
continue;
|
||
}
|
||
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
|
||
if (i0 >= range && i0 + range < tensor->ne[0]) {
|
||
continue;
|
||
}
|
||
if (tensor->type == GGML_TYPE_F32) {
|
||
printf(" [%d, %d, %d, %d] = %f\n", i3, i2, i1, i0, ggml_ext_tensor_get_f32(tensor, i0, i1, i2, i3));
|
||
} else if (tensor->type == GGML_TYPE_F16) {
|
||
printf(" [%d, %d, %d, %d] = %f\n", i3, i2, i1, i0, ggml_fp16_to_fp32(ggml_ext_tensor_get_f16(tensor, i0, i1, i2, i3)));
|
||
} else if (tensor->type == GGML_TYPE_I32) {
|
||
printf(" [%d, %d, %d, %d] = %i3\n", i3, i2, i1, i0, ggml_ext_tensor_get_i32(tensor, i0, i1, i2, i3));
|
||
}
|
||
fflush(stdout);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
__STATIC_INLINE__ void ggml_ext_tensor_iter(
|
||
ggml_tensor* tensor,
|
||
const std::function<void(ggml_tensor*, int64_t, int64_t, int64_t, int64_t)>& fn) {
|
||
int64_t n0 = tensor->ne[0];
|
||
int64_t n1 = tensor->ne[1];
|
||
int64_t n2 = tensor->ne[2];
|
||
int64_t n3 = tensor->ne[3];
|
||
|
||
for (int64_t i3 = 0; i3 < n3; i3++) {
|
||
for (int64_t i2 = 0; i2 < n2; i2++) {
|
||
for (int64_t i1 = 0; i1 < n1; i1++) {
|
||
for (int64_t i0 = 0; i0 < n0; i0++) {
|
||
fn(tensor, i0, i1, i2, i3);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
__STATIC_INLINE__ void ggml_ext_tensor_iter(
|
||
ggml_tensor* tensor,
|
||
const std::function<void(ggml_tensor*, int64_t)>& fn) {
|
||
int64_t n0 = tensor->ne[0];
|
||
int64_t n1 = tensor->ne[1];
|
||
int64_t n2 = tensor->ne[2];
|
||
int64_t n3 = tensor->ne[3];
|
||
|
||
for (int64_t i = 0; i < ggml_nelements(tensor); i++) {
|
||
fn(tensor, i);
|
||
}
|
||
}
|
||
|
||
__STATIC_INLINE__ void ggml_ext_tensor_diff(
|
||
ggml_tensor* a,
|
||
ggml_tensor* b,
|
||
float gap = 0.1f) {
|
||
GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
|
||
ggml_ext_tensor_iter(a, [&](ggml_tensor* a, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
|
||
float a_value = ggml_ext_tensor_get_f32(a, i0, i1, i2, i3);
|
||
float b_value = ggml_ext_tensor_get_f32(b, i0, i1, i2, i3);
|
||
if (abs(a_value - b_value) > gap) {
|
||
LOG_WARN("[%ld, %ld, %ld, %ld] %f %f", i3, i2, i1, i0, a_value, b_value);
|
||
}
|
||
});
|
||
}
|
||
|
||
__STATIC_INLINE__ ggml_tensor* load_tensor_from_file(ggml_context* ctx, const std::string& file_path) {
|
||
std::ifstream file(file_path, std::ios::binary);
|
||
if (!file.is_open()) {
|
||
LOG_ERROR("failed to open '%s'", file_path.c_str());
|
||
return nullptr;
|
||
}
|
||
int32_t n_dims;
|
||
int32_t length;
|
||
int32_t ttype;
|
||
|
||
file.read(reinterpret_cast<char*>(&n_dims), sizeof(n_dims));
|
||
file.read(reinterpret_cast<char*>(&length), sizeof(length));
|
||
file.read(reinterpret_cast<char*>(&ttype), sizeof(ttype));
|
||
|
||
LOG_DEBUG("load_tensor_from_file %d %d %d", n_dims, length, ttype);
|
||
|
||
if (file.eof()) {
|
||
LOG_ERROR("incomplete file '%s'", file_path.c_str());
|
||
return nullptr;
|
||
}
|
||
|
||
int32_t nelements = 1;
|
||
int32_t ne[4] = {1, 1, 1, 1};
|
||
for (int i = 0; i < n_dims; ++i) {
|
||
file.read(reinterpret_cast<char*>(&ne[i]), sizeof(ne[i]));
|
||
nelements *= ne[i];
|
||
}
|
||
std::string name(length, 0);
|
||
file.read(&name[0], length);
|
||
ggml_tensor* tensor = ggml_new_tensor_4d(ctx, (ggml_type)ttype, ne[0], ne[1], ne[2], ne[3]);
|
||
const size_t bpe = ggml_type_size(ggml_type(ttype));
|
||
file.read(reinterpret_cast<char*>(tensor->data), ggml_nbytes(tensor));
|
||
return tensor;
|
||
}
|
||
|
||
// __STATIC_INLINE__ void save_tensor_to_file(const std::string& file_name, ggml_tensor* tensor, const std::string & name) {
|
||
// std::string file_name_ = file_name + ".tensor";
|
||
// std::string name_ = name;
|
||
// std::ofstream file("./" + file_name_, std::ios::binary);
|
||
// file.write(reinterpret_cast<char*>(&tensor->n_dims), sizeof(tensor->n_dims));
|
||
// int len = (int)name_.size();
|
||
// file.write(reinterpret_cast<char*>(&len), sizeof(len));
|
||
// int ttype = (int)tensor->type;
|
||
// file.write(reinterpret_cast<char*>(&ttype), sizeof(ttype));
|
||
// for (int i = 0; i < tensor->n_dims; ++i) {
|
||
// int ne_ = (int) tensor->ne[i];
|
||
// file.write(reinterpret_cast<char*>(&ne_), sizeof(ne_));
|
||
// }
|
||
// file.write(&name_[0], len);
|
||
// char* data = nullptr;
|
||
// file.write((char*)tensor->data, ggml_nbytes(tensor));
|
||
// file.close();
|
||
// }
|
||
|
||
__STATIC_INLINE__ void copy_ggml_tensor(struct ggml_tensor* dst, struct ggml_tensor* src) {
|
||
if (dst->type == src->type) {
|
||
dst->nb[0] = src->nb[0];
|
||
dst->nb[1] = src->nb[1];
|
||
dst->nb[2] = src->nb[2];
|
||
dst->nb[3] = src->nb[3];
|
||
|
||
memcpy(((char*)dst->data), ((char*)src->data), ggml_nbytes(dst));
|
||
return;
|
||
}
|
||
struct ggml_init_params params;
|
||
params.mem_size = 10 * 1024 * 1024; // for padding
|
||
params.mem_buffer = nullptr;
|
||
params.no_alloc = false;
|
||
struct ggml_context* ctx = ggml_init(params);
|
||
if (!ctx) {
|
||
LOG_ERROR("ggml_init() failed");
|
||
return;
|
||
}
|
||
ggml_tensor* final = ggml_cpy(ctx, src, dst);
|
||
|
||
struct ggml_cgraph* graph = ggml_new_graph(ctx);
|
||
ggml_build_forward_expand(graph, final);
|
||
ggml_graph_compute_with_ctx(ctx, graph, 1);
|
||
ggml_free(ctx);
|
||
}
|
||
|
||
__STATIC_INLINE__ float sigmoid(float x) {
|
||
return 1 / (1.0f + expf(-x));
|
||
}
|
||
|
||
// SPECIAL OPERATIONS WITH TENSORS
|
||
|
||
__STATIC_INLINE__ uint8_t* ggml_tensor_to_sd_image(struct ggml_tensor* input, uint8_t* image_data = nullptr) {
|
||
int64_t width = input->ne[0];
|
||
int64_t height = input->ne[1];
|
||
int64_t channels = input->ne[2];
|
||
GGML_ASSERT(channels == 3 && input->type == GGML_TYPE_F32);
|
||
if (image_data == nullptr) {
|
||
image_data = (uint8_t*)malloc(width * height * channels);
|
||
}
|
||
for (int iy = 0; iy < height; iy++) {
|
||
for (int ix = 0; ix < width; ix++) {
|
||
for (int k = 0; k < channels; k++) {
|
||
float value = ggml_ext_tensor_get_f32(input, ix, iy, k);
|
||
*(image_data + iy * width * channels + ix * channels + k) = (uint8_t)(value * 255.0f);
|
||
}
|
||
}
|
||
}
|
||
return image_data;
|
||
}
|
||
|
||
__STATIC_INLINE__ uint8_t* ggml_tensor_to_sd_image(struct ggml_tensor* input, int idx, bool video = false) {
|
||
int64_t width = input->ne[0];
|
||
int64_t height = input->ne[1];
|
||
int64_t channels;
|
||
if (video) {
|
||
channels = input->ne[3];
|
||
} else {
|
||
channels = input->ne[2];
|
||
}
|
||
GGML_ASSERT(channels == 3 && input->type == GGML_TYPE_F32);
|
||
uint8_t* image_data = (uint8_t*)malloc(width * height * channels);
|
||
for (int ih = 0; ih < height; ih++) {
|
||
for (int iw = 0; iw < width; iw++) {
|
||
for (int ic = 0; ic < channels; ic++) {
|
||
float value;
|
||
if (video) {
|
||
value = ggml_ext_tensor_get_f32(input, iw, ih, idx, ic);
|
||
} else {
|
||
value = ggml_ext_tensor_get_f32(input, iw, ih, ic, idx);
|
||
}
|
||
*(image_data + ih * width * channels + iw * channels + ic) = (uint8_t)(value * 255.0f);
|
||
}
|
||
}
|
||
}
|
||
return image_data;
|
||
}
|
||
|
||
__STATIC_INLINE__ void sd_image_to_ggml_tensor(sd_image_t image,
|
||
ggml_tensor* tensor,
|
||
bool scale = true) {
|
||
GGML_ASSERT(image.width == tensor->ne[0]);
|
||
GGML_ASSERT(image.height == tensor->ne[1]);
|
||
GGML_ASSERT(image.channel == tensor->ne[2]);
|
||
GGML_ASSERT(1 == tensor->ne[3]);
|
||
GGML_ASSERT(tensor->type == GGML_TYPE_F32);
|
||
ggml_ext_tensor_iter(tensor, [&](ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
|
||
float value = sd_image_get_f32(image, i0, i1, i2, scale);
|
||
ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2, i3);
|
||
});
|
||
}
|
||
|
||
__STATIC_INLINE__ void ggml_ext_tensor_apply_mask(struct ggml_tensor* image_data,
|
||
struct ggml_tensor* mask,
|
||
struct ggml_tensor* output,
|
||
float masked_value = 0.5f) {
|
||
int64_t width = output->ne[0];
|
||
int64_t height = output->ne[1];
|
||
int64_t channels = output->ne[2];
|
||
float rescale_mx = mask->ne[0] / output->ne[0];
|
||
float rescale_my = mask->ne[1] / output->ne[1];
|
||
GGML_ASSERT(output->type == GGML_TYPE_F32);
|
||
for (int ix = 0; ix < width; ix++) {
|
||
for (int iy = 0; iy < height; iy++) {
|
||
int mx = (int)(ix * rescale_mx);
|
||
int my = (int)(iy * rescale_my);
|
||
float m = ggml_ext_tensor_get_f32(mask, mx, my);
|
||
m = round(m); // inpaint models need binary masks
|
||
ggml_ext_tensor_set_f32(mask, m, mx, my);
|
||
for (int k = 0; k < channels; k++) {
|
||
float value = ggml_ext_tensor_get_f32(image_data, ix, iy, k);
|
||
value = (1 - m) * (value - masked_value) + masked_value;
|
||
ggml_ext_tensor_set_f32(output, value, ix, iy, k);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
__STATIC_INLINE__ void sd_image_f32_to_ggml_tensor(sd_image_f32_t image,
|
||
ggml_tensor* tensor,
|
||
bool scale = true) {
|
||
GGML_ASSERT(image.width == tensor->ne[0]);
|
||
GGML_ASSERT(image.height == tensor->ne[1]);
|
||
GGML_ASSERT(image.channel == tensor->ne[2]);
|
||
GGML_ASSERT(1 == tensor->ne[3]);
|
||
GGML_ASSERT(tensor->type == GGML_TYPE_F32);
|
||
ggml_ext_tensor_iter(tensor, [&](ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
|
||
float value = sd_image_get_f32(image, i0, i1, i2, scale);
|
||
ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2, i3);
|
||
});
|
||
}
|
||
|
||
__STATIC_INLINE__ void ggml_ext_tensor_split_2d(struct ggml_tensor* input,
|
||
struct ggml_tensor* output,
|
||
int x,
|
||
int y) {
|
||
int64_t width = output->ne[0];
|
||
int64_t height = output->ne[1];
|
||
int64_t channels = output->ne[2];
|
||
int64_t ne3 = output->ne[3];
|
||
GGML_ASSERT(input->type == GGML_TYPE_F32 && output->type == GGML_TYPE_F32);
|
||
for (int iy = 0; iy < height; iy++) {
|
||
for (int ix = 0; ix < width; ix++) {
|
||
for (int k = 0; k < channels; k++) {
|
||
for (int l = 0; l < ne3; l++) {
|
||
float value = ggml_ext_tensor_get_f32(input, ix + x, iy + y, k, l);
|
||
ggml_ext_tensor_set_f32(output, value, ix, iy, k, l);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// unclamped -> expects x in the range [0-1]
|
||
__STATIC_INLINE__ float smootherstep_f32(const float x) {
|
||
GGML_ASSERT(x >= 0.f && x <= 1.f);
|
||
return x * x * x * (x * (6.0f * x - 15.0f) + 10.0f);
|
||
}
|
||
|
||
__STATIC_INLINE__ void ggml_ext_tensor_merge_2d(struct ggml_tensor* input,
|
||
struct ggml_tensor* output,
|
||
int x,
|
||
int y,
|
||
int overlap_x,
|
||
int overlap_y,
|
||
int x_skip = 0,
|
||
int y_skip = 0) {
|
||
int64_t width = input->ne[0];
|
||
int64_t height = input->ne[1];
|
||
int64_t channels = input->ne[2];
|
||
int64_t ne3 = input->ne[3];
|
||
|
||
int64_t img_width = output->ne[0];
|
||
int64_t img_height = output->ne[1];
|
||
|
||
GGML_ASSERT(input->type == GGML_TYPE_F32 && output->type == GGML_TYPE_F32);
|
||
for (int iy = y_skip; iy < height; iy++) {
|
||
for (int ix = x_skip; ix < width; ix++) {
|
||
for (int k = 0; k < channels; k++) {
|
||
for (int l = 0; l < ne3; l++) {
|
||
float new_value = ggml_ext_tensor_get_f32(input, ix, iy, k, l);
|
||
if (overlap_x > 0 || overlap_y > 0) { // blend colors in overlapped area
|
||
float old_value = ggml_ext_tensor_get_f32(output, x + ix, y + iy, k, l);
|
||
|
||
const float x_f_0 = (overlap_x > 0 && x > 0) ? (ix - x_skip) / float(overlap_x) : 1;
|
||
const float x_f_1 = (overlap_x > 0 && x < (img_width - width)) ? (width - ix) / float(overlap_x) : 1;
|
||
const float y_f_0 = (overlap_y > 0 && y > 0) ? (iy - y_skip) / float(overlap_y) : 1;
|
||
const float y_f_1 = (overlap_y > 0 && y < (img_height - height)) ? (height - iy) / float(overlap_y) : 1;
|
||
|
||
const float x_f = std::min(std::min(x_f_0, x_f_1), 1.f);
|
||
const float y_f = std::min(std::min(y_f_0, y_f_1), 1.f);
|
||
|
||
ggml_ext_tensor_set_f32(
|
||
output,
|
||
old_value + new_value * smootherstep_f32(y_f) * smootherstep_f32(x_f),
|
||
x + ix, y + iy, k, l);
|
||
} else {
|
||
ggml_ext_tensor_set_f32(output, new_value, x + ix, y + iy, k, l);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
__STATIC_INLINE__ float ggml_ext_tensor_mean(struct ggml_tensor* src) {
|
||
float mean = 0.0f;
|
||
int64_t nelements = ggml_nelements(src);
|
||
float* data = (float*)src->data;
|
||
for (int i = 0; i < nelements; i++) {
|
||
mean += data[i] / nelements * 1.0f;
|
||
}
|
||
return mean;
|
||
}
|
||
|
||
// a = a+b
|
||
__STATIC_INLINE__ void ggml_ext_tensor_add_inplace(struct ggml_tensor* a, struct ggml_tensor* b) {
|
||
GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
|
||
int64_t nelements = ggml_nelements(a);
|
||
float* vec_a = (float*)a->data;
|
||
float* vec_b = (float*)b->data;
|
||
for (int i = 0; i < nelements; i++) {
|
||
vec_a[i] = vec_a[i] + vec_b[i];
|
||
}
|
||
}
|
||
|
||
__STATIC_INLINE__ void ggml_ext_tensor_scale_inplace(struct ggml_tensor* src, float scale) {
|
||
int64_t nelements = ggml_nelements(src);
|
||
float* data = (float*)src->data;
|
||
for (int i = 0; i < nelements; i++) {
|
||
data[i] = data[i] * scale;
|
||
}
|
||
}
|
||
|
||
__STATIC_INLINE__ void ggml_ext_tensor_clamp_inplace(struct ggml_tensor* src, float min, float max) {
|
||
int64_t nelements = ggml_nelements(src);
|
||
float* data = (float*)src->data;
|
||
for (int i = 0; i < nelements; i++) {
|
||
float val = data[i];
|
||
data[i] = val < min ? min : (val > max ? max : val);
|
||
}
|
||
}
|
||
|
||
__STATIC_INLINE__ struct ggml_tensor* ggml_ext_tensor_concat(struct ggml_context* ctx,
|
||
struct ggml_tensor* a,
|
||
struct ggml_tensor* b,
|
||
int dim) {
|
||
int64_t ne[GGML_MAX_DIMS];
|
||
for (int d = 0; d < GGML_MAX_DIMS; ++d) {
|
||
if (d == dim) {
|
||
ne[d] = a->ne[d] + b->ne[d];
|
||
continue;
|
||
}
|
||
GGML_ASSERT(a->ne[d] == b->ne[d]);
|
||
ne[d] = a->ne[d];
|
||
}
|
||
struct ggml_tensor* result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
|
||
int64_t o[4] = {0, 0, 0, 0};
|
||
o[dim] = a->ne[dim];
|
||
|
||
float v;
|
||
for (int i3 = 0; i3 < result->ne[3]; i3++) {
|
||
for (int i2 = 0; i2 < result->ne[2]; i2++) {
|
||
for (int i1 = 0; i1 < result->ne[1]; i1++) {
|
||
for (int i0 = 0; i0 < result->ne[0]; i0++) {
|
||
if (i0 < a->ne[0] && i1 < a->ne[1] && i2 < a->ne[2] && i3 < a->ne[3]) {
|
||
v = ggml_ext_tensor_get_f32(a, i0, i1, i2, i3);
|
||
} else {
|
||
v = ggml_ext_tensor_get_f32(b, i0 - o[0], i1 - o[1], i2 - o[2], i3 - o[3]);
|
||
}
|
||
|
||
ggml_ext_tensor_set_f32(result, v, i0, i1, i2, i3);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
return result;
|
||
}
|
||
|
||
// convert values from [0, 1] to [-1, 1]
|
||
__STATIC_INLINE__ void process_vae_input_tensor(struct ggml_tensor* src) {
|
||
int64_t nelements = ggml_nelements(src);
|
||
float* data = (float*)src->data;
|
||
for (int i = 0; i < nelements; i++) {
|
||
float val = data[i];
|
||
data[i] = val * 2.0f - 1.0f;
|
||
}
|
||
}
|
||
|
||
// convert values from [-1, 1] to [0, 1]
|
||
__STATIC_INLINE__ void process_vae_output_tensor(struct ggml_tensor* src) {
|
||
int64_t nelements = ggml_nelements(src);
|
||
float* data = (float*)src->data;
|
||
for (int i = 0; i < nelements; i++) {
|
||
float val = data[i];
|
||
data[i] = (val + 1.0f) * 0.5f;
|
||
}
|
||
}
|
||
|
||
__STATIC_INLINE__ struct ggml_tensor* ggml_ext_cont(struct ggml_context* ctx,
|
||
struct ggml_tensor* x) {
|
||
if (ggml_is_contiguous(x)) {
|
||
return x;
|
||
}
|
||
return ggml_cont(ctx, x);
|
||
}
|
||
|
||
// torch like permute
|
||
__STATIC_INLINE__ struct ggml_tensor* ggml_ext_torch_permute(struct ggml_context* ctx,
|
||
struct ggml_tensor* x,
|
||
int axis0,
|
||
int axis1,
|
||
int axis2,
|
||
int axis3) {
|
||
int torch_axes[4] = {axis0, axis1, axis2, axis3};
|
||
|
||
int ggml_axes[4] = {0};
|
||
for (int i = 0; i < 4; ++i) {
|
||
int found = 0;
|
||
for (int j = 0; j < 4; ++j) {
|
||
if (torch_axes[j] == i) {
|
||
ggml_axes[i] = j;
|
||
found = 1;
|
||
break;
|
||
}
|
||
}
|
||
GGML_ASSERT(found && "Invalid permute input: must be a permutation of 0-3");
|
||
}
|
||
|
||
return ggml_permute(ctx, x, ggml_axes[0], ggml_axes[1], ggml_axes[2], ggml_axes[3]);
|
||
}
|
||
|
||
__STATIC_INLINE__ struct ggml_tensor* ggml_ext_slice(struct ggml_context* ctx,
|
||
struct ggml_tensor* x,
|
||
int64_t dim,
|
||
int64_t start,
|
||
int64_t end) {
|
||
GGML_ASSERT(dim >= 0 && dim < 4);
|
||
if (x->ne[dim] == 1) {
|
||
return x;
|
||
}
|
||
while (start < 0) {
|
||
start = x->ne[dim] + start;
|
||
}
|
||
while (end < 0) {
|
||
end = x->ne[dim] + end;
|
||
}
|
||
GGML_ASSERT(end > start);
|
||
GGML_ASSERT(start >= 0 && start < x->ne[dim]);
|
||
GGML_ASSERT(end > start && end <= x->ne[dim]);
|
||
|
||
int perm[4] = {0, 1, 2, 3};
|
||
for (int i = dim; i < 3; ++i)
|
||
perm[i] = perm[i + 1];
|
||
perm[3] = dim;
|
||
|
||
int inv_perm[4];
|
||
for (int i = 0; i < 4; ++i)
|
||
inv_perm[perm[i]] = i;
|
||
|
||
if (dim != 3) {
|
||
x = ggml_ext_torch_permute(ctx, x, perm[0], perm[1], perm[2], perm[3]);
|
||
x = ggml_cont(ctx, x);
|
||
}
|
||
|
||
x = ggml_view_4d(
|
||
ctx, x,
|
||
x->ne[0], x->ne[1], x->ne[2], end - start,
|
||
x->nb[1], x->nb[2], x->nb[3], x->nb[3] * start);
|
||
|
||
if (dim != 3) {
|
||
x = ggml_ext_torch_permute(ctx, x, inv_perm[0], inv_perm[1], inv_perm[2], inv_perm[3]);
|
||
x = ggml_cont(ctx, x);
|
||
}
|
||
|
||
return x;
|
||
}
|
||
|
||
// example: [N, 3*C, H, W] => ([N, C, H, W], [N, C, H, W], [N, C, H, W])
|
||
__STATIC_INLINE__ std::vector<struct ggml_tensor*> ggml_ext_chunk(struct ggml_context* ctx,
|
||
struct ggml_tensor* x,
|
||
int num,
|
||
int64_t dim) {
|
||
GGML_ASSERT(dim >= 0 && dim < 4);
|
||
GGML_ASSERT(x->ne[dim] % num == 0);
|
||
|
||
int perm[4] = {0, 1, 2, 3};
|
||
for (int i = dim; i < 3; ++i)
|
||
perm[i] = perm[i + 1];
|
||
perm[3] = dim;
|
||
|
||
int inv_perm[4];
|
||
for (int i = 0; i < 4; ++i)
|
||
inv_perm[perm[i]] = i;
|
||
|
||
if (dim != 3) {
|
||
x = ggml_ext_torch_permute(ctx, x, perm[0], perm[1], perm[2], perm[3]);
|
||
x = ggml_cont(ctx, x);
|
||
}
|
||
|
||
std::vector<struct ggml_tensor*> chunks;
|
||
int64_t chunk_size = x->ne[3] / num;
|
||
for (int i = 0; i < num; i++) {
|
||
auto chunk = ggml_view_4d(
|
||
ctx, x,
|
||
x->ne[0], x->ne[1], x->ne[2], chunk_size,
|
||
x->nb[1], x->nb[2], x->nb[3], x->nb[3] * i * chunk_size);
|
||
|
||
if (dim != 3) {
|
||
chunk = ggml_ext_torch_permute(ctx, chunk, inv_perm[0], inv_perm[1], inv_perm[2], inv_perm[3]);
|
||
chunk = ggml_cont(ctx, chunk);
|
||
}
|
||
chunks.push_back(chunk);
|
||
}
|
||
|
||
return chunks;
|
||
}
|
||
|
||
__STATIC_INLINE__ ggml_tensor* ggml_ext_silu_act(ggml_context* ctx, ggml_tensor* x) {
|
||
// x: [ne3, ne2, ne1, ne0]
|
||
// return: [ne3, ne2, ne1, ne0/2]
|
||
|
||
auto x_vec = ggml_ext_chunk(ctx, x, 2, 0);
|
||
auto x1 = x_vec[0]; // [ne3, ne2, ne1, ne0/2]
|
||
auto x2 = x_vec[1]; // [ne3, ne2, ne1, ne0/2]
|
||
|
||
x1 = ggml_gelu_inplace(ctx, x1);
|
||
|
||
x = ggml_mul(ctx, x1, x2); // [ne3, ne2, ne1, ne0/2]
|
||
|
||
return x;
|
||
}
|
||
|
||
typedef std::function<void(ggml_tensor*, ggml_tensor*, bool)> on_tile_process;
|
||
|
||
__STATIC_INLINE__ void sd_tiling_calc_tiles(int& num_tiles_dim,
|
||
float& tile_overlap_factor_dim,
|
||
int small_dim,
|
||
int tile_size,
|
||
const float tile_overlap_factor) {
|
||
int tile_overlap = (tile_size * tile_overlap_factor);
|
||
int non_tile_overlap = tile_size - tile_overlap;
|
||
|
||
num_tiles_dim = (small_dim - tile_overlap) / non_tile_overlap;
|
||
int overshoot_dim = ((num_tiles_dim + 1) * non_tile_overlap + tile_overlap) % small_dim;
|
||
|
||
if ((overshoot_dim != non_tile_overlap) && (overshoot_dim <= num_tiles_dim * (tile_size / 2 - tile_overlap))) {
|
||
// if tiles don't fit perfectly using the desired overlap
|
||
// and there is enough room to squeeze an extra tile without overlap becoming >0.5
|
||
num_tiles_dim++;
|
||
}
|
||
|
||
tile_overlap_factor_dim = (float)(tile_size * num_tiles_dim - small_dim) / (float)(tile_size * (num_tiles_dim - 1));
|
||
if (num_tiles_dim <= 2) {
|
||
if (small_dim <= tile_size) {
|
||
num_tiles_dim = 1;
|
||
tile_overlap_factor_dim = 0;
|
||
} else {
|
||
num_tiles_dim = 2;
|
||
tile_overlap_factor_dim = (2 * tile_size - small_dim) / (float)tile_size;
|
||
}
|
||
}
|
||
}
|
||
|
||
// Tiling
|
||
__STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,
|
||
ggml_tensor* output,
|
||
const int scale,
|
||
const int p_tile_size_x,
|
||
const int p_tile_size_y,
|
||
const float tile_overlap_factor,
|
||
on_tile_process on_processing) {
|
||
output = ggml_set_f32(output, 0);
|
||
|
||
int input_width = (int)input->ne[0];
|
||
int input_height = (int)input->ne[1];
|
||
int output_width = (int)output->ne[0];
|
||
int output_height = (int)output->ne[1];
|
||
|
||
GGML_ASSERT(((input_width / output_width) == (input_height / output_height)) &&
|
||
((output_width / input_width) == (output_height / input_height)));
|
||
GGML_ASSERT(((input_width / output_width) == scale) ||
|
||
((output_width / input_width) == scale));
|
||
|
||
int small_width = output_width;
|
||
int small_height = output_height;
|
||
|
||
bool decode = output_width > input_width;
|
||
if (decode) {
|
||
small_width = input_width;
|
||
small_height = input_height;
|
||
}
|
||
|
||
int num_tiles_x;
|
||
float tile_overlap_factor_x;
|
||
sd_tiling_calc_tiles(num_tiles_x, tile_overlap_factor_x, small_width, p_tile_size_x, tile_overlap_factor);
|
||
|
||
int num_tiles_y;
|
||
float tile_overlap_factor_y;
|
||
sd_tiling_calc_tiles(num_tiles_y, tile_overlap_factor_y, small_height, p_tile_size_y, tile_overlap_factor);
|
||
|
||
LOG_DEBUG("num tiles : %d, %d ", num_tiles_x, num_tiles_y);
|
||
LOG_DEBUG("optimal overlap : %f, %f (targeting %f)", tile_overlap_factor_x, tile_overlap_factor_y, tile_overlap_factor);
|
||
|
||
GGML_ASSERT(input_width % 2 == 0 && input_height % 2 == 0 && output_width % 2 == 0 && output_height % 2 == 0); // should be multiple of 2
|
||
|
||
int tile_overlap_x = (int32_t)(p_tile_size_x * tile_overlap_factor_x);
|
||
int non_tile_overlap_x = p_tile_size_x - tile_overlap_x;
|
||
|
||
int tile_overlap_y = (int32_t)(p_tile_size_y * tile_overlap_factor_y);
|
||
int non_tile_overlap_y = p_tile_size_y - tile_overlap_y;
|
||
|
||
int tile_size_x = p_tile_size_x < small_width ? p_tile_size_x : small_width;
|
||
int tile_size_y = p_tile_size_y < small_height ? p_tile_size_y : small_height;
|
||
|
||
int input_tile_size_x = tile_size_x;
|
||
int input_tile_size_y = tile_size_y;
|
||
int output_tile_size_x = tile_size_x;
|
||
int output_tile_size_y = tile_size_y;
|
||
|
||
if (decode) {
|
||
output_tile_size_x *= scale;
|
||
output_tile_size_y *= scale;
|
||
} else {
|
||
input_tile_size_x *= scale;
|
||
input_tile_size_y *= scale;
|
||
}
|
||
|
||
struct ggml_init_params params = {};
|
||
params.mem_size += input_tile_size_x * input_tile_size_y * input->ne[2] * input->ne[3] * sizeof(float); // input chunk
|
||
params.mem_size += output_tile_size_x * output_tile_size_y * output->ne[2] * output->ne[3] * sizeof(float); // output chunk
|
||
params.mem_size += 3 * ggml_tensor_overhead();
|
||
params.mem_buffer = nullptr;
|
||
params.no_alloc = false;
|
||
|
||
LOG_DEBUG("tile work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f);
|
||
|
||
// draft context
|
||
struct ggml_context* tiles_ctx = ggml_init(params);
|
||
if (!tiles_ctx) {
|
||
LOG_ERROR("ggml_init() failed");
|
||
return;
|
||
}
|
||
|
||
// tiling
|
||
ggml_tensor* input_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, input_tile_size_x, input_tile_size_y, input->ne[2], input->ne[3]);
|
||
ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, output_tile_size_x, output_tile_size_y, output->ne[2], output->ne[3]);
|
||
int num_tiles = num_tiles_x * num_tiles_y;
|
||
LOG_DEBUG("processing %i tiles", num_tiles);
|
||
pretty_progress(0, num_tiles, 0.0f);
|
||
int tile_count = 1;
|
||
bool last_y = false, last_x = false;
|
||
float last_time = 0.0f;
|
||
for (int y = 0; y < small_height && !last_y; y += non_tile_overlap_y) {
|
||
int dy = 0;
|
||
if (y + tile_size_y >= small_height) {
|
||
int _y = y;
|
||
y = small_height - tile_size_y;
|
||
dy = _y - y;
|
||
if (decode) {
|
||
dy *= scale;
|
||
}
|
||
last_y = true;
|
||
}
|
||
for (int x = 0; x < small_width && !last_x; x += non_tile_overlap_x) {
|
||
int dx = 0;
|
||
if (x + tile_size_x >= small_width) {
|
||
int _x = x;
|
||
x = small_width - tile_size_x;
|
||
dx = _x - x;
|
||
if (decode) {
|
||
dx *= scale;
|
||
}
|
||
last_x = true;
|
||
}
|
||
|
||
int x_in = decode ? x : scale * x;
|
||
int y_in = decode ? y : scale * y;
|
||
int x_out = decode ? x * scale : x;
|
||
int y_out = decode ? y * scale : y;
|
||
|
||
int overlap_x_out = decode ? tile_overlap_x * scale : tile_overlap_x;
|
||
int overlap_y_out = decode ? tile_overlap_y * scale : tile_overlap_y;
|
||
|
||
int64_t t1 = ggml_time_ms();
|
||
ggml_ext_tensor_split_2d(input, input_tile, x_in, y_in);
|
||
on_processing(input_tile, output_tile, false);
|
||
ggml_ext_tensor_merge_2d(output_tile, output, x_out, y_out, overlap_x_out, overlap_y_out, dx, dy);
|
||
|
||
int64_t t2 = ggml_time_ms();
|
||
last_time = (t2 - t1) / 1000.0f;
|
||
pretty_progress(tile_count, num_tiles, last_time);
|
||
tile_count++;
|
||
}
|
||
last_x = false;
|
||
}
|
||
if (tile_count < num_tiles) {
|
||
pretty_progress(num_tiles, num_tiles, last_time);
|
||
}
|
||
ggml_free(tiles_ctx);
|
||
}
|
||
|
||
__STATIC_INLINE__ void sd_tiling(ggml_tensor* input,
|
||
ggml_tensor* output,
|
||
const int scale,
|
||
const int tile_size,
|
||
const float tile_overlap_factor,
|
||
on_tile_process on_processing) {
|
||
sd_tiling_non_square(input, output, scale, tile_size, tile_size, tile_overlap_factor, on_processing);
|
||
}
|
||
|
||
__STATIC_INLINE__ struct ggml_tensor* ggml_ext_group_norm_32(struct ggml_context* ctx,
|
||
struct ggml_tensor* a) {
|
||
const float eps = 1e-6f; // default eps parameter
|
||
return ggml_group_norm(ctx, a, 32, eps);
|
||
}
|
||
|
||
__STATIC_INLINE__ struct ggml_tensor* ggml_ext_linear(struct ggml_context* ctx,
|
||
struct ggml_tensor* x,
|
||
struct ggml_tensor* w,
|
||
struct ggml_tensor* b,
|
||
bool force_prec_f32 = false,
|
||
float scale = 1.f) {
|
||
if (scale != 1.f) {
|
||
x = ggml_scale(ctx, x, scale);
|
||
}
|
||
if (x->ne[2] * x->ne[3] > 1024) {
|
||
// workaround: avoid ggml cuda error
|
||
int64_t ne2 = x->ne[2];
|
||
int64_t ne3 = x->ne[3];
|
||
x = ggml_reshape_2d(ctx, x, x->ne[0], x->ne[1] * x->ne[2] * x->ne[3]);
|
||
x = ggml_mul_mat(ctx, w, x);
|
||
if (force_prec_f32) {
|
||
ggml_mul_mat_set_prec(x, GGML_PREC_F32);
|
||
}
|
||
x = ggml_reshape_4d(ctx, x, x->ne[0], x->ne[1] / ne2 / ne3, ne2, ne3);
|
||
} else {
|
||
x = ggml_mul_mat(ctx, w, x);
|
||
if (force_prec_f32) {
|
||
ggml_mul_mat_set_prec(x, GGML_PREC_F32);
|
||
}
|
||
}
|
||
if (scale != 1.f) {
|
||
x = ggml_scale(ctx, x, 1.f / scale);
|
||
}
|
||
if (b != nullptr) {
|
||
x = ggml_add_inplace(ctx, x, b);
|
||
}
|
||
return x;
|
||
}
|
||
|
||
// w: [OC,IC, KH, KW]
|
||
// x: [N, IC, IH, IW]
|
||
// b: [OC,]
|
||
// result: [N, OC, OH, OW]
|
||
__STATIC_INLINE__ struct ggml_tensor* ggml_ext_conv_2d(struct ggml_context* ctx,
|
||
struct ggml_tensor* x,
|
||
struct ggml_tensor* w,
|
||
struct ggml_tensor* b,
|
||
int s0 = 1,
|
||
int s1 = 1,
|
||
int p0 = 0,
|
||
int p1 = 0,
|
||
int d0 = 1,
|
||
int d1 = 1,
|
||
bool direct = false,
|
||
float scale = 1.f) {
|
||
if (scale != 1.f) {
|
||
x = ggml_scale(ctx, x, scale);
|
||
}
|
||
if (w->ne[2] != x->ne[2] && ggml_n_dims(w) == 2) {
|
||
w = ggml_reshape_4d(ctx, w, 1, 1, w->ne[0], w->ne[1]);
|
||
}
|
||
if (direct) {
|
||
x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1);
|
||
} else {
|
||
x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1);
|
||
}
|
||
if (scale != 1.f) {
|
||
x = ggml_scale(ctx, x, 1.f / scale);
|
||
}
|
||
if (b != nullptr) {
|
||
b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
|
||
x = ggml_add_inplace(ctx, x, b);
|
||
}
|
||
return x;
|
||
}
|
||
|
||
// w: [OC,IC, KD, 1 * 1]
|
||
// x: [N, IC, IH, IW]
|
||
// b: [OC,]
|
||
// result: [N*OC, OD, OH, OW]
|
||
__STATIC_INLINE__ struct ggml_tensor* ggml_ext_conv_3d(struct ggml_context* ctx,
|
||
struct ggml_tensor* x,
|
||
struct ggml_tensor* w,
|
||
struct ggml_tensor* b,
|
||
int64_t IC,
|
||
int s0 = 1,
|
||
int s1 = 1,
|
||
int s2 = 1,
|
||
int p0 = 0,
|
||
int p1 = 0,
|
||
int p2 = 0,
|
||
int d0 = 1,
|
||
int d1 = 1,
|
||
int d2 = 1) {
|
||
int64_t OC = w->ne[3] / IC;
|
||
int64_t N = x->ne[3] / IC;
|
||
x = ggml_conv_3d(ctx, w, x, IC, s0, s1, s2, p0, p1, p2, d0, d1, d2);
|
||
|
||
if (b != nullptr) {
|
||
b = ggml_reshape_4d(ctx, b, 1, 1, 1, b->ne[0]); // [OC, 1, 1, 1]
|
||
x = ggml_add_inplace(ctx, x, b);
|
||
}
|
||
return x;
|
||
}
|
||
|
||
// w: [OC,IC, KD, 1 * 1]
|
||
// x: [N, IC, ID, IH*IW]
|
||
// b: [OC,]
|
||
// result: [N, OC, OD, OH*OW]
|
||
__STATIC_INLINE__ struct ggml_tensor* ggml_ext_conv_3d_nx1x1(struct ggml_context* ctx,
|
||
struct ggml_tensor* x,
|
||
struct ggml_tensor* w,
|
||
struct ggml_tensor* b,
|
||
int s2 = 1,
|
||
int p2 = 1,
|
||
int d2 = 1) {
|
||
x = ggml_conv_2d(ctx, w, x, 1, s2, 0, p2, 1, d2); // [N, OC, T, OH * OW]
|
||
if (b != nullptr) {
|
||
b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
|
||
x = ggml_add(ctx, x, b);
|
||
}
|
||
return x; // [N, OC, T, OH * OW]
|
||
}
|
||
|
||
// qkv: [N, L, 3*C]
|
||
// return: ([N, L, C], [N, L, C], [N, L, C])
|
||
__STATIC_INLINE__ std::vector<struct ggml_tensor*> split_qkv(struct ggml_context* ctx,
|
||
struct ggml_tensor* qkv) {
|
||
qkv = ggml_reshape_4d(ctx, qkv, qkv->ne[0] / 3, 3, qkv->ne[1], qkv->ne[2]); // [N, L, 3, C]
|
||
qkv = ggml_cont(ctx, ggml_permute(ctx, qkv, 0, 3, 1, 2)); // [3, N, L, C]
|
||
|
||
int64_t offset = qkv->nb[2] * qkv->ne[2];
|
||
auto q = ggml_view_3d(ctx, qkv, qkv->ne[0], qkv->ne[1], qkv->ne[2], qkv->nb[1], qkv->nb[2], offset * 0); // [N, L, C]
|
||
auto k = ggml_view_3d(ctx, qkv, qkv->ne[0], qkv->ne[1], qkv->ne[2], qkv->nb[1], qkv->nb[2], offset * 1); // [N, L, C]
|
||
auto v = ggml_view_3d(ctx, qkv, qkv->ne[0], qkv->ne[1], qkv->ne[2], qkv->nb[1], qkv->nb[2], offset * 2); // [N, L, C]
|
||
return {q, k, v};
|
||
}
|
||
|
||
// qkv: [N, 3*C, H, W]
|
||
// return: ([N, C, H, W], [N, C, H, W], [N, C, H, W])
|
||
__STATIC_INLINE__ std::vector<struct ggml_tensor*> split_image_qkv(struct ggml_context* ctx,
|
||
struct ggml_tensor* qkv) {
|
||
int64_t W = qkv->ne[0];
|
||
int64_t H = qkv->ne[1];
|
||
int64_t C = qkv->ne[2] / 3;
|
||
int64_t N = qkv->ne[3];
|
||
int64_t nb1 = qkv->nb[1];
|
||
int64_t nb2 = qkv->nb[2];
|
||
qkv = ggml_reshape_4d(ctx, qkv, W * H, C, 3, N); // [N, 3, C, H*W]
|
||
qkv = ggml_cont(ctx, ggml_ext_torch_permute(ctx, qkv, 0, 1, 3, 2)); // [3, N, C, H*W]
|
||
|
||
int64_t offset = qkv->nb[2] * qkv->ne[2];
|
||
auto q = ggml_view_4d(ctx, qkv, W, H, C, N, nb1, nb2, qkv->nb[3], offset * 0); // [N, C, H, W]
|
||
auto k = ggml_view_4d(ctx, qkv, W, H, C, N, nb1, nb2, qkv->nb[3], offset * 1); // [N, C, H, W]
|
||
auto v = ggml_view_4d(ctx, qkv, W, H, C, N, nb1, nb2, qkv->nb[3], offset * 2); // [N, C, H, W]
|
||
return {q, k, v};
|
||
}
|
||
|
||
__STATIC_INLINE__ struct ggml_tensor* ggml_ext_full(struct ggml_context* ctx,
|
||
float value,
|
||
int64_t ne0,
|
||
int64_t ne1,
|
||
int64_t ne2,
|
||
int64_t ne3) {
|
||
auto one = ggml_get_tensor(ctx, "ggml_runner_build_in_tensor:one");
|
||
auto t = ggml_scale(ctx, one, value); // [1,]
|
||
t = ggml_repeat_4d(ctx, t, ne0, ne1, ne2, ne3); // [ne0, ne1, ne2, ne3]
|
||
return t;
|
||
}
|
||
|
||
__STATIC_INLINE__ struct ggml_tensor* ggml_ext_zeros(struct ggml_context* ctx,
|
||
int64_t ne0,
|
||
int64_t ne1,
|
||
int64_t ne2,
|
||
int64_t ne3) {
|
||
return ggml_ext_full(ctx, 0.f, ne0, ne1, ne2, ne3);
|
||
}
|
||
|
||
__STATIC_INLINE__ struct ggml_tensor* ggml_ext_ones(struct ggml_context* ctx,
|
||
int64_t ne0,
|
||
int64_t ne1,
|
||
int64_t ne2,
|
||
int64_t ne3) {
|
||
return ggml_ext_full(ctx, 1.f, ne0, ne1, ne2, ne3);
|
||
}
|
||
|
||
__STATIC_INLINE__ ggml_tensor* ggml_ext_cast_f32(ggml_context* ctx, ggml_tensor* a) {
|
||
#ifdef SD_USE_VULKAN
|
||
auto zero_index = ggml_get_tensor(ctx, "ggml_runner_build_in_tensor:zero_int");
|
||
auto out = ggml_reshape_1d(ctx, a, ggml_nelements(a));
|
||
out = ggml_get_rows(ctx, out, zero_index);
|
||
out = ggml_reshape(ctx, out, a);
|
||
// auto out = ggml_cast(ctx, a, GGML_TYPE_F32);
|
||
return out;
|
||
#else
|
||
auto out = ggml_reshape_2d(ctx, a, 1, ggml_nelements(a));
|
||
ggml_tensor* one = ggml_ext_ones(ctx, 1, 1, 1, 1); // [1,]
|
||
if (ggml_is_transposed(out)) {
|
||
out = ggml_mul_mat(ctx, one, out);
|
||
} else {
|
||
out = ggml_mul_mat(ctx, out, one);
|
||
}
|
||
out = ggml_reshape(ctx, out, a);
|
||
#endif
|
||
return out;
|
||
}
|
||
|
||
// q: [N * n_head, n_token, d_head]
|
||
// k: [N * n_head, n_k, d_head]
|
||
// v: [N * n_head, d_head, n_k]
|
||
// return: [N * n_head, n_token, d_head]
|
||
__STATIC_INLINE__ struct ggml_tensor* ggml_ext_attention(struct ggml_context* ctx,
|
||
struct ggml_tensor* q,
|
||
struct ggml_tensor* k,
|
||
struct ggml_tensor* v,
|
||
bool mask = false) {
|
||
#if defined(SD_USE_FLASH_ATTENTION) && !defined(SD_USE_CUDA) && !defined(SD_USE_METAL) && !defined(SD_USE_VULKAN) && !defined(SD_USE_SYCL)
|
||
struct ggml_tensor* kqv = ggml_flash_attn(ctx, q, k, v, false); // [N * n_head, n_token, d_head]
|
||
#else
|
||
float d_head = (float)q->ne[0];
|
||
struct ggml_tensor* kq = ggml_mul_mat(ctx, k, q); // [N * n_head, n_token, n_k]
|
||
kq = ggml_scale_inplace(ctx, kq, 1.0f / sqrt(d_head));
|
||
if (mask) {
|
||
kq = ggml_diag_mask_inf_inplace(ctx, kq, 0);
|
||
}
|
||
kq = ggml_soft_max_inplace(ctx, kq);
|
||
struct ggml_tensor* kqv = ggml_mul_mat(ctx, v, kq); // [N * n_head, n_token, d_head]
|
||
#endif
|
||
return kqv;
|
||
}
|
||
|
||
// q: [N, L_q, C(n_head*d_head)] or [N*n_head, L_q, d_head]
|
||
// k: [N, L_k, n_kv_head*d_head] or [N*n_kv_head, L_k, d_head]
|
||
// v: [N, L_k, n_kv_head*d_head] or [N, L_k, n_kv_head, d_head]
|
||
// mask: [N, L_q, L_k]
|
||
// return: [N, L_q, C]
|
||
__STATIC_INLINE__ struct ggml_tensor* ggml_ext_attention_ext(struct ggml_context* ctx,
|
||
ggml_backend_t backend,
|
||
struct ggml_tensor* q,
|
||
struct ggml_tensor* k,
|
||
struct ggml_tensor* v,
|
||
int64_t n_head,
|
||
struct ggml_tensor* mask = nullptr,
|
||
bool diag_mask_inf = false,
|
||
bool skip_reshape = false,
|
||
bool flash_attn = false,
|
||
float kv_scale = 1.0f) { // avoid overflow
|
||
int64_t L_q;
|
||
int64_t L_k;
|
||
int64_t C;
|
||
int64_t N;
|
||
int64_t d_head;
|
||
int64_t n_kv_head;
|
||
if (!skip_reshape) {
|
||
L_q = q->ne[1];
|
||
L_k = k->ne[1];
|
||
C = q->ne[0];
|
||
N = q->ne[2];
|
||
d_head = C / n_head;
|
||
n_kv_head = k->ne[0] / d_head;
|
||
|
||
q = ggml_reshape_4d(ctx, q, d_head, n_head, L_q, N); // [N, L_q, n_head, d_head]
|
||
q = ggml_ext_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3)); // [N, n_head, L_q, d_head]
|
||
q = ggml_reshape_3d(ctx, q, d_head, L_q, n_head * N); // [N * n_head, L_q, d_head]
|
||
|
||
k = ggml_reshape_4d(ctx, k, d_head, n_kv_head, L_k, N); // [N, L_k, n_kv_head, d_head]
|
||
k = ggml_ext_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3)); // [N, n_kv_head, L_k, d_head]
|
||
k = ggml_reshape_3d(ctx, k, d_head, L_k, n_kv_head * N); // [N * n_kv_head, L_k, d_head]
|
||
|
||
v = ggml_reshape_4d(ctx, v, d_head, n_kv_head, L_k, N); // [N, L_k, n_kv_head, d_head]
|
||
} else {
|
||
L_q = q->ne[1];
|
||
L_k = k->ne[1];
|
||
d_head = v->ne[0];
|
||
N = v->ne[3];
|
||
n_kv_head = k->ne[2] / N;
|
||
C = d_head * n_head;
|
||
}
|
||
|
||
float scale = (1.0f / sqrt((float)d_head));
|
||
|
||
int kv_pad = 0;
|
||
ggml_tensor* kqv = nullptr;
|
||
|
||
auto build_kqv = [&](ggml_tensor* q_in, ggml_tensor* k_in, ggml_tensor* v_in, ggml_tensor* mask_in) -> ggml_tensor* {
|
||
if (kv_pad != 0) {
|
||
k_in = ggml_pad(ctx, k_in, 0, kv_pad, 0, 0);
|
||
}
|
||
if (kv_scale != 1.0f) {
|
||
k_in = ggml_scale(ctx, k_in, kv_scale);
|
||
}
|
||
k_in = ggml_cast(ctx, k_in, GGML_TYPE_F16);
|
||
|
||
v_in = ggml_ext_cont(ctx, ggml_permute(ctx, v_in, 0, 2, 1, 3));
|
||
v_in = ggml_reshape_3d(ctx, v_in, d_head, L_k, n_kv_head * N);
|
||
if (kv_pad != 0) {
|
||
v_in = ggml_pad(ctx, v_in, 0, kv_pad, 0, 0);
|
||
}
|
||
if (kv_scale != 1.0f) {
|
||
v_in = ggml_scale(ctx, v_in, kv_scale);
|
||
}
|
||
v_in = ggml_cast(ctx, v_in, GGML_TYPE_F16);
|
||
|
||
if (mask_in != nullptr) {
|
||
mask_in = ggml_transpose(ctx, mask_in);
|
||
} else {
|
||
if (kv_pad > 0) {
|
||
mask_in = ggml_ext_zeros(ctx, L_k, L_q, 1, 1);
|
||
auto pad_tensor = ggml_ext_full(ctx, -INFINITY, kv_pad, L_q, 1, 1);
|
||
mask_in = ggml_concat(ctx, mask_in, pad_tensor, 0);
|
||
}
|
||
}
|
||
|
||
if (mask_in != nullptr) {
|
||
int mask_pad = 0;
|
||
if (mask_in->ne[1] % GGML_KQ_MASK_PAD != 0) {
|
||
mask_pad = GGML_PAD(L_q, GGML_KQ_MASK_PAD) - mask_in->ne[1];
|
||
}
|
||
if (mask_pad > 0) {
|
||
mask_in = ggml_pad(ctx, mask_in, 0, mask_pad, 0, 0);
|
||
}
|
||
mask_in = ggml_cast(ctx, mask_in, GGML_TYPE_F16);
|
||
}
|
||
|
||
auto out = ggml_flash_attn_ext(ctx, q_in, k_in, v_in, mask_in, scale / kv_scale, 0, 0);
|
||
ggml_flash_attn_ext_set_prec(out, GGML_PREC_F32);
|
||
if (kv_scale != 1.0f) {
|
||
out = ggml_scale(ctx, out, 1.0f / kv_scale);
|
||
}
|
||
return out;
|
||
};
|
||
|
||
if (flash_attn) {
|
||
// LOG_DEBUG("attention_ext L_q:%d L_k:%d n_head:%d C:%d d_head:%d N:%d", L_q, L_k, n_head, C, d_head, N);
|
||
bool can_use_flash_attn = true;
|
||
if (can_use_flash_attn && L_k % 256 != 0) {
|
||
kv_pad = GGML_PAD(L_k, 256) - L_k;
|
||
}
|
||
|
||
if (mask != nullptr) {
|
||
// TODO: figure out if we can bend t5 to work too
|
||
can_use_flash_attn = can_use_flash_attn && mask->ne[3] == 1;
|
||
}
|
||
|
||
if (can_use_flash_attn) {
|
||
kqv = build_kqv(q, k, v, mask);
|
||
if (!ggml_backend_supports_op(backend, kqv)) {
|
||
kqv = nullptr;
|
||
} else {
|
||
kqv = ggml_view_3d(ctx, kqv, d_head, n_head, L_q, kqv->nb[1], kqv->nb[2], 0);
|
||
}
|
||
}
|
||
}
|
||
|
||
if (kqv == nullptr) {
|
||
// if (flash_attn) {
|
||
// LOG_DEBUG("fallback to default attention, L_q:%d L_k:%d n_head:%d C:%d d_head:%d N:%d", L_q, L_k, n_head, C, d_head, N);
|
||
// }
|
||
v = ggml_ext_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3)); // [N, n_kv_head, d_head, L_k]
|
||
v = ggml_reshape_3d(ctx, v, L_k, d_head, n_kv_head * N); // [N * n_kv_head, d_head, L_k]
|
||
|
||
auto kq = ggml_mul_mat(ctx, k, q); // [N * n_head, L_q, L_k]
|
||
kq = ggml_scale_inplace(ctx, kq, scale);
|
||
if (mask) {
|
||
kq = ggml_add_inplace(ctx, kq, mask);
|
||
}
|
||
if (diag_mask_inf) {
|
||
kq = ggml_diag_mask_inf_inplace(ctx, kq, 0);
|
||
}
|
||
kq = ggml_soft_max_inplace(ctx, kq);
|
||
|
||
kqv = ggml_mul_mat(ctx, v, kq); // [N * n_head, L_q, d_head]
|
||
|
||
kqv = ggml_reshape_4d(ctx, kqv, d_head, L_q, n_head, N); // [N, n_head, L_q, d_head]
|
||
kqv = ggml_permute(ctx, kqv, 0, 2, 1, 3); // [N, L_q, n_head, d_head]
|
||
}
|
||
|
||
kqv = ggml_ext_cont(ctx, kqv);
|
||
kqv = ggml_reshape_3d(ctx, kqv, d_head * n_head, L_q, N); // [N, L_q, C]
|
||
|
||
return kqv;
|
||
}
|
||
|
||
__STATIC_INLINE__ struct ggml_tensor* ggml_ext_layer_norm(struct ggml_context* ctx,
|
||
struct ggml_tensor* x,
|
||
struct ggml_tensor* w,
|
||
struct ggml_tensor* b,
|
||
float eps = EPS) {
|
||
x = ggml_norm(ctx, x, eps);
|
||
if (w != nullptr) {
|
||
x = ggml_mul_inplace(ctx, x, w);
|
||
if (b != nullptr) {
|
||
x = ggml_add_inplace(ctx, x, b);
|
||
}
|
||
}
|
||
return x;
|
||
}
|
||
|
||
__STATIC_INLINE__ struct ggml_tensor* ggml_ext_group_norm(struct ggml_context* ctx,
|
||
struct ggml_tensor* x,
|
||
struct ggml_tensor* w,
|
||
struct ggml_tensor* b,
|
||
int num_groups = 32) {
|
||
if (ggml_n_dims(x) >= 3 && w != nullptr && b != nullptr) {
|
||
w = ggml_reshape_4d(ctx, w, 1, 1, w->ne[0], 1);
|
||
b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
|
||
}
|
||
|
||
const float eps = 1e-6f; // default eps parameter
|
||
x = ggml_group_norm(ctx, x, num_groups, eps);
|
||
if (w != nullptr && b != nullptr) {
|
||
x = ggml_mul_inplace(ctx, x, w);
|
||
// b = ggml_repeat(ctx, b, x);
|
||
x = ggml_add_inplace(ctx, x, b);
|
||
}
|
||
return x;
|
||
}
|
||
|
||
__STATIC_INLINE__ void ggml_ext_backend_tensor_get_and_sync(ggml_backend_t backend, const struct ggml_tensor* tensor, void* data, size_t offset, size_t size) {
|
||
#if defined(SD_USE_CUDA) || defined(SD_USE_SYCL)
|
||
if (!ggml_backend_is_cpu(backend)) {
|
||
ggml_backend_tensor_get_async(backend, tensor, data, offset, size);
|
||
ggml_backend_synchronize(backend);
|
||
} else {
|
||
ggml_backend_tensor_get(tensor, data, offset, size);
|
||
}
|
||
#else
|
||
ggml_backend_tensor_get(tensor, data, offset, size);
|
||
#endif
|
||
}
|
||
|
||
__STATIC_INLINE__ float ggml_ext_backend_tensor_get_f32(ggml_tensor* tensor) {
|
||
GGML_ASSERT(tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_I32);
|
||
float value;
|
||
if (tensor->type == GGML_TYPE_F32) {
|
||
ggml_backend_tensor_get(tensor, &value, 0, sizeof(value));
|
||
} else if (tensor->type == GGML_TYPE_F16) {
|
||
ggml_fp16_t f16_value;
|
||
ggml_backend_tensor_get(tensor, &f16_value, 0, sizeof(f16_value));
|
||
value = ggml_fp16_to_fp32(f16_value);
|
||
} else { // GGML_TYPE_I32
|
||
int int32_value;
|
||
ggml_backend_tensor_get(tensor, &int32_value, 0, sizeof(int32_value));
|
||
value = (float)int32_value;
|
||
}
|
||
return value;
|
||
}
|
||
|
||
__STATIC_INLINE__ struct ggml_tensor* vector_to_ggml_tensor(struct ggml_context* ctx,
|
||
const std::vector<float>& vec) {
|
||
struct ggml_tensor* t = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, vec.size());
|
||
memcpy(t->data, (const void*)vec.data(), ggml_nbytes(t));
|
||
return t;
|
||
}
|
||
|
||
__STATIC_INLINE__ struct ggml_tensor* vector_to_ggml_tensor_i32(struct ggml_context* ctx,
|
||
const std::vector<int>& vec) {
|
||
struct ggml_tensor* t = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, vec.size());
|
||
memcpy(t->data, (const void*)vec.data(), ggml_nbytes(t));
|
||
return t;
|
||
}
|
||
|
||
__STATIC_INLINE__ std::vector<float> arange(float start, float end, float step = 1.f) {
|
||
std::vector<float> result;
|
||
|
||
for (float value = start; value < end; value += step) {
|
||
result.push_back(value);
|
||
}
|
||
|
||
return result;
|
||
}
|
||
|
||
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
|
||
__STATIC_INLINE__ std::vector<float> timestep_embedding(std::vector<float> timesteps,
|
||
int dim,
|
||
int max_period = 10000,
|
||
bool flip_sin_to_cos = true,
|
||
float scale = 1.f) {
|
||
// timesteps: [N,]
|
||
// embedding: [N, dim]
|
||
size_t N = timesteps.size();
|
||
std::vector<float> embedding(N * dim, 0.f);
|
||
int half = dim / 2;
|
||
std::vector<float> freqs(half);
|
||
for (int i = 0; i < half; ++i) {
|
||
freqs[i] = (float)std::exp(-std::log(max_period) * i / half);
|
||
}
|
||
for (int i = 0; i < N; ++i) {
|
||
for (int j = 0; j < half; ++j) {
|
||
float arg = timesteps[i] * freqs[j] * scale;
|
||
if (flip_sin_to_cos) {
|
||
embedding[i * dim + j] = std::cos(arg);
|
||
embedding[i * dim + j + half] = std::sin(arg);
|
||
} else {
|
||
embedding[i * dim + j] = std::sin(arg);
|
||
embedding[i * dim + j + half] = std::cos(arg);
|
||
}
|
||
}
|
||
}
|
||
return embedding;
|
||
}
|
||
|
||
__STATIC_INLINE__ void set_timestep_embedding(std::vector<float> timesteps,
|
||
struct ggml_tensor* embedding,
|
||
int dim,
|
||
int max_period = 10000) {
|
||
std::vector<float> embedding_vec = timestep_embedding(timesteps, dim, max_period);
|
||
memcpy(((char*)embedding->data), ((char*)embedding_vec.data()), ggml_nbytes(embedding));
|
||
}
|
||
|
||
__STATIC_INLINE__ struct ggml_tensor* new_timestep_embedding(struct ggml_context* ctx,
|
||
std::vector<float> timesteps,
|
||
int dim,
|
||
int max_period = 10000) {
|
||
// timesteps: [N,]
|
||
// embedding: [N, dim]
|
||
std::vector<float> embedding_vec = timestep_embedding(timesteps, dim, max_period);
|
||
struct ggml_tensor* embedding = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, timesteps.size());
|
||
if (embedding->data != nullptr) {
|
||
memcpy(((char*)embedding->data), ((char*)embedding_vec.data()), ggml_nbytes(embedding));
|
||
} else {
|
||
ggml_backend_tensor_set(embedding, embedding_vec.data(), 0, ggml_nbytes(embedding));
|
||
}
|
||
return embedding;
|
||
}
|
||
|
||
__STATIC_INLINE__ struct ggml_tensor* ggml_ext_timestep_embedding(
|
||
struct ggml_context* ctx,
|
||
struct ggml_tensor* timesteps,
|
||
int dim,
|
||
int max_period = 10000,
|
||
float time_factor = 1.0f) {
|
||
timesteps = ggml_scale(ctx, timesteps, time_factor);
|
||
return ggml_timestep_embedding(ctx, timesteps, dim, max_period);
|
||
}
|
||
|
||
__STATIC_INLINE__ size_t ggml_tensor_num(ggml_context* ctx) {
|
||
size_t num = 0;
|
||
for (ggml_tensor* t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
||
num++;
|
||
}
|
||
return num;
|
||
}
|
||
|
||
/* SDXL with LoRA requires more space */
|
||
#define MAX_PARAMS_TENSOR_NUM 32768
|
||
#define MAX_GRAPH_SIZE 327680
|
||
|
||
struct WeightAdapter {
|
||
struct ForwardParams {
|
||
enum class op_type_t {
|
||
OP_LINEAR,
|
||
OP_CONV2D,
|
||
} op_type;
|
||
struct {
|
||
bool force_prec_f32 = false;
|
||
float scale = 1.f;
|
||
} linear;
|
||
struct {
|
||
int s0 = 1;
|
||
int s1 = 1;
|
||
int p0 = 0;
|
||
int p1 = 0;
|
||
int d0 = 1;
|
||
int d1 = 1;
|
||
bool direct = false;
|
||
float scale = 1.f;
|
||
} conv2d;
|
||
};
|
||
virtual ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name) = 0;
|
||
virtual ggml_tensor* forward_with_lora(ggml_context* ctx,
|
||
ggml_tensor* x,
|
||
ggml_tensor* w,
|
||
ggml_tensor* b,
|
||
const std::string& prefix,
|
||
ForwardParams forward_params) = 0;
|
||
virtual size_t get_extra_graph_size() = 0;
|
||
};
|
||
|
||
struct GGMLRunnerContext {
|
||
ggml_backend_t backend = nullptr;
|
||
ggml_context* ggml_ctx = nullptr;
|
||
bool flash_attn_enabled = false;
|
||
bool conv2d_direct_enabled = false;
|
||
std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
|
||
};
|
||
|
||
struct GGMLRunner {
|
||
protected:
|
||
typedef std::function<struct ggml_cgraph*()> get_graph_cb_t;
|
||
|
||
ggml_backend_t params_backend = nullptr;
|
||
ggml_backend_t runtime_backend = nullptr;
|
||
|
||
struct ggml_context* params_ctx = nullptr;
|
||
ggml_backend_buffer_t params_buffer = nullptr;
|
||
struct ggml_context* offload_ctx = nullptr;
|
||
ggml_backend_buffer_t runtime_params_buffer = nullptr;
|
||
bool params_on_runtime_backend = false;
|
||
|
||
struct ggml_context* cache_ctx = nullptr;
|
||
ggml_backend_buffer_t cache_buffer = nullptr;
|
||
|
||
struct ggml_context* compute_ctx = nullptr;
|
||
struct ggml_gallocr* compute_allocr = nullptr;
|
||
|
||
std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
|
||
|
||
std::vector<float> one_vec = {1.f};
|
||
ggml_tensor* one_tensor = nullptr;
|
||
|
||
std::vector<int> zero_int_vec = {0};
|
||
ggml_tensor* zero_int_tensor = nullptr;
|
||
|
||
std::map<struct ggml_tensor*, const void*> backend_tensor_data_map;
|
||
std::map<std::string, struct ggml_tensor*> cache_tensor_map; // name -> tensor
|
||
const std::string final_result_name = "ggml_runner_final_result_tensor";
|
||
|
||
bool flash_attn_enabled = false;
|
||
bool conv2d_direct_enabled = false;
|
||
|
||
void alloc_params_ctx() {
|
||
struct ggml_init_params params;
|
||
params.mem_size = static_cast<size_t>(MAX_PARAMS_TENSOR_NUM * ggml_tensor_overhead());
|
||
params.mem_buffer = nullptr;
|
||
params.no_alloc = true;
|
||
|
||
params_ctx = ggml_init(params);
|
||
GGML_ASSERT(params_ctx != nullptr);
|
||
if (params_backend != runtime_backend) {
|
||
offload_ctx = ggml_init(params);
|
||
GGML_ASSERT(offload_ctx != nullptr);
|
||
}
|
||
}
|
||
|
||
void free_params_ctx() {
|
||
if (params_ctx != nullptr) {
|
||
ggml_free(params_ctx);
|
||
params_ctx = nullptr;
|
||
}
|
||
if (offload_ctx != nullptr) {
|
||
ggml_free(offload_ctx);
|
||
offload_ctx = nullptr;
|
||
}
|
||
}
|
||
|
||
void alloc_cache_ctx() {
|
||
struct ggml_init_params params;
|
||
params.mem_size = static_cast<size_t>(MAX_PARAMS_TENSOR_NUM * ggml_tensor_overhead());
|
||
params.mem_buffer = nullptr;
|
||
params.no_alloc = true;
|
||
|
||
cache_ctx = ggml_init(params);
|
||
GGML_ASSERT(cache_ctx != nullptr);
|
||
}
|
||
|
||
void free_cache_ctx() {
|
||
if (cache_ctx != nullptr) {
|
||
ggml_free(cache_ctx);
|
||
cache_ctx = nullptr;
|
||
}
|
||
}
|
||
|
||
void alloc_compute_ctx() {
|
||
struct ggml_init_params params;
|
||
params.mem_size = static_cast<size_t>(ggml_tensor_overhead() * MAX_GRAPH_SIZE + ggml_graph_overhead());
|
||
params.mem_buffer = nullptr;
|
||
params.no_alloc = true;
|
||
|
||
compute_ctx = ggml_init(params);
|
||
GGML_ASSERT(compute_ctx != nullptr);
|
||
}
|
||
|
||
void free_compute_ctx() {
|
||
if (compute_ctx != nullptr) {
|
||
ggml_free(compute_ctx);
|
||
compute_ctx = nullptr;
|
||
}
|
||
}
|
||
|
||
void prepare_build_in_tensor_before() {
|
||
one_tensor = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_F32, 1);
|
||
ggml_set_name(one_tensor, "ggml_runner_build_in_tensor:one");
|
||
set_backend_tensor_data(one_tensor, one_vec.data());
|
||
|
||
zero_int_tensor = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, 1);
|
||
ggml_set_name(zero_int_tensor, "ggml_runner_build_in_tensor:zero_int");
|
||
set_backend_tensor_data(zero_int_tensor, zero_int_vec.data());
|
||
}
|
||
|
||
void prepare_build_in_tensor_after(struct ggml_cgraph* gf) {
|
||
ggml_build_forward_expand(gf, one_tensor);
|
||
ggml_build_forward_expand(gf, zero_int_tensor);
|
||
}
|
||
|
||
struct ggml_cgraph* new_graph_custom(size_t graph_size) {
|
||
if (weight_adapter) {
|
||
graph_size += weight_adapter->get_extra_graph_size();
|
||
}
|
||
return ggml_new_graph_custom(compute_ctx, graph_size, false);
|
||
}
|
||
|
||
struct ggml_cgraph* get_compute_graph(get_graph_cb_t get_graph) {
|
||
prepare_build_in_tensor_before();
|
||
struct ggml_cgraph* gf = get_graph();
|
||
if (ggml_graph_n_nodes(gf) > 0) {
|
||
auto result = ggml_graph_node(gf, -1);
|
||
ggml_set_name(result, final_result_name.c_str());
|
||
}
|
||
prepare_build_in_tensor_after(gf);
|
||
return gf;
|
||
}
|
||
|
||
bool alloc_compute_buffer(get_graph_cb_t get_graph) {
|
||
if (compute_allocr != nullptr) {
|
||
return true;
|
||
}
|
||
reset_compute_ctx();
|
||
struct ggml_cgraph* gf = get_compute_graph(get_graph);
|
||
backend_tensor_data_map.clear();
|
||
compute_allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(runtime_backend));
|
||
|
||
if (!ggml_gallocr_reserve(compute_allocr, gf)) {
|
||
// failed to allocate the compute buffer
|
||
LOG_ERROR("%s: failed to allocate the compute buffer\n", get_desc().c_str());
|
||
free_compute_buffer();
|
||
return false;
|
||
}
|
||
|
||
// compute the required memory
|
||
size_t compute_buffer_size = ggml_gallocr_get_buffer_size(compute_allocr, 0);
|
||
LOG_DEBUG("%s compute buffer size: %.2f MB(%s)",
|
||
get_desc().c_str(),
|
||
compute_buffer_size / 1024.0 / 1024.0,
|
||
ggml_backend_is_cpu(runtime_backend) ? "RAM" : "VRAM");
|
||
return true;
|
||
}
|
||
|
||
void free_cache_buffer() {
|
||
if (cache_buffer != nullptr) {
|
||
ggml_backend_buffer_free(cache_buffer);
|
||
cache_buffer = nullptr;
|
||
}
|
||
}
|
||
|
||
void copy_cache_tensors_to_cache_buffer() {
|
||
if (cache_tensor_map.size() == 0) {
|
||
return;
|
||
}
|
||
free_cache_ctx_and_buffer();
|
||
alloc_cache_ctx();
|
||
GGML_ASSERT(cache_buffer == nullptr);
|
||
std::map<ggml_tensor*, ggml_tensor*> runtime_tensor_to_cache_tensor;
|
||
for (auto kv : cache_tensor_map) {
|
||
auto cache_tensor = ggml_dup_tensor(cache_ctx, kv.second);
|
||
ggml_set_name(cache_tensor, kv.first.c_str());
|
||
runtime_tensor_to_cache_tensor[kv.second] = cache_tensor;
|
||
}
|
||
size_t num_tensors = ggml_tensor_num(cache_ctx);
|
||
cache_buffer = ggml_backend_alloc_ctx_tensors(cache_ctx, runtime_backend);
|
||
GGML_ASSERT(cache_buffer != nullptr);
|
||
for (auto kv : runtime_tensor_to_cache_tensor) {
|
||
ggml_backend_tensor_copy(kv.first, kv.second);
|
||
}
|
||
ggml_backend_synchronize(runtime_backend);
|
||
cache_tensor_map.clear();
|
||
size_t cache_buffer_size = ggml_backend_buffer_get_size(cache_buffer);
|
||
LOG_DEBUG("%s cache backend buffer size = % 6.2f MB(%s) (%i tensors)",
|
||
get_desc().c_str(),
|
||
cache_buffer_size / (1024.f * 1024.f),
|
||
ggml_backend_is_cpu(runtime_backend) ? "RAM" : "VRAM",
|
||
num_tensors);
|
||
}
|
||
|
||
void copy_data_to_backend_tensor() {
|
||
for (auto& kv : backend_tensor_data_map) {
|
||
auto tensor = kv.first;
|
||
auto data = kv.second;
|
||
|
||
ggml_backend_tensor_set(tensor, data, 0, ggml_nbytes(tensor));
|
||
}
|
||
|
||
backend_tensor_data_map.clear();
|
||
}
|
||
|
||
bool offload_params_to_runtime_backend() {
|
||
if (params_backend == runtime_backend) {
|
||
return true;
|
||
}
|
||
if (params_on_runtime_backend) {
|
||
return true;
|
||
}
|
||
GGML_ASSERT(runtime_params_buffer == nullptr);
|
||
int64_t t0 = ggml_time_ms();
|
||
size_t num_tensors = ggml_tensor_num(offload_ctx);
|
||
if (num_tensors == 0) {
|
||
for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; t = ggml_get_next_tensor(params_ctx, t)) {
|
||
GGML_ASSERT(t->view_src == nullptr);
|
||
ggml_dup_tensor(offload_ctx, t);
|
||
}
|
||
}
|
||
num_tensors = ggml_tensor_num(offload_ctx);
|
||
GGML_ASSERT(num_tensors == ggml_tensor_num(params_ctx));
|
||
|
||
runtime_params_buffer = ggml_backend_alloc_ctx_tensors(offload_ctx, runtime_backend);
|
||
|
||
if (runtime_params_buffer == nullptr) {
|
||
LOG_ERROR("%s alloc runtime params backend buffer failed, num_tensors = %i",
|
||
get_desc().c_str(),
|
||
num_tensors);
|
||
return false;
|
||
}
|
||
|
||
ggml_tensor* t = ggml_get_first_tensor(params_ctx);
|
||
ggml_tensor* offload_t = ggml_get_first_tensor(offload_ctx);
|
||
|
||
while (t != nullptr && offload_t != nullptr) {
|
||
ggml_backend_tensor_copy(t, offload_t);
|
||
std::swap(t->buffer, offload_t->buffer);
|
||
std::swap(t->data, offload_t->data);
|
||
std::swap(t->extra, offload_t->extra);
|
||
|
||
t = ggml_get_next_tensor(params_ctx, t);
|
||
offload_t = ggml_get_next_tensor(offload_ctx, offload_t);
|
||
}
|
||
|
||
int64_t t1 = ggml_time_ms();
|
||
|
||
size_t params_buffer_size = ggml_backend_buffer_get_size(runtime_params_buffer);
|
||
LOG_INFO("%s offload params (%6.2f MB, %i tensors) to runtime backend (%s), taking %.2fs",
|
||
get_desc().c_str(),
|
||
params_buffer_size / (1024.f * 1024.f),
|
||
num_tensors,
|
||
ggml_backend_name(runtime_backend),
|
||
(t1 - t0) * 1.0f / 1000);
|
||
|
||
params_on_runtime_backend = true;
|
||
|
||
return true;
|
||
}
|
||
|
||
void offload_params_to_params_backend() {
|
||
if (!params_on_runtime_backend) {
|
||
return;
|
||
}
|
||
ggml_tensor* t = ggml_get_first_tensor(params_ctx);
|
||
ggml_tensor* offload_t = ggml_get_first_tensor(offload_ctx);
|
||
|
||
while (t != nullptr && offload_t != nullptr) {
|
||
t->buffer = offload_t->buffer;
|
||
t->data = offload_t->data;
|
||
t->extra = offload_t->extra;
|
||
offload_t->buffer = nullptr;
|
||
offload_t->data = nullptr;
|
||
offload_t->extra = nullptr;
|
||
|
||
t = ggml_get_next_tensor(params_ctx, t);
|
||
offload_t = ggml_get_next_tensor(offload_ctx, offload_t);
|
||
}
|
||
|
||
if (runtime_params_buffer != nullptr) {
|
||
ggml_backend_buffer_free(runtime_params_buffer);
|
||
runtime_params_buffer = nullptr;
|
||
}
|
||
params_on_runtime_backend = false;
|
||
}
|
||
|
||
public:
|
||
virtual std::string get_desc() = 0;
|
||
|
||
GGMLRunner(ggml_backend_t backend, bool offload_params_to_cpu = false)
|
||
: runtime_backend(backend) {
|
||
alloc_params_ctx();
|
||
if (!ggml_backend_is_cpu(runtime_backend) && offload_params_to_cpu) {
|
||
params_backend = ggml_backend_cpu_init();
|
||
} else {
|
||
params_backend = runtime_backend;
|
||
}
|
||
}
|
||
|
||
virtual ~GGMLRunner() {
|
||
free_params_buffer();
|
||
free_compute_buffer();
|
||
free_params_ctx();
|
||
free_compute_ctx();
|
||
if (params_backend != runtime_backend) {
|
||
ggml_backend_free(params_backend);
|
||
}
|
||
free_cache_ctx_and_buffer();
|
||
}
|
||
|
||
virtual GGMLRunnerContext get_context() {
|
||
GGMLRunnerContext runner_ctx;
|
||
runner_ctx.ggml_ctx = compute_ctx;
|
||
runner_ctx.backend = runtime_backend;
|
||
runner_ctx.flash_attn_enabled = flash_attn_enabled;
|
||
runner_ctx.conv2d_direct_enabled = conv2d_direct_enabled;
|
||
runner_ctx.weight_adapter = weight_adapter;
|
||
return runner_ctx;
|
||
}
|
||
|
||
void reset_compute_ctx() {
|
||
free_compute_ctx();
|
||
alloc_compute_ctx();
|
||
}
|
||
|
||
bool alloc_params_buffer() {
|
||
size_t num_tensors = ggml_tensor_num(params_ctx);
|
||
params_buffer = ggml_backend_alloc_ctx_tensors(params_ctx, params_backend);
|
||
if (params_buffer == nullptr) {
|
||
LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i",
|
||
get_desc().c_str(),
|
||
num_tensors);
|
||
return false;
|
||
}
|
||
size_t params_buffer_size = ggml_backend_buffer_get_size(params_buffer);
|
||
LOG_DEBUG("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)",
|
||
get_desc().c_str(),
|
||
params_buffer_size / (1024.f * 1024.f),
|
||
ggml_backend_is_cpu(params_backend) ? "RAM" : "VRAM",
|
||
num_tensors);
|
||
return true;
|
||
}
|
||
|
||
void free_params_buffer() {
|
||
if (params_buffer != nullptr) {
|
||
ggml_backend_buffer_free(params_buffer);
|
||
params_buffer = nullptr;
|
||
}
|
||
}
|
||
|
||
size_t get_params_buffer_size() {
|
||
if (params_buffer != nullptr) {
|
||
return ggml_backend_buffer_get_size(params_buffer);
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
void free_cache_ctx_and_buffer() {
|
||
free_cache_buffer();
|
||
free_cache_ctx();
|
||
}
|
||
|
||
void free_compute_buffer() {
|
||
if (compute_allocr != nullptr) {
|
||
ggml_gallocr_free(compute_allocr);
|
||
compute_allocr = nullptr;
|
||
}
|
||
offload_params_to_params_backend();
|
||
}
|
||
|
||
// do copy after alloc graph
|
||
void set_backend_tensor_data(struct ggml_tensor* tensor, const void* data) {
|
||
backend_tensor_data_map[tensor] = data;
|
||
}
|
||
|
||
struct ggml_tensor* to_backend(struct ggml_tensor* tensor) {
|
||
GGML_ASSERT(compute_ctx != nullptr);
|
||
if (tensor == nullptr) {
|
||
return nullptr;
|
||
}
|
||
// it's performing a compute, check if backend isn't cpu
|
||
if (!ggml_backend_is_cpu(runtime_backend) && (tensor->buffer == nullptr || ggml_backend_buffer_is_host(tensor->buffer))) {
|
||
// pass input tensors to gpu memory
|
||
auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor);
|
||
|
||
set_backend_tensor_data(backend_tensor, tensor->data);
|
||
return backend_tensor;
|
||
} else {
|
||
return tensor;
|
||
}
|
||
}
|
||
|
||
void cache(const std::string name, struct ggml_tensor* tensor) {
|
||
cache_tensor_map[name] = tensor;
|
||
}
|
||
|
||
struct ggml_tensor* get_cache_tensor_by_name(const std::string& name) {
|
||
if (cache_ctx == nullptr) {
|
||
return nullptr;
|
||
}
|
||
return ggml_get_tensor(cache_ctx, name.c_str());
|
||
}
|
||
|
||
bool compute(get_graph_cb_t get_graph,
|
||
int n_threads,
|
||
bool free_compute_buffer_immediately = true,
|
||
struct ggml_tensor** output = nullptr,
|
||
struct ggml_context* output_ctx = nullptr) {
|
||
if (!offload_params_to_runtime_backend()) {
|
||
LOG_ERROR("%s offload params to runtime backend failed", get_desc().c_str());
|
||
return false;
|
||
}
|
||
if (!alloc_compute_buffer(get_graph)) {
|
||
LOG_ERROR("%s alloc compute buffer failed", get_desc().c_str());
|
||
return false;
|
||
}
|
||
reset_compute_ctx();
|
||
struct ggml_cgraph* gf = get_compute_graph(get_graph);
|
||
if (!ggml_gallocr_alloc_graph(compute_allocr, gf)) {
|
||
LOG_ERROR("%s alloc compute graph failed", get_desc().c_str());
|
||
return false;
|
||
}
|
||
copy_data_to_backend_tensor();
|
||
if (ggml_backend_is_cpu(runtime_backend)) {
|
||
ggml_backend_cpu_set_n_threads(runtime_backend, n_threads);
|
||
}
|
||
|
||
ggml_status status = ggml_backend_graph_compute(runtime_backend, gf);
|
||
if (status != GGML_STATUS_SUCCESS) {
|
||
LOG_ERROR("%s compute failed: %s", get_desc().c_str(), ggml_status_to_string(status));
|
||
return false;
|
||
}
|
||
#ifdef GGML_PERF
|
||
ggml_graph_print(gf);
|
||
#endif
|
||
copy_cache_tensors_to_cache_buffer();
|
||
if (output != nullptr) {
|
||
auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str());
|
||
if (*output == nullptr && output_ctx != nullptr) {
|
||
*output = ggml_dup_tensor(output_ctx, result);
|
||
}
|
||
if (*output != nullptr) {
|
||
ggml_ext_backend_tensor_get_and_sync(runtime_backend, result, (*output)->data, 0, ggml_nbytes(*output));
|
||
}
|
||
}
|
||
|
||
if (free_compute_buffer_immediately) {
|
||
free_compute_buffer();
|
||
}
|
||
return true;
|
||
}
|
||
|
||
void set_flash_attention_enabled(bool enabled) {
|
||
flash_attn_enabled = enabled;
|
||
}
|
||
|
||
void set_conv2d_direct_enabled(bool enabled) {
|
||
conv2d_direct_enabled = enabled;
|
||
}
|
||
|
||
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {
|
||
weight_adapter = adapter;
|
||
}
|
||
};
|
||
|
||
class GGMLBlock {
|
||
protected:
|
||
typedef std::unordered_map<std::string, struct ggml_tensor*> ParameterMap;
|
||
typedef std::unordered_map<std::string, std::shared_ptr<GGMLBlock>> GGMLBlockMap;
|
||
GGMLBlockMap blocks;
|
||
ParameterMap params;
|
||
|
||
ggml_type get_type(const std::string& name, const String2TensorStorage& tensor_storage_map, ggml_type default_type) {
|
||
ggml_type wtype = default_type;
|
||
auto iter = tensor_storage_map.find(name);
|
||
if (iter != tensor_storage_map.end()) {
|
||
const TensorStorage& tensor_storage = iter->second;
|
||
if (tensor_storage.expected_type != GGML_TYPE_COUNT) {
|
||
wtype = tensor_storage.expected_type;
|
||
} else {
|
||
wtype = tensor_storage.type;
|
||
}
|
||
}
|
||
return wtype;
|
||
}
|
||
|
||
void init_blocks(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
|
||
for (auto& pair : blocks) {
|
||
auto& block = pair.second;
|
||
block->init(ctx, tensor_storage_map, prefix + pair.first);
|
||
}
|
||
}
|
||
|
||
virtual void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {}
|
||
|
||
public:
|
||
void init(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") {
|
||
if (prefix.size() > 0) {
|
||
prefix = prefix + ".";
|
||
}
|
||
init_params(ctx, tensor_storage_map, prefix);
|
||
init_blocks(ctx, tensor_storage_map, prefix);
|
||
}
|
||
|
||
size_t get_params_num() {
|
||
size_t num_tensors = params.size();
|
||
for (auto& pair : blocks) {
|
||
auto& block = pair.second;
|
||
|
||
num_tensors += block->get_params_num();
|
||
}
|
||
return num_tensors;
|
||
};
|
||
|
||
size_t get_params_mem_size() {
|
||
size_t mem_size = 0;
|
||
for (auto& pair : blocks) {
|
||
auto& block = pair.second;
|
||
|
||
mem_size += block->get_params_mem_size();
|
||
}
|
||
|
||
for (auto& pair : params) {
|
||
mem_size += ggml_nbytes(pair.second);
|
||
}
|
||
|
||
return mem_size;
|
||
}
|
||
|
||
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, std::string prefix = "") {
|
||
if (prefix.size() > 0) {
|
||
prefix = prefix + ".";
|
||
}
|
||
for (auto& pair : blocks) {
|
||
auto& block = pair.second;
|
||
block->get_param_tensors(tensors, prefix + pair.first);
|
||
}
|
||
|
||
for (auto& pair : params) {
|
||
struct ggml_tensor* param = pair.second;
|
||
tensors[prefix + pair.first] = pair.second;
|
||
}
|
||
}
|
||
|
||
virtual std::string get_desc() {
|
||
return "GGMLBlock";
|
||
}
|
||
|
||
void get_all_blocks(std::vector<GGMLBlock*>& result) {
|
||
result.push_back(this);
|
||
for (auto& block_iter : blocks) {
|
||
if (block_iter.second) {
|
||
block_iter.second->get_all_blocks(result);
|
||
}
|
||
}
|
||
}
|
||
};
|
||
|
||
class UnaryBlock : public GGMLBlock {
|
||
public:
|
||
virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) = 0;
|
||
};
|
||
|
||
class Identity : public UnaryBlock {
|
||
public:
|
||
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
|
||
return x;
|
||
}
|
||
};
|
||
|
||
class Linear : public UnaryBlock {
|
||
protected:
|
||
int64_t in_features;
|
||
int64_t out_features;
|
||
bool bias;
|
||
bool force_f32;
|
||
bool force_prec_f32;
|
||
float scale;
|
||
std::string prefix;
|
||
|
||
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
|
||
this->prefix = prefix;
|
||
enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32);
|
||
if (in_features % ggml_blck_size(wtype) != 0 || force_f32) {
|
||
wtype = GGML_TYPE_F32;
|
||
}
|
||
params["weight"] = ggml_new_tensor_2d(ctx, wtype, in_features, out_features);
|
||
if (bias) {
|
||
enum ggml_type wtype = GGML_TYPE_F32;
|
||
params["bias"] = ggml_new_tensor_1d(ctx, wtype, out_features);
|
||
}
|
||
}
|
||
|
||
public:
|
||
Linear(int64_t in_features,
|
||
int64_t out_features,
|
||
bool bias = true,
|
||
bool force_f32 = false,
|
||
bool force_prec_f32 = false,
|
||
float scale = 1.f)
|
||
: in_features(in_features),
|
||
out_features(out_features),
|
||
bias(bias),
|
||
force_f32(force_f32),
|
||
force_prec_f32(force_prec_f32),
|
||
scale(scale) {}
|
||
|
||
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
|
||
struct ggml_tensor* w = params["weight"];
|
||
struct ggml_tensor* b = nullptr;
|
||
if (bias) {
|
||
b = params["bias"];
|
||
}
|
||
if (ctx->weight_adapter) {
|
||
WeightAdapter::ForwardParams forward_params;
|
||
forward_params.op_type = WeightAdapter::ForwardParams::op_type_t::OP_LINEAR;
|
||
forward_params.linear.force_prec_f32 = force_prec_f32;
|
||
forward_params.linear.scale = scale;
|
||
return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, x, w, b, prefix, forward_params);
|
||
}
|
||
return ggml_ext_linear(ctx->ggml_ctx, x, w, b, force_prec_f32, scale);
|
||
}
|
||
};
|
||
|
||
__STATIC_INLINE__ bool support_get_rows(ggml_type wtype) {
|
||
std::set<ggml_type> allow_types = {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0};
|
||
if (allow_types.find(wtype) != allow_types.end()) {
|
||
return true;
|
||
}
|
||
return false;
|
||
}
|
||
|
||
class Embedding : public UnaryBlock {
|
||
protected:
|
||
int64_t embedding_dim;
|
||
int64_t num_embeddings;
|
||
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override {
|
||
enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32);
|
||
if (!support_get_rows(wtype)) {
|
||
wtype = GGML_TYPE_F32;
|
||
}
|
||
params["weight"] = ggml_new_tensor_2d(ctx, wtype, embedding_dim, num_embeddings);
|
||
}
|
||
|
||
public:
|
||
Embedding(int64_t num_embeddings, int64_t embedding_dim)
|
||
: embedding_dim(embedding_dim),
|
||
num_embeddings(num_embeddings) {
|
||
}
|
||
|
||
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||
struct ggml_tensor* input_ids) {
|
||
// input_ids: [N, n_token]
|
||
auto weight = params["weight"];
|
||
|
||
// There are issues with ggml batch inference, so we are expanding it here first.
|
||
// TODO: fix ggml batch inference
|
||
int64_t n = input_ids->ne[1];
|
||
input_ids = ggml_reshape_1d(ctx->ggml_ctx, input_ids, input_ids->ne[0] * input_ids->ne[1]);
|
||
|
||
input_ids = ggml_reshape_3d(ctx->ggml_ctx, input_ids, input_ids->ne[0], 1, input_ids->ne[1]);
|
||
auto embedding = ggml_get_rows(ctx->ggml_ctx, weight, input_ids);
|
||
embedding = ggml_reshape_3d(ctx->ggml_ctx, embedding, embedding->ne[0], embedding->ne[1] / n, n);
|
||
|
||
// [N, n_token, embedding_dim]
|
||
return embedding;
|
||
}
|
||
};
|
||
|
||
class Conv2d : public UnaryBlock {
|
||
protected:
|
||
int64_t in_channels;
|
||
int64_t out_channels;
|
||
std::pair<int, int> kernel_size;
|
||
std::pair<int, int> stride;
|
||
std::pair<int, int> padding;
|
||
std::pair<int, int> dilation;
|
||
bool bias;
|
||
float scale = 1.f;
|
||
std::string prefix;
|
||
|
||
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override {
|
||
this->prefix = prefix;
|
||
enum ggml_type wtype = GGML_TYPE_F16;
|
||
params["weight"] = ggml_new_tensor_4d(ctx, wtype, kernel_size.second, kernel_size.first, in_channels, out_channels);
|
||
if (bias) {
|
||
enum ggml_type wtype = GGML_TYPE_F32;
|
||
params["bias"] = ggml_new_tensor_1d(ctx, wtype, out_channels);
|
||
}
|
||
}
|
||
|
||
public:
|
||
Conv2d(int64_t in_channels,
|
||
int64_t out_channels,
|
||
std::pair<int, int> kernel_size,
|
||
std::pair<int, int> stride = {1, 1},
|
||
std::pair<int, int> padding = {0, 0},
|
||
std::pair<int, int> dilation = {1, 1},
|
||
bool bias = true)
|
||
: in_channels(in_channels),
|
||
out_channels(out_channels),
|
||
kernel_size(kernel_size),
|
||
stride(stride),
|
||
padding(padding),
|
||
dilation(dilation),
|
||
bias(bias) {}
|
||
|
||
void set_scale(float scale_value) {
|
||
scale = scale_value;
|
||
}
|
||
|
||
std::string get_desc() {
|
||
return "Conv2d";
|
||
}
|
||
|
||
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
|
||
struct ggml_tensor* w = params["weight"];
|
||
struct ggml_tensor* b = nullptr;
|
||
if (bias) {
|
||
b = params["bias"];
|
||
}
|
||
if (ctx->weight_adapter) {
|
||
WeightAdapter::ForwardParams forward_params;
|
||
forward_params.op_type = WeightAdapter::ForwardParams::op_type_t::OP_CONV2D;
|
||
forward_params.conv2d.s0 = stride.second;
|
||
forward_params.conv2d.s1 = stride.first;
|
||
forward_params.conv2d.p0 = padding.second;
|
||
forward_params.conv2d.p1 = padding.first;
|
||
forward_params.conv2d.d0 = dilation.second;
|
||
forward_params.conv2d.d1 = dilation.first;
|
||
forward_params.conv2d.direct = ctx->conv2d_direct_enabled;
|
||
forward_params.conv2d.scale = scale;
|
||
return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, x, w, b, prefix, forward_params);
|
||
}
|
||
return ggml_ext_conv_2d(ctx->ggml_ctx,
|
||
x,
|
||
w,
|
||
b,
|
||
stride.second,
|
||
stride.first,
|
||
padding.second,
|
||
padding.first,
|
||
dilation.second,
|
||
dilation.first,
|
||
ctx->conv2d_direct_enabled,
|
||
scale);
|
||
}
|
||
};
|
||
|
||
class Conv3dnx1x1 : public UnaryBlock {
|
||
protected:
|
||
int64_t in_channels;
|
||
int64_t out_channels;
|
||
int64_t kernel_size;
|
||
int64_t stride;
|
||
int64_t padding;
|
||
int64_t dilation;
|
||
bool bias;
|
||
|
||
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override {
|
||
enum ggml_type wtype = GGML_TYPE_F16;
|
||
params["weight"] = ggml_new_tensor_4d(ctx, wtype, 1, kernel_size, in_channels, out_channels); // 5d => 4d
|
||
if (bias) {
|
||
enum ggml_type wtype = GGML_TYPE_F32;
|
||
params["bias"] = ggml_new_tensor_1d(ctx, wtype, out_channels);
|
||
}
|
||
}
|
||
|
||
public:
|
||
Conv3dnx1x1(int64_t in_channels,
|
||
int64_t out_channels,
|
||
int64_t kernel_size,
|
||
int64_t stride = 1,
|
||
int64_t padding = 0,
|
||
int64_t dilation = 1,
|
||
bool bias = true)
|
||
: in_channels(in_channels),
|
||
out_channels(out_channels),
|
||
kernel_size(kernel_size),
|
||
stride(stride),
|
||
padding(padding),
|
||
dilation(dilation),
|
||
bias(bias) {}
|
||
|
||
// x: [N, IC, ID, IH*IW]
|
||
// result: [N, OC, OD, OH*OW]
|
||
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
|
||
struct ggml_tensor* w = params["weight"];
|
||
struct ggml_tensor* b = nullptr;
|
||
if (bias) {
|
||
b = params["bias"];
|
||
}
|
||
return ggml_ext_conv_3d_nx1x1(ctx->ggml_ctx, x, w, b, stride, padding, dilation);
|
||
}
|
||
};
|
||
|
||
class Conv3d : public UnaryBlock {
|
||
protected:
|
||
int64_t in_channels;
|
||
int64_t out_channels;
|
||
std::tuple<int, int, int> kernel_size;
|
||
std::tuple<int, int, int> stride;
|
||
std::tuple<int, int, int> padding;
|
||
std::tuple<int, int, int> dilation;
|
||
bool bias;
|
||
std::string prefix;
|
||
|
||
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override {
|
||
this->prefix = prefix;
|
||
enum ggml_type wtype = GGML_TYPE_F16;
|
||
params["weight"] = ggml_new_tensor_4d(ctx,
|
||
wtype,
|
||
std::get<2>(kernel_size),
|
||
std::get<1>(kernel_size),
|
||
std::get<0>(kernel_size),
|
||
in_channels * out_channels);
|
||
if (bias) {
|
||
params["bias"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
|
||
}
|
||
}
|
||
|
||
public:
|
||
Conv3d(int64_t in_channels,
|
||
int64_t out_channels,
|
||
std::tuple<int, int, int> kernel_size,
|
||
std::tuple<int, int, int> stride = {1, 1, 1},
|
||
std::tuple<int, int, int> padding = {0, 0, 0},
|
||
std::tuple<int, int, int> dilation = {1, 1, 1},
|
||
bool bias = true)
|
||
: in_channels(in_channels),
|
||
out_channels(out_channels),
|
||
kernel_size(kernel_size),
|
||
stride(stride),
|
||
padding(padding),
|
||
dilation(dilation),
|
||
bias(bias) {}
|
||
|
||
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
|
||
struct ggml_tensor* w = params["weight"];
|
||
struct ggml_tensor* b = nullptr;
|
||
if (ctx->weight_adapter) {
|
||
w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight");
|
||
if (w->type != GGML_TYPE_F16) {
|
||
w = ggml_cast(ctx->ggml_ctx, w, GGML_TYPE_F16);
|
||
}
|
||
}
|
||
if (bias) {
|
||
b = params["bias"];
|
||
if (ctx->weight_adapter) {
|
||
b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, b, prefix + "bias");
|
||
}
|
||
}
|
||
return ggml_ext_conv_3d(ctx->ggml_ctx, x, w, b, in_channels,
|
||
std::get<2>(stride), std::get<1>(stride), std::get<0>(stride),
|
||
std::get<2>(padding), std::get<1>(padding), std::get<0>(padding),
|
||
std::get<2>(dilation), std::get<1>(dilation), std::get<0>(dilation));
|
||
}
|
||
};
|
||
|
||
class LayerNorm : public UnaryBlock {
|
||
protected:
|
||
int64_t normalized_shape;
|
||
float eps;
|
||
bool elementwise_affine;
|
||
bool bias;
|
||
std::string prefix;
|
||
|
||
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
|
||
this->prefix = prefix;
|
||
if (elementwise_affine) {
|
||
enum ggml_type wtype = GGML_TYPE_F32;
|
||
params["weight"] = ggml_new_tensor_1d(ctx, wtype, normalized_shape);
|
||
if (bias) {
|
||
enum ggml_type wtype = GGML_TYPE_F32;
|
||
params["bias"] = ggml_new_tensor_1d(ctx, wtype, normalized_shape);
|
||
}
|
||
}
|
||
}
|
||
|
||
public:
|
||
LayerNorm(int64_t normalized_shape,
|
||
float eps = 1e-05f,
|
||
bool elementwise_affine = true,
|
||
bool bias = true)
|
||
: normalized_shape(normalized_shape),
|
||
eps(eps),
|
||
elementwise_affine(elementwise_affine),
|
||
bias(bias) {}
|
||
|
||
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
|
||
struct ggml_tensor* w = nullptr;
|
||
struct ggml_tensor* b = nullptr;
|
||
|
||
if (elementwise_affine) {
|
||
w = params["weight"];
|
||
if (ctx->weight_adapter) {
|
||
w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight");
|
||
}
|
||
if (bias) {
|
||
b = params["bias"];
|
||
if (ctx->weight_adapter) {
|
||
b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, b, prefix + "bias");
|
||
}
|
||
}
|
||
}
|
||
return ggml_ext_layer_norm(ctx->ggml_ctx, x, w, b, eps);
|
||
}
|
||
};
|
||
|
||
class GroupNorm : public GGMLBlock {
|
||
protected:
|
||
int64_t num_groups;
|
||
int64_t num_channels;
|
||
float eps;
|
||
bool affine;
|
||
std::string prefix;
|
||
|
||
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
|
||
this->prefix = prefix;
|
||
if (affine) {
|
||
enum ggml_type wtype = GGML_TYPE_F32;
|
||
enum ggml_type bias_wtype = GGML_TYPE_F32;
|
||
params["weight"] = ggml_new_tensor_1d(ctx, wtype, num_channels);
|
||
params["bias"] = ggml_new_tensor_1d(ctx, bias_wtype, num_channels);
|
||
}
|
||
}
|
||
|
||
public:
|
||
GroupNorm(int64_t num_groups,
|
||
int64_t num_channels,
|
||
float eps = 1e-05f,
|
||
bool affine = true)
|
||
: num_groups(num_groups),
|
||
num_channels(num_channels),
|
||
eps(eps),
|
||
affine(affine) {}
|
||
|
||
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
|
||
struct ggml_tensor* w = nullptr;
|
||
struct ggml_tensor* b = nullptr;
|
||
if (affine) {
|
||
w = params["weight"];
|
||
b = params["bias"];
|
||
if (ctx->weight_adapter) {
|
||
w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight");
|
||
b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, b, prefix + "bias");
|
||
}
|
||
}
|
||
return ggml_ext_group_norm(ctx->ggml_ctx, x, w, b, num_groups);
|
||
}
|
||
};
|
||
|
||
class GroupNorm32 : public GroupNorm {
|
||
public:
|
||
GroupNorm32(int64_t num_channels)
|
||
: GroupNorm(32, num_channels, 1e-06f) {}
|
||
};
|
||
|
||
class RMSNorm : public UnaryBlock {
|
||
protected:
|
||
int64_t hidden_size;
|
||
float eps;
|
||
std::string prefix;
|
||
|
||
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
|
||
this->prefix = prefix;
|
||
enum ggml_type wtype = GGML_TYPE_F32;
|
||
params["weight"] = ggml_new_tensor_1d(ctx, wtype, hidden_size);
|
||
}
|
||
|
||
public:
|
||
RMSNorm(int64_t hidden_size,
|
||
float eps = 1e-06f)
|
||
: hidden_size(hidden_size),
|
||
eps(eps) {}
|
||
|
||
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
|
||
struct ggml_tensor* w = params["weight"];
|
||
if (ctx->weight_adapter) {
|
||
w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight");
|
||
}
|
||
x = ggml_rms_norm(ctx->ggml_ctx, x, eps);
|
||
x = ggml_mul_inplace(ctx->ggml_ctx, x, w);
|
||
return x;
|
||
}
|
||
};
|
||
|
||
class MultiheadAttention : public GGMLBlock {
|
||
protected:
|
||
int64_t embed_dim;
|
||
int64_t n_head;
|
||
bool proj_in;
|
||
std::string q_proj_name;
|
||
std::string k_proj_name;
|
||
std::string v_proj_name;
|
||
std::string in_proj_name;
|
||
std::string out_proj_name;
|
||
|
||
public:
|
||
MultiheadAttention(int64_t embed_dim,
|
||
int64_t n_head,
|
||
bool qkv_proj_bias = true,
|
||
bool out_proj_bias = true,
|
||
bool proj_in = false,
|
||
std::string q_proj_name = "q_proj",
|
||
std::string k_proj_name = "k_proj",
|
||
std::string v_proj_name = "v_proj",
|
||
std::string in_proj_name = "in_proj",
|
||
std::string out_proj_name = "out_proj")
|
||
: embed_dim(embed_dim),
|
||
n_head(n_head),
|
||
proj_in(proj_in),
|
||
q_proj_name(q_proj_name),
|
||
k_proj_name(k_proj_name),
|
||
v_proj_name(v_proj_name),
|
||
in_proj_name(in_proj_name),
|
||
out_proj_name(out_proj_name) {
|
||
if (proj_in) {
|
||
blocks[in_proj_name] = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim * 3, qkv_proj_bias));
|
||
} else {
|
||
blocks[q_proj_name] = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, qkv_proj_bias));
|
||
blocks[k_proj_name] = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, qkv_proj_bias));
|
||
blocks[v_proj_name] = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, qkv_proj_bias));
|
||
}
|
||
blocks[out_proj_name] = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, out_proj_bias));
|
||
}
|
||
|
||
// x: [N, n_token, embed_dim]
|
||
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||
struct ggml_tensor* x,
|
||
bool mask = false) {
|
||
auto out_proj = std::dynamic_pointer_cast<Linear>(blocks[out_proj_name]);
|
||
|
||
ggml_tensor* q;
|
||
ggml_tensor* k;
|
||
ggml_tensor* v;
|
||
if (proj_in) {
|
||
auto in_proj = std::dynamic_pointer_cast<Linear>(blocks[in_proj_name]);
|
||
auto qkv = in_proj->forward(ctx, x);
|
||
auto qkv_vec = split_qkv(ctx->ggml_ctx, qkv);
|
||
q = qkv_vec[0];
|
||
k = qkv_vec[1];
|
||
v = qkv_vec[2];
|
||
} else {
|
||
auto q_proj = std::dynamic_pointer_cast<Linear>(blocks[q_proj_name]);
|
||
auto k_proj = std::dynamic_pointer_cast<Linear>(blocks[k_proj_name]);
|
||
auto v_proj = std::dynamic_pointer_cast<Linear>(blocks[v_proj_name]);
|
||
|
||
q = q_proj->forward(ctx, x);
|
||
k = k_proj->forward(ctx, x);
|
||
v = v_proj->forward(ctx, x);
|
||
}
|
||
|
||
x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, mask); // [N, n_token, embed_dim]
|
||
|
||
x = out_proj->forward(ctx, x); // [N, n_token, embed_dim]
|
||
return x;
|
||
}
|
||
};
|
||
|
||
#endif // __GGML_EXTEND__HPP__
|