mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2025-12-12 13:28:37 +00:00
* add qwen tokenizer * add qwen2.5 vl support * mv qwen.hpp -> qwenvl.hpp * add qwen image model * add qwen image t2i pipeline * fix qwen image flash attn * add qwen image i2i pipline * change encoding of vocab_qwen.hpp to utf8 * fix get_first_stage_encoding * apply jeffbolz f32 patch https://github.com/leejet/stable-diffusion.cpp/pull/851#issuecomment-3335515302 * fix the issue that occurs when using CUDA with k-quants weights * optimize the handling of the FeedForward precision fix * to_add_out precision fix * update docs
732 lines
30 KiB
C++
732 lines
30 KiB
C++
#ifndef __QWENVL_HPP__
|
|
#define __QWENVL_HPP__
|
|
|
|
#include <algorithm>
|
|
#include <fstream>
|
|
#include <iostream>
|
|
#include <map>
|
|
#include <optional>
|
|
#include <regex>
|
|
#include <set>
|
|
#include <sstream>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include "clip.hpp"
|
|
#include "ggml_extend.hpp"
|
|
#include "json.hpp"
|
|
#include "tokenize_util.h"
|
|
|
|
namespace Qwen {
|
|
|
|
class Qwen2Tokenizer {
|
|
private:
|
|
std::map<int, std::u32string> byte_encoder;
|
|
std::map<std::u32string, int> byte_decoder;
|
|
std::map<std::u32string, int> encoder;
|
|
std::map<int, std::u32string> decoder;
|
|
std::map<std::pair<std::u32string, std::u32string>, int> bpe_ranks;
|
|
std::regex pat;
|
|
int encoder_len;
|
|
int bpe_len;
|
|
|
|
public:
|
|
const std::string UNK_TOKEN = "<|endoftext|>";
|
|
const std::string EOS_TOKEN = "<|endoftext|>";
|
|
const std::string PAD_TOKEN = "<|endoftext|>";
|
|
|
|
const int UNK_TOKEN_ID = 151643;
|
|
const int EOS_TOKEN_ID = 151643;
|
|
const int PAD_TOKEN_ID = 151643;
|
|
|
|
std::vector<std::string> special_tokens = {
|
|
"<|endoftext|>",
|
|
"<|im_start|>",
|
|
"<|im_end|>",
|
|
"<|object_ref_start|>",
|
|
"<|object_ref_end|>",
|
|
"<|box_start|>",
|
|
"<|box_end|>",
|
|
"<|quad_start|>",
|
|
"<|quad_end|>",
|
|
"<|vision_start|>",
|
|
"<|vision_end|>",
|
|
"<|vision_pad|>",
|
|
"<|image_pad|>",
|
|
"<|video_pad|>",
|
|
"<tool_call>",
|
|
"</tool_call>",
|
|
"<|fim_prefix|>",
|
|
"<|fim_middle|>",
|
|
"<|fim_suffix|>",
|
|
"<|fim_pad|>",
|
|
"<|repo_name|>",
|
|
"<|file_sep|>",
|
|
};
|
|
|
|
private:
|
|
static std::string strip(const std::string& str) {
|
|
std::string::size_type start = str.find_first_not_of(" \t\n\r\v\f");
|
|
std::string::size_type end = str.find_last_not_of(" \t\n\r\v\f");
|
|
|
|
if (start == std::string::npos) {
|
|
// String contains only whitespace characters
|
|
return "";
|
|
}
|
|
|
|
return str.substr(start, end - start + 1);
|
|
}
|
|
|
|
static std::string whitespace_clean(std::string text) {
|
|
text = std::regex_replace(text, std::regex(R"(\s+)"), " ");
|
|
text = strip(text);
|
|
return text;
|
|
}
|
|
|
|
static std::set<std::pair<std::u32string, std::u32string>> get_pairs(const std::vector<std::u32string>& subwords) {
|
|
std::set<std::pair<std::u32string, std::u32string>> pairs;
|
|
if (subwords.size() == 0) {
|
|
return pairs;
|
|
}
|
|
std::u32string prev_subword = subwords[0];
|
|
for (int i = 1; i < subwords.size(); i++) {
|
|
std::u32string subword = subwords[i];
|
|
std::pair<std::u32string, std::u32string> pair(prev_subword, subword);
|
|
pairs.insert(pair);
|
|
prev_subword = subword;
|
|
}
|
|
return pairs;
|
|
}
|
|
|
|
bool is_special_token(const std::string& token) {
|
|
for (auto& special_token : special_tokens) {
|
|
if (special_token == token) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
public:
|
|
explicit Qwen2Tokenizer(const std::string& merges_utf8_str = "") {
|
|
if (merges_utf8_str.size() > 0) {
|
|
load_from_merges(merges_utf8_str);
|
|
} else {
|
|
load_from_merges(ModelLoader::load_qwen2_merges());
|
|
}
|
|
}
|
|
|
|
void load_from_merges(const std::string& merges_utf8_str) {
|
|
auto byte_unicode_pairs = bytes_to_unicode();
|
|
// printf("byte_unicode_pairs have %lu pairs \n", byte_unicode_pairs.size());
|
|
byte_encoder = std::map<int, std::u32string>(byte_unicode_pairs.begin(), byte_unicode_pairs.end());
|
|
for (auto& pair : byte_unicode_pairs) {
|
|
byte_decoder[pair.second] = pair.first;
|
|
}
|
|
// for (auto & pair: byte_unicode_pairs) {
|
|
// std::cout << pair.first << ": " << pair.second << std::endl;
|
|
// }
|
|
std::vector<std::u32string> merges;
|
|
size_t start = 0;
|
|
size_t pos;
|
|
std::u32string merges_utf32_str = utf8_to_utf32(merges_utf8_str);
|
|
while ((pos = merges_utf32_str.find('\n', start)) != std::string::npos) {
|
|
merges.push_back(merges_utf32_str.substr(start, pos - start));
|
|
start = pos + 1;
|
|
}
|
|
LOG_DEBUG("merges size %llu", merges.size());
|
|
merges = std::vector<std::u32string>(merges.begin(), merges.end());
|
|
std::vector<std::pair<std::u32string, std::u32string>> merge_pairs;
|
|
for (const auto& merge : merges) {
|
|
size_t space_pos = merge.find(' ');
|
|
merge_pairs.emplace_back(merge.substr(0, space_pos), merge.substr(space_pos + 1));
|
|
// LOG_DEBUG("%s", utf32_to_utf8(merge.substr(space_pos + 1)).c_str());
|
|
// printf("%s :: %s | %s \n", utf32_to_utf8(merge).c_str(), utf32_to_utf8(merge.substr(0, space_pos)).c_str(),
|
|
// utf32_to_utf8(merge.substr(space_pos + 1)).c_str());
|
|
}
|
|
|
|
std::vector<std::u32string> vocab;
|
|
for (const auto& pair : byte_unicode_pairs) {
|
|
vocab.push_back(pair.second);
|
|
}
|
|
for (const auto& merge : merge_pairs) {
|
|
vocab.push_back(merge.first + merge.second);
|
|
}
|
|
for (auto& special_token : special_tokens) {
|
|
vocab.push_back(utf8_to_utf32(special_token));
|
|
}
|
|
|
|
LOG_DEBUG("vocab size: %llu", vocab.size());
|
|
int i = 0;
|
|
for (const auto& token : vocab) {
|
|
encoder[token] = i;
|
|
decoder[i] = token;
|
|
i++;
|
|
}
|
|
encoder_len = i;
|
|
|
|
int rank = 0;
|
|
for (const auto& merge : merge_pairs) {
|
|
bpe_ranks[merge] = rank++;
|
|
}
|
|
bpe_len = rank;
|
|
};
|
|
|
|
std::u32string bpe(const std::u32string& token) {
|
|
std::vector<std::u32string> word;
|
|
|
|
for (int i = 0; i < token.size(); i++) {
|
|
word.emplace_back(1, token[i]);
|
|
}
|
|
|
|
std::set<std::pair<std::u32string, std::u32string>> pairs = get_pairs(word);
|
|
|
|
if (pairs.empty()) {
|
|
return token;
|
|
}
|
|
|
|
while (true) {
|
|
auto min_pair_iter = std::min_element(pairs.begin(),
|
|
pairs.end(),
|
|
[&](const std::pair<std::u32string, std::u32string>& a,
|
|
const std::pair<std::u32string, std::u32string>& b) {
|
|
if (bpe_ranks.find(a) == bpe_ranks.end()) {
|
|
return false;
|
|
} else if (bpe_ranks.find(b) == bpe_ranks.end()) {
|
|
return true;
|
|
}
|
|
return bpe_ranks.at(a) < bpe_ranks.at(b);
|
|
});
|
|
|
|
const std::pair<std::u32string, std::u32string>& bigram = *min_pair_iter;
|
|
|
|
if (bpe_ranks.find(bigram) == bpe_ranks.end()) {
|
|
break;
|
|
}
|
|
|
|
std::u32string first = bigram.first;
|
|
std::u32string second = bigram.second;
|
|
std::vector<std::u32string> new_word;
|
|
int32_t i = 0;
|
|
|
|
while (i < word.size()) {
|
|
auto it = std::find(word.begin() + i, word.end(), first);
|
|
if (it == word.end()) {
|
|
new_word.insert(new_word.end(), word.begin() + i, word.end());
|
|
break;
|
|
}
|
|
new_word.insert(new_word.end(), word.begin() + i, it);
|
|
i = static_cast<int32_t>(std::distance(word.begin(), it));
|
|
|
|
if (word[i] == first && i < static_cast<int32_t>(word.size()) - 1 && word[i + 1] == second) {
|
|
new_word.push_back(first + second);
|
|
i += 2;
|
|
} else {
|
|
new_word.push_back(word[i]);
|
|
i += 1;
|
|
}
|
|
}
|
|
|
|
word = new_word;
|
|
|
|
if (word.size() == 1) {
|
|
break;
|
|
}
|
|
pairs = get_pairs(word);
|
|
}
|
|
|
|
std::u32string result;
|
|
for (int i = 0; i < word.size(); i++) {
|
|
result += word[i];
|
|
if (i != word.size() - 1) {
|
|
result += utf8_to_utf32(" ");
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
std::vector<int> tokenize(std::string text,
|
|
on_new_token_cb_t on_new_token_cb = nullptr,
|
|
size_t max_length = 0,
|
|
bool padding = false) {
|
|
std::vector<int32_t> tokens = encode(text, on_new_token_cb);
|
|
|
|
if (max_length > 0) {
|
|
if (tokens.size() < max_length) {
|
|
tokens.resize(max_length);
|
|
} else {
|
|
if (padding) {
|
|
tokens.insert(tokens.end(), max_length - tokens.size(), PAD_TOKEN_ID);
|
|
}
|
|
}
|
|
}
|
|
|
|
return tokens;
|
|
}
|
|
|
|
void pad_tokens(std::vector<int>& tokens,
|
|
std::vector<float>& weights,
|
|
size_t max_length = 0,
|
|
bool padding = false) {
|
|
if (max_length > 0 && padding) {
|
|
size_t n = std::ceil(tokens.size() * 1.0 / max_length);
|
|
if (n == 0) {
|
|
n = 1;
|
|
}
|
|
size_t length = max_length * n;
|
|
LOG_DEBUG("token length: %llu", length);
|
|
tokens.insert(tokens.end(), length - tokens.size(), PAD_TOKEN_ID);
|
|
weights.insert(weights.end(), length - weights.size(), 1.0);
|
|
}
|
|
}
|
|
|
|
std::vector<int> encode(std::string text, on_new_token_cb_t on_new_token_cb = nullptr) {
|
|
std::string original_text = text;
|
|
std::vector<int32_t> bpe_tokens;
|
|
std::vector<std::string> token_strs;
|
|
|
|
auto splited_texts = split_with_special_tokens(text, special_tokens);
|
|
|
|
for (auto& splited_text : splited_texts) {
|
|
if (is_special_token(splited_text)) {
|
|
bpe_tokens.push_back(encoder[utf8_to_utf32(splited_text)]);
|
|
token_strs.push_back(splited_text);
|
|
continue;
|
|
}
|
|
auto tokens = token_split(splited_text);
|
|
for (auto& token : tokens) {
|
|
if (on_new_token_cb != nullptr) {
|
|
bool skip = on_new_token_cb(token, bpe_tokens);
|
|
if (skip) {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
std::string token_str = token;
|
|
std::u32string utf32_token;
|
|
for (int i = 0; i < token_str.length(); i++) {
|
|
unsigned char b = token_str[i];
|
|
utf32_token += byte_encoder[b];
|
|
}
|
|
auto bpe_strs = bpe(utf32_token);
|
|
size_t start = 0;
|
|
size_t pos;
|
|
while ((pos = bpe_strs.find(' ', start)) != std::u32string::npos) {
|
|
auto bpe_str = bpe_strs.substr(start, pos - start);
|
|
bpe_tokens.push_back(encoder[bpe_str]);
|
|
token_strs.push_back(utf32_to_utf8(bpe_str));
|
|
|
|
start = pos + 1;
|
|
}
|
|
auto bpe_str = bpe_strs.substr(start, bpe_strs.size() - start);
|
|
bpe_tokens.push_back(encoder[bpe_str]);
|
|
token_strs.push_back(utf32_to_utf8(bpe_str));
|
|
}
|
|
}
|
|
|
|
std::stringstream ss;
|
|
ss << "[";
|
|
for (auto token : token_strs) {
|
|
ss << "\"" << token << "\", ";
|
|
}
|
|
ss << "]";
|
|
// LOG_DEBUG("split prompt \"%s\" to tokens %s", original_text.c_str(), ss.str().c_str());
|
|
// printf("split prompt \"%s\" to tokens %s \n", original_text.c_str(), ss.str().c_str());
|
|
return bpe_tokens;
|
|
}
|
|
};
|
|
|
|
struct Qwen2_5_VLMLP : public GGMLBlock {
|
|
public:
|
|
Qwen2_5_VLMLP(int64_t hidden_size, int64_t intermediate_size, bool bias = false) {
|
|
blocks["gate_proj"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, intermediate_size, false));
|
|
blocks["up_proj"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, intermediate_size, false));
|
|
blocks["down_proj"] = std::shared_ptr<GGMLBlock>(new Linear(intermediate_size, hidden_size, false));
|
|
}
|
|
|
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
|
|
// x: [N, n_token, hidden_size]
|
|
auto gate_proj = std::dynamic_pointer_cast<Linear>(blocks["gate_proj"]);
|
|
auto up_proj = std::dynamic_pointer_cast<Linear>(blocks["up_proj"]);
|
|
auto down_proj = std::dynamic_pointer_cast<Linear>(blocks["down_proj"]);
|
|
|
|
auto h = gate_proj->forward(ctx, x);
|
|
h = ggml_silu_inplace(ctx, h);
|
|
h = ggml_mul_inplace(ctx, h, up_proj->forward(ctx, x));
|
|
h = down_proj->forward(ctx, h);
|
|
return h;
|
|
}
|
|
};
|
|
|
|
struct Qwen2_5_VLAttention : public GGMLBlock {
|
|
protected:
|
|
int64_t head_dim;
|
|
int64_t num_heads;
|
|
int64_t num_kv_heads;
|
|
|
|
public:
|
|
Qwen2_5_VLAttention(int64_t hidden_size,
|
|
int64_t num_heads,
|
|
int64_t num_kv_heads)
|
|
: num_heads(num_heads), num_kv_heads(num_kv_heads) {
|
|
head_dim = hidden_size / num_heads;
|
|
GGML_ASSERT(num_heads * head_dim == hidden_size);
|
|
blocks["q_proj"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, num_heads * head_dim));
|
|
blocks["k_proj"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, num_kv_heads * head_dim));
|
|
blocks["v_proj"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, num_kv_heads * head_dim));
|
|
blocks["o_proj"] = std::shared_ptr<GGMLBlock>(new Linear(num_heads * head_dim, hidden_size, false));
|
|
}
|
|
|
|
struct ggml_tensor* forward(struct ggml_context* ctx,
|
|
ggml_backend_t backend,
|
|
struct ggml_tensor* x,
|
|
struct ggml_tensor* input_pos) {
|
|
// x: [N, n_token, hidden_size]
|
|
int64_t n_token = x->ne[1];
|
|
int64_t N = x->ne[2];
|
|
auto q_proj = std::dynamic_pointer_cast<Linear>(blocks["q_proj"]);
|
|
auto k_proj = std::dynamic_pointer_cast<Linear>(blocks["k_proj"]);
|
|
auto v_proj = std::dynamic_pointer_cast<Linear>(blocks["v_proj"]);
|
|
auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["o_proj"]);
|
|
|
|
auto q = q_proj->forward(ctx, x); // [N, n_token, num_heads*head_dim]
|
|
auto k = k_proj->forward(ctx, x); // [N, n_token, num_kv_heads*head_dim]
|
|
auto v = v_proj->forward(ctx, x); // [N, n_token, num_kv_heads*head_dim]
|
|
|
|
q = ggml_reshape_4d(ctx, q, head_dim, num_heads, n_token, N); // [N, n_token, num_heads, head_dim]
|
|
k = ggml_reshape_4d(ctx, k, head_dim, num_kv_heads, n_token, N); // [N, n_token, num_kv_heads, head_dim]
|
|
v = ggml_reshape_4d(ctx, v, head_dim, num_kv_heads, n_token, N); // [N, n_token, num_kv_heads, head_dim]
|
|
|
|
int sections[4] = {16, 24, 24, 0};
|
|
q = ggml_rope_multi(ctx, q, input_pos, nullptr, head_dim, sections, GGML_ROPE_TYPE_MROPE, 128000, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
|
|
k = ggml_rope_multi(ctx, k, input_pos, nullptr, head_dim, sections, GGML_ROPE_TYPE_MROPE, 128000, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
|
|
|
|
q = ggml_cont(ctx, ggml_torch_permute(ctx, q, 0, 2, 1, 3)); // [N, num_heads, n_token, head_dim]
|
|
q = ggml_reshape_3d(ctx, q, q->ne[0], q->ne[1], q->ne[2] * q->ne[3]); // [N*num_heads, n_token, head_dim]
|
|
|
|
k = ggml_cont(ctx, ggml_torch_permute(ctx, k, 0, 2, 1, 3)); // [N, num_kv_heads, n_token, head_dim]
|
|
k = ggml_reshape_3d(ctx, k, k->ne[0], k->ne[1], k->ne[2] * k->ne[3]); // [N*num_kv_heads, n_token, head_dim]
|
|
|
|
x = ggml_nn_attention_ext(ctx, backend, q, k, v, num_heads, nullptr, true, true, false); // [N, n_token, hidden_size]
|
|
|
|
x = out_proj->forward(ctx, x); // [N, n_token, hidden_size]
|
|
return x;
|
|
}
|
|
};
|
|
|
|
struct Qwen2_5_VLBlock : public GGMLBlock {
|
|
public:
|
|
Qwen2_5_VLBlock(int64_t hidden_size,
|
|
int64_t intermediate_size,
|
|
int64_t num_heads,
|
|
int64_t num_kv_heads,
|
|
float eps = 1e-6f) {
|
|
blocks["self_attn"] = std::shared_ptr<GGMLBlock>(new Qwen2_5_VLAttention(hidden_size, num_heads, num_kv_heads));
|
|
blocks["mlp"] = std::shared_ptr<GGMLBlock>(new Qwen2_5_VLMLP(hidden_size, intermediate_size));
|
|
blocks["input_layernorm"] = std::shared_ptr<GGMLBlock>(new RMSNorm(hidden_size, eps));
|
|
blocks["post_attention_layernorm"] = std::shared_ptr<GGMLBlock>(new RMSNorm(hidden_size, eps));
|
|
}
|
|
|
|
struct ggml_tensor* forward(struct ggml_context* ctx,
|
|
ggml_backend_t backend,
|
|
struct ggml_tensor* x,
|
|
struct ggml_tensor* input_pos) {
|
|
// x: [N, n_token, hidden_size]
|
|
auto self_attn = std::dynamic_pointer_cast<Qwen2_5_VLAttention>(blocks["self_attn"]);
|
|
auto mlp = std::dynamic_pointer_cast<Qwen2_5_VLMLP>(blocks["mlp"]);
|
|
auto input_layernorm = std::dynamic_pointer_cast<RMSNorm>(blocks["input_layernorm"]);
|
|
auto post_attention_layernorm = std::dynamic_pointer_cast<RMSNorm>(blocks["post_attention_layernorm"]);
|
|
|
|
auto residual = x;
|
|
x = input_layernorm->forward(ctx, x);
|
|
x = self_attn->forward(ctx, backend, x, input_pos);
|
|
x = ggml_add_inplace(ctx, x, residual);
|
|
|
|
residual = x;
|
|
x = post_attention_layernorm->forward(ctx, x);
|
|
x = mlp->forward(ctx, x);
|
|
x = ggml_add_inplace(ctx, x, residual);
|
|
|
|
return x;
|
|
}
|
|
};
|
|
|
|
struct Qwen2_5_VLTextModel : public GGMLBlock {
|
|
protected:
|
|
int64_t num_layers;
|
|
|
|
public:
|
|
Qwen2_5_VLTextModel(int64_t num_layers,
|
|
int64_t vocab_size,
|
|
int64_t hidden_size,
|
|
int64_t intermediate_size,
|
|
int64_t num_heads,
|
|
int64_t num_kv_heads,
|
|
float eps = 1e-6f)
|
|
: num_layers(num_layers) {
|
|
blocks["embed_tokens"] = std::shared_ptr<GGMLBlock>(new Embedding(vocab_size, hidden_size));
|
|
for (int i = 0; i < num_layers; i++) {
|
|
blocks["layers." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new Qwen2_5_VLBlock(hidden_size,
|
|
intermediate_size,
|
|
num_heads,
|
|
num_kv_heads));
|
|
}
|
|
blocks["norm"] = std::shared_ptr<GGMLBlock>(new RMSNorm(hidden_size, eps));
|
|
}
|
|
|
|
struct ggml_tensor* forward(struct ggml_context* ctx,
|
|
ggml_backend_t backend,
|
|
struct ggml_tensor* input_ids,
|
|
struct ggml_tensor* input_pos) {
|
|
// input_ids: [N, n_token]
|
|
// return: [N, n_token, hidden_size]
|
|
|
|
auto embed_tokens = std::dynamic_pointer_cast<Embedding>(blocks["embed_tokens"]);
|
|
auto norm = std::dynamic_pointer_cast<RMSNorm>(blocks["norm"]);
|
|
|
|
auto x = embed_tokens->forward(ctx, input_ids);
|
|
|
|
for (int i = 0; i < num_layers; i++) {
|
|
auto block = std::dynamic_pointer_cast<Qwen2_5_VLBlock>(blocks["layers." + std::to_string(i)]);
|
|
|
|
x = block->forward(ctx, backend, x, input_pos);
|
|
}
|
|
|
|
x = norm->forward(ctx, x);
|
|
return x;
|
|
}
|
|
};
|
|
|
|
struct Qwen2_5_VLParams {
|
|
int64_t num_layers = 28;
|
|
int64_t hidden_size = 3584;
|
|
int64_t intermediate_size = 18944;
|
|
int64_t num_heads = 28;
|
|
int64_t num_kv_heads = 4;
|
|
int64_t vocab_size = 152064;
|
|
float rms_norm_eps = 1e-06f;
|
|
};
|
|
|
|
struct Qwen2_5_VL : public GGMLBlock {
|
|
Qwen2_5_VLParams params;
|
|
|
|
public:
|
|
Qwen2_5_VL() {}
|
|
Qwen2_5_VL(Qwen2_5_VLParams params)
|
|
: params(params) {
|
|
blocks["model"] = std::shared_ptr<GGMLBlock>(new Qwen2_5_VLTextModel(params.num_layers,
|
|
params.vocab_size,
|
|
params.hidden_size,
|
|
params.intermediate_size,
|
|
params.num_heads,
|
|
params.num_kv_heads,
|
|
params.rms_norm_eps));
|
|
}
|
|
|
|
struct ggml_tensor* forward(struct ggml_context* ctx,
|
|
ggml_backend_t backend,
|
|
struct ggml_tensor* input_ids,
|
|
struct ggml_tensor* input_pos) {
|
|
// input_ids: [N, n_token]
|
|
auto model = std::dynamic_pointer_cast<Qwen2_5_VLTextModel>(blocks["model"]);
|
|
|
|
auto x = model->forward(ctx, backend, input_ids, input_pos);
|
|
return x;
|
|
}
|
|
};
|
|
|
|
struct Qwen2_5_VLRunner : public GGMLRunner {
|
|
Qwen2_5_VLParams params;
|
|
Qwen2_5_VL model;
|
|
|
|
std::vector<int> input_pos_vec;
|
|
|
|
Qwen2_5_VLRunner(ggml_backend_t backend,
|
|
bool offload_params_to_cpu,
|
|
const String2GGMLType& tensor_types,
|
|
const std::string prefix)
|
|
: GGMLRunner(backend, offload_params_to_cpu) {
|
|
model = Qwen2_5_VL(params);
|
|
model.init(params_ctx, tensor_types, prefix);
|
|
}
|
|
|
|
std::string get_desc() {
|
|
return "qwenvl2.5";
|
|
}
|
|
|
|
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
|
|
model.get_param_tensors(tensors, prefix);
|
|
}
|
|
|
|
struct ggml_tensor* forward(struct ggml_context* ctx,
|
|
ggml_backend_t backend,
|
|
struct ggml_tensor* input_ids,
|
|
struct ggml_tensor* input_pos) {
|
|
auto hidden_states = model.forward(ctx, backend, input_ids, input_pos); // [N, n_token, hidden_size]
|
|
return hidden_states;
|
|
}
|
|
|
|
struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids) {
|
|
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
|
|
|
|
input_ids = to_backend(input_ids);
|
|
|
|
int64_t n_tokens = input_ids->ne[0];
|
|
input_pos_vec.resize(n_tokens * 4);
|
|
for (int i = 0; i < n_tokens; ++i) {
|
|
input_pos_vec[i] = i;
|
|
input_pos_vec[n_tokens + i] = i;
|
|
input_pos_vec[2 * n_tokens + i] = i;
|
|
input_pos_vec[3 * n_tokens + i] = 0;
|
|
}
|
|
|
|
auto input_pos = ggml_new_tensor_1d(compute_ctx,
|
|
GGML_TYPE_I32,
|
|
n_tokens * 4);
|
|
set_backend_tensor_data(input_pos, input_pos_vec.data());
|
|
|
|
struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, input_pos);
|
|
|
|
ggml_build_forward_expand(gf, hidden_states);
|
|
|
|
return gf;
|
|
}
|
|
|
|
void compute(const int n_threads,
|
|
struct ggml_tensor* input_ids,
|
|
ggml_tensor** output,
|
|
ggml_context* output_ctx = NULL) {
|
|
auto get_graph = [&]() -> struct ggml_cgraph* {
|
|
return build_graph(input_ids);
|
|
};
|
|
GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
|
|
}
|
|
};
|
|
|
|
struct Qwen2_5_VLEmbedder {
|
|
Qwen2Tokenizer tokenizer;
|
|
Qwen2_5_VLRunner model;
|
|
|
|
Qwen2_5_VLEmbedder(ggml_backend_t backend,
|
|
bool offload_params_to_cpu,
|
|
const String2GGMLType& tensor_types = {},
|
|
const std::string prefix = "")
|
|
: model(backend, offload_params_to_cpu, tensor_types, prefix) {
|
|
}
|
|
|
|
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
|
|
model.get_param_tensors(tensors, prefix);
|
|
}
|
|
|
|
void alloc_params_buffer() {
|
|
model.alloc_params_buffer();
|
|
}
|
|
|
|
std::tuple<std::vector<int>, std::vector<float>> tokenize(std::string text,
|
|
size_t max_length = 0,
|
|
bool padding = false) {
|
|
auto parsed_attention = parse_prompt_attention(text);
|
|
|
|
{
|
|
std::stringstream ss;
|
|
ss << "[";
|
|
for (const auto& item : parsed_attention) {
|
|
ss << "['" << item.first << "', " << item.second << "], ";
|
|
}
|
|
ss << "]";
|
|
LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
|
|
}
|
|
|
|
std::vector<int> tokens;
|
|
std::vector<float> weights;
|
|
for (const auto& item : parsed_attention) {
|
|
const std::string& curr_text = item.first;
|
|
float curr_weight = item.second;
|
|
std::vector<int> curr_tokens = tokenizer.tokenize(curr_text, nullptr);
|
|
tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
|
|
weights.insert(weights.end(), curr_tokens.size(), curr_weight);
|
|
}
|
|
|
|
tokenizer.pad_tokens(tokens, weights, max_length, padding);
|
|
|
|
// for (int i = 0; i < tokens.size(); i++) {
|
|
// std::cout << tokens[i] << ":" << weights[i] << ", ";
|
|
// }
|
|
// std::cout << std::endl;
|
|
|
|
return {tokens, weights};
|
|
}
|
|
|
|
void test() {
|
|
struct ggml_init_params params;
|
|
params.mem_size = static_cast<size_t>(1024 * 1024) * 1024; // 1GB
|
|
params.mem_buffer = NULL;
|
|
params.no_alloc = false;
|
|
|
|
struct ggml_context* work_ctx = ggml_init(params);
|
|
GGML_ASSERT(work_ctx != NULL);
|
|
|
|
{
|
|
std::string text("<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\na lovely cat<|im_end|>\n<|im_start|>assistant\n");
|
|
auto tokens_and_weights = tokenize(text, 0, false);
|
|
std::vector<int>& tokens = std::get<0>(tokens_and_weights);
|
|
std::vector<float>& weights = std::get<1>(tokens_and_weights);
|
|
for (auto token : tokens) {
|
|
printf("%d ", token);
|
|
}
|
|
printf("\n");
|
|
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens);
|
|
struct ggml_tensor* out = NULL;
|
|
|
|
int t0 = ggml_time_ms();
|
|
model.compute(8, input_ids, &out, work_ctx);
|
|
int t1 = ggml_time_ms();
|
|
|
|
print_ggml_tensor(out);
|
|
LOG_DEBUG("qwen2vl test done in %dms", t1 - t0);
|
|
}
|
|
}
|
|
|
|
static void load_from_file_and_test(const std::string& file_path) {
|
|
// cpu f16: pass
|
|
// ggml_backend_t backend = ggml_backend_cuda_init(0);
|
|
ggml_backend_t backend = ggml_backend_cpu_init();
|
|
ggml_type model_data_type = GGML_TYPE_Q8_0;
|
|
|
|
ModelLoader model_loader;
|
|
if (!model_loader.init_from_file(file_path, "qwen2vl.")) {
|
|
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
|
|
return;
|
|
}
|
|
|
|
auto tensor_types = model_loader.tensor_storages_types;
|
|
for (auto& item : tensor_types) {
|
|
// LOG_DEBUG("%s %u", item.first.c_str(), item.second);
|
|
if (ends_with(item.first, "weight")) {
|
|
item.second = model_data_type;
|
|
}
|
|
}
|
|
|
|
std::shared_ptr<Qwen2_5_VLEmbedder> qwenvl = std::shared_ptr<Qwen2_5_VLEmbedder>(new Qwen2_5_VLEmbedder(backend, false, tensor_types, "qwen2vl"));
|
|
|
|
qwenvl->alloc_params_buffer();
|
|
std::map<std::string, ggml_tensor*> tensors;
|
|
qwenvl->get_param_tensors(tensors, "qwen2vl");
|
|
|
|
bool success = model_loader.load_tensors(tensors);
|
|
|
|
if (!success) {
|
|
LOG_ERROR("load tensors from model loader failed");
|
|
return;
|
|
}
|
|
|
|
LOG_INFO("qwenvl model loaded");
|
|
qwenvl->test();
|
|
}
|
|
};
|
|
|
|
}; // Qwen
|
|
|
|
#endif // __QWENVL_HPP__
|