diff --git a/clip.hpp b/clip.hpp index bde8a78..0b52956 100644 --- a/clip.hpp +++ b/clip.hpp @@ -6,7 +6,7 @@ /*================================================== CLIPTokenizer ===================================================*/ -std::pair, std::string> extract_and_remove_lora(std::string text) { +__STATIC_INLINE__ std::pair, std::string> extract_and_remove_lora(std::string text) { std::regex re("]+)>"); std::smatch matches; std::unordered_map filename2multiplier; @@ -31,7 +31,7 @@ std::pair, std::string> extract_and_remov return std::make_pair(filename2multiplier, text); } -std::vector> bytes_to_unicode() { +__STATIC_INLINE__ std::vector> bytes_to_unicode() { std::vector> byte_unicode_pairs; std::set byte_set; for (int b = static_cast('!'); b <= static_cast('~'); ++b) { @@ -398,6 +398,7 @@ public: } for (auto& token : matches) { std::string token_str = token.str(); + LOG_DEBUG("%s", token_str.c_str()); std::u32string utf32_token; for (int i = 0; i < token_str.length(); i++) { unsigned char b = token_str[i]; diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 02f4767..5b43670 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -27,6 +27,8 @@ #include "avi_writer.h" +#include "qwen.hpp" + #if defined(_WIN32) #define NOMINMAX #include @@ -1138,6 +1140,20 @@ bool load_images_from_dir(const std::string dir, int main(int argc, const char* argv[]) { SDParams params; + params.verbose = true; + sd_set_log_callback(sd_log_cb, (void*)¶ms); + auto on_new_token_cb = [&](std::string& str, std::vector& bpe_tokens) -> bool { + return false; + }; + // auto tokenizer = CLIPTokenizer(); + auto tokenizer = Qwen::Qwen2Tokenizer(); + std::string text("a lovely cat"); + auto tokens = tokenizer.encode(text, on_new_token_cb); + for (auto token : tokens) { + std::cout << token << " "; + } + std::cout << std::endl; + exit(1); parse_args(argc, argv, params); params.sample_params.guidance.slg.layers = params.skip_layers.data(); params.sample_params.guidance.slg.layer_count = params.skip_layers.size(); diff --git a/model.cpp b/model.cpp index 0585e98..330abeb 100644 --- a/model.cpp +++ b/model.cpp @@ -16,6 +16,7 @@ #include "stable-diffusion.h" #include "util.h" #include "vocab.hpp" +#include "vocab_qwen.hpp" #include "vocab_umt5.hpp" #include "ggml-alloc.h" @@ -1939,6 +1940,11 @@ std::string ModelLoader::load_merges() { return merges_utf8_str; } +std::string ModelLoader::load_qwen2_merges() { + std::string merges_utf8_str(reinterpret_cast(qwen2_merges_utf8_c_str), sizeof(qwen2_merges_utf8_c_str)); + return merges_utf8_str; +} + std::string ModelLoader::load_t5_tokenizer_json() { std::string json_str(reinterpret_cast(t5_tokenizer_json_str), sizeof(t5_tokenizer_json_str)); return json_str; diff --git a/model.h b/model.h index 0fdc99c..fb489fc 100644 --- a/model.h +++ b/model.h @@ -258,6 +258,7 @@ public: ~ModelLoader() = default; static std::string load_merges(); + static std::string load_qwen2_merges(); static std::string load_t5_tokenizer_json(); static std::string load_umt5_tokenizer_json(); }; diff --git a/qwen.hpp b/qwen.hpp new file mode 100644 index 0000000..d73a882 --- /dev/null +++ b/qwen.hpp @@ -0,0 +1,316 @@ +#ifndef __QWEN_HPP__ +#define __QWEN_HPP__ + +#include "ggml_extend.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "json.hpp" +#include "clip.hpp" +#include "tokenize_util.h" + +namespace Qwen { + +class Qwen2Tokenizer { +private: + std::map byte_encoder; + std::map byte_decoder; + std::map encoder; + std::map decoder; + std::map, int> bpe_ranks; + std::regex pat; + int encoder_len; + int bpe_len; + +public: + const std::string UNK_TOKEN = "<|endoftext|>"; + const std::string EOS_TOKEN = "<|endoftext|>"; + const std::string PAD_TOKEN = "<|endoftext|>"; + + const int UNK_TOKEN_ID = 151643; + const int EOS_TOKEN_ID = 151643; + const int PAD_TOKEN_ID = 151643; + +private: + static std::string strip(const std::string& str) { + std::string::size_type start = str.find_first_not_of(" \t\n\r\v\f"); + std::string::size_type end = str.find_last_not_of(" \t\n\r\v\f"); + + if (start == std::string::npos) { + // String contains only whitespace characters + return ""; + } + + return str.substr(start, end - start + 1); + } + + static std::string whitespace_clean(std::string text) { + text = std::regex_replace(text, std::regex(R"(\s+)"), " "); + text = strip(text); + return text; + } + + static std::set> get_pairs(const std::vector& subwords) { + std::set> pairs; + if (subwords.size() == 0) { + return pairs; + } + std::u32string prev_subword = subwords[0]; + for (int i = 1; i < subwords.size(); i++) { + std::u32string subword = subwords[i]; + std::pair pair(prev_subword, subword); + pairs.insert(pair); + prev_subword = subword; + } + return pairs; + } + +public: + explicit Qwen2Tokenizer(const std::string& merges_utf8_str = "") { + if (merges_utf8_str.size() > 0) { + load_from_merges(merges_utf8_str); + } else { + load_from_merges(ModelLoader::load_qwen2_merges()); + } + } + + void load_from_merges(const std::string& merges_utf8_str) { + auto byte_unicode_pairs = bytes_to_unicode(); + // printf("byte_unicode_pairs have %lu pairs \n", byte_unicode_pairs.size()); + byte_encoder = std::map(byte_unicode_pairs.begin(), byte_unicode_pairs.end()); + for (auto& pair : byte_unicode_pairs) { + byte_decoder[pair.second] = pair.first; + } + // for (auto & pair: byte_unicode_pairs) { + // std::cout << pair.first << ": " << pair.second << std::endl; + // } + std::vector merges; + size_t start = 0; + size_t pos; + std::u32string merges_utf32_str = utf8_to_utf32(merges_utf8_str); + while ((pos = merges_utf32_str.find('\n', start)) != std::string::npos) { + merges.push_back(merges_utf32_str.substr(start, pos - start)); + start = pos + 1; + } + LOG_DEBUG("merges size %llu", merges.size()); + // GGML_ASSERT(merges.size() == 48895); + merges = std::vector(merges.begin(), merges.end()); + std::vector> merge_pairs; + for (const auto& merge : merges) { + size_t space_pos = merge.find(' '); + merge_pairs.emplace_back(merge.substr(0, space_pos), merge.substr(space_pos + 1)); + // LOG_DEBUG("%s", utf32_to_utf8(merge.substr(space_pos + 1)).c_str()); + // printf("%s :: %s | %s \n", utf32_to_utf8(merge).c_str(), utf32_to_utf8(merge.substr(0, space_pos)).c_str(), + // utf32_to_utf8(merge.substr(space_pos + 1)).c_str()); + } + + std::vector vocab; + for (const auto& pair : byte_unicode_pairs) { + vocab.push_back(pair.second); + } + for (const auto& merge : merge_pairs) { + vocab.push_back(merge.first + merge.second); + } + vocab.push_back(utf8_to_utf32("<|endoftext|>")); + vocab.push_back(utf8_to_utf32("<|im_start|>")); + vocab.push_back(utf8_to_utf32("<|im_end|>")); + vocab.push_back(utf8_to_utf32("<|object_ref_start|>")); + vocab.push_back(utf8_to_utf32("<|object_ref_end|>")); + vocab.push_back(utf8_to_utf32("<|box_start|>")); + vocab.push_back(utf8_to_utf32("<|box_end|>")); + vocab.push_back(utf8_to_utf32("<|quad_start|>")); + vocab.push_back(utf8_to_utf32("<|quad_end|>")); + vocab.push_back(utf8_to_utf32("<|vision_start|>")); + vocab.push_back(utf8_to_utf32("<|vision_end|>")); + vocab.push_back(utf8_to_utf32("<|vision_pad|>")); + vocab.push_back(utf8_to_utf32("<|image_pad|>")); + vocab.push_back(utf8_to_utf32("<|video_pad|>")); + vocab.push_back(utf8_to_utf32("")); + vocab.push_back(utf8_to_utf32("")); + vocab.push_back(utf8_to_utf32("<|fim_prefix|>")); + vocab.push_back(utf8_to_utf32("<|fim_middle|>")); + vocab.push_back(utf8_to_utf32("<|fim_suffix|>")); + vocab.push_back(utf8_to_utf32("<|fim_pad|>")); + vocab.push_back(utf8_to_utf32("<|repo_name|>")); + vocab.push_back(utf8_to_utf32("<|file_sep|>")); + + LOG_DEBUG("vocab size: %llu", vocab.size()); + int i = 0; + for (const auto& token : vocab) { + encoder[token] = i; + decoder[i] = token; + i++; + } + encoder_len = i; + + int rank = 0; + for (const auto& merge : merge_pairs) { + bpe_ranks[merge] = rank++; + } + bpe_len = rank; + }; + + std::u32string bpe(const std::u32string& token) { + std::vector word; + + for (int i = 0; i < token.size(); i++) { + word.emplace_back(1, token[i]); + } + + std::set> pairs = get_pairs(word); + + if (pairs.empty()) { + return token; + } + + while (true) { + auto min_pair_iter = std::min_element(pairs.begin(), + pairs.end(), + [&](const std::pair& a, + const std::pair& b) { + if (bpe_ranks.find(a) == bpe_ranks.end()) { + return false; + } else if (bpe_ranks.find(b) == bpe_ranks.end()) { + return true; + } + return bpe_ranks.at(a) < bpe_ranks.at(b); + }); + + const std::pair& bigram = *min_pair_iter; + + if (bpe_ranks.find(bigram) == bpe_ranks.end()) { + break; + } + + std::u32string first = bigram.first; + std::u32string second = bigram.second; + std::vector new_word; + int32_t i = 0; + + while (i < word.size()) { + auto it = std::find(word.begin() + i, word.end(), first); + if (it == word.end()) { + new_word.insert(new_word.end(), word.begin() + i, word.end()); + break; + } + new_word.insert(new_word.end(), word.begin() + i, it); + i = static_cast(std::distance(word.begin(), it)); + + if (word[i] == first && i < static_cast(word.size()) - 1 && word[i + 1] == second) { + new_word.push_back(first + second); + i += 2; + } else { + new_word.push_back(word[i]); + i += 1; + } + } + + word = new_word; + + if (word.size() == 1) { + break; + } + pairs = get_pairs(word); + } + + std::u32string result; + for (int i = 0; i < word.size(); i++) { + result += word[i]; + if (i != word.size() - 1) { + result += utf8_to_utf32(" "); + } + } + + return result; + } + + std::vector tokenize(std::string text, + on_new_token_cb_t on_new_token_cb, + size_t max_length = 0, + bool padding = false) { + std::vector tokens = encode(text, on_new_token_cb); + + if (max_length > 0) { + tokens.resize(max_length); + if (padding) { + tokens.insert(tokens.end(), max_length - tokens.size(), PAD_TOKEN_ID); + } + } + + return tokens; + } + + void pad_tokens(std::vector& tokens, + std::vector& weights, + size_t max_length = 0, + bool padding = false) { + if (max_length > 0 && padding) { + size_t n = std::ceil(tokens.size() * 1.0 / max_length); + if (n == 0) { + n = 1; + } + size_t length = max_length * n; + LOG_DEBUG("token length: %llu", length); + tokens.insert(tokens.end(), length - tokens.size(), PAD_TOKEN_ID); + weights.insert(weights.end(), length - weights.size(), 1.0); + } + } + + std::vector encode(std::string text, on_new_token_cb_t on_new_token_cb) { + std::string original_text = text; + std::vector bpe_tokens; + + auto tokens = token_split(text); + std::vector token_strs; + for (auto& token : tokens) { + bool skip = on_new_token_cb(token, bpe_tokens); + if (skip) { + continue; + } + std::string token_str = token; + std::u32string utf32_token; + for (int i = 0; i < token_str.length(); i++) { + unsigned char b = token_str[i]; + utf32_token += byte_encoder[b]; + } + auto bpe_strs = bpe(utf32_token); + size_t start = 0; + size_t pos; + while ((pos = bpe_strs.find(' ', start)) != std::u32string::npos) { + auto bpe_str = bpe_strs.substr(start, pos - start); + bpe_tokens.push_back(encoder[bpe_str]); + token_strs.push_back(utf32_to_utf8(bpe_str)); + + start = pos + 1; + } + auto bpe_str = bpe_strs.substr(start, bpe_strs.size() - start); + bpe_tokens.push_back(encoder[bpe_str]); + token_strs.push_back(utf32_to_utf8(bpe_str)); + } + + std::stringstream ss; + ss << "["; + for (auto token : token_strs) { + ss << "\"" << token << "\", "; + } + ss << "]"; + LOG_DEBUG("split prompt \"%s\" to tokens %s", original_text.c_str(), ss.str().c_str()); + // printf("split prompt \"%s\" to tokens %s \n", original_text.c_str(), ss.str().c_str()); + return bpe_tokens; + } +}; + +}; + + + +#endif // __QWEN_HPP__ diff --git a/tokenize_util.cpp b/tokenize_util.cpp new file mode 100644 index 0000000..85e3821 --- /dev/null +++ b/tokenize_util.cpp @@ -0,0 +1,927 @@ +#include +#include +#include +#include + +#include "tokenize_util.h" + +bool is_number(char32_t ch) { + return (ch >= U'0' && ch <= U'9'); +} + +bool is_letter(char32_t ch) { + static const struct { char32_t start, end; } ranges[] = { + {0x41, 0x5A}, + {0x61, 0x7A}, + {0xAA, 0xAA}, + {0xB5, 0xB5}, + {0xBA, 0xBA}, + {0xC0, 0xD6}, + {0xD8, 0xF6}, + {0xF8, 0x2C1}, + {0x2C6, 0x2D1}, + {0x2E0, 0x2E4}, + {0x2EC, 0x2EC}, + {0x2EE, 0x2EE}, + {0x370, 0x374}, + {0x376, 0x377}, + {0x37A, 0x37D}, + {0x37F, 0x37F}, + {0x386, 0x386}, + {0x388, 0x38A}, + {0x38C, 0x38C}, + {0x38E, 0x3A1}, + {0x3A3, 0x3F5}, + {0x3F7, 0x481}, + {0x48A, 0x52F}, + {0x531, 0x556}, + {0x559, 0x559}, + {0x560, 0x588}, + {0x5D0, 0x5EA}, + {0x5EF, 0x5F2}, + {0x620, 0x64A}, + {0x66E, 0x66F}, + {0x671, 0x6D3}, + {0x6D5, 0x6D5}, + {0x6E5, 0x6E6}, + {0x6EE, 0x6EF}, + {0x6FA, 0x6FC}, + {0x6FF, 0x6FF}, + {0x710, 0x710}, + {0x712, 0x72F}, + {0x74D, 0x7A5}, + {0x7B1, 0x7B1}, + {0x7CA, 0x7EA}, + {0x7F4, 0x7F5}, + {0x7FA, 0x7FA}, + {0x800, 0x815}, + {0x81A, 0x81A}, + {0x824, 0x824}, + {0x828, 0x828}, + {0x840, 0x858}, + {0x860, 0x86A}, + {0x870, 0x887}, + {0x889, 0x88F}, + {0x8A0, 0x8C9}, + {0x904, 0x939}, + {0x93D, 0x93D}, + {0x950, 0x950}, + {0x958, 0x961}, + {0x971, 0x980}, + {0x985, 0x98C}, + {0x98F, 0x990}, + {0x993, 0x9A8}, + {0x9AA, 0x9B0}, + {0x9B2, 0x9B2}, + {0x9B6, 0x9B9}, + {0x9BD, 0x9BD}, + {0x9CE, 0x9CE}, + {0x9DC, 0x9DD}, + {0x9DF, 0x9E1}, + {0x9F0, 0x9F1}, + {0x9FC, 0x9FC}, + {0xA05, 0xA0A}, + {0xA0F, 0xA10}, + {0xA13, 0xA28}, + {0xA2A, 0xA30}, + {0xA32, 0xA33}, + {0xA35, 0xA36}, + {0xA38, 0xA39}, + {0xA59, 0xA5C}, + {0xA5E, 0xA5E}, + {0xA72, 0xA74}, + {0xA85, 0xA8D}, + {0xA8F, 0xA91}, + {0xA93, 0xAA8}, + {0xAAA, 0xAB0}, + {0xAB2, 0xAB3}, + {0xAB5, 0xAB9}, + {0xABD, 0xABD}, + {0xAD0, 0xAD0}, + {0xAE0, 0xAE1}, + {0xAF9, 0xAF9}, + {0xB05, 0xB0C}, + {0xB0F, 0xB10}, + {0xB13, 0xB28}, + {0xB2A, 0xB30}, + {0xB32, 0xB33}, + {0xB35, 0xB39}, + {0xB3D, 0xB3D}, + {0xB5C, 0xB5D}, + {0xB5F, 0xB61}, + {0xB71, 0xB71}, + {0xB83, 0xB83}, + {0xB85, 0xB8A}, + {0xB8E, 0xB90}, + {0xB92, 0xB95}, + {0xB99, 0xB9A}, + {0xB9C, 0xB9C}, + {0xB9E, 0xB9F}, + {0xBA3, 0xBA4}, + {0xBA8, 0xBAA}, + {0xBAE, 0xBB9}, + {0xBD0, 0xBD0}, + {0xC05, 0xC0C}, + {0xC0E, 0xC10}, + {0xC12, 0xC28}, + {0xC2A, 0xC39}, + {0xC3D, 0xC3D}, + {0xC58, 0xC5A}, + {0xC5C, 0xC5D}, + {0xC60, 0xC61}, + {0xC80, 0xC80}, + {0xC85, 0xC8C}, + {0xC8E, 0xC90}, + {0xC92, 0xCA8}, + {0xCAA, 0xCB3}, + {0xCB5, 0xCB9}, + {0xCBD, 0xCBD}, + {0xCDC, 0xCDE}, + {0xCE0, 0xCE1}, + {0xCF1, 0xCF2}, + {0xD04, 0xD0C}, + {0xD0E, 0xD10}, + {0xD12, 0xD3A}, + {0xD3D, 0xD3D}, + {0xD4E, 0xD4E}, + {0xD54, 0xD56}, + {0xD5F, 0xD61}, + {0xD7A, 0xD7F}, + {0xD85, 0xD96}, + {0xD9A, 0xDB1}, + {0xDB3, 0xDBB}, + {0xDBD, 0xDBD}, + {0xDC0, 0xDC6}, + {0xE01, 0xE30}, + {0xE32, 0xE33}, + {0xE40, 0xE46}, + {0xE81, 0xE82}, + {0xE84, 0xE84}, + {0xE86, 0xE8A}, + {0xE8C, 0xEA3}, + {0xEA5, 0xEA5}, + {0xEA7, 0xEB0}, + {0xEB2, 0xEB3}, + {0xEBD, 0xEBD}, + {0xEC0, 0xEC4}, + {0xEC6, 0xEC6}, + {0xEDC, 0xEDF}, + {0xF00, 0xF00}, + {0xF40, 0xF47}, + {0xF49, 0xF6C}, + {0xF88, 0xF8C}, + {0x1000, 0x102A}, + {0x103F, 0x103F}, + {0x1050, 0x1055}, + {0x105A, 0x105D}, + {0x1061, 0x1061}, + {0x1065, 0x1066}, + {0x106E, 0x1070}, + {0x1075, 0x1081}, + {0x108E, 0x108E}, + {0x10A0, 0x10C5}, + {0x10C7, 0x10C7}, + {0x10CD, 0x10CD}, + {0x10D0, 0x10FA}, + {0x10FC, 0x1248}, + {0x124A, 0x124D}, + {0x1250, 0x1256}, + {0x1258, 0x1258}, + {0x125A, 0x125D}, + {0x1260, 0x1288}, + {0x128A, 0x128D}, + {0x1290, 0x12B0}, + {0x12B2, 0x12B5}, + {0x12B8, 0x12BE}, + {0x12C0, 0x12C0}, + {0x12C2, 0x12C5}, + {0x12C8, 0x12D6}, + {0x12D8, 0x1310}, + {0x1312, 0x1315}, + {0x1318, 0x135A}, + {0x1380, 0x138F}, + {0x13A0, 0x13F5}, + {0x13F8, 0x13FD}, + {0x1401, 0x166C}, + {0x166F, 0x167F}, + {0x1681, 0x169A}, + {0x16A0, 0x16EA}, + {0x16F1, 0x16F8}, + {0x1700, 0x1711}, + {0x171F, 0x1731}, + {0x1740, 0x1751}, + {0x1760, 0x176C}, + {0x176E, 0x1770}, + {0x1780, 0x17B3}, + {0x17D7, 0x17D7}, + {0x17DC, 0x17DC}, + {0x1820, 0x1878}, + {0x1880, 0x1884}, + {0x1887, 0x18A8}, + {0x18AA, 0x18AA}, + {0x18B0, 0x18F5}, + {0x1900, 0x191E}, + {0x1950, 0x196D}, + {0x1970, 0x1974}, + {0x1980, 0x19AB}, + {0x19B0, 0x19C9}, + {0x1A00, 0x1A16}, + {0x1A20, 0x1A54}, + {0x1AA7, 0x1AA7}, + {0x1B05, 0x1B33}, + {0x1B45, 0x1B4C}, + {0x1B83, 0x1BA0}, + {0x1BAE, 0x1BAF}, + {0x1BBA, 0x1BE5}, + {0x1C00, 0x1C23}, + {0x1C4D, 0x1C4F}, + {0x1C5A, 0x1C7D}, + {0x1C80, 0x1C8A}, + {0x1C90, 0x1CBA}, + {0x1CBD, 0x1CBF}, + {0x1CE9, 0x1CEC}, + {0x1CEE, 0x1CF3}, + {0x1CF5, 0x1CF6}, + {0x1CFA, 0x1CFA}, + {0x1D00, 0x1DBF}, + {0x1E00, 0x1F15}, + {0x1F18, 0x1F1D}, + {0x1F20, 0x1F45}, + {0x1F48, 0x1F4D}, + {0x1F50, 0x1F57}, + {0x1F59, 0x1F59}, + {0x1F5B, 0x1F5B}, + {0x1F5D, 0x1F5D}, + {0x1F5F, 0x1F7D}, + {0x1F80, 0x1FB4}, + {0x1FB6, 0x1FBC}, + {0x1FBE, 0x1FBE}, + {0x1FC2, 0x1FC4}, + {0x1FC6, 0x1FCC}, + {0x1FD0, 0x1FD3}, + {0x1FD6, 0x1FDB}, + {0x1FE0, 0x1FEC}, + {0x1FF2, 0x1FF4}, + {0x1FF6, 0x1FFC}, + {0x2071, 0x2071}, + {0x207F, 0x207F}, + {0x2090, 0x209C}, + {0x2102, 0x2102}, + {0x2107, 0x2107}, + {0x210A, 0x2113}, + {0x2115, 0x2115}, + {0x2119, 0x211D}, + {0x2124, 0x2124}, + {0x2126, 0x2126}, + {0x2128, 0x2128}, + {0x212A, 0x212D}, + {0x212F, 0x2139}, + {0x213C, 0x213F}, + {0x2145, 0x2149}, + {0x214E, 0x214E}, + {0x2183, 0x2184}, + {0x2C00, 0x2CE4}, + {0x2CEB, 0x2CEE}, + {0x2CF2, 0x2CF3}, + {0x2D00, 0x2D25}, + {0x2D27, 0x2D27}, + {0x2D2D, 0x2D2D}, + {0x2D30, 0x2D67}, + {0x2D6F, 0x2D6F}, + {0x2D80, 0x2D96}, + {0x2DA0, 0x2DA6}, + {0x2DA8, 0x2DAE}, + {0x2DB0, 0x2DB6}, + {0x2DB8, 0x2DBE}, + {0x2DC0, 0x2DC6}, + {0x2DC8, 0x2DCE}, + {0x2DD0, 0x2DD6}, + {0x2DD8, 0x2DDE}, + {0x2E2F, 0x2E2F}, + {0x3005, 0x3006}, + {0x3031, 0x3035}, + {0x303B, 0x303C}, + {0x3041, 0x3096}, + {0x309D, 0x309F}, + {0x30A1, 0x30FA}, + {0x30FC, 0x30FF}, + {0x3105, 0x312F}, + {0x3131, 0x318E}, + {0x31A0, 0x31BF}, + {0x31F0, 0x31FF}, + {0x3400, 0x4DBF}, + {0x4E00, 0xA48C}, + {0xA4D0, 0xA4FD}, + {0xA500, 0xA60C}, + {0xA610, 0xA61F}, + {0xA62A, 0xA62B}, + {0xA640, 0xA66E}, + {0xA67F, 0xA69D}, + {0xA6A0, 0xA6E5}, + {0xA717, 0xA71F}, + {0xA722, 0xA788}, + {0xA78B, 0xA7DC}, + {0xA7F1, 0xA801}, + {0xA803, 0xA805}, + {0xA807, 0xA80A}, + {0xA80C, 0xA822}, + {0xA840, 0xA873}, + {0xA882, 0xA8B3}, + {0xA8F2, 0xA8F7}, + {0xA8FB, 0xA8FB}, + {0xA8FD, 0xA8FE}, + {0xA90A, 0xA925}, + {0xA930, 0xA946}, + {0xA960, 0xA97C}, + {0xA984, 0xA9B2}, + {0xA9CF, 0xA9CF}, + {0xA9E0, 0xA9E4}, + {0xA9E6, 0xA9EF}, + {0xA9FA, 0xA9FE}, + {0xAA00, 0xAA28}, + {0xAA40, 0xAA42}, + {0xAA44, 0xAA4B}, + {0xAA60, 0xAA76}, + {0xAA7A, 0xAA7A}, + {0xAA7E, 0xAAAF}, + {0xAAB1, 0xAAB1}, + {0xAAB5, 0xAAB6}, + {0xAAB9, 0xAABD}, + {0xAAC0, 0xAAC0}, + {0xAAC2, 0xAAC2}, + {0xAADB, 0xAADD}, + {0xAAE0, 0xAAEA}, + {0xAAF2, 0xAAF4}, + {0xAB01, 0xAB06}, + {0xAB09, 0xAB0E}, + {0xAB11, 0xAB16}, + {0xAB20, 0xAB26}, + {0xAB28, 0xAB2E}, + {0xAB30, 0xAB5A}, + {0xAB5C, 0xAB69}, + {0xAB70, 0xABE2}, + {0xAC00, 0xD7A3}, + {0xD7B0, 0xD7C6}, + {0xD7CB, 0xD7FB}, + {0xF900, 0xFA6D}, + {0xFA70, 0xFAD9}, + {0xFB00, 0xFB06}, + {0xFB13, 0xFB17}, + {0xFB1D, 0xFB1D}, + {0xFB1F, 0xFB28}, + {0xFB2A, 0xFB36}, + {0xFB38, 0xFB3C}, + {0xFB3E, 0xFB3E}, + {0xFB40, 0xFB41}, + {0xFB43, 0xFB44}, + {0xFB46, 0xFBB1}, + {0xFBD3, 0xFD3D}, + {0xFD50, 0xFD8F}, + {0xFD92, 0xFDC7}, + {0xFDF0, 0xFDFB}, + {0xFE70, 0xFE74}, + {0xFE76, 0xFEFC}, + {0xFF21, 0xFF3A}, + {0xFF41, 0xFF5A}, + {0xFF66, 0xFFBE}, + {0xFFC2, 0xFFC7}, + {0xFFCA, 0xFFCF}, + {0xFFD2, 0xFFD7}, + {0xFFDA, 0xFFDC}, + {0x10000, 0x1000B}, + {0x1000D, 0x10026}, + {0x10028, 0x1003A}, + {0x1003C, 0x1003D}, + {0x1003F, 0x1004D}, + {0x10050, 0x1005D}, + {0x10080, 0x100FA}, + {0x10280, 0x1029C}, + {0x102A0, 0x102D0}, + {0x10300, 0x1031F}, + {0x1032D, 0x10340}, + {0x10342, 0x10349}, + {0x10350, 0x10375}, + {0x10380, 0x1039D}, + {0x103A0, 0x103C3}, + {0x103C8, 0x103CF}, + {0x10400, 0x1049D}, + {0x104B0, 0x104D3}, + {0x104D8, 0x104FB}, + {0x10500, 0x10527}, + {0x10530, 0x10563}, + {0x10570, 0x1057A}, + {0x1057C, 0x1058A}, + {0x1058C, 0x10592}, + {0x10594, 0x10595}, + {0x10597, 0x105A1}, + {0x105A3, 0x105B1}, + {0x105B3, 0x105B9}, + {0x105BB, 0x105BC}, + {0x105C0, 0x105F3}, + {0x10600, 0x10736}, + {0x10740, 0x10755}, + {0x10760, 0x10767}, + {0x10780, 0x10785}, + {0x10787, 0x107B0}, + {0x107B2, 0x107BA}, + {0x10800, 0x10805}, + {0x10808, 0x10808}, + {0x1080A, 0x10835}, + {0x10837, 0x10838}, + {0x1083C, 0x1083C}, + {0x1083F, 0x10855}, + {0x10860, 0x10876}, + {0x10880, 0x1089E}, + {0x108E0, 0x108F2}, + {0x108F4, 0x108F5}, + {0x10900, 0x10915}, + {0x10920, 0x10939}, + {0x10940, 0x10959}, + {0x10980, 0x109B7}, + {0x109BE, 0x109BF}, + {0x10A00, 0x10A00}, + {0x10A10, 0x10A13}, + {0x10A15, 0x10A17}, + {0x10A19, 0x10A35}, + {0x10A60, 0x10A7C}, + {0x10A80, 0x10A9C}, + {0x10AC0, 0x10AC7}, + {0x10AC9, 0x10AE4}, + {0x10B00, 0x10B35}, + {0x10B40, 0x10B55}, + {0x10B60, 0x10B72}, + {0x10B80, 0x10B91}, + {0x10C00, 0x10C48}, + {0x10C80, 0x10CB2}, + {0x10CC0, 0x10CF2}, + {0x10D00, 0x10D23}, + {0x10D4A, 0x10D65}, + {0x10D6F, 0x10D85}, + {0x10E80, 0x10EA9}, + {0x10EB0, 0x10EB1}, + {0x10EC2, 0x10EC7}, + {0x10F00, 0x10F1C}, + {0x10F27, 0x10F27}, + {0x10F30, 0x10F45}, + {0x10F70, 0x10F81}, + {0x10FB0, 0x10FC4}, + {0x10FE0, 0x10FF6}, + {0x11003, 0x11037}, + {0x11071, 0x11072}, + {0x11075, 0x11075}, + {0x11083, 0x110AF}, + {0x110D0, 0x110E8}, + {0x11103, 0x11126}, + {0x11144, 0x11144}, + {0x11147, 0x11147}, + {0x11150, 0x11172}, + {0x11176, 0x11176}, + {0x11183, 0x111B2}, + {0x111C1, 0x111C4}, + {0x111DA, 0x111DA}, + {0x111DC, 0x111DC}, + {0x11200, 0x11211}, + {0x11213, 0x1122B}, + {0x1123F, 0x11240}, + {0x11280, 0x11286}, + {0x11288, 0x11288}, + {0x1128A, 0x1128D}, + {0x1128F, 0x1129D}, + {0x1129F, 0x112A8}, + {0x112B0, 0x112DE}, + {0x11305, 0x1130C}, + {0x1130F, 0x11310}, + {0x11313, 0x11328}, + {0x1132A, 0x11330}, + {0x11332, 0x11333}, + {0x11335, 0x11339}, + {0x1133D, 0x1133D}, + {0x11350, 0x11350}, + {0x1135D, 0x11361}, + {0x11380, 0x11389}, + {0x1138B, 0x1138B}, + {0x1138E, 0x1138E}, + {0x11390, 0x113B5}, + {0x113B7, 0x113B7}, + {0x113D1, 0x113D1}, + {0x113D3, 0x113D3}, + {0x11400, 0x11434}, + {0x11447, 0x1144A}, + {0x1145F, 0x11461}, + {0x11480, 0x114AF}, + {0x114C4, 0x114C5}, + {0x114C7, 0x114C7}, + {0x11580, 0x115AE}, + {0x115D8, 0x115DB}, + {0x11600, 0x1162F}, + {0x11644, 0x11644}, + {0x11680, 0x116AA}, + {0x116B8, 0x116B8}, + {0x11700, 0x1171A}, + {0x11740, 0x11746}, + {0x11800, 0x1182B}, + {0x118A0, 0x118DF}, + {0x118FF, 0x11906}, + {0x11909, 0x11909}, + {0x1190C, 0x11913}, + {0x11915, 0x11916}, + {0x11918, 0x1192F}, + {0x1193F, 0x1193F}, + {0x11941, 0x11941}, + {0x119A0, 0x119A7}, + {0x119AA, 0x119D0}, + {0x119E1, 0x119E1}, + {0x119E3, 0x119E3}, + {0x11A00, 0x11A00}, + {0x11A0B, 0x11A32}, + {0x11A3A, 0x11A3A}, + {0x11A50, 0x11A50}, + {0x11A5C, 0x11A89}, + {0x11A9D, 0x11A9D}, + {0x11AB0, 0x11AF8}, + {0x11BC0, 0x11BE0}, + {0x11C00, 0x11C08}, + {0x11C0A, 0x11C2E}, + {0x11C40, 0x11C40}, + {0x11C72, 0x11C8F}, + {0x11D00, 0x11D06}, + {0x11D08, 0x11D09}, + {0x11D0B, 0x11D30}, + {0x11D46, 0x11D46}, + {0x11D60, 0x11D65}, + {0x11D67, 0x11D68}, + {0x11D6A, 0x11D89}, + {0x11D98, 0x11D98}, + {0x11DB0, 0x11DDB}, + {0x11EE0, 0x11EF2}, + {0x11F02, 0x11F02}, + {0x11F04, 0x11F10}, + {0x11F12, 0x11F33}, + {0x11FB0, 0x11FB0}, + {0x12000, 0x12399}, + {0x12480, 0x12543}, + {0x12F90, 0x12FF0}, + {0x13000, 0x1342F}, + {0x13441, 0x13446}, + {0x13460, 0x143FA}, + {0x14400, 0x14646}, + {0x16100, 0x1611D}, + {0x16800, 0x16A38}, + {0x16A40, 0x16A5E}, + {0x16A70, 0x16ABE}, + {0x16AD0, 0x16AED}, + {0x16B00, 0x16B2F}, + {0x16B40, 0x16B43}, + {0x16B63, 0x16B77}, + {0x16B7D, 0x16B8F}, + {0x16D40, 0x16D6C}, + {0x16E40, 0x16E7F}, + {0x16EA0, 0x16EB8}, + {0x16EBB, 0x16ED3}, + {0x16F00, 0x16F4A}, + {0x16F50, 0x16F50}, + {0x16F93, 0x16F9F}, + {0x16FE0, 0x16FE1}, + {0x16FE3, 0x16FE3}, + {0x16FF2, 0x16FF3}, + {0x17000, 0x18CD5}, + {0x18CFF, 0x18D1E}, + {0x18D80, 0x18DF2}, + {0x1AFF0, 0x1AFF3}, + {0x1AFF5, 0x1AFFB}, + {0x1AFFD, 0x1AFFE}, + {0x1B000, 0x1B122}, + {0x1B132, 0x1B132}, + {0x1B150, 0x1B152}, + {0x1B155, 0x1B155}, + {0x1B164, 0x1B167}, + {0x1B170, 0x1B2FB}, + {0x1BC00, 0x1BC6A}, + {0x1BC70, 0x1BC7C}, + {0x1BC80, 0x1BC88}, + {0x1BC90, 0x1BC99}, + {0x1D400, 0x1D454}, + {0x1D456, 0x1D49C}, + {0x1D49E, 0x1D49F}, + {0x1D4A2, 0x1D4A2}, + {0x1D4A5, 0x1D4A6}, + {0x1D4A9, 0x1D4AC}, + {0x1D4AE, 0x1D4B9}, + {0x1D4BB, 0x1D4BB}, + {0x1D4BD, 0x1D4C3}, + {0x1D4C5, 0x1D505}, + {0x1D507, 0x1D50A}, + {0x1D50D, 0x1D514}, + {0x1D516, 0x1D51C}, + {0x1D51E, 0x1D539}, + {0x1D53B, 0x1D53E}, + {0x1D540, 0x1D544}, + {0x1D546, 0x1D546}, + {0x1D54A, 0x1D550}, + {0x1D552, 0x1D6A5}, + {0x1D6A8, 0x1D6C0}, + {0x1D6C2, 0x1D6DA}, + {0x1D6DC, 0x1D6FA}, + {0x1D6FC, 0x1D714}, + {0x1D716, 0x1D734}, + {0x1D736, 0x1D74E}, + {0x1D750, 0x1D76E}, + {0x1D770, 0x1D788}, + {0x1D78A, 0x1D7A8}, + {0x1D7AA, 0x1D7C2}, + {0x1D7C4, 0x1D7CB}, + {0x1DF00, 0x1DF1E}, + {0x1DF25, 0x1DF2A}, + {0x1E030, 0x1E06D}, + {0x1E100, 0x1E12C}, + {0x1E137, 0x1E13D}, + {0x1E14E, 0x1E14E}, + {0x1E290, 0x1E2AD}, + {0x1E2C0, 0x1E2EB}, + {0x1E4D0, 0x1E4EB}, + {0x1E5D0, 0x1E5ED}, + {0x1E5F0, 0x1E5F0}, + {0x1E6C0, 0x1E6DE}, + {0x1E6E0, 0x1E6E2}, + {0x1E6E4, 0x1E6E5}, + {0x1E6E7, 0x1E6ED}, + {0x1E6F0, 0x1E6F4}, + {0x1E6FE, 0x1E6FF}, + {0x1E7E0, 0x1E7E6}, + {0x1E7E8, 0x1E7EB}, + {0x1E7ED, 0x1E7EE}, + {0x1E7F0, 0x1E7FE}, + {0x1E800, 0x1E8C4}, + {0x1E900, 0x1E943}, + {0x1E94B, 0x1E94B}, + {0x1EE00, 0x1EE03}, + {0x1EE05, 0x1EE1F}, + {0x1EE21, 0x1EE22}, + {0x1EE24, 0x1EE24}, + {0x1EE27, 0x1EE27}, + {0x1EE29, 0x1EE32}, + {0x1EE34, 0x1EE37}, + {0x1EE39, 0x1EE39}, + {0x1EE3B, 0x1EE3B}, + {0x1EE42, 0x1EE42}, + {0x1EE47, 0x1EE47}, + {0x1EE49, 0x1EE49}, + {0x1EE4B, 0x1EE4B}, + {0x1EE4D, 0x1EE4F}, + {0x1EE51, 0x1EE52}, + {0x1EE54, 0x1EE54}, + {0x1EE57, 0x1EE57}, + {0x1EE59, 0x1EE59}, + {0x1EE5B, 0x1EE5B}, + {0x1EE5D, 0x1EE5D}, + {0x1EE5F, 0x1EE5F}, + {0x1EE61, 0x1EE62}, + {0x1EE64, 0x1EE64}, + {0x1EE67, 0x1EE6A}, + {0x1EE6C, 0x1EE72}, + {0x1EE74, 0x1EE77}, + {0x1EE79, 0x1EE7C}, + {0x1EE7E, 0x1EE7E}, + {0x1EE80, 0x1EE89}, + {0x1EE8B, 0x1EE9B}, + {0x1EEA1, 0x1EEA3}, + {0x1EEA5, 0x1EEA9}, + {0x1EEAB, 0x1EEBB}, + {0x20000, 0x2A6DF}, + {0x2A700, 0x2B81D}, + {0x2B820, 0x2CEAD}, + {0x2CEB0, 0x2EBE0}, + {0x2EBF0, 0x2EE5D}, + {0x2F800, 0x2FA1D}, + {0x30000, 0x3134A}, + {0x31350, 0x33479}, + }; + + for (const auto &r : ranges) { + if (ch >= r.start && ch <= r.end) return true; + } + return false; +} + +bool is_space(char32_t cp) { + switch (cp) { + case 0x0009: // TAB \t + case 0x000A: // LF \n + case 0x000B: // VT + case 0x000C: // FF + case 0x000D: // CR \r + case 0x0020: // Space + case 0x00A0: // No-Break Space + case 0x1680: // Ogham Space Mark + case 0x2000: // En Quad + case 0x2001: // Em Quad + case 0x2002: // En Space + case 0x2003: // Em Space + case 0x2004: // Three-Per-Em Space + case 0x2005: // Four-Per-Em Space + case 0x2006: // Six-Per-Em Space + case 0x2007: // Figure Space + case 0x2008: // Punctuation Space + case 0x2009: // Thin Space + case 0x200A: // Hair Space + case 0x202F: // Narrow No-Break Space + case 0x205F: // Medium Mathematical Space + case 0x3000: // Ideographic Space + return true; + default: + return false; + } +} + +std::string str_to_lower(const std::string& input) { + std::string result = input; + std::transform(result.begin(), result.end(), result.begin(), + [](unsigned char c){ return std::tolower(c); }); + return result; +} + +// UTF-8 -> Unicode code points +std::vector utf8_to_codepoints(const std::string& str) { + std::vector codepoints; + size_t i = 0; + while (i < str.size()) { + unsigned char c = str[i]; + char32_t cp = 0; + size_t extra_bytes = 0; + + if ((c & 0x80) == 0) cp = c; + else if ((c & 0xE0) == 0xC0) { cp = c & 0x1F; extra_bytes = 1; } + else if ((c & 0xF0) == 0xE0) { cp = c & 0x0F; extra_bytes = 2; } + else if ((c & 0xF8) == 0xF0) { cp = c & 0x07; extra_bytes = 3; } + else { ++i; continue; } // Invalid UTF-8 + + if (i + extra_bytes >= str.size()) break; + + for (size_t j = 1; j <= extra_bytes; ++j) + cp = (cp << 6) | (str[i + j] & 0x3F); + + codepoints.push_back(cp); + i += 1 + extra_bytes; + } + return codepoints; +} + +// Unicode code point -> UTF-8 +std::string codepoint_to_utf8(char32_t cp) { + std::string out; + if (cp <= 0x7F) out.push_back(static_cast(cp)); + else if (cp <= 0x7FF) { + out.push_back(static_cast(0xC0 | (cp >> 6))); + out.push_back(static_cast(0x80 | (cp & 0x3F))); + } else if (cp <= 0xFFFF) { + out.push_back(static_cast(0xE0 | (cp >> 12))); + out.push_back(static_cast(0x80 | ((cp >> 6) & 0x3F))); + out.push_back(static_cast(0x80 | (cp & 0x3F))); + } else { + out.push_back(static_cast(0xF0 | (cp >> 18))); + out.push_back(static_cast(0x80 | ((cp >> 12) & 0x3F))); + out.push_back(static_cast(0x80 | ((cp >> 6) & 0x3F))); + out.push_back(static_cast(0x80 | (cp & 0x3F))); + } + return out; +} + + +std::vector token_split(const std::string& text) { + std::vector tokens; + auto cps = utf8_to_codepoints(text); + size_t i = 0; + + while (i < cps.size()) { + char32_t cp = cps[i]; + + // `(?i:'s|'t|'re|'ve|'m|'ll|'d)` + if (cp == U'\'' && i + 1 < cps.size()) { + std::string next = str_to_lower(codepoint_to_utf8(cps[i+1])); + if (next == "s" || next == "t" || next == "m") { + tokens.push_back("'" + next); + i += 2; + continue; + } + if (i + 2 < cps.size()) { + next += str_to_lower(codepoint_to_utf8(cps[i+2])); + if (next == "re" || next == "ve" || next == "ll" || next == "d") { + tokens.push_back("'" + next); + i += 3; + continue; + } + } + } + + // `\p{N}` + if (is_number(cp)) { + tokens.push_back(codepoint_to_utf8(cp)); + ++i; + continue; + } + + // `[^\r\n\p{L}\p{N}]?\p{L}+` + { + // `[^\r\n\p{L}\p{N}]\p{L}+` + if (!is_letter(cp) && cp != U'\r' && cp != U'\n' && i + 1 < cps.size() && is_letter(cps[i+1])) { + std::string token = codepoint_to_utf8(cp); + ++i; + + while (i < cps.size() && is_letter(cps[i])) { + token += codepoint_to_utf8(cps[i]); + ++i; + } + tokens.push_back(token); + continue; + } + + // `\p{L}+` + if (is_letter(cp)) { + std::string token = codepoint_to_utf8(cp); + ++i; + while (i < cps.size() && is_letter(cps[i])) { + token += codepoint_to_utf8(cps[i]); + ++i; + } + tokens.push_back(token); + continue; + } + } + + // ` ?[^\s\p{L}\p{N}]+[\r\n]*` + { + // ` [^\s\p{L}\p{N}]+[\r\n]*` + if (cp == U' ' && i + 1 < cps.size() && !isspace(cps[i+1]) && !is_letter(cps[i+1]) && !is_number(cps[i+1])) { + std::string token = codepoint_to_utf8(cp); + token += codepoint_to_utf8(cps[i+1]); + i+=2; + + while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) { + token += codepoint_to_utf8(cps[i]); + ++i; + } + + while (i < cps.size() && (cps[i] == U'\r' || cps[i] == U'\n')) { + token += codepoint_to_utf8(cps[i]); + ++i; + } + + tokens.push_back(token); + continue; + } + + // `[^\s\p{L}\p{N}]+[\r\n]*` + std::string token; + if (!is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) { + std::string token = codepoint_to_utf8(cp); + ++i; + + while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) { + token += codepoint_to_utf8(cps[i]); + ++i; + } + + while (i < cps.size() && (cps[i] == U'\r' || cps[i] == U'\n')) { + token += codepoint_to_utf8(cps[i]); + ++i; + } + + tokens.push_back(token); + continue; + } + } + + // `\s*[\r\n]+|\s+(?!\S)|\s+` + if (is_space(cp)) { + std::string token = codepoint_to_utf8(cp); + ++i; + + while (i < cps.size() && is_space(cps[i])) { + token += codepoint_to_utf8(cps[i]); + ++i; + if (cps[i] == U'\r' || cps[i] == U'\n') { + break; + } + } + + tokens.push_back(token); + continue; + } + + // skip + ++i; + } + + return tokens; +} + +// int main() { +// std::string text = "I'm testing C++ token_split function. 你好,世界! 123"; +// auto tokens = token_split(text); + +// for (const auto& t : tokens) { +// std::cout << "[" << t << "] "; +// } +// std::cout << "\n"; +// return 0; +// } diff --git a/tokenize_util.h b/tokenize_util.h new file mode 100644 index 0000000..fca07a8 --- /dev/null +++ b/tokenize_util.h @@ -0,0 +1,9 @@ +#ifndef __TOKENIZE_UTIL__ +#define __TOKENIZE_UTIL__ + +#include +#include + +std::vector token_split(const std::string& text); + +#endif // __TOKENIZE_UTIL__ \ No newline at end of file diff --git a/vocab_qwen.hpp b/vocab_qwen.hpp new file mode 100644 index 0000000..cc9c783 Binary files /dev/null and b/vocab_qwen.hpp differ