stable-diffusion.cpp/tokenize_util.cpp
2025-11-29 02:20:12 +08:00

988 lines
27 KiB
C++

#include <algorithm>
#include <iostream>
#include <string>
#include <vector>
#include "tokenize_util.h"
bool is_number(char32_t ch) {
return (ch >= U'0' && ch <= U'9');
}
bool is_letter(char32_t ch) {
static const struct { char32_t start, end; } ranges[] = {
{0x41, 0x5A},
{0x61, 0x7A},
{0xAA, 0xAA},
{0xB5, 0xB5},
{0xBA, 0xBA},
{0xC0, 0xD6},
{0xD8, 0xF6},
{0xF8, 0x2C1},
{0x2C6, 0x2D1},
{0x2E0, 0x2E4},
{0x2EC, 0x2EC},
{0x2EE, 0x2EE},
{0x370, 0x374},
{0x376, 0x377},
{0x37A, 0x37D},
{0x37F, 0x37F},
{0x386, 0x386},
{0x388, 0x38A},
{0x38C, 0x38C},
{0x38E, 0x3A1},
{0x3A3, 0x3F5},
{0x3F7, 0x481},
{0x48A, 0x52F},
{0x531, 0x556},
{0x559, 0x559},
{0x560, 0x588},
{0x5D0, 0x5EA},
{0x5EF, 0x5F2},
{0x620, 0x64A},
{0x66E, 0x66F},
{0x671, 0x6D3},
{0x6D5, 0x6D5},
{0x6E5, 0x6E6},
{0x6EE, 0x6EF},
{0x6FA, 0x6FC},
{0x6FF, 0x6FF},
{0x710, 0x710},
{0x712, 0x72F},
{0x74D, 0x7A5},
{0x7B1, 0x7B1},
{0x7CA, 0x7EA},
{0x7F4, 0x7F5},
{0x7FA, 0x7FA},
{0x800, 0x815},
{0x81A, 0x81A},
{0x824, 0x824},
{0x828, 0x828},
{0x840, 0x858},
{0x860, 0x86A},
{0x870, 0x887},
{0x889, 0x88F},
{0x8A0, 0x8C9},
{0x904, 0x939},
{0x93D, 0x93D},
{0x950, 0x950},
{0x958, 0x961},
{0x971, 0x980},
{0x985, 0x98C},
{0x98F, 0x990},
{0x993, 0x9A8},
{0x9AA, 0x9B0},
{0x9B2, 0x9B2},
{0x9B6, 0x9B9},
{0x9BD, 0x9BD},
{0x9CE, 0x9CE},
{0x9DC, 0x9DD},
{0x9DF, 0x9E1},
{0x9F0, 0x9F1},
{0x9FC, 0x9FC},
{0xA05, 0xA0A},
{0xA0F, 0xA10},
{0xA13, 0xA28},
{0xA2A, 0xA30},
{0xA32, 0xA33},
{0xA35, 0xA36},
{0xA38, 0xA39},
{0xA59, 0xA5C},
{0xA5E, 0xA5E},
{0xA72, 0xA74},
{0xA85, 0xA8D},
{0xA8F, 0xA91},
{0xA93, 0xAA8},
{0xAAA, 0xAB0},
{0xAB2, 0xAB3},
{0xAB5, 0xAB9},
{0xABD, 0xABD},
{0xAD0, 0xAD0},
{0xAE0, 0xAE1},
{0xAF9, 0xAF9},
{0xB05, 0xB0C},
{0xB0F, 0xB10},
{0xB13, 0xB28},
{0xB2A, 0xB30},
{0xB32, 0xB33},
{0xB35, 0xB39},
{0xB3D, 0xB3D},
{0xB5C, 0xB5D},
{0xB5F, 0xB61},
{0xB71, 0xB71},
{0xB83, 0xB83},
{0xB85, 0xB8A},
{0xB8E, 0xB90},
{0xB92, 0xB95},
{0xB99, 0xB9A},
{0xB9C, 0xB9C},
{0xB9E, 0xB9F},
{0xBA3, 0xBA4},
{0xBA8, 0xBAA},
{0xBAE, 0xBB9},
{0xBD0, 0xBD0},
{0xC05, 0xC0C},
{0xC0E, 0xC10},
{0xC12, 0xC28},
{0xC2A, 0xC39},
{0xC3D, 0xC3D},
{0xC58, 0xC5A},
{0xC5C, 0xC5D},
{0xC60, 0xC61},
{0xC80, 0xC80},
{0xC85, 0xC8C},
{0xC8E, 0xC90},
{0xC92, 0xCA8},
{0xCAA, 0xCB3},
{0xCB5, 0xCB9},
{0xCBD, 0xCBD},
{0xCDC, 0xCDE},
{0xCE0, 0xCE1},
{0xCF1, 0xCF2},
{0xD04, 0xD0C},
{0xD0E, 0xD10},
{0xD12, 0xD3A},
{0xD3D, 0xD3D},
{0xD4E, 0xD4E},
{0xD54, 0xD56},
{0xD5F, 0xD61},
{0xD7A, 0xD7F},
{0xD85, 0xD96},
{0xD9A, 0xDB1},
{0xDB3, 0xDBB},
{0xDBD, 0xDBD},
{0xDC0, 0xDC6},
{0xE01, 0xE30},
{0xE32, 0xE33},
{0xE40, 0xE46},
{0xE81, 0xE82},
{0xE84, 0xE84},
{0xE86, 0xE8A},
{0xE8C, 0xEA3},
{0xEA5, 0xEA5},
{0xEA7, 0xEB0},
{0xEB2, 0xEB3},
{0xEBD, 0xEBD},
{0xEC0, 0xEC4},
{0xEC6, 0xEC6},
{0xEDC, 0xEDF},
{0xF00, 0xF00},
{0xF40, 0xF47},
{0xF49, 0xF6C},
{0xF88, 0xF8C},
{0x1000, 0x102A},
{0x103F, 0x103F},
{0x1050, 0x1055},
{0x105A, 0x105D},
{0x1061, 0x1061},
{0x1065, 0x1066},
{0x106E, 0x1070},
{0x1075, 0x1081},
{0x108E, 0x108E},
{0x10A0, 0x10C5},
{0x10C7, 0x10C7},
{0x10CD, 0x10CD},
{0x10D0, 0x10FA},
{0x10FC, 0x1248},
{0x124A, 0x124D},
{0x1250, 0x1256},
{0x1258, 0x1258},
{0x125A, 0x125D},
{0x1260, 0x1288},
{0x128A, 0x128D},
{0x1290, 0x12B0},
{0x12B2, 0x12B5},
{0x12B8, 0x12BE},
{0x12C0, 0x12C0},
{0x12C2, 0x12C5},
{0x12C8, 0x12D6},
{0x12D8, 0x1310},
{0x1312, 0x1315},
{0x1318, 0x135A},
{0x1380, 0x138F},
{0x13A0, 0x13F5},
{0x13F8, 0x13FD},
{0x1401, 0x166C},
{0x166F, 0x167F},
{0x1681, 0x169A},
{0x16A0, 0x16EA},
{0x16F1, 0x16F8},
{0x1700, 0x1711},
{0x171F, 0x1731},
{0x1740, 0x1751},
{0x1760, 0x176C},
{0x176E, 0x1770},
{0x1780, 0x17B3},
{0x17D7, 0x17D7},
{0x17DC, 0x17DC},
{0x1820, 0x1878},
{0x1880, 0x1884},
{0x1887, 0x18A8},
{0x18AA, 0x18AA},
{0x18B0, 0x18F5},
{0x1900, 0x191E},
{0x1950, 0x196D},
{0x1970, 0x1974},
{0x1980, 0x19AB},
{0x19B0, 0x19C9},
{0x1A00, 0x1A16},
{0x1A20, 0x1A54},
{0x1AA7, 0x1AA7},
{0x1B05, 0x1B33},
{0x1B45, 0x1B4C},
{0x1B83, 0x1BA0},
{0x1BAE, 0x1BAF},
{0x1BBA, 0x1BE5},
{0x1C00, 0x1C23},
{0x1C4D, 0x1C4F},
{0x1C5A, 0x1C7D},
{0x1C80, 0x1C8A},
{0x1C90, 0x1CBA},
{0x1CBD, 0x1CBF},
{0x1CE9, 0x1CEC},
{0x1CEE, 0x1CF3},
{0x1CF5, 0x1CF6},
{0x1CFA, 0x1CFA},
{0x1D00, 0x1DBF},
{0x1E00, 0x1F15},
{0x1F18, 0x1F1D},
{0x1F20, 0x1F45},
{0x1F48, 0x1F4D},
{0x1F50, 0x1F57},
{0x1F59, 0x1F59},
{0x1F5B, 0x1F5B},
{0x1F5D, 0x1F5D},
{0x1F5F, 0x1F7D},
{0x1F80, 0x1FB4},
{0x1FB6, 0x1FBC},
{0x1FBE, 0x1FBE},
{0x1FC2, 0x1FC4},
{0x1FC6, 0x1FCC},
{0x1FD0, 0x1FD3},
{0x1FD6, 0x1FDB},
{0x1FE0, 0x1FEC},
{0x1FF2, 0x1FF4},
{0x1FF6, 0x1FFC},
{0x2071, 0x2071},
{0x207F, 0x207F},
{0x2090, 0x209C},
{0x2102, 0x2102},
{0x2107, 0x2107},
{0x210A, 0x2113},
{0x2115, 0x2115},
{0x2119, 0x211D},
{0x2124, 0x2124},
{0x2126, 0x2126},
{0x2128, 0x2128},
{0x212A, 0x212D},
{0x212F, 0x2139},
{0x213C, 0x213F},
{0x2145, 0x2149},
{0x214E, 0x214E},
{0x2183, 0x2184},
{0x2C00, 0x2CE4},
{0x2CEB, 0x2CEE},
{0x2CF2, 0x2CF3},
{0x2D00, 0x2D25},
{0x2D27, 0x2D27},
{0x2D2D, 0x2D2D},
{0x2D30, 0x2D67},
{0x2D6F, 0x2D6F},
{0x2D80, 0x2D96},
{0x2DA0, 0x2DA6},
{0x2DA8, 0x2DAE},
{0x2DB0, 0x2DB6},
{0x2DB8, 0x2DBE},
{0x2DC0, 0x2DC6},
{0x2DC8, 0x2DCE},
{0x2DD0, 0x2DD6},
{0x2DD8, 0x2DDE},
{0x2E2F, 0x2E2F},
{0x3005, 0x3006},
{0x3031, 0x3035},
{0x303B, 0x303C},
{0x3041, 0x3096},
{0x309D, 0x309F},
{0x30A1, 0x30FA},
{0x30FC, 0x30FF},
{0x3105, 0x312F},
{0x3131, 0x318E},
{0x31A0, 0x31BF},
{0x31F0, 0x31FF},
{0x3400, 0x4DBF},
{0x4E00, 0xA48C},
{0xA4D0, 0xA4FD},
{0xA500, 0xA60C},
{0xA610, 0xA61F},
{0xA62A, 0xA62B},
{0xA640, 0xA66E},
{0xA67F, 0xA69D},
{0xA6A0, 0xA6E5},
{0xA717, 0xA71F},
{0xA722, 0xA788},
{0xA78B, 0xA7DC},
{0xA7F1, 0xA801},
{0xA803, 0xA805},
{0xA807, 0xA80A},
{0xA80C, 0xA822},
{0xA840, 0xA873},
{0xA882, 0xA8B3},
{0xA8F2, 0xA8F7},
{0xA8FB, 0xA8FB},
{0xA8FD, 0xA8FE},
{0xA90A, 0xA925},
{0xA930, 0xA946},
{0xA960, 0xA97C},
{0xA984, 0xA9B2},
{0xA9CF, 0xA9CF},
{0xA9E0, 0xA9E4},
{0xA9E6, 0xA9EF},
{0xA9FA, 0xA9FE},
{0xAA00, 0xAA28},
{0xAA40, 0xAA42},
{0xAA44, 0xAA4B},
{0xAA60, 0xAA76},
{0xAA7A, 0xAA7A},
{0xAA7E, 0xAAAF},
{0xAAB1, 0xAAB1},
{0xAAB5, 0xAAB6},
{0xAAB9, 0xAABD},
{0xAAC0, 0xAAC0},
{0xAAC2, 0xAAC2},
{0xAADB, 0xAADD},
{0xAAE0, 0xAAEA},
{0xAAF2, 0xAAF4},
{0xAB01, 0xAB06},
{0xAB09, 0xAB0E},
{0xAB11, 0xAB16},
{0xAB20, 0xAB26},
{0xAB28, 0xAB2E},
{0xAB30, 0xAB5A},
{0xAB5C, 0xAB69},
{0xAB70, 0xABE2},
{0xAC00, 0xD7A3},
{0xD7B0, 0xD7C6},
{0xD7CB, 0xD7FB},
{0xF900, 0xFA6D},
{0xFA70, 0xFAD9},
{0xFB00, 0xFB06},
{0xFB13, 0xFB17},
{0xFB1D, 0xFB1D},
{0xFB1F, 0xFB28},
{0xFB2A, 0xFB36},
{0xFB38, 0xFB3C},
{0xFB3E, 0xFB3E},
{0xFB40, 0xFB41},
{0xFB43, 0xFB44},
{0xFB46, 0xFBB1},
{0xFBD3, 0xFD3D},
{0xFD50, 0xFD8F},
{0xFD92, 0xFDC7},
{0xFDF0, 0xFDFB},
{0xFE70, 0xFE74},
{0xFE76, 0xFEFC},
{0xFF21, 0xFF3A},
{0xFF41, 0xFF5A},
{0xFF66, 0xFFBE},
{0xFFC2, 0xFFC7},
{0xFFCA, 0xFFCF},
{0xFFD2, 0xFFD7},
{0xFFDA, 0xFFDC},
{0x10000, 0x1000B},
{0x1000D, 0x10026},
{0x10028, 0x1003A},
{0x1003C, 0x1003D},
{0x1003F, 0x1004D},
{0x10050, 0x1005D},
{0x10080, 0x100FA},
{0x10280, 0x1029C},
{0x102A0, 0x102D0},
{0x10300, 0x1031F},
{0x1032D, 0x10340},
{0x10342, 0x10349},
{0x10350, 0x10375},
{0x10380, 0x1039D},
{0x103A0, 0x103C3},
{0x103C8, 0x103CF},
{0x10400, 0x1049D},
{0x104B0, 0x104D3},
{0x104D8, 0x104FB},
{0x10500, 0x10527},
{0x10530, 0x10563},
{0x10570, 0x1057A},
{0x1057C, 0x1058A},
{0x1058C, 0x10592},
{0x10594, 0x10595},
{0x10597, 0x105A1},
{0x105A3, 0x105B1},
{0x105B3, 0x105B9},
{0x105BB, 0x105BC},
{0x105C0, 0x105F3},
{0x10600, 0x10736},
{0x10740, 0x10755},
{0x10760, 0x10767},
{0x10780, 0x10785},
{0x10787, 0x107B0},
{0x107B2, 0x107BA},
{0x10800, 0x10805},
{0x10808, 0x10808},
{0x1080A, 0x10835},
{0x10837, 0x10838},
{0x1083C, 0x1083C},
{0x1083F, 0x10855},
{0x10860, 0x10876},
{0x10880, 0x1089E},
{0x108E0, 0x108F2},
{0x108F4, 0x108F5},
{0x10900, 0x10915},
{0x10920, 0x10939},
{0x10940, 0x10959},
{0x10980, 0x109B7},
{0x109BE, 0x109BF},
{0x10A00, 0x10A00},
{0x10A10, 0x10A13},
{0x10A15, 0x10A17},
{0x10A19, 0x10A35},
{0x10A60, 0x10A7C},
{0x10A80, 0x10A9C},
{0x10AC0, 0x10AC7},
{0x10AC9, 0x10AE4},
{0x10B00, 0x10B35},
{0x10B40, 0x10B55},
{0x10B60, 0x10B72},
{0x10B80, 0x10B91},
{0x10C00, 0x10C48},
{0x10C80, 0x10CB2},
{0x10CC0, 0x10CF2},
{0x10D00, 0x10D23},
{0x10D4A, 0x10D65},
{0x10D6F, 0x10D85},
{0x10E80, 0x10EA9},
{0x10EB0, 0x10EB1},
{0x10EC2, 0x10EC7},
{0x10F00, 0x10F1C},
{0x10F27, 0x10F27},
{0x10F30, 0x10F45},
{0x10F70, 0x10F81},
{0x10FB0, 0x10FC4},
{0x10FE0, 0x10FF6},
{0x11003, 0x11037},
{0x11071, 0x11072},
{0x11075, 0x11075},
{0x11083, 0x110AF},
{0x110D0, 0x110E8},
{0x11103, 0x11126},
{0x11144, 0x11144},
{0x11147, 0x11147},
{0x11150, 0x11172},
{0x11176, 0x11176},
{0x11183, 0x111B2},
{0x111C1, 0x111C4},
{0x111DA, 0x111DA},
{0x111DC, 0x111DC},
{0x11200, 0x11211},
{0x11213, 0x1122B},
{0x1123F, 0x11240},
{0x11280, 0x11286},
{0x11288, 0x11288},
{0x1128A, 0x1128D},
{0x1128F, 0x1129D},
{0x1129F, 0x112A8},
{0x112B0, 0x112DE},
{0x11305, 0x1130C},
{0x1130F, 0x11310},
{0x11313, 0x11328},
{0x1132A, 0x11330},
{0x11332, 0x11333},
{0x11335, 0x11339},
{0x1133D, 0x1133D},
{0x11350, 0x11350},
{0x1135D, 0x11361},
{0x11380, 0x11389},
{0x1138B, 0x1138B},
{0x1138E, 0x1138E},
{0x11390, 0x113B5},
{0x113B7, 0x113B7},
{0x113D1, 0x113D1},
{0x113D3, 0x113D3},
{0x11400, 0x11434},
{0x11447, 0x1144A},
{0x1145F, 0x11461},
{0x11480, 0x114AF},
{0x114C4, 0x114C5},
{0x114C7, 0x114C7},
{0x11580, 0x115AE},
{0x115D8, 0x115DB},
{0x11600, 0x1162F},
{0x11644, 0x11644},
{0x11680, 0x116AA},
{0x116B8, 0x116B8},
{0x11700, 0x1171A},
{0x11740, 0x11746},
{0x11800, 0x1182B},
{0x118A0, 0x118DF},
{0x118FF, 0x11906},
{0x11909, 0x11909},
{0x1190C, 0x11913},
{0x11915, 0x11916},
{0x11918, 0x1192F},
{0x1193F, 0x1193F},
{0x11941, 0x11941},
{0x119A0, 0x119A7},
{0x119AA, 0x119D0},
{0x119E1, 0x119E1},
{0x119E3, 0x119E3},
{0x11A00, 0x11A00},
{0x11A0B, 0x11A32},
{0x11A3A, 0x11A3A},
{0x11A50, 0x11A50},
{0x11A5C, 0x11A89},
{0x11A9D, 0x11A9D},
{0x11AB0, 0x11AF8},
{0x11BC0, 0x11BE0},
{0x11C00, 0x11C08},
{0x11C0A, 0x11C2E},
{0x11C40, 0x11C40},
{0x11C72, 0x11C8F},
{0x11D00, 0x11D06},
{0x11D08, 0x11D09},
{0x11D0B, 0x11D30},
{0x11D46, 0x11D46},
{0x11D60, 0x11D65},
{0x11D67, 0x11D68},
{0x11D6A, 0x11D89},
{0x11D98, 0x11D98},
{0x11DB0, 0x11DDB},
{0x11EE0, 0x11EF2},
{0x11F02, 0x11F02},
{0x11F04, 0x11F10},
{0x11F12, 0x11F33},
{0x11FB0, 0x11FB0},
{0x12000, 0x12399},
{0x12480, 0x12543},
{0x12F90, 0x12FF0},
{0x13000, 0x1342F},
{0x13441, 0x13446},
{0x13460, 0x143FA},
{0x14400, 0x14646},
{0x16100, 0x1611D},
{0x16800, 0x16A38},
{0x16A40, 0x16A5E},
{0x16A70, 0x16ABE},
{0x16AD0, 0x16AED},
{0x16B00, 0x16B2F},
{0x16B40, 0x16B43},
{0x16B63, 0x16B77},
{0x16B7D, 0x16B8F},
{0x16D40, 0x16D6C},
{0x16E40, 0x16E7F},
{0x16EA0, 0x16EB8},
{0x16EBB, 0x16ED3},
{0x16F00, 0x16F4A},
{0x16F50, 0x16F50},
{0x16F93, 0x16F9F},
{0x16FE0, 0x16FE1},
{0x16FE3, 0x16FE3},
{0x16FF2, 0x16FF3},
{0x17000, 0x18CD5},
{0x18CFF, 0x18D1E},
{0x18D80, 0x18DF2},
{0x1AFF0, 0x1AFF3},
{0x1AFF5, 0x1AFFB},
{0x1AFFD, 0x1AFFE},
{0x1B000, 0x1B122},
{0x1B132, 0x1B132},
{0x1B150, 0x1B152},
{0x1B155, 0x1B155},
{0x1B164, 0x1B167},
{0x1B170, 0x1B2FB},
{0x1BC00, 0x1BC6A},
{0x1BC70, 0x1BC7C},
{0x1BC80, 0x1BC88},
{0x1BC90, 0x1BC99},
{0x1D400, 0x1D454},
{0x1D456, 0x1D49C},
{0x1D49E, 0x1D49F},
{0x1D4A2, 0x1D4A2},
{0x1D4A5, 0x1D4A6},
{0x1D4A9, 0x1D4AC},
{0x1D4AE, 0x1D4B9},
{0x1D4BB, 0x1D4BB},
{0x1D4BD, 0x1D4C3},
{0x1D4C5, 0x1D505},
{0x1D507, 0x1D50A},
{0x1D50D, 0x1D514},
{0x1D516, 0x1D51C},
{0x1D51E, 0x1D539},
{0x1D53B, 0x1D53E},
{0x1D540, 0x1D544},
{0x1D546, 0x1D546},
{0x1D54A, 0x1D550},
{0x1D552, 0x1D6A5},
{0x1D6A8, 0x1D6C0},
{0x1D6C2, 0x1D6DA},
{0x1D6DC, 0x1D6FA},
{0x1D6FC, 0x1D714},
{0x1D716, 0x1D734},
{0x1D736, 0x1D74E},
{0x1D750, 0x1D76E},
{0x1D770, 0x1D788},
{0x1D78A, 0x1D7A8},
{0x1D7AA, 0x1D7C2},
{0x1D7C4, 0x1D7CB},
{0x1DF00, 0x1DF1E},
{0x1DF25, 0x1DF2A},
{0x1E030, 0x1E06D},
{0x1E100, 0x1E12C},
{0x1E137, 0x1E13D},
{0x1E14E, 0x1E14E},
{0x1E290, 0x1E2AD},
{0x1E2C0, 0x1E2EB},
{0x1E4D0, 0x1E4EB},
{0x1E5D0, 0x1E5ED},
{0x1E5F0, 0x1E5F0},
{0x1E6C0, 0x1E6DE},
{0x1E6E0, 0x1E6E2},
{0x1E6E4, 0x1E6E5},
{0x1E6E7, 0x1E6ED},
{0x1E6F0, 0x1E6F4},
{0x1E6FE, 0x1E6FF},
{0x1E7E0, 0x1E7E6},
{0x1E7E8, 0x1E7EB},
{0x1E7ED, 0x1E7EE},
{0x1E7F0, 0x1E7FE},
{0x1E800, 0x1E8C4},
{0x1E900, 0x1E943},
{0x1E94B, 0x1E94B},
{0x1EE00, 0x1EE03},
{0x1EE05, 0x1EE1F},
{0x1EE21, 0x1EE22},
{0x1EE24, 0x1EE24},
{0x1EE27, 0x1EE27},
{0x1EE29, 0x1EE32},
{0x1EE34, 0x1EE37},
{0x1EE39, 0x1EE39},
{0x1EE3B, 0x1EE3B},
{0x1EE42, 0x1EE42},
{0x1EE47, 0x1EE47},
{0x1EE49, 0x1EE49},
{0x1EE4B, 0x1EE4B},
{0x1EE4D, 0x1EE4F},
{0x1EE51, 0x1EE52},
{0x1EE54, 0x1EE54},
{0x1EE57, 0x1EE57},
{0x1EE59, 0x1EE59},
{0x1EE5B, 0x1EE5B},
{0x1EE5D, 0x1EE5D},
{0x1EE5F, 0x1EE5F},
{0x1EE61, 0x1EE62},
{0x1EE64, 0x1EE64},
{0x1EE67, 0x1EE6A},
{0x1EE6C, 0x1EE72},
{0x1EE74, 0x1EE77},
{0x1EE79, 0x1EE7C},
{0x1EE7E, 0x1EE7E},
{0x1EE80, 0x1EE89},
{0x1EE8B, 0x1EE9B},
{0x1EEA1, 0x1EEA3},
{0x1EEA5, 0x1EEA9},
{0x1EEAB, 0x1EEBB},
{0x20000, 0x2A6DF},
{0x2A700, 0x2B81D},
{0x2B820, 0x2CEAD},
{0x2CEB0, 0x2EBE0},
{0x2EBF0, 0x2EE5D},
{0x2F800, 0x2FA1D},
{0x30000, 0x3134A},
{0x31350, 0x33479},
};
for (const auto& r : ranges) {
if (ch >= r.start && ch <= r.end)
return true;
}
return false;
}
bool is_space(char32_t cp) {
switch (cp) {
case 0x0009: // TAB \t
case 0x000A: // LF \n
case 0x000B: // VT
case 0x000C: // FF
case 0x000D: // CR \r
case 0x0020: // Space
case 0x00A0: // No-Break Space
case 0x1680: // Ogham Space Mark
case 0x2000: // En Quad
case 0x2001: // Em Quad
case 0x2002: // En Space
case 0x2003: // Em Space
case 0x2004: // Three-Per-Em Space
case 0x2005: // Four-Per-Em Space
case 0x2006: // Six-Per-Em Space
case 0x2007: // Figure Space
case 0x2008: // Punctuation Space
case 0x2009: // Thin Space
case 0x200A: // Hair Space
case 0x202F: // Narrow No-Break Space
case 0x205F: // Medium Mathematical Space
case 0x3000: // Ideographic Space
return true;
default:
return false;
}
}
std::string str_to_lower(const std::string& input) {
std::string result = input;
std::transform(result.begin(), result.end(), result.begin(),
[](unsigned char c) { return std::tolower(c); });
return result;
}
// UTF-8 -> Unicode code points
std::vector<char32_t> utf8_to_codepoints(const std::string& str) {
std::vector<char32_t> codepoints;
size_t i = 0;
while (i < str.size()) {
unsigned char c = str[i];
char32_t cp = 0;
size_t extra_bytes = 0;
if ((c & 0x80) == 0)
cp = c;
else if ((c & 0xE0) == 0xC0) {
cp = c & 0x1F;
extra_bytes = 1;
} else if ((c & 0xF0) == 0xE0) {
cp = c & 0x0F;
extra_bytes = 2;
} else if ((c & 0xF8) == 0xF0) {
cp = c & 0x07;
extra_bytes = 3;
} else {
++i;
continue;
} // Invalid UTF-8
if (i + extra_bytes >= str.size())
break;
for (size_t j = 1; j <= extra_bytes; ++j)
cp = (cp << 6) | (str[i + j] & 0x3F);
codepoints.push_back(cp);
i += 1 + extra_bytes;
}
return codepoints;
}
// Unicode code point -> UTF-8
std::string codepoint_to_utf8(char32_t cp) {
std::string out;
if (cp <= 0x7F)
out.push_back(static_cast<char>(cp));
else if (cp <= 0x7FF) {
out.push_back(static_cast<char>(0xC0 | (cp >> 6)));
out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
} else if (cp <= 0xFFFF) {
out.push_back(static_cast<char>(0xE0 | (cp >> 12)));
out.push_back(static_cast<char>(0x80 | ((cp >> 6) & 0x3F)));
out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
} else {
out.push_back(static_cast<char>(0xF0 | (cp >> 18)));
out.push_back(static_cast<char>(0x80 | ((cp >> 12) & 0x3F)));
out.push_back(static_cast<char>(0x80 | ((cp >> 6) & 0x3F)));
out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
}
return out;
}
bool starts_with(const std::vector<char32_t>& text,
const std::vector<char32_t>& prefix,
std::size_t index) {
if (index > text.size()) {
return false;
}
if (prefix.size() > text.size() - index) {
return false;
}
return std::equal(prefix.begin(), prefix.end(), text.begin() + index);
}
// mistral: [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+
// qwen2: (?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+
std::vector<std::string> token_split(const std::string& text) {
std::vector<std::string> tokens;
auto cps = utf8_to_codepoints(text);
size_t i = 0;
while (i < cps.size()) {
char32_t cp = cps[i];
// `(?i:'s|'t|'re|'ve|'m|'ll|'d)`
if (cp == U'\'' && i + 1 < cps.size()) {
std::string next = str_to_lower(codepoint_to_utf8(cps[i + 1]));
if (next == "s" || next == "t" || next == "m") {
tokens.push_back("'" + next);
i += 2;
continue;
}
if (i + 2 < cps.size()) {
next += str_to_lower(codepoint_to_utf8(cps[i + 2]));
if (next == "re" || next == "ve" || next == "ll" || next == "d") {
tokens.push_back("'" + next);
i += 3;
continue;
}
}
}
// `\p{N}`
if (is_number(cp)) {
tokens.push_back(codepoint_to_utf8(cp));
++i;
continue;
}
// `[^\r\n\p{L}\p{N}]?\p{L}+`
{
// `[^\r\n\p{L}\p{N}]\p{L}+`
if (!is_letter(cp) && cp != U'\r' && cp != U'\n' && i + 1 < cps.size() && is_letter(cps[i + 1])) {
std::string token = codepoint_to_utf8(cp);
++i;
while (i < cps.size() && is_letter(cps[i])) {
token += codepoint_to_utf8(cps[i]);
++i;
}
tokens.push_back(token);
continue;
}
// `\p{L}+`
if (is_letter(cp)) {
std::string token = codepoint_to_utf8(cp);
++i;
while (i < cps.size() && is_letter(cps[i])) {
token += codepoint_to_utf8(cps[i]);
++i;
}
tokens.push_back(token);
continue;
}
}
// ` ?[^\s\p{L}\p{N}]+[\r\n]*`
{
// ` [^\s\p{L}\p{N}]+[\r\n]*`
if (cp == U' ' && i + 1 < cps.size() && !isspace(cps[i + 1]) && !is_letter(cps[i + 1]) && !is_number(cps[i + 1])) {
std::string token = codepoint_to_utf8(cp);
token += codepoint_to_utf8(cps[i + 1]);
i += 2;
while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) {
token += codepoint_to_utf8(cps[i]);
++i;
}
while (i < cps.size() && (cps[i] == U'\r' || cps[i] == U'\n')) {
token += codepoint_to_utf8(cps[i]);
++i;
}
tokens.push_back(token);
continue;
}
// `[^\s\p{L}\p{N}]+[\r\n]*`
std::string token;
if (!is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) {
std::string token = codepoint_to_utf8(cp);
++i;
while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) {
token += codepoint_to_utf8(cps[i]);
++i;
}
while (i < cps.size() && (cps[i] == U'\r' || cps[i] == U'\n')) {
token += codepoint_to_utf8(cps[i]);
++i;
}
tokens.push_back(token);
continue;
}
}
// `\s*[\r\n]+|\s+(?!\S)|\s+`
if (is_space(cp)) {
std::string token = codepoint_to_utf8(cp);
++i;
while (i < cps.size() && is_space(cps[i])) {
token += codepoint_to_utf8(cps[i]);
++i;
if (cps[i] == U'\r' || cps[i] == U'\n') {
break;
}
}
tokens.push_back(token);
continue;
}
// skip
++i;
}
return tokens;
}
std::vector<std::string> split_with_special_tokens(
const std::string& text,
const std::vector<std::string>& special_tokens) {
std::vector<std::string> result;
size_t pos = 0;
size_t text_len = text.size();
while (pos < text_len) {
size_t next_pos = text_len;
std::string matched_token;
for (const auto& token : special_tokens) {
size_t token_pos = text.find(token, pos);
if (token_pos != std::string::npos && token_pos < next_pos) {
next_pos = token_pos;
matched_token = token;
}
}
if (next_pos > pos) {
result.push_back(text.substr(pos, next_pos - pos));
}
if (!matched_token.empty()) {
result.push_back(matched_token);
pos = next_pos + matched_token.size();
} else {
break;
}
}
return result;
}
// int main() {
// std::string text = "I'm testing C++ token_split function. 你好,世界! 123";
// auto tokens = token_split(text);
// for (const auto& t : tokens) {
// std::cout << "[" << t << "] ";
// }
// std::cout << "\n";
// return 0;
// }