mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2025-12-12 21:38:58 +00:00
988 lines
27 KiB
C++
988 lines
27 KiB
C++
#include <algorithm>
|
|
#include <iostream>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include "tokenize_util.h"
|
|
|
|
bool is_number(char32_t ch) {
|
|
return (ch >= U'0' && ch <= U'9');
|
|
}
|
|
|
|
bool is_letter(char32_t ch) {
|
|
static const struct { char32_t start, end; } ranges[] = {
|
|
{0x41, 0x5A},
|
|
{0x61, 0x7A},
|
|
{0xAA, 0xAA},
|
|
{0xB5, 0xB5},
|
|
{0xBA, 0xBA},
|
|
{0xC0, 0xD6},
|
|
{0xD8, 0xF6},
|
|
{0xF8, 0x2C1},
|
|
{0x2C6, 0x2D1},
|
|
{0x2E0, 0x2E4},
|
|
{0x2EC, 0x2EC},
|
|
{0x2EE, 0x2EE},
|
|
{0x370, 0x374},
|
|
{0x376, 0x377},
|
|
{0x37A, 0x37D},
|
|
{0x37F, 0x37F},
|
|
{0x386, 0x386},
|
|
{0x388, 0x38A},
|
|
{0x38C, 0x38C},
|
|
{0x38E, 0x3A1},
|
|
{0x3A3, 0x3F5},
|
|
{0x3F7, 0x481},
|
|
{0x48A, 0x52F},
|
|
{0x531, 0x556},
|
|
{0x559, 0x559},
|
|
{0x560, 0x588},
|
|
{0x5D0, 0x5EA},
|
|
{0x5EF, 0x5F2},
|
|
{0x620, 0x64A},
|
|
{0x66E, 0x66F},
|
|
{0x671, 0x6D3},
|
|
{0x6D5, 0x6D5},
|
|
{0x6E5, 0x6E6},
|
|
{0x6EE, 0x6EF},
|
|
{0x6FA, 0x6FC},
|
|
{0x6FF, 0x6FF},
|
|
{0x710, 0x710},
|
|
{0x712, 0x72F},
|
|
{0x74D, 0x7A5},
|
|
{0x7B1, 0x7B1},
|
|
{0x7CA, 0x7EA},
|
|
{0x7F4, 0x7F5},
|
|
{0x7FA, 0x7FA},
|
|
{0x800, 0x815},
|
|
{0x81A, 0x81A},
|
|
{0x824, 0x824},
|
|
{0x828, 0x828},
|
|
{0x840, 0x858},
|
|
{0x860, 0x86A},
|
|
{0x870, 0x887},
|
|
{0x889, 0x88F},
|
|
{0x8A0, 0x8C9},
|
|
{0x904, 0x939},
|
|
{0x93D, 0x93D},
|
|
{0x950, 0x950},
|
|
{0x958, 0x961},
|
|
{0x971, 0x980},
|
|
{0x985, 0x98C},
|
|
{0x98F, 0x990},
|
|
{0x993, 0x9A8},
|
|
{0x9AA, 0x9B0},
|
|
{0x9B2, 0x9B2},
|
|
{0x9B6, 0x9B9},
|
|
{0x9BD, 0x9BD},
|
|
{0x9CE, 0x9CE},
|
|
{0x9DC, 0x9DD},
|
|
{0x9DF, 0x9E1},
|
|
{0x9F0, 0x9F1},
|
|
{0x9FC, 0x9FC},
|
|
{0xA05, 0xA0A},
|
|
{0xA0F, 0xA10},
|
|
{0xA13, 0xA28},
|
|
{0xA2A, 0xA30},
|
|
{0xA32, 0xA33},
|
|
{0xA35, 0xA36},
|
|
{0xA38, 0xA39},
|
|
{0xA59, 0xA5C},
|
|
{0xA5E, 0xA5E},
|
|
{0xA72, 0xA74},
|
|
{0xA85, 0xA8D},
|
|
{0xA8F, 0xA91},
|
|
{0xA93, 0xAA8},
|
|
{0xAAA, 0xAB0},
|
|
{0xAB2, 0xAB3},
|
|
{0xAB5, 0xAB9},
|
|
{0xABD, 0xABD},
|
|
{0xAD0, 0xAD0},
|
|
{0xAE0, 0xAE1},
|
|
{0xAF9, 0xAF9},
|
|
{0xB05, 0xB0C},
|
|
{0xB0F, 0xB10},
|
|
{0xB13, 0xB28},
|
|
{0xB2A, 0xB30},
|
|
{0xB32, 0xB33},
|
|
{0xB35, 0xB39},
|
|
{0xB3D, 0xB3D},
|
|
{0xB5C, 0xB5D},
|
|
{0xB5F, 0xB61},
|
|
{0xB71, 0xB71},
|
|
{0xB83, 0xB83},
|
|
{0xB85, 0xB8A},
|
|
{0xB8E, 0xB90},
|
|
{0xB92, 0xB95},
|
|
{0xB99, 0xB9A},
|
|
{0xB9C, 0xB9C},
|
|
{0xB9E, 0xB9F},
|
|
{0xBA3, 0xBA4},
|
|
{0xBA8, 0xBAA},
|
|
{0xBAE, 0xBB9},
|
|
{0xBD0, 0xBD0},
|
|
{0xC05, 0xC0C},
|
|
{0xC0E, 0xC10},
|
|
{0xC12, 0xC28},
|
|
{0xC2A, 0xC39},
|
|
{0xC3D, 0xC3D},
|
|
{0xC58, 0xC5A},
|
|
{0xC5C, 0xC5D},
|
|
{0xC60, 0xC61},
|
|
{0xC80, 0xC80},
|
|
{0xC85, 0xC8C},
|
|
{0xC8E, 0xC90},
|
|
{0xC92, 0xCA8},
|
|
{0xCAA, 0xCB3},
|
|
{0xCB5, 0xCB9},
|
|
{0xCBD, 0xCBD},
|
|
{0xCDC, 0xCDE},
|
|
{0xCE0, 0xCE1},
|
|
{0xCF1, 0xCF2},
|
|
{0xD04, 0xD0C},
|
|
{0xD0E, 0xD10},
|
|
{0xD12, 0xD3A},
|
|
{0xD3D, 0xD3D},
|
|
{0xD4E, 0xD4E},
|
|
{0xD54, 0xD56},
|
|
{0xD5F, 0xD61},
|
|
{0xD7A, 0xD7F},
|
|
{0xD85, 0xD96},
|
|
{0xD9A, 0xDB1},
|
|
{0xDB3, 0xDBB},
|
|
{0xDBD, 0xDBD},
|
|
{0xDC0, 0xDC6},
|
|
{0xE01, 0xE30},
|
|
{0xE32, 0xE33},
|
|
{0xE40, 0xE46},
|
|
{0xE81, 0xE82},
|
|
{0xE84, 0xE84},
|
|
{0xE86, 0xE8A},
|
|
{0xE8C, 0xEA3},
|
|
{0xEA5, 0xEA5},
|
|
{0xEA7, 0xEB0},
|
|
{0xEB2, 0xEB3},
|
|
{0xEBD, 0xEBD},
|
|
{0xEC0, 0xEC4},
|
|
{0xEC6, 0xEC6},
|
|
{0xEDC, 0xEDF},
|
|
{0xF00, 0xF00},
|
|
{0xF40, 0xF47},
|
|
{0xF49, 0xF6C},
|
|
{0xF88, 0xF8C},
|
|
{0x1000, 0x102A},
|
|
{0x103F, 0x103F},
|
|
{0x1050, 0x1055},
|
|
{0x105A, 0x105D},
|
|
{0x1061, 0x1061},
|
|
{0x1065, 0x1066},
|
|
{0x106E, 0x1070},
|
|
{0x1075, 0x1081},
|
|
{0x108E, 0x108E},
|
|
{0x10A0, 0x10C5},
|
|
{0x10C7, 0x10C7},
|
|
{0x10CD, 0x10CD},
|
|
{0x10D0, 0x10FA},
|
|
{0x10FC, 0x1248},
|
|
{0x124A, 0x124D},
|
|
{0x1250, 0x1256},
|
|
{0x1258, 0x1258},
|
|
{0x125A, 0x125D},
|
|
{0x1260, 0x1288},
|
|
{0x128A, 0x128D},
|
|
{0x1290, 0x12B0},
|
|
{0x12B2, 0x12B5},
|
|
{0x12B8, 0x12BE},
|
|
{0x12C0, 0x12C0},
|
|
{0x12C2, 0x12C5},
|
|
{0x12C8, 0x12D6},
|
|
{0x12D8, 0x1310},
|
|
{0x1312, 0x1315},
|
|
{0x1318, 0x135A},
|
|
{0x1380, 0x138F},
|
|
{0x13A0, 0x13F5},
|
|
{0x13F8, 0x13FD},
|
|
{0x1401, 0x166C},
|
|
{0x166F, 0x167F},
|
|
{0x1681, 0x169A},
|
|
{0x16A0, 0x16EA},
|
|
{0x16F1, 0x16F8},
|
|
{0x1700, 0x1711},
|
|
{0x171F, 0x1731},
|
|
{0x1740, 0x1751},
|
|
{0x1760, 0x176C},
|
|
{0x176E, 0x1770},
|
|
{0x1780, 0x17B3},
|
|
{0x17D7, 0x17D7},
|
|
{0x17DC, 0x17DC},
|
|
{0x1820, 0x1878},
|
|
{0x1880, 0x1884},
|
|
{0x1887, 0x18A8},
|
|
{0x18AA, 0x18AA},
|
|
{0x18B0, 0x18F5},
|
|
{0x1900, 0x191E},
|
|
{0x1950, 0x196D},
|
|
{0x1970, 0x1974},
|
|
{0x1980, 0x19AB},
|
|
{0x19B0, 0x19C9},
|
|
{0x1A00, 0x1A16},
|
|
{0x1A20, 0x1A54},
|
|
{0x1AA7, 0x1AA7},
|
|
{0x1B05, 0x1B33},
|
|
{0x1B45, 0x1B4C},
|
|
{0x1B83, 0x1BA0},
|
|
{0x1BAE, 0x1BAF},
|
|
{0x1BBA, 0x1BE5},
|
|
{0x1C00, 0x1C23},
|
|
{0x1C4D, 0x1C4F},
|
|
{0x1C5A, 0x1C7D},
|
|
{0x1C80, 0x1C8A},
|
|
{0x1C90, 0x1CBA},
|
|
{0x1CBD, 0x1CBF},
|
|
{0x1CE9, 0x1CEC},
|
|
{0x1CEE, 0x1CF3},
|
|
{0x1CF5, 0x1CF6},
|
|
{0x1CFA, 0x1CFA},
|
|
{0x1D00, 0x1DBF},
|
|
{0x1E00, 0x1F15},
|
|
{0x1F18, 0x1F1D},
|
|
{0x1F20, 0x1F45},
|
|
{0x1F48, 0x1F4D},
|
|
{0x1F50, 0x1F57},
|
|
{0x1F59, 0x1F59},
|
|
{0x1F5B, 0x1F5B},
|
|
{0x1F5D, 0x1F5D},
|
|
{0x1F5F, 0x1F7D},
|
|
{0x1F80, 0x1FB4},
|
|
{0x1FB6, 0x1FBC},
|
|
{0x1FBE, 0x1FBE},
|
|
{0x1FC2, 0x1FC4},
|
|
{0x1FC6, 0x1FCC},
|
|
{0x1FD0, 0x1FD3},
|
|
{0x1FD6, 0x1FDB},
|
|
{0x1FE0, 0x1FEC},
|
|
{0x1FF2, 0x1FF4},
|
|
{0x1FF6, 0x1FFC},
|
|
{0x2071, 0x2071},
|
|
{0x207F, 0x207F},
|
|
{0x2090, 0x209C},
|
|
{0x2102, 0x2102},
|
|
{0x2107, 0x2107},
|
|
{0x210A, 0x2113},
|
|
{0x2115, 0x2115},
|
|
{0x2119, 0x211D},
|
|
{0x2124, 0x2124},
|
|
{0x2126, 0x2126},
|
|
{0x2128, 0x2128},
|
|
{0x212A, 0x212D},
|
|
{0x212F, 0x2139},
|
|
{0x213C, 0x213F},
|
|
{0x2145, 0x2149},
|
|
{0x214E, 0x214E},
|
|
{0x2183, 0x2184},
|
|
{0x2C00, 0x2CE4},
|
|
{0x2CEB, 0x2CEE},
|
|
{0x2CF2, 0x2CF3},
|
|
{0x2D00, 0x2D25},
|
|
{0x2D27, 0x2D27},
|
|
{0x2D2D, 0x2D2D},
|
|
{0x2D30, 0x2D67},
|
|
{0x2D6F, 0x2D6F},
|
|
{0x2D80, 0x2D96},
|
|
{0x2DA0, 0x2DA6},
|
|
{0x2DA8, 0x2DAE},
|
|
{0x2DB0, 0x2DB6},
|
|
{0x2DB8, 0x2DBE},
|
|
{0x2DC0, 0x2DC6},
|
|
{0x2DC8, 0x2DCE},
|
|
{0x2DD0, 0x2DD6},
|
|
{0x2DD8, 0x2DDE},
|
|
{0x2E2F, 0x2E2F},
|
|
{0x3005, 0x3006},
|
|
{0x3031, 0x3035},
|
|
{0x303B, 0x303C},
|
|
{0x3041, 0x3096},
|
|
{0x309D, 0x309F},
|
|
{0x30A1, 0x30FA},
|
|
{0x30FC, 0x30FF},
|
|
{0x3105, 0x312F},
|
|
{0x3131, 0x318E},
|
|
{0x31A0, 0x31BF},
|
|
{0x31F0, 0x31FF},
|
|
{0x3400, 0x4DBF},
|
|
{0x4E00, 0xA48C},
|
|
{0xA4D0, 0xA4FD},
|
|
{0xA500, 0xA60C},
|
|
{0xA610, 0xA61F},
|
|
{0xA62A, 0xA62B},
|
|
{0xA640, 0xA66E},
|
|
{0xA67F, 0xA69D},
|
|
{0xA6A0, 0xA6E5},
|
|
{0xA717, 0xA71F},
|
|
{0xA722, 0xA788},
|
|
{0xA78B, 0xA7DC},
|
|
{0xA7F1, 0xA801},
|
|
{0xA803, 0xA805},
|
|
{0xA807, 0xA80A},
|
|
{0xA80C, 0xA822},
|
|
{0xA840, 0xA873},
|
|
{0xA882, 0xA8B3},
|
|
{0xA8F2, 0xA8F7},
|
|
{0xA8FB, 0xA8FB},
|
|
{0xA8FD, 0xA8FE},
|
|
{0xA90A, 0xA925},
|
|
{0xA930, 0xA946},
|
|
{0xA960, 0xA97C},
|
|
{0xA984, 0xA9B2},
|
|
{0xA9CF, 0xA9CF},
|
|
{0xA9E0, 0xA9E4},
|
|
{0xA9E6, 0xA9EF},
|
|
{0xA9FA, 0xA9FE},
|
|
{0xAA00, 0xAA28},
|
|
{0xAA40, 0xAA42},
|
|
{0xAA44, 0xAA4B},
|
|
{0xAA60, 0xAA76},
|
|
{0xAA7A, 0xAA7A},
|
|
{0xAA7E, 0xAAAF},
|
|
{0xAAB1, 0xAAB1},
|
|
{0xAAB5, 0xAAB6},
|
|
{0xAAB9, 0xAABD},
|
|
{0xAAC0, 0xAAC0},
|
|
{0xAAC2, 0xAAC2},
|
|
{0xAADB, 0xAADD},
|
|
{0xAAE0, 0xAAEA},
|
|
{0xAAF2, 0xAAF4},
|
|
{0xAB01, 0xAB06},
|
|
{0xAB09, 0xAB0E},
|
|
{0xAB11, 0xAB16},
|
|
{0xAB20, 0xAB26},
|
|
{0xAB28, 0xAB2E},
|
|
{0xAB30, 0xAB5A},
|
|
{0xAB5C, 0xAB69},
|
|
{0xAB70, 0xABE2},
|
|
{0xAC00, 0xD7A3},
|
|
{0xD7B0, 0xD7C6},
|
|
{0xD7CB, 0xD7FB},
|
|
{0xF900, 0xFA6D},
|
|
{0xFA70, 0xFAD9},
|
|
{0xFB00, 0xFB06},
|
|
{0xFB13, 0xFB17},
|
|
{0xFB1D, 0xFB1D},
|
|
{0xFB1F, 0xFB28},
|
|
{0xFB2A, 0xFB36},
|
|
{0xFB38, 0xFB3C},
|
|
{0xFB3E, 0xFB3E},
|
|
{0xFB40, 0xFB41},
|
|
{0xFB43, 0xFB44},
|
|
{0xFB46, 0xFBB1},
|
|
{0xFBD3, 0xFD3D},
|
|
{0xFD50, 0xFD8F},
|
|
{0xFD92, 0xFDC7},
|
|
{0xFDF0, 0xFDFB},
|
|
{0xFE70, 0xFE74},
|
|
{0xFE76, 0xFEFC},
|
|
{0xFF21, 0xFF3A},
|
|
{0xFF41, 0xFF5A},
|
|
{0xFF66, 0xFFBE},
|
|
{0xFFC2, 0xFFC7},
|
|
{0xFFCA, 0xFFCF},
|
|
{0xFFD2, 0xFFD7},
|
|
{0xFFDA, 0xFFDC},
|
|
{0x10000, 0x1000B},
|
|
{0x1000D, 0x10026},
|
|
{0x10028, 0x1003A},
|
|
{0x1003C, 0x1003D},
|
|
{0x1003F, 0x1004D},
|
|
{0x10050, 0x1005D},
|
|
{0x10080, 0x100FA},
|
|
{0x10280, 0x1029C},
|
|
{0x102A0, 0x102D0},
|
|
{0x10300, 0x1031F},
|
|
{0x1032D, 0x10340},
|
|
{0x10342, 0x10349},
|
|
{0x10350, 0x10375},
|
|
{0x10380, 0x1039D},
|
|
{0x103A0, 0x103C3},
|
|
{0x103C8, 0x103CF},
|
|
{0x10400, 0x1049D},
|
|
{0x104B0, 0x104D3},
|
|
{0x104D8, 0x104FB},
|
|
{0x10500, 0x10527},
|
|
{0x10530, 0x10563},
|
|
{0x10570, 0x1057A},
|
|
{0x1057C, 0x1058A},
|
|
{0x1058C, 0x10592},
|
|
{0x10594, 0x10595},
|
|
{0x10597, 0x105A1},
|
|
{0x105A3, 0x105B1},
|
|
{0x105B3, 0x105B9},
|
|
{0x105BB, 0x105BC},
|
|
{0x105C0, 0x105F3},
|
|
{0x10600, 0x10736},
|
|
{0x10740, 0x10755},
|
|
{0x10760, 0x10767},
|
|
{0x10780, 0x10785},
|
|
{0x10787, 0x107B0},
|
|
{0x107B2, 0x107BA},
|
|
{0x10800, 0x10805},
|
|
{0x10808, 0x10808},
|
|
{0x1080A, 0x10835},
|
|
{0x10837, 0x10838},
|
|
{0x1083C, 0x1083C},
|
|
{0x1083F, 0x10855},
|
|
{0x10860, 0x10876},
|
|
{0x10880, 0x1089E},
|
|
{0x108E0, 0x108F2},
|
|
{0x108F4, 0x108F5},
|
|
{0x10900, 0x10915},
|
|
{0x10920, 0x10939},
|
|
{0x10940, 0x10959},
|
|
{0x10980, 0x109B7},
|
|
{0x109BE, 0x109BF},
|
|
{0x10A00, 0x10A00},
|
|
{0x10A10, 0x10A13},
|
|
{0x10A15, 0x10A17},
|
|
{0x10A19, 0x10A35},
|
|
{0x10A60, 0x10A7C},
|
|
{0x10A80, 0x10A9C},
|
|
{0x10AC0, 0x10AC7},
|
|
{0x10AC9, 0x10AE4},
|
|
{0x10B00, 0x10B35},
|
|
{0x10B40, 0x10B55},
|
|
{0x10B60, 0x10B72},
|
|
{0x10B80, 0x10B91},
|
|
{0x10C00, 0x10C48},
|
|
{0x10C80, 0x10CB2},
|
|
{0x10CC0, 0x10CF2},
|
|
{0x10D00, 0x10D23},
|
|
{0x10D4A, 0x10D65},
|
|
{0x10D6F, 0x10D85},
|
|
{0x10E80, 0x10EA9},
|
|
{0x10EB0, 0x10EB1},
|
|
{0x10EC2, 0x10EC7},
|
|
{0x10F00, 0x10F1C},
|
|
{0x10F27, 0x10F27},
|
|
{0x10F30, 0x10F45},
|
|
{0x10F70, 0x10F81},
|
|
{0x10FB0, 0x10FC4},
|
|
{0x10FE0, 0x10FF6},
|
|
{0x11003, 0x11037},
|
|
{0x11071, 0x11072},
|
|
{0x11075, 0x11075},
|
|
{0x11083, 0x110AF},
|
|
{0x110D0, 0x110E8},
|
|
{0x11103, 0x11126},
|
|
{0x11144, 0x11144},
|
|
{0x11147, 0x11147},
|
|
{0x11150, 0x11172},
|
|
{0x11176, 0x11176},
|
|
{0x11183, 0x111B2},
|
|
{0x111C1, 0x111C4},
|
|
{0x111DA, 0x111DA},
|
|
{0x111DC, 0x111DC},
|
|
{0x11200, 0x11211},
|
|
{0x11213, 0x1122B},
|
|
{0x1123F, 0x11240},
|
|
{0x11280, 0x11286},
|
|
{0x11288, 0x11288},
|
|
{0x1128A, 0x1128D},
|
|
{0x1128F, 0x1129D},
|
|
{0x1129F, 0x112A8},
|
|
{0x112B0, 0x112DE},
|
|
{0x11305, 0x1130C},
|
|
{0x1130F, 0x11310},
|
|
{0x11313, 0x11328},
|
|
{0x1132A, 0x11330},
|
|
{0x11332, 0x11333},
|
|
{0x11335, 0x11339},
|
|
{0x1133D, 0x1133D},
|
|
{0x11350, 0x11350},
|
|
{0x1135D, 0x11361},
|
|
{0x11380, 0x11389},
|
|
{0x1138B, 0x1138B},
|
|
{0x1138E, 0x1138E},
|
|
{0x11390, 0x113B5},
|
|
{0x113B7, 0x113B7},
|
|
{0x113D1, 0x113D1},
|
|
{0x113D3, 0x113D3},
|
|
{0x11400, 0x11434},
|
|
{0x11447, 0x1144A},
|
|
{0x1145F, 0x11461},
|
|
{0x11480, 0x114AF},
|
|
{0x114C4, 0x114C5},
|
|
{0x114C7, 0x114C7},
|
|
{0x11580, 0x115AE},
|
|
{0x115D8, 0x115DB},
|
|
{0x11600, 0x1162F},
|
|
{0x11644, 0x11644},
|
|
{0x11680, 0x116AA},
|
|
{0x116B8, 0x116B8},
|
|
{0x11700, 0x1171A},
|
|
{0x11740, 0x11746},
|
|
{0x11800, 0x1182B},
|
|
{0x118A0, 0x118DF},
|
|
{0x118FF, 0x11906},
|
|
{0x11909, 0x11909},
|
|
{0x1190C, 0x11913},
|
|
{0x11915, 0x11916},
|
|
{0x11918, 0x1192F},
|
|
{0x1193F, 0x1193F},
|
|
{0x11941, 0x11941},
|
|
{0x119A0, 0x119A7},
|
|
{0x119AA, 0x119D0},
|
|
{0x119E1, 0x119E1},
|
|
{0x119E3, 0x119E3},
|
|
{0x11A00, 0x11A00},
|
|
{0x11A0B, 0x11A32},
|
|
{0x11A3A, 0x11A3A},
|
|
{0x11A50, 0x11A50},
|
|
{0x11A5C, 0x11A89},
|
|
{0x11A9D, 0x11A9D},
|
|
{0x11AB0, 0x11AF8},
|
|
{0x11BC0, 0x11BE0},
|
|
{0x11C00, 0x11C08},
|
|
{0x11C0A, 0x11C2E},
|
|
{0x11C40, 0x11C40},
|
|
{0x11C72, 0x11C8F},
|
|
{0x11D00, 0x11D06},
|
|
{0x11D08, 0x11D09},
|
|
{0x11D0B, 0x11D30},
|
|
{0x11D46, 0x11D46},
|
|
{0x11D60, 0x11D65},
|
|
{0x11D67, 0x11D68},
|
|
{0x11D6A, 0x11D89},
|
|
{0x11D98, 0x11D98},
|
|
{0x11DB0, 0x11DDB},
|
|
{0x11EE0, 0x11EF2},
|
|
{0x11F02, 0x11F02},
|
|
{0x11F04, 0x11F10},
|
|
{0x11F12, 0x11F33},
|
|
{0x11FB0, 0x11FB0},
|
|
{0x12000, 0x12399},
|
|
{0x12480, 0x12543},
|
|
{0x12F90, 0x12FF0},
|
|
{0x13000, 0x1342F},
|
|
{0x13441, 0x13446},
|
|
{0x13460, 0x143FA},
|
|
{0x14400, 0x14646},
|
|
{0x16100, 0x1611D},
|
|
{0x16800, 0x16A38},
|
|
{0x16A40, 0x16A5E},
|
|
{0x16A70, 0x16ABE},
|
|
{0x16AD0, 0x16AED},
|
|
{0x16B00, 0x16B2F},
|
|
{0x16B40, 0x16B43},
|
|
{0x16B63, 0x16B77},
|
|
{0x16B7D, 0x16B8F},
|
|
{0x16D40, 0x16D6C},
|
|
{0x16E40, 0x16E7F},
|
|
{0x16EA0, 0x16EB8},
|
|
{0x16EBB, 0x16ED3},
|
|
{0x16F00, 0x16F4A},
|
|
{0x16F50, 0x16F50},
|
|
{0x16F93, 0x16F9F},
|
|
{0x16FE0, 0x16FE1},
|
|
{0x16FE3, 0x16FE3},
|
|
{0x16FF2, 0x16FF3},
|
|
{0x17000, 0x18CD5},
|
|
{0x18CFF, 0x18D1E},
|
|
{0x18D80, 0x18DF2},
|
|
{0x1AFF0, 0x1AFF3},
|
|
{0x1AFF5, 0x1AFFB},
|
|
{0x1AFFD, 0x1AFFE},
|
|
{0x1B000, 0x1B122},
|
|
{0x1B132, 0x1B132},
|
|
{0x1B150, 0x1B152},
|
|
{0x1B155, 0x1B155},
|
|
{0x1B164, 0x1B167},
|
|
{0x1B170, 0x1B2FB},
|
|
{0x1BC00, 0x1BC6A},
|
|
{0x1BC70, 0x1BC7C},
|
|
{0x1BC80, 0x1BC88},
|
|
{0x1BC90, 0x1BC99},
|
|
{0x1D400, 0x1D454},
|
|
{0x1D456, 0x1D49C},
|
|
{0x1D49E, 0x1D49F},
|
|
{0x1D4A2, 0x1D4A2},
|
|
{0x1D4A5, 0x1D4A6},
|
|
{0x1D4A9, 0x1D4AC},
|
|
{0x1D4AE, 0x1D4B9},
|
|
{0x1D4BB, 0x1D4BB},
|
|
{0x1D4BD, 0x1D4C3},
|
|
{0x1D4C5, 0x1D505},
|
|
{0x1D507, 0x1D50A},
|
|
{0x1D50D, 0x1D514},
|
|
{0x1D516, 0x1D51C},
|
|
{0x1D51E, 0x1D539},
|
|
{0x1D53B, 0x1D53E},
|
|
{0x1D540, 0x1D544},
|
|
{0x1D546, 0x1D546},
|
|
{0x1D54A, 0x1D550},
|
|
{0x1D552, 0x1D6A5},
|
|
{0x1D6A8, 0x1D6C0},
|
|
{0x1D6C2, 0x1D6DA},
|
|
{0x1D6DC, 0x1D6FA},
|
|
{0x1D6FC, 0x1D714},
|
|
{0x1D716, 0x1D734},
|
|
{0x1D736, 0x1D74E},
|
|
{0x1D750, 0x1D76E},
|
|
{0x1D770, 0x1D788},
|
|
{0x1D78A, 0x1D7A8},
|
|
{0x1D7AA, 0x1D7C2},
|
|
{0x1D7C4, 0x1D7CB},
|
|
{0x1DF00, 0x1DF1E},
|
|
{0x1DF25, 0x1DF2A},
|
|
{0x1E030, 0x1E06D},
|
|
{0x1E100, 0x1E12C},
|
|
{0x1E137, 0x1E13D},
|
|
{0x1E14E, 0x1E14E},
|
|
{0x1E290, 0x1E2AD},
|
|
{0x1E2C0, 0x1E2EB},
|
|
{0x1E4D0, 0x1E4EB},
|
|
{0x1E5D0, 0x1E5ED},
|
|
{0x1E5F0, 0x1E5F0},
|
|
{0x1E6C0, 0x1E6DE},
|
|
{0x1E6E0, 0x1E6E2},
|
|
{0x1E6E4, 0x1E6E5},
|
|
{0x1E6E7, 0x1E6ED},
|
|
{0x1E6F0, 0x1E6F4},
|
|
{0x1E6FE, 0x1E6FF},
|
|
{0x1E7E0, 0x1E7E6},
|
|
{0x1E7E8, 0x1E7EB},
|
|
{0x1E7ED, 0x1E7EE},
|
|
{0x1E7F0, 0x1E7FE},
|
|
{0x1E800, 0x1E8C4},
|
|
{0x1E900, 0x1E943},
|
|
{0x1E94B, 0x1E94B},
|
|
{0x1EE00, 0x1EE03},
|
|
{0x1EE05, 0x1EE1F},
|
|
{0x1EE21, 0x1EE22},
|
|
{0x1EE24, 0x1EE24},
|
|
{0x1EE27, 0x1EE27},
|
|
{0x1EE29, 0x1EE32},
|
|
{0x1EE34, 0x1EE37},
|
|
{0x1EE39, 0x1EE39},
|
|
{0x1EE3B, 0x1EE3B},
|
|
{0x1EE42, 0x1EE42},
|
|
{0x1EE47, 0x1EE47},
|
|
{0x1EE49, 0x1EE49},
|
|
{0x1EE4B, 0x1EE4B},
|
|
{0x1EE4D, 0x1EE4F},
|
|
{0x1EE51, 0x1EE52},
|
|
{0x1EE54, 0x1EE54},
|
|
{0x1EE57, 0x1EE57},
|
|
{0x1EE59, 0x1EE59},
|
|
{0x1EE5B, 0x1EE5B},
|
|
{0x1EE5D, 0x1EE5D},
|
|
{0x1EE5F, 0x1EE5F},
|
|
{0x1EE61, 0x1EE62},
|
|
{0x1EE64, 0x1EE64},
|
|
{0x1EE67, 0x1EE6A},
|
|
{0x1EE6C, 0x1EE72},
|
|
{0x1EE74, 0x1EE77},
|
|
{0x1EE79, 0x1EE7C},
|
|
{0x1EE7E, 0x1EE7E},
|
|
{0x1EE80, 0x1EE89},
|
|
{0x1EE8B, 0x1EE9B},
|
|
{0x1EEA1, 0x1EEA3},
|
|
{0x1EEA5, 0x1EEA9},
|
|
{0x1EEAB, 0x1EEBB},
|
|
{0x20000, 0x2A6DF},
|
|
{0x2A700, 0x2B81D},
|
|
{0x2B820, 0x2CEAD},
|
|
{0x2CEB0, 0x2EBE0},
|
|
{0x2EBF0, 0x2EE5D},
|
|
{0x2F800, 0x2FA1D},
|
|
{0x30000, 0x3134A},
|
|
{0x31350, 0x33479},
|
|
};
|
|
|
|
for (const auto& r : ranges) {
|
|
if (ch >= r.start && ch <= r.end)
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool is_space(char32_t cp) {
|
|
switch (cp) {
|
|
case 0x0009: // TAB \t
|
|
case 0x000A: // LF \n
|
|
case 0x000B: // VT
|
|
case 0x000C: // FF
|
|
case 0x000D: // CR \r
|
|
case 0x0020: // Space
|
|
case 0x00A0: // No-Break Space
|
|
case 0x1680: // Ogham Space Mark
|
|
case 0x2000: // En Quad
|
|
case 0x2001: // Em Quad
|
|
case 0x2002: // En Space
|
|
case 0x2003: // Em Space
|
|
case 0x2004: // Three-Per-Em Space
|
|
case 0x2005: // Four-Per-Em Space
|
|
case 0x2006: // Six-Per-Em Space
|
|
case 0x2007: // Figure Space
|
|
case 0x2008: // Punctuation Space
|
|
case 0x2009: // Thin Space
|
|
case 0x200A: // Hair Space
|
|
case 0x202F: // Narrow No-Break Space
|
|
case 0x205F: // Medium Mathematical Space
|
|
case 0x3000: // Ideographic Space
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
std::string str_to_lower(const std::string& input) {
|
|
std::string result = input;
|
|
std::transform(result.begin(), result.end(), result.begin(),
|
|
[](unsigned char c) { return std::tolower(c); });
|
|
return result;
|
|
}
|
|
|
|
// UTF-8 -> Unicode code points
|
|
std::vector<char32_t> utf8_to_codepoints(const std::string& str) {
|
|
std::vector<char32_t> codepoints;
|
|
size_t i = 0;
|
|
while (i < str.size()) {
|
|
unsigned char c = str[i];
|
|
char32_t cp = 0;
|
|
size_t extra_bytes = 0;
|
|
|
|
if ((c & 0x80) == 0)
|
|
cp = c;
|
|
else if ((c & 0xE0) == 0xC0) {
|
|
cp = c & 0x1F;
|
|
extra_bytes = 1;
|
|
} else if ((c & 0xF0) == 0xE0) {
|
|
cp = c & 0x0F;
|
|
extra_bytes = 2;
|
|
} else if ((c & 0xF8) == 0xF0) {
|
|
cp = c & 0x07;
|
|
extra_bytes = 3;
|
|
} else {
|
|
++i;
|
|
continue;
|
|
} // Invalid UTF-8
|
|
|
|
if (i + extra_bytes >= str.size())
|
|
break;
|
|
|
|
for (size_t j = 1; j <= extra_bytes; ++j)
|
|
cp = (cp << 6) | (str[i + j] & 0x3F);
|
|
|
|
codepoints.push_back(cp);
|
|
i += 1 + extra_bytes;
|
|
}
|
|
return codepoints;
|
|
}
|
|
|
|
// Unicode code point -> UTF-8
|
|
std::string codepoint_to_utf8(char32_t cp) {
|
|
std::string out;
|
|
if (cp <= 0x7F)
|
|
out.push_back(static_cast<char>(cp));
|
|
else if (cp <= 0x7FF) {
|
|
out.push_back(static_cast<char>(0xC0 | (cp >> 6)));
|
|
out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
|
|
} else if (cp <= 0xFFFF) {
|
|
out.push_back(static_cast<char>(0xE0 | (cp >> 12)));
|
|
out.push_back(static_cast<char>(0x80 | ((cp >> 6) & 0x3F)));
|
|
out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
|
|
} else {
|
|
out.push_back(static_cast<char>(0xF0 | (cp >> 18)));
|
|
out.push_back(static_cast<char>(0x80 | ((cp >> 12) & 0x3F)));
|
|
out.push_back(static_cast<char>(0x80 | ((cp >> 6) & 0x3F)));
|
|
out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
|
|
}
|
|
return out;
|
|
}
|
|
|
|
bool starts_with(const std::vector<char32_t>& text,
|
|
const std::vector<char32_t>& prefix,
|
|
std::size_t index) {
|
|
if (index > text.size()) {
|
|
return false;
|
|
}
|
|
if (prefix.size() > text.size() - index) {
|
|
return false;
|
|
}
|
|
return std::equal(prefix.begin(), prefix.end(), text.begin() + index);
|
|
}
|
|
|
|
// mistral: [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+
|
|
// qwen2: (?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+
|
|
std::vector<std::string> token_split(const std::string& text) {
|
|
std::vector<std::string> tokens;
|
|
auto cps = utf8_to_codepoints(text);
|
|
size_t i = 0;
|
|
|
|
while (i < cps.size()) {
|
|
char32_t cp = cps[i];
|
|
|
|
// `(?i:'s|'t|'re|'ve|'m|'ll|'d)`
|
|
if (cp == U'\'' && i + 1 < cps.size()) {
|
|
std::string next = str_to_lower(codepoint_to_utf8(cps[i + 1]));
|
|
if (next == "s" || next == "t" || next == "m") {
|
|
tokens.push_back("'" + next);
|
|
i += 2;
|
|
continue;
|
|
}
|
|
if (i + 2 < cps.size()) {
|
|
next += str_to_lower(codepoint_to_utf8(cps[i + 2]));
|
|
if (next == "re" || next == "ve" || next == "ll" || next == "d") {
|
|
tokens.push_back("'" + next);
|
|
i += 3;
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
|
|
// `\p{N}`
|
|
if (is_number(cp)) {
|
|
tokens.push_back(codepoint_to_utf8(cp));
|
|
++i;
|
|
continue;
|
|
}
|
|
|
|
// `[^\r\n\p{L}\p{N}]?\p{L}+`
|
|
{
|
|
// `[^\r\n\p{L}\p{N}]\p{L}+`
|
|
if (!is_letter(cp) && cp != U'\r' && cp != U'\n' && i + 1 < cps.size() && is_letter(cps[i + 1])) {
|
|
std::string token = codepoint_to_utf8(cp);
|
|
++i;
|
|
|
|
while (i < cps.size() && is_letter(cps[i])) {
|
|
token += codepoint_to_utf8(cps[i]);
|
|
++i;
|
|
}
|
|
tokens.push_back(token);
|
|
continue;
|
|
}
|
|
|
|
// `\p{L}+`
|
|
if (is_letter(cp)) {
|
|
std::string token = codepoint_to_utf8(cp);
|
|
++i;
|
|
while (i < cps.size() && is_letter(cps[i])) {
|
|
token += codepoint_to_utf8(cps[i]);
|
|
++i;
|
|
}
|
|
tokens.push_back(token);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// ` ?[^\s\p{L}\p{N}]+[\r\n]*`
|
|
{
|
|
// ` [^\s\p{L}\p{N}]+[\r\n]*`
|
|
if (cp == U' ' && i + 1 < cps.size() && !isspace(cps[i + 1]) && !is_letter(cps[i + 1]) && !is_number(cps[i + 1])) {
|
|
std::string token = codepoint_to_utf8(cp);
|
|
token += codepoint_to_utf8(cps[i + 1]);
|
|
i += 2;
|
|
|
|
while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) {
|
|
token += codepoint_to_utf8(cps[i]);
|
|
++i;
|
|
}
|
|
|
|
while (i < cps.size() && (cps[i] == U'\r' || cps[i] == U'\n')) {
|
|
token += codepoint_to_utf8(cps[i]);
|
|
++i;
|
|
}
|
|
|
|
tokens.push_back(token);
|
|
continue;
|
|
}
|
|
|
|
// `[^\s\p{L}\p{N}]+[\r\n]*`
|
|
std::string token;
|
|
if (!is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) {
|
|
std::string token = codepoint_to_utf8(cp);
|
|
++i;
|
|
|
|
while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) {
|
|
token += codepoint_to_utf8(cps[i]);
|
|
++i;
|
|
}
|
|
|
|
while (i < cps.size() && (cps[i] == U'\r' || cps[i] == U'\n')) {
|
|
token += codepoint_to_utf8(cps[i]);
|
|
++i;
|
|
}
|
|
|
|
tokens.push_back(token);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// `\s*[\r\n]+|\s+(?!\S)|\s+`
|
|
if (is_space(cp)) {
|
|
std::string token = codepoint_to_utf8(cp);
|
|
++i;
|
|
|
|
while (i < cps.size() && is_space(cps[i])) {
|
|
token += codepoint_to_utf8(cps[i]);
|
|
++i;
|
|
if (cps[i] == U'\r' || cps[i] == U'\n') {
|
|
break;
|
|
}
|
|
}
|
|
|
|
tokens.push_back(token);
|
|
continue;
|
|
}
|
|
|
|
// skip
|
|
++i;
|
|
}
|
|
|
|
return tokens;
|
|
}
|
|
|
|
std::vector<std::string> split_with_special_tokens(
|
|
const std::string& text,
|
|
const std::vector<std::string>& special_tokens) {
|
|
std::vector<std::string> result;
|
|
size_t pos = 0;
|
|
size_t text_len = text.size();
|
|
|
|
while (pos < text_len) {
|
|
size_t next_pos = text_len;
|
|
std::string matched_token;
|
|
|
|
for (const auto& token : special_tokens) {
|
|
size_t token_pos = text.find(token, pos);
|
|
if (token_pos != std::string::npos && token_pos < next_pos) {
|
|
next_pos = token_pos;
|
|
matched_token = token;
|
|
}
|
|
}
|
|
|
|
if (next_pos > pos) {
|
|
result.push_back(text.substr(pos, next_pos - pos));
|
|
}
|
|
|
|
if (!matched_token.empty()) {
|
|
result.push_back(matched_token);
|
|
pos = next_pos + matched_token.size();
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
// int main() {
|
|
// std::string text = "I'm testing C++ token_split function. 你好,世界! 123";
|
|
// auto tokens = token_split(text);
|
|
|
|
// for (const auto& t : tokens) {
|
|
// std::cout << "[" << t << "] ";
|
|
// }
|
|
// std::cout << "\n";
|
|
// return 0;
|
|
// }
|