#ifndef __SD_TOKENIZERS_BPE_TOKENIZER_H__ #define __SD_TOKENIZERS_BPE_TOKENIZER_H__ #include #include #include #include #include #include #include #include #include #include "tokenizer.h" class BPETokenizer : public Tokenizer { protected: std::map byte_encoder; std::map byte_decoder; std::map encoder; std::map decoder; std::map, int> bpe_ranks; int encoder_len = 0; int bpe_len = 0; protected: static std::vector> bytes_to_unicode(); static std::vector split_utf32(const std::string& text, char32_t delimiter = U'\n'); virtual std::vector token_split(const std::string& text) const; std::vector bpe(const std::u32string& token) const; std::string decode_token(int token_id) const override; public: BPETokenizer() = default; virtual ~BPETokenizer() = default; std::vector encode(const std::string& text, on_new_token_cb_t on_new_token_cb = nullptr) override; }; #endif // __SD_TOKENIZERS_BPE_TOKENIZER_H__