feat: add left padding support to tokenizers (#1424)

This commit is contained in:
leejet 2026-04-15 23:17:47 +08:00 committed by GitHub
parent 9ac7b672c2
commit c41c5ded7a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 18 additions and 6 deletions

View File

@ -107,6 +107,16 @@ void Tokenizer::pad_tokens(std::vector<int>& tokens,
if (final_length > out_tokens.size()) { if (final_length > out_tokens.size()) {
const size_t pad_count = final_length - out_tokens.size(); const size_t pad_count = final_length - out_tokens.size();
if (pad_left) {
out_tokens.insert(out_tokens.begin(), pad_count, PAD_TOKEN_ID);
if (use_weights) {
out_weights.insert(out_weights.begin(), pad_count, 1.0f);
}
if (use_mask) {
out_mask.insert(out_mask.begin(), pad_count, 0.0f);
}
} else {
out_tokens.insert(out_tokens.end(), pad_count, PAD_TOKEN_ID); out_tokens.insert(out_tokens.end(), pad_count, PAD_TOKEN_ID);
if (use_weights) { if (use_weights) {
@ -116,6 +126,7 @@ void Tokenizer::pad_tokens(std::vector<int>& tokens,
out_mask.insert(out_mask.end(), pad_count, 0.0f); out_mask.insert(out_mask.end(), pad_count, 0.0f);
} }
} }
}
}; };
const size_t single_length = std::max(min_length, tokens.size() + special_token_count); const size_t single_length = std::max(min_length, tokens.size() + special_token_count);

View File

@ -14,6 +14,7 @@ protected:
std::vector<std::string> special_tokens; std::vector<std::string> special_tokens;
bool add_bos_token = false; bool add_bos_token = false;
bool add_eos_token = false; bool add_eos_token = false;
bool pad_left = false;
std::string end_of_word_suffix; std::string end_of_word_suffix;
virtual std::string decode_token(int token_id) const = 0; virtual std::string decode_token(int token_id) const = 0;