feat: add left padding support to tokenizers (#1424)

This commit is contained in:
leejet 2026-04-15 23:17:47 +08:00 committed by GitHub
parent 9ac7b672c2
commit c41c5ded7a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 18 additions and 6 deletions

View File

@ -107,13 +107,24 @@ void Tokenizer::pad_tokens(std::vector<int>& tokens,
if (final_length > out_tokens.size()) {
const size_t pad_count = final_length - out_tokens.size();
out_tokens.insert(out_tokens.end(), pad_count, PAD_TOKEN_ID);
if (pad_left) {
out_tokens.insert(out_tokens.begin(), pad_count, PAD_TOKEN_ID);
if (use_weights) {
out_weights.insert(out_weights.end(), pad_count, 1.0f);
}
if (use_mask) {
out_mask.insert(out_mask.end(), pad_count, 0.0f);
if (use_weights) {
out_weights.insert(out_weights.begin(), pad_count, 1.0f);
}
if (use_mask) {
out_mask.insert(out_mask.begin(), pad_count, 0.0f);
}
} else {
out_tokens.insert(out_tokens.end(), pad_count, PAD_TOKEN_ID);
if (use_weights) {
out_weights.insert(out_weights.end(), pad_count, 1.0f);
}
if (use_mask) {
out_mask.insert(out_mask.end(), pad_count, 0.0f);
}
}
}
};

View File

@ -14,6 +14,7 @@ protected:
std::vector<std::string> special_tokens;
bool add_bos_token = false;
bool add_eos_token = false;
bool pad_left = false;
std::string end_of_word_suffix;
virtual std::string decode_token(int token_id) const = 0;