Split quoted text into character-level tokens

remove debug logs
2026-02-04 19:03:35 +00:00 · 2025-12-08 02:28:29 +01:00 · 2025-12-08 02:28:29 +01:00 · 9f225e4e63
commit 9f225e4e63
parent fc8d85e133
1 changed files with 76 additions and 29 deletions
--- a/conditioner.hpp
+++ b/conditioner.hpp
@ -1648,46 +1648,91 @@ struct LLMEmbedder : public Conditioner {
        }
    }

-    std::tuple<std::vector<int>, std::vector<float>> tokenize(std::string text,
-                                                              std::pair<int, int> attn_range,
-                                                              size_t max_length = 0,
-                                                              bool padding      = false) {
+    std::tuple<std::vector<int>, std::vector<float>> tokenize(
+        std::string text,
+        std::pair<int, int> attn_range,
+        size_t max_length = 0,
+        bool padding      = false,
+        bool spell_quotes = false) {
        std::vector<std::pair<std::string, float>> parsed_attention;
        parsed_attention.emplace_back(text.substr(0, attn_range.first), 1.f);
+
        if (attn_range.second - attn_range.first > 0) {
-            auto new_parsed_attention = parse_prompt_attention(text.substr(attn_range.first, attn_range.second - attn_range.first));
-            parsed_attention.insert(parsed_attention.end(),
-                                    new_parsed_attention.begin(),
-                                    new_parsed_attention.end());
+            auto new_parsed_attention = parse_prompt_attention(
+                text.substr(attn_range.first, attn_range.second - attn_range.first));
+            parsed_attention.insert(
+                parsed_attention.end(),
+                new_parsed_attention.begin(),
+                new_parsed_attention.end());
        }
        parsed_attention.emplace_back(text.substr(attn_range.second), 1.f);
-        {
-            std::stringstream ss;
-            ss << "[";
-            for (const auto& item : parsed_attention) {
-                ss << "['" << item.first << "', " << item.second << "], ";
-            }
-            ss << "]";
-            LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
-        }
+
+        // {
+        //     std::stringstream ss;
+        //     ss << '[';
+        //     for (const auto& item : parsed_attention) {
+        //         ss << "['" << item.first << "', " << item.second << "], ";
+        //     }
+        //     ss << ']';
+        //     LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
+        // }

        std::vector<int> tokens;
        std::vector<float> weights;
+
        for (const auto& item : parsed_attention) {
            const std::string& curr_text = item.first;
            float curr_weight            = item.second;
-            std::vector<int> curr_tokens = tokenizer->tokenize(curr_text, nullptr);
-            tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
-            weights.insert(weights.end(), curr_tokens.size(), curr_weight);
+
+            if (spell_quotes) {
+                std::vector<std::string> parts;
+                bool in_quote = false;
+                std::string current_part;
+
+                for (char c : curr_text) {
+                    if (c == '\'') {
+                        if (!current_part.empty()) {
+                            parts.push_back(current_part);
+                            current_part.clear();
+                        }
+                        in_quote = !in_quote;
+                    } else {
+                        current_part += c;
+                        if (in_quote && current_part.size() == 1) {
+                            parts.push_back(current_part);
+                            current_part.clear();
+                        }
+                    }
+                }
+                if (!current_part.empty()) {
+                    parts.push_back(current_part);
+                }
+
+                for (const auto& part : parts) {
+                    if (part.empty())
+                        continue;
+                    if (part[0] == '\'' && part.back() == '\'') {
+                        std::string quoted_content = part.substr(1, part.size() - 2);
+                        for (char ch : quoted_content) {
+                            std::string char_str(1, ch);
+                            std::vector<int> char_tokens = tokenizer->tokenize(char_str, nullptr);
+                            tokens.insert(tokens.end(), char_tokens.begin(), char_tokens.end());
+                            weights.insert(weights.end(), char_tokens.size(), curr_weight);
+                        }
+                    } else {
+                        std::vector<int> part_tokens = tokenizer->tokenize(part, nullptr);
+                        tokens.insert(tokens.end(), part_tokens.begin(), part_tokens.end());
+                        weights.insert(weights.end(), part_tokens.size(), curr_weight);
+                    }
+                }
+            } else {
+                std::vector<int> curr_tokens = tokenizer->tokenize(curr_text, nullptr);
+                tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
+                weights.insert(weights.end(), curr_tokens.size(), curr_weight);
+            }
        }

        tokenizer->pad_tokens(tokens, weights, max_length, padding);
-
-        // for (int i = 0; i < tokens.size(); i++) {
-        //     std::cout << tokens[i] << ":" << weights[i] << ", " << i << std::endl;
-        // }
-        // std::cout << std::endl;
-
        return {tokens, weights};
    }

@ -1698,7 +1743,8 @@ struct LLMEmbedder : public Conditioner {
        std::vector<std::pair<int, ggml_tensor*>> image_embeds;
        std::pair<int, int> prompt_attn_range;
        int prompt_template_encode_start_idx = 34;
-        int max_length = 0;
+        int max_length                       = 0;
+        bool spell_quotes                    = false;
        std::set<int> out_layers;
        if (llm->enable_vision && conditioner_params.ref_images.size() > 0) {
            LOG_INFO("QwenImageEditPlusPipeline");
@ -1810,7 +1856,8 @@ struct LLMEmbedder : public Conditioner {
        } else if (sd_version_is_longcat(version)) {
            prompt_template_encode_start_idx = 36;
            // prompt_template_encode_end_idx = 5;
-            max_length = 512;
+            max_length   = 512;
+            spell_quotes = true;

            prompt = "<|im_start|>system\nAs an image captioning expert, generate a descriptive text prompt based on an image content, suitable for input to a text-to-image model.<|im_end|>\n<|im_start|>user\n";

@ -1831,7 +1878,7 @@ struct LLMEmbedder : public Conditioner {
            prompt += "<|im_end|>\n<|im_start|>assistant\n";
        }

-        auto tokens_and_weights = tokenize(prompt, prompt_attn_range, max_length, max_length > 0);
+        auto tokens_and_weights = tokenize(prompt, prompt_attn_range, max_length, max_length > 0, spell_quotes);
        auto& tokens            = std::get<0>(tokens_and_weights);
        auto& weights           = std::get<1>(tokens_and_weights);