2026-06-25 23:56:40 +00:00
8 changed files with 16 additions and 212 deletions
--- a/README.md
+++ b/README.md
@ -81,9 +81,7 @@ API and command-line option may change frequently.***
    - [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457)
    - `DPM++ 2S a`
    - [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952)
- Cross-platform reproducibility
-    - `--rng cuda`, default, consistent with the `stable-diffusion-webui GPU RNG`
-    - `--rng cpu`, consistent with the `comfyui RNG`
+- Cross-platform reproducibility (`--rng cuda`, consistent with the `stable-diffusion-webui GPU RNG`)
 - Embedds generation parameters into png output as webui-compatible text string

 ## Quick Start
--- a/conditioner.hpp
+++ b/conditioner.hpp
@ -278,30 +278,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
            const std::string& curr_text = item.first;
            float curr_weight            = item.second;
            // printf(" %s: %f \n", curr_text.c_str(), curr_weight);
-            int32_t clean_index = 0;
-            if (curr_text == "BREAK" && curr_weight == -1.0f) {
-                // Pad token array up to chunk size at this point.
-                // TODO: This is a hardcoded chunk_len, like in stable-diffusion.cpp, make it a parameter for the future?
-                // Also, this is 75 instead of 77 to leave room for BOS and EOS tokens.
-                int padding_size = 75 - (tokens_acc % 75);
-                for (int j = 0; j < padding_size; j++) {
-                    clean_input_ids.push_back(tokenizer.EOS_TOKEN_ID);
-                    clean_index++;
-                }
-
-                // After padding, continue to the next iteration to process the following text as a new segment
-                tokens.insert(tokens.end(), clean_input_ids.begin(), clean_input_ids.end());
-                weights.insert(weights.end(), padding_size, curr_weight);
-                continue;
-            }
-
-            // Regular token, process normally
            std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb);
+            int32_t clean_index          = 0;
            for (uint32_t i = 0; i < curr_tokens.size(); i++) {
                int token_id = curr_tokens[i];
-                if (token_id == image_token) {
+                if (token_id == image_token)
                    class_token_index.push_back(clean_index - 1);
-                } else {
+                else {
                    clean_input_ids.push_back(token_id);
                    clean_index++;
                }
@ -404,22 +387,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
        for (const auto& item : parsed_attention) {
            const std::string& curr_text = item.first;
            float curr_weight            = item.second;
-
-            if (curr_text == "BREAK" && curr_weight == -1.0f) {
-                // Pad token array up to chunk size at this point.
-                // TODO: This is a hardcoded chunk_len, like in stable-diffusion.cpp, make it a parameter for the future?
-                // Also, this is 75 instead of 77 to leave room for BOS and EOS tokens.
-                size_t current_size = tokens.size();
-                size_t padding_size = (75 - (current_size % 75)) % 75;  // Ensure no negative padding
-
-                if (padding_size > 0) {
-                    LOG_DEBUG("BREAK token encountered, padding current chunk by %zu tokens.", padding_size);
-                    tokens.insert(tokens.end(), padding_size, tokenizer.EOS_TOKEN_ID);
-                    weights.insert(weights.end(), padding_size, 1.0f);
-                }
-                continue;  // Skip to the next item after handling BREAK
-            }
-
            std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb);
            tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
            weights.insert(weights.end(), curr_tokens.size(), curr_weight);
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@ -94,7 +94,7 @@ Options:
  -M, --mode                               run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen
  --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
                                           type of the weight file
-  --rng                                    RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
+  --rng                                    RNG, one of [std_default, cuda], default: cuda
  -s, --seed                               RNG seed (default: 42, use random seed for < 0)
  --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
                                           tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise)
@ -103,7 +103,7 @@ Options:
                                           contain any quantized parameters, the at_runtime mode will be used; otherwise,
                                           immediately will be used.The immediately mode may have precision and
                                           compatibility issues with quantized parameters, but it usually offers faster inference
-                                           speed and, in some cases, lower memory usage. The at_runtime mode, on the other
+                                           speed and, in some cases, lower memory usageThe at_runtime mode, on the other
                                           hand, is exactly the opposite.
  --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple], default:
                                           discrete
@ -119,4 +119,4 @@ Options:
  --vae-relative-tile-size                 relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
                                           (overrides --vae-tile-size)
  --preview                                preview method. must be one of the following [none, proj, tae, vae] (default is none)
-```
+```
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -1124,7 +1124,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         on_type_arg},
        {"",
         "--rng",
-         "RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)",
+         "RNG, one of [std_default, cuda], default: cuda",
         on_rng_arg},
        {"-s",
         "--seed",
@ -1144,7 +1144,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         "the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. "
         "In auto mode, if the model weights contain any quantized parameters, the at_runtime mode will be used; otherwise, immediately will be used."
         "The immediately mode may have precision and compatibility issues with quantized parameters, "
-         "but it usually offers faster inference speed and, in some cases, lower memory usage. "
+         "but it usually offers faster inference speed and, in some cases, lower memory usage"
         "The at_runtime mode, on the other hand, is exactly the opposite.",
         on_lora_apply_mode_arg},
        {"",
--- a/rng_mt19937.hpp
+++ b/rng_mt19937.hpp
@ -1,147 +0,0 @@
-#ifndef __RNG_MT19937_HPP__
-#define __RNG_MT19937_HPP__
-
-#include <cmath>
-#include <vector>
-
-#include "rng.hpp"
-
-// RNG imitiating torch cpu randn on CPU.
-// Port from pytorch, original license: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/LICENSE
-// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/TransformationHelper.h, for uniform_real
-// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/native/cpu/DistributionTemplates.h, for normal_kernel/normal_fill/normal_fill_16
-// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/MT19937RNGEngine.h, for mt19937_engine
-// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/DistributionsHelper.h, for uniform_real_distribution/normal_distribution
-class MT19937RNG : public RNG {
-    static const int N             = 624;
-    static const int M             = 397;
-    static const uint32_t MATRIX_A = 0x9908b0dfU;
-    static const uint32_t UMASK    = 0x80000000U;
-    static const uint32_t LMASK    = 0x7fffffffU;
-
-    struct State {
-        uint64_t seed_;
-        int left_;
-        bool seeded_;
-        uint32_t next_;
-        std::array<uint32_t, N> state_;
-        bool has_next_gauss = false;
-        double next_gauss   = 0.0f;
-    };
-
-    State s;
-
-    uint32_t mix_bits(uint32_t u, uint32_t v) { return (u & UMASK) | (v & LMASK); }
-    uint32_t twist(uint32_t u, uint32_t v) { return (mix_bits(u, v) >> 1) ^ ((v & 1) ? MATRIX_A : 0); }
-    void next_state() {
-        uint32_t* p = s.state_.data();
-        s.left_     = N;
-        s.next_     = 0;
-        for (int j = N - M + 1; --j; p++)
-            p[0] = p[M] ^ twist(p[0], p[1]);
-        for (int j = M; --j; p++)
-            p[0] = p[M - N] ^ twist(p[0], p[1]);
-        p[0] = p[M - N] ^ twist(p[0], s.state_[0]);
-    }
-
-    uint32_t rand_uint32() {
-        if (--s.left_ == 0)
-            next_state();
-        uint32_t y = s.state_[s.next_++];
-        y ^= (y >> 11);
-        y ^= (y << 7) & 0x9d2c5680U;
-        y ^= (y << 15) & 0xefc60000U;
-        y ^= (y >> 18);
-        return y;
-    }
-
-    uint64_t rand_uint64() {
-        uint64_t high = (uint64_t)rand_uint32();
-        uint64_t low  = (uint64_t)rand_uint32();
-        return (high << 32) | low;
-    }
-
-    template <typename T, typename V>
-    T uniform_real(V val, T from, T to) {
-        constexpr auto MASK    = static_cast<V>((static_cast<uint64_t>(1) << std::numeric_limits<T>::digits) - 1);
-        constexpr auto DIVISOR = static_cast<T>(1) / (static_cast<uint64_t>(1) << std::numeric_limits<T>::digits);
-        T x                    = (val & MASK) * DIVISOR;
-        return (x * (to - from) + from);
-    }
-
-    double normal_double_value(double mean, double std) {
-        if (s.has_next_gauss) {
-            s.has_next_gauss = false;
-            return s.next_gauss;
-        }
-        double u1 = uniform_real(rand_uint64(), 0., 1.);  // double
-        double u2 = uniform_real(rand_uint64(), 0., 1.);  // double
-
-        double r         = std::sqrt(-2.0 * std::log1p(-u2));
-        double theta     = 2.0 * 3.14159265358979323846 * u1;
-        double value     = r * std::cos(theta) * std + mean;
-        s.next_gauss     = r * std::sin(theta) * std + mean;
-        s.has_next_gauss = true;
-        return value;
-    }
-
-    void normal_fill_16(float* data, float mean, float std) {
-        for (int j = 0; j < 8; ++j) {
-            float u1    = 1.0f - data[j];
-            float u2    = data[j + 8];
-            float r     = std::sqrt(-2.0f * std::log(u1));
-            float theta = 2.0f * 3.14159265358979323846 * u2;
-            data[j]     = r * std::cos(theta) * std + mean;
-            data[j + 8] = r * std::sin(theta) * std + mean;
-        }
-    }
-
-    void randn(float* data, int64_t size, float mean = 0.0f, float std = 1.0f) {
-        if (size >= 16) {
-            for (int64_t i = 0; i < size; i++) {
-                data[i] = uniform_real(rand_uint32(), 0.f, 1.f);
-            }
-            for (int64_t i = 0; i < size - 15; i += 16) {
-                normal_fill_16(data + i, mean, std);
-            }
-            if (size % 16 != 0) {
-                // Recompute the last 16 values.
-                data = data + size - 16;
-                for (int64_t i = 0; i < 16; i++) {
-                    data[i] = uniform_real(rand_uint32(), 0.f, 1.f);
-                }
-                normal_fill_16(data, mean, std);
-            }
-        } else {
-            // Strange handling, hard to understand, but keeping it consistent with PyTorch.
-            for (int64_t i = 0; i < size; i++) {
-                data[i] = (float)normal_double_value(mean, std);
-            }
-        }
-    }
-
-public:
-    MT19937RNG(uint64_t seed = 0) { manual_seed(seed); }
-
-    void manual_seed(uint64_t seed) override {
-        s.seed_     = seed;
-        s.seeded_   = true;
-        s.state_[0] = (uint32_t)(seed & 0xffffffffU);
-        for (int j = 1; j < N; j++) {
-            uint32_t prev = s.state_[j - 1];
-            s.state_[j]   = 1812433253U * (prev ^ (prev >> 30)) + j;
-        }
-        s.left_          = 1;
-        s.next_          = 0;
-        s.has_next_gauss = false;
-    }
-
-    std::vector<float> randn(uint32_t n) override {
-        std::vector<float> out;
-        out.resize(n);
-        randn((float*)out.data(), out.size());
-        return out;
-    }
-};
-
-#endif  // __RNG_MT19937_HPP__
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -2,7 +2,6 @@

 #include "model.h"
 #include "rng.hpp"
-#include "rng_mt19937.hpp"
 #include "rng_philox.hpp"
 #include "stable-diffusion.h"
 #include "util.h"
@ -201,8 +200,6 @@ public:
            rng = std::make_shared<STDDefaultRNG>();
        } else if (sd_ctx_params->rng_type == CUDA_RNG) {
            rng = std::make_shared<PhiloxRNG>();
-        } else if (sd_ctx_params->rng_type == CPU_RNG) {
-            rng = std::make_shared<MT19937RNG>();
        }

        ggml_log_set(ggml_log_callback_default, nullptr);
@ -339,14 +336,10 @@ public:

        if (sd_ctx_params->lora_apply_mode == LORA_APPLY_AUTO) {
            bool have_quantized_weight = false;
-            if (wtype != GGML_TYPE_COUNT && ggml_is_quantized(wtype)) {
-                have_quantized_weight = true;
-            } else {
-                for (const auto& [type, _] : wtype_stat) {
-                    if (ggml_is_quantized(type)) {
-                        have_quantized_weight = true;
-                        break;
-                    }
+            for (const auto& [type, _] : wtype_stat) {
+                if (ggml_is_quantized(type)) {
+                    have_quantized_weight = true;
+                    break;
                }
            }
            if (have_quantized_weight) {
@ -2134,7 +2127,6 @@ enum sd_type_t str_to_sd_type(const char* str) {
 const char* rng_type_to_str[] = {
    "std_default",
    "cuda",
-    "cpu",
 };

 const char* sd_rng_type_name(enum rng_type_t rng_type) {
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@ -31,7 +31,6 @@ extern "C" {
 enum rng_type_t {
    STD_DEFAULT_RNG,
    CUDA_RNG,
-    CPU_RNG,
    RNG_TYPE_COUNT
 };

--- a/util.cpp
+++ b/util.cpp
@ -5,7 +5,6 @@
 #include <cstdarg>
 #include <fstream>
 #include <locale>
-#include <regex>
 #include <sstream>
 #include <string>
 #include <thread>
@ -548,8 +547,6 @@ sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int targe
 //   (abc) - increases attention to abc by a multiplier of 1.1
 //   (abc:3.12) - increases attention to abc by a multiplier of 3.12
 //   [abc] - decreases attention to abc by a multiplier of 1.1
-//   BREAK - separates the prompt into conceptually distinct parts for sequential processing
-//   B - internal helper pattern; prevents 'B' in 'BREAK' from being consumed as normal text
 //   \( - literal character '('
 //   \[ - literal character '['
 //   \) - literal character ')'
@ -585,7 +582,7 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
    float round_bracket_multiplier  = 1.1f;
    float square_bracket_multiplier = 1 / 1.1f;

-    std::regex re_attention(R"(\\\(|\\\)|\\\[|\\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|\)|\]|\bBREAK\b|[^\\()\[\]:B]+|:|\bB)");
+    std::regex re_attention(R"(\\\(|\\\)|\\\[|\\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|\)|\]|[^\\()\[\]:]+|:)");
    std::regex re_break(R"(\s*\bBREAK\b\s*)");

    auto multiply_range = [&](int start_position, float multiplier) {
@ -594,7 +591,7 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
        }
    };

-    std::smatch m, m2;
+    std::smatch m;
    std::string remaining_text = text;

    while (std::regex_search(remaining_text, m, re_attention)) {
@ -618,8 +615,6 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
            square_brackets.pop_back();
        } else if (text == "\\(") {
            res.push_back({text.substr(1), 1.0f});
-        } else if (std::regex_search(text, m2, re_break)) {
-            res.push_back({"BREAK", -1.0f});
        } else {
            res.push_back({text, 1.0f});
        }
@ -650,4 +645,4 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
    }

    return res;
-}
+}