Compare commits

...

4 Commits

Author SHA1 Message Date
leejet
742a7333c3
feat: add cpu rng (#977) 2025-11-16 14:48:15 +08:00
Wagner Bruna
e8eb3791c8
fix: typo in --lora-apply-mode help (#972) 2025-11-16 14:48:00 +08:00
Wagner Bruna
aa44e06890
fix: avoid crash with LoRAs and type override (#974) 2025-11-16 14:47:36 +08:00
Daniele
6448430dbb
feat: add break pseudo token support (#422)
---------

Co-authored-by: Urs Ganse <urs.ganse@helsinki.fi>
2025-11-16 14:45:20 +08:00
8 changed files with 212 additions and 16 deletions

View File

@ -81,7 +81,9 @@ API and command-line option may change frequently.***
- [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457) - [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457)
- `DPM++ 2S a` - `DPM++ 2S a`
- [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952) - [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952)
- Cross-platform reproducibility (`--rng cuda`, consistent with the `stable-diffusion-webui GPU RNG`) - Cross-platform reproducibility
- `--rng cuda`, default, consistent with the `stable-diffusion-webui GPU RNG`
- `--rng cpu`, consistent with the `comfyui RNG`
- Embedds generation parameters into png output as webui-compatible text string - Embedds generation parameters into png output as webui-compatible text string
## Quick Start ## Quick Start

View File

@ -278,13 +278,30 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
const std::string& curr_text = item.first; const std::string& curr_text = item.first;
float curr_weight = item.second; float curr_weight = item.second;
// printf(" %s: %f \n", curr_text.c_str(), curr_weight); // printf(" %s: %f \n", curr_text.c_str(), curr_weight);
int32_t clean_index = 0;
if (curr_text == "BREAK" && curr_weight == -1.0f) {
// Pad token array up to chunk size at this point.
// TODO: This is a hardcoded chunk_len, like in stable-diffusion.cpp, make it a parameter for the future?
// Also, this is 75 instead of 77 to leave room for BOS and EOS tokens.
int padding_size = 75 - (tokens_acc % 75);
for (int j = 0; j < padding_size; j++) {
clean_input_ids.push_back(tokenizer.EOS_TOKEN_ID);
clean_index++;
}
// After padding, continue to the next iteration to process the following text as a new segment
tokens.insert(tokens.end(), clean_input_ids.begin(), clean_input_ids.end());
weights.insert(weights.end(), padding_size, curr_weight);
continue;
}
// Regular token, process normally
std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb); std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb);
int32_t clean_index = 0;
for (uint32_t i = 0; i < curr_tokens.size(); i++) { for (uint32_t i = 0; i < curr_tokens.size(); i++) {
int token_id = curr_tokens[i]; int token_id = curr_tokens[i];
if (token_id == image_token) if (token_id == image_token) {
class_token_index.push_back(clean_index - 1); class_token_index.push_back(clean_index - 1);
else { } else {
clean_input_ids.push_back(token_id); clean_input_ids.push_back(token_id);
clean_index++; clean_index++;
} }
@ -387,6 +404,22 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
for (const auto& item : parsed_attention) { for (const auto& item : parsed_attention) {
const std::string& curr_text = item.first; const std::string& curr_text = item.first;
float curr_weight = item.second; float curr_weight = item.second;
if (curr_text == "BREAK" && curr_weight == -1.0f) {
// Pad token array up to chunk size at this point.
// TODO: This is a hardcoded chunk_len, like in stable-diffusion.cpp, make it a parameter for the future?
// Also, this is 75 instead of 77 to leave room for BOS and EOS tokens.
size_t current_size = tokens.size();
size_t padding_size = (75 - (current_size % 75)) % 75; // Ensure no negative padding
if (padding_size > 0) {
LOG_DEBUG("BREAK token encountered, padding current chunk by %zu tokens.", padding_size);
tokens.insert(tokens.end(), padding_size, tokenizer.EOS_TOKEN_ID);
weights.insert(weights.end(), padding_size, 1.0f);
}
continue; // Skip to the next item after handling BREAK
}
std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb); std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb);
tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end()); tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
weights.insert(weights.end(), curr_tokens.size(), curr_weight); weights.insert(weights.end(), curr_tokens.size(), curr_weight);

View File

@ -94,7 +94,7 @@ Options:
-M, --mode run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen -M, --mode run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the --type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
type of the weight file type of the weight file
--rng RNG, one of [std_default, cuda], default: cuda --rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
-s, --seed RNG seed (default: 42, use random seed for < 0) -s, --seed RNG seed (default: 42, use random seed for < 0)
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, --sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise) tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise)
@ -103,7 +103,7 @@ Options:
contain any quantized parameters, the at_runtime mode will be used; otherwise, contain any quantized parameters, the at_runtime mode will be used; otherwise,
immediately will be used.The immediately mode may have precision and immediately will be used.The immediately mode may have precision and
compatibility issues with quantized parameters, but it usually offers faster inference compatibility issues with quantized parameters, but it usually offers faster inference
speed and, in some cases, lower memory usageThe at_runtime mode, on the other speed and, in some cases, lower memory usage. The at_runtime mode, on the other
hand, is exactly the opposite. hand, is exactly the opposite.
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple], default: --scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple], default:
discrete discrete
@ -119,4 +119,4 @@ Options:
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 --vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
(overrides --vae-tile-size) (overrides --vae-tile-size)
--preview preview method. must be one of the following [none, proj, tae, vae] (default is none) --preview preview method. must be one of the following [none, proj, tae, vae] (default is none)
``` ```

View File

@ -1124,7 +1124,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
on_type_arg}, on_type_arg},
{"", {"",
"--rng", "--rng",
"RNG, one of [std_default, cuda], default: cuda", "RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)",
on_rng_arg}, on_rng_arg},
{"-s", {"-s",
"--seed", "--seed",
@ -1144,7 +1144,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
"the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. " "the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. "
"In auto mode, if the model weights contain any quantized parameters, the at_runtime mode will be used; otherwise, immediately will be used." "In auto mode, if the model weights contain any quantized parameters, the at_runtime mode will be used; otherwise, immediately will be used."
"The immediately mode may have precision and compatibility issues with quantized parameters, " "The immediately mode may have precision and compatibility issues with quantized parameters, "
"but it usually offers faster inference speed and, in some cases, lower memory usage" "but it usually offers faster inference speed and, in some cases, lower memory usage. "
"The at_runtime mode, on the other hand, is exactly the opposite.", "The at_runtime mode, on the other hand, is exactly the opposite.",
on_lora_apply_mode_arg}, on_lora_apply_mode_arg},
{"", {"",

147
rng_mt19937.hpp Normal file
View File

@ -0,0 +1,147 @@
#ifndef __RNG_MT19937_HPP__
#define __RNG_MT19937_HPP__
#include <cmath>
#include <vector>
#include "rng.hpp"
// RNG imitiating torch cpu randn on CPU.
// Port from pytorch, original license: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/LICENSE
// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/TransformationHelper.h, for uniform_real
// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/native/cpu/DistributionTemplates.h, for normal_kernel/normal_fill/normal_fill_16
// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/MT19937RNGEngine.h, for mt19937_engine
// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/DistributionsHelper.h, for uniform_real_distribution/normal_distribution
class MT19937RNG : public RNG {
static const int N = 624;
static const int M = 397;
static const uint32_t MATRIX_A = 0x9908b0dfU;
static const uint32_t UMASK = 0x80000000U;
static const uint32_t LMASK = 0x7fffffffU;
struct State {
uint64_t seed_;
int left_;
bool seeded_;
uint32_t next_;
std::array<uint32_t, N> state_;
bool has_next_gauss = false;
double next_gauss = 0.0f;
};
State s;
uint32_t mix_bits(uint32_t u, uint32_t v) { return (u & UMASK) | (v & LMASK); }
uint32_t twist(uint32_t u, uint32_t v) { return (mix_bits(u, v) >> 1) ^ ((v & 1) ? MATRIX_A : 0); }
void next_state() {
uint32_t* p = s.state_.data();
s.left_ = N;
s.next_ = 0;
for (int j = N - M + 1; --j; p++)
p[0] = p[M] ^ twist(p[0], p[1]);
for (int j = M; --j; p++)
p[0] = p[M - N] ^ twist(p[0], p[1]);
p[0] = p[M - N] ^ twist(p[0], s.state_[0]);
}
uint32_t rand_uint32() {
if (--s.left_ == 0)
next_state();
uint32_t y = s.state_[s.next_++];
y ^= (y >> 11);
y ^= (y << 7) & 0x9d2c5680U;
y ^= (y << 15) & 0xefc60000U;
y ^= (y >> 18);
return y;
}
uint64_t rand_uint64() {
uint64_t high = (uint64_t)rand_uint32();
uint64_t low = (uint64_t)rand_uint32();
return (high << 32) | low;
}
template <typename T, typename V>
T uniform_real(V val, T from, T to) {
constexpr auto MASK = static_cast<V>((static_cast<uint64_t>(1) << std::numeric_limits<T>::digits) - 1);
constexpr auto DIVISOR = static_cast<T>(1) / (static_cast<uint64_t>(1) << std::numeric_limits<T>::digits);
T x = (val & MASK) * DIVISOR;
return (x * (to - from) + from);
}
double normal_double_value(double mean, double std) {
if (s.has_next_gauss) {
s.has_next_gauss = false;
return s.next_gauss;
}
double u1 = uniform_real(rand_uint64(), 0., 1.); // double
double u2 = uniform_real(rand_uint64(), 0., 1.); // double
double r = std::sqrt(-2.0 * std::log1p(-u2));
double theta = 2.0 * 3.14159265358979323846 * u1;
double value = r * std::cos(theta) * std + mean;
s.next_gauss = r * std::sin(theta) * std + mean;
s.has_next_gauss = true;
return value;
}
void normal_fill_16(float* data, float mean, float std) {
for (int j = 0; j < 8; ++j) {
float u1 = 1.0f - data[j];
float u2 = data[j + 8];
float r = std::sqrt(-2.0f * std::log(u1));
float theta = 2.0f * 3.14159265358979323846 * u2;
data[j] = r * std::cos(theta) * std + mean;
data[j + 8] = r * std::sin(theta) * std + mean;
}
}
void randn(float* data, int64_t size, float mean = 0.0f, float std = 1.0f) {
if (size >= 16) {
for (int64_t i = 0; i < size; i++) {
data[i] = uniform_real(rand_uint32(), 0.f, 1.f);
}
for (int64_t i = 0; i < size - 15; i += 16) {
normal_fill_16(data + i, mean, std);
}
if (size % 16 != 0) {
// Recompute the last 16 values.
data = data + size - 16;
for (int64_t i = 0; i < 16; i++) {
data[i] = uniform_real(rand_uint32(), 0.f, 1.f);
}
normal_fill_16(data, mean, std);
}
} else {
// Strange handling, hard to understand, but keeping it consistent with PyTorch.
for (int64_t i = 0; i < size; i++) {
data[i] = (float)normal_double_value(mean, std);
}
}
}
public:
MT19937RNG(uint64_t seed = 0) { manual_seed(seed); }
void manual_seed(uint64_t seed) override {
s.seed_ = seed;
s.seeded_ = true;
s.state_[0] = (uint32_t)(seed & 0xffffffffU);
for (int j = 1; j < N; j++) {
uint32_t prev = s.state_[j - 1];
s.state_[j] = 1812433253U * (prev ^ (prev >> 30)) + j;
}
s.left_ = 1;
s.next_ = 0;
s.has_next_gauss = false;
}
std::vector<float> randn(uint32_t n) override {
std::vector<float> out;
out.resize(n);
randn((float*)out.data(), out.size());
return out;
}
};
#endif // __RNG_MT19937_HPP__

View File

@ -2,6 +2,7 @@
#include "model.h" #include "model.h"
#include "rng.hpp" #include "rng.hpp"
#include "rng_mt19937.hpp"
#include "rng_philox.hpp" #include "rng_philox.hpp"
#include "stable-diffusion.h" #include "stable-diffusion.h"
#include "util.h" #include "util.h"
@ -200,6 +201,8 @@ public:
rng = std::make_shared<STDDefaultRNG>(); rng = std::make_shared<STDDefaultRNG>();
} else if (sd_ctx_params->rng_type == CUDA_RNG) { } else if (sd_ctx_params->rng_type == CUDA_RNG) {
rng = std::make_shared<PhiloxRNG>(); rng = std::make_shared<PhiloxRNG>();
} else if (sd_ctx_params->rng_type == CPU_RNG) {
rng = std::make_shared<MT19937RNG>();
} }
ggml_log_set(ggml_log_callback_default, nullptr); ggml_log_set(ggml_log_callback_default, nullptr);
@ -336,10 +339,14 @@ public:
if (sd_ctx_params->lora_apply_mode == LORA_APPLY_AUTO) { if (sd_ctx_params->lora_apply_mode == LORA_APPLY_AUTO) {
bool have_quantized_weight = false; bool have_quantized_weight = false;
for (const auto& [type, _] : wtype_stat) { if (wtype != GGML_TYPE_COUNT && ggml_is_quantized(wtype)) {
if (ggml_is_quantized(type)) { have_quantized_weight = true;
have_quantized_weight = true; } else {
break; for (const auto& [type, _] : wtype_stat) {
if (ggml_is_quantized(type)) {
have_quantized_weight = true;
break;
}
} }
} }
if (have_quantized_weight) { if (have_quantized_weight) {
@ -2127,6 +2134,7 @@ enum sd_type_t str_to_sd_type(const char* str) {
const char* rng_type_to_str[] = { const char* rng_type_to_str[] = {
"std_default", "std_default",
"cuda", "cuda",
"cpu",
}; };
const char* sd_rng_type_name(enum rng_type_t rng_type) { const char* sd_rng_type_name(enum rng_type_t rng_type) {

View File

@ -31,6 +31,7 @@ extern "C" {
enum rng_type_t { enum rng_type_t {
STD_DEFAULT_RNG, STD_DEFAULT_RNG,
CUDA_RNG, CUDA_RNG,
CPU_RNG,
RNG_TYPE_COUNT RNG_TYPE_COUNT
}; };

View File

@ -5,6 +5,7 @@
#include <cstdarg> #include <cstdarg>
#include <fstream> #include <fstream>
#include <locale> #include <locale>
#include <regex>
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <thread> #include <thread>
@ -547,6 +548,8 @@ sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int targe
// (abc) - increases attention to abc by a multiplier of 1.1 // (abc) - increases attention to abc by a multiplier of 1.1
// (abc:3.12) - increases attention to abc by a multiplier of 3.12 // (abc:3.12) - increases attention to abc by a multiplier of 3.12
// [abc] - decreases attention to abc by a multiplier of 1.1 // [abc] - decreases attention to abc by a multiplier of 1.1
// BREAK - separates the prompt into conceptually distinct parts for sequential processing
// B - internal helper pattern; prevents 'B' in 'BREAK' from being consumed as normal text
// \( - literal character '(' // \( - literal character '('
// \[ - literal character '[' // \[ - literal character '['
// \) - literal character ')' // \) - literal character ')'
@ -582,7 +585,7 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
float round_bracket_multiplier = 1.1f; float round_bracket_multiplier = 1.1f;
float square_bracket_multiplier = 1 / 1.1f; float square_bracket_multiplier = 1 / 1.1f;
std::regex re_attention(R"(\\\(|\\\)|\\\[|\\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|\)|\]|[^\\()\[\]:]+|:)"); std::regex re_attention(R"(\\\(|\\\)|\\\[|\\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|\)|\]|\bBREAK\b|[^\\()\[\]:B]+|:|\bB)");
std::regex re_break(R"(\s*\bBREAK\b\s*)"); std::regex re_break(R"(\s*\bBREAK\b\s*)");
auto multiply_range = [&](int start_position, float multiplier) { auto multiply_range = [&](int start_position, float multiplier) {
@ -591,7 +594,7 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
} }
}; };
std::smatch m; std::smatch m, m2;
std::string remaining_text = text; std::string remaining_text = text;
while (std::regex_search(remaining_text, m, re_attention)) { while (std::regex_search(remaining_text, m, re_attention)) {
@ -615,6 +618,8 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
square_brackets.pop_back(); square_brackets.pop_back();
} else if (text == "\\(") { } else if (text == "\\(") {
res.push_back({text.substr(1), 1.0f}); res.push_back({text.substr(1), 1.0f});
} else if (std::regex_search(text, m2, re_break)) {
res.push_back({"BREAK", -1.0f});
} else { } else {
res.push_back({text, 1.0f}); res.push_back({text, 1.0f});
} }
@ -645,4 +650,4 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
} }
return res; return res;
} }