Compare commits

..

No commits in common. "742a7333c3a054a62630262496edb5cb1ebcd0ae" and "347710f68f6c6c8e243496957f056a4b9f271d24" have entirely different histories.

8 changed files with 16 additions and 212 deletions

View File

@ -81,9 +81,7 @@ API and command-line option may change frequently.***
- [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457)
- `DPM++ 2S a`
- [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952)
- Cross-platform reproducibility
- `--rng cuda`, default, consistent with the `stable-diffusion-webui GPU RNG`
- `--rng cpu`, consistent with the `comfyui RNG`
- Cross-platform reproducibility (`--rng cuda`, consistent with the `stable-diffusion-webui GPU RNG`)
- Embedds generation parameters into png output as webui-compatible text string
## Quick Start

View File

@ -278,30 +278,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
const std::string& curr_text = item.first;
float curr_weight = item.second;
// printf(" %s: %f \n", curr_text.c_str(), curr_weight);
int32_t clean_index = 0;
if (curr_text == "BREAK" && curr_weight == -1.0f) {
// Pad token array up to chunk size at this point.
// TODO: This is a hardcoded chunk_len, like in stable-diffusion.cpp, make it a parameter for the future?
// Also, this is 75 instead of 77 to leave room for BOS and EOS tokens.
int padding_size = 75 - (tokens_acc % 75);
for (int j = 0; j < padding_size; j++) {
clean_input_ids.push_back(tokenizer.EOS_TOKEN_ID);
clean_index++;
}
// After padding, continue to the next iteration to process the following text as a new segment
tokens.insert(tokens.end(), clean_input_ids.begin(), clean_input_ids.end());
weights.insert(weights.end(), padding_size, curr_weight);
continue;
}
// Regular token, process normally
std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb);
int32_t clean_index = 0;
for (uint32_t i = 0; i < curr_tokens.size(); i++) {
int token_id = curr_tokens[i];
if (token_id == image_token) {
if (token_id == image_token)
class_token_index.push_back(clean_index - 1);
} else {
else {
clean_input_ids.push_back(token_id);
clean_index++;
}
@ -404,22 +387,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
for (const auto& item : parsed_attention) {
const std::string& curr_text = item.first;
float curr_weight = item.second;
if (curr_text == "BREAK" && curr_weight == -1.0f) {
// Pad token array up to chunk size at this point.
// TODO: This is a hardcoded chunk_len, like in stable-diffusion.cpp, make it a parameter for the future?
// Also, this is 75 instead of 77 to leave room for BOS and EOS tokens.
size_t current_size = tokens.size();
size_t padding_size = (75 - (current_size % 75)) % 75; // Ensure no negative padding
if (padding_size > 0) {
LOG_DEBUG("BREAK token encountered, padding current chunk by %zu tokens.", padding_size);
tokens.insert(tokens.end(), padding_size, tokenizer.EOS_TOKEN_ID);
weights.insert(weights.end(), padding_size, 1.0f);
}
continue; // Skip to the next item after handling BREAK
}
std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb);
tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
weights.insert(weights.end(), curr_tokens.size(), curr_weight);

View File

@ -94,7 +94,7 @@ Options:
-M, --mode run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
type of the weight file
--rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
--rng RNG, one of [std_default, cuda], default: cuda
-s, --seed RNG seed (default: 42, use random seed for < 0)
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise)
@ -103,7 +103,7 @@ Options:
contain any quantized parameters, the at_runtime mode will be used; otherwise,
immediately will be used.The immediately mode may have precision and
compatibility issues with quantized parameters, but it usually offers faster inference
speed and, in some cases, lower memory usage. The at_runtime mode, on the other
speed and, in some cases, lower memory usageThe at_runtime mode, on the other
hand, is exactly the opposite.
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple], default:
discrete
@ -119,4 +119,4 @@ Options:
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
(overrides --vae-tile-size)
--preview preview method. must be one of the following [none, proj, tae, vae] (default is none)
```
```

View File

@ -1124,7 +1124,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
on_type_arg},
{"",
"--rng",
"RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)",
"RNG, one of [std_default, cuda], default: cuda",
on_rng_arg},
{"-s",
"--seed",
@ -1144,7 +1144,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
"the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. "
"In auto mode, if the model weights contain any quantized parameters, the at_runtime mode will be used; otherwise, immediately will be used."
"The immediately mode may have precision and compatibility issues with quantized parameters, "
"but it usually offers faster inference speed and, in some cases, lower memory usage. "
"but it usually offers faster inference speed and, in some cases, lower memory usage"
"The at_runtime mode, on the other hand, is exactly the opposite.",
on_lora_apply_mode_arg},
{"",

View File

@ -1,147 +0,0 @@
#ifndef __RNG_MT19937_HPP__
#define __RNG_MT19937_HPP__
#include <cmath>
#include <vector>
#include "rng.hpp"
// RNG imitiating torch cpu randn on CPU.
// Port from pytorch, original license: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/LICENSE
// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/TransformationHelper.h, for uniform_real
// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/native/cpu/DistributionTemplates.h, for normal_kernel/normal_fill/normal_fill_16
// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/MT19937RNGEngine.h, for mt19937_engine
// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/DistributionsHelper.h, for uniform_real_distribution/normal_distribution
class MT19937RNG : public RNG {
static const int N = 624;
static const int M = 397;
static const uint32_t MATRIX_A = 0x9908b0dfU;
static const uint32_t UMASK = 0x80000000U;
static const uint32_t LMASK = 0x7fffffffU;
struct State {
uint64_t seed_;
int left_;
bool seeded_;
uint32_t next_;
std::array<uint32_t, N> state_;
bool has_next_gauss = false;
double next_gauss = 0.0f;
};
State s;
uint32_t mix_bits(uint32_t u, uint32_t v) { return (u & UMASK) | (v & LMASK); }
uint32_t twist(uint32_t u, uint32_t v) { return (mix_bits(u, v) >> 1) ^ ((v & 1) ? MATRIX_A : 0); }
void next_state() {
uint32_t* p = s.state_.data();
s.left_ = N;
s.next_ = 0;
for (int j = N - M + 1; --j; p++)
p[0] = p[M] ^ twist(p[0], p[1]);
for (int j = M; --j; p++)
p[0] = p[M - N] ^ twist(p[0], p[1]);
p[0] = p[M - N] ^ twist(p[0], s.state_[0]);
}
uint32_t rand_uint32() {
if (--s.left_ == 0)
next_state();
uint32_t y = s.state_[s.next_++];
y ^= (y >> 11);
y ^= (y << 7) & 0x9d2c5680U;
y ^= (y << 15) & 0xefc60000U;
y ^= (y >> 18);
return y;
}
uint64_t rand_uint64() {
uint64_t high = (uint64_t)rand_uint32();
uint64_t low = (uint64_t)rand_uint32();
return (high << 32) | low;
}
template <typename T, typename V>
T uniform_real(V val, T from, T to) {
constexpr auto MASK = static_cast<V>((static_cast<uint64_t>(1) << std::numeric_limits<T>::digits) - 1);
constexpr auto DIVISOR = static_cast<T>(1) / (static_cast<uint64_t>(1) << std::numeric_limits<T>::digits);
T x = (val & MASK) * DIVISOR;
return (x * (to - from) + from);
}
double normal_double_value(double mean, double std) {
if (s.has_next_gauss) {
s.has_next_gauss = false;
return s.next_gauss;
}
double u1 = uniform_real(rand_uint64(), 0., 1.); // double
double u2 = uniform_real(rand_uint64(), 0., 1.); // double
double r = std::sqrt(-2.0 * std::log1p(-u2));
double theta = 2.0 * 3.14159265358979323846 * u1;
double value = r * std::cos(theta) * std + mean;
s.next_gauss = r * std::sin(theta) * std + mean;
s.has_next_gauss = true;
return value;
}
void normal_fill_16(float* data, float mean, float std) {
for (int j = 0; j < 8; ++j) {
float u1 = 1.0f - data[j];
float u2 = data[j + 8];
float r = std::sqrt(-2.0f * std::log(u1));
float theta = 2.0f * 3.14159265358979323846 * u2;
data[j] = r * std::cos(theta) * std + mean;
data[j + 8] = r * std::sin(theta) * std + mean;
}
}
void randn(float* data, int64_t size, float mean = 0.0f, float std = 1.0f) {
if (size >= 16) {
for (int64_t i = 0; i < size; i++) {
data[i] = uniform_real(rand_uint32(), 0.f, 1.f);
}
for (int64_t i = 0; i < size - 15; i += 16) {
normal_fill_16(data + i, mean, std);
}
if (size % 16 != 0) {
// Recompute the last 16 values.
data = data + size - 16;
for (int64_t i = 0; i < 16; i++) {
data[i] = uniform_real(rand_uint32(), 0.f, 1.f);
}
normal_fill_16(data, mean, std);
}
} else {
// Strange handling, hard to understand, but keeping it consistent with PyTorch.
for (int64_t i = 0; i < size; i++) {
data[i] = (float)normal_double_value(mean, std);
}
}
}
public:
MT19937RNG(uint64_t seed = 0) { manual_seed(seed); }
void manual_seed(uint64_t seed) override {
s.seed_ = seed;
s.seeded_ = true;
s.state_[0] = (uint32_t)(seed & 0xffffffffU);
for (int j = 1; j < N; j++) {
uint32_t prev = s.state_[j - 1];
s.state_[j] = 1812433253U * (prev ^ (prev >> 30)) + j;
}
s.left_ = 1;
s.next_ = 0;
s.has_next_gauss = false;
}
std::vector<float> randn(uint32_t n) override {
std::vector<float> out;
out.resize(n);
randn((float*)out.data(), out.size());
return out;
}
};
#endif // __RNG_MT19937_HPP__

View File

@ -2,7 +2,6 @@
#include "model.h"
#include "rng.hpp"
#include "rng_mt19937.hpp"
#include "rng_philox.hpp"
#include "stable-diffusion.h"
#include "util.h"
@ -201,8 +200,6 @@ public:
rng = std::make_shared<STDDefaultRNG>();
} else if (sd_ctx_params->rng_type == CUDA_RNG) {
rng = std::make_shared<PhiloxRNG>();
} else if (sd_ctx_params->rng_type == CPU_RNG) {
rng = std::make_shared<MT19937RNG>();
}
ggml_log_set(ggml_log_callback_default, nullptr);
@ -339,14 +336,10 @@ public:
if (sd_ctx_params->lora_apply_mode == LORA_APPLY_AUTO) {
bool have_quantized_weight = false;
if (wtype != GGML_TYPE_COUNT && ggml_is_quantized(wtype)) {
have_quantized_weight = true;
} else {
for (const auto& [type, _] : wtype_stat) {
if (ggml_is_quantized(type)) {
have_quantized_weight = true;
break;
}
for (const auto& [type, _] : wtype_stat) {
if (ggml_is_quantized(type)) {
have_quantized_weight = true;
break;
}
}
if (have_quantized_weight) {
@ -2134,7 +2127,6 @@ enum sd_type_t str_to_sd_type(const char* str) {
const char* rng_type_to_str[] = {
"std_default",
"cuda",
"cpu",
};
const char* sd_rng_type_name(enum rng_type_t rng_type) {

View File

@ -31,7 +31,6 @@ extern "C" {
enum rng_type_t {
STD_DEFAULT_RNG,
CUDA_RNG,
CPU_RNG,
RNG_TYPE_COUNT
};

View File

@ -5,7 +5,6 @@
#include <cstdarg>
#include <fstream>
#include <locale>
#include <regex>
#include <sstream>
#include <string>
#include <thread>
@ -548,8 +547,6 @@ sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int targe
// (abc) - increases attention to abc by a multiplier of 1.1
// (abc:3.12) - increases attention to abc by a multiplier of 3.12
// [abc] - decreases attention to abc by a multiplier of 1.1
// BREAK - separates the prompt into conceptually distinct parts for sequential processing
// B - internal helper pattern; prevents 'B' in 'BREAK' from being consumed as normal text
// \( - literal character '('
// \[ - literal character '['
// \) - literal character ')'
@ -585,7 +582,7 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
float round_bracket_multiplier = 1.1f;
float square_bracket_multiplier = 1 / 1.1f;
std::regex re_attention(R"(\\\(|\\\)|\\\[|\\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|\)|\]|\bBREAK\b|[^\\()\[\]:B]+|:|\bB)");
std::regex re_attention(R"(\\\(|\\\)|\\\[|\\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|\)|\]|[^\\()\[\]:]+|:)");
std::regex re_break(R"(\s*\bBREAK\b\s*)");
auto multiply_range = [&](int start_position, float multiplier) {
@ -594,7 +591,7 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
}
};
std::smatch m, m2;
std::smatch m;
std::string remaining_text = text;
while (std::regex_search(remaining_text, m, re_attention)) {
@ -618,8 +615,6 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
square_brackets.pop_back();
} else if (text == "\\(") {
res.push_back({text.substr(1), 1.0f});
} else if (std::regex_search(text, m2, re_break)) {
res.push_back({"BREAK", -1.0f});
} else {
res.push_back({text, 1.0f});
}
@ -650,4 +645,4 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
}
return res;
}
}