mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2025-12-13 05:48:56 +00:00
Compare commits
4 Commits
347710f68f
...
742a7333c3
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
742a7333c3 | ||
|
|
e8eb3791c8 | ||
|
|
aa44e06890 | ||
|
|
6448430dbb |
@ -81,7 +81,9 @@ API and command-line option may change frequently.***
|
||||
- [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457)
|
||||
- `DPM++ 2S a`
|
||||
- [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952)
|
||||
- Cross-platform reproducibility (`--rng cuda`, consistent with the `stable-diffusion-webui GPU RNG`)
|
||||
- Cross-platform reproducibility
|
||||
- `--rng cuda`, default, consistent with the `stable-diffusion-webui GPU RNG`
|
||||
- `--rng cpu`, consistent with the `comfyui RNG`
|
||||
- Embedds generation parameters into png output as webui-compatible text string
|
||||
|
||||
## Quick Start
|
||||
|
||||
@ -278,13 +278,30 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
||||
const std::string& curr_text = item.first;
|
||||
float curr_weight = item.second;
|
||||
// printf(" %s: %f \n", curr_text.c_str(), curr_weight);
|
||||
int32_t clean_index = 0;
|
||||
if (curr_text == "BREAK" && curr_weight == -1.0f) {
|
||||
// Pad token array up to chunk size at this point.
|
||||
// TODO: This is a hardcoded chunk_len, like in stable-diffusion.cpp, make it a parameter for the future?
|
||||
// Also, this is 75 instead of 77 to leave room for BOS and EOS tokens.
|
||||
int padding_size = 75 - (tokens_acc % 75);
|
||||
for (int j = 0; j < padding_size; j++) {
|
||||
clean_input_ids.push_back(tokenizer.EOS_TOKEN_ID);
|
||||
clean_index++;
|
||||
}
|
||||
|
||||
// After padding, continue to the next iteration to process the following text as a new segment
|
||||
tokens.insert(tokens.end(), clean_input_ids.begin(), clean_input_ids.end());
|
||||
weights.insert(weights.end(), padding_size, curr_weight);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Regular token, process normally
|
||||
std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb);
|
||||
int32_t clean_index = 0;
|
||||
for (uint32_t i = 0; i < curr_tokens.size(); i++) {
|
||||
int token_id = curr_tokens[i];
|
||||
if (token_id == image_token)
|
||||
if (token_id == image_token) {
|
||||
class_token_index.push_back(clean_index - 1);
|
||||
else {
|
||||
} else {
|
||||
clean_input_ids.push_back(token_id);
|
||||
clean_index++;
|
||||
}
|
||||
@ -387,6 +404,22 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
||||
for (const auto& item : parsed_attention) {
|
||||
const std::string& curr_text = item.first;
|
||||
float curr_weight = item.second;
|
||||
|
||||
if (curr_text == "BREAK" && curr_weight == -1.0f) {
|
||||
// Pad token array up to chunk size at this point.
|
||||
// TODO: This is a hardcoded chunk_len, like in stable-diffusion.cpp, make it a parameter for the future?
|
||||
// Also, this is 75 instead of 77 to leave room for BOS and EOS tokens.
|
||||
size_t current_size = tokens.size();
|
||||
size_t padding_size = (75 - (current_size % 75)) % 75; // Ensure no negative padding
|
||||
|
||||
if (padding_size > 0) {
|
||||
LOG_DEBUG("BREAK token encountered, padding current chunk by %zu tokens.", padding_size);
|
||||
tokens.insert(tokens.end(), padding_size, tokenizer.EOS_TOKEN_ID);
|
||||
weights.insert(weights.end(), padding_size, 1.0f);
|
||||
}
|
||||
continue; // Skip to the next item after handling BREAK
|
||||
}
|
||||
|
||||
std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb);
|
||||
tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
|
||||
weights.insert(weights.end(), curr_tokens.size(), curr_weight);
|
||||
|
||||
@ -94,7 +94,7 @@ Options:
|
||||
-M, --mode run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen
|
||||
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
|
||||
type of the weight file
|
||||
--rng RNG, one of [std_default, cuda], default: cuda
|
||||
--rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
|
||||
-s, --seed RNG seed (default: 42, use random seed for < 0)
|
||||
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
|
||||
tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise)
|
||||
@ -103,7 +103,7 @@ Options:
|
||||
contain any quantized parameters, the at_runtime mode will be used; otherwise,
|
||||
immediately will be used.The immediately mode may have precision and
|
||||
compatibility issues with quantized parameters, but it usually offers faster inference
|
||||
speed and, in some cases, lower memory usageThe at_runtime mode, on the other
|
||||
speed and, in some cases, lower memory usage. The at_runtime mode, on the other
|
||||
hand, is exactly the opposite.
|
||||
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple], default:
|
||||
discrete
|
||||
@ -119,4 +119,4 @@ Options:
|
||||
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
|
||||
(overrides --vae-tile-size)
|
||||
--preview preview method. must be one of the following [none, proj, tae, vae] (default is none)
|
||||
```
|
||||
```
|
||||
|
||||
@ -1124,7 +1124,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
|
||||
on_type_arg},
|
||||
{"",
|
||||
"--rng",
|
||||
"RNG, one of [std_default, cuda], default: cuda",
|
||||
"RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)",
|
||||
on_rng_arg},
|
||||
{"-s",
|
||||
"--seed",
|
||||
@ -1144,7 +1144,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
|
||||
"the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. "
|
||||
"In auto mode, if the model weights contain any quantized parameters, the at_runtime mode will be used; otherwise, immediately will be used."
|
||||
"The immediately mode may have precision and compatibility issues with quantized parameters, "
|
||||
"but it usually offers faster inference speed and, in some cases, lower memory usage"
|
||||
"but it usually offers faster inference speed and, in some cases, lower memory usage. "
|
||||
"The at_runtime mode, on the other hand, is exactly the opposite.",
|
||||
on_lora_apply_mode_arg},
|
||||
{"",
|
||||
|
||||
147
rng_mt19937.hpp
Normal file
147
rng_mt19937.hpp
Normal file
@ -0,0 +1,147 @@
|
||||
#ifndef __RNG_MT19937_HPP__
|
||||
#define __RNG_MT19937_HPP__
|
||||
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
|
||||
#include "rng.hpp"
|
||||
|
||||
// RNG imitiating torch cpu randn on CPU.
|
||||
// Port from pytorch, original license: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/LICENSE
|
||||
// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/TransformationHelper.h, for uniform_real
|
||||
// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/native/cpu/DistributionTemplates.h, for normal_kernel/normal_fill/normal_fill_16
|
||||
// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/MT19937RNGEngine.h, for mt19937_engine
|
||||
// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/DistributionsHelper.h, for uniform_real_distribution/normal_distribution
|
||||
class MT19937RNG : public RNG {
|
||||
static const int N = 624;
|
||||
static const int M = 397;
|
||||
static const uint32_t MATRIX_A = 0x9908b0dfU;
|
||||
static const uint32_t UMASK = 0x80000000U;
|
||||
static const uint32_t LMASK = 0x7fffffffU;
|
||||
|
||||
struct State {
|
||||
uint64_t seed_;
|
||||
int left_;
|
||||
bool seeded_;
|
||||
uint32_t next_;
|
||||
std::array<uint32_t, N> state_;
|
||||
bool has_next_gauss = false;
|
||||
double next_gauss = 0.0f;
|
||||
};
|
||||
|
||||
State s;
|
||||
|
||||
uint32_t mix_bits(uint32_t u, uint32_t v) { return (u & UMASK) | (v & LMASK); }
|
||||
uint32_t twist(uint32_t u, uint32_t v) { return (mix_bits(u, v) >> 1) ^ ((v & 1) ? MATRIX_A : 0); }
|
||||
void next_state() {
|
||||
uint32_t* p = s.state_.data();
|
||||
s.left_ = N;
|
||||
s.next_ = 0;
|
||||
for (int j = N - M + 1; --j; p++)
|
||||
p[0] = p[M] ^ twist(p[0], p[1]);
|
||||
for (int j = M; --j; p++)
|
||||
p[0] = p[M - N] ^ twist(p[0], p[1]);
|
||||
p[0] = p[M - N] ^ twist(p[0], s.state_[0]);
|
||||
}
|
||||
|
||||
uint32_t rand_uint32() {
|
||||
if (--s.left_ == 0)
|
||||
next_state();
|
||||
uint32_t y = s.state_[s.next_++];
|
||||
y ^= (y >> 11);
|
||||
y ^= (y << 7) & 0x9d2c5680U;
|
||||
y ^= (y << 15) & 0xefc60000U;
|
||||
y ^= (y >> 18);
|
||||
return y;
|
||||
}
|
||||
|
||||
uint64_t rand_uint64() {
|
||||
uint64_t high = (uint64_t)rand_uint32();
|
||||
uint64_t low = (uint64_t)rand_uint32();
|
||||
return (high << 32) | low;
|
||||
}
|
||||
|
||||
template <typename T, typename V>
|
||||
T uniform_real(V val, T from, T to) {
|
||||
constexpr auto MASK = static_cast<V>((static_cast<uint64_t>(1) << std::numeric_limits<T>::digits) - 1);
|
||||
constexpr auto DIVISOR = static_cast<T>(1) / (static_cast<uint64_t>(1) << std::numeric_limits<T>::digits);
|
||||
T x = (val & MASK) * DIVISOR;
|
||||
return (x * (to - from) + from);
|
||||
}
|
||||
|
||||
double normal_double_value(double mean, double std) {
|
||||
if (s.has_next_gauss) {
|
||||
s.has_next_gauss = false;
|
||||
return s.next_gauss;
|
||||
}
|
||||
double u1 = uniform_real(rand_uint64(), 0., 1.); // double
|
||||
double u2 = uniform_real(rand_uint64(), 0., 1.); // double
|
||||
|
||||
double r = std::sqrt(-2.0 * std::log1p(-u2));
|
||||
double theta = 2.0 * 3.14159265358979323846 * u1;
|
||||
double value = r * std::cos(theta) * std + mean;
|
||||
s.next_gauss = r * std::sin(theta) * std + mean;
|
||||
s.has_next_gauss = true;
|
||||
return value;
|
||||
}
|
||||
|
||||
void normal_fill_16(float* data, float mean, float std) {
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
float u1 = 1.0f - data[j];
|
||||
float u2 = data[j + 8];
|
||||
float r = std::sqrt(-2.0f * std::log(u1));
|
||||
float theta = 2.0f * 3.14159265358979323846 * u2;
|
||||
data[j] = r * std::cos(theta) * std + mean;
|
||||
data[j + 8] = r * std::sin(theta) * std + mean;
|
||||
}
|
||||
}
|
||||
|
||||
void randn(float* data, int64_t size, float mean = 0.0f, float std = 1.0f) {
|
||||
if (size >= 16) {
|
||||
for (int64_t i = 0; i < size; i++) {
|
||||
data[i] = uniform_real(rand_uint32(), 0.f, 1.f);
|
||||
}
|
||||
for (int64_t i = 0; i < size - 15; i += 16) {
|
||||
normal_fill_16(data + i, mean, std);
|
||||
}
|
||||
if (size % 16 != 0) {
|
||||
// Recompute the last 16 values.
|
||||
data = data + size - 16;
|
||||
for (int64_t i = 0; i < 16; i++) {
|
||||
data[i] = uniform_real(rand_uint32(), 0.f, 1.f);
|
||||
}
|
||||
normal_fill_16(data, mean, std);
|
||||
}
|
||||
} else {
|
||||
// Strange handling, hard to understand, but keeping it consistent with PyTorch.
|
||||
for (int64_t i = 0; i < size; i++) {
|
||||
data[i] = (float)normal_double_value(mean, std);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
MT19937RNG(uint64_t seed = 0) { manual_seed(seed); }
|
||||
|
||||
void manual_seed(uint64_t seed) override {
|
||||
s.seed_ = seed;
|
||||
s.seeded_ = true;
|
||||
s.state_[0] = (uint32_t)(seed & 0xffffffffU);
|
||||
for (int j = 1; j < N; j++) {
|
||||
uint32_t prev = s.state_[j - 1];
|
||||
s.state_[j] = 1812433253U * (prev ^ (prev >> 30)) + j;
|
||||
}
|
||||
s.left_ = 1;
|
||||
s.next_ = 0;
|
||||
s.has_next_gauss = false;
|
||||
}
|
||||
|
||||
std::vector<float> randn(uint32_t n) override {
|
||||
std::vector<float> out;
|
||||
out.resize(n);
|
||||
randn((float*)out.data(), out.size());
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
#endif // __RNG_MT19937_HPP__
|
||||
@ -2,6 +2,7 @@
|
||||
|
||||
#include "model.h"
|
||||
#include "rng.hpp"
|
||||
#include "rng_mt19937.hpp"
|
||||
#include "rng_philox.hpp"
|
||||
#include "stable-diffusion.h"
|
||||
#include "util.h"
|
||||
@ -200,6 +201,8 @@ public:
|
||||
rng = std::make_shared<STDDefaultRNG>();
|
||||
} else if (sd_ctx_params->rng_type == CUDA_RNG) {
|
||||
rng = std::make_shared<PhiloxRNG>();
|
||||
} else if (sd_ctx_params->rng_type == CPU_RNG) {
|
||||
rng = std::make_shared<MT19937RNG>();
|
||||
}
|
||||
|
||||
ggml_log_set(ggml_log_callback_default, nullptr);
|
||||
@ -336,10 +339,14 @@ public:
|
||||
|
||||
if (sd_ctx_params->lora_apply_mode == LORA_APPLY_AUTO) {
|
||||
bool have_quantized_weight = false;
|
||||
for (const auto& [type, _] : wtype_stat) {
|
||||
if (ggml_is_quantized(type)) {
|
||||
have_quantized_weight = true;
|
||||
break;
|
||||
if (wtype != GGML_TYPE_COUNT && ggml_is_quantized(wtype)) {
|
||||
have_quantized_weight = true;
|
||||
} else {
|
||||
for (const auto& [type, _] : wtype_stat) {
|
||||
if (ggml_is_quantized(type)) {
|
||||
have_quantized_weight = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (have_quantized_weight) {
|
||||
@ -2127,6 +2134,7 @@ enum sd_type_t str_to_sd_type(const char* str) {
|
||||
const char* rng_type_to_str[] = {
|
||||
"std_default",
|
||||
"cuda",
|
||||
"cpu",
|
||||
};
|
||||
|
||||
const char* sd_rng_type_name(enum rng_type_t rng_type) {
|
||||
|
||||
@ -31,6 +31,7 @@ extern "C" {
|
||||
enum rng_type_t {
|
||||
STD_DEFAULT_RNG,
|
||||
CUDA_RNG,
|
||||
CPU_RNG,
|
||||
RNG_TYPE_COUNT
|
||||
};
|
||||
|
||||
|
||||
11
util.cpp
11
util.cpp
@ -5,6 +5,7 @@
|
||||
#include <cstdarg>
|
||||
#include <fstream>
|
||||
#include <locale>
|
||||
#include <regex>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
@ -547,6 +548,8 @@ sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int targe
|
||||
// (abc) - increases attention to abc by a multiplier of 1.1
|
||||
// (abc:3.12) - increases attention to abc by a multiplier of 3.12
|
||||
// [abc] - decreases attention to abc by a multiplier of 1.1
|
||||
// BREAK - separates the prompt into conceptually distinct parts for sequential processing
|
||||
// B - internal helper pattern; prevents 'B' in 'BREAK' from being consumed as normal text
|
||||
// \( - literal character '('
|
||||
// \[ - literal character '['
|
||||
// \) - literal character ')'
|
||||
@ -582,7 +585,7 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
|
||||
float round_bracket_multiplier = 1.1f;
|
||||
float square_bracket_multiplier = 1 / 1.1f;
|
||||
|
||||
std::regex re_attention(R"(\\\(|\\\)|\\\[|\\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|\)|\]|[^\\()\[\]:]+|:)");
|
||||
std::regex re_attention(R"(\\\(|\\\)|\\\[|\\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|\)|\]|\bBREAK\b|[^\\()\[\]:B]+|:|\bB)");
|
||||
std::regex re_break(R"(\s*\bBREAK\b\s*)");
|
||||
|
||||
auto multiply_range = [&](int start_position, float multiplier) {
|
||||
@ -591,7 +594,7 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
|
||||
}
|
||||
};
|
||||
|
||||
std::smatch m;
|
||||
std::smatch m, m2;
|
||||
std::string remaining_text = text;
|
||||
|
||||
while (std::regex_search(remaining_text, m, re_attention)) {
|
||||
@ -615,6 +618,8 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
|
||||
square_brackets.pop_back();
|
||||
} else if (text == "\\(") {
|
||||
res.push_back({text.substr(1), 1.0f});
|
||||
} else if (std::regex_search(text, m2, re_break)) {
|
||||
res.push_back({"BREAK", -1.0f});
|
||||
} else {
|
||||
res.push_back({text, 1.0f});
|
||||
}
|
||||
@ -645,4 +650,4 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user