Compare commits

..

No commits in common. "b31fded8f69dba6bb3fe22dc4caebf5b509dceea" and "f68ce0582abac6f6d41c81fc77965ca48efef9fd" have entirely different histories.

31 changed files with 236 additions and 627 deletions

4
.gitignore vendored
View File

@ -1,10 +1,10 @@
build*/
cmake-build-*/
test/
.vscode/
.idea/
.cache/
*.swp
.vscode/
.idea/
*.bat
*.bin
*.exe

View File

@ -149,7 +149,3 @@ if (SD_BUILD_EXAMPLES)
add_subdirectory(examples)
endif()
set(SD_PUBLIC_HEADERS stable-diffusion.h)
set_target_properties(${SD_LIB} PROPERTIES PUBLIC_HEADER "${SD_PUBLIC_HEADERS}")
install(TARGETS ${SD_LIB} LIBRARY PUBLIC_HEADER)

View File

@ -1,19 +0,0 @@
ARG SYCL_VERSION=2025.1.0-0
FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS build
RUN apt-get update && apt-get install -y cmake
WORKDIR /sd.cpp
COPY . .
RUN mkdir build && cd build && \
cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DSD_SYCL=ON -DCMAKE_BUILD_TYPE=Release && \
cmake --build . --config Release -j$(nproc)
FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS runtime
COPY --from=build /sd.cpp/build/bin/sd /sd
ENTRYPOINT [ "/sd" ]

View File

@ -60,6 +60,14 @@ API and command-line option may change frequently.***
- Windows
- Android (via Termux, [Local Diffusion](https://github.com/rmatif/Local-Diffusion))
### TODO
- [ ] More sampling methods
- [ ] Make inference faster
- The current implementation of ggml_conv_2d is slow and has high memory usage
- [ ] Continuing to reduce memory usage (quantizing the weights of ggml_conv_2d)
- [ ] Implement Inpainting support
## Usage
For most users, you can download the built executable program from the latest [release](https://github.com/leejet/stable-diffusion.cpp/releases/latest).
@ -313,9 +321,6 @@ arguments:
-i, --end-img [IMAGE] path to the end image, required by flf2v
--control-image [IMAGE] path to image condition, control net
-r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times)
--control-video [PATH] path to control video frames, It must be a directory path.
The video frames inside should be stored as images in lexicographical (character) order
For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, 鈥?etc.
--increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).
-o, --output OUTPUT path to write result image to (default: ./output.png)
-p, --prompt [PROMPT] the prompt to render
@ -329,9 +334,9 @@ arguments:
--skip-layers LAYERS Layers to skip for SLG steps: (default: [7,8,9])
--skip-layer-start START SLG enabling point: (default: 0.01)
--skip-layer-end END SLG disabling point: (default: 0.2)
--scheduler {discrete, karras, exponential, ays, gits, smoothstep} Denoiser sigma scheduler (default: discrete)
--scheduler {discrete, karras, exponential, ays, gits} Denoiser sigma scheduler (default: discrete)
--sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
sampling method (default: "euler" for Flux/SD3/Wan, "euler_a" otherwise)
sampling method (default: "euler_a")
--steps STEPS number of sample steps (default: 20)
--high-noise-cfg-scale SCALE (high noise) unconditional guidance scale: (default: 7.0)
--high-noise-img-cfg-scale SCALE (high noise) image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
@ -342,7 +347,7 @@ arguments:
--high-noise-skip-layers LAYERS (high noise) Layers to skip for SLG steps: (default: [7,8,9])
--high-noise-skip-layer-start (high noise) SLG enabling point: (default: 0.01)
--high-noise-skip-layer-end END (high noise) SLG disabling point: (default: 0.2)
--high-noise-scheduler {discrete, karras, exponential, ays, gits, smoothstep} Denoiser sigma scheduler (default: discrete)
--high-noise-scheduler {discrete, karras, exponential, ays, gits} Denoiser sigma scheduler (default: discrete)
--high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
(high noise) sampling method (default: "euler_a")
--high-noise-steps STEPS (high noise) number of sample steps (default: -1 = auto)
@ -359,9 +364,6 @@ arguments:
--clip-skip N ignore last_dot_pos layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)
<= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
--vae-tiling process vae in tiles to reduce memory usage
--vae-tile-size [X]x[Y] tile size for vae tiling (default: 32x32)
--vae-relative-tile-size [X]x[Y] relative tile size for vae tiling, in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)
--vae-tile-overlap OVERLAP tile overlap for vae tiling, in fraction of tile size (default: 0.5)
--vae-on-cpu keep vae in cpu (for low vram)
--clip-on-cpu keep clip in cpu (for low vram)
--diffusion-fa use flash attention in the diffusion model (for low vram)
@ -382,7 +384,6 @@ arguments:
--moe-boundary BOUNDARY timestep boundary for Wan2.2 MoE model. (default: 0.875)
only enabled if `--high-noise-steps` is set to -1
--flow-shift SHIFT shift value for Flow models like SD3.x or WAN (default: auto)
--vace-strength wan vace strength
-v, --verbose print extra info
```
@ -392,9 +393,9 @@ arguments:
./bin/sd -m ../models/sd-v1-4.ckpt -p "a lovely cat"
# ./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
# ./bin/sd -m ../models/sd_xl_base_1.0.safetensors --vae ../models/sdxl_vae-fp16-fix.safetensors -H 1024 -W 1024 -p "a lovely cat" -v
# ./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable Diffusion CPP\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
# ./bin/sd --diffusion-model ../models/flux1-dev-q3_k.gguf --vae ../models/ae.sft --clip_l ../models/clip_l.safetensors --t5xxl ../models/t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
# ./bin/sd -m ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
# ./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable Diffusion CPP\"' --cfg-scale 4.5 --sampling-method euler -v
# ./bin/sd --diffusion-model ../models/flux1-dev-q3_k.gguf --vae ../models/ae.sft --clip_l ../models/clip_l.safetensors --t5xxl ../models/t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v
# ./bin/sd -m ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v
```
Using formats of different precisions will yield results of varying quality.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -548,15 +548,9 @@ protected:
int64_t embed_dim;
int64_t vocab_size;
int64_t num_positions;
bool force_clip_f32;
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
enum ggml_type token_wtype = GGML_TYPE_F32;
if (!force_clip_f32) {
auto tensor_type = tensor_types.find(prefix + "token_embedding.weight");
if (tensor_type != tensor_types.end())
token_wtype = tensor_type->second;
}
enum ggml_type token_wtype = GGML_TYPE_F32;
enum ggml_type position_wtype = GGML_TYPE_F32;
params["token_embedding.weight"] = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
@ -566,12 +560,10 @@ protected:
public:
CLIPEmbeddings(int64_t embed_dim,
int64_t vocab_size = 49408,
int64_t num_positions = 77,
bool force_clip_f32 = false)
int64_t num_positions = 77)
: embed_dim(embed_dim),
vocab_size(vocab_size),
num_positions(num_positions),
force_clip_f32(force_clip_f32) {
num_positions(num_positions) {
}
struct ggml_tensor* get_token_embed_weight() {
@ -686,11 +678,12 @@ public:
int32_t n_head = 12;
int32_t n_layer = 12; // num_hidden_layers
int32_t projection_dim = 1280; // only for OPEN_CLIP_VIT_BIGG_14
int32_t clip_skip = -1;
bool with_final_ln = true;
CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
bool with_final_ln = true,
bool force_clip_f32 = false)
int clip_skip_value = -1)
: version(version), with_final_ln(with_final_ln) {
if (version == OPEN_CLIP_VIT_H_14) {
hidden_size = 1024;
@ -703,12 +696,20 @@ public:
n_head = 20;
n_layer = 32;
}
set_clip_skip(clip_skip_value);
blocks["embeddings"] = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token, force_clip_f32));
blocks["embeddings"] = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token));
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size));
blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
}
void set_clip_skip(int skip) {
if (skip <= 0) {
skip = -1;
}
clip_skip = skip;
}
struct ggml_tensor* get_token_embed_weight() {
auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
return embeddings->get_token_embed_weight();
@ -719,8 +720,7 @@ public:
struct ggml_tensor* input_ids,
struct ggml_tensor* tkn_embeddings,
size_t max_token_idx = 0,
bool return_pooled = false,
int clip_skip = -1) {
bool return_pooled = false) {
// input_ids: [N, n_token]
auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
auto encoder = std::dynamic_pointer_cast<CLIPEncoder>(blocks["encoder"]);
@ -889,8 +889,8 @@ struct CLIPTextModelRunner : public GGMLRunner {
const std::string prefix,
CLIPVersion version = OPENAI_CLIP_VIT_L_14,
bool with_final_ln = true,
bool force_clip_f32 = false)
: GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, force_clip_f32) {
int clip_skip_value = -1)
: GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, clip_skip_value) {
model.init(params_ctx, tensor_types, prefix);
}
@ -898,6 +898,10 @@ struct CLIPTextModelRunner : public GGMLRunner {
return "clip";
}
void set_clip_skip(int clip_skip) {
model.set_clip_skip(clip_skip);
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
model.get_param_tensors(tensors, prefix);
}
@ -907,8 +911,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
struct ggml_tensor* input_ids,
struct ggml_tensor* embeddings,
size_t max_token_idx = 0,
bool return_pooled = false,
int clip_skip = -1) {
bool return_pooled = false) {
size_t N = input_ids->ne[1];
size_t n_token = input_ids->ne[0];
if (input_ids->ne[0] > model.n_token) {
@ -916,15 +919,14 @@ struct CLIPTextModelRunner : public GGMLRunner {
input_ids = ggml_reshape_2d(ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
}
return model.forward(ctx, backend, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
return model.forward(ctx, backend, input_ids, embeddings, max_token_idx, return_pooled);
}
struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
int num_custom_embeddings = 0,
void* custom_embeddings_data = NULL,
size_t max_token_idx = 0,
bool return_pooled = false,
int clip_skip = -1) {
bool return_pooled = false) {
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
input_ids = to_backend(input_ids);
@ -943,7 +945,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1);
}
struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, embeddings, max_token_idx, return_pooled);
ggml_build_forward_expand(gf, hidden_states);
@ -956,11 +958,10 @@ struct CLIPTextModelRunner : public GGMLRunner {
void* custom_embeddings_data,
size_t max_token_idx,
bool return_pooled,
int clip_skip,
ggml_tensor** output,
ggml_context* output_ctx = NULL) {
auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip);
return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled);
};
GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
}

View File

@ -61,16 +61,30 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
const String2GGMLType& tensor_types,
const std::string& embd_dir,
SDVersion version = VERSION_SD1,
PMVersion pv = PM_VERSION_1)
PMVersion pv = PM_VERSION_1,
int clip_skip = -1)
: version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
bool force_clip_f32 = embd_dir.size() > 0;
if (sd_version_is_sd1(version)) {
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14);
} else if (sd_version_is_sd2(version)) {
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14);
} else if (sd_version_is_sdxl(version)) {
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
}
set_clip_skip(clip_skip);
}
void set_clip_skip(int clip_skip) {
if (clip_skip <= 0) {
clip_skip = 1;
if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) {
clip_skip = 2;
}
}
text_model->set_clip_skip(clip_skip);
if (sd_version_is_sdxl(version)) {
text_model2->set_clip_skip(clip_skip);
}
}
@ -115,7 +129,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
return true;
}
struct ggml_init_params params;
params.mem_size = 100 * 1024 * 1024; // max for custom embeddings 100 MB
params.mem_size = 10 * 1024 * 1024; // max for custom embeddings 10 MB
params.mem_buffer = NULL;
params.no_alloc = false;
struct ggml_context* embd_ctx = ggml_init(params);
@ -398,6 +412,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
int height,
int adm_in_channels = -1,
bool zero_out_masked = false) {
set_clip_skip(clip_skip);
int64_t t0 = ggml_time_ms();
struct ggml_tensor* hidden_states = NULL; // [N, n_token, hidden_size]
struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2]
@ -406,10 +421,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
struct ggml_tensor* pooled = NULL;
std::vector<float> hidden_states_vec;
if (clip_skip <= 0) {
clip_skip = (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) ? 2 : 1;
}
size_t chunk_len = 77;
size_t chunk_count = tokens.size() / chunk_len;
for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) {
@ -444,7 +455,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
token_embed_custom.data(),
max_token_idx,
false,
clip_skip,
&chunk_hidden_states1,
work_ctx);
if (sd_version_is_sdxl(version)) {
@ -454,7 +464,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
token_embed_custom.data(),
max_token_idx,
false,
clip_skip,
&chunk_hidden_states2, work_ctx);
// concat
chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0);
@ -466,7 +475,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
token_embed_custom.data(),
max_token_idx,
true,
clip_skip,
&pooled,
work_ctx);
}
@ -661,11 +669,21 @@ struct SD3CLIPEmbedder : public Conditioner {
SD3CLIPEmbedder(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {})
const String2GGMLType& tensor_types = {},
int clip_skip = -1)
: clip_g_tokenizer(0) {
clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
clip_g = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer");
set_clip_skip(clip_skip);
}
void set_clip_skip(int clip_skip) {
if (clip_skip <= 0) {
clip_skip = 2;
}
clip_l->set_clip_skip(clip_skip);
clip_g->set_clip_skip(clip_skip);
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
@ -762,6 +780,7 @@ struct SD3CLIPEmbedder : public Conditioner {
std::vector<std::pair<std::vector<int>, std::vector<float>>> token_and_weights,
int clip_skip,
bool zero_out_masked = false) {
set_clip_skip(clip_skip);
auto& clip_l_tokens = token_and_weights[0].first;
auto& clip_l_weights = token_and_weights[0].second;
auto& clip_g_tokens = token_and_weights[1].first;
@ -769,10 +788,6 @@ struct SD3CLIPEmbedder : public Conditioner {
auto& t5_tokens = token_and_weights[2].first;
auto& t5_weights = token_and_weights[2].second;
if (clip_skip <= 0) {
clip_skip = 2;
}
int64_t t0 = ggml_time_ms();
struct ggml_tensor* hidden_states = NULL; // [N, n_token*2, 4096]
struct ggml_tensor* chunk_hidden_states = NULL; // [n_token*2, 4096]
@ -803,7 +818,6 @@ struct SD3CLIPEmbedder : public Conditioner {
NULL,
max_token_idx,
false,
clip_skip,
&chunk_hidden_states_l,
work_ctx);
{
@ -831,7 +845,6 @@ struct SD3CLIPEmbedder : public Conditioner {
NULL,
max_token_idx,
true,
clip_skip,
&pooled_l,
work_ctx);
}
@ -853,7 +866,6 @@ struct SD3CLIPEmbedder : public Conditioner {
NULL,
max_token_idx,
false,
clip_skip,
&chunk_hidden_states_g,
work_ctx);
@ -882,7 +894,6 @@ struct SD3CLIPEmbedder : public Conditioner {
NULL,
max_token_idx,
true,
clip_skip,
&pooled_g,
work_ctx);
}
@ -1006,9 +1017,18 @@ struct FluxCLIPEmbedder : public Conditioner {
FluxCLIPEmbedder(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {}) {
const String2GGMLType& tensor_types = {},
int clip_skip = -1) {
clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer");
set_clip_skip(clip_skip);
}
void set_clip_skip(int clip_skip) {
if (clip_skip <= 0) {
clip_skip = 2;
}
clip_l->set_clip_skip(clip_skip);
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
@ -1089,15 +1109,12 @@ struct FluxCLIPEmbedder : public Conditioner {
std::vector<std::pair<std::vector<int>, std::vector<float>>> token_and_weights,
int clip_skip,
bool zero_out_masked = false) {
set_clip_skip(clip_skip);
auto& clip_l_tokens = token_and_weights[0].first;
auto& clip_l_weights = token_and_weights[0].second;
auto& t5_tokens = token_and_weights[1].first;
auto& t5_weights = token_and_weights[1].second;
if (clip_skip <= 0) {
clip_skip = 2;
}
int64_t t0 = ggml_time_ms();
struct ggml_tensor* hidden_states = NULL; // [N, n_token, 4096]
struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, 4096]
@ -1126,7 +1143,6 @@ struct FluxCLIPEmbedder : public Conditioner {
NULL,
max_token_idx,
true,
clip_skip,
&pooled,
work_ctx);
}
@ -1225,6 +1241,7 @@ struct T5CLIPEmbedder : public Conditioner {
T5CLIPEmbedder(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {},
int clip_skip = -1,
bool use_mask = false,
int mask_pad = 1,
bool is_umt5 = false)
@ -1232,6 +1249,9 @@ struct T5CLIPEmbedder : public Conditioner {
t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer", is_umt5);
}
void set_clip_skip(int clip_skip) {
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
}

View File

@ -251,35 +251,6 @@ struct KarrasSchedule : SigmaSchedule {
}
};
// Close to Beta Schedule, but increadably simple in code.
struct SmoothStepSchedule : SigmaSchedule {
static constexpr float smoothstep(float x) {
return x * x * (3.0f - 2.0f * x);
}
std::vector<float> get_sigmas(uint32_t n, float /*sigma_min*/, float /*sigma_max*/, t_to_sigma_t t_to_sigma) override {
std::vector<float> result;
result.reserve(n + 1);
const int t_max = TIMESTEPS - 1;
if (n == 0) {
return result;
} else if (n == 1) {
result.push_back(t_to_sigma((float)t_max));
result.push_back(0.f);
return result;
}
for (uint32_t i = 0; i < n; i++) {
float u = 1.f - float(i) / float(n);
result.push_back(t_to_sigma(std::round(smoothstep(u) * t_max)));
}
result.push_back(0.f);
return result;
}
};
struct Denoiser {
std::shared_ptr<SigmaSchedule> scheduler = std::make_shared<DiscreteSchedule>();
virtual float sigma_min() = 0;

View File

@ -97,9 +97,8 @@ struct MMDiTModel : public DiffusionModel {
MMDiTModel(ggml_backend_t backend,
bool offload_params_to_cpu,
bool flash_attn = false,
const String2GGMLType& tensor_types = {})
: mmdit(backend, offload_params_to_cpu, flash_attn, tensor_types, "model.diffusion_model") {
: mmdit(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model") {
}
std::string get_desc() {

View File

@ -24,7 +24,7 @@ You can download the preconverted gguf weights from [silveroxides/Chroma-GGUF](h
For example:
```
.\bin\Release\sd.exe --diffusion-model ..\models\chroma-unlocked-v40-q8_0.gguf --vae ..\models\ae.sft --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'chroma.cpp'" --cfg-scale 4.0 --sampling-method euler -v --chroma-disable-dit-mask --clip-on-cpu
.\bin\Release\sd.exe --diffusion-model ..\models\chroma-unlocked-v40-q8_0.gguf --vae ..\models\ae.sft --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'chroma.cpp'" --cfg-scale 4.0 --sampling-method euler -v --chroma-disable-dit-mask
```
![](../assets/flux/chroma_v40.png)

View File

@ -28,7 +28,7 @@ Using fp16 will lead to overflow, but ggml's support for bf16 is not yet fully d
For example:
```
.\bin\Release\sd.exe --diffusion-model ..\models\flux1-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
.\bin\Release\sd.exe --diffusion-model ..\models\flux1-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v
```
Using formats of different precisions will yield results of varying quality.
@ -44,7 +44,7 @@ Using formats of different precisions will yield results of varying quality.
```
.\bin\Release\sd.exe --diffusion-model ..\models\flux1-schnell-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --steps 4 --clip-on-cpu
.\bin\Release\sd.exe --diffusion-model ..\models\flux1-schnell-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --steps 4
```
| q8_0 |
@ -60,7 +60,7 @@ Since many flux LoRA training libraries have used various LoRA naming formats, i
- LoRA model from https://huggingface.co/XLabs-AI/flux-lora-collection/tree/main (using comfy converted version!!!)
```
.\bin\Release\sd.exe --diffusion-model ..\models\flux1-dev-q8_0.gguf --vae ...\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'<lora:realism_lora_comfy_converted:1>" --cfg-scale 1.0 --sampling-method euler -v --lora-model-dir ../models --clip-on-cpu
.\bin\Release\sd.exe --diffusion-model ..\models\flux1-dev-q8_0.gguf --vae ...\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'<lora:realism_lora_comfy_converted:1>" --cfg-scale 1.0 --sampling-method euler -v --lora-model-dir ../models
```
![output](../assets/flux/flux1-dev-q8_0%20with%20lora.png)

View File

@ -27,7 +27,7 @@ You can download the preconverted gguf weights from [FLUX.1-Kontext-dev-GGUF](ht
For example:
```
.\bin\Release\sd.exe -r .\flux1-dev-q8_0.png --diffusion-model ..\models\flux1-kontext-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "change 'flux.cpp' to 'kontext.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
.\bin\Release\sd.exe -r .\flux1-dev-q8_0.png --diffusion-model ..\models\flux1-kontext-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "change 'flux.cpp' to 'kontext.cpp'" --cfg-scale 1.0 --sampling-method euler -v
```

View File

@ -14,7 +14,7 @@
For example:
```
.\bin\Release\sd.exe -m ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
.\bin\Release\sd.exe -m ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v
```
![](../assets/sd3.5_large.png)

View File

@ -18,12 +18,6 @@
- Wan2.1 FLF2V 14B 720P
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/city96/Wan2.1-FLF2V-14B-720P-gguf/tree/main
- Wan2.1 VACE 1.3B
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/calcuis/wan-1.3b-gguf/tree/main
- Wan2.1 VACE 14B
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/QuantStack/Wan2.1_14B_VACE-GGUF/tree/main
- Wan2.2
- Wan2.2 TI2V 5B
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/tree/main/split_files/diffusion_models
@ -143,62 +137,3 @@
```
<video src=../assets/wan/Wan2.2_14B_flf2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
### Wan2.1 VACE 1.3B
#### T2V
```
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1-vace-1.3b-q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa --video-frames 1 --offload-to-cpu
```
<video src=../assets/wan/Wan2.1_1.3B_vace_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
#### R2V
```
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1-vace-1.3b-q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa -i ..\assets\cat_with_sd_cpp_42.png --video-frames 33 --offload-to-cpu
```
<video src=../assets/wan/Wan2.1_1.3B_vace_r2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
#### V2V
```
mkdir post+depth
ffmpeg -i ..\..\ComfyUI\input\post+depth.mp4 -qscale:v 1 -vf fps=8 post+depth\frame_%04d.jpg
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1-vace-1.3b-q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "The girl is dancing in a sea of flowers, slowly moving her hands. There is a close - up shot of her upper body. The character is surrounded by other transparent glass flowers in the style of Nicoletta Ceccoli, creating a beautiful, surreal, and emotionally expressive movie scene with a white. transparent feel and a dreamyl atmosphere." --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 480 -H 832 --diffusion-fa -i ..\..\ComfyUI\input\dance_girl.jpg --control-video ./post+depth --video-frames 33 --offload-to-cpu
```
<video src=../assets/wan/Wan2.1_1.3B_vace_v2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
### Wan2.1 VACE 14B
#### T2V
```
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.1_14B_VACE-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa --video-frames 33 --offload-to-cpu
```
<video src=../assets/wan/Wan2.1_14B_vace_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
#### R2V
```
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.1_14B_VACE-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa -i ..\assets\cat_with_sd_cpp_42.png --video-frames 33 --offload-to-cpu
```
<video src=../assets/wan/Wan2.1_14B_vace_r2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
#### V2V
```
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.1_14B_VACE-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "The girl is dancing in a sea of flowers, slowly moving her hands. There is a close - up shot of her upper body. The character is surrounded by other transparent glass flowers in the style of Nicoletta Ceccoli, creating a beautiful, surreal, and emotionally expressive movie scene with a white. transparent feel and a dreamyl atmosphere." --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 480 -H 832 --diffusion-fa -i ..\..\ComfyUI\input\dance_girl.jpg --control-video ./post+depth --video-frames 33 --offload-to-cpu
```
<video src=../assets/wan/Wan2.1_14B_vace_v2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>

View File

@ -104,6 +104,7 @@ struct SDParams {
rng_type_t rng_type = CUDA_RNG;
int64_t seed = 42;
bool verbose = false;
bool vae_tiling = false;
bool offload_params_to_cpu = false;
bool control_net_cpu = false;
bool normalize_input = false;
@ -121,8 +122,6 @@ struct SDParams {
int chroma_t5_mask_pad = 1;
float flow_shift = INFINITY;
sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f};
SDParams() {
sd_sample_params_init(&sample_params);
sd_sample_params_init(&high_noise_sample_params);
@ -185,7 +184,7 @@ void print_params(SDParams params) {
printf(" rng: %s\n", sd_rng_type_name(params.rng_type));
printf(" seed: %zd\n", params.seed);
printf(" batch_count: %d\n", params.batch_count);
printf(" vae_tiling: %s\n", params.vae_tiling_params.enabled ? "true" : "false");
printf(" vae_tiling: %s\n", params.vae_tiling ? "true" : "false");
printf(" upscale_repeats: %d\n", params.upscale_repeats);
printf(" chroma_use_dit_mask: %s\n", params.chroma_use_dit_mask ? "true" : "false");
printf(" chroma_use_t5_mask: %s\n", params.chroma_use_t5_mask ? "true" : "false");
@ -231,7 +230,7 @@ void print_usage(int argc, const char* argv[]) {
printf(" -i, --end-img [IMAGE] path to the end image, required by flf2v\n");
printf(" --control-image [IMAGE] path to image condition, control net\n");
printf(" -r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times) \n");
printf(" --control-video [PATH] path to control video frames, It must be a directory path.\n");
printf(" --control-video [PATH] path to control video frames, It must be a directory path.");
printf(" The video frames inside should be stored as images in lexicographical (character) order\n");
printf(" For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, … etc.\n");
printf(" --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).\n");
@ -247,9 +246,9 @@ void print_usage(int argc, const char* argv[]) {
printf(" --skip-layers LAYERS Layers to skip for SLG steps: (default: [7,8,9])\n");
printf(" --skip-layer-start START SLG enabling point: (default: 0.01)\n");
printf(" --skip-layer-end END SLG disabling point: (default: 0.2)\n");
printf(" --scheduler {discrete, karras, exponential, ays, gits, smoothstep} Denoiser sigma scheduler (default: discrete)\n");
printf(" --scheduler {discrete, karras, exponential, ays, gits} Denoiser sigma scheduler (default: discrete)\n");
printf(" --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n");
printf(" sampling method (default: \"euler\" for Flux/SD3/Wan, \"euler_a\" otherwise)\n");
printf(" sampling method (default: \"euler_a\")\n");
printf(" --steps STEPS number of sample steps (default: 20)\n");
printf(" --high-noise-cfg-scale SCALE (high noise) unconditional guidance scale: (default: 7.0)\n");
printf(" --high-noise-img-cfg-scale SCALE (high noise) image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)\n");
@ -260,7 +259,7 @@ void print_usage(int argc, const char* argv[]) {
printf(" --high-noise-skip-layers LAYERS (high noise) Layers to skip for SLG steps: (default: [7,8,9])\n");
printf(" --high-noise-skip-layer-start (high noise) SLG enabling point: (default: 0.01)\n");
printf(" --high-noise-skip-layer-end END (high noise) SLG disabling point: (default: 0.2)\n");
printf(" --high-noise-scheduler {discrete, karras, exponential, ays, gits, smoothstep} Denoiser sigma scheduler (default: discrete)\n");
printf(" --high-noise-scheduler {discrete, karras, exponential, ays, gits} Denoiser sigma scheduler (default: discrete)\n");
printf(" --high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n");
printf(" (high noise) sampling method (default: \"euler_a\")\n");
printf(" --high-noise-steps STEPS (high noise) number of sample steps (default: -1 = auto)\n");
@ -277,9 +276,6 @@ void print_usage(int argc, const char* argv[]) {
printf(" --clip-skip N ignore last_dot_pos layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n");
printf(" <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x\n");
printf(" --vae-tiling process vae in tiles to reduce memory usage\n");
printf(" --vae-tile-size [X]x[Y] tile size for vae tiling (default: 32x32)\n");
printf(" --vae-relative-tile-size [X]x[Y] relative tile size for vae tiling, in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)\n");
printf(" --vae-tile-overlap OVERLAP tile overlap for vae tiling, in fraction of tile size (default: 0.5)\n");
printf(" --vae-on-cpu keep vae in cpu (for low vram)\n");
printf(" --clip-on-cpu keep clip in cpu (for low vram)\n");
printf(" --diffusion-fa use flash attention in the diffusion model (for low vram)\n");
@ -499,6 +495,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
{"-o", "--output", "", &params.output_path},
{"-p", "--prompt", "", &params.prompt},
{"-n", "--negative-prompt", "", &params.negative_prompt},
{"", "--upscale-model", "", &params.esrgan_path},
};
@ -537,11 +534,10 @@ void parse_args(int argc, const char** argv, SDParams& params) {
{"", "--moe-boundary", "", &params.moe_boundary},
{"", "--flow-shift", "", &params.flow_shift},
{"", "--vace-strength", "", &params.vace_strength},
{"", "--vae-tile-overlap", "", &params.vae_tiling_params.target_overlap},
};
options.bool_options = {
{"", "--vae-tiling", "", true, &params.vae_tiling_params.enabled},
{"", "--vae-tiling", "", true, &params.vae_tiling},
{"", "--offload-to-cpu", "", true, &params.offload_params_to_cpu},
{"", "--control-net-cpu", "", true, &params.control_net_cpu},
{"", "--normalize-input", "", true, &params.normalize_input},
@ -741,52 +737,6 @@ void parse_args(int argc, const char** argv, SDParams& params) {
return 1;
};
auto on_tile_size_arg = [&](int argc, const char** argv, int index) {
if (++index >= argc) {
return -1;
}
std::string tile_size_str = argv[index];
size_t x_pos = tile_size_str.find('x');
try {
if (x_pos != std::string::npos) {
std::string tile_x_str = tile_size_str.substr(0, x_pos);
std::string tile_y_str = tile_size_str.substr(x_pos + 1);
params.vae_tiling_params.tile_size_x = std::stoi(tile_x_str);
params.vae_tiling_params.tile_size_y = std::stoi(tile_y_str);
} else {
params.vae_tiling_params.tile_size_x = params.vae_tiling_params.tile_size_y = std::stoi(tile_size_str);
}
} catch (const std::invalid_argument& e) {
return -1;
} catch (const std::out_of_range& e) {
return -1;
}
return 1;
};
auto on_relative_tile_size_arg = [&](int argc, const char** argv, int index) {
if (++index >= argc) {
return -1;
}
std::string rel_size_str = argv[index];
size_t x_pos = rel_size_str.find('x');
try {
if (x_pos != std::string::npos) {
std::string rel_x_str = rel_size_str.substr(0, x_pos);
std::string rel_y_str = rel_size_str.substr(x_pos + 1);
params.vae_tiling_params.rel_size_x = std::stof(rel_x_str);
params.vae_tiling_params.rel_size_y = std::stof(rel_y_str);
} else {
params.vae_tiling_params.rel_size_x = params.vae_tiling_params.rel_size_y = std::stof(rel_size_str);
}
} catch (const std::invalid_argument& e) {
return -1;
} catch (const std::out_of_range& e) {
return -1;
}
return 1;
};
options.manual_options = {
{"-M", "--mode", "", on_mode_arg},
{"", "--type", "", on_type_arg},
@ -800,8 +750,6 @@ void parse_args(int argc, const char** argv, SDParams& params) {
{"", "--high-noise-skip-layers", "", on_high_noise_skip_layers_arg},
{"-r", "--ref-image", "", on_ref_image_arg},
{"-h", "--help", "", on_help_arg},
{"", "--vae-tile-size", "", on_tile_size_arg},
{"", "--vae-relative-tile-size", "", on_relative_tile_size_arg},
};
if (!parse_options(argc, argv, options)) {
@ -1285,6 +1233,7 @@ int main(int argc, const char* argv[]) {
params.embedding_dir.c_str(),
params.stacked_id_embed_dir.c_str(),
vae_decode_only,
params.vae_tiling,
true,
params.n_threads,
params.wtype,
@ -1310,10 +1259,6 @@ int main(int argc, const char* argv[]) {
return 1;
}
if (params.sample_params.sample_method == SAMPLE_METHOD_DEFAULT) {
params.sample_params.sample_method = sd_get_default_sample_method(sd_ctx);
}
sd_image_t* results;
int num_results = 1;
if (params.mode == IMG_GEN) {
@ -1337,7 +1282,6 @@ int main(int argc, const char* argv[]) {
params.style_ratio,
params.normalize_input,
params.input_id_images_path.c_str(),
params.vae_tiling_params,
};
results = generate_image(sd_ctx, &img_gen_params);

View File

@ -56,25 +56,6 @@
#define __STATIC_INLINE__ static inline
#endif
__STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void*) {
switch (level) {
case GGML_LOG_LEVEL_DEBUG:
LOG_DEBUG(text);
break;
case GGML_LOG_LEVEL_INFO:
LOG_INFO(text);
break;
case GGML_LOG_LEVEL_WARN:
LOG_WARN(text);
break;
case GGML_LOG_LEVEL_ERROR:
LOG_ERROR(text);
break;
default:
LOG_DEBUG(text);
}
}
static_assert(GGML_MAX_NAME >= 128, "GGML_MAX_NAME must be at least 128");
// n-mode trensor-matrix product
@ -143,6 +124,13 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_kronecker(ggml_context* ctx, struct g
b);
}
__STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void* user_data) {
(void)level;
(void)user_data;
fputs(text, stderr);
fflush(stderr);
}
__STATIC_INLINE__ void ggml_tensor_set_f32_randn(struct ggml_tensor* tensor, std::shared_ptr<RNG> rng) {
uint32_t n = (uint32_t)ggml_nelements(tensor);
std::vector<float> random_numbers = rng->randn(n);
@ -524,10 +512,7 @@ __STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input,
struct ggml_tensor* output,
int x,
int y,
int overlap_x,
int overlap_y,
int x_skip = 0,
int y_skip = 0) {
int overlap) {
int64_t width = input->ne[0];
int64_t height = input->ne[1];
int64_t channels = input->ne[2];
@ -536,17 +521,17 @@ __STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input,
int64_t img_height = output->ne[1];
GGML_ASSERT(input->type == GGML_TYPE_F32 && output->type == GGML_TYPE_F32);
for (int iy = y_skip; iy < height; iy++) {
for (int ix = x_skip; ix < width; ix++) {
for (int iy = 0; iy < height; iy++) {
for (int ix = 0; ix < width; ix++) {
for (int k = 0; k < channels; k++) {
float new_value = ggml_tensor_get_f32(input, ix, iy, k);
if (overlap_x > 0 || overlap_y > 0) { // blend colors in overlapped area
if (overlap > 0) { // blend colors in overlapped area
float old_value = ggml_tensor_get_f32(output, x + ix, y + iy, k);
const float x_f_0 = (overlap_x > 0 && x > 0) ? (ix - x_skip) / float(overlap_x) : 1;
const float x_f_1 = (overlap_x > 0 && x < (img_width - width)) ? (width - ix) / float(overlap_x) : 1;
const float y_f_0 = (overlap_y > 0 && y > 0) ? (iy - y_skip) / float(overlap_y) : 1;
const float y_f_1 = (overlap_y > 0 && y < (img_height - height)) ? (height - iy) / float(overlap_y) : 1;
const float x_f_0 = (x > 0) ? ix / float(overlap) : 1;
const float x_f_1 = (x < (img_width - width)) ? (width - ix) / float(overlap) : 1;
const float y_f_0 = (y > 0) ? iy / float(overlap) : 1;
const float y_f_1 = (y < (img_height - height)) ? (height - iy) / float(overlap) : 1;
const float x_f = std::min(std::min(x_f_0, x_f_1), 1.f);
const float y_f = std::min(std::min(y_f_0, y_f_1), 1.f);
@ -778,102 +763,22 @@ __STATIC_INLINE__ std::vector<struct ggml_tensor*> ggml_chunk(struct ggml_contex
typedef std::function<void(ggml_tensor*, ggml_tensor*, bool)> on_tile_process;
__STATIC_INLINE__ void sd_tiling_calc_tiles(int& num_tiles_dim,
float& tile_overlap_factor_dim,
int small_dim,
int tile_size,
const float tile_overlap_factor) {
int tile_overlap = (tile_size * tile_overlap_factor);
int non_tile_overlap = tile_size - tile_overlap;
num_tiles_dim = (small_dim - tile_overlap) / non_tile_overlap;
int overshoot_dim = ((num_tiles_dim + 1) * non_tile_overlap + tile_overlap) % small_dim;
if ((overshoot_dim != non_tile_overlap) && (overshoot_dim <= num_tiles_dim * (tile_size / 2 - tile_overlap))) {
// if tiles don't fit perfectly using the desired overlap
// and there is enough room to squeeze an extra tile without overlap becoming >0.5
num_tiles_dim++;
}
tile_overlap_factor_dim = (float)(tile_size * num_tiles_dim - small_dim) / (float)(tile_size * (num_tiles_dim - 1));
if (num_tiles_dim <= 2) {
if (small_dim <= tile_size) {
num_tiles_dim = 1;
tile_overlap_factor_dim = 0;
} else {
num_tiles_dim = 2;
tile_overlap_factor_dim = (2 * tile_size - small_dim) / (float)tile_size;
}
}
}
// Tiling
__STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,
ggml_tensor* output,
const int scale,
const int p_tile_size_x,
const int p_tile_size_y,
const float tile_overlap_factor,
on_tile_process on_processing) {
__STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const int scale, const int tile_size, const float tile_overlap_factor, on_tile_process on_processing) {
output = ggml_set_f32(output, 0);
int input_width = (int)input->ne[0];
int input_height = (int)input->ne[1];
int output_width = (int)output->ne[0];
int output_height = (int)output->ne[1];
GGML_ASSERT(((input_width / output_width) == (input_height / output_height)) &&
((output_width / input_width) == (output_height / input_height)));
GGML_ASSERT(((input_width / output_width) == scale) ||
((output_width / input_width) == scale));
int small_width = output_width;
int small_height = output_height;
bool decode = output_width > input_width;
if (decode) {
small_width = input_width;
small_height = input_height;
}
int num_tiles_x;
float tile_overlap_factor_x;
sd_tiling_calc_tiles(num_tiles_x, tile_overlap_factor_x, small_width, p_tile_size_x, tile_overlap_factor);
int num_tiles_y;
float tile_overlap_factor_y;
sd_tiling_calc_tiles(num_tiles_y, tile_overlap_factor_y, small_height, p_tile_size_y, tile_overlap_factor);
LOG_DEBUG("num tiles : %d, %d ", num_tiles_x, num_tiles_y);
LOG_DEBUG("optimal overlap : %f, %f (targeting %f)", tile_overlap_factor_x, tile_overlap_factor_y, tile_overlap_factor);
GGML_ASSERT(input_width % 2 == 0 && input_height % 2 == 0 && output_width % 2 == 0 && output_height % 2 == 0); // should be multiple of 2
int tile_overlap_x = (int32_t)(p_tile_size_x * tile_overlap_factor_x);
int non_tile_overlap_x = p_tile_size_x - tile_overlap_x;
int tile_overlap_y = (int32_t)(p_tile_size_y * tile_overlap_factor_y);
int non_tile_overlap_y = p_tile_size_y - tile_overlap_y;
int tile_size_x = p_tile_size_x < small_width ? p_tile_size_x : small_width;
int tile_size_y = p_tile_size_y < small_height ? p_tile_size_y : small_height;
int input_tile_size_x = tile_size_x;
int input_tile_size_y = tile_size_y;
int output_tile_size_x = tile_size_x;
int output_tile_size_y = tile_size_y;
if (decode) {
output_tile_size_x *= scale;
output_tile_size_y *= scale;
} else {
input_tile_size_x *= scale;
input_tile_size_y *= scale;
}
int tile_overlap = (int32_t)(tile_size * tile_overlap_factor);
int non_tile_overlap = tile_size - tile_overlap;
struct ggml_init_params params = {};
params.mem_size += input_tile_size_x * input_tile_size_y * input->ne[2] * sizeof(float); // input chunk
params.mem_size += output_tile_size_x * output_tile_size_y * output->ne[2] * sizeof(float); // output chunk
params.mem_size += tile_size * tile_size * input->ne[2] * sizeof(float); // input chunk
params.mem_size += (tile_size * scale) * (tile_size * scale) * output->ne[2] * sizeof(float); // output chunk
params.mem_size += 3 * ggml_tensor_overhead();
params.mem_buffer = NULL;
params.no_alloc = false;
@ -888,50 +793,29 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,
}
// tiling
ggml_tensor* input_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, input_tile_size_x, input_tile_size_y, input->ne[2], 1);
ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, output_tile_size_x, output_tile_size_y, output->ne[2], 1);
int num_tiles = num_tiles_x * num_tiles_y;
ggml_tensor* input_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, tile_size, tile_size, input->ne[2], 1);
ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, tile_size * scale, tile_size * scale, output->ne[2], 1);
on_processing(input_tile, NULL, true);
int num_tiles = ceil((float)input_width / non_tile_overlap) * ceil((float)input_height / non_tile_overlap);
LOG_INFO("processing %i tiles", num_tiles);
pretty_progress(0, num_tiles, 0.0f);
pretty_progress(1, num_tiles, 0.0f);
int tile_count = 1;
bool last_y = false, last_x = false;
float last_time = 0.0f;
for (int y = 0; y < small_height && !last_y; y += non_tile_overlap_y) {
int dy = 0;
if (y + tile_size_y >= small_height) {
int _y = y;
y = small_height - tile_size_y;
dy = _y - y;
if (decode) {
dy *= scale;
}
for (int y = 0; y < input_height && !last_y; y += non_tile_overlap) {
if (y + tile_size >= input_height) {
y = input_height - tile_size;
last_y = true;
}
for (int x = 0; x < small_width && !last_x; x += non_tile_overlap_x) {
int dx = 0;
if (x + tile_size_x >= small_width) {
int _x = x;
x = small_width - tile_size_x;
dx = _x - x;
if (decode) {
dx *= scale;
}
for (int x = 0; x < input_width && !last_x; x += non_tile_overlap) {
if (x + tile_size >= input_width) {
x = input_width - tile_size;
last_x = true;
}
int x_in = decode ? x : scale * x;
int y_in = decode ? y : scale * y;
int x_out = decode ? x * scale : x;
int y_out = decode ? y * scale : y;
int overlap_x_out = decode ? tile_overlap_x * scale : tile_overlap_x;
int overlap_y_out = decode ? tile_overlap_y * scale : tile_overlap_y;
int64_t t1 = ggml_time_ms();
ggml_split_tensor_2d(input, input_tile, x_in, y_in);
ggml_split_tensor_2d(input, input_tile, x, y);
on_processing(input_tile, output_tile, false);
ggml_merge_tensor_2d(output_tile, output, x_out, y_out, overlap_x_out, overlap_y_out, dx, dy);
ggml_merge_tensor_2d(output_tile, output, x * scale, y * scale, tile_overlap * scale);
int64_t t2 = ggml_time_ms();
last_time = (t2 - t1) / 1000.0f;
pretty_progress(tile_count, num_tiles, last_time);
@ -945,15 +829,6 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,
ggml_free(tiles_ctx);
}
__STATIC_INLINE__ void sd_tiling(ggml_tensor* input,
ggml_tensor* output,
const int scale,
const int tile_size,
const float tile_overlap_factor,
on_tile_process on_processing) {
sd_tiling_non_square(input, output, scale, tile_size, tile_size, tile_overlap_factor, on_processing);
}
__STATIC_INLINE__ struct ggml_tensor* ggml_group_norm_32(struct ggml_context* ctx,
struct ggml_tensor* a) {
const float eps = 1e-6f; // default eps parameter

View File

@ -147,16 +147,14 @@ public:
int64_t num_heads;
bool pre_only;
std::string qk_norm;
bool flash_attn;
public:
SelfAttention(int64_t dim,
int64_t num_heads = 8,
std::string qk_norm = "",
bool qkv_bias = false,
bool pre_only = false,
bool flash_attn = false)
: num_heads(num_heads), pre_only(pre_only), qk_norm(qk_norm), flash_attn(flash_attn) {
bool pre_only = false)
: num_heads(num_heads), pre_only(pre_only), qk_norm(qk_norm) {
int64_t d_head = dim / num_heads;
blocks["qkv"] = std::shared_ptr<GGMLBlock>(new Linear(dim, dim * 3, qkv_bias));
if (!pre_only) {
@ -208,8 +206,8 @@ public:
ggml_backend_t backend,
struct ggml_tensor* x) {
auto qkv = pre_attention(ctx, x);
x = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, NULL, false, false, true); // [N, n_token, dim]
x = post_attention(ctx, x); // [N, n_token, dim]
x = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads); // [N, n_token, dim]
x = post_attention(ctx, x); // [N, n_token, dim]
return x;
}
};
@ -234,7 +232,6 @@ public:
int64_t num_heads;
bool pre_only;
bool self_attn;
bool flash_attn;
public:
DismantledBlock(int64_t hidden_size,
@ -243,17 +240,16 @@ public:
std::string qk_norm = "",
bool qkv_bias = false,
bool pre_only = false,
bool self_attn = false,
bool flash_attn = false)
bool self_attn = false)
: num_heads(num_heads), pre_only(pre_only), self_attn(self_attn) {
// rmsnorm is always Flase
// scale_mod_only is always Flase
// swiglu is always Flase
blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-06f, false));
blocks["attn"] = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qk_norm, qkv_bias, pre_only, flash_attn));
blocks["attn"] = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qk_norm, qkv_bias, pre_only));
if (self_attn) {
blocks["attn2"] = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qk_norm, qkv_bias, false, flash_attn));
blocks["attn2"] = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qk_norm, qkv_bias, false));
}
if (!pre_only) {
@ -439,8 +435,8 @@ public:
auto qkv2 = std::get<1>(qkv_intermediates);
auto intermediates = std::get<2>(qkv_intermediates);
auto attn_out = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, NULL, false, false, flash_attn); // [N, n_token, dim]
auto attn2_out = ggml_nn_attention_ext(ctx, backend, qkv2[0], qkv2[1], qkv2[2], num_heads, NULL, false, false, flash_attn); // [N, n_token, dim]
auto attn_out = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads); // [N, n_token, dim]
auto attn2_out = ggml_nn_attention_ext(ctx, backend, qkv2[0], qkv2[1], qkv2[2], num_heads); // [N, n_token, dim]
x = post_attention_x(ctx,
attn_out,
attn2_out,
@ -456,7 +452,7 @@ public:
auto qkv = qkv_intermediates.first;
auto intermediates = qkv_intermediates.second;
auto attn_out = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, NULL, false, false, flash_attn); // [N, n_token, dim]
auto attn_out = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads); // [N, n_token, dim]
x = post_attention(ctx,
attn_out,
intermediates[0],
@ -472,7 +468,6 @@ public:
__STATIC_INLINE__ std::pair<struct ggml_tensor*, struct ggml_tensor*>
block_mixing(struct ggml_context* ctx,
ggml_backend_t backend,
bool flash_attn,
struct ggml_tensor* context,
struct ggml_tensor* x,
struct ggml_tensor* c,
@ -502,8 +497,8 @@ block_mixing(struct ggml_context* ctx,
qkv.push_back(ggml_concat(ctx, context_qkv[i], x_qkv[i], 1));
}
auto attn = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], x_block->num_heads, NULL, false, false, flash_attn); // [N, n_context + n_token, hidden_size]
attn = ggml_cont(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3)); // [n_context + n_token, N, hidden_size]
auto attn = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], x_block->num_heads); // [N, n_context + n_token, hidden_size]
attn = ggml_cont(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3)); // [n_context + n_token, N, hidden_size]
auto context_attn = ggml_view_3d(ctx,
attn,
attn->ne[0],
@ -561,8 +556,6 @@ block_mixing(struct ggml_context* ctx,
}
struct JointBlock : public GGMLBlock {
bool flash_attn;
public:
JointBlock(int64_t hidden_size,
int64_t num_heads,
@ -570,11 +563,9 @@ public:
std::string qk_norm = "",
bool qkv_bias = false,
bool pre_only = false,
bool self_attn_x = false,
bool flash_attn = false)
: flash_attn(flash_attn) {
blocks["context_block"] = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, pre_only, false, flash_attn));
blocks["x_block"] = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, false, self_attn_x, flash_attn));
bool self_attn_x = false) {
blocks["context_block"] = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, pre_only));
blocks["x_block"] = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, false, self_attn_x));
}
std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx,
@ -585,7 +576,7 @@ public:
auto context_block = std::dynamic_pointer_cast<DismantledBlock>(blocks["context_block"]);
auto x_block = std::dynamic_pointer_cast<DismantledBlock>(blocks["x_block"]);
return block_mixing(ctx, backend, flash_attn, context, x, c, context_block, x_block);
return block_mixing(ctx, backend, context, x, c, context_block, x_block);
}
};
@ -643,7 +634,6 @@ protected:
int64_t context_embedder_out_dim = 1536;
int64_t hidden_size;
std::string qk_norm;
bool flash_attn = false;
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") {
enum ggml_type wtype = GGML_TYPE_F32;
@ -651,8 +641,7 @@ protected:
}
public:
MMDiT(bool flash_attn = false, const String2GGMLType& tensor_types = {})
: flash_attn(flash_attn) {
MMDiT(const String2GGMLType& tensor_types = {}) {
// input_size is always None
// learn_sigma is always False
// register_length is alwalys 0
@ -720,8 +709,7 @@ public:
qk_norm,
true,
i == depth - 1,
i <= d_self,
flash_attn));
i <= d_self));
}
blocks["final_layer"] = std::shared_ptr<GGMLBlock>(new FinalLayer(hidden_size, patch_size, out_channels));
@ -868,10 +856,9 @@ struct MMDiTRunner : public GGMLRunner {
MMDiTRunner(ggml_backend_t backend,
bool offload_params_to_cpu,
bool flash_attn,
const String2GGMLType& tensor_types = {},
const std::string prefix = "")
: GGMLRunner(backend, offload_params_to_cpu), mmdit(flash_attn, tensor_types) {
: GGMLRunner(backend, offload_params_to_cpu), mmdit(tensor_types) {
mmdit.init(params_ctx, tensor_types, prefix);
}
@ -970,7 +957,7 @@ struct MMDiTRunner : public GGMLRunner {
// ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_type model_data_type = GGML_TYPE_F16;
std::shared_ptr<MMDiTRunner> mmdit = std::shared_ptr<MMDiTRunner>(new MMDiTRunner(backend, false, false));
std::shared_ptr<MMDiTRunner> mmdit = std::shared_ptr<MMDiTRunner>(new MMDiTRunner(backend, false));
{
LOG_INFO("loading from '%s'", file_path.c_str());

View File

@ -107,7 +107,7 @@ const char* unused_tensors[] = {
};
bool is_unused_tensor(std::string name) {
for (size_t i = 0; i < sizeof(unused_tensors) / sizeof(const char*); i++) {
for (int i = 0; i < sizeof(unused_tensors) / sizeof(const char*); i++) {
if (starts_with(name, unused_tensors[i])) {
return true;
}
@ -2310,7 +2310,7 @@ std::vector<std::pair<std::string, ggml_type>> parse_tensor_type_rules(const std
if (type_name == "f32") {
tensor_type = GGML_TYPE_F32;
} else {
for (size_t i = 0; i < GGML_TYPE_COUNT; i++) {
for (size_t i = 0; i < SD_TYPE_COUNT; i++) {
auto trait = ggml_get_type_traits((ggml_type)i);
if (trait->to_float && trait->type_size && type_name == trait->type_name) {
tensor_type = (ggml_type)i;

View File

@ -119,7 +119,7 @@ struct TensorStorage {
size_t file_index = 0;
int index_in_zip = -1; // >= means stored in a zip file
uint64_t offset = 0; // offset in file
size_t offset = 0; // offset in file
TensorStorage() = default;
@ -164,10 +164,10 @@ struct TensorStorage {
std::vector<TensorStorage> chunk(size_t n) {
std::vector<TensorStorage> chunks;
uint64_t chunk_size = nbytes_to_read() / n;
size_t chunk_size = nbytes_to_read() / n;
// printf("%d/%d\n", chunk_size, nbytes_to_read());
reverse_ne();
for (size_t i = 0; i < n; i++) {
for (int i = 0; i < n; i++) {
TensorStorage chunk_i = *this;
chunk_i.ne[0] = ne[0] / n;
chunk_i.offset = offset + i * chunk_size;

View File

@ -164,7 +164,7 @@ void threshold_hystersis(struct ggml_tensor* img, float high_threshold, float lo
bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold, float weak, float strong, bool inverse) {
struct ggml_init_params params;
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10MB
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10
params.mem_buffer = NULL;
params.no_alloc = false;
struct ggml_context* work_ctx = ggml_init(params);

View File

@ -43,7 +43,7 @@ const char* model_version_to_str[] = {
};
const char* sampling_methods_str[] = {
"default",
"Euler A",
"Euler",
"Heun",
"DPM2",
@ -55,7 +55,6 @@ const char* sampling_methods_str[] = {
"LCM",
"DDIM \"trailing\"",
"TCD",
"Euler A",
};
/*================================================== Helper Functions ================================================*/
@ -108,10 +107,10 @@ public:
std::shared_ptr<PhotoMakerIDEmbed> pmid_id_embeds;
std::string taesd_path;
bool use_tiny_autoencoder = false;
sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0, 0};
bool offload_params_to_cpu = false;
bool stacked_id = false;
bool use_tiny_autoencoder = false;
bool vae_tiling = false;
bool offload_params_to_cpu = false;
bool stacked_id = false;
bool is_using_v_parameterization = false;
bool is_using_edm_v_parameterization = false;
@ -146,6 +145,7 @@ public:
#endif
#ifdef SD_USE_METAL
LOG_DEBUG("Using Metal backend");
ggml_log_set(ggml_log_callback_default, nullptr);
backend = ggml_backend_metal_init();
#endif
#ifdef SD_USE_VULKAN
@ -183,6 +183,7 @@ public:
lora_model_dir = SAFE_STR(sd_ctx_params->lora_model_dir);
taesd_path = SAFE_STR(sd_ctx_params->taesd_path);
use_tiny_autoencoder = taesd_path.size() > 0;
vae_tiling = sd_ctx_params->vae_tiling;
offload_params_to_cpu = sd_ctx_params->offload_params_to_cpu;
if (sd_ctx_params->rng_type == STD_DEFAULT_RNG) {
@ -191,8 +192,6 @@ public:
rng = std::make_shared<PhiloxRNG>();
}
ggml_log_set(ggml_log_callback_default, nullptr);
init_backend();
ModelLoader model_loader;
@ -265,9 +264,7 @@ public:
}
LOG_INFO("Version: %s ", model_version_to_str[version]);
ggml_type wtype = (int)sd_ctx_params->wtype < std::min<int>(SD_TYPE_COUNT, GGML_TYPE_COUNT)
? (ggml_type)sd_ctx_params->wtype
: GGML_TYPE_COUNT;
ggml_type wtype = (ggml_type)sd_ctx_params->wtype;
if (wtype == GGML_TYPE_COUNT) {
model_wtype = model_loader.get_sd_wtype();
if (model_wtype == GGML_TYPE_COUNT) {
@ -295,6 +292,11 @@ public:
model_loader.set_wtype_override(wtype);
}
if (sd_version_is_sdxl(version)) {
vae_wtype = GGML_TYPE_F32;
model_loader.set_wtype_override(GGML_TYPE_F32, "vae.");
}
LOG_INFO("Weight type: %s", ggml_type_name(model_wtype));
LOG_INFO("Conditioner weight type: %s", ggml_type_name(conditioner_wtype));
LOG_INFO("Diffusion model weight type: %s", ggml_type_name(diffusion_model_wtype));
@ -342,12 +344,14 @@ public:
LOG_INFO("Using flash attention in the diffusion model");
}
if (sd_version_is_sd3(version)) {
if (sd_ctx_params->diffusion_flash_attn) {
LOG_WARN("flash attention in this diffusion model is currently unsupported!");
}
cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend,
offload_params_to_cpu,
model_loader.tensor_storages_types);
diffusion_model = std::make_shared<MMDiTModel>(backend,
offload_params_to_cpu,
sd_ctx_params->diffusion_flash_attn,
model_loader.tensor_storages_types);
} else if (sd_version_is_flux(version)) {
bool is_chroma = false;
@ -358,18 +362,10 @@ public:
}
}
if (is_chroma) {
if (sd_ctx_params->diffusion_flash_attn && sd_ctx_params->chroma_use_dit_mask) {
LOG_WARN(
"!!!It looks like you are using Chroma with flash attention. "
"This is currently unsupported. "
"If you find that the generated images are broken, "
"try either disabling flash attention or specifying "
"--chroma-disable-dit-mask as a workaround.");
}
cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
offload_params_to_cpu,
model_loader.tensor_storages_types,
-1,
sd_ctx_params->chroma_use_t5_mask,
sd_ctx_params->chroma_t5_mask_pad);
} else {
@ -387,6 +383,7 @@ public:
cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
offload_params_to_cpu,
model_loader.tensor_storages_types,
-1,
true,
1,
true);
@ -747,10 +744,6 @@ public:
denoiser->scheduler = std::make_shared<GITSSchedule>();
denoiser->scheduler->version = version;
break;
case SMOOTHSTEP:
LOG_INFO("Running with SmoothStep scheduler");
denoiser->scheduler = std::make_shared<SmoothStepSchedule>();
break;
case DEFAULT:
// Don't touch anything.
break;
@ -1279,77 +1272,15 @@ public:
return latent;
}
void get_tile_sizes(int& tile_size_x,
int& tile_size_y,
float& tile_overlap,
const sd_tiling_params_t& params,
int latent_x,
int latent_y,
float encoding_factor = 1.0f) {
tile_overlap = std::max(std::min(params.target_overlap, 0.5f), 0.0f);
auto get_tile_size = [&](int requested_size, float factor, int latent_size) {
const int default_tile_size = 32;
const int min_tile_dimension = 4;
int tile_size = default_tile_size;
// factor <= 1 means simple fraction of the latent dimension
// factor > 1 means number of tiles across that dimension
if (factor > 0.f) {
if (factor > 1.0)
factor = 1 / (factor - factor * tile_overlap + tile_overlap);
tile_size = std::round(latent_size * factor);
} else if (requested_size >= min_tile_dimension) {
tile_size = requested_size;
}
tile_size *= encoding_factor;
return std::max(std::min(tile_size, latent_size), min_tile_dimension);
};
tile_size_x = get_tile_size(params.tile_size_x, params.rel_size_x, latent_x);
tile_size_y = get_tile_size(params.tile_size_y, params.rel_size_y, latent_y);
}
ggml_tensor* encode_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool encode_video = false) {
ggml_tensor* encode_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode_video = false) {
int64_t t0 = ggml_time_ms();
ggml_tensor* result = NULL;
int W = x->ne[0] / 8;
int H = x->ne[1] / 8;
if (vae_tiling_params.enabled && !encode_video) {
// TODO wan2.2 vae support?
int C = sd_version_is_dit(version) ? 16 : 4;
if (!use_tiny_autoencoder) {
C *= 2;
}
result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, x->ne[3]);
}
if (!use_tiny_autoencoder) {
float tile_overlap;
int tile_size_x, tile_size_y;
// multiply tile size for encode to keep the compute buffer size consistent
get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, vae_tiling_params, W, H, 1.30539f);
LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y);
process_vae_input_tensor(x);
if (vae_tiling_params.enabled && !encode_video) {
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
first_stage_model->compute(n_threads, in, false, &out, work_ctx);
};
sd_tiling_non_square(x, result, 8, tile_size_x, tile_size_y, tile_overlap, on_tiling);
} else {
first_stage_model->compute(n_threads, x, false, &result, work_ctx);
}
first_stage_model->compute(n_threads, x, false, &result, work_ctx);
first_stage_model->free_compute_buffer();
} else {
if (vae_tiling_params.enabled && !encode_video) {
// split latent in 32x32 tiles and compute in several steps
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
tae_first_stage->compute(n_threads, in, false, &out, NULL);
};
sd_tiling(x, result, 8, 64, 0.5f, on_tiling);
} else {
tae_first_stage->compute(n_threads, x, false, &result, work_ctx);
}
tae_first_stage->compute(n_threads, x, false, &result, work_ctx);
tae_first_stage->free_compute_buffer();
}
@ -1466,29 +1397,24 @@ public:
C,
x->ne[3]);
}
int64_t t0 = ggml_time_ms();
if (!use_tiny_autoencoder) {
float tile_overlap;
int tile_size_x, tile_size_y;
get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, vae_tiling_params, x->ne[0], x->ne[1]);
LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y);
process_latent_out(x);
// x = load_tensor_from_file(work_ctx, "wan_vae_z.bin");
if (vae_tiling_params.enabled && !decode_video) {
if (vae_tiling && !decode_video) {
// split latent in 32x32 tiles and compute in several steps
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
first_stage_model->compute(n_threads, in, true, &out, NULL);
};
sd_tiling_non_square(x, result, 8, tile_size_x, tile_size_y, tile_overlap, on_tiling);
sd_tiling(x, result, 8, 32, 0.5f, on_tiling);
} else {
first_stage_model->compute(n_threads, x, true, &result, work_ctx);
}
first_stage_model->free_compute_buffer();
process_vae_output_tensor(result);
} else {
if (vae_tiling_params.enabled && !decode_video) {
if (vae_tiling && !decode_video) {
// split latent in 64x64 tiles and compute in several steps
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
tae_first_stage->compute(n_threads, in, true, &out);
@ -1512,14 +1438,11 @@ public:
#define NONE_STR "NONE"
const char* sd_type_name(enum sd_type_t type) {
if ((int)type < std::min<int>(SD_TYPE_COUNT, GGML_TYPE_COUNT)) {
return ggml_type_name((ggml_type)type);
}
return NONE_STR;
return ggml_type_name((ggml_type)type);
}
enum sd_type_t str_to_sd_type(const char* str) {
for (int i = 0; i < std::min<int>(SD_TYPE_COUNT, GGML_TYPE_COUNT); i++) {
for (int i = 0; i < SD_TYPE_COUNT; i++) {
auto trait = ggml_get_type_traits((ggml_type)i);
if (!strcmp(str, trait->type_name)) {
return (enum sd_type_t)i;
@ -1550,7 +1473,7 @@ enum rng_type_t str_to_rng_type(const char* str) {
}
const char* sample_method_to_str[] = {
"default",
"euler_a",
"euler",
"heun",
"dpm2",
@ -1562,7 +1485,6 @@ const char* sample_method_to_str[] = {
"lcm",
"ddim_trailing",
"tcd",
"euler_a",
};
const char* sd_sample_method_name(enum sample_method_t sample_method) {
@ -1588,7 +1510,6 @@ const char* schedule_to_str[] = {
"exponential",
"ays",
"gits",
"smoothstep",
};
const char* sd_schedule_name(enum scheduler_t scheduler) {
@ -1608,8 +1529,9 @@ enum scheduler_t str_to_schedule(const char* str) {
}
void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
*sd_ctx_params = {};
memset((void*)sd_ctx_params, 0, sizeof(sd_ctx_params_t));
sd_ctx_params->vae_decode_only = true;
sd_ctx_params->vae_tiling = false;
sd_ctx_params->free_params_immediately = true;
sd_ctx_params->n_threads = get_num_physical_cores();
sd_ctx_params->wtype = SD_TYPE_COUNT;
@ -1673,6 +1595,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
SAFE_STR(sd_ctx_params->embedding_dir),
SAFE_STR(sd_ctx_params->stacked_id_embed_dir),
BOOL_STR(sd_ctx_params->vae_decode_only),
BOOL_STR(sd_ctx_params->vae_tiling),
BOOL_STR(sd_ctx_params->free_params_immediately),
sd_ctx_params->n_threads,
sd_type_name(sd_ctx_params->wtype),
@ -1690,7 +1613,6 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
}
void sd_sample_params_init(sd_sample_params_t* sample_params) {
*sample_params = {};
sample_params->guidance.txt_cfg = 7.0f;
sample_params->guidance.img_cfg = INFINITY;
sample_params->guidance.distilled_guidance = 3.5f;
@ -1699,7 +1621,7 @@ void sd_sample_params_init(sd_sample_params_t* sample_params) {
sample_params->guidance.slg.layer_end = 0.2f;
sample_params->guidance.slg.scale = 0.f;
sample_params->scheduler = DEFAULT;
sample_params->sample_method = SAMPLE_METHOD_DEFAULT;
sample_params->sample_method = EULER_A;
sample_params->sample_steps = 20;
}
@ -1737,19 +1659,18 @@ char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) {
}
void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params) {
*sd_img_gen_params = {};
memset((void*)sd_img_gen_params, 0, sizeof(sd_img_gen_params_t));
sd_img_gen_params->clip_skip = -1;
sd_sample_params_init(&sd_img_gen_params->sample_params);
sd_img_gen_params->clip_skip = -1;
sd_img_gen_params->ref_images_count = 0;
sd_img_gen_params->width = 512;
sd_img_gen_params->height = 512;
sd_img_gen_params->strength = 0.75f;
sd_img_gen_params->seed = -1;
sd_img_gen_params->batch_count = 1;
sd_img_gen_params->control_strength = 0.9f;
sd_img_gen_params->style_strength = 20.f;
sd_img_gen_params->normalize_input = false;
sd_img_gen_params->vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f};
sd_img_gen_params->ref_images_count = 0;
sd_img_gen_params->width = 512;
sd_img_gen_params->height = 512;
sd_img_gen_params->strength = 0.75f;
sd_img_gen_params->seed = -1;
sd_img_gen_params->batch_count = 1;
sd_img_gen_params->control_strength = 0.9f;
sd_img_gen_params->style_strength = 20.f;
sd_img_gen_params->normalize_input = false;
}
char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
@ -1769,7 +1690,6 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
"sample_params: %s\n"
"strength: %.2f\n"
"seed: %" PRId64
"VAE tiling:"
"\n"
"batch_count: %d\n"
"ref_images_count: %d\n"
@ -1786,7 +1706,6 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
SAFE_STR(sample_params_str),
sd_img_gen_params->strength,
sd_img_gen_params->seed,
BOOL_STR(sd_img_gen_params->vae_tiling_params.enabled),
sd_img_gen_params->batch_count,
sd_img_gen_params->ref_images_count,
BOOL_STR(sd_img_gen_params->increase_ref_index),
@ -1799,7 +1718,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
}
void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) {
*sd_vid_gen_params = {};
memset((void*)sd_vid_gen_params, 0, sizeof(sd_vid_gen_params_t));
sd_sample_params_init(&sd_vid_gen_params->sample_params);
sd_sample_params_init(&sd_vid_gen_params->high_noise_sample_params);
sd_vid_gen_params->high_noise_sample_params.sample_steps = -1;
@ -1824,7 +1743,6 @@ sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params) {
sd_ctx->sd = new StableDiffusionGGML();
if (sd_ctx->sd == NULL) {
free(sd_ctx);
return NULL;
}
@ -1845,17 +1763,6 @@ void free_sd_ctx(sd_ctx_t* sd_ctx) {
free(sd_ctx);
}
enum sample_method_t sd_get_default_sample_method(const sd_ctx_t* sd_ctx) {
if (sd_ctx != NULL && sd_ctx->sd != NULL) {
SDVersion version = sd_ctx->sd->version;
if (sd_version_is_dit(version))
return EULER;
else
return EULER_A;
}
return SAMPLE_METHOD_COUNT;
}
sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
struct ggml_context* work_ctx,
ggml_tensor* init_latent,
@ -2224,9 +2131,8 @@ ggml_tensor* generate_init_latent(sd_ctx_t* sd_ctx,
}
sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) {
sd_ctx->sd->vae_tiling_params = sd_img_gen_params->vae_tiling_params;
int width = sd_img_gen_params->width;
int height = sd_img_gen_params->height;
int width = sd_img_gen_params->width;
int height = sd_img_gen_params->height;
if (sd_version_is_dit(sd_ctx->sd->version)) {
if (width % 16 || height % 16) {
LOG_ERROR("Image dimensions must be must be a multiple of 16 on each axis for %s models. (Got %dx%d)",
@ -2248,7 +2154,19 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
}
struct ggml_init_params params;
params.mem_size = static_cast<size_t>(1024 * 1024) * 1024; // 1G
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10 MB
if (sd_version_is_sd3(sd_ctx->sd->version)) {
params.mem_size *= 3;
}
if (sd_version_is_flux(sd_ctx->sd->version)) {
params.mem_size *= 4;
}
if (sd_ctx->sd->stacked_id) {
params.mem_size += static_cast<size_t>(10 * 1024 * 1024); // 10 MB
}
params.mem_size += width * height * 3 * sizeof(float) * 3;
params.mem_size += width * height * 3 * sizeof(float) * 3 * sd_img_gen_params->ref_images_count;
params.mem_size *= sd_img_gen_params->batch_count;
params.mem_buffer = NULL;
params.no_alloc = false;
// LOG_DEBUG("mem_size %u ", params.mem_size);
@ -2409,11 +2327,6 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
}
enum sample_method_t sample_method = sd_img_gen_params->sample_params.sample_method;
if (sample_method == SAMPLE_METHOD_DEFAULT) {
sample_method = sd_get_default_sample_method(sd_ctx);
}
sd_image_t* result_images = generate_image_internal(sd_ctx,
work_ctx,
init_latent,
@ -2424,7 +2337,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
sd_img_gen_params->sample_params.eta,
width,
height,
sample_method,
sd_img_gen_params->sample_params.sample_method,
sigmas,
seed,
sd_img_gen_params->batch_count,
@ -2432,7 +2345,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
sd_img_gen_params->control_strength,
sd_img_gen_params->style_strength,
sd_img_gen_params->normalize_input,
SAFE_STR(sd_img_gen_params->input_id_images_path),
sd_img_gen_params->input_id_images_path,
ref_latents,
sd_img_gen_params->increase_ref_index,
concat_latent,

View File

@ -35,7 +35,7 @@ enum rng_type_t {
};
enum sample_method_t {
SAMPLE_METHOD_DEFAULT,
EULER_A,
EULER,
HEUN,
DPM2,
@ -47,7 +47,6 @@ enum sample_method_t {
LCM,
DDIM_TRAILING,
TCD,
EULER_A,
SAMPLE_METHOD_COUNT
};
@ -58,7 +57,6 @@ enum scheduler_t {
EXPONENTIAL,
AYS,
GITS,
SMOOTHSTEP,
SCHEDULE_COUNT
};
@ -114,15 +112,6 @@ enum sd_log_level_t {
SD_LOG_ERROR
};
typedef struct {
bool enabled;
int tile_size_x;
int tile_size_y;
float target_overlap;
float rel_size_x;
float rel_size_y;
} sd_tiling_params_t;
typedef struct {
const char* model_path;
const char* clip_l_path;
@ -138,6 +127,7 @@ typedef struct {
const char* embedding_dir;
const char* stacked_id_embed_dir;
bool vae_decode_only;
bool vae_tiling;
bool free_params_immediately;
int n_threads;
enum sd_type_t wtype;
@ -205,7 +195,6 @@ typedef struct {
float style_strength;
bool normalize_input;
const char* input_id_images_path;
sd_tiling_params_t vae_tiling_params;
} sd_img_gen_params_t;
typedef struct {
@ -251,7 +240,6 @@ SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);
SD_API sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params);
SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
SD_API enum sample_method_t sd_get_default_sample_method(const sd_ctx_t* sd_ctx);
SD_API void sd_sample_params_init(sd_sample_params_t* sample_params);
SD_API char* sd_sample_params_to_str(const sd_sample_params_t* sample_params);

View File

@ -19,13 +19,13 @@ struct UpscalerGGML {
bool load_from_file(const std::string& esrgan_path,
bool offload_params_to_cpu) {
ggml_log_set(ggml_log_callback_default, nullptr);
#ifdef SD_USE_CUDA
LOG_DEBUG("Using CUDA backend");
backend = ggml_backend_cuda_init(0);
#endif
#ifdef SD_USE_METAL
LOG_DEBUG("Using Metal backend");
ggml_log_set(ggml_log_callback_default, nullptr);
backend = ggml_backend_metal_init();
#endif
#ifdef SD_USE_VULKAN
@ -69,7 +69,8 @@ struct UpscalerGGML {
input_image.width, input_image.height, output_width, output_height);
struct ggml_init_params params;
params.mem_size = static_cast<size_t>(1024 * 1024) * 1024; // 1G
params.mem_size = output_width * output_height * 3 * sizeof(float) * 2;
params.mem_size += 2 * ggml_tensor_overhead();
params.mem_buffer = NULL;
params.no_alloc = false;
@ -79,7 +80,7 @@ struct UpscalerGGML {
LOG_ERROR("ggml_init() failed");
return upscaled_image;
}
// LOG_DEBUG("upscale work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f);
LOG_DEBUG("upscale work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f);
ggml_tensor* input_image_tensor = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, input_image.width, input_image.height, 3, 1);
sd_image_to_tensor(input_image, input_image_tensor);

View File

@ -414,10 +414,7 @@ void log_printf(sd_log_level_t level, const char* file, int line, const char* fo
if (written >= 0 && written < LOG_BUFFER_SIZE) {
vsnprintf(log_buffer + written, LOG_BUFFER_SIZE - written, format, args);
}
size_t len = strlen(log_buffer);
if (log_buffer[len - 1] != '\n') {
strncat(log_buffer, "\n", LOG_BUFFER_SIZE - len);
}
strncat(log_buffer, "\n", LOG_BUFFER_SIZE - strlen(log_buffer));
if (sd_log_cb) {
sd_log_cb(level, log_buffer, sd_log_cb_data);

View File

@ -588,7 +588,7 @@ struct AutoEncoderKL : public VAE {
};
// ggml_set_f32(z, 0.5f);
// print_ggml_tensor(z);
GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
}
void test() {

View File

@ -1219,7 +1219,7 @@ namespace WAN {
void test() {
struct ggml_init_params params;
params.mem_size = static_cast<size_t>(1024 * 1024) * 1024; // 1G
params.mem_size = static_cast<size_t>(1000 * 1024 * 1024); // 10 MB
params.mem_buffer = NULL;
params.no_alloc = false;