Compare commits

..

No commits in common. "master" and "master-580-7d33d4b" have entirely different histories.

66 changed files with 1626 additions and 6393 deletions

View File

@ -72,31 +72,37 @@ option(SD_USE_SYSTEM_GGML "sd: use system-installed GGML library" OFF
if(SD_CUDA)
message("-- Use CUDA as backend stable-diffusion")
set(GGML_CUDA ON)
add_definitions(-DSD_USE_CUDA)
endif()
if(SD_METAL)
message("-- Use Metal as backend stable-diffusion")
set(GGML_METAL ON)
add_definitions(-DSD_USE_METAL)
endif()
if (SD_VULKAN)
message("-- Use Vulkan as backend stable-diffusion")
set(GGML_VULKAN ON)
add_definitions(-DSD_USE_VULKAN)
endif ()
if (SD_OPENCL)
message("-- Use OpenCL as backend stable-diffusion")
set(GGML_OPENCL ON)
add_definitions(-DSD_USE_OPENCL)
endif ()
if (SD_HIPBLAS)
message("-- Use HIPBLAS as backend stable-diffusion")
set(GGML_HIP ON)
add_definitions(-DSD_USE_CUDA)
endif ()
if(SD_MUSA)
message("-- Use MUSA as backend stable-diffusion")
set(GGML_MUSA ON)
add_definitions(-DSD_USE_CUDA)
endif()
if(SD_WEBP)
@ -150,12 +156,10 @@ endif()
set(SD_LIB stable-diffusion)
file(GLOB SD_LIB_SOURCES CONFIGURE_DEPENDS
file(GLOB SD_LIB_SOURCES
"src/*.h"
"src/*.cpp"
"src/*.hpp"
"src/model_io/*.h"
"src/model_io/*.cpp"
"src/tokenizers/*.h"
"src/tokenizers/*.cpp"
"src/tokenizers/vocab/*.h"
@ -216,6 +220,7 @@ if(SD_SYCL)
message("-- Use SYCL as backend stable-diffusion")
set(GGML_SYCL ON)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
add_definitions(-DSD_USE_SYCL)
# disable fast-math on host, see:
# https://www.intel.com/content/www/us/en/docs/cpp-compiler/developer-guide-reference/2021-10/fp-model-fp.html
if (WIN32)

View File

@ -77,10 +77,9 @@ API and command-line option may change frequently.***
- OpenCL
- SYCL
- Supported weight formats
- Pytorch checkpoint (`.ckpt` or `.pth` or `.pt`)
- Pytorch checkpoint (`.ckpt` or `.pth`)
- Safetensors (`.safetensors`)
- GGUF (`.gguf`)
- Convert mode supports converting model weights to `.gguf` or `.safetensors`
- Supported platforms
- Linux
- Mac OS

View File

@ -131,6 +131,8 @@ sd-cli -m model.safetensors -p "a cat" --cache-mode spectrum
| `warmup` | Steps to always compute before caching starts | 4 |
| `stop` | Stop caching at this fraction of total steps | 0.9 |
```
### Performance Tips
- Start with default thresholds and adjust based on output quality

View File

@ -4,17 +4,14 @@
usage: ./bin/sd-cli [options]
CLI Options:
-o, --output <string> path to write result image to. you can use printf-style %d format specifiers for image
sequences (default: ./output.png) (eg. output_%03d.png). Single-file video outputs
support .avi, .webm, and animated .webp
-o, --output <string> path to write result image to. you can use printf-style %d format specifiers for image sequences (default:
./output.png) (eg. output_%03d.png). For video generation, single-file outputs support .avi, .webm, and animated .webp
--preview-path <string> path to write preview image to (default: ./preview.png). Multi-frame previews support .avi, .webm, and animated .webp
--preview-interval <int> interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at
every step)
--output-begin-idx <int> starting index for output image sequence, must be non-negative (default 0 if specified %d in output path, 1 otherwise)
--image <string> path to the image to inspect (for metadata mode)
--metadata-format <string> metadata output format, one of [text, json] (default: text)
--preview-path <string> path to write preview image to (default: ./preview.png). Multi-frame previews support
.avi, .webm, and animated .webp
--preview-interval <int> interval in denoising steps between consecutive updates of the image preview file
(default is 1, meaning updating at every step)
--output-begin-idx <int> starting index for output image sequence, must be non-negative (default 0 if specified
%d in output path, 1 otherwise)
--canny apply canny preprocessor (edge detection)
--convert-name convert tensor name (for convert mode)
-v, --verbose print extra info
@ -34,8 +31,7 @@ Context Options:
--clip_g <string> path to the clip-g text encoder
--clip_vision <string> path to the clip-vision encoder
--t5xxl <string> path to the t5xxl text encoder
--llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image,
mistral-small3.2 for flux2, ...)
--llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)
--llm_vision <string> path to the llm vit
--qwen2vl <string> alias of --llm. Deprecated.
--qwen2vl_vision <string> alias of --llm_vision. Deprecated.
@ -47,18 +43,16 @@ Context Options:
--control-net <string> path to control net model
--embd-dir <string> embeddings directory
--lora-model-dir <string> lora model directory
--hires-upscalers-dir <string> highres fix upscaler model directory
--tensor-type-rules <string> weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
--photo-maker <string> path to PHOTOMAKER model
--upscale-model <string> path to esrgan model.
-t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0,
then threads will be set to the number of CPU physical cores
-t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
CPU physical cores
--chroma-t5-mask-pad <int> t5 mask pad size of chroma
--max-vram <float> maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
graph splitting
--vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5)
--vae-tiling process vae in tiles to reduce memory usage
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM
when needed
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
--mmap whether to memory-map model
--control-net-cpu keep controlnet in cpu (for low vram)
--clip-on-cpu keep clip in cpu (for low vram)
@ -73,19 +67,20 @@ Context Options:
--chroma-disable-dit-mask disable dit mask for chroma
--qwen-image-zero-cond-t enable zero_cond_t for qwen image
--chroma-enable-t5-mask enable t5 mask for chroma
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K,
q4_K). If not specified, the default is the type of the weight file
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
type of the weight file
--rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
--sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
--prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow,
flux2_flow]
--lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is
auto. In auto mode, if the model weights contain any quantized parameters,
the at_runtime mode will be used; otherwise, immediately will be used.The
immediately mode may have precision and compatibility issues with quantized
parameters, but it usually offers faster inference speed and, in some cases,
lower memory usage. The at_runtime mode, on the other hand, is exactly the
opposite.
--prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]
--lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights
contain any quantized parameters, the at_runtime mode will be used; otherwise,
immediately will be used.The immediately mode may have precision and
compatibility issues with quantized parameters, but it usually offers faster inference
speed and, in some cases, lower memory usage. The at_runtime mode, on the
other hand, is exactly the opposite.
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
(overrides --vae-tile-size)
Generation Options:
-p, --prompt <string> the prompt to render
@ -94,99 +89,69 @@ Generation Options:
--end-img <string> path to the end image, required by flf2v
--mask <string> path to the mask image
--control-image <string> path to control image, control net
--control-video <string> path to control video frames, It must be a directory path. The video frames
inside should be stored as images in lexicographical (character) order. For
example, if the control video path is `frames`, the directory contain images
such as 00.png, 01.png, ... etc.
--control-video <string> path to control video frames, It must be a directory path. The video frames inside should be stored as images in
lexicographical (character) order. For example, if the control video path is
`frames`, the directory contain images such as 00.png, 01.png, ... etc.
--pm-id-images-dir <string> path to PHOTOMAKER input id images dir
--pm-id-embed-path <string> path to PHOTOMAKER v2 id embed
--hires-upscaler <string> highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent
(nearest-exact), Latent (antialiased), Latent (bicubic), Latent (bicubic
antialiased), or a model name under --hires-upscalers-dir (default: Latent)
-H, --height <int> image height, in pixel space (default: 512)
-W, --width <int> image width, in pixel space (default: 512)
--steps <int> number of sample steps (default: 20)
--high-noise-steps <int> (high noise) number of sample steps (default: -1 = auto)
--clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer
(default: -1). <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
--clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified,
will be 1 for SD1.x, 2 for SD2.x
-b, --batch-count <int> batch count
--video-frames <int> video frames (default: 1)
--fps <int> fps (default: 24)
--timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for
NitroSD-Realism around 250 and 500 for NitroSD-Vibrant
--timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
NitroSD-Vibrant
--upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1)
--upscale-tile-size <int> tile size for ESRGAN upscaling (default: 128)
--hires-width <int> highres fix target width, 0 to use --hires-scale (default: 0)
--hires-height <int> highres fix target height, 0 to use --hires-scale (default: 0)
--hires-steps <int> highres fix second pass sample steps, 0 to reuse --steps (default: 0)
--hires-upscale-tile-size <int> highres fix upscaler tile size, reserved for model-backed upscalers (default:
128)
--cfg-scale <float> unconditional guidance scale: (default: 7.0)
--img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same
as --cfg-scale)
--img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
--guidance <float> distilled guidance scale for models with guidance input (default: 3.5)
--slg-scale <float> skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means
disabled, a value of 2.5 is nice for sd3.5 medium
--slg-scale <float> skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5
medium
--skip-layer-start <float> SLG enabling point (default: 0.01)
--skip-layer-end <float> SLG disabling point (default: 0.2)
--eta <float> noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and
res_2s; 1 for euler_a, er_sde and dpm++2s_a)
--eta <float> noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
--flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto)
--high-noise-cfg-scale <float> (high noise) unconditional guidance scale: (default: 7.0)
--high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or instruct-pix2pix models
(default: same as --cfg-scale)
--high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input
(default: 3.5)
--high-noise-slg-scale <float> (high noise) skip layer guidance (SLG) scale, only for DiT models: (default:
0)
--high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
--high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input (default: 3.5)
--high-noise-slg-scale <float> (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
--high-noise-skip-layer-start <float> (high noise) SLG enabling point (default: 0.01)
--high-noise-skip-layer-end <float> (high noise) SLG disabling point (default: 0.2)
--high-noise-eta <float> (high noise) noise multiplier (default: 0 for ddim_trailing, tcd,
res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
--high-noise-eta <float> (high noise) noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
--strength <float> strength for noising/unnoising (default: 0.75)
--pm-style-strength <float>
--control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full
destruction of information in init image
--moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if
`--high-noise-steps` is set to -1
--control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
--moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1
--vace-strength <float> wan vace strength
--vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5)
--hires-scale <float> highres fix scale when target size is not set (default: 2.0)
--hires-denoising-strength <float> highres fix second pass denoising strength (default: 0.7)
--increase-ref-index automatically increase the indices of references images based on the order
they are listed (starting with 1).
--increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).
--disable-auto-resize-ref-image disable auto resize of ref images
--disable-image-metadata do not embed generation metadata on image files
--vae-tiling process vae in tiles to reduce memory usage
--hires enable highres fix
-s, --seed RNG seed (default: 42, use random seed for < 0)
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m,
dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s,
er_sde] (default: euler for Flux/SD3/Wan, euler_a otherwise)
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a,
dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep,
res_2s, er_sde] default: euler for Flux/SD3/Wan, euler_a otherwise
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits,
smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent], default:
discrete
--sigmas custom sigma values for the sampler, comma-separated (e.g.,
"14.61,7.8,3.5,0.0").
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
tcd, res_multistep, res_2s, er_sde] (default: euler for Flux/SD3/Wan, euler_a
otherwise)
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
ddim_trailing, tcd, res_multistep, res_2s, er_sde] default: euler for Flux/SD3/Wan,
euler_a otherwise
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple,
kl_optimal, lcm, bong_tangent], default: discrete
--sigmas custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0").
--skip-layers layers to skip for SLG steps (default: [7,8,9])
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
-r, --ref-image reference image for Flux Kontext models (can be used multiple times)
--cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET),
'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT
Chebyshev+Taylor forecasting)
--cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level),
'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)
--cache-option named cache params (key=value format, comma-separated). easycache/ucache:
threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit:
Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=.
Examples: "threshold=0.25" or "threshold=1.5,reset=0"
--scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g.,
"1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=;
spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples:
"threshold=0.25" or "threshold=1.5,reset=0" or "w=0.4,window=2"
--scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
--scm-policy SCM policy: 'dynamic' (default) or 'static'
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size
if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)
```
Metadata mode inspects PNG/JPEG container metadata without loading any model:

View File

@ -278,9 +278,7 @@ void parse_args(int argc, const char** argv, SDCliParams& cli_params, SDContextP
bool valid = cli_params.resolve_and_validate();
if (valid && cli_params.mode != METADATA) {
valid = ctx_params.resolve_and_validate(cli_params.mode) &&
gen_params.resolve_and_validate(cli_params.mode,
ctx_params.lora_model_dir,
ctx_params.hires_upscalers_dir);
gen_params.resolve_and_validate(cli_params.mode, ctx_params.lora_model_dir);
}
if (!valid) {
@ -433,9 +431,8 @@ bool save_results(const SDCliParams& cli_params,
if (!img.data)
return false;
const int64_t metadata_seed = cli_params.mode == VID_GEN ? gen_params.seed : gen_params.seed + idx;
std::string params = gen_params.embed_image_metadata
? get_image_params(ctx_params, gen_params, metadata_seed, cli_params.mode)
? get_image_params(ctx_params, gen_params, gen_params.seed + idx)
: "";
const bool ok = write_image_to_file(path.string(), img.data, img.width, img.height, img.channel, params, 90);
LOG_INFO("save result image %d to '%s' (%s)", idx, path.string().c_str(), ok ? "success" : "failure");
@ -691,13 +688,6 @@ int main(int argc, const char* argv[]) {
vae_decode_only = false;
}
if (gen_params.hires_enabled &&
(gen_params.resolved_hires_upscaler == SD_HIRES_UPSCALER_MODEL ||
gen_params.resolved_hires_upscaler == SD_HIRES_UPSCALER_LANCZOS ||
gen_params.resolved_hires_upscaler == SD_HIRES_UPSCALER_NEAREST)) {
vae_decode_only = false;
}
sd_ctx_params_t sd_ctx_params = ctx_params.to_sd_ctx_params_t(vae_decode_only, true, cli_params.taesd_preview);
SDImageVec results;

View File

@ -107,60 +107,47 @@ static bool is_absolute_path(const std::string& p) {
std::string ArgOptions::wrap_text(const std::string& text, size_t width, size_t indent) {
std::ostringstream oss;
size_t pos = 0;
size_t line_len = 0;
size_t pos = 0;
while (pos < text.size()) {
// Preserve manual newlines
if (text[pos] == '\n') {
oss << '\n'
<< std::string(indent, ' ');
line_len = 0;
line_len = indent;
++pos;
continue;
}
if (std::isspace(static_cast<unsigned char>(text[pos]))) {
++pos;
continue;
}
size_t word_start = pos;
while (pos < text.size() &&
text[pos] != '\n' &&
!std::isspace(static_cast<unsigned char>(text[pos]))) {
++pos;
}
std::string word = text.substr(word_start, pos - word_start);
while (!word.empty()) {
size_t separator_len = line_len == 0 ? 0 : 1;
if (line_len + separator_len + word.size() <= width) {
if (separator_len > 0) {
oss << ' ';
// Add the character
oss << text[pos];
++line_len;
}
oss << word;
line_len += word.size();
word.clear();
continue;
}
++pos;
if (line_len > 0) {
oss << '\n'
<< std::string(indent, ' ');
line_len = 0;
continue;
}
// If the current line exceeds width, try to break at the last space
if (line_len >= width) {
std::string current = oss.str();
size_t back = current.size();
size_t chunk_len = std::min(width, word.size());
oss << word.substr(0, chunk_len);
line_len = chunk_len;
word.erase(0, chunk_len);
if (!word.empty()) {
oss << '\n'
// Find the last space (for a clean break)
while (back > 0 && current[back - 1] != ' ' && current[back - 1] != '\n')
--back;
// If found a space to break on
if (back > 0 && current[back - 1] != '\n') {
std::string before = current.substr(0, back - 1);
std::string after = current.substr(back);
oss.str("");
oss.clear();
oss << before << "\n"
<< std::string(indent, ' ') << after;
} else {
// If no space found, just break at width
oss << "\n"
<< std::string(indent, ' ');
line_len = 0;
}
line_len = indent;
}
}
@ -364,10 +351,7 @@ ArgOptions SDContextParams::get_options() {
"--lora-model-dir",
"lora model directory",
&lora_model_dir},
{"",
"--hires-upscalers-dir",
"highres fix upscaler model directory",
&hires_upscalers_dir},
{"",
"--tensor-type-rules",
"weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")",
@ -394,12 +378,7 @@ ArgOptions SDContextParams::get_options() {
&chroma_t5_mask_pad},
};
options.float_options = {
{"",
"--max-vram",
"maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting",
&max_vram},
};
options.float_options = {};
options.bool_options = {
{"",
@ -670,12 +649,10 @@ std::string SDContextParams::to_string() const {
<< " wtype: " << sd_type_name(wtype) << ",\n"
<< " tensor_type_rules: \"" << tensor_type_rules << "\",\n"
<< " lora_model_dir: \"" << lora_model_dir << "\",\n"
<< " hires_upscalers_dir: \"" << hires_upscalers_dir << "\",\n"
<< " photo_maker_path: \"" << photo_maker_path << "\",\n"
<< " rng_type: " << sd_rng_type_name(rng_type) << ",\n"
<< " sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n"
<< " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n"
<< " max_vram: " << max_vram << ",\n"
<< " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n"
<< " control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n"
<< " clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n"
@ -750,7 +727,6 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
chroma_use_t5_mask,
chroma_t5_mask_pad,
qwen_image_zero_cond_t,
max_vram,
};
return sd_ctx_params;
}
@ -801,12 +777,6 @@ ArgOptions SDGenerationParams::get_options() {
"--pm-id-embed-path",
"path to PHOTOMAKER v2 id embed",
&pm_id_embed_path},
{"",
"--hires-upscaler",
"highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent (nearest-exact), "
"Latent (antialiased), Latent (bicubic), Latent (bicubic antialiased), or a model name "
"under --hires-upscalers-dir (default: Latent)",
&hires_upscaler},
};
options.int_options = {
@ -856,22 +826,6 @@ ArgOptions SDGenerationParams::get_options() {
"--upscale-tile-size",
"tile size for ESRGAN upscaling (default: 128)",
&upscale_tile_size},
{"",
"--hires-width",
"highres fix target width, 0 to use --hires-scale (default: 0)",
&hires_width},
{"",
"--hires-height",
"highres fix target height, 0 to use --hires-scale (default: 0)",
&hires_height},
{"",
"--hires-steps",
"highres fix second pass sample steps, 0 to reuse --steps (default: 0)",
&hires_steps},
{"",
"--hires-upscale-tile-size",
"highres fix upscaler tile size, reserved for model-backed upscalers (default: 128)",
&hires_upscale_tile_size},
};
options.float_options = {
@ -959,14 +913,6 @@ ArgOptions SDGenerationParams::get_options() {
"--vae-tile-overlap",
"tile overlap for vae tiling, in fraction of tile size (default: 0.5)",
&vae_tiling_params.target_overlap},
{"",
"--hires-scale",
"highres fix scale when target size is not set (default: 2.0)",
&hires_scale},
{"",
"--hires-denoising-strength",
"highres fix second pass denoising strength (default: 0.7)",
&hires_denoising_strength},
};
options.bool_options = {
@ -990,11 +936,6 @@ ArgOptions SDGenerationParams::get_options() {
"process vae in tiles to reduce memory usage",
true,
&vae_tiling_params.enabled},
{"",
"--hires",
"enable highres fix",
true,
&hires_enabled},
};
auto on_seed_arg = [&](int argc, const char** argv, int index) {
@ -1483,37 +1424,6 @@ static bool parse_lora_json_field(const json& parent,
return true;
}
static bool resolve_model_file_from_dir(const std::string& model_name,
const std::string& model_dir,
const std::vector<std::string>& valid_ext,
const char* label,
std::string& resolved_path) {
if (model_dir.empty()) {
LOG_ERROR("%s directory is empty", label);
return false;
}
if (model_name.empty() ||
model_name.find('/') != std::string::npos ||
model_name.find('\\') != std::string::npos ||
fs::path(model_name).has_root_path() ||
fs::path(model_name).has_extension()) {
LOG_ERROR("%s must be a model name without path or extension: %s", label, model_name.c_str());
return false;
}
fs::path model_dir_path = model_dir;
for (const auto& ext : valid_ext) {
fs::path try_path = model_dir_path / (model_name + ext);
if (fs::exists(try_path) && fs::is_regular_file(try_path)) {
resolved_path = try_path.lexically_normal().string();
return true;
}
}
LOG_ERROR("can not find %s %s in %s", label, model_name.c_str(), model_dir_path.lexically_normal().string().c_str());
return false;
}
bool SDGenerationParams::from_json_str(
const std::string& json_str,
const std::function<std::string(const std::string&)>& lora_path_resolver) {
@ -1577,34 +1487,6 @@ bool SDGenerationParams::from_json_str(
load_if_exists("increase_ref_index", increase_ref_index);
load_if_exists("embed_image_metadata", embed_image_metadata);
if (j.contains("hires") && j["hires"].is_object()) {
const json& hires_json = j["hires"];
if (hires_json.contains("enabled") && hires_json["enabled"].is_boolean()) {
hires_enabled = hires_json["enabled"];
}
if (hires_json.contains("upscaler") && hires_json["upscaler"].is_string()) {
hires_upscaler = hires_json["upscaler"];
}
if (hires_json.contains("scale") && hires_json["scale"].is_number()) {
hires_scale = hires_json["scale"];
}
if (hires_json.contains("target_width") && hires_json["target_width"].is_number_integer()) {
hires_width = hires_json["target_width"];
}
if (hires_json.contains("target_height") && hires_json["target_height"].is_number_integer()) {
hires_height = hires_json["target_height"];
}
if (hires_json.contains("steps") && hires_json["steps"].is_number_integer()) {
hires_steps = hires_json["steps"];
}
if (hires_json.contains("denoising_strength") && hires_json["denoising_strength"].is_number()) {
hires_denoising_strength = hires_json["denoising_strength"];
}
if (hires_json.contains("upscale_tile_size") && hires_json["upscale_tile_size"].is_number_integer()) {
hires_upscale_tile_size = hires_json["upscale_tile_size"];
}
}
auto parse_sample_params_json = [&](const json& sample_json,
sd_sample_params_t& target_params,
std::vector<int>& target_skip_layers,
@ -1918,7 +1800,7 @@ bool SDGenerationParams::initialize_cache_params() {
return true;
}
bool SDGenerationParams::resolve(const std::string& lora_model_dir, const std::string& hires_upscalers_dir, bool strict) {
bool SDGenerationParams::resolve(const std::string& lora_model_dir, bool strict) {
if (high_noise_sample_params.sample_steps <= 0) {
high_noise_sample_params.sample_steps = -1;
}
@ -1937,27 +1819,6 @@ bool SDGenerationParams::resolve(const std::string& lora_model_dir, const std::s
sample_params.sample_steps = std::clamp(sample_params.sample_steps, 1, 100);
}
hires_upscaler_model_path.clear();
if (hires_enabled) {
if (hires_upscaler.empty()) {
hires_upscaler = "Latent";
}
resolved_hires_upscaler = str_to_sd_hires_upscaler(hires_upscaler.c_str());
if (resolved_hires_upscaler == SD_HIRES_UPSCALER_NONE) {
hires_enabled = false;
} else if (resolved_hires_upscaler == SD_HIRES_UPSCALER_COUNT) {
static const std::vector<std::string> valid_ext = {".gguf", ".safetensors", ".pt", ".pth"};
if (!resolve_model_file_from_dir(hires_upscaler,
hires_upscalers_dir,
valid_ext,
"hires upscaler",
hires_upscaler_model_path)) {
return false;
}
resolved_hires_upscaler = SD_HIRES_UPSCALER_MODEL;
}
}
prompt_with_lora = prompt;
if (!lora_model_dir.empty()) {
extract_and_remove_lora(lora_model_dir);
@ -2022,29 +1883,6 @@ bool SDGenerationParams::validate(SDMode mode) {
return false;
}
if (hires_enabled) {
if (hires_width < 0 || hires_height < 0) {
LOG_ERROR("error: hires target width and height must be >= 0");
return false;
}
if (hires_scale <= 0.f && hires_width <= 0 && hires_height <= 0) {
LOG_ERROR("error: hires scale must be positive when target size is not set");
return false;
}
if (hires_steps < 0) {
LOG_ERROR("error: hires steps must be >= 0");
return false;
}
if (hires_denoising_strength <= 0.f || hires_denoising_strength > 1.f) {
LOG_ERROR("error: hires denoising strength must be in (0.0, 1.0]");
return false;
}
if (hires_upscale_tile_size < 1) {
LOG_ERROR("error: hires upscale tile size must be positive");
return false;
}
}
if (mode == UPSCALE) {
if (init_image_path.length() == 0) {
LOG_ERROR("error: upscale mode needs an init image (--init-img)\n");
@ -2055,11 +1893,8 @@ bool SDGenerationParams::validate(SDMode mode) {
return true;
}
bool SDGenerationParams::resolve_and_validate(SDMode mode,
const std::string& lora_model_dir,
const std::string& hires_upscalers_dir,
bool strict) {
if (!resolve(lora_model_dir, hires_upscalers_dir, strict)) {
bool SDGenerationParams::resolve_and_validate(SDMode mode, const std::string& lora_model_dir, bool strict) {
if (!resolve(lora_model_dir, strict)) {
return false;
}
if (!validate(mode)) {
@ -2130,16 +1965,6 @@ sd_img_gen_params_t SDGenerationParams::to_sd_img_gen_params_t() {
params.pm_params = pm_params;
params.vae_tiling_params = vae_tiling_params;
params.cache = cache_params;
params.hires.enabled = hires_enabled;
params.hires.upscaler = resolved_hires_upscaler;
params.hires.model_path = hires_upscaler_model_path.empty() ? nullptr : hires_upscaler_model_path.c_str();
params.hires.scale = hires_scale;
params.hires.target_width = hires_width;
params.hires.target_height = hires_height;
params.hires.steps = hires_steps;
params.hires.denoising_strength = hires_denoising_strength;
params.hires.upscale_tile_size = hires_upscale_tile_size;
return params;
}
@ -2264,15 +2089,6 @@ std::string SDGenerationParams::to_string() const {
<< " seed: " << seed << ",\n"
<< " upscale_repeats: " << upscale_repeats << ",\n"
<< " upscale_tile_size: " << upscale_tile_size << ",\n"
<< " hires: { enabled: " << (hires_enabled ? "true" : "false")
<< ", upscaler: \"" << hires_upscaler << "\""
<< ", model_path: \"" << hires_upscaler_model_path << "\""
<< ", scale: " << hires_scale
<< ", target_width: " << hires_width
<< ", target_height: " << hires_height
<< ", steps: " << hires_steps
<< ", denoising_strength: " << hires_denoising_strength
<< ", upscale_tile_size: " << hires_upscale_tile_size << " },\n"
<< " vae_tiling_params: { "
<< vae_tiling_params.enabled << ", "
<< vae_tiling_params.tile_size_x << ", "
@ -2288,192 +2104,7 @@ std::string version_string() {
return std::string("stable-diffusion.cpp version ") + sd_version() + ", commit " + sd_commit();
}
static std::string safe_json_string(const char* value) {
return value ? value : "";
}
static void set_json_basename_if_not_empty(json& target, const char* key, const std::string& path) {
if (!path.empty()) {
target[key] = sd_basename(path);
}
}
static json build_sampling_metadata_json(const sd_sample_params_t& sample_params,
const std::vector<int>& skip_layers,
const std::vector<float>* custom_sigmas = nullptr) {
json sampling = {
{"steps", sample_params.sample_steps},
{"eta", sample_params.eta},
{"shifted_timestep", sample_params.shifted_timestep},
{"flow_shift", sample_params.flow_shift},
{"guidance",
{
{"txt_cfg", sample_params.guidance.txt_cfg},
{"img_cfg", sample_params.guidance.img_cfg},
{"distilled_guidance", sample_params.guidance.distilled_guidance},
{"slg",
{
{"scale", sample_params.guidance.slg.scale},
{"layers", skip_layers},
{"start", sample_params.guidance.slg.layer_start},
{"end", sample_params.guidance.slg.layer_end},
}},
}},
};
if (sample_params.sample_method != SAMPLE_METHOD_COUNT) {
sampling["method"] = safe_json_string(sd_sample_method_name(sample_params.sample_method));
}
if (sample_params.scheduler != SCHEDULER_COUNT) {
sampling["scheduler"] = safe_json_string(sd_scheduler_name(sample_params.scheduler));
}
if (custom_sigmas != nullptr) {
sampling["custom_sigmas"] = *custom_sigmas;
}
return sampling;
}
std::string build_sdcpp_image_metadata_json(const SDContextParams& ctx_params,
const SDGenerationParams& gen_params,
int64_t seed,
SDMode mode) {
json root;
root["schema"] = "sdcpp.image.params/v1";
root["mode"] = mode == VID_GEN ? "vid_gen" : "img_gen";
root["generator"] = {
{"name", "stable-diffusion.cpp"},
{"version", safe_json_string(sd_version())},
{"commit", safe_json_string(sd_commit())},
};
root["seed"] = seed;
root["width"] = gen_params.get_resolved_width();
root["height"] = gen_params.get_resolved_height();
root["prompt"] = {
{"positive", gen_params.prompt},
{"negative", gen_params.negative_prompt},
};
root["sampling"] = build_sampling_metadata_json(gen_params.sample_params,
gen_params.skip_layers,
&gen_params.custom_sigmas);
json models;
set_json_basename_if_not_empty(models, "model", ctx_params.model_path);
set_json_basename_if_not_empty(models, "clip_l", ctx_params.clip_l_path);
set_json_basename_if_not_empty(models, "clip_g", ctx_params.clip_g_path);
set_json_basename_if_not_empty(models, "clip_vision", ctx_params.clip_vision_path);
set_json_basename_if_not_empty(models, "t5xxl", ctx_params.t5xxl_path);
set_json_basename_if_not_empty(models, "llm", ctx_params.llm_path);
set_json_basename_if_not_empty(models, "llm_vision", ctx_params.llm_vision_path);
set_json_basename_if_not_empty(models, "diffusion_model", ctx_params.diffusion_model_path);
set_json_basename_if_not_empty(models, "high_noise_diffusion_model", ctx_params.high_noise_diffusion_model_path);
set_json_basename_if_not_empty(models, "vae", ctx_params.vae_path);
set_json_basename_if_not_empty(models, "taesd", ctx_params.taesd_path);
set_json_basename_if_not_empty(models, "control_net", ctx_params.control_net_path);
root["models"] = std::move(models);
root["clip_skip"] = gen_params.clip_skip;
root["strength"] = gen_params.strength;
root["control_strength"] = gen_params.control_strength;
root["auto_resize_ref_image"] = gen_params.auto_resize_ref_image;
root["increase_ref_index"] = gen_params.increase_ref_index;
if (mode == VID_GEN) {
root["video"] = {
{"frame_count", gen_params.video_frames},
{"fps", gen_params.fps},
};
root["moe_boundary"] = gen_params.moe_boundary;
root["vace_strength"] = gen_params.vace_strength;
root["high_noise_sampling"] = build_sampling_metadata_json(gen_params.high_noise_sample_params,
gen_params.high_noise_skip_layers);
}
root["rng"] = safe_json_string(sd_rng_type_name(ctx_params.rng_type));
if (ctx_params.sampler_rng_type != RNG_TYPE_COUNT) {
root["sampler_rng"] = safe_json_string(sd_rng_type_name(ctx_params.sampler_rng_type));
}
json loras = json::array();
for (const auto& entry : gen_params.lora_map) {
loras.push_back({
{"name", sd_basename(entry.first)},
{"multiplier", entry.second},
{"is_high_noise", false},
});
}
for (const auto& entry : gen_params.high_noise_lora_map) {
loras.push_back({
{"name", sd_basename(entry.first)},
{"multiplier", entry.second},
{"is_high_noise", true},
});
}
if (!loras.empty()) {
root["loras"] = std::move(loras);
}
if (gen_params.hires_enabled) {
root["hires"] = {
{"enabled", gen_params.hires_enabled},
{"upscaler", gen_params.hires_upscaler},
{"model", gen_params.hires_upscaler_model_path.empty() ? "" : sd_basename(gen_params.hires_upscaler_model_path)},
{"scale", gen_params.hires_scale},
{"target_width", gen_params.hires_width},
{"target_height", gen_params.hires_height},
{"steps", gen_params.hires_steps},
{"denoising_strength", gen_params.hires_denoising_strength},
{"upscale_tile_size", gen_params.hires_upscale_tile_size},
};
}
if (gen_params.cache_params.mode != SD_CACHE_DISABLED) {
root["cache"] = {
{"requested_mode", gen_params.cache_mode},
{"requested_option", gen_params.cache_option},
{"mode", gen_params.cache_params.mode},
{"scm_mask", gen_params.scm_mask},
{"scm_policy_dynamic", gen_params.scm_policy_dynamic},
{"reuse_threshold", gen_params.cache_params.reuse_threshold},
{"start_percent", gen_params.cache_params.start_percent},
{"end_percent", gen_params.cache_params.end_percent},
{"error_decay_rate", gen_params.cache_params.error_decay_rate},
{"use_relative_threshold", gen_params.cache_params.use_relative_threshold},
{"reset_error_on_compute", gen_params.cache_params.reset_error_on_compute},
{"Fn_compute_blocks", gen_params.cache_params.Fn_compute_blocks},
{"Bn_compute_blocks", gen_params.cache_params.Bn_compute_blocks},
{"residual_diff_threshold", gen_params.cache_params.residual_diff_threshold},
{"max_warmup_steps", gen_params.cache_params.max_warmup_steps},
{"max_cached_steps", gen_params.cache_params.max_cached_steps},
{"max_continuous_cached_steps", gen_params.cache_params.max_continuous_cached_steps},
{"taylorseer_n_derivatives", gen_params.cache_params.taylorseer_n_derivatives},
{"taylorseer_skip_interval", gen_params.cache_params.taylorseer_skip_interval},
{"spectrum_w", gen_params.cache_params.spectrum_w},
{"spectrum_m", gen_params.cache_params.spectrum_m},
{"spectrum_lam", gen_params.cache_params.spectrum_lam},
{"spectrum_window_size", gen_params.cache_params.spectrum_window_size},
{"spectrum_flex_window", gen_params.cache_params.spectrum_flex_window},
{"spectrum_warmup_steps", gen_params.cache_params.spectrum_warmup_steps},
{"spectrum_stop_percent", gen_params.cache_params.spectrum_stop_percent},
};
}
if (gen_params.vae_tiling_params.enabled) {
root["vae_tiling"] = {
{"enabled", gen_params.vae_tiling_params.enabled},
{"tile_size_x", gen_params.vae_tiling_params.tile_size_x},
{"tile_size_y", gen_params.vae_tiling_params.tile_size_y},
{"target_overlap", gen_params.vae_tiling_params.target_overlap},
{"rel_size_x", gen_params.vae_tiling_params.rel_size_x},
{"rel_size_y", gen_params.vae_tiling_params.rel_size_y},
};
}
return root.dump();
}
std::string get_image_params(const SDContextParams& ctx_params,
const SDGenerationParams& gen_params,
int64_t seed,
SDMode mode) {
std::string get_image_params(const SDContextParams& ctx_params, const SDGenerationParams& gen_params, int64_t seed) {
std::string parameter_string;
if (gen_params.prompt_with_lora.size() != 0) {
parameter_string += gen_params.prompt_with_lora + "\n";
@ -2486,7 +2117,7 @@ std::string get_image_params(const SDContextParams& ctx_params,
parameter_string += "Steps: " + std::to_string(gen_params.sample_params.sample_steps) + ", ";
parameter_string += "CFG scale: " + std::to_string(gen_params.sample_params.guidance.txt_cfg) + ", ";
if (gen_params.sample_params.guidance.slg.scale != 0 && gen_params.skip_layers.size() != 0) {
parameter_string += "SLG scale: " + std::to_string(gen_params.sample_params.guidance.slg.scale) + ", ";
parameter_string += "SLG scale: " + std::to_string(gen_params.sample_params.guidance.txt_cfg) + ", ";
parameter_string += "Skip layers: [";
for (const auto& layer : gen_params.skip_layers) {
parameter_string += std::to_string(layer) + ", ";
@ -2531,14 +2162,6 @@ std::string get_image_params(const SDContextParams& ctx_params,
if (gen_params.clip_skip != -1) {
parameter_string += "Clip skip: " + std::to_string(gen_params.clip_skip) + ", ";
}
if (gen_params.hires_enabled) {
parameter_string += "Hires upscale: " + gen_params.hires_upscaler + ", ";
parameter_string += "Hires scale: " + std::to_string(gen_params.hires_scale) + ", ";
parameter_string += "Hires resize: " + std::to_string(gen_params.hires_width) + "x" + std::to_string(gen_params.hires_height) + ", ";
parameter_string += "Hires steps: " + std::to_string(gen_params.hires_steps) + ", ";
parameter_string += "Denoising strength: " + std::to_string(gen_params.hires_denoising_strength) + ", ";
}
parameter_string += "Version: stable-diffusion.cpp";
parameter_string += ", SDCPP: " + build_sdcpp_image_metadata_json(ctx_params, gen_params, seed, mode);
return parameter_string;
}

View File

@ -101,7 +101,6 @@ struct SDContextParams {
sd_type_t wtype = SD_TYPE_COUNT;
std::string tensor_type_rules;
std::string lora_model_dir = ".";
std::string hires_upscalers_dir;
std::map<std::string, std::string> embedding_map;
std::vector<sd_embedding_t> embedding_vec;
@ -109,7 +108,6 @@ struct SDContextParams {
rng_type_t rng_type = CUDA_RNG;
rng_type_t sampler_rng_type = RNG_TYPE_COUNT;
bool offload_params_to_cpu = false;
float max_vram = 0.f;
bool enable_mmap = false;
bool control_net_cpu = false;
bool clip_on_cpu = false;
@ -192,23 +190,12 @@ struct SDGenerationParams {
int upscale_repeats = 1;
int upscale_tile_size = 128;
bool hires_enabled = false;
std::string hires_upscaler = "Latent";
std::string hires_upscaler_model_path;
float hires_scale = 2.f;
int hires_width = 0;
int hires_height = 0;
int hires_steps = 0;
float hires_denoising_strength = 0.7f;
int hires_upscale_tile_size = 128;
std::map<std::string, float> lora_map;
std::map<std::string, float> high_noise_lora_map;
// Derived and normalized fields.
std::string prompt_with_lora; // for metadata record only
std::vector<sd_lora_t> lora_vec;
sd_hires_upscaler_t resolved_hires_upscaler;
// Owned execution payload.
SDImageOwner init_image;
@ -238,25 +225,15 @@ struct SDGenerationParams {
void set_width_and_height_if_unset(int w, int h);
int get_resolved_width() const;
int get_resolved_height() const;
bool resolve(const std::string& lora_model_dir, const std::string& hires_upscalers_dir, bool strict = false);
bool resolve(const std::string& lora_model_dir, bool strict = false);
bool validate(SDMode mode);
bool resolve_and_validate(SDMode mode,
const std::string& lora_model_dir,
const std::string& hires_upscalers_dir,
bool strict = false);
bool resolve_and_validate(SDMode mode, const std::string& lora_model_dir, bool strict = false);
sd_img_gen_params_t to_sd_img_gen_params_t();
sd_vid_gen_params_t to_sd_vid_gen_params_t();
std::string to_string() const;
};
std::string version_string();
std::string build_sdcpp_image_metadata_json(const SDContextParams& ctx_params,
const SDGenerationParams& gen_params,
int64_t seed,
SDMode mode = IMG_GEN);
std::string get_image_params(const SDContextParams& ctx_params,
const SDGenerationParams& gen_params,
int64_t seed,
SDMode mode = IMG_GEN);
std::string get_image_params(const SDContextParams& ctx_params, const SDGenerationParams& gen_params, int64_t seed);
#endif // __EXAMPLES_COMMON_COMMON_H__

View File

@ -136,8 +136,7 @@ Context Options:
--clip_g <string> path to the clip-g text encoder
--clip_vision <string> path to the clip-vision encoder
--t5xxl <string> path to the t5xxl text encoder
--llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image,
mistral-small3.2 for flux2, ...)
--llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)
--llm_vision <string> path to the llm vit
--qwen2vl <string> alias of --llm. Deprecated.
--qwen2vl_vision <string> alias of --llm_vision. Deprecated.
@ -149,18 +148,16 @@ Context Options:
--control-net <string> path to control net model
--embd-dir <string> embeddings directory
--lora-model-dir <string> lora model directory
--hires-upscalers-dir <string> highres fix upscaler model directory
--tensor-type-rules <string> weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
--photo-maker <string> path to PHOTOMAKER model
--upscale-model <string> path to esrgan model.
-t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0,
then threads will be set to the number of CPU physical cores
-t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
CPU physical cores
--chroma-t5-mask-pad <int> t5 mask pad size of chroma
--max-vram <float> maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
graph splitting
--vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5)
--vae-tiling process vae in tiles to reduce memory usage
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM
when needed
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
--mmap whether to memory-map model
--control-net-cpu keep controlnet in cpu (for low vram)
--clip-on-cpu keep clip in cpu (for low vram)
@ -175,19 +172,20 @@ Context Options:
--chroma-disable-dit-mask disable dit mask for chroma
--qwen-image-zero-cond-t enable zero_cond_t for qwen image
--chroma-enable-t5-mask enable t5 mask for chroma
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K,
q4_K). If not specified, the default is the type of the weight file
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
type of the weight file
--rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
--sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
--prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow,
flux2_flow]
--lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is
auto. In auto mode, if the model weights contain any quantized parameters,
the at_runtime mode will be used; otherwise, immediately will be used.The
immediately mode may have precision and compatibility issues with quantized
parameters, but it usually offers faster inference speed and, in some cases,
lower memory usage. The at_runtime mode, on the other hand, is exactly the
opposite.
--prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]
--lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights
contain any quantized parameters, the at_runtime mode will be used; otherwise,
immediately will be used.The immediately mode may have precision and
compatibility issues with quantized parameters, but it usually offers faster inference
speed and, in some cases, lower memory usage. The at_runtime mode, on the
other hand, is exactly the opposite.
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
(overrides --vae-tile-size)
Default Generation Options:
-p, --prompt <string> the prompt to render
@ -196,97 +194,65 @@ Default Generation Options:
--end-img <string> path to the end image, required by flf2v
--mask <string> path to the mask image
--control-image <string> path to control image, control net
--control-video <string> path to control video frames, It must be a directory path. The video frames
inside should be stored as images in lexicographical (character) order. For
example, if the control video path is `frames`, the directory contain images
such as 00.png, 01.png, ... etc.
--control-video <string> path to control video frames, It must be a directory path. The video frames inside should be stored as images in
lexicographical (character) order. For example, if the control video path is
`frames`, the directory contain images such as 00.png, 01.png, ... etc.
--pm-id-images-dir <string> path to PHOTOMAKER input id images dir
--pm-id-embed-path <string> path to PHOTOMAKER v2 id embed
--hires-upscaler <string> highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent
(nearest-exact), Latent (antialiased), Latent (bicubic), Latent (bicubic
antialiased), or a model name under --hires-upscalers-dir (default: Latent)
-H, --height <int> image height, in pixel space (default: 512)
-W, --width <int> image width, in pixel space (default: 512)
--steps <int> number of sample steps (default: 20)
--high-noise-steps <int> (high noise) number of sample steps (default: -1 = auto)
--clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer
(default: -1). <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
--clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified,
will be 1 for SD1.x, 2 for SD2.x
-b, --batch-count <int> batch count
--video-frames <int> video frames (default: 1)
--fps <int> fps (default: 24)
--timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for
NitroSD-Realism around 250 and 500 for NitroSD-Vibrant
--timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
NitroSD-Vibrant
--upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1)
--upscale-tile-size <int> tile size for ESRGAN upscaling (default: 128)
--hires-width <int> highres fix target width, 0 to use --hires-scale (default: 0)
--hires-height <int> highres fix target height, 0 to use --hires-scale (default: 0)
--hires-steps <int> highres fix second pass sample steps, 0 to reuse --steps (default: 0)
--hires-upscale-tile-size <int> highres fix upscaler tile size, reserved for model-backed upscalers (default:
128)
--cfg-scale <float> unconditional guidance scale: (default: 7.0)
--img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same
as --cfg-scale)
--img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
--guidance <float> distilled guidance scale for models with guidance input (default: 3.5)
--slg-scale <float> skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means
disabled, a value of 2.5 is nice for sd3.5 medium
--slg-scale <float> skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5
medium
--skip-layer-start <float> SLG enabling point (default: 0.01)
--skip-layer-end <float> SLG disabling point (default: 0.2)
--eta <float> noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and
res_2s; 1 for euler_a, er_sde and dpm++2s_a)
--eta <float> noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
--flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto)
--high-noise-cfg-scale <float> (high noise) unconditional guidance scale: (default: 7.0)
--high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or instruct-pix2pix models
(default: same as --cfg-scale)
--high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input
(default: 3.5)
--high-noise-slg-scale <float> (high noise) skip layer guidance (SLG) scale, only for DiT models: (default:
0)
--high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
--high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input (default: 3.5)
--high-noise-slg-scale <float> (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
--high-noise-skip-layer-start <float> (high noise) SLG enabling point (default: 0.01)
--high-noise-skip-layer-end <float> (high noise) SLG disabling point (default: 0.2)
--high-noise-eta <float> (high noise) noise multiplier (default: 0 for ddim_trailing, tcd,
res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
--high-noise-eta <float> (high noise) noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
--strength <float> strength for noising/unnoising (default: 0.75)
--pm-style-strength <float>
--control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full
destruction of information in init image
--moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if
`--high-noise-steps` is set to -1
--control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
--moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1
--vace-strength <float> wan vace strength
--vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5)
--hires-scale <float> highres fix scale when target size is not set (default: 2.0)
--hires-denoising-strength <float> highres fix second pass denoising strength (default: 0.7)
--increase-ref-index automatically increase the indices of references images based on the order
they are listed (starting with 1).
--increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).
--disable-auto-resize-ref-image disable auto resize of ref images
--disable-image-metadata do not embed generation metadata on image files
--vae-tiling process vae in tiles to reduce memory usage
--hires enable highres fix
-s, --seed RNG seed (default: 42, use random seed for < 0)
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m,
dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s,
er_sde] (default: euler for Flux/SD3/Wan, euler_a otherwise)
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a,
dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep,
res_2s, er_sde] default: euler for Flux/SD3/Wan, euler_a otherwise
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits,
smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent], default:
discrete
--sigmas custom sigma values for the sampler, comma-separated (e.g.,
"14.61,7.8,3.5,0.0").
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
tcd, res_multistep, res_2s, er_sde] (default: euler for Flux/SD3/Wan, euler_a
otherwise)
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
ddim_trailing, tcd, res_multistep, res_2s, er_sde] default: euler for Flux/SD3/Wan,
euler_a otherwise
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple,
kl_optimal, lcm, bong_tangent], default: discrete
--sigmas custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0").
--skip-layers layers to skip for SLG steps (default: [7,8,9])
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
-r, --ref-image reference image for Flux Kontext models (can be used multiple times)
--cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET),
'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT
Chebyshev+Taylor forecasting)
--cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)
--cache-option named cache params (key=value format, comma-separated). easycache/ucache:
threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit:
Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=.
Examples: "threshold=0.25" or "threshold=1.5,reset=0"
--scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g.,
"1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples:
"threshold=0.25" or "threshold=1.5,reset=0"
--scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
--scm-policy SCM policy: 'dynamic' (default) or 'static'
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size
if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)
```

View File

@ -38,8 +38,6 @@ Current generation-related endpoints include:
- `POST /sdapi/v1/txt2img`
- `POST /sdapi/v1/img2img`
- `GET /sdapi/v1/loras`
- `GET /sdapi/v1/upscalers`
- `GET /sdapi/v1/latent-upscale-modes`
- `GET /sdapi/v1/samplers`
- `GET /sdapi/v1/schedulers`
- `GET /sdapi/v1/sd-models`
@ -218,13 +216,6 @@ Currently supported request fields:
| `scheduler` | `string` | Scheduler name |
| `lora` | `array<object>` | Structured LoRA list |
| `extra_images` | `array<string>` | Base64 or data URL images |
| `enable_hr` | `boolean` | Enable highres fix for `txt2img` |
| `hr_upscaler` | `string` | `Lanczos`, `Nearest`, a latent mode such as `Latent (nearest-exact)`, or an upscaler model name from `/sdapi/v1/upscalers` |
| `hr_scale` | `number` | Highres scale when resize target is not set |
| `hr_resize_x` | `integer` | Highres target width, `0` to use scale |
| `hr_resize_y` | `integer` | Highres target height, `0` to use scale |
| `hr_steps` | `integer` | Highres second-pass sample steps, `0` to reuse `steps` |
| `denoising_strength` | `number` | Highres denoising strength for `txt2img` |
Native extension fields:
@ -250,8 +241,6 @@ Currently supported request fields:
| `inpainting_mask_invert` | `integer` or `boolean` | Treated as invert flag |
| `denoising_strength` | `number` | Clamped to `0.0..1.0` |
Highres fix fields are currently handled for `txt2img`; `img2img` uses `denoising_strength` as image-to-image strength.
Native extension fields:
- any `sdcpp API` fields embedded through `sd_cpp_extra_args` inside `prompt`
@ -269,8 +258,6 @@ Response fields:
Currently exposed:
- `GET /sdapi/v1/loras`
- `GET /sdapi/v1/upscalers`
- `GET /sdapi/v1/latent-upscale-modes`
- `GET /sdapi/v1/samplers`
- `GET /sdapi/v1/schedulers`
- `GET /sdapi/v1/sd-models`
@ -285,26 +272,6 @@ Response fields:
| `[].name` | `string` | Display name derived from file stem |
| `[].path` | `string` | Relative path under the configured LoRA directory |
`GET /sdapi/v1/upscalers`
| Field | Type | Notes |
| --- | --- | --- |
| `[].name` | `string` | Built-in name or model stem |
| `[].model_name` | `string \| null` | Model family label for model-backed upscalers |
| `[].model_path` | `string \| null` | Absolute model path for model-backed upscalers |
| `[].model_url` | `string \| null` | Currently always null |
| `[].scale` | `integer` | Currently `4` |
Built-in entries include `None`, `Lanczos`, and `Nearest`. Model-backed entries are scanned from the top level of `--hires-upscalers-dir`; subdirectories are not scanned.
`GET /sdapi/v1/latent-upscale-modes`
| Field | Type | Notes |
| --- | --- | --- |
| `[].name` | `string` | WebUI-compatible latent upscale mode name |
Built-in latent modes include `Latent`, `Latent (nearest)`, `Latent (nearest-exact)`, `Latent (antialiased)`, `Latent (bicubic)`, and `Latent (bicubic antialiased)`.
`GET /sdapi/v1/samplers`
| Field | Type | Notes |
@ -421,7 +388,6 @@ Top-level fields:
| `samplers` | `array<string>` | Available sampling methods |
| `schedulers` | `array<string>` | Available schedulers |
| `loras` | `array<object>` | Available LoRA entries |
| `upscalers` | `array<object>` | Available model-backed highres upscalers |
| `limits` | `object` | Shared queue and size limits |
`model`
@ -458,14 +424,6 @@ Shared nested fields:
| `loras[].name` | `string` |
| `loras[].path` | `string` |
`upscalers`
| Field | Type | Notes |
| --- | --- | --- |
| `upscalers[].name` | `string` | Built-in name or model stem; use this value in `hires.upscaler` |
Built-in entries include `None`, `Lanczos`, `Nearest`, `Latent`, `Latent (nearest)`, `Latent (nearest-exact)`, `Latent (antialiased)`, `Latent (bicubic)`, and `Latent (bicubic antialiased)`. Model-backed entries are scanned from the top level of `--hires-upscalers-dir`; subdirectories are not scanned.
`limits`
| Field | Type |
@ -524,15 +482,6 @@ Shared default fields used by both `img_gen` and `vid_gen`:
| `auto_resize_ref_image` | `boolean` |
| `increase_ref_index` | `boolean` |
| `control_strength` | `number` |
| `hires` | `object` |
| `hires.enabled` | `boolean` |
| `hires.upscaler` | `string` |
| `hires.scale` | `number` |
| `hires.target_width` | `integer` |
| `hires.target_height` | `integer` |
| `hires.steps` | `integer` |
| `hires.denoising_strength` | `number` |
| `hires.upscale_tile_size` | `integer` |
`vid_gen`-specific default fields:
@ -565,7 +514,6 @@ Fields returned in `features_by_mode.img_gen`:
- `ref_images`
- `lora`
- `vae_tiling`
- `hires`
- `cache`
- `cancel_queued`
- `cancel_generating`
@ -677,16 +625,6 @@ Example:
},
"lora": [],
"hires": {
"enabled": false,
"upscaler": "Latent",
"scale": 2.0,
"target_width": 0,
"target_height": 0,
"steps": 0,
"denoising_strength": 0.7,
"upscale_tile_size": 128
},
"vae_tiling_params": {
"enabled": false,
@ -791,23 +729,12 @@ Other native fields:
| Field | Type |
| --- | --- |
| `hires` | `object` |
| `hires.enabled` | `boolean` |
| `hires.upscaler` | `string` |
| `hires.scale` | `number` |
| `hires.target_width` | `integer` |
| `hires.target_height` | `integer` |
| `hires.steps` | `integer` |
| `hires.denoising_strength` | `number` |
| `hires.upscale_tile_size` | `integer` |
| `vae_tiling_params` | `object` |
| `cache_mode` | `string` |
| `cache_option` | `string` |
| `scm_mask` | `string` |
| `scm_policy_dynamic` | `boolean` |
For `hires.upscaler`, use `Lanczos`, `Nearest`, `Latent`, `Latent (nearest)`, `Latent (nearest-exact)`, `Latent (antialiased)`, `Latent (bicubic)`, `Latent (bicubic antialiased)`, or an `upscalers[].name` value from `GET /sdcpp/v1/capabilities`. Model-backed upscalers are resolved as `--hires-upscalers-dir / (name + ext)` and must live directly in that directory.
HTTP-only output fields:
| Field | Type |

View File

@ -48,9 +48,7 @@ static void parse_args(int argc,
if (!svr_params.resolve_and_validate() ||
!ctx_params.resolve_and_validate(IMG_GEN) ||
!default_gen_params.resolve_and_validate(IMG_GEN,
ctx_params.lora_model_dir,
ctx_params.hires_upscalers_dir)) {
!default_gen_params.resolve_and_validate(IMG_GEN, ctx_params.lora_model_dir)) {
print_usage(argv[0], options_vec);
exit(1);
}
@ -97,8 +95,6 @@ int main(int argc, const char** argv) {
std::vector<LoraEntry> lora_cache;
std::mutex lora_mutex;
std::vector<UpscalerEntry> upscaler_cache;
std::mutex upscaler_mutex;
AsyncJobManager async_job_manager;
ServerRuntime runtime = {
sd_ctx.get(),
@ -108,8 +104,6 @@ int main(int argc, const char** argv) {
&default_gen_params,
&lora_cache,
&lora_mutex,
&upscaler_cache,
&upscaler_mutex,
&async_job_manager,
};

View File

@ -70,7 +70,7 @@ static bool build_openai_generation_request(const httplib::Request& req,
}
// Intentionally disable prompt-embedded LoRA tag parsing for server APIs.
if (!request.gen_params.resolve_and_validate(IMG_GEN, "", runtime.ctx_params->hires_upscalers_dir, true)) {
if (!request.gen_params.resolve_and_validate(IMG_GEN, "", true)) {
error_message = "invalid params";
return false;
}
@ -212,7 +212,7 @@ static bool build_openai_edit_request(const httplib::Request& req,
}
// Intentionally disable prompt-embedded LoRA tag parsing for server APIs.
if (!request.gen_params.resolve_and_validate(IMG_GEN, "", runtime.ctx_params->hires_upscalers_dir, true)) {
if (!request.gen_params.resolve_and_validate(IMG_GEN, "", true)) {
error_message = "invalid params";
return false;
}

View File

@ -1,7 +1,6 @@
#include "routes.h"
#include <algorithm>
#include <cctype>
#include <cstring>
#include <regex>
#include <string_view>
@ -36,20 +35,14 @@ static fs::path resolve_display_model_path(const ServerRuntime& runtime) {
return {};
}
static std::string lower_ascii(std::string value) {
std::transform(value.begin(), value.end(), value.begin(), [](unsigned char c) {
return static_cast<char>(std::tolower(c));
});
return value;
}
static enum sample_method_t get_sdapi_sample_method(std::string name) {
enum sample_method_t result = str_to_sample_method(name.c_str());
if (result != SAMPLE_METHOD_COUNT) {
return result;
}
name = lower_ascii(name);
std::transform(name.begin(), name.end(), name.begin(),
[](unsigned char c) { return static_cast<char>(std::tolower(c)); });
static const std::unordered_map<std::string_view, sample_method_t> hardcoded{
{"euler a", EULER_A_SAMPLE_METHOD},
{"k_euler_a", EULER_A_SAMPLE_METHOD},
@ -121,18 +114,6 @@ static bool build_sdapi_img_gen_request(const json& j,
request.gen_params.width = j.value("width", -1);
request.gen_params.height = j.value("height", -1);
if (!img2img && j.value("enable_hr", false)) {
request.gen_params.hires_enabled = true;
request.gen_params.hires_scale = j.value("hr_scale", request.gen_params.hires_scale);
request.gen_params.hires_width = j.value("hr_resize_x", request.gen_params.hires_width);
request.gen_params.hires_height = j.value("hr_resize_y", request.gen_params.hires_height);
request.gen_params.hires_steps = j.value("hr_steps", request.gen_params.hires_steps);
request.gen_params.hires_denoising_strength =
j.value("denoising_strength", request.gen_params.hires_denoising_strength);
request.gen_params.hires_upscaler = j.value("hr_upscaler", request.gen_params.hires_upscaler);
}
std::string sd_cpp_extra_args_str = extract_and_remove_sd_cpp_extra_args(request.gen_params.prompt);
if (!sd_cpp_extra_args_str.empty() && !request.gen_params.from_json_str(sd_cpp_extra_args_str)) {
error_message = "invalid sd_cpp_extra_args";
@ -247,7 +228,7 @@ static bool build_sdapi_img_gen_request(const json& j,
}
// Intentionally disable prompt-embedded LoRA tag parsing for server APIs.
if (!request.gen_params.resolve_and_validate(IMG_GEN, "", runtime.ctx_params->hires_upscalers_dir, true)) {
if (!request.gen_params.resolve_and_validate(IMG_GEN, "", true)) {
error_message = "invalid params";
return false;
}
@ -366,52 +347,6 @@ void register_sdapi_endpoints(httplib::Server& svr, ServerRuntime& rt) {
res.set_content(result.dump(), "application/json");
});
svr.Get("/sdapi/v1/upscalers", [runtime](const httplib::Request&, httplib::Response& res) {
refresh_upscaler_cache(*runtime);
auto make_builtin = [](const char* name) {
json item;
item["name"] = name;
item["model_name"] = nullptr;
item["model_path"] = nullptr;
item["model_url"] = nullptr;
item["scale"] = 4;
return item;
};
json result = json::array();
result.push_back(make_builtin("None"));
result.push_back(make_builtin("Lanczos"));
result.push_back(make_builtin("Nearest"));
{
std::lock_guard<std::mutex> lock(*runtime->upscaler_mutex);
for (const auto& e : *runtime->upscaler_cache) {
json item;
item["name"] = e.name;
item["model_name"] = e.model_name;
item["model_path"] = e.fullpath;
item["model_url"] = nullptr;
item["scale"] = e.scale;
result.push_back(item);
}
}
res.set_content(result.dump(), "application/json");
});
svr.Get("/sdapi/v1/latent-upscale-modes", [](const httplib::Request&, httplib::Response& res) {
json result = json::array({
{{"name", "Latent"}},
{{"name", "Latent (nearest)"}},
{{"name", "Latent (nearest-exact)"}},
{{"name", "Latent (antialiased)"}},
{{"name", "Latent (bicubic)"}},
{{"name", "Latent (bicubic antialiased)"}},
});
res.set_content(result.dump(), "application/json");
});
svr.Get("/sdapi/v1/samplers", [runtime](const httplib::Request&, httplib::Response& res) {
std::vector<std::string> sampler_names;
sampler_names.push_back("default");

View File

@ -114,17 +114,6 @@ static json make_img_gen_defaults_json(const SDGenerationParams& defaults, const
{"increase_ref_index", defaults.increase_ref_index},
{"control_strength", defaults.control_strength},
{"sample_params", make_sample_params_json(defaults.sample_params, defaults.skip_layers)},
{"hires",
{
{"enabled", defaults.hires_enabled},
{"upscaler", defaults.hires_upscaler},
{"scale", defaults.hires_scale},
{"target_width", defaults.hires_width},
{"target_height", defaults.hires_height},
{"steps", defaults.hires_steps},
{"denoising_strength", defaults.hires_denoising_strength},
{"upscale_tile_size", defaults.hires_upscale_tile_size},
}},
{"vae_tiling_params", make_vae_tiling_json(defaults.vae_tiling_params)},
{"cache_mode", defaults.cache_mode},
{"cache_option", defaults.cache_option},
@ -168,7 +157,6 @@ static json make_img_gen_features_json() {
{"ref_images", true},
{"lora", true},
{"vae_tiling", true},
{"hires", true},
{"cache", true},
{"cancel_queued", true},
{"cancel_generating", false},
@ -191,7 +179,6 @@ static json make_vid_gen_features_json() {
static json make_capabilities_json(ServerRuntime& runtime) {
refresh_lora_cache(runtime);
refresh_upscaler_cache(runtime);
AsyncJobManager& manager = *runtime.async_job_manager;
const auto& defaults = *runtime.default_gen_params;
@ -203,7 +190,6 @@ static json make_capabilities_json(ServerRuntime& runtime) {
json image_output_formats = supported_img_output_formats();
json video_output_formats = supported_vid_output_formats();
json available_loras = json::array();
json available_upscalers = json::array();
json supported_modes = json::array();
for (int i = 0; i < SAMPLE_METHOD_COUNT; ++i) {
@ -224,42 +210,6 @@ static json make_capabilities_json(ServerRuntime& runtime) {
}
}
available_upscalers.push_back({
{"name", "None"},
});
available_upscalers.push_back({
{"name", "Lanczos"},
});
available_upscalers.push_back({
{"name", "Nearest"},
});
available_upscalers.push_back({
{"name", "Latent"},
});
available_upscalers.push_back({
{"name", "Latent (nearest)"},
});
available_upscalers.push_back({
{"name", "Latent (nearest-exact)"},
});
available_upscalers.push_back({
{"name", "Latent (antialiased)"},
});
available_upscalers.push_back({
{"name", "Latent (bicubic)"},
});
available_upscalers.push_back({
{"name", "Latent (bicubic antialiased)"},
});
{
std::lock_guard<std::mutex> lock(*runtime.upscaler_mutex);
for (const auto& entry : *runtime.upscaler_cache) {
available_upscalers.push_back({
{"name", entry.name},
});
}
}
if (supports_img) {
supported_modes.push_back("img_gen");
}
@ -334,7 +284,6 @@ static json make_capabilities_json(ServerRuntime& runtime) {
result["features"] = top_level_features;
result["features_by_mode"] = features_by_mode;
result["loras"] = available_loras;
result["upscalers"] = available_upscalers;
return result;
}
@ -358,7 +307,7 @@ static bool parse_img_gen_request(const json& body,
return false;
}
// Intentionally disable prompt-embedded LoRA tag parsing for server APIs.
if (!request.gen_params.resolve_and_validate(IMG_GEN, "", runtime.ctx_params->hires_upscalers_dir, true)) {
if (!request.gen_params.resolve_and_validate(IMG_GEN, "", true)) {
error_message = "invalid generation parameters";
return false;
}
@ -385,7 +334,7 @@ static bool parse_vid_gen_request(const json& body,
return false;
}
// Intentionally disable prompt-embedded LoRA tag parsing for server APIs.
if (!request.gen_params.resolve_and_validate(VID_GEN, "", runtime.ctx_params->hires_upscalers_dir, true)) {
if (!request.gen_params.resolve_and_validate(VID_GEN, "", true)) {
error_message = "invalid generation parameters";
return false;
}

View File

@ -1,7 +1,6 @@
#include "runtime.h"
#include <algorithm>
#include <cctype>
#include <chrono>
#include <cstdlib>
#include <filesystem>
@ -14,18 +13,6 @@
namespace fs = std::filesystem;
static std::string lower_ascii(std::string value) {
std::transform(value.begin(), value.end(), value.begin(), [](unsigned char c) {
return static_cast<char>(std::tolower(c));
});
return value;
}
static bool is_supported_model_ext(const fs::path& p) {
auto ext = lower_ascii(p.extension().string());
return ext == ".gguf" || ext == ".pt" || ext == ".pth" || ext == ".safetensors";
}
static const std::string k_base64_chars =
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz"
@ -254,12 +241,20 @@ void refresh_lora_cache(ServerRuntime& rt) {
fs::path lora_dir = rt.ctx_params->lora_model_dir;
if (fs::exists(lora_dir) && fs::is_directory(lora_dir)) {
auto is_lora_ext = [](const fs::path& p) {
auto ext = p.extension().string();
std::transform(ext.begin(), ext.end(), ext.begin(), [](unsigned char c) {
return static_cast<char>(std::tolower(c));
});
return ext == ".gguf" || ext == ".pt" || ext == ".pth" || ext == ".safetensors";
};
for (auto& entry : fs::recursive_directory_iterator(lora_dir)) {
if (!entry.is_regular_file()) {
continue;
}
const fs::path& p = entry.path();
if (!is_supported_model_ext(p)) {
if (!is_lora_ext(p)) {
continue;
}
@ -291,40 +286,6 @@ std::string get_lora_full_path(ServerRuntime& rt, const std::string& path) {
return it != rt.lora_cache->end() ? it->fullpath : "";
}
void refresh_upscaler_cache(ServerRuntime& rt) {
std::vector<UpscalerEntry> new_cache;
fs::path upscaler_dir = rt.ctx_params->hires_upscalers_dir;
if (fs::exists(upscaler_dir) && fs::is_directory(upscaler_dir)) {
for (auto& entry : fs::directory_iterator(upscaler_dir)) {
if (!entry.is_regular_file()) {
continue;
}
const fs::path& p = entry.path();
if (!is_supported_model_ext(p)) {
continue;
}
UpscalerEntry upscaler_entry;
upscaler_entry.name = p.stem().u8string();
upscaler_entry.fullpath = fs::absolute(p).lexically_normal().u8string();
upscaler_entry.model_name = "ESRGAN_4x";
upscaler_entry.path = p.filename().u8string();
new_cache.push_back(std::move(upscaler_entry));
}
}
std::sort(new_cache.begin(), new_cache.end(), [](const UpscalerEntry& a, const UpscalerEntry& b) {
return a.name < b.name;
});
{
std::lock_guard<std::mutex> lock(*rt.upscaler_mutex);
*rt.upscaler_cache = std::move(new_cache);
}
}
int64_t unix_timestamp_now() {
return std::chrono::duration_cast<std::chrono::seconds>(
std::chrono::system_clock::now().time_since_epoch())

View File

@ -37,14 +37,6 @@ struct LoraEntry {
std::string fullpath;
};
struct UpscalerEntry {
std::string name;
std::string path;
std::string fullpath;
std::string model_name;
int scale = 4;
};
struct ServerRuntime {
sd_ctx_t* sd_ctx;
std::mutex* sd_ctx_mutex;
@ -53,8 +45,6 @@ struct ServerRuntime {
const SDGenerationParams* default_gen_params;
std::vector<LoraEntry>* lora_cache;
std::mutex* lora_mutex;
std::vector<UpscalerEntry>* upscaler_cache;
std::mutex* upscaler_mutex;
AsyncJobManager* async_job_manager;
};
@ -96,5 +86,4 @@ bool runtime_supports_generation_mode(const ServerRuntime& runtime, SDMode mode)
std::string unsupported_generation_mode_error(SDMode mode);
void refresh_lora_cache(ServerRuntime& rt);
std::string get_lora_full_path(ServerRuntime& rt, const std::string& path);
void refresh_upscaler_cache(ServerRuntime& rt);
int64_t unix_timestamp_now();

View File

@ -1,5 +1,5 @@
for f in src/*.cpp src/*.h src/*.hpp src/tokenizers/*.h src/tokenizers/*.cpp src/tokenizers/vocab/*.h src/tokenizers/vocab/*.cpp \
src/model_io/*.h src/model_io/*.cpp examples/cli/*.cpp examples/cli/*.h examples/server/*.cpp \
examples/cli/*.cpp examples/cli/*.h examples/server/*.cpp \
examples/common/*.hpp examples/common/*.h examples/common/*.cpp; do
[[ "$f" == vocab* ]] && continue
echo "formatting '$f'"

View File

@ -203,7 +203,6 @@ typedef struct {
bool chroma_use_t5_mask;
int chroma_t5_mask_pad;
bool qwen_image_zero_cond_t;
float max_vram;
} sd_ctx_params_t;
typedef struct {
@ -290,32 +289,6 @@ typedef struct {
const char* path;
} sd_lora_t;
enum sd_hires_upscaler_t {
SD_HIRES_UPSCALER_NONE,
SD_HIRES_UPSCALER_LATENT,
SD_HIRES_UPSCALER_LATENT_NEAREST,
SD_HIRES_UPSCALER_LATENT_NEAREST_EXACT,
SD_HIRES_UPSCALER_LATENT_ANTIALIASED,
SD_HIRES_UPSCALER_LATENT_BICUBIC,
SD_HIRES_UPSCALER_LATENT_BICUBIC_ANTIALIASED,
SD_HIRES_UPSCALER_LANCZOS,
SD_HIRES_UPSCALER_NEAREST,
SD_HIRES_UPSCALER_MODEL,
SD_HIRES_UPSCALER_COUNT,
};
typedef struct {
bool enabled;
enum sd_hires_upscaler_t upscaler;
const char* model_path;
float scale;
int target_width;
int target_height;
int steps;
float denoising_strength;
int upscale_tile_size;
} sd_hires_params_t;
typedef struct {
const sd_lora_t* loras;
uint32_t lora_count;
@ -339,7 +312,6 @@ typedef struct {
sd_pm_params_t pm_params;
sd_tiling_params_t vae_tiling_params;
sd_cache_params_t cache;
sd_hires_params_t hires;
} sd_img_gen_params_t;
typedef struct {
@ -393,11 +365,8 @@ SD_API const char* sd_preview_name(enum preview_t preview);
SD_API enum preview_t str_to_preview(const char* str);
SD_API const char* sd_lora_apply_mode_name(enum lora_apply_mode_t mode);
SD_API enum lora_apply_mode_t str_to_lora_apply_mode(const char* str);
SD_API const char* sd_hires_upscaler_name(enum sd_hires_upscaler_t upscaler);
SD_API enum sd_hires_upscaler_t str_to_sd_hires_upscaler(const char* str);
SD_API void sd_cache_params_init(sd_cache_params_t* cache_params);
SD_API void sd_hires_params_init(sd_hires_params_t* hires_params);
SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params);
SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);

View File

@ -499,15 +499,9 @@ namespace Anima {
encoder_hidden_states = adapted_context;
}
sd::ggml_graph_cut::mark_graph_cut(x, "anima.prelude", "x");
sd::ggml_graph_cut::mark_graph_cut(embedded_timestep, "anima.prelude", "embedded_timestep");
sd::ggml_graph_cut::mark_graph_cut(temb, "anima.prelude", "temb");
sd::ggml_graph_cut::mark_graph_cut(encoder_hidden_states, "anima.prelude", "context");
for (int i = 0; i < num_layers; i++) {
auto block = std::dynamic_pointer_cast<TransformerBlock>(blocks["blocks." + std::to_string(i)]);
x = block->forward(ctx, x, encoder_hidden_states, embedded_timestep, temb, image_pe);
sd::ggml_graph_cut::mark_graph_cut(x, "anima.blocks." + std::to_string(i), "x");
}
x = final_layer->forward(ctx, x, embedded_timestep, temb); // [N, h*w, ph*pw*C]

View File

@ -328,7 +328,6 @@ public:
auto conv_out = std::dynamic_pointer_cast<Conv2d>(blocks["conv_out"]);
auto h = conv_in->forward(ctx, x); // [N, ch, h, w]
// sd::ggml_graph_cut::mark_graph_cut(h, "vae.encoder.prelude", "h");
// downsampling
size_t num_resolutions = ch_mult.size();
@ -338,14 +337,12 @@ public:
auto down_block = std::dynamic_pointer_cast<ResnetBlock>(blocks[name]);
h = down_block->forward(ctx, h);
// sd::ggml_graph_cut::mark_graph_cut(h, "vae.encoder.down." + std::to_string(i) + ".block." + std::to_string(j), "h");
}
if (i != num_resolutions - 1) {
std::string name = "down." + std::to_string(i) + ".downsample";
auto down_sample = std::dynamic_pointer_cast<DownSampleBlock>(blocks[name]);
h = down_sample->forward(ctx, h);
// sd::ggml_graph_cut::mark_graph_cut(h, "vae.encoder.down." + std::to_string(i) + ".downsample", "h");
}
}
@ -353,7 +350,6 @@ public:
h = mid_block_1->forward(ctx, h);
h = mid_attn_1->forward(ctx, h);
h = mid_block_2->forward(ctx, h); // [N, block_in, h, w]
// sd::ggml_graph_cut::mark_graph_cut(h, "vae.encoder.mid", "h");
// end
h = norm_out->forward(ctx, h);
@ -454,7 +450,6 @@ public:
// conv_in
auto h = conv_in->forward(ctx, z); // [N, block_in, h, w]
// sd::ggml_graph_cut::mark_graph_cut(h, "vae.decoder.prelude", "h");
// middle
h = mid_block_1->forward(ctx, h);
@ -462,7 +457,6 @@ public:
h = mid_attn_1->forward(ctx, h);
h = mid_block_2->forward(ctx, h); // [N, block_in, h, w]
// sd::ggml_graph_cut::mark_graph_cut(h, "vae.decoder.mid", "h");
// upsampling
int num_resolutions = static_cast<int>(ch_mult.size());
@ -472,14 +466,12 @@ public:
auto up_block = std::dynamic_pointer_cast<ResnetBlock>(blocks[name]);
h = up_block->forward(ctx, h);
// sd::ggml_graph_cut::mark_graph_cut(h, "vae.decoder.up." + std::to_string(i) + ".block." + std::to_string(j), "h");
}
if (i != 0) {
std::string name = "up." + std::to_string(i) + ".upsample";
auto up_sample = std::dynamic_pointer_cast<UpSampleBlock>(blocks[name]);
h = up_sample->forward(ctx, h);
// sd::ggml_graph_cut::mark_graph_cut(h, "vae.decoder.up." + std::to_string(i) + ".upsample", "h");
}
}
@ -607,7 +599,6 @@ public:
if (use_quant) {
auto post_quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["post_quant_conv"]);
z = post_quant_conv->forward(ctx, z); // [N, z_channels, h, w]
// sd::ggml_graph_cut::mark_graph_cut(z, "vae.decode.prelude", "z");
}
auto decoder = std::dynamic_pointer_cast<Decoder>(blocks["decoder"]);
@ -625,7 +616,6 @@ public:
if (use_quant) {
auto quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["quant_conv"]);
z = quant_conv->forward(ctx, z); // [N, 2*embed_dim, h/8, w/8]
// sd::ggml_graph_cut::mark_graph_cut(z, "vae.encode.final", "z");
}
if (sd_version_uses_flux2_vae(version)) {
z = ggml_ext_chunk(ctx->ggml_ctx, z, 2, 2)[0];

View File

@ -96,8 +96,7 @@ public:
ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_tensor* x,
ggml_tensor* mask = nullptr,
int clip_skip = -1,
const std::string& graph_cut_prefix = "") {
int clip_skip = -1) {
// x: [N, n_token, d_model]
int layer_idx = n_layer - 1;
// LOG_DEBUG("clip_skip %d", clip_skip);
@ -113,9 +112,6 @@ public:
std::string name = "layers." + std::to_string(i);
auto layer = std::dynamic_pointer_cast<CLIPLayer>(blocks[name]);
x = layer->forward(ctx, x, mask); // [N, n_token, d_model]
if (!graph_cut_prefix.empty()) {
sd::ggml_graph_cut::mark_graph_cut(x, graph_cut_prefix + ".layers." + std::to_string(i), "x");
}
// LOG_DEBUG("layer %d", i);
}
return x;
@ -308,8 +304,7 @@ public:
auto final_layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["final_layer_norm"]);
auto x = embeddings->forward(ctx, input_ids, tkn_embeddings); // [N, n_token, hidden_size]
sd::ggml_graph_cut::mark_graph_cut(x, "clip_text.prelude", "x");
x = encoder->forward(ctx, x, mask, return_pooled ? -1 : clip_skip, "clip_text");
x = encoder->forward(ctx, x, mask, return_pooled ? -1 : clip_skip);
if (return_pooled || with_final_ln) {
x = final_layer_norm->forward(ctx, x);
}
@ -373,8 +368,7 @@ public:
auto x = embeddings->forward(ctx, pixel_values); // [N, num_positions, embed_dim]
x = pre_layernorm->forward(ctx, x);
sd::ggml_graph_cut::mark_graph_cut(x, "clip_vision.prelude", "x");
x = encoder->forward(ctx, x, nullptr, clip_skip, "clip_vision");
x = encoder->forward(ctx, x, nullptr, clip_skip);
auto last_hidden_state = x;

View File

@ -1,9 +1,7 @@
#ifndef __COMMON_BLOCK_HPP__
#define __COMMON_BLOCK_HPP__
#include "ggml-backend.h"
#include "ggml_extend.hpp"
#include "util.h"
class DownSampleBlock : public GGMLBlock {
protected:
@ -250,6 +248,9 @@ public:
float scale = 1.f;
if (precision_fix) {
scale = 1.f / 128.f;
#ifdef SD_USE_VULKAN
force_prec_f32 = true;
#endif
}
// The purpose of the scale here is to prevent NaN issues in certain situations.
// For example, when using Vulkan without enabling force_prec_f32,
@ -263,9 +264,6 @@ public:
auto net_0 = std::dynamic_pointer_cast<UnaryBlock>(blocks["net.0"]);
auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
if (sd_backend_is(ctx->backend, "Vulkan")) {
net_2->set_force_prec_f32(true);
}
x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim]
x = net_2->forward(ctx, x); // [ne3, ne2, ne1, dim_out]

View File

@ -85,7 +85,6 @@ public:
virtual void free_params_buffer() = 0;
virtual void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) = 0;
virtual size_t get_params_buffer_size() = 0;
virtual void set_max_graph_vram_bytes(size_t max_vram_bytes) {}
virtual void set_flash_attention_enabled(bool enabled) = 0;
virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {}
virtual std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(int n_threads,
@ -166,13 +165,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
return buffer_size;
}
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
text_model->set_max_graph_vram_bytes(max_vram_bytes);
if (sd_version_is_sdxl(version)) {
text_model2->set_max_graph_vram_bytes(max_vram_bytes);
}
}
void set_flash_attention_enabled(bool enabled) override {
text_model->set_flash_attention_enabled(enabled);
if (sd_version_is_sdxl(version)) {
@ -789,18 +781,6 @@ struct SD3CLIPEmbedder : public Conditioner {
return buffer_size;
}
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
if (clip_l) {
clip_l->set_max_graph_vram_bytes(max_vram_bytes);
}
if (clip_g) {
clip_g->set_max_graph_vram_bytes(max_vram_bytes);
}
if (t5) {
t5->set_max_graph_vram_bytes(max_vram_bytes);
}
}
void set_flash_attention_enabled(bool enabled) override {
if (clip_l) {
clip_l->set_flash_attention_enabled(enabled);
@ -1144,15 +1124,6 @@ struct FluxCLIPEmbedder : public Conditioner {
return buffer_size;
}
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
if (clip_l) {
clip_l->set_max_graph_vram_bytes(max_vram_bytes);
}
if (t5) {
t5->set_max_graph_vram_bytes(max_vram_bytes);
}
}
void set_flash_attention_enabled(bool enabled) override {
if (clip_l) {
clip_l->set_flash_attention_enabled(enabled);
@ -1378,12 +1349,6 @@ struct T5CLIPEmbedder : public Conditioner {
return buffer_size;
}
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
if (t5) {
t5->set_max_graph_vram_bytes(max_vram_bytes);
}
}
void set_flash_attention_enabled(bool enabled) override {
if (t5) {
t5->set_flash_attention_enabled(enabled);
@ -1560,10 +1525,6 @@ struct AnimaConditioner : public Conditioner {
return llm->get_params_buffer_size();
}
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
llm->set_max_graph_vram_bytes(max_vram_bytes);
}
void set_flash_attention_enabled(bool enabled) override {
llm->set_flash_attention_enabled(enabled);
}
@ -1696,10 +1657,6 @@ struct LLMEmbedder : public Conditioner {
return buffer_size;
}
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
llm->set_max_graph_vram_bytes(max_vram_bytes);
}
void set_flash_attention_enabled(bool enabled) override {
llm->set_flash_attention_enabled(enabled);
}

View File

@ -1,138 +0,0 @@
#include <cstring>
#include <mutex>
#include <regex>
#include <vector>
#include "model.h"
#include "model_io/gguf_io.h"
#include "model_io/safetensors_io.h"
#include "util.h"
#include "ggml-cpu.h"
static ggml_type get_export_tensor_type(ModelLoader& model_loader,
const TensorStorage& tensor_storage,
ggml_type type,
const TensorTypeRules& tensor_type_rules) {
const std::string& name = tensor_storage.name;
ggml_type tensor_type = tensor_storage.type;
ggml_type dst_type = type;
for (const auto& tensor_type_rule : tensor_type_rules) {
std::regex pattern(tensor_type_rule.first);
if (std::regex_search(name, pattern)) {
dst_type = tensor_type_rule.second;
break;
}
}
if (model_loader.tensor_should_be_converted(tensor_storage, dst_type)) {
tensor_type = dst_type;
}
return tensor_type;
}
static bool load_tensors_for_export(ModelLoader& model_loader,
ggml_context* ggml_ctx,
ggml_type type,
const TensorTypeRules& tensor_type_rules,
std::vector<TensorWriteInfo>& tensors) {
std::mutex tensor_mutex;
auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
const std::string& name = tensor_storage.name;
ggml_type tensor_type = get_export_tensor_type(model_loader, tensor_storage, type, tensor_type_rules);
std::lock_guard<std::mutex> lock(tensor_mutex);
ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne);
if (tensor == nullptr) {
LOG_ERROR("ggml_new_tensor failed");
return false;
}
ggml_set_name(tensor, name.c_str());
if (!tensor->data) {
GGML_ASSERT(ggml_nelements(tensor) == 0);
// Avoid crashing writers by setting a dummy pointer for zero-sized tensors.
LOG_DEBUG("setting dummy pointer for zero-sized tensor %s", name.c_str());
tensor->data = ggml_get_mem_buffer(ggml_ctx);
}
TensorWriteInfo write_info;
write_info.tensor = tensor;
write_info.n_dims = tensor_storage.n_dims;
for (int i = 0; i < tensor_storage.n_dims; ++i) {
write_info.ne[i] = tensor_storage.ne[i];
}
*dst_tensor = tensor;
tensors.push_back(std::move(write_info));
return true;
};
bool success = model_loader.load_tensors(on_new_tensor_cb);
LOG_INFO("load tensors done");
return success;
}
bool convert(const char* input_path,
const char* vae_path,
const char* output_path,
sd_type_t output_type,
const char* tensor_type_rules,
bool convert_name) {
ModelLoader model_loader;
if (!model_loader.init_from_file(input_path)) {
LOG_ERROR("init model loader from file failed: '%s'", input_path);
return false;
}
if (vae_path != nullptr && strlen(vae_path) > 0) {
if (!model_loader.init_from_file(vae_path, "vae.")) {
LOG_ERROR("init model loader from file failed: '%s'", vae_path);
return false;
}
}
if (convert_name) {
model_loader.convert_tensors_name();
}
ggml_type type = (ggml_type)output_type;
bool output_is_safetensors = ends_with(output_path, ".safetensors");
TensorTypeRules type_rules = parse_tensor_type_rules(tensor_type_rules);
auto backend = ggml_backend_cpu_init();
size_t mem_size = 1 * 1024 * 1024; // for padding
mem_size += model_loader.get_tensor_storage_map().size() * ggml_tensor_overhead();
mem_size += model_loader.get_params_mem_size(backend, type);
LOG_INFO("model tensors mem size: %.2fMB", mem_size / 1024.f / 1024.f);
ggml_context* ggml_ctx = ggml_init({mem_size, nullptr, false});
if (ggml_ctx == nullptr) {
LOG_ERROR("ggml_init failed for converter");
ggml_backend_free(backend);
return false;
}
std::vector<TensorWriteInfo> tensors;
bool success = load_tensors_for_export(model_loader, ggml_ctx, type, type_rules, tensors);
ggml_backend_free(backend);
std::string error;
if (success) {
if (output_is_safetensors) {
success = write_safetensors_file(output_path, tensors, &error);
} else {
success = write_gguf_file(output_path, tensors, &error);
}
}
if (!success && !error.empty()) {
LOG_ERROR("%s", error.c_str());
}
ggml_free(ggml_ctx);
return success;
}

View File

@ -808,18 +808,6 @@ static std::tuple<float, float, float> get_ancestral_step_flow(float sigma_from,
return {sigma_down, sigma_up, alpha_scale};
}
static std::tuple<float, float, float> get_ancestral_step(float sigma_from,
float sigma_to,
float eta,
bool is_flow_denoiser) {
if (is_flow_denoiser) {
return get_ancestral_step_flow(sigma_from, sigma_to, eta);
} else {
auto [sigma_down, sigma_up] = get_ancestral_step(sigma_from, sigma_to, eta);
return {sigma_down, sigma_up, 1.0f};
}
}
static sd::Tensor<float> sample_euler_ancestral(denoise_cb_t model,
sd::Tensor<float> x,
const std::vector<float>& sigmas,
@ -1076,6 +1064,8 @@ static sd::Tensor<float> sample_dpmpp_2s_ancestral_flow(denoise_cb_t model,
return x;
}
static sd::Tensor<float> sample_dpmpp_2m(denoise_cb_t model,
sd::Tensor<float> x,
const std::vector<float>& sigmas) {
@ -1147,8 +1137,7 @@ static sd::Tensor<float> sample_dpmpp_2m_v2(denoise_cb_t model,
static sd::Tensor<float> sample_lcm(denoise_cb_t model,
sd::Tensor<float> x,
const std::vector<float>& sigmas,
std::shared_ptr<RNG> rng,
bool is_flow_denoiser) {
std::shared_ptr<RNG> rng) {
int steps = static_cast<int>(sigmas.size()) - 1;
for (int i = 0; i < steps; i++) {
auto denoised_opt = model(x, sigmas[i], i + 1);
@ -1157,9 +1146,6 @@ static sd::Tensor<float> sample_lcm(denoise_cb_t model,
}
x = std::move(denoised_opt);
if (sigmas[i + 1] > 0) {
if (is_flow_denoiser) {
x *= (1 - sigmas[i + 1]);
}
x += sd::Tensor<float>::randn_like(x, rng) * sigmas[i + 1];
}
}
@ -1259,7 +1245,6 @@ static sd::Tensor<float> sample_res_multistep(denoise_cb_t model,
sd::Tensor<float> x,
const std::vector<float>& sigmas,
std::shared_ptr<RNG> rng,
bool is_flow_denoiser,
float eta) {
sd::Tensor<float> old_denoised = x;
bool have_old_sigma = false;
@ -1291,8 +1276,7 @@ static sd::Tensor<float> sample_res_multistep(denoise_cb_t model,
float sigma_from = sigmas[i];
float sigma_to = sigmas[i + 1];
auto [sigma_down, sigma_up, alpha_scale] = get_ancestral_step(sigma_from, sigma_to, eta, is_flow_denoiser);
auto [sigma_down, sigma_up] = get_ancestral_step(sigma_from, sigma_to, eta);
if (sigma_down == 0.0f || !have_old_sigma) {
x += ((x - denoised) / sigma_from) * (sigma_down - sigma_from);
@ -1319,10 +1303,7 @@ static sd::Tensor<float> sample_res_multistep(denoise_cb_t model,
x = sigma_fn(h) * x + h * (b1 * denoised + b2 * old_denoised);
}
if (sigma_to > 0.0f && sigma_up > 0.0f) {
if (is_flow_denoiser) {
x *= alpha_scale;
}
if (sigmas[i + 1] > 0 && sigma_up > 0.0f) {
x += sd::Tensor<float>::randn_like(x, rng) * sigma_up;
}
@ -1337,7 +1318,6 @@ static sd::Tensor<float> sample_res_2s(denoise_cb_t model,
sd::Tensor<float> x,
const std::vector<float>& sigmas,
std::shared_ptr<RNG> rng,
bool is_flow_denoiser,
float eta) {
const float c2 = 0.5f;
auto t_fn = [](float sigma) -> float { return -logf(sigma); };
@ -1366,7 +1346,7 @@ static sd::Tensor<float> sample_res_2s(denoise_cb_t model,
}
sd::Tensor<float> denoised = std::move(denoised_opt);
auto [sigma_down, sigma_up, alpha_scale] = get_ancestral_step(sigma_from, sigma_to, eta, is_flow_denoiser);
auto [sigma_down, sigma_up] = get_ancestral_step(sigma_from, sigma_to, eta);
sd::Tensor<float> x0 = x;
if (sigma_down == 0.0f || sigma_from == 0.0f) {
@ -1395,10 +1375,7 @@ static sd::Tensor<float> sample_res_2s(denoise_cb_t model,
x = x0 + h * (b1 * eps1 + b2 * eps2);
}
if (sigma_to > 0.0f && sigma_up > 0.0f) {
if (is_flow_denoiser) {
x *= alpha_scale;
}
if (sigmas[i + 1] > 0 && sigma_up > 0.0f) {
x += sd::Tensor<float>::randn_like(x, rng) * sigma_up;
}
}
@ -1544,10 +1521,32 @@ static sd::Tensor<float> sample_ddim_trailing(denoise_cb_t model,
const std::vector<float>& sigmas,
std::shared_ptr<RNG> rng,
float eta) {
float beta_start = 0.00085f;
float beta_end = 0.0120f;
std::vector<double> alphas_cumprod(TIMESTEPS);
std::vector<double> compvis_sigmas(TIMESTEPS);
for (int i = 0; i < TIMESTEPS; i++) {
alphas_cumprod[i] =
(i == 0 ? 1.0f : alphas_cumprod[i - 1]) *
(1.0f -
std::pow(sqrtf(beta_start) +
(sqrtf(beta_end) - sqrtf(beta_start)) *
((float)i / (TIMESTEPS - 1)),
2));
compvis_sigmas[i] =
std::sqrt((1 - alphas_cumprod[i]) / alphas_cumprod[i]);
}
int steps = static_cast<int>(sigmas.size()) - 1;
for (int i = 0; i < steps; i++) {
float sigma = sigmas[i];
float sigma_to = sigmas[i + 1];
int timestep = static_cast<int>(roundf(TIMESTEPS - i * ((float)TIMESTEPS / steps))) - 1;
int prev_timestep = timestep - TIMESTEPS / steps;
float sigma = static_cast<float>(compvis_sigmas[timestep]);
if (i == 0) {
x *= std::sqrt(sigma * sigma + 1) / sigma;
} else {
x *= std::sqrt(sigma * sigma + 1);
}
auto model_output_opt = model(x, sigma, i + 1);
if (model_output_opt.empty()) {
@ -1556,8 +1555,8 @@ static sd::Tensor<float> sample_ddim_trailing(denoise_cb_t model,
sd::Tensor<float> model_output = std::move(model_output_opt);
model_output = (x - model_output) * (1.0f / sigma);
float alpha_prod_t = 1.0f / (sigma * sigma + 1.0f);
float alpha_prod_t_prev = 1.0f / (sigma_to * sigma_to + 1.0f);
float alpha_prod_t = static_cast<float>(alphas_cumprod[timestep]);
float alpha_prod_t_prev = static_cast<float>(prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]);
float beta_prod_t = 1.0f - alpha_prod_t;
sd::Tensor<float> pred_original_sample = ((x / std::sqrt(sigma * sigma + 1)) -
@ -1569,11 +1568,11 @@ static sd::Tensor<float> sample_ddim_trailing(denoise_cb_t model,
(1.0f - alpha_prod_t / alpha_prod_t_prev);
float std_dev_t = eta * std::sqrt(variance);
x = pred_original_sample +
std::sqrt((1.0f - alpha_prod_t_prev - std::pow(std_dev_t, 2)) / alpha_prod_t_prev) * model_output;
x = std::sqrt(alpha_prod_t_prev) * pred_original_sample +
std::sqrt(1.0f - alpha_prod_t_prev - std::pow(std_dev_t, 2)) * model_output;
if (eta > 0) {
x += std_dev_t / std::sqrt(alpha_prod_t_prev) * sd::Tensor<float>::randn_like(x, rng);
x += std_dev_t * sd::Tensor<float>::randn_like(x, rng);
}
}
return x;
@ -1600,26 +1599,19 @@ static sd::Tensor<float> sample_tcd(denoise_cb_t model,
std::sqrt((1 - alphas_cumprod[i]) / alphas_cumprod[i]);
}
auto get_timestep_from_sigma = [&](float s) -> int {
auto it = std::lower_bound(compvis_sigmas.begin(), compvis_sigmas.end(), s);
if (it == compvis_sigmas.begin())
return 0;
if (it == compvis_sigmas.end())
return TIMESTEPS - 1;
int idx_high = static_cast<int>(std::distance(compvis_sigmas.begin(), it));
int idx_low = idx_high - 1;
if (std::abs(compvis_sigmas[idx_high] - s) < std::abs(compvis_sigmas[idx_low] - s)) {
return idx_high;
}
return idx_low;
};
int original_steps = 50;
int steps = static_cast<int>(sigmas.size()) - 1;
for (int i = 0; i < steps; i++) {
float sigma_to = sigmas[i + 1];
int prev_timestep = get_timestep_from_sigma(sigma_to);
int timestep = TIMESTEPS - 1 - (TIMESTEPS / original_steps) * (int)floor(i * ((float)original_steps / steps));
int prev_timestep = i >= steps - 1 ? 0 : TIMESTEPS - 1 - (TIMESTEPS / original_steps) * (int)floor((i + 1) * ((float)original_steps / steps));
int timestep_s = (int)floor((1 - eta) * prev_timestep);
float sigma = sigmas[i];
float sigma = static_cast<float>(compvis_sigmas[timestep]);
if (i == 0) {
x *= std::sqrt(sigma * sigma + 1) / sigma;
} else {
x *= std::sqrt(sigma * sigma + 1);
}
auto model_output_opt = model(x, sigma, i + 1);
if (model_output_opt.empty()) {
@ -1628,9 +1620,9 @@ static sd::Tensor<float> sample_tcd(denoise_cb_t model,
sd::Tensor<float> model_output = std::move(model_output_opt);
model_output = (x - model_output) * (1.0f / sigma);
float alpha_prod_t = 1.0f / (sigma * sigma + 1.0f);
float alpha_prod_t = static_cast<float>(alphas_cumprod[timestep]);
float beta_prod_t = 1.0f - alpha_prod_t;
float alpha_prod_t_prev = 1.0f / (sigma_to * sigma_to + 1.0f);
float alpha_prod_t_prev = static_cast<float>(prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]);
float alpha_prod_s = static_cast<float>(alphas_cumprod[timestep_s]);
float beta_prod_s = 1.0f - alpha_prod_s;
@ -1638,12 +1630,12 @@ static sd::Tensor<float> sample_tcd(denoise_cb_t model,
std::sqrt(beta_prod_t) * model_output) *
(1.0f / std::sqrt(alpha_prod_t));
x = std::sqrt(alpha_prod_s / alpha_prod_t_prev) * pred_original_sample +
std::sqrt(beta_prod_s / alpha_prod_t_prev) * model_output;
x = std::sqrt(alpha_prod_s) * pred_original_sample +
std::sqrt(beta_prod_s) * model_output;
if (eta > 0 && sigma_to > 0.0f) {
if (eta > 0 && i != steps - 1) {
x = std::sqrt(alpha_prod_t_prev / alpha_prod_s) * x +
std::sqrt(1.0f / alpha_prod_t_prev - 1.0f / alpha_prod_s) * sd::Tensor<float>::randn_like(x, rng);
std::sqrt(1.0f - alpha_prod_t_prev / alpha_prod_s) * sd::Tensor<float>::randn_like(x, rng);
}
}
return x;
@ -1679,15 +1671,15 @@ static sd::Tensor<float> sample_k_diffusion(sample_method_t method,
case DPMPP2Mv2_SAMPLE_METHOD:
return sample_dpmpp_2m_v2(model, std::move(x), sigmas);
case LCM_SAMPLE_METHOD:
return sample_lcm(model, std::move(x), sigmas, rng, is_flow_denoiser);
return sample_lcm(model, std::move(x), sigmas, rng);
case IPNDM_SAMPLE_METHOD:
return sample_ipndm(model, std::move(x), sigmas);
case IPNDM_V_SAMPLE_METHOD:
return sample_ipndm_v(model, std::move(x), sigmas);
case RES_MULTISTEP_SAMPLE_METHOD:
return sample_res_multistep(model, std::move(x), sigmas, rng, is_flow_denoiser, eta);
return sample_res_multistep(model, std::move(x), sigmas, rng, eta);
case RES_2S_SAMPLE_METHOD:
return sample_res_2s(model, std::move(x), sigmas, rng, is_flow_denoiser, eta);
return sample_res_2s(model, std::move(x), sigmas, rng, eta);
case ER_SDE_SAMPLE_METHOD:
return sample_er_sde(model, std::move(x), sigmas, rng, is_flow_denoiser, eta);
case DDIM_TRAILING_SAMPLE_METHOD:

View File

@ -49,7 +49,6 @@ struct DiffusionModel {
virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter){};
virtual int64_t get_adm_in_channels() = 0;
virtual void set_flash_attention_enabled(bool enabled) = 0;
virtual void set_max_graph_vram_bytes(size_t max_vram_bytes) = 0;
virtual void set_circular_axes(bool circular_x, bool circular_y) = 0;
};
@ -99,10 +98,6 @@ struct UNetModel : public DiffusionModel {
unet.set_flash_attention_enabled(enabled);
}
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
unet.set_max_graph_vram_bytes(max_vram_bytes);
}
void set_circular_axes(bool circular_x, bool circular_y) override {
unet.set_circular_axes(circular_x, circular_y);
}
@ -169,10 +164,6 @@ struct MMDiTModel : public DiffusionModel {
mmdit.set_flash_attention_enabled(enabled);
}
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
mmdit.set_max_graph_vram_bytes(max_vram_bytes);
}
void set_circular_axes(bool circular_x, bool circular_y) override {
mmdit.set_circular_axes(circular_x, circular_y);
}
@ -238,10 +229,6 @@ struct FluxModel : public DiffusionModel {
flux.set_flash_attention_enabled(enabled);
}
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
flux.set_max_graph_vram_bytes(max_vram_bytes);
}
void set_circular_axes(bool circular_x, bool circular_y) override {
flux.set_circular_axes(circular_x, circular_y);
}
@ -312,10 +299,6 @@ struct AnimaModel : public DiffusionModel {
anima.set_flash_attention_enabled(enabled);
}
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
anima.set_max_graph_vram_bytes(max_vram_bytes);
}
void set_circular_axes(bool circular_x, bool circular_y) override {
anima.set_circular_axes(circular_x, circular_y);
}
@ -381,10 +364,6 @@ struct WanModel : public DiffusionModel {
wan.set_flash_attention_enabled(enabled);
}
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
wan.set_max_graph_vram_bytes(max_vram_bytes);
}
void set_circular_axes(bool circular_x, bool circular_y) override {
wan.set_circular_axes(circular_x, circular_y);
}
@ -454,10 +433,6 @@ struct QwenImageModel : public DiffusionModel {
qwen_image.set_flash_attention_enabled(enabled);
}
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
qwen_image.set_max_graph_vram_bytes(max_vram_bytes);
}
void set_circular_axes(bool circular_x, bool circular_y) override {
qwen_image.set_circular_axes(circular_x, circular_y);
}
@ -524,10 +499,6 @@ struct ZImageModel : public DiffusionModel {
z_image.set_flash_attention_enabled(enabled);
}
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
z_image.set_max_graph_vram_bytes(max_vram_bytes);
}
void set_circular_axes(bool circular_x, bool circular_y) override {
z_image.set_circular_axes(circular_x, circular_y);
}
@ -593,10 +564,6 @@ struct ErnieImageModel : public DiffusionModel {
ernie_image.set_flash_attention_enabled(enabled);
}
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
ernie_image.set_max_graph_vram_bytes(max_vram_bytes);
}
void set_circular_axes(bool circular_x, bool circular_y) override {
ernie_image.set_circular_axes(circular_x, circular_y);
}

View File

@ -295,8 +295,6 @@ namespace ErnieImage {
auto c = time_embedding->forward(ctx, sample); // [N, hidden_size]
auto mod_params = adaLN_mod->forward(ctx, ggml_silu(ctx->ggml_ctx, c)); // [N, 6 * hidden_size]
sd::ggml_graph_cut::mark_graph_cut(hidden_states, "ernie_image.prelude", "hidden_states");
// sd::ggml_graph_cut::mark_graph_cut(mod_params, "ernie_image.prelude", "mod_params");
auto chunks = ggml_ext_chunk(ctx->ggml_ctx, mod_params, 6, 0);
std::vector<ggml_tensor*> temb;
temb.reserve(6);
@ -307,7 +305,6 @@ namespace ErnieImage {
for (int i = 0; i < params.num_layers; i++) {
auto layer = std::dynamic_pointer_cast<ErnieImageSharedAdaLNBlock>(blocks["layers." + std::to_string(i)]);
hidden_states = layer->forward(ctx, hidden_states, pe, temb);
sd::ggml_graph_cut::mark_graph_cut(hidden_states, "ernie_image.layers." + std::to_string(i), "hidden_states");
}
hidden_states = final_norm->forward(ctx, hidden_states, c);

View File

@ -125,32 +125,26 @@ public:
auto conv_last = std::dynamic_pointer_cast<Conv2d>(blocks["conv_last"]);
auto feat = conv_first->forward(ctx, x);
sd::ggml_graph_cut::mark_graph_cut(feat, "esrgan.prelude", "feat");
auto body_feat = feat;
for (int i = 0; i < num_block; i++) {
std::string name = "body." + std::to_string(i);
auto block = std::dynamic_pointer_cast<RRDB>(blocks[name]);
body_feat = block->forward(ctx, body_feat);
sd::ggml_graph_cut::mark_graph_cut(body_feat, "esrgan.body." + std::to_string(i), "feat");
}
body_feat = conv_body->forward(ctx, body_feat);
feat = ggml_add(ctx->ggml_ctx, feat, body_feat);
sd::ggml_graph_cut::mark_graph_cut(feat, "esrgan.body.out", "feat");
// upsample
if (scale >= 2) {
auto conv_up1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up1"]);
feat = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx->ggml_ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
sd::ggml_graph_cut::mark_graph_cut(feat, "esrgan.up1", "feat");
if (scale == 4) {
auto conv_up2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up2"]);
feat = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx->ggml_ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
sd::ggml_graph_cut::mark_graph_cut(feat, "esrgan.up2", "feat");
}
}
// for all scales
auto out = conv_last->forward(ctx, lrelu(ctx, conv_hr->forward(ctx, feat)));
sd::ggml_graph_cut::mark_graph_cut(out, "esrgan.final", "out");
return out;
}
};

View File

@ -928,9 +928,6 @@ namespace Flux {
}
txt = txt_in->forward(ctx, txt);
sd::ggml_graph_cut::mark_graph_cut(img, "flux.prelude", "img");
sd::ggml_graph_cut::mark_graph_cut(txt, "flux.prelude", "txt");
sd::ggml_graph_cut::mark_graph_cut(vec, "flux.prelude", "vec");
for (int i = 0; i < params.depth; i++) {
if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i) != skip_layers.end()) {
@ -942,8 +939,6 @@ namespace Flux {
auto img_txt = block->forward(ctx, img, txt, vec, pe, txt_img_mask, ds_img_mods, ds_txt_mods);
img = img_txt.first; // [N, n_img_token, hidden_size]
txt = img_txt.second; // [N, n_txt_token, hidden_size]
sd::ggml_graph_cut::mark_graph_cut(img, "flux.double_blocks." + std::to_string(i), "img");
sd::ggml_graph_cut::mark_graph_cut(txt, "flux.double_blocks." + std::to_string(i), "txt");
}
auto txt_img = ggml_concat(ctx->ggml_ctx, txt, img, 1); // [N, n_txt_token + n_img_token, hidden_size]
@ -954,7 +949,6 @@ namespace Flux {
auto block = std::dynamic_pointer_cast<SingleStreamBlock>(blocks["single_blocks." + std::to_string(i)]);
txt_img = block->forward(ctx, txt_img, vec, pe, txt_img_mask, ss_mods);
sd::ggml_graph_cut::mark_graph_cut(txt_img, "flux.single_blocks." + std::to_string(i), "txt_img");
}
img = ggml_view_3d(ctx->ggml_ctx,

File diff suppressed because it is too large Load Diff

View File

@ -1,298 +0,0 @@
#ifndef __GGML_EXTEND_BACKEND_HPP__
#define __GGML_EXTEND_BACKEND_HPP__
#include <cstring>
#include <mutex>
#include "ggml-backend.h"
#include "ggml.h"
#ifndef __STATIC_INLINE__
#define __STATIC_INLINE__ static inline
#endif
inline void ggml_backend_load_all_once() {
// If the registry already has devices and the CPU backend is present,
// assume either static registration or explicit host-side preloading has
// completed and avoid rescanning the default paths.
if (ggml_backend_dev_count() > 0 && ggml_backend_reg_by_name("CPU") != nullptr) {
return;
}
// In dynamic-backend mode the backend modules are discovered at runtime,
// so we must load them before asking for the CPU backend or its proc table.
// If the host preloaded only a subset of backends, allow one default-path
// scan so missing modules can still be discovered.
static std::once_flag once;
std::call_once(once, []() {
if (ggml_backend_dev_count() > 0 && ggml_backend_reg_by_name("CPU") != nullptr) {
return;
}
ggml_backend_load_all();
});
}
// Do not gate this branch on GGML_CPU or GGML_CPU_ALL_VARIANTS:
// those are CMake options used to configure ggml itself, but they are not
// exported as PUBLIC compile definitions to stable-diffusion in backend-DL mode.
// In practice, this target can reliably see GGML_BACKEND_DL, but not whether
// the CPU backend was compiled as a loadable module. We therefore use runtime
// backend discovery instead of compile-time assumptions.
__STATIC_INLINE__ ggml_backend_reg_t ggml_backend_cpu_reg() {
ggml_backend_reg_t reg = ggml_backend_reg_by_name("CPU");
if (reg != nullptr) {
return reg;
}
ggml_backend_load_all_once();
return ggml_backend_reg_by_name("CPU");
}
__STATIC_INLINE__ ggml_backend_reg_t ggml_backend_reg_from_backend(ggml_backend_t backend) {
if (backend != nullptr) {
ggml_backend_dev_t device = ggml_backend_get_device(backend);
if (device != nullptr) {
return ggml_backend_dev_backend_reg(device);
}
}
return ggml_backend_cpu_reg();
}
__STATIC_INLINE__ ggml_backend_t ggml_backend_cpu_init() {
ggml_backend_t backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
if (backend != nullptr) {
return backend;
}
ggml_backend_load_all_once();
return ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
}
__STATIC_INLINE__ bool ggml_backend_is_cpu(ggml_backend_t backend) {
if (backend == nullptr) {
return false;
}
ggml_backend_dev_t device = ggml_backend_get_device(backend);
if (device != nullptr) {
return ggml_backend_dev_type(device) == GGML_BACKEND_DEVICE_TYPE_CPU;
}
const char* backend_name = ggml_backend_name(backend);
return backend_name != nullptr && std::strcmp(backend_name, "CPU") == 0;
}
__STATIC_INLINE__ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
ggml_backend_reg_t reg = ggml_backend_reg_from_backend(backend_cpu);
if (reg == nullptr) {
return;
}
auto fn = reinterpret_cast<ggml_backend_set_n_threads_t>(ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"));
if (fn != nullptr) {
fn(backend_cpu, n_threads);
}
}
using __ggml_backend_cpu_set_threadpool_t = void (*)(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
__STATIC_INLINE__ void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
ggml_backend_reg_t reg = ggml_backend_reg_from_backend(backend_cpu);
if (reg == nullptr) {
return;
}
auto fn = reinterpret_cast<__ggml_backend_cpu_set_threadpool_t>(ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool"));
if (fn != nullptr) {
fn(backend_cpu, threadpool);
}
}
__STATIC_INLINE__ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void* abort_callback_data) {
ggml_backend_reg_t reg = ggml_backend_reg_from_backend(backend_cpu);
if (reg == nullptr) {
return;
}
auto fn = reinterpret_cast<ggml_backend_set_abort_callback_t>(ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback"));
if (fn != nullptr) {
fn(backend_cpu, abort_callback, abort_callback_data);
}
}
__STATIC_INLINE__ ggml_backend_buffer_t ggml_backend_tensor_buffer(const struct ggml_tensor* tensor) {
if (tensor == nullptr) {
return nullptr;
}
return tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
}
__STATIC_INLINE__ bool ggml_backend_tensor_is_host_accessible(const struct ggml_tensor* tensor) {
if (tensor == nullptr || tensor->data == nullptr) {
return false;
}
ggml_backend_buffer_t buffer = ggml_backend_tensor_buffer(tensor);
return buffer == nullptr || ggml_backend_buffer_is_host(buffer);
}
__STATIC_INLINE__ size_t ggml_backend_tensor_offset(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
return (size_t)(i0 * tensor->nb[0] + i1 * tensor->nb[1] + i2 * tensor->nb[2] + i3 * tensor->nb[3]);
}
template <typename T>
__STATIC_INLINE__ void ggml_backend_tensor_write_scalar(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3, T value) {
const size_t offset = ggml_backend_tensor_offset(tensor, i0, i1, i2, i3);
if (ggml_backend_tensor_is_host_accessible(tensor)) {
auto* dst = reinterpret_cast<T*>(reinterpret_cast<char*>(tensor->data) + offset);
*dst = value;
return;
}
ggml_backend_tensor_set(const_cast<struct ggml_tensor*>(tensor), &value, offset, sizeof(T));
}
__STATIC_INLINE__ void ggml_set_f32_nd(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3, float value) {
switch (tensor->type) {
case GGML_TYPE_I8:
ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast<int8_t>(value));
break;
case GGML_TYPE_I16:
ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast<int16_t>(value));
break;
case GGML_TYPE_I32:
ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast<int32_t>(value));
break;
case GGML_TYPE_F16:
ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, ggml_fp32_to_fp16(value));
break;
case GGML_TYPE_BF16:
ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, ggml_fp32_to_bf16(value));
break;
case GGML_TYPE_F32:
ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, value);
break;
default:
GGML_ABORT("fatal error");
}
}
__STATIC_INLINE__ void ggml_set_f32_1d(const struct ggml_tensor* tensor, int i, float value) {
if (!ggml_is_contiguous(tensor)) {
int64_t id[4] = {0, 0, 0, 0};
ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
ggml_set_f32_nd(tensor, id[0], id[1], id[2], id[3], value);
return;
}
switch (tensor->type) {
case GGML_TYPE_I8:
ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast<int8_t>(value));
break;
case GGML_TYPE_I16:
ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast<int16_t>(value));
break;
case GGML_TYPE_I32:
ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast<int32_t>(value));
break;
case GGML_TYPE_F16:
ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, ggml_fp32_to_fp16(value));
break;
case GGML_TYPE_BF16:
ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, ggml_fp32_to_bf16(value));
break;
case GGML_TYPE_F32:
ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, value);
break;
default:
GGML_ABORT("fatal error");
}
}
__STATIC_INLINE__ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context* ctx, struct ggml_cgraph* cgraph, int n_threads) {
(void)ctx;
// The legacy ggml_graph_compute_with_ctx() symbol lives in ggml-cpu, but
// the backend proc table does not expose it in GGML_BACKEND_DL mode.
// Recreate the old behavior by initializing the CPU backend explicitly and
// executing the graph through the generic backend API.
ggml_backend_t backend = ggml_backend_cpu_init();
if (backend == nullptr) {
return GGML_STATUS_ALLOC_FAILED;
}
ggml_backend_cpu_set_n_threads(backend, n_threads);
const enum ggml_status status = ggml_backend_graph_compute(backend, cgraph);
ggml_backend_free(backend);
return status;
}
__STATIC_INLINE__ ggml_tensor* ggml_set_f32(struct ggml_tensor* tensor, float value) {
GGML_ASSERT(tensor != nullptr);
if (ggml_backend_tensor_is_host_accessible(tensor) && ggml_is_contiguous(tensor)) {
const int64_t nelements = ggml_nelements(tensor);
switch (tensor->type) {
case GGML_TYPE_I8: {
auto* data = reinterpret_cast<int8_t*>(tensor->data);
const int8_t v = static_cast<int8_t>(value);
for (int64_t i = 0; i < nelements; ++i) {
data[i] = v;
}
} break;
case GGML_TYPE_I16: {
auto* data = reinterpret_cast<int16_t*>(tensor->data);
const int16_t v = static_cast<int16_t>(value);
for (int64_t i = 0; i < nelements; ++i) {
data[i] = v;
}
} break;
case GGML_TYPE_I32: {
auto* data = reinterpret_cast<int32_t*>(tensor->data);
const int32_t v = static_cast<int32_t>(value);
for (int64_t i = 0; i < nelements; ++i) {
data[i] = v;
}
} break;
case GGML_TYPE_F16: {
auto* data = reinterpret_cast<ggml_fp16_t*>(tensor->data);
const ggml_fp16_t v = ggml_fp32_to_fp16(value);
for (int64_t i = 0; i < nelements; ++i) {
data[i] = v;
}
} break;
case GGML_TYPE_BF16: {
auto* data = reinterpret_cast<ggml_bf16_t*>(tensor->data);
const ggml_bf16_t v = ggml_fp32_to_bf16(value);
for (int64_t i = 0; i < nelements; ++i) {
data[i] = v;
}
} break;
case GGML_TYPE_F32: {
auto* data = reinterpret_cast<float*>(tensor->data);
for (int64_t i = 0; i < nelements; ++i) {
data[i] = value;
}
} break;
default:
GGML_ABORT("fatal error");
}
return tensor;
}
const int64_t nelements = ggml_nelements(tensor);
for (int64_t i = 0; i < nelements; ++i) {
ggml_set_f32_1d(tensor, static_cast<int>(i), value);
}
return tensor;
}
#endif

View File

@ -1,676 +0,0 @@
#include "ggml_graph_cut.h"
#include <algorithm>
#include <cstring>
#include <map>
#include <set>
#include <sstream>
#include <stack>
#include <unordered_map>
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "util.h"
#include "../ggml/src/ggml-impl.h"
namespace sd::ggml_graph_cut {
static std::string graph_cut_tensor_display_name(const ggml_tensor* tensor) {
if (tensor == nullptr) {
return "<null>";
}
if (tensor->name[0] != '\0') {
return tensor->name;
}
return sd_format("<tensor@%p>", (const void*)tensor);
}
static int graph_leaf_index(ggml_cgraph* gf, const ggml_tensor* tensor) {
GGML_ASSERT(gf != nullptr);
GGML_ASSERT(tensor != nullptr);
for (int i = 0; i < gf->n_leafs; ++i) {
if (gf->leafs[i] == tensor) {
return i;
}
}
return -1;
}
static bool is_params_tensor(const std::unordered_set<const ggml_tensor*>& params_tensor_set,
const ggml_tensor* tensor) {
if (tensor == nullptr) {
return false;
}
return params_tensor_set.find(tensor) != params_tensor_set.end();
}
static Plan::InputShape input_shape(const ggml_tensor* tensor) {
Plan::InputShape shape;
if (tensor == nullptr) {
return shape;
}
shape.type = tensor->type;
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
shape.ne[static_cast<size_t>(i)] = tensor->ne[i];
}
return shape;
}
static size_t graph_cut_segment_vram_bytes(const Segment& segment) {
return segment.compute_buffer_size +
segment.input_param_bytes +
segment.input_previous_cut_bytes +
segment.output_bytes;
}
static Segment make_segment_seed(const Plan& plan,
size_t start_segment_index,
size_t end_segment_index) {
GGML_ASSERT(start_segment_index < plan.segments.size());
GGML_ASSERT(end_segment_index < plan.segments.size());
GGML_ASSERT(start_segment_index <= end_segment_index);
Segment seed;
const auto& start_segment = plan.segments[start_segment_index];
const auto& target_segment = plan.segments[end_segment_index];
std::unordered_set<int> seen_output_node_indices;
for (size_t seg_idx = start_segment_index; seg_idx <= end_segment_index; ++seg_idx) {
for (int output_node_index : plan.segments[seg_idx].output_node_indices) {
if (seen_output_node_indices.insert(output_node_index).second) {
seed.output_node_indices.push_back(output_node_index);
}
}
}
if (start_segment_index == end_segment_index) {
seed.group_name = target_segment.group_name;
} else {
seed.group_name = sd_format("%s..%s",
start_segment.group_name.c_str(),
target_segment.group_name.c_str());
}
return seed;
}
static void build_segment(ggml_cgraph* gf,
Plan& plan,
Segment& segment,
const std::unordered_map<const ggml_tensor*, int>& producer_index,
std::unordered_set<int>& available_cut_output_node_indices,
ggml_backend_t backend,
const std::unordered_set<const ggml_tensor*>& params_tensor_set,
const char* log_desc) {
std::set<int> internal_nodes;
std::unordered_set<const ggml_tensor*> input_seen;
std::vector<Segment::InputRef> input_refs;
std::stack<ggml_tensor*> work_stack;
for (int output_node_index : segment.output_node_indices) {
ggml_tensor* output = ggml_graph_node(gf, output_node_index);
if (output != nullptr) {
work_stack.push(output);
}
}
while (!work_stack.empty()) {
ggml_tensor* tensor = work_stack.top();
work_stack.pop();
if (tensor == nullptr) {
continue;
}
auto producer_it = producer_index.find(tensor);
if (producer_it == producer_index.end()) {
if (input_seen.insert(tensor).second) {
Segment::InputRef input_ref;
input_ref.type = is_params_tensor(params_tensor_set, tensor) ? Segment::INPUT_PARAM : Segment::INPUT_EXTERNAL;
input_ref.display_name = graph_cut_tensor_display_name(tensor);
input_ref.leaf_index = graph_leaf_index(gf, tensor);
input_refs.push_back(std::move(input_ref));
}
continue;
}
int node_idx = producer_it->second;
if (available_cut_output_node_indices.find(node_idx) != available_cut_output_node_indices.end()) {
if (input_seen.insert(tensor).second) {
Segment::InputRef input_ref;
input_ref.type = Segment::INPUT_PREVIOUS_CUT;
input_ref.display_name = graph_cut_tensor_display_name(tensor);
input_ref.node_index = node_idx;
input_refs.push_back(std::move(input_ref));
}
continue;
}
if (!internal_nodes.insert(node_idx).second) {
continue;
}
ggml_tensor* node = ggml_graph_node(gf, node_idx);
for (int src_idx = 0; src_idx < GGML_MAX_SRC; ++src_idx) {
if (node->src[src_idx] != nullptr) {
work_stack.push(node->src[src_idx]);
}
}
}
if (!internal_nodes.empty()) {
segment.internal_node_indices.assign(internal_nodes.begin(), internal_nodes.end());
}
std::sort(input_refs.begin(),
input_refs.end(),
[](const Segment::InputRef& a, const Segment::InputRef& b) {
if (a.type != b.type) {
return a.type < b.type;
}
return a.display_name < b.display_name;
});
segment.input_refs = input_refs;
for (const auto& input : input_refs) {
ggml_tensor* current_input = input_tensor(gf, input);
size_t tensor_bytes = current_input == nullptr
? 0
: (input.type == Segment::INPUT_PREVIOUS_CUT
? cache_tensor_bytes(current_input)
: ggml_nbytes(current_input));
switch (input.type) {
case Segment::INPUT_PREVIOUS_CUT:
segment.input_previous_cut_bytes += tensor_bytes;
break;
case Segment::INPUT_PARAM:
segment.input_param_bytes += tensor_bytes;
break;
case Segment::INPUT_EXTERNAL:
default:
segment.input_external_bytes += tensor_bytes;
break;
}
}
for (int output_node_index : segment.output_node_indices) {
ggml_tensor* output = ggml_graph_node(gf, output_node_index);
segment.output_bytes += cache_tensor_bytes(output);
}
segment.compute_buffer_size = measure_segment_compute_buffer(backend, gf, segment, log_desc);
for (int output_node_index : segment.output_node_indices) {
available_cut_output_node_indices.insert(output_node_index);
}
plan.segments.push_back(std::move(segment));
}
bool is_graph_cut_tensor(const ggml_tensor* tensor) {
if (tensor == nullptr || tensor->name[0] == '\0') {
return false;
}
return std::strncmp(tensor->name, GGML_RUNNER_CUT_PREFIX, std::strlen(GGML_RUNNER_CUT_PREFIX)) == 0;
}
std::string make_graph_cut_name(const std::string& group, const std::string& output) {
return std::string(GGML_RUNNER_CUT_PREFIX) + group + "|" + output;
}
void mark_graph_cut(ggml_tensor* tensor, const std::string& group, const std::string& output) {
if (tensor == nullptr) {
return;
}
auto name = make_graph_cut_name(group, output);
ggml_set_name(tensor, name.c_str());
}
int leaf_count(ggml_cgraph* gf) {
GGML_ASSERT(gf != nullptr);
return gf->n_leafs;
}
ggml_tensor* leaf_tensor(ggml_cgraph* gf, int leaf_index) {
GGML_ASSERT(gf != nullptr);
if (leaf_index < 0 || leaf_index >= gf->n_leafs) {
return nullptr;
}
return gf->leafs[leaf_index];
}
ggml_backend_buffer_t tensor_buffer(const ggml_tensor* tensor) {
if (tensor == nullptr) {
return nullptr;
}
return tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
}
ggml_tensor* cache_source_tensor(ggml_tensor* tensor) {
if (tensor == nullptr) {
return nullptr;
}
return tensor->view_src ? tensor->view_src : tensor;
}
size_t cache_tensor_bytes(const ggml_tensor* tensor) {
if (tensor == nullptr) {
return 0;
}
const ggml_tensor* cache_src = tensor->view_src ? tensor->view_src : tensor;
return ggml_nbytes(cache_src);
}
bool plan_matches_graph(ggml_cgraph* gf, const Plan& plan) {
GGML_ASSERT(gf != nullptr);
if (ggml_graph_n_nodes(gf) != plan.n_nodes || gf->n_leafs != plan.n_leafs) {
return false;
}
for (const auto& input_shape_ref : plan.input_shapes) {
if (input_shape_ref.leaf_index < 0 || input_shape_ref.leaf_index >= gf->n_leafs) {
return false;
}
ggml_tensor* leaf = gf->leafs[input_shape_ref.leaf_index];
if (leaf == nullptr || input_shape_ref.type != leaf->type) {
return false;
}
for (int d = 0; d < GGML_MAX_DIMS; ++d) {
if (input_shape_ref.ne[static_cast<size_t>(d)] != leaf->ne[d]) {
return false;
}
}
}
return true;
}
ggml_tensor* output_tensor(ggml_cgraph* gf, const Segment& segment, size_t output_index) {
GGML_ASSERT(gf != nullptr);
if (output_index >= segment.output_node_indices.size()) {
return nullptr;
}
int node_index = segment.output_node_indices[output_index];
if (node_index < 0 || node_index >= ggml_graph_n_nodes(gf)) {
return nullptr;
}
return ggml_graph_node(gf, node_index);
}
ggml_tensor* input_tensor(ggml_cgraph* gf, const Segment::InputRef& input_ref) {
GGML_ASSERT(gf != nullptr);
if (input_ref.type == Segment::INPUT_PREVIOUS_CUT) {
if (input_ref.node_index < 0 || input_ref.node_index >= ggml_graph_n_nodes(gf)) {
return nullptr;
}
return ggml_graph_node(gf, input_ref.node_index);
}
if (input_ref.leaf_index < 0 || input_ref.leaf_index >= gf->n_leafs) {
return nullptr;
}
return leaf_tensor(gf, input_ref.leaf_index);
}
std::vector<ggml_tensor*> param_tensors(ggml_cgraph* gf, const Segment& segment) {
GGML_ASSERT(gf != nullptr);
std::vector<ggml_tensor*> tensors;
std::unordered_set<ggml_tensor*> seen_tensors;
tensors.reserve(segment.input_refs.size());
seen_tensors.reserve(segment.input_refs.size());
for (const auto& input_ref : segment.input_refs) {
if (input_ref.type != Segment::INPUT_PARAM) {
continue;
}
ggml_tensor* tensor = input_tensor(gf, input_ref);
if (tensor == nullptr) {
continue;
}
if (seen_tensors.insert(tensor).second) {
tensors.push_back(tensor);
}
}
return tensors;
}
std::vector<ggml_tensor*> runtime_param_tensors(ggml_cgraph* gf, const Segment& segment, const char* log_desc) {
std::vector<ggml_tensor*> tensors = param_tensors(gf, segment);
std::vector<ggml_tensor*> filtered_tensors;
filtered_tensors.reserve(tensors.size());
for (ggml_tensor* tensor : tensors) {
if (tensor_buffer(tensor) == nullptr) {
LOG_WARN("%s graph cut skipping param input without buffer: segment=%s tensor=%s",
log_desc == nullptr ? "unknown" : log_desc,
segment.group_name.c_str(),
tensor->name);
continue;
}
filtered_tensors.push_back(tensor);
}
return filtered_tensors;
}
std::unordered_set<std::string> collect_future_input_names(ggml_cgraph* gf,
const Plan& plan,
size_t current_segment_index) {
GGML_ASSERT(gf != nullptr);
std::unordered_set<std::string> future_input_names;
for (size_t seg_idx = current_segment_index + 1; seg_idx < plan.segments.size(); ++seg_idx) {
const auto& segment = plan.segments[seg_idx];
for (const auto& input_ref : segment.input_refs) {
if (input_ref.type != Segment::INPUT_PREVIOUS_CUT) {
continue;
}
ggml_tensor* current_input = input_tensor(gf, input_ref);
if (current_input != nullptr && current_input->name[0] != '\0') {
future_input_names.insert(current_input->name);
}
}
}
return future_input_names;
}
ggml_cgraph* build_segment_graph(ggml_cgraph* gf,
const Segment& segment,
ggml_context** graph_ctx_out) {
GGML_ASSERT(gf != nullptr);
GGML_ASSERT(graph_ctx_out != nullptr);
const size_t graph_size = segment.internal_node_indices.size() + segment.input_refs.size() + 8;
ggml_init_params params = {
/*.mem_size =*/ggml_graph_overhead_custom(graph_size, false) + 1024,
/*.mem_buffer =*/nullptr,
/*.no_alloc =*/true,
};
ggml_context* graph_ctx = ggml_init(params);
GGML_ASSERT(graph_ctx != nullptr);
ggml_cgraph* segment_graph = ggml_new_graph_custom(graph_ctx, graph_size, false);
GGML_ASSERT(segment_graph != nullptr);
for (const auto& input : segment.input_refs) {
ggml_tensor* current_input = input_tensor(gf, input);
if (current_input == nullptr) {
continue;
}
GGML_ASSERT(segment_graph->n_leafs < segment_graph->size);
segment_graph->leafs[segment_graph->n_leafs++] = current_input;
}
for (int output_node_index : segment.output_node_indices) {
ggml_tensor* output = ggml_graph_node(gf, output_node_index);
if (output == nullptr) {
continue;
}
ggml_set_output(output);
}
for (int node_idx : segment.internal_node_indices) {
ggml_graph_add_node(segment_graph, ggml_graph_node(gf, node_idx));
}
*graph_ctx_out = graph_ctx;
return segment_graph;
}
size_t measure_segment_compute_buffer(ggml_backend_t backend,
ggml_cgraph* gf,
const Segment& segment,
const char* log_desc) {
GGML_ASSERT(backend != nullptr);
GGML_ASSERT(gf != nullptr);
if (segment.internal_node_indices.empty()) {
return 0;
}
ggml_context* graph_ctx = nullptr;
ggml_cgraph* segment_graph = build_segment_graph(gf, segment, &graph_ctx);
ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
size_t sizes[1] = {0};
ggml_gallocr_reserve_n_size(
allocr,
segment_graph,
nullptr,
nullptr,
sizes);
size_t buffer_size = sizes[0];
ggml_gallocr_free(allocr);
ggml_free(graph_ctx);
return buffer_size;
}
Plan build_plan(ggml_backend_t backend,
ggml_cgraph* gf,
const std::unordered_set<const ggml_tensor*>& params_tensor_set,
const char* log_desc) {
GGML_ASSERT(backend != nullptr);
GGML_ASSERT(gf != nullptr);
Plan plan;
plan.available = true;
const int n_nodes = ggml_graph_n_nodes(gf);
if (n_nodes <= 0) {
return plan;
}
plan.n_nodes = n_nodes;
plan.n_leafs = gf->n_leafs;
for (int i = 0; i < gf->n_leafs; ++i) {
ggml_tensor* leaf = gf->leafs[i];
if (is_params_tensor(params_tensor_set, leaf)) {
continue;
}
auto shape = input_shape(leaf);
shape.leaf_index = i;
plan.input_shapes.push_back(shape);
}
std::unordered_map<const ggml_tensor*, int> producer_index;
producer_index.reserve(static_cast<size_t>(n_nodes));
for (int i = 0; i < n_nodes; ++i) {
producer_index[ggml_graph_node(gf, i)] = i;
}
std::vector<Segment> grouped_segments;
std::unordered_map<std::string, size_t> group_to_segment;
for (int i = 0; i < n_nodes; ++i) {
ggml_tensor* node = ggml_graph_node(gf, i);
if (!is_graph_cut_tensor(node)) {
continue;
}
plan.has_cuts = true;
std::string full_name(node->name);
std::string payload = full_name.substr(std::strlen(GGML_RUNNER_CUT_PREFIX));
size_t sep = payload.find('|');
std::string group = sep == std::string::npos ? payload : payload.substr(0, sep);
auto it = group_to_segment.find(group);
if (it == group_to_segment.end()) {
Segment segment;
segment.group_name = group;
segment.output_node_indices.push_back(i);
group_to_segment[group] = grouped_segments.size();
grouped_segments.push_back(std::move(segment));
} else {
auto& segment = grouped_segments[it->second];
segment.output_node_indices.push_back(i);
}
}
if (!plan.has_cuts) {
return plan;
}
std::unordered_set<int> available_cut_output_node_indices;
available_cut_output_node_indices.reserve(static_cast<size_t>(n_nodes));
for (auto& segment : grouped_segments) {
build_segment(gf,
plan,
segment,
producer_index,
available_cut_output_node_indices,
backend,
params_tensor_set,
log_desc);
}
ggml_tensor* final_output = ggml_graph_node(gf, -1);
if (final_output != nullptr && available_cut_output_node_indices.find(n_nodes - 1) == available_cut_output_node_indices.end()) {
Segment final_segment;
final_segment.group_name = "ggml_runner.final";
final_segment.output_node_indices.push_back(n_nodes - 1);
build_segment(gf,
plan,
final_segment,
producer_index,
available_cut_output_node_indices,
backend,
params_tensor_set,
log_desc);
}
return plan;
}
Plan apply_max_vram_budget(ggml_cgraph* gf,
const Plan& base_plan,
size_t max_graph_vram_bytes,
ggml_backend_t backend,
const std::unordered_set<const ggml_tensor*>& params_tensor_set,
const char* log_desc) {
GGML_ASSERT(backend != nullptr);
GGML_ASSERT(gf != nullptr);
int64_t t_budget_begin = ggml_time_ms();
if (max_graph_vram_bytes == 0 || !base_plan.has_cuts || base_plan.segments.size() <= 1) {
return base_plan;
}
const int n_nodes = ggml_graph_n_nodes(gf);
std::unordered_map<const ggml_tensor*, int> producer_index;
producer_index.reserve(static_cast<size_t>(n_nodes));
for (int i = 0; i < n_nodes; ++i) {
producer_index[ggml_graph_node(gf, i)] = i;
}
Plan merged_plan;
merged_plan.available = true;
merged_plan.has_cuts = base_plan.has_cuts;
merged_plan.valid = base_plan.valid;
merged_plan.n_nodes = base_plan.n_nodes;
merged_plan.n_leafs = base_plan.n_leafs;
std::unordered_set<int> available_cut_output_node_indices;
available_cut_output_node_indices.reserve(static_cast<size_t>(n_nodes));
size_t start_segment_index = 0;
while (start_segment_index < base_plan.segments.size()) {
Plan single_plan;
auto single_available_cut_output_node_indices = available_cut_output_node_indices;
auto single_seed = make_segment_seed(base_plan,
start_segment_index,
start_segment_index);
build_segment(gf,
single_plan,
single_seed,
producer_index,
single_available_cut_output_node_indices,
backend,
params_tensor_set,
log_desc);
GGML_ASSERT(!single_plan.segments.empty());
size_t best_end_segment_index = start_segment_index;
bool can_merge_next_segment = graph_cut_segment_vram_bytes(single_plan.segments.back()) <= max_graph_vram_bytes;
while (can_merge_next_segment && best_end_segment_index + 1 < base_plan.segments.size()) {
const size_t next_end_segment_index = best_end_segment_index + 1;
Plan candidate_plan;
auto candidate_available_cut_output_node_indices = available_cut_output_node_indices;
auto candidate_seed = make_segment_seed(base_plan,
start_segment_index,
next_end_segment_index);
build_segment(gf,
candidate_plan,
candidate_seed,
producer_index,
candidate_available_cut_output_node_indices,
backend,
params_tensor_set,
log_desc);
GGML_ASSERT(!candidate_plan.segments.empty());
const auto& candidate_segment = candidate_plan.segments.back();
if (graph_cut_segment_vram_bytes(candidate_segment) > max_graph_vram_bytes) {
break;
}
best_end_segment_index = next_end_segment_index;
}
auto best_seed = make_segment_seed(base_plan,
start_segment_index,
best_end_segment_index);
build_segment(gf,
merged_plan,
best_seed,
producer_index,
available_cut_output_node_indices,
backend,
params_tensor_set,
log_desc);
start_segment_index = best_end_segment_index + 1;
}
if (log_desc != nullptr && merged_plan.segments.size() != base_plan.segments.size()) {
LOG_INFO("%s graph cut max_vram=%.2f MB merged %zu segments -> %zu segments",
log_desc,
max_graph_vram_bytes / 1024.0 / 1024.0,
base_plan.segments.size(),
merged_plan.segments.size());
}
if (log_desc != nullptr) {
LOG_INFO("%s graph cut max_vram budget merge took %lld ms",
log_desc,
ggml_time_ms() - t_budget_begin);
}
return merged_plan;
}
Plan resolve_plan(ggml_backend_t backend,
ggml_cgraph* gf,
PlanCache* cache,
size_t max_graph_vram_bytes,
const std::unordered_set<const ggml_tensor*>& params_tensor_set,
const char* log_desc) {
GGML_ASSERT(backend != nullptr);
GGML_ASSERT(gf != nullptr);
GGML_ASSERT(cache != nullptr);
int64_t t_prepare_begin = ggml_time_ms();
Plan base_plan;
int64_t t_plan_begin = ggml_time_ms();
if (cache->graph_cut_plan.available && plan_matches_graph(gf, cache->graph_cut_plan)) {
base_plan = cache->graph_cut_plan;
} else {
base_plan = build_plan(backend, gf, params_tensor_set, log_desc);
cache->graph_cut_plan = base_plan;
cache->graph_cut_plan.available = true;
cache->budgeted_graph_cut_plan.available = false;
if (log_desc != nullptr) {
LOG_INFO("%s build cached graph cut plan done (taking %lld ms)", log_desc, ggml_time_ms() - t_plan_begin);
}
}
Plan resolved_plan = base_plan;
if (max_graph_vram_bytes > 0 && base_plan.has_cuts) {
if (cache->budgeted_graph_cut_plan.available &&
cache->budgeted_graph_cut_plan_max_vram_bytes == max_graph_vram_bytes &&
plan_matches_graph(gf, cache->budgeted_graph_cut_plan)) {
resolved_plan = cache->budgeted_graph_cut_plan;
} else {
resolved_plan = apply_max_vram_budget(gf,
base_plan,
max_graph_vram_bytes,
backend,
params_tensor_set,
log_desc);
cache->budgeted_graph_cut_plan = resolved_plan;
cache->budgeted_graph_cut_plan.available = true;
cache->budgeted_graph_cut_plan_max_vram_bytes = max_graph_vram_bytes;
}
}
return resolved_plan;
}
} // namespace sd::ggml_graph_cut

View File

@ -1,104 +0,0 @@
#ifndef __SD_GGML_GRAPH_CUT_H__
#define __SD_GGML_GRAPH_CUT_H__
#include <array>
#include <string>
#include <unordered_set>
#include <vector>
#include "ggml-backend.h"
#include "ggml.h"
namespace sd::ggml_graph_cut {
struct Segment {
enum InputType {
INPUT_EXTERNAL = 0,
INPUT_PREVIOUS_CUT,
INPUT_PARAM,
};
struct InputRef {
InputType type = INPUT_EXTERNAL;
std::string display_name;
int leaf_index = -1;
int node_index = -1;
};
size_t compute_buffer_size = 0;
size_t output_bytes = 0;
size_t input_external_bytes = 0;
size_t input_previous_cut_bytes = 0;
size_t input_param_bytes = 0;
std::string group_name;
std::vector<int> internal_node_indices;
std::vector<int> output_node_indices;
std::vector<InputRef> input_refs;
};
struct Plan {
struct InputShape {
int leaf_index = -1;
ggml_type type = GGML_TYPE_COUNT;
std::array<int64_t, GGML_MAX_DIMS> ne = {0, 0, 0, 0};
};
bool available = false;
bool has_cuts = false;
bool valid = true;
int n_nodes = 0;
int n_leafs = 0;
std::vector<InputShape> input_shapes;
std::vector<Segment> segments;
};
struct PlanCache {
Plan graph_cut_plan;
Plan budgeted_graph_cut_plan;
size_t budgeted_graph_cut_plan_max_vram_bytes = 0;
};
static constexpr const char* GGML_RUNNER_CUT_PREFIX = "ggml_runner_cut:";
bool is_graph_cut_tensor(const ggml_tensor* tensor);
std::string make_graph_cut_name(const std::string& group, const std::string& output);
void mark_graph_cut(ggml_tensor* tensor, const std::string& group, const std::string& output);
int leaf_count(ggml_cgraph* gf);
ggml_tensor* leaf_tensor(ggml_cgraph* gf, int leaf_index);
ggml_backend_buffer_t tensor_buffer(const ggml_tensor* tensor);
ggml_tensor* cache_source_tensor(ggml_tensor* tensor);
size_t cache_tensor_bytes(const ggml_tensor* tensor);
bool plan_matches_graph(ggml_cgraph* gf, const Plan& plan);
ggml_tensor* output_tensor(ggml_cgraph* gf, const Segment& segment, size_t output_index);
ggml_tensor* input_tensor(ggml_cgraph* gf, const Segment::InputRef& input_ref);
std::vector<ggml_tensor*> param_tensors(ggml_cgraph* gf, const Segment& segment);
std::vector<ggml_tensor*> runtime_param_tensors(ggml_cgraph* gf, const Segment& segment, const char* log_desc);
std::unordered_set<std::string> collect_future_input_names(ggml_cgraph* gf,
const Plan& plan,
size_t current_segment_index);
ggml_cgraph* build_segment_graph(ggml_cgraph* gf,
const Segment& segment,
ggml_context** graph_ctx_out);
size_t measure_segment_compute_buffer(ggml_backend_t backend,
ggml_cgraph* gf,
const Segment& segment,
const char* log_desc);
Plan build_plan(ggml_backend_t backend,
ggml_cgraph* gf,
const std::unordered_set<const ggml_tensor*>& params_tensor_set,
const char* log_desc);
Plan apply_max_vram_budget(ggml_cgraph* gf,
const Plan& base_plan,
size_t max_graph_vram_bytes,
ggml_backend_t backend,
const std::unordered_set<const ggml_tensor*>& params_tensor_set,
const char* log_desc);
Plan resolve_plan(ggml_backend_t backend,
ggml_cgraph* gf,
PlanCache* cache,
size_t max_graph_vram_bytes,
const std::unordered_set<const ggml_tensor*>& params_tensor_set,
const char* log_desc);
} // namespace sd::ggml_graph_cut
#endif

View File

@ -1,5 +1,5 @@
#ifndef __SD_MODEL_IO_GGUF_READER_EXT_H__
#define __SD_MODEL_IO_GGUF_READER_EXT_H__
#ifndef __GGUF_READER_HPP__
#define __GGUF_READER_HPP__
#include <cstdint>
#include <fstream>
@ -231,4 +231,4 @@ public:
size_t data_offset() const { return data_offset_; }
};
#endif // __SD_MODEL_IO_GGUF_READER_EXT_H__
#endif // __GGUF_READER_HPP__

View File

@ -346,7 +346,6 @@ namespace LLM {
auto merger = std::dynamic_pointer_cast<PatchMerger>(blocks["merger"]);
auto x = patch_embed->forward(ctx, pixel_values);
sd::ggml_graph_cut::mark_graph_cut(x, "llm.vision.prelude", "x");
x = ggml_reshape_4d(ctx->ggml_ctx, x, x->ne[0] * spatial_merge_size * spatial_merge_size, x->ne[1] / spatial_merge_size / spatial_merge_size, x->ne[2], x->ne[3]);
x = ggml_get_rows(ctx->ggml_ctx, x, window_index);
@ -360,11 +359,9 @@ namespace LLM {
mask = nullptr;
}
x = block->forward(ctx, x, pe, mask);
sd::ggml_graph_cut::mark_graph_cut(x, "llm.vision.blocks." + std::to_string(i), "x");
}
x = merger->forward(ctx, x);
sd::ggml_graph_cut::mark_graph_cut(x, "llm.vision.final", "x");
x = ggml_get_rows(ctx->ggml_ctx, x, window_inverse_index);
@ -509,7 +506,6 @@ namespace LLM {
auto norm = std::dynamic_pointer_cast<RMSNorm>(blocks["norm"]);
auto x = embed_tokens->forward(ctx, input_ids);
sd::ggml_graph_cut::mark_graph_cut(x, "llm.text.prelude", "x");
std::vector<ggml_tensor*> intermediate_outputs;
@ -556,10 +552,6 @@ namespace LLM {
auto block = std::dynamic_pointer_cast<TransformerBlock>(blocks["layers." + std::to_string(i)]);
x = block->forward(ctx, x, input_pos, attention_mask);
if (out_layers.size() > 1) {
x = ggml_cont(ctx->ggml_ctx, x);
}
sd::ggml_graph_cut::mark_graph_cut(x, "llm.text.layers." + std::to_string(i), "x");
if (out_layers.find(i + 1) != out_layers.end()) {
intermediate_outputs.push_back(x);
}

View File

@ -129,7 +129,7 @@ struct LoraModel : public GGMLRunner {
}
}
ggml_tensor* get_lora_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_backend_t backend) {
ggml_tensor* get_lora_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) {
ggml_tensor* updown = nullptr;
int index = 0;
while (true) {
@ -152,17 +152,17 @@ struct LoraModel : public GGMLRunner {
auto iter = lora_tensors.find(lora_up_name);
if (iter != lora_tensors.end()) {
lora_up = ggml_ext_cast_f32(ctx, backend, iter->second);
lora_up = ggml_ext_cast_f32(ctx, iter->second);
}
iter = lora_tensors.find(lora_mid_name);
if (iter != lora_tensors.end()) {
lora_mid = ggml_ext_cast_f32(ctx, backend, iter->second);
lora_mid = ggml_ext_cast_f32(ctx, iter->second);
}
iter = lora_tensors.find(lora_down_name);
if (iter != lora_tensors.end()) {
lora_down = ggml_ext_cast_f32(ctx, backend, iter->second);
lora_down = ggml_ext_cast_f32(ctx, iter->second);
}
if (lora_up == nullptr || lora_down == nullptr) {
@ -208,7 +208,7 @@ struct LoraModel : public GGMLRunner {
return updown;
}
ggml_tensor* get_raw_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_backend_t backend) {
ggml_tensor* get_raw_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) {
ggml_tensor* updown = nullptr;
int index = 0;
while (true) {
@ -225,7 +225,7 @@ struct LoraModel : public GGMLRunner {
auto iter = lora_tensors.find(diff_name);
if (iter != lora_tensors.end()) {
curr_updown = ggml_ext_cast_f32(ctx, backend, iter->second);
curr_updown = ggml_ext_cast_f32(ctx, iter->second);
} else {
break;
}
@ -248,7 +248,7 @@ struct LoraModel : public GGMLRunner {
return updown;
}
ggml_tensor* get_loha_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_backend_t backend) {
ggml_tensor* get_loha_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) {
ggml_tensor* updown = nullptr;
int index = 0;
while (true) {
@ -276,33 +276,33 @@ struct LoraModel : public GGMLRunner {
auto iter = lora_tensors.find(hada_1_down_name);
if (iter != lora_tensors.end()) {
hada_1_down = ggml_ext_cast_f32(ctx, backend, iter->second);
hada_1_down = ggml_ext_cast_f32(ctx, iter->second);
}
iter = lora_tensors.find(hada_1_up_name);
if (iter != lora_tensors.end()) {
hada_1_up = ggml_ext_cast_f32(ctx, backend, iter->second);
hada_1_up = ggml_ext_cast_f32(ctx, iter->second);
}
iter = lora_tensors.find(hada_1_mid_name);
if (iter != lora_tensors.end()) {
hada_1_mid = ggml_ext_cast_f32(ctx, backend, iter->second);
hada_1_mid = ggml_ext_cast_f32(ctx, iter->second);
hada_1_up = ggml_cont(ctx, ggml_transpose(ctx, hada_1_up));
}
iter = lora_tensors.find(hada_2_down_name);
if (iter != lora_tensors.end()) {
hada_2_down = ggml_ext_cast_f32(ctx, backend, iter->second);
hada_2_down = ggml_ext_cast_f32(ctx, iter->second);
}
iter = lora_tensors.find(hada_2_up_name);
if (iter != lora_tensors.end()) {
hada_2_up = ggml_ext_cast_f32(ctx, backend, iter->second);
hada_2_up = ggml_ext_cast_f32(ctx, iter->second);
}
iter = lora_tensors.find(hada_2_mid_name);
if (iter != lora_tensors.end()) {
hada_2_mid = ggml_ext_cast_f32(ctx, backend, iter->second);
hada_2_mid = ggml_ext_cast_f32(ctx, iter->second);
hada_2_up = ggml_cont(ctx, ggml_transpose(ctx, hada_2_up));
}
@ -351,7 +351,7 @@ struct LoraModel : public GGMLRunner {
return updown;
}
ggml_tensor* get_lokr_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_backend_t backend) {
ggml_tensor* get_lokr_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) {
ggml_tensor* updown = nullptr;
int index = 0;
while (true) {
@ -378,24 +378,24 @@ struct LoraModel : public GGMLRunner {
auto iter = lora_tensors.find(lokr_w1_name);
if (iter != lora_tensors.end()) {
lokr_w1 = ggml_ext_cast_f32(ctx, backend, iter->second);
lokr_w1 = ggml_ext_cast_f32(ctx, iter->second);
}
iter = lora_tensors.find(lokr_w2_name);
if (iter != lora_tensors.end()) {
lokr_w2 = ggml_ext_cast_f32(ctx, backend, iter->second);
lokr_w2 = ggml_ext_cast_f32(ctx, iter->second);
}
int64_t rank = 1;
if (lokr_w1 == nullptr) {
iter = lora_tensors.find(lokr_w1_a_name);
if (iter != lora_tensors.end()) {
lokr_w1_a = ggml_ext_cast_f32(ctx, backend, iter->second);
lokr_w1_a = ggml_ext_cast_f32(ctx, iter->second);
}
iter = lora_tensors.find(lokr_w1_b_name);
if (iter != lora_tensors.end()) {
lokr_w1_b = ggml_ext_cast_f32(ctx, backend, iter->second);
lokr_w1_b = ggml_ext_cast_f32(ctx, iter->second);
}
if (lokr_w1_a == nullptr || lokr_w1_b == nullptr) {
@ -410,12 +410,12 @@ struct LoraModel : public GGMLRunner {
if (lokr_w2 == nullptr) {
iter = lora_tensors.find(lokr_w2_a_name);
if (iter != lora_tensors.end()) {
lokr_w2_a = ggml_ext_cast_f32(ctx, backend, iter->second);
lokr_w2_a = ggml_ext_cast_f32(ctx, iter->second);
}
iter = lora_tensors.find(lokr_w2_b_name);
if (iter != lora_tensors.end()) {
lokr_w2_b = ggml_ext_cast_f32(ctx, backend, iter->second);
lokr_w2_b = ggml_ext_cast_f32(ctx, iter->second);
}
if (lokr_w2_a == nullptr || lokr_w2_b == nullptr) {
@ -468,23 +468,23 @@ struct LoraModel : public GGMLRunner {
return updown;
}
ggml_tensor* get_weight_diff(const std::string& model_tensor_name, ggml_backend_t backend, ggml_context* ctx, ggml_tensor* model_tensor, bool with_lora_and_lokr = true) {
ggml_tensor* get_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_tensor* model_tensor, bool with_lora_and_lokr = true) {
// lora
ggml_tensor* diff = nullptr;
if (with_lora_and_lokr) {
diff = get_lora_weight_diff(model_tensor_name, ctx, backend);
diff = get_lora_weight_diff(model_tensor_name, ctx);
}
// diff
if (diff == nullptr) {
diff = get_raw_weight_diff(model_tensor_name, ctx, backend);
diff = get_raw_weight_diff(model_tensor_name, ctx);
}
// loha
if (diff == nullptr) {
diff = get_loha_weight_diff(model_tensor_name, ctx, backend);
diff = get_loha_weight_diff(model_tensor_name, ctx);
}
// lokr
if (diff == nullptr && with_lora_and_lokr) {
diff = get_lokr_weight_diff(model_tensor_name, ctx, backend);
diff = get_lokr_weight_diff(model_tensor_name, ctx);
}
if (diff != nullptr) {
if (ggml_nelements(diff) < ggml_nelements(model_tensor)) {
@ -502,7 +502,6 @@ struct LoraModel : public GGMLRunner {
}
ggml_tensor* get_out_diff(ggml_context* ctx,
ggml_backend_t backend,
ggml_tensor* x,
WeightAdapter::ForwardParams forward_params,
const std::string& model_tensor_name) {
@ -591,7 +590,7 @@ struct LoraModel : public GGMLRunner {
}
scale_value *= multiplier;
auto curr_out_diff = ggml_ext_lokr_forward(ctx, backend, x, lokr_w1, lokr_w1_a, lokr_w1_b, lokr_w2, lokr_w2_a, lokr_w2_b, is_conv2d, forward_params.conv2d, scale_value);
auto curr_out_diff = ggml_ext_lokr_forward(ctx, x, lokr_w1, lokr_w1_a, lokr_w1_b, lokr_w2, lokr_w2_a, lokr_w2_b, is_conv2d, forward_params.conv2d, scale_value);
if (out_diff == nullptr) {
out_diff = curr_out_diff;
} else {
@ -762,7 +761,7 @@ struct LoraModel : public GGMLRunner {
ggml_tensor* model_tensor = it.second;
// lora
ggml_tensor* diff = get_weight_diff(model_tensor_name, runtime_backend, compute_ctx, model_tensor);
ggml_tensor* diff = get_weight_diff(model_tensor_name, compute_ctx, model_tensor);
if (diff == nullptr) {
continue;
}
@ -775,7 +774,7 @@ struct LoraModel : public GGMLRunner {
ggml_tensor* final_tensor;
if (model_tensor->type != GGML_TYPE_F32 && model_tensor->type != GGML_TYPE_F16) {
final_tensor = ggml_ext_cast_f32(compute_ctx, runtime_backend, model_tensor);
final_tensor = ggml_ext_cast_f32(compute_ctx, model_tensor);
final_tensor = ggml_add_inplace(compute_ctx, final_tensor, diff);
final_tensor = ggml_cpy(compute_ctx, final_tensor, model_tensor);
} else {
@ -842,35 +841,34 @@ public:
: lora_models(lora_models) {
}
ggml_tensor* patch_weight(ggml_context* ctx, ggml_backend_t backend, ggml_tensor* weight, const std::string& weight_name, bool with_lora_and_lokr) {
ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name, bool with_lora_and_lokr) {
for (auto& lora_model : lora_models) {
ggml_tensor* diff = lora_model->get_weight_diff(weight_name, backend, ctx, weight, with_lora_and_lokr);
ggml_tensor* diff = lora_model->get_weight_diff(weight_name, ctx, weight, with_lora_and_lokr);
if (diff == nullptr) {
continue;
}
if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) {
weight = ggml_ext_cast_f32(ctx, backend, weight);
weight = ggml_ext_cast_f32(ctx, weight);
}
weight = ggml_add(ctx, weight, diff);
}
return weight;
}
ggml_tensor* patch_weight(ggml_context* ctx, ggml_backend_t backend, ggml_tensor* weight, const std::string& weight_name) override {
return patch_weight(ctx, backend, weight, weight_name, true);
ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name) override {
return patch_weight(ctx, weight, weight_name, true);
}
ggml_tensor* forward_with_lora(ggml_context* ctx,
ggml_backend_t backend,
ggml_tensor* x,
ggml_tensor* w,
ggml_tensor* b,
const std::string& prefix,
WeightAdapter::ForwardParams forward_params) override {
w = patch_weight(ctx, backend, w, prefix + "weight", false);
w = patch_weight(ctx, w, prefix + "weight", false);
if (b) {
b = patch_weight(ctx, backend, b, prefix + "bias", false);
b = patch_weight(ctx, b, prefix + "bias", false);
}
ggml_tensor* out;
if (forward_params.op_type == ForwardParams::op_type_t::OP_LINEAR) {
@ -892,7 +890,7 @@ public:
forward_params.conv2d.scale);
}
for (auto& lora_model : lora_models) {
ggml_tensor* out_diff = lora_model->get_out_diff(ctx, backend, x, forward_params, prefix + "weight");
ggml_tensor* out_diff = lora_model->get_out_diff(ctx, x, forward_params, prefix + "weight");
if (out_diff == nullptr) {
continue;
}

View File

@ -767,8 +767,6 @@ public:
auto context_x = block->forward(ctx, context, x, c_mod);
context = context_x.first;
x = context_x.second;
sd::ggml_graph_cut::mark_graph_cut(context, "mmdit.joint_blocks." + std::to_string(i), "context");
sd::ggml_graph_cut::mark_graph_cut(x, "mmdit.joint_blocks." + std::to_string(i), "x");
}
x = final_layer->forward(ctx, x, c_mod); // (N, T, patch_size ** 2 * out_channels)
@ -811,11 +809,6 @@ public:
context = context_embedder->forward(ctx, context); // [N, L, D] aka [N, L, 1536]
}
sd::ggml_graph_cut::mark_graph_cut(x, "mmdit.prelude", "x");
sd::ggml_graph_cut::mark_graph_cut(c, "mmdit.prelude", "c");
if (context != nullptr) {
sd::ggml_graph_cut::mark_graph_cut(context, "mmdit.prelude", "context");
}
x = forward_core_with_concat(ctx, x, c, context, skip_layers); // (N, H*W, patch_size ** 2 * out_channels)

View File

@ -2,7 +2,6 @@
#include <atomic>
#include <chrono>
#include <cstdarg>
#include <cstdlib>
#include <fstream>
#include <functional>
#include <mutex>
@ -13,21 +12,64 @@
#include <unordered_map>
#include <vector>
#include "gguf_reader.hpp"
#include "model.h"
#include "model_io/gguf_io.h"
#include "model_io/safetensors_io.h"
#include "model_io/torch_legacy_io.h"
#include "model_io/torch_zip_io.h"
#include "stable-diffusion.h"
#include "util.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "ggml-cpu.h"
#include "ggml.h"
#include "ggml_extend_backend.hpp"
#include "zip.h"
#include "name_conversion.h"
#include "stable-diffusion.h"
#ifdef SD_USE_METAL
#include "ggml-metal.h"
#endif
#ifdef SD_USE_VULKAN
#include "ggml-vulkan.h"
#endif
#ifdef SD_USE_OPENCL
#include "ggml-opencl.h"
#endif
#define ST_HEADER_SIZE_LEN 8
uint64_t read_u64(uint8_t* buffer) {
// little endian
uint64_t value = 0;
value |= static_cast<int64_t>(buffer[7]) << 56;
value |= static_cast<int64_t>(buffer[6]) << 48;
value |= static_cast<int64_t>(buffer[5]) << 40;
value |= static_cast<int64_t>(buffer[4]) << 32;
value |= static_cast<int64_t>(buffer[3]) << 24;
value |= static_cast<int64_t>(buffer[2]) << 16;
value |= static_cast<int64_t>(buffer[1]) << 8;
value |= static_cast<int64_t>(buffer[0]);
return value;
}
int32_t read_int(uint8_t* buffer) {
// little endian
int value = 0;
value |= buffer[3] << 24;
value |= buffer[2] << 16;
value |= buffer[1] << 8;
value |= buffer[0];
return value;
}
uint16_t read_short(uint8_t* buffer) {
// little endian
uint16_t value = 0;
value |= buffer[1] << 8;
value |= buffer[0];
return value;
}
/*================================================= Preprocess ==================================================*/
@ -68,7 +110,7 @@ const char* unused_tensors[] = {
"first_stage_model.bn.",
};
bool is_unused_tensor(const std::string& name) {
bool is_unused_tensor(std::string name) {
for (size_t i = 0; i < sizeof(unused_tensors) / sizeof(const char*); i++) {
if (starts_with(name, unused_tensors[i])) {
return true;
@ -208,6 +250,79 @@ void ModelLoader::add_tensor_storage(const TensorStorage& tensor_storage) {
tensor_storage_map[tensor_storage.name] = tensor_storage;
}
bool is_zip_file(const std::string& file_path) {
zip_t* zip = zip_open(file_path.c_str(), 0, 'r');
if (zip == nullptr) {
return false;
}
zip_close(zip);
return true;
}
bool is_gguf_file(const std::string& file_path) {
std::ifstream file(file_path, std::ios::binary);
if (!file.is_open()) {
return false;
}
char magic[4];
file.read(magic, sizeof(magic));
if (!file) {
return false;
}
for (uint32_t i = 0; i < sizeof(magic); i++) {
if (magic[i] != GGUF_MAGIC[i]) {
return false;
}
}
return true;
}
bool is_safetensors_file(const std::string& file_path) {
std::ifstream file(file_path, std::ios::binary);
if (!file.is_open()) {
return false;
}
// get file size
file.seekg(0, file.end);
size_t file_size_ = file.tellg();
file.seekg(0, file.beg);
// read header size
if (file_size_ <= ST_HEADER_SIZE_LEN) {
return false;
}
uint8_t header_size_buf[ST_HEADER_SIZE_LEN];
file.read((char*)header_size_buf, ST_HEADER_SIZE_LEN);
if (!file) {
return false;
}
size_t header_size_ = read_u64(header_size_buf);
if (header_size_ >= file_size_ || header_size_ <= 2) {
return false;
}
// read header
std::vector<char> header_buf;
header_buf.resize(header_size_ + 1);
header_buf[header_size_] = '\0';
file.read(header_buf.data(), header_size_);
if (!file) {
return false;
}
try {
nlohmann::json header_ = nlohmann::json::parse(header_buf.data());
} catch (const std::exception&) {
return false;
}
return true;
}
bool ModelLoader::init_from_file(const std::string& file_path, const std::string& prefix) {
if (is_directory(file_path)) {
LOG_INFO("load %s using diffusers format", file_path.c_str());
@ -218,12 +333,9 @@ bool ModelLoader::init_from_file(const std::string& file_path, const std::string
} else if (is_safetensors_file(file_path)) {
LOG_INFO("load %s using safetensors format", file_path.c_str());
return init_from_safetensors_file(file_path, prefix);
} else if (is_torch_zip_file(file_path)) {
LOG_INFO("load %s using torch zip format", file_path.c_str());
return init_from_torch_zip_file(file_path, prefix);
} else if (init_from_torch_legacy_file(file_path, prefix)) {
LOG_INFO("load %s using torch legacy format", file_path.c_str());
return true;
} else if (is_zip_file(file_path)) {
LOG_INFO("load %s using checkpoint format", file_path.c_str());
return init_from_ckpt_file(file_path, prefix);
} else {
if (file_exists(file_path)) {
LOG_WARN("unknown format %s", file_path.c_str());
@ -263,24 +375,37 @@ bool ModelLoader::init_from_file_and_convert_name(const std::string& file_path,
bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::string& prefix) {
LOG_DEBUG("init from '%s'", file_path.c_str());
std::vector<TensorStorage> tensor_storages;
std::string error;
if (!read_gguf_file(file_path, tensor_storages, &error)) {
LOG_ERROR("%s", error.c_str());
return false;
}
file_paths_.push_back(file_path);
size_t file_index = file_paths_.size() - 1;
for (auto& tensor_storage : tensor_storages) {
// LOG_DEBUG("%s", tensor_storage.name.c_str());
gguf_context* ctx_gguf_ = nullptr;
ggml_context* ctx_meta_ = nullptr;
if (!starts_with(tensor_storage.name, prefix)) {
tensor_storage.name = prefix + tensor_storage.name;
ctx_gguf_ = gguf_init_from_file(file_path.c_str(), {true, &ctx_meta_});
if (!ctx_gguf_) {
LOG_ERROR("failed to open '%s' with gguf_init_from_file. Try to open it with GGUFReader.", file_path.c_str());
GGUFReader gguf_reader;
if (!gguf_reader.load(file_path)) {
LOG_ERROR("failed to open '%s' with GGUFReader.", file_path.c_str());
return false;
}
tensor_storage.file_index = file_index;
size_t data_offset = gguf_reader.data_offset();
for (const auto& gguf_tensor_info : gguf_reader.tensors()) {
std::string name = gguf_tensor_info.name;
if (!starts_with(name, prefix)) {
name = prefix + name;
}
TensorStorage tensor_storage(
name,
gguf_tensor_info.type,
gguf_tensor_info.shape.data(),
static_cast<int>(gguf_tensor_info.shape.size()),
file_index,
data_offset + gguf_tensor_info.offset);
// LOG_DEBUG("%s %s", name.c_str(), tensor_storage.to_string().c_str());
add_tensor_storage(tensor_storage);
}
@ -288,96 +413,204 @@ bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::s
return true;
}
int n_tensors = static_cast<int>(gguf_get_n_tensors(ctx_gguf_));
size_t total_size = 0;
size_t data_offset = gguf_get_data_offset(ctx_gguf_);
for (int i = 0; i < n_tensors; i++) {
std::string name = gguf_get_tensor_name(ctx_gguf_, i);
ggml_tensor* dummy = ggml_get_tensor(ctx_meta_, name.c_str());
size_t offset = data_offset + gguf_get_tensor_offset(ctx_gguf_, i);
// LOG_DEBUG("%s", name.c_str());
if (!starts_with(name, prefix)) {
name = prefix + name;
}
TensorStorage tensor_storage(name, dummy->type, dummy->ne, ggml_n_dims(dummy), file_index, offset);
GGML_ASSERT(ggml_nbytes(dummy) == tensor_storage.nbytes());
add_tensor_storage(tensor_storage);
}
gguf_free(ctx_gguf_);
ggml_free(ctx_meta_);
return true;
}
/*================================================= SafeTensorsModelLoader ==================================================*/
ggml_type str_to_ggml_type(const std::string& dtype) {
ggml_type ttype = GGML_TYPE_COUNT;
if (dtype == "F16") {
ttype = GGML_TYPE_F16;
} else if (dtype == "BF16") {
ttype = GGML_TYPE_BF16;
} else if (dtype == "F32") {
ttype = GGML_TYPE_F32;
} else if (dtype == "F64") {
ttype = GGML_TYPE_F32;
} else if (dtype == "F8_E4M3") {
ttype = GGML_TYPE_F16;
} else if (dtype == "F8_E5M2") {
ttype = GGML_TYPE_F16;
} else if (dtype == "I64") {
ttype = GGML_TYPE_I32;
}
return ttype;
}
// https://huggingface.co/docs/safetensors/index
bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const std::string& prefix) {
LOG_DEBUG("init from '%s', prefix = '%s'", file_path.c_str(), prefix.c_str());
std::vector<TensorStorage> tensor_storages;
std::string error;
if (!read_safetensors_file(file_path, tensor_storages, &error)) {
LOG_ERROR("%s", error.c_str());
file_paths_.push_back(file_path);
size_t file_index = file_paths_.size() - 1;
std::ifstream file(file_path, std::ios::binary);
if (!file.is_open()) {
LOG_ERROR("failed to open '%s'", file_path.c_str());
file_paths_.pop_back();
return false;
}
file_paths_.push_back(file_path);
size_t file_index = file_paths_.size() - 1;
// get file size
file.seekg(0, file.end);
size_t file_size_ = file.tellg();
file.seekg(0, file.beg);
for (auto& tensor_storage : tensor_storages) {
if (is_unused_tensor(tensor_storage.name)) {
// read header size
if (file_size_ <= ST_HEADER_SIZE_LEN) {
LOG_ERROR("invalid safetensor file '%s'", file_path.c_str());
file_paths_.pop_back();
return false;
}
uint8_t header_size_buf[ST_HEADER_SIZE_LEN];
file.read((char*)header_size_buf, ST_HEADER_SIZE_LEN);
if (!file) {
LOG_ERROR("read safetensors header size failed: '%s'", file_path.c_str());
return false;
}
size_t header_size_ = read_u64(header_size_buf);
if (header_size_ >= file_size_) {
LOG_ERROR("invalid safetensor file '%s'", file_path.c_str());
file_paths_.pop_back();
return false;
}
// read header
std::vector<char> header_buf;
header_buf.resize(header_size_ + 1);
header_buf[header_size_] = '\0';
file.read(header_buf.data(), header_size_);
if (!file) {
LOG_ERROR("read safetensors header failed: '%s'", file_path.c_str());
file_paths_.pop_back();
return false;
}
nlohmann::json header_;
try {
header_ = nlohmann::json::parse(header_buf.data());
} catch (const std::exception&) {
LOG_ERROR("parsing safetensors header failed", file_path.c_str());
file_paths_.pop_back();
return false;
}
for (auto& item : header_.items()) {
std::string name = item.key();
nlohmann::json tensor_info = item.value();
// LOG_DEBUG("%s %s\n", name.c_str(), tensor_info.dump().c_str());
if (name == "__metadata__") {
continue;
}
if (!starts_with(tensor_storage.name, prefix)) {
tensor_storage.name = prefix + tensor_storage.name;
}
tensor_storage.file_index = file_index;
add_tensor_storage(tensor_storage);
// LOG_DEBUG("%s", tensor_storage.to_string().c_str());
}
return true;
}
/*================================================= TorchLegacyModelLoader ==================================================*/
bool ModelLoader::init_from_torch_legacy_file(const std::string& file_path, const std::string& prefix) {
LOG_DEBUG("init from torch legacy '%s'", file_path.c_str());
std::vector<TensorStorage> tensor_storages;
std::string error;
if (!read_torch_legacy_file(file_path, tensor_storages, &error)) {
if ((!error.empty()) && (ends_with(file_path, ".pt") || ends_with(file_path, ".pth"))) {
LOG_WARN("%s", error.c_str());
}
return false;
}
file_paths_.push_back(file_path);
size_t file_index = file_paths_.size() - 1;
for (auto& tensor_storage : tensor_storages) {
if (is_unused_tensor(tensor_storage.name)) {
if (is_unused_tensor(name)) {
continue;
}
if (!starts_with(tensor_storage.name, prefix)) {
tensor_storage.name = prefix + tensor_storage.name;
}
tensor_storage.file_index = file_index;
std::string dtype = tensor_info["dtype"];
nlohmann::json shape = tensor_info["shape"];
add_tensor_storage(tensor_storage);
if (dtype == "U8") {
continue;
}
return true;
}
size_t begin = tensor_info["data_offsets"][0].get<size_t>();
size_t end = tensor_info["data_offsets"][1].get<size_t>();
/*================================================= TorchZipModelLoader ==================================================*/
bool ModelLoader::init_from_torch_zip_file(const std::string& file_path, const std::string& prefix) {
LOG_DEBUG("init from '%s'", file_path.c_str());
std::vector<TensorStorage> tensor_storages;
std::string error;
if (!read_torch_zip_file(file_path, tensor_storages, &error)) {
LOG_ERROR("%s", error.c_str());
ggml_type type = str_to_ggml_type(dtype);
if (type == GGML_TYPE_COUNT) {
LOG_ERROR("unsupported dtype '%s' (tensor '%s')", dtype.c_str(), name.c_str());
return false;
}
file_paths_.push_back(file_path);
size_t file_index = file_paths_.size() - 1;
for (auto& tensor_storage : tensor_storages) {
if (!starts_with(tensor_storage.name, prefix)) {
tensor_storage.name = prefix + tensor_storage.name;
if (shape.size() > SD_MAX_DIMS) {
LOG_ERROR("invalid tensor '%s'", name.c_str());
return false;
}
int n_dims = (int)shape.size();
int64_t ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1};
for (int i = 0; i < n_dims; i++) {
ne[i] = shape[i].get<int64_t>();
}
if (n_dims == 5) {
n_dims = 4;
ne[0] = ne[0] * ne[1];
ne[1] = ne[2];
ne[2] = ne[3];
ne[3] = ne[4];
}
// ggml_n_dims returns 1 for scalars
if (n_dims == 0) {
n_dims = 1;
}
if (!starts_with(name, prefix)) {
name = prefix + name;
}
TensorStorage tensor_storage(name, type, ne, n_dims, file_index, ST_HEADER_SIZE_LEN + header_size_ + begin);
tensor_storage.reverse_ne();
size_t tensor_data_size = end - begin;
bool tensor_size_ok;
if (dtype == "F8_E4M3") {
tensor_storage.is_f8_e4m3 = true;
// f8 -> f16
tensor_size_ok = (tensor_storage.nbytes() == tensor_data_size * 2);
} else if (dtype == "F8_E5M2") {
tensor_storage.is_f8_e5m2 = true;
// f8 -> f16
tensor_size_ok = (tensor_storage.nbytes() == tensor_data_size * 2);
} else if (dtype == "F64") {
tensor_storage.is_f64 = true;
// f64 -> f32
tensor_size_ok = (tensor_storage.nbytes() * 2 == tensor_data_size);
} else if (dtype == "I64") {
tensor_storage.is_i64 = true;
// i64 -> i32
tensor_size_ok = (tensor_storage.nbytes() * 2 == tensor_data_size);
} else {
tensor_size_ok = (tensor_storage.nbytes() == tensor_data_size);
}
if (!tensor_size_ok) {
LOG_ERROR("size mismatch for tensor '%s' (%s)\n", name.c_str(), dtype.c_str());
return false;
}
tensor_storage.file_index = file_index;
add_tensor_storage(tensor_storage);
// LOG_DEBUG("%s", tensor_storage.to_string().c_str());
// LOG_DEBUG("%s %s", tensor_storage.to_string().c_str(), dtype.c_str());
}
return true;
@ -409,6 +642,367 @@ bool ModelLoader::init_from_diffusers_file(const std::string& file_path, const s
return true;
}
/*================================================= CkptModelLoader ==================================================*/
// $ python -m pickletools sd-v1-4/archive/data.pkl | head -n 100
// 0: \x80 PROTO 2
// 2: } EMPTY_DICT
// 3: q BINPUT 0
// 5: ( MARK
// 6: X BINUNICODE 'epoch'
// 16: q BINPUT 1
// 18: K BININT1 6
// 20: X BINUNICODE 'global_step'
// 36: q BINPUT 2
// 38: J BININT 470000
// 43: X BINUNICODE 'pytorch-lightning_version'
// 73: q BINPUT 3
// 75: X BINUNICODE '1.4.2'
// 85: q BINPUT 4
// 87: X BINUNICODE 'state_dict'
// 102: q BINPUT 5
// 104: } EMPTY_DICT
// 105: q BINPUT 6
// 107: ( MARK
// 108: X BINUNICODE 'betas'
// 118: q BINPUT 7
// 120: c GLOBAL 'torch._utils _rebuild_tensor_v2'
// 153: q BINPUT 8
// 155: ( MARK
// 156: ( MARK
// 157: X BINUNICODE 'storage'
// 169: q BINPUT 9
// 171: c GLOBAL 'torch FloatStorage'
// 191: q BINPUT 10
// 193: X BINUNICODE '0'
// 199: q BINPUT 11
// 201: X BINUNICODE 'cpu'
// 209: q BINPUT 12
// 211: M BININT2 1000
// 214: t TUPLE (MARK at 156)
// 215: q BINPUT 13
// 217: Q BINPERSID
// 218: K BININT1 0
// 220: M BININT2 1000
// ...............................
// 3201: q BINPUT 250
// 3203: R REDUCE
// 3204: q BINPUT 251
// 3206: X BINUNICODE 'model.diffusion_model.input_blocks.1.1.proj_in.weight'
// 3264: q BINPUT 252
// 3266: h BINGET 8
// 3268: ( MARK
// 3269: ( MARK
// 3270: h BINGET 9
// 3272: h BINGET 10
// 3274: X BINUNICODE '30'
// 3281: q BINPUT 253
// 3283: h BINGET 12
// 3285: J BININT 102400
// 3290: t TUPLE (MARK at 3269)
// 3291: q BINPUT 254
// 3293: Q BINPERSID
// 3294: K BININT1 0
// 3296: ( MARK
// 3297: M BININT2 320
// 3300: M BININT2 320
// 3303: K BININT1 1
// 3305: K BININT1 1
// 3307: t TUPLE (MARK at 3296)
// 3308: q BINPUT 255
// 3310: ( MARK
// 3311: M BININT2 320
// 3314: K BININT1 1
// 3316: K BININT1 1
// 3318: K BININT1 1
// 3320: t TUPLE (MARK at 3310)
// 3321: r LONG_BINPUT 256
// 3326: \x89 NEWFALSE
// 3327: h BINGET 16
// 3329: ) EMPTY_TUPLE
// 3330: R REDUCE
// 3331: r LONG_BINPUT 257
// 3336: t TUPLE (MARK at 3268)
// 3337: r LONG_BINPUT 258
// 3342: R REDUCE
// 3343: r LONG_BINPUT 259
// 3348: X BINUNICODE 'model.diffusion_model.input_blocks.1.1.proj_in.bias'
// 3404: r LONG_BINPUT 260
// 3409: h BINGET 8
// 3411: ( MARK
// 3412: ( MARK
// 3413: h BINGET 9
// 3415: h BINGET 10
// 3417: X BINUNICODE '31'
struct PickleTensorReader {
enum ReadPhase {
READ_NAME,
READ_DATA,
CHECK_SIZE,
READ_DIMENS
};
ReadPhase phase = READ_NAME;
size_t entry_size = 0;
int32_t nelements = 0;
TensorStorage tensor_storage;
static ggml_type global_type; // all pickle_tensors data type
static bool read_global_type;
bool read_int_value(uint32_t value) {
if (phase == CHECK_SIZE) {
if (entry_size == value * ggml_type_size(tensor_storage.type)) {
nelements = value;
phase = READ_DIMENS;
return true;
} else {
phase = READ_NAME;
}
} else if (phase == READ_DIMENS) {
if (tensor_storage.n_dims + 1 > SD_MAX_DIMS) { // too many dimens
phase = READ_NAME;
tensor_storage.n_dims = 0;
}
if (nelements % value == 0) {
tensor_storage.ne[tensor_storage.n_dims] = value;
tensor_storage.n_dims++;
}
}
return false;
}
void read_global(const std::string& str) {
if (str == "FloatStorage") {
if (read_global_type) {
global_type = GGML_TYPE_F32;
read_global_type = false;
}
tensor_storage.type = GGML_TYPE_F32;
} else if (str == "HalfStorage") {
if (read_global_type) {
global_type = GGML_TYPE_F16;
read_global_type = false;
}
tensor_storage.type = GGML_TYPE_F16;
}
}
void read_string(const std::string& str, zip_t* zip, std::string dir) {
if (str == "storage") {
read_global_type = true;
} else if (str != "state_dict") {
if (phase == READ_DATA) {
std::string entry_name = dir + "data/" + std::string(str);
size_t i, n = zip_entries_total(zip);
for (i = 0; i < n; ++i) {
zip_entry_openbyindex(zip, i);
{
std::string name = zip_entry_name(zip);
if (name == entry_name) {
tensor_storage.index_in_zip = (int)i;
entry_size = zip_entry_size(zip);
zip_entry_close(zip);
break;
}
}
zip_entry_close(zip);
}
phase = entry_size > 0 ? CHECK_SIZE : READ_NAME;
}
if (!read_global_type && phase == READ_NAME) {
tensor_storage.name = str;
phase = READ_DATA;
tensor_storage.type = global_type;
}
}
}
};
ggml_type PickleTensorReader::global_type = GGML_TYPE_F32; // all pickle_tensors data type
bool PickleTensorReader::read_global_type = false;
int find_char(uint8_t* buffer, int len, char c) {
for (int pos = 0; pos < len; pos++) {
if (buffer[pos] == c) {
return pos;
}
}
return -1;
}
#define MAX_STRING_BUFFER 512
bool ModelLoader::parse_data_pkl(uint8_t* buffer,
size_t buffer_size,
zip_t* zip,
std::string dir,
size_t file_index,
const std::string prefix) {
uint8_t* buffer_end = buffer + buffer_size;
if (buffer[0] == 0x80) { // proto
if (buffer[1] != 2) {
LOG_ERROR("Unsupported protocol\n");
return false;
}
buffer += 2; // 0x80 and version
char string_buffer[MAX_STRING_BUFFER];
bool finish = false;
PickleTensorReader reader;
// read pickle binary file
while (!finish && buffer < buffer_end) {
uint8_t opcode = *buffer;
buffer++;
// https://github.com/python/cpython/blob/3.7/Lib/pickletools.py#L1048
// https://github.com/python/cpython/blob/main/Lib/pickle.py#L105
switch (opcode) {
case '}': // EMPTY_DICT = b'}' # push empty dict
break;
case ']': // EMPTY_LIST = b']' # push empty list
break;
// skip unused sections
case 'h': // BINGET = b'h' # " " " " " " ; " " 1-byte arg
case 'q': // BINPUT = b'q' # " " " " " ; " " 1-byte arg
case 'Q': // BINPERSID = b'Q' # " " " ; " " " " stack
buffer++;
break;
case 'r': // LONG_BINPUT = b'r' # " " " " " ; " " 4-byte arg
buffer += 4;
break;
case 0x95: // FRAME = b'\x95' # indicate the beginning of a new frame
buffer += 8;
break;
case 0x94: // MEMOIZE = b'\x94' # store top of the stack in memo
break;
case '(': // MARK = b'(' # push special markobject on stack
break;
case 'K': // BININT1 = b'K' # push 1-byte unsigned int
{
uint8_t value = *buffer;
if (reader.read_int_value(value)) {
buffer++;
}
buffer++;
} break;
case 'M': // BININT2 = b'M' # push 2-byte unsigned int
{
uint16_t value = read_short(buffer);
if (reader.read_int_value(value)) {
buffer++;
}
buffer += 2;
} break;
case 'J': // BININT = b'J' # push four-byte signed int
{
const int32_t value = read_int(buffer);
if (reader.read_int_value(value)) {
buffer++; // skip tuple after read num_elements
}
buffer += 4;
} break;
case 'X': // BINUNICODE = b'X' # " " " ; counted UTF-8 string argument
{
const int32_t len = read_int(buffer);
buffer += 4;
memset(string_buffer, 0, MAX_STRING_BUFFER);
if (len > MAX_STRING_BUFFER) {
LOG_WARN("tensor name very large");
}
memcpy(string_buffer, buffer, len < MAX_STRING_BUFFER ? len : (MAX_STRING_BUFFER - 1));
buffer += len;
reader.read_string(string_buffer, zip, dir);
} break;
case 0x8C: // SHORT_BINUNICODE = b'\x8c' # push short string; UTF-8 length < 256 bytes
{
const int8_t len = *buffer;
buffer++;
memset(string_buffer, 0, MAX_STRING_BUFFER);
memcpy(string_buffer, buffer, len);
buffer += len;
// printf("String: '%s'\n", string_buffer);
} break;
case 'c': // GLOBAL = b'c' # push self.find_class(modname, name); 2 string args
{
int len = find_char(buffer, MAX_STRING_BUFFER, '\n');
buffer += len + 1;
len = find_char(buffer, MAX_STRING_BUFFER, '\n');
memset(string_buffer, 0, MAX_STRING_BUFFER);
memcpy(string_buffer, buffer, len);
buffer += len + 1;
reader.read_global(string_buffer);
} break;
case 0x86: // TUPLE2 = b'\x86' # build 2-tuple from two topmost stack items
case 0x85: // TUPLE1 = b'\x85' # build 1-tuple from stack top
case 't': // TUPLE = b't' # build tuple from topmost stack items
if (reader.phase == PickleTensorReader::READ_DIMENS) {
reader.tensor_storage.reverse_ne();
reader.tensor_storage.file_index = file_index;
// if(strcmp(prefix.c_str(), "scarlett") == 0)
// printf(" ZIP got tensor %s \n ", reader.tensor_storage.name.c_str());
std::string name = reader.tensor_storage.name;
if (!starts_with(name, prefix)) {
name = prefix + name;
}
reader.tensor_storage.name = name;
add_tensor_storage(reader.tensor_storage);
// LOG_DEBUG("%s", reader.tensor_storage.name.c_str());
// reset
reader = PickleTensorReader();
}
break;
case '.': // STOP = b'.' # every pickle ends with STOP
finish = true;
break;
default:
break;
}
}
}
return true;
}
bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::string& prefix) {
LOG_DEBUG("init from '%s'", file_path.c_str());
file_paths_.push_back(file_path);
size_t file_index = file_paths_.size() - 1;
zip_t* zip = zip_open(file_path.c_str(), 0, 'r');
if (zip == nullptr) {
LOG_ERROR("failed to open '%s'", file_path.c_str());
return false;
}
int n = (int)zip_entries_total(zip);
for (int i = 0; i < n; ++i) {
zip_entry_openbyindex(zip, i);
{
std::string name = zip_entry_name(zip);
size_t pos = name.find("data.pkl");
if (pos != std::string::npos) {
std::string dir = name.substr(0, pos);
printf("ZIP %d, name = %s, dir = %s \n", i, name.c_str(), dir.c_str());
void* pkl_data = nullptr;
size_t pkl_size;
zip_entry_read(zip, &pkl_data, &pkl_size);
// LOG_DEBUG("%lld", pkl_size);
parse_data_pkl((uint8_t*)pkl_data, pkl_size, zip, dir, file_index, prefix);
free(pkl_data);
}
}
zip_entry_close(zip);
}
zip_close(zip);
return true;
}
SDVersion ModelLoader::get_sd_version() {
TensorStorage token_embedding_weight, input_block_weight;
@ -674,8 +1268,8 @@ std::map<ggml_type, uint32_t> ModelLoader::get_vae_wtype_stat() {
return wtype_stat;
}
TensorTypeRules parse_tensor_type_rules(const std::string& tensor_type_rules) {
TensorTypeRules result;
static std::vector<std::pair<std::string, ggml_type>> parse_tensor_type_rules(const std::string& tensor_type_rules) {
std::vector<std::pair<std::string, ggml_type>> result;
for (const auto& item : split_string(tensor_type_rules, ',')) {
if (item.size() == 0)
continue;
@ -1108,6 +1702,76 @@ bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage
return false;
}
bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules_str) {
auto backend = ggml_backend_cpu_init();
size_t mem_size = 1 * 1024 * 1024; // for padding
mem_size += tensor_storage_map.size() * ggml_tensor_overhead();
mem_size += get_params_mem_size(backend, type);
LOG_INFO("model tensors mem size: %.2fMB", mem_size / 1024.f / 1024.f);
ggml_context* ggml_ctx = ggml_init({mem_size, nullptr, false});
gguf_context* gguf_ctx = gguf_init_empty();
auto tensor_type_rules = parse_tensor_type_rules(tensor_type_rules_str);
std::mutex tensor_mutex;
auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
const std::string& name = tensor_storage.name;
ggml_type tensor_type = tensor_storage.type;
ggml_type dst_type = type;
for (const auto& tensor_type_rule : tensor_type_rules) {
std::regex pattern(tensor_type_rule.first);
if (std::regex_search(name, pattern)) {
dst_type = tensor_type_rule.second;
break;
}
}
if (tensor_should_be_converted(tensor_storage, dst_type)) {
tensor_type = dst_type;
}
std::lock_guard<std::mutex> lock(tensor_mutex);
ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne);
if (tensor == nullptr) {
LOG_ERROR("ggml_new_tensor failed");
return false;
}
ggml_set_name(tensor, name.c_str());
// LOG_DEBUG("%s %d %s %d[%d %d %d %d] %d[%d %d %d %d]", name.c_str(),
// ggml_nbytes(tensor), ggml_type_name(tensor_type),
// tensor_storage.n_dims,
// tensor_storage.ne[0], tensor_storage.ne[1], tensor_storage.ne[2], tensor_storage.ne[3],
// tensor->n_dims, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
if (!tensor->data) {
GGML_ASSERT(ggml_nelements(tensor) == 0);
// avoid crashing the gguf writer by setting a dummy pointer for zero-sized tensors
LOG_DEBUG("setting dummy pointer for zero-sized tensor %s", name.c_str());
tensor->data = ggml_get_mem_buffer(ggml_ctx);
}
*dst_tensor = tensor;
gguf_add_tensor(gguf_ctx, tensor);
return true;
};
bool success = load_tensors(on_new_tensor_cb);
ggml_backend_free(backend);
LOG_INFO("load tensors done");
LOG_INFO("trying to save tensors to %s", file_path.c_str());
if (success) {
gguf_write_to_file(gguf_ctx, file_path.c_str(), false);
}
ggml_free(ggml_ctx);
gguf_free(gguf_ctx);
return success;
}
int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type) {
size_t alignment = 128;
if (backend != nullptr) {
@ -1127,3 +1791,29 @@ int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type)
return mem_size;
}
bool convert(const char* input_path,
const char* vae_path,
const char* output_path,
sd_type_t output_type,
const char* tensor_type_rules,
bool convert_name) {
ModelLoader model_loader;
if (!model_loader.init_from_file(input_path)) {
LOG_ERROR("init model loader from file failed: '%s'", input_path);
return false;
}
if (vae_path != nullptr && strlen(vae_path) > 0) {
if (!model_loader.init_from_file(vae_path, "vae.")) {
LOG_ERROR("init model loader from file failed: '%s'", vae_path);
return false;
}
}
if (convert_name) {
model_loader.convert_tensors_name();
}
bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type, tensor_type_rules);
return success;
}

View File

@ -5,13 +5,20 @@
#include <map>
#include <memory>
#include <set>
#include <sstream>
#include <string>
#include <tuple>
#include <utility>
#include <vector>
#include "ggml-backend.h"
#include "ggml.h"
#include "model_io/tensor_storage.h"
#include "gguf.h"
#include "json.hpp"
#include "ordered_map.hpp"
#include "zip.h"
#define SD_MAX_DIMS 5
enum SDVersion {
VERSION_SD1,
@ -188,10 +195,116 @@ enum PMVersion {
PM_VERSION_2,
};
typedef OrderedMap<std::string, TensorStorage> String2TensorStorage;
using TensorTypeRules = std::vector<std::pair<std::string, ggml_type>>;
struct TensorStorage {
std::string name;
ggml_type type = GGML_TYPE_F32;
ggml_type expected_type = GGML_TYPE_COUNT;
bool is_f8_e4m3 = false;
bool is_f8_e5m2 = false;
bool is_f64 = false;
bool is_i64 = false;
int64_t ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1};
int n_dims = 0;
TensorTypeRules parse_tensor_type_rules(const std::string& tensor_type_rules);
size_t file_index = 0;
int index_in_zip = -1; // >= means stored in a zip file
uint64_t offset = 0; // offset in file
TensorStorage() = default;
TensorStorage(std::string name, ggml_type type, const int64_t* ne, int n_dims, size_t file_index, size_t offset = 0)
: name(std::move(name)), type(type), n_dims(n_dims), file_index(file_index), offset(offset) {
for (int i = 0; i < n_dims; i++) {
this->ne[i] = ne[i];
}
}
int64_t nelements() const {
int64_t n = 1;
for (int i = 0; i < SD_MAX_DIMS; i++) {
n *= ne[i];
}
return n;
}
int64_t nbytes() const {
return nelements() * ggml_type_size(type) / ggml_blck_size(type);
}
int64_t nbytes_to_read() const {
if (is_f8_e4m3 || is_f8_e5m2) {
return nbytes() / 2;
} else if (is_f64 || is_i64) {
return nbytes() * 2;
} else {
return nbytes();
}
}
void unsqueeze() {
if (n_dims == 2) {
n_dims = 4;
ne[3] = ne[1];
ne[2] = ne[0];
ne[1] = 1;
ne[0] = 1;
}
}
std::vector<TensorStorage> chunk(size_t n) {
std::vector<TensorStorage> chunks;
uint64_t chunk_size = nbytes_to_read() / n;
// printf("%d/%d\n", chunk_size, nbytes_to_read());
reverse_ne();
for (size_t i = 0; i < n; i++) {
TensorStorage chunk_i = *this;
chunk_i.ne[0] = ne[0] / n;
chunk_i.offset = offset + i * chunk_size;
chunk_i.reverse_ne();
chunks.push_back(chunk_i);
}
reverse_ne();
return chunks;
}
void reverse_ne() {
int64_t new_ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1};
for (int i = 0; i < n_dims; i++) {
new_ne[i] = ne[n_dims - 1 - i];
}
for (int i = 0; i < n_dims; i++) {
ne[i] = new_ne[i];
}
}
std::string to_string() const {
std::stringstream ss;
const char* type_name = ggml_type_name(type);
if (is_f8_e4m3) {
type_name = "f8_e4m3";
} else if (is_f8_e5m2) {
type_name = "f8_e5m2";
} else if (is_f64) {
type_name = "f64";
} else if (is_i64) {
type_name = "i64";
}
ss << name << " | " << type_name << " | ";
ss << n_dims << " [";
for (int i = 0; i < SD_MAX_DIMS; i++) {
ss << ne[i];
if (i != SD_MAX_DIMS - 1) {
ss << ", ";
}
}
ss << "]";
return ss.str();
}
};
typedef std::function<bool(const TensorStorage&, ggml_tensor**)> on_new_tensor_cb_t;
typedef OrderedMap<std::string, TensorStorage> String2TensorStorage;
class ModelLoader {
protected:
@ -201,10 +314,16 @@ protected:
void add_tensor_storage(const TensorStorage& tensor_storage);
bool parse_data_pkl(uint8_t* buffer,
size_t buffer_size,
zip_t* zip,
std::string dir,
size_t file_index,
const std::string prefix);
bool init_from_gguf_file(const std::string& file_path, const std::string& prefix = "");
bool init_from_safetensors_file(const std::string& file_path, const std::string& prefix = "");
bool init_from_torch_zip_file(const std::string& file_path, const std::string& prefix = "");
bool init_from_torch_legacy_file(const std::string& file_path, const std::string& prefix = "");
bool init_from_ckpt_file(const std::string& file_path, const std::string& prefix = "");
bool init_from_diffusers_file(const std::string& file_path, const std::string& prefix = "");
public:
@ -234,6 +353,7 @@ public:
return names;
}
bool save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules);
bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);
int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
~ModelLoader() = default;

View File

@ -1,57 +0,0 @@
#ifndef __SD_MODEL_IO_BINARY_IO_H__
#define __SD_MODEL_IO_BINARY_IO_H__
#include <cstdint>
#include <ostream>
namespace model_io {
inline int32_t read_int(const uint8_t* buffer) {
uint32_t value = 0;
value |= static_cast<uint32_t>(buffer[3]) << 24;
value |= static_cast<uint32_t>(buffer[2]) << 16;
value |= static_cast<uint32_t>(buffer[1]) << 8;
value |= static_cast<uint32_t>(buffer[0]);
return static_cast<int32_t>(value);
}
inline uint16_t read_short(const uint8_t* buffer) {
uint16_t value = 0;
value |= static_cast<uint16_t>(buffer[1]) << 8;
value |= static_cast<uint16_t>(buffer[0]);
return value;
}
inline uint64_t read_u64(const uint8_t* buffer) {
uint64_t value = 0;
value |= static_cast<uint64_t>(buffer[7]) << 56;
value |= static_cast<uint64_t>(buffer[6]) << 48;
value |= static_cast<uint64_t>(buffer[5]) << 40;
value |= static_cast<uint64_t>(buffer[4]) << 32;
value |= static_cast<uint64_t>(buffer[3]) << 24;
value |= static_cast<uint64_t>(buffer[2]) << 16;
value |= static_cast<uint64_t>(buffer[1]) << 8;
value |= static_cast<uint64_t>(buffer[0]);
return value;
}
inline void write_u64(std::ostream& stream, uint64_t value) {
uint8_t buffer[8];
for (int i = 0; i < 8; ++i) {
buffer[i] = static_cast<uint8_t>((value >> (8 * i)) & 0xFF);
}
stream.write((const char*)buffer, sizeof(buffer));
}
inline int find_char(const uint8_t* buffer, int len, char c) {
for (int pos = 0; pos < len; pos++) {
if (buffer[pos] == (uint8_t)c) {
return pos;
}
}
return -1;
}
} // namespace model_io
#endif // __SD_MODEL_IO_BINARY_IO_H__

View File

@ -1,123 +0,0 @@
#include "gguf_io.h"
#include <cstdint>
#include <fstream>
#include <string>
#include <vector>
#include "gguf.h"
#include "gguf_reader_ext.h"
#include "util.h"
static void set_error(std::string* error, const std::string& message) {
if (error != nullptr) {
*error = message;
}
}
bool is_gguf_file(const std::string& file_path) {
std::ifstream file(file_path, std::ios::binary);
if (!file.is_open()) {
return false;
}
char magic[4];
file.read(magic, sizeof(magic));
if (!file) {
return false;
}
for (uint32_t i = 0; i < sizeof(magic); i++) {
if (magic[i] != GGUF_MAGIC[i]) {
return false;
}
}
return true;
}
bool read_gguf_file(const std::string& file_path,
std::vector<TensorStorage>& tensor_storages,
std::string* error) {
tensor_storages.clear();
gguf_context* ctx_gguf_ = nullptr;
ggml_context* ctx_meta_ = nullptr;
ctx_gguf_ = gguf_init_from_file(file_path.c_str(), {true, &ctx_meta_});
if (!ctx_gguf_) {
GGUFReader gguf_reader;
if (!gguf_reader.load(file_path)) {
set_error(error, "failed to open '" + file_path + "' with GGUFReader");
return false;
}
size_t data_offset = gguf_reader.data_offset();
for (const auto& gguf_tensor_info : gguf_reader.tensors()) {
TensorStorage tensor_storage(
gguf_tensor_info.name,
gguf_tensor_info.type,
gguf_tensor_info.shape.data(),
static_cast<int>(gguf_tensor_info.shape.size()),
0,
data_offset + gguf_tensor_info.offset);
tensor_storages.push_back(tensor_storage);
}
return true;
}
int n_tensors = static_cast<int>(gguf_get_n_tensors(ctx_gguf_));
size_t data_offset = gguf_get_data_offset(ctx_gguf_);
for (int i = 0; i < n_tensors; i++) {
std::string name = gguf_get_tensor_name(ctx_gguf_, i);
ggml_tensor* dummy = ggml_get_tensor(ctx_meta_, name.c_str());
size_t offset = data_offset + gguf_get_tensor_offset(ctx_gguf_, i);
TensorStorage tensor_storage(name, dummy->type, dummy->ne, ggml_n_dims(dummy), 0, offset);
if (ggml_nbytes(dummy) != tensor_storage.nbytes()) {
gguf_free(ctx_gguf_);
ggml_free(ctx_meta_);
set_error(error, "size mismatch for tensor '" + name + "'");
return false;
}
tensor_storages.push_back(tensor_storage);
}
gguf_free(ctx_gguf_);
ggml_free(ctx_meta_);
return true;
}
bool write_gguf_file(const std::string& file_path,
const std::vector<TensorWriteInfo>& tensors,
std::string* error) {
gguf_context* gguf_ctx = gguf_init_empty();
if (gguf_ctx == nullptr) {
set_error(error, "gguf_init_empty failed");
return false;
}
for (const TensorWriteInfo& write_tensor : tensors) {
ggml_tensor* tensor = write_tensor.tensor;
if (tensor == nullptr) {
set_error(error, "null tensor cannot be written to GGUF");
gguf_free(gguf_ctx);
return false;
}
gguf_add_tensor(gguf_ctx, tensor);
}
LOG_INFO("trying to save tensors to %s", file_path.c_str());
bool success = gguf_write_to_file(gguf_ctx, file_path.c_str(), false);
if (!success) {
set_error(error, "failed to write GGUF file '" + file_path + "'");
}
gguf_free(gguf_ctx);
return success;
}

View File

@ -1,17 +0,0 @@
#ifndef __SD_MODEL_IO_GGUF_IO_H__
#define __SD_MODEL_IO_GGUF_IO_H__
#include <string>
#include <vector>
#include "tensor_storage.h"
bool is_gguf_file(const std::string& file_path);
bool read_gguf_file(const std::string& file_path,
std::vector<TensorStorage>& tensor_storages,
std::string* error = nullptr);
bool write_gguf_file(const std::string& file_path,
const std::vector<TensorWriteInfo>& tensors,
std::string* error = nullptr);
#endif // __SD_MODEL_IO_GGUF_IO_H__

File diff suppressed because it is too large Load Diff

View File

@ -1,21 +0,0 @@
#ifndef __SD_MODEL_IO_PICKLE_IO_H__
#define __SD_MODEL_IO_PICKLE_IO_H__
#include <cstddef>
#include <cstdint>
#include <string>
#include <unordered_map>
#include <vector>
#include "tensor_storage.h"
bool skip_pickle_object(const uint8_t* buffer, size_t buffer_size, size_t* object_size);
bool pickle_object_is_torch_magic_number(const uint8_t* buffer, size_t buffer_size);
bool parse_pickle_uint32_object(const uint8_t* buffer, size_t buffer_size, uint32_t* value);
bool parse_torch_state_dict_pickle(const uint8_t* buffer,
size_t buffer_size,
std::vector<TensorStorage>& tensor_storages,
std::unordered_map<std::string, uint64_t>& storage_nbytes,
std::string* error = nullptr);
#endif // __SD_MODEL_IO_PICKLE_IO_H__

View File

@ -1,316 +0,0 @@
#include "safetensors_io.h"
#include <cstdint>
#include <exception>
#include <fstream>
#include <string>
#include <vector>
#include "binary_io.h"
#include "json.hpp"
#include "util.h"
static constexpr size_t ST_HEADER_SIZE_LEN = 8;
static void set_error(std::string* error, const std::string& message) {
if (error != nullptr) {
*error = message;
}
}
bool is_safetensors_file(const std::string& file_path) {
std::ifstream file(file_path, std::ios::binary);
if (!file.is_open()) {
return false;
}
// get file size
file.seekg(0, file.end);
size_t file_size_ = file.tellg();
file.seekg(0, file.beg);
// read header size
if (file_size_ <= ST_HEADER_SIZE_LEN) {
return false;
}
uint8_t header_size_buf[ST_HEADER_SIZE_LEN];
file.read((char*)header_size_buf, ST_HEADER_SIZE_LEN);
if (!file) {
return false;
}
size_t header_size_ = model_io::read_u64(header_size_buf);
if (header_size_ >= file_size_ || header_size_ <= 2) {
return false;
}
// read header
std::vector<char> header_buf;
header_buf.resize(header_size_ + 1);
header_buf[header_size_] = '\0';
file.read(header_buf.data(), header_size_);
if (!file) {
return false;
}
try {
nlohmann::json header_ = nlohmann::json::parse(header_buf.data());
} catch (const std::exception&) {
return false;
}
return true;
}
static ggml_type safetensors_dtype_to_ggml_type(const std::string& dtype) {
ggml_type ttype = GGML_TYPE_COUNT;
if (dtype == "F16") {
ttype = GGML_TYPE_F16;
} else if (dtype == "BF16") {
ttype = GGML_TYPE_BF16;
} else if (dtype == "F32") {
ttype = GGML_TYPE_F32;
} else if (dtype == "F64") {
ttype = GGML_TYPE_F32;
} else if (dtype == "F8_E4M3") {
ttype = GGML_TYPE_F16;
} else if (dtype == "F8_E5M2") {
ttype = GGML_TYPE_F16;
} else if (dtype == "I32") {
ttype = GGML_TYPE_I32;
} else if (dtype == "I64") {
ttype = GGML_TYPE_I32;
}
return ttype;
}
// https://huggingface.co/docs/safetensors/index
bool read_safetensors_file(const std::string& file_path,
std::vector<TensorStorage>& tensor_storages,
std::string* error) {
std::ifstream file(file_path, std::ios::binary);
if (!file.is_open()) {
set_error(error, "failed to open '" + file_path + "'");
return false;
}
// get file size
file.seekg(0, file.end);
size_t file_size_ = file.tellg();
file.seekg(0, file.beg);
// read header size
if (file_size_ <= ST_HEADER_SIZE_LEN) {
set_error(error, "invalid safetensor file '" + file_path + "'");
return false;
}
uint8_t header_size_buf[ST_HEADER_SIZE_LEN];
file.read((char*)header_size_buf, ST_HEADER_SIZE_LEN);
if (!file) {
set_error(error, "read safetensors header size failed: '" + file_path + "'");
return false;
}
size_t header_size_ = model_io::read_u64(header_size_buf);
if (header_size_ >= file_size_) {
set_error(error, "invalid safetensor file '" + file_path + "'");
return false;
}
// read header
std::vector<char> header_buf;
header_buf.resize(header_size_ + 1);
header_buf[header_size_] = '\0';
file.read(header_buf.data(), header_size_);
if (!file) {
set_error(error, "read safetensors header failed: '" + file_path + "'");
return false;
}
nlohmann::json header_;
try {
header_ = nlohmann::json::parse(header_buf.data());
} catch (const std::exception&) {
set_error(error, "parsing safetensors header failed: '" + file_path + "'");
return false;
}
tensor_storages.clear();
for (auto& item : header_.items()) {
std::string name = item.key();
nlohmann::json tensor_info = item.value();
// LOG_DEBUG("%s %s\n", name.c_str(), tensor_info.dump().c_str());
if (name == "__metadata__") {
continue;
}
std::string dtype = tensor_info["dtype"];
nlohmann::json shape = tensor_info["shape"];
if (dtype == "U8") {
continue;
}
size_t begin = tensor_info["data_offsets"][0].get<size_t>();
size_t end = tensor_info["data_offsets"][1].get<size_t>();
ggml_type type = safetensors_dtype_to_ggml_type(dtype);
if (type == GGML_TYPE_COUNT) {
set_error(error, "unsupported dtype '" + dtype + "' (tensor '" + name + "')");
return false;
}
if (shape.size() > SD_MAX_DIMS) {
set_error(error, "invalid tensor '" + name + "'");
return false;
}
int n_dims = (int)shape.size();
int64_t ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1};
for (int i = 0; i < n_dims; i++) {
ne[i] = shape[i].get<int64_t>();
}
if (n_dims == 5) {
n_dims = 4;
ne[0] = ne[0] * ne[1];
ne[1] = ne[2];
ne[2] = ne[3];
ne[3] = ne[4];
}
// ggml_n_dims returns 1 for scalars
if (n_dims == 0) {
n_dims = 1;
}
TensorStorage tensor_storage(name, type, ne, n_dims, 0, ST_HEADER_SIZE_LEN + header_size_ + begin);
tensor_storage.reverse_ne();
size_t tensor_data_size = end - begin;
bool tensor_size_ok;
if (dtype == "F8_E4M3") {
tensor_storage.is_f8_e4m3 = true;
// f8 -> f16
tensor_size_ok = (tensor_storage.nbytes() == tensor_data_size * 2);
} else if (dtype == "F8_E5M2") {
tensor_storage.is_f8_e5m2 = true;
// f8 -> f16
tensor_size_ok = (tensor_storage.nbytes() == tensor_data_size * 2);
} else if (dtype == "F64") {
tensor_storage.is_f64 = true;
// f64 -> f32
tensor_size_ok = (tensor_storage.nbytes() * 2 == tensor_data_size);
} else if (dtype == "I64") {
tensor_storage.is_i64 = true;
// i64 -> i32
tensor_size_ok = (tensor_storage.nbytes() * 2 == tensor_data_size);
} else {
tensor_size_ok = (tensor_storage.nbytes() == tensor_data_size);
}
if (!tensor_size_ok) {
set_error(error, "size mismatch for tensor '" + name + "' (" + dtype + ")");
return false;
}
tensor_storages.push_back(tensor_storage);
// LOG_DEBUG("%s %s", tensor_storage.to_string().c_str(), dtype.c_str());
}
return true;
}
static bool ggml_type_to_safetensors_dtype(ggml_type type, std::string* dtype) {
switch (type) {
case GGML_TYPE_F16:
*dtype = "F16";
return true;
case GGML_TYPE_BF16:
*dtype = "BF16";
return true;
case GGML_TYPE_F32:
*dtype = "F32";
return true;
case GGML_TYPE_I32:
*dtype = "I32";
return true;
default:
return false;
}
}
bool write_safetensors_file(const std::string& file_path,
const std::vector<TensorWriteInfo>& tensors,
std::string* error) {
nlohmann::ordered_json header = nlohmann::ordered_json::object();
uint64_t data_offset = 0;
for (const TensorWriteInfo& write_tensor : tensors) {
ggml_tensor* tensor = write_tensor.tensor;
if (tensor == nullptr) {
set_error(error, "null tensor cannot be written to safetensors");
return false;
}
const std::string name = ggml_get_name(tensor);
std::string dtype;
if (!ggml_type_to_safetensors_dtype(tensor->type, &dtype)) {
set_error(error,
"unsupported safetensors dtype '" + std::string(ggml_type_name(tensor->type)) +
"' for tensor '" + name + "'");
return false;
}
const uint64_t tensor_nbytes = ggml_nbytes(tensor);
nlohmann::ordered_json json_tensor_info = nlohmann::ordered_json::object();
json_tensor_info["dtype"] = dtype;
nlohmann::ordered_json shape = nlohmann::ordered_json::array();
for (int i = 0; i < write_tensor.n_dims; ++i) {
shape.push_back(write_tensor.ne[write_tensor.n_dims - 1 - i]);
}
json_tensor_info["shape"] = shape;
nlohmann::ordered_json data_offsets = nlohmann::ordered_json::array();
data_offsets.push_back(data_offset);
data_offsets.push_back(data_offset + tensor_nbytes);
json_tensor_info["data_offsets"] = data_offsets;
header[name] = json_tensor_info;
data_offset += tensor_nbytes;
}
const std::string header_str = header.dump();
std::ofstream file(file_path, std::ios::binary);
if (!file.is_open()) {
set_error(error, "failed to open '" + file_path + "' for writing");
return false;
}
LOG_INFO("trying to save tensors to %s", file_path.c_str());
model_io::write_u64(file, header_str.size());
file.write(header_str.data(), header_str.size());
if (!file) {
set_error(error, "failed to write safetensors header to '" + file_path + "'");
return false;
}
for (const TensorWriteInfo& write_tensor : tensors) {
ggml_tensor* tensor = write_tensor.tensor;
const std::string name = ggml_get_name(tensor);
const size_t tensor_nbytes = ggml_nbytes(tensor);
file.write((const char*)tensor->data, tensor_nbytes);
if (!file) {
set_error(error,
"failed to write tensor '" + name + "' to '" + file_path + "'");
return false;
}
}
return true;
}

View File

@ -1,17 +0,0 @@
#ifndef __SD_MODEL_IO_SAFETENSORS_IO_H__
#define __SD_MODEL_IO_SAFETENSORS_IO_H__
#include <string>
#include <vector>
#include "tensor_storage.h"
bool is_safetensors_file(const std::string& file_path);
bool read_safetensors_file(const std::string& file_path,
std::vector<TensorStorage>& tensor_storages,
std::string* error = nullptr);
bool write_safetensors_file(const std::string& file_path,
const std::vector<TensorWriteInfo>& tensors,
std::string* error = nullptr);
#endif // __SD_MODEL_IO_SAFETENSORS_IO_H__

View File

@ -1,132 +0,0 @@
#ifndef __SD_TENSOR_STORAGE_H__
#define __SD_TENSOR_STORAGE_H__
#include <cstddef>
#include <cstdint>
#include <functional>
#include <sstream>
#include <string>
#include <utility>
#include <vector>
#include "ggml.h"
#define SD_MAX_DIMS 5
struct TensorStorage {
std::string name;
ggml_type type = GGML_TYPE_F32;
ggml_type expected_type = GGML_TYPE_COUNT;
bool is_f8_e4m3 = false;
bool is_f8_e5m2 = false;
bool is_f64 = false;
bool is_i64 = false;
int64_t ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1};
int n_dims = 0;
std::string storage_key;
size_t file_index = 0;
int index_in_zip = -1; // >= means stored in a zip file
uint64_t offset = 0; // offset in file
TensorStorage() = default;
TensorStorage(std::string name, ggml_type type, const int64_t* ne, int n_dims, size_t file_index, size_t offset = 0)
: name(std::move(name)), type(type), n_dims(n_dims), file_index(file_index), offset(offset) {
for (int i = 0; i < n_dims; i++) {
this->ne[i] = ne[i];
}
}
int64_t nelements() const {
int64_t n = 1;
for (int i = 0; i < SD_MAX_DIMS; i++) {
n *= ne[i];
}
return n;
}
int64_t nbytes() const {
return nelements() * ggml_type_size(type) / ggml_blck_size(type);
}
int64_t nbytes_to_read() const {
if (is_f8_e4m3 || is_f8_e5m2) {
return nbytes() / 2;
} else if (is_f64 || is_i64) {
return nbytes() * 2;
} else {
return nbytes();
}
}
void unsqueeze() {
if (n_dims == 2) {
n_dims = 4;
ne[3] = ne[1];
ne[2] = ne[0];
ne[1] = 1;
ne[0] = 1;
}
}
std::vector<TensorStorage> chunk(size_t n) {
std::vector<TensorStorage> chunks;
uint64_t chunk_size = nbytes_to_read() / n;
// printf("%d/%d\n", chunk_size, nbytes_to_read());
reverse_ne();
for (size_t i = 0; i < n; i++) {
TensorStorage chunk_i = *this;
chunk_i.ne[0] = ne[0] / n;
chunk_i.offset = offset + i * chunk_size;
chunk_i.reverse_ne();
chunks.push_back(chunk_i);
}
reverse_ne();
return chunks;
}
void reverse_ne() {
int64_t new_ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1};
for (int i = 0; i < n_dims; i++) {
new_ne[i] = ne[n_dims - 1 - i];
}
for (int i = 0; i < n_dims; i++) {
ne[i] = new_ne[i];
}
}
std::string to_string() const {
std::stringstream ss;
const char* type_name = ggml_type_name(type);
if (is_f8_e4m3) {
type_name = "f8_e4m3";
} else if (is_f8_e5m2) {
type_name = "f8_e5m2";
} else if (is_f64) {
type_name = "f64";
} else if (is_i64) {
type_name = "i64";
}
ss << name << " | " << type_name << " | ";
ss << n_dims << " [";
for (int i = 0; i < SD_MAX_DIMS; i++) {
ss << ne[i];
if (i != SD_MAX_DIMS - 1) {
ss << ", ";
}
}
ss << "]";
return ss.str();
}
};
struct TensorWriteInfo {
int64_t ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1};
int n_dims = 0;
ggml_tensor* tensor = nullptr;
};
typedef std::function<bool(const TensorStorage&, ggml_tensor**)> on_new_tensor_cb_t;
#endif // __SD_TENSOR_STORAGE_H__

View File

@ -1,252 +0,0 @@
#include "torch_legacy_io.h"
#include <algorithm>
#include <cstdint>
#include <fstream>
#include <string>
#include <unordered_map>
#include <vector>
#include "pickle_io.h"
#include "util.h"
// torch.save format background:
//
// - Before PyTorch 1.6.0, torch.save used this legacy non-zip format by
// default.
// - Since PyTorch 1.6.0, torch.save defaults to an uncompressed ZIP64 archive
// containing data.pkl, data/, version, and, since PyTorch 2.1.0, byteorder.
// - The old format can still be produced explicitly with:
// torch.save(obj, path, _use_new_zipfile_serialization=False)
//
// Whether obj is a state_dict or a whole nn.Module does not change the outer
// container format selected by torch.save. It changes the pickled object inside:
//
// - state_dict: usually an OrderedDict[str, Tensor]. pickle_io.cpp supports a
// restricted subset of this layout because tensor metadata and raw storages
// can be recovered without executing pickle callables.
// - whole module/checkpoint object: arbitrary Python object graph. This may
// require importing user classes and executing pickle GLOBAL/REDUCE rebuild
// logic, so it is intentionally not supported here.
//
// Legacy non-zip PyTorch files are not a single pickle object:
//
// 1. pickle object: PyTorch legacy magic number
// 2. pickle object: legacy protocol version, expected to be 1001
// 3. pickle object: sys_info metadata, ignored by this reader
// 4. pickle object: state_dict metadata, parsed by pickle_io.cpp
// 5. pickle object: serialized storage key list, skipped here
// 6. raw storage data payloads
// - PyTorch writes storages after the pickles, ordered by storage key
// - each storage has an 8-byte legacy storage header followed by raw bytes
static constexpr size_t LEGACY_STORAGE_HEADER_SIZE = 8;
static void set_error(std::string* error, const std::string& message) {
if (error != nullptr) {
*error = message;
}
}
static std::string bytes_to_hex(const std::vector<uint8_t>& bytes) {
static const char* hex = "0123456789ABCDEF";
std::string result;
result.reserve(bytes.size() * 3);
for (size_t i = 0; i < bytes.size(); ++i) {
if (i > 0) {
result.push_back('-');
}
result.push_back(hex[(bytes[i] >> 4) & 0x0F]);
result.push_back(hex[bytes[i] & 0x0F]);
}
return result;
}
static bool is_probably_tar_file(const std::vector<uint8_t>& header) {
return header.size() >= 262 &&
header[257] == 'u' &&
header[258] == 's' &&
header[259] == 't' &&
header[260] == 'a' &&
header[261] == 'r';
}
static std::string torch_legacy_diagnostics(const std::string& file_path, const std::vector<uint8_t>& buffer) {
if (!ends_with(file_path, ".pt") && !ends_with(file_path, ".pth")) {
return "";
}
if (buffer.empty()) {
return "unsupported PyTorch file '" + file_path + "': empty file";
}
size_t short_len = std::min<size_t>(buffer.size(), 32);
std::vector<uint8_t> short_header(buffer.begin(), buffer.begin() + short_len);
const bool raw_pickle = buffer[0] == 0x80;
const bool tar_file = is_probably_tar_file(buffer);
std::string message = "unsupported PyTorch file '" + file_path + "': first bytes " +
bytes_to_hex(short_header) +
", raw_pickle=" + (raw_pickle ? "true" : "false") +
", tar=" + (tar_file ? "true" : "false");
if (raw_pickle) {
message += "; raw pickle did not match the restricted state_dict layouts currently supported";
} else if (tar_file) {
message += "; legacy tar PyTorch checkpoints are not supported yet";
}
return message;
}
bool read_torch_legacy_file(const std::string& file_path,
std::vector<TensorStorage>& tensor_storages,
std::string* error) {
std::ifstream file(file_path, std::ios::binary);
if (!file.is_open()) {
set_error(error, "failed to open '" + file_path + "'");
return false;
}
file.seekg(0, file.end);
size_t file_size = (size_t)file.tellg();
file.seekg(0, file.beg);
if (file_size == 0) {
set_error(error, "empty file '" + file_path + "'");
return false;
}
std::vector<uint8_t> buffer(file_size);
file.read((char*)buffer.data(), file_size);
if (!file) {
set_error(error, "failed to read '" + file_path + "'");
return false;
}
auto finalize_tensor_offsets = [&](size_t storage_data_offset,
const std::unordered_map<std::string, uint64_t>& legacy_storage_map) -> bool {
if (storage_data_offset > file_size) {
return false;
}
std::vector<std::string> storage_keys;
storage_keys.reserve(legacy_storage_map.size());
for (const auto& [storage_key, _] : legacy_storage_map) {
storage_keys.push_back(storage_key);
}
std::sort(storage_keys.begin(), storage_keys.end());
std::unordered_map<std::string, uint64_t> storage_offsets;
uint64_t current_offset = storage_data_offset;
for (const auto& storage_key : storage_keys) {
auto it = legacy_storage_map.find(storage_key);
if (it == legacy_storage_map.end()) {
return false;
}
if (current_offset + LEGACY_STORAGE_HEADER_SIZE + it->second > file_size) {
return false;
}
storage_offsets[storage_key] = current_offset + LEGACY_STORAGE_HEADER_SIZE;
current_offset += LEGACY_STORAGE_HEADER_SIZE + it->second;
}
for (auto& tensor_storage : tensor_storages) {
if (tensor_storage.storage_key.empty()) {
continue;
}
auto it_offset = storage_offsets.find(tensor_storage.storage_key);
auto it_size = legacy_storage_map.find(tensor_storage.storage_key);
if (it_offset == storage_offsets.end() || it_size == legacy_storage_map.end()) {
return false;
}
uint64_t base_offset = it_offset->second;
uint64_t storage_nbytes = it_size->second;
uint64_t tensor_nbytes = tensor_storage.nbytes_to_read();
if (tensor_storage.offset + tensor_nbytes > storage_nbytes) {
return false;
}
tensor_storage.offset = base_offset + tensor_storage.offset;
tensor_storage.storage_key.clear();
}
return true;
};
auto parse_state_dict_at = [&](size_t state_dict_offset, size_t state_dict_size, size_t* storage_data_offset) -> bool {
tensor_storages.clear();
std::unordered_map<std::string, uint64_t> legacy_storage_map;
if (!parse_torch_state_dict_pickle(buffer.data() + state_dict_offset,
state_dict_size,
tensor_storages,
legacy_storage_map,
error)) {
return false;
}
size_t offset_after_state_dict = state_dict_offset + state_dict_size;
size_t storage_keys_size = 0;
if (!skip_pickle_object(buffer.data() + offset_after_state_dict,
buffer.size() - offset_after_state_dict,
&storage_keys_size)) {
return false;
}
*storage_data_offset = offset_after_state_dict + storage_keys_size;
return finalize_tensor_offsets(*storage_data_offset, legacy_storage_map);
};
size_t object_size_1 = 0;
size_t offset = 0;
if (skip_pickle_object(buffer.data(), buffer.size(), &object_size_1) &&
pickle_object_is_torch_magic_number(buffer.data(), object_size_1)) {
offset += object_size_1;
size_t object_size_2 = 0;
if (!skip_pickle_object(buffer.data() + offset, buffer.size() - offset, &object_size_2)) {
set_error(error, torch_legacy_diagnostics(file_path, buffer));
return false;
}
uint32_t protocol_version = 0;
if (!parse_pickle_uint32_object(buffer.data() + offset, object_size_2, &protocol_version) || protocol_version != 1001) {
set_error(error, torch_legacy_diagnostics(file_path, buffer));
return false;
}
offset += object_size_2;
size_t object_size_3 = 0;
if (!skip_pickle_object(buffer.data() + offset, buffer.size() - offset, &object_size_3)) {
set_error(error, torch_legacy_diagnostics(file_path, buffer));
return false;
}
offset += object_size_3;
size_t state_dict_size = 0;
if (!skip_pickle_object(buffer.data() + offset, buffer.size() - offset, &state_dict_size)) {
set_error(error, torch_legacy_diagnostics(file_path, buffer));
return false;
}
size_t storage_data_offset = 0;
if (parse_state_dict_at(offset, state_dict_size, &storage_data_offset)) {
return true;
}
if (error != nullptr && error->empty()) {
set_error(error, torch_legacy_diagnostics(file_path, buffer));
}
return false;
}
size_t state_dict_size = 0;
if (skip_pickle_object(buffer.data(), buffer.size(), &state_dict_size)) {
size_t storage_data_offset = 0;
if (parse_state_dict_at(0, state_dict_size, &storage_data_offset)) {
return true;
}
}
if (error != nullptr && error->empty()) {
set_error(error, torch_legacy_diagnostics(file_path, buffer));
}
return false;
}

View File

@ -1,13 +0,0 @@
#ifndef __SD_MODEL_IO_TORCH_LEGACY_IO_H__
#define __SD_MODEL_IO_TORCH_LEGACY_IO_H__
#include <string>
#include <vector>
#include "tensor_storage.h"
bool read_torch_legacy_file(const std::string& file_path,
std::vector<TensorStorage>& tensor_storages,
std::string* error = nullptr);
#endif // __SD_MODEL_IO_TORCH_LEGACY_IO_H__

View File

@ -1,140 +0,0 @@
#include "torch_zip_io.h"
#include <cstdint>
#include <cstdlib>
#include <string>
#include <unordered_map>
#include <vector>
#include "pickle_io.h"
#include "zip.h"
static void set_error(std::string* error, const std::string& message) {
if (error != nullptr) {
*error = message;
}
}
bool is_torch_zip_file(const std::string& file_path) {
zip_t* zip = zip_open(file_path.c_str(), 0, 'r');
if (zip == nullptr) {
return false;
}
zip_close(zip);
return true;
}
static bool find_zip_entry(zip_t* zip, const std::string& entry_name, int* index, uint64_t* size) {
size_t n = zip_entries_total(zip);
for (size_t i = 0; i < n; ++i) {
zip_entry_openbyindex(zip, i);
std::string name = zip_entry_name(zip);
if (name == entry_name) {
*index = (int)i;
*size = zip_entry_size(zip);
zip_entry_close(zip);
return true;
}
zip_entry_close(zip);
}
return false;
}
static bool parse_zip_data_pkl(const uint8_t* buffer,
size_t buffer_size,
zip_t* zip,
const std::string& dir,
std::vector<TensorStorage>& tensor_storages,
std::string* error) {
std::vector<TensorStorage> parsed_tensors;
std::unordered_map<std::string, uint64_t> storage_nbytes;
if (!parse_torch_state_dict_pickle(buffer, buffer_size, parsed_tensors, storage_nbytes, error)) {
if (error != nullptr && error->empty()) {
*error = "failed to parse torch zip pickle metadata";
}
return false;
}
for (auto& tensor_storage : parsed_tensors) {
if (tensor_storage.storage_key.empty()) {
set_error(error, "tensor '" + tensor_storage.name + "' has no storage key");
return false;
}
const std::string entry_name = dir + "data/" + tensor_storage.storage_key;
int zip_index = -1;
uint64_t entry_size = 0;
if (!find_zip_entry(zip, entry_name, &zip_index, &entry_size)) {
set_error(error, "storage entry '" + entry_name + "' was not found");
return false;
}
auto it_storage_size = storage_nbytes.find(tensor_storage.storage_key);
if (it_storage_size != storage_nbytes.end() && entry_size < it_storage_size->second) {
set_error(error, "storage entry '" + entry_name + "' is smaller than pickle metadata");
return false;
}
uint64_t tensor_nbytes = tensor_storage.nbytes_to_read();
if (tensor_storage.offset + tensor_nbytes > entry_size) {
set_error(error, "tensor '" + tensor_storage.name + "' exceeds storage entry '" + entry_name + "'");
return false;
}
tensor_storage.index_in_zip = zip_index;
tensor_storage.storage_key.clear();
tensor_storages.push_back(tensor_storage);
}
return true;
}
bool read_torch_zip_file(const std::string& file_path,
std::vector<TensorStorage>& tensor_storages,
std::string* error) {
zip_t* zip = zip_open(file_path.c_str(), 0, 'r');
if (zip == nullptr) {
set_error(error, "failed to open '" + file_path + "'");
return false;
}
tensor_storages.clear();
bool success = true;
bool found_data_pkl = false;
int n = (int)zip_entries_total(zip);
for (int i = 0; i < n; ++i) {
zip_entry_openbyindex(zip, i);
std::string name = zip_entry_name(zip);
size_t pos = name.find("data.pkl");
if (pos != std::string::npos) {
found_data_pkl = true;
std::string dir = name.substr(0, pos);
void* pkl_data = nullptr;
size_t pkl_size = 0;
zip_entry_read(zip, &pkl_data, &pkl_size);
if (pkl_data == nullptr || pkl_size == 0) {
set_error(error, "failed to read '" + name + "' from '" + file_path + "'");
success = false;
} else if (!parse_zip_data_pkl((const uint8_t*)pkl_data, pkl_size, zip, dir, tensor_storages, error)) {
success = false;
}
free(pkl_data);
}
zip_entry_close(zip);
if (!success) {
break;
}
}
if (success && !found_data_pkl) {
set_error(error, "data.pkl was not found in '" + file_path + "'");
success = false;
}
zip_close(zip);
return success;
}

View File

@ -1,14 +0,0 @@
#ifndef __SD_MODEL_IO_TORCH_ZIP_IO_H__
#define __SD_MODEL_IO_TORCH_ZIP_IO_H__
#include <string>
#include <vector>
#include "tensor_storage.h"
bool is_torch_zip_file(const std::string& file_path);
bool read_torch_zip_file(const std::string& file_path,
std::vector<TensorStorage>& tensor_storages,
std::string* error = nullptr);
#endif // __SD_MODEL_IO_TORCH_ZIP_IO_H__

View File

@ -24,75 +24,6 @@ static inline void preprocessing_set_4d(sd::Tensor<float>& tensor, float value,
tensor.values()[static_cast<size_t>(preprocessing_offset_4d(tensor, i0, i1, i2, i3))] = value;
}
static inline uint8_t preprocessing_float_to_u8(float value) {
if (value <= 0.0f) {
return 0;
}
if (value >= 1.0f) {
return 255;
}
return static_cast<uint8_t>(value * 255.0f + 0.5f);
}
static inline void preprocessing_tensor_frame_to_sd_image(const sd::Tensor<float>& tensor, int frame_index, uint8_t* image_data) {
const auto& shape = tensor.shape();
GGML_ASSERT(shape.size() == 4 || shape.size() == 5);
GGML_ASSERT(image_data != nullptr);
const int width = static_cast<int>(shape[0]);
const int height = static_cast<int>(shape[1]);
const int channel = static_cast<int>(shape[shape.size() == 5 ? 3 : 2]);
const size_t pixels = static_cast<size_t>(width) * static_cast<size_t>(height);
const float* src = tensor.data();
if (shape.size() == 4) {
GGML_ASSERT(frame_index >= 0 && frame_index < shape[3]);
const size_t frame_stride = pixels * static_cast<size_t>(channel);
const float* frame_ptr = src + static_cast<size_t>(frame_index) * frame_stride;
if (channel == 3) {
const float* c0 = frame_ptr;
const float* c1 = frame_ptr + pixels;
const float* c2 = frame_ptr + pixels * 2;
for (size_t i = 0; i < pixels; ++i) {
image_data[i * 3 + 0] = preprocessing_float_to_u8(c0[i]);
image_data[i * 3 + 1] = preprocessing_float_to_u8(c1[i]);
image_data[i * 3 + 2] = preprocessing_float_to_u8(c2[i]);
}
return;
}
for (size_t i = 0; i < pixels; ++i) {
for (int c = 0; c < channel; ++c) {
image_data[i * static_cast<size_t>(channel) + static_cast<size_t>(c)] =
preprocessing_float_to_u8(frame_ptr[i + pixels * static_cast<size_t>(c)]);
}
}
return;
}
GGML_ASSERT(frame_index >= 0 && frame_index < shape[2]);
const size_t channel_stride = pixels * static_cast<size_t>(shape[2]);
const float* frame_ptr = src + static_cast<size_t>(frame_index) * pixels;
if (channel == 3) {
const float* c0 = frame_ptr;
const float* c1 = frame_ptr + channel_stride;
const float* c2 = frame_ptr + channel_stride * 2;
for (size_t i = 0; i < pixels; ++i) {
image_data[i * 3 + 0] = preprocessing_float_to_u8(c0[i]);
image_data[i * 3 + 1] = preprocessing_float_to_u8(c1[i]);
image_data[i * 3 + 2] = preprocessing_float_to_u8(c2[i]);
}
return;
}
for (size_t i = 0; i < pixels; ++i) {
for (int c = 0; c < channel; ++c) {
image_data[i * static_cast<size_t>(channel) + static_cast<size_t>(c)] =
preprocessing_float_to_u8(frame_ptr[i + channel_stride * static_cast<size_t>(c)]);
}
}
}
static inline sd::Tensor<float> sd_image_to_preprocessing_tensor(sd_image_t image) {
sd::Tensor<float> tensor({static_cast<int64_t>(image.width), static_cast<int64_t>(image.height), static_cast<int64_t>(image.channel), 1});
for (uint32_t y = 0; y < image.height; ++y) {
@ -108,7 +39,20 @@ static inline sd::Tensor<float> sd_image_to_preprocessing_tensor(sd_image_t imag
static inline void preprocessing_tensor_to_sd_image(const sd::Tensor<float>& tensor, uint8_t* image_data) {
GGML_ASSERT(tensor.dim() == 4);
GGML_ASSERT(tensor.shape()[3] == 1);
preprocessing_tensor_frame_to_sd_image(tensor, 0, image_data);
GGML_ASSERT(image_data != nullptr);
int width = static_cast<int>(tensor.shape()[0]);
int height = static_cast<int>(tensor.shape()[1]);
int channel = static_cast<int>(tensor.shape()[2]);
for (int y = 0; y < height; ++y) {
for (int x = 0; x < width; ++x) {
for (int c = 0; c < channel; ++c) {
float value = preprocessing_get_4d(tensor, x, y, c, 0);
value = std::min(1.0f, std::max(0.0f, value));
image_data[(y * width + x) * channel + c] = static_cast<uint8_t>(std::round(value * 255.0f));
}
}
}
}
static inline sd::Tensor<float> gaussian_kernel_tensor(int kernel_size) {

View File

@ -95,7 +95,9 @@ namespace Qwen {
float scale = 1.f / 32.f;
bool force_prec_f32 = false;
#ifdef SD_USE_VULKAN
force_prec_f32 = true;
#endif
// The purpose of the scale here is to prevent NaN issues in certain situations.
// For example when using CUDA but the weights are k-quants (not all prompts).
blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_dim, out_bias, false, force_prec_f32, scale));
@ -122,10 +124,6 @@ namespace Qwen {
auto to_v = std::dynamic_pointer_cast<Linear>(blocks["to_v"]);
auto to_out_0 = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
if (sd_backend_is(ctx->backend, "Vulkan")) {
to_out_0->set_force_prec_f32(true);
}
auto norm_added_q = std::dynamic_pointer_cast<UnaryBlock>(blocks["norm_added_q"]);
auto norm_added_k = std::dynamic_pointer_cast<UnaryBlock>(blocks["norm_added_k"]);
@ -412,9 +410,6 @@ namespace Qwen {
auto img = img_in->forward(ctx, x);
auto txt = txt_norm->forward(ctx, context);
txt = txt_in->forward(ctx, txt);
sd::ggml_graph_cut::mark_graph_cut(img, "qwen_image.prelude", "img");
sd::ggml_graph_cut::mark_graph_cut(txt, "qwen_image.prelude", "txt");
// sd::ggml_graph_cut::mark_graph_cut(t_emb, "qwen_image.prelude", "t_emb");
for (int i = 0; i < params.num_layers; i++) {
auto block = std::dynamic_pointer_cast<QwenImageTransformerBlock>(blocks["transformer_blocks." + std::to_string(i)]);
@ -422,8 +417,6 @@ namespace Qwen {
auto result = block->forward(ctx, img, txt, t_emb, pe, modulate_index);
img = result.first;
txt = result.second;
sd::ggml_graph_cut::mark_graph_cut(img, "qwen_image.transformer_blocks." + std::to_string(i), "img");
sd::ggml_graph_cut::mark_graph_cut(txt, "qwen_image.transformer_blocks." + std::to_string(i), "txt");
}
if (params.zero_cond_t) {

View File

@ -17,7 +17,6 @@
#include "pmid.hpp"
#include "sample-cache.h"
#include "tae.hpp"
#include "upscaler.h"
#include "vae.hpp"
#include "latent-preview.h"
@ -144,7 +143,6 @@ public:
std::string taesd_path;
sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0, 0};
bool offload_params_to_cpu = false;
float max_vram = 0.f;
bool use_pmid = false;
bool is_using_v_parameterization = false;
@ -173,7 +171,60 @@ public:
}
void init_backend() {
backend = sd_get_default_backend();
#ifdef SD_USE_CUDA
LOG_DEBUG("Using CUDA backend");
backend = ggml_backend_cuda_init(0);
#endif
#ifdef SD_USE_METAL
LOG_DEBUG("Using Metal backend");
backend = ggml_backend_metal_init();
#endif
#ifdef SD_USE_VULKAN
LOG_DEBUG("Using Vulkan backend");
size_t device = 0;
const int device_count = ggml_backend_vk_get_device_count();
if (device_count) {
const char* SD_VK_DEVICE = getenv("SD_VK_DEVICE");
if (SD_VK_DEVICE != nullptr) {
std::string sd_vk_device_str = SD_VK_DEVICE;
try {
device = std::stoull(sd_vk_device_str);
} catch (const std::invalid_argument&) {
LOG_WARN("SD_VK_DEVICE environment variable is not a valid integer (%s). Falling back to device 0.", SD_VK_DEVICE);
device = 0;
} catch (const std::out_of_range&) {
LOG_WARN("SD_VK_DEVICE environment variable value is out of range for `unsigned long long` type (%s). Falling back to device 0.", SD_VK_DEVICE);
device = 0;
}
if (device >= device_count) {
LOG_WARN("Cannot find targeted vulkan device (%llu). Falling back to device 0.", device);
device = 0;
}
}
LOG_INFO("Vulkan: Using device %llu", device);
backend = ggml_backend_vk_init(device);
}
if (!backend) {
LOG_WARN("Failed to initialize Vulkan backend");
}
#endif
#ifdef SD_USE_OPENCL
LOG_DEBUG("Using OpenCL backend");
// ggml_log_set(ggml_log_callback_default, nullptr); // Optional ggml logs
backend = ggml_backend_opencl_init();
if (!backend) {
LOG_WARN("Failed to initialize OpenCL backend");
}
#endif
#ifdef SD_USE_SYCL
LOG_DEBUG("Using SYCL backend");
backend = ggml_backend_sycl_init(0);
#endif
if (!backend) {
LOG_DEBUG("Using CPU backend");
backend = ggml_backend_cpu_init();
}
}
std::shared_ptr<RNG> get_rng(rng_type_t rng_type) {
@ -191,7 +242,6 @@ public:
vae_decode_only = sd_ctx_params->vae_decode_only;
free_params_immediately = sd_ctx_params->free_params_immediately;
offload_params_to_cpu = sd_ctx_params->offload_params_to_cpu;
max_vram = sd_ctx_params->max_vram;
bool use_tae = false;
@ -377,10 +427,6 @@ public:
bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu;
const size_t max_graph_vram_bytes = max_vram <= 0.f
? 0
: static_cast<size_t>(static_cast<double>(max_vram) * 1024.0 * 1024.0 * 1024.0);
{
clip_backend = backend;
if (clip_on_cpu && !ggml_backend_is_cpu(backend)) {
@ -470,7 +516,6 @@ public:
clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(backend,
offload_params_to_cpu,
tensor_storage_map);
clip_vision->set_max_graph_vram_bytes(max_graph_vram_bytes);
clip_vision->alloc_params_buffer();
clip_vision->get_param_tensors(tensors);
}
@ -547,11 +592,9 @@ public:
}
}
cond_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
cond_stage_model->alloc_params_buffer();
cond_stage_model->get_param_tensors(tensors);
diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
diffusion_model->alloc_params_buffer();
diffusion_model->get_param_tensors(tensors);
@ -560,7 +603,6 @@ public:
}
if (high_noise_diffusion_model) {
high_noise_diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
high_noise_diffusion_model->alloc_params_buffer();
high_noise_diffusion_model->get_param_tensors(tensors);
}
@ -633,19 +675,16 @@ public:
} else if (use_tae && !tae_preview_only) {
LOG_INFO("using TAE for encoding / decoding");
first_stage_model = create_tae();
first_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
first_stage_model->alloc_params_buffer();
first_stage_model->get_param_tensors(tensors, "tae");
} else {
LOG_INFO("using VAE for encoding / decoding");
first_stage_model = create_vae();
first_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
first_stage_model->alloc_params_buffer();
first_stage_model->get_param_tensors(tensors, "first_stage_model");
if (use_tae && tae_preview_only) {
LOG_INFO("using TAE for preview");
preview_vae = create_tae();
preview_vae->set_max_graph_vram_bytes(max_graph_vram_bytes);
preview_vae->alloc_params_buffer();
preview_vae->get_param_tensors(tensors, "tae");
}
@ -1117,14 +1156,9 @@ public:
cond_stage_lora_models.push_back(lora);
}
}
// Only attach the adapter when there are LoRAs targeting the cond_stage model.
// An empty MultiLoraAdapter still routes every linear/conv through
// forward_with_lora() instead of the direct kernel path — slower for no benefit.
if (!cond_stage_lora_models.empty()) {
auto multi_lora_adapter = std::make_shared<MultiLoraAdapter>(cond_stage_lora_models);
cond_stage_model->set_weight_adapter(multi_lora_adapter);
}
}
if (diffusion_model) {
std::vector<std::shared_ptr<LoraModel>> lora_models;
auto lora_state_diff = lora_state;
@ -1154,14 +1188,12 @@ public:
diffusion_lora_models.push_back(lora);
}
}
if (!diffusion_lora_models.empty()) {
auto multi_lora_adapter = std::make_shared<MultiLoraAdapter>(diffusion_lora_models);
diffusion_model->set_weight_adapter(multi_lora_adapter);
if (high_noise_diffusion_model) {
high_noise_diffusion_model->set_weight_adapter(multi_lora_adapter);
}
}
}
if (first_stage_model) {
std::vector<std::shared_ptr<LoraModel>> lora_models;
@ -1192,12 +1224,10 @@ public:
first_stage_lora_models.push_back(lora);
}
}
if (!first_stage_lora_models.empty()) {
auto multi_lora_adapter = std::make_shared<MultiLoraAdapter>(first_stage_lora_models);
first_stage_model->set_weight_adapter(multi_lora_adapter);
}
}
}
void lora_stat() {
if (!cond_stage_lora_models.empty()) {
@ -2083,35 +2113,6 @@ enum lora_apply_mode_t str_to_lora_apply_mode(const char* str) {
return LORA_APPLY_MODE_COUNT;
}
const char* hires_upscaler_to_str[] = {
"None",
"Latent",
"Latent (nearest)",
"Latent (nearest-exact)",
"Latent (antialiased)",
"Latent (bicubic)",
"Latent (bicubic antialiased)",
"Lanczos",
"Nearest",
"Model",
};
const char* sd_hires_upscaler_name(enum sd_hires_upscaler_t upscaler) {
if (upscaler >= SD_HIRES_UPSCALER_NONE && upscaler < SD_HIRES_UPSCALER_COUNT) {
return hires_upscaler_to_str[upscaler];
}
return NONE_STR;
}
enum sd_hires_upscaler_t str_to_sd_hires_upscaler(const char* str) {
for (int i = 0; i < SD_HIRES_UPSCALER_COUNT; i++) {
if (!strcmp(str, hires_upscaler_to_str[i])) {
return (enum sd_hires_upscaler_t)i;
}
}
return SD_HIRES_UPSCALER_COUNT;
}
void sd_cache_params_init(sd_cache_params_t* cache_params) {
*cache_params = {};
cache_params->mode = SD_CACHE_DISABLED;
@ -2140,19 +2141,6 @@ void sd_cache_params_init(sd_cache_params_t* cache_params) {
cache_params->spectrum_stop_percent = 0.9f;
}
void sd_hires_params_init(sd_hires_params_t* hires_params) {
*hires_params = {};
hires_params->enabled = false;
hires_params->upscaler = SD_HIRES_UPSCALER_LATENT;
hires_params->model_path = nullptr;
hires_params->scale = 2.0f;
hires_params->target_width = 0;
hires_params->target_height = 0;
hires_params->steps = 0;
hires_params->denoising_strength = 0.7f;
hires_params->upscale_tile_size = 128;
}
void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
*sd_ctx_params = {};
sd_ctx_params->vae_decode_only = true;
@ -2164,7 +2152,6 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
sd_ctx_params->prediction = PREDICTION_COUNT;
sd_ctx_params->lora_apply_mode = LORA_APPLY_AUTO;
sd_ctx_params->offload_params_to_cpu = false;
sd_ctx_params->max_vram = 0.f;
sd_ctx_params->enable_mmap = false;
sd_ctx_params->keep_clip_on_cpu = false;
sd_ctx_params->keep_control_net_on_cpu = false;
@ -2206,7 +2193,6 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
"sampler_rng_type: %s\n"
"prediction: %s\n"
"offload_params_to_cpu: %s\n"
"max_vram: %.3f\n"
"keep_clip_on_cpu: %s\n"
"keep_control_net_on_cpu: %s\n"
"keep_vae_on_cpu: %s\n"
@ -2239,7 +2225,6 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
sd_rng_type_name(sd_ctx_params->sampler_rng_type),
sd_prediction_name(sd_ctx_params->prediction),
BOOL_STR(sd_ctx_params->offload_params_to_cpu),
sd_ctx_params->max_vram,
BOOL_STR(sd_ctx_params->keep_clip_on_cpu),
BOOL_STR(sd_ctx_params->keep_control_net_on_cpu),
BOOL_STR(sd_ctx_params->keep_vae_on_cpu),
@ -2325,7 +2310,6 @@ void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params) {
sd_img_gen_params->pm_params = {nullptr, 0, nullptr, 20.f};
sd_img_gen_params->vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f};
sd_cache_params_init(&sd_img_gen_params->cache);
sd_hires_params_init(&sd_img_gen_params->hires);
}
char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
@ -2352,8 +2336,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
"increase_ref_index: %s\n"
"control_strength: %.2f\n"
"photo maker: {style_strength = %.2f, id_images_count = %d, id_embed_path = %s}\n"
"VAE tiling: %s\n"
"hires: {enabled=%s, upscaler=%s, model_path=%s, scale=%.2f, target=%dx%d, steps=%d, denoising_strength=%.2f}\n",
"VAE tiling: %s\n",
SAFE_STR(sd_img_gen_params->prompt),
SAFE_STR(sd_img_gen_params->negative_prompt),
sd_img_gen_params->clip_skip,
@ -2370,15 +2353,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
sd_img_gen_params->pm_params.style_strength,
sd_img_gen_params->pm_params.id_images_count,
SAFE_STR(sd_img_gen_params->pm_params.id_embed_path),
BOOL_STR(sd_img_gen_params->vae_tiling_params.enabled),
BOOL_STR(sd_img_gen_params->hires.enabled),
sd_hires_upscaler_name(sd_img_gen_params->hires.upscaler),
SAFE_STR(sd_img_gen_params->hires.model_path),
sd_img_gen_params->hires.scale,
sd_img_gen_params->hires.target_width,
sd_img_gen_params->hires.target_height,
sd_img_gen_params->hires.steps,
sd_img_gen_params->hires.denoising_strength);
BOOL_STR(sd_img_gen_params->vae_tiling_params.enabled));
const char* cache_mode_str = "disabled";
if (sd_img_gen_params->cache.mode == SD_CACHE_EASYCACHE) {
cache_mode_str = "easycache";
@ -2482,10 +2457,8 @@ enum scheduler_t sd_get_default_scheduler(const sd_ctx_t* sd_ctx, enum sample_me
return EXPONENTIAL_SCHEDULER;
}
}
if (sample_method == LCM_SAMPLE_METHOD || sample_method == TCD_SAMPLE_METHOD) {
if (sample_method == LCM_SAMPLE_METHOD) {
return LCM_SCHEDULER;
} else if (sample_method == DDIM_TRAILING_SAMPLE_METHOD) {
return SIMPLE_SCHEDULER;
}
return DISCRETE_SCHEDULER;
}
@ -2559,7 +2532,6 @@ struct GenerationRequest {
sd_guidance_params_t guidance = {};
sd_guidance_params_t high_noise_guidance = {};
sd_pm_params_t pm_params = {};
sd_hires_params_t hires = {};
int frames = -1;
float vace_strength = 1.f;
@ -2581,7 +2553,6 @@ struct GenerationRequest {
auto_resize_ref_image = sd_img_gen_params->auto_resize_ref_image;
guidance = sd_img_gen_params->sample_params.guidance;
pm_params = sd_img_gen_params->pm_params;
hires = sd_img_gen_params->hires;
cache_params = &sd_img_gen_params->cache;
resolve(sd_ctx);
}
@ -2604,76 +2575,26 @@ struct GenerationRequest {
}
void align_generation_request_size() {
align_image_size(&width, &height, "generation request");
}
void align_image_size(int* target_width, int* target_height, const char* label) {
int spatial_multiple = vae_scale_factor * diffusion_model_down_factor;
int width_offset = align_up_offset(*target_width, spatial_multiple);
int height_offset = align_up_offset(*target_height, spatial_multiple);
int width_offset = align_up_offset(width, spatial_multiple);
int height_offset = align_up_offset(height, spatial_multiple);
if (width_offset <= 0 && height_offset <= 0) {
return;
}
int original_width = *target_width;
int original_height = *target_height;
int original_width = width;
int original_height = height;
*target_width += width_offset;
*target_height += height_offset;
LOG_WARN("align %s up %dx%d to %dx%d (multiple=%d)",
label,
width += width_offset;
height += height_offset;
LOG_WARN("align up %dx%d to %dx%d (multiple=%d)",
original_width,
original_height,
*target_width,
*target_height,
width,
height,
spatial_multiple);
}
void resolve_hires() {
if (!hires.enabled) {
return;
}
if (hires.upscaler == SD_HIRES_UPSCALER_NONE) {
hires.enabled = false;
return;
}
if (hires.upscaler < SD_HIRES_UPSCALER_NONE || hires.upscaler >= SD_HIRES_UPSCALER_COUNT) {
LOG_WARN("hires upscaler '%d' is invalid, disabling hires", hires.upscaler);
hires.enabled = false;
return;
}
if (hires.upscaler == SD_HIRES_UPSCALER_MODEL && strlen(SAFE_STR(hires.model_path)) == 0) {
LOG_WARN("hires model upscaler requires a model path, disabling hires");
hires.enabled = false;
return;
}
if (hires.scale <= 0.f && hires.target_width <= 0 && hires.target_height <= 0) {
LOG_WARN("hires scale must be positive when no target size is set, disabling hires");
hires.enabled = false;
return;
}
hires.denoising_strength = std::clamp(hires.denoising_strength, 0.0001f, 1.f);
hires.steps = std::max(0, hires.steps);
if (hires.target_width > 0 && hires.target_height > 0) {
// pass
} else if (hires.target_width > 0) {
hires.target_height = hires.target_width;
} else if (hires.target_height > 0) {
hires.target_width = hires.target_height;
} else {
hires.target_width = static_cast<int>(std::round(width * hires.scale));
hires.target_height = static_cast<int>(std::round(height * hires.scale));
}
if (hires.target_width <= 0 || hires.target_height <= 0) {
LOG_WARN("hires target size is not positive, disabling hires");
hires.enabled = false;
return;
}
align_image_size(&hires.target_width, &hires.target_height, "hires target");
}
static void resolve_guidance(sd_ctx_t* sd_ctx,
sd_guidance_params_t* guidance,
bool* use_uncond,
@ -2714,7 +2635,6 @@ struct GenerationRequest {
void resolve(sd_ctx_t* sd_ctx) {
align_generation_request_size();
resolve_hires();
seed = resolve_seed(seed);
resolve_guidance(sd_ctx, &guidance, &use_uncond, &use_img_cond);
@ -3205,7 +3125,7 @@ static sd_image_t* decode_image_outputs(sd_ctx_t* sd_ctx,
}
decoded_images.push_back(std::move(image));
int64_t t2 = ggml_time_ms();
LOG_INFO("latent %zu decoded, taking %.2fs", i + 1, (t2 - t1) * 1.0f / 1000);
LOG_INFO("latent %" PRId64 " decoded, taking %.2fs", i + 1, (t2 - t1) * 1.0f / 1000);
}
int64_t t4 = ggml_time_ms();
@ -3227,135 +3147,6 @@ static sd_image_t* decode_image_outputs(sd_ctx_t* sd_ctx,
return result_images;
}
static sd::Tensor<float> upscale_hires_latent(sd_ctx_t* sd_ctx,
const sd::Tensor<float>& latent,
const GenerationRequest& request,
UpscalerGGML* upscaler) {
auto get_hires_latent_target_shape = [&]() {
std::vector<int64_t> target_shape = latent.shape();
if (target_shape.size() < 2) {
target_shape.clear();
return target_shape;
}
target_shape[0] = request.hires.target_width / request.vae_scale_factor;
target_shape[1] = request.hires.target_height / request.vae_scale_factor;
return target_shape;
};
if (request.hires.upscaler == SD_HIRES_UPSCALER_LATENT ||
request.hires.upscaler == SD_HIRES_UPSCALER_LATENT_NEAREST ||
request.hires.upscaler == SD_HIRES_UPSCALER_LATENT_NEAREST_EXACT ||
request.hires.upscaler == SD_HIRES_UPSCALER_LATENT_ANTIALIASED ||
request.hires.upscaler == SD_HIRES_UPSCALER_LATENT_BICUBIC ||
request.hires.upscaler == SD_HIRES_UPSCALER_LATENT_BICUBIC_ANTIALIASED) {
std::vector<int64_t> target_shape = get_hires_latent_target_shape();
if (target_shape.empty()) {
LOG_ERROR("latent has invalid shape for hires upscale");
return {};
}
sd::ops::InterpolateMode mode = sd::ops::InterpolateMode::Nearest;
bool antialias = false;
switch (request.hires.upscaler) {
case SD_HIRES_UPSCALER_LATENT:
mode = sd::ops::InterpolateMode::Bilinear;
break;
case SD_HIRES_UPSCALER_LATENT_NEAREST:
mode = sd::ops::InterpolateMode::Nearest;
break;
case SD_HIRES_UPSCALER_LATENT_NEAREST_EXACT:
mode = sd::ops::InterpolateMode::NearestExact;
break;
case SD_HIRES_UPSCALER_LATENT_ANTIALIASED:
mode = sd::ops::InterpolateMode::Bilinear;
antialias = true;
break;
case SD_HIRES_UPSCALER_LATENT_BICUBIC:
mode = sd::ops::InterpolateMode::Bicubic;
break;
case SD_HIRES_UPSCALER_LATENT_BICUBIC_ANTIALIASED:
mode = sd::ops::InterpolateMode::Bicubic;
antialias = true;
break;
default:
break;
}
LOG_INFO("hires %s upscale %" PRId64 "x%" PRId64 " -> %" PRId64 "x%" PRId64,
sd_hires_upscaler_name(request.hires.upscaler),
latent.shape()[0],
latent.shape()[1],
target_shape[0],
target_shape[1]);
return sd::ops::interpolate(latent, target_shape, mode, false, antialias);
} else if (request.hires.upscaler == SD_HIRES_UPSCALER_MODEL ||
request.hires.upscaler == SD_HIRES_UPSCALER_LANCZOS ||
request.hires.upscaler == SD_HIRES_UPSCALER_NEAREST) {
if (sd_ctx->sd->vae_decode_only) {
LOG_ERROR("hires %s upscaler requires VAE encoder weights; create the context with vae_decode_only=false",
sd_hires_upscaler_name(request.hires.upscaler));
return {};
}
if (request.hires.upscaler == SD_HIRES_UPSCALER_MODEL && upscaler == nullptr) {
LOG_ERROR("hires model upscaler context is null");
return {};
}
sd::Tensor<float> decoded = sd_ctx->sd->decode_first_stage(latent);
if (decoded.empty()) {
LOG_ERROR("decode_first_stage failed before hires %s upscale",
sd_hires_upscaler_name(request.hires.upscaler));
return {};
}
sd::Tensor<float> upscaled_tensor;
if (request.hires.upscaler == SD_HIRES_UPSCALER_MODEL) {
upscaled_tensor = upscaler->upscale_tensor(decoded);
if (upscaled_tensor.empty()) {
LOG_ERROR("hires model upscale failed");
return {};
}
if (upscaled_tensor.shape()[0] != request.hires.target_width ||
upscaled_tensor.shape()[1] != request.hires.target_height) {
upscaled_tensor = sd::ops::interpolate(upscaled_tensor,
{request.hires.target_width,
request.hires.target_height,
upscaled_tensor.shape()[2],
upscaled_tensor.shape()[3]});
}
} else {
sd::ops::InterpolateMode mode = request.hires.upscaler == SD_HIRES_UPSCALER_LANCZOS
? sd::ops::InterpolateMode::Lanczos
: sd::ops::InterpolateMode::Nearest;
LOG_INFO("hires %s image upscale %" PRId64 "x%" PRId64 " -> %dx%d",
sd_hires_upscaler_name(request.hires.upscaler),
decoded.shape()[0],
decoded.shape()[1],
request.hires.target_width,
request.hires.target_height);
upscaled_tensor = sd::ops::interpolate(decoded,
{request.hires.target_width,
request.hires.target_height,
decoded.shape()[2],
decoded.shape()[3]},
mode);
upscaled_tensor = sd::ops::clamp(upscaled_tensor, 0.0f, 1.0f);
}
sd::Tensor<float> upscaled_latent = sd_ctx->sd->encode_first_stage(upscaled_tensor);
if (upscaled_latent.empty()) {
LOG_ERROR("encode_first_stage failed after hires %s upscale",
sd_hires_upscaler_name(request.hires.upscaler));
}
return upscaled_latent;
}
LOG_ERROR("unsupported hires upscaler '%s'", sd_hires_upscaler_name(request.hires.upscaler));
return {};
}
SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) {
if (sd_ctx == nullptr || sd_img_gen_params == nullptr) {
return nullptr;
@ -3443,143 +3234,14 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
}
return nullptr;
}
if (sd_ctx->sd->free_params_immediately && !request.hires.enabled) {
if (sd_ctx->sd->free_params_immediately) {
sd_ctx->sd->diffusion_model->free_params_buffer();
}
int64_t denoise_end = ggml_time_ms();
LOG_INFO("generating %zu latent images completed, taking %.2fs",
LOG_INFO("generating %" PRId64 " latent images completed, taking %.2fs",
final_latents.size(),
(denoise_end - denoise_start) * 1.0f / 1000);
if (request.hires.enabled && request.hires.target_width > 0) {
LOG_INFO("hires fix: upscaling to %dx%d", request.hires.target_width, request.hires.target_height);
std::unique_ptr<UpscalerGGML> hires_upscaler;
if (request.hires.upscaler == SD_HIRES_UPSCALER_MODEL) {
LOG_INFO("hires fix: loading model upscaler from '%s'", request.hires.model_path);
hires_upscaler = std::make_unique<UpscalerGGML>(sd_ctx->sd->n_threads,
false,
request.hires.upscale_tile_size);
const size_t max_graph_vram_bytes = sd_ctx->sd->max_vram <= 0.f
? 0
: static_cast<size_t>(static_cast<double>(sd_ctx->sd->max_vram) * 1024.0 * 1024.0 * 1024.0);
hires_upscaler->set_max_graph_vram_bytes(max_graph_vram_bytes);
if (!hires_upscaler->load_from_file(request.hires.model_path,
sd_ctx->sd->offload_params_to_cpu,
sd_ctx->sd->n_threads)) {
LOG_ERROR("load hires model upscaler failed");
if (sd_ctx->sd->free_params_immediately) {
sd_ctx->sd->diffusion_model->free_params_buffer();
}
return nullptr;
}
}
int hires_steps = request.hires.steps > 0 ? request.hires.steps : plan.sample_steps;
// sd-webui behavior: scale up total steps so trimming by denoising_strength yields exactly hires_steps effective steps,
// unlike img2img which trims from a fixed step count
hires_steps = static_cast<int>(hires_steps / request.hires.denoising_strength);
std::vector<float> hires_sigmas = sd_ctx->sd->denoiser->get_sigmas(
hires_steps,
sd_ctx->sd->get_image_seq_len(request.hires.target_height, request.hires.target_width),
sd_img_gen_params->sample_params.scheduler,
sd_ctx->sd->version);
size_t t_enc = static_cast<size_t>(hires_steps * request.hires.denoising_strength);
if (t_enc >= static_cast<size_t>(hires_steps)) {
t_enc = static_cast<size_t>(hires_steps) - 1;
}
std::vector<float> hires_sigma_sched(hires_sigmas.begin() + hires_steps - static_cast<int>(t_enc) - 1,
hires_sigmas.end());
LOG_INFO("hires fix: %d steps, denoising_strength=%.2f, sigma_sched_size=%zu",
hires_steps,
request.hires.denoising_strength,
hires_sigma_sched.size());
std::vector<sd::Tensor<float>> hires_final_latents;
int64_t hires_denoise_start = ggml_time_ms();
for (int b = 0; b < (int)final_latents.size(); b++) {
int64_t cur_seed = request.seed + b;
sd_ctx->sd->rng->manual_seed(cur_seed);
sd_ctx->sd->sampler_rng->manual_seed(cur_seed);
sd::Tensor<float> upscaled = upscale_hires_latent(sd_ctx,
final_latents[b],
request,
hires_upscaler.get());
if (upscaled.empty()) {
if (sd_ctx->sd->free_params_immediately) {
sd_ctx->sd->diffusion_model->free_params_buffer();
}
return nullptr;
}
sd::Tensor<float> noise = sd::randn_like<float>(upscaled, sd_ctx->sd->rng);
sd::Tensor<float> hires_denoise_mask;
if (!latents.denoise_mask.empty()) {
std::vector<int64_t> mask_shape = latents.denoise_mask.shape();
mask_shape[0] = upscaled.shape()[0];
mask_shape[1] = upscaled.shape()[1];
hires_denoise_mask = sd::ops::interpolate(latents.denoise_mask,
mask_shape,
sd::ops::InterpolateMode::NearestMax);
}
int64_t hires_sample_start = ggml_time_ms();
sd::Tensor<float> x_0 = sd_ctx->sd->sample(sd_ctx->sd->diffusion_model,
true,
upscaled,
std::move(noise),
embeds.cond,
embeds.uncond,
embeds.img_cond,
embeds.id_cond,
latents.control_image,
request.control_strength,
request.guidance,
plan.eta,
request.shifted_timestep,
plan.sample_method,
sd_ctx->sd->is_flow_denoiser(),
hires_sigma_sched,
plan.start_merge_step,
latents.ref_latents,
request.increase_ref_index,
hires_denoise_mask,
sd::Tensor<float>(),
1.f,
request.cache_params);
int64_t hires_sample_end = ggml_time_ms();
if (!x_0.empty()) {
LOG_INFO("hires sampling %d/%d completed, taking %.2fs",
b + 1,
(int)final_latents.size(),
(hires_sample_end - hires_sample_start) * 1.0f / 1000);
hires_final_latents.push_back(std::move(x_0));
continue;
}
LOG_ERROR("hires sampling for image %d/%d failed after %.2fs",
b + 1,
(int)final_latents.size(),
(hires_sample_end - hires_sample_start) * 1.0f / 1000);
if (sd_ctx->sd->free_params_immediately) {
sd_ctx->sd->diffusion_model->free_params_buffer();
}
return nullptr;
}
if (sd_ctx->sd->free_params_immediately) {
sd_ctx->sd->diffusion_model->free_params_buffer();
}
int64_t hires_denoise_end = ggml_time_ms();
LOG_INFO("hires fix completed, taking %.2fs", (hires_denoise_end - hires_denoise_start) * 1.0f / 1000);
final_latents = std::move(hires_final_latents);
}
auto result = decode_image_outputs(sd_ctx, request, final_latents);
if (result == nullptr) {
return nullptr;

View File

@ -251,8 +251,7 @@ public:
ggml_tensor* x,
ggml_tensor* past_bias = nullptr,
ggml_tensor* attention_mask = nullptr,
ggml_tensor* relative_position_bucket = nullptr,
const std::string& graph_cut_prefix = "") {
ggml_tensor* relative_position_bucket = nullptr) {
// x: [N, n_token, model_dim]
for (int i = 0; i < num_layers; i++) {
auto block = std::dynamic_pointer_cast<T5Block>(blocks["block." + std::to_string(i)]);
@ -260,9 +259,6 @@ public:
auto ret = block->forward(ctx, x, past_bias, attention_mask, relative_position_bucket);
x = ret.first;
past_bias = ret.second;
if (!graph_cut_prefix.empty()) {
sd::ggml_graph_cut::mark_graph_cut(x, graph_cut_prefix + ".block." + std::to_string(i), "x");
}
}
auto final_layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["final_layer_norm"]);
@ -309,8 +305,7 @@ public:
auto encoder = std::dynamic_pointer_cast<T5Stack>(blocks["encoder"]);
auto x = shared->forward(ctx, input_ids);
sd::ggml_graph_cut::mark_graph_cut(x, "t5.prelude", "x");
x = encoder->forward(ctx, x, past_bias, attention_mask, relative_position_bucket, "t5");
x = encoder->forward(ctx, x, past_bias, attention_mask, relative_position_bucket);
return x;
}
};

View File

@ -815,202 +815,11 @@ namespace sd {
namespace ops {
enum class InterpolateMode {
Nearest,
NearestExact,
NearestMax,
NearestMin,
NearestAvg,
Bilinear,
Bicubic,
Lanczos,
};
inline bool is_nearest_like_interpolate_mode(InterpolateMode mode) {
return mode == InterpolateMode::Nearest ||
mode == InterpolateMode::NearestExact ||
mode == InterpolateMode::NearestMax ||
mode == InterpolateMode::NearestMin ||
mode == InterpolateMode::NearestAvg;
}
inline bool is_2d_filter_interpolate_mode(InterpolateMode mode) {
return mode == InterpolateMode::Bilinear ||
mode == InterpolateMode::Bicubic ||
mode == InterpolateMode::Lanczos;
}
inline int64_t nearest_exact_interpolate_index(int64_t output_index,
int64_t input_size,
int64_t output_size) {
const double scale = static_cast<double>(input_size) / static_cast<double>(output_size);
const double center = (static_cast<double>(output_index) + 0.5) * scale - 0.5;
return std::min(std::max<int64_t>(static_cast<int64_t>(std::floor(center + 0.5)), 0), input_size - 1);
}
inline double linear_interpolate_weight(double x) {
x = std::abs(x);
return x < 1.0 ? 1.0 - x : 0.0;
}
inline double cubic_interpolate_weight(double x) {
constexpr double a = -0.75; // Match PyTorch bicubic interpolation.
x = std::abs(x);
if (x <= 1.0) {
return ((a + 2.0) * x - (a + 3.0)) * x * x + 1.0;
}
if (x < 2.0) {
return ((a * x - 5.0 * a) * x + 8.0 * a) * x - 4.0 * a;
}
return 0.0;
}
inline double sinc(double x) {
constexpr double pi = 3.14159265358979323846;
if (std::abs(x) < 1e-12) {
return 1.0;
}
const double pix = pi * x;
return std::sin(pix) / pix;
}
inline double lanczos_interpolate_weight(double x) {
constexpr double radius = 3.0;
x = std::abs(x);
if (x >= radius) {
return 0.0;
}
return sinc(x) * sinc(x / radius);
}
struct InterpolateContributor {
int64_t index;
double weight;
};
inline std::vector<std::vector<InterpolateContributor>> make_interpolate_contributors(
int64_t input_size,
int64_t output_size,
InterpolateMode mode,
bool antialias) {
std::vector<std::vector<InterpolateContributor>> contributors(static_cast<size_t>(output_size));
const double scale = static_cast<double>(input_size) / static_cast<double>(output_size);
const double filter_scale = antialias ? std::max(1.0, scale) : 1.0;
for (int64_t out = 0; out < output_size; ++out) {
const double center = (static_cast<double>(out) + 0.5) * scale - 0.5;
int64_t start = 0;
int64_t end = 0;
if (mode == InterpolateMode::Bilinear) {
const double support = filter_scale;
start = static_cast<int64_t>(std::ceil(center - support));
end = static_cast<int64_t>(std::floor(center + support));
} else if (mode == InterpolateMode::Bicubic) {
const double support = 2.0 * filter_scale;
start = static_cast<int64_t>(std::ceil(center - support));
end = static_cast<int64_t>(std::floor(center + support));
} else if (mode == InterpolateMode::Lanczos) {
const double support = 3.0 * filter_scale;
start = static_cast<int64_t>(std::ceil(center - support));
end = static_cast<int64_t>(std::floor(center + support));
} else {
tensor_throw_invalid_argument("Unsupported 2D filter interpolate mode: mode=" +
std::to_string(static_cast<int>(mode)));
}
double weight_sum = 0.0;
std::vector<InterpolateContributor>& axis_contributors = contributors[static_cast<size_t>(out)];
axis_contributors.reserve(static_cast<size_t>(end - start + 1));
for (int64_t in = start; in <= end; ++in) {
double weight = 0.0;
if (mode == InterpolateMode::Bilinear) {
weight = linear_interpolate_weight((center - static_cast<double>(in)) / filter_scale);
} else if (mode == InterpolateMode::Bicubic) {
weight = cubic_interpolate_weight((center - static_cast<double>(in)) / filter_scale);
} else {
weight = lanczos_interpolate_weight((center - static_cast<double>(in)) / filter_scale);
}
if (weight == 0.0) {
continue;
}
const int64_t clamped_index = std::min(std::max<int64_t>(in, 0), input_size - 1);
axis_contributors.push_back({clamped_index, weight});
weight_sum += weight;
}
if ((antialias || mode == InterpolateMode::Lanczos) &&
std::abs(weight_sum) > 1e-12) {
for (auto& contributor : axis_contributors) {
contributor.weight /= weight_sum;
}
}
if (axis_contributors.empty()) {
const int64_t nearest = std::min(
std::max<int64_t>(static_cast<int64_t>(std::floor(center + 0.5)), 0),
input_size - 1);
axis_contributors.push_back({nearest, 1.0});
}
}
return contributors;
}
template <typename T>
inline Tensor<T> interpolate_2d_filter(const Tensor<T>& input,
const std::vector<int64_t>& output_shape,
InterpolateMode mode,
bool antialias) {
if (input.dim() < 2) {
tensor_throw_invalid_argument("2D filter interpolate requires rank >= 2: input_shape=" +
tensor_shape_to_string(input.shape()) + ", output_shape=" +
tensor_shape_to_string(output_shape));
}
for (size_t i = 2; i < output_shape.size(); ++i) {
if (input.shape()[i] != output_shape[i]) {
tensor_throw_invalid_argument("2D filter interpolate only supports resizing dimensions 0 and 1: input_shape=" +
tensor_shape_to_string(input.shape()) + ", output_shape=" +
tensor_shape_to_string(output_shape));
}
}
Tensor<T> output(output_shape);
const int64_t input_width = input.shape()[0];
const int64_t input_height = input.shape()[1];
const int64_t output_width = output_shape[0];
const int64_t output_height = output_shape[1];
const int64_t input_plane = input_width * input_height;
const int64_t output_plane = output_width * output_height;
const int64_t plane_count = input.numel() / input_plane;
auto x_contributors = make_interpolate_contributors(input_width, output_width, mode, antialias);
auto y_contributors = make_interpolate_contributors(input_height, output_height, mode, antialias);
for (int64_t plane = 0; plane < plane_count; ++plane) {
const int64_t input_plane_offset = plane * input_plane;
const int64_t output_plane_offset = plane * output_plane;
for (int64_t y = 0; y < output_height; ++y) {
const auto& y_axis = y_contributors[static_cast<size_t>(y)];
for (int64_t x = 0; x < output_width; ++x) {
const auto& x_axis = x_contributors[static_cast<size_t>(x)];
double value = 0.0;
for (const auto& yc : y_axis) {
const int64_t input_row_offset = input_plane_offset + yc.index * input_width;
for (const auto& xc : x_axis) {
value += static_cast<double>(input.data()[input_row_offset + xc.index]) *
xc.weight * yc.weight;
}
}
output.data()[output_plane_offset + y * output_width + x] = static_cast<T>(value);
}
}
}
return output;
}
inline int64_t normalize_slice_bound(int64_t index, int64_t dim_size) {
if (index < 0) {
index += dim_size;
@ -1205,20 +1014,17 @@ namespace sd {
inline Tensor<T> interpolate(const Tensor<T>& input,
std::vector<int64_t> output_shape,
InterpolateMode mode = InterpolateMode::Nearest,
bool align_corners = false,
bool antialias = false) {
const bool is_nearest_like_mode = is_nearest_like_interpolate_mode(mode);
const bool is_2d_filter_mode = is_2d_filter_interpolate_mode(mode);
if (!is_nearest_like_mode && !is_2d_filter_mode) {
tensor_throw_invalid_argument("Unsupported interpolate mode: mode=" +
std::to_string(static_cast<int>(mode)));
}
if (antialias && !is_2d_filter_mode) {
tensor_throw_invalid_argument("Tensor interpolate antialias requires a 2D filter mode: mode=" +
bool align_corners = false) {
const bool is_nearest_like_mode = (mode == InterpolateMode::Nearest ||
mode == InterpolateMode::NearestMax ||
mode == InterpolateMode::NearestMin ||
mode == InterpolateMode::NearestAvg);
if (!is_nearest_like_mode) {
tensor_throw_invalid_argument("Only nearest-like interpolate modes are implemented, got mode=" +
std::to_string(static_cast<int>(mode)));
}
if (align_corners) {
tensor_throw_invalid_argument("align_corners is not supported for tensor interpolate: input_shape=" +
tensor_throw_invalid_argument("align_corners is not supported for nearest-like interpolate: input_shape=" +
tensor_shape_to_string(input.shape()) + ", output_shape=" +
tensor_shape_to_string(output_shape));
}
@ -1245,10 +1051,6 @@ namespace sd {
}
}
if (is_2d_filter_mode) {
return interpolate_2d_filter(input, output_shape, mode, antialias);
}
bool has_downsampling = false;
for (int64_t i = 0; i < input.dim(); ++i) {
if (input.shape()[i] > output_shape[i]) {
@ -1258,21 +1060,13 @@ namespace sd {
}
Tensor<T> output(std::move(output_shape));
if (mode == InterpolateMode::Nearest ||
mode == InterpolateMode::NearestExact ||
!has_downsampling) {
if (mode == InterpolateMode::Nearest || !has_downsampling) {
for (int64_t flat = 0; flat < output.numel(); ++flat) {
std::vector<int64_t> output_coord = tensor_unravel_index(flat, output.shape());
std::vector<int64_t> input_coord(static_cast<size_t>(input.dim()), 0);
for (size_t i = 0; i < static_cast<size_t>(input.dim()); ++i) {
if (mode == InterpolateMode::NearestExact) {
input_coord[i] = nearest_exact_interpolate_index(output_coord[i],
input.shape()[i],
output.shape()[i]);
} else {
input_coord[i] = output_coord[i] * input.shape()[i] / output.shape()[i];
}
}
output[flat] = input.index(input_coord);
}
@ -1289,12 +1083,6 @@ namespace sd {
return T(0);
case InterpolateMode::Nearest:
return T(0);
case InterpolateMode::NearestExact:
return T(0);
case InterpolateMode::Bilinear:
case InterpolateMode::Bicubic:
case InterpolateMode::Lanczos:
break;
}
tensor_throw_invalid_argument("Unsupported interpolate mode: mode=" +
@ -1314,12 +1102,6 @@ namespace sd {
break;
case InterpolateMode::Nearest:
break;
case InterpolateMode::NearestExact:
break;
case InterpolateMode::Bilinear:
case InterpolateMode::Bicubic:
case InterpolateMode::Lanczos:
break;
}
};
@ -1375,20 +1157,17 @@ namespace sd {
const std::optional<std::vector<int64_t>>& size,
const std::optional<std::vector<double>>& scale_factor,
InterpolateMode mode = InterpolateMode::Nearest,
bool align_corners = false,
bool antialias = false) {
const bool is_nearest_like_mode = is_nearest_like_interpolate_mode(mode);
const bool is_2d_filter_mode = is_2d_filter_interpolate_mode(mode);
if (!is_nearest_like_mode && !is_2d_filter_mode) {
tensor_throw_invalid_argument("Unsupported interpolate mode: mode=" +
std::to_string(static_cast<int>(mode)));
}
if (antialias && !is_2d_filter_mode) {
tensor_throw_invalid_argument("Tensor interpolate antialias requires a 2D filter mode: mode=" +
bool align_corners = false) {
const bool is_nearest_like_mode = (mode == InterpolateMode::Nearest ||
mode == InterpolateMode::NearestMax ||
mode == InterpolateMode::NearestMin ||
mode == InterpolateMode::NearestAvg);
if (!is_nearest_like_mode) {
tensor_throw_invalid_argument("Only nearest-like interpolate modes are implemented, got mode=" +
std::to_string(static_cast<int>(mode)));
}
if (align_corners) {
tensor_throw_invalid_argument("align_corners is not supported for tensor interpolate: input_shape=" +
tensor_throw_invalid_argument("align_corners is not supported for nearest-like interpolate: input_shape=" +
tensor_shape_to_string(input.shape()));
}
if (size.has_value() == scale_factor.has_value()) {
@ -1432,7 +1211,7 @@ namespace sd {
}
}
return interpolate(input, std::move(output_shape), mode, align_corners, antialias);
return interpolate(input, std::move(output_shape), mode, align_corners);
}
template <typename T>
@ -1440,14 +1219,12 @@ namespace sd {
const std::optional<std::vector<int64_t>>& size,
double scale_factor,
InterpolateMode mode = InterpolateMode::Nearest,
bool align_corners = false,
bool antialias = false) {
bool align_corners = false) {
return interpolate(input,
size,
std::vector<double>(size.has_value() ? size->size() : input.dim(), scale_factor),
mode,
align_corners,
antialias);
align_corners);
}
template <typename T>

View File

@ -62,7 +62,7 @@ void CLIPTokenizer::load_from_merges(const std::string& merges_utf8_str) {
}
vocab.push_back(utf8_to_utf32("<|startoftext|>"));
vocab.push_back(utf8_to_utf32("<|endoftext|>"));
LOG_DEBUG("vocab size: %zu", vocab.size());
LOG_DEBUG("vocab size: %llu", vocab.size());
int i = 0;
for (const auto& token : vocab) {
encoder[token] = i;

View File

@ -28,7 +28,7 @@ void MistralTokenizer::load_from_merges(const std::string& merges_utf8_str, cons
byte_decoder[pair.second] = pair.first;
}
std::vector<std::u32string> merges = split_utf32(merges_utf8_str);
LOG_DEBUG("merges size %zu", merges.size());
LOG_DEBUG("merges size %llu", merges.size());
std::vector<std::pair<std::u32string, std::u32string>> merge_pairs;
for (const auto& merge : merges) {
size_t space_pos = merge.find(' ');

View File

@ -11,7 +11,7 @@ void Qwen2Tokenizer::load_from_merges(const std::string& merges_utf8_str) {
}
std::vector<std::u32string> merges = split_utf32(merges_utf8_str);
LOG_DEBUG("merges size %zu", merges.size());
LOG_DEBUG("merges size %llu", merges.size());
std::vector<std::pair<std::u32string, std::u32string>> merge_pairs;
for (const auto& merge : merges) {
size_t space_pos = merge.find(' ');

View File

@ -482,14 +482,12 @@ public:
emb = ggml_add(ctx->ggml_ctx, emb, label_emb); // [N, time_embed_dim]
}
// sd::ggml_graph_cut::mark_graph_cut(emb, "unet.prelude", "emb");
// input_blocks
std::vector<ggml_tensor*> hs;
// input block 0
auto h = input_blocks_0_0->forward(ctx, x);
sd::ggml_graph_cut::mark_graph_cut(h, "unet.input_blocks.0", "h");
ggml_set_name(h, "bench-start");
hs.push_back(h);
@ -507,7 +505,6 @@ public:
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
h = attention_layer_forward(name, ctx, h, context, num_video_frames); // [N, mult*model_channels, h, w]
}
sd::ggml_graph_cut::mark_graph_cut(h, "unet.input_blocks." + std::to_string(input_block_idx), "h");
hs.push_back(h);
}
if (tiny_unet) {
@ -521,7 +518,6 @@ public:
auto block = std::dynamic_pointer_cast<DownSampleBlock>(blocks[name]);
h = block->forward(ctx, h); // [N, mult*model_channels, h/(2^(i+1)), w/(2^(i+1))]
// sd::ggml_graph_cut::mark_graph_cut(h, "unet.input_blocks." + std::to_string(input_block_idx), "h");
hs.push_back(h);
}
}
@ -535,7 +531,6 @@ public:
h = resblock_forward("middle_block.2", ctx, h, emb, num_video_frames); // [N, 4*model_channels, h/8, w/8]
}
}
sd::ggml_graph_cut::mark_graph_cut(h, "unet.middle_block", "h");
if (controls.size() > 0) {
auto cs = ggml_ext_scale(ctx->ggml_ctx, controls[controls.size() - 1], control_strength, true);
h = ggml_add(ctx->ggml_ctx, h, cs); // middle control
@ -586,7 +581,6 @@ public:
}
output_block_idx += 1;
sd::ggml_graph_cut::mark_graph_cut(h, "unet.output_blocks." + std::to_string(output_block_idx - 1), "h");
}
}

View File

@ -1,31 +1,50 @@
#include "upscaler.h"
#include "esrgan.hpp"
#include "ggml_extend.hpp"
#include "model.h"
#include "stable-diffusion.h"
#include "util.h"
UpscalerGGML::UpscalerGGML(int n_threads,
bool direct,
int tile_size)
struct UpscalerGGML {
ggml_backend_t backend = nullptr; // general backend
ggml_type model_data_type = GGML_TYPE_F16;
std::shared_ptr<ESRGAN> esrgan_upscaler;
std::string esrgan_path;
int n_threads;
bool direct = false;
int tile_size = 128;
UpscalerGGML(int n_threads,
bool direct = false,
int tile_size = 128)
: n_threads(n_threads),
direct(direct),
tile_size(tile_size) {
}
void UpscalerGGML::set_max_graph_vram_bytes(size_t max_vram_bytes) {
max_graph_vram_bytes = max_vram_bytes;
if (esrgan_upscaler) {
esrgan_upscaler->set_max_graph_vram_bytes(max_vram_bytes);
}
}
bool UpscalerGGML::load_from_file(const std::string& esrgan_path,
bool load_from_file(const std::string& esrgan_path,
bool offload_params_to_cpu,
int n_threads) {
ggml_log_set(ggml_log_callback_default, nullptr);
backend = sd_get_default_backend();
#ifdef SD_USE_CUDA
LOG_DEBUG("Using CUDA backend");
backend = ggml_backend_cuda_init(0);
#endif
#ifdef SD_USE_METAL
LOG_DEBUG("Using Metal backend");
backend = ggml_backend_metal_init();
#endif
#ifdef SD_USE_VULKAN
LOG_DEBUG("Using Vulkan backend");
backend = ggml_backend_vk_init(0);
#endif
#ifdef SD_USE_OPENCL
LOG_DEBUG("Using OpenCL backend");
backend = ggml_backend_opencl_init();
#endif
#ifdef SD_USE_SYCL
LOG_DEBUG("Using SYCL backend");
backend = ggml_backend_sycl_init(0);
#endif
ModelLoader model_loader;
if (!model_loader.init_from_file_and_convert_name(esrgan_path)) {
LOG_ERROR("init model loader from file failed: '%s'", esrgan_path.c_str());
@ -37,7 +56,6 @@ bool UpscalerGGML::load_from_file(const std::string& esrgan_path,
}
LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, tile_size, model_loader.get_tensor_storage_map());
esrgan_upscaler->set_max_graph_vram_bytes(max_graph_vram_bytes);
if (direct) {
esrgan_upscaler->set_conv2d_direct_enabled(true);
}
@ -47,7 +65,7 @@ bool UpscalerGGML::load_from_file(const std::string& esrgan_path,
return true;
}
sd::Tensor<float> UpscalerGGML::upscale_tensor(const sd::Tensor<float>& input_tensor) {
sd::Tensor<float> upscale_tensor(const sd::Tensor<float>& input_tensor) {
sd::Tensor<float> upscaled;
if (tile_size <= 0 || (input_tensor.shape()[0] <= tile_size && input_tensor.shape()[1] <= tile_size)) {
upscaled = esrgan_upscaler->compute(n_threads, input_tensor);
@ -80,7 +98,7 @@ sd::Tensor<float> UpscalerGGML::upscale_tensor(const sd::Tensor<float>& input_te
return upscaled;
}
sd_image_t UpscalerGGML::upscale(sd_image_t input_image, uint32_t upscale_factor) {
sd_image_t upscale(sd_image_t input_image, uint32_t upscale_factor) {
// upscale_factor, unused for RealESRGAN_x4plus_anime_6B.pth
sd_image_t upscaled_image = {0, 0, 0, nullptr};
int output_width = (int)input_image.width * esrgan_upscaler->scale;
@ -101,6 +119,7 @@ sd_image_t UpscalerGGML::upscale(sd_image_t input_image, uint32_t upscale_factor
upscaled_image = upscaled_data;
return upscaled_image;
}
};
struct upscaler_ctx_t {
UpscalerGGML* upscaler = nullptr;

View File

@ -1,33 +0,0 @@
#ifndef __SD_UPSCALER_H__
#define __SD_UPSCALER_H__
#include "esrgan.hpp"
#include "stable-diffusion.h"
#include "tensor.hpp"
#include <memory>
#include <string>
struct UpscalerGGML {
ggml_backend_t backend = nullptr; // general backend
ggml_type model_data_type = GGML_TYPE_F16;
std::shared_ptr<ESRGAN> esrgan_upscaler;
std::string esrgan_path;
int n_threads;
bool direct = false;
int tile_size = 128;
size_t max_graph_vram_bytes = 0;
UpscalerGGML(int n_threads,
bool direct = false,
int tile_size = 128);
bool load_from_file(const std::string& esrgan_path,
bool offload_params_to_cpu,
int n_threads);
void set_max_graph_vram_bytes(size_t max_vram_bytes);
sd::Tensor<float> upscale_tensor(const sd::Tensor<float>& input_tensor);
sd_image_t upscale(sd_image_t input_image, uint32_t upscale_factor);
};
#endif // __SD_UPSCALER_H__

View File

@ -23,9 +23,8 @@
#include <unistd.h>
#endif
#include "ggml-backend.h"
#include "ggml-cpu.h"
#include "ggml.h"
#include "ggml_extend_backend.hpp"
#include "stable-diffusion.h"
bool ends_with(const std::string& str, const std::string& ending) {
@ -120,10 +119,10 @@ std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
filename.c_str(),
GENERIC_READ,
FILE_SHARE_READ,
nullptr,
NULL,
OPEN_EXISTING,
FILE_ATTRIBUTE_NORMAL,
nullptr);
NULL);
if (file_handle == INVALID_HANDLE_VALUE) {
return nullptr;
@ -137,16 +136,16 @@ std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
file_size = static_cast<size_t>(size.QuadPart);
HANDLE mapping_handle = CreateFileMapping(file_handle, nullptr, PAGE_READONLY, 0, 0, nullptr);
HANDLE mapping_handle = CreateFileMapping(file_handle, NULL, PAGE_READONLY, 0, 0, NULL);
if (mapping_handle == nullptr) {
if (mapping_handle == NULL) {
CloseHandle(file_handle);
return nullptr;
}
mapped_data = MapViewOfFile(mapping_handle, FILE_MAP_READ, 0, 0, file_size);
if (mapped_data == nullptr) {
if (mapped_data == NULL) {
CloseHandle(mapping_handle);
CloseHandle(file_handle);
return nullptr;
@ -204,7 +203,7 @@ std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
size_t file_size = sb.st_size;
void* mapped_data = mmap(nullptr, file_size, PROT_READ, mmap_flags, file_descriptor, 0);
void* mapped_data = mmap(NULL, file_size, PROT_READ, mmap_flags, file_descriptor, 0);
close(file_descriptor);
@ -496,6 +495,26 @@ sd_progress_cb_t sd_get_progress_callback() {
void* sd_get_progress_callback_data() {
return sd_progress_cb_data;
}
const char* sd_get_system_info() {
static char buffer[1024];
std::stringstream ss;
ss << "System Info: \n";
ss << " SSE3 = " << ggml_cpu_has_sse3() << " | ";
ss << " AVX = " << ggml_cpu_has_avx() << " | ";
ss << " AVX2 = " << ggml_cpu_has_avx2() << " | ";
ss << " AVX512 = " << ggml_cpu_has_avx512() << " | ";
ss << " AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << " | ";
ss << " AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << " | ";
ss << " FMA = " << ggml_cpu_has_fma() << " | ";
ss << " NEON = " << ggml_cpu_has_neon() << " | ";
ss << " ARM_FMA = " << ggml_cpu_has_arm_fma() << " | ";
ss << " F16C = " << ggml_cpu_has_f16c() << " | ";
ss << " FP16_VA = " << ggml_cpu_has_fp16_va() << " | ";
ss << " WASM_SIMD = " << ggml_cpu_has_wasm_simd() << " | ";
ss << " VSX = " << ggml_cpu_has_vsx() << " | ";
snprintf(buffer, sizeof(buffer), "%s", ss.str().c_str());
return buffer;
}
sd_image_t tensor_to_sd_image(const sd::Tensor<float>& tensor, int frame_index) {
const auto& shape = tensor.shape();
@ -505,7 +524,17 @@ sd_image_t tensor_to_sd_image(const sd::Tensor<float>& tensor, int frame_index)
int channel = static_cast<int>(shape[shape.size() == 5 ? 3 : 2]);
uint8_t* data = (uint8_t*)malloc(static_cast<size_t>(width * height * channel));
GGML_ASSERT(data != nullptr);
preprocessing_tensor_frame_to_sd_image(tensor, frame_index, data);
for (int iw = 0; iw < width; ++iw) {
for (int ih = 0; ih < height; ++ih) {
for (int ic = 0; ic < channel; ++ic) {
float value = shape.size() == 5 ? tensor.index(iw, ih, frame_index, ic, 0)
: tensor.index(iw, ih, ic, frame_index);
value = std::clamp(value, 0.0f, 1.0f);
data[(ih * width + iw) * channel + ic] = static_cast<uint8_t>(std::round(value * 255.0f));
}
}
}
return {
static_cast<uint32_t>(width),
static_cast<uint32_t>(height),
@ -689,100 +718,3 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
return res;
}
// test if the backend is a specific one, e.g. "CUDA", "ROCm", "Vulkan" etc.
bool sd_backend_is(ggml_backend_t backend, const std::string& name) {
if (!backend) {
return false;
}
ggml_backend_dev_t dev = ggml_backend_get_device(backend);
if (!dev)
return false;
std::string dev_name = ggml_backend_dev_name(dev);
return dev_name.find(name) != std::string::npos;
}
ggml_backend_t sd_get_default_backend() {
ggml_backend_load_all_once();
static std::once_flag once;
std::call_once(once, []() {
size_t dev_count = ggml_backend_dev_count();
if (dev_count == 0) {
LOG_ERROR("No devices found!");
} else {
LOG_DEBUG("Found %zu backend devices:", dev_count);
for (size_t i = 0; i < dev_count; ++i) {
auto dev = ggml_backend_dev_get(i);
LOG_DEBUG("#%zu: %s", i, ggml_backend_dev_name(dev));
}
}
});
ggml_backend_t backend = nullptr;
const char* SD_VK_DEVICE = getenv("SD_VK_DEVICE");
if (SD_VK_DEVICE != nullptr) {
std::string sd_vk_device_str = SD_VK_DEVICE;
try {
unsigned long long device = std::stoull(sd_vk_device_str);
std::string vk_device_name = "Vulkan" + std::to_string(device);
if (backend_name_exists(vk_device_name)) {
LOG_INFO("Selecting %s as main device by env var SD_VK_DEVICE", vk_device_name.c_str());
backend = init_named_backend(vk_device_name);
if (!backend) {
LOG_WARN("Device %s requested by SD_VK_DEVICE failed to init. Falling back to the default device.", vk_device_name.c_str());
}
} else {
LOG_WARN("Device %s requested by SD_VK_DEVICE was not found. Falling back to the default device.", vk_device_name.c_str());
}
} catch (const std::invalid_argument&) {
LOG_WARN("SD_VK_DEVICE environment variable is not a valid integer (%s). Falling back to the default device.", SD_VK_DEVICE);
} catch (const std::out_of_range&) {
LOG_WARN("SD_VK_DEVICE environment variable value is out of range for `unsigned long long` type (%s). Falling back to the default device.", SD_VK_DEVICE);
}
}
if (!backend) {
std::string dev_name = get_default_backend_name();
backend = init_named_backend(dev_name);
if (!backend && !dev_name.empty()) {
LOG_WARN("device %s failed to init", dev_name.c_str());
}
}
if (!backend) {
LOG_WARN("loading CPU backend");
backend = ggml_backend_cpu_init();
}
if (ggml_backend_is_cpu(backend)) {
LOG_DEBUG("Using CPU backend");
}
return backend;
}
// namespace is needed to avoid conflicts with ggml_backend_extend.hpp
namespace ggml_cpu {
#include "ggml-cpu.h"
}
const char* sd_get_system_info() {
using namespace ggml_cpu;
static char buffer[1024];
std::stringstream ss;
ss << "System Info: \n";
ss << " SSE3 = " << ggml_cpu_has_sse3() << " | ";
ss << " AVX = " << ggml_cpu_has_avx() << " | ";
ss << " AVX2 = " << ggml_cpu_has_avx2() << " | ";
ss << " AVX512 = " << ggml_cpu_has_avx512() << " | ";
ss << " AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << " | ";
ss << " AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << " | ";
ss << " FMA = " << ggml_cpu_has_fma() << " | ";
ss << " NEON = " << ggml_cpu_has_neon() << " | ";
ss << " ARM_FMA = " << ggml_cpu_has_arm_fma() << " | ";
ss << " F16C = " << ggml_cpu_has_f16c() << " | ";
ss << " FP16_VA = " << ggml_cpu_has_fp16_va() << " | ";
ss << " WASM_SIMD = " << ggml_cpu_has_wasm_simd() << " | ";
ss << " VSX = " << ggml_cpu_has_vsx() << " | ";
snprintf(buffer, sizeof(buffer), "%s", ss.str().c_str());
return buffer;
}

View File

@ -6,7 +6,6 @@
#include <string>
#include <vector>
#include "ggml-backend.h"
#include "stable-diffusion.h"
#include "tensor.hpp"
@ -83,10 +82,6 @@ int sd_get_preview_interval();
bool sd_should_preview_denoised();
bool sd_should_preview_noisy();
// test if the backend is a specific one, e.g. "CUDA", "ROCm", "Vulkan" etc.
bool sd_backend_is(ggml_backend_t backend, const std::string& name);
ggml_backend_t sd_get_default_backend();
#define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__)
#define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__)
#define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__)

View File

@ -142,9 +142,8 @@ public:
"vae encode compute failed while processing a tile");
} else {
output = _compute(n_threads, input, false);
}
free_compute_buffer();
}
if (output.empty()) {
LOG_ERROR("vae encode compute failed");

View File

@ -692,7 +692,6 @@ namespace WAN {
} else {
x = conv1->forward(ctx, x);
}
// sd::ggml_graph_cut::mark_graph_cut(x, "wan_vae.encoder.prelude", "x");
// downsamples
std::vector<int64_t> dims = {dim};
@ -718,14 +717,12 @@ namespace WAN {
x = layer->forward(ctx, x, b, feat_cache, feat_idx, chunk_idx);
}
}
// sd::ggml_graph_cut::mark_graph_cut(x, "wan_vae.encoder.down." + std::to_string(i), "x");
}
// middle
x = middle_0->forward(ctx, x, b, feat_cache, feat_idx);
x = middle_1->forward(ctx, x, b);
x = middle_2->forward(ctx, x, b, feat_cache, feat_idx);
// sd::ggml_graph_cut::mark_graph_cut(x, "wan_vae.encoder.mid", "x");
// head
x = head_0->forward(ctx, x);
@ -866,13 +863,11 @@ namespace WAN {
} else {
x = conv1->forward(ctx, x);
}
// sd::ggml_graph_cut::mark_graph_cut(x, "wan_vae.decoder.prelude", "x");
// middle
x = middle_0->forward(ctx, x, b, feat_cache, feat_idx);
x = middle_1->forward(ctx, x, b);
x = middle_2->forward(ctx, x, b, feat_cache, feat_idx);
// sd::ggml_graph_cut::mark_graph_cut(x, "wan_vae.decoder.mid", "x");
// upsamples
std::vector<int64_t> dims = {dim_mult[dim_mult.size() - 1] * dim};
@ -898,7 +893,6 @@ namespace WAN {
x = layer->forward(ctx, x, b, feat_cache, feat_idx, chunk_idx);
}
}
// sd::ggml_graph_cut::mark_graph_cut(x, "wan_vae.decoder.up." + std::to_string(i), "x");
}
// head
@ -1037,7 +1031,6 @@ namespace WAN {
if (wan2_2) {
x = patchify(ctx->ggml_ctx, x, 2, b);
}
// sd::ggml_graph_cut::mark_graph_cut(x, "wan_vae.encode.prelude", "x");
auto encoder = std::dynamic_pointer_cast<Encoder3d>(blocks["encoder"]);
auto conv1 = std::dynamic_pointer_cast<CausalConv3d>(blocks["conv1"]);
@ -1058,7 +1051,6 @@ namespace WAN {
}
out = conv1->forward(ctx, out);
auto mu = ggml_ext_chunk(ctx->ggml_ctx, out, 2, 3)[0];
// sd::ggml_graph_cut::mark_graph_cut(mu, "wan_vae.encode.final", "mu");
clear_cache();
return mu;
}
@ -1076,7 +1068,6 @@ namespace WAN {
int64_t iter_ = z->ne[2];
auto x = conv2->forward(ctx, z);
// sd::ggml_graph_cut::mark_graph_cut(x, "wan_vae.decode.prelude", "x");
ggml_tensor* out;
for (int i = 0; i < iter_; i++) {
_conv_idx = 0;
@ -1092,7 +1083,6 @@ namespace WAN {
if (wan2_2) {
out = unpatchify(ctx->ggml_ctx, out, 2, b);
}
// sd::ggml_graph_cut::mark_graph_cut(out, "wan_vae.decode.final", "out");
clear_cache();
return out;
}
@ -1108,14 +1098,12 @@ namespace WAN {
auto conv2 = std::dynamic_pointer_cast<CausalConv3d>(blocks["conv2"]);
auto x = conv2->forward(ctx, z);
// sd::ggml_graph_cut::mark_graph_cut(x, "wan_vae.decode_partial.prelude", "x");
auto in = ggml_ext_slice(ctx->ggml_ctx, x, 2, i, i + 1); // [b*c, 1, h, w]
_conv_idx = 0;
auto out = decoder->forward(ctx, in, b, _feat_map, _conv_idx, i);
if (wan2_2) {
out = unpatchify(ctx->ggml_ctx, out, 2, b);
}
// sd::ggml_graph_cut::mark_graph_cut(out, "wan_vae.decode_partial.final", "out");
return out;
}
};
@ -1996,13 +1984,6 @@ namespace WAN {
c = ggml_reshape_3d(ctx->ggml_ctx, c, c->ne[0] * c->ne[1] * c->ne[2], c->ne[3] / N, N); // [N, dim, t_len*h_len*w_len]
c = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, c, 1, 0, 2, 3)); // [N, t_len*h_len*w_len, dim]
}
sd::ggml_graph_cut::mark_graph_cut(x, "wan.prelude", "x");
// sd::ggml_graph_cut::mark_graph_cut(e, "wan.prelude", "e");
// sd::ggml_graph_cut::mark_graph_cut(e0, "wan.prelude", "e0");
// sd::ggml_graph_cut::mark_graph_cut(context, "wan.prelude", "context");
if (c != nullptr) {
sd::ggml_graph_cut::mark_graph_cut(c, "wan.prelude", "c");
}
auto x_orig = x;
@ -2023,10 +2004,6 @@ namespace WAN {
c_skip = ggml_ext_scale(ctx->ggml_ctx, c_skip, vace_strength);
x = ggml_add(ctx->ggml_ctx, x, c_skip);
}
sd::ggml_graph_cut::mark_graph_cut(x, "wan.blocks." + std::to_string(i), "x");
if (c != nullptr) {
sd::ggml_graph_cut::mark_graph_cut(c, "wan.blocks." + std::to_string(i), "c");
}
}
x = head->forward(ctx, x, e); // [N, t_len*h_len*w_len, pt*ph*pw*out_dim]

View File

@ -31,6 +31,10 @@ namespace ZImage {
: head_dim(head_dim), num_heads(num_heads), num_kv_heads(num_kv_heads), qk_norm(qk_norm) {
blocks["qkv"] = std::make_shared<Linear>(hidden_size, (num_heads + num_kv_heads * 2) * head_dim, false);
float scale = 1.f;
#if GGML_USE_HIP
// Prevent NaN issues with certain ROCm setups
scale = 1.f / 16.f;
#endif
blocks["out"] = std::make_shared<Linear>(num_heads * head_dim, hidden_size, false, false, false, scale);
if (qk_norm) {
blocks["q_norm"] = std::make_shared<RMSNorm>(head_dim);
@ -48,10 +52,6 @@ namespace ZImage {
auto qkv_proj = std::dynamic_pointer_cast<Linear>(blocks["qkv"]);
auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["out"]);
if (sd_backend_is(ctx->backend, "ROCm")) {
out_proj->set_scale(1.f / 16.f);
}
auto qkv = qkv_proj->forward(ctx, x); // [N, n_token, (num_heads + num_kv_heads*2)*head_dim]
qkv = ggml_reshape_4d(ctx->ggml_ctx, qkv, head_dim, num_heads + num_kv_heads * 2, qkv->ne[1], qkv->ne[2]); // [N, n_token, num_heads + num_kv_heads*2, head_dim]
@ -115,7 +115,9 @@ namespace ZImage {
bool force_prec_f32 = false;
float scale = 1.f / 128.f;
#ifdef SD_USE_VULKAN
force_prec_f32 = true;
#endif
// The purpose of the scale here is to prevent NaN issues in certain situations.
// For example, when using CUDA but the weights are k-quants.
blocks["w2"] = std::make_shared<Linear>(hidden_dim, dim, false, false, force_prec_f32, scale);
@ -127,10 +129,6 @@ namespace ZImage {
auto w2 = std::dynamic_pointer_cast<Linear>(blocks["w2"]);
auto w3 = std::dynamic_pointer_cast<Linear>(blocks["w3"]);
if (sd_backend_is(ctx->backend, "Vulkan")) {
w2->set_force_prec_f32(true);
}
auto x1 = w1->forward(ctx, x);
auto x3 = w3->forward(ctx, x);
x = ggml_swiglu_split(ctx->ggml_ctx, x1, x3);
@ -371,9 +369,6 @@ namespace ZImage {
auto txt = cap_embedder_1->forward(ctx, cap_embedder_0->forward(ctx, context)); // [N, n_txt_token, hidden_size]
auto img = x_embedder->forward(ctx, x); // [N, n_img_token, hidden_size]
sd::ggml_graph_cut::mark_graph_cut(txt, "z_image.prelude", "txt");
sd::ggml_graph_cut::mark_graph_cut(img, "z_image.prelude", "img");
sd::ggml_graph_cut::mark_graph_cut(t_emb, "z_image.prelude", "t_emb");
int64_t n_txt_pad_token = Rope::bound_mod(static_cast<int>(n_txt_token), SEQ_MULTI_OF);
if (n_txt_pad_token > 0) {
@ -396,24 +391,20 @@ namespace ZImage {
auto block = std::dynamic_pointer_cast<JointTransformerBlock>(blocks["context_refiner." + std::to_string(i)]);
txt = block->forward(ctx, txt, txt_pe, nullptr, nullptr);
sd::ggml_graph_cut::mark_graph_cut(txt, "z_image.context_refiner." + std::to_string(i), "txt");
}
for (int i = 0; i < z_image_params.num_refiner_layers; i++) {
auto block = std::dynamic_pointer_cast<JointTransformerBlock>(blocks["noise_refiner." + std::to_string(i)]);
img = block->forward(ctx, img, img_pe, nullptr, t_emb);
sd::ggml_graph_cut::mark_graph_cut(img, "z_image.noise_refiner." + std::to_string(i), "img");
}
auto txt_img = ggml_concat(ctx->ggml_ctx, txt, img, 1); // [N, n_txt_token + n_txt_pad_token + n_img_token + n_img_pad_token, hidden_size]
sd::ggml_graph_cut::mark_graph_cut(txt_img, "z_image.prelude", "txt_img");
for (int i = 0; i < z_image_params.num_layers; i++) {
auto block = std::dynamic_pointer_cast<JointTransformerBlock>(blocks["layers." + std::to_string(i)]);
txt_img = block->forward(ctx, txt_img, pe, nullptr, t_emb);
sd::ggml_graph_cut::mark_graph_cut(txt_img, "z_image.layers." + std::to_string(i), "txt_img");
}
txt_img = final_layer->forward(ctx, txt_img, t_emb); // [N, n_txt_token + n_txt_pad_token + n_img_token + n_img_pad_token, ph*pw*C]