Compare commits

..

No commits in common. "master" and "master-586-c97702e" have entirely different histories.

45 changed files with 582 additions and 3113 deletions

View File

@ -72,31 +72,37 @@ option(SD_USE_SYSTEM_GGML "sd: use system-installed GGML library" OFF
if(SD_CUDA) if(SD_CUDA)
message("-- Use CUDA as backend stable-diffusion") message("-- Use CUDA as backend stable-diffusion")
set(GGML_CUDA ON) set(GGML_CUDA ON)
add_definitions(-DSD_USE_CUDA)
endif() endif()
if(SD_METAL) if(SD_METAL)
message("-- Use Metal as backend stable-diffusion") message("-- Use Metal as backend stable-diffusion")
set(GGML_METAL ON) set(GGML_METAL ON)
add_definitions(-DSD_USE_METAL)
endif() endif()
if (SD_VULKAN) if (SD_VULKAN)
message("-- Use Vulkan as backend stable-diffusion") message("-- Use Vulkan as backend stable-diffusion")
set(GGML_VULKAN ON) set(GGML_VULKAN ON)
add_definitions(-DSD_USE_VULKAN)
endif () endif ()
if (SD_OPENCL) if (SD_OPENCL)
message("-- Use OpenCL as backend stable-diffusion") message("-- Use OpenCL as backend stable-diffusion")
set(GGML_OPENCL ON) set(GGML_OPENCL ON)
add_definitions(-DSD_USE_OPENCL)
endif () endif ()
if (SD_HIPBLAS) if (SD_HIPBLAS)
message("-- Use HIPBLAS as backend stable-diffusion") message("-- Use HIPBLAS as backend stable-diffusion")
set(GGML_HIP ON) set(GGML_HIP ON)
add_definitions(-DSD_USE_CUDA)
endif () endif ()
if(SD_MUSA) if(SD_MUSA)
message("-- Use MUSA as backend stable-diffusion") message("-- Use MUSA as backend stable-diffusion")
set(GGML_MUSA ON) set(GGML_MUSA ON)
add_definitions(-DSD_USE_CUDA)
endif() endif()
if(SD_WEBP) if(SD_WEBP)
@ -216,6 +222,7 @@ if(SD_SYCL)
message("-- Use SYCL as backend stable-diffusion") message("-- Use SYCL as backend stable-diffusion")
set(GGML_SYCL ON) set(GGML_SYCL ON)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
add_definitions(-DSD_USE_SYCL)
# disable fast-math on host, see: # disable fast-math on host, see:
# https://www.intel.com/content/www/us/en/docs/cpp-compiler/developer-guide-reference/2021-10/fp-model-fp.html # https://www.intel.com/content/www/us/en/docs/cpp-compiler/developer-guide-reference/2021-10/fp-model-fp.html
if (WIN32) if (WIN32)

View File

@ -131,6 +131,8 @@ sd-cli -m model.safetensors -p "a cat" --cache-mode spectrum
| `warmup` | Steps to always compute before caching starts | 4 | | `warmup` | Steps to always compute before caching starts | 4 |
| `stop` | Stop caching at this fraction of total steps | 0.9 | | `stop` | Stop caching at this fraction of total steps | 0.9 |
```
### Performance Tips ### Performance Tips
- Start with default thresholds and adjust based on output quality - Start with default thresholds and adjust based on output quality

View File

@ -4,29 +4,29 @@
usage: ./bin/sd-cli [options] usage: ./bin/sd-cli [options]
CLI Options: CLI Options:
-o, --output <string> path to write result image to. you can use printf-style %d format specifiers for image -o, --output <string> path to write result image to. you can use printf-style %d format specifiers for image sequences (default:
sequences (default: ./output.png) (eg. output_%03d.png). Single-file video outputs ./output.png) (eg. output_%03d.png). For video generation, single-file outputs support .avi, .webm, and animated .webp
support .avi, .webm, and animated .webp --preview-path <string> path to write preview image to (default: ./preview.png). Multi-frame previews support .avi, .webm, and animated .webp
--image <string> path to the image to inspect (for metadata mode) --preview-interval <int> interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at
--metadata-format <string> metadata output format, one of [text, json] (default: text) every step)
--preview-path <string> path to write preview image to (default: ./preview.png). Multi-frame previews support --output-begin-idx <int> starting index for output image sequence, must be non-negative (default 0 if specified %d in output path, 1 otherwise)
.avi, .webm, and animated .webp --image <string> path to the image to inspect (for metadata mode)
--preview-interval <int> interval in denoising steps between consecutive updates of the image preview file --metadata-format <string> metadata output format, one of [text, json] (default: text)
(default is 1, meaning updating at every step) --canny apply canny preprocessor (edge detection)
--output-begin-idx <int> starting index for output image sequence, must be non-negative (default 0 if specified --convert-name convert tensor name (for convert mode)
%d in output path, 1 otherwise) convert mode writes `.gguf` or `.safetensors` based on the output extension.
--canny apply canny preprocessor (edge detection) `.safetensors` export currently supports f16, bf16, f32, and i32 tensor types only.
--convert-name convert tensor name (for convert mode) i32 is passthrough only; no f32 <-> i32 conversion is performed
-v, --verbose print extra info -v, --verbose print extra info
--color colors the logging tags according to level --color colors the logging tags according to level
--taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview tae) --taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview tae)
--preview-noisy enables previewing noisy inputs of the models rather than the denoised outputs --preview-noisy enables previewing noisy inputs of the models rather than the denoised outputs
--metadata-raw include raw hex previews for unparsed metadata payloads --metadata-raw include raw hex previews for unparsed metadata payloads
--metadata-brief truncate long metadata text values in text output --metadata-brief truncate long metadata text values in text output
--metadata-all include structural/container entries such as IHDR, IDAT, and non-metadata JPEG segments --metadata-all include structural/container entries such as IHDR, IDAT, and non-metadata JPEG segments
-M, --mode run mode, one of [img_gen, vid_gen, upscale, convert, metadata], default: img_gen -M, --mode run mode, one of [img_gen, vid_gen, upscale, convert, metadata], default: img_gen
--preview preview method. must be one of the following [none, proj, tae, vae] (default is none) --preview preview method. must be one of the following [none, proj, tae, vae] (default is none)
-h, --help show this help message and exit -h, --help show this help message and exit
Context Options: Context Options:
-m, --model <string> path to full model -m, --model <string> path to full model
@ -34,8 +34,7 @@ Context Options:
--clip_g <string> path to the clip-g text encoder --clip_g <string> path to the clip-g text encoder
--clip_vision <string> path to the clip-vision encoder --clip_vision <string> path to the clip-vision encoder
--t5xxl <string> path to the t5xxl text encoder --t5xxl <string> path to the t5xxl text encoder
--llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, --llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)
mistral-small3.2 for flux2, ...)
--llm_vision <string> path to the llm vit --llm_vision <string> path to the llm vit
--qwen2vl <string> alias of --llm. Deprecated. --qwen2vl <string> alias of --llm. Deprecated.
--qwen2vl_vision <string> alias of --llm_vision. Deprecated. --qwen2vl_vision <string> alias of --llm_vision. Deprecated.
@ -47,18 +46,16 @@ Context Options:
--control-net <string> path to control net model --control-net <string> path to control net model
--embd-dir <string> embeddings directory --embd-dir <string> embeddings directory
--lora-model-dir <string> lora model directory --lora-model-dir <string> lora model directory
--hires-upscalers-dir <string> highres fix upscaler model directory
--tensor-type-rules <string> weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0") --tensor-type-rules <string> weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
--photo-maker <string> path to PHOTOMAKER model --photo-maker <string> path to PHOTOMAKER model
--upscale-model <string> path to esrgan model. --upscale-model <string> path to esrgan model.
-t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0, -t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
then threads will be set to the number of CPU physical cores CPU physical cores
--chroma-t5-mask-pad <int> t5 mask pad size of chroma --chroma-t5-mask-pad <int> t5 mask pad size of chroma
--max-vram <float> maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables --vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5)
graph splitting --vae-tiling process vae in tiles to reduce memory usage
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
when needed
--mmap whether to memory-map model --mmap whether to memory-map model
--control-net-cpu keep controlnet in cpu (for low vram) --control-net-cpu keep controlnet in cpu (for low vram)
--clip-on-cpu keep clip in cpu (for low vram) --clip-on-cpu keep clip in cpu (for low vram)
@ -73,19 +70,20 @@ Context Options:
--chroma-disable-dit-mask disable dit mask for chroma --chroma-disable-dit-mask disable dit mask for chroma
--qwen-image-zero-cond-t enable zero_cond_t for qwen image --qwen-image-zero-cond-t enable zero_cond_t for qwen image
--chroma-enable-t5-mask enable t5 mask for chroma --chroma-enable-t5-mask enable t5 mask for chroma
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, --type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
q4_K). If not specified, the default is the type of the weight file type of the weight file
--rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui) --rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
--sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng --sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
--prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, --prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]
flux2_flow] --lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights
--lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is contain any quantized parameters, the at_runtime mode will be used; otherwise,
auto. In auto mode, if the model weights contain any quantized parameters, immediately will be used.The immediately mode may have precision and
the at_runtime mode will be used; otherwise, immediately will be used.The compatibility issues with quantized parameters, but it usually offers faster inference
immediately mode may have precision and compatibility issues with quantized speed and, in some cases, lower memory usage. The at_runtime mode, on the
parameters, but it usually offers faster inference speed and, in some cases, other hand, is exactly the opposite.
lower memory usage. The at_runtime mode, on the other hand, is exactly the --vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
opposite. --vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
(overrides --vae-tile-size)
Generation Options: Generation Options:
-p, --prompt <string> the prompt to render -p, --prompt <string> the prompt to render
@ -94,99 +92,69 @@ Generation Options:
--end-img <string> path to the end image, required by flf2v --end-img <string> path to the end image, required by flf2v
--mask <string> path to the mask image --mask <string> path to the mask image
--control-image <string> path to control image, control net --control-image <string> path to control image, control net
--control-video <string> path to control video frames, It must be a directory path. The video frames --control-video <string> path to control video frames, It must be a directory path. The video frames inside should be stored as images in
inside should be stored as images in lexicographical (character) order. For lexicographical (character) order. For example, if the control video path is
example, if the control video path is `frames`, the directory contain images `frames`, the directory contain images such as 00.png, 01.png, ... etc.
such as 00.png, 01.png, ... etc.
--pm-id-images-dir <string> path to PHOTOMAKER input id images dir --pm-id-images-dir <string> path to PHOTOMAKER input id images dir
--pm-id-embed-path <string> path to PHOTOMAKER v2 id embed --pm-id-embed-path <string> path to PHOTOMAKER v2 id embed
--hires-upscaler <string> highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent
(nearest-exact), Latent (antialiased), Latent (bicubic), Latent (bicubic
antialiased), or a model name under --hires-upscalers-dir (default: Latent)
-H, --height <int> image height, in pixel space (default: 512) -H, --height <int> image height, in pixel space (default: 512)
-W, --width <int> image width, in pixel space (default: 512) -W, --width <int> image width, in pixel space (default: 512)
--steps <int> number of sample steps (default: 20) --steps <int> number of sample steps (default: 20)
--high-noise-steps <int> (high noise) number of sample steps (default: -1 = auto) --high-noise-steps <int> (high noise) number of sample steps (default: -1 = auto)
--clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer --clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified,
(default: -1). <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x will be 1 for SD1.x, 2 for SD2.x
-b, --batch-count <int> batch count -b, --batch-count <int> batch count
--video-frames <int> video frames (default: 1) --video-frames <int> video frames (default: 1)
--fps <int> fps (default: 24) --fps <int> fps (default: 24)
--timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for --timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
NitroSD-Realism around 250 and 500 for NitroSD-Vibrant NitroSD-Vibrant
--upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1) --upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1)
--upscale-tile-size <int> tile size for ESRGAN upscaling (default: 128) --upscale-tile-size <int> tile size for ESRGAN upscaling (default: 128)
--hires-width <int> highres fix target width, 0 to use --hires-scale (default: 0)
--hires-height <int> highres fix target height, 0 to use --hires-scale (default: 0)
--hires-steps <int> highres fix second pass sample steps, 0 to reuse --steps (default: 0)
--hires-upscale-tile-size <int> highres fix upscaler tile size, reserved for model-backed upscalers (default:
128)
--cfg-scale <float> unconditional guidance scale: (default: 7.0) --cfg-scale <float> unconditional guidance scale: (default: 7.0)
--img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same --img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
as --cfg-scale)
--guidance <float> distilled guidance scale for models with guidance input (default: 3.5) --guidance <float> distilled guidance scale for models with guidance input (default: 3.5)
--slg-scale <float> skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means --slg-scale <float> skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5
disabled, a value of 2.5 is nice for sd3.5 medium medium
--skip-layer-start <float> SLG enabling point (default: 0.01) --skip-layer-start <float> SLG enabling point (default: 0.01)
--skip-layer-end <float> SLG disabling point (default: 0.2) --skip-layer-end <float> SLG disabling point (default: 0.2)
--eta <float> noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and --eta <float> noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
res_2s; 1 for euler_a, er_sde and dpm++2s_a)
--flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto) --flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto)
--high-noise-cfg-scale <float> (high noise) unconditional guidance scale: (default: 7.0) --high-noise-cfg-scale <float> (high noise) unconditional guidance scale: (default: 7.0)
--high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or instruct-pix2pix models --high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
(default: same as --cfg-scale) --high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input (default: 3.5)
--high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input --high-noise-slg-scale <float> (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
(default: 3.5)
--high-noise-slg-scale <float> (high noise) skip layer guidance (SLG) scale, only for DiT models: (default:
0)
--high-noise-skip-layer-start <float> (high noise) SLG enabling point (default: 0.01) --high-noise-skip-layer-start <float> (high noise) SLG enabling point (default: 0.01)
--high-noise-skip-layer-end <float> (high noise) SLG disabling point (default: 0.2) --high-noise-skip-layer-end <float> (high noise) SLG disabling point (default: 0.2)
--high-noise-eta <float> (high noise) noise multiplier (default: 0 for ddim_trailing, tcd, --high-noise-eta <float> (high noise) noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
--strength <float> strength for noising/unnoising (default: 0.75) --strength <float> strength for noising/unnoising (default: 0.75)
--pm-style-strength <float> --pm-style-strength <float>
--control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full --control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
destruction of information in init image --moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1
--moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if
`--high-noise-steps` is set to -1
--vace-strength <float> wan vace strength --vace-strength <float> wan vace strength
--vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5) --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).
--hires-scale <float> highres fix scale when target size is not set (default: 2.0)
--hires-denoising-strength <float> highres fix second pass denoising strength (default: 0.7)
--increase-ref-index automatically increase the indices of references images based on the order
they are listed (starting with 1).
--disable-auto-resize-ref-image disable auto resize of ref images --disable-auto-resize-ref-image disable auto resize of ref images
--disable-image-metadata do not embed generation metadata on image files --disable-image-metadata do not embed generation metadata on image files
--vae-tiling process vae in tiles to reduce memory usage
--hires enable highres fix
-s, --seed RNG seed (default: 42, use random seed for < 0) -s, --seed RNG seed (default: 42, use random seed for < 0)
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, --sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s, tcd, res_multistep, res_2s, er_sde] (default: euler for Flux/SD3/Wan, euler_a
er_sde] (default: euler for Flux/SD3/Wan, euler_a otherwise) otherwise)
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, --high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, ddim_trailing, tcd, res_multistep, res_2s, er_sde] default: euler for Flux/SD3/Wan,
res_2s, er_sde] default: euler for Flux/SD3/Wan, euler_a otherwise euler_a otherwise
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, --scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple,
smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent], default: kl_optimal, lcm, bong_tangent], default: discrete
discrete --sigmas custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0").
--sigmas custom sigma values for the sampler, comma-separated (e.g.,
"14.61,7.8,3.5,0.0").
--skip-layers layers to skip for SLG steps (default: [7,8,9]) --skip-layers layers to skip for SLG steps (default: [7,8,9])
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9]) --high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
-r, --ref-image reference image for Flux Kontext models (can be used multiple times) -r, --ref-image reference image for Flux Kontext models (can be used multiple times)
--cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), --cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level),
'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT 'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)
Chebyshev+Taylor forecasting)
--cache-option named cache params (key=value format, comma-separated). easycache/ucache: --cache-option named cache params (key=value format, comma-separated). easycache/ucache:
threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=;
Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples:
Examples: "threshold=0.25" or "threshold=1.5,reset=0" "threshold=0.25" or "threshold=1.5,reset=0" or "w=0.4,window=2"
--scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., --scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
"1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
--scm-policy SCM policy: 'dynamic' (default) or 'static' --scm-policy SCM policy: 'dynamic' (default) or 'static'
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size
if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)
``` ```
Metadata mode inspects PNG/JPEG container metadata without loading any model: Metadata mode inspects PNG/JPEG container metadata without loading any model:

View File

@ -433,11 +433,10 @@ bool save_results(const SDCliParams& cli_params,
if (!img.data) if (!img.data)
return false; return false;
const int64_t metadata_seed = cli_params.mode == VID_GEN ? gen_params.seed : gen_params.seed + idx; std::string params = gen_params.embed_image_metadata
std::string params = gen_params.embed_image_metadata ? get_image_params(ctx_params, gen_params, gen_params.seed + idx)
? get_image_params(ctx_params, gen_params, metadata_seed, cli_params.mode) : "";
: ""; const bool ok = write_image_to_file(path.string(), img.data, img.width, img.height, img.channel, params, 90);
const bool ok = write_image_to_file(path.string(), img.data, img.width, img.height, img.channel, params, 90);
LOG_INFO("save result image %d to '%s' (%s)", idx, path.string().c_str(), ok ? "success" : "failure"); LOG_INFO("save result image %d to '%s' (%s)", idx, path.string().c_str(), ok ? "success" : "failure");
return ok; return ok;
}; };
@ -691,10 +690,7 @@ int main(int argc, const char* argv[]) {
vae_decode_only = false; vae_decode_only = false;
} }
if (gen_params.hires_enabled && if (gen_params.hires_enabled && !gen_params.hires_upscaler_model_path.empty()) {
(gen_params.resolved_hires_upscaler == SD_HIRES_UPSCALER_MODEL ||
gen_params.resolved_hires_upscaler == SD_HIRES_UPSCALER_LANCZOS ||
gen_params.resolved_hires_upscaler == SD_HIRES_UPSCALER_NEAREST)) {
vae_decode_only = false; vae_decode_only = false;
} }

View File

@ -107,60 +107,47 @@ static bool is_absolute_path(const std::string& p) {
std::string ArgOptions::wrap_text(const std::string& text, size_t width, size_t indent) { std::string ArgOptions::wrap_text(const std::string& text, size_t width, size_t indent) {
std::ostringstream oss; std::ostringstream oss;
size_t pos = 0;
size_t line_len = 0; size_t line_len = 0;
size_t pos = 0;
while (pos < text.size()) { while (pos < text.size()) {
// Preserve manual newlines
if (text[pos] == '\n') { if (text[pos] == '\n') {
oss << '\n' oss << '\n'
<< std::string(indent, ' '); << std::string(indent, ' ');
line_len = 0; line_len = indent;
++pos; ++pos;
continue; continue;
} }
if (std::isspace(static_cast<unsigned char>(text[pos]))) { // Add the character
++pos; oss << text[pos];
continue; ++line_len;
} ++pos;
size_t word_start = pos; // If the current line exceeds width, try to break at the last space
while (pos < text.size() && if (line_len >= width) {
text[pos] != '\n' && std::string current = oss.str();
!std::isspace(static_cast<unsigned char>(text[pos]))) { size_t back = current.size();
++pos;
}
std::string word = text.substr(word_start, pos - word_start); // Find the last space (for a clean break)
while (!word.empty()) { while (back > 0 && current[back - 1] != ' ' && current[back - 1] != '\n')
size_t separator_len = line_len == 0 ? 0 : 1; --back;
if (line_len + separator_len + word.size() <= width) {
if (separator_len > 0) {
oss << ' ';
++line_len;
}
oss << word;
line_len += word.size();
word.clear();
continue;
}
if (line_len > 0) { // If found a space to break on
oss << '\n' if (back > 0 && current[back - 1] != '\n') {
std::string before = current.substr(0, back - 1);
std::string after = current.substr(back);
oss.str("");
oss.clear();
oss << before << "\n"
<< std::string(indent, ' ') << after;
} else {
// If no space found, just break at width
oss << "\n"
<< std::string(indent, ' '); << std::string(indent, ' ');
line_len = 0;
continue;
}
size_t chunk_len = std::min(width, word.size());
oss << word.substr(0, chunk_len);
line_len = chunk_len;
word.erase(0, chunk_len);
if (!word.empty()) {
oss << '\n'
<< std::string(indent, ' ');
line_len = 0;
} }
line_len = indent;
} }
} }
@ -394,12 +381,7 @@ ArgOptions SDContextParams::get_options() {
&chroma_t5_mask_pad}, &chroma_t5_mask_pad},
}; };
options.float_options = { options.float_options = {};
{"",
"--max-vram",
"maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting",
&max_vram},
};
options.bool_options = { options.bool_options = {
{"", {"",
@ -675,7 +657,6 @@ std::string SDContextParams::to_string() const {
<< " rng_type: " << sd_rng_type_name(rng_type) << ",\n" << " rng_type: " << sd_rng_type_name(rng_type) << ",\n"
<< " sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n" << " sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n"
<< " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n" << " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n"
<< " max_vram: " << max_vram << ",\n"
<< " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n" << " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n"
<< " control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n" << " control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n"
<< " clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n" << " clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n"
@ -750,7 +731,6 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
chroma_use_t5_mask, chroma_use_t5_mask,
chroma_t5_mask_pad, chroma_t5_mask_pad,
qwen_image_zero_cond_t, qwen_image_zero_cond_t,
max_vram,
}; };
return sd_ctx_params; return sd_ctx_params;
} }
@ -803,9 +783,7 @@ ArgOptions SDGenerationParams::get_options() {
&pm_id_embed_path}, &pm_id_embed_path},
{"", {"",
"--hires-upscaler", "--hires-upscaler",
"highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent (nearest-exact), " "highres fix upscaler, Latent (nearest) or a model name/path under --hires-upscalers-dir (default: Latent (nearest))",
"Latent (antialiased), Latent (bicubic), Latent (bicubic antialiased), or a model name "
"under --hires-upscalers-dir (default: Latent)",
&hires_upscaler}, &hires_upscaler},
}; };
@ -1940,7 +1918,7 @@ bool SDGenerationParams::resolve(const std::string& lora_model_dir, const std::s
hires_upscaler_model_path.clear(); hires_upscaler_model_path.clear();
if (hires_enabled) { if (hires_enabled) {
if (hires_upscaler.empty()) { if (hires_upscaler.empty()) {
hires_upscaler = "Latent"; hires_upscaler = "Latent (nearest)";
} }
resolved_hires_upscaler = str_to_sd_hires_upscaler(hires_upscaler.c_str()); resolved_hires_upscaler = str_to_sd_hires_upscaler(hires_upscaler.c_str());
if (resolved_hires_upscaler == SD_HIRES_UPSCALER_NONE) { if (resolved_hires_upscaler == SD_HIRES_UPSCALER_NONE) {
@ -2288,192 +2266,7 @@ std::string version_string() {
return std::string("stable-diffusion.cpp version ") + sd_version() + ", commit " + sd_commit(); return std::string("stable-diffusion.cpp version ") + sd_version() + ", commit " + sd_commit();
} }
static std::string safe_json_string(const char* value) { std::string get_image_params(const SDContextParams& ctx_params, const SDGenerationParams& gen_params, int64_t seed) {
return value ? value : "";
}
static void set_json_basename_if_not_empty(json& target, const char* key, const std::string& path) {
if (!path.empty()) {
target[key] = sd_basename(path);
}
}
static json build_sampling_metadata_json(const sd_sample_params_t& sample_params,
const std::vector<int>& skip_layers,
const std::vector<float>* custom_sigmas = nullptr) {
json sampling = {
{"steps", sample_params.sample_steps},
{"eta", sample_params.eta},
{"shifted_timestep", sample_params.shifted_timestep},
{"flow_shift", sample_params.flow_shift},
{"guidance",
{
{"txt_cfg", sample_params.guidance.txt_cfg},
{"img_cfg", sample_params.guidance.img_cfg},
{"distilled_guidance", sample_params.guidance.distilled_guidance},
{"slg",
{
{"scale", sample_params.guidance.slg.scale},
{"layers", skip_layers},
{"start", sample_params.guidance.slg.layer_start},
{"end", sample_params.guidance.slg.layer_end},
}},
}},
};
if (sample_params.sample_method != SAMPLE_METHOD_COUNT) {
sampling["method"] = safe_json_string(sd_sample_method_name(sample_params.sample_method));
}
if (sample_params.scheduler != SCHEDULER_COUNT) {
sampling["scheduler"] = safe_json_string(sd_scheduler_name(sample_params.scheduler));
}
if (custom_sigmas != nullptr) {
sampling["custom_sigmas"] = *custom_sigmas;
}
return sampling;
}
std::string build_sdcpp_image_metadata_json(const SDContextParams& ctx_params,
const SDGenerationParams& gen_params,
int64_t seed,
SDMode mode) {
json root;
root["schema"] = "sdcpp.image.params/v1";
root["mode"] = mode == VID_GEN ? "vid_gen" : "img_gen";
root["generator"] = {
{"name", "stable-diffusion.cpp"},
{"version", safe_json_string(sd_version())},
{"commit", safe_json_string(sd_commit())},
};
root["seed"] = seed;
root["width"] = gen_params.get_resolved_width();
root["height"] = gen_params.get_resolved_height();
root["prompt"] = {
{"positive", gen_params.prompt},
{"negative", gen_params.negative_prompt},
};
root["sampling"] = build_sampling_metadata_json(gen_params.sample_params,
gen_params.skip_layers,
&gen_params.custom_sigmas);
json models;
set_json_basename_if_not_empty(models, "model", ctx_params.model_path);
set_json_basename_if_not_empty(models, "clip_l", ctx_params.clip_l_path);
set_json_basename_if_not_empty(models, "clip_g", ctx_params.clip_g_path);
set_json_basename_if_not_empty(models, "clip_vision", ctx_params.clip_vision_path);
set_json_basename_if_not_empty(models, "t5xxl", ctx_params.t5xxl_path);
set_json_basename_if_not_empty(models, "llm", ctx_params.llm_path);
set_json_basename_if_not_empty(models, "llm_vision", ctx_params.llm_vision_path);
set_json_basename_if_not_empty(models, "diffusion_model", ctx_params.diffusion_model_path);
set_json_basename_if_not_empty(models, "high_noise_diffusion_model", ctx_params.high_noise_diffusion_model_path);
set_json_basename_if_not_empty(models, "vae", ctx_params.vae_path);
set_json_basename_if_not_empty(models, "taesd", ctx_params.taesd_path);
set_json_basename_if_not_empty(models, "control_net", ctx_params.control_net_path);
root["models"] = std::move(models);
root["clip_skip"] = gen_params.clip_skip;
root["strength"] = gen_params.strength;
root["control_strength"] = gen_params.control_strength;
root["auto_resize_ref_image"] = gen_params.auto_resize_ref_image;
root["increase_ref_index"] = gen_params.increase_ref_index;
if (mode == VID_GEN) {
root["video"] = {
{"frame_count", gen_params.video_frames},
{"fps", gen_params.fps},
};
root["moe_boundary"] = gen_params.moe_boundary;
root["vace_strength"] = gen_params.vace_strength;
root["high_noise_sampling"] = build_sampling_metadata_json(gen_params.high_noise_sample_params,
gen_params.high_noise_skip_layers);
}
root["rng"] = safe_json_string(sd_rng_type_name(ctx_params.rng_type));
if (ctx_params.sampler_rng_type != RNG_TYPE_COUNT) {
root["sampler_rng"] = safe_json_string(sd_rng_type_name(ctx_params.sampler_rng_type));
}
json loras = json::array();
for (const auto& entry : gen_params.lora_map) {
loras.push_back({
{"name", sd_basename(entry.first)},
{"multiplier", entry.second},
{"is_high_noise", false},
});
}
for (const auto& entry : gen_params.high_noise_lora_map) {
loras.push_back({
{"name", sd_basename(entry.first)},
{"multiplier", entry.second},
{"is_high_noise", true},
});
}
if (!loras.empty()) {
root["loras"] = std::move(loras);
}
if (gen_params.hires_enabled) {
root["hires"] = {
{"enabled", gen_params.hires_enabled},
{"upscaler", gen_params.hires_upscaler},
{"model", gen_params.hires_upscaler_model_path.empty() ? "" : sd_basename(gen_params.hires_upscaler_model_path)},
{"scale", gen_params.hires_scale},
{"target_width", gen_params.hires_width},
{"target_height", gen_params.hires_height},
{"steps", gen_params.hires_steps},
{"denoising_strength", gen_params.hires_denoising_strength},
{"upscale_tile_size", gen_params.hires_upscale_tile_size},
};
}
if (gen_params.cache_params.mode != SD_CACHE_DISABLED) {
root["cache"] = {
{"requested_mode", gen_params.cache_mode},
{"requested_option", gen_params.cache_option},
{"mode", gen_params.cache_params.mode},
{"scm_mask", gen_params.scm_mask},
{"scm_policy_dynamic", gen_params.scm_policy_dynamic},
{"reuse_threshold", gen_params.cache_params.reuse_threshold},
{"start_percent", gen_params.cache_params.start_percent},
{"end_percent", gen_params.cache_params.end_percent},
{"error_decay_rate", gen_params.cache_params.error_decay_rate},
{"use_relative_threshold", gen_params.cache_params.use_relative_threshold},
{"reset_error_on_compute", gen_params.cache_params.reset_error_on_compute},
{"Fn_compute_blocks", gen_params.cache_params.Fn_compute_blocks},
{"Bn_compute_blocks", gen_params.cache_params.Bn_compute_blocks},
{"residual_diff_threshold", gen_params.cache_params.residual_diff_threshold},
{"max_warmup_steps", gen_params.cache_params.max_warmup_steps},
{"max_cached_steps", gen_params.cache_params.max_cached_steps},
{"max_continuous_cached_steps", gen_params.cache_params.max_continuous_cached_steps},
{"taylorseer_n_derivatives", gen_params.cache_params.taylorseer_n_derivatives},
{"taylorseer_skip_interval", gen_params.cache_params.taylorseer_skip_interval},
{"spectrum_w", gen_params.cache_params.spectrum_w},
{"spectrum_m", gen_params.cache_params.spectrum_m},
{"spectrum_lam", gen_params.cache_params.spectrum_lam},
{"spectrum_window_size", gen_params.cache_params.spectrum_window_size},
{"spectrum_flex_window", gen_params.cache_params.spectrum_flex_window},
{"spectrum_warmup_steps", gen_params.cache_params.spectrum_warmup_steps},
{"spectrum_stop_percent", gen_params.cache_params.spectrum_stop_percent},
};
}
if (gen_params.vae_tiling_params.enabled) {
root["vae_tiling"] = {
{"enabled", gen_params.vae_tiling_params.enabled},
{"tile_size_x", gen_params.vae_tiling_params.tile_size_x},
{"tile_size_y", gen_params.vae_tiling_params.tile_size_y},
{"target_overlap", gen_params.vae_tiling_params.target_overlap},
{"rel_size_x", gen_params.vae_tiling_params.rel_size_x},
{"rel_size_y", gen_params.vae_tiling_params.rel_size_y},
};
}
return root.dump();
}
std::string get_image_params(const SDContextParams& ctx_params,
const SDGenerationParams& gen_params,
int64_t seed,
SDMode mode) {
std::string parameter_string; std::string parameter_string;
if (gen_params.prompt_with_lora.size() != 0) { if (gen_params.prompt_with_lora.size() != 0) {
parameter_string += gen_params.prompt_with_lora + "\n"; parameter_string += gen_params.prompt_with_lora + "\n";
@ -2486,7 +2279,7 @@ std::string get_image_params(const SDContextParams& ctx_params,
parameter_string += "Steps: " + std::to_string(gen_params.sample_params.sample_steps) + ", "; parameter_string += "Steps: " + std::to_string(gen_params.sample_params.sample_steps) + ", ";
parameter_string += "CFG scale: " + std::to_string(gen_params.sample_params.guidance.txt_cfg) + ", "; parameter_string += "CFG scale: " + std::to_string(gen_params.sample_params.guidance.txt_cfg) + ", ";
if (gen_params.sample_params.guidance.slg.scale != 0 && gen_params.skip_layers.size() != 0) { if (gen_params.sample_params.guidance.slg.scale != 0 && gen_params.skip_layers.size() != 0) {
parameter_string += "SLG scale: " + std::to_string(gen_params.sample_params.guidance.slg.scale) + ", "; parameter_string += "SLG scale: " + std::to_string(gen_params.sample_params.guidance.txt_cfg) + ", ";
parameter_string += "Skip layers: ["; parameter_string += "Skip layers: [";
for (const auto& layer : gen_params.skip_layers) { for (const auto& layer : gen_params.skip_layers) {
parameter_string += std::to_string(layer) + ", "; parameter_string += std::to_string(layer) + ", ";
@ -2539,6 +2332,5 @@ std::string get_image_params(const SDContextParams& ctx_params,
parameter_string += "Denoising strength: " + std::to_string(gen_params.hires_denoising_strength) + ", "; parameter_string += "Denoising strength: " + std::to_string(gen_params.hires_denoising_strength) + ", ";
} }
parameter_string += "Version: stable-diffusion.cpp"; parameter_string += "Version: stable-diffusion.cpp";
parameter_string += ", SDCPP: " + build_sdcpp_image_metadata_json(ctx_params, gen_params, seed, mode);
return parameter_string; return parameter_string;
} }

View File

@ -109,7 +109,6 @@ struct SDContextParams {
rng_type_t rng_type = CUDA_RNG; rng_type_t rng_type = CUDA_RNG;
rng_type_t sampler_rng_type = RNG_TYPE_COUNT; rng_type_t sampler_rng_type = RNG_TYPE_COUNT;
bool offload_params_to_cpu = false; bool offload_params_to_cpu = false;
float max_vram = 0.f;
bool enable_mmap = false; bool enable_mmap = false;
bool control_net_cpu = false; bool control_net_cpu = false;
bool clip_on_cpu = false; bool clip_on_cpu = false;
@ -193,7 +192,7 @@ struct SDGenerationParams {
int upscale_tile_size = 128; int upscale_tile_size = 128;
bool hires_enabled = false; bool hires_enabled = false;
std::string hires_upscaler = "Latent"; std::string hires_upscaler = "Latent (nearest)";
std::string hires_upscaler_model_path; std::string hires_upscaler_model_path;
float hires_scale = 2.f; float hires_scale = 2.f;
int hires_width = 0; int hires_width = 0;
@ -250,13 +249,6 @@ struct SDGenerationParams {
}; };
std::string version_string(); std::string version_string();
std::string build_sdcpp_image_metadata_json(const SDContextParams& ctx_params, std::string get_image_params(const SDContextParams& ctx_params, const SDGenerationParams& gen_params, int64_t seed);
const SDGenerationParams& gen_params,
int64_t seed,
SDMode mode = IMG_GEN);
std::string get_image_params(const SDContextParams& ctx_params,
const SDGenerationParams& gen_params,
int64_t seed,
SDMode mode = IMG_GEN);
#endif // __EXAMPLES_COMMON_COMMON_H__ #endif // __EXAMPLES_COMMON_COMMON_H__

View File

@ -123,11 +123,11 @@ In this case, the server will load and serve the specified `index.html` file ins
usage: ./bin/sd-server [options] usage: ./bin/sd-server [options]
Svr Options: Svr Options:
-l, --listen-ip <string> server listen ip (default: 127.0.0.1) -l, --listen-ip <string> server listen ip (default: 127.0.0.1)
--serve-html-path <string> path to HTML file to serve at root (optional) --serve-html-path <string> path to HTML file to serve at root (optional)
--listen-port <int> server listen port (default: 1234) --listen-port <int> server listen port (default: 1234)
-v, --verbose print extra info -v, --verbose print extra info
--color colors the logging tags according to level --color colors the logging tags according to level
-h, --help show this help message and exit -h, --help show this help message and exit
Context Options: Context Options:
@ -136,8 +136,7 @@ Context Options:
--clip_g <string> path to the clip-g text encoder --clip_g <string> path to the clip-g text encoder
--clip_vision <string> path to the clip-vision encoder --clip_vision <string> path to the clip-vision encoder
--t5xxl <string> path to the t5xxl text encoder --t5xxl <string> path to the t5xxl text encoder
--llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, --llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)
mistral-small3.2 for flux2, ...)
--llm_vision <string> path to the llm vit --llm_vision <string> path to the llm vit
--qwen2vl <string> alias of --llm. Deprecated. --qwen2vl <string> alias of --llm. Deprecated.
--qwen2vl_vision <string> alias of --llm_vision. Deprecated. --qwen2vl_vision <string> alias of --llm_vision. Deprecated.
@ -149,18 +148,16 @@ Context Options:
--control-net <string> path to control net model --control-net <string> path to control net model
--embd-dir <string> embeddings directory --embd-dir <string> embeddings directory
--lora-model-dir <string> lora model directory --lora-model-dir <string> lora model directory
--hires-upscalers-dir <string> highres fix upscaler model directory
--tensor-type-rules <string> weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0") --tensor-type-rules <string> weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
--photo-maker <string> path to PHOTOMAKER model --photo-maker <string> path to PHOTOMAKER model
--upscale-model <string> path to esrgan model. --upscale-model <string> path to esrgan model.
-t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0, -t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
then threads will be set to the number of CPU physical cores CPU physical cores
--chroma-t5-mask-pad <int> t5 mask pad size of chroma --chroma-t5-mask-pad <int> t5 mask pad size of chroma
--max-vram <float> maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables --vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5)
graph splitting --vae-tiling process vae in tiles to reduce memory usage
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
when needed
--mmap whether to memory-map model --mmap whether to memory-map model
--control-net-cpu keep controlnet in cpu (for low vram) --control-net-cpu keep controlnet in cpu (for low vram)
--clip-on-cpu keep clip in cpu (for low vram) --clip-on-cpu keep clip in cpu (for low vram)
@ -175,19 +172,20 @@ Context Options:
--chroma-disable-dit-mask disable dit mask for chroma --chroma-disable-dit-mask disable dit mask for chroma
--qwen-image-zero-cond-t enable zero_cond_t for qwen image --qwen-image-zero-cond-t enable zero_cond_t for qwen image
--chroma-enable-t5-mask enable t5 mask for chroma --chroma-enable-t5-mask enable t5 mask for chroma
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, --type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
q4_K). If not specified, the default is the type of the weight file type of the weight file
--rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui) --rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
--sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng --sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
--prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, --prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]
flux2_flow] --lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights
--lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is contain any quantized parameters, the at_runtime mode will be used; otherwise,
auto. In auto mode, if the model weights contain any quantized parameters, immediately will be used.The immediately mode may have precision and
the at_runtime mode will be used; otherwise, immediately will be used.The compatibility issues with quantized parameters, but it usually offers faster inference
immediately mode may have precision and compatibility issues with quantized speed and, in some cases, lower memory usage. The at_runtime mode, on the
parameters, but it usually offers faster inference speed and, in some cases, other hand, is exactly the opposite.
lower memory usage. The at_runtime mode, on the other hand, is exactly the --vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
opposite. --vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
(overrides --vae-tile-size)
Default Generation Options: Default Generation Options:
-p, --prompt <string> the prompt to render -p, --prompt <string> the prompt to render
@ -196,97 +194,65 @@ Default Generation Options:
--end-img <string> path to the end image, required by flf2v --end-img <string> path to the end image, required by flf2v
--mask <string> path to the mask image --mask <string> path to the mask image
--control-image <string> path to control image, control net --control-image <string> path to control image, control net
--control-video <string> path to control video frames, It must be a directory path. The video frames --control-video <string> path to control video frames, It must be a directory path. The video frames inside should be stored as images in
inside should be stored as images in lexicographical (character) order. For lexicographical (character) order. For example, if the control video path is
example, if the control video path is `frames`, the directory contain images `frames`, the directory contain images such as 00.png, 01.png, ... etc.
such as 00.png, 01.png, ... etc.
--pm-id-images-dir <string> path to PHOTOMAKER input id images dir --pm-id-images-dir <string> path to PHOTOMAKER input id images dir
--pm-id-embed-path <string> path to PHOTOMAKER v2 id embed --pm-id-embed-path <string> path to PHOTOMAKER v2 id embed
--hires-upscaler <string> highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent
(nearest-exact), Latent (antialiased), Latent (bicubic), Latent (bicubic
antialiased), or a model name under --hires-upscalers-dir (default: Latent)
-H, --height <int> image height, in pixel space (default: 512) -H, --height <int> image height, in pixel space (default: 512)
-W, --width <int> image width, in pixel space (default: 512) -W, --width <int> image width, in pixel space (default: 512)
--steps <int> number of sample steps (default: 20) --steps <int> number of sample steps (default: 20)
--high-noise-steps <int> (high noise) number of sample steps (default: -1 = auto) --high-noise-steps <int> (high noise) number of sample steps (default: -1 = auto)
--clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer --clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified,
(default: -1). <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x will be 1 for SD1.x, 2 for SD2.x
-b, --batch-count <int> batch count -b, --batch-count <int> batch count
--video-frames <int> video frames (default: 1) --video-frames <int> video frames (default: 1)
--fps <int> fps (default: 24) --fps <int> fps (default: 24)
--timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for --timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
NitroSD-Realism around 250 and 500 for NitroSD-Vibrant NitroSD-Vibrant
--upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1) --upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1)
--upscale-tile-size <int> tile size for ESRGAN upscaling (default: 128) --upscale-tile-size <int> tile size for ESRGAN upscaling (default: 128)
--hires-width <int> highres fix target width, 0 to use --hires-scale (default: 0)
--hires-height <int> highres fix target height, 0 to use --hires-scale (default: 0)
--hires-steps <int> highres fix second pass sample steps, 0 to reuse --steps (default: 0)
--hires-upscale-tile-size <int> highres fix upscaler tile size, reserved for model-backed upscalers (default:
128)
--cfg-scale <float> unconditional guidance scale: (default: 7.0) --cfg-scale <float> unconditional guidance scale: (default: 7.0)
--img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same --img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
as --cfg-scale)
--guidance <float> distilled guidance scale for models with guidance input (default: 3.5) --guidance <float> distilled guidance scale for models with guidance input (default: 3.5)
--slg-scale <float> skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means --slg-scale <float> skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5
disabled, a value of 2.5 is nice for sd3.5 medium medium
--skip-layer-start <float> SLG enabling point (default: 0.01) --skip-layer-start <float> SLG enabling point (default: 0.01)
--skip-layer-end <float> SLG disabling point (default: 0.2) --skip-layer-end <float> SLG disabling point (default: 0.2)
--eta <float> noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and --eta <float> noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
res_2s; 1 for euler_a, er_sde and dpm++2s_a)
--flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto) --flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto)
--high-noise-cfg-scale <float> (high noise) unconditional guidance scale: (default: 7.0) --high-noise-cfg-scale <float> (high noise) unconditional guidance scale: (default: 7.0)
--high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or instruct-pix2pix models --high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
(default: same as --cfg-scale) --high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input (default: 3.5)
--high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input --high-noise-slg-scale <float> (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
(default: 3.5)
--high-noise-slg-scale <float> (high noise) skip layer guidance (SLG) scale, only for DiT models: (default:
0)
--high-noise-skip-layer-start <float> (high noise) SLG enabling point (default: 0.01) --high-noise-skip-layer-start <float> (high noise) SLG enabling point (default: 0.01)
--high-noise-skip-layer-end <float> (high noise) SLG disabling point (default: 0.2) --high-noise-skip-layer-end <float> (high noise) SLG disabling point (default: 0.2)
--high-noise-eta <float> (high noise) noise multiplier (default: 0 for ddim_trailing, tcd, --high-noise-eta <float> (high noise) noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
--strength <float> strength for noising/unnoising (default: 0.75) --strength <float> strength for noising/unnoising (default: 0.75)
--pm-style-strength <float> --pm-style-strength <float>
--control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full --control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
destruction of information in init image --moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1
--moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if
`--high-noise-steps` is set to -1
--vace-strength <float> wan vace strength --vace-strength <float> wan vace strength
--vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5) --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).
--hires-scale <float> highres fix scale when target size is not set (default: 2.0)
--hires-denoising-strength <float> highres fix second pass denoising strength (default: 0.7)
--increase-ref-index automatically increase the indices of references images based on the order
they are listed (starting with 1).
--disable-auto-resize-ref-image disable auto resize of ref images --disable-auto-resize-ref-image disable auto resize of ref images
--disable-image-metadata do not embed generation metadata on image files --disable-image-metadata do not embed generation metadata on image files
--vae-tiling process vae in tiles to reduce memory usage
--hires enable highres fix
-s, --seed RNG seed (default: 42, use random seed for < 0) -s, --seed RNG seed (default: 42, use random seed for < 0)
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, --sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s, tcd, res_multistep, res_2s, er_sde] (default: euler for Flux/SD3/Wan, euler_a
er_sde] (default: euler for Flux/SD3/Wan, euler_a otherwise) otherwise)
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, --high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, ddim_trailing, tcd, res_multistep, res_2s, er_sde] default: euler for Flux/SD3/Wan,
res_2s, er_sde] default: euler for Flux/SD3/Wan, euler_a otherwise euler_a otherwise
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, --scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple,
smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent], default: kl_optimal, lcm, bong_tangent], default: discrete
discrete --sigmas custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0").
--sigmas custom sigma values for the sampler, comma-separated (e.g.,
"14.61,7.8,3.5,0.0").
--skip-layers layers to skip for SLG steps (default: [7,8,9]) --skip-layers layers to skip for SLG steps (default: [7,8,9])
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9]) --high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
-r, --ref-image reference image for Flux Kontext models (can be used multiple times) -r, --ref-image reference image for Flux Kontext models (can be used multiple times)
--cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), --cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)
'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT
Chebyshev+Taylor forecasting)
--cache-option named cache params (key=value format, comma-separated). easycache/ucache: --cache-option named cache params (key=value format, comma-separated). easycache/ucache:
threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples:
Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. "threshold=0.25" or "threshold=1.5,reset=0"
Examples: "threshold=0.25" or "threshold=1.5,reset=0" --scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
--scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g.,
"1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
--scm-policy SCM policy: 'dynamic' (default) or 'static' --scm-policy SCM policy: 'dynamic' (default) or 'static'
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size
if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)
``` ```

View File

@ -219,7 +219,7 @@ Currently supported request fields:
| `lora` | `array<object>` | Structured LoRA list | | `lora` | `array<object>` | Structured LoRA list |
| `extra_images` | `array<string>` | Base64 or data URL images | | `extra_images` | `array<string>` | Base64 or data URL images |
| `enable_hr` | `boolean` | Enable highres fix for `txt2img` | | `enable_hr` | `boolean` | Enable highres fix for `txt2img` |
| `hr_upscaler` | `string` | `Lanczos`, `Nearest`, a latent mode such as `Latent (nearest-exact)`, or an upscaler model name from `/sdapi/v1/upscalers` | | `hr_upscaler` | `string` | `Latent (nearest)` or an upscaler model name from `/sdapi/v1/upscalers` |
| `hr_scale` | `number` | Highres scale when resize target is not set | | `hr_scale` | `number` | Highres scale when resize target is not set |
| `hr_resize_x` | `integer` | Highres target width, `0` to use scale | | `hr_resize_x` | `integer` | Highres target width, `0` to use scale |
| `hr_resize_y` | `integer` | Highres target height, `0` to use scale | | `hr_resize_y` | `integer` | Highres target height, `0` to use scale |
@ -303,8 +303,6 @@ Built-in entries include `None`, `Lanczos`, and `Nearest`. Model-backed entries
| --- | --- | --- | | --- | --- | --- |
| `[].name` | `string` | WebUI-compatible latent upscale mode name | | `[].name` | `string` | WebUI-compatible latent upscale mode name |
Built-in latent modes include `Latent`, `Latent (nearest)`, `Latent (nearest-exact)`, `Latent (antialiased)`, `Latent (bicubic)`, and `Latent (bicubic antialiased)`.
`GET /sdapi/v1/samplers` `GET /sdapi/v1/samplers`
| Field | Type | Notes | | Field | Type | Notes |
@ -464,7 +462,7 @@ Shared nested fields:
| --- | --- | --- | | --- | --- | --- |
| `upscalers[].name` | `string` | Built-in name or model stem; use this value in `hires.upscaler` | | `upscalers[].name` | `string` | Built-in name or model stem; use this value in `hires.upscaler` |
Built-in entries include `None`, `Lanczos`, `Nearest`, `Latent`, `Latent (nearest)`, `Latent (nearest-exact)`, `Latent (antialiased)`, `Latent (bicubic)`, and `Latent (bicubic antialiased)`. Model-backed entries are scanned from the top level of `--hires-upscalers-dir`; subdirectories are not scanned. Built-in entries include `None` and `Latent (nearest)`. Model-backed entries are scanned from the top level of `--hires-upscalers-dir`; subdirectories are not scanned.
`limits` `limits`
@ -679,7 +677,7 @@ Example:
"lora": [], "lora": [],
"hires": { "hires": {
"enabled": false, "enabled": false,
"upscaler": "Latent", "upscaler": "Latent (nearest)",
"scale": 2.0, "scale": 2.0,
"target_width": 0, "target_width": 0,
"target_height": 0, "target_height": 0,
@ -806,7 +804,7 @@ Other native fields:
| `scm_mask` | `string` | | `scm_mask` | `string` |
| `scm_policy_dynamic` | `boolean` | | `scm_policy_dynamic` | `boolean` |
For `hires.upscaler`, use `Lanczos`, `Nearest`, `Latent`, `Latent (nearest)`, `Latent (nearest-exact)`, `Latent (antialiased)`, `Latent (bicubic)`, `Latent (bicubic antialiased)`, or an `upscalers[].name` value from `GET /sdcpp/v1/capabilities`. Model-backed upscalers are resolved as `--hires-upscalers-dir / (name + ext)` and must live directly in that directory. For `hires.upscaler`, use `Latent (nearest)` for latent upscale or an `upscalers[].name` value from `GET /sdcpp/v1/capabilities`. Model-backed upscalers are resolved as `--hires-upscalers-dir / (name + ext)` and must live directly in that directory.
HTTP-only output fields: HTTP-only output fields:

View File

@ -381,8 +381,6 @@ void register_sdapi_endpoints(httplib::Server& svr, ServerRuntime& rt) {
json result = json::array(); json result = json::array();
result.push_back(make_builtin("None")); result.push_back(make_builtin("None"));
result.push_back(make_builtin("Lanczos"));
result.push_back(make_builtin("Nearest"));
{ {
std::lock_guard<std::mutex> lock(*runtime->upscaler_mutex); std::lock_guard<std::mutex> lock(*runtime->upscaler_mutex);
@ -402,12 +400,7 @@ void register_sdapi_endpoints(httplib::Server& svr, ServerRuntime& rt) {
svr.Get("/sdapi/v1/latent-upscale-modes", [](const httplib::Request&, httplib::Response& res) { svr.Get("/sdapi/v1/latent-upscale-modes", [](const httplib::Request&, httplib::Response& res) {
json result = json::array({ json result = json::array({
{{"name", "Latent"}},
{{"name", "Latent (nearest)"}}, {{"name", "Latent (nearest)"}},
{{"name", "Latent (nearest-exact)"}},
{{"name", "Latent (antialiased)"}},
{{"name", "Latent (bicubic)"}},
{{"name", "Latent (bicubic antialiased)"}},
}); });
res.set_content(result.dump(), "application/json"); res.set_content(result.dump(), "application/json");
}); });

View File

@ -227,30 +227,9 @@ static json make_capabilities_json(ServerRuntime& runtime) {
available_upscalers.push_back({ available_upscalers.push_back({
{"name", "None"}, {"name", "None"},
}); });
available_upscalers.push_back({
{"name", "Lanczos"},
});
available_upscalers.push_back({
{"name", "Nearest"},
});
available_upscalers.push_back({
{"name", "Latent"},
});
available_upscalers.push_back({ available_upscalers.push_back({
{"name", "Latent (nearest)"}, {"name", "Latent (nearest)"},
}); });
available_upscalers.push_back({
{"name", "Latent (nearest-exact)"},
});
available_upscalers.push_back({
{"name", "Latent (antialiased)"},
});
available_upscalers.push_back({
{"name", "Latent (bicubic)"},
});
available_upscalers.push_back({
{"name", "Latent (bicubic antialiased)"},
});
{ {
std::lock_guard<std::mutex> lock(*runtime.upscaler_mutex); std::lock_guard<std::mutex> lock(*runtime.upscaler_mutex);
for (const auto& entry : *runtime.upscaler_cache) { for (const auto& entry : *runtime.upscaler_cache) {

View File

@ -203,7 +203,6 @@ typedef struct {
bool chroma_use_t5_mask; bool chroma_use_t5_mask;
int chroma_t5_mask_pad; int chroma_t5_mask_pad;
bool qwen_image_zero_cond_t; bool qwen_image_zero_cond_t;
float max_vram;
} sd_ctx_params_t; } sd_ctx_params_t;
typedef struct { typedef struct {
@ -292,14 +291,7 @@ typedef struct {
enum sd_hires_upscaler_t { enum sd_hires_upscaler_t {
SD_HIRES_UPSCALER_NONE, SD_HIRES_UPSCALER_NONE,
SD_HIRES_UPSCALER_LATENT,
SD_HIRES_UPSCALER_LATENT_NEAREST, SD_HIRES_UPSCALER_LATENT_NEAREST,
SD_HIRES_UPSCALER_LATENT_NEAREST_EXACT,
SD_HIRES_UPSCALER_LATENT_ANTIALIASED,
SD_HIRES_UPSCALER_LATENT_BICUBIC,
SD_HIRES_UPSCALER_LATENT_BICUBIC_ANTIALIASED,
SD_HIRES_UPSCALER_LANCZOS,
SD_HIRES_UPSCALER_NEAREST,
SD_HIRES_UPSCALER_MODEL, SD_HIRES_UPSCALER_MODEL,
SD_HIRES_UPSCALER_COUNT, SD_HIRES_UPSCALER_COUNT,
}; };

View File

@ -499,15 +499,9 @@ namespace Anima {
encoder_hidden_states = adapted_context; encoder_hidden_states = adapted_context;
} }
sd::ggml_graph_cut::mark_graph_cut(x, "anima.prelude", "x");
sd::ggml_graph_cut::mark_graph_cut(embedded_timestep, "anima.prelude", "embedded_timestep");
sd::ggml_graph_cut::mark_graph_cut(temb, "anima.prelude", "temb");
sd::ggml_graph_cut::mark_graph_cut(encoder_hidden_states, "anima.prelude", "context");
for (int i = 0; i < num_layers; i++) { for (int i = 0; i < num_layers; i++) {
auto block = std::dynamic_pointer_cast<TransformerBlock>(blocks["blocks." + std::to_string(i)]); auto block = std::dynamic_pointer_cast<TransformerBlock>(blocks["blocks." + std::to_string(i)]);
x = block->forward(ctx, x, encoder_hidden_states, embedded_timestep, temb, image_pe); x = block->forward(ctx, x, encoder_hidden_states, embedded_timestep, temb, image_pe);
sd::ggml_graph_cut::mark_graph_cut(x, "anima.blocks." + std::to_string(i), "x");
} }
x = final_layer->forward(ctx, x, embedded_timestep, temb); // [N, h*w, ph*pw*C] x = final_layer->forward(ctx, x, embedded_timestep, temb); // [N, h*w, ph*pw*C]

View File

@ -328,7 +328,6 @@ public:
auto conv_out = std::dynamic_pointer_cast<Conv2d>(blocks["conv_out"]); auto conv_out = std::dynamic_pointer_cast<Conv2d>(blocks["conv_out"]);
auto h = conv_in->forward(ctx, x); // [N, ch, h, w] auto h = conv_in->forward(ctx, x); // [N, ch, h, w]
// sd::ggml_graph_cut::mark_graph_cut(h, "vae.encoder.prelude", "h");
// downsampling // downsampling
size_t num_resolutions = ch_mult.size(); size_t num_resolutions = ch_mult.size();
@ -338,14 +337,12 @@ public:
auto down_block = std::dynamic_pointer_cast<ResnetBlock>(blocks[name]); auto down_block = std::dynamic_pointer_cast<ResnetBlock>(blocks[name]);
h = down_block->forward(ctx, h); h = down_block->forward(ctx, h);
// sd::ggml_graph_cut::mark_graph_cut(h, "vae.encoder.down." + std::to_string(i) + ".block." + std::to_string(j), "h");
} }
if (i != num_resolutions - 1) { if (i != num_resolutions - 1) {
std::string name = "down." + std::to_string(i) + ".downsample"; std::string name = "down." + std::to_string(i) + ".downsample";
auto down_sample = std::dynamic_pointer_cast<DownSampleBlock>(blocks[name]); auto down_sample = std::dynamic_pointer_cast<DownSampleBlock>(blocks[name]);
h = down_sample->forward(ctx, h); h = down_sample->forward(ctx, h);
// sd::ggml_graph_cut::mark_graph_cut(h, "vae.encoder.down." + std::to_string(i) + ".downsample", "h");
} }
} }
@ -353,7 +350,6 @@ public:
h = mid_block_1->forward(ctx, h); h = mid_block_1->forward(ctx, h);
h = mid_attn_1->forward(ctx, h); h = mid_attn_1->forward(ctx, h);
h = mid_block_2->forward(ctx, h); // [N, block_in, h, w] h = mid_block_2->forward(ctx, h); // [N, block_in, h, w]
// sd::ggml_graph_cut::mark_graph_cut(h, "vae.encoder.mid", "h");
// end // end
h = norm_out->forward(ctx, h); h = norm_out->forward(ctx, h);
@ -454,7 +450,6 @@ public:
// conv_in // conv_in
auto h = conv_in->forward(ctx, z); // [N, block_in, h, w] auto h = conv_in->forward(ctx, z); // [N, block_in, h, w]
// sd::ggml_graph_cut::mark_graph_cut(h, "vae.decoder.prelude", "h");
// middle // middle
h = mid_block_1->forward(ctx, h); h = mid_block_1->forward(ctx, h);
@ -462,7 +457,6 @@ public:
h = mid_attn_1->forward(ctx, h); h = mid_attn_1->forward(ctx, h);
h = mid_block_2->forward(ctx, h); // [N, block_in, h, w] h = mid_block_2->forward(ctx, h); // [N, block_in, h, w]
// sd::ggml_graph_cut::mark_graph_cut(h, "vae.decoder.mid", "h");
// upsampling // upsampling
int num_resolutions = static_cast<int>(ch_mult.size()); int num_resolutions = static_cast<int>(ch_mult.size());
@ -472,14 +466,12 @@ public:
auto up_block = std::dynamic_pointer_cast<ResnetBlock>(blocks[name]); auto up_block = std::dynamic_pointer_cast<ResnetBlock>(blocks[name]);
h = up_block->forward(ctx, h); h = up_block->forward(ctx, h);
// sd::ggml_graph_cut::mark_graph_cut(h, "vae.decoder.up." + std::to_string(i) + ".block." + std::to_string(j), "h");
} }
if (i != 0) { if (i != 0) {
std::string name = "up." + std::to_string(i) + ".upsample"; std::string name = "up." + std::to_string(i) + ".upsample";
auto up_sample = std::dynamic_pointer_cast<UpSampleBlock>(blocks[name]); auto up_sample = std::dynamic_pointer_cast<UpSampleBlock>(blocks[name]);
h = up_sample->forward(ctx, h); h = up_sample->forward(ctx, h);
// sd::ggml_graph_cut::mark_graph_cut(h, "vae.decoder.up." + std::to_string(i) + ".upsample", "h");
} }
} }
@ -607,7 +599,6 @@ public:
if (use_quant) { if (use_quant) {
auto post_quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["post_quant_conv"]); auto post_quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["post_quant_conv"]);
z = post_quant_conv->forward(ctx, z); // [N, z_channels, h, w] z = post_quant_conv->forward(ctx, z); // [N, z_channels, h, w]
// sd::ggml_graph_cut::mark_graph_cut(z, "vae.decode.prelude", "z");
} }
auto decoder = std::dynamic_pointer_cast<Decoder>(blocks["decoder"]); auto decoder = std::dynamic_pointer_cast<Decoder>(blocks["decoder"]);
@ -625,7 +616,6 @@ public:
if (use_quant) { if (use_quant) {
auto quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["quant_conv"]); auto quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["quant_conv"]);
z = quant_conv->forward(ctx, z); // [N, 2*embed_dim, h/8, w/8] z = quant_conv->forward(ctx, z); // [N, 2*embed_dim, h/8, w/8]
// sd::ggml_graph_cut::mark_graph_cut(z, "vae.encode.final", "z");
} }
if (sd_version_uses_flux2_vae(version)) { if (sd_version_uses_flux2_vae(version)) {
z = ggml_ext_chunk(ctx->ggml_ctx, z, 2, 2)[0]; z = ggml_ext_chunk(ctx->ggml_ctx, z, 2, 2)[0];

View File

@ -95,9 +95,8 @@ public:
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_tensor* x, ggml_tensor* x,
ggml_tensor* mask = nullptr, ggml_tensor* mask = nullptr,
int clip_skip = -1, int clip_skip = -1) {
const std::string& graph_cut_prefix = "") {
// x: [N, n_token, d_model] // x: [N, n_token, d_model]
int layer_idx = n_layer - 1; int layer_idx = n_layer - 1;
// LOG_DEBUG("clip_skip %d", clip_skip); // LOG_DEBUG("clip_skip %d", clip_skip);
@ -113,9 +112,6 @@ public:
std::string name = "layers." + std::to_string(i); std::string name = "layers." + std::to_string(i);
auto layer = std::dynamic_pointer_cast<CLIPLayer>(blocks[name]); auto layer = std::dynamic_pointer_cast<CLIPLayer>(blocks[name]);
x = layer->forward(ctx, x, mask); // [N, n_token, d_model] x = layer->forward(ctx, x, mask); // [N, n_token, d_model]
if (!graph_cut_prefix.empty()) {
sd::ggml_graph_cut::mark_graph_cut(x, graph_cut_prefix + ".layers." + std::to_string(i), "x");
}
// LOG_DEBUG("layer %d", i); // LOG_DEBUG("layer %d", i);
} }
return x; return x;
@ -308,8 +304,7 @@ public:
auto final_layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["final_layer_norm"]); auto final_layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["final_layer_norm"]);
auto x = embeddings->forward(ctx, input_ids, tkn_embeddings); // [N, n_token, hidden_size] auto x = embeddings->forward(ctx, input_ids, tkn_embeddings); // [N, n_token, hidden_size]
sd::ggml_graph_cut::mark_graph_cut(x, "clip_text.prelude", "x"); x = encoder->forward(ctx, x, mask, return_pooled ? -1 : clip_skip);
x = encoder->forward(ctx, x, mask, return_pooled ? -1 : clip_skip, "clip_text");
if (return_pooled || with_final_ln) { if (return_pooled || with_final_ln) {
x = final_layer_norm->forward(ctx, x); x = final_layer_norm->forward(ctx, x);
} }
@ -373,8 +368,7 @@ public:
auto x = embeddings->forward(ctx, pixel_values); // [N, num_positions, embed_dim] auto x = embeddings->forward(ctx, pixel_values); // [N, num_positions, embed_dim]
x = pre_layernorm->forward(ctx, x); x = pre_layernorm->forward(ctx, x);
sd::ggml_graph_cut::mark_graph_cut(x, "clip_vision.prelude", "x"); x = encoder->forward(ctx, x, nullptr, clip_skip);
x = encoder->forward(ctx, x, nullptr, clip_skip, "clip_vision");
auto last_hidden_state = x; auto last_hidden_state = x;

View File

@ -1,9 +1,7 @@
#ifndef __COMMON_BLOCK_HPP__ #ifndef __COMMON_BLOCK_HPP__
#define __COMMON_BLOCK_HPP__ #define __COMMON_BLOCK_HPP__
#include "ggml-backend.h"
#include "ggml_extend.hpp" #include "ggml_extend.hpp"
#include "util.h"
class DownSampleBlock : public GGMLBlock { class DownSampleBlock : public GGMLBlock {
protected: protected:
@ -250,6 +248,9 @@ public:
float scale = 1.f; float scale = 1.f;
if (precision_fix) { if (precision_fix) {
scale = 1.f / 128.f; scale = 1.f / 128.f;
#ifdef SD_USE_VULKAN
force_prec_f32 = true;
#endif
} }
// The purpose of the scale here is to prevent NaN issues in certain situations. // The purpose of the scale here is to prevent NaN issues in certain situations.
// For example, when using Vulkan without enabling force_prec_f32, // For example, when using Vulkan without enabling force_prec_f32,
@ -263,9 +264,6 @@ public:
auto net_0 = std::dynamic_pointer_cast<UnaryBlock>(blocks["net.0"]); auto net_0 = std::dynamic_pointer_cast<UnaryBlock>(blocks["net.0"]);
auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]); auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
if (sd_backend_is(ctx->backend, "Vulkan")) {
net_2->set_force_prec_f32(true);
}
x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim] x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim]
x = net_2->forward(ctx, x); // [ne3, ne2, ne1, dim_out] x = net_2->forward(ctx, x); // [ne3, ne2, ne1, dim_out]

View File

@ -85,8 +85,7 @@ public:
virtual void free_params_buffer() = 0; virtual void free_params_buffer() = 0;
virtual void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) = 0; virtual void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) = 0;
virtual size_t get_params_buffer_size() = 0; virtual size_t get_params_buffer_size() = 0;
virtual void set_max_graph_vram_bytes(size_t max_vram_bytes) {} virtual void set_flash_attention_enabled(bool enabled) = 0;
virtual void set_flash_attention_enabled(bool enabled) = 0;
virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {} virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {}
virtual std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(int n_threads, virtual std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(int n_threads,
const ConditionerParams& conditioner_params) { const ConditionerParams& conditioner_params) {
@ -166,13 +165,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
return buffer_size; return buffer_size;
} }
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
text_model->set_max_graph_vram_bytes(max_vram_bytes);
if (sd_version_is_sdxl(version)) {
text_model2->set_max_graph_vram_bytes(max_vram_bytes);
}
}
void set_flash_attention_enabled(bool enabled) override { void set_flash_attention_enabled(bool enabled) override {
text_model->set_flash_attention_enabled(enabled); text_model->set_flash_attention_enabled(enabled);
if (sd_version_is_sdxl(version)) { if (sd_version_is_sdxl(version)) {
@ -789,18 +781,6 @@ struct SD3CLIPEmbedder : public Conditioner {
return buffer_size; return buffer_size;
} }
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
if (clip_l) {
clip_l->set_max_graph_vram_bytes(max_vram_bytes);
}
if (clip_g) {
clip_g->set_max_graph_vram_bytes(max_vram_bytes);
}
if (t5) {
t5->set_max_graph_vram_bytes(max_vram_bytes);
}
}
void set_flash_attention_enabled(bool enabled) override { void set_flash_attention_enabled(bool enabled) override {
if (clip_l) { if (clip_l) {
clip_l->set_flash_attention_enabled(enabled); clip_l->set_flash_attention_enabled(enabled);
@ -1144,15 +1124,6 @@ struct FluxCLIPEmbedder : public Conditioner {
return buffer_size; return buffer_size;
} }
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
if (clip_l) {
clip_l->set_max_graph_vram_bytes(max_vram_bytes);
}
if (t5) {
t5->set_max_graph_vram_bytes(max_vram_bytes);
}
}
void set_flash_attention_enabled(bool enabled) override { void set_flash_attention_enabled(bool enabled) override {
if (clip_l) { if (clip_l) {
clip_l->set_flash_attention_enabled(enabled); clip_l->set_flash_attention_enabled(enabled);
@ -1378,12 +1349,6 @@ struct T5CLIPEmbedder : public Conditioner {
return buffer_size; return buffer_size;
} }
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
if (t5) {
t5->set_max_graph_vram_bytes(max_vram_bytes);
}
}
void set_flash_attention_enabled(bool enabled) override { void set_flash_attention_enabled(bool enabled) override {
if (t5) { if (t5) {
t5->set_flash_attention_enabled(enabled); t5->set_flash_attention_enabled(enabled);
@ -1560,10 +1525,6 @@ struct AnimaConditioner : public Conditioner {
return llm->get_params_buffer_size(); return llm->get_params_buffer_size();
} }
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
llm->set_max_graph_vram_bytes(max_vram_bytes);
}
void set_flash_attention_enabled(bool enabled) override { void set_flash_attention_enabled(bool enabled) override {
llm->set_flash_attention_enabled(enabled); llm->set_flash_attention_enabled(enabled);
} }
@ -1696,10 +1657,6 @@ struct LLMEmbedder : public Conditioner {
return buffer_size; return buffer_size;
} }
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
llm->set_max_graph_vram_bytes(max_vram_bytes);
}
void set_flash_attention_enabled(bool enabled) override { void set_flash_attention_enabled(bool enabled) override {
llm->set_flash_attention_enabled(enabled); llm->set_flash_attention_enabled(enabled);
} }

View File

@ -808,18 +808,6 @@ static std::tuple<float, float, float> get_ancestral_step_flow(float sigma_from,
return {sigma_down, sigma_up, alpha_scale}; return {sigma_down, sigma_up, alpha_scale};
} }
static std::tuple<float, float, float> get_ancestral_step(float sigma_from,
float sigma_to,
float eta,
bool is_flow_denoiser) {
if (is_flow_denoiser) {
return get_ancestral_step_flow(sigma_from, sigma_to, eta);
} else {
auto [sigma_down, sigma_up] = get_ancestral_step(sigma_from, sigma_to, eta);
return {sigma_down, sigma_up, 1.0f};
}
}
static sd::Tensor<float> sample_euler_ancestral(denoise_cb_t model, static sd::Tensor<float> sample_euler_ancestral(denoise_cb_t model,
sd::Tensor<float> x, sd::Tensor<float> x,
const std::vector<float>& sigmas, const std::vector<float>& sigmas,
@ -1259,7 +1247,6 @@ static sd::Tensor<float> sample_res_multistep(denoise_cb_t model,
sd::Tensor<float> x, sd::Tensor<float> x,
const std::vector<float>& sigmas, const std::vector<float>& sigmas,
std::shared_ptr<RNG> rng, std::shared_ptr<RNG> rng,
bool is_flow_denoiser,
float eta) { float eta) {
sd::Tensor<float> old_denoised = x; sd::Tensor<float> old_denoised = x;
bool have_old_sigma = false; bool have_old_sigma = false;
@ -1291,8 +1278,7 @@ static sd::Tensor<float> sample_res_multistep(denoise_cb_t model,
float sigma_from = sigmas[i]; float sigma_from = sigmas[i];
float sigma_to = sigmas[i + 1]; float sigma_to = sigmas[i + 1];
auto [sigma_down, sigma_up] = get_ancestral_step(sigma_from, sigma_to, eta);
auto [sigma_down, sigma_up, alpha_scale] = get_ancestral_step(sigma_from, sigma_to, eta, is_flow_denoiser);
if (sigma_down == 0.0f || !have_old_sigma) { if (sigma_down == 0.0f || !have_old_sigma) {
x += ((x - denoised) / sigma_from) * (sigma_down - sigma_from); x += ((x - denoised) / sigma_from) * (sigma_down - sigma_from);
@ -1319,10 +1305,7 @@ static sd::Tensor<float> sample_res_multistep(denoise_cb_t model,
x = sigma_fn(h) * x + h * (b1 * denoised + b2 * old_denoised); x = sigma_fn(h) * x + h * (b1 * denoised + b2 * old_denoised);
} }
if (sigma_to > 0.0f && sigma_up > 0.0f) { if (sigmas[i + 1] > 0 && sigma_up > 0.0f) {
if (is_flow_denoiser) {
x *= alpha_scale;
}
x += sd::Tensor<float>::randn_like(x, rng) * sigma_up; x += sd::Tensor<float>::randn_like(x, rng) * sigma_up;
} }
@ -1337,7 +1320,6 @@ static sd::Tensor<float> sample_res_2s(denoise_cb_t model,
sd::Tensor<float> x, sd::Tensor<float> x,
const std::vector<float>& sigmas, const std::vector<float>& sigmas,
std::shared_ptr<RNG> rng, std::shared_ptr<RNG> rng,
bool is_flow_denoiser,
float eta) { float eta) {
const float c2 = 0.5f; const float c2 = 0.5f;
auto t_fn = [](float sigma) -> float { return -logf(sigma); }; auto t_fn = [](float sigma) -> float { return -logf(sigma); };
@ -1366,7 +1348,7 @@ static sd::Tensor<float> sample_res_2s(denoise_cb_t model,
} }
sd::Tensor<float> denoised = std::move(denoised_opt); sd::Tensor<float> denoised = std::move(denoised_opt);
auto [sigma_down, sigma_up, alpha_scale] = get_ancestral_step(sigma_from, sigma_to, eta, is_flow_denoiser); auto [sigma_down, sigma_up] = get_ancestral_step(sigma_from, sigma_to, eta);
sd::Tensor<float> x0 = x; sd::Tensor<float> x0 = x;
if (sigma_down == 0.0f || sigma_from == 0.0f) { if (sigma_down == 0.0f || sigma_from == 0.0f) {
@ -1395,10 +1377,7 @@ static sd::Tensor<float> sample_res_2s(denoise_cb_t model,
x = x0 + h * (b1 * eps1 + b2 * eps2); x = x0 + h * (b1 * eps1 + b2 * eps2);
} }
if (sigma_to > 0.0f && sigma_up > 0.0f) { if (sigmas[i + 1] > 0 && sigma_up > 0.0f) {
if (is_flow_denoiser) {
x *= alpha_scale;
}
x += sd::Tensor<float>::randn_like(x, rng) * sigma_up; x += sd::Tensor<float>::randn_like(x, rng) * sigma_up;
} }
} }
@ -1685,9 +1664,9 @@ static sd::Tensor<float> sample_k_diffusion(sample_method_t method,
case IPNDM_V_SAMPLE_METHOD: case IPNDM_V_SAMPLE_METHOD:
return sample_ipndm_v(model, std::move(x), sigmas); return sample_ipndm_v(model, std::move(x), sigmas);
case RES_MULTISTEP_SAMPLE_METHOD: case RES_MULTISTEP_SAMPLE_METHOD:
return sample_res_multistep(model, std::move(x), sigmas, rng, is_flow_denoiser, eta); return sample_res_multistep(model, std::move(x), sigmas, rng, eta);
case RES_2S_SAMPLE_METHOD: case RES_2S_SAMPLE_METHOD:
return sample_res_2s(model, std::move(x), sigmas, rng, is_flow_denoiser, eta); return sample_res_2s(model, std::move(x), sigmas, rng, eta);
case ER_SDE_SAMPLE_METHOD: case ER_SDE_SAMPLE_METHOD:
return sample_er_sde(model, std::move(x), sigmas, rng, is_flow_denoiser, eta); return sample_er_sde(model, std::move(x), sigmas, rng, is_flow_denoiser, eta);
case DDIM_TRAILING_SAMPLE_METHOD: case DDIM_TRAILING_SAMPLE_METHOD:

View File

@ -49,7 +49,6 @@ struct DiffusionModel {
virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter){}; virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter){};
virtual int64_t get_adm_in_channels() = 0; virtual int64_t get_adm_in_channels() = 0;
virtual void set_flash_attention_enabled(bool enabled) = 0; virtual void set_flash_attention_enabled(bool enabled) = 0;
virtual void set_max_graph_vram_bytes(size_t max_vram_bytes) = 0;
virtual void set_circular_axes(bool circular_x, bool circular_y) = 0; virtual void set_circular_axes(bool circular_x, bool circular_y) = 0;
}; };
@ -99,10 +98,6 @@ struct UNetModel : public DiffusionModel {
unet.set_flash_attention_enabled(enabled); unet.set_flash_attention_enabled(enabled);
} }
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
unet.set_max_graph_vram_bytes(max_vram_bytes);
}
void set_circular_axes(bool circular_x, bool circular_y) override { void set_circular_axes(bool circular_x, bool circular_y) override {
unet.set_circular_axes(circular_x, circular_y); unet.set_circular_axes(circular_x, circular_y);
} }
@ -169,10 +164,6 @@ struct MMDiTModel : public DiffusionModel {
mmdit.set_flash_attention_enabled(enabled); mmdit.set_flash_attention_enabled(enabled);
} }
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
mmdit.set_max_graph_vram_bytes(max_vram_bytes);
}
void set_circular_axes(bool circular_x, bool circular_y) override { void set_circular_axes(bool circular_x, bool circular_y) override {
mmdit.set_circular_axes(circular_x, circular_y); mmdit.set_circular_axes(circular_x, circular_y);
} }
@ -238,10 +229,6 @@ struct FluxModel : public DiffusionModel {
flux.set_flash_attention_enabled(enabled); flux.set_flash_attention_enabled(enabled);
} }
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
flux.set_max_graph_vram_bytes(max_vram_bytes);
}
void set_circular_axes(bool circular_x, bool circular_y) override { void set_circular_axes(bool circular_x, bool circular_y) override {
flux.set_circular_axes(circular_x, circular_y); flux.set_circular_axes(circular_x, circular_y);
} }
@ -312,10 +299,6 @@ struct AnimaModel : public DiffusionModel {
anima.set_flash_attention_enabled(enabled); anima.set_flash_attention_enabled(enabled);
} }
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
anima.set_max_graph_vram_bytes(max_vram_bytes);
}
void set_circular_axes(bool circular_x, bool circular_y) override { void set_circular_axes(bool circular_x, bool circular_y) override {
anima.set_circular_axes(circular_x, circular_y); anima.set_circular_axes(circular_x, circular_y);
} }
@ -381,10 +364,6 @@ struct WanModel : public DiffusionModel {
wan.set_flash_attention_enabled(enabled); wan.set_flash_attention_enabled(enabled);
} }
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
wan.set_max_graph_vram_bytes(max_vram_bytes);
}
void set_circular_axes(bool circular_x, bool circular_y) override { void set_circular_axes(bool circular_x, bool circular_y) override {
wan.set_circular_axes(circular_x, circular_y); wan.set_circular_axes(circular_x, circular_y);
} }
@ -454,10 +433,6 @@ struct QwenImageModel : public DiffusionModel {
qwen_image.set_flash_attention_enabled(enabled); qwen_image.set_flash_attention_enabled(enabled);
} }
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
qwen_image.set_max_graph_vram_bytes(max_vram_bytes);
}
void set_circular_axes(bool circular_x, bool circular_y) override { void set_circular_axes(bool circular_x, bool circular_y) override {
qwen_image.set_circular_axes(circular_x, circular_y); qwen_image.set_circular_axes(circular_x, circular_y);
} }
@ -524,10 +499,6 @@ struct ZImageModel : public DiffusionModel {
z_image.set_flash_attention_enabled(enabled); z_image.set_flash_attention_enabled(enabled);
} }
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
z_image.set_max_graph_vram_bytes(max_vram_bytes);
}
void set_circular_axes(bool circular_x, bool circular_y) override { void set_circular_axes(bool circular_x, bool circular_y) override {
z_image.set_circular_axes(circular_x, circular_y); z_image.set_circular_axes(circular_x, circular_y);
} }
@ -593,10 +564,6 @@ struct ErnieImageModel : public DiffusionModel {
ernie_image.set_flash_attention_enabled(enabled); ernie_image.set_flash_attention_enabled(enabled);
} }
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
ernie_image.set_max_graph_vram_bytes(max_vram_bytes);
}
void set_circular_axes(bool circular_x, bool circular_y) override { void set_circular_axes(bool circular_x, bool circular_y) override {
ernie_image.set_circular_axes(circular_x, circular_y); ernie_image.set_circular_axes(circular_x, circular_y);
} }

View File

@ -295,9 +295,7 @@ namespace ErnieImage {
auto c = time_embedding->forward(ctx, sample); // [N, hidden_size] auto c = time_embedding->forward(ctx, sample); // [N, hidden_size]
auto mod_params = adaLN_mod->forward(ctx, ggml_silu(ctx->ggml_ctx, c)); // [N, 6 * hidden_size] auto mod_params = adaLN_mod->forward(ctx, ggml_silu(ctx->ggml_ctx, c)); // [N, 6 * hidden_size]
sd::ggml_graph_cut::mark_graph_cut(hidden_states, "ernie_image.prelude", "hidden_states"); auto chunks = ggml_ext_chunk(ctx->ggml_ctx, mod_params, 6, 0);
// sd::ggml_graph_cut::mark_graph_cut(mod_params, "ernie_image.prelude", "mod_params");
auto chunks = ggml_ext_chunk(ctx->ggml_ctx, mod_params, 6, 0);
std::vector<ggml_tensor*> temb; std::vector<ggml_tensor*> temb;
temb.reserve(6); temb.reserve(6);
for (auto chunk : chunks) { for (auto chunk : chunks) {
@ -307,7 +305,6 @@ namespace ErnieImage {
for (int i = 0; i < params.num_layers; i++) { for (int i = 0; i < params.num_layers; i++) {
auto layer = std::dynamic_pointer_cast<ErnieImageSharedAdaLNBlock>(blocks["layers." + std::to_string(i)]); auto layer = std::dynamic_pointer_cast<ErnieImageSharedAdaLNBlock>(blocks["layers." + std::to_string(i)]);
hidden_states = layer->forward(ctx, hidden_states, pe, temb); hidden_states = layer->forward(ctx, hidden_states, pe, temb);
sd::ggml_graph_cut::mark_graph_cut(hidden_states, "ernie_image.layers." + std::to_string(i), "hidden_states");
} }
hidden_states = final_norm->forward(ctx, hidden_states, c); hidden_states = final_norm->forward(ctx, hidden_states, c);

View File

@ -124,33 +124,27 @@ public:
auto conv_hr = std::dynamic_pointer_cast<Conv2d>(blocks["conv_hr"]); auto conv_hr = std::dynamic_pointer_cast<Conv2d>(blocks["conv_hr"]);
auto conv_last = std::dynamic_pointer_cast<Conv2d>(blocks["conv_last"]); auto conv_last = std::dynamic_pointer_cast<Conv2d>(blocks["conv_last"]);
auto feat = conv_first->forward(ctx, x); auto feat = conv_first->forward(ctx, x);
sd::ggml_graph_cut::mark_graph_cut(feat, "esrgan.prelude", "feat");
auto body_feat = feat; auto body_feat = feat;
for (int i = 0; i < num_block; i++) { for (int i = 0; i < num_block; i++) {
std::string name = "body." + std::to_string(i); std::string name = "body." + std::to_string(i);
auto block = std::dynamic_pointer_cast<RRDB>(blocks[name]); auto block = std::dynamic_pointer_cast<RRDB>(blocks[name]);
body_feat = block->forward(ctx, body_feat); body_feat = block->forward(ctx, body_feat);
sd::ggml_graph_cut::mark_graph_cut(body_feat, "esrgan.body." + std::to_string(i), "feat");
} }
body_feat = conv_body->forward(ctx, body_feat); body_feat = conv_body->forward(ctx, body_feat);
feat = ggml_add(ctx->ggml_ctx, feat, body_feat); feat = ggml_add(ctx->ggml_ctx, feat, body_feat);
sd::ggml_graph_cut::mark_graph_cut(feat, "esrgan.body.out", "feat");
// upsample // upsample
if (scale >= 2) { if (scale >= 2) {
auto conv_up1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up1"]); auto conv_up1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up1"]);
feat = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx->ggml_ctx, feat, 2, GGML_SCALE_MODE_NEAREST))); feat = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx->ggml_ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
sd::ggml_graph_cut::mark_graph_cut(feat, "esrgan.up1", "feat");
if (scale == 4) { if (scale == 4) {
auto conv_up2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up2"]); auto conv_up2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up2"]);
feat = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx->ggml_ctx, feat, 2, GGML_SCALE_MODE_NEAREST))); feat = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx->ggml_ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
sd::ggml_graph_cut::mark_graph_cut(feat, "esrgan.up2", "feat");
} }
} }
// for all scales // for all scales
auto out = conv_last->forward(ctx, lrelu(ctx, conv_hr->forward(ctx, feat))); auto out = conv_last->forward(ctx, lrelu(ctx, conv_hr->forward(ctx, feat)));
sd::ggml_graph_cut::mark_graph_cut(out, "esrgan.final", "out");
return out; return out;
} }
}; };

View File

@ -928,9 +928,6 @@ namespace Flux {
} }
txt = txt_in->forward(ctx, txt); txt = txt_in->forward(ctx, txt);
sd::ggml_graph_cut::mark_graph_cut(img, "flux.prelude", "img");
sd::ggml_graph_cut::mark_graph_cut(txt, "flux.prelude", "txt");
sd::ggml_graph_cut::mark_graph_cut(vec, "flux.prelude", "vec");
for (int i = 0; i < params.depth; i++) { for (int i = 0; i < params.depth; i++) {
if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i) != skip_layers.end()) { if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i) != skip_layers.end()) {
@ -942,8 +939,6 @@ namespace Flux {
auto img_txt = block->forward(ctx, img, txt, vec, pe, txt_img_mask, ds_img_mods, ds_txt_mods); auto img_txt = block->forward(ctx, img, txt, vec, pe, txt_img_mask, ds_img_mods, ds_txt_mods);
img = img_txt.first; // [N, n_img_token, hidden_size] img = img_txt.first; // [N, n_img_token, hidden_size]
txt = img_txt.second; // [N, n_txt_token, hidden_size] txt = img_txt.second; // [N, n_txt_token, hidden_size]
sd::ggml_graph_cut::mark_graph_cut(img, "flux.double_blocks." + std::to_string(i), "img");
sd::ggml_graph_cut::mark_graph_cut(txt, "flux.double_blocks." + std::to_string(i), "txt");
} }
auto txt_img = ggml_concat(ctx->ggml_ctx, txt, img, 1); // [N, n_txt_token + n_img_token, hidden_size] auto txt_img = ggml_concat(ctx->ggml_ctx, txt, img, 1); // [N, n_txt_token + n_img_token, hidden_size]
@ -954,7 +949,6 @@ namespace Flux {
auto block = std::dynamic_pointer_cast<SingleStreamBlock>(blocks["single_blocks." + std::to_string(i)]); auto block = std::dynamic_pointer_cast<SingleStreamBlock>(blocks["single_blocks." + std::to_string(i)]);
txt_img = block->forward(ctx, txt_img, vec, pe, txt_img_mask, ss_mods); txt_img = block->forward(ctx, txt_img, vec, pe, txt_img_mask, ss_mods);
sd::ggml_graph_cut::mark_graph_cut(txt_img, "flux.single_blocks." + std::to_string(i), "txt_img");
} }
img = ggml_view_3d(ctx->ggml_ctx, img = ggml_view_3d(ctx->ggml_ctx,

File diff suppressed because it is too large Load Diff

View File

@ -1,298 +0,0 @@
#ifndef __GGML_EXTEND_BACKEND_HPP__
#define __GGML_EXTEND_BACKEND_HPP__
#include <cstring>
#include <mutex>
#include "ggml-backend.h"
#include "ggml.h"
#ifndef __STATIC_INLINE__
#define __STATIC_INLINE__ static inline
#endif
inline void ggml_backend_load_all_once() {
// If the registry already has devices and the CPU backend is present,
// assume either static registration or explicit host-side preloading has
// completed and avoid rescanning the default paths.
if (ggml_backend_dev_count() > 0 && ggml_backend_reg_by_name("CPU") != nullptr) {
return;
}
// In dynamic-backend mode the backend modules are discovered at runtime,
// so we must load them before asking for the CPU backend or its proc table.
// If the host preloaded only a subset of backends, allow one default-path
// scan so missing modules can still be discovered.
static std::once_flag once;
std::call_once(once, []() {
if (ggml_backend_dev_count() > 0 && ggml_backend_reg_by_name("CPU") != nullptr) {
return;
}
ggml_backend_load_all();
});
}
// Do not gate this branch on GGML_CPU or GGML_CPU_ALL_VARIANTS:
// those are CMake options used to configure ggml itself, but they are not
// exported as PUBLIC compile definitions to stable-diffusion in backend-DL mode.
// In practice, this target can reliably see GGML_BACKEND_DL, but not whether
// the CPU backend was compiled as a loadable module. We therefore use runtime
// backend discovery instead of compile-time assumptions.
__STATIC_INLINE__ ggml_backend_reg_t ggml_backend_cpu_reg() {
ggml_backend_reg_t reg = ggml_backend_reg_by_name("CPU");
if (reg != nullptr) {
return reg;
}
ggml_backend_load_all_once();
return ggml_backend_reg_by_name("CPU");
}
__STATIC_INLINE__ ggml_backend_reg_t ggml_backend_reg_from_backend(ggml_backend_t backend) {
if (backend != nullptr) {
ggml_backend_dev_t device = ggml_backend_get_device(backend);
if (device != nullptr) {
return ggml_backend_dev_backend_reg(device);
}
}
return ggml_backend_cpu_reg();
}
__STATIC_INLINE__ ggml_backend_t ggml_backend_cpu_init() {
ggml_backend_t backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
if (backend != nullptr) {
return backend;
}
ggml_backend_load_all_once();
return ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
}
__STATIC_INLINE__ bool ggml_backend_is_cpu(ggml_backend_t backend) {
if (backend == nullptr) {
return false;
}
ggml_backend_dev_t device = ggml_backend_get_device(backend);
if (device != nullptr) {
return ggml_backend_dev_type(device) == GGML_BACKEND_DEVICE_TYPE_CPU;
}
const char* backend_name = ggml_backend_name(backend);
return backend_name != nullptr && std::strcmp(backend_name, "CPU") == 0;
}
__STATIC_INLINE__ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
ggml_backend_reg_t reg = ggml_backend_reg_from_backend(backend_cpu);
if (reg == nullptr) {
return;
}
auto fn = reinterpret_cast<ggml_backend_set_n_threads_t>(ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"));
if (fn != nullptr) {
fn(backend_cpu, n_threads);
}
}
using __ggml_backend_cpu_set_threadpool_t = void (*)(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
__STATIC_INLINE__ void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
ggml_backend_reg_t reg = ggml_backend_reg_from_backend(backend_cpu);
if (reg == nullptr) {
return;
}
auto fn = reinterpret_cast<__ggml_backend_cpu_set_threadpool_t>(ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool"));
if (fn != nullptr) {
fn(backend_cpu, threadpool);
}
}
__STATIC_INLINE__ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void* abort_callback_data) {
ggml_backend_reg_t reg = ggml_backend_reg_from_backend(backend_cpu);
if (reg == nullptr) {
return;
}
auto fn = reinterpret_cast<ggml_backend_set_abort_callback_t>(ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback"));
if (fn != nullptr) {
fn(backend_cpu, abort_callback, abort_callback_data);
}
}
__STATIC_INLINE__ ggml_backend_buffer_t ggml_backend_tensor_buffer(const struct ggml_tensor* tensor) {
if (tensor == nullptr) {
return nullptr;
}
return tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
}
__STATIC_INLINE__ bool ggml_backend_tensor_is_host_accessible(const struct ggml_tensor* tensor) {
if (tensor == nullptr || tensor->data == nullptr) {
return false;
}
ggml_backend_buffer_t buffer = ggml_backend_tensor_buffer(tensor);
return buffer == nullptr || ggml_backend_buffer_is_host(buffer);
}
__STATIC_INLINE__ size_t ggml_backend_tensor_offset(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
return (size_t)(i0 * tensor->nb[0] + i1 * tensor->nb[1] + i2 * tensor->nb[2] + i3 * tensor->nb[3]);
}
template <typename T>
__STATIC_INLINE__ void ggml_backend_tensor_write_scalar(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3, T value) {
const size_t offset = ggml_backend_tensor_offset(tensor, i0, i1, i2, i3);
if (ggml_backend_tensor_is_host_accessible(tensor)) {
auto* dst = reinterpret_cast<T*>(reinterpret_cast<char*>(tensor->data) + offset);
*dst = value;
return;
}
ggml_backend_tensor_set(const_cast<struct ggml_tensor*>(tensor), &value, offset, sizeof(T));
}
__STATIC_INLINE__ void ggml_set_f32_nd(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3, float value) {
switch (tensor->type) {
case GGML_TYPE_I8:
ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast<int8_t>(value));
break;
case GGML_TYPE_I16:
ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast<int16_t>(value));
break;
case GGML_TYPE_I32:
ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast<int32_t>(value));
break;
case GGML_TYPE_F16:
ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, ggml_fp32_to_fp16(value));
break;
case GGML_TYPE_BF16:
ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, ggml_fp32_to_bf16(value));
break;
case GGML_TYPE_F32:
ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, value);
break;
default:
GGML_ABORT("fatal error");
}
}
__STATIC_INLINE__ void ggml_set_f32_1d(const struct ggml_tensor* tensor, int i, float value) {
if (!ggml_is_contiguous(tensor)) {
int64_t id[4] = {0, 0, 0, 0};
ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
ggml_set_f32_nd(tensor, id[0], id[1], id[2], id[3], value);
return;
}
switch (tensor->type) {
case GGML_TYPE_I8:
ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast<int8_t>(value));
break;
case GGML_TYPE_I16:
ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast<int16_t>(value));
break;
case GGML_TYPE_I32:
ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast<int32_t>(value));
break;
case GGML_TYPE_F16:
ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, ggml_fp32_to_fp16(value));
break;
case GGML_TYPE_BF16:
ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, ggml_fp32_to_bf16(value));
break;
case GGML_TYPE_F32:
ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, value);
break;
default:
GGML_ABORT("fatal error");
}
}
__STATIC_INLINE__ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context* ctx, struct ggml_cgraph* cgraph, int n_threads) {
(void)ctx;
// The legacy ggml_graph_compute_with_ctx() symbol lives in ggml-cpu, but
// the backend proc table does not expose it in GGML_BACKEND_DL mode.
// Recreate the old behavior by initializing the CPU backend explicitly and
// executing the graph through the generic backend API.
ggml_backend_t backend = ggml_backend_cpu_init();
if (backend == nullptr) {
return GGML_STATUS_ALLOC_FAILED;
}
ggml_backend_cpu_set_n_threads(backend, n_threads);
const enum ggml_status status = ggml_backend_graph_compute(backend, cgraph);
ggml_backend_free(backend);
return status;
}
__STATIC_INLINE__ ggml_tensor* ggml_set_f32(struct ggml_tensor* tensor, float value) {
GGML_ASSERT(tensor != nullptr);
if (ggml_backend_tensor_is_host_accessible(tensor) && ggml_is_contiguous(tensor)) {
const int64_t nelements = ggml_nelements(tensor);
switch (tensor->type) {
case GGML_TYPE_I8: {
auto* data = reinterpret_cast<int8_t*>(tensor->data);
const int8_t v = static_cast<int8_t>(value);
for (int64_t i = 0; i < nelements; ++i) {
data[i] = v;
}
} break;
case GGML_TYPE_I16: {
auto* data = reinterpret_cast<int16_t*>(tensor->data);
const int16_t v = static_cast<int16_t>(value);
for (int64_t i = 0; i < nelements; ++i) {
data[i] = v;
}
} break;
case GGML_TYPE_I32: {
auto* data = reinterpret_cast<int32_t*>(tensor->data);
const int32_t v = static_cast<int32_t>(value);
for (int64_t i = 0; i < nelements; ++i) {
data[i] = v;
}
} break;
case GGML_TYPE_F16: {
auto* data = reinterpret_cast<ggml_fp16_t*>(tensor->data);
const ggml_fp16_t v = ggml_fp32_to_fp16(value);
for (int64_t i = 0; i < nelements; ++i) {
data[i] = v;
}
} break;
case GGML_TYPE_BF16: {
auto* data = reinterpret_cast<ggml_bf16_t*>(tensor->data);
const ggml_bf16_t v = ggml_fp32_to_bf16(value);
for (int64_t i = 0; i < nelements; ++i) {
data[i] = v;
}
} break;
case GGML_TYPE_F32: {
auto* data = reinterpret_cast<float*>(tensor->data);
for (int64_t i = 0; i < nelements; ++i) {
data[i] = value;
}
} break;
default:
GGML_ABORT("fatal error");
}
return tensor;
}
const int64_t nelements = ggml_nelements(tensor);
for (int64_t i = 0; i < nelements; ++i) {
ggml_set_f32_1d(tensor, static_cast<int>(i), value);
}
return tensor;
}
#endif

View File

@ -1,676 +0,0 @@
#include "ggml_graph_cut.h"
#include <algorithm>
#include <cstring>
#include <map>
#include <set>
#include <sstream>
#include <stack>
#include <unordered_map>
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "util.h"
#include "../ggml/src/ggml-impl.h"
namespace sd::ggml_graph_cut {
static std::string graph_cut_tensor_display_name(const ggml_tensor* tensor) {
if (tensor == nullptr) {
return "<null>";
}
if (tensor->name[0] != '\0') {
return tensor->name;
}
return sd_format("<tensor@%p>", (const void*)tensor);
}
static int graph_leaf_index(ggml_cgraph* gf, const ggml_tensor* tensor) {
GGML_ASSERT(gf != nullptr);
GGML_ASSERT(tensor != nullptr);
for (int i = 0; i < gf->n_leafs; ++i) {
if (gf->leafs[i] == tensor) {
return i;
}
}
return -1;
}
static bool is_params_tensor(const std::unordered_set<const ggml_tensor*>& params_tensor_set,
const ggml_tensor* tensor) {
if (tensor == nullptr) {
return false;
}
return params_tensor_set.find(tensor) != params_tensor_set.end();
}
static Plan::InputShape input_shape(const ggml_tensor* tensor) {
Plan::InputShape shape;
if (tensor == nullptr) {
return shape;
}
shape.type = tensor->type;
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
shape.ne[static_cast<size_t>(i)] = tensor->ne[i];
}
return shape;
}
static size_t graph_cut_segment_vram_bytes(const Segment& segment) {
return segment.compute_buffer_size +
segment.input_param_bytes +
segment.input_previous_cut_bytes +
segment.output_bytes;
}
static Segment make_segment_seed(const Plan& plan,
size_t start_segment_index,
size_t end_segment_index) {
GGML_ASSERT(start_segment_index < plan.segments.size());
GGML_ASSERT(end_segment_index < plan.segments.size());
GGML_ASSERT(start_segment_index <= end_segment_index);
Segment seed;
const auto& start_segment = plan.segments[start_segment_index];
const auto& target_segment = plan.segments[end_segment_index];
std::unordered_set<int> seen_output_node_indices;
for (size_t seg_idx = start_segment_index; seg_idx <= end_segment_index; ++seg_idx) {
for (int output_node_index : plan.segments[seg_idx].output_node_indices) {
if (seen_output_node_indices.insert(output_node_index).second) {
seed.output_node_indices.push_back(output_node_index);
}
}
}
if (start_segment_index == end_segment_index) {
seed.group_name = target_segment.group_name;
} else {
seed.group_name = sd_format("%s..%s",
start_segment.group_name.c_str(),
target_segment.group_name.c_str());
}
return seed;
}
static void build_segment(ggml_cgraph* gf,
Plan& plan,
Segment& segment,
const std::unordered_map<const ggml_tensor*, int>& producer_index,
std::unordered_set<int>& available_cut_output_node_indices,
ggml_backend_t backend,
const std::unordered_set<const ggml_tensor*>& params_tensor_set,
const char* log_desc) {
std::set<int> internal_nodes;
std::unordered_set<const ggml_tensor*> input_seen;
std::vector<Segment::InputRef> input_refs;
std::stack<ggml_tensor*> work_stack;
for (int output_node_index : segment.output_node_indices) {
ggml_tensor* output = ggml_graph_node(gf, output_node_index);
if (output != nullptr) {
work_stack.push(output);
}
}
while (!work_stack.empty()) {
ggml_tensor* tensor = work_stack.top();
work_stack.pop();
if (tensor == nullptr) {
continue;
}
auto producer_it = producer_index.find(tensor);
if (producer_it == producer_index.end()) {
if (input_seen.insert(tensor).second) {
Segment::InputRef input_ref;
input_ref.type = is_params_tensor(params_tensor_set, tensor) ? Segment::INPUT_PARAM : Segment::INPUT_EXTERNAL;
input_ref.display_name = graph_cut_tensor_display_name(tensor);
input_ref.leaf_index = graph_leaf_index(gf, tensor);
input_refs.push_back(std::move(input_ref));
}
continue;
}
int node_idx = producer_it->second;
if (available_cut_output_node_indices.find(node_idx) != available_cut_output_node_indices.end()) {
if (input_seen.insert(tensor).second) {
Segment::InputRef input_ref;
input_ref.type = Segment::INPUT_PREVIOUS_CUT;
input_ref.display_name = graph_cut_tensor_display_name(tensor);
input_ref.node_index = node_idx;
input_refs.push_back(std::move(input_ref));
}
continue;
}
if (!internal_nodes.insert(node_idx).second) {
continue;
}
ggml_tensor* node = ggml_graph_node(gf, node_idx);
for (int src_idx = 0; src_idx < GGML_MAX_SRC; ++src_idx) {
if (node->src[src_idx] != nullptr) {
work_stack.push(node->src[src_idx]);
}
}
}
if (!internal_nodes.empty()) {
segment.internal_node_indices.assign(internal_nodes.begin(), internal_nodes.end());
}
std::sort(input_refs.begin(),
input_refs.end(),
[](const Segment::InputRef& a, const Segment::InputRef& b) {
if (a.type != b.type) {
return a.type < b.type;
}
return a.display_name < b.display_name;
});
segment.input_refs = input_refs;
for (const auto& input : input_refs) {
ggml_tensor* current_input = input_tensor(gf, input);
size_t tensor_bytes = current_input == nullptr
? 0
: (input.type == Segment::INPUT_PREVIOUS_CUT
? cache_tensor_bytes(current_input)
: ggml_nbytes(current_input));
switch (input.type) {
case Segment::INPUT_PREVIOUS_CUT:
segment.input_previous_cut_bytes += tensor_bytes;
break;
case Segment::INPUT_PARAM:
segment.input_param_bytes += tensor_bytes;
break;
case Segment::INPUT_EXTERNAL:
default:
segment.input_external_bytes += tensor_bytes;
break;
}
}
for (int output_node_index : segment.output_node_indices) {
ggml_tensor* output = ggml_graph_node(gf, output_node_index);
segment.output_bytes += cache_tensor_bytes(output);
}
segment.compute_buffer_size = measure_segment_compute_buffer(backend, gf, segment, log_desc);
for (int output_node_index : segment.output_node_indices) {
available_cut_output_node_indices.insert(output_node_index);
}
plan.segments.push_back(std::move(segment));
}
bool is_graph_cut_tensor(const ggml_tensor* tensor) {
if (tensor == nullptr || tensor->name[0] == '\0') {
return false;
}
return std::strncmp(tensor->name, GGML_RUNNER_CUT_PREFIX, std::strlen(GGML_RUNNER_CUT_PREFIX)) == 0;
}
std::string make_graph_cut_name(const std::string& group, const std::string& output) {
return std::string(GGML_RUNNER_CUT_PREFIX) + group + "|" + output;
}
void mark_graph_cut(ggml_tensor* tensor, const std::string& group, const std::string& output) {
if (tensor == nullptr) {
return;
}
auto name = make_graph_cut_name(group, output);
ggml_set_name(tensor, name.c_str());
}
int leaf_count(ggml_cgraph* gf) {
GGML_ASSERT(gf != nullptr);
return gf->n_leafs;
}
ggml_tensor* leaf_tensor(ggml_cgraph* gf, int leaf_index) {
GGML_ASSERT(gf != nullptr);
if (leaf_index < 0 || leaf_index >= gf->n_leafs) {
return nullptr;
}
return gf->leafs[leaf_index];
}
ggml_backend_buffer_t tensor_buffer(const ggml_tensor* tensor) {
if (tensor == nullptr) {
return nullptr;
}
return tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
}
ggml_tensor* cache_source_tensor(ggml_tensor* tensor) {
if (tensor == nullptr) {
return nullptr;
}
return tensor->view_src ? tensor->view_src : tensor;
}
size_t cache_tensor_bytes(const ggml_tensor* tensor) {
if (tensor == nullptr) {
return 0;
}
const ggml_tensor* cache_src = tensor->view_src ? tensor->view_src : tensor;
return ggml_nbytes(cache_src);
}
bool plan_matches_graph(ggml_cgraph* gf, const Plan& plan) {
GGML_ASSERT(gf != nullptr);
if (ggml_graph_n_nodes(gf) != plan.n_nodes || gf->n_leafs != plan.n_leafs) {
return false;
}
for (const auto& input_shape_ref : plan.input_shapes) {
if (input_shape_ref.leaf_index < 0 || input_shape_ref.leaf_index >= gf->n_leafs) {
return false;
}
ggml_tensor* leaf = gf->leafs[input_shape_ref.leaf_index];
if (leaf == nullptr || input_shape_ref.type != leaf->type) {
return false;
}
for (int d = 0; d < GGML_MAX_DIMS; ++d) {
if (input_shape_ref.ne[static_cast<size_t>(d)] != leaf->ne[d]) {
return false;
}
}
}
return true;
}
ggml_tensor* output_tensor(ggml_cgraph* gf, const Segment& segment, size_t output_index) {
GGML_ASSERT(gf != nullptr);
if (output_index >= segment.output_node_indices.size()) {
return nullptr;
}
int node_index = segment.output_node_indices[output_index];
if (node_index < 0 || node_index >= ggml_graph_n_nodes(gf)) {
return nullptr;
}
return ggml_graph_node(gf, node_index);
}
ggml_tensor* input_tensor(ggml_cgraph* gf, const Segment::InputRef& input_ref) {
GGML_ASSERT(gf != nullptr);
if (input_ref.type == Segment::INPUT_PREVIOUS_CUT) {
if (input_ref.node_index < 0 || input_ref.node_index >= ggml_graph_n_nodes(gf)) {
return nullptr;
}
return ggml_graph_node(gf, input_ref.node_index);
}
if (input_ref.leaf_index < 0 || input_ref.leaf_index >= gf->n_leafs) {
return nullptr;
}
return leaf_tensor(gf, input_ref.leaf_index);
}
std::vector<ggml_tensor*> param_tensors(ggml_cgraph* gf, const Segment& segment) {
GGML_ASSERT(gf != nullptr);
std::vector<ggml_tensor*> tensors;
std::unordered_set<ggml_tensor*> seen_tensors;
tensors.reserve(segment.input_refs.size());
seen_tensors.reserve(segment.input_refs.size());
for (const auto& input_ref : segment.input_refs) {
if (input_ref.type != Segment::INPUT_PARAM) {
continue;
}
ggml_tensor* tensor = input_tensor(gf, input_ref);
if (tensor == nullptr) {
continue;
}
if (seen_tensors.insert(tensor).second) {
tensors.push_back(tensor);
}
}
return tensors;
}
std::vector<ggml_tensor*> runtime_param_tensors(ggml_cgraph* gf, const Segment& segment, const char* log_desc) {
std::vector<ggml_tensor*> tensors = param_tensors(gf, segment);
std::vector<ggml_tensor*> filtered_tensors;
filtered_tensors.reserve(tensors.size());
for (ggml_tensor* tensor : tensors) {
if (tensor_buffer(tensor) == nullptr) {
LOG_WARN("%s graph cut skipping param input without buffer: segment=%s tensor=%s",
log_desc == nullptr ? "unknown" : log_desc,
segment.group_name.c_str(),
tensor->name);
continue;
}
filtered_tensors.push_back(tensor);
}
return filtered_tensors;
}
std::unordered_set<std::string> collect_future_input_names(ggml_cgraph* gf,
const Plan& plan,
size_t current_segment_index) {
GGML_ASSERT(gf != nullptr);
std::unordered_set<std::string> future_input_names;
for (size_t seg_idx = current_segment_index + 1; seg_idx < plan.segments.size(); ++seg_idx) {
const auto& segment = plan.segments[seg_idx];
for (const auto& input_ref : segment.input_refs) {
if (input_ref.type != Segment::INPUT_PREVIOUS_CUT) {
continue;
}
ggml_tensor* current_input = input_tensor(gf, input_ref);
if (current_input != nullptr && current_input->name[0] != '\0') {
future_input_names.insert(current_input->name);
}
}
}
return future_input_names;
}
ggml_cgraph* build_segment_graph(ggml_cgraph* gf,
const Segment& segment,
ggml_context** graph_ctx_out) {
GGML_ASSERT(gf != nullptr);
GGML_ASSERT(graph_ctx_out != nullptr);
const size_t graph_size = segment.internal_node_indices.size() + segment.input_refs.size() + 8;
ggml_init_params params = {
/*.mem_size =*/ggml_graph_overhead_custom(graph_size, false) + 1024,
/*.mem_buffer =*/nullptr,
/*.no_alloc =*/true,
};
ggml_context* graph_ctx = ggml_init(params);
GGML_ASSERT(graph_ctx != nullptr);
ggml_cgraph* segment_graph = ggml_new_graph_custom(graph_ctx, graph_size, false);
GGML_ASSERT(segment_graph != nullptr);
for (const auto& input : segment.input_refs) {
ggml_tensor* current_input = input_tensor(gf, input);
if (current_input == nullptr) {
continue;
}
GGML_ASSERT(segment_graph->n_leafs < segment_graph->size);
segment_graph->leafs[segment_graph->n_leafs++] = current_input;
}
for (int output_node_index : segment.output_node_indices) {
ggml_tensor* output = ggml_graph_node(gf, output_node_index);
if (output == nullptr) {
continue;
}
ggml_set_output(output);
}
for (int node_idx : segment.internal_node_indices) {
ggml_graph_add_node(segment_graph, ggml_graph_node(gf, node_idx));
}
*graph_ctx_out = graph_ctx;
return segment_graph;
}
size_t measure_segment_compute_buffer(ggml_backend_t backend,
ggml_cgraph* gf,
const Segment& segment,
const char* log_desc) {
GGML_ASSERT(backend != nullptr);
GGML_ASSERT(gf != nullptr);
if (segment.internal_node_indices.empty()) {
return 0;
}
ggml_context* graph_ctx = nullptr;
ggml_cgraph* segment_graph = build_segment_graph(gf, segment, &graph_ctx);
ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
size_t sizes[1] = {0};
ggml_gallocr_reserve_n_size(
allocr,
segment_graph,
nullptr,
nullptr,
sizes);
size_t buffer_size = sizes[0];
ggml_gallocr_free(allocr);
ggml_free(graph_ctx);
return buffer_size;
}
Plan build_plan(ggml_backend_t backend,
ggml_cgraph* gf,
const std::unordered_set<const ggml_tensor*>& params_tensor_set,
const char* log_desc) {
GGML_ASSERT(backend != nullptr);
GGML_ASSERT(gf != nullptr);
Plan plan;
plan.available = true;
const int n_nodes = ggml_graph_n_nodes(gf);
if (n_nodes <= 0) {
return plan;
}
plan.n_nodes = n_nodes;
plan.n_leafs = gf->n_leafs;
for (int i = 0; i < gf->n_leafs; ++i) {
ggml_tensor* leaf = gf->leafs[i];
if (is_params_tensor(params_tensor_set, leaf)) {
continue;
}
auto shape = input_shape(leaf);
shape.leaf_index = i;
plan.input_shapes.push_back(shape);
}
std::unordered_map<const ggml_tensor*, int> producer_index;
producer_index.reserve(static_cast<size_t>(n_nodes));
for (int i = 0; i < n_nodes; ++i) {
producer_index[ggml_graph_node(gf, i)] = i;
}
std::vector<Segment> grouped_segments;
std::unordered_map<std::string, size_t> group_to_segment;
for (int i = 0; i < n_nodes; ++i) {
ggml_tensor* node = ggml_graph_node(gf, i);
if (!is_graph_cut_tensor(node)) {
continue;
}
plan.has_cuts = true;
std::string full_name(node->name);
std::string payload = full_name.substr(std::strlen(GGML_RUNNER_CUT_PREFIX));
size_t sep = payload.find('|');
std::string group = sep == std::string::npos ? payload : payload.substr(0, sep);
auto it = group_to_segment.find(group);
if (it == group_to_segment.end()) {
Segment segment;
segment.group_name = group;
segment.output_node_indices.push_back(i);
group_to_segment[group] = grouped_segments.size();
grouped_segments.push_back(std::move(segment));
} else {
auto& segment = grouped_segments[it->second];
segment.output_node_indices.push_back(i);
}
}
if (!plan.has_cuts) {
return plan;
}
std::unordered_set<int> available_cut_output_node_indices;
available_cut_output_node_indices.reserve(static_cast<size_t>(n_nodes));
for (auto& segment : grouped_segments) {
build_segment(gf,
plan,
segment,
producer_index,
available_cut_output_node_indices,
backend,
params_tensor_set,
log_desc);
}
ggml_tensor* final_output = ggml_graph_node(gf, -1);
if (final_output != nullptr && available_cut_output_node_indices.find(n_nodes - 1) == available_cut_output_node_indices.end()) {
Segment final_segment;
final_segment.group_name = "ggml_runner.final";
final_segment.output_node_indices.push_back(n_nodes - 1);
build_segment(gf,
plan,
final_segment,
producer_index,
available_cut_output_node_indices,
backend,
params_tensor_set,
log_desc);
}
return plan;
}
Plan apply_max_vram_budget(ggml_cgraph* gf,
const Plan& base_plan,
size_t max_graph_vram_bytes,
ggml_backend_t backend,
const std::unordered_set<const ggml_tensor*>& params_tensor_set,
const char* log_desc) {
GGML_ASSERT(backend != nullptr);
GGML_ASSERT(gf != nullptr);
int64_t t_budget_begin = ggml_time_ms();
if (max_graph_vram_bytes == 0 || !base_plan.has_cuts || base_plan.segments.size() <= 1) {
return base_plan;
}
const int n_nodes = ggml_graph_n_nodes(gf);
std::unordered_map<const ggml_tensor*, int> producer_index;
producer_index.reserve(static_cast<size_t>(n_nodes));
for (int i = 0; i < n_nodes; ++i) {
producer_index[ggml_graph_node(gf, i)] = i;
}
Plan merged_plan;
merged_plan.available = true;
merged_plan.has_cuts = base_plan.has_cuts;
merged_plan.valid = base_plan.valid;
merged_plan.n_nodes = base_plan.n_nodes;
merged_plan.n_leafs = base_plan.n_leafs;
std::unordered_set<int> available_cut_output_node_indices;
available_cut_output_node_indices.reserve(static_cast<size_t>(n_nodes));
size_t start_segment_index = 0;
while (start_segment_index < base_plan.segments.size()) {
Plan single_plan;
auto single_available_cut_output_node_indices = available_cut_output_node_indices;
auto single_seed = make_segment_seed(base_plan,
start_segment_index,
start_segment_index);
build_segment(gf,
single_plan,
single_seed,
producer_index,
single_available_cut_output_node_indices,
backend,
params_tensor_set,
log_desc);
GGML_ASSERT(!single_plan.segments.empty());
size_t best_end_segment_index = start_segment_index;
bool can_merge_next_segment = graph_cut_segment_vram_bytes(single_plan.segments.back()) <= max_graph_vram_bytes;
while (can_merge_next_segment && best_end_segment_index + 1 < base_plan.segments.size()) {
const size_t next_end_segment_index = best_end_segment_index + 1;
Plan candidate_plan;
auto candidate_available_cut_output_node_indices = available_cut_output_node_indices;
auto candidate_seed = make_segment_seed(base_plan,
start_segment_index,
next_end_segment_index);
build_segment(gf,
candidate_plan,
candidate_seed,
producer_index,
candidate_available_cut_output_node_indices,
backend,
params_tensor_set,
log_desc);
GGML_ASSERT(!candidate_plan.segments.empty());
const auto& candidate_segment = candidate_plan.segments.back();
if (graph_cut_segment_vram_bytes(candidate_segment) > max_graph_vram_bytes) {
break;
}
best_end_segment_index = next_end_segment_index;
}
auto best_seed = make_segment_seed(base_plan,
start_segment_index,
best_end_segment_index);
build_segment(gf,
merged_plan,
best_seed,
producer_index,
available_cut_output_node_indices,
backend,
params_tensor_set,
log_desc);
start_segment_index = best_end_segment_index + 1;
}
if (log_desc != nullptr && merged_plan.segments.size() != base_plan.segments.size()) {
LOG_INFO("%s graph cut max_vram=%.2f MB merged %zu segments -> %zu segments",
log_desc,
max_graph_vram_bytes / 1024.0 / 1024.0,
base_plan.segments.size(),
merged_plan.segments.size());
}
if (log_desc != nullptr) {
LOG_INFO("%s graph cut max_vram budget merge took %lld ms",
log_desc,
ggml_time_ms() - t_budget_begin);
}
return merged_plan;
}
Plan resolve_plan(ggml_backend_t backend,
ggml_cgraph* gf,
PlanCache* cache,
size_t max_graph_vram_bytes,
const std::unordered_set<const ggml_tensor*>& params_tensor_set,
const char* log_desc) {
GGML_ASSERT(backend != nullptr);
GGML_ASSERT(gf != nullptr);
GGML_ASSERT(cache != nullptr);
int64_t t_prepare_begin = ggml_time_ms();
Plan base_plan;
int64_t t_plan_begin = ggml_time_ms();
if (cache->graph_cut_plan.available && plan_matches_graph(gf, cache->graph_cut_plan)) {
base_plan = cache->graph_cut_plan;
} else {
base_plan = build_plan(backend, gf, params_tensor_set, log_desc);
cache->graph_cut_plan = base_plan;
cache->graph_cut_plan.available = true;
cache->budgeted_graph_cut_plan.available = false;
if (log_desc != nullptr) {
LOG_INFO("%s build cached graph cut plan done (taking %lld ms)", log_desc, ggml_time_ms() - t_plan_begin);
}
}
Plan resolved_plan = base_plan;
if (max_graph_vram_bytes > 0 && base_plan.has_cuts) {
if (cache->budgeted_graph_cut_plan.available &&
cache->budgeted_graph_cut_plan_max_vram_bytes == max_graph_vram_bytes &&
plan_matches_graph(gf, cache->budgeted_graph_cut_plan)) {
resolved_plan = cache->budgeted_graph_cut_plan;
} else {
resolved_plan = apply_max_vram_budget(gf,
base_plan,
max_graph_vram_bytes,
backend,
params_tensor_set,
log_desc);
cache->budgeted_graph_cut_plan = resolved_plan;
cache->budgeted_graph_cut_plan.available = true;
cache->budgeted_graph_cut_plan_max_vram_bytes = max_graph_vram_bytes;
}
}
return resolved_plan;
}
} // namespace sd::ggml_graph_cut

View File

@ -1,104 +0,0 @@
#ifndef __SD_GGML_GRAPH_CUT_H__
#define __SD_GGML_GRAPH_CUT_H__
#include <array>
#include <string>
#include <unordered_set>
#include <vector>
#include "ggml-backend.h"
#include "ggml.h"
namespace sd::ggml_graph_cut {
struct Segment {
enum InputType {
INPUT_EXTERNAL = 0,
INPUT_PREVIOUS_CUT,
INPUT_PARAM,
};
struct InputRef {
InputType type = INPUT_EXTERNAL;
std::string display_name;
int leaf_index = -1;
int node_index = -1;
};
size_t compute_buffer_size = 0;
size_t output_bytes = 0;
size_t input_external_bytes = 0;
size_t input_previous_cut_bytes = 0;
size_t input_param_bytes = 0;
std::string group_name;
std::vector<int> internal_node_indices;
std::vector<int> output_node_indices;
std::vector<InputRef> input_refs;
};
struct Plan {
struct InputShape {
int leaf_index = -1;
ggml_type type = GGML_TYPE_COUNT;
std::array<int64_t, GGML_MAX_DIMS> ne = {0, 0, 0, 0};
};
bool available = false;
bool has_cuts = false;
bool valid = true;
int n_nodes = 0;
int n_leafs = 0;
std::vector<InputShape> input_shapes;
std::vector<Segment> segments;
};
struct PlanCache {
Plan graph_cut_plan;
Plan budgeted_graph_cut_plan;
size_t budgeted_graph_cut_plan_max_vram_bytes = 0;
};
static constexpr const char* GGML_RUNNER_CUT_PREFIX = "ggml_runner_cut:";
bool is_graph_cut_tensor(const ggml_tensor* tensor);
std::string make_graph_cut_name(const std::string& group, const std::string& output);
void mark_graph_cut(ggml_tensor* tensor, const std::string& group, const std::string& output);
int leaf_count(ggml_cgraph* gf);
ggml_tensor* leaf_tensor(ggml_cgraph* gf, int leaf_index);
ggml_backend_buffer_t tensor_buffer(const ggml_tensor* tensor);
ggml_tensor* cache_source_tensor(ggml_tensor* tensor);
size_t cache_tensor_bytes(const ggml_tensor* tensor);
bool plan_matches_graph(ggml_cgraph* gf, const Plan& plan);
ggml_tensor* output_tensor(ggml_cgraph* gf, const Segment& segment, size_t output_index);
ggml_tensor* input_tensor(ggml_cgraph* gf, const Segment::InputRef& input_ref);
std::vector<ggml_tensor*> param_tensors(ggml_cgraph* gf, const Segment& segment);
std::vector<ggml_tensor*> runtime_param_tensors(ggml_cgraph* gf, const Segment& segment, const char* log_desc);
std::unordered_set<std::string> collect_future_input_names(ggml_cgraph* gf,
const Plan& plan,
size_t current_segment_index);
ggml_cgraph* build_segment_graph(ggml_cgraph* gf,
const Segment& segment,
ggml_context** graph_ctx_out);
size_t measure_segment_compute_buffer(ggml_backend_t backend,
ggml_cgraph* gf,
const Segment& segment,
const char* log_desc);
Plan build_plan(ggml_backend_t backend,
ggml_cgraph* gf,
const std::unordered_set<const ggml_tensor*>& params_tensor_set,
const char* log_desc);
Plan apply_max_vram_budget(ggml_cgraph* gf,
const Plan& base_plan,
size_t max_graph_vram_bytes,
ggml_backend_t backend,
const std::unordered_set<const ggml_tensor*>& params_tensor_set,
const char* log_desc);
Plan resolve_plan(ggml_backend_t backend,
ggml_cgraph* gf,
PlanCache* cache,
size_t max_graph_vram_bytes,
const std::unordered_set<const ggml_tensor*>& params_tensor_set,
const char* log_desc);
} // namespace sd::ggml_graph_cut
#endif

View File

@ -346,7 +346,6 @@ namespace LLM {
auto merger = std::dynamic_pointer_cast<PatchMerger>(blocks["merger"]); auto merger = std::dynamic_pointer_cast<PatchMerger>(blocks["merger"]);
auto x = patch_embed->forward(ctx, pixel_values); auto x = patch_embed->forward(ctx, pixel_values);
sd::ggml_graph_cut::mark_graph_cut(x, "llm.vision.prelude", "x");
x = ggml_reshape_4d(ctx->ggml_ctx, x, x->ne[0] * spatial_merge_size * spatial_merge_size, x->ne[1] / spatial_merge_size / spatial_merge_size, x->ne[2], x->ne[3]); x = ggml_reshape_4d(ctx->ggml_ctx, x, x->ne[0] * spatial_merge_size * spatial_merge_size, x->ne[1] / spatial_merge_size / spatial_merge_size, x->ne[2], x->ne[3]);
x = ggml_get_rows(ctx->ggml_ctx, x, window_index); x = ggml_get_rows(ctx->ggml_ctx, x, window_index);
@ -360,11 +359,9 @@ namespace LLM {
mask = nullptr; mask = nullptr;
} }
x = block->forward(ctx, x, pe, mask); x = block->forward(ctx, x, pe, mask);
sd::ggml_graph_cut::mark_graph_cut(x, "llm.vision.blocks." + std::to_string(i), "x");
} }
x = merger->forward(ctx, x); x = merger->forward(ctx, x);
sd::ggml_graph_cut::mark_graph_cut(x, "llm.vision.final", "x");
x = ggml_get_rows(ctx->ggml_ctx, x, window_inverse_index); x = ggml_get_rows(ctx->ggml_ctx, x, window_inverse_index);
@ -509,7 +506,6 @@ namespace LLM {
auto norm = std::dynamic_pointer_cast<RMSNorm>(blocks["norm"]); auto norm = std::dynamic_pointer_cast<RMSNorm>(blocks["norm"]);
auto x = embed_tokens->forward(ctx, input_ids); auto x = embed_tokens->forward(ctx, input_ids);
sd::ggml_graph_cut::mark_graph_cut(x, "llm.text.prelude", "x");
std::vector<ggml_tensor*> intermediate_outputs; std::vector<ggml_tensor*> intermediate_outputs;
@ -556,10 +552,6 @@ namespace LLM {
auto block = std::dynamic_pointer_cast<TransformerBlock>(blocks["layers." + std::to_string(i)]); auto block = std::dynamic_pointer_cast<TransformerBlock>(blocks["layers." + std::to_string(i)]);
x = block->forward(ctx, x, input_pos, attention_mask); x = block->forward(ctx, x, input_pos, attention_mask);
if (out_layers.size() > 1) {
x = ggml_cont(ctx->ggml_ctx, x);
}
sd::ggml_graph_cut::mark_graph_cut(x, "llm.text.layers." + std::to_string(i), "x");
if (out_layers.find(i + 1) != out_layers.end()) { if (out_layers.find(i + 1) != out_layers.end()) {
intermediate_outputs.push_back(x); intermediate_outputs.push_back(x);
} }

View File

@ -129,7 +129,7 @@ struct LoraModel : public GGMLRunner {
} }
} }
ggml_tensor* get_lora_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_backend_t backend) { ggml_tensor* get_lora_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) {
ggml_tensor* updown = nullptr; ggml_tensor* updown = nullptr;
int index = 0; int index = 0;
while (true) { while (true) {
@ -152,17 +152,17 @@ struct LoraModel : public GGMLRunner {
auto iter = lora_tensors.find(lora_up_name); auto iter = lora_tensors.find(lora_up_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
lora_up = ggml_ext_cast_f32(ctx, backend, iter->second); lora_up = ggml_ext_cast_f32(ctx, iter->second);
} }
iter = lora_tensors.find(lora_mid_name); iter = lora_tensors.find(lora_mid_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
lora_mid = ggml_ext_cast_f32(ctx, backend, iter->second); lora_mid = ggml_ext_cast_f32(ctx, iter->second);
} }
iter = lora_tensors.find(lora_down_name); iter = lora_tensors.find(lora_down_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
lora_down = ggml_ext_cast_f32(ctx, backend, iter->second); lora_down = ggml_ext_cast_f32(ctx, iter->second);
} }
if (lora_up == nullptr || lora_down == nullptr) { if (lora_up == nullptr || lora_down == nullptr) {
@ -208,7 +208,7 @@ struct LoraModel : public GGMLRunner {
return updown; return updown;
} }
ggml_tensor* get_raw_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_backend_t backend) { ggml_tensor* get_raw_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) {
ggml_tensor* updown = nullptr; ggml_tensor* updown = nullptr;
int index = 0; int index = 0;
while (true) { while (true) {
@ -225,7 +225,7 @@ struct LoraModel : public GGMLRunner {
auto iter = lora_tensors.find(diff_name); auto iter = lora_tensors.find(diff_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
curr_updown = ggml_ext_cast_f32(ctx, backend, iter->second); curr_updown = ggml_ext_cast_f32(ctx, iter->second);
} else { } else {
break; break;
} }
@ -248,7 +248,7 @@ struct LoraModel : public GGMLRunner {
return updown; return updown;
} }
ggml_tensor* get_loha_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_backend_t backend) { ggml_tensor* get_loha_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) {
ggml_tensor* updown = nullptr; ggml_tensor* updown = nullptr;
int index = 0; int index = 0;
while (true) { while (true) {
@ -276,33 +276,33 @@ struct LoraModel : public GGMLRunner {
auto iter = lora_tensors.find(hada_1_down_name); auto iter = lora_tensors.find(hada_1_down_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
hada_1_down = ggml_ext_cast_f32(ctx, backend, iter->second); hada_1_down = ggml_ext_cast_f32(ctx, iter->second);
} }
iter = lora_tensors.find(hada_1_up_name); iter = lora_tensors.find(hada_1_up_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
hada_1_up = ggml_ext_cast_f32(ctx, backend, iter->second); hada_1_up = ggml_ext_cast_f32(ctx, iter->second);
} }
iter = lora_tensors.find(hada_1_mid_name); iter = lora_tensors.find(hada_1_mid_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
hada_1_mid = ggml_ext_cast_f32(ctx, backend, iter->second); hada_1_mid = ggml_ext_cast_f32(ctx, iter->second);
hada_1_up = ggml_cont(ctx, ggml_transpose(ctx, hada_1_up)); hada_1_up = ggml_cont(ctx, ggml_transpose(ctx, hada_1_up));
} }
iter = lora_tensors.find(hada_2_down_name); iter = lora_tensors.find(hada_2_down_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
hada_2_down = ggml_ext_cast_f32(ctx, backend, iter->second); hada_2_down = ggml_ext_cast_f32(ctx, iter->second);
} }
iter = lora_tensors.find(hada_2_up_name); iter = lora_tensors.find(hada_2_up_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
hada_2_up = ggml_ext_cast_f32(ctx, backend, iter->second); hada_2_up = ggml_ext_cast_f32(ctx, iter->second);
} }
iter = lora_tensors.find(hada_2_mid_name); iter = lora_tensors.find(hada_2_mid_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
hada_2_mid = ggml_ext_cast_f32(ctx, backend, iter->second); hada_2_mid = ggml_ext_cast_f32(ctx, iter->second);
hada_2_up = ggml_cont(ctx, ggml_transpose(ctx, hada_2_up)); hada_2_up = ggml_cont(ctx, ggml_transpose(ctx, hada_2_up));
} }
@ -351,7 +351,7 @@ struct LoraModel : public GGMLRunner {
return updown; return updown;
} }
ggml_tensor* get_lokr_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_backend_t backend) { ggml_tensor* get_lokr_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) {
ggml_tensor* updown = nullptr; ggml_tensor* updown = nullptr;
int index = 0; int index = 0;
while (true) { while (true) {
@ -378,24 +378,24 @@ struct LoraModel : public GGMLRunner {
auto iter = lora_tensors.find(lokr_w1_name); auto iter = lora_tensors.find(lokr_w1_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
lokr_w1 = ggml_ext_cast_f32(ctx, backend, iter->second); lokr_w1 = ggml_ext_cast_f32(ctx, iter->second);
} }
iter = lora_tensors.find(lokr_w2_name); iter = lora_tensors.find(lokr_w2_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
lokr_w2 = ggml_ext_cast_f32(ctx, backend, iter->second); lokr_w2 = ggml_ext_cast_f32(ctx, iter->second);
} }
int64_t rank = 1; int64_t rank = 1;
if (lokr_w1 == nullptr) { if (lokr_w1 == nullptr) {
iter = lora_tensors.find(lokr_w1_a_name); iter = lora_tensors.find(lokr_w1_a_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
lokr_w1_a = ggml_ext_cast_f32(ctx, backend, iter->second); lokr_w1_a = ggml_ext_cast_f32(ctx, iter->second);
} }
iter = lora_tensors.find(lokr_w1_b_name); iter = lora_tensors.find(lokr_w1_b_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
lokr_w1_b = ggml_ext_cast_f32(ctx, backend, iter->second); lokr_w1_b = ggml_ext_cast_f32(ctx, iter->second);
} }
if (lokr_w1_a == nullptr || lokr_w1_b == nullptr) { if (lokr_w1_a == nullptr || lokr_w1_b == nullptr) {
@ -410,12 +410,12 @@ struct LoraModel : public GGMLRunner {
if (lokr_w2 == nullptr) { if (lokr_w2 == nullptr) {
iter = lora_tensors.find(lokr_w2_a_name); iter = lora_tensors.find(lokr_w2_a_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
lokr_w2_a = ggml_ext_cast_f32(ctx, backend, iter->second); lokr_w2_a = ggml_ext_cast_f32(ctx, iter->second);
} }
iter = lora_tensors.find(lokr_w2_b_name); iter = lora_tensors.find(lokr_w2_b_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
lokr_w2_b = ggml_ext_cast_f32(ctx, backend, iter->second); lokr_w2_b = ggml_ext_cast_f32(ctx, iter->second);
} }
if (lokr_w2_a == nullptr || lokr_w2_b == nullptr) { if (lokr_w2_a == nullptr || lokr_w2_b == nullptr) {
@ -468,23 +468,23 @@ struct LoraModel : public GGMLRunner {
return updown; return updown;
} }
ggml_tensor* get_weight_diff(const std::string& model_tensor_name, ggml_backend_t backend, ggml_context* ctx, ggml_tensor* model_tensor, bool with_lora_and_lokr = true) { ggml_tensor* get_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_tensor* model_tensor, bool with_lora_and_lokr = true) {
// lora // lora
ggml_tensor* diff = nullptr; ggml_tensor* diff = nullptr;
if (with_lora_and_lokr) { if (with_lora_and_lokr) {
diff = get_lora_weight_diff(model_tensor_name, ctx, backend); diff = get_lora_weight_diff(model_tensor_name, ctx);
} }
// diff // diff
if (diff == nullptr) { if (diff == nullptr) {
diff = get_raw_weight_diff(model_tensor_name, ctx, backend); diff = get_raw_weight_diff(model_tensor_name, ctx);
} }
// loha // loha
if (diff == nullptr) { if (diff == nullptr) {
diff = get_loha_weight_diff(model_tensor_name, ctx, backend); diff = get_loha_weight_diff(model_tensor_name, ctx);
} }
// lokr // lokr
if (diff == nullptr && with_lora_and_lokr) { if (diff == nullptr && with_lora_and_lokr) {
diff = get_lokr_weight_diff(model_tensor_name, ctx, backend); diff = get_lokr_weight_diff(model_tensor_name, ctx);
} }
if (diff != nullptr) { if (diff != nullptr) {
if (ggml_nelements(diff) < ggml_nelements(model_tensor)) { if (ggml_nelements(diff) < ggml_nelements(model_tensor)) {
@ -502,7 +502,6 @@ struct LoraModel : public GGMLRunner {
} }
ggml_tensor* get_out_diff(ggml_context* ctx, ggml_tensor* get_out_diff(ggml_context* ctx,
ggml_backend_t backend,
ggml_tensor* x, ggml_tensor* x,
WeightAdapter::ForwardParams forward_params, WeightAdapter::ForwardParams forward_params,
const std::string& model_tensor_name) { const std::string& model_tensor_name) {
@ -591,7 +590,7 @@ struct LoraModel : public GGMLRunner {
} }
scale_value *= multiplier; scale_value *= multiplier;
auto curr_out_diff = ggml_ext_lokr_forward(ctx, backend, x, lokr_w1, lokr_w1_a, lokr_w1_b, lokr_w2, lokr_w2_a, lokr_w2_b, is_conv2d, forward_params.conv2d, scale_value); auto curr_out_diff = ggml_ext_lokr_forward(ctx, x, lokr_w1, lokr_w1_a, lokr_w1_b, lokr_w2, lokr_w2_a, lokr_w2_b, is_conv2d, forward_params.conv2d, scale_value);
if (out_diff == nullptr) { if (out_diff == nullptr) {
out_diff = curr_out_diff; out_diff = curr_out_diff;
} else { } else {
@ -762,7 +761,7 @@ struct LoraModel : public GGMLRunner {
ggml_tensor* model_tensor = it.second; ggml_tensor* model_tensor = it.second;
// lora // lora
ggml_tensor* diff = get_weight_diff(model_tensor_name, runtime_backend, compute_ctx, model_tensor); ggml_tensor* diff = get_weight_diff(model_tensor_name, compute_ctx, model_tensor);
if (diff == nullptr) { if (diff == nullptr) {
continue; continue;
} }
@ -775,7 +774,7 @@ struct LoraModel : public GGMLRunner {
ggml_tensor* final_tensor; ggml_tensor* final_tensor;
if (model_tensor->type != GGML_TYPE_F32 && model_tensor->type != GGML_TYPE_F16) { if (model_tensor->type != GGML_TYPE_F32 && model_tensor->type != GGML_TYPE_F16) {
final_tensor = ggml_ext_cast_f32(compute_ctx, runtime_backend, model_tensor); final_tensor = ggml_ext_cast_f32(compute_ctx, model_tensor);
final_tensor = ggml_add_inplace(compute_ctx, final_tensor, diff); final_tensor = ggml_add_inplace(compute_ctx, final_tensor, diff);
final_tensor = ggml_cpy(compute_ctx, final_tensor, model_tensor); final_tensor = ggml_cpy(compute_ctx, final_tensor, model_tensor);
} else { } else {
@ -842,35 +841,34 @@ public:
: lora_models(lora_models) { : lora_models(lora_models) {
} }
ggml_tensor* patch_weight(ggml_context* ctx, ggml_backend_t backend, ggml_tensor* weight, const std::string& weight_name, bool with_lora_and_lokr) { ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name, bool with_lora_and_lokr) {
for (auto& lora_model : lora_models) { for (auto& lora_model : lora_models) {
ggml_tensor* diff = lora_model->get_weight_diff(weight_name, backend, ctx, weight, with_lora_and_lokr); ggml_tensor* diff = lora_model->get_weight_diff(weight_name, ctx, weight, with_lora_and_lokr);
if (diff == nullptr) { if (diff == nullptr) {
continue; continue;
} }
if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) { if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) {
weight = ggml_ext_cast_f32(ctx, backend, weight); weight = ggml_ext_cast_f32(ctx, weight);
} }
weight = ggml_add(ctx, weight, diff); weight = ggml_add(ctx, weight, diff);
} }
return weight; return weight;
} }
ggml_tensor* patch_weight(ggml_context* ctx, ggml_backend_t backend, ggml_tensor* weight, const std::string& weight_name) override { ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name) override {
return patch_weight(ctx, backend, weight, weight_name, true); return patch_weight(ctx, weight, weight_name, true);
} }
ggml_tensor* forward_with_lora(ggml_context* ctx, ggml_tensor* forward_with_lora(ggml_context* ctx,
ggml_backend_t backend,
ggml_tensor* x, ggml_tensor* x,
ggml_tensor* w, ggml_tensor* w,
ggml_tensor* b, ggml_tensor* b,
const std::string& prefix, const std::string& prefix,
WeightAdapter::ForwardParams forward_params) override { WeightAdapter::ForwardParams forward_params) override {
w = patch_weight(ctx, backend, w, prefix + "weight", false); w = patch_weight(ctx, w, prefix + "weight", false);
if (b) { if (b) {
b = patch_weight(ctx, backend, b, prefix + "bias", false); b = patch_weight(ctx, b, prefix + "bias", false);
} }
ggml_tensor* out; ggml_tensor* out;
if (forward_params.op_type == ForwardParams::op_type_t::OP_LINEAR) { if (forward_params.op_type == ForwardParams::op_type_t::OP_LINEAR) {
@ -892,7 +890,7 @@ public:
forward_params.conv2d.scale); forward_params.conv2d.scale);
} }
for (auto& lora_model : lora_models) { for (auto& lora_model : lora_models) {
ggml_tensor* out_diff = lora_model->get_out_diff(ctx, backend, x, forward_params, prefix + "weight"); ggml_tensor* out_diff = lora_model->get_out_diff(ctx, x, forward_params, prefix + "weight");
if (out_diff == nullptr) { if (out_diff == nullptr) {
continue; continue;
} }

View File

@ -767,8 +767,6 @@ public:
auto context_x = block->forward(ctx, context, x, c_mod); auto context_x = block->forward(ctx, context, x, c_mod);
context = context_x.first; context = context_x.first;
x = context_x.second; x = context_x.second;
sd::ggml_graph_cut::mark_graph_cut(context, "mmdit.joint_blocks." + std::to_string(i), "context");
sd::ggml_graph_cut::mark_graph_cut(x, "mmdit.joint_blocks." + std::to_string(i), "x");
} }
x = final_layer->forward(ctx, x, c_mod); // (N, T, patch_size ** 2 * out_channels) x = final_layer->forward(ctx, x, c_mod); // (N, T, patch_size ** 2 * out_channels)
@ -811,11 +809,6 @@ public:
context = context_embedder->forward(ctx, context); // [N, L, D] aka [N, L, 1536] context = context_embedder->forward(ctx, context); // [N, L, D] aka [N, L, 1536]
} }
sd::ggml_graph_cut::mark_graph_cut(x, "mmdit.prelude", "x");
sd::ggml_graph_cut::mark_graph_cut(c, "mmdit.prelude", "c");
if (context != nullptr) {
sd::ggml_graph_cut::mark_graph_cut(context, "mmdit.prelude", "context");
}
x = forward_core_with_concat(ctx, x, c, context, skip_layers); // (N, H*W, patch_size ** 2 * out_channels) x = forward_core_with_concat(ctx, x, c, context, skip_layers); // (N, H*W, patch_size ** 2 * out_channels)

View File

@ -23,11 +23,24 @@
#include "ggml-alloc.h" #include "ggml-alloc.h"
#include "ggml-backend.h" #include "ggml-backend.h"
#include "ggml-cpu.h"
#include "ggml.h" #include "ggml.h"
#include "ggml_extend_backend.hpp"
#include "zip.h" #include "zip.h"
#include "name_conversion.h" #include "name_conversion.h"
#include "stable-diffusion.h"
#ifdef SD_USE_METAL
#include "ggml-metal.h"
#endif
#ifdef SD_USE_VULKAN
#include "ggml-vulkan.h"
#endif
#ifdef SD_USE_OPENCL
#include "ggml-opencl.h"
#endif
/*================================================= Preprocess ==================================================*/ /*================================================= Preprocess ==================================================*/

View File

@ -24,75 +24,6 @@ static inline void preprocessing_set_4d(sd::Tensor<float>& tensor, float value,
tensor.values()[static_cast<size_t>(preprocessing_offset_4d(tensor, i0, i1, i2, i3))] = value; tensor.values()[static_cast<size_t>(preprocessing_offset_4d(tensor, i0, i1, i2, i3))] = value;
} }
static inline uint8_t preprocessing_float_to_u8(float value) {
if (value <= 0.0f) {
return 0;
}
if (value >= 1.0f) {
return 255;
}
return static_cast<uint8_t>(value * 255.0f + 0.5f);
}
static inline void preprocessing_tensor_frame_to_sd_image(const sd::Tensor<float>& tensor, int frame_index, uint8_t* image_data) {
const auto& shape = tensor.shape();
GGML_ASSERT(shape.size() == 4 || shape.size() == 5);
GGML_ASSERT(image_data != nullptr);
const int width = static_cast<int>(shape[0]);
const int height = static_cast<int>(shape[1]);
const int channel = static_cast<int>(shape[shape.size() == 5 ? 3 : 2]);
const size_t pixels = static_cast<size_t>(width) * static_cast<size_t>(height);
const float* src = tensor.data();
if (shape.size() == 4) {
GGML_ASSERT(frame_index >= 0 && frame_index < shape[3]);
const size_t frame_stride = pixels * static_cast<size_t>(channel);
const float* frame_ptr = src + static_cast<size_t>(frame_index) * frame_stride;
if (channel == 3) {
const float* c0 = frame_ptr;
const float* c1 = frame_ptr + pixels;
const float* c2 = frame_ptr + pixels * 2;
for (size_t i = 0; i < pixels; ++i) {
image_data[i * 3 + 0] = preprocessing_float_to_u8(c0[i]);
image_data[i * 3 + 1] = preprocessing_float_to_u8(c1[i]);
image_data[i * 3 + 2] = preprocessing_float_to_u8(c2[i]);
}
return;
}
for (size_t i = 0; i < pixels; ++i) {
for (int c = 0; c < channel; ++c) {
image_data[i * static_cast<size_t>(channel) + static_cast<size_t>(c)] =
preprocessing_float_to_u8(frame_ptr[i + pixels * static_cast<size_t>(c)]);
}
}
return;
}
GGML_ASSERT(frame_index >= 0 && frame_index < shape[2]);
const size_t channel_stride = pixels * static_cast<size_t>(shape[2]);
const float* frame_ptr = src + static_cast<size_t>(frame_index) * pixels;
if (channel == 3) {
const float* c0 = frame_ptr;
const float* c1 = frame_ptr + channel_stride;
const float* c2 = frame_ptr + channel_stride * 2;
for (size_t i = 0; i < pixels; ++i) {
image_data[i * 3 + 0] = preprocessing_float_to_u8(c0[i]);
image_data[i * 3 + 1] = preprocessing_float_to_u8(c1[i]);
image_data[i * 3 + 2] = preprocessing_float_to_u8(c2[i]);
}
return;
}
for (size_t i = 0; i < pixels; ++i) {
for (int c = 0; c < channel; ++c) {
image_data[i * static_cast<size_t>(channel) + static_cast<size_t>(c)] =
preprocessing_float_to_u8(frame_ptr[i + channel_stride * static_cast<size_t>(c)]);
}
}
}
static inline sd::Tensor<float> sd_image_to_preprocessing_tensor(sd_image_t image) { static inline sd::Tensor<float> sd_image_to_preprocessing_tensor(sd_image_t image) {
sd::Tensor<float> tensor({static_cast<int64_t>(image.width), static_cast<int64_t>(image.height), static_cast<int64_t>(image.channel), 1}); sd::Tensor<float> tensor({static_cast<int64_t>(image.width), static_cast<int64_t>(image.height), static_cast<int64_t>(image.channel), 1});
for (uint32_t y = 0; y < image.height; ++y) { for (uint32_t y = 0; y < image.height; ++y) {
@ -108,7 +39,20 @@ static inline sd::Tensor<float> sd_image_to_preprocessing_tensor(sd_image_t imag
static inline void preprocessing_tensor_to_sd_image(const sd::Tensor<float>& tensor, uint8_t* image_data) { static inline void preprocessing_tensor_to_sd_image(const sd::Tensor<float>& tensor, uint8_t* image_data) {
GGML_ASSERT(tensor.dim() == 4); GGML_ASSERT(tensor.dim() == 4);
GGML_ASSERT(tensor.shape()[3] == 1); GGML_ASSERT(tensor.shape()[3] == 1);
preprocessing_tensor_frame_to_sd_image(tensor, 0, image_data); GGML_ASSERT(image_data != nullptr);
int width = static_cast<int>(tensor.shape()[0]);
int height = static_cast<int>(tensor.shape()[1]);
int channel = static_cast<int>(tensor.shape()[2]);
for (int y = 0; y < height; ++y) {
for (int x = 0; x < width; ++x) {
for (int c = 0; c < channel; ++c) {
float value = preprocessing_get_4d(tensor, x, y, c, 0);
value = std::min(1.0f, std::max(0.0f, value));
image_data[(y * width + x) * channel + c] = static_cast<uint8_t>(std::round(value * 255.0f));
}
}
}
} }
static inline sd::Tensor<float> gaussian_kernel_tensor(int kernel_size) { static inline sd::Tensor<float> gaussian_kernel_tensor(int kernel_size) {

View File

@ -95,7 +95,9 @@ namespace Qwen {
float scale = 1.f / 32.f; float scale = 1.f / 32.f;
bool force_prec_f32 = false; bool force_prec_f32 = false;
#ifdef SD_USE_VULKAN
force_prec_f32 = true;
#endif
// The purpose of the scale here is to prevent NaN issues in certain situations. // The purpose of the scale here is to prevent NaN issues in certain situations.
// For example when using CUDA but the weights are k-quants (not all prompts). // For example when using CUDA but the weights are k-quants (not all prompts).
blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_dim, out_bias, false, force_prec_f32, scale)); blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_dim, out_bias, false, force_prec_f32, scale));
@ -122,10 +124,6 @@ namespace Qwen {
auto to_v = std::dynamic_pointer_cast<Linear>(blocks["to_v"]); auto to_v = std::dynamic_pointer_cast<Linear>(blocks["to_v"]);
auto to_out_0 = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]); auto to_out_0 = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
if (sd_backend_is(ctx->backend, "Vulkan")) {
to_out_0->set_force_prec_f32(true);
}
auto norm_added_q = std::dynamic_pointer_cast<UnaryBlock>(blocks["norm_added_q"]); auto norm_added_q = std::dynamic_pointer_cast<UnaryBlock>(blocks["norm_added_q"]);
auto norm_added_k = std::dynamic_pointer_cast<UnaryBlock>(blocks["norm_added_k"]); auto norm_added_k = std::dynamic_pointer_cast<UnaryBlock>(blocks["norm_added_k"]);
@ -412,9 +410,6 @@ namespace Qwen {
auto img = img_in->forward(ctx, x); auto img = img_in->forward(ctx, x);
auto txt = txt_norm->forward(ctx, context); auto txt = txt_norm->forward(ctx, context);
txt = txt_in->forward(ctx, txt); txt = txt_in->forward(ctx, txt);
sd::ggml_graph_cut::mark_graph_cut(img, "qwen_image.prelude", "img");
sd::ggml_graph_cut::mark_graph_cut(txt, "qwen_image.prelude", "txt");
// sd::ggml_graph_cut::mark_graph_cut(t_emb, "qwen_image.prelude", "t_emb");
for (int i = 0; i < params.num_layers; i++) { for (int i = 0; i < params.num_layers; i++) {
auto block = std::dynamic_pointer_cast<QwenImageTransformerBlock>(blocks["transformer_blocks." + std::to_string(i)]); auto block = std::dynamic_pointer_cast<QwenImageTransformerBlock>(blocks["transformer_blocks." + std::to_string(i)]);
@ -422,8 +417,6 @@ namespace Qwen {
auto result = block->forward(ctx, img, txt, t_emb, pe, modulate_index); auto result = block->forward(ctx, img, txt, t_emb, pe, modulate_index);
img = result.first; img = result.first;
txt = result.second; txt = result.second;
sd::ggml_graph_cut::mark_graph_cut(img, "qwen_image.transformer_blocks." + std::to_string(i), "img");
sd::ggml_graph_cut::mark_graph_cut(txt, "qwen_image.transformer_blocks." + std::to_string(i), "txt");
} }
if (params.zero_cond_t) { if (params.zero_cond_t) {

View File

@ -144,7 +144,6 @@ public:
std::string taesd_path; std::string taesd_path;
sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0, 0}; sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0, 0};
bool offload_params_to_cpu = false; bool offload_params_to_cpu = false;
float max_vram = 0.f;
bool use_pmid = false; bool use_pmid = false;
bool is_using_v_parameterization = false; bool is_using_v_parameterization = false;
@ -173,7 +172,60 @@ public:
} }
void init_backend() { void init_backend() {
backend = sd_get_default_backend(); #ifdef SD_USE_CUDA
LOG_DEBUG("Using CUDA backend");
backend = ggml_backend_cuda_init(0);
#endif
#ifdef SD_USE_METAL
LOG_DEBUG("Using Metal backend");
backend = ggml_backend_metal_init();
#endif
#ifdef SD_USE_VULKAN
LOG_DEBUG("Using Vulkan backend");
size_t device = 0;
const int device_count = ggml_backend_vk_get_device_count();
if (device_count) {
const char* SD_VK_DEVICE = getenv("SD_VK_DEVICE");
if (SD_VK_DEVICE != nullptr) {
std::string sd_vk_device_str = SD_VK_DEVICE;
try {
device = std::stoull(sd_vk_device_str);
} catch (const std::invalid_argument&) {
LOG_WARN("SD_VK_DEVICE environment variable is not a valid integer (%s). Falling back to device 0.", SD_VK_DEVICE);
device = 0;
} catch (const std::out_of_range&) {
LOG_WARN("SD_VK_DEVICE environment variable value is out of range for `unsigned long long` type (%s). Falling back to device 0.", SD_VK_DEVICE);
device = 0;
}
if (device >= device_count) {
LOG_WARN("Cannot find targeted vulkan device (%llu). Falling back to device 0.", device);
device = 0;
}
}
LOG_INFO("Vulkan: Using device %llu", device);
backend = ggml_backend_vk_init(device);
}
if (!backend) {
LOG_WARN("Failed to initialize Vulkan backend");
}
#endif
#ifdef SD_USE_OPENCL
LOG_DEBUG("Using OpenCL backend");
// ggml_log_set(ggml_log_callback_default, nullptr); // Optional ggml logs
backend = ggml_backend_opencl_init();
if (!backend) {
LOG_WARN("Failed to initialize OpenCL backend");
}
#endif
#ifdef SD_USE_SYCL
LOG_DEBUG("Using SYCL backend");
backend = ggml_backend_sycl_init(0);
#endif
if (!backend) {
LOG_DEBUG("Using CPU backend");
backend = ggml_backend_cpu_init();
}
} }
std::shared_ptr<RNG> get_rng(rng_type_t rng_type) { std::shared_ptr<RNG> get_rng(rng_type_t rng_type) {
@ -191,7 +243,6 @@ public:
vae_decode_only = sd_ctx_params->vae_decode_only; vae_decode_only = sd_ctx_params->vae_decode_only;
free_params_immediately = sd_ctx_params->free_params_immediately; free_params_immediately = sd_ctx_params->free_params_immediately;
offload_params_to_cpu = sd_ctx_params->offload_params_to_cpu; offload_params_to_cpu = sd_ctx_params->offload_params_to_cpu;
max_vram = sd_ctx_params->max_vram;
bool use_tae = false; bool use_tae = false;
@ -377,10 +428,6 @@ public:
bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu; bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu;
const size_t max_graph_vram_bytes = max_vram <= 0.f
? 0
: static_cast<size_t>(static_cast<double>(max_vram) * 1024.0 * 1024.0 * 1024.0);
{ {
clip_backend = backend; clip_backend = backend;
if (clip_on_cpu && !ggml_backend_is_cpu(backend)) { if (clip_on_cpu && !ggml_backend_is_cpu(backend)) {
@ -470,7 +517,6 @@ public:
clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(backend, clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(backend,
offload_params_to_cpu, offload_params_to_cpu,
tensor_storage_map); tensor_storage_map);
clip_vision->set_max_graph_vram_bytes(max_graph_vram_bytes);
clip_vision->alloc_params_buffer(); clip_vision->alloc_params_buffer();
clip_vision->get_param_tensors(tensors); clip_vision->get_param_tensors(tensors);
} }
@ -547,11 +593,9 @@ public:
} }
} }
cond_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
cond_stage_model->alloc_params_buffer(); cond_stage_model->alloc_params_buffer();
cond_stage_model->get_param_tensors(tensors); cond_stage_model->get_param_tensors(tensors);
diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
diffusion_model->alloc_params_buffer(); diffusion_model->alloc_params_buffer();
diffusion_model->get_param_tensors(tensors); diffusion_model->get_param_tensors(tensors);
@ -560,7 +604,6 @@ public:
} }
if (high_noise_diffusion_model) { if (high_noise_diffusion_model) {
high_noise_diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
high_noise_diffusion_model->alloc_params_buffer(); high_noise_diffusion_model->alloc_params_buffer();
high_noise_diffusion_model->get_param_tensors(tensors); high_noise_diffusion_model->get_param_tensors(tensors);
} }
@ -633,19 +676,16 @@ public:
} else if (use_tae && !tae_preview_only) { } else if (use_tae && !tae_preview_only) {
LOG_INFO("using TAE for encoding / decoding"); LOG_INFO("using TAE for encoding / decoding");
first_stage_model = create_tae(); first_stage_model = create_tae();
first_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
first_stage_model->alloc_params_buffer(); first_stage_model->alloc_params_buffer();
first_stage_model->get_param_tensors(tensors, "tae"); first_stage_model->get_param_tensors(tensors, "tae");
} else { } else {
LOG_INFO("using VAE for encoding / decoding"); LOG_INFO("using VAE for encoding / decoding");
first_stage_model = create_vae(); first_stage_model = create_vae();
first_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
first_stage_model->alloc_params_buffer(); first_stage_model->alloc_params_buffer();
first_stage_model->get_param_tensors(tensors, "first_stage_model"); first_stage_model->get_param_tensors(tensors, "first_stage_model");
if (use_tae && tae_preview_only) { if (use_tae && tae_preview_only) {
LOG_INFO("using TAE for preview"); LOG_INFO("using TAE for preview");
preview_vae = create_tae(); preview_vae = create_tae();
preview_vae->set_max_graph_vram_bytes(max_graph_vram_bytes);
preview_vae->alloc_params_buffer(); preview_vae->alloc_params_buffer();
preview_vae->get_param_tensors(tensors, "tae"); preview_vae->get_param_tensors(tensors, "tae");
} }
@ -1117,13 +1157,8 @@ public:
cond_stage_lora_models.push_back(lora); cond_stage_lora_models.push_back(lora);
} }
} }
// Only attach the adapter when there are LoRAs targeting the cond_stage model. auto multi_lora_adapter = std::make_shared<MultiLoraAdapter>(cond_stage_lora_models);
// An empty MultiLoraAdapter still routes every linear/conv through cond_stage_model->set_weight_adapter(multi_lora_adapter);
// forward_with_lora() instead of the direct kernel path — slower for no benefit.
if (!cond_stage_lora_models.empty()) {
auto multi_lora_adapter = std::make_shared<MultiLoraAdapter>(cond_stage_lora_models);
cond_stage_model->set_weight_adapter(multi_lora_adapter);
}
} }
if (diffusion_model) { if (diffusion_model) {
std::vector<std::shared_ptr<LoraModel>> lora_models; std::vector<std::shared_ptr<LoraModel>> lora_models;
@ -1154,12 +1189,10 @@ public:
diffusion_lora_models.push_back(lora); diffusion_lora_models.push_back(lora);
} }
} }
if (!diffusion_lora_models.empty()) { auto multi_lora_adapter = std::make_shared<MultiLoraAdapter>(diffusion_lora_models);
auto multi_lora_adapter = std::make_shared<MultiLoraAdapter>(diffusion_lora_models); diffusion_model->set_weight_adapter(multi_lora_adapter);
diffusion_model->set_weight_adapter(multi_lora_adapter); if (high_noise_diffusion_model) {
if (high_noise_diffusion_model) { high_noise_diffusion_model->set_weight_adapter(multi_lora_adapter);
high_noise_diffusion_model->set_weight_adapter(multi_lora_adapter);
}
} }
} }
@ -1192,10 +1225,8 @@ public:
first_stage_lora_models.push_back(lora); first_stage_lora_models.push_back(lora);
} }
} }
if (!first_stage_lora_models.empty()) { auto multi_lora_adapter = std::make_shared<MultiLoraAdapter>(first_stage_lora_models);
auto multi_lora_adapter = std::make_shared<MultiLoraAdapter>(first_stage_lora_models); first_stage_model->set_weight_adapter(multi_lora_adapter);
first_stage_model->set_weight_adapter(multi_lora_adapter);
}
} }
} }
@ -2085,19 +2116,12 @@ enum lora_apply_mode_t str_to_lora_apply_mode(const char* str) {
const char* hires_upscaler_to_str[] = { const char* hires_upscaler_to_str[] = {
"None", "None",
"Latent",
"Latent (nearest)", "Latent (nearest)",
"Latent (nearest-exact)",
"Latent (antialiased)",
"Latent (bicubic)",
"Latent (bicubic antialiased)",
"Lanczos",
"Nearest",
"Model", "Model",
}; };
const char* sd_hires_upscaler_name(enum sd_hires_upscaler_t upscaler) { const char* sd_hires_upscaler_name(enum sd_hires_upscaler_t upscaler) {
if (upscaler >= SD_HIRES_UPSCALER_NONE && upscaler < SD_HIRES_UPSCALER_COUNT) { if (upscaler < SD_HIRES_UPSCALER_COUNT) {
return hires_upscaler_to_str[upscaler]; return hires_upscaler_to_str[upscaler];
} }
return NONE_STR; return NONE_STR;
@ -2143,7 +2167,7 @@ void sd_cache_params_init(sd_cache_params_t* cache_params) {
void sd_hires_params_init(sd_hires_params_t* hires_params) { void sd_hires_params_init(sd_hires_params_t* hires_params) {
*hires_params = {}; *hires_params = {};
hires_params->enabled = false; hires_params->enabled = false;
hires_params->upscaler = SD_HIRES_UPSCALER_LATENT; hires_params->upscaler = SD_HIRES_UPSCALER_LATENT_NEAREST;
hires_params->model_path = nullptr; hires_params->model_path = nullptr;
hires_params->scale = 2.0f; hires_params->scale = 2.0f;
hires_params->target_width = 0; hires_params->target_width = 0;
@ -2164,7 +2188,6 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
sd_ctx_params->prediction = PREDICTION_COUNT; sd_ctx_params->prediction = PREDICTION_COUNT;
sd_ctx_params->lora_apply_mode = LORA_APPLY_AUTO; sd_ctx_params->lora_apply_mode = LORA_APPLY_AUTO;
sd_ctx_params->offload_params_to_cpu = false; sd_ctx_params->offload_params_to_cpu = false;
sd_ctx_params->max_vram = 0.f;
sd_ctx_params->enable_mmap = false; sd_ctx_params->enable_mmap = false;
sd_ctx_params->keep_clip_on_cpu = false; sd_ctx_params->keep_clip_on_cpu = false;
sd_ctx_params->keep_control_net_on_cpu = false; sd_ctx_params->keep_control_net_on_cpu = false;
@ -2206,7 +2229,6 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
"sampler_rng_type: %s\n" "sampler_rng_type: %s\n"
"prediction: %s\n" "prediction: %s\n"
"offload_params_to_cpu: %s\n" "offload_params_to_cpu: %s\n"
"max_vram: %.3f\n"
"keep_clip_on_cpu: %s\n" "keep_clip_on_cpu: %s\n"
"keep_control_net_on_cpu: %s\n" "keep_control_net_on_cpu: %s\n"
"keep_vae_on_cpu: %s\n" "keep_vae_on_cpu: %s\n"
@ -2239,7 +2261,6 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
sd_rng_type_name(sd_ctx_params->sampler_rng_type), sd_rng_type_name(sd_ctx_params->sampler_rng_type),
sd_prediction_name(sd_ctx_params->prediction), sd_prediction_name(sd_ctx_params->prediction),
BOOL_STR(sd_ctx_params->offload_params_to_cpu), BOOL_STR(sd_ctx_params->offload_params_to_cpu),
sd_ctx_params->max_vram,
BOOL_STR(sd_ctx_params->keep_clip_on_cpu), BOOL_STR(sd_ctx_params->keep_clip_on_cpu),
BOOL_STR(sd_ctx_params->keep_control_net_on_cpu), BOOL_STR(sd_ctx_params->keep_control_net_on_cpu),
BOOL_STR(sd_ctx_params->keep_vae_on_cpu), BOOL_STR(sd_ctx_params->keep_vae_on_cpu),
@ -2637,7 +2658,7 @@ struct GenerationRequest {
hires.enabled = false; hires.enabled = false;
return; return;
} }
if (hires.upscaler < SD_HIRES_UPSCALER_NONE || hires.upscaler >= SD_HIRES_UPSCALER_COUNT) { if (hires.upscaler < SD_HIRES_UPSCALER_NONE && hires.upscaler >= SD_HIRES_UPSCALER_COUNT) {
LOG_WARN("hires upscaler '%d' is invalid, disabling hires", hires.upscaler); LOG_WARN("hires upscaler '%d' is invalid, disabling hires", hires.upscaler);
hires.enabled = false; hires.enabled = false;
return; return;
@ -3205,7 +3226,7 @@ static sd_image_t* decode_image_outputs(sd_ctx_t* sd_ctx,
} }
decoded_images.push_back(std::move(image)); decoded_images.push_back(std::move(image));
int64_t t2 = ggml_time_ms(); int64_t t2 = ggml_time_ms();
LOG_INFO("latent %zu decoded, taking %.2fs", i + 1, (t2 - t1) * 1.0f / 1000); LOG_INFO("latent %" PRId64 " decoded, taking %.2fs", i + 1, (t2 - t1) * 1.0f / 1000);
} }
int64_t t4 = ggml_time_ms(); int64_t t4 = ggml_time_ms();
@ -3231,123 +3252,55 @@ static sd::Tensor<float> upscale_hires_latent(sd_ctx_t* sd_ctx,
const sd::Tensor<float>& latent, const sd::Tensor<float>& latent,
const GenerationRequest& request, const GenerationRequest& request,
UpscalerGGML* upscaler) { UpscalerGGML* upscaler) {
auto get_hires_latent_target_shape = [&]() { if (request.hires.upscaler == SD_HIRES_UPSCALER_LATENT_NEAREST) {
std::vector<int64_t> target_shape = latent.shape(); std::vector<int64_t> target_shape = latent.shape();
if (target_shape.size() < 2) { if (target_shape.size() < 2) {
target_shape.clear();
return target_shape;
}
target_shape[0] = request.hires.target_width / request.vae_scale_factor;
target_shape[1] = request.hires.target_height / request.vae_scale_factor;
return target_shape;
};
if (request.hires.upscaler == SD_HIRES_UPSCALER_LATENT ||
request.hires.upscaler == SD_HIRES_UPSCALER_LATENT_NEAREST ||
request.hires.upscaler == SD_HIRES_UPSCALER_LATENT_NEAREST_EXACT ||
request.hires.upscaler == SD_HIRES_UPSCALER_LATENT_ANTIALIASED ||
request.hires.upscaler == SD_HIRES_UPSCALER_LATENT_BICUBIC ||
request.hires.upscaler == SD_HIRES_UPSCALER_LATENT_BICUBIC_ANTIALIASED) {
std::vector<int64_t> target_shape = get_hires_latent_target_shape();
if (target_shape.empty()) {
LOG_ERROR("latent has invalid shape for hires upscale"); LOG_ERROR("latent has invalid shape for hires upscale");
return {}; return {};
} }
target_shape[0] = request.hires.target_width / request.vae_scale_factor;
target_shape[1] = request.hires.target_height / request.vae_scale_factor;
sd::ops::InterpolateMode mode = sd::ops::InterpolateMode::Nearest; LOG_INFO("hires latent upscale %" PRId64 "x%" PRId64 " -> %" PRId64 "x%" PRId64,
bool antialias = false;
switch (request.hires.upscaler) {
case SD_HIRES_UPSCALER_LATENT:
mode = sd::ops::InterpolateMode::Bilinear;
break;
case SD_HIRES_UPSCALER_LATENT_NEAREST:
mode = sd::ops::InterpolateMode::Nearest;
break;
case SD_HIRES_UPSCALER_LATENT_NEAREST_EXACT:
mode = sd::ops::InterpolateMode::NearestExact;
break;
case SD_HIRES_UPSCALER_LATENT_ANTIALIASED:
mode = sd::ops::InterpolateMode::Bilinear;
antialias = true;
break;
case SD_HIRES_UPSCALER_LATENT_BICUBIC:
mode = sd::ops::InterpolateMode::Bicubic;
break;
case SD_HIRES_UPSCALER_LATENT_BICUBIC_ANTIALIASED:
mode = sd::ops::InterpolateMode::Bicubic;
antialias = true;
break;
default:
break;
}
LOG_INFO("hires %s upscale %" PRId64 "x%" PRId64 " -> %" PRId64 "x%" PRId64,
sd_hires_upscaler_name(request.hires.upscaler),
latent.shape()[0], latent.shape()[0],
latent.shape()[1], latent.shape()[1],
target_shape[0], target_shape[0],
target_shape[1]); target_shape[1]);
return sd::ops::interpolate(latent, target_shape, sd::ops::InterpolateMode::Nearest);
return sd::ops::interpolate(latent, target_shape, mode, false, antialias); } else if (request.hires.upscaler == SD_HIRES_UPSCALER_MODEL) {
} else if (request.hires.upscaler == SD_HIRES_UPSCALER_MODEL || if (upscaler == nullptr) {
request.hires.upscaler == SD_HIRES_UPSCALER_LANCZOS || LOG_ERROR("hires model upscaler context is null");
request.hires.upscaler == SD_HIRES_UPSCALER_NEAREST) {
if (sd_ctx->sd->vae_decode_only) {
LOG_ERROR("hires %s upscaler requires VAE encoder weights; create the context with vae_decode_only=false",
sd_hires_upscaler_name(request.hires.upscaler));
return {}; return {};
} }
if (request.hires.upscaler == SD_HIRES_UPSCALER_MODEL && upscaler == nullptr) { if (sd_ctx->sd->vae_decode_only) {
LOG_ERROR("hires model upscaler context is null"); LOG_ERROR("hires model upscaler requires VAE encoder weights; create the context with vae_decode_only=false");
return {}; return {};
} }
sd::Tensor<float> decoded = sd_ctx->sd->decode_first_stage(latent); sd::Tensor<float> decoded = sd_ctx->sd->decode_first_stage(latent);
if (decoded.empty()) { if (decoded.empty()) {
LOG_ERROR("decode_first_stage failed before hires %s upscale", LOG_ERROR("decode_first_stage failed before hires model upscale");
sd_hires_upscaler_name(request.hires.upscaler));
return {}; return {};
} }
sd::Tensor<float> upscaled_tensor; sd::Tensor<float> upscaled_tensor = upscaler->upscale_tensor(decoded);
if (request.hires.upscaler == SD_HIRES_UPSCALER_MODEL) { if (upscaled_tensor.empty()) {
upscaled_tensor = upscaler->upscale_tensor(decoded); LOG_ERROR("hires model upscale failed");
if (upscaled_tensor.empty()) { return {};
LOG_ERROR("hires model upscale failed"); }
return {};
}
if (upscaled_tensor.shape()[0] != request.hires.target_width || if (upscaled_tensor.shape()[0] != request.hires.target_width ||
upscaled_tensor.shape()[1] != request.hires.target_height) { upscaled_tensor.shape()[1] != request.hires.target_height) {
upscaled_tensor = sd::ops::interpolate(upscaled_tensor, upscaled_tensor = sd::ops::interpolate(upscaled_tensor,
{request.hires.target_width,
request.hires.target_height,
upscaled_tensor.shape()[2],
upscaled_tensor.shape()[3]});
}
} else {
sd::ops::InterpolateMode mode = request.hires.upscaler == SD_HIRES_UPSCALER_LANCZOS
? sd::ops::InterpolateMode::Lanczos
: sd::ops::InterpolateMode::Nearest;
LOG_INFO("hires %s image upscale %" PRId64 "x%" PRId64 " -> %dx%d",
sd_hires_upscaler_name(request.hires.upscaler),
decoded.shape()[0],
decoded.shape()[1],
request.hires.target_width,
request.hires.target_height);
upscaled_tensor = sd::ops::interpolate(decoded,
{request.hires.target_width, {request.hires.target_width,
request.hires.target_height, request.hires.target_height,
decoded.shape()[2], upscaled_tensor.shape()[2],
decoded.shape()[3]}, upscaled_tensor.shape()[3]});
mode);
upscaled_tensor = sd::ops::clamp(upscaled_tensor, 0.0f, 1.0f);
} }
sd::Tensor<float> upscaled_latent = sd_ctx->sd->encode_first_stage(upscaled_tensor); sd::Tensor<float> upscaled_latent = sd_ctx->sd->encode_first_stage(upscaled_tensor);
if (upscaled_latent.empty()) { if (upscaled_latent.empty()) {
LOG_ERROR("encode_first_stage failed after hires %s upscale", LOG_ERROR("encode_first_stage failed after hires model upscale");
sd_hires_upscaler_name(request.hires.upscaler));
} }
return upscaled_latent; return upscaled_latent;
} }
@ -3447,7 +3400,7 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
sd_ctx->sd->diffusion_model->free_params_buffer(); sd_ctx->sd->diffusion_model->free_params_buffer();
} }
int64_t denoise_end = ggml_time_ms(); int64_t denoise_end = ggml_time_ms();
LOG_INFO("generating %zu latent images completed, taking %.2fs", LOG_INFO("generating %" PRId64 " latent images completed, taking %.2fs",
final_latents.size(), final_latents.size(),
(denoise_end - denoise_start) * 1.0f / 1000); (denoise_end - denoise_start) * 1.0f / 1000);
@ -3457,13 +3410,9 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
std::unique_ptr<UpscalerGGML> hires_upscaler; std::unique_ptr<UpscalerGGML> hires_upscaler;
if (request.hires.upscaler == SD_HIRES_UPSCALER_MODEL) { if (request.hires.upscaler == SD_HIRES_UPSCALER_MODEL) {
LOG_INFO("hires fix: loading model upscaler from '%s'", request.hires.model_path); LOG_INFO("hires fix: loading model upscaler from '%s'", request.hires.model_path);
hires_upscaler = std::make_unique<UpscalerGGML>(sd_ctx->sd->n_threads, hires_upscaler = std::make_unique<UpscalerGGML>(sd_ctx->sd->n_threads,
false, false,
request.hires.upscale_tile_size); request.hires.upscale_tile_size);
const size_t max_graph_vram_bytes = sd_ctx->sd->max_vram <= 0.f
? 0
: static_cast<size_t>(static_cast<double>(sd_ctx->sd->max_vram) * 1024.0 * 1024.0 * 1024.0);
hires_upscaler->set_max_graph_vram_bytes(max_graph_vram_bytes);
if (!hires_upscaler->load_from_file(request.hires.model_path, if (!hires_upscaler->load_from_file(request.hires.model_path,
sd_ctx->sd->offload_params_to_cpu, sd_ctx->sd->offload_params_to_cpu,
sd_ctx->sd->n_threads)) { sd_ctx->sd->n_threads)) {

View File

@ -251,8 +251,7 @@ public:
ggml_tensor* x, ggml_tensor* x,
ggml_tensor* past_bias = nullptr, ggml_tensor* past_bias = nullptr,
ggml_tensor* attention_mask = nullptr, ggml_tensor* attention_mask = nullptr,
ggml_tensor* relative_position_bucket = nullptr, ggml_tensor* relative_position_bucket = nullptr) {
const std::string& graph_cut_prefix = "") {
// x: [N, n_token, model_dim] // x: [N, n_token, model_dim]
for (int i = 0; i < num_layers; i++) { for (int i = 0; i < num_layers; i++) {
auto block = std::dynamic_pointer_cast<T5Block>(blocks["block." + std::to_string(i)]); auto block = std::dynamic_pointer_cast<T5Block>(blocks["block." + std::to_string(i)]);
@ -260,9 +259,6 @@ public:
auto ret = block->forward(ctx, x, past_bias, attention_mask, relative_position_bucket); auto ret = block->forward(ctx, x, past_bias, attention_mask, relative_position_bucket);
x = ret.first; x = ret.first;
past_bias = ret.second; past_bias = ret.second;
if (!graph_cut_prefix.empty()) {
sd::ggml_graph_cut::mark_graph_cut(x, graph_cut_prefix + ".block." + std::to_string(i), "x");
}
} }
auto final_layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["final_layer_norm"]); auto final_layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["final_layer_norm"]);
@ -309,8 +305,7 @@ public:
auto encoder = std::dynamic_pointer_cast<T5Stack>(blocks["encoder"]); auto encoder = std::dynamic_pointer_cast<T5Stack>(blocks["encoder"]);
auto x = shared->forward(ctx, input_ids); auto x = shared->forward(ctx, input_ids);
sd::ggml_graph_cut::mark_graph_cut(x, "t5.prelude", "x"); x = encoder->forward(ctx, x, past_bias, attention_mask, relative_position_bucket);
x = encoder->forward(ctx, x, past_bias, attention_mask, relative_position_bucket, "t5");
return x; return x;
} }
}; };

View File

@ -815,202 +815,11 @@ namespace sd {
namespace ops { namespace ops {
enum class InterpolateMode { enum class InterpolateMode {
Nearest, Nearest,
NearestExact,
NearestMax, NearestMax,
NearestMin, NearestMin,
NearestAvg, NearestAvg,
Bilinear,
Bicubic,
Lanczos,
}; };
inline bool is_nearest_like_interpolate_mode(InterpolateMode mode) {
return mode == InterpolateMode::Nearest ||
mode == InterpolateMode::NearestExact ||
mode == InterpolateMode::NearestMax ||
mode == InterpolateMode::NearestMin ||
mode == InterpolateMode::NearestAvg;
}
inline bool is_2d_filter_interpolate_mode(InterpolateMode mode) {
return mode == InterpolateMode::Bilinear ||
mode == InterpolateMode::Bicubic ||
mode == InterpolateMode::Lanczos;
}
inline int64_t nearest_exact_interpolate_index(int64_t output_index,
int64_t input_size,
int64_t output_size) {
const double scale = static_cast<double>(input_size) / static_cast<double>(output_size);
const double center = (static_cast<double>(output_index) + 0.5) * scale - 0.5;
return std::min(std::max<int64_t>(static_cast<int64_t>(std::floor(center + 0.5)), 0), input_size - 1);
}
inline double linear_interpolate_weight(double x) {
x = std::abs(x);
return x < 1.0 ? 1.0 - x : 0.0;
}
inline double cubic_interpolate_weight(double x) {
constexpr double a = -0.75; // Match PyTorch bicubic interpolation.
x = std::abs(x);
if (x <= 1.0) {
return ((a + 2.0) * x - (a + 3.0)) * x * x + 1.0;
}
if (x < 2.0) {
return ((a * x - 5.0 * a) * x + 8.0 * a) * x - 4.0 * a;
}
return 0.0;
}
inline double sinc(double x) {
constexpr double pi = 3.14159265358979323846;
if (std::abs(x) < 1e-12) {
return 1.0;
}
const double pix = pi * x;
return std::sin(pix) / pix;
}
inline double lanczos_interpolate_weight(double x) {
constexpr double radius = 3.0;
x = std::abs(x);
if (x >= radius) {
return 0.0;
}
return sinc(x) * sinc(x / radius);
}
struct InterpolateContributor {
int64_t index;
double weight;
};
inline std::vector<std::vector<InterpolateContributor>> make_interpolate_contributors(
int64_t input_size,
int64_t output_size,
InterpolateMode mode,
bool antialias) {
std::vector<std::vector<InterpolateContributor>> contributors(static_cast<size_t>(output_size));
const double scale = static_cast<double>(input_size) / static_cast<double>(output_size);
const double filter_scale = antialias ? std::max(1.0, scale) : 1.0;
for (int64_t out = 0; out < output_size; ++out) {
const double center = (static_cast<double>(out) + 0.5) * scale - 0.5;
int64_t start = 0;
int64_t end = 0;
if (mode == InterpolateMode::Bilinear) {
const double support = filter_scale;
start = static_cast<int64_t>(std::ceil(center - support));
end = static_cast<int64_t>(std::floor(center + support));
} else if (mode == InterpolateMode::Bicubic) {
const double support = 2.0 * filter_scale;
start = static_cast<int64_t>(std::ceil(center - support));
end = static_cast<int64_t>(std::floor(center + support));
} else if (mode == InterpolateMode::Lanczos) {
const double support = 3.0 * filter_scale;
start = static_cast<int64_t>(std::ceil(center - support));
end = static_cast<int64_t>(std::floor(center + support));
} else {
tensor_throw_invalid_argument("Unsupported 2D filter interpolate mode: mode=" +
std::to_string(static_cast<int>(mode)));
}
double weight_sum = 0.0;
std::vector<InterpolateContributor>& axis_contributors = contributors[static_cast<size_t>(out)];
axis_contributors.reserve(static_cast<size_t>(end - start + 1));
for (int64_t in = start; in <= end; ++in) {
double weight = 0.0;
if (mode == InterpolateMode::Bilinear) {
weight = linear_interpolate_weight((center - static_cast<double>(in)) / filter_scale);
} else if (mode == InterpolateMode::Bicubic) {
weight = cubic_interpolate_weight((center - static_cast<double>(in)) / filter_scale);
} else {
weight = lanczos_interpolate_weight((center - static_cast<double>(in)) / filter_scale);
}
if (weight == 0.0) {
continue;
}
const int64_t clamped_index = std::min(std::max<int64_t>(in, 0), input_size - 1);
axis_contributors.push_back({clamped_index, weight});
weight_sum += weight;
}
if ((antialias || mode == InterpolateMode::Lanczos) &&
std::abs(weight_sum) > 1e-12) {
for (auto& contributor : axis_contributors) {
contributor.weight /= weight_sum;
}
}
if (axis_contributors.empty()) {
const int64_t nearest = std::min(
std::max<int64_t>(static_cast<int64_t>(std::floor(center + 0.5)), 0),
input_size - 1);
axis_contributors.push_back({nearest, 1.0});
}
}
return contributors;
}
template <typename T>
inline Tensor<T> interpolate_2d_filter(const Tensor<T>& input,
const std::vector<int64_t>& output_shape,
InterpolateMode mode,
bool antialias) {
if (input.dim() < 2) {
tensor_throw_invalid_argument("2D filter interpolate requires rank >= 2: input_shape=" +
tensor_shape_to_string(input.shape()) + ", output_shape=" +
tensor_shape_to_string(output_shape));
}
for (size_t i = 2; i < output_shape.size(); ++i) {
if (input.shape()[i] != output_shape[i]) {
tensor_throw_invalid_argument("2D filter interpolate only supports resizing dimensions 0 and 1: input_shape=" +
tensor_shape_to_string(input.shape()) + ", output_shape=" +
tensor_shape_to_string(output_shape));
}
}
Tensor<T> output(output_shape);
const int64_t input_width = input.shape()[0];
const int64_t input_height = input.shape()[1];
const int64_t output_width = output_shape[0];
const int64_t output_height = output_shape[1];
const int64_t input_plane = input_width * input_height;
const int64_t output_plane = output_width * output_height;
const int64_t plane_count = input.numel() / input_plane;
auto x_contributors = make_interpolate_contributors(input_width, output_width, mode, antialias);
auto y_contributors = make_interpolate_contributors(input_height, output_height, mode, antialias);
for (int64_t plane = 0; plane < plane_count; ++plane) {
const int64_t input_plane_offset = plane * input_plane;
const int64_t output_plane_offset = plane * output_plane;
for (int64_t y = 0; y < output_height; ++y) {
const auto& y_axis = y_contributors[static_cast<size_t>(y)];
for (int64_t x = 0; x < output_width; ++x) {
const auto& x_axis = x_contributors[static_cast<size_t>(x)];
double value = 0.0;
for (const auto& yc : y_axis) {
const int64_t input_row_offset = input_plane_offset + yc.index * input_width;
for (const auto& xc : x_axis) {
value += static_cast<double>(input.data()[input_row_offset + xc.index]) *
xc.weight * yc.weight;
}
}
output.data()[output_plane_offset + y * output_width + x] = static_cast<T>(value);
}
}
}
return output;
}
inline int64_t normalize_slice_bound(int64_t index, int64_t dim_size) { inline int64_t normalize_slice_bound(int64_t index, int64_t dim_size) {
if (index < 0) { if (index < 0) {
index += dim_size; index += dim_size;
@ -1205,20 +1014,17 @@ namespace sd {
inline Tensor<T> interpolate(const Tensor<T>& input, inline Tensor<T> interpolate(const Tensor<T>& input,
std::vector<int64_t> output_shape, std::vector<int64_t> output_shape,
InterpolateMode mode = InterpolateMode::Nearest, InterpolateMode mode = InterpolateMode::Nearest,
bool align_corners = false, bool align_corners = false) {
bool antialias = false) { const bool is_nearest_like_mode = (mode == InterpolateMode::Nearest ||
const bool is_nearest_like_mode = is_nearest_like_interpolate_mode(mode); mode == InterpolateMode::NearestMax ||
const bool is_2d_filter_mode = is_2d_filter_interpolate_mode(mode); mode == InterpolateMode::NearestMin ||
if (!is_nearest_like_mode && !is_2d_filter_mode) { mode == InterpolateMode::NearestAvg);
tensor_throw_invalid_argument("Unsupported interpolate mode: mode=" + if (!is_nearest_like_mode) {
std::to_string(static_cast<int>(mode))); tensor_throw_invalid_argument("Only nearest-like interpolate modes are implemented, got mode=" +
}
if (antialias && !is_2d_filter_mode) {
tensor_throw_invalid_argument("Tensor interpolate antialias requires a 2D filter mode: mode=" +
std::to_string(static_cast<int>(mode))); std::to_string(static_cast<int>(mode)));
} }
if (align_corners) { if (align_corners) {
tensor_throw_invalid_argument("align_corners is not supported for tensor interpolate: input_shape=" + tensor_throw_invalid_argument("align_corners is not supported for nearest-like interpolate: input_shape=" +
tensor_shape_to_string(input.shape()) + ", output_shape=" + tensor_shape_to_string(input.shape()) + ", output_shape=" +
tensor_shape_to_string(output_shape)); tensor_shape_to_string(output_shape));
} }
@ -1245,10 +1051,6 @@ namespace sd {
} }
} }
if (is_2d_filter_mode) {
return interpolate_2d_filter(input, output_shape, mode, antialias);
}
bool has_downsampling = false; bool has_downsampling = false;
for (int64_t i = 0; i < input.dim(); ++i) { for (int64_t i = 0; i < input.dim(); ++i) {
if (input.shape()[i] > output_shape[i]) { if (input.shape()[i] > output_shape[i]) {
@ -1258,20 +1060,12 @@ namespace sd {
} }
Tensor<T> output(std::move(output_shape)); Tensor<T> output(std::move(output_shape));
if (mode == InterpolateMode::Nearest || if (mode == InterpolateMode::Nearest || !has_downsampling) {
mode == InterpolateMode::NearestExact ||
!has_downsampling) {
for (int64_t flat = 0; flat < output.numel(); ++flat) { for (int64_t flat = 0; flat < output.numel(); ++flat) {
std::vector<int64_t> output_coord = tensor_unravel_index(flat, output.shape()); std::vector<int64_t> output_coord = tensor_unravel_index(flat, output.shape());
std::vector<int64_t> input_coord(static_cast<size_t>(input.dim()), 0); std::vector<int64_t> input_coord(static_cast<size_t>(input.dim()), 0);
for (size_t i = 0; i < static_cast<size_t>(input.dim()); ++i) { for (size_t i = 0; i < static_cast<size_t>(input.dim()); ++i) {
if (mode == InterpolateMode::NearestExact) { input_coord[i] = output_coord[i] * input.shape()[i] / output.shape()[i];
input_coord[i] = nearest_exact_interpolate_index(output_coord[i],
input.shape()[i],
output.shape()[i]);
} else {
input_coord[i] = output_coord[i] * input.shape()[i] / output.shape()[i];
}
} }
output[flat] = input.index(input_coord); output[flat] = input.index(input_coord);
} }
@ -1289,12 +1083,6 @@ namespace sd {
return T(0); return T(0);
case InterpolateMode::Nearest: case InterpolateMode::Nearest:
return T(0); return T(0);
case InterpolateMode::NearestExact:
return T(0);
case InterpolateMode::Bilinear:
case InterpolateMode::Bicubic:
case InterpolateMode::Lanczos:
break;
} }
tensor_throw_invalid_argument("Unsupported interpolate mode: mode=" + tensor_throw_invalid_argument("Unsupported interpolate mode: mode=" +
@ -1314,12 +1102,6 @@ namespace sd {
break; break;
case InterpolateMode::Nearest: case InterpolateMode::Nearest:
break; break;
case InterpolateMode::NearestExact:
break;
case InterpolateMode::Bilinear:
case InterpolateMode::Bicubic:
case InterpolateMode::Lanczos:
break;
} }
}; };
@ -1375,20 +1157,17 @@ namespace sd {
const std::optional<std::vector<int64_t>>& size, const std::optional<std::vector<int64_t>>& size,
const std::optional<std::vector<double>>& scale_factor, const std::optional<std::vector<double>>& scale_factor,
InterpolateMode mode = InterpolateMode::Nearest, InterpolateMode mode = InterpolateMode::Nearest,
bool align_corners = false, bool align_corners = false) {
bool antialias = false) { const bool is_nearest_like_mode = (mode == InterpolateMode::Nearest ||
const bool is_nearest_like_mode = is_nearest_like_interpolate_mode(mode); mode == InterpolateMode::NearestMax ||
const bool is_2d_filter_mode = is_2d_filter_interpolate_mode(mode); mode == InterpolateMode::NearestMin ||
if (!is_nearest_like_mode && !is_2d_filter_mode) { mode == InterpolateMode::NearestAvg);
tensor_throw_invalid_argument("Unsupported interpolate mode: mode=" + if (!is_nearest_like_mode) {
std::to_string(static_cast<int>(mode))); tensor_throw_invalid_argument("Only nearest-like interpolate modes are implemented, got mode=" +
}
if (antialias && !is_2d_filter_mode) {
tensor_throw_invalid_argument("Tensor interpolate antialias requires a 2D filter mode: mode=" +
std::to_string(static_cast<int>(mode))); std::to_string(static_cast<int>(mode)));
} }
if (align_corners) { if (align_corners) {
tensor_throw_invalid_argument("align_corners is not supported for tensor interpolate: input_shape=" + tensor_throw_invalid_argument("align_corners is not supported for nearest-like interpolate: input_shape=" +
tensor_shape_to_string(input.shape())); tensor_shape_to_string(input.shape()));
} }
if (size.has_value() == scale_factor.has_value()) { if (size.has_value() == scale_factor.has_value()) {
@ -1432,7 +1211,7 @@ namespace sd {
} }
} }
return interpolate(input, std::move(output_shape), mode, align_corners, antialias); return interpolate(input, std::move(output_shape), mode, align_corners);
} }
template <typename T> template <typename T>
@ -1440,14 +1219,12 @@ namespace sd {
const std::optional<std::vector<int64_t>>& size, const std::optional<std::vector<int64_t>>& size,
double scale_factor, double scale_factor,
InterpolateMode mode = InterpolateMode::Nearest, InterpolateMode mode = InterpolateMode::Nearest,
bool align_corners = false, bool align_corners = false) {
bool antialias = false) {
return interpolate(input, return interpolate(input,
size, size,
std::vector<double>(size.has_value() ? size->size() : input.dim(), scale_factor), std::vector<double>(size.has_value() ? size->size() : input.dim(), scale_factor),
mode, mode,
align_corners, align_corners);
antialias);
} }
template <typename T> template <typename T>

View File

@ -62,7 +62,7 @@ void CLIPTokenizer::load_from_merges(const std::string& merges_utf8_str) {
} }
vocab.push_back(utf8_to_utf32("<|startoftext|>")); vocab.push_back(utf8_to_utf32("<|startoftext|>"));
vocab.push_back(utf8_to_utf32("<|endoftext|>")); vocab.push_back(utf8_to_utf32("<|endoftext|>"));
LOG_DEBUG("vocab size: %zu", vocab.size()); LOG_DEBUG("vocab size: %llu", vocab.size());
int i = 0; int i = 0;
for (const auto& token : vocab) { for (const auto& token : vocab) {
encoder[token] = i; encoder[token] = i;

View File

@ -28,7 +28,7 @@ void MistralTokenizer::load_from_merges(const std::string& merges_utf8_str, cons
byte_decoder[pair.second] = pair.first; byte_decoder[pair.second] = pair.first;
} }
std::vector<std::u32string> merges = split_utf32(merges_utf8_str); std::vector<std::u32string> merges = split_utf32(merges_utf8_str);
LOG_DEBUG("merges size %zu", merges.size()); LOG_DEBUG("merges size %llu", merges.size());
std::vector<std::pair<std::u32string, std::u32string>> merge_pairs; std::vector<std::pair<std::u32string, std::u32string>> merge_pairs;
for (const auto& merge : merges) { for (const auto& merge : merges) {
size_t space_pos = merge.find(' '); size_t space_pos = merge.find(' ');

View File

@ -11,7 +11,7 @@ void Qwen2Tokenizer::load_from_merges(const std::string& merges_utf8_str) {
} }
std::vector<std::u32string> merges = split_utf32(merges_utf8_str); std::vector<std::u32string> merges = split_utf32(merges_utf8_str);
LOG_DEBUG("merges size %zu", merges.size()); LOG_DEBUG("merges size %llu", merges.size());
std::vector<std::pair<std::u32string, std::u32string>> merge_pairs; std::vector<std::pair<std::u32string, std::u32string>> merge_pairs;
for (const auto& merge : merges) { for (const auto& merge : merges) {
size_t space_pos = merge.find(' '); size_t space_pos = merge.find(' ');

View File

@ -482,14 +482,12 @@ public:
emb = ggml_add(ctx->ggml_ctx, emb, label_emb); // [N, time_embed_dim] emb = ggml_add(ctx->ggml_ctx, emb, label_emb); // [N, time_embed_dim]
} }
// sd::ggml_graph_cut::mark_graph_cut(emb, "unet.prelude", "emb");
// input_blocks // input_blocks
std::vector<ggml_tensor*> hs; std::vector<ggml_tensor*> hs;
// input block 0 // input block 0
auto h = input_blocks_0_0->forward(ctx, x); auto h = input_blocks_0_0->forward(ctx, x);
sd::ggml_graph_cut::mark_graph_cut(h, "unet.input_blocks.0", "h");
ggml_set_name(h, "bench-start"); ggml_set_name(h, "bench-start");
hs.push_back(h); hs.push_back(h);
@ -507,7 +505,6 @@ public:
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1"; std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
h = attention_layer_forward(name, ctx, h, context, num_video_frames); // [N, mult*model_channels, h, w] h = attention_layer_forward(name, ctx, h, context, num_video_frames); // [N, mult*model_channels, h, w]
} }
sd::ggml_graph_cut::mark_graph_cut(h, "unet.input_blocks." + std::to_string(input_block_idx), "h");
hs.push_back(h); hs.push_back(h);
} }
if (tiny_unet) { if (tiny_unet) {
@ -521,7 +518,6 @@ public:
auto block = std::dynamic_pointer_cast<DownSampleBlock>(blocks[name]); auto block = std::dynamic_pointer_cast<DownSampleBlock>(blocks[name]);
h = block->forward(ctx, h); // [N, mult*model_channels, h/(2^(i+1)), w/(2^(i+1))] h = block->forward(ctx, h); // [N, mult*model_channels, h/(2^(i+1)), w/(2^(i+1))]
// sd::ggml_graph_cut::mark_graph_cut(h, "unet.input_blocks." + std::to_string(input_block_idx), "h");
hs.push_back(h); hs.push_back(h);
} }
} }
@ -535,7 +531,6 @@ public:
h = resblock_forward("middle_block.2", ctx, h, emb, num_video_frames); // [N, 4*model_channels, h/8, w/8] h = resblock_forward("middle_block.2", ctx, h, emb, num_video_frames); // [N, 4*model_channels, h/8, w/8]
} }
} }
sd::ggml_graph_cut::mark_graph_cut(h, "unet.middle_block", "h");
if (controls.size() > 0) { if (controls.size() > 0) {
auto cs = ggml_ext_scale(ctx->ggml_ctx, controls[controls.size() - 1], control_strength, true); auto cs = ggml_ext_scale(ctx->ggml_ctx, controls[controls.size() - 1], control_strength, true);
h = ggml_add(ctx->ggml_ctx, h, cs); // middle control h = ggml_add(ctx->ggml_ctx, h, cs); // middle control
@ -586,7 +581,6 @@ public:
} }
output_block_idx += 1; output_block_idx += 1;
sd::ggml_graph_cut::mark_graph_cut(h, "unet.output_blocks." + std::to_string(output_block_idx - 1), "h");
} }
} }

View File

@ -12,20 +12,30 @@ UpscalerGGML::UpscalerGGML(int n_threads,
tile_size(tile_size) { tile_size(tile_size) {
} }
void UpscalerGGML::set_max_graph_vram_bytes(size_t max_vram_bytes) {
max_graph_vram_bytes = max_vram_bytes;
if (esrgan_upscaler) {
esrgan_upscaler->set_max_graph_vram_bytes(max_vram_bytes);
}
}
bool UpscalerGGML::load_from_file(const std::string& esrgan_path, bool UpscalerGGML::load_from_file(const std::string& esrgan_path,
bool offload_params_to_cpu, bool offload_params_to_cpu,
int n_threads) { int n_threads) {
ggml_log_set(ggml_log_callback_default, nullptr); ggml_log_set(ggml_log_callback_default, nullptr);
#ifdef SD_USE_CUDA
backend = sd_get_default_backend(); LOG_DEBUG("Using CUDA backend");
backend = ggml_backend_cuda_init(0);
#endif
#ifdef SD_USE_METAL
LOG_DEBUG("Using Metal backend");
backend = ggml_backend_metal_init();
#endif
#ifdef SD_USE_VULKAN
LOG_DEBUG("Using Vulkan backend");
backend = ggml_backend_vk_init(0);
#endif
#ifdef SD_USE_OPENCL
LOG_DEBUG("Using OpenCL backend");
backend = ggml_backend_opencl_init();
#endif
#ifdef SD_USE_SYCL
LOG_DEBUG("Using SYCL backend");
backend = ggml_backend_sycl_init(0);
#endif
ModelLoader model_loader; ModelLoader model_loader;
if (!model_loader.init_from_file_and_convert_name(esrgan_path)) { if (!model_loader.init_from_file_and_convert_name(esrgan_path)) {
LOG_ERROR("init model loader from file failed: '%s'", esrgan_path.c_str()); LOG_ERROR("init model loader from file failed: '%s'", esrgan_path.c_str());
@ -37,7 +47,6 @@ bool UpscalerGGML::load_from_file(const std::string& esrgan_path,
} }
LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type)); LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, tile_size, model_loader.get_tensor_storage_map()); esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, tile_size, model_loader.get_tensor_storage_map());
esrgan_upscaler->set_max_graph_vram_bytes(max_graph_vram_bytes);
if (direct) { if (direct) {
esrgan_upscaler->set_conv2d_direct_enabled(true); esrgan_upscaler->set_conv2d_direct_enabled(true);
} }

View File

@ -14,9 +14,8 @@ struct UpscalerGGML {
std::shared_ptr<ESRGAN> esrgan_upscaler; std::shared_ptr<ESRGAN> esrgan_upscaler;
std::string esrgan_path; std::string esrgan_path;
int n_threads; int n_threads;
bool direct = false; bool direct = false;
int tile_size = 128; int tile_size = 128;
size_t max_graph_vram_bytes = 0;
UpscalerGGML(int n_threads, UpscalerGGML(int n_threads,
bool direct = false, bool direct = false,
@ -25,7 +24,6 @@ struct UpscalerGGML {
bool load_from_file(const std::string& esrgan_path, bool load_from_file(const std::string& esrgan_path,
bool offload_params_to_cpu, bool offload_params_to_cpu,
int n_threads); int n_threads);
void set_max_graph_vram_bytes(size_t max_vram_bytes);
sd::Tensor<float> upscale_tensor(const sd::Tensor<float>& input_tensor); sd::Tensor<float> upscale_tensor(const sd::Tensor<float>& input_tensor);
sd_image_t upscale(sd_image_t input_image, uint32_t upscale_factor); sd_image_t upscale(sd_image_t input_image, uint32_t upscale_factor);
}; };

View File

@ -23,9 +23,8 @@
#include <unistd.h> #include <unistd.h>
#endif #endif
#include "ggml-backend.h" #include "ggml-cpu.h"
#include "ggml.h" #include "ggml.h"
#include "ggml_extend_backend.hpp"
#include "stable-diffusion.h" #include "stable-diffusion.h"
bool ends_with(const std::string& str, const std::string& ending) { bool ends_with(const std::string& str, const std::string& ending) {
@ -120,10 +119,10 @@ std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
filename.c_str(), filename.c_str(),
GENERIC_READ, GENERIC_READ,
FILE_SHARE_READ, FILE_SHARE_READ,
nullptr, NULL,
OPEN_EXISTING, OPEN_EXISTING,
FILE_ATTRIBUTE_NORMAL, FILE_ATTRIBUTE_NORMAL,
nullptr); NULL);
if (file_handle == INVALID_HANDLE_VALUE) { if (file_handle == INVALID_HANDLE_VALUE) {
return nullptr; return nullptr;
@ -137,16 +136,16 @@ std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
file_size = static_cast<size_t>(size.QuadPart); file_size = static_cast<size_t>(size.QuadPart);
HANDLE mapping_handle = CreateFileMapping(file_handle, nullptr, PAGE_READONLY, 0, 0, nullptr); HANDLE mapping_handle = CreateFileMapping(file_handle, NULL, PAGE_READONLY, 0, 0, NULL);
if (mapping_handle == nullptr) { if (mapping_handle == NULL) {
CloseHandle(file_handle); CloseHandle(file_handle);
return nullptr; return nullptr;
} }
mapped_data = MapViewOfFile(mapping_handle, FILE_MAP_READ, 0, 0, file_size); mapped_data = MapViewOfFile(mapping_handle, FILE_MAP_READ, 0, 0, file_size);
if (mapped_data == nullptr) { if (mapped_data == NULL) {
CloseHandle(mapping_handle); CloseHandle(mapping_handle);
CloseHandle(file_handle); CloseHandle(file_handle);
return nullptr; return nullptr;
@ -204,7 +203,7 @@ std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
size_t file_size = sb.st_size; size_t file_size = sb.st_size;
void* mapped_data = mmap(nullptr, file_size, PROT_READ, mmap_flags, file_descriptor, 0); void* mapped_data = mmap(NULL, file_size, PROT_READ, mmap_flags, file_descriptor, 0);
close(file_descriptor); close(file_descriptor);
@ -496,6 +495,26 @@ sd_progress_cb_t sd_get_progress_callback() {
void* sd_get_progress_callback_data() { void* sd_get_progress_callback_data() {
return sd_progress_cb_data; return sd_progress_cb_data;
} }
const char* sd_get_system_info() {
static char buffer[1024];
std::stringstream ss;
ss << "System Info: \n";
ss << " SSE3 = " << ggml_cpu_has_sse3() << " | ";
ss << " AVX = " << ggml_cpu_has_avx() << " | ";
ss << " AVX2 = " << ggml_cpu_has_avx2() << " | ";
ss << " AVX512 = " << ggml_cpu_has_avx512() << " | ";
ss << " AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << " | ";
ss << " AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << " | ";
ss << " FMA = " << ggml_cpu_has_fma() << " | ";
ss << " NEON = " << ggml_cpu_has_neon() << " | ";
ss << " ARM_FMA = " << ggml_cpu_has_arm_fma() << " | ";
ss << " F16C = " << ggml_cpu_has_f16c() << " | ";
ss << " FP16_VA = " << ggml_cpu_has_fp16_va() << " | ";
ss << " WASM_SIMD = " << ggml_cpu_has_wasm_simd() << " | ";
ss << " VSX = " << ggml_cpu_has_vsx() << " | ";
snprintf(buffer, sizeof(buffer), "%s", ss.str().c_str());
return buffer;
}
sd_image_t tensor_to_sd_image(const sd::Tensor<float>& tensor, int frame_index) { sd_image_t tensor_to_sd_image(const sd::Tensor<float>& tensor, int frame_index) {
const auto& shape = tensor.shape(); const auto& shape = tensor.shape();
@ -505,7 +524,17 @@ sd_image_t tensor_to_sd_image(const sd::Tensor<float>& tensor, int frame_index)
int channel = static_cast<int>(shape[shape.size() == 5 ? 3 : 2]); int channel = static_cast<int>(shape[shape.size() == 5 ? 3 : 2]);
uint8_t* data = (uint8_t*)malloc(static_cast<size_t>(width * height * channel)); uint8_t* data = (uint8_t*)malloc(static_cast<size_t>(width * height * channel));
GGML_ASSERT(data != nullptr); GGML_ASSERT(data != nullptr);
preprocessing_tensor_frame_to_sd_image(tensor, frame_index, data);
for (int iw = 0; iw < width; ++iw) {
for (int ih = 0; ih < height; ++ih) {
for (int ic = 0; ic < channel; ++ic) {
float value = shape.size() == 5 ? tensor.index(iw, ih, frame_index, ic, 0)
: tensor.index(iw, ih, ic, frame_index);
value = std::clamp(value, 0.0f, 1.0f);
data[(ih * width + iw) * channel + ic] = static_cast<uint8_t>(std::round(value * 255.0f));
}
}
}
return { return {
static_cast<uint32_t>(width), static_cast<uint32_t>(width),
static_cast<uint32_t>(height), static_cast<uint32_t>(height),
@ -689,100 +718,3 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
return res; return res;
} }
// test if the backend is a specific one, e.g. "CUDA", "ROCm", "Vulkan" etc.
bool sd_backend_is(ggml_backend_t backend, const std::string& name) {
if (!backend) {
return false;
}
ggml_backend_dev_t dev = ggml_backend_get_device(backend);
if (!dev)
return false;
std::string dev_name = ggml_backend_dev_name(dev);
return dev_name.find(name) != std::string::npos;
}
ggml_backend_t sd_get_default_backend() {
ggml_backend_load_all_once();
static std::once_flag once;
std::call_once(once, []() {
size_t dev_count = ggml_backend_dev_count();
if (dev_count == 0) {
LOG_ERROR("No devices found!");
} else {
LOG_DEBUG("Found %zu backend devices:", dev_count);
for (size_t i = 0; i < dev_count; ++i) {
auto dev = ggml_backend_dev_get(i);
LOG_DEBUG("#%zu: %s", i, ggml_backend_dev_name(dev));
}
}
});
ggml_backend_t backend = nullptr;
const char* SD_VK_DEVICE = getenv("SD_VK_DEVICE");
if (SD_VK_DEVICE != nullptr) {
std::string sd_vk_device_str = SD_VK_DEVICE;
try {
unsigned long long device = std::stoull(sd_vk_device_str);
std::string vk_device_name = "Vulkan" + std::to_string(device);
if (backend_name_exists(vk_device_name)) {
LOG_INFO("Selecting %s as main device by env var SD_VK_DEVICE", vk_device_name.c_str());
backend = init_named_backend(vk_device_name);
if (!backend) {
LOG_WARN("Device %s requested by SD_VK_DEVICE failed to init. Falling back to the default device.", vk_device_name.c_str());
}
} else {
LOG_WARN("Device %s requested by SD_VK_DEVICE was not found. Falling back to the default device.", vk_device_name.c_str());
}
} catch (const std::invalid_argument&) {
LOG_WARN("SD_VK_DEVICE environment variable is not a valid integer (%s). Falling back to the default device.", SD_VK_DEVICE);
} catch (const std::out_of_range&) {
LOG_WARN("SD_VK_DEVICE environment variable value is out of range for `unsigned long long` type (%s). Falling back to the default device.", SD_VK_DEVICE);
}
}
if (!backend) {
std::string dev_name = get_default_backend_name();
backend = init_named_backend(dev_name);
if (!backend && !dev_name.empty()) {
LOG_WARN("device %s failed to init", dev_name.c_str());
}
}
if (!backend) {
LOG_WARN("loading CPU backend");
backend = ggml_backend_cpu_init();
}
if (ggml_backend_is_cpu(backend)) {
LOG_DEBUG("Using CPU backend");
}
return backend;
}
// namespace is needed to avoid conflicts with ggml_backend_extend.hpp
namespace ggml_cpu {
#include "ggml-cpu.h"
}
const char* sd_get_system_info() {
using namespace ggml_cpu;
static char buffer[1024];
std::stringstream ss;
ss << "System Info: \n";
ss << " SSE3 = " << ggml_cpu_has_sse3() << " | ";
ss << " AVX = " << ggml_cpu_has_avx() << " | ";
ss << " AVX2 = " << ggml_cpu_has_avx2() << " | ";
ss << " AVX512 = " << ggml_cpu_has_avx512() << " | ";
ss << " AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << " | ";
ss << " AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << " | ";
ss << " FMA = " << ggml_cpu_has_fma() << " | ";
ss << " NEON = " << ggml_cpu_has_neon() << " | ";
ss << " ARM_FMA = " << ggml_cpu_has_arm_fma() << " | ";
ss << " F16C = " << ggml_cpu_has_f16c() << " | ";
ss << " FP16_VA = " << ggml_cpu_has_fp16_va() << " | ";
ss << " WASM_SIMD = " << ggml_cpu_has_wasm_simd() << " | ";
ss << " VSX = " << ggml_cpu_has_vsx() << " | ";
snprintf(buffer, sizeof(buffer), "%s", ss.str().c_str());
return buffer;
}

View File

@ -6,7 +6,6 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "ggml-backend.h"
#include "stable-diffusion.h" #include "stable-diffusion.h"
#include "tensor.hpp" #include "tensor.hpp"
@ -83,10 +82,6 @@ int sd_get_preview_interval();
bool sd_should_preview_denoised(); bool sd_should_preview_denoised();
bool sd_should_preview_noisy(); bool sd_should_preview_noisy();
// test if the backend is a specific one, e.g. "CUDA", "ROCm", "Vulkan" etc.
bool sd_backend_is(ggml_backend_t backend, const std::string& name);
ggml_backend_t sd_get_default_backend();
#define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__) #define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__)
#define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__) #define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__)
#define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__) #define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__)

View File

@ -142,10 +142,9 @@ public:
"vae encode compute failed while processing a tile"); "vae encode compute failed while processing a tile");
} else { } else {
output = _compute(n_threads, input, false); output = _compute(n_threads, input, false);
free_compute_buffer();
} }
free_compute_buffer();
if (output.empty()) { if (output.empty()) {
LOG_ERROR("vae encode compute failed"); LOG_ERROR("vae encode compute failed");
return {}; return {};

View File

@ -692,7 +692,6 @@ namespace WAN {
} else { } else {
x = conv1->forward(ctx, x); x = conv1->forward(ctx, x);
} }
// sd::ggml_graph_cut::mark_graph_cut(x, "wan_vae.encoder.prelude", "x");
// downsamples // downsamples
std::vector<int64_t> dims = {dim}; std::vector<int64_t> dims = {dim};
@ -718,14 +717,12 @@ namespace WAN {
x = layer->forward(ctx, x, b, feat_cache, feat_idx, chunk_idx); x = layer->forward(ctx, x, b, feat_cache, feat_idx, chunk_idx);
} }
} }
// sd::ggml_graph_cut::mark_graph_cut(x, "wan_vae.encoder.down." + std::to_string(i), "x");
} }
// middle // middle
x = middle_0->forward(ctx, x, b, feat_cache, feat_idx); x = middle_0->forward(ctx, x, b, feat_cache, feat_idx);
x = middle_1->forward(ctx, x, b); x = middle_1->forward(ctx, x, b);
x = middle_2->forward(ctx, x, b, feat_cache, feat_idx); x = middle_2->forward(ctx, x, b, feat_cache, feat_idx);
// sd::ggml_graph_cut::mark_graph_cut(x, "wan_vae.encoder.mid", "x");
// head // head
x = head_0->forward(ctx, x); x = head_0->forward(ctx, x);
@ -866,13 +863,11 @@ namespace WAN {
} else { } else {
x = conv1->forward(ctx, x); x = conv1->forward(ctx, x);
} }
// sd::ggml_graph_cut::mark_graph_cut(x, "wan_vae.decoder.prelude", "x");
// middle // middle
x = middle_0->forward(ctx, x, b, feat_cache, feat_idx); x = middle_0->forward(ctx, x, b, feat_cache, feat_idx);
x = middle_1->forward(ctx, x, b); x = middle_1->forward(ctx, x, b);
x = middle_2->forward(ctx, x, b, feat_cache, feat_idx); x = middle_2->forward(ctx, x, b, feat_cache, feat_idx);
// sd::ggml_graph_cut::mark_graph_cut(x, "wan_vae.decoder.mid", "x");
// upsamples // upsamples
std::vector<int64_t> dims = {dim_mult[dim_mult.size() - 1] * dim}; std::vector<int64_t> dims = {dim_mult[dim_mult.size() - 1] * dim};
@ -898,7 +893,6 @@ namespace WAN {
x = layer->forward(ctx, x, b, feat_cache, feat_idx, chunk_idx); x = layer->forward(ctx, x, b, feat_cache, feat_idx, chunk_idx);
} }
} }
// sd::ggml_graph_cut::mark_graph_cut(x, "wan_vae.decoder.up." + std::to_string(i), "x");
} }
// head // head
@ -1037,7 +1031,6 @@ namespace WAN {
if (wan2_2) { if (wan2_2) {
x = patchify(ctx->ggml_ctx, x, 2, b); x = patchify(ctx->ggml_ctx, x, 2, b);
} }
// sd::ggml_graph_cut::mark_graph_cut(x, "wan_vae.encode.prelude", "x");
auto encoder = std::dynamic_pointer_cast<Encoder3d>(blocks["encoder"]); auto encoder = std::dynamic_pointer_cast<Encoder3d>(blocks["encoder"]);
auto conv1 = std::dynamic_pointer_cast<CausalConv3d>(blocks["conv1"]); auto conv1 = std::dynamic_pointer_cast<CausalConv3d>(blocks["conv1"]);
@ -1058,7 +1051,6 @@ namespace WAN {
} }
out = conv1->forward(ctx, out); out = conv1->forward(ctx, out);
auto mu = ggml_ext_chunk(ctx->ggml_ctx, out, 2, 3)[0]; auto mu = ggml_ext_chunk(ctx->ggml_ctx, out, 2, 3)[0];
// sd::ggml_graph_cut::mark_graph_cut(mu, "wan_vae.encode.final", "mu");
clear_cache(); clear_cache();
return mu; return mu;
} }
@ -1076,7 +1068,6 @@ namespace WAN {
int64_t iter_ = z->ne[2]; int64_t iter_ = z->ne[2];
auto x = conv2->forward(ctx, z); auto x = conv2->forward(ctx, z);
// sd::ggml_graph_cut::mark_graph_cut(x, "wan_vae.decode.prelude", "x");
ggml_tensor* out; ggml_tensor* out;
for (int i = 0; i < iter_; i++) { for (int i = 0; i < iter_; i++) {
_conv_idx = 0; _conv_idx = 0;
@ -1092,7 +1083,6 @@ namespace WAN {
if (wan2_2) { if (wan2_2) {
out = unpatchify(ctx->ggml_ctx, out, 2, b); out = unpatchify(ctx->ggml_ctx, out, 2, b);
} }
// sd::ggml_graph_cut::mark_graph_cut(out, "wan_vae.decode.final", "out");
clear_cache(); clear_cache();
return out; return out;
} }
@ -1107,15 +1097,13 @@ namespace WAN {
auto decoder = std::dynamic_pointer_cast<Decoder3d>(blocks["decoder"]); auto decoder = std::dynamic_pointer_cast<Decoder3d>(blocks["decoder"]);
auto conv2 = std::dynamic_pointer_cast<CausalConv3d>(blocks["conv2"]); auto conv2 = std::dynamic_pointer_cast<CausalConv3d>(blocks["conv2"]);
auto x = conv2->forward(ctx, z); auto x = conv2->forward(ctx, z);
// sd::ggml_graph_cut::mark_graph_cut(x, "wan_vae.decode_partial.prelude", "x");
auto in = ggml_ext_slice(ctx->ggml_ctx, x, 2, i, i + 1); // [b*c, 1, h, w] auto in = ggml_ext_slice(ctx->ggml_ctx, x, 2, i, i + 1); // [b*c, 1, h, w]
_conv_idx = 0; _conv_idx = 0;
auto out = decoder->forward(ctx, in, b, _feat_map, _conv_idx, i); auto out = decoder->forward(ctx, in, b, _feat_map, _conv_idx, i);
if (wan2_2) { if (wan2_2) {
out = unpatchify(ctx->ggml_ctx, out, 2, b); out = unpatchify(ctx->ggml_ctx, out, 2, b);
} }
// sd::ggml_graph_cut::mark_graph_cut(out, "wan_vae.decode_partial.final", "out");
return out; return out;
} }
}; };
@ -1996,13 +1984,6 @@ namespace WAN {
c = ggml_reshape_3d(ctx->ggml_ctx, c, c->ne[0] * c->ne[1] * c->ne[2], c->ne[3] / N, N); // [N, dim, t_len*h_len*w_len] c = ggml_reshape_3d(ctx->ggml_ctx, c, c->ne[0] * c->ne[1] * c->ne[2], c->ne[3] / N, N); // [N, dim, t_len*h_len*w_len]
c = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, c, 1, 0, 2, 3)); // [N, t_len*h_len*w_len, dim] c = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, c, 1, 0, 2, 3)); // [N, t_len*h_len*w_len, dim]
} }
sd::ggml_graph_cut::mark_graph_cut(x, "wan.prelude", "x");
// sd::ggml_graph_cut::mark_graph_cut(e, "wan.prelude", "e");
// sd::ggml_graph_cut::mark_graph_cut(e0, "wan.prelude", "e0");
// sd::ggml_graph_cut::mark_graph_cut(context, "wan.prelude", "context");
if (c != nullptr) {
sd::ggml_graph_cut::mark_graph_cut(c, "wan.prelude", "c");
}
auto x_orig = x; auto x_orig = x;
@ -2023,10 +2004,6 @@ namespace WAN {
c_skip = ggml_ext_scale(ctx->ggml_ctx, c_skip, vace_strength); c_skip = ggml_ext_scale(ctx->ggml_ctx, c_skip, vace_strength);
x = ggml_add(ctx->ggml_ctx, x, c_skip); x = ggml_add(ctx->ggml_ctx, x, c_skip);
} }
sd::ggml_graph_cut::mark_graph_cut(x, "wan.blocks." + std::to_string(i), "x");
if (c != nullptr) {
sd::ggml_graph_cut::mark_graph_cut(c, "wan.blocks." + std::to_string(i), "c");
}
} }
x = head->forward(ctx, x, e); // [N, t_len*h_len*w_len, pt*ph*pw*out_dim] x = head->forward(ctx, x, e); // [N, t_len*h_len*w_len, pt*ph*pw*out_dim]

View File

@ -31,6 +31,10 @@ namespace ZImage {
: head_dim(head_dim), num_heads(num_heads), num_kv_heads(num_kv_heads), qk_norm(qk_norm) { : head_dim(head_dim), num_heads(num_heads), num_kv_heads(num_kv_heads), qk_norm(qk_norm) {
blocks["qkv"] = std::make_shared<Linear>(hidden_size, (num_heads + num_kv_heads * 2) * head_dim, false); blocks["qkv"] = std::make_shared<Linear>(hidden_size, (num_heads + num_kv_heads * 2) * head_dim, false);
float scale = 1.f; float scale = 1.f;
#if GGML_USE_HIP
// Prevent NaN issues with certain ROCm setups
scale = 1.f / 16.f;
#endif
blocks["out"] = std::make_shared<Linear>(num_heads * head_dim, hidden_size, false, false, false, scale); blocks["out"] = std::make_shared<Linear>(num_heads * head_dim, hidden_size, false, false, false, scale);
if (qk_norm) { if (qk_norm) {
blocks["q_norm"] = std::make_shared<RMSNorm>(head_dim); blocks["q_norm"] = std::make_shared<RMSNorm>(head_dim);
@ -48,10 +52,6 @@ namespace ZImage {
auto qkv_proj = std::dynamic_pointer_cast<Linear>(blocks["qkv"]); auto qkv_proj = std::dynamic_pointer_cast<Linear>(blocks["qkv"]);
auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["out"]); auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["out"]);
if (sd_backend_is(ctx->backend, "ROCm")) {
out_proj->set_scale(1.f / 16.f);
}
auto qkv = qkv_proj->forward(ctx, x); // [N, n_token, (num_heads + num_kv_heads*2)*head_dim] auto qkv = qkv_proj->forward(ctx, x); // [N, n_token, (num_heads + num_kv_heads*2)*head_dim]
qkv = ggml_reshape_4d(ctx->ggml_ctx, qkv, head_dim, num_heads + num_kv_heads * 2, qkv->ne[1], qkv->ne[2]); // [N, n_token, num_heads + num_kv_heads*2, head_dim] qkv = ggml_reshape_4d(ctx->ggml_ctx, qkv, head_dim, num_heads + num_kv_heads * 2, qkv->ne[1], qkv->ne[2]); // [N, n_token, num_heads + num_kv_heads*2, head_dim]
@ -115,7 +115,9 @@ namespace ZImage {
bool force_prec_f32 = false; bool force_prec_f32 = false;
float scale = 1.f / 128.f; float scale = 1.f / 128.f;
#ifdef SD_USE_VULKAN
force_prec_f32 = true;
#endif
// The purpose of the scale here is to prevent NaN issues in certain situations. // The purpose of the scale here is to prevent NaN issues in certain situations.
// For example, when using CUDA but the weights are k-quants. // For example, when using CUDA but the weights are k-quants.
blocks["w2"] = std::make_shared<Linear>(hidden_dim, dim, false, false, force_prec_f32, scale); blocks["w2"] = std::make_shared<Linear>(hidden_dim, dim, false, false, force_prec_f32, scale);
@ -127,10 +129,6 @@ namespace ZImage {
auto w2 = std::dynamic_pointer_cast<Linear>(blocks["w2"]); auto w2 = std::dynamic_pointer_cast<Linear>(blocks["w2"]);
auto w3 = std::dynamic_pointer_cast<Linear>(blocks["w3"]); auto w3 = std::dynamic_pointer_cast<Linear>(blocks["w3"]);
if (sd_backend_is(ctx->backend, "Vulkan")) {
w2->set_force_prec_f32(true);
}
auto x1 = w1->forward(ctx, x); auto x1 = w1->forward(ctx, x);
auto x3 = w3->forward(ctx, x); auto x3 = w3->forward(ctx, x);
x = ggml_swiglu_split(ctx->ggml_ctx, x1, x3); x = ggml_swiglu_split(ctx->ggml_ctx, x1, x3);
@ -371,9 +369,6 @@ namespace ZImage {
auto txt = cap_embedder_1->forward(ctx, cap_embedder_0->forward(ctx, context)); // [N, n_txt_token, hidden_size] auto txt = cap_embedder_1->forward(ctx, cap_embedder_0->forward(ctx, context)); // [N, n_txt_token, hidden_size]
auto img = x_embedder->forward(ctx, x); // [N, n_img_token, hidden_size] auto img = x_embedder->forward(ctx, x); // [N, n_img_token, hidden_size]
sd::ggml_graph_cut::mark_graph_cut(txt, "z_image.prelude", "txt");
sd::ggml_graph_cut::mark_graph_cut(img, "z_image.prelude", "img");
sd::ggml_graph_cut::mark_graph_cut(t_emb, "z_image.prelude", "t_emb");
int64_t n_txt_pad_token = Rope::bound_mod(static_cast<int>(n_txt_token), SEQ_MULTI_OF); int64_t n_txt_pad_token = Rope::bound_mod(static_cast<int>(n_txt_token), SEQ_MULTI_OF);
if (n_txt_pad_token > 0) { if (n_txt_pad_token > 0) {
@ -396,24 +391,20 @@ namespace ZImage {
auto block = std::dynamic_pointer_cast<JointTransformerBlock>(blocks["context_refiner." + std::to_string(i)]); auto block = std::dynamic_pointer_cast<JointTransformerBlock>(blocks["context_refiner." + std::to_string(i)]);
txt = block->forward(ctx, txt, txt_pe, nullptr, nullptr); txt = block->forward(ctx, txt, txt_pe, nullptr, nullptr);
sd::ggml_graph_cut::mark_graph_cut(txt, "z_image.context_refiner." + std::to_string(i), "txt");
} }
for (int i = 0; i < z_image_params.num_refiner_layers; i++) { for (int i = 0; i < z_image_params.num_refiner_layers; i++) {
auto block = std::dynamic_pointer_cast<JointTransformerBlock>(blocks["noise_refiner." + std::to_string(i)]); auto block = std::dynamic_pointer_cast<JointTransformerBlock>(blocks["noise_refiner." + std::to_string(i)]);
img = block->forward(ctx, img, img_pe, nullptr, t_emb); img = block->forward(ctx, img, img_pe, nullptr, t_emb);
sd::ggml_graph_cut::mark_graph_cut(img, "z_image.noise_refiner." + std::to_string(i), "img");
} }
auto txt_img = ggml_concat(ctx->ggml_ctx, txt, img, 1); // [N, n_txt_token + n_txt_pad_token + n_img_token + n_img_pad_token, hidden_size] auto txt_img = ggml_concat(ctx->ggml_ctx, txt, img, 1); // [N, n_txt_token + n_txt_pad_token + n_img_token + n_img_pad_token, hidden_size]
sd::ggml_graph_cut::mark_graph_cut(txt_img, "z_image.prelude", "txt_img");
for (int i = 0; i < z_image_params.num_layers; i++) { for (int i = 0; i < z_image_params.num_layers; i++) {
auto block = std::dynamic_pointer_cast<JointTransformerBlock>(blocks["layers." + std::to_string(i)]); auto block = std::dynamic_pointer_cast<JointTransformerBlock>(blocks["layers." + std::to_string(i)]);
txt_img = block->forward(ctx, txt_img, pe, nullptr, t_emb); txt_img = block->forward(ctx, txt_img, pe, nullptr, t_emb);
sd::ggml_graph_cut::mark_graph_cut(txt_img, "z_image.layers." + std::to_string(i), "txt_img");
} }
txt_img = final_layer->forward(ctx, txt_img, t_emb); // [N, n_txt_token + n_txt_pad_token + n_img_token + n_img_pad_token, ph*pw*C] txt_img = final_layer->forward(ctx, txt_img, t_emb); // [N, n_txt_token + n_txt_pad_token + n_img_token + n_img_pad_token, ph*pw*C]