diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b173ebed..749366e7 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -176,6 +176,7 @@ jobs: build-and-push-docker-images: name: Build and push container images + if: ${{ github.event_name != 'pull_request' }} runs-on: ubuntu-latest permissions: diff --git a/CMakeLists.txt b/CMakeLists.txt index a43c99f9..6a9fb104 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,6 +11,10 @@ endif() if (MSVC) add_compile_definitions(_CRT_SECURE_NO_WARNINGS) add_compile_definitions(_SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING) + add_compile_options( + $<$:/MP> + $<$:/MP> + ) endif() set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) @@ -152,12 +156,16 @@ endif() set(SD_LIB stable-diffusion) -file(GLOB SD_LIB_SOURCES +file(GLOB SD_LIB_SOURCES CONFIGURE_DEPENDS "src/*.h" "src/*.cpp" "src/*.hpp" - "src/vocab/*.h" - "src/vocab/*.cpp" + "src/model_io/*.h" + "src/model_io/*.cpp" + "src/tokenizers/*.h" + "src/tokenizers/*.cpp" + "src/tokenizers/vocab/*.h" + "src/tokenizers/vocab/*.cpp" ) find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH) @@ -250,7 +258,7 @@ endif() add_subdirectory(thirdparty) target_link_libraries(${SD_LIB} PUBLIC ggml zip) -target_include_directories(${SD_LIB} PUBLIC . include) +target_include_directories(${SD_LIB} PUBLIC . src include) target_include_directories(${SD_LIB} PUBLIC . thirdparty) target_compile_features(${SD_LIB} PUBLIC c_std_11 cxx_std_17) diff --git a/README.md b/README.md index fbed50d2..8afdeb20 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,7 @@ API and command-line option may change frequently.*** - [Z-Image](./docs/z_image.md) - [Ovis-Image](./docs/ovis_image.md) - [Anima](./docs/anima.md) + - [ERNIE-Image](./docs/ernie_image.md) - Image Edit Models - [FLUX.1-Kontext-dev](./docs/kontext.md) - [Qwen Image Edit series](./docs/qwen_image_edit.md) @@ -76,9 +77,10 @@ API and command-line option may change frequently.*** - OpenCL - SYCL - Supported weight formats - - Pytorch checkpoint (`.ckpt` or `.pth`) + - Pytorch checkpoint (`.ckpt` or `.pth` or `.pt`) - Safetensors (`.safetensors`) - GGUF (`.gguf`) +- Convert mode supports converting model weights to `.gguf` or `.safetensors` - Supported platforms - Linux - Mac OS @@ -96,6 +98,7 @@ API and command-line option may change frequently.*** - `DPM++ 2M` - [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457) - `DPM++ 2S a` + - `ER-SDE` - [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952) - Cross-platform reproducibility - `--rng cuda`, default, consistent with the `stable-diffusion-webui GPU RNG` @@ -144,6 +147,7 @@ If you want to improve performance or reduce VRAM/RAM usage, please refer to [pe - [πŸ”₯Z-Image](./docs/z_image.md) - [Ovis-Image](./docs/ovis_image.md) - [Anima](./docs/anima.md) +- [ERNIE-Image](./docs/ernie_image.md) - [LoRA](./docs/lora.md) - [LCM/LCM-LoRA](./docs/lcm.md) - [Using PhotoMaker to personalize image generation](./docs/photo_maker.md) diff --git a/assets/ernie_image/example.png b/assets/ernie_image/example.png new file mode 100644 index 00000000..3f5ed652 Binary files /dev/null and b/assets/ernie_image/example.png differ diff --git a/assets/ernie_image/turbo_example.png b/assets/ernie_image/turbo_example.png new file mode 100644 index 00000000..15318b3e Binary files /dev/null and b/assets/ernie_image/turbo_example.png differ diff --git a/docs/distilled_sd.md b/docs/distilled_sd.md index 3174b18f..7aa8fbed 100644 --- a/docs/distilled_sd.md +++ b/docs/distilled_sd.md @@ -87,51 +87,32 @@ pipe.save_pretrained("segmindtiny-sd", safe_serialization=True) ```bash python convert_diffusers_to_original_stable_diffusion.py \ --model_path ./segmindtiny-sd \ - --checkpoint_path ./segmind_tiny-sd.ckpt --half + --checkpoint_path ./segmind_tiny-sd.safetensors --half --use_safetensors ``` -The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above. +The file segmind_tiny-sd.safetensors will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above. -##### Another available .ckpt file: - - * https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt - -To use this file, you must first adjust its non-contiguous tensors: - -```python -import torch -ckpt = torch.load("tinySDdistilled.ckpt", map_location=torch.device('cpu')) -for key, value in ckpt['state_dict'].items(): - if isinstance(value, torch.Tensor): - ckpt['state_dict'][key] = value.contiguous() -torch.save(ckpt, "tinySDdistilled_fixed.ckpt") -``` - - -### SDXS-512 +### SDXS-512-DreamShaper Another very tiny and **incredibly fast** model is SDXS by IDKiro et al. The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part. +##### Some ready-to-run SDXS-512 model files are available online, such as: -##### 1. Download the diffusers model from Hugging Face using Python: - -```python -from diffusers import StableDiffusionPipeline -pipe = StableDiffusionPipeline.from_pretrained("IDKiro/sdxs-512-dreamshaper") -pipe.save_pretrained(save_directory="sdxs") -``` -##### 2. Create a safetensors file - -```bash -python convert_diffusers_to_original_stable_diffusion.py \ - --model_path sdxs --checkpoint_path sdxs.safetensors --half --use_safetensors -``` - -##### 3. Run the model as follows: +* https://huggingface.co/akleine/sdxs-512 +* https://huggingface.co/concedo/sdxs-512-tinySDdistilled-GGUF +##### Run the model as follows: ```bash ~/stable-diffusion.cpp/build/bin/sd-cli -m sdxs.safetensors -p "portrait of a lovely cat" \ --cfg-scale 1 --steps 1 ``` +Both options: ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are mandatory here. -Both options: ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are mandatory here. +### SDXS-512-0.9 + +Even though the name "SDXS-512-0.9" is similar to "SDXS-512-DreamShaper", it is *completely different* but also **incredibly fast**. Sometimes it is preferred, so try it yourself. +##### Download a ready-to-run file from here: + +* https://huggingface.co/akleine/sdxs-09 + +For the use of this model, both options ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are again absolutely necessary. diff --git a/docs/ernie_image.md b/docs/ernie_image.md new file mode 100644 index 00000000..d68da396 --- /dev/null +++ b/docs/ernie_image.md @@ -0,0 +1,35 @@ +# How to Use + +You can run ERNIE-Image with stable-diffusion.cpp on GPUs with 4GB of VRAM β€” or even less. + +## Download weights + +- Download ERNIE-Image-Turbo + - safetensors: https://huggingface.co/Comfy-Org/ERNIE-Image/tree/main/diffusion_models + - gguf: https://huggingface.co/unsloth/ERNIE-Image-Turbo-GGUF/tree/main +- Download ERNIE-Image + - safetensors: https://huggingface.co/Comfy-Org/ERNIE-Image/tree/main/diffusion_models + - gguf: https://huggingface.co/unsloth/ERNIE-Image-GGUF/tree/main +- Download vae + - safetensors: https://huggingface.co/Comfy-Org/ERNIE-Image/tree/main/vae +- Download ministral 3b + - safetensors: https://huggingface.co/Comfy-Org/ERNIE-Image/tree/main/text_encoders + - gguf: https://huggingface.co/unsloth/Ministral-3-3B-Instruct-2512-GGUF/tree/main + +## Examples + +### ERNIE-Image-Turbo + +``` +.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\ernie-image-turbo.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\ministral-3-3b.safetensors -p "a lovely cat" --cfg-scale 1.0 --steps 8 -v --offload-to-cpu --diffusion-fa +``` + +ERNIE-Image Turbo example + +### ERNIE-Image + +``` +.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\ernie-image-UD-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\ministral-3-3b.safetensors -p "a lovely cat" --cfg-scale 5.0 -v --offload-to-cpu --diffusion-fa +``` + +ERNIE-Image example diff --git a/examples/cli/README.md b/examples/cli/README.md index 289cb866..7b620fee 100644 --- a/examples/cli/README.md +++ b/examples/cli/README.md @@ -4,26 +4,29 @@ usage: ./bin/sd-cli [options] CLI Options: - -o, --output path to write result image to. you can use printf-style %d format specifiers for image sequences (default: - ./output.png) (eg. output_%03d.png). For video generation, single-file outputs support .avi, .webm, and animated .webp - --preview-path path to write preview image to (default: ./preview.png). Multi-frame previews support .avi, .webm, and animated .webp - --preview-interval interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at - every step) - --output-begin-idx starting index for output image sequence, must be non-negative (default 0 if specified %d in output path, 1 otherwise) - --image path to the image to inspect (for metadata mode) - --metadata-format metadata output format, one of [text, json] (default: text) - --canny apply canny preprocessor (edge detection) - --convert-name convert tensor name (for convert mode) - -v, --verbose print extra info - --color colors the logging tags according to level - --taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview tae) - --preview-noisy enables previewing noisy inputs of the models rather than the denoised outputs - --metadata-raw include raw hex previews for unparsed metadata payloads - --metadata-brief truncate long metadata text values in text output - --metadata-all include structural/container entries such as IHDR, IDAT, and non-metadata JPEG segments - -M, --mode run mode, one of [img_gen, vid_gen, upscale, convert, metadata], default: img_gen - --preview preview method. must be one of the following [none, proj, tae, vae] (default is none) - -h, --help show this help message and exit + -o, --output path to write result image to. you can use printf-style %d format specifiers for image + sequences (default: ./output.png) (eg. output_%03d.png). Single-file video outputs + support .avi, .webm, and animated .webp + --image path to the image to inspect (for metadata mode) + --metadata-format metadata output format, one of [text, json] (default: text) + --preview-path path to write preview image to (default: ./preview.png). Multi-frame previews support + .avi, .webm, and animated .webp + --preview-interval interval in denoising steps between consecutive updates of the image preview file + (default is 1, meaning updating at every step) + --output-begin-idx starting index for output image sequence, must be non-negative (default 0 if specified + %d in output path, 1 otherwise) + --canny apply canny preprocessor (edge detection) + --convert-name convert tensor name (for convert mode) + -v, --verbose print extra info + --color colors the logging tags according to level + --taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview tae) + --preview-noisy enables previewing noisy inputs of the models rather than the denoised outputs + --metadata-raw include raw hex previews for unparsed metadata payloads + --metadata-brief truncate long metadata text values in text output + --metadata-all include structural/container entries such as IHDR, IDAT, and non-metadata JPEG segments + -M, --mode run mode, one of [img_gen, vid_gen, upscale, convert, metadata], default: img_gen + --preview preview method. must be one of the following [none, proj, tae, vae] (default is none) + -h, --help show this help message and exit Context Options: -m, --model path to full model @@ -31,7 +34,8 @@ Context Options: --clip_g path to the clip-g text encoder --clip_vision path to the clip-vision encoder --t5xxl path to the t5xxl text encoder - --llm path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...) + --llm path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, + mistral-small3.2 for flux2, ...) --llm_vision path to the llm vit --qwen2vl alias of --llm. Deprecated. --qwen2vl_vision alias of --llm_vision. Deprecated. @@ -43,16 +47,16 @@ Context Options: --control-net path to control net model --embd-dir embeddings directory --lora-model-dir lora model directory + --hires-upscalers-dir highres fix upscaler model directory --tensor-type-rules weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0") --photo-maker path to PHOTOMAKER model --upscale-model path to esrgan model. - -t, --threads number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of - CPU physical cores + -t, --threads number of threads to use during computation (default: -1). If threads <= 0, + then threads will be set to the number of CPU physical cores --chroma-t5-mask-pad t5 mask pad size of chroma - --vae-tile-overlap tile overlap for vae tiling, in fraction of tile size (default: 0.5) - --vae-tiling process vae in tiles to reduce memory usage --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae - --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed + --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM + when needed --mmap whether to memory-map model --control-net-cpu keep controlnet in cpu (for low vram) --clip-on-cpu keep clip in cpu (for low vram) @@ -67,20 +71,19 @@ Context Options: --chroma-disable-dit-mask disable dit mask for chroma --qwen-image-zero-cond-t enable zero_cond_t for qwen image --chroma-enable-t5-mask enable t5 mask for chroma - --type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the - type of the weight file + --type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, + q4_K). If not specified, the default is the type of the weight file --rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui) --sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng - --prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow] - --lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights - contain any quantized parameters, the at_runtime mode will be used; otherwise, - immediately will be used.The immediately mode may have precision and - compatibility issues with quantized parameters, but it usually offers faster inference - speed and, in some cases, lower memory usage. The at_runtime mode, on the - other hand, is exactly the opposite. - --vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32) - --vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 - (overrides --vae-tile-size) + --prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, + flux2_flow] + --lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is + auto. In auto mode, if the model weights contain any quantized parameters, + the at_runtime mode will be used; otherwise, immediately will be used.The + immediately mode may have precision and compatibility issues with quantized + parameters, but it usually offers faster inference speed and, in some cases, + lower memory usage. The at_runtime mode, on the other hand, is exactly the + opposite. Generation Options: -p, --prompt the prompt to render @@ -89,69 +92,99 @@ Generation Options: --end-img path to the end image, required by flf2v --mask path to the mask image --control-image path to control image, control net - --control-video path to control video frames, It must be a directory path. The video frames inside should be stored as images in - lexicographical (character) order. For example, if the control video path is - `frames`, the directory contain images such as 00.png, 01.png, ... etc. + --control-video path to control video frames, It must be a directory path. The video frames + inside should be stored as images in lexicographical (character) order. For + example, if the control video path is `frames`, the directory contain images + such as 00.png, 01.png, ... etc. --pm-id-images-dir path to PHOTOMAKER input id images dir --pm-id-embed-path path to PHOTOMAKER v2 id embed + --hires-upscaler highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent + (nearest-exact), Latent (antialiased), Latent (bicubic), Latent (bicubic + antialiased), or a model name under --hires-upscalers-dir (default: Latent) -H, --height image height, in pixel space (default: 512) -W, --width image width, in pixel space (default: 512) --steps number of sample steps (default: 20) --high-noise-steps (high noise) number of sample steps (default: -1 = auto) - --clip-skip ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified, - will be 1 for SD1.x, 2 for SD2.x + --clip-skip ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer + (default: -1). <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x -b, --batch-count batch count --video-frames video frames (default: 1) --fps fps (default: 24) - --timestep-shift shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for - NitroSD-Vibrant + --timestep-shift shift timestep for NitroFusion models (default: 0). recommended N for + NitroSD-Realism around 250 and 500 for NitroSD-Vibrant --upscale-repeats Run the ESRGAN upscaler this many times (default: 1) --upscale-tile-size tile size for ESRGAN upscaling (default: 128) + --hires-width highres fix target width, 0 to use --hires-scale (default: 0) + --hires-height highres fix target height, 0 to use --hires-scale (default: 0) + --hires-steps highres fix second pass sample steps, 0 to reuse --steps (default: 0) + --hires-upscale-tile-size highres fix upscaler tile size, reserved for model-backed upscalers (default: + 128) --cfg-scale unconditional guidance scale: (default: 7.0) - --img-cfg-scale image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale) + --img-cfg-scale image guidance scale for inpaint or instruct-pix2pix models: (default: same + as --cfg-scale) --guidance distilled guidance scale for models with guidance input (default: 3.5) - --slg-scale skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5 - medium + --slg-scale skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means + disabled, a value of 2.5 is nice for sd3.5 medium --skip-layer-start SLG enabling point (default: 0.01) --skip-layer-end SLG disabling point (default: 0.2) - --eta noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a and dpm++2s_a) + --eta noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and + res_2s; 1 for euler_a, er_sde and dpm++2s_a) --flow-shift shift value for Flow models like SD3.x or WAN (default: auto) --high-noise-cfg-scale (high noise) unconditional guidance scale: (default: 7.0) - --high-noise-img-cfg-scale (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale) - --high-noise-guidance (high noise) distilled guidance scale for models with guidance input (default: 3.5) - --high-noise-slg-scale (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0) + --high-noise-img-cfg-scale (high noise) image guidance scale for inpaint or instruct-pix2pix models + (default: same as --cfg-scale) + --high-noise-guidance (high noise) distilled guidance scale for models with guidance input + (default: 3.5) + --high-noise-slg-scale (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: + 0) --high-noise-skip-layer-start (high noise) SLG enabling point (default: 0.01) --high-noise-skip-layer-end (high noise) SLG disabling point (default: 0.2) - --high-noise-eta (high noise) noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a and dpm++2s_a) + --high-noise-eta (high noise) noise multiplier (default: 0 for ddim_trailing, tcd, + res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a) --strength strength for noising/unnoising (default: 0.75) - --pm-style-strength - --control-strength strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image - --moe-boundary timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1 + --pm-style-strength + --control-strength strength to apply Control Net (default: 0.9). 1.0 corresponds to full + destruction of information in init image + --moe-boundary timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if + `--high-noise-steps` is set to -1 --vace-strength wan vace strength - --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1). + --vae-tile-overlap tile overlap for vae tiling, in fraction of tile size (default: 0.5) + --hires-scale highres fix scale when target size is not set (default: 2.0) + --hires-denoising-strength highres fix second pass denoising strength (default: 0.7) + --increase-ref-index automatically increase the indices of references images based on the order + they are listed (starting with 1). --disable-auto-resize-ref-image disable auto resize of ref images --disable-image-metadata do not embed generation metadata on image files + --vae-tiling process vae in tiles to reduce memory usage + --hires enable highres fix -s, --seed RNG seed (default: 42, use random seed for < 0) - --sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, - tcd, res_multistep, res_2s] (default: euler for Flux/SD3/Wan, euler_a - otherwise) - --high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, - ddim_trailing, tcd, res_multistep, res_2s] default: euler for Flux/SD3/Wan, - euler_a otherwise - --scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, - kl_optimal, lcm, bong_tangent], default: discrete - --sigmas custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0"). + --sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, + dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s, + er_sde] (default: euler for Flux/SD3/Wan, euler_a otherwise) + --high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, + dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, + res_2s, er_sde] default: euler for Flux/SD3/Wan, euler_a otherwise + --scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, + smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent], default: + discrete + --sigmas custom sigma values for the sampler, comma-separated (e.g., + "14.61,7.8,3.5,0.0"). --skip-layers layers to skip for SLG steps (default: [7,8,9]) --high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9]) -r, --ref-image reference image for Flux Kontext models (can be used multiple times) - --cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), - 'spectrum' (UNET/DiT Chebyshev+Taylor forecasting) + --cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), + 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT + Chebyshev+Taylor forecasting) --cache-option named cache params (key=value format, comma-separated). easycache/ucache: - threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=; - spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples: - "threshold=0.25" or "threshold=1.5,reset=0" or "w=0.4,window=2" - --scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache + threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: + Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. + Examples: "threshold=0.25" or "threshold=1.5,reset=0" + --scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., + "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache --scm-policy SCM policy: 'dynamic' (default) or 'static' + --vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32) + --vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size + if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size) ``` Metadata mode inspects PNG/JPEG container metadata without loading any model: diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 0a8063f4..15b04d8f 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -279,7 +279,9 @@ void parse_args(int argc, const char** argv, SDCliParams& cli_params, SDContextP bool valid = cli_params.resolve_and_validate(); if (valid && cli_params.mode != METADATA) { valid = ctx_params.resolve_and_validate(cli_params.mode) && - gen_params.resolve_and_validate(cli_params.mode, ctx_params.lora_model_dir); + gen_params.resolve_and_validate(cli_params.mode, + ctx_params.lora_model_dir, + ctx_params.hires_upscalers_dir); } if (!valid) { @@ -501,7 +503,7 @@ int main(int argc, const char* argv[]) { cli_params.verbose = true; sd_set_log_callback(sd_log_cb, (void*)&cli_params); - LLM::GemmaTokenizer tokenizer; + GemmaTokenizer tokenizer; auto tokens = tokenizer.tokenize(" δΈ€εͺε―ηˆ±ηš„ε°ηŒ«"); for (auto token : tokens) { LOG_INFO("%d", token); @@ -698,6 +700,13 @@ int main(int argc, const char* argv[]) { vae_decode_only = false; } + if (gen_params.hires_enabled && + (gen_params.resolved_hires_upscaler == SD_HIRES_UPSCALER_MODEL || + gen_params.resolved_hires_upscaler == SD_HIRES_UPSCALER_LANCZOS || + gen_params.resolved_hires_upscaler == SD_HIRES_UPSCALER_NEAREST)) { + vae_decode_only = false; + } + sd_ctx_params_t sd_ctx_params = ctx_params.to_sd_ctx_params_t(vae_decode_only, true, cli_params.taesd_preview); SDImageVec results; diff --git a/examples/common/common.cpp b/examples/common/common.cpp index 5826b835..2d29df26 100644 --- a/examples/common/common.cpp +++ b/examples/common/common.cpp @@ -107,47 +107,60 @@ static bool is_absolute_path(const std::string& p) { std::string ArgOptions::wrap_text(const std::string& text, size_t width, size_t indent) { std::ostringstream oss; - size_t line_len = 0; size_t pos = 0; + size_t line_len = 0; while (pos < text.size()) { - // Preserve manual newlines if (text[pos] == '\n') { oss << '\n' << std::string(indent, ' '); - line_len = indent; + line_len = 0; ++pos; continue; } - // Add the character - oss << text[pos]; - ++line_len; - ++pos; + if (std::isspace(static_cast(text[pos]))) { + ++pos; + continue; + } - // If the current line exceeds width, try to break at the last space - if (line_len >= width) { - std::string current = oss.str(); - size_t back = current.size(); + size_t word_start = pos; + while (pos < text.size() && + text[pos] != '\n' && + !std::isspace(static_cast(text[pos]))) { + ++pos; + } - // Find the last space (for a clean break) - while (back > 0 && current[back - 1] != ' ' && current[back - 1] != '\n') - --back; - - // If found a space to break on - if (back > 0 && current[back - 1] != '\n') { - std::string before = current.substr(0, back - 1); - std::string after = current.substr(back); - oss.str(""); - oss.clear(); - oss << before << "\n" - << std::string(indent, ' ') << after; - } else { - // If no space found, just break at width - oss << "\n" - << std::string(indent, ' '); + std::string word = text.substr(word_start, pos - word_start); + while (!word.empty()) { + size_t separator_len = line_len == 0 ? 0 : 1; + if (line_len + separator_len + word.size() <= width) { + if (separator_len > 0) { + oss << ' '; + ++line_len; + } + oss << word; + line_len += word.size(); + word.clear(); + continue; + } + + if (line_len > 0) { + oss << '\n' + << std::string(indent, ' '); + line_len = 0; + continue; + } + + size_t chunk_len = std::min(width, word.size()); + oss << word.substr(0, chunk_len); + line_len = chunk_len; + word.erase(0, chunk_len); + if (!word.empty()) { + oss << '\n' + << std::string(indent, ' '); + line_len = 0; } - line_len = indent; } } @@ -351,7 +364,10 @@ ArgOptions SDContextParams::get_options() { "--lora-model-dir", "lora model directory", &lora_model_dir}, - + {"", + "--hires-upscalers-dir", + "highres fix upscaler model directory", + &hires_upscalers_dir}, {"", "--tensor-type-rules", "weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")", @@ -649,6 +665,7 @@ std::string SDContextParams::to_string() const { << " wtype: " << sd_type_name(wtype) << ",\n" << " tensor_type_rules: \"" << tensor_type_rules << "\",\n" << " lora_model_dir: \"" << lora_model_dir << "\",\n" + << " hires_upscalers_dir: \"" << hires_upscalers_dir << "\",\n" << " photo_maker_path: \"" << photo_maker_path << "\",\n" << " rng_type: " << sd_rng_type_name(rng_type) << ",\n" << " sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n" @@ -777,6 +794,12 @@ ArgOptions SDGenerationParams::get_options() { "--pm-id-embed-path", "path to PHOTOMAKER v2 id embed", &pm_id_embed_path}, + {"", + "--hires-upscaler", + "highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent (nearest-exact), " + "Latent (antialiased), Latent (bicubic), Latent (bicubic antialiased), or a model name " + "under --hires-upscalers-dir (default: Latent)", + &hires_upscaler}, }; options.int_options = { @@ -826,6 +849,22 @@ ArgOptions SDGenerationParams::get_options() { "--upscale-tile-size", "tile size for ESRGAN upscaling (default: 128)", &upscale_tile_size}, + {"", + "--hires-width", + "highres fix target width, 0 to use --hires-scale (default: 0)", + &hires_width}, + {"", + "--hires-height", + "highres fix target height, 0 to use --hires-scale (default: 0)", + &hires_height}, + {"", + "--hires-steps", + "highres fix second pass sample steps, 0 to reuse --steps (default: 0)", + &hires_steps}, + {"", + "--hires-upscale-tile-size", + "highres fix upscaler tile size, reserved for model-backed upscalers (default: 128)", + &hires_upscale_tile_size}, }; options.float_options = { @@ -855,7 +894,7 @@ ArgOptions SDGenerationParams::get_options() { &sample_params.guidance.slg.layer_end}, {"", "--eta", - "noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a and dpm++2s_a)", + "noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)", &sample_params.eta}, {"", "--flow-shift", @@ -887,7 +926,7 @@ ArgOptions SDGenerationParams::get_options() { &high_noise_sample_params.guidance.slg.layer_end}, {"", "--high-noise-eta", - "(high noise) noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a and dpm++2s_a)", + "(high noise) noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)", &high_noise_sample_params.eta}, {"", "--strength", @@ -913,6 +952,14 @@ ArgOptions SDGenerationParams::get_options() { "--vae-tile-overlap", "tile overlap for vae tiling, in fraction of tile size (default: 0.5)", &vae_tiling_params.target_overlap}, + {"", + "--hires-scale", + "highres fix scale when target size is not set (default: 2.0)", + &hires_scale}, + {"", + "--hires-denoising-strength", + "highres fix second pass denoising strength (default: 0.7)", + &hires_denoising_strength}, }; options.bool_options = { @@ -936,6 +983,11 @@ ArgOptions SDGenerationParams::get_options() { "process vae in tiles to reduce memory usage", true, &vae_tiling_params.enabled}, + {"", + "--hires", + "enable highres fix", + true, + &hires_enabled}, }; auto on_seed_arg = [&](int argc, const char** argv, int index) { @@ -1185,12 +1237,12 @@ ArgOptions SDGenerationParams::get_options() { on_seed_arg}, {"", "--sampling-method", - "sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s] " + "sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s, er_sde] " "(default: euler for Flux/SD3/Wan, euler_a otherwise)", on_sample_method_arg}, {"", "--high-noise-sampling-method", - "(high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s]" + "(high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s, er_sde]" " default: euler for Flux/SD3/Wan, euler_a otherwise", on_high_noise_sample_method_arg}, {"", @@ -1424,6 +1476,37 @@ static bool parse_lora_json_field(const json& parent, return true; } +static bool resolve_model_file_from_dir(const std::string& model_name, + const std::string& model_dir, + const std::vector& valid_ext, + const char* label, + std::string& resolved_path) { + if (model_dir.empty()) { + LOG_ERROR("%s directory is empty", label); + return false; + } + if (model_name.empty() || + model_name.find('/') != std::string::npos || + model_name.find('\\') != std::string::npos || + fs::path(model_name).has_root_path() || + fs::path(model_name).has_extension()) { + LOG_ERROR("%s must be a model name without path or extension: %s", label, model_name.c_str()); + return false; + } + + fs::path model_dir_path = model_dir; + for (const auto& ext : valid_ext) { + fs::path try_path = model_dir_path / (model_name + ext); + if (fs::exists(try_path) && fs::is_regular_file(try_path)) { + resolved_path = try_path.lexically_normal().string(); + return true; + } + } + + LOG_ERROR("can not find %s %s in %s", label, model_name.c_str(), model_dir_path.lexically_normal().string().c_str()); + return false; +} + bool SDGenerationParams::from_json_str( const std::string& json_str, const std::function& lora_path_resolver) { @@ -1487,6 +1570,34 @@ bool SDGenerationParams::from_json_str( load_if_exists("increase_ref_index", increase_ref_index); load_if_exists("embed_image_metadata", embed_image_metadata); + if (j.contains("hires") && j["hires"].is_object()) { + const json& hires_json = j["hires"]; + if (hires_json.contains("enabled") && hires_json["enabled"].is_boolean()) { + hires_enabled = hires_json["enabled"]; + } + if (hires_json.contains("upscaler") && hires_json["upscaler"].is_string()) { + hires_upscaler = hires_json["upscaler"]; + } + if (hires_json.contains("scale") && hires_json["scale"].is_number()) { + hires_scale = hires_json["scale"]; + } + if (hires_json.contains("target_width") && hires_json["target_width"].is_number_integer()) { + hires_width = hires_json["target_width"]; + } + if (hires_json.contains("target_height") && hires_json["target_height"].is_number_integer()) { + hires_height = hires_json["target_height"]; + } + if (hires_json.contains("steps") && hires_json["steps"].is_number_integer()) { + hires_steps = hires_json["steps"]; + } + if (hires_json.contains("denoising_strength") && hires_json["denoising_strength"].is_number()) { + hires_denoising_strength = hires_json["denoising_strength"]; + } + if (hires_json.contains("upscale_tile_size") && hires_json["upscale_tile_size"].is_number_integer()) { + hires_upscale_tile_size = hires_json["upscale_tile_size"]; + } + } + auto parse_sample_params_json = [&](const json& sample_json, sd_sample_params_t& target_params, std::vector& target_skip_layers, @@ -1589,10 +1700,18 @@ bool SDGenerationParams::from_json_str( LOG_ERROR("invalid init_image"); return false; } + if (!parse_image_json_field(j, "end_image", 3, width, height, end_image)) { + LOG_ERROR("invalid end_image"); + return false; + } if (!parse_image_array_json_field(j, "ref_images", 3, width, height, ref_images)) { LOG_ERROR("invalid ref_images"); return false; } + if (!parse_image_array_json_field(j, "control_frames", 3, width, height, control_frames)) { + LOG_ERROR("invalid control_frames"); + return false; + } if (!parse_image_json_field(j, "mask_image", 1, width, height, mask_image)) { LOG_ERROR("invalid mask_image"); return false; @@ -1792,7 +1911,7 @@ bool SDGenerationParams::initialize_cache_params() { return true; } -bool SDGenerationParams::resolve(const std::string& lora_model_dir, bool strict) { +bool SDGenerationParams::resolve(const std::string& lora_model_dir, const std::string& hires_upscalers_dir, bool strict) { if (high_noise_sample_params.sample_steps <= 0) { high_noise_sample_params.sample_steps = -1; } @@ -1811,6 +1930,27 @@ bool SDGenerationParams::resolve(const std::string& lora_model_dir, bool strict) sample_params.sample_steps = std::clamp(sample_params.sample_steps, 1, 100); } + hires_upscaler_model_path.clear(); + if (hires_enabled) { + if (hires_upscaler.empty()) { + hires_upscaler = "Latent"; + } + resolved_hires_upscaler = str_to_sd_hires_upscaler(hires_upscaler.c_str()); + if (resolved_hires_upscaler == SD_HIRES_UPSCALER_NONE) { + hires_enabled = false; + } else if (resolved_hires_upscaler == SD_HIRES_UPSCALER_COUNT) { + static const std::vector valid_ext = {".gguf", ".safetensors", ".pt", ".pth"}; + if (!resolve_model_file_from_dir(hires_upscaler, + hires_upscalers_dir, + valid_ext, + "hires upscaler", + hires_upscaler_model_path)) { + return false; + } + resolved_hires_upscaler = SD_HIRES_UPSCALER_MODEL; + } + } + prompt_with_lora = prompt; if (!lora_model_dir.empty()) { extract_and_remove_lora(lora_model_dir); @@ -1875,6 +2015,29 @@ bool SDGenerationParams::validate(SDMode mode) { return false; } + if (hires_enabled) { + if (hires_width < 0 || hires_height < 0) { + LOG_ERROR("error: hires target width and height must be >= 0"); + return false; + } + if (hires_scale <= 0.f && hires_width <= 0 && hires_height <= 0) { + LOG_ERROR("error: hires scale must be positive when target size is not set"); + return false; + } + if (hires_steps < 0) { + LOG_ERROR("error: hires steps must be >= 0"); + return false; + } + if (hires_denoising_strength <= 0.f || hires_denoising_strength > 1.f) { + LOG_ERROR("error: hires denoising strength must be in (0.0, 1.0]"); + return false; + } + if (hires_upscale_tile_size < 1) { + LOG_ERROR("error: hires upscale tile size must be positive"); + return false; + } + } + if (mode == UPSCALE) { if (init_image_path.length() == 0) { LOG_ERROR("error: upscale mode needs an init image (--init-img)\n"); @@ -1885,8 +2048,11 @@ bool SDGenerationParams::validate(SDMode mode) { return true; } -bool SDGenerationParams::resolve_and_validate(SDMode mode, const std::string& lora_model_dir, bool strict) { - if (!resolve(lora_model_dir, strict)) { +bool SDGenerationParams::resolve_and_validate(SDMode mode, + const std::string& lora_model_dir, + const std::string& hires_upscalers_dir, + bool strict) { + if (!resolve(lora_model_dir, hires_upscalers_dir, strict)) { return false; } if (!validate(mode)) { @@ -1957,6 +2123,16 @@ sd_img_gen_params_t SDGenerationParams::to_sd_img_gen_params_t() { params.pm_params = pm_params; params.vae_tiling_params = vae_tiling_params; params.cache = cache_params; + + params.hires.enabled = hires_enabled; + params.hires.upscaler = resolved_hires_upscaler; + params.hires.model_path = hires_upscaler_model_path.empty() ? nullptr : hires_upscaler_model_path.c_str(); + params.hires.scale = hires_scale; + params.hires.target_width = hires_width; + params.hires.target_height = hires_height; + params.hires.steps = hires_steps; + params.hires.denoising_strength = hires_denoising_strength; + params.hires.upscale_tile_size = hires_upscale_tile_size; return params; } @@ -2081,6 +2257,15 @@ std::string SDGenerationParams::to_string() const { << " seed: " << seed << ",\n" << " upscale_repeats: " << upscale_repeats << ",\n" << " upscale_tile_size: " << upscale_tile_size << ",\n" + << " hires: { enabled: " << (hires_enabled ? "true" : "false") + << ", upscaler: \"" << hires_upscaler << "\"" + << ", model_path: \"" << hires_upscaler_model_path << "\"" + << ", scale: " << hires_scale + << ", target_width: " << hires_width + << ", target_height: " << hires_height + << ", steps: " << hires_steps + << ", denoising_strength: " << hires_denoising_strength + << ", upscale_tile_size: " << hires_upscale_tile_size << " },\n" << " vae_tiling_params: { " << vae_tiling_params.enabled << ", " << vae_tiling_params.tile_size_x << ", " @@ -2154,6 +2339,13 @@ std::string get_image_params(const SDContextParams& ctx_params, const SDGenerati if (gen_params.clip_skip != -1) { parameter_string += "Clip skip: " + std::to_string(gen_params.clip_skip) + ", "; } + if (gen_params.hires_enabled) { + parameter_string += "Hires upscale: " + gen_params.hires_upscaler + ", "; + parameter_string += "Hires scale: " + std::to_string(gen_params.hires_scale) + ", "; + parameter_string += "Hires resize: " + std::to_string(gen_params.hires_width) + "x" + std::to_string(gen_params.hires_height) + ", "; + parameter_string += "Hires steps: " + std::to_string(gen_params.hires_steps) + ", "; + parameter_string += "Denoising strength: " + std::to_string(gen_params.hires_denoising_strength) + ", "; + } parameter_string += "Version: stable-diffusion.cpp"; return parameter_string; } diff --git a/examples/common/common.h b/examples/common/common.h index 5afe89b3..333d3311 100644 --- a/examples/common/common.h +++ b/examples/common/common.h @@ -101,6 +101,7 @@ struct SDContextParams { sd_type_t wtype = SD_TYPE_COUNT; std::string tensor_type_rules; std::string lora_model_dir = "."; + std::string hires_upscalers_dir; std::map embedding_map; std::vector embedding_vec; @@ -190,12 +191,23 @@ struct SDGenerationParams { int upscale_repeats = 1; int upscale_tile_size = 128; + bool hires_enabled = false; + std::string hires_upscaler = "Latent"; + std::string hires_upscaler_model_path; + float hires_scale = 2.f; + int hires_width = 0; + int hires_height = 0; + int hires_steps = 0; + float hires_denoising_strength = 0.7f; + int hires_upscale_tile_size = 128; + std::map lora_map; std::map high_noise_lora_map; // Derived and normalized fields. std::string prompt_with_lora; // for metadata record only std::vector lora_vec; + sd_hires_upscaler_t resolved_hires_upscaler; // Owned execution payload. SDImageOwner init_image; @@ -225,9 +237,12 @@ struct SDGenerationParams { void set_width_and_height_if_unset(int w, int h); int get_resolved_width() const; int get_resolved_height() const; - bool resolve(const std::string& lora_model_dir, bool strict = false); + bool resolve(const std::string& lora_model_dir, const std::string& hires_upscalers_dir, bool strict = false); bool validate(SDMode mode); - bool resolve_and_validate(SDMode mode, const std::string& lora_model_dir, bool strict = false); + bool resolve_and_validate(SDMode mode, + const std::string& lora_model_dir, + const std::string& hires_upscalers_dir, + bool strict = false); sd_img_gen_params_t to_sd_img_gen_params_t(); sd_vid_gen_params_t to_sd_vid_gen_params_t(); std::string to_string() const; diff --git a/examples/common/media_io.cpp b/examples/common/media_io.cpp index 0b8b3a27..e2e1ca5a 100644 --- a/examples/common/media_io.cpp +++ b/examples/common/media_io.cpp @@ -95,6 +95,57 @@ using WebPMuxPtr = std::unique_ptr; using WebPAnimEncoderPtr = std::unique_ptr; #endif +#ifdef SD_USE_WEBM +class MemoryMkvWriter : public mkvmuxer::IMkvWriter { +public: + mkvmuxer::int32 Write(const void* buf, mkvmuxer::uint32 len) override { + if (buf == nullptr && len > 0) { + return -1; + } + const size_t end_pos = position_ + static_cast(len); + if (end_pos > data_.size()) { + data_.resize(end_pos); + } + if (len > 0) { + memcpy(data_.data() + position_, buf, len); + } + position_ = end_pos; + return 0; + } + + mkvmuxer::int64 Position() const override { + return static_cast(position_); + } + + mkvmuxer::int32 Position(mkvmuxer::int64 position) override { + if (position < 0) { + return -1; + } + const size_t target = static_cast(position); + if (target > data_.size()) { + data_.resize(target); + } + position_ = target; + return 0; + } + + bool Seekable() const override { + return true; + } + + void ElementStartNotify(mkvmuxer::uint64, mkvmuxer::int64) override { + } + + const std::vector& data() const { + return data_; + } + +private: + std::vector data_; + size_t position_ = 0; +}; +#endif + bool read_binary_file_bytes(const char* path, std::vector& data) { std::ifstream fin(fs::path(path), std::ios::binary); if (!fin) { @@ -570,6 +621,32 @@ void write_u16_le(FILE* f, uint16_t val) { fwrite(&val, 2, 1, f); } +void write_u32_le(std::vector& data, uint32_t val) { + data.push_back(static_cast(val & 0xFF)); + data.push_back(static_cast((val >> 8) & 0xFF)); + data.push_back(static_cast((val >> 16) & 0xFF)); + data.push_back(static_cast((val >> 24) & 0xFF)); +} + +void write_u16_le(std::vector& data, uint16_t val) { + data.push_back(static_cast(val & 0xFF)); + data.push_back(static_cast((val >> 8) & 0xFF)); +} + +void patch_u32_le(std::vector& data, size_t offset, uint32_t val) { + if (offset + 4 > data.size()) { + return; + } + data[offset + 0] = static_cast(val & 0xFF); + data[offset + 1] = static_cast((val >> 8) & 0xFF); + data[offset + 2] = static_cast((val >> 16) & 0xFF); + data[offset + 3] = static_cast((val >> 24) & 0xFF); +} + +void write_fourcc(std::vector& data, const char* fourcc) { + data.insert(data.end(), fourcc, fourcc + 4); +} + EncodedImageFormat encoded_image_format_from_path(const std::string& path) { std::string ext = fs::path(path).extension().string(); std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower); @@ -699,95 +776,96 @@ uint8_t* load_image_from_memory(const char* image_bytes, return load_image_common(true, image_bytes, len, width, height, expected_width, expected_height, expected_channel); } -int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) { +std::vector create_mjpg_avi_from_sd_images_to_vector(sd_image_t* images, int num_images, int fps, int quality) { if (num_images == 0) { fprintf(stderr, "Error: Image array is empty.\n"); - return -1; + return {}; } - FilePtr file(fopen(filename, "wb")); - if (!file) { - perror("Error opening file for writing"); - return -1; - } - FILE* f = file.get(); - uint32_t width = images[0].width; uint32_t height = images[0].height; uint32_t channels = images[0].channel; if (channels != 3 && channels != 4) { fprintf(stderr, "Error: Unsupported channel count: %u\n", channels); - return -1; + return {}; } - fwrite("RIFF", 4, 1, f); - long riff_size_pos = ftell(f); - write_u32_le(f, 0); - fwrite("AVI ", 4, 1, f); + // stb_image_write changes JPEG sampling behavior above quality 90. + // MJPG AVI playback is more compatible when we keep the encoder on the + // <= 90 path. + const int mjpg_quality = std::clamp(quality, 1, 90); - fwrite("LIST", 4, 1, f); - write_u32_le(f, 4 + 8 + 56 + 8 + 4 + 8 + 56 + 8 + 40); - fwrite("hdrl", 4, 1, f); + std::vector avi_data; + avi_data.reserve(static_cast(num_images) * 1024); - fwrite("avih", 4, 1, f); - write_u32_le(f, 56); - write_u32_le(f, 1000000 / fps); - write_u32_le(f, 0); - write_u32_le(f, 0); - write_u32_le(f, 0x110); - write_u32_le(f, num_images); - write_u32_le(f, 0); - write_u32_le(f, 1); - write_u32_le(f, width * height * 3); - write_u32_le(f, width); - write_u32_le(f, height); - write_u32_le(f, 0); - write_u32_le(f, 0); - write_u32_le(f, 0); - write_u32_le(f, 0); + write_fourcc(avi_data, "RIFF"); + const size_t riff_size_pos = avi_data.size(); + write_u32_le(avi_data, 0); + write_fourcc(avi_data, "AVI "); - fwrite("LIST", 4, 1, f); - write_u32_le(f, 4 + 8 + 56 + 8 + 40); - fwrite("strl", 4, 1, f); + write_fourcc(avi_data, "LIST"); + write_u32_le(avi_data, 4 + 8 + 56 + 8 + 4 + 8 + 56 + 8 + 40); + write_fourcc(avi_data, "hdrl"); - fwrite("strh", 4, 1, f); - write_u32_le(f, 56); - fwrite("vids", 4, 1, f); - fwrite("MJPG", 4, 1, f); - write_u32_le(f, 0); - write_u16_le(f, 0); - write_u16_le(f, 0); - write_u32_le(f, 0); - write_u32_le(f, 1); - write_u32_le(f, fps); - write_u32_le(f, 0); - write_u32_le(f, num_images); - write_u32_le(f, width * height * 3); - write_u32_le(f, (uint32_t)-1); - write_u32_le(f, 0); - write_u16_le(f, 0); - write_u16_le(f, 0); - write_u16_le(f, 0); - write_u16_le(f, 0); + write_fourcc(avi_data, "avih"); + write_u32_le(avi_data, 56); + write_u32_le(avi_data, 1000000 / fps); + write_u32_le(avi_data, 0); + write_u32_le(avi_data, 0); + write_u32_le(avi_data, 0x110); + write_u32_le(avi_data, num_images); + write_u32_le(avi_data, 0); + write_u32_le(avi_data, 1); + write_u32_le(avi_data, width * height * 3); + write_u32_le(avi_data, width); + write_u32_le(avi_data, height); + write_u32_le(avi_data, 0); + write_u32_le(avi_data, 0); + write_u32_le(avi_data, 0); + write_u32_le(avi_data, 0); - fwrite("strf", 4, 1, f); - write_u32_le(f, 40); - write_u32_le(f, 40); - write_u32_le(f, width); - write_u32_le(f, height); - write_u16_le(f, 1); - write_u16_le(f, 24); - fwrite("MJPG", 4, 1, f); - write_u32_le(f, width * height * 3); - write_u32_le(f, 0); - write_u32_le(f, 0); - write_u32_le(f, 0); - write_u32_le(f, 0); + write_fourcc(avi_data, "LIST"); + write_u32_le(avi_data, 4 + 8 + 56 + 8 + 40); + write_fourcc(avi_data, "strl"); - fwrite("LIST", 4, 1, f); - long movi_size_pos = ftell(f); - write_u32_le(f, 0); - fwrite("movi", 4, 1, f); + write_fourcc(avi_data, "strh"); + write_u32_le(avi_data, 56); + write_fourcc(avi_data, "vids"); + write_fourcc(avi_data, "MJPG"); + write_u32_le(avi_data, 0); + write_u16_le(avi_data, 0); + write_u16_le(avi_data, 0); + write_u32_le(avi_data, 0); + write_u32_le(avi_data, 1); + write_u32_le(avi_data, fps); + write_u32_le(avi_data, 0); + write_u32_le(avi_data, num_images); + write_u32_le(avi_data, width * height * 3); + write_u32_le(avi_data, static_cast(-1)); + write_u32_le(avi_data, 0); + write_u16_le(avi_data, 0); + write_u16_le(avi_data, 0); + write_u16_le(avi_data, 0); + write_u16_le(avi_data, 0); + + write_fourcc(avi_data, "strf"); + write_u32_le(avi_data, 40); + write_u32_le(avi_data, 40); + write_u32_le(avi_data, width); + write_u32_le(avi_data, height); + write_u16_le(avi_data, 1); + write_u16_le(avi_data, 24); + write_fourcc(avi_data, "MJPG"); + write_u32_le(avi_data, width * height * 3); + write_u32_le(avi_data, 0); + write_u32_le(avi_data, 0); + write_u32_le(avi_data, 0); + write_u32_le(avi_data, 0); + + write_fourcc(avi_data, "LIST"); + const size_t movi_size_pos = avi_data.size(); + write_u32_le(avi_data, 0); + write_fourcc(avi_data, "movi"); std::vector index(static_cast(num_images)); std::vector jpeg_data; @@ -801,55 +879,61 @@ int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int buffer->insert(buffer->end(), src, src + size); }; - if (!stbi_write_jpg_to_func(write_to_buf, &jpeg_data, images[i].width, images[i].height, channels, images[i].data, quality)) { + if (!stbi_write_jpg_to_func(write_to_buf, &jpeg_data, images[i].width, images[i].height, channels, images[i].data, mjpg_quality)) { fprintf(stderr, "Error: Failed to encode JPEG frame.\n"); - return -1; + return {}; } - fwrite("00dc", 4, 1, f); - write_u32_le(f, (uint32_t)jpeg_data.size()); - index[i].offset = ftell(f) - 8; - index[i].size = (uint32_t)jpeg_data.size(); - fwrite(jpeg_data.data(), 1, jpeg_data.size(), f); + index[i].offset = static_cast(avi_data.size()); + write_fourcc(avi_data, "00dc"); + write_u32_le(avi_data, static_cast(jpeg_data.size())); + index[i].size = (uint32_t)jpeg_data.size(); + avi_data.insert(avi_data.end(), jpeg_data.begin(), jpeg_data.end()); if (jpeg_data.size() % 2) { - fputc(0, f); + avi_data.push_back(0); } } - long cur_pos = ftell(f); - long movi_size = cur_pos - movi_size_pos - 4; - fseek(f, movi_size_pos, SEEK_SET); - write_u32_le(f, movi_size); - fseek(f, cur_pos, SEEK_SET); + const size_t movi_size = avi_data.size() - movi_size_pos - 4; + patch_u32_le(avi_data, movi_size_pos, static_cast(movi_size)); - fwrite("idx1", 4, 1, f); - write_u32_le(f, num_images * 16); + write_fourcc(avi_data, "idx1"); + write_u32_le(avi_data, num_images * 16); for (int i = 0; i < num_images; i++) { - fwrite("00dc", 4, 1, f); - write_u32_le(f, 0x10); - write_u32_le(f, index[i].offset); - write_u32_le(f, index[i].size); + write_fourcc(avi_data, "00dc"); + write_u32_le(avi_data, 0x10); + write_u32_le(avi_data, index[i].offset); + write_u32_le(avi_data, index[i].size); } - cur_pos = ftell(f); - long file_size = cur_pos - riff_size_pos - 4; - fseek(f, riff_size_pos, SEEK_SET); - write_u32_le(f, file_size); - fseek(f, cur_pos, SEEK_SET); + const size_t file_size = avi_data.size() - riff_size_pos - 4; + patch_u32_le(avi_data, riff_size_pos, static_cast(file_size)); + return avi_data; +} + +int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) { + std::vector avi_data = create_mjpg_avi_from_sd_images_to_vector(images, num_images, fps, quality); + if (avi_data.empty()) { + return -1; + } + if (!write_binary_file_bytes(filename, avi_data)) { + perror("Error opening file for writing"); + return -1; + } return 0; } #ifdef SD_USE_WEBP -int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) { +std::vector create_animated_webp_from_sd_images_to_vector(sd_image_t* images, int num_images, int fps, int quality) { if (num_images == 0) { fprintf(stderr, "Error: Image array is empty.\n"); - return -1; + return {}; } if (fps <= 0) { fprintf(stderr, "Error: FPS must be positive.\n"); - return -1; + return {}; } const int width = static_cast(images[0].width); @@ -857,14 +941,14 @@ int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images const int channels = static_cast(images[0].channel); if (channels != 1 && channels != 3 && channels != 4) { fprintf(stderr, "Error: Unsupported channel count: %d\n", channels); - return -1; + return {}; } WebPAnimEncoderOptions anim_options; WebPConfig config; if (!WebPAnimEncoderOptionsInit(&anim_options) || !WebPConfigInit(&config)) { fprintf(stderr, "Error: Failed to initialize WebP animation encoder.\n"); - return -1; + return {}; } config.quality = static_cast(quality); @@ -875,13 +959,13 @@ int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images } if (!WebPValidateConfig(&config)) { fprintf(stderr, "Error: Invalid WebP encoder configuration.\n"); - return -1; + return {}; } WebPAnimEncoderPtr enc(WebPAnimEncoderNew(width, height, &anim_options)); if (enc == nullptr) { fprintf(stderr, "Error: Could not create WebPAnimEncoder object.\n"); - return -1; + return {}; } const int frame_duration_ms = std::max(1, static_cast(std::lround(1000.0 / static_cast(fps)))); @@ -891,13 +975,13 @@ int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images const sd_image_t& image = images[i]; if (static_cast(image.width) != width || static_cast(image.height) != height) { fprintf(stderr, "Error: Frame dimensions do not match.\n"); - return -1; + return {}; } WebPPictureGuard picture; if (!picture.initialized) { fprintf(stderr, "Error: Failed to initialize WebPPicture.\n"); - return -1; + return {}; } picture.picture.use_argb = 1; picture.picture.width = width; @@ -921,12 +1005,12 @@ int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images if (!picture_ok) { fprintf(stderr, "Error: Failed to import frame into WebPPicture.\n"); - return -1; + return {}; } if (!WebPAnimEncoderAdd(enc.get(), &picture.picture, timestamp_ms, &config)) { fprintf(stderr, "Error: Failed to add frame to animated WebP: %s\n", WebPAnimEncoderGetError(enc.get())); - return -1; + return {}; } timestamp_ms += frame_duration_ms; @@ -934,52 +1018,50 @@ int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images if (!WebPAnimEncoderAdd(enc.get(), nullptr, timestamp_ms, nullptr)) { fprintf(stderr, "Error: Failed to finalize animated WebP frames: %s\n", WebPAnimEncoderGetError(enc.get())); - return -1; + return {}; } WebPDataGuard webp_data; if (!WebPAnimEncoderAssemble(enc.get(), &webp_data.data)) { fprintf(stderr, "Error: Failed to assemble animated WebP: %s\n", WebPAnimEncoderGetError(enc.get())); - return -1; + return {}; } - FilePtr f(fopen(filename, "wb")); - if (!f) { + return std::vector(webp_data.data.bytes, webp_data.data.bytes + webp_data.data.size); +} + +int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) { + std::vector webp_data = create_animated_webp_from_sd_images_to_vector(images, num_images, fps, quality); + if (webp_data.empty()) { + return -1; + } + if (!write_binary_file_bytes(filename, webp_data)) { perror("Error opening file for writing"); return -1; } - if (webp_data.data.size > 0 && fwrite(webp_data.data.bytes, 1, webp_data.data.size, f.get()) != webp_data.data.size) { - fprintf(stderr, "Error: Failed to write animated WebP file.\n"); - return -1; - } - return 0; } #endif #ifdef SD_USE_WEBM -int create_webm_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) { +std::vector create_webm_from_sd_images_to_vector(sd_image_t* images, int num_images, int fps, int quality) { if (num_images == 0) { fprintf(stderr, "Error: Image array is empty.\n"); - return -1; + return {}; } if (fps <= 0) { fprintf(stderr, "Error: FPS must be positive.\n"); - return -1; + return {}; } const int width = static_cast(images[0].width); const int height = static_cast(images[0].height); if (width <= 0 || height <= 0) { fprintf(stderr, "Error: Invalid frame dimensions.\n"); - return -1; + return {}; } - mkvmuxer::MkvWriter writer; - if (!writer.Open(filename)) { - fprintf(stderr, "Error: Could not open WebM file for writing.\n"); - return -1; - } + MemoryMkvWriter writer; const int ret = [&]() -> int { mkvmuxer::Segment segment; @@ -1045,30 +1127,63 @@ int create_webm_from_sd_images(const char* filename, sd_image_t* images, int num } return 0; }(); - writer.Close(); - return ret; + if (ret != 0) { + return {}; + } + return writer.data(); +} + +int create_webm_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) { + std::vector webm_data = create_webm_from_sd_images_to_vector(images, num_images, fps, quality); + if (webm_data.empty()) { + return -1; + } + if (!write_binary_file_bytes(filename, webm_data)) { + perror("Error opening file for writing"); + return -1; + } + return 0; } #endif -int create_video_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) { - std::string path = filename ? filename : ""; - auto pos = path.find_last_of('.'); - std::string ext = pos == std::string::npos ? "" : path.substr(pos); - for (char& ch : ext) { - ch = static_cast(tolower(static_cast(ch))); +std::vector create_video_from_sd_images_to_vector(const std::string& output_format, + sd_image_t* images, + int num_images, + int fps, + int quality) { + std::string format = output_format; + std::transform(format.begin(), format.end(), format.begin(), + [](unsigned char c) { return static_cast(tolower(c)); }); + if (!format.empty() && format[0] == '.') { + format.erase(format.begin()); } #ifdef SD_USE_WEBM - if (ext == ".webm") { - return create_webm_from_sd_images(filename, images, num_images, fps, quality); + if (format == "webm") { + return create_webm_from_sd_images_to_vector(images, num_images, fps, quality); } #endif #ifdef SD_USE_WEBP - if (ext == ".webp") { - return create_animated_webp_from_sd_images(filename, images, num_images, fps, quality); + if (format == "webp") { + return create_animated_webp_from_sd_images_to_vector(images, num_images, fps, quality); } #endif - return create_mjpg_avi_from_sd_images(filename, images, num_images, fps, quality); + return create_mjpg_avi_from_sd_images_to_vector(images, num_images, fps, quality); +} + +int create_video_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) { + std::string path = filename ? filename : ""; + auto pos = path.find_last_of('.'); + std::string ext = pos == std::string::npos ? "" : path.substr(pos); + std::vector video_data = create_video_from_sd_images_to_vector(ext, images, num_images, fps, quality); + if (video_data.empty()) { + return -1; + } + if (!write_binary_file_bytes(filename, video_data)) { + perror("Error opening file for writing"); + return -1; + } + return 0; } diff --git a/examples/common/media_io.h b/examples/common/media_io.h index e6ca098d..6b3f6f88 100644 --- a/examples/common/media_io.h +++ b/examples/common/media_io.h @@ -58,6 +58,10 @@ int create_mjpg_avi_from_sd_images(const char* filename, int num_images, int fps, int quality = 90); +std::vector create_mjpg_avi_from_sd_images_to_vector(sd_image_t* images, + int num_images, + int fps, + int quality = 90); #ifdef SD_USE_WEBP int create_animated_webp_from_sd_images(const char* filename, @@ -65,6 +69,10 @@ int create_animated_webp_from_sd_images(const char* filename, int num_images, int fps, int quality = 90); +std::vector create_animated_webp_from_sd_images_to_vector(sd_image_t* images, + int num_images, + int fps, + int quality = 90); #endif #ifdef SD_USE_WEBM @@ -73,6 +81,10 @@ int create_webm_from_sd_images(const char* filename, int num_images, int fps, int quality = 90); +std::vector create_webm_from_sd_images_to_vector(sd_image_t* images, + int num_images, + int fps, + int quality = 90); #endif int create_video_from_sd_images(const char* filename, @@ -80,5 +92,10 @@ int create_video_from_sd_images(const char* filename, int num_images, int fps, int quality = 90); +std::vector create_video_from_sd_images_to_vector(const std::string& output_format, + sd_image_t* images, + int num_images, + int fps, + int quality = 90); #endif // __MEDIA_IO_H__ diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index f6303f3c..b70b525e 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -50,7 +50,13 @@ if(SD_SERVER_BUILD_FRONTEND AND EXISTS "${FRONTEND_DIR}") set_source_files_properties("${GENERATED_HTML_HEADER}" PROPERTIES GENERATED TRUE) else() - message(WARNING "pnpm not found, frontend build disabled") + if(EXISTS "${GENERATED_HTML_HEADER}") + message(STATUS "pnpm not found; using pre-built frontend header detected at ${GENERATED_HTML_HEADER}") + set(HAVE_FRONTEND_BUILD ON) + add_custom_target(${TARGET}_frontend) + else() + message(WARNING "pnpm not found; frontend build disabled.") + endif() endif() else() message(STATUS "Frontend disabled or directory not found: ${FRONTEND_DIR}") diff --git a/examples/server/README.md b/examples/server/README.md index e27d973f..469dd346 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -123,11 +123,11 @@ In this case, the server will load and serve the specified `index.html` file ins usage: ./bin/sd-server [options] Svr Options: - -l, --listen-ip server listen ip (default: 127.0.0.1) + -l, --listen-ip server listen ip (default: 127.0.0.1) --serve-html-path path to HTML file to serve at root (optional) --listen-port server listen port (default: 1234) -v, --verbose print extra info - --color colors the logging tags according to level + --color colors the logging tags according to level -h, --help show this help message and exit Context Options: @@ -136,7 +136,8 @@ Context Options: --clip_g path to the clip-g text encoder --clip_vision path to the clip-vision encoder --t5xxl path to the t5xxl text encoder - --llm path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...) + --llm path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, + mistral-small3.2 for flux2, ...) --llm_vision path to the llm vit --qwen2vl alias of --llm. Deprecated. --qwen2vl_vision alias of --llm_vision. Deprecated. @@ -148,16 +149,16 @@ Context Options: --control-net path to control net model --embd-dir embeddings directory --lora-model-dir lora model directory + --hires-upscalers-dir highres fix upscaler model directory --tensor-type-rules weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0") --photo-maker path to PHOTOMAKER model --upscale-model path to esrgan model. - -t, --threads number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of - CPU physical cores + -t, --threads number of threads to use during computation (default: -1). If threads <= 0, + then threads will be set to the number of CPU physical cores --chroma-t5-mask-pad t5 mask pad size of chroma - --vae-tile-overlap tile overlap for vae tiling, in fraction of tile size (default: 0.5) - --vae-tiling process vae in tiles to reduce memory usage --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae - --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed + --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM + when needed --mmap whether to memory-map model --control-net-cpu keep controlnet in cpu (for low vram) --clip-on-cpu keep clip in cpu (for low vram) @@ -172,20 +173,19 @@ Context Options: --chroma-disable-dit-mask disable dit mask for chroma --qwen-image-zero-cond-t enable zero_cond_t for qwen image --chroma-enable-t5-mask enable t5 mask for chroma - --type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the - type of the weight file + --type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, + q4_K). If not specified, the default is the type of the weight file --rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui) --sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng - --prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow] - --lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights - contain any quantized parameters, the at_runtime mode will be used; otherwise, - immediately will be used.The immediately mode may have precision and - compatibility issues with quantized parameters, but it usually offers faster inference - speed and, in some cases, lower memory usage. The at_runtime mode, on the - other hand, is exactly the opposite. - --vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32) - --vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 - (overrides --vae-tile-size) + --prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, + flux2_flow] + --lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is + auto. In auto mode, if the model weights contain any quantized parameters, + the at_runtime mode will be used; otherwise, immediately will be used.The + immediately mode may have precision and compatibility issues with quantized + parameters, but it usually offers faster inference speed and, in some cases, + lower memory usage. The at_runtime mode, on the other hand, is exactly the + opposite. Default Generation Options: -p, --prompt the prompt to render @@ -194,65 +194,97 @@ Default Generation Options: --end-img path to the end image, required by flf2v --mask path to the mask image --control-image path to control image, control net - --control-video path to control video frames, It must be a directory path. The video frames inside should be stored as images in - lexicographical (character) order. For example, if the control video path is - `frames`, the directory contain images such as 00.png, 01.png, ... etc. + --control-video path to control video frames, It must be a directory path. The video frames + inside should be stored as images in lexicographical (character) order. For + example, if the control video path is `frames`, the directory contain images + such as 00.png, 01.png, ... etc. --pm-id-images-dir path to PHOTOMAKER input id images dir --pm-id-embed-path path to PHOTOMAKER v2 id embed + --hires-upscaler highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent + (nearest-exact), Latent (antialiased), Latent (bicubic), Latent (bicubic + antialiased), or a model name under --hires-upscalers-dir (default: Latent) -H, --height image height, in pixel space (default: 512) -W, --width image width, in pixel space (default: 512) --steps number of sample steps (default: 20) --high-noise-steps (high noise) number of sample steps (default: -1 = auto) - --clip-skip ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified, - will be 1 for SD1.x, 2 for SD2.x + --clip-skip ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer + (default: -1). <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x -b, --batch-count batch count --video-frames video frames (default: 1) --fps fps (default: 24) - --timestep-shift shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for - NitroSD-Vibrant + --timestep-shift shift timestep for NitroFusion models (default: 0). recommended N for + NitroSD-Realism around 250 and 500 for NitroSD-Vibrant --upscale-repeats Run the ESRGAN upscaler this many times (default: 1) --upscale-tile-size tile size for ESRGAN upscaling (default: 128) + --hires-width highres fix target width, 0 to use --hires-scale (default: 0) + --hires-height highres fix target height, 0 to use --hires-scale (default: 0) + --hires-steps highres fix second pass sample steps, 0 to reuse --steps (default: 0) + --hires-upscale-tile-size highres fix upscaler tile size, reserved for model-backed upscalers (default: + 128) --cfg-scale unconditional guidance scale: (default: 7.0) - --img-cfg-scale image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale) + --img-cfg-scale image guidance scale for inpaint or instruct-pix2pix models: (default: same + as --cfg-scale) --guidance distilled guidance scale for models with guidance input (default: 3.5) - --slg-scale skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5 - medium + --slg-scale skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means + disabled, a value of 2.5 is nice for sd3.5 medium --skip-layer-start SLG enabling point (default: 0.01) --skip-layer-end SLG disabling point (default: 0.2) - --eta noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a and dpm++2s_a) + --eta noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and + res_2s; 1 for euler_a, er_sde and dpm++2s_a) --flow-shift shift value for Flow models like SD3.x or WAN (default: auto) --high-noise-cfg-scale (high noise) unconditional guidance scale: (default: 7.0) - --high-noise-img-cfg-scale (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale) - --high-noise-guidance (high noise) distilled guidance scale for models with guidance input (default: 3.5) - --high-noise-slg-scale (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0) + --high-noise-img-cfg-scale (high noise) image guidance scale for inpaint or instruct-pix2pix models + (default: same as --cfg-scale) + --high-noise-guidance (high noise) distilled guidance scale for models with guidance input + (default: 3.5) + --high-noise-slg-scale (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: + 0) --high-noise-skip-layer-start (high noise) SLG enabling point (default: 0.01) --high-noise-skip-layer-end (high noise) SLG disabling point (default: 0.2) - --high-noise-eta (high noise) noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a and dpm++2s_a) + --high-noise-eta (high noise) noise multiplier (default: 0 for ddim_trailing, tcd, + res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a) --strength strength for noising/unnoising (default: 0.75) - --pm-style-strength - --control-strength strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image - --moe-boundary timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1 + --pm-style-strength + --control-strength strength to apply Control Net (default: 0.9). 1.0 corresponds to full + destruction of information in init image + --moe-boundary timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if + `--high-noise-steps` is set to -1 --vace-strength wan vace strength - --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1). + --vae-tile-overlap tile overlap for vae tiling, in fraction of tile size (default: 0.5) + --hires-scale highres fix scale when target size is not set (default: 2.0) + --hires-denoising-strength highres fix second pass denoising strength (default: 0.7) + --increase-ref-index automatically increase the indices of references images based on the order + they are listed (starting with 1). --disable-auto-resize-ref-image disable auto resize of ref images --disable-image-metadata do not embed generation metadata on image files + --vae-tiling process vae in tiles to reduce memory usage + --hires enable highres fix -s, --seed RNG seed (default: 42, use random seed for < 0) - --sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, - tcd, res_multistep, res_2s] (default: euler for Flux/SD3/Wan, euler_a - otherwise) - --high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, - ddim_trailing, tcd, res_multistep, res_2s] default: euler for Flux/SD3/Wan, - euler_a otherwise - --scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, - kl_optimal, lcm, bong_tangent], default: discrete - --sigmas custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0"). + --sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, + dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s, + er_sde] (default: euler for Flux/SD3/Wan, euler_a otherwise) + --high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, + dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, + res_2s, er_sde] default: euler for Flux/SD3/Wan, euler_a otherwise + --scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, + smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent], default: + discrete + --sigmas custom sigma values for the sampler, comma-separated (e.g., + "14.61,7.8,3.5,0.0"). --skip-layers layers to skip for SLG steps (default: [7,8,9]) --high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9]) -r, --ref-image reference image for Flux Kontext models (can be used multiple times) - --cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT Chebyshev+Taylor forecasting) + --cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), + 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT + Chebyshev+Taylor forecasting) --cache-option named cache params (key=value format, comma-separated). easycache/ucache: - threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples: - "threshold=0.25" or "threshold=1.5,reset=0" - --scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache + threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: + Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. + Examples: "threshold=0.25" or "threshold=1.5,reset=0" + --scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., + "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache --scm-policy SCM policy: 'dynamic' (default) or 'static' + --vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32) + --vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size + if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size) ``` diff --git a/examples/server/api.md b/examples/server/api.md index 8f8bf9ed..483daa04 100644 --- a/examples/server/api.md +++ b/examples/server/api.md @@ -9,7 +9,7 @@ The server currently exposes three API families: - `sdcpp API` under `/sdcpp/v1/...` The `sdcpp API` is the native API surface. -Its request schema is also the canonical schema for `sd_cpp_extra_args`. +Its request schema is the same schema used by `sd_cpp_extra_args`. Global LoRA rule: @@ -38,6 +38,8 @@ Current generation-related endpoints include: - `POST /sdapi/v1/txt2img` - `POST /sdapi/v1/img2img` - `GET /sdapi/v1/loras` +- `GET /sdapi/v1/upscalers` +- `GET /sdapi/v1/latent-upscale-modes` - `GET /sdapi/v1/samplers` - `GET /sdapi/v1/schedulers` - `GET /sdapi/v1/sd-models` @@ -55,8 +57,6 @@ Current endpoints include: - `POST /sdcpp/v1/jobs/{id}/cancel` - `POST /sdcpp/v1/vid_gen` -`POST /sdcpp/v1/vid_gen` is currently exposed but returns `501 Not Implemented`. - ## `sd_cpp_extra_args` `sd_cpp_extra_args` is an extension mechanism for the compatibility APIs. @@ -79,12 +79,12 @@ Behavior: - The JSON block is parsed using the same field rules as the `sdcpp API`. - The block is removed from the final prompt before generation. -Intended use: +Supported use: - extend `OpenAI API` requests with native `stable-diffusion.cpp` controls - extend `sdapi` requests with native `stable-diffusion.cpp` controls -Not intended use: +Unsupported use: - do not use `sd_cpp_extra_args` with `/sdcpp/v1/*` @@ -218,6 +218,13 @@ Currently supported request fields: | `scheduler` | `string` | Scheduler name | | `lora` | `array` | Structured LoRA list | | `extra_images` | `array` | Base64 or data URL images | +| `enable_hr` | `boolean` | Enable highres fix for `txt2img` | +| `hr_upscaler` | `string` | `Lanczos`, `Nearest`, a latent mode such as `Latent (nearest-exact)`, or an upscaler model name from `/sdapi/v1/upscalers` | +| `hr_scale` | `number` | Highres scale when resize target is not set | +| `hr_resize_x` | `integer` | Highres target width, `0` to use scale | +| `hr_resize_y` | `integer` | Highres target height, `0` to use scale | +| `hr_steps` | `integer` | Highres second-pass sample steps, `0` to reuse `steps` | +| `denoising_strength` | `number` | Highres denoising strength for `txt2img` | Native extension fields: @@ -243,6 +250,8 @@ Currently supported request fields: | `inpainting_mask_invert` | `integer` or `boolean` | Treated as invert flag | | `denoising_strength` | `number` | Clamped to `0.0..1.0` | +Highres fix fields are currently handled for `txt2img`; `img2img` uses `denoising_strength` as image-to-image strength. + Native extension fields: - any `sdcpp API` fields embedded through `sd_cpp_extra_args` inside `prompt` @@ -260,6 +269,8 @@ Response fields: Currently exposed: - `GET /sdapi/v1/loras` +- `GET /sdapi/v1/upscalers` +- `GET /sdapi/v1/latent-upscale-modes` - `GET /sdapi/v1/samplers` - `GET /sdapi/v1/schedulers` - `GET /sdapi/v1/sd-models` @@ -274,6 +285,26 @@ Response fields: | `[].name` | `string` | Display name derived from file stem | | `[].path` | `string` | Relative path under the configured LoRA directory | +`GET /sdapi/v1/upscalers` + +| Field | Type | Notes | +| --- | --- | --- | +| `[].name` | `string` | Built-in name or model stem | +| `[].model_name` | `string \| null` | Model family label for model-backed upscalers | +| `[].model_path` | `string \| null` | Absolute model path for model-backed upscalers | +| `[].model_url` | `string \| null` | Currently always null | +| `[].scale` | `integer` | Currently `4` | + +Built-in entries include `None`, `Lanczos`, and `Nearest`. Model-backed entries are scanned from the top level of `--hires-upscalers-dir`; subdirectories are not scanned. + +`GET /sdapi/v1/latent-upscale-modes` + +| Field | Type | Notes | +| --- | --- | --- | +| `[].name` | `string` | WebUI-compatible latent upscale mode name | + +Built-in latent modes include `Latent`, `Latent (nearest)`, `Latent (nearest-exact)`, `Latent (antialiased)`, `Latent (bicubic)`, and `Latent (bicubic antialiased)`. + `GET /sdapi/v1/samplers` | Field | Type | Notes | @@ -372,20 +403,26 @@ Field types: Returns frontend-friendly capability metadata. -Typical contents: +The mode-aware fields are the primary interface. The top-level compatibility fields are deprecated mirrors kept for older clients. -| Field | Type | -| --- | --- | -| `model` | `object` | -| `defaults` | `object` | -| `loras` | `array` | -| `samplers` | `array` | -| `schedulers` | `array` | -| `output_formats` | `array` | -| `limits` | `object` | -| `features` | `object` | +Top-level fields: -Nested fields currently returned: +| Field | Type | Notes | +| --- | --- | --- | +| `model` | `object` | Loaded model metadata | +| `current_mode` | `string` | The native generation mode mirrored by top-level compatibility fields | +| `supported_modes` | `array` | Supported native modes such as `img_gen` or `vid_gen` | +| `defaults` | `object` | Deprecated compatibility mirror of `defaults_by_mode[current_mode]` | +| `output_formats` | `array` | Deprecated compatibility mirror of `output_formats_by_mode[current_mode]` | +| `features` | `object` | Deprecated compatibility mirror of `features_by_mode[current_mode]` | +| `defaults_by_mode` | `object` | Explicit defaults for each supported mode | +| `output_formats_by_mode` | `object` | Explicit output formats for each supported mode | +| `features_by_mode` | `object` | Explicit feature flags for each supported mode | +| `samplers` | `array` | Available sampling methods | +| `schedulers` | `array` | Available schedulers | +| `loras` | `array` | Available LoRA entries | +| `upscalers` | `array` | Available model-backed highres upscalers | +| `limits` | `object` | Shared queue and size limits | `model` @@ -395,50 +432,24 @@ Nested fields currently returned: | `model.stem` | `string` | | `model.path` | `string` | -`defaults` +Compatibility rules: + +- `defaults`, `output_formats`, and `features` are deprecated compatibility mirrors +- those three top-level fields always mirror `current_mode` +- `supported_modes`, `defaults_by_mode`, `output_formats_by_mode`, and `features_by_mode` are the mode-aware fields + +Mode-aware objects: | Field | Type | | --- | --- | -| `defaults.prompt` | `string` | -| `defaults.negative_prompt` | `string` | -| `defaults.clip_skip` | `integer` | -| `defaults.width` | `integer` | -| `defaults.height` | `integer` | -| `defaults.strength` | `number` | -| `defaults.seed` | `integer` | -| `defaults.batch_count` | `integer` | -| `defaults.auto_resize_ref_image` | `boolean` | -| `defaults.increase_ref_index` | `boolean` | -| `defaults.control_strength` | `number` | -| `defaults.sample_params` | `object` | -| `defaults.sample_params.scheduler` | `string` | -| `defaults.sample_params.sample_method` | `string` | -| `defaults.sample_params.sample_steps` | `integer` | -| `defaults.sample_params.eta` | `number \| null` | -| `defaults.sample_params.shifted_timestep` | `integer` | -| `defaults.sample_params.flow_shift` | `number \| null` | -| `defaults.sample_params.guidance` | `object` | -| `defaults.sample_params.guidance.txt_cfg` | `number` | -| `defaults.sample_params.guidance.img_cfg` | `number \| null` | -| `defaults.sample_params.guidance.distilled_guidance` | `number` | -| `defaults.sample_params.guidance.slg` | `object` | -| `defaults.sample_params.guidance.slg.layers` | `array` | -| `defaults.sample_params.guidance.slg.layer_start` | `number` | -| `defaults.sample_params.guidance.slg.layer_end` | `number` | -| `defaults.sample_params.guidance.slg.scale` | `number` | -| `defaults.vae_tiling_params` | `object` | -| `defaults.vae_tiling_params.enabled` | `boolean` | -| `defaults.vae_tiling_params.tile_size_x` | `integer` | -| `defaults.vae_tiling_params.tile_size_y` | `integer` | -| `defaults.vae_tiling_params.target_overlap` | `number` | -| `defaults.vae_tiling_params.rel_size_x` | `number` | -| `defaults.vae_tiling_params.rel_size_y` | `number` | -| `defaults.cache_mode` | `string` | -| `defaults.cache_option` | `string` | -| `defaults.scm_mask` | `string` | -| `defaults.scm_policy_dynamic` | `boolean` | -| `defaults.output_format` | `string` | -| `defaults.output_compression` | `integer` | +| `defaults_by_mode.img_gen` | `object` | +| `defaults_by_mode.vid_gen` | `object` | +| `output_formats_by_mode.img_gen` | `array` | +| `output_formats_by_mode.vid_gen` | `array` | +| `features_by_mode.img_gen` | `object` | +| `features_by_mode.vid_gen` | `object` | + +Shared nested fields: `loras` @@ -447,6 +458,14 @@ Nested fields currently returned: | `loras[].name` | `string` | | `loras[].path` | `string` | +`upscalers` + +| Field | Type | Notes | +| --- | --- | --- | +| `upscalers[].name` | `string` | Built-in name or model stem; use this value in `hires.upscaler` | + +Built-in entries include `None`, `Lanczos`, `Nearest`, `Latent`, `Latent (nearest)`, `Latent (nearest-exact)`, `Latent (antialiased)`, `Latent (bicubic)`, and `Latent (bicubic antialiased)`. Model-backed entries are scanned from the top level of `--hires-upscalers-dir`; subdirectories are not scanned. + `limits` | Field | Type | @@ -458,19 +477,110 @@ Nested fields currently returned: | `limits.max_batch_count` | `integer` | | `limits.max_queue_size` | `integer` | -`features` +Shared default fields used by both `img_gen` and `vid_gen`: | Field | Type | | --- | --- | -| `features.init_image` | `boolean` | -| `features.mask_image` | `boolean` | -| `features.control_image` | `boolean` | -| `features.ref_images` | `boolean` | -| `features.lora` | `boolean` | -| `features.vae_tiling` | `boolean` | -| `features.cache` | `boolean` | -| `features.cancel_queued` | `boolean` | -| `features.cancel_generating` | `boolean` | +| `prompt` | `string` | +| `negative_prompt` | `string` | +| `clip_skip` | `integer` | +| `width` | `integer` | +| `height` | `integer` | +| `strength` | `number` | +| `seed` | `integer` | +| `sample_params` | `object` | +| `sample_params.scheduler` | `string` | +| `sample_params.sample_method` | `string` | +| `sample_params.sample_steps` | `integer` | +| `sample_params.eta` | `number \| null` | +| `sample_params.shifted_timestep` | `integer` | +| `sample_params.flow_shift` | `number \| null` | +| `sample_params.guidance.txt_cfg` | `number` | +| `sample_params.guidance.img_cfg` | `number \| null` | +| `sample_params.guidance.distilled_guidance` | `number` | +| `sample_params.guidance.slg.layers` | `array` | +| `sample_params.guidance.slg.layer_start` | `number` | +| `sample_params.guidance.slg.layer_end` | `number` | +| `sample_params.guidance.slg.scale` | `number` | +| `vae_tiling_params` | `object` | +| `vae_tiling_params.enabled` | `boolean` | +| `vae_tiling_params.tile_size_x` | `integer` | +| `vae_tiling_params.tile_size_y` | `integer` | +| `vae_tiling_params.target_overlap` | `number` | +| `vae_tiling_params.rel_size_x` | `number` | +| `vae_tiling_params.rel_size_y` | `number` | +| `cache_mode` | `string` | +| `cache_option` | `string` | +| `scm_mask` | `string` | +| `scm_policy_dynamic` | `boolean` | +| `output_format` | `string` | +| `output_compression` | `integer` | + +`img_gen`-specific default fields: + +| Field | Type | +| --- | --- | +| `batch_count` | `integer` | +| `auto_resize_ref_image` | `boolean` | +| `increase_ref_index` | `boolean` | +| `control_strength` | `number` | +| `hires` | `object` | +| `hires.enabled` | `boolean` | +| `hires.upscaler` | `string` | +| `hires.scale` | `number` | +| `hires.target_width` | `integer` | +| `hires.target_height` | `integer` | +| `hires.steps` | `integer` | +| `hires.denoising_strength` | `number` | +| `hires.upscale_tile_size` | `integer` | + +`vid_gen`-specific default fields: + +| Field | Type | +| --- | --- | +| `video_frames` | `integer` | +| `fps` | `integer` | +| `moe_boundary` | `number` | +| `vace_strength` | `number` | +| `high_noise_sample_params` | `object` | +| `high_noise_sample_params.scheduler` | `string` | +| `high_noise_sample_params.sample_method` | `string` | +| `high_noise_sample_params.sample_steps` | `integer` | +| `high_noise_sample_params.eta` | `number \| null` | +| `high_noise_sample_params.shifted_timestep` | `integer` | +| `high_noise_sample_params.flow_shift` | `number \| null` | +| `high_noise_sample_params.guidance.txt_cfg` | `number` | +| `high_noise_sample_params.guidance.img_cfg` | `number \| null` | +| `high_noise_sample_params.guidance.distilled_guidance` | `number` | +| `high_noise_sample_params.guidance.slg.layers` | `array` | +| `high_noise_sample_params.guidance.slg.layer_start` | `number` | +| `high_noise_sample_params.guidance.slg.layer_end` | `number` | +| `high_noise_sample_params.guidance.slg.scale` | `number` | + +Fields returned in `features_by_mode.img_gen`: + +- `init_image` +- `mask_image` +- `control_image` +- `ref_images` +- `lora` +- `vae_tiling` +- `hires` +- `cache` +- `cancel_queued` +- `cancel_generating` + +Fields returned in `features_by_mode.vid_gen`: + +- `init_image` +- `end_image` +- `control_frames` +- `high_noise_sample_params` +- `lora` +- `vae_tiling` +- `cache` +- `cancel_queued` +- `cancel_generating` #### `POST /sdcpp/v1/img_gen` @@ -521,9 +631,7 @@ Typical status codes: - `409 Conflict` - `410 Gone` -### Canonical Request Schema - -The `sdcpp API` request body is the canonical native schema. +### Request Body Example: @@ -569,6 +677,16 @@ Example: }, "lora": [], + "hires": { + "enabled": false, + "upscaler": "Latent", + "scale": 2.0, + "target_width": 0, + "target_height": 0, + "steps": 0, + "denoising_strength": 0.7, + "upscale_tile_size": 128 + }, "vae_tiling_params": { "enabled": false, @@ -612,7 +730,7 @@ Channel expectations: If omitted or null: - single-image fields map to an empty `sd_image_t` -- array fields map to `nullptr + count = 0` +- array fields map to an empty C-style array, represented as `pointer = nullptr` and `count = 0` ### Field Mapping Summary @@ -673,12 +791,23 @@ Other native fields: | Field | Type | | --- | --- | +| `hires` | `object` | +| `hires.enabled` | `boolean` | +| `hires.upscaler` | `string` | +| `hires.scale` | `number` | +| `hires.target_width` | `integer` | +| `hires.target_height` | `integer` | +| `hires.steps` | `integer` | +| `hires.denoising_strength` | `number` | +| `hires.upscale_tile_size` | `integer` | | `vae_tiling_params` | `object` | | `cache_mode` | `string` | | `cache_option` | `string` | | `scm_mask` | `string` | | `scm_policy_dynamic` | `boolean` | +For `hires.upscaler`, use `Lanczos`, `Nearest`, `Latent`, `Latent (nearest)`, `Latent (nearest-exact)`, `Latent (antialiased)`, `Latent (bicubic)`, `Latent (bicubic antialiased)`, or an `upscalers[].name` value from `GET /sdcpp/v1/capabilities`. Model-backed upscalers are resolved as `--hires-upscalers-dir / (name + ext)` and must live directly in that directory. + HTTP-only output fields: | Field | Type | @@ -686,11 +815,11 @@ HTTP-only output fields: | `output_format` | `string` | | `output_compression` | `integer` | -### Optional Field Semantics +### Optional Field Handling -Clients should preserve unset semantics for optional sampling fields. +Optional sampling fields may be omitted. -If a user has not explicitly provided one of these fields, the client should omit it instead of injecting a guessed fallback: +When omitted, backend defaults apply to these fields: - `sample_params.scheduler` - `sample_params.sample_method` @@ -766,29 +895,394 @@ Example cancelled job: } ``` -### Validation and Retention +### Submission Errors -Recommended behavior: +`POST /sdcpp/v1/img_gen` may return: -- malformed JSON returns `400` -- invalid image payloads return `400` -- invalid parameter structure returns `400` -- queue full returns `429` or `503` -- accepted runtime failures transition the job to `failed` -- unsupported in-progress cancellation may return `409` +- `202 Accepted` when the job is created +- `400 Bad Request` for an empty body, unsupported model mode, invalid JSON, or invalid generation parameters +- `429 Too Many Requests` when the job queue is full +- `500 Internal Server Error` for unexpected server exceptions during submission -Recommended retention controls: +### `vid_gen` -- pending job limit -- completed job TTL -- failed job TTL +The following section documents the native async contract for video generation. -### Future `vid_gen` +#### `POST /sdcpp/v1/vid_gen` -Future `vid_gen` should reuse the same async job model: +Submits an async video generation job. -- `POST /sdcpp/v1/vid_gen` -- `GET /sdcpp/v1/jobs/{id}` -- `POST /sdcpp/v1/jobs/{id}/cancel` +Successful submission returns `202 Accepted`. -Its request body should mirror `sd_vid_gen_params_t` in the same way that `img_gen` mirrors `sd_img_gen_params_t`. +Example response: + +```json +{ + "id": "job_01HTXYZVID", + "kind": "vid_gen", + "status": "queued", + "created": 1775401200, + "poll_url": "/sdcpp/v1/jobs/job_01HTXYZVID" +} +``` + +Response fields: + +| Field | Type | +| --- | --- | +| `id` | `string` | +| `kind` | `string` | +| `status` | `string` | +| `created` | `integer` | +| `poll_url` | `string` | + +### Request Body + +Compared with `img_gen`, the `vid_gen` request body: + +- `vid_gen` is a single video sequence job, so `batch_count` is not part of the request schema +- `ref_images`, `mask_image`, `control_image`, `control_strength`, and `embed_image_metadata` are not part of the request schema +- `vid_gen` adds `end_image`, `control_frames`, `high_noise_sample_params`, `video_frames`, `fps`, `moe_boundary`, and `vace_strength` + +Example: + +```json +{ + "prompt": "a cat walking through a rainy alley", + "negative_prompt": "", + "clip_skip": -1, + "width": 832, + "height": 480, + "strength": 0.75, + "seed": -1, + "video_frames": 33, + "fps": 16, + "moe_boundary": 0.875, + "vace_strength": 1.0, + + "init_image": null, + "end_image": null, + "control_frames": [], + + "sample_params": { + "scheduler": "discrete", + "sample_method": "euler", + "sample_steps": 28, + "eta": 1.0, + "shifted_timestep": 0, + "custom_sigmas": [], + "flow_shift": 0.0, + "guidance": { + "txt_cfg": 7.0, + "img_cfg": 7.0, + "distilled_guidance": 3.5, + "slg": { + "layers": [7, 8, 9], + "layer_start": 0.01, + "layer_end": 0.2, + "scale": 0.0 + } + } + }, + + "high_noise_sample_params": { + "scheduler": "discrete", + "sample_method": "euler", + "sample_steps": -1, + "eta": 1.0, + "shifted_timestep": 0, + "flow_shift": 0.0, + "guidance": { + "txt_cfg": 7.0, + "img_cfg": 7.0, + "distilled_guidance": 3.5, + "slg": { + "layers": [7, 8, 9], + "layer_start": 0.01, + "layer_end": 0.2, + "scale": 0.0 + } + } + }, + + "lora": [], + + "vae_tiling_params": { + "enabled": false, + "tile_size_x": 0, + "tile_size_y": 0, + "target_overlap": 0.5, + "rel_size_x": 0.0, + "rel_size_y": 0.0 + }, + + "cache_mode": "disabled", + "cache_option": "", + "scm_mask": "", + "scm_policy_dynamic": true, + + "output_format": "webm", + "output_compression": 100 +} +``` + +### LoRA Rules + +- The server only accepts explicit LoRA entries from the `lora` field. +- Prompt-embedded `` tags are intentionally unsupported. +- `lora[].is_high_noise` controls whether a LoRA applies only to the high-noise stage. + +### Image and Frame Encoding Rules + +Any image field accepts: + +- a raw base64 string, or +- a data URL such as `data:image/png;base64,...` + +Channel expectations: + +- `init_image`: 3 channels +- `end_image`: 3 channels +- `control_frames[]`: 3 channels + +Frame ordering rules: + +- `control_frames[]` order is the conditioning frame order +- `control_frames[]` is preserved in request order + +If omitted or null: + +- single-image fields map to an empty `sd_image_t` +- array fields map to an empty C-style array, represented as `pointer = nullptr` and `count = 0` + +### Field Mapping Summary + +Top-level scalar fields: + +| Field | Type | +| --- | --- | +| `prompt` | `string` | +| `negative_prompt` | `string` | +| `clip_skip` | `integer` | +| `width` | `integer` | +| `height` | `integer` | +| `strength` | `number` | +| `seed` | `integer` | +| `video_frames` | `integer` | +| `fps` | `integer` | +| `moe_boundary` | `number` | +| `vace_strength` | `number` | + +Image and frame fields: + +| Field | Type | +| --- | --- | +| `init_image` | `string \| null` | +| `end_image` | `string \| null` | +| `control_frames` | `array` | + +LoRA fields: + +| Field | Type | +| --- | --- | +| `lora[].path` | `string` | +| `lora[].multiplier` | `number` | +| `lora[].is_high_noise` | `boolean` | + +Sampling fields: + +| Field | Type | +| --- | --- | +| `sample_params.scheduler` | `string` | +| `sample_params.sample_method` | `string` | +| `sample_params.sample_steps` | `integer` | +| `sample_params.eta` | `number` | +| `sample_params.shifted_timestep` | `integer` | +| `sample_params.custom_sigmas` | `array` | +| `sample_params.flow_shift` | `number` | +| `sample_params.guidance.txt_cfg` | `number` | +| `sample_params.guidance.img_cfg` | `number` | +| `sample_params.guidance.distilled_guidance` | `number` | +| `sample_params.guidance.slg.layers` | `array` | +| `sample_params.guidance.slg.layer_start` | `number` | +| `sample_params.guidance.slg.layer_end` | `number` | +| `sample_params.guidance.slg.scale` | `number` | + +High-noise sampling fields: + +| Field | Type | +| --- | --- | +| `high_noise_sample_params.scheduler` | `string` | +| `high_noise_sample_params.sample_method` | `string` | +| `high_noise_sample_params.sample_steps` | `integer` | +| `high_noise_sample_params.eta` | `number` | +| `high_noise_sample_params.shifted_timestep` | `integer` | +| `high_noise_sample_params.flow_shift` | `number` | +| `high_noise_sample_params.guidance.txt_cfg` | `number` | +| `high_noise_sample_params.guidance.img_cfg` | `number` | +| `high_noise_sample_params.guidance.distilled_guidance` | `number` | +| `high_noise_sample_params.guidance.slg.layers` | `array` | +| `high_noise_sample_params.guidance.slg.layer_start` | `number` | +| `high_noise_sample_params.guidance.slg.layer_end` | `number` | +| `high_noise_sample_params.guidance.slg.scale` | `number` | + +Other native fields: + +| Field | Type | +| --- | --- | +| `vae_tiling_params` | `object` | +| `cache_mode` | `string` | +| `cache_option` | `string` | +| `scm_mask` | `string` | +| `scm_policy_dynamic` | `boolean` | + +HTTP-only output fields: + +| Field | Type | +| --- | --- | +| `output_format` | `string` | +| `output_compression` | `integer` | + +For `vid_gen`, `output_format` and `output_compression` control container encoding. +`fps` is request metadata for the generated sequence and is echoed in the completed job result. + +Allowed `output_format` values: + +- `webm` +- `webp` +- `avi` + +Output format behavior: + +- `output_format` defaults to `webm` +- `webp` means animated WebP +- `avi` means MJPG AVI +- `webm` requires the server to be built with WebM support; otherwise the request returns `400` + +### Result Payload + +Completed jobs return one encoded container payload, not a list of per-frame images. + +Result fields: + +- `result.b64_json` contains the whole encoded container file as base64 +- `result.mime_type` identifies the media type +- `result.output_format` echoes the selected container format +- `result.fps` echoes the effective playback FPS +- `result.frame_count` reports the actual decoded frame count used to build the container + +Expected MIME types: + +| `output_format` | `mime_type` | +| --- | --- | +| `webm` | `video/webm` | +| `webp` | `image/webp` | +| `avi` | `video/x-msvideo` | + +### Optional Field Handling + +Optional sampling fields may be omitted. + +When omitted, backend defaults apply to these fields: + +- `sample_params.scheduler` +- `sample_params.sample_method` +- `sample_params.eta` +- `sample_params.flow_shift` +- `sample_params.guidance.img_cfg` +- `high_noise_sample_params.scheduler` +- `high_noise_sample_params.sample_method` +- `high_noise_sample_params.eta` +- `high_noise_sample_params.flow_shift` +- `high_noise_sample_params.guidance.img_cfg` + +`high_noise_sample_params` may also be omitted entirely. + +### Frame Count Semantics + +`video_frames` is the requested target length, but the current core video path internally normalizes the effective frame count to the largest `4n + 1` value that does not exceed the requested count. + +Examples: + +- `video_frames = 33` stays `33` +- `video_frames = 34` becomes `33` +- `video_frames = 32` becomes `29` + +The completed job payload includes the actual decoded `frame_count`. + +### Completion Result + +Example completed job: + +```json +{ + "id": "job_01HTXYZVID", + "kind": "vid_gen", + "status": "completed", + "created": 1775401200, + "started": 1775401203, + "completed": 1775401215, + "queue_position": 0, + "result": { + "output_format": "webm", + "mime_type": "video/webm", + "fps": 16, + "frame_count": 33, + "b64_json": "GkXfo59ChoEBQveBAULygQRC84EIQo..." + }, + "error": null +} +``` + +The response returns the encoded `.webm`, animated `.webp`, or `.avi` container payload directly. + +### Failure Result + +Example failed job: + +```json +{ + "id": "job_01HTXYZVID", + "kind": "vid_gen", + "status": "failed", + "created": 1775401200, + "started": 1775401203, + "completed": 1775401204, + "queue_position": 0, + "result": null, + "error": { + "code": "generation_failed", + "message": "generate_video returned no results" + } +} +``` + +### Cancelled Result + +Example cancelled job: + +```json +{ + "id": "job_01HTXYZVID", + "kind": "vid_gen", + "status": "cancelled", + "created": 1775401200, + "started": null, + "completed": 1775401202, + "queue_position": 0, + "result": null, + "error": { + "code": "cancelled", + "message": "job cancelled by client" + } +} +``` + +### Submission Errors + +`POST /sdcpp/v1/vid_gen` may return: + +- `202 Accepted` when the job is created +- `400 Bad Request` for an empty body, unsupported model mode, invalid JSON, invalid generation parameters, or an unsupported output format +- `429 Too Many Requests` when the job queue is full +- `500 Internal Server Error` for unexpected server exceptions during submission diff --git a/examples/server/async_jobs.cpp b/examples/server/async_jobs.cpp index 39c47cfa..e8e9d8ad 100644 --- a/examples/server/async_jobs.cpp +++ b/examples/server/async_jobs.cpp @@ -95,8 +95,12 @@ bool cancel_queued_job(AsyncJobManager& manager, AsyncGenerationJob& job) { job.status = AsyncJobStatus::Cancelled; job.completed_at = unix_timestamp_now(); job.result_images_b64.clear(); - job.error_code = "cancelled"; - job.error_message = "job cancelled by client"; + job.result_media_b64.clear(); + job.result_media_mime_type.clear(); + job.result_frame_count = 0; + job.result_fps = 0; + job.error_code = "cancelled"; + job.error_message = "job cancelled by client"; return true; } @@ -122,14 +126,24 @@ json make_async_job_json(const AsyncJobManager& manager, const AsyncGenerationJo } if (job.status == AsyncJobStatus::Completed) { - json images = json::array(); - for (size_t i = 0; i < job.result_images_b64.size(); ++i) { - images.push_back({{"index", i}, {"b64_json", job.result_images_b64[i]}}); + if (job.kind == AsyncJobKind::VidGen) { + result["result"] = { + {"output_format", job.vid_gen.output_format}, + {"mime_type", job.result_media_mime_type}, + {"fps", job.result_fps}, + {"frame_count", job.result_frame_count}, + {"b64_json", job.result_media_b64}, + }; + } else { + json images = json::array(); + for (size_t i = 0; i < job.result_images_b64.size(); ++i) { + images.push_back({{"index", i}, {"b64_json", job.result_images_b64[i]}}); + } + result["result"] = { + {"output_format", job.img_gen.output_format}, + {"images", images}, + }; } - result["result"] = { - {"output_format", job.img_gen.output_format}, - {"images", images}, - }; result["error"] = nullptr; } else if (job.status == AsyncJobStatus::Failed || job.status == AsyncJobStatus::Cancelled) { @@ -156,16 +170,15 @@ bool execute_img_gen_job(ServerRuntime& runtime, sd_img_gen_params_t params = job.img_gen.to_sd_img_gen_params_t(); SDImageVec results; - int num_results = 0; { std::lock_guard lock(*runtime.sd_ctx_mutex); sd_image_t* raw_results = generate_image(runtime.sd_ctx, ¶ms); - num_results = params.batch_count; - results.adopt(raw_results, num_results); + results.adopt(raw_results, params.batch_count); } - if (results.empty() || num_results <= 0) { + const int num_results = results.count(); + if (num_results <= 0) { error_message = "generate_image returned no results"; return false; } @@ -208,6 +221,47 @@ bool execute_img_gen_job(ServerRuntime& runtime, return true; } +bool execute_vid_gen_job(ServerRuntime& runtime, + AsyncGenerationJob& job, + std::string& output_media_b64, + std::string& output_media_mime_type, + int& output_frame_count, + int& output_fps, + std::string& error_message) { + sd_vid_gen_params_t params = job.vid_gen.to_sd_vid_gen_params_t(); + + SDImageVec results; + int num_results = 0; + + { + std::lock_guard lock(*runtime.sd_ctx_mutex); + sd_image_t* raw_results = generate_video(runtime.sd_ctx, ¶ms, &num_results); + results.adopt(raw_results, num_results); + } + + num_results = results.count(); + if (num_results <= 0) { + error_message = "generate_video returned no results"; + return false; + } + + std::vector video_bytes = create_video_from_sd_images_to_vector(job.vid_gen.output_format, + results.data(), + num_results, + job.vid_gen.gen_params.fps, + job.vid_gen.output_compression); + if (video_bytes.empty()) { + error_message = "failed to encode generated video container"; + return false; + } + + output_media_b64 = base64_encode(video_bytes); + output_media_mime_type = video_mime_type(job.vid_gen.output_format); + output_frame_count = num_results; + output_fps = job.vid_gen.gen_params.fps; + return true; +} + void async_job_worker(ServerRuntime& runtime) { AsyncJobManager& manager = *runtime.async_job_manager; @@ -240,11 +294,23 @@ void async_job_worker(ServerRuntime& runtime) { } std::vector output_images; + std::string output_media_b64; + std::string output_media_mime_type; + int output_frame_count = 0; + int output_fps = 0; std::string error_message; bool ok = false; if (job->kind == AsyncJobKind::ImgGen) { ok = execute_img_gen_job(runtime, *job, output_images, error_message); + } else if (job->kind == AsyncJobKind::VidGen) { + ok = execute_vid_gen_job(runtime, + *job, + output_media_b64, + output_media_mime_type, + output_frame_count, + output_fps, + error_message); } else { error_message = "unsupported job kind"; } @@ -258,8 +324,12 @@ void async_job_worker(ServerRuntime& runtime) { job->completed_at = unix_timestamp_now(); if (ok) { - job->status = AsyncJobStatus::Completed; - job->result_images_b64 = std::move(output_images); + job->status = AsyncJobStatus::Completed; + job->result_images_b64 = std::move(output_images); + job->result_media_b64 = std::move(output_media_b64); + job->result_media_mime_type = std::move(output_media_mime_type); + job->result_frame_count = output_frame_count; + job->result_fps = output_fps; job->error_code.clear(); job->error_message.clear(); } else { @@ -267,6 +337,10 @@ void async_job_worker(ServerRuntime& runtime) { job->error_code = "generation_failed"; job->error_message = error_message.empty() ? "unknown generation error" : error_message; job->result_images_b64.clear(); + job->result_media_b64.clear(); + job->result_media_mime_type.clear(); + job->result_frame_count = 0; + job->result_fps = 0; } purge_expired_jobs(manager); diff --git a/examples/server/async_jobs.h b/examples/server/async_jobs.h index cb90bdd8..89997a3b 100644 --- a/examples/server/async_jobs.h +++ b/examples/server/async_jobs.h @@ -36,7 +36,12 @@ struct AsyncGenerationJob { int64_t started_at = 0; int64_t completed_at = 0; ImgGenJobRequest img_gen; + VidGenJobRequest vid_gen; std::vector result_images_b64; + std::string result_media_b64; + std::string result_media_mime_type; + int result_frame_count = 0; + int result_fps = 0; std::string error_code; std::string error_message; }; @@ -63,4 +68,11 @@ bool execute_img_gen_job(ServerRuntime& runtime, AsyncGenerationJob& job, std::vector& output_images, std::string& error_message); +bool execute_vid_gen_job(ServerRuntime& runtime, + AsyncGenerationJob& job, + std::string& output_media_b64, + std::string& output_media_mime_type, + int& output_frame_count, + int& output_fps, + std::string& error_message); void async_job_worker(ServerRuntime& runtime); diff --git a/examples/server/frontend b/examples/server/frontend index 740475a7..797ccf80 160000 --- a/examples/server/frontend +++ b/examples/server/frontend @@ -1 +1 @@ -Subproject commit 740475a7a6794dc07fb23e8ec5dc56e7e80aa8c1 +Subproject commit 797ccf80825cc035508ba9b599b2a21953e7f835 diff --git a/examples/server/main.cpp b/examples/server/main.cpp index 11a334d5..114d526a 100644 --- a/examples/server/main.cpp +++ b/examples/server/main.cpp @@ -48,7 +48,9 @@ static void parse_args(int argc, if (!svr_params.resolve_and_validate() || !ctx_params.resolve_and_validate(IMG_GEN) || - !default_gen_params.resolve_and_validate(IMG_GEN, ctx_params.lora_model_dir)) { + !default_gen_params.resolve_and_validate(IMG_GEN, + ctx_params.lora_model_dir, + ctx_params.hires_upscalers_dir)) { print_usage(argv[0], options_vec); exit(1); } @@ -95,6 +97,8 @@ int main(int argc, const char** argv) { std::vector lora_cache; std::mutex lora_mutex; + std::vector upscaler_cache; + std::mutex upscaler_mutex; AsyncJobManager async_job_manager; ServerRuntime runtime = { sd_ctx.get(), @@ -104,6 +108,8 @@ int main(int argc, const char** argv) { &default_gen_params, &lora_cache, &lora_mutex, + &upscaler_cache, + &upscaler_mutex, &async_job_manager, }; diff --git a/examples/server/routes_openai.cpp b/examples/server/routes_openai.cpp index af121045..a24383d6 100644 --- a/examples/server/routes_openai.cpp +++ b/examples/server/routes_openai.cpp @@ -70,7 +70,7 @@ static bool build_openai_generation_request(const httplib::Request& req, } // Intentionally disable prompt-embedded LoRA tag parsing for server APIs. - if (!request.gen_params.resolve_and_validate(IMG_GEN, "", true)) { + if (!request.gen_params.resolve_and_validate(IMG_GEN, "", runtime.ctx_params->hires_upscalers_dir, true)) { error_message = "invalid params"; return false; } @@ -212,7 +212,7 @@ static bool build_openai_edit_request(const httplib::Request& req, } // Intentionally disable prompt-embedded LoRA tag parsing for server APIs. - if (!request.gen_params.resolve_and_validate(IMG_GEN, "", true)) { + if (!request.gen_params.resolve_and_validate(IMG_GEN, "", runtime.ctx_params->hires_upscalers_dir, true)) { error_message = "invalid params"; return false; } @@ -253,6 +253,12 @@ void register_openai_api_endpoints(httplib::Server& svr, ServerRuntime& rt) { svr.Post("/v1/images/generations", [runtime](const httplib::Request& req, httplib::Response& res) { try { + if (!runtime_supports_generation_mode(*runtime, IMG_GEN)) { + res.status = 400; + res.set_content(json({{"error", unsupported_generation_mode_error(IMG_GEN)}}).dump(), "application/json"); + return; + } + ImgGenJobRequest request; std::string error_message; if (!build_openai_generation_request(req, *runtime, request, error_message)) { @@ -319,6 +325,12 @@ void register_openai_api_endpoints(httplib::Server& svr, ServerRuntime& rt) { svr.Post("/v1/images/edits", [runtime](const httplib::Request& req, httplib::Response& res) { try { + if (!runtime_supports_generation_mode(*runtime, IMG_GEN)) { + res.status = 400; + res.set_content(json({{"error", unsupported_generation_mode_error(IMG_GEN)}}).dump(), "application/json"); + return; + } + ImgGenJobRequest request; std::string error_message; if (!build_openai_edit_request(req, *runtime, request, error_message)) { diff --git a/examples/server/routes_sdapi.cpp b/examples/server/routes_sdapi.cpp index ca6661c0..1e01d292 100644 --- a/examples/server/routes_sdapi.cpp +++ b/examples/server/routes_sdapi.cpp @@ -1,6 +1,7 @@ #include "routes.h" #include +#include #include #include #include @@ -35,14 +36,20 @@ static fs::path resolve_display_model_path(const ServerRuntime& runtime) { return {}; } +static std::string lower_ascii(std::string value) { + std::transform(value.begin(), value.end(), value.begin(), [](unsigned char c) { + return static_cast(std::tolower(c)); + }); + return value; +} + static enum sample_method_t get_sdapi_sample_method(std::string name) { enum sample_method_t result = str_to_sample_method(name.c_str()); if (result != SAMPLE_METHOD_COUNT) { return result; } - std::transform(name.begin(), name.end(), name.begin(), - [](unsigned char c) { return static_cast(std::tolower(c)); }); + name = lower_ascii(name); static const std::unordered_map hardcoded{ {"euler a", EULER_A_SAMPLE_METHOD}, {"k_euler_a", EULER_A_SAMPLE_METHOD}, @@ -114,6 +121,18 @@ static bool build_sdapi_img_gen_request(const json& j, request.gen_params.width = j.value("width", -1); request.gen_params.height = j.value("height", -1); + if (!img2img && j.value("enable_hr", false)) { + request.gen_params.hires_enabled = true; + request.gen_params.hires_scale = j.value("hr_scale", request.gen_params.hires_scale); + request.gen_params.hires_width = j.value("hr_resize_x", request.gen_params.hires_width); + request.gen_params.hires_height = j.value("hr_resize_y", request.gen_params.hires_height); + request.gen_params.hires_steps = j.value("hr_steps", request.gen_params.hires_steps); + request.gen_params.hires_denoising_strength = + j.value("denoising_strength", request.gen_params.hires_denoising_strength); + + request.gen_params.hires_upscaler = j.value("hr_upscaler", request.gen_params.hires_upscaler); + } + std::string sd_cpp_extra_args_str = extract_and_remove_sd_cpp_extra_args(request.gen_params.prompt); if (!sd_cpp_extra_args_str.empty() && !request.gen_params.from_json_str(sd_cpp_extra_args_str)) { error_message = "invalid sd_cpp_extra_args"; @@ -228,7 +247,7 @@ static bool build_sdapi_img_gen_request(const json& j, } // Intentionally disable prompt-embedded LoRA tag parsing for server APIs. - if (!request.gen_params.resolve_and_validate(IMG_GEN, "", true)) { + if (!request.gen_params.resolve_and_validate(IMG_GEN, "", runtime.ctx_params->hires_upscalers_dir, true)) { error_message = "invalid params"; return false; } @@ -246,6 +265,11 @@ void register_sdapi_endpoints(httplib::Server& svr, ServerRuntime& rt) { res.set_content(R"({"error":"empty body"})", "application/json"); return; } + if (!runtime_supports_generation_mode(*runtime, IMG_GEN)) { + res.status = 400; + res.set_content(json({{"error", unsupported_generation_mode_error(IMG_GEN)}}).dump(), "application/json"); + return; + } json j = json::parse(req.body); ImgGenJobRequest request; @@ -342,6 +366,52 @@ void register_sdapi_endpoints(httplib::Server& svr, ServerRuntime& rt) { res.set_content(result.dump(), "application/json"); }); + svr.Get("/sdapi/v1/upscalers", [runtime](const httplib::Request&, httplib::Response& res) { + refresh_upscaler_cache(*runtime); + + auto make_builtin = [](const char* name) { + json item; + item["name"] = name; + item["model_name"] = nullptr; + item["model_path"] = nullptr; + item["model_url"] = nullptr; + item["scale"] = 4; + return item; + }; + + json result = json::array(); + result.push_back(make_builtin("None")); + result.push_back(make_builtin("Lanczos")); + result.push_back(make_builtin("Nearest")); + + { + std::lock_guard lock(*runtime->upscaler_mutex); + for (const auto& e : *runtime->upscaler_cache) { + json item; + item["name"] = e.name; + item["model_name"] = e.model_name; + item["model_path"] = e.fullpath; + item["model_url"] = nullptr; + item["scale"] = e.scale; + result.push_back(item); + } + } + + res.set_content(result.dump(), "application/json"); + }); + + svr.Get("/sdapi/v1/latent-upscale-modes", [](const httplib::Request&, httplib::Response& res) { + json result = json::array({ + {{"name", "Latent"}}, + {{"name", "Latent (nearest)"}}, + {{"name", "Latent (nearest-exact)"}}, + {{"name", "Latent (antialiased)"}}, + {{"name", "Latent (bicubic)"}}, + {{"name", "Latent (bicubic antialiased)"}}, + }); + res.set_content(result.dump(), "application/json"); + }); + svr.Get("/sdapi/v1/samplers", [runtime](const httplib::Request&, httplib::Response& res) { std::vector sampler_names; sampler_names.push_back("default"); diff --git a/examples/server/routes_sdcpp.cpp b/examples/server/routes_sdcpp.cpp index 930033bb..16fe0af4 100644 --- a/examples/server/routes_sdcpp.cpp +++ b/examples/server/routes_sdcpp.cpp @@ -75,48 +75,33 @@ static fs::path resolve_display_model_path(const ServerRuntime& runtime) { return {}; } -static json make_capabilities_json(ServerRuntime& runtime) { - refresh_lora_cache(runtime); - - AsyncJobManager& manager = *runtime.async_job_manager; - const auto& defaults = *runtime.default_gen_params; - const auto& sample_params = defaults.sample_params; - const auto& guidance = sample_params.guidance; - const fs::path model_path = resolve_display_model_path(runtime); - json samplers = json::array(); - json schedulers = json::array(); - json output_formats = json::array({"png", "jpeg"}); - json available_loras = json::array(); - - for (int i = 0; i < SAMPLE_METHOD_COUNT; ++i) { - samplers.push_back(sd_sample_method_name((sample_method_t)i)); - } - - for (int i = 0; i < SCHEDULER_COUNT; ++i) { - schedulers.push_back(sd_scheduler_name((scheduler_t)i)); - } - -#ifdef SD_USE_WEBP - output_formats.push_back("webp"); -#endif - - { - std::lock_guard lock(*runtime.lora_mutex); - for (const auto& entry : *runtime.lora_cache) { - available_loras.push_back({ - {"name", entry.name}, - {"path", entry.path}, - }); - } - } - - json result; - result["model"] = { - {"name", model_path.filename().u8string()}, - {"stem", model_path.stem().u8string()}, - {"path", model_path.u8string()}, +static json make_sample_params_json(const sd_sample_params_t& sample_params, const std::vector& skip_layers) { + const auto& guidance = sample_params.guidance; + return { + {"scheduler", capability_scheduler_name(sample_params.scheduler)}, + {"sample_method", capability_sample_method_name(sample_params.sample_method)}, + {"sample_steps", sample_params.sample_steps}, + {"eta", finite_number_or_null(sample_params.eta)}, + {"shifted_timestep", sample_params.shifted_timestep}, + {"flow_shift", finite_number_or_null(sample_params.flow_shift)}, + {"guidance", + { + {"txt_cfg", guidance.txt_cfg}, + {"img_cfg", finite_number_or_null(guidance.img_cfg)}, + {"distilled_guidance", guidance.distilled_guidance}, + {"slg", + { + {"layers", skip_layers}, + {"layer_start", guidance.slg.layer_start}, + {"layer_end", guidance.slg.layer_end}, + {"scale", guidance.slg.scale}, + }}, + }}, }; - result["defaults"] = { +} + +static json make_img_gen_defaults_json(const SDGenerationParams& defaults, const std::string& output_format) { + return { {"prompt", defaults.prompt}, {"negative_prompt", defaults.negative_prompt}, {"clip_skip", defaults.clip_skip}, @@ -128,59 +113,228 @@ static json make_capabilities_json(ServerRuntime& runtime) { {"auto_resize_ref_image", defaults.auto_resize_ref_image}, {"increase_ref_index", defaults.increase_ref_index}, {"control_strength", defaults.control_strength}, - {"sample_params", + {"sample_params", make_sample_params_json(defaults.sample_params, defaults.skip_layers)}, + {"hires", { - {"scheduler", capability_scheduler_name(sample_params.scheduler)}, - {"sample_method", capability_sample_method_name(sample_params.sample_method)}, - {"sample_steps", sample_params.sample_steps}, - {"eta", finite_number_or_null(sample_params.eta)}, - {"shifted_timestep", sample_params.shifted_timestep}, - {"flow_shift", finite_number_or_null(sample_params.flow_shift)}, - {"guidance", - { - {"txt_cfg", guidance.txt_cfg}, - {"img_cfg", finite_number_or_null(guidance.img_cfg)}, - {"distilled_guidance", guidance.distilled_guidance}, - {"slg", - { - {"layers", defaults.skip_layers}, - {"layer_start", guidance.slg.layer_start}, - {"layer_end", guidance.slg.layer_end}, - {"scale", guidance.slg.scale}, - }}, - }}, + {"enabled", defaults.hires_enabled}, + {"upscaler", defaults.hires_upscaler}, + {"scale", defaults.hires_scale}, + {"target_width", defaults.hires_width}, + {"target_height", defaults.hires_height}, + {"steps", defaults.hires_steps}, + {"denoising_strength", defaults.hires_denoising_strength}, + {"upscale_tile_size", defaults.hires_upscale_tile_size}, }}, {"vae_tiling_params", make_vae_tiling_json(defaults.vae_tiling_params)}, {"cache_mode", defaults.cache_mode}, {"cache_option", defaults.cache_option}, {"scm_mask", defaults.scm_mask}, {"scm_policy_dynamic", defaults.scm_policy_dynamic}, - {"output_format", "png"}, + {"output_format", output_format}, {"output_compression", 100}, }; - result["limits"] = { - {"min_width", 64}, - {"max_width", 4096}, - {"min_height", 64}, - {"max_height", 4096}, - {"max_batch_count", 8}, - {"max_queue_size", manager.max_pending_jobs}, +} + +static json make_vid_gen_defaults_json(const SDGenerationParams& defaults, const std::string& output_format) { + return { + {"prompt", defaults.prompt}, + {"negative_prompt", defaults.negative_prompt}, + {"clip_skip", defaults.clip_skip}, + {"width", defaults.width > 0 ? defaults.width : 512}, + {"height", defaults.height > 0 ? defaults.height : 512}, + {"strength", defaults.strength}, + {"seed", defaults.seed}, + {"video_frames", defaults.video_frames}, + {"fps", defaults.fps}, + {"moe_boundary", defaults.moe_boundary}, + {"vace_strength", defaults.vace_strength}, + {"sample_params", make_sample_params_json(defaults.sample_params, defaults.skip_layers)}, + {"high_noise_sample_params", make_sample_params_json(defaults.high_noise_sample_params, defaults.high_noise_skip_layers)}, + {"vae_tiling_params", make_vae_tiling_json(defaults.vae_tiling_params)}, + {"cache_mode", defaults.cache_mode}, + {"cache_option", defaults.cache_option}, + {"scm_mask", defaults.scm_mask}, + {"scm_policy_dynamic", defaults.scm_policy_dynamic}, + {"output_format", output_format}, + {"output_compression", 100}, }; - result["samplers"] = samplers; - result["schedulers"] = schedulers; - result["output_formats"] = output_formats; - result["features"] = { - {"init_image", true}, - {"mask_image", true}, - {"control_image", true}, - {"ref_images", true}, - {"lora", true}, - {"vae_tiling", true}, - {"cache", true}, +} + +static json make_img_gen_features_json() { + return { + {"init_image", true}, + {"mask_image", true}, + {"control_image", true}, + {"ref_images", true}, + {"lora", true}, + {"vae_tiling", true}, + {"hires", true}, + {"cache", true}, + {"cancel_queued", true}, + {"cancel_generating", false}, + }; +} + +static json make_vid_gen_features_json() { + return { + {"init_image", true}, + {"end_image", true}, + {"control_frames", true}, + {"high_noise_sample_params", true}, + {"lora", true}, + {"vae_tiling", true}, + {"cache", true}, + {"cancel_queued", true}, + {"cancel_generating", false}, + }; +} + +static json make_capabilities_json(ServerRuntime& runtime) { + refresh_lora_cache(runtime); + refresh_upscaler_cache(runtime); + + AsyncJobManager& manager = *runtime.async_job_manager; + const auto& defaults = *runtime.default_gen_params; + const fs::path model_path = resolve_display_model_path(runtime); + const bool supports_img = runtime_supports_generation_mode(runtime, IMG_GEN); + const bool supports_vid = runtime_supports_generation_mode(runtime, VID_GEN); + json samplers = json::array(); + json schedulers = json::array(); + json image_output_formats = supported_img_output_formats(); + json video_output_formats = supported_vid_output_formats(); + json available_loras = json::array(); + json available_upscalers = json::array(); + json supported_modes = json::array(); + + for (int i = 0; i < SAMPLE_METHOD_COUNT; ++i) { + samplers.push_back(sd_sample_method_name((sample_method_t)i)); + } + + for (int i = 0; i < SCHEDULER_COUNT; ++i) { + schedulers.push_back(sd_scheduler_name((scheduler_t)i)); + } + + { + std::lock_guard lock(*runtime.lora_mutex); + for (const auto& entry : *runtime.lora_cache) { + available_loras.push_back({ + {"name", entry.name}, + {"path", entry.path}, + }); + } + } + + available_upscalers.push_back({ + {"name", "None"}, + }); + available_upscalers.push_back({ + {"name", "Lanczos"}, + }); + available_upscalers.push_back({ + {"name", "Nearest"}, + }); + available_upscalers.push_back({ + {"name", "Latent"}, + }); + available_upscalers.push_back({ + {"name", "Latent (nearest)"}, + }); + available_upscalers.push_back({ + {"name", "Latent (nearest-exact)"}, + }); + available_upscalers.push_back({ + {"name", "Latent (antialiased)"}, + }); + available_upscalers.push_back({ + {"name", "Latent (bicubic)"}, + }); + available_upscalers.push_back({ + {"name", "Latent (bicubic antialiased)"}, + }); + { + std::lock_guard lock(*runtime.upscaler_mutex); + for (const auto& entry : *runtime.upscaler_cache) { + available_upscalers.push_back({ + {"name", entry.name}, + }); + } + } + + if (supports_img) { + supported_modes.push_back("img_gen"); + } + if (supports_vid) { + supported_modes.push_back("vid_gen"); + } + + std::string default_img_output_format = "png"; + std::string default_vid_output_format = "avi"; + if (!image_output_formats.empty()) { + default_img_output_format = image_output_formats[0].get(); + } + if (!video_output_formats.empty()) { + default_vid_output_format = video_output_formats[0].get(); + } + + json defaults_by_mode = json::object(); + json output_formats_by_mode = json::object(); + json features_by_mode = json::object(); + if (supports_img) { + defaults_by_mode["img_gen"] = make_img_gen_defaults_json(defaults, default_img_output_format); + output_formats_by_mode["img_gen"] = image_output_formats; + features_by_mode["img_gen"] = make_img_gen_features_json(); + } + if (supports_vid) { + defaults_by_mode["vid_gen"] = make_vid_gen_defaults_json(defaults, default_vid_output_format); + output_formats_by_mode["vid_gen"] = video_output_formats; + features_by_mode["vid_gen"] = make_vid_gen_features_json(); + } + + json top_level_defaults = json::object(); + json top_level_output_formats = json::array(); + json top_level_features = { {"cancel_queued", true}, {"cancel_generating", false}, }; - result["loras"] = available_loras; + std::string current_mode = ""; + if (supports_img) { + current_mode = "img_gen"; + top_level_defaults = defaults_by_mode["img_gen"]; + top_level_output_formats = output_formats_by_mode["img_gen"]; + top_level_features = features_by_mode["img_gen"]; + } else if (supports_vid) { + current_mode = "vid_gen"; + top_level_defaults = defaults_by_mode["vid_gen"]; + top_level_output_formats = output_formats_by_mode["vid_gen"]; + top_level_features = features_by_mode["vid_gen"]; + } + + json result; + result["model"] = { + {"name", model_path.filename().u8string()}, + {"stem", model_path.stem().u8string()}, + {"path", model_path.u8string()}, + }; + result["current_mode"] = current_mode; + result["supported_modes"] = supported_modes; + result["defaults"] = top_level_defaults; + result["defaults_by_mode"] = defaults_by_mode; + result["limits"] = { + {"min_width", 64}, + {"max_width", 4096}, + {"min_height", 64}, + {"max_height", 4096}, + {"max_batch_count", 8}, + {"max_queue_size", manager.max_pending_jobs}, + }; + result["samplers"] = samplers; + result["schedulers"] = schedulers; + result["output_formats"] = top_level_output_formats; + result["output_formats_by_mode"] = output_formats_by_mode; + result["features"] = top_level_features; + result["features_by_mode"] = features_by_mode; + result["loras"] = available_loras; + result["upscalers"] = available_upscalers; return result; } @@ -204,7 +358,34 @@ static bool parse_img_gen_request(const json& body, return false; } // Intentionally disable prompt-embedded LoRA tag parsing for server APIs. - if (!request.gen_params.resolve_and_validate(IMG_GEN, "", true)) { + if (!request.gen_params.resolve_and_validate(IMG_GEN, "", runtime.ctx_params->hires_upscalers_dir, true)) { + error_message = "invalid generation parameters"; + return false; + } + return true; +} + +static bool parse_vid_gen_request(const json& body, + ServerRuntime& runtime, + VidGenJobRequest& request, + std::string& error_message) { + request.gen_params = *runtime.default_gen_params; + + refresh_lora_cache(runtime); + if (!request.gen_params.from_json_str(body.dump(), [&](const std::string& path) { + return get_lora_full_path(runtime, path); + })) { + error_message = "invalid generation parameters"; + return false; + } + + std::string output_format = body.value("output_format", "webm"); + int output_compression = body.value("output_compression", 100); + if (!assign_output_options(request, output_format, output_compression, error_message)) { + return false; + } + // Intentionally disable prompt-embedded LoRA tag parsing for server APIs. + if (!request.gen_params.resolve_and_validate(VID_GEN, "", runtime.ctx_params->hires_upscalers_dir, true)) { error_message = "invalid generation parameters"; return false; } @@ -226,6 +407,11 @@ void register_sdcpp_api_endpoints(httplib::Server& svr, ServerRuntime& rt) { res.set_content(R"({"error":"empty body"})", "application/json"); return; } + if (!runtime_supports_generation_mode(*runtime, IMG_GEN)) { + res.status = 400; + res.set_content(json({{"error", unsupported_generation_mode_error(IMG_GEN)}}).dump(), "application/json"); + return; + } json body = json::parse(req.body); ImgGenJobRequest request; @@ -276,9 +462,66 @@ void register_sdcpp_api_endpoints(httplib::Server& svr, ServerRuntime& rt) { } }); - svr.Post("/sdcpp/v1/vid_gen", [](const httplib::Request&, httplib::Response& res) { - res.status = 501; - res.set_content(R"({"error":"vid_gen is reserved and not implemented yet"})", "application/json"); + svr.Post("/sdcpp/v1/vid_gen", [runtime](const httplib::Request& req, httplib::Response& res) { + try { + if (req.body.empty()) { + res.status = 400; + res.set_content(R"({"error":"empty body"})", "application/json"); + return; + } + if (!runtime_supports_generation_mode(*runtime, VID_GEN)) { + res.status = 400; + res.set_content(json({{"error", unsupported_generation_mode_error(VID_GEN)}}).dump(), "application/json"); + return; + } + + json body = json::parse(req.body); + VidGenJobRequest request; + std::string error_message; + if (!parse_vid_gen_request(body, *runtime, request, error_message)) { + res.status = 400; + res.set_content(json({{"error", error_message}}).dump(), "application/json"); + return; + } + + AsyncJobManager& manager = *runtime->async_job_manager; + std::shared_ptr job = std::make_shared(); + job->kind = AsyncJobKind::VidGen; + job->status = AsyncJobStatus::Queued; + job->created_at = unix_timestamp_now(); + job->vid_gen = std::move(request); + + { + std::lock_guard lock(manager.mutex); + purge_expired_jobs(manager); + if (count_pending_jobs(manager) >= manager.max_pending_jobs) { + res.status = 429; + res.set_content(R"({"error":"job queue is full"})", "application/json"); + return; + } + job->id = make_async_job_id(manager); + manager.jobs[job->id] = job; + manager.queue.push_back(job->id); + } + + manager.cv.notify_one(); + + json out; + out["id"] = job->id; + out["kind"] = async_job_kind_name(job->kind); + out["status"] = async_job_status_name(job->status); + out["created"] = job->created_at; + out["poll_url"] = "/sdcpp/v1/jobs/" + job->id; + + res.status = 202; + res.set_content(out.dump(), "application/json"); + } catch (const json::parse_error& e) { + res.status = 400; + res.set_content(json({{"error", "invalid json"}, {"message", e.what()}}).dump(), "application/json"); + } catch (const std::exception& e) { + res.status = 500; + res.set_content(json({{"error", "server_error"}, {"message", e.what()}}).dump(), "application/json"); + } }); svr.Get(R"(/sdcpp/v1/jobs/([A-Za-z0-9_\-]+))", [runtime](const httplib::Request& req, httplib::Response& res) { diff --git a/examples/server/runtime.cpp b/examples/server/runtime.cpp index c29799e3..afadb62a 100644 --- a/examples/server/runtime.cpp +++ b/examples/server/runtime.cpp @@ -1,6 +1,7 @@ #include "runtime.h" #include +#include #include #include #include @@ -13,6 +14,18 @@ namespace fs = std::filesystem; +static std::string lower_ascii(std::string value) { + std::transform(value.begin(), value.end(), value.begin(), [](unsigned char c) { + return static_cast(std::tolower(c)); + }); + return value; +} + +static bool is_supported_model_ext(const fs::path& p) { + auto ext = lower_ascii(p.extension().string()); + return ext == ".gguf" || ext == ".pt" || ext == ".pth" || ext == ".safetensors"; +} + static const std::string k_base64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" @@ -45,6 +58,44 @@ std::string normalize_output_format(std::string output_format) { return output_format; } +std::vector supported_img_output_formats(bool allow_webp) { + std::vector formats = {"png", "jpeg"}; +#ifdef SD_USE_WEBP + if (allow_webp) { + formats.push_back("webp"); + } +#else + (void)allow_webp; +#endif + return formats; +} + +std::vector supported_vid_output_formats() { + std::vector formats; +#ifdef SD_USE_WEBM + formats.push_back("webm"); +#endif +#ifdef SD_USE_WEBP + formats.push_back("webp"); +#endif + formats.push_back("avi"); + return formats; +} + +static std::string valid_vid_output_formats_message() { + const std::vector formats = supported_vid_output_formats(); + + std::string message = "invalid output_format, must be one of ["; + for (size_t i = 0; i < formats.size(); ++i) { + if (i > 0) { + message += ", "; + } + message += formats[i]; + } + message += "]"; + return message; +} + bool assign_output_options(ImgGenJobRequest& request, std::string output_format, int output_compression, @@ -53,19 +104,88 @@ bool assign_output_options(ImgGenJobRequest& request, request.output_format = normalize_output_format(std::move(output_format)); request.output_compression = std::clamp(output_compression, 0, 100); - const bool valid_format = request.output_format == "png" || - request.output_format == "jpeg" || - (allow_webp && request.output_format == "webp"); + const std::vector valid_formats = supported_img_output_formats(allow_webp); + const bool valid_format = std::find(valid_formats.begin(), + valid_formats.end(), + request.output_format) != valid_formats.end(); if (!valid_format) { - error_message = allow_webp - ? "invalid output_format, must be one of [png, jpeg, webp]" - : "invalid output_format, must be one of [png, jpeg]"; + error_message = "invalid output_format, must be one of ["; + for (size_t i = 0; i < valid_formats.size(); ++i) { + if (i > 0) { + error_message += ", "; + } + error_message += valid_formats[i]; + } + error_message += "]"; return false; } return true; } +bool assign_output_options(VidGenJobRequest& request, + std::string output_format, + int output_compression, + std::string& error_message) { + request.output_format = normalize_output_format(std::move(output_format)); + request.output_compression = std::clamp(output_compression, 0, 100); + + if (request.output_format == "avi") { + return true; + } + + if (request.output_format == "webm") { +#ifdef SD_USE_WEBM + return true; +#else + error_message = valid_vid_output_formats_message(); + return false; +#endif + } + + if (request.output_format == "webp") { +#ifdef SD_USE_WEBP + return true; +#else + error_message = valid_vid_output_formats_message(); + return false; +#endif + } + + error_message = valid_vid_output_formats_message(); + return false; +} + +std::string video_mime_type(const std::string& output_format) { + if (output_format == "webm") { + return "video/webm"; + } + if (output_format == "webp") { + return "image/webp"; + } + return "video/x-msvideo"; +} + +bool runtime_supports_generation_mode(const ServerRuntime& runtime, SDMode mode) { + if (mode == VID_GEN) { + return sd_ctx_supports_video_generation(runtime.sd_ctx); + } + if (mode == IMG_GEN) { + return sd_ctx_supports_image_generation(runtime.sd_ctx); + } + return true; +} + +std::string unsupported_generation_mode_error(SDMode mode) { + if (mode == VID_GEN) { + return "loaded model does not support vid_gen"; + } + if (mode == IMG_GEN) { + return "loaded model does not support img_gen"; + } + return "loaded model does not support requested mode"; +} + ArgOptions SDSvrParams::get_options() { ArgOptions options; @@ -134,20 +254,12 @@ void refresh_lora_cache(ServerRuntime& rt) { fs::path lora_dir = rt.ctx_params->lora_model_dir; if (fs::exists(lora_dir) && fs::is_directory(lora_dir)) { - auto is_lora_ext = [](const fs::path& p) { - auto ext = p.extension().string(); - std::transform(ext.begin(), ext.end(), ext.begin(), [](unsigned char c) { - return static_cast(std::tolower(c)); - }); - return ext == ".gguf" || ext == ".pt" || ext == ".pth" || ext == ".safetensors"; - }; - for (auto& entry : fs::recursive_directory_iterator(lora_dir)) { if (!entry.is_regular_file()) { continue; } const fs::path& p = entry.path(); - if (!is_lora_ext(p)) { + if (!is_supported_model_ext(p)) { continue; } @@ -179,6 +291,40 @@ std::string get_lora_full_path(ServerRuntime& rt, const std::string& path) { return it != rt.lora_cache->end() ? it->fullpath : ""; } +void refresh_upscaler_cache(ServerRuntime& rt) { + std::vector new_cache; + + fs::path upscaler_dir = rt.ctx_params->hires_upscalers_dir; + if (fs::exists(upscaler_dir) && fs::is_directory(upscaler_dir)) { + for (auto& entry : fs::directory_iterator(upscaler_dir)) { + if (!entry.is_regular_file()) { + continue; + } + const fs::path& p = entry.path(); + if (!is_supported_model_ext(p)) { + continue; + } + + UpscalerEntry upscaler_entry; + upscaler_entry.name = p.stem().u8string(); + upscaler_entry.fullpath = fs::absolute(p).lexically_normal().u8string(); + upscaler_entry.model_name = "ESRGAN_4x"; + upscaler_entry.path = p.filename().u8string(); + + new_cache.push_back(std::move(upscaler_entry)); + } + } + + std::sort(new_cache.begin(), new_cache.end(), [](const UpscalerEntry& a, const UpscalerEntry& b) { + return a.name < b.name; + }); + + { + std::lock_guard lock(*rt.upscaler_mutex); + *rt.upscaler_cache = std::move(new_cache); + } +} + int64_t unix_timestamp_now() { return std::chrono::duration_cast( std::chrono::system_clock::now().time_since_epoch()) diff --git a/examples/server/runtime.h b/examples/server/runtime.h index 65e93243..5c5f2d48 100644 --- a/examples/server/runtime.h +++ b/examples/server/runtime.h @@ -37,6 +37,14 @@ struct LoraEntry { std::string fullpath; }; +struct UpscalerEntry { + std::string name; + std::string path; + std::string fullpath; + std::string model_name; + int scale = 4; +}; + struct ServerRuntime { sd_ctx_t* sd_ctx; std::mutex* sd_ctx_mutex; @@ -45,6 +53,8 @@ struct ServerRuntime { const SDGenerationParams* default_gen_params; std::vector* lora_cache; std::mutex* lora_mutex; + std::vector* upscaler_cache; + std::mutex* upscaler_mutex; AsyncJobManager* async_job_manager; }; @@ -58,13 +68,33 @@ struct ImgGenJobRequest { } }; +struct VidGenJobRequest { + SDGenerationParams gen_params; + std::string output_format = "webm"; + int output_compression = 100; + + sd_vid_gen_params_t to_sd_vid_gen_params_t() { + return gen_params.to_sd_vid_gen_params_t(); + } +}; + std::string base64_encode(const std::vector& bytes); std::string normalize_output_format(std::string output_format); +std::vector supported_img_output_formats(bool allow_webp = true); +std::vector supported_vid_output_formats(); bool assign_output_options(ImgGenJobRequest& request, std::string output_format, int output_compression, bool allow_webp, std::string& error_message); +bool assign_output_options(VidGenJobRequest& request, + std::string output_format, + int output_compression, + std::string& error_message); +std::string video_mime_type(const std::string& output_format); +bool runtime_supports_generation_mode(const ServerRuntime& runtime, SDMode mode); +std::string unsupported_generation_mode_error(SDMode mode); void refresh_lora_cache(ServerRuntime& rt); std::string get_lora_full_path(ServerRuntime& rt, const std::string& path); +void refresh_upscaler_cache(ServerRuntime& rt); int64_t unix_timestamp_now(); diff --git a/format-code.sh b/format-code.sh index 2e87da41..8aa422bc 100644 --- a/format-code.sh +++ b/format-code.sh @@ -1,5 +1,5 @@ -for f in src/*.cpp src/*.h src/*.hpp src/vocab/*.h src/vocab/*.cpp \ - examples/cli/*.cpp examples/cli/*.h examples/server/*.cpp \ +for f in src/*.cpp src/*.h src/*.hpp src/tokenizers/*.h src/tokenizers/*.cpp src/tokenizers/vocab/*.h src/tokenizers/vocab/*.cpp \ + src/model_io/*.h src/model_io/*.cpp examples/cli/*.cpp examples/cli/*.h examples/server/*.cpp \ examples/common/*.hpp examples/common/*.h examples/common/*.cpp; do [[ "$f" == vocab* ]] && continue echo "formatting '$f'" diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index f093bb56..75027f8f 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -50,6 +50,7 @@ enum sample_method_t { TCD_SAMPLE_METHOD, RES_MULTISTEP_SAMPLE_METHOD, RES_2S_SAMPLE_METHOD, + ER_SDE_SAMPLE_METHOD, SAMPLE_METHOD_COUNT }; @@ -288,6 +289,32 @@ typedef struct { const char* path; } sd_lora_t; +enum sd_hires_upscaler_t { + SD_HIRES_UPSCALER_NONE, + SD_HIRES_UPSCALER_LATENT, + SD_HIRES_UPSCALER_LATENT_NEAREST, + SD_HIRES_UPSCALER_LATENT_NEAREST_EXACT, + SD_HIRES_UPSCALER_LATENT_ANTIALIASED, + SD_HIRES_UPSCALER_LATENT_BICUBIC, + SD_HIRES_UPSCALER_LATENT_BICUBIC_ANTIALIASED, + SD_HIRES_UPSCALER_LANCZOS, + SD_HIRES_UPSCALER_NEAREST, + SD_HIRES_UPSCALER_MODEL, + SD_HIRES_UPSCALER_COUNT, +}; + +typedef struct { + bool enabled; + enum sd_hires_upscaler_t upscaler; + const char* model_path; + float scale; + int target_width; + int target_height; + int steps; + float denoising_strength; + int upscale_tile_size; +} sd_hires_params_t; + typedef struct { const sd_lora_t* loras; uint32_t lora_count; @@ -311,6 +338,7 @@ typedef struct { sd_pm_params_t pm_params; sd_tiling_params_t vae_tiling_params; sd_cache_params_t cache; + sd_hires_params_t hires; } sd_img_gen_params_t; typedef struct { @@ -347,6 +375,8 @@ SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data); SD_API void sd_set_preview_callback(sd_preview_cb_t cb, enum preview_t mode, int interval, bool denoised, bool noisy, void* data); SD_API int32_t sd_get_num_physical_cores(); SD_API const char* sd_get_system_info(); +SD_API bool sd_ctx_supports_image_generation(const sd_ctx_t* sd_ctx); +SD_API bool sd_ctx_supports_video_generation(const sd_ctx_t* sd_ctx); SD_API const char* sd_type_name(enum sd_type_t type); SD_API enum sd_type_t str_to_sd_type(const char* str); @@ -362,8 +392,11 @@ SD_API const char* sd_preview_name(enum preview_t preview); SD_API enum preview_t str_to_preview(const char* str); SD_API const char* sd_lora_apply_mode_name(enum lora_apply_mode_t mode); SD_API enum lora_apply_mode_t str_to_lora_apply_mode(const char* str); +SD_API const char* sd_hires_upscaler_name(enum sd_hires_upscaler_t upscaler); +SD_API enum sd_hires_upscaler_t str_to_sd_hires_upscaler(const char* str); SD_API void sd_cache_params_init(sd_cache_params_t* cache_params); +SD_API void sd_hires_params_init(sd_hires_params_t* hires_params); SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params); SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params); diff --git a/src/auto_encoder_kl.hpp b/src/auto_encoder_kl.hpp index d4283959..5cf09b88 100644 --- a/src/auto_encoder_kl.hpp +++ b/src/auto_encoder_kl.hpp @@ -533,7 +533,7 @@ public: const std::string& prefix = "") : version(version), decode_only(decode_only), use_video_decoder(use_video_decoder) { if (sd_version_is_dit(version)) { - if (sd_version_is_flux2(version)) { + if (sd_version_uses_flux2_vae(version)) { dd_config.z_channels = 32; embed_dim = 32; } else { @@ -578,7 +578,7 @@ public: ggml_tensor* decode(GGMLRunnerContext* ctx, ggml_tensor* z) { // z: [N, z_channels, h, w] - if (sd_version_is_flux2(version)) { + if (sd_version_uses_flux2_vae(version)) { // [N, C*p*p, h, w] -> [N, C, h*p, w*p] int64_t p = 2; @@ -617,7 +617,7 @@ public: auto quant_conv = std::dynamic_pointer_cast(blocks["quant_conv"]); z = quant_conv->forward(ctx, z); // [N, 2*embed_dim, h/8, w/8] } - if (sd_version_is_flux2(version)) { + if (sd_version_uses_flux2_vae(version)) { z = ggml_ext_chunk(ctx->ggml_ctx, z, 2, 2)[0]; // [N, C, H, W] -> [N, C*p*p, H/p, W/p] @@ -640,7 +640,7 @@ public: int get_encoder_output_channels() { int factor = dd_config.double_z ? 2 : 1; - if (sd_version_is_flux2(version)) { + if (sd_version_uses_flux2_vae(version)) { return dd_config.z_channels * 4; } return dd_config.z_channels * factor; @@ -673,7 +673,7 @@ struct AutoEncoderKL : public VAE { } else if (sd_version_is_flux(version) || sd_version_is_z_image(version)) { scale_factor = 0.3611f; shift_factor = 0.1159f; - } else if (sd_version_is_flux2(version)) { + } else if (sd_version_uses_flux2_vae(version)) { scale_factor = 1.0f; shift_factor = 0.f; } @@ -747,7 +747,7 @@ struct AutoEncoderKL : public VAE { } sd::Tensor vae_output_to_latents(const sd::Tensor& vae_output, std::shared_ptr rng) override { - if (sd_version_is_flux2(version)) { + if (sd_version_uses_flux2_vae(version)) { return vae_output; } else if (version == VERSION_SD1_PIX2PIX) { return sd::ops::chunk(vae_output, 2, 2)[0]; @@ -758,7 +758,7 @@ struct AutoEncoderKL : public VAE { std::pair, sd::Tensor> get_latents_mean_std(const sd::Tensor& latents, int channel_dim) { GGML_ASSERT(channel_dim >= 0 && static_cast(channel_dim) < static_cast(latents.dim())); - if (sd_version_is_flux2(version)) { + if (sd_version_uses_flux2_vae(version)) { GGML_ASSERT(latents.shape()[channel_dim] == 128); std::vector stats_shape(static_cast(latents.dim()), 1); stats_shape[static_cast(channel_dim)] = latents.shape()[channel_dim]; @@ -804,7 +804,7 @@ struct AutoEncoderKL : public VAE { } sd::Tensor diffusion_to_vae_latents(const sd::Tensor& latents) override { - if (sd_version_is_flux2(version)) { + if (sd_version_uses_flux2_vae(version)) { int channel_dim = 2; auto [mean_tensor, std_tensor] = get_latents_mean_std(latents, channel_dim); return (latents * std_tensor) / scale_factor + mean_tensor; @@ -813,7 +813,7 @@ struct AutoEncoderKL : public VAE { } sd::Tensor vae_to_diffusion_latents(const sd::Tensor& latents) override { - if (sd_version_is_flux2(version)) { + if (sd_version_uses_flux2_vae(version)) { int channel_dim = 2; auto [mean_tensor, std_tensor] = get_latents_mean_std(latents, channel_dim); return ((latents - mean_tensor) * scale_factor) / std_tensor; diff --git a/src/clip.hpp b/src/clip.hpp index 8f2ac064..8a2070e0 100644 --- a/src/clip.hpp +++ b/src/clip.hpp @@ -3,455 +3,7 @@ #include "ggml_extend.hpp" #include "model.h" -#include "tokenize_util.h" -#include "vocab/vocab.h" - -/*================================================== CLIPTokenizer ===================================================*/ - -__STATIC_INLINE__ std::vector> bytes_to_unicode() { - std::vector> byte_unicode_pairs; - std::set byte_set; - for (int b = static_cast('!'); b <= static_cast('~'); ++b) { - byte_set.insert(b); - byte_unicode_pairs.push_back(std::pair(b, unicode_value_to_utf32(b))); - } - for (int b = 161; b <= 172; ++b) { - byte_set.insert(b); - byte_unicode_pairs.push_back(std::pair(b, unicode_value_to_utf32(b))); - } - for (int b = 174; b <= 255; ++b) { - byte_set.insert(b); - byte_unicode_pairs.push_back(std::pair(b, unicode_value_to_utf32(b))); - } - int n = 0; - for (int b = 0; b < 256; ++b) { - if (byte_set.find(b) == byte_set.end()) { - byte_unicode_pairs.push_back(std::pair(b, unicode_value_to_utf32(n + 256))); - ++n; - } - } - // LOG_DEBUG("byte_unicode_pairs %d", byte_unicode_pairs.size()); - return byte_unicode_pairs; -} - -// Ref: https://github.com/openai/CLIP/blob/main/clip/simple_tokenizer.py - -typedef std::function&)> on_new_token_cb_t; - -class CLIPTokenizer { -private: - std::map byte_encoder; - std::map byte_decoder; - std::map encoder; - std::map decoder; - std::map, int> bpe_ranks; - std::regex pat; - int encoder_len; - int bpe_len; - - std::vector special_tokens; - -public: - const std::string UNK_TOKEN = "<|endoftext|>"; - const std::string BOS_TOKEN = "<|startoftext|>"; - const std::string EOS_TOKEN = "<|endoftext|>"; - const std::string PAD_TOKEN = "<|endoftext|>"; - - const int UNK_TOKEN_ID = 49407; - const int BOS_TOKEN_ID = 49406; - const int EOS_TOKEN_ID = 49407; - const int PAD_TOKEN_ID = 49407; - -private: - static std::string strip(const std::string& str) { - std::string::size_type start = str.find_first_not_of(" \t\n\r\v\f"); - std::string::size_type end = str.find_last_not_of(" \t\n\r\v\f"); - - if (start == std::string::npos) { - // String contains only whitespace characters - return ""; - } - - return str.substr(start, end - start + 1); - } - - static std::string whitespace_clean(std::string text) { - text = std::regex_replace(text, std::regex(R"(\s+)"), " "); - text = strip(text); - return text; - } - - static std::set> get_pairs(const std::vector& subwords) { - std::set> pairs; - if (subwords.size() == 0) { - return pairs; - } - std::u32string prev_subword = subwords[0]; - for (int i = 1; i < subwords.size(); i++) { - std::u32string subword = subwords[i]; - std::pair pair(prev_subword, subword); - pairs.insert(pair); - prev_subword = subword; - } - return pairs; - } - - bool is_special_token(const std::string& token) { - for (auto& special_token : special_tokens) { - if (special_token == token) { - return true; - } - } - return false; - } - -public: - CLIPTokenizer(int pad_token_id = 49407, const std::string& merges_utf8_str = "") - : PAD_TOKEN_ID(pad_token_id) { - if (merges_utf8_str.size() > 0) { - load_from_merges(merges_utf8_str); - } else { - load_from_merges(load_clip_merges()); - } - add_special_token("<|startoftext|>"); - add_special_token("<|endoftext|>"); - } - - void load_from_merges(const std::string& merges_utf8_str) { - auto byte_unicode_pairs = bytes_to_unicode(); - // printf("byte_unicode_pairs have %lu pairs \n", byte_unicode_pairs.size()); - byte_encoder = std::map(byte_unicode_pairs.begin(), byte_unicode_pairs.end()); - for (auto& pair : byte_unicode_pairs) { - byte_decoder[pair.second] = pair.first; - } - // for (auto & pair: byte_unicode_pairs) { - // std::cout << pair.first << ": " << pair.second << std::endl; - // } - std::vector merges; - size_t start = 0; - size_t pos; - std::u32string merges_utf32_str = utf8_to_utf32(merges_utf8_str); - while ((pos = merges_utf32_str.find('\n', start)) != std::string::npos) { - merges.push_back(merges_utf32_str.substr(start, pos - start)); - start = pos + 1; - } - // LOG_DEBUG("merges size %llu", merges.size()); - GGML_ASSERT(merges.size() == 48895); - merges = std::vector(merges.begin() + 1, merges.end()); - std::vector> merge_pairs; - for (const auto& merge : merges) { - size_t space_pos = merge.find(' '); - merge_pairs.emplace_back(merge.substr(0, space_pos), merge.substr(space_pos + 1)); - // LOG_DEBUG("%s", utf32_to_utf8(merge.substr(space_pos + 1)).c_str()); - // printf("%s :: %s | %s \n", utf32_to_utf8(merge).c_str(), utf32_to_utf8(merge.substr(0, space_pos)).c_str(), - // utf32_to_utf8(merge.substr(space_pos + 1)).c_str()); - } - std::vector vocab; - for (const auto& pair : byte_unicode_pairs) { - vocab.push_back(pair.second); - } - for (const auto& pair : byte_unicode_pairs) { - vocab.push_back(pair.second + utf8_to_utf32("")); - } - for (const auto& merge : merge_pairs) { - vocab.push_back(merge.first + merge.second); - } - vocab.push_back(utf8_to_utf32("<|startoftext|>")); - vocab.push_back(utf8_to_utf32("<|endoftext|>")); - LOG_DEBUG("vocab size: %llu", vocab.size()); - int i = 0; - for (const auto& token : vocab) { - encoder[token] = i; - decoder[i] = token; - i++; - } - encoder_len = i; - - auto it = encoder.find(utf8_to_utf32("img")); - if (it != encoder.end()) { - LOG_DEBUG("trigger word img already in vocab"); - } else { - LOG_DEBUG("trigger word img not in vocab yet"); - } - - int rank = 0; - for (const auto& merge : merge_pairs) { - bpe_ranks[merge] = rank++; - } - bpe_len = rank; - }; - - void add_token(const std::string& text) { - std::u32string token = utf8_to_utf32(text); - auto it = encoder.find(token); - if (it != encoder.end()) { - encoder[token] = encoder_len; - decoder[encoder_len] = token; - encoder_len++; - } - } - - void add_special_token(const std::string& token) { - special_tokens.push_back(token); - } - - std::u32string bpe(const std::u32string& token) { - std::vector word; - - for (int i = 0; i < token.size() - 1; i++) { - word.emplace_back(1, token[i]); - } - word.push_back(token.substr(token.size() - 1) + utf8_to_utf32("")); - - std::set> pairs = get_pairs(word); - - if (pairs.empty()) { - return token + utf8_to_utf32(""); - } - - while (true) { - auto min_pair_iter = std::min_element(pairs.begin(), - pairs.end(), - [&](const std::pair& a, - const std::pair& b) { - if (bpe_ranks.find(a) == bpe_ranks.end()) { - return false; - } else if (bpe_ranks.find(b) == bpe_ranks.end()) { - return true; - } - return bpe_ranks.at(a) < bpe_ranks.at(b); - }); - - const std::pair& bigram = *min_pair_iter; - - if (bpe_ranks.find(bigram) == bpe_ranks.end()) { - break; - } - - std::u32string first = bigram.first; - std::u32string second = bigram.second; - std::vector new_word; - int32_t i = 0; - - while (i < word.size()) { - auto it = std::find(word.begin() + i, word.end(), first); - if (it == word.end()) { - new_word.insert(new_word.end(), word.begin() + i, word.end()); - break; - } - new_word.insert(new_word.end(), word.begin() + i, it); - i = static_cast(std::distance(word.begin(), it)); - - if (word[i] == first && i < static_cast(word.size()) - 1 && word[i + 1] == second) { - new_word.push_back(first + second); - i += 2; - } else { - new_word.push_back(word[i]); - i += 1; - } - } - - word = new_word; - - if (word.size() == 1) { - break; - } - pairs = get_pairs(word); - } - - std::u32string result; - for (int i = 0; i < word.size(); i++) { - result += word[i]; - if (i != word.size() - 1) { - result += utf8_to_utf32(" "); - } - } - - return result; - } - - std::vector tokenize(std::string text, - on_new_token_cb_t on_new_token_cb, - size_t max_length = 0, - bool padding = false) { - std::vector tokens = encode(text, on_new_token_cb); - - tokens.insert(tokens.begin(), BOS_TOKEN_ID); - if (max_length > 0) { - if (tokens.size() > max_length - 1) { - tokens.resize(max_length - 1); - tokens.push_back(EOS_TOKEN_ID); - } else { - tokens.push_back(EOS_TOKEN_ID); - if (padding) { - tokens.insert(tokens.end(), max_length - tokens.size(), PAD_TOKEN_ID); - } - } - } - - return tokens; - } - - void pad_tokens(std::vector& tokens, - std::vector& weights, - size_t max_length = 0, - bool padding = false) { - if (max_length > 0 && padding) { - size_t n = static_cast(std::ceil(tokens.size() * 1.0 / (max_length - 2))); - if (n == 0) { - n = 1; - } - size_t length = max_length * n; - LOG_DEBUG("token length: %llu", length); - std::vector new_tokens; - std::vector new_weights; - new_tokens.push_back(BOS_TOKEN_ID); - new_weights.push_back(1.0); - int token_idx = 0; - for (int i = 1; i < length; i++) { - if (token_idx >= tokens.size()) { - break; - } - if (i % max_length == 0) { - new_tokens.push_back(BOS_TOKEN_ID); - new_weights.push_back(1.0); - } else if (i % max_length == max_length - 1) { - new_tokens.push_back(EOS_TOKEN_ID); - new_weights.push_back(1.0); - } else { - new_tokens.push_back(tokens[token_idx]); - new_weights.push_back(weights[token_idx]); - token_idx++; - } - } - - new_tokens.push_back(EOS_TOKEN_ID); - new_weights.push_back(1.0); - tokens = new_tokens; - weights = new_weights; - - if (padding) { - tokens.insert(tokens.end(), length - tokens.size(), PAD_TOKEN_ID); - weights.insert(weights.end(), length - weights.size(), 1.0); - } - } - } - - std::string clean_up_tokenization(std::string& text) { - std::regex pattern(R"( ,)"); - // Replace " ," with "," - std::string result = std::regex_replace(text, pattern, ","); - return result; - } - - std::string decode(const std::vector& tokens) { - std::string text = ""; - for (int t : tokens) { - if (t == 49406 || t == 49407) - continue; - std::u32string ts = decoder[t]; - // printf("%d, %s \n", t, utf32_to_utf8(ts).c_str()); - std::string s = utf32_to_utf8(ts); - if (s.length() >= 4) { - if (ends_with(s, "")) { - text += s.replace(s.length() - 4, s.length() - 1, "") + " "; - } else { - text += s; - } - } else { - text += " " + s; - } - } - // std::vector bytes; - // for (auto c : text){ - // bytes.push_back(byte_decoder[c]); - // } - - // std::string s((char *)bytes.data()); - // std::string s = ""; - text = clean_up_tokenization(text); - return trim(text); - } - - std::vector token_split(const std::string& text) { - std::regex pat(R"('s|'t|'re|'ve|'m|'ll|'d|[[:alpha:]]+|[[:digit:]]|[^[:space:][:alpha:][:digit:]]+)", - std::regex::icase); - std::sregex_iterator iter(text.begin(), text.end(), pat); - std::sregex_iterator end; - - std::vector result; - for (; iter != end; ++iter) { - result.emplace_back(iter->str()); - } - - return result; - } - - std::vector encode(std::string text, on_new_token_cb_t on_new_token_cb) { - std::string original_text = text; - std::vector bpe_tokens; - text = whitespace_clean(text); - std::transform(text.begin(), text.end(), text.begin(), [](unsigned char c) { return std::tolower(c); }); - - std::string str = text; - std::vector token_strs; - - auto splited_texts = split_with_special_tokens(text, special_tokens); - - for (auto& splited_text : splited_texts) { - LOG_DEBUG("token %s", splited_text.c_str()); - if (is_special_token(splited_text)) { - LOG_DEBUG("special %s", splited_text.c_str()); - bool skip = on_new_token_cb(splited_text, bpe_tokens); - if (skip) { - token_strs.push_back(splited_text); - continue; - } - continue; - } - - auto tokens = token_split(splited_text); - for (auto& token : tokens) { - if (on_new_token_cb != nullptr) { - bool skip = on_new_token_cb(token, bpe_tokens); - if (skip) { - token_strs.push_back(token); - continue; - } - } - - std::string token_str = token; - std::u32string utf32_token; - for (int i = 0; i < token_str.length(); i++) { - unsigned char b = token_str[i]; - utf32_token += byte_encoder[b]; - } - auto bpe_strs = bpe(utf32_token); - size_t start = 0; - size_t pos; - while ((pos = bpe_strs.find(' ', start)) != std::u32string::npos) { - auto bpe_str = bpe_strs.substr(start, pos - start); - bpe_tokens.push_back(encoder[bpe_str]); - token_strs.push_back(utf32_to_utf8(bpe_str)); - - start = pos + 1; - } - auto bpe_str = bpe_strs.substr(start, bpe_strs.size() - start); - bpe_tokens.push_back(encoder[bpe_str]); - token_strs.push_back(utf32_to_utf8(bpe_str)); - } - } - // std::stringstream ss; - // ss << "["; - // for (auto token : token_strs) { - // ss << "\"" << token << "\", "; - // } - // ss << "]"; - // LOG_DEBUG("split prompt \"%s\" to tokens %s", original_text.c_str(), ss.str().c_str()); - // printf("split prompt \"%s\" to tokens %s \n", original_text.c_str(), ss.str().c_str()); - return bpe_tokens; - } -}; +#include "tokenizers/clip_tokenizer.h" /*================================================ FrozenCLIPEmbedder ================================================*/ diff --git a/src/common_block.hpp b/src/common_block.hpp index 2cef389a..112a4d7a 100644 --- a/src/common_block.hpp +++ b/src/common_block.hpp @@ -277,6 +277,7 @@ protected: int64_t context_dim; int64_t n_head; int64_t d_head; + bool xtra_dim = false; public: CrossAttention(int64_t query_dim, @@ -288,7 +289,11 @@ public: query_dim(query_dim), context_dim(context_dim) { int64_t inner_dim = d_head * n_head; - + if (context_dim == 320 && d_head == 320) { + // LOG_DEBUG("CrossAttention: temp set dim to 1024 for sdxs_09"); + xtra_dim = true; + context_dim = 1024; + } blocks["to_q"] = std::shared_ptr(new Linear(query_dim, inner_dim, false)); blocks["to_k"] = std::shared_ptr(new Linear(context_dim, inner_dim, false)); blocks["to_v"] = std::shared_ptr(new Linear(context_dim, inner_dim, false)); @@ -313,10 +318,16 @@ public: int64_t n_context = context->ne[1]; int64_t inner_dim = d_head * n_head; - auto q = to_q->forward(ctx, x); // [N, n_token, inner_dim] + auto q = to_q->forward(ctx, x); // [N, n_token, inner_dim] + if (xtra_dim) { + // LOG_DEBUG("CrossAttention: temp set dim to 1024 for sdxs_09"); + context->ne[0] = 1024; // patch dim + } auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim] auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim] - + if (xtra_dim) { + context->ne[0] = 320; // reset dim to orig + } x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, ctx->flash_attn_enabled); // [N, n_token, inner_dim] x = to_out_0->forward(ctx, x); // [N, n_token, query_dim] diff --git a/src/conditioner.hpp b/src/conditioner.hpp index 5564373e..9f4d4552 100644 --- a/src/conditioner.hpp +++ b/src/conditioner.hpp @@ -256,15 +256,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { return true; } - std::tuple, std::vector, std::vector> - tokenize_with_trigger_token(std::string text, - int num_input_imgs, - int32_t image_token, - bool padding = false) { - return tokenize_with_trigger_token(text, num_input_imgs, image_token, - text_model->model.n_token, padding); - } - std::vector convert_token_to_id(std::string text) { auto on_new_token_cb = [&](std::string& str, std::vector& bpe_tokens) -> bool { auto iter = embedding_map.find(str); @@ -288,9 +279,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { std::tuple, std::vector, std::vector> tokenize_with_trigger_token(std::string text, int num_input_imgs, - int32_t image_token, - size_t max_length = 0, - bool padding = false) { + int32_t image_token) { auto parsed_attention = parse_prompt_attention(text); { @@ -377,7 +366,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { // tokens.insert(tokens.begin(), tokenizer.BOS_TOKEN_ID); // weights.insert(weights.begin(), 1.0); - tokenizer.pad_tokens(tokens, weights, max_length, padding); + tokenizer.pad_tokens(tokens, &weights, nullptr, text_model->model.n_token, text_model->model.n_token, true); int offset = pm_version == PM_VERSION_2 ? 2 * num_input_imgs : num_input_imgs; for (int i = 0; i < tokens.size(); i++) { // if (class_idx + 1 <= i && i < class_idx + 1 + 2*num_input_imgs) // photomaker V2 has num_tokens(=2)*num_input_imgs @@ -403,13 +392,9 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { } std::pair, std::vector> tokenize(std::string text, - bool padding = false) { - return tokenize(text, text_model->model.n_token, padding); - } - - std::pair, std::vector> tokenize(std::string text, - size_t max_length = 0, - bool padding = false) { + size_t min_length = 0, + size_t max_length = 0, + bool allow_overflow_expand = true) { auto parsed_attention = parse_prompt_attention(text); { @@ -460,7 +445,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { weights.insert(weights.end(), curr_tokens.size(), curr_weight); } - tokenizer.pad_tokens(tokens, weights, max_length, padding); + tokenizer.pad_tokens(tokens, &weights, nullptr, min_length, max_length, allow_overflow_expand); // for (int i = 0; i < tokens.size(); i++) { // std::cout << tokens[i] << ":" << weights[i] << ", "; @@ -603,8 +588,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { GGML_ASSERT(image_tokens.size() == 1); auto tokens_and_weights = tokenize_with_trigger_token(conditioner_params.text, conditioner_params.num_input_imgs, - image_tokens[0], - true); + image_tokens[0]); std::vector& tokens = std::get<0>(tokens_and_weights); std::vector& weights = std::get<1>(tokens_and_weights); std::vector& clsm = std::get<2>(tokens_and_weights); @@ -630,7 +614,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { std::string remove_trigger_from_prompt(const std::string& prompt) override { auto image_tokens = convert_token_to_id(trigger_word); GGML_ASSERT(image_tokens.size() == 1); - auto tokens_and_weights = tokenize(prompt, false); + auto tokens_and_weights = tokenize(prompt); std::vector& tokens = tokens_and_weights.first; auto it = std::find(tokens.begin(), tokens.end(), image_tokens[0]); GGML_ASSERT(it != tokens.end()); // prompt must have trigger word @@ -640,7 +624,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { SDCondition get_learned_condition(int n_threads, const ConditionerParams& conditioner_params) override { - auto tokens_and_weights = tokenize(conditioner_params.text, true); + auto tokens_and_weights = tokenize(conditioner_params.text, text_model->model.n_token, text_model->model.n_token, true); std::vector& tokens = tokens_and_weights.first; std::vector& weights = tokens_and_weights.second; return get_learned_condition_common(n_threads, @@ -822,8 +806,9 @@ struct SD3CLIPEmbedder : public Conditioner { } std::vector, std::vector>> tokenize(std::string text, - size_t max_length = 0, - bool padding = false) { + size_t min_length = 0, + size_t max_length = 0, + bool allow_overflow_expand = true) { auto parsed_attention = parse_prompt_attention(text); { @@ -860,20 +845,20 @@ struct SD3CLIPEmbedder : public Conditioner { clip_g_weights.insert(clip_g_weights.end(), curr_tokens.size(), curr_weight); } if (t5) { - std::vector curr_tokens = t5_tokenizer.Encode(curr_text, true); + std::vector curr_tokens = t5_tokenizer.encode(curr_text); t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end()); t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight); } } if (clip_l) { - clip_l_tokenizer.pad_tokens(clip_l_tokens, clip_l_weights, max_length, padding); + clip_l_tokenizer.pad_tokens(clip_l_tokens, &clip_l_weights, nullptr, min_length, max_length, allow_overflow_expand); } if (clip_g) { - clip_g_tokenizer.pad_tokens(clip_g_tokens, clip_g_weights, max_length, padding); + clip_g_tokenizer.pad_tokens(clip_g_tokens, &clip_g_weights, nullptr, min_length, max_length, allow_overflow_expand); } if (t5) { - t5_tokenizer.pad_tokens(t5_tokens, t5_weights, nullptr, max_length, padding); + t5_tokenizer.pad_tokens(t5_tokens, &t5_weights, nullptr, min_length, max_length, true); } // for (int i = 0; i < clip_l_tokens.size(); i++) { @@ -1056,7 +1041,7 @@ struct SD3CLIPEmbedder : public Conditioner { SDCondition get_learned_condition(int n_threads, const ConditionerParams& conditioner_params) override { - auto tokens_and_weights = tokenize(conditioner_params.text, 77, true); + auto tokens_and_weights = tokenize(conditioner_params.text, 77, 77, true); return get_learned_condition_common(n_threads, tokens_and_weights, conditioner_params.clip_skip, @@ -1158,8 +1143,8 @@ struct FluxCLIPEmbedder : public Conditioner { } std::vector, std::vector>> tokenize(std::string text, - size_t max_length = 0, - bool padding = false) { + size_t min_length = 0, + size_t max_length = 0) { auto parsed_attention = parse_prompt_attention(text); { @@ -1189,17 +1174,17 @@ struct FluxCLIPEmbedder : public Conditioner { clip_l_weights.insert(clip_l_weights.end(), curr_tokens.size(), curr_weight); } if (t5) { - std::vector curr_tokens = t5_tokenizer.Encode(curr_text, true); + std::vector curr_tokens = t5_tokenizer.encode(curr_text); t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end()); t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight); } } if (clip_l) { - clip_l_tokenizer.pad_tokens(clip_l_tokens, clip_l_weights, 77, padding); + clip_l_tokenizer.pad_tokens(clip_l_tokens, &clip_l_weights, nullptr, 77, 77, true); } if (t5) { - t5_tokenizer.pad_tokens(t5_tokens, t5_weights, nullptr, max_length, padding); + t5_tokenizer.pad_tokens(t5_tokens, &t5_weights, nullptr, min_length, max_length, true); } // for (int i = 0; i < clip_l_tokens.size(); i++) { @@ -1300,7 +1285,7 @@ struct FluxCLIPEmbedder : public Conditioner { SDCondition get_learned_condition(int n_threads, const ConditionerParams& conditioner_params) override { - auto tokens_and_weights = tokenize(conditioner_params.text, chunk_len, true); + auto tokens_and_weights = tokenize(conditioner_params.text, chunk_len, chunk_len); return get_learned_condition_common(n_threads, tokens_and_weights, conditioner_params.clip_skip, @@ -1377,8 +1362,8 @@ struct T5CLIPEmbedder : public Conditioner { } std::tuple, std::vector, std::vector> tokenize(std::string text, - size_t max_length = 0, - bool padding = false) { + size_t min_length = 0, + size_t max_length = 0) { auto parsed_attention = parse_prompt_attention(text); { @@ -1403,12 +1388,15 @@ struct T5CLIPEmbedder : public Conditioner { const std::string& curr_text = item.first; float curr_weight = item.second; - std::vector curr_tokens = t5_tokenizer.Encode(curr_text, true); + std::vector curr_tokens = t5_tokenizer.encode(curr_text); t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end()); t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight); } - t5_tokenizer.pad_tokens(t5_tokens, t5_weights, &t5_mask, max_length, padding); + t5_tokenizer.pad_tokens(t5_tokens, &t5_weights, &t5_mask, min_length, max_length, true); + for (auto& mask_value : t5_mask) { + mask_value = mask_value > 0.0f ? 0.0f : -HUGE_VALF; + } } return {t5_tokens, t5_weights, t5_mask}; } @@ -1496,7 +1484,7 @@ struct T5CLIPEmbedder : public Conditioner { SDCondition get_learned_condition(int n_threads, const ConditionerParams& conditioner_params) override { - auto tokens_and_weights = tokenize(conditioner_params.text, chunk_len, true); + auto tokens_and_weights = tokenize(conditioner_params.text, chunk_len, chunk_len); return get_learned_condition_common(n_threads, tokens_and_weights, conditioner_params.clip_skip, @@ -1505,14 +1493,14 @@ struct T5CLIPEmbedder : public Conditioner { }; struct AnimaConditioner : public Conditioner { - std::shared_ptr qwen_tokenizer; + std::shared_ptr qwen_tokenizer; T5UniGramTokenizer t5_tokenizer; std::shared_ptr llm; AnimaConditioner(ggml_backend_t backend, bool offload_params_to_cpu, const String2TensorStorage& tensor_storage_map = {}) { - qwen_tokenizer = std::make_shared(); + qwen_tokenizer = std::make_shared(); llm = std::make_shared(LLM::LLMArch::QWEN3, backend, offload_params_to_cpu, @@ -1578,7 +1566,7 @@ struct AnimaConditioner : public Conditioner { for (const auto& item : parsed_attention) { const std::string& curr_text = item.first; float curr_weight = item.second; - std::vector curr_tokens = t5_tokenizer.Encode(curr_text, true); + std::vector curr_tokens = t5_tokenizer.tokenize(curr_text, nullptr, true); t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end()); t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight); } @@ -1620,7 +1608,7 @@ struct AnimaConditioner : public Conditioner { struct LLMEmbedder : public Conditioner { SDVersion version; - std::shared_ptr tokenizer; + std::shared_ptr tokenizer; std::shared_ptr llm; LLMEmbedder(ggml_backend_t backend, @@ -1633,13 +1621,15 @@ struct LLMEmbedder : public Conditioner { LLM::LLMArch arch = LLM::LLMArch::QWEN2_5_VL; if (version == VERSION_FLUX2) { arch = LLM::LLMArch::MISTRAL_SMALL_3_2; + } else if (sd_version_is_ernie_image(version)) { + arch = LLM::LLMArch::MINISTRAL_3_3B; } else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) { arch = LLM::LLMArch::QWEN3; } - if (arch == LLM::LLMArch::MISTRAL_SMALL_3_2) { - tokenizer = std::make_shared(); + if (arch == LLM::LLMArch::MISTRAL_SMALL_3_2 || arch == LLM::LLMArch::MINISTRAL_3_3B) { + tokenizer = std::make_shared(); } else { - tokenizer = std::make_shared(); + tokenizer = std::make_shared(); } llm = std::make_shared(arch, backend, @@ -1677,20 +1667,24 @@ struct LLMEmbedder : public Conditioner { } } - std::tuple, std::vector> tokenize(std::string text, - const std::pair& attn_range, - size_t max_length = 0, - bool padding = false) { + std::tuple, std::vector, std::vector> tokenize(std::string text, + const std::pair& attn_range, + size_t min_length = 0, + size_t max_length = 100000000) { std::vector> parsed_attention; if (attn_range.first >= 0 && attn_range.second > 0) { - parsed_attention.emplace_back(text.substr(0, attn_range.first), 1.f); + if (attn_range.first > 0) { + parsed_attention.emplace_back(text.substr(0, attn_range.first), 1.f); + } if (attn_range.second - attn_range.first > 0) { auto new_parsed_attention = parse_prompt_attention(text.substr(attn_range.first, attn_range.second - attn_range.first)); parsed_attention.insert(parsed_attention.end(), new_parsed_attention.begin(), new_parsed_attention.end()); } - parsed_attention.emplace_back(text.substr(attn_range.second), 1.f); + if (attn_range.second < text.size()) { + parsed_attention.emplace_back(text.substr(attn_range.second), 1.f); + } } else { parsed_attention.emplace_back(text, 1.f); } @@ -1710,39 +1704,34 @@ struct LLMEmbedder : public Conditioner { for (const auto& item : parsed_attention) { const std::string& curr_text = item.first; float curr_weight = item.second; - std::vector curr_tokens = tokenizer->tokenize(curr_text, nullptr); + std::vector curr_tokens = tokenizer->encode(curr_text, nullptr); tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end()); weights.insert(weights.end(), curr_tokens.size(), curr_weight); } - tokenizer->pad_tokens(tokens, weights, max_length, padding); + std::vector mask; + tokenizer->pad_tokens(tokens, &weights, &mask, min_length, max_length); // for (int i = 0; i < tokens.size(); i++) { // std::cout << tokens[i] << ":" << weights[i] << ", " << i << std::endl; // } // std::cout << std::endl; - return {tokens, weights}; + return {tokens, weights, mask}; } sd::Tensor encode_prompt(int n_threads, const std::string prompt, const std::pair& prompt_attn_range, - int max_length, int min_length, + int hidden_states_min_length, const std::vector>>& image_embeds, const std::set& out_layers, int prompt_template_encode_start_idx) { - auto tokens_and_weights = tokenize(prompt, prompt_attn_range); - auto& tokens = std::get<0>(tokens_and_weights); - auto& weights = std::get<1>(tokens_and_weights); - std::vector mask; - - if (max_length > 0 && tokens.size() < max_length) { - mask.insert(mask.end(), tokens.size(), 1.f); - mask.insert(mask.end(), max_length - tokens.size(), 0.f); - tokenizer->pad_tokens(tokens, weights, max_length, true); - } + auto tokens_weights_mask = tokenize(prompt, prompt_attn_range, min_length); + auto& tokens = std::get<0>(tokens_weights_mask); + auto& weights = std::get<1>(tokens_weights_mask); + auto& mask = std::get<2>(tokens_weights_mask); sd::Tensor input_ids({static_cast(tokens.size())}, tokens); sd::Tensor attention_mask; @@ -1769,9 +1758,9 @@ struct LLMEmbedder : public Conditioner { GGML_ASSERT(hidden_states.shape()[1] > prompt_template_encode_start_idx); int64_t zero_pad_len = 0; - if (min_length > 0) { - if (hidden_states.shape()[1] - prompt_template_encode_start_idx < min_length) { - zero_pad_len = min_length - hidden_states.shape()[1] + prompt_template_encode_start_idx; + if (hidden_states_min_length > 0) { + if (hidden_states.shape()[1] - prompt_template_encode_start_idx < hidden_states_min_length) { + zero_pad_len = hidden_states_min_length - hidden_states.shape()[1] + prompt_template_encode_start_idx; } } @@ -1798,8 +1787,8 @@ struct LLMEmbedder : public Conditioner { std::vector> extra_prompts_attn_range; std::vector>> image_embeds; int prompt_template_encode_start_idx = 34; - int max_length = 0; // pad tokens - int min_length = 0; // zero pad hidden_states + int min_length = 0; // pad tokens + int hidden_states_min_length = 0; // zero pad hidden_states std::set out_layers; int64_t t0 = ggml_time_ms(); @@ -1874,7 +1863,7 @@ struct LLMEmbedder : public Conditioner { } } else if (version == VERSION_FLUX2) { prompt_template_encode_start_idx = 0; - min_length = 512; + hidden_states_min_length = 512; out_layers = {10, 20, 30}; prompt = "[SYSTEM_PROMPT]You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object\nattribution and actions without speculation.[/SYSTEM_PROMPT][INST]"; @@ -1884,6 +1873,13 @@ struct LLMEmbedder : public Conditioner { prompt_attn_range.second = static_cast(prompt.size()); prompt += "[/INST]"; + } else if (sd_version_is_ernie_image(version)) { + prompt_template_encode_start_idx = 0; + out_layers = {25}; // -2 + + prompt_attn_range.first = 0; + prompt += conditioner_params.text; + prompt_attn_range.second = static_cast(prompt.size()); } else if (sd_version_is_z_image(version)) { prompt_template_encode_start_idx = 0; out_layers = {35}; // -2 @@ -1907,7 +1903,7 @@ struct LLMEmbedder : public Conditioner { } } else if (version == VERSION_FLUX2_KLEIN) { prompt_template_encode_start_idx = 0; - max_length = 512; + min_length = 512; out_layers = {9, 18, 27}; prompt = "<|im_start|>user\n"; @@ -1919,7 +1915,7 @@ struct LLMEmbedder : public Conditioner { prompt += "<|im_end|>\n<|im_start|>assistant\n\n\n\n\n"; } else if (version == VERSION_OVIS_IMAGE) { prompt_template_encode_start_idx = 28; - max_length = prompt_template_encode_start_idx + 256; + min_length = prompt_template_encode_start_idx + 256; prompt = "<|im_start|>user\nDescribe the image by detailing the color, quantity, text, shape, size, texture, spatial relationships of the objects and background:"; @@ -1935,8 +1931,8 @@ struct LLMEmbedder : public Conditioner { auto hidden_states = encode_prompt(n_threads, prompt, prompt_attn_range, - max_length, min_length, + hidden_states_min_length, image_embeds, out_layers, prompt_template_encode_start_idx); @@ -1945,8 +1941,8 @@ struct LLMEmbedder : public Conditioner { auto extra_hidden_states = encode_prompt(n_threads, extra_prompts[i], extra_prompts_attn_range[i], - max_length, min_length, + hidden_states_min_length, image_embeds, out_layers, prompt_template_encode_start_idx); diff --git a/src/convert.cpp b/src/convert.cpp new file mode 100644 index 00000000..7cae8df0 --- /dev/null +++ b/src/convert.cpp @@ -0,0 +1,138 @@ +#include +#include +#include +#include + +#include "model.h" +#include "model_io/gguf_io.h" +#include "model_io/safetensors_io.h" +#include "util.h" + +#include "ggml-cpu.h" + +static ggml_type get_export_tensor_type(ModelLoader& model_loader, + const TensorStorage& tensor_storage, + ggml_type type, + const TensorTypeRules& tensor_type_rules) { + const std::string& name = tensor_storage.name; + ggml_type tensor_type = tensor_storage.type; + ggml_type dst_type = type; + + for (const auto& tensor_type_rule : tensor_type_rules) { + std::regex pattern(tensor_type_rule.first); + if (std::regex_search(name, pattern)) { + dst_type = tensor_type_rule.second; + break; + } + } + + if (model_loader.tensor_should_be_converted(tensor_storage, dst_type)) { + tensor_type = dst_type; + } + + return tensor_type; +} + +static bool load_tensors_for_export(ModelLoader& model_loader, + ggml_context* ggml_ctx, + ggml_type type, + const TensorTypeRules& tensor_type_rules, + std::vector& tensors) { + std::mutex tensor_mutex; + auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool { + const std::string& name = tensor_storage.name; + ggml_type tensor_type = get_export_tensor_type(model_loader, tensor_storage, type, tensor_type_rules); + + std::lock_guard lock(tensor_mutex); + ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne); + if (tensor == nullptr) { + LOG_ERROR("ggml_new_tensor failed"); + return false; + } + ggml_set_name(tensor, name.c_str()); + + if (!tensor->data) { + GGML_ASSERT(ggml_nelements(tensor) == 0); + // Avoid crashing writers by setting a dummy pointer for zero-sized tensors. + LOG_DEBUG("setting dummy pointer for zero-sized tensor %s", name.c_str()); + tensor->data = ggml_get_mem_buffer(ggml_ctx); + } + + TensorWriteInfo write_info; + write_info.tensor = tensor; + write_info.n_dims = tensor_storage.n_dims; + for (int i = 0; i < tensor_storage.n_dims; ++i) { + write_info.ne[i] = tensor_storage.ne[i]; + } + + *dst_tensor = tensor; + tensors.push_back(std::move(write_info)); + + return true; + }; + + bool success = model_loader.load_tensors(on_new_tensor_cb); + LOG_INFO("load tensors done"); + return success; +} + +bool convert(const char* input_path, + const char* vae_path, + const char* output_path, + sd_type_t output_type, + const char* tensor_type_rules, + bool convert_name) { + ModelLoader model_loader; + + if (!model_loader.init_from_file(input_path)) { + LOG_ERROR("init model loader from file failed: '%s'", input_path); + return false; + } + + if (vae_path != nullptr && strlen(vae_path) > 0) { + if (!model_loader.init_from_file(vae_path, "vae.")) { + LOG_ERROR("init model loader from file failed: '%s'", vae_path); + return false; + } + } + if (convert_name) { + model_loader.convert_tensors_name(); + } + + ggml_type type = (ggml_type)output_type; + bool output_is_safetensors = ends_with(output_path, ".safetensors"); + TensorTypeRules type_rules = parse_tensor_type_rules(tensor_type_rules); + + auto backend = ggml_backend_cpu_init(); + size_t mem_size = 1 * 1024 * 1024; // for padding + mem_size += model_loader.get_tensor_storage_map().size() * ggml_tensor_overhead(); + mem_size += model_loader.get_params_mem_size(backend, type); + LOG_INFO("model tensors mem size: %.2fMB", mem_size / 1024.f / 1024.f); + ggml_context* ggml_ctx = ggml_init({mem_size, nullptr, false}); + + if (ggml_ctx == nullptr) { + LOG_ERROR("ggml_init failed for converter"); + ggml_backend_free(backend); + return false; + } + + std::vector tensors; + bool success = load_tensors_for_export(model_loader, ggml_ctx, type, type_rules, tensors); + ggml_backend_free(backend); + + std::string error; + if (success) { + if (output_is_safetensors) { + success = write_safetensors_file(output_path, tensors, &error); + } else { + success = write_gguf_file(output_path, tensors, &error); + } + } + + if (!success && !error.empty()) { + LOG_ERROR("%s", error.c_str()); + } + + ggml_free(ggml_ctx); + return success; +} diff --git a/src/denoiser.hpp b/src/denoiser.hpp index c9c9d881..a6e81d59 100644 --- a/src/denoiser.hpp +++ b/src/denoiser.hpp @@ -953,8 +953,9 @@ static sd::Tensor sample_dpmpp_2s_ancestral(denoise_cb_t model, float t_next = t_fn(sigma_down); float h = t_next - t; float s = t + 0.5f * h; - sd::Tensor x2 = (sigma_fn(s) / sigma_fn(t)) * x - (exp(-h * 0.5f) - 1) * denoised; - auto denoised2_opt = model(x2, sigmas[i + 1], i + 1); + float sigma_s = sigma_fn(s); + sd::Tensor x2 = (sigma_s / sigma_fn(t)) * x - (exp(-h * 0.5f) - 1) * denoised; + auto denoised2_opt = model(x2, sigma_s, i + 1); if (denoised2_opt.empty()) { return {}; } @@ -969,6 +970,100 @@ static sd::Tensor sample_dpmpp_2s_ancestral(denoise_cb_t model, return x; } +static sd::Tensor sample_dpmpp_2s_ancestral_flow(denoise_cb_t model, + sd::Tensor x, + const std::vector& sigmas, + std::shared_ptr rng, + float eta = 1.0f) { + int steps = static_cast(sigmas.size()) - 1; + for (int i = 0; i < steps; i++) { + float sigma = sigmas[i]; + float sigma_to = sigmas[i + 1]; + + bool opt_first_step = (1.0 - sigma < 1e-6); + + auto denoised_opt = model(x, sigma, (opt_first_step ? 1 : -1) * (i + 1)); + if (denoised_opt.empty()) { + return {}; + } + sd::Tensor denoised = std::move(denoised_opt); + + if (sigma_to == 0.0f) { + // Euler method (final step, no noise) + // sigma_to == 0 --> sigma_down = 0, so: + // x + d * (sigma_down - sigma) + // = x + ((x - denoised) / sigma) * (sigma_down - sigma) + // = x + ((x - denoised) / sigma) * ( 0 - sigma) + // = x + ((x - denoised) ) * -1 + // = x -x + denoised + x = denoised; + + } else { + auto [sigma_down, sigma_up, alpha_scale] = get_ancestral_step_flow(sigma, sigma_to, eta); + sd::Tensor D_i; + + if (opt_first_step) { + // the reformulated exp_s calc already accounts for this, but we can avoid + // a redundant model call for the typical sigma 1 at the first step: + // exp_s = sqrt((1-sigma)/sigma * (1-sigma_down)/sigma_down) + // = sqrt((1- 1)/ 1 * (1-sigma_down)/sigma_down) + // = 0 + // so sigma_s = 1 = sigma, and sigma_s_i_ratio = sigma_s / sigma = 1 + // u = (x*sigma_s_i_ratio)+(denoised*(1.0f-sigma_s_i_ratio)) + // = (x*1)+(denoised*0) = x + // so D_i = model(u, sigma_s, i + 1) + // = model(x, sigma, i + 1) + // = denoised + D_i = denoised; + + } else { + float sigma_s; + + // ref implementation would be: + // auto lambda_fn = [](float sigma) -> float { + // return std::log((1.0f - sigma) / sigma); }; + // auto sigma_fn = [](float lbda) -> float { + // return 1.0f / (std::exp(lbda) + 1.0f); }; + // t_i = lambda_fn(sigma); + // t_down = lambda_fn(sigma_down); + // float r = 0.5f; + // h = t_down - t_i; + // s = t_i + r * h; + // sigma_s = sigma_fn(s); + + // assuming r is constant, we sidestep the singularity at sigma -> 1 by: + // s = 0.5 * (lambda_fn(sigma) + lambda_fn(sigma_down)) + // = 0.5 * (log((1-sigma)/sigma) + log((1-sigma_down)/sigma_down)) + // = 0.5 * log(((1-sigma)/sigma) * ((1-sigma_down)/sigma_down)) + // = log(sqrt (((1-sigma)/sigma) * ((1-sigma_down)/sigma_down))) + // so exp(s) = sqrt((1-sigma)/sigma * (1-sigma_down)/sigma_down) + // and sigma_s = sigma_fn(s) = 1.0f / (exp(s) + 1.0f) + + float exp_s = std::sqrt(((1 - sigma) / sigma) * ((1 - sigma_down) / sigma_down)); + sigma_s = 1.0f / (exp_s + 1.0f); + + float sigma_s_i_ratio = sigma_s / sigma; + sd::Tensor u = (x * sigma_s_i_ratio) + (denoised * (1.0f - sigma_s_i_ratio)); + + auto denoised2_opt = model(u, sigma_s, i + 1); + if (denoised2_opt.empty()) { + return {}; + } + D_i = std::move(denoised2_opt); + } + + float sigma_down_i_ratio = sigma_down / sigma; + x = (x * sigma_down_i_ratio) + (D_i * (1.0f - sigma_down_i_ratio)); + + if (sigma_to > 0.0f && eta > 0.0f) { + x = alpha_scale * x + sd::Tensor::randn_like(x, rng) * sigma_up; + } + } + } + + return x; +} + static sd::Tensor sample_dpmpp_2m(denoise_cb_t model, sd::Tensor x, const std::vector& sigmas) { @@ -1040,7 +1135,8 @@ static sd::Tensor sample_dpmpp_2m_v2(denoise_cb_t model, static sd::Tensor sample_lcm(denoise_cb_t model, sd::Tensor x, const std::vector& sigmas, - std::shared_ptr rng) { + std::shared_ptr rng, + bool is_flow_denoiser) { int steps = static_cast(sigmas.size()) - 1; for (int i = 0; i < steps; i++) { auto denoised_opt = model(x, sigmas[i], i + 1); @@ -1049,6 +1145,9 @@ static sd::Tensor sample_lcm(denoise_cb_t model, } x = std::move(denoised_opt); if (sigmas[i + 1] > 0) { + if (is_flow_denoiser) { + x *= (1 - sigmas[i + 1]); + } x += sd::Tensor::randn_like(x, rng) * sigmas[i + 1]; } } @@ -1285,37 +1384,149 @@ static sd::Tensor sample_res_2s(denoise_cb_t model, return x; } +static sd::Tensor sample_er_sde(denoise_cb_t model, + sd::Tensor x, + std::vector sigmas, + std::shared_ptr rng, + bool is_flow_denoiser, + float eta) { + constexpr int max_stage = 3; + constexpr int num_integration_points = 200; + constexpr float num_integration_points_f = 200.0f; + float s_noise = eta; + + auto er_sde_flow_sigma = [](float sigma) -> float { + sigma = std::max(sigma, 1e-6f); + sigma = std::min(sigma, 1.0f - 1e-4f); + return sigma; + }; + + auto sigma_to_er_sde_lambda = [&](float sigma, bool is_flow_denoiser) -> float { + if (is_flow_denoiser) { + sigma = er_sde_flow_sigma(sigma); + return sigma / std::max(1.0f - sigma, 1e-6f); + } + return std::max(sigma, 1e-6f); + }; + + auto sigma_to_er_sde_alpha = [&](float sigma, bool is_flow_denoiser) -> float { + if (is_flow_denoiser) { + sigma = er_sde_flow_sigma(sigma); + return 1.0f - sigma; + } + return 1.0f; + }; + + auto er_sde_noise_scaler = [](float x) -> float { + x = std::max(x, 0.0f); + return x * (std::exp(std::pow(x, 0.3f)) + 10.0f); + }; + + if (is_flow_denoiser) { + for (size_t i = 0; i + 1 < sigmas.size(); ++i) { + if (sigmas[i] > 1.0f) { + sigmas[i] = er_sde_flow_sigma(sigmas[i]); + } + } + } + + std::vector er_lambdas(sigmas.size(), 0.0f); + for (size_t i = 0; i < sigmas.size(); ++i) { + er_lambdas[i] = sigma_to_er_sde_lambda(sigmas[i], is_flow_denoiser); + } + + sd::Tensor old_denoised = x; + sd::Tensor old_denoised_d = x; + bool have_old_denoised = false; + bool have_old_denoised_d = false; + + int steps = static_cast(sigmas.size()) - 1; + for (int i = 0; i < steps; i++) { + sd::Tensor denoised = model(x, sigmas[i], i + 1); + if (denoised.empty()) { + return {}; + } + + int stage_used = std::min(max_stage, i + 1); + + if (sigmas[i + 1] == 0.0f) { + x = denoised; + } else { + float er_lambda_s = er_lambdas[i]; + float er_lambda_t = er_lambdas[i + 1]; + float alpha_s = sigma_to_er_sde_alpha(sigmas[i], is_flow_denoiser); + float alpha_t = sigma_to_er_sde_alpha(sigmas[i + 1], is_flow_denoiser); + float scaled_s = er_sde_noise_scaler(er_lambda_s); + float scaled_t = er_sde_noise_scaler(er_lambda_t); + float r_alpha = alpha_s > 0.0f ? alpha_t / alpha_s : 0.0f; + float r = scaled_s > 0.0f ? scaled_t / scaled_s : 0.0f; + + x = r_alpha * r * x + alpha_t * (1.0f - r) * denoised; + + if (stage_used >= 2 && have_old_denoised) { + float dt = er_lambda_t - er_lambda_s; + float lambda_step_size = -dt / num_integration_points_f; + float s = 0.0f; + float s_u = 0.0f; + + for (int p = 0; p < num_integration_points; ++p) { + float lambda_pos = er_lambda_t + p * lambda_step_size; + float scaled_pos = er_sde_noise_scaler(lambda_pos); + if (scaled_pos <= 0.0f) { + continue; + } + + s += 1.0f / scaled_pos; + if (stage_used >= 3 && have_old_denoised_d) { + s_u += (lambda_pos - er_lambda_s) / scaled_pos; + } + } + + s *= lambda_step_size; + + float denom_d = er_lambda_s - er_lambdas[i - 1]; + if (std::fabs(denom_d) > 1e-12f) { + float coeff_d = alpha_t * (dt + s * scaled_t); + sd::Tensor denoised_d = (denoised - old_denoised) / denom_d; + x += coeff_d * denoised_d; + + if (stage_used >= 3 && have_old_denoised_d) { + float denom_u = (er_lambda_s - er_lambdas[i - 2]) * 0.5f; + if (std::fabs(denom_u) > 1e-12f) { + s_u *= lambda_step_size; + float coeff_u = alpha_t * (0.5f * dt * dt + s_u * scaled_t); + sd::Tensor denoised_u = (denoised_d - old_denoised_d) / denom_u; + x += coeff_u * denoised_u; + } + } + + old_denoised_d = denoised_d; + have_old_denoised_d = true; + } + } + + float noise_scale_sq = er_lambda_t * er_lambda_t - er_lambda_s * er_lambda_s * r * r; + if (s_noise > 0.0f && noise_scale_sq > 0.0f) { + float noise_scale = alpha_t * std::sqrt(std::max(noise_scale_sq, 0.0f)); + x += sd::Tensor::randn_like(x, rng) * noise_scale; + } + } + + old_denoised = denoised; + have_old_denoised = true; + } + return x; +} + static sd::Tensor sample_ddim_trailing(denoise_cb_t model, sd::Tensor x, const std::vector& sigmas, std::shared_ptr rng, float eta) { - float beta_start = 0.00085f; - float beta_end = 0.0120f; - std::vector alphas_cumprod(TIMESTEPS); - std::vector compvis_sigmas(TIMESTEPS); - for (int i = 0; i < TIMESTEPS; i++) { - alphas_cumprod[i] = - (i == 0 ? 1.0f : alphas_cumprod[i - 1]) * - (1.0f - - std::pow(sqrtf(beta_start) + - (sqrtf(beta_end) - sqrtf(beta_start)) * - ((float)i / (TIMESTEPS - 1)), - 2)); - compvis_sigmas[i] = - std::sqrt((1 - alphas_cumprod[i]) / alphas_cumprod[i]); - } - int steps = static_cast(sigmas.size()) - 1; for (int i = 0; i < steps; i++) { - int timestep = static_cast(roundf(TIMESTEPS - i * ((float)TIMESTEPS / steps))) - 1; - int prev_timestep = timestep - TIMESTEPS / steps; - float sigma = static_cast(compvis_sigmas[timestep]); - if (i == 0) { - x *= std::sqrt(sigma * sigma + 1) / sigma; - } else { - x *= std::sqrt(sigma * sigma + 1); - } + float sigma = sigmas[i]; + float sigma_to = sigmas[i + 1]; auto model_output_opt = model(x, sigma, i + 1); if (model_output_opt.empty()) { @@ -1324,8 +1535,8 @@ static sd::Tensor sample_ddim_trailing(denoise_cb_t model, sd::Tensor model_output = std::move(model_output_opt); model_output = (x - model_output) * (1.0f / sigma); - float alpha_prod_t = static_cast(alphas_cumprod[timestep]); - float alpha_prod_t_prev = static_cast(prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]); + float alpha_prod_t = 1.0f / (sigma * sigma + 1.0f); + float alpha_prod_t_prev = 1.0f / (sigma_to * sigma_to + 1.0f); float beta_prod_t = 1.0f - alpha_prod_t; sd::Tensor pred_original_sample = ((x / std::sqrt(sigma * sigma + 1)) - @@ -1337,11 +1548,11 @@ static sd::Tensor sample_ddim_trailing(denoise_cb_t model, (1.0f - alpha_prod_t / alpha_prod_t_prev); float std_dev_t = eta * std::sqrt(variance); - x = std::sqrt(alpha_prod_t_prev) * pred_original_sample + - std::sqrt(1.0f - alpha_prod_t_prev - std::pow(std_dev_t, 2)) * model_output; + x = pred_original_sample + + std::sqrt((1.0f - alpha_prod_t_prev - std::pow(std_dev_t, 2)) / alpha_prod_t_prev) * model_output; if (eta > 0) { - x += std_dev_t * sd::Tensor::randn_like(x, rng); + x += std_dev_t / std::sqrt(alpha_prod_t_prev) * sd::Tensor::randn_like(x, rng); } } return x; @@ -1368,19 +1579,26 @@ static sd::Tensor sample_tcd(denoise_cb_t model, std::sqrt((1 - alphas_cumprod[i]) / alphas_cumprod[i]); } - int original_steps = 50; - int steps = static_cast(sigmas.size()) - 1; - for (int i = 0; i < steps; i++) { - int timestep = TIMESTEPS - 1 - (TIMESTEPS / original_steps) * (int)floor(i * ((float)original_steps / steps)); - int prev_timestep = i >= steps - 1 ? 0 : TIMESTEPS - 1 - (TIMESTEPS / original_steps) * (int)floor((i + 1) * ((float)original_steps / steps)); - int timestep_s = (int)floor((1 - eta) * prev_timestep); - float sigma = static_cast(compvis_sigmas[timestep]); - - if (i == 0) { - x *= std::sqrt(sigma * sigma + 1) / sigma; - } else { - x *= std::sqrt(sigma * sigma + 1); + auto get_timestep_from_sigma = [&](float s) -> int { + auto it = std::lower_bound(compvis_sigmas.begin(), compvis_sigmas.end(), s); + if (it == compvis_sigmas.begin()) + return 0; + if (it == compvis_sigmas.end()) + return TIMESTEPS - 1; + int idx_high = static_cast(std::distance(compvis_sigmas.begin(), it)); + int idx_low = idx_high - 1; + if (std::abs(compvis_sigmas[idx_high] - s) < std::abs(compvis_sigmas[idx_low] - s)) { + return idx_high; } + return idx_low; + }; + + int steps = static_cast(sigmas.size()) - 1; + for (int i = 0; i < steps; i++) { + float sigma_to = sigmas[i + 1]; + int prev_timestep = get_timestep_from_sigma(sigma_to); + int timestep_s = (int)floor((1 - eta) * prev_timestep); + float sigma = sigmas[i]; auto model_output_opt = model(x, sigma, i + 1); if (model_output_opt.empty()) { @@ -1389,9 +1607,9 @@ static sd::Tensor sample_tcd(denoise_cb_t model, sd::Tensor model_output = std::move(model_output_opt); model_output = (x - model_output) * (1.0f / sigma); - float alpha_prod_t = static_cast(alphas_cumprod[timestep]); + float alpha_prod_t = 1.0f / (sigma * sigma + 1.0f); float beta_prod_t = 1.0f - alpha_prod_t; - float alpha_prod_t_prev = static_cast(prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]); + float alpha_prod_t_prev = 1.0f / (sigma_to * sigma_to + 1.0f); float alpha_prod_s = static_cast(alphas_cumprod[timestep_s]); float beta_prod_s = 1.0f - alpha_prod_s; @@ -1399,12 +1617,12 @@ static sd::Tensor sample_tcd(denoise_cb_t model, std::sqrt(beta_prod_t) * model_output) * (1.0f / std::sqrt(alpha_prod_t)); - x = std::sqrt(alpha_prod_s) * pred_original_sample + - std::sqrt(beta_prod_s) * model_output; + x = std::sqrt(alpha_prod_s / alpha_prod_t_prev) * pred_original_sample + + std::sqrt(beta_prod_s / alpha_prod_t_prev) * model_output; - if (eta > 0 && i != steps - 1) { + if (eta > 0 && sigma_to > 0.0f) { x = std::sqrt(alpha_prod_t_prev / alpha_prod_s) * x + - std::sqrt(1.0f - alpha_prod_t_prev / alpha_prod_s) * sd::Tensor::randn_like(x, rng); + std::sqrt(1.0f / alpha_prod_t_prev - 1.0f / alpha_prod_s) * sd::Tensor::randn_like(x, rng); } } return x; @@ -1431,13 +1649,16 @@ static sd::Tensor sample_k_diffusion(sample_method_t method, case DPM2_SAMPLE_METHOD: return sample_dpm2(model, std::move(x), sigmas); case DPMPP2S_A_SAMPLE_METHOD: - return sample_dpmpp_2s_ancestral(model, std::move(x), sigmas, rng, eta); + if (is_flow_denoiser) + return sample_dpmpp_2s_ancestral_flow(model, std::move(x), sigmas, rng, eta); + else + return sample_dpmpp_2s_ancestral(model, std::move(x), sigmas, rng, eta); case DPMPP2M_SAMPLE_METHOD: return sample_dpmpp_2m(model, std::move(x), sigmas); case DPMPP2Mv2_SAMPLE_METHOD: return sample_dpmpp_2m_v2(model, std::move(x), sigmas); case LCM_SAMPLE_METHOD: - return sample_lcm(model, std::move(x), sigmas, rng); + return sample_lcm(model, std::move(x), sigmas, rng, is_flow_denoiser); case IPNDM_SAMPLE_METHOD: return sample_ipndm(model, std::move(x), sigmas); case IPNDM_V_SAMPLE_METHOD: @@ -1446,6 +1667,8 @@ static sd::Tensor sample_k_diffusion(sample_method_t method, return sample_res_multistep(model, std::move(x), sigmas, rng, eta); case RES_2S_SAMPLE_METHOD: return sample_res_2s(model, std::move(x), sigmas, rng, eta); + case ER_SDE_SAMPLE_METHOD: + return sample_er_sde(model, std::move(x), sigmas, rng, is_flow_denoiser, eta); case DDIM_TRAILING_SAMPLE_METHOD: return sample_ddim_trailing(model, std::move(x), sigmas, rng, eta); case TCD_SAMPLE_METHOD: diff --git a/src/diffusion_model.hpp b/src/diffusion_model.hpp index eb0debff..c0a2a11c 100644 --- a/src/diffusion_model.hpp +++ b/src/diffusion_model.hpp @@ -3,6 +3,7 @@ #include #include "anima.hpp" +#include "ernie_image.hpp" #include "flux.hpp" #include "mmdit.hpp" #include "qwen_image.hpp" @@ -516,4 +517,66 @@ struct ZImageModel : public DiffusionModel { } }; +struct ErnieImageModel : public DiffusionModel { + std::string prefix; + ErnieImage::ErnieImageRunner ernie_image; + + ErnieImageModel(ggml_backend_t backend, + bool offload_params_to_cpu, + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "model.diffusion_model") + : prefix(prefix), ernie_image(backend, offload_params_to_cpu, tensor_storage_map, prefix) { + } + + std::string get_desc() override { + return ernie_image.get_desc(); + } + + void alloc_params_buffer() override { + ernie_image.alloc_params_buffer(); + } + + void free_params_buffer() override { + ernie_image.free_params_buffer(); + } + + void free_compute_buffer() override { + ernie_image.free_compute_buffer(); + } + + void get_param_tensors(std::map& tensors) override { + ernie_image.get_param_tensors(tensors, prefix); + } + + size_t get_params_buffer_size() override { + return ernie_image.get_params_buffer_size(); + } + + void set_weight_adapter(const std::shared_ptr& adapter) override { + ernie_image.set_weight_adapter(adapter); + } + + int64_t get_adm_in_channels() override { + return 768; + } + + void set_flash_attention_enabled(bool enabled) { + ernie_image.set_flash_attention_enabled(enabled); + } + + void set_circular_axes(bool circular_x, bool circular_y) override { + ernie_image.set_circular_axes(circular_x, circular_y); + } + + sd::Tensor compute(int n_threads, + const DiffusionParams& diffusion_params) override { + GGML_ASSERT(diffusion_params.x != nullptr); + GGML_ASSERT(diffusion_params.timesteps != nullptr); + return ernie_image.compute(n_threads, + *diffusion_params.x, + *diffusion_params.timesteps, + tensor_or_empty(diffusion_params.context)); + } +}; + #endif diff --git a/src/ernie_image.hpp b/src/ernie_image.hpp new file mode 100644 index 00000000..d17648d2 --- /dev/null +++ b/src/ernie_image.hpp @@ -0,0 +1,438 @@ +#ifndef __SD_ERNIE_IMAGE_HPP__ +#define __SD_ERNIE_IMAGE_HPP__ + +#include +#include + +#include "common_dit.hpp" +#include "flux.hpp" +#include "qwen_image.hpp" +#include "rope.hpp" + +namespace ErnieImage { + constexpr int ERNIE_IMAGE_GRAPH_SIZE = 40960; + + __STATIC_INLINE__ ggml_tensor* timestep_embedding_sin_cos(ggml_context* ctx, + ggml_tensor* timesteps, + int dim, + int max_period = 10000) { + auto emb = ggml_ext_timestep_embedding(ctx, timesteps, dim, max_period, 1.0f); + int64_t half = dim / 2; + auto cos_part = ggml_view_2d(ctx, emb, half, emb->ne[1], emb->nb[1], 0); + auto sin_part = ggml_view_2d(ctx, emb, half, emb->ne[1], emb->nb[1], half * emb->nb[0]); + auto sin_first = ggml_concat(ctx, sin_part, cos_part, 0); + return sin_first; + } + + __STATIC_INLINE__ ggml_tensor* apply_rotary_emb(ggml_context* ctx, ggml_tensor* x, ggml_tensor* pe) { + // x: [N, S, heads, head_dim] + // pe: [2, S, 1, head_dim], stored as ggml [head_dim, 1, S, 2]. + int64_t head_dim = x->ne[0]; + int64_t heads = x->ne[1]; + int64_t S = x->ne[2]; + int64_t N = x->ne[3]; + int64_t rot_dim = pe->ne[0]; + GGML_ASSERT(rot_dim <= head_dim); + GGML_ASSERT(rot_dim % 2 == 0); + GGML_ASSERT(pe->ne[1] == 1 && pe->ne[2] == S && pe->ne[3] == 2); + + x = ggml_cont(ctx, x); + auto x_rot = ggml_ext_slice(ctx, x, 0, 0, rot_dim, false); + auto x_pass = rot_dim < head_dim ? ggml_ext_slice(ctx, x, 0, rot_dim, head_dim, false) : nullptr; + + int64_t half = rot_dim / 2; + auto x1 = ggml_view_4d(ctx, x_rot, half, heads, S, N, x_rot->nb[1], x_rot->nb[2], x_rot->nb[3], 0); + auto x2 = ggml_view_4d(ctx, x_rot, half, heads, S, N, x_rot->nb[1], x_rot->nb[2], x_rot->nb[3], half * x_rot->nb[0]); + x1 = ggml_cont(ctx, x1); + x2 = ggml_cont(ctx, x2); + auto rotated = ggml_concat(ctx, ggml_neg(ctx, x2), x1, 0); + + auto cos_emb = ggml_ext_slice(ctx, pe, 3, 0, 1, false); + auto sin_emb = ggml_ext_slice(ctx, pe, 3, 1, 2, false); + + auto out = ggml_add(ctx, ggml_mul(ctx, x_rot, cos_emb), ggml_mul(ctx, rotated, sin_emb)); + if (x_pass != nullptr) { + out = ggml_concat(ctx, out, x_pass, 0); + } + return out; + } + + struct ErnieImageAttention : public GGMLBlock { + int64_t num_heads; + int64_t head_dim; + + ErnieImageAttention(int64_t query_dim, + int64_t heads, + int64_t dim_head, + float eps = 1e-6f) + : num_heads(heads), head_dim(dim_head) { + int64_t inner_dim = heads * dim_head; + blocks["to_q"] = std::make_shared(query_dim, inner_dim, false); + blocks["to_k"] = std::make_shared(query_dim, inner_dim, false); + blocks["to_v"] = std::make_shared(query_dim, inner_dim, false); + blocks["norm_q"] = std::make_shared(dim_head, eps); + blocks["norm_k"] = std::make_shared(dim_head, eps); + blocks["to_out.0"] = std::make_shared(inner_dim, query_dim, false); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* pe, + ggml_tensor* attention_mask = nullptr) { + // x: [N, S, hidden_size] + // pe: [S, head_dim/2, 2, 2], generated in image-token-first order. + auto to_q = std::dynamic_pointer_cast(blocks["to_q"]); + auto to_k = std::dynamic_pointer_cast(blocks["to_k"]); + auto to_v = std::dynamic_pointer_cast(blocks["to_v"]); + auto norm_q = std::dynamic_pointer_cast(blocks["norm_q"]); + auto norm_k = std::dynamic_pointer_cast(blocks["norm_k"]); + auto to_out_0 = std::dynamic_pointer_cast(blocks["to_out.0"]); + + int64_t S = x->ne[1]; + int64_t N = x->ne[2]; + + auto q = to_q->forward(ctx, x); + auto k = to_k->forward(ctx, x); + auto v = to_v->forward(ctx, x); + + q = ggml_reshape_4d(ctx->ggml_ctx, q, head_dim, num_heads, S, N); // [N, S, heads, head_dim] + k = ggml_reshape_4d(ctx->ggml_ctx, k, head_dim, num_heads, S, N); // [N, S, heads, head_dim] + v = ggml_reshape_4d(ctx->ggml_ctx, v, head_dim, num_heads, S, N); // [N, S, heads, head_dim] + + q = norm_q->forward(ctx, q); + k = norm_k->forward(ctx, k); + + q = apply_rotary_emb(ctx->ggml_ctx, q, pe); + k = apply_rotary_emb(ctx->ggml_ctx, k, pe); + + q = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, q, 0, 2, 1, 3)); // [N, heads, S, head_dim] + q = ggml_reshape_3d(ctx->ggml_ctx, q, q->ne[0], q->ne[1], q->ne[2] * q->ne[3]); + + k = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, k, 0, 2, 1, 3)); // [N, heads, S, head_dim] + k = ggml_reshape_3d(ctx->ggml_ctx, k, k->ne[0], k->ne[1], k->ne[2] * k->ne[3]); + + x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, attention_mask, true, ctx->flash_attn_enabled); // [N, S, hidden_size] + x = to_out_0->forward(ctx, x); + return x; + } + }; + + struct ErnieImageFeedForward : public GGMLBlock { + public: + ErnieImageFeedForward(int64_t hidden_size, int64_t ffn_hidden_size) { + blocks["gate_proj"] = std::make_shared(hidden_size, ffn_hidden_size, false); + blocks["up_proj"] = std::make_shared(hidden_size, ffn_hidden_size, false); + blocks["linear_fc2"] = std::make_shared(ffn_hidden_size, hidden_size, false); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { + auto gate_proj = std::dynamic_pointer_cast(blocks["gate_proj"]); + auto up_proj = std::dynamic_pointer_cast(blocks["up_proj"]); + auto linear_fc2 = std::dynamic_pointer_cast(blocks["linear_fc2"]); + + auto gate = gate_proj->forward(ctx, x); + gate = ggml_ext_gelu(ctx->ggml_ctx, gate); + x = up_proj->forward(ctx, x); + x = ggml_mul(ctx->ggml_ctx, x, gate); + x = linear_fc2->forward(ctx, x); + return x; + } + }; + + struct ErnieImageSharedAdaLNBlock : public GGMLBlock { + public: + ErnieImageSharedAdaLNBlock(int64_t hidden_size, + int64_t num_heads, + int64_t ffn_hidden_size, + float eps = 1e-6f) { + blocks["adaLN_sa_ln"] = std::make_shared(hidden_size, eps); + blocks["self_attention"] = std::make_shared(hidden_size, + num_heads, + hidden_size / num_heads, + eps); + blocks["adaLN_mlp_ln"] = std::make_shared(hidden_size, eps); + blocks["mlp"] = std::make_shared(hidden_size, ffn_hidden_size); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* pe, + const std::vector& temb, + ggml_tensor* attention_mask = nullptr) { + // x: [N, image_tokens + text_tokens, hidden_size] + auto adaLN_sa_ln = std::dynamic_pointer_cast(blocks["adaLN_sa_ln"]); + auto self_attention = std::dynamic_pointer_cast(blocks["self_attention"]); + auto adaLN_mlp_ln = std::dynamic_pointer_cast(blocks["adaLN_mlp_ln"]); + auto mlp = std::dynamic_pointer_cast(blocks["mlp"]); + + auto shift_msa = temb[0]; + auto scale_msa = temb[1]; + auto gate_msa = temb[2]; + auto shift_mlp = temb[3]; + auto scale_mlp = temb[4]; + auto gate_mlp = temb[5]; + + auto residual = x; + x = adaLN_sa_ln->forward(ctx, x); + x = Flux::modulate(ctx->ggml_ctx, x, shift_msa, scale_msa, true); + auto attn_out = self_attention->forward(ctx, x, pe, attention_mask); + x = ggml_add(ctx->ggml_ctx, residual, ggml_mul(ctx->ggml_ctx, attn_out, gate_msa)); + + residual = x; + x = adaLN_mlp_ln->forward(ctx, x); + x = Flux::modulate(ctx->ggml_ctx, x, shift_mlp, scale_mlp, true); + x = ggml_add(ctx->ggml_ctx, residual, ggml_mul(ctx->ggml_ctx, mlp->forward(ctx, x), gate_mlp)); + return x; + } + }; + + struct ErnieImageAdaLNContinuous : public GGMLBlock { + public: + ErnieImageAdaLNContinuous(int64_t hidden_size, float eps = 1e-6f) { + blocks["norm"] = std::make_shared(hidden_size, eps, false); + blocks["linear"] = std::make_shared(hidden_size, hidden_size * 2, true); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* conditioning) { + auto norm = std::dynamic_pointer_cast(blocks["norm"]); + auto linear = std::dynamic_pointer_cast(blocks["linear"]); + + auto mods = ggml_ext_chunk(ctx->ggml_ctx, linear->forward(ctx, conditioning), 2, 0); + auto scale = mods[0]; + auto shift = mods[1]; + + x = norm->forward(ctx, x); + x = Flux::modulate(ctx->ggml_ctx, x, shift, scale); + return x; + } + }; + + struct ErnieImageParams { + int64_t hidden_size = 4096; + int64_t num_heads = 32; + int64_t num_layers = 36; + int64_t ffn_hidden_size = 12288; + int64_t in_channels = 128; + int64_t out_channels = 128; + int patch_size = 1; + int64_t text_in_dim = 3072; + int theta = 256; + std::vector axes_dim = {32, 48, 48}; + int axes_dim_sum = 128; + float eps = 1e-6f; + }; + + class ErnieImageModel : public GGMLBlock { + public: + ErnieImageParams params; + + ErnieImageModel() = default; + ErnieImageModel(ErnieImageParams params) + : params(params) { + blocks["x_embedder.proj"] = std::make_shared(params.in_channels, + params.hidden_size, + std::pair{params.patch_size, params.patch_size}, + std::pair{params.patch_size, params.patch_size}, + std::pair{0, 0}, + std::pair{1, 1}, + true); + if (params.text_in_dim != params.hidden_size) { + blocks["text_proj"] = std::make_shared(params.text_in_dim, params.hidden_size, false); + } + blocks["time_embedding"] = std::make_shared(params.hidden_size, params.hidden_size); + blocks["adaLN_modulation.1"] = std::make_shared(params.hidden_size, 6 * params.hidden_size, true); + + for (int i = 0; i < params.num_layers; i++) { + blocks["layers." + std::to_string(i)] = std::make_shared(params.hidden_size, + params.num_heads, + params.ffn_hidden_size, + params.eps); + } + + blocks["final_norm"] = std::make_shared(params.hidden_size, params.eps); + blocks["final_linear"] = std::make_shared(params.hidden_size, + params.patch_size * params.patch_size * params.out_channels, + true); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* timestep, + ggml_tensor* context, + ggml_tensor* pe) { + // x: [N, C, H, W] + // context: [N, text_tokens, 3072] + // pe: [image_tokens + text_tokens, head_dim/2, 2, 2] + GGML_ASSERT(context != nullptr); + GGML_ASSERT(x->ne[1] % params.patch_size == 0 && x->ne[0] % params.patch_size == 0); + + int64_t W = x->ne[0]; + int64_t H = x->ne[1]; + int64_t Hp = H / params.patch_size; + int64_t Wp = W / params.patch_size; + int64_t n_img = Hp * Wp; + int64_t N = x->ne[3]; + + auto x_embedder_proj = std::dynamic_pointer_cast(blocks["x_embedder.proj"]); + auto time_embedding = std::dynamic_pointer_cast(blocks["time_embedding"]); + auto adaLN_mod = std::dynamic_pointer_cast(blocks["adaLN_modulation.1"]); + auto final_norm = std::dynamic_pointer_cast(blocks["final_norm"]); + auto final_linear = std::dynamic_pointer_cast(blocks["final_linear"]); + + auto img = x_embedder_proj->forward(ctx, x); // [N, hidden_size, Hp, Wp] + img = ggml_reshape_3d(ctx->ggml_ctx, img, img->ne[0] * img->ne[1], img->ne[2], N); // [N, hidden_size, image_tokens] + img = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, img, 1, 0, 2, 3)); // [N, image_tokens, hidden_size] + + auto txt = context; + auto text_proj = std::dynamic_pointer_cast(blocks["text_proj"]); + if (text_proj) { + txt = text_proj->forward(ctx, txt); + } + + auto hidden_states = ggml_concat(ctx->ggml_ctx, img, txt, 1); // [N, image_tokens + text_tokens, hidden_size] + + auto sample = timestep_embedding_sin_cos(ctx->ggml_ctx, timestep, static_cast(params.hidden_size)); + auto c = time_embedding->forward(ctx, sample); // [N, hidden_size] + + auto mod_params = adaLN_mod->forward(ctx, ggml_silu(ctx->ggml_ctx, c)); // [N, 6 * hidden_size] + auto chunks = ggml_ext_chunk(ctx->ggml_ctx, mod_params, 6, 0); + std::vector temb; + temb.reserve(6); + for (auto chunk : chunks) { + temb.push_back(ggml_reshape_3d(ctx->ggml_ctx, chunk, chunk->ne[0], 1, chunk->ne[1])); // [N, 1, hidden_size] + } + + for (int i = 0; i < params.num_layers; i++) { + auto layer = std::dynamic_pointer_cast(blocks["layers." + std::to_string(i)]); + hidden_states = layer->forward(ctx, hidden_states, pe, temb); + } + + hidden_states = final_norm->forward(ctx, hidden_states, c); + hidden_states = final_linear->forward(ctx, hidden_states); // [N, image_tokens, p*p*out_channels] + auto patches = ggml_ext_slice(ctx->ggml_ctx, hidden_states, 1, 0, n_img); // [N, image_tokens, hidden_size] + + auto out = DiT::unpatchify(ctx->ggml_ctx, + patches, + Hp, + Wp, + params.patch_size, + params.patch_size, + false); // [N, out_channels, H, W] + return out; + } + }; + + struct ErnieImageRunner : public GGMLRunner { + ErnieImageParams ernie_params; + ErnieImageModel ernie_image; + std::vector pe_vec; + + ErnieImageRunner(ggml_backend_t backend, + bool offload_params_to_cpu, + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "") + : GGMLRunner(backend, offload_params_to_cpu) { + ernie_params.num_layers = 0; + for (const auto& [name, tensor_storage] : tensor_storage_map) { + if (!starts_with(name, prefix)) { + continue; + } + if (ends_with(name, "x_embedder.proj.weight") && tensor_storage.n_dims == 4) { + ernie_params.patch_size = static_cast(tensor_storage.ne[0]); + ernie_params.in_channels = tensor_storage.ne[2]; + ernie_params.hidden_size = tensor_storage.ne[3]; + } else if (ends_with(name, "text_proj.weight") && tensor_storage.n_dims == 2) { + ernie_params.text_in_dim = tensor_storage.ne[0]; + } else if (ends_with(name, "layers.0.self_attention.norm_q.weight")) { + int64_t head_dim = tensor_storage.ne[0]; + ernie_params.num_heads = ernie_params.hidden_size / head_dim; + } else if (ends_with(name, "layers.0.mlp.gate_proj.weight") && tensor_storage.n_dims == 2) { + ernie_params.ffn_hidden_size = tensor_storage.ne[1]; + } else if (ends_with(name, "final_linear.weight") && tensor_storage.n_dims == 2) { + int64_t out_dim = tensor_storage.ne[1]; + ernie_params.out_channels = out_dim / ernie_params.patch_size / ernie_params.patch_size; + } + + size_t pos = name.find("layers."); + if (pos != std::string::npos) { + std::string layer_name = name.substr(pos); + auto items = split_string(layer_name, '.'); + if (items.size() > 1) { + int block_index = atoi(items[1].c_str()); + if (block_index + 1 > ernie_params.num_layers) { + ernie_params.num_layers = block_index + 1; + } + } + } + } + if (ernie_params.num_layers == 0) { + ernie_params.num_layers = 36; + } + ernie_params.axes_dim_sum = 0; + for (int axis_dim : ernie_params.axes_dim) { + ernie_params.axes_dim_sum += axis_dim; + } + + LOG_INFO("ernie_image: layers = %" PRId64 ", hidden_size = %" PRId64 ", heads = %" PRId64 + ", ffn_hidden_size = %" PRId64 ", in_channels = %" PRId64 ", out_channels = %" PRId64, + ernie_params.num_layers, + ernie_params.hidden_size, + ernie_params.num_heads, + ernie_params.ffn_hidden_size, + ernie_params.in_channels, + ernie_params.out_channels); + + ernie_image = ErnieImageModel(ernie_params); + ernie_image.init(params_ctx, tensor_storage_map, prefix); + } + + std::string get_desc() override { + return "ernie_image"; + } + + void get_param_tensors(std::map& tensors, const std::string prefix) { + ernie_image.get_param_tensors(tensors, prefix); + } + + ggml_cgraph* build_graph(const sd::Tensor& x_tensor, + const sd::Tensor& timesteps_tensor, + const sd::Tensor& context_tensor) { + ggml_cgraph* gf = new_graph_custom(ERNIE_IMAGE_GRAPH_SIZE); + ggml_tensor* x = make_input(x_tensor); + ggml_tensor* timesteps = make_input(timesteps_tensor); + GGML_ASSERT(x->ne[3] == 1); + GGML_ASSERT(!context_tensor.empty()); + ggml_tensor* context = make_input(context_tensor); + + pe_vec = Rope::gen_ernie_image_pe(static_cast(x->ne[1]), + static_cast(x->ne[0]), + ernie_params.patch_size, + static_cast(x->ne[3]), + static_cast(context->ne[1]), + ernie_params.theta, + circular_y_enabled, + circular_x_enabled, + ernie_params.axes_dim); + int pos_len = static_cast(pe_vec.size() / ernie_params.axes_dim_sum / 2); + auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, ernie_params.axes_dim_sum, 1, pos_len, 2); + set_backend_tensor_data(pe, pe_vec.data()); + + auto runner_ctx = get_context(); + ggml_tensor* out = ernie_image.forward(&runner_ctx, x, timesteps, context, pe); + ggml_build_forward_expand(gf, out); + return gf; + } + + sd::Tensor compute(int n_threads, + const sd::Tensor& x, + const sd::Tensor& timesteps, + const sd::Tensor& context) { + auto get_graph = [&]() -> ggml_cgraph* { + return build_graph(x, timesteps, context); + }; + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); + } + }; +} // namespace ErnieImage + +#endif // __SD_ERNIE_IMAGE_HPP__ diff --git a/src/llm.hpp b/src/llm.hpp index 17743396..95030385 100644 --- a/src/llm.hpp +++ b/src/llm.hpp @@ -14,652 +14,22 @@ #include #include -#include "clip.hpp" #include "ggml_extend.hpp" #include "json.hpp" #include "rope.hpp" -#include "tokenize_util.h" -#include "vocab/vocab.h" +#include "tokenizers/bpe_tokenizer.h" +#include "tokenizers/gemma_tokenizer.h" +#include "tokenizers/mistral_tokenizer.h" +#include "tokenizers/qwen2_tokenizer.h" namespace LLM { constexpr int LLM_GRAPH_SIZE = 10240; - class BPETokenizer { - protected: - std::map byte_encoder; - std::map byte_decoder; - std::map encoder; - std::map decoder; - std::map, int> bpe_ranks; - std::regex pat; - int encoder_len; - int bpe_len; - - std::string UNK_TOKEN; - std::string BOS_TOKEN; - std::string EOS_TOKEN; - std::string PAD_TOKEN; - - int UNK_TOKEN_ID; - int BOS_TOKEN_ID; - int EOS_TOKEN_ID; - int PAD_TOKEN_ID; - - std::vector special_tokens; - - bool add_bos_token = false; - bool byte_level_bpe = true; - bool byte_fallback = false; - - protected: - virtual std::string preprocess(const std::string& text) const { - return text; - } - - static std::set> get_pairs(const std::vector& subwords) { - std::set> pairs; - if (subwords.size() == 0) { - return pairs; - } - std::u32string prev_subword = subwords[0]; - for (int i = 1; i < subwords.size(); i++) { - std::u32string subword = subwords[i]; - std::pair pair(prev_subword, subword); - pairs.insert(pair); - prev_subword = subword; - } - return pairs; - } - - bool is_special_token(const std::string& token) { - for (auto& special_token : special_tokens) { - if (special_token == token) { - return true; - } - } - return false; - } - - static std::vector split_utf32(const std::u32string& s, char32_t delim) { - std::vector result; - size_t start = 0; - - while (true) { - size_t pos = s.find(delim, start); - if (pos == std::u32string::npos) { - result.emplace_back(s.substr(start)); - break; - } - result.emplace_back(s.substr(start, pos - start)); - start = pos + 1; - } - return result; - } - - public: - BPETokenizer() = default; - - std::u32string bpe(const std::u32string& token) { - std::vector word; - - for (int i = 0; i < token.size(); i++) { - word.emplace_back(1, token[i]); - } - - std::set> pairs = get_pairs(word); - - if (pairs.empty()) { - return token; - } - - while (true) { - auto min_pair_iter = std::min_element(pairs.begin(), - pairs.end(), - [&](const std::pair& a, - const std::pair& b) { - if (bpe_ranks.find(a) == bpe_ranks.end()) { - return false; - } else if (bpe_ranks.find(b) == bpe_ranks.end()) { - return true; - } - return bpe_ranks.at(a) < bpe_ranks.at(b); - }); - - const std::pair& bigram = *min_pair_iter; - - if (bpe_ranks.find(bigram) == bpe_ranks.end()) { - break; - } - - std::u32string first = bigram.first; - std::u32string second = bigram.second; - std::vector new_word; - int32_t i = 0; - - while (i < word.size()) { - auto it = std::find(word.begin() + i, word.end(), first); - if (it == word.end()) { - new_word.insert(new_word.end(), word.begin() + i, word.end()); - break; - } - new_word.insert(new_word.end(), word.begin() + i, it); - i = static_cast(std::distance(word.begin(), it)); - - if (word[i] == first && i < static_cast(word.size()) - 1 && word[i + 1] == second) { - new_word.push_back(first + second); - i += 2; - } else { - new_word.push_back(word[i]); - i += 1; - } - } - - word = new_word; - - if (word.size() == 1) { - break; - } - pairs = get_pairs(word); - } - - std::u32string result; - for (int i = 0; i < word.size(); i++) { - result += word[i]; - if (i != word.size() - 1) { - result += utf8_to_utf32(" "); - } - } - - return result; - } - - std::vector tokenize(std::string text, - on_new_token_cb_t on_new_token_cb = nullptr, - size_t max_length = 0, - bool padding = false) { - std::vector tokens = encode(text, on_new_token_cb); - - if (max_length > 0) { - if (tokens.size() < max_length) { - tokens.resize(max_length); - } else { - if (padding) { - tokens.insert(tokens.end(), max_length - tokens.size(), PAD_TOKEN_ID); - } - } - } - - return tokens; - } - - void pad_tokens(std::vector& tokens, - std::vector& weights, - size_t max_length = 0, - bool padding = false) { - if (add_bos_token) { - tokens.insert(tokens.begin(), BOS_TOKEN_ID); - weights.insert(weights.begin(), 1.f); - } - if (max_length > 0 && padding) { - size_t n = static_cast(std::ceil(tokens.size() * 1.f / max_length)); - if (n == 0) { - n = 1; - } - size_t length = max_length * n; - LOG_DEBUG("token length: %llu", length); - tokens.insert(tokens.end(), length - tokens.size(), PAD_TOKEN_ID); - weights.insert(weights.end(), length - weights.size(), 1.f); - } - } - - virtual std::vector encode(std::string text, on_new_token_cb_t on_new_token_cb = nullptr) { - std::string original_text = text; - std::vector bpe_tokens; - std::vector token_strs; - - auto splited_texts = split_with_special_tokens(text, special_tokens); - - for (auto& splited_text : splited_texts) { - if (is_special_token(splited_text)) { - bpe_tokens.push_back(encoder[utf8_to_utf32(splited_text)]); - token_strs.push_back(splited_text); - continue; - } - auto tokens = token_split(splited_text); - for (auto& token : tokens) { - if (on_new_token_cb != nullptr) { - bool skip = on_new_token_cb(token, bpe_tokens); - if (skip) { - continue; - } - } - - std::string token_str = preprocess(token); - std::u32string utf32_token; - if (byte_level_bpe) { - for (int i = 0; i < token_str.length(); i++) { - unsigned char b = token_str[i]; - utf32_token += byte_encoder[b]; - } - } else { - utf32_token = utf8_to_utf32(token_str); - } - - auto bpe_strs = bpe(utf32_token); - for (const auto& bpe_str : split_utf32(bpe_strs, U' ')) { - int token_id; - auto iter = encoder.find(bpe_str); - if (iter != encoder.end()) { - token_id = iter->second; - } else { - if (byte_fallback) { - auto utf8_token_str = utf32_to_utf8(bpe_str); - for (int i = 0; i < utf8_token_str.length(); i++) { - unsigned char b = utf8_token_str[i]; - char hex_buf[16]; - snprintf(hex_buf, sizeof(hex_buf), "<0x%02X>", b); - iter = encoder.find(utf8_to_utf32(hex_buf)); - GGML_ASSERT(iter != encoder.end()); - bpe_tokens.push_back(token_id); - token_strs.push_back(hex_buf); - } - continue; - } else { - token_id = UNK_TOKEN_ID; - } - } - bpe_tokens.push_back(token_id); - token_strs.push_back(utf32_to_utf8(bpe_str)); - } - } - } - - std::stringstream ss; - ss << "["; - for (auto token : token_strs) { - ss << "\"" << token << "\", "; - } - ss << "]"; - LOG_DEBUG("split prompt \"%s\" to tokens %s", original_text.c_str(), ss.str().c_str()); - return bpe_tokens; - } - }; - - class Qwen2Tokenizer : public BPETokenizer { - protected: - void load_from_merges(const std::string& merges_utf8_str) { - auto byte_unicode_pairs = bytes_to_unicode(); - byte_encoder = std::map(byte_unicode_pairs.begin(), byte_unicode_pairs.end()); - for (auto& pair : byte_unicode_pairs) { - byte_decoder[pair.second] = pair.first; - } - std::u32string merges_utf32_str = utf8_to_utf32(merges_utf8_str); - std::vector merges = split_utf32(merges_utf32_str, U'\n'); - std::vector> merge_pairs; - for (const auto& merge : merges) { - size_t space_pos = merge.find(' '); - merge_pairs.emplace_back(merge.substr(0, space_pos), merge.substr(space_pos + 1)); - } - LOG_DEBUG("merges size %zu", merge_pairs.size()); - - std::vector tokens; - for (const auto& pair : byte_unicode_pairs) { - tokens.push_back(pair.second); - } - for (const auto& merge : merge_pairs) { - tokens.push_back(merge.first + merge.second); - } - for (auto& special_token : special_tokens) { - tokens.push_back(utf8_to_utf32(special_token)); - } - - int i = 0; - for (const auto& token : tokens) { - encoder[token] = i; - decoder[i] = token; - i++; - } - encoder_len = i; - LOG_DEBUG("vocab size: %d", encoder_len); - - int rank = 0; - for (const auto& merge : merge_pairs) { - bpe_ranks[merge] = rank++; - } - bpe_len = rank; - }; - - public: - explicit Qwen2Tokenizer(const std::string& merges_utf8_str = "") { - UNK_TOKEN = "<|endoftext|>"; - EOS_TOKEN = "<|endoftext|>"; - PAD_TOKEN = "<|endoftext|>"; - - UNK_TOKEN_ID = 151643; - EOS_TOKEN_ID = 151643; - PAD_TOKEN_ID = 151643; - - special_tokens = { - "<|endoftext|>", - "<|im_start|>", - "<|im_end|>", - "<|object_ref_start|>", - "<|object_ref_end|>", - "<|box_start|>", - "<|box_end|>", - "<|quad_start|>", - "<|quad_end|>", - "<|vision_start|>", - "<|vision_end|>", - "<|vision_pad|>", - "<|image_pad|>", - "<|video_pad|>", - "", - "", - "<|fim_prefix|>", - "<|fim_middle|>", - "<|fim_suffix|>", - "<|fim_pad|>", - "<|repo_name|>", - "<|file_sep|>", - "", - "", - "", - "", - }; - - if (merges_utf8_str.size() > 0) { - load_from_merges(merges_utf8_str); - } else { - load_from_merges(load_qwen2_merges()); - } - } - }; - - class MistralTokenizer : public BPETokenizer { - protected: - void load_from_merges(const std::string& merges_utf8_str, const std::string& vocab_utf8_str) { - nlohmann::json vocab; - - try { - vocab = nlohmann::json::parse(vocab_utf8_str); - } catch (const nlohmann::json::parse_error&) { - GGML_ABORT("invalid vocab json str"); - } - for (const auto& [key, value] : vocab.items()) { - std::u32string token = utf8_to_utf32(key); - int i = value; - encoder[token] = i; - decoder[i] = token; - } - encoder_len = static_cast(vocab.size()); - LOG_DEBUG("vocab size: %d", encoder_len); - - auto byte_unicode_pairs = bytes_to_unicode(); - byte_encoder = std::map(byte_unicode_pairs.begin(), byte_unicode_pairs.end()); - for (auto& pair : byte_unicode_pairs) { - byte_decoder[pair.second] = pair.first; - } - std::u32string merges_utf32_str = utf8_to_utf32(merges_utf8_str); - std::vector merges = split_utf32(merges_utf32_str, U'\n'); - std::vector> merge_pairs; - for (const auto& merge : merges) { - size_t space_pos = merge.find(' '); - merge_pairs.emplace_back(merge.substr(0, space_pos), merge.substr(space_pos + 1)); - } - LOG_DEBUG("merges size %zu", merge_pairs.size()); - - int rank = 0; - for (const auto& merge : merge_pairs) { - bpe_ranks[merge] = rank++; - } - bpe_len = rank; - }; - - public: - explicit MistralTokenizer(const std::string& merges_utf8_str = "", const std::string& vocab_utf8_str = "") { - add_bos_token = true; - - UNK_TOKEN = ""; - BOS_TOKEN = ""; - EOS_TOKEN = ""; - PAD_TOKEN = ""; - - UNK_TOKEN_ID = 0; - BOS_TOKEN_ID = 1; - EOS_TOKEN_ID = 2; - PAD_TOKEN_ID = 11; - - special_tokens = { - "", - "", - "", - "[INST]", - "[/INST]", - "[AVAILABLE_TOOLS]", - "[/AVAILABLE_TOOLS]", - "[TOOL_RESULTS]", - "[/TOOL_RESULTS]", - "[TOOL_CALLS]", - "[IMG]", - "", - "[IMG_BREAK]", - "[IMG_END]", - "[PREFIX]", - "[MIDDLE]", - "[SUFFIX]", - "[SYSTEM_PROMPT]", - "[/SYSTEM_PROMPT]", - "[TOOL_CONTENT]", - }; - for (int i = 20; i < 1000; i++) { - special_tokens.push_back(""); - } - - if (merges_utf8_str.size() > 0 && vocab_utf8_str.size() > 0) { - load_from_merges(merges_utf8_str, vocab_utf8_str); - } else { - load_from_merges(load_mistral_merges(), load_mistral_vocab_json()); - } - } - }; - - class GemmaTokenizer : public BPETokenizer { - protected: - std::vector special_tokens_before_merge; - std::vector special_tokens_after_merge; - - std::string preprocess(const std::string& text) const override { - std::string normalized = text; - size_t pos = 0; - while ((pos = normalized.find(' ', pos)) != std::string::npos) { - normalized.replace(pos, 1, "\xE2\x96\x81"); - pos += 3; - } - return normalized; - } - - void load_from_merges(const std::string& merges_utf8_str, const std::string& vocab_utf8_str) { - nlohmann::json vocab; - try { - vocab = nlohmann::json::parse(vocab_utf8_str); - } catch (const nlohmann::json::parse_error&) { - GGML_ABORT("invalid vocab json str"); - } - for (const auto& [key, value] : vocab.items()) { - std::u32string token = utf8_to_utf32(key); - int i = value; - encoder[token] = i; - decoder[i] = token; - } - encoder_len = static_cast(vocab.size()); - LOG_DEBUG("vocab size: %d", encoder_len); - - std::u32string merges_utf32_str = utf8_to_utf32(merges_utf8_str); - std::vector merges = split_utf32(merges_utf32_str, U'\n'); - std::vector> merge_pairs; - for (const auto& merge : merges) { - size_t space_pos = merge.find(' '); - merge_pairs.emplace_back(merge.substr(0, space_pos), merge.substr(space_pos + 1)); - } - LOG_DEBUG("merges size %zu", merge_pairs.size()); - - int rank = 0; - for (const auto& merge : merge_pairs) { - bpe_ranks[merge] = rank++; - } - bpe_len = rank; - }; - - public: - explicit GemmaTokenizer(const std::string& merges_utf8_str = "", const std::string& vocab_json_utf8_str = "") { - byte_level_bpe = false; - byte_fallback = true; - add_bos_token = true; - PAD_TOKEN = ""; - EOS_TOKEN = ""; - BOS_TOKEN = ""; - UNK_TOKEN = ""; - - PAD_TOKEN_ID = 0; - EOS_TOKEN_ID = 1; - BOS_TOKEN_ID = 2; - UNK_TOKEN_ID = 3; - - special_tokens_before_merge = { - PAD_TOKEN, - EOS_TOKEN, - BOS_TOKEN, - UNK_TOKEN, - "", - "[multimodal]", - }; - for (int i = 0; i <= 98; i++) { - special_tokens_before_merge.push_back(""); - } - special_tokens_before_merge.push_back(""); - special_tokens_before_merge.push_back(""); - for (int i = 1; i <= 31; i++) { - special_tokens_before_merge.push_back(std::string(i, '\n')); - } - for (int i = 2; i <= 31; i++) { - std::string whitespace_token; - for (int j = 0; j < i; j++) { - whitespace_token += "\xE2\x96\x81"; - } - special_tokens_before_merge.push_back(whitespace_token); - } - std::vector html_tokens = { - "", - "", - "", - "", - "", - "
", - "
", - "", - "
", - "", - "", - "", - "", - "", - "", - "", - "

", - "

", - "

", - "

", - "

", - "
", - "
", - "
", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "