Compare commits

..

No commits in common. "50134e51dd8693c155dad5883c31fd16c8037cbd" and "bd17f53b7386fb5f60e8587b75e73c4b2fed3426" have entirely different histories.

48 changed files with 369 additions and 7120 deletions

View File

@ -135,7 +135,7 @@ jobs:
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential libvulkan-dev glslc spirv-headers
sudo apt-get install build-essential libvulkan-dev glslc
- name: Build
id: cmake_build
@ -448,7 +448,7 @@ jobs:
runs-on: windows-2022
env:
ROCM_VERSION: "7.13.0"
ROCM_VERSION: "7.12.0"
GPU_TARGETS: "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201"
steps:
@ -516,6 +516,9 @@ jobs:
- name: Pack artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\libhipblaslt.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-${{ env.ROCM_VERSION }}-x64.zip .\build\bin\*
- name: Upload artifacts
@ -647,7 +650,7 @@ jobs:
- ROCM_VERSION: "7.2.1"
gpu_targets: "gfx908;gfx90a;gfx942;gfx1030;gfx1031;gfx1032;gfx1100;gfx1101;gfx1102;gfx1151;gfx1150;gfx1200;gfx1201"
build: 'x64'
- ROCM_VERSION: "7.13.0"
- ROCM_VERSION: "7.12.0"
gpu_targets: "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201"
build: x64

2
.gitmodules vendored
View File

@ -1,6 +1,6 @@
[submodule "ggml"]
path = ggml
url = https://github.com/leejet/ggml.git
url = https://github.com/ggml-org/ggml.git
[submodule "examples/server/frontend"]
path = examples/server/frontend
url = https://github.com/leejet/sdcpp-webui.git

View File

@ -13,9 +13,7 @@ if (MSVC)
add_compile_definitions(_SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING)
add_compile_options(
$<$<COMPILE_LANGUAGE:C>:/MP>
$<$<COMPILE_LANGUAGE:C>:/utf-8>
$<$<COMPILE_LANGUAGE:CXX>:/MP>
$<$<COMPILE_LANGUAGE:CXX>:/utf-8>
)
endif()

View File

@ -2,7 +2,7 @@ ARG UBUNTU_VERSION=24.04
FROM ubuntu:$UBUNTU_VERSION AS build
RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake libvulkan-dev glslc spirv-headers
RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake libvulkan-dev glslc
WORKDIR /sd.cpp

View File

@ -64,7 +64,6 @@ API and command-line option may change frequently.***
- [Qwen Image Edit series](./docs/qwen_image_edit.md)
- Video Models
- [Wan2.1/Wan2.2](./docs/wan.md)
- [LTX-2.3](./docs/ltx2.md)
- [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support.
- Control Net support with SD 1.5
- LoRA support, same as [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora)
@ -148,7 +147,6 @@ For runtime and parameter backend placement, see the [backend selection guide](.
- [🔥Qwen Image](./docs/qwen_image.md)
- [🔥Qwen Image Edit series](./docs/qwen_image_edit.md)
- [🔥Wan2.1/Wan2.2](./docs/wan.md)
- [🔥LTX-2.3](./docs/ltx2.md)
- [🔥Z-Image](./docs/z_image.md)
- [Ovis-Image](./docs/ovis_image.md)
- [Anima](./docs/anima.md)

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -102,11 +102,6 @@ cmake --build . --config Release
## Build with Vulkan
Install Vulkan SDK from https://www.lunarg.com/vulkan-sdk/.
On Ubuntu, install the Vulkan development packages and SPIR-V headers:
```shell
sudo apt-get install build-essential libvulkan-dev glslc spirv-headers
```
```shell
mkdir build && cd build

View File

@ -1,53 +0,0 @@
# How to Use
## Download weights
- Download LTX-2.3
- safetensors: https://huggingface.co/Kijai/LTX2.3_comfy/tree/main/diffusion_models
- gguf: https://huggingface.co/unsloth/LTX-2.3-GGUF/tree/main
- Download gemma-3-12b-it
- gguf: https://huggingface.co/unsloth/gemma-3-12b-it-GGUF/tree/main
- Download embeddings connectors
- safetensors: https://huggingface.co/unsloth/LTX-2.3-GGUF/tree/main/text_encoders
- Download vae
- safetensors: https://huggingface.co/unsloth/LTX-2.3-GGUF/tree/main/vae
- Download audio vae
- safetensors: https://huggingface.co/unsloth/LTX-2.3-GGUF/tree/main/vae
## Examples
### LTX-2.3 dev T2V
```
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\ltx-2.3-22b-dev-UD-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_video_vae.safetensors --audio-vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_audio_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\gemma-3-12b-it-qat-UD-Q4_K_XL.gguf --embeddings-connectors ..\..\ComfyUI\models\text_encoders\ltx-2.3-22b-dev_embeddings_connectors.safetensors -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "worst quality, low quality, blurry, distorted, artifacts" -W 1280 -H 720 --diffusion-fa --offload-to-cpu --video-frames 33 --fps 24 -o t2v.webm
```
<video
src="../assets/ltx2/t2v.webm"
controls
muted
style="max-width: 100%; height: auto;"></video>
### LTX-2.3 dev I2V
```
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\ltx-2.3-22b-dev-UD-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_video_vae.safetensors --audio-vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_audio_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\gemma-3-12b-it-qat-UD-Q4_K_XL.gguf --embeddings-connectors ..\..\ComfyUI\models\text_encoders\ltx-2.3-22b-dev_embeddings_connectors.safetensors -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -W 1280 -H 720 --diffusion-fa --offload-to-cpu --video-frames 33 -i ..\assets\ernie_image\turbo_example.png -o i2v.webm
```
<video
src="../assets/ltx2/i2v.webm"
controls
muted
style="max-width: 100%; height: auto;"></video>
### LTX-2.3 dev FLF2V
```
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\ltx-2.3-22b-dev-UD-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_video_vae.safetensors --audio-vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_audio_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\gemma-3-12b-it-qat-UD-Q4_K_XL.gguf --embeddings-connectors ..\..\ComfyUI\models\text_encoders\ltx-2.3-22b-dev_embeddings_connectors.safetensors -p "glass flower blossom" --cfg-scale 6.0 --sampling-method euler -v -W 1280 -H 720 --diffusion-fa --offload-to-cpu --video-frames 33 --init-img ..\..\ComfyUI\input\start_image.png --end-img ..\..\ComfyUI\input\end_image.png -o flf2v.webm
```
<video
src="../assets/ltx2/flf2v.webm"
controls
muted
style="max-width: 100%; height: auto;"></video>

View File

@ -7,10 +7,6 @@ add_executable(${TARGET}
image_metadata.cpp
main.cpp
)
target_include_directories(${TARGET} PRIVATE
"${CMAKE_CURRENT_SOURCE_DIR}/.."
"${PROJECT_SOURCE_DIR}/src"
)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE stable-diffusion zip ${CMAKE_THREAD_LIBS_INIT})
if(SD_WEBP)

View File

@ -103,9 +103,8 @@ Generation Options:
--hires-upscaler <string> highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent
(nearest-exact), Latent (antialiased), Latent (bicubic), Latent (bicubic
antialiased), or a model name under --hires-upscalers-dir (default: Latent)
--extra-sample-args <string> extra sampler/scheduler args, key=value list. lcm supports noise_clip_std,
noise_scale_start, noise_scale_end; ltx2 supports max_shift, base_shift,
stretch, terminal
--extra-sample-args <string> extra sampler args, key=value list. Currently lcm supports noise_clip_std,
noise_scale_start, noise_scale_end
-H, --height <int> image height, in pixel space (default: 512)
-W, --width <int> image width, in pixel space (default: 512)
--steps <int> number of sample steps (default: 20)
@ -161,7 +160,6 @@ Generation Options:
--disable-auto-resize-ref-image disable auto resize of ref images
--disable-image-metadata do not embed generation metadata on image files
--vae-tiling process vae in tiles to reduce memory usage
--temporal-tiling enable temporal tiling for LTX video VAE decode
--hires enable highres fix
-s, --seed RNG seed (default: 42, use random seed for < 0)
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m,
@ -171,8 +169,8 @@ Generation Options:
dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep,
res_2s, er_sde, euler_cfg_pp, euler_a_cfg_pp] default: euler for Flux/SD3/Wan, euler_a otherwise
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits,
smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent, ltx2], default:
model-specific
smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent], default:
discrete
--sigmas custom sigma values for the sampler, comma-separated (e.g.,
"14.61,7.8,3.5,0.0").
--skip-layers layers to skip for SLG steps (default: [7,8,9])

View File

@ -385,32 +385,11 @@ std::string format_frame_idx(std::string pattern, int frame_idx) {
return result;
}
static fs::path get_video_audio_sidecar_path(const SDCliParams& cli_params) {
fs::path out_path = cli_params.output_path;
fs::path base_path = out_path;
fs::path ext = out_path.has_extension() ? out_path.extension() : fs::path{};
std::string ext_lower = ext.string();
std::transform(ext_lower.begin(), ext_lower.end(), ext_lower.begin(), ::tolower);
const EncodedImageFormat output_format = encoded_image_format_from_path(out_path.string());
if (!ext.empty()) {
if (output_format == EncodedImageFormat::JPEG ||
output_format == EncodedImageFormat::PNG ||
output_format == EncodedImageFormat::WEBP ||
ext_lower == ".avi" ||
ext_lower == ".webm") {
base_path.replace_extension();
}
}
base_path += ".wav";
return base_path;
}
bool save_results(const SDCliParams& cli_params,
const SDContextParams& ctx_params,
const SDGenerationParams& gen_params,
sd_image_t* results,
int num_results,
const sd_audio_t* generated_audio = nullptr) {
int num_results) {
if (results == nullptr || num_results <= 0) {
return false;
}
@ -463,21 +442,6 @@ bool save_results(const SDCliParams& cli_params,
return ok;
};
auto write_audio_sidecar = [&](const fs::path& wav_path) {
if (generated_audio == nullptr) {
return;
}
if (write_wav_to_file(wav_path.string(),
generated_audio->data,
generated_audio->sample_count,
generated_audio->channels,
generated_audio->sample_rate)) {
LOG_INFO("save result audio to '%s'", wav_path.string().c_str());
} else {
LOG_WARN("failed to save result audio to '%s'", wav_path.string().c_str());
}
};
int sucessful_reults = 0;
if (std::regex_search(cli_params.output_path, format_specifier_regex)) {
@ -501,16 +465,8 @@ bool save_results(const SDCliParams& cli_params,
ext = ".avi";
fs::path video_path = base_path;
video_path += ext;
std::string final_ext_lower = ext.string();
std::transform(final_ext_lower.begin(), final_ext_lower.end(), final_ext_lower.begin(), ::tolower);
const bool mux_audio = generated_audio != nullptr && (final_ext_lower == ".avi" || final_ext_lower == ".webm");
if (create_video_from_sd_images(video_path.string().c_str(), results, num_results, gen_params.fps, 90, mux_audio ? generated_audio : nullptr) == 0) {
if (create_video_from_sd_images(video_path.string().c_str(), results, num_results, gen_params.fps) == 0) {
LOG_INFO("save result video to '%s'", video_path.string().c_str());
if (generated_audio != nullptr && !mux_audio) {
fs::path wav_path = video_path;
wav_path.replace_extension(".wav");
write_audio_sidecar(wav_path);
}
return true;
} else {
LOG_ERROR("Failed to save result video to '%s'", video_path.string().c_str());
@ -532,9 +488,6 @@ bool save_results(const SDCliParams& cli_params,
}
}
LOG_INFO("%d/%d images saved", sucessful_reults, num_results);
if (generated_audio != nullptr) {
write_audio_sidecar(get_video_audio_sidecar_path(cli_params));
}
return sucessful_reults != 0;
}
@ -748,8 +701,7 @@ int main(int argc, const char* argv[]) {
sd_ctx_params_t sd_ctx_params = ctx_params.to_sd_ctx_params_t(vae_decode_only, true, cli_params.taesd_preview);
SDImageVec results;
int num_results = 0;
sd_audio_t* generated_audio = nullptr;
int num_results = 0;
if (cli_params.mode == UPSCALE) {
num_results = 1;
@ -781,10 +733,7 @@ int main(int argc, const char* argv[]) {
results.adopt(generate_image(sd_ctx.get(), &img_gen_params), num_results);
} else if (cli_params.mode == VID_GEN) {
sd_vid_gen_params_t vid_gen_params = gen_params.to_sd_vid_gen_params_t();
sd_image_t* generated_video = nullptr;
if (!generate_video(sd_ctx.get(), &vid_gen_params, &generated_video, &num_results, &generated_audio)) {
generated_video = nullptr;
}
sd_image_t* generated_video = generate_video(sd_ctx.get(), &vid_gen_params, &num_results);
results.adopt(generated_video, num_results);
}
@ -826,12 +775,9 @@ int main(int argc, const char* argv[]) {
}
}
if (!save_results(cli_params, ctx_params, gen_params, results.data(), num_results, generated_audio)) {
free_sd_audio(generated_audio);
if (!save_results(cli_params, ctx_params, gen_params, results.data(), num_results)) {
return 1;
}
free_sd_audio(generated_audio);
return 0;
}

View File

@ -340,18 +340,10 @@ ArgOptions SDContextParams::get_options() {
"--high-noise-diffusion-model",
"path to the standalone high noise diffusion model",
&high_noise_diffusion_model_path},
{"",
"--embeddings-connectors",
"path to LTXAV embeddings connectors",
&embeddings_connectors_path},
{"",
"--vae",
"path to standalone vae model",
&vae_path},
{"",
"--audio-vae",
"path to standalone LTX audio vae model",
&audio_vae_path},
{"",
"--taesd",
"path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)",
@ -677,9 +669,7 @@ std::string SDContextParams::to_string() const {
<< " llm_vision_path: \"" << llm_vision_path << "\",\n"
<< " diffusion_model_path: \"" << diffusion_model_path << "\",\n"
<< " high_noise_diffusion_model_path: \"" << high_noise_diffusion_model_path << "\",\n"
<< " embeddings_connectors_path: \"" << embeddings_connectors_path << "\",\n"
<< " vae_path: \"" << vae_path << "\",\n"
<< " audio_vae_path: \"" << audio_vae_path << "\",\n"
<< " taesd_path: \"" << taesd_path << "\",\n"
<< " esrgan_path: \"" << esrgan_path << "\",\n"
<< " control_net_path: \"" << control_net_path << "\",\n"
@ -738,9 +728,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
llm_vision_path.c_str(),
diffusion_model_path.c_str(),
high_noise_diffusion_model_path.c_str(),
embeddings_connectors_path.c_str(),
vae_path.c_str(),
audio_vae_path.c_str(),
taesd_path.c_str(),
control_net_path.c_str(),
embedding_vec.data(),
@ -833,7 +821,7 @@ ArgOptions SDGenerationParams::get_options() {
&hires_upscaler},
{"",
"--extra-sample-args",
"extra sampler/scheduler args, key=value list. lcm supports noise_clip_std, noise_scale_start, noise_scale_end; ltx2 supports max_shift, base_shift, stretch, terminal",
"extra sampler args, key=value list. Currently lcm supports noise_clip_std, noise_scale_start, noise_scale_end",
&extra_sample_args},
};
@ -1018,11 +1006,6 @@ ArgOptions SDGenerationParams::get_options() {
"process vae in tiles to reduce memory usage",
true,
&vae_tiling_params.enabled},
{"",
"--temporal-tiling",
"enable temporal tiling for LTX video VAE decode",
true,
&vae_tiling_params.temporal_tiling},
{"",
"--hires",
"enable highres fix",
@ -1287,7 +1270,7 @@ ArgOptions SDGenerationParams::get_options() {
on_high_noise_sample_method_arg},
{"",
"--scheduler",
"denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent, ltx2], default: model-specific",
"denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent], default: discrete",
on_scheduler_arg},
{"",
"--sigmas",
@ -1720,9 +1703,6 @@ bool SDGenerationParams::from_json_str(
if (tiling_json.contains("enabled") && tiling_json["enabled"].is_boolean()) {
vae_tiling_params.enabled = tiling_json["enabled"];
}
if (tiling_json.contains("temporal_tiling") && tiling_json["temporal_tiling"].is_boolean()) {
vae_tiling_params.temporal_tiling = tiling_json["temporal_tiling"];
}
if (tiling_json.contains("tile_size_x") && tiling_json["tile_size_x"].is_number_integer()) {
vae_tiling_params.tile_size_x = tiling_json["tile_size_x"];
}
@ -2232,7 +2212,6 @@ sd_vid_gen_params_t SDGenerationParams::to_sd_vid_gen_params_t() {
params.strength = strength;
params.seed = seed;
params.video_frames = video_frames;
params.fps = fps;
params.vace_strength = vace_strength;
params.vae_tiling_params = vae_tiling_params;
params.cache = cache_params;
@ -2321,7 +2300,6 @@ std::string SDGenerationParams::to_string() const {
<< ", upscale_tile_size: " << hires_upscale_tile_size << " },\n"
<< " vae_tiling_params: { "
<< vae_tiling_params.enabled << ", "
<< vae_tiling_params.temporal_tiling << ", "
<< vae_tiling_params.tile_size_x << ", "
<< vae_tiling_params.tile_size_y << ", "
<< vae_tiling_params.target_overlap << ", "

View File

@ -92,9 +92,7 @@ struct SDContextParams {
std::string llm_vision_path;
std::string diffusion_model_path;
std::string high_noise_diffusion_model_path;
std::string embeddings_connectors_path;
std::string vae_path;
std::string audio_vae_path;
std::string taesd_path;
std::string esrgan_path;
std::string control_net_path;
@ -189,7 +187,7 @@ struct SDGenerationParams {
int video_frames = 1;
int fps = 16;
float vace_strength = 1.f;
sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f};
sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f};
std::string pm_id_images_dir;
std::string pm_id_embed_path;

View File

@ -613,13 +613,6 @@ typedef struct {
uint32_t size;
} avi_index_entry;
typedef struct {
char fourcc[4];
uint32_t flags;
uint32_t offset;
uint32_t size;
} avi_chunk_index_entry;
void write_u32_le(FILE* f, uint32_t val) {
fwrite(&val, 4, 1, f);
}
@ -654,33 +647,6 @@ void write_fourcc(std::vector<uint8_t>& data, const char* fourcc) {
data.insert(data.end(), fourcc, fourcc + 4);
}
static std::vector<uint8_t> audio_to_pcm16_bytes(const sd_audio_t* audio) {
if (audio == nullptr || audio->data == nullptr || audio->sample_count == 0 || audio->channels == 0 || audio->sample_rate == 0) {
return {};
}
const size_t pcm_samples = static_cast<size_t>(audio->sample_count) * static_cast<size_t>(audio->channels);
std::vector<uint8_t> bytes(pcm_samples * sizeof(int16_t));
auto* pcm = reinterpret_cast<int16_t*>(bytes.data());
for (size_t i = 0; i < pcm_samples; ++i) {
const float sample = std::clamp(audio->data[i], -1.0f, 1.0f);
pcm[i] = static_cast<int16_t>(std::lrint(sample * 32767.0f));
}
return bytes;
}
static std::pair<uint64_t, uint64_t> audio_sample_range_for_video_frame(const sd_audio_t* audio, int frame_idx, int num_frames, int fps) {
if (audio == nullptr || fps <= 0 || num_frames <= 0) {
return {0, 0};
}
const uint64_t total = audio->sample_count;
const uint64_t start = static_cast<uint64_t>((static_cast<long double>(frame_idx) * total) / num_frames);
const uint64_t end = frame_idx + 1 == num_frames
? total
: static_cast<uint64_t>((static_cast<long double>(frame_idx + 1) * total) / num_frames);
return {start, std::max(start, end)};
}
EncodedImageFormat encoded_image_format_from_path(const std::string& path) {
std::string ext = fs::path(path).extension().string();
std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
@ -810,7 +776,7 @@ uint8_t* load_image_from_memory(const char* image_bytes,
return load_image_common(true, image_bytes, len, width, height, expected_width, expected_height, expected_channel);
}
std::vector<uint8_t> create_mjpg_avi_from_sd_images_to_vector(sd_image_t* images, int num_images, int fps, int quality, const sd_audio_t* audio) {
std::vector<uint8_t> create_mjpg_avi_from_sd_images_to_vector(sd_image_t* images, int num_images, int fps, int quality) {
if (num_images == 0) {
fprintf(stderr, "Error: Image array is empty.\n");
return {};
@ -827,13 +793,7 @@ std::vector<uint8_t> create_mjpg_avi_from_sd_images_to_vector(sd_image_t* images
// stb_image_write changes JPEG sampling behavior above quality 90.
// MJPG AVI playback is more compatible when we keep the encoder on the
// <= 90 path.
const int mjpg_quality = std::clamp(quality, 1, 90);
const bool has_audio = audio != nullptr && audio->data != nullptr && audio->sample_count > 0 && audio->channels > 0 && audio->sample_rate > 0;
const std::vector<uint8_t> audio_pcm = audio_to_pcm16_bytes(audio);
const uint16_t audio_bits_per_sample = 16;
const uint16_t audio_block_align = has_audio ? static_cast<uint16_t>(audio->channels * (audio_bits_per_sample / 8)) : 0;
const uint32_t audio_byte_rate = has_audio ? static_cast<uint32_t>(audio->sample_rate * audio_block_align) : 0;
const uint32_t audio_data_size = has_audio ? static_cast<uint32_t>(audio_pcm.size()) : 0;
const int mjpg_quality = std::clamp(quality, 1, 90);
std::vector<uint8_t> avi_data;
avi_data.reserve(static_cast<size_t>(num_images) * 1024);
@ -844,11 +804,7 @@ std::vector<uint8_t> create_mjpg_avi_from_sd_images_to_vector(sd_image_t* images
write_fourcc(avi_data, "AVI ");
write_fourcc(avi_data, "LIST");
uint32_t hdrl_size = 4 + 8 + 56 + 8 + 4 + 8 + 56 + 8 + 40;
if (has_audio) {
hdrl_size += 8 + (4 + 8 + 56 + 8 + 16);
}
write_u32_le(avi_data, hdrl_size);
write_u32_le(avi_data, 4 + 8 + 56 + 8 + 4 + 8 + 56 + 8 + 40);
write_fourcc(avi_data, "hdrl");
write_fourcc(avi_data, "avih");
@ -859,7 +815,7 @@ std::vector<uint8_t> create_mjpg_avi_from_sd_images_to_vector(sd_image_t* images
write_u32_le(avi_data, 0x110);
write_u32_le(avi_data, num_images);
write_u32_le(avi_data, 0);
write_u32_le(avi_data, has_audio ? 2 : 1);
write_u32_le(avi_data, 1);
write_u32_le(avi_data, width * height * 3);
write_u32_le(avi_data, width);
write_u32_le(avi_data, height);
@ -906,48 +862,12 @@ std::vector<uint8_t> create_mjpg_avi_from_sd_images_to_vector(sd_image_t* images
write_u32_le(avi_data, 0);
write_u32_le(avi_data, 0);
if (has_audio) {
write_fourcc(avi_data, "LIST");
write_u32_le(avi_data, 4 + 8 + 56 + 8 + 16);
write_fourcc(avi_data, "strl");
write_fourcc(avi_data, "strh");
write_u32_le(avi_data, 56);
write_fourcc(avi_data, "auds");
write_u32_le(avi_data, 0);
write_u32_le(avi_data, 0);
write_u16_le(avi_data, 0);
write_u16_le(avi_data, 0);
write_u32_le(avi_data, 0);
write_u32_le(avi_data, audio_block_align);
write_u32_le(avi_data, audio_byte_rate);
write_u32_le(avi_data, 0);
write_u32_le(avi_data, static_cast<uint32_t>(audio->sample_count));
write_u32_le(avi_data, audio_data_size);
write_u32_le(avi_data, static_cast<uint32_t>(-1));
write_u32_le(avi_data, audio_block_align);
write_u16_le(avi_data, 0);
write_u16_le(avi_data, 0);
write_u16_le(avi_data, 0);
write_u16_le(avi_data, 0);
write_fourcc(avi_data, "strf");
write_u32_le(avi_data, 16);
write_u16_le(avi_data, 1);
write_u16_le(avi_data, static_cast<uint16_t>(audio->channels));
write_u32_le(avi_data, audio->sample_rate);
write_u32_le(avi_data, audio_byte_rate);
write_u16_le(avi_data, audio_block_align);
write_u16_le(avi_data, audio_bits_per_sample);
}
write_fourcc(avi_data, "LIST");
const size_t movi_size_pos = avi_data.size();
write_u32_le(avi_data, 0);
write_fourcc(avi_data, "movi");
std::vector<avi_chunk_index_entry> index;
index.reserve(static_cast<size_t>(num_images) + (has_audio ? 1 : 0));
std::vector<avi_index_entry> index(static_cast<size_t>(num_images));
std::vector<uint8_t> jpeg_data;
for (int i = 0; i < num_images; i++) {
@ -964,46 +884,27 @@ std::vector<uint8_t> create_mjpg_avi_from_sd_images_to_vector(sd_image_t* images
return {};
}
avi_chunk_index_entry video_entry = {};
memcpy(video_entry.fourcc, "00dc", 4);
video_entry.flags = 0x10;
video_entry.offset = static_cast<uint32_t>(avi_data.size());
index[i].offset = static_cast<uint32_t>(avi_data.size());
write_fourcc(avi_data, "00dc");
write_u32_le(avi_data, static_cast<uint32_t>(jpeg_data.size()));
video_entry.size = static_cast<uint32_t>(jpeg_data.size());
index[i].size = (uint32_t)jpeg_data.size();
avi_data.insert(avi_data.end(), jpeg_data.begin(), jpeg_data.end());
index.push_back(video_entry);
if (jpeg_data.size() % 2) {
avi_data.push_back(0);
}
}
if (has_audio && !audio_pcm.empty()) {
avi_chunk_index_entry audio_entry = {};
memcpy(audio_entry.fourcc, "01wb", 4);
audio_entry.flags = 0;
audio_entry.offset = static_cast<uint32_t>(avi_data.size());
audio_entry.size = static_cast<uint32_t>(audio_pcm.size());
write_fourcc(avi_data, "01wb");
write_u32_le(avi_data, static_cast<uint32_t>(audio_pcm.size()));
avi_data.insert(avi_data.end(), audio_pcm.begin(), audio_pcm.end());
index.push_back(audio_entry);
if (audio_pcm.size() % 2 != 0) {
avi_data.push_back(0);
}
}
const size_t movi_size = avi_data.size() - movi_size_pos - 4;
patch_u32_le(avi_data, movi_size_pos, static_cast<uint32_t>(movi_size));
write_fourcc(avi_data, "idx1");
write_u32_le(avi_data, static_cast<uint32_t>(index.size() * 16));
for (const auto& entry : index) {
write_fourcc(avi_data, entry.fourcc);
write_u32_le(avi_data, entry.flags);
write_u32_le(avi_data, entry.offset);
write_u32_le(avi_data, entry.size);
write_u32_le(avi_data, num_images * 16);
for (int i = 0; i < num_images; i++) {
write_fourcc(avi_data, "00dc");
write_u32_le(avi_data, 0x10);
write_u32_le(avi_data, index[i].offset);
write_u32_le(avi_data, index[i].size);
}
const size_t file_size = avi_data.size() - riff_size_pos - 4;
@ -1012,8 +913,8 @@ std::vector<uint8_t> create_mjpg_avi_from_sd_images_to_vector(sd_image_t* images
return avi_data;
}
int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality, const sd_audio_t* audio) {
std::vector<uint8_t> avi_data = create_mjpg_avi_from_sd_images_to_vector(images, num_images, fps, quality, audio);
int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) {
std::vector<uint8_t> avi_data = create_mjpg_avi_from_sd_images_to_vector(images, num_images, fps, quality);
if (avi_data.empty()) {
return -1;
}
@ -1143,7 +1044,7 @@ int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images
#endif
#ifdef SD_USE_WEBM
std::vector<uint8_t> create_webm_from_sd_images_to_vector(sd_image_t* images, int num_images, int fps, int quality, const sd_audio_t* audio) {
std::vector<uint8_t> create_webm_from_sd_images_to_vector(sd_image_t* images, int num_images, int fps, int quality) {
if (num_images == 0) {
fprintf(stderr, "Error: Image array is empty.\n");
return {};
@ -1188,25 +1089,6 @@ std::vector<uint8_t> create_webm_from_sd_images_to_vector(sd_image_t* images, in
video_track->set_display_height(static_cast<uint64_t>(height));
video_track->set_frame_rate(static_cast<double>(fps));
}
uint64_t audio_track_number = 0;
std::vector<uint8_t> audio_pcm = audio_to_pcm16_bytes(audio);
if (audio != nullptr && !audio_pcm.empty()) {
audio_track_number = segment.AddAudioTrack(static_cast<int32_t>(audio->sample_rate), static_cast<int32_t>(audio->channels), 0);
if (audio_track_number == 0) {
fprintf(stderr, "Error: Failed to add audio track.\n");
return -1;
}
auto* audio_track = static_cast<mkvmuxer::AudioTrack*>(segment.GetTrackByNumber(audio_track_number));
if (audio_track == nullptr) {
fprintf(stderr, "Error: Failed to get audio track.\n");
return -1;
}
audio_track->set_codec_id("A_PCM/INT/LIT");
audio_track->set_bit_depth(16);
audio_track->set_sample_rate(static_cast<double>(audio->sample_rate));
audio_track->set_channels(audio->channels);
}
segment.GetSegmentInfo()->set_writing_app("stable-diffusion.cpp");
segment.GetSegmentInfo()->set_muxing_app("stable-diffusion.cpp");
@ -1236,23 +1118,6 @@ std::vector<uint8_t> create_webm_from_sd_images_to_vector(sd_image_t* images, in
return -1;
}
if (audio_track_number != 0) {
auto [audio_begin, audio_end] = audio_sample_range_for_video_frame(audio, i, num_images, fps);
const uint64_t frame_samples = audio_end - audio_begin;
if (frame_samples > 0) {
const uint64_t frame_bytes = frame_samples * audio->channels * sizeof(int16_t);
const uint8_t* frame_ptr = audio_pcm.data() + audio_begin * audio->channels * sizeof(int16_t);
if (!segment.AddFrame(frame_ptr,
frame_bytes,
audio_track_number,
timestamp_ns,
true)) {
fprintf(stderr, "Error: Failed to mux audio chunk %d into WebM.\n", i);
return -1;
}
}
}
timestamp_ns += frame_duration_ns;
}
@ -1268,8 +1133,8 @@ std::vector<uint8_t> create_webm_from_sd_images_to_vector(sd_image_t* images, in
return writer.data();
}
int create_webm_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality, const sd_audio_t* audio) {
std::vector<uint8_t> webm_data = create_webm_from_sd_images_to_vector(images, num_images, fps, quality, audio);
int create_webm_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) {
std::vector<uint8_t> webm_data = create_webm_from_sd_images_to_vector(images, num_images, fps, quality);
if (webm_data.empty()) {
return -1;
}
@ -1285,8 +1150,7 @@ std::vector<uint8_t> create_video_from_sd_images_to_vector(const std::string& ou
sd_image_t* images,
int num_images,
int fps,
int quality,
const sd_audio_t* audio) {
int quality) {
std::string format = output_format;
std::transform(format.begin(), format.end(), format.begin(),
[](unsigned char c) { return static_cast<char>(tolower(c)); });
@ -1296,7 +1160,7 @@ std::vector<uint8_t> create_video_from_sd_images_to_vector(const std::string& ou
#ifdef SD_USE_WEBM
if (format == "webm") {
return create_webm_from_sd_images_to_vector(images, num_images, fps, quality, audio);
return create_webm_from_sd_images_to_vector(images, num_images, fps, quality);
}
#endif
@ -1306,14 +1170,14 @@ std::vector<uint8_t> create_video_from_sd_images_to_vector(const std::string& ou
}
#endif
return create_mjpg_avi_from_sd_images_to_vector(images, num_images, fps, quality, audio);
return create_mjpg_avi_from_sd_images_to_vector(images, num_images, fps, quality);
}
int create_video_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality, const sd_audio_t* audio) {
int create_video_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) {
std::string path = filename ? filename : "";
auto pos = path.find_last_of('.');
std::string ext = pos == std::string::npos ? "" : path.substr(pos);
std::vector<uint8_t> video_data = create_video_from_sd_images_to_vector(ext, images, num_images, fps, quality, audio);
std::vector<uint8_t> video_data = create_video_from_sd_images_to_vector(ext, images, num_images, fps, quality);
if (video_data.empty()) {
return -1;
}
@ -1323,54 +1187,3 @@ int create_video_from_sd_images(const char* filename, sd_image_t* images, int nu
}
return 0;
}
bool write_wav_to_file(const std::string& path,
const float* interleaved_samples,
uint64_t sample_count,
uint32_t channels,
uint32_t sample_rate) {
if (interleaved_samples == nullptr || sample_count == 0 || channels == 0 || sample_rate == 0) {
return false;
}
std::ofstream file(path, std::ios::binary);
if (!file.is_open()) {
return false;
}
uint32_t bits_per_sample = 16;
uint32_t bytes_per_sample = bits_per_sample / 8;
uint32_t block_align = channels * bytes_per_sample;
uint32_t byte_rate = sample_rate * block_align;
uint32_t data_size = static_cast<uint32_t>(sample_count * channels * bytes_per_sample);
uint32_t riff_size = 36 + data_size;
file.write("RIFF", 4);
file.write(reinterpret_cast<const char*>(&riff_size), sizeof(riff_size));
file.write("WAVE", 4);
file.write("fmt ", 4);
uint32_t fmt_size = 16;
uint16_t audio_format = 1;
uint16_t wav_channels = static_cast<uint16_t>(channels);
uint16_t wav_block_align = static_cast<uint16_t>(block_align);
uint16_t wav_bits_per_sample = static_cast<uint16_t>(bits_per_sample);
file.write(reinterpret_cast<const char*>(&fmt_size), sizeof(fmt_size));
file.write(reinterpret_cast<const char*>(&audio_format), sizeof(audio_format));
file.write(reinterpret_cast<const char*>(&wav_channels), sizeof(wav_channels));
file.write(reinterpret_cast<const char*>(&sample_rate), sizeof(sample_rate));
file.write(reinterpret_cast<const char*>(&byte_rate), sizeof(byte_rate));
file.write(reinterpret_cast<const char*>(&wav_block_align), sizeof(wav_block_align));
file.write(reinterpret_cast<const char*>(&wav_bits_per_sample), sizeof(wav_bits_per_sample));
file.write("data", 4);
file.write(reinterpret_cast<const char*>(&data_size), sizeof(data_size));
std::vector<int16_t> pcm(sample_count * channels);
for (size_t i = 0; i < pcm.size(); ++i) {
float sample = std::max(-1.0f, std::min(1.0f, interleaved_samples[i]));
pcm[i] = static_cast<int16_t>(std::lrint(sample * 32767.0f));
}
file.write(reinterpret_cast<const char*>(pcm.data()), static_cast<std::streamsize>(pcm.size() * sizeof(int16_t)));
return file.good();
}

View File

@ -57,13 +57,11 @@ int create_mjpg_avi_from_sd_images(const char* filename,
sd_image_t* images,
int num_images,
int fps,
int quality = 90,
const sd_audio_t* audio = nullptr);
int quality = 90);
std::vector<uint8_t> create_mjpg_avi_from_sd_images_to_vector(sd_image_t* images,
int num_images,
int fps,
int quality = 90,
const sd_audio_t* audio = nullptr);
int quality = 90);
#ifdef SD_USE_WEBP
int create_animated_webp_from_sd_images(const char* filename,
@ -82,32 +80,22 @@ int create_webm_from_sd_images(const char* filename,
sd_image_t* images,
int num_images,
int fps,
int quality = 90,
const sd_audio_t* audio = nullptr);
int quality = 90);
std::vector<uint8_t> create_webm_from_sd_images_to_vector(sd_image_t* images,
int num_images,
int fps,
int quality = 90,
const sd_audio_t* audio = nullptr);
int quality = 90);
#endif
int create_video_from_sd_images(const char* filename,
sd_image_t* images,
int num_images,
int fps,
int quality = 90,
const sd_audio_t* audio = nullptr);
int quality = 90);
std::vector<uint8_t> create_video_from_sd_images_to_vector(const std::string& output_format,
sd_image_t* images,
int num_images,
int fps,
int quality = 90,
const sd_audio_t* audio = nullptr);
bool write_wav_to_file(const std::string& path,
const float* interleaved_samples,
uint64_t sample_count,
uint32_t channels,
uint32_t sample_rate);
int quality = 90);
#endif // __MEDIA_IO_H__

View File

@ -205,9 +205,8 @@ Default Generation Options:
--hires-upscaler <string> highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent
(nearest-exact), Latent (antialiased), Latent (bicubic), Latent (bicubic
antialiased), or a model name under --hires-upscalers-dir (default: Latent)
--extra-sample-args <string> extra sampler/scheduler args, key=value list. lcm supports noise_clip_std,
noise_scale_start, noise_scale_end; ltx2 supports max_shift, base_shift,
stretch, terminal
--extra-sample-args <string> extra sampler args, key=value list. Currently lcm supports noise_clip_std,
noise_scale_start, noise_scale_end
-H, --height <int> image height, in pixel space (default: 512)
-W, --width <int> image width, in pixel space (default: 512)
--steps <int> number of sample steps (default: 20)
@ -272,8 +271,8 @@ Default Generation Options:
dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep,
res_2s, er_sde, euler_cfg_pp, euler_a_cfg_pp] default: euler for Flux/SD3/Wan, euler_a otherwise
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits,
smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent, ltx2], default:
model-specific
smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent], default:
discrete
--sigmas custom sigma values for the sampler, comma-separated (e.g.,
"14.61,7.8,3.5,0.0").
--skip-layers layers to skip for SLG steps (default: [7,8,9])

View File

@ -231,21 +231,16 @@ bool execute_vid_gen_job(ServerRuntime& runtime,
sd_vid_gen_params_t params = job.vid_gen.to_sd_vid_gen_params_t();
SDImageVec results;
int num_results = 0;
sd_audio_t* generated_audio = nullptr;
int num_results = 0;
{
std::lock_guard<std::mutex> lock(*runtime.sd_ctx_mutex);
sd_image_t* raw_results = nullptr;
if (!generate_video(runtime.sd_ctx, &params, &raw_results, &num_results, &generated_audio)) {
raw_results = nullptr;
}
sd_image_t* raw_results = generate_video(runtime.sd_ctx, &params, &num_results);
results.adopt(raw_results, num_results);
}
num_results = results.count();
if (num_results <= 0) {
free_sd_audio(generated_audio);
error_message = "generate_video returned no results";
return false;
}
@ -254,9 +249,7 @@ bool execute_vid_gen_job(ServerRuntime& runtime,
results.data(),
num_results,
job.vid_gen.gen_params.fps,
job.vid_gen.output_compression,
generated_audio);
free_sd_audio(generated_audio);
job.vid_gen.output_compression);
if (video_bytes.empty()) {
error_message = "failed to encode generated video container";
return false;

2
ggml

@ -1 +1 @@
Subproject commit 7f4ab364b2843921e795d6890d0f42dd5e5d6b63
Subproject commit 404fcb9d7c96989569e68c9e7881ee3465a05c50

View File

@ -68,7 +68,6 @@ enum scheduler_t {
KL_OPTIMAL_SCHEDULER,
LCM_SCHEDULER,
BONG_TANGENT_SCHEDULER,
LTX2_SCHEDULER,
SCHEDULER_COUNT
};
@ -152,7 +151,6 @@ enum lora_apply_mode_t {
typedef struct {
bool enabled;
bool temporal_tiling;
int tile_size_x;
int tile_size_y;
float target_overlap;
@ -175,9 +173,7 @@ typedef struct {
const char* llm_vision_path;
const char* diffusion_model_path;
const char* high_noise_diffusion_model_path;
const char* embeddings_connectors_path;
const char* vae_path;
const char* audio_vae_path;
const char* taesd_path;
const char* control_net_path;
const sd_embedding_t* embeddings;
@ -214,13 +210,6 @@ typedef struct {
const char* params_backend;
} sd_ctx_params_t;
typedef struct {
uint32_t sample_rate;
uint32_t channels;
uint64_t sample_count;
float* data;
} sd_audio_t;
typedef struct {
uint32_t width;
uint32_t height;
@ -376,7 +365,6 @@ typedef struct {
float strength;
int64_t seed;
int video_frames;
int fps;
float vace_strength;
sd_tiling_params_t vae_tiling_params;
sd_cache_params_t cache;
@ -421,7 +409,6 @@ SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);
SD_API sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params);
SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
SD_API void free_sd_audio(sd_audio_t* audio);
SD_API void sd_sample_params_init(sd_sample_params_t* sample_params);
SD_API char* sd_sample_params_to_str(const sd_sample_params_t* sample_params);
@ -434,11 +421,7 @@ SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_para
SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params);
SD_API void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params);
SD_API bool generate_video(sd_ctx_t* sd_ctx,
const sd_vid_gen_params_t* sd_vid_gen_params,
sd_image_t** frames_out,
int* num_frames_out,
sd_audio_t** audio_out);
SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, int* num_frames_out);
typedef struct upscaler_ctx_t upscaler_ctx_t;

View File

@ -103,64 +103,6 @@ namespace DiT {
x = ggml_ext_slice(ctx, x, 0, 0, W); // [N, C, H, W]
return x;
}
inline ggml_tensor* patchify(ggml_context* ctx,
ggml_tensor* x,
int pt,
int ph,
int pw,
int64_t N = 1) {
// x: [N*C, T, H, W]
// return: [N, h*w, C*pt*ph*pw]
int64_t C = x->ne[3] / N;
int64_t T = x->ne[2];
int64_t H = x->ne[1];
int64_t W = x->ne[0];
int64_t t_len = T / pt;
int64_t h_len = H / ph;
int64_t w_len = W / pw;
GGML_ASSERT(C * N == x->ne[3]);
GGML_ASSERT(t_len * pt == T && h_len * ph == H && w_len * pw == W);
x = ggml_reshape_4d(ctx, x, pw * w_len, ph * h_len, pt, t_len * C * N); // [N*C*t_len, pt, h_len*ph, w_len*pw]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len, h_len*ph, pt, w_len*pw]
x = ggml_reshape_4d(ctx, x, pw * w_len, pt, ph, h_len * t_len * C * N); // [N*C*t_len*h_len, ph, pt, w_len*pw]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, pt, ph, w_len*pw]
x = ggml_reshape_4d(ctx, x, pw, w_len, ph * pt, h_len * t_len * C * N); // [N*C*t_len*h_len, pt*ph, w_len, pw]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, w_len, pt*ph, pw]
x = ggml_reshape_4d(ctx, x, pw * ph * pt, w_len * h_len * t_len, C, N); // [N, C, t_len*h_len*w_len, pt*ph*pw]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N, t_len*h_len*w_len, C, pt*ph*pw]
x = ggml_reshape_4d(ctx, x, pw * ph * pt * C, w_len * h_len * t_len, N, 1); // [N, t_len*h_len*w_len, C*pt*ph*pw]
return x;
}
inline ggml_tensor* unpatchify(ggml_context* ctx,
ggml_tensor* x,
int64_t t_len,
int64_t h_len,
int64_t w_len,
int pt,
int ph,
int pw) {
// x: [N, t_len*h_len*w_len, pt*ph*pw*C]
// return: [N*C, t_len*pt, h_len*ph, w_len*pw]
int64_t N = x->ne[3];
int64_t C = x->ne[0] / pt / ph / pw;
GGML_ASSERT(C * pt * ph * pw == x->ne[0]);
x = ggml_reshape_4d(ctx, x, C, pw * ph * pt, w_len * h_len * t_len, N); // [N, t_len*h_len*w_len, pt*ph*pw, C]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 1, 2, 0, 3)); // [N, C, t_len*h_len*w_len, pt*ph*pw]
x = ggml_reshape_4d(ctx, x, pw, ph * pt, w_len, h_len * t_len * C * N); // [N*C*t_len*h_len, w_len, pt*ph, pw]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, pt*ph, w_len, pw]
x = ggml_reshape_4d(ctx, x, pw * w_len, ph, pt, h_len * t_len * C * N); // [N*C*t_len*h_len, pt, ph, w_len*pw]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, ph, pt, w_len*pw]
x = ggml_reshape_4d(ctx, x, pw * w_len, pt, ph * h_len, t_len * C * N); // [N*C*t_len, h_len*ph, pt, w_len*pw]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len, pt, h_len*ph, w_len*pw]
x = ggml_reshape_4d(ctx, x, pw * w_len, ph * h_len, pt * t_len, C * N); // [N*C, t_len*pt, h_len*ph, w_len*pw]
return x;
}
} // namespace DiT
#endif // __COMMON_DIT_HPP__

View File

@ -1,8 +1,6 @@
#ifndef __CONDITIONER_HPP__
#define __CONDITIONER_HPP__
#include <cmath>
#include <limits>
#include <optional>
#include "clip.hpp"
@ -68,17 +66,6 @@ static inline sd::Tensor<float> apply_token_weights(sd::Tensor<float> hidden_sta
return hidden_states;
}
bool all_one = true;
for (float weight : weights) {
if (weight != 1.0f) {
all_one = false;
break;
}
}
if (all_one) {
return hidden_states;
}
if (hidden_states.dim() == 1) {
hidden_states.unsqueeze_(1);
}
@ -90,7 +77,7 @@ static inline sd::Tensor<float> apply_token_weights(sd::Tensor<float> hidden_sta
chunk_weights.reshape_({1, static_cast<int64_t>(weights.size())});
hidden_states *= chunk_weights;
float new_mean = hidden_states.mean();
if (std::isfinite(original_mean) && std::isfinite(new_mean) && new_mean != 0.0f) {
if (new_mean != 0.0f) {
hidden_states *= (original_mean / new_mean);
}
@ -2035,277 +2022,4 @@ struct LLMEmbedder : public Conditioner {
}
};
struct LTXAVTextProjection : public GGMLBlock {
static constexpr int64_t kHiddenSize = 3840;
static constexpr int64_t kNumStates = 49;
bool dual_projection = false;
LTXAVTextProjection(bool dual_projection = false)
: dual_projection(dual_projection) {
if (dual_projection) {
blocks["video_aggregate_embed"] = std::make_shared<Linear>(kHiddenSize * kNumStates, 4096, true);
blocks["audio_aggregate_embed"] = std::make_shared<Linear>(kHiddenSize * kNumStates, 2048, true);
} else {
blocks["projection"] = std::make_shared<Linear>(kHiddenSize * kNumStates, kHiddenSize, false);
}
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
if (!dual_projection) {
auto projection = std::dynamic_pointer_cast<Linear>(blocks["projection"]);
return projection->forward(ctx, x);
}
auto video_projection = std::dynamic_pointer_cast<Linear>(blocks["video_aggregate_embed"]);
auto audio_projection = std::dynamic_pointer_cast<Linear>(blocks["audio_aggregate_embed"]);
auto video_in = ggml_ext_scale(ctx->ggml_ctx, x, std::sqrt(4096.f / static_cast<float>(kHiddenSize)));
auto audio_in = ggml_ext_scale(ctx->ggml_ctx, x, std::sqrt(2048.f / static_cast<float>(kHiddenSize)));
auto video = video_projection->forward(ctx, video_in);
auto audio = audio_projection->forward(ctx, audio_in);
return ggml_concat(ctx->ggml_ctx, video, audio, 0);
}
};
struct LTXAVTextProjectionRunner : public GGMLRunner {
LTXAVTextProjection model;
LTXAVTextProjectionRunner(ggml_backend_t backend,
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string& prefix = "")
: GGMLRunner(backend, params_backend),
model(tensor_storage_map.find(prefix + ".video_aggregate_embed.weight") != tensor_storage_map.end()) {
model.init(params_ctx, tensor_storage_map, prefix);
}
std::string get_desc() override {
return "ltxav_text_projection";
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) {
model.get_param_tensors(tensors, prefix);
}
ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor) {
ggml_cgraph* gf = ggml_new_graph(compute_ctx);
auto x = make_input(x_tensor);
auto runner_ctx = get_context();
auto out = model.forward(&runner_ctx, x);
ggml_build_forward_expand(gf, out);
return gf;
}
sd::Tensor<float> compute(int n_threads, const sd::Tensor<float>& x) {
auto get_graph = [&]() -> ggml_cgraph* {
return build_graph(x);
};
return take_or_empty(GGMLRunner::compute<float>(get_graph, n_threads, true));
}
};
struct LTXAVEmbedder : public Conditioner {
static constexpr int64_t kHiddenSize = 3840;
static constexpr int64_t kNumStates = 49;
static constexpr int64_t kMinLength = 1024;
std::shared_ptr<GemmaTokenizer> tokenizer;
std::shared_ptr<LLM::LLMRunner> llm;
std::shared_ptr<LTXAVTextProjectionRunner> projector;
bool dual_projection = false;
LTXAVEmbedder(ggml_backend_t backend,
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string& llm_prefix = "text_encoders.llm",
const std::string& projector_prefix = "text_embedding_projection") {
tokenizer = std::make_shared<GemmaTokenizer>();
llm = std::make_shared<LLM::LLMRunner>(LLM::LLMArch::GEMMA3_12B,
backend,
params_backend,
tensor_storage_map,
llm_prefix,
false);
dual_projection = tensor_storage_map.find(projector_prefix + ".video_aggregate_embed.weight") != tensor_storage_map.end();
projector = std::make_shared<LTXAVTextProjectionRunner>(backend,
params_backend,
tensor_storage_map,
projector_prefix);
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
llm->get_param_tensors(tensors, "text_encoders.llm");
projector->get_param_tensors(tensors, "text_embedding_projection");
}
void alloc_params_buffer() override {
llm->alloc_params_buffer();
projector->alloc_params_buffer();
}
void free_params_buffer() override {
llm->free_params_buffer();
projector->free_params_buffer();
}
size_t get_params_buffer_size() override {
return llm->get_params_buffer_size() + projector->get_params_buffer_size();
}
void set_flash_attention_enabled(bool enabled) override {
llm->set_flash_attention_enabled(enabled);
projector->set_flash_attention_enabled(enabled);
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
llm->set_weight_adapter(adapter);
projector->set_weight_adapter(adapter);
}
std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text,
const std::pair<int, int>& attn_range) {
std::vector<std::pair<std::string, float>> parsed_attention;
if (attn_range.first >= 0 && attn_range.second > 0) {
if (attn_range.first > 0) {
parsed_attention.emplace_back(text.substr(0, attn_range.first), 1.f);
}
if (attn_range.second - attn_range.first > 0) {
auto new_parsed_attention = parse_prompt_attention(text.substr(attn_range.first, attn_range.second - attn_range.first));
parsed_attention.insert(parsed_attention.end(), new_parsed_attention.begin(), new_parsed_attention.end());
}
if (static_cast<size_t>(attn_range.second) < text.size()) {
parsed_attention.emplace_back(text.substr(attn_range.second), 1.f);
}
} else {
parsed_attention.emplace_back(text, 1.f);
}
std::vector<int> tokens;
std::vector<float> weights;
for (const auto& item : parsed_attention) {
auto curr_tokens = tokenizer->encode(item.first, nullptr);
tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
weights.insert(weights.end(), curr_tokens.size(), item.second);
}
std::vector<float> mask;
tokenizer->pad_tokens(tokens, &weights, &mask, kMinLength);
return {tokens, weights, mask};
}
sd::Tensor<float> encode_prompt(int n_threads,
const std::string& prompt,
const std::pair<int, int>& prompt_attn_range) {
auto tokens_weights_mask = tokenize(prompt, prompt_attn_range);
auto& tokens = std::get<0>(tokens_weights_mask);
auto& weights = std::get<1>(tokens_weights_mask);
auto& mask = std::get<2>(tokens_weights_mask);
sd::Tensor<int32_t> input_ids({static_cast<int64_t>(tokens.size())}, std::vector<int32_t>(tokens.begin(), tokens.end()));
sd::Tensor<float> attention_mask;
if (!mask.empty()) {
const float mask_min = std::numeric_limits<float>::lowest() / 4.0f;
attention_mask = sd::Tensor<float>({static_cast<int64_t>(mask.size()), static_cast<int64_t>(mask.size())});
for (size_t i1 = 0; i1 < mask.size(); ++i1) {
for (size_t i0 = 0; i0 < mask.size(); ++i0) {
float value = 0.0f;
if (mask[i0] == 0.0f) {
value += mask_min;
}
if (i0 > i1) {
value += mask_min;
}
attention_mask[static_cast<int64_t>(i0 + mask.size() * i1)] = value;
}
}
}
auto hidden_states = llm->compute(n_threads,
input_ids,
attention_mask,
{},
{},
true);
GGML_ASSERT(!hidden_states.empty());
hidden_states = apply_token_weights(std::move(hidden_states), weights);
int64_t valid_tokens = 0;
for (float value : mask) {
valid_tokens += static_cast<int64_t>(value > 0.0f);
}
GGML_ASSERT(valid_tokens > 0);
hidden_states = sd::ops::slice(hidden_states,
1,
hidden_states.shape()[1] - valid_tokens,
hidden_states.shape()[1]);
hidden_states.reshape_({kHiddenSize, kNumStates, valid_tokens});
hidden_states = hidden_states.permute({1, 0, 2});
if (dual_projection) {
for (int64_t state_idx = 0; state_idx < kNumStates; ++state_idx) {
for (int64_t token_idx = 0; token_idx < valid_tokens; ++token_idx) {
double sq_sum = 0.0;
for (int64_t hidden_idx = 0; hidden_idx < kHiddenSize; ++hidden_idx) {
float value = hidden_states.index(state_idx, hidden_idx, token_idx);
sq_sum += static_cast<double>(value) * static_cast<double>(value);
}
float inv_rms = 1.0f / std::sqrt(static_cast<float>(sq_sum / static_cast<double>(kHiddenSize)) + 1e-6f);
for (int64_t hidden_idx = 0; hidden_idx < kHiddenSize; ++hidden_idx) {
hidden_states.index(state_idx, hidden_idx, token_idx) *= inv_rms;
}
}
}
} else {
for (int64_t state_idx = 0; state_idx < kNumStates; ++state_idx) {
double sum = 0.0;
float min_value = std::numeric_limits<float>::infinity();
float max_value = -std::numeric_limits<float>::infinity();
for (int64_t token_idx = 0; token_idx < valid_tokens; ++token_idx) {
for (int64_t hidden_idx = 0; hidden_idx < kHiddenSize; ++hidden_idx) {
float value = hidden_states.index(state_idx, hidden_idx, token_idx);
sum += value;
min_value = std::min(min_value, value);
max_value = std::max(max_value, value);
}
}
float mean_value = static_cast<float>(sum / static_cast<double>(kHiddenSize * valid_tokens));
float denom = max_value - min_value + 1e-6f;
float scale_value = 8.0f / denom;
for (int64_t token_idx = 0; token_idx < valid_tokens; ++token_idx) {
for (int64_t hidden_idx = 0; hidden_idx < kHiddenSize; ++hidden_idx) {
float value = hidden_states.index(state_idx, hidden_idx, token_idx);
hidden_states.index(state_idx, hidden_idx, token_idx) = (value - mean_value) * scale_value;
}
}
}
}
hidden_states.reshape_({kNumStates * kHiddenSize, valid_tokens});
return projector->compute(n_threads, hidden_states);
}
SDCondition get_learned_condition(int n_threads,
const ConditionerParams& conditioner_params) override {
int64_t t0 = ggml_time_ms();
std::string prompt;
std::pair<int, int> prompt_attn_range;
prompt_attn_range.first = static_cast<int>(prompt.size());
prompt += conditioner_params.text;
prompt_attn_range.second = static_cast<int>(prompt.size());
auto hidden_states = encode_prompt(n_threads, prompt, prompt_attn_range);
GGML_ASSERT(!hidden_states.empty());
int64_t t1 = ggml_time_ms();
LOG_DEBUG("computing LTXAV condition graph completed, taking %" PRId64 " ms", t1 - t0);
SDCondition result;
result.c_crossattn = std::move(hidden_states);
return result;
}
};
#endif

View File

@ -1,16 +1,12 @@
#ifndef __DENOISER_HPP__
#define __DENOISER_HPP__
#include <algorithm>
#include <cctype>
#include <cmath>
#include <functional>
#include <string>
#include <utility>
#include "ggml_extend.hpp"
#include "gits_noise.inl"
#include "guidance.h"
#include "tensor.hpp"
/*================================================= CompVisDenoiser ==================================================*/
@ -484,141 +480,6 @@ struct KLOptimalScheduler : SigmaScheduler {
}
};
struct LTX2Scheduler : SigmaScheduler {
int token_count = 4096;
float max_shift = 2.05f;
float base_shift = 0.95f;
bool stretch = true;
float terminal = 0.1f;
explicit LTX2Scheduler(int token_count, const char* extra_sample_args = nullptr)
: token_count(token_count > 0 ? token_count : 4096) {
parse_extra_sample_args(extra_sample_args);
}
static std::string trim(std::string value) {
const char* whitespace = " \t\r\n";
size_t begin = value.find_first_not_of(whitespace);
if (begin == std::string::npos) {
return "";
}
size_t end = value.find_last_not_of(whitespace);
return value.substr(begin, end - begin + 1);
}
void parse_extra_sample_args(const char* extra_sample_args) {
if (extra_sample_args == nullptr || extra_sample_args[0] == '\0') {
return;
}
std::string raw(extra_sample_args);
size_t start = 0;
auto parse_arg = [&](const std::string& item) {
std::string token = trim(item);
if (token.empty()) {
return;
}
size_t eq = token.find('=');
if (eq == std::string::npos) {
LOG_WARN("ignoring invalid ltx2 scheduler arg '%s'", token.c_str());
return;
}
std::string key = trim(token.substr(0, eq));
std::string value = trim(token.substr(eq + 1));
auto parse_float = [&](float* out) -> bool {
try {
size_t consumed = 0;
float parsed = std::stof(value, &consumed);
if (!trim(value.substr(consumed)).empty()) {
return false;
}
*out = parsed;
return true;
} catch (const std::exception&) {
return false;
}
};
try {
if (key == "max_shift") {
if (!parse_float(&max_shift)) {
LOG_WARN("ignoring invalid ltx2 scheduler arg '%s'", token.c_str());
}
} else if (key == "base_shift") {
if (!parse_float(&base_shift)) {
LOG_WARN("ignoring invalid ltx2 scheduler arg '%s'", token.c_str());
}
} else if (key == "terminal") {
if (!parse_float(&terminal)) {
LOG_WARN("ignoring invalid ltx2 scheduler arg '%s'", token.c_str());
}
} else if (key == "stretch") {
std::string v = value;
std::transform(v.begin(), v.end(), v.begin(), [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
if (v == "1" || v == "true" || v == "yes" || v == "on") {
stretch = true;
} else if (v == "0" || v == "false" || v == "no" || v == "off") {
stretch = false;
} else {
LOG_WARN("ignoring invalid ltx2 scheduler arg '%s'", token.c_str());
}
} else {
LOG_WARN("ignoring unknown ltx2 scheduler arg '%s'", key.c_str());
}
} catch (const std::exception&) {
LOG_WARN("ignoring invalid ltx2 scheduler arg '%s'", token.c_str());
}
};
for (size_t pos = 0; pos <= raw.size(); ++pos) {
if (pos == raw.size() || raw[pos] == ',' || raw[pos] == ';') {
parse_arg(raw.substr(start, pos - start));
start = pos + 1;
}
}
}
std::vector<float> get_sigmas(uint32_t n, float /*sigma_min*/, float /*sigma_max*/, t_to_sigma_t /*t_to_sigma*/) override {
std::vector<float> sigmas;
if (n == 0) {
sigmas.push_back(0.0f);
return sigmas;
}
constexpr float base_shift_anchor = 1024.0f;
constexpr float max_shift_anchor = 4096.0f;
float m = (max_shift - base_shift) / (max_shift_anchor - base_shift_anchor);
float b = base_shift - m * base_shift_anchor;
float sigma_shift = static_cast<float>(token_count) * m + b;
float exp_shift = std::exp(sigma_shift);
float target_terminal = std::clamp(terminal, 0.0f, 0.99f);
LOG_DEBUG("LTX2 scheduler: tokens=%d, shift=%.4f, stretch=%d, terminal=%.4f", token_count, sigma_shift, stretch ? 1 : 0, target_terminal);
sigmas.reserve(n + 1);
for (uint32_t i = 0; i <= n; ++i) {
float sigma = 1.0f - static_cast<float>(i) / static_cast<float>(n);
if (sigma != 0.0f) {
sigma = exp_shift / (exp_shift + (1.0f / sigma - 1.0f));
}
sigmas.push_back(sigma);
}
if (stretch && sigmas.size() > 2) {
float one_minus_last = 1.0f - sigmas[n - 1];
float scale_factor = one_minus_last / (1.0f - target_terminal);
if (scale_factor > 1e-8f) {
for (uint32_t i = 0; i < n; ++i) {
sigmas[i] = 1.0f - (1.0f - sigmas[i]) / scale_factor;
}
}
}
sigmas[n] = 0.0f;
return sigmas;
}
};
struct Denoiser {
virtual float sigma_min() = 0;
virtual float sigma_max() = 0;
@ -631,7 +492,7 @@ struct Denoiser {
virtual sd::Tensor<float> inverse_noise_scaling(float sigma,
const sd::Tensor<float>& latent) = 0;
virtual std::vector<float> get_sigmas(uint32_t n, int image_seq_len, scheduler_t scheduler_type, SDVersion version, const char* extra_sample_args = nullptr) {
virtual std::vector<float> get_sigmas(uint32_t n, int /*image_seq_len*/, scheduler_t scheduler_type, SDVersion version) {
auto bound_t_to_sigma = std::bind(&Denoiser::t_to_sigma, this, std::placeholders::_1);
std::shared_ptr<SigmaScheduler> scheduler;
switch (scheduler_type) {
@ -679,10 +540,6 @@ struct Denoiser {
LOG_INFO("get_sigmas with LCM scheduler");
scheduler = std::make_shared<LCMScheduler>();
break;
case LTX2_SCHEDULER:
LOG_INFO("get_sigmas with LTX2 scheduler");
scheduler = std::make_shared<LTX2Scheduler>(image_seq_len, extra_sample_args);
break;
default:
LOG_INFO("get_sigmas with discrete scheduler (default)");
scheduler = std::make_shared<DiscreteScheduler>();
@ -888,15 +745,15 @@ struct Flux2FlowDenoiser : public FluxFlowDenoiser {
return mu;
}
std::vector<float> get_sigmas(uint32_t n, int image_seq_len, scheduler_t scheduler_type, SDVersion version, const char* extra_sample_args = nullptr) override {
std::vector<float> get_sigmas(uint32_t n, int image_seq_len, scheduler_t scheduler_type, SDVersion version) override {
float mu = compute_empirical_mu(n, image_seq_len);
LOG_DEBUG("Flux2FlowDenoiser: set shift to %.3f", mu);
set_shift(mu);
return Denoiser::get_sigmas(n, image_seq_len, scheduler_type, version, extra_sample_args);
return Denoiser::get_sigmas(n, image_seq_len, scheduler_type, version);
}
};
typedef std::function<sd::guidance::GuiderOutput(const sd::Tensor<float>&, float, int)> denoise_cb_t;
typedef std::function<sd::Tensor<float>(const sd::Tensor<float>&, float, int, sd::Tensor<float>*)> denoise_cb_t;
static std::pair<float, float> get_ancestral_step(float sigma_from,
float sigma_to,
@ -974,11 +831,11 @@ static sd::Tensor<float> sample_euler_ancestral(denoise_cb_t model,
for (int i = 0; i < steps; i++) {
float sigma = sigmas[i];
float sigma_to = sigmas[i + 1];
auto denoised_opt = model(x, sigma, i + 1);
if (denoised_opt.pred.empty()) {
auto denoised_opt = model(x, sigma, i + 1, nullptr);
if (denoised_opt.empty()) {
return {};
}
sd::Tensor<float> denoised = std::move(denoised_opt.pred);
sd::Tensor<float> denoised = std::move(denoised_opt);
if (sigma_to == 0.f) {
x = denoised;
} else if (eta == 0.f) {
@ -1005,11 +862,11 @@ static sd::Tensor<float> sample_euler(denoise_cb_t model,
int steps = static_cast<int>(sigmas.size()) - 1;
for (int i = 0; i < steps; i++) {
float sigma = sigmas[i];
auto denoised_opt = model(x, sigma, i + 1);
if (denoised_opt.pred.empty()) {
auto denoised_opt = model(x, sigma, i + 1, nullptr);
if (denoised_opt.empty()) {
return {};
}
sd::Tensor<float> denoised = std::move(denoised_opt.pred);
sd::Tensor<float> denoised = std::move(denoised_opt);
sd::Tensor<float> d = (x - denoised) / sigma;
x += d * (sigmas[i + 1] - sigma);
}
@ -1021,22 +878,22 @@ static sd::Tensor<float> sample_heun(denoise_cb_t model,
const std::vector<float>& sigmas) {
int steps = static_cast<int>(sigmas.size()) - 1;
for (int i = 0; i < steps; i++) {
auto denoised_opt = model(x, sigmas[i], -(i + 1));
if (denoised_opt.pred.empty()) {
auto denoised_opt = model(x, sigmas[i], -(i + 1), nullptr);
if (denoised_opt.empty()) {
return {};
}
sd::Tensor<float> denoised = std::move(denoised_opt.pred);
sd::Tensor<float> denoised = std::move(denoised_opt);
sd::Tensor<float> d = (x - denoised) / sigmas[i];
float dt = sigmas[i + 1] - sigmas[i];
if (sigmas[i + 1] == 0) {
x += d * dt;
} else {
sd::Tensor<float> x2 = x + d * dt;
auto denoised2_opt = model(x2, sigmas[i + 1], i + 1);
if (denoised2_opt.pred.empty()) {
auto denoised2_opt = model(x2, sigmas[i + 1], i + 1, nullptr);
if (denoised2_opt.empty()) {
return {};
}
sd::Tensor<float> denoised2 = std::move(denoised2_opt.pred);
sd::Tensor<float> denoised2 = std::move(denoised2_opt);
d = (d + (x2 - denoised2) / sigmas[i + 1]) / 2.0f;
x += d * dt;
}
@ -1049,11 +906,11 @@ static sd::Tensor<float> sample_dpm2(denoise_cb_t model,
const std::vector<float>& sigmas) {
int steps = static_cast<int>(sigmas.size()) - 1;
for (int i = 0; i < steps; i++) {
auto denoised_opt = model(x, sigmas[i], -(i + 1));
if (denoised_opt.pred.empty()) {
auto denoised_opt = model(x, sigmas[i], -(i + 1), nullptr);
if (denoised_opt.empty()) {
return {};
}
sd::Tensor<float> denoised = std::move(denoised_opt.pred);
sd::Tensor<float> denoised = std::move(denoised_opt);
sd::Tensor<float> d = (x - denoised) / sigmas[i];
if (sigmas[i + 1] == 0) {
x += d * (sigmas[i + 1] - sigmas[i]);
@ -1062,11 +919,11 @@ static sd::Tensor<float> sample_dpm2(denoise_cb_t model,
float dt_1 = sigma_mid - sigmas[i];
float dt_2 = sigmas[i + 1] - sigmas[i];
sd::Tensor<float> x2 = x + d * dt_1;
auto denoised2_opt = model(x2, sigma_mid, i + 1);
if (denoised2_opt.pred.empty()) {
auto denoised2_opt = model(x2, sigma_mid, i + 1, nullptr);
if (denoised2_opt.empty()) {
return {};
}
sd::Tensor<float> denoised2 = std::move(denoised2_opt.pred);
sd::Tensor<float> denoised2 = std::move(denoised2_opt);
x += ((x2 - denoised2) / sigma_mid) * dt_2;
}
}
@ -1083,11 +940,11 @@ static sd::Tensor<float> sample_dpmpp_2s_ancestral(denoise_cb_t model,
int steps = static_cast<int>(sigmas.size()) - 1;
for (int i = 0; i < steps; i++) {
auto denoised_opt = model(x, sigmas[i], -(i + 1));
if (denoised_opt.pred.empty()) {
auto denoised_opt = model(x, sigmas[i], -(i + 1), nullptr);
if (denoised_opt.empty()) {
return {};
}
sd::Tensor<float> denoised = std::move(denoised_opt.pred);
sd::Tensor<float> denoised = std::move(denoised_opt);
auto [sigma_down, sigma_up] = get_ancestral_step(sigmas[i], sigmas[i + 1], eta);
if (sigma_down == 0) {
@ -1099,11 +956,11 @@ static sd::Tensor<float> sample_dpmpp_2s_ancestral(denoise_cb_t model,
float s = t + 0.5f * h;
float sigma_s = sigma_fn(s);
sd::Tensor<float> x2 = (sigma_s / sigma_fn(t)) * x - (exp(-h * 0.5f) - 1) * denoised;
auto denoised2_opt = model(x2, sigma_s, i + 1);
if (denoised2_opt.pred.empty()) {
auto denoised2_opt = model(x2, sigma_s, i + 1, nullptr);
if (denoised2_opt.empty()) {
return {};
}
sd::Tensor<float> denoised2 = std::move(denoised2_opt.pred);
sd::Tensor<float> denoised2 = std::move(denoised2_opt);
x = (sigma_fn(t_next) / sigma_fn(t)) * x - (exp(-h) - 1) * denoised2;
}
@ -1126,11 +983,11 @@ static sd::Tensor<float> sample_dpmpp_2s_ancestral_flow(denoise_cb_t model,
bool opt_first_step = (1.0 - sigma < 1e-6);
auto denoised_opt = model(x, sigma, (opt_first_step ? 1 : -1) * (i + 1));
if (denoised_opt.pred.empty()) {
auto denoised_opt = model(x, sigma, (opt_first_step ? 1 : -1) * (i + 1), nullptr);
if (denoised_opt.empty()) {
return {};
}
sd::Tensor<float> denoised = std::move(denoised_opt.pred);
sd::Tensor<float> denoised = std::move(denoised_opt);
if (sigma_to == 0.0f) {
// Euler method (final step, no noise)
@ -1155,8 +1012,8 @@ static sd::Tensor<float> sample_dpmpp_2s_ancestral_flow(denoise_cb_t model,
// so sigma_s = 1 = sigma, and sigma_s_i_ratio = sigma_s / sigma = 1
// u = (x*sigma_s_i_ratio)+(denoised*(1.0f-sigma_s_i_ratio))
// = (x*1)+(denoised*0) = x
// so D_i = model(u, sigma_s, i + 1)
// = model(x, sigma, i + 1)
// so D_i = model(u, sigma_s, i + 1, nullptr)
// = model(x, sigma, i + 1, nullptr)
// = denoised
D_i = denoised;
@ -1189,11 +1046,11 @@ static sd::Tensor<float> sample_dpmpp_2s_ancestral_flow(denoise_cb_t model,
float sigma_s_i_ratio = sigma_s / sigma;
sd::Tensor<float> u = (x * sigma_s_i_ratio) + (denoised * (1.0f - sigma_s_i_ratio));
auto denoised2_opt = model(u, sigma_s, i + 1);
if (denoised2_opt.pred.empty()) {
auto denoised2_opt = model(u, sigma_s, i + 1, nullptr);
if (denoised2_opt.empty()) {
return {};
}
D_i = std::move(denoised2_opt.pred);
D_i = std::move(denoised2_opt);
}
float sigma_down_i_ratio = sigma_down / sigma;
@ -1216,11 +1073,11 @@ static sd::Tensor<float> sample_dpmpp_2m(denoise_cb_t model,
int steps = static_cast<int>(sigmas.size()) - 1;
for (int i = 0; i < steps; i++) {
auto denoised_opt = model(x, sigmas[i], i + 1);
if (denoised_opt.pred.empty()) {
auto denoised_opt = model(x, sigmas[i], i + 1, nullptr);
if (denoised_opt.empty()) {
return {};
}
sd::Tensor<float> denoised = std::move(denoised_opt.pred);
sd::Tensor<float> denoised = std::move(denoised_opt);
float t = t_fn(sigmas[i]);
float t_next = t_fn(sigmas[i + 1]);
float h = t_next - t;
@ -1248,11 +1105,11 @@ static sd::Tensor<float> sample_dpmpp_2m_v2(denoise_cb_t model,
int steps = static_cast<int>(sigmas.size()) - 1;
for (int i = 0; i < steps; i++) {
auto denoised_opt = model(x, sigmas[i], i + 1);
if (denoised_opt.pred.empty()) {
auto denoised_opt = model(x, sigmas[i], i + 1, nullptr);
if (denoised_opt.empty()) {
return {};
}
sd::Tensor<float> denoised = std::move(denoised_opt.pred);
sd::Tensor<float> denoised = std::move(denoised_opt);
float t = t_fn(sigmas[i]);
float t_next = t_fn(sigmas[i + 1]);
float h = t_next - t;
@ -1356,11 +1213,11 @@ static sd::Tensor<float> sample_lcm(denoise_cb_t model,
int steps = static_cast<int>(sigmas.size()) - 1;
for (int i = 0; i < steps; i++) {
auto denoised_opt = model(x, sigmas[i], i + 1);
if (denoised_opt.pred.empty()) {
auto denoised_opt = model(x, sigmas[i], i + 1, nullptr);
if (denoised_opt.empty()) {
return {};
}
x = std::move(denoised_opt.pred);
x = std::move(denoised_opt);
if (sigmas[i + 1] > 0) {
if (is_flow_denoiser) {
x *= (1 - sigmas[i + 1]);
@ -1402,11 +1259,11 @@ static sd::Tensor<float> sample_ipndm(denoise_cb_t model,
float sigma = sigmas[i];
float sigma_next = sigmas[i + 1];
auto denoised_opt = model(x, sigma, i + 1);
if (denoised_opt.pred.empty()) {
auto denoised_opt = model(x, sigma, i + 1, nullptr);
if (denoised_opt.empty()) {
return {};
}
sd::Tensor<float> denoised = std::move(denoised_opt.pred);
sd::Tensor<float> denoised = std::move(denoised_opt);
sd::Tensor<float> d_cur = (x - denoised) / sigma;
int order = std::min(max_order, i + 1);
@ -1446,11 +1303,11 @@ static sd::Tensor<float> sample_ipndm_v(denoise_cb_t model,
float sigma = sigmas[i];
float t_next = sigmas[i + 1];
auto denoised_opt = model(x, sigma, i + 1);
if (denoised_opt.pred.empty()) {
auto denoised_opt = model(x, sigma, i + 1, nullptr);
if (denoised_opt.empty()) {
return {};
}
sd::Tensor<float> denoised = std::move(denoised_opt.pred);
sd::Tensor<float> denoised = std::move(denoised_opt);
sd::Tensor<float> d_cur = (x - denoised) / sigma;
int order = std::min(max_order, i + 1);
@ -1508,11 +1365,11 @@ static sd::Tensor<float> sample_res_multistep(denoise_cb_t model,
int steps = static_cast<int>(sigmas.size()) - 1;
for (int i = 0; i < steps; i++) {
auto denoised_opt = model(x, sigmas[i], i + 1);
if (denoised_opt.pred.empty()) {
auto denoised_opt = model(x, sigmas[i], i + 1, nullptr);
if (denoised_opt.empty()) {
return {};
}
sd::Tensor<float> denoised = std::move(denoised_opt.pred);
sd::Tensor<float> denoised = std::move(denoised_opt);
float sigma_from = sigmas[i];
float sigma_to = sigmas[i + 1];
@ -1585,11 +1442,11 @@ static sd::Tensor<float> sample_res_2s(denoise_cb_t model,
float sigma_from = sigmas[i];
float sigma_to = sigmas[i + 1];
auto denoised_opt = model(x, sigma_from, -(i + 1));
if (denoised_opt.pred.empty()) {
auto denoised_opt = model(x, sigma_from, -(i + 1), nullptr);
if (denoised_opt.empty()) {
return {};
}
sd::Tensor<float> denoised = std::move(denoised_opt.pred);
sd::Tensor<float> denoised = std::move(denoised_opt);
auto [sigma_down, sigma_up, alpha_scale] = get_ancestral_step(sigma_from, sigma_to, eta, is_flow_denoiser);
@ -1611,11 +1468,11 @@ static sd::Tensor<float> sample_res_2s(denoise_cb_t model,
sd::Tensor<float> eps1 = denoised - x0;
sd::Tensor<float> x2 = x0 + eps1 * (h * a21);
auto denoised2_opt = model(x2, sigma_c2, i + 1);
if (denoised2_opt.pred.empty()) {
auto denoised2_opt = model(x2, sigma_c2, i + 1, nullptr);
if (denoised2_opt.empty()) {
return {};
}
sd::Tensor<float> denoised2 = std::move(denoised2_opt.pred);
sd::Tensor<float> denoised2 = std::move(denoised2_opt);
sd::Tensor<float> eps2 = denoised2 - x0;
x = x0 + h * (b1 * eps1 + b2 * eps2);
}
@ -1688,11 +1545,10 @@ static sd::Tensor<float> sample_er_sde(denoise_cb_t model,
int steps = static_cast<int>(sigmas.size()) - 1;
for (int i = 0; i < steps; i++) {
auto denoised_opt = model(x, sigmas[i], i + 1);
if (denoised_opt.pred.empty()) {
sd::Tensor<float> denoised = model(x, sigmas[i], i + 1, nullptr);
if (denoised.empty()) {
return {};
}
sd::Tensor<float> denoised = std::move(denoised_opt.pred);
int stage_used = std::min(max_stage, i + 1);
@ -1807,11 +1663,11 @@ static sd::Tensor<float> sample_tcd(denoise_cb_t model,
int timestep_s = (int)floor((1 - eta) * prev_timestep);
float sigma = sigmas[i];
auto denoised_opt = model(x, sigma, i + 1);
if (denoised_opt.pred.empty()) {
auto denoised_opt = model(x, sigma, i + 1, nullptr);
if (denoised_opt.empty()) {
return {};
}
sd::Tensor<float> denoised = std::move(denoised_opt.pred);
sd::Tensor<float> denoised = std::move(denoised_opt);
sd::Tensor<float> d = (x - denoised) / sigma;
float alpha_prod_t = 1.0f / (sigma * sigma + 1.0f);
@ -1836,15 +1692,16 @@ static sd::Tensor<float> sample_euler_cfg_pp(denoise_cb_t model,
const std::vector<float>& sigmas) {
int steps = static_cast<int>(sigmas.size()) - 1;
for (int i = 0; i < steps; i++) {
float sigma = sigmas[i];
auto denoised_opt = model(x, sigma, i + 1);
if (denoised_opt.pred.empty() || denoised_opt.pred_uncond.empty()) {
float sigma = sigmas[i];
sd::Tensor<float> uncond_denoised;
auto denoised_opt = model(x, sigma, i + 1, &uncond_denoised);
if (denoised_opt.empty() || uncond_denoised.empty()) {
return {};
}
sd::Tensor<float> denoised = std::move(denoised_opt.pred);
sd::Tensor<float> uncond_denoised = std::move(denoised_opt.pred_uncond);
sd::Tensor<float> d = (x - uncond_denoised) / sigma;
sd::Tensor<float> denoised = std::move(denoised_opt);
sd::Tensor<float> d = (x - uncond_denoised) / sigma;
x = denoised + d * sigmas[i + 1];
}
@ -1858,15 +1715,16 @@ static sd::Tensor<float> sample_euler_ancestral_cfg_pp(denoise_cb_t model,
float eta) {
int steps = static_cast<int>(sigmas.size()) - 1;
for (int i = 0; i < steps; i++) {
float sigma = sigmas[i];
auto denoised_opt = model(x, sigma, i + 1);
if (denoised_opt.pred.empty() || denoised_opt.pred_uncond.empty()) {
float sigma = sigmas[i];
sd::Tensor<float> uncond_denoised;
auto denoised_opt = model(x, sigma, i + 1, &uncond_denoised);
if (denoised_opt.empty() || uncond_denoised.empty()) {
return {};
}
sd::Tensor<float> denoised = std::move(denoised_opt.pred);
sd::Tensor<float> uncond_denoised = std::move(denoised_opt.pred_uncond);
sd::Tensor<float> d = (x - uncond_denoised) / sigma;
sd::Tensor<float> denoised = std::move(denoised_opt);
sd::Tensor<float> d = (x - uncond_denoised) / sigma;
auto [sigma_down, sigma_up] = get_ancestral_step(sigmas[i], sigmas[i + 1], eta);

View File

@ -6,7 +6,6 @@
#include "ernie_image.hpp"
#include "flux.hpp"
#include "hidream_o1.hpp"
#include "ltxv.hpp"
#include "mmdit.hpp"
#include "qwen_image.hpp"
#include "tensor_ggml.hpp"
@ -17,8 +16,6 @@
struct DiffusionParams {
const sd::Tensor<float>* x = nullptr;
const sd::Tensor<float>* timesteps = nullptr;
const sd::Tensor<float>* audio_x = nullptr;
const sd::Tensor<float>* audio_timesteps = nullptr;
const sd::Tensor<float>* context = nullptr;
const sd::Tensor<float>* c_concat = nullptr;
const sd::Tensor<float>* y = nullptr;
@ -38,9 +35,6 @@ struct DiffusionParams {
float control_strength = 0.f;
const sd::Tensor<float>* vace_context = nullptr;
float vace_strength = 1.f;
int audio_length = 0;
float frame_rate = 24.f;
const sd::Tensor<float>* video_positions = nullptr;
const std::vector<int>* skip_layers = nullptr;
};
@ -701,75 +695,4 @@ struct ErnieImageModel : public DiffusionModel {
}
};
struct LTXAVModel : public DiffusionModel {
std::string prefix;
LTXV::LTXAVRunner ltxav;
LTXAVModel(ggml_backend_t backend,
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model")
: prefix(prefix), ltxav(backend, params_backend, tensor_storage_map, prefix) {
}
std::string get_desc() override {
return ltxav.get_desc();
}
void alloc_params_buffer() override {
ltxav.alloc_params_buffer();
}
void free_params_buffer() override {
ltxav.free_params_buffer();
}
void free_compute_buffer() override {
ltxav.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
ltxav.get_param_tensors(tensors, prefix);
}
size_t get_params_buffer_size() override {
return ltxav.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
ltxav.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return 0;
}
void set_flash_attention_enabled(bool enabled) override {
ltxav.set_flash_attention_enabled(enabled);
}
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
ltxav.set_max_graph_vram_bytes(max_vram_bytes);
}
void set_circular_axes(bool circular_x, bool circular_y) override {
ltxav.set_circular_axes(circular_x, circular_y);
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
return ltxav.compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context),
tensor_or_empty(diffusion_params.audio_x),
tensor_or_empty(diffusion_params.audio_timesteps),
diffusion_params.audio_length,
diffusion_params.frame_rate,
tensor_or_empty(diffusion_params.video_positions));
}
};
#endif

View File

@ -1127,33 +1127,18 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_conv_3d(ggml_context* ctx,
ggml_tensor* w,
ggml_tensor* b,
int64_t IC,
int s0 = 1,
int s1 = 1,
int s2 = 1,
int p0 = 0,
int p1 = 0,
int p2 = 0,
int d0 = 1,
int d1 = 1,
int d2 = 1,
bool force_prec_f32 = false) {
if (force_prec_f32) {
ggml_tensor* im2col = ggml_im2col_3d(ctx, w, x, IC, s0, s1, s2, p0, p1, p2, d0, d1, d2, w->type);
int64_t OC = w->ne[3] / IC;
int64_t N = x->ne[3] / IC;
x = ggml_mul_mat(ctx,
ggml_reshape_2d(ctx, im2col, im2col->ne[0], im2col->ne[3] * im2col->ne[2] * im2col->ne[1]),
ggml_reshape_2d(ctx, w, w->ne[0] * w->ne[1] * w->ne[2] * IC, OC));
ggml_mul_mat_set_prec(x, GGML_PREC_F32);
int64_t OD = im2col->ne[3] / N;
x = ggml_reshape_4d(ctx, x, im2col->ne[1] * im2col->ne[2], OD, N, OC);
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 1, 3, 2));
x = ggml_reshape_4d(ctx, x, im2col->ne[1], im2col->ne[2], OD, OC * N);
} else {
x = ggml_conv_3d(ctx, w, x, IC, s0, s1, s2, p0, p1, p2, d0, d1, d2);
}
int s0 = 1,
int s1 = 1,
int s2 = 1,
int p0 = 0,
int p1 = 0,
int p2 = 0,
int d0 = 1,
int d1 = 1,
int d2 = 1) {
int64_t OC = w->ne[3] / IC;
int64_t N = x->ne[3] / IC;
x = ggml_conv_3d(ctx, w, x, IC, s0, s1, s2, p0, p1, p2, d0, d1, d2);
if (b != nullptr) {
b = ggml_reshape_4d(ctx, b, 1, 1, 1, b->ne[0]); // [OC, 1, 1, 1]
@ -3148,7 +3133,6 @@ protected:
std::tuple<int, int, int> padding;
std::tuple<int, int, int> dilation;
bool bias;
bool force_prec_f32;
std::string prefix;
void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override {
@ -3172,16 +3156,14 @@ public:
std::tuple<int, int, int> stride = {1, 1, 1},
std::tuple<int, int, int> padding = {0, 0, 0},
std::tuple<int, int, int> dilation = {1, 1, 1},
bool bias = true,
bool force_prec_f32 = false)
bool bias = true)
: in_channels(in_channels),
out_channels(out_channels),
kernel_size(kernel_size),
stride(stride),
padding(padding),
dilation(dilation),
bias(bias),
force_prec_f32(force_prec_f32) {}
bias(bias) {}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
ggml_tensor* w = params["weight"];
@ -3201,8 +3183,7 @@ public:
return ggml_ext_conv_3d(ctx->ggml_ctx, x, w, b, in_channels,
std::get<2>(stride), std::get<1>(stride), std::get<0>(stride),
std::get<2>(padding), std::get<1>(padding), std::get<0>(padding),
std::get<2>(dilation), std::get<1>(dilation), std::get<0>(dilation),
force_prec_f32);
std::get<2>(dilation), std::get<1>(dilation), std::get<0>(dilation));
}
};

View File

@ -1,89 +0,0 @@
#include "guidance.h"
#include <utility>
namespace sd::guidance {
static bool has_tensor(const sd::Tensor<float>* tensor) {
return tensor != nullptr && !tensor->empty();
}
ClassifierFreeGuidance::ClassifierFreeGuidance(float guidance_scale,
float image_guidance_scale)
: guidance_scale_(guidance_scale),
image_guidance_scale_(image_guidance_scale) {
}
GuiderOutput ClassifierFreeGuidance::forward(const GuidanceInput& input,
GuiderOutput previous) const {
(void)previous;
GuiderOutput output;
if (!has_tensor(input.pred_cond)) {
return output;
}
const sd::Tensor<float>& pred_cond = *input.pred_cond;
output.pred = pred_cond;
if (has_tensor(input.pred_uncond)) {
const sd::Tensor<float>& pred_uncond = *input.pred_uncond;
if (has_tensor(input.pred_img_cond)) {
const sd::Tensor<float>& pred_img_cond = *input.pred_img_cond;
output.pred = pred_uncond +
image_guidance_scale_ * (pred_img_cond - pred_uncond) +
guidance_scale_ * (pred_cond - pred_img_cond);
} else {
output.pred = pred_uncond + guidance_scale_ * (pred_cond - pred_uncond);
}
} else if (has_tensor(input.pred_img_cond)) {
const sd::Tensor<float>& pred_img_cond = *input.pred_img_cond;
output.pred = pred_img_cond + guidance_scale_ * (pred_cond - pred_img_cond);
}
return output;
}
SkipLayerGuidance::SkipLayerGuidance(std::vector<int> layers,
float scale,
float start,
float stop)
: layers_(std::move(layers)),
scale_(scale),
start_(start),
stop_(stop) {
}
bool SkipLayerGuidance::is_enabled_for_step(const GuidanceInput& input) const {
if (scale_ == 0.0f || layers_.empty() || input.schedule_size == 0) {
return false;
}
int start_step = static_cast<int>(start_ * static_cast<float>(input.schedule_size));
int stop_step = static_cast<int>(stop_ * static_cast<float>(input.schedule_size));
return input.step > start_step && input.step < stop_step;
}
const std::vector<int>& SkipLayerGuidance::layers() const {
return layers_;
}
GuiderOutput SkipLayerGuidance::forward(const GuidanceInput& input,
GuiderOutput output) const {
if (!is_enabled_for_step(input) || !input.predict_skip_layer) {
return output;
}
if (output.pred.empty() || !has_tensor(input.pred_cond)) {
return GuiderOutput();
}
output.pred_skip_layer = input.predict_skip_layer();
if (output.pred_skip_layer.empty()) {
return GuiderOutput();
}
output.pred += (*input.pred_cond - output.pred_skip_layer) * scale_;
return output;
}
} // namespace sd::guidance

View File

@ -1,70 +0,0 @@
#ifndef __SD_GUIDANCE_H__
#define __SD_GUIDANCE_H__
#include <cstddef>
#include <functional>
#include <vector>
#include "tensor.hpp"
namespace sd::guidance {
struct GuiderOutput {
sd::Tensor<float> pred;
sd::Tensor<float> pred_cond;
sd::Tensor<float> pred_uncond;
sd::Tensor<float> pred_img_cond;
sd::Tensor<float> pred_skip_layer;
};
struct GuidanceInput {
int step = 0;
size_t schedule_size = 0;
const sd::Tensor<float>* pred_cond = nullptr;
const sd::Tensor<float>* pred_uncond = nullptr;
const sd::Tensor<float>* pred_img_cond = nullptr;
std::function<sd::Tensor<float>()> predict_skip_layer;
};
class BaseGuidance {
public:
virtual ~BaseGuidance() = default;
virtual GuiderOutput forward(const GuidanceInput& input,
GuiderOutput previous) const = 0;
};
class ClassifierFreeGuidance : public BaseGuidance {
float guidance_scale_ = 1.0f;
float image_guidance_scale_ = 1.0f;
public:
ClassifierFreeGuidance(float guidance_scale,
float image_guidance_scale);
GuiderOutput forward(const GuidanceInput& input,
GuiderOutput previous) const override;
};
class SkipLayerGuidance : public BaseGuidance {
std::vector<int> layers_;
float scale_ = 0.0f;
float start_ = 0.0f;
float stop_ = 1.0f;
public:
SkipLayerGuidance(std::vector<int> layers,
float scale,
float start,
float stop);
bool is_enabled_for_step(const GuidanceInput& input) const;
const std::vector<int>& layers() const;
GuiderOutput forward(const GuidanceInput& input,
GuiderOutput previous) const override;
};
} // namespace sd::guidance
#endif // __SD_GUIDANCE_H__

View File

@ -4,138 +4,6 @@
#include "ggml.h"
#include "tensor.hpp"
const float ltxav_latent_rgb_proj[128][3] = {
{-0.0293802f, -0.0362516f, -0.0291386f},
{0.0117735f, 0.0223435f, 0.018856f},
{0.00922335f, 0.0145666f, 0.0038772f},
{0.0227299f, 0.0109122f, 0.0131384f},
{0.00192413f, 0.0024648f, 0.00689245f},
{-0.0105576f, -0.0135933f, -0.00873841f},
{-0.0310222f, -0.0396358f, -0.0408445f},
{0.0149737f, 0.0316323f, 0.03415f},
{0.0027752f, 0.00814889f, 0.0108575f},
{-0.000678017f, -0.00180589f, -0.0161684f},
{0.0153964f, 0.0159774f, 0.0186479f},
{-0.0222799f, -0.0202068f, -0.0181082f},
{0.0128696f, 0.00754416f, -0.00673279f},
{0.0142729f, 0.00448099f, -0.00193934f},
{-0.014066f, -0.0193755f, -0.0160104f},
{-0.0176785f, -0.015903f, -0.0152621f},
{0.0307381f, 0.0292082f, 0.0328668f},
{0.0332928f, 0.0368629f, 0.0440893f},
{0.0186304f, 0.0124069f, 0.0160734f},
{0.00477787f, -0.00315658f, -0.000145702f},
{0.0183099f, 0.0122593f, 0.00599732f},
{-0.0194551f, -0.0183924f, -0.0147465f},
{0.0025732f, 0.00442582f, 0.0173176f},
{-0.0169423f, -0.0293863f, -0.0225908f},
{-0.021228f, -0.0265094f, -0.0253049f},
{0.0327111f, 0.0187133f, 0.0266184f},
{-0.0226425f, -0.0313781f, -0.0414356f},
{-0.0163142f, -0.0146144f, -0.0171793f},
{0.0192183f, 0.0108411f, 0.00829186f},
{-0.032246f, -0.0274846f, -0.0287434f},
{0.00345399f, 0.0115567f, 0.015288f},
{0.000972292f, 0.00331303f, 0.0110501f},
{0.000939494f, -0.00705084f, -0.00979449f},
{0.0405155f, 0.0339534f, 0.0419513f},
{0.0198596f, 0.0186626f, 0.0213766f},
{-0.00982375f, -0.00880439f, -0.00470429f},
{-0.0313707f, -0.0258098f, -0.0211663f},
{0.0144159f, 0.0117896f, 0.0141573f},
{0.0164571f, 0.0149178f, 0.00921599f},
{0.0436184f, 0.0346583f, 0.0360647f},
{-0.00289744f, -0.000752502f, 0.000675415f},
{-0.00621715f, -0.000558851f, 0.0135814f},
{-0.00817579f, -0.0113584f, -0.00556793f},
{0.00965067f, 0.0178221f, 0.015821f},
{0.0211832f, 0.0180827f, 0.0154707f},
{-0.00412858f, -0.00374182f, 0.0029568f},
{-0.0175603f, -0.0226242f, -0.0279012f},
{-0.00437471f, -0.00668329f, 0.000164887f},
{-0.0355983f, -0.0419093f, -0.0383065f},
{0.0144314f, 0.0192514f, 0.0175639f},
{-0.0130693f, -0.00569884f, -0.00341647f},
{-0.00184689f, 0.00189034f, -0.00190561f},
{0.019457f, 0.00842282f, 0.0123738f},
{-0.00477146f, -0.00206932f, 0.00283336f},
{-0.0364544f, -0.0256141f, -0.0322336f},
{-0.0295634f, -0.0295048f, -0.021057f},
{0.0144484f, 0.0191862f, 0.0112445f},
{0.0536406f, 0.0582376f, 0.0570966f},
{0.0085178f, 0.00748455f, 0.00995162f},
{-0.0136637f, -0.0172914f, -0.0195978f},
{-0.0339128f, -0.0392692f, -0.0355216f},
{0.00612855f, 0.00568303f, -0.00212333f},
{-0.0029225f, 0.00668819f, 0.0122131f},
{0.00841843f, 0.000181587f, -0.00650644f},
{-0.00514432f, 0.0127043f, 0.0168049f},
{-0.00997384f, -0.00602262f, -0.0164031f},
{0.0233226f, 0.033254f, 0.0307266f},
{-0.0110201f, -0.0164169f, -0.0161829f},
{-0.0195952f, -0.0177943f, -0.0115377f},
{-0.00523918f, -0.00452043f, 0.00267397f},
{0.0313464f, 0.0288241f, 0.0262496f},
{0.0324018f, 0.0339792f, 0.0312209f},
{-0.0163247f, -0.0230503f, -0.0263239f},
{0.000420577f, -0.00535659f, -0.00663426f},
{-0.012897f, -0.00203767f, -0.000622678f},
{-0.0632956f, -0.0651325f, -0.0584479f},
{-0.00426634f, -0.0150098f, -0.00719348f},
{0.00476109f, 0.00674315f, 0.00895472f},
{0.0129384f, 0.0158352f, 0.00963773f},
{-0.0333379f, -0.0410522f, -0.0317462f},
{0.00344054f, 0.00275915f, 0.00355732f},
{0.0209062f, 0.0273453f, 0.0222967f},
{0.00827287f, 0.00223045f, 0.00325844f},
{-0.0149132f, -0.0183973f, -0.0199781f},
{-0.0100786f, -0.0103681f, -0.00218224f},
{-0.00791409f, -0.00405153f, -0.00599893f},
{0.0176126f, 0.00618342f, -6.6569e-05f},
{0.00942486f, -0.00206494f, -0.00580324f},
{0.00678093f, -0.00291742f, -0.000921195f},
{-0.0221992f, -0.00483162f, -0.000848514f},
{-0.0151587f, -0.0157166f, -0.0107302f},
{0.00909646f, 0.0171985f, 0.0169785f},
{0.0127224f, 0.0170612f, 0.0303428f},
{0.0196562f, 0.00212451f, 0.0127744f},
{0.0233013f, 0.0228994f, 0.0108387f},
{0.00520761f, 0.00992992f, 0.0066267f},
{-3.77736e-05f, 0.00460229f, -0.00475132f},
{-0.0311763f, -0.0453566f, -0.0486901f},
{0.0195798f, 0.0281246f, 0.0180102f},
{-0.0174149f, -0.0240867f, -0.0188785f},
{0.000104658f, 0.00659008f, 0.0144594f},
{-0.00311086f, -0.0241426f, -0.0244164f},
{0.0336462f, 0.0305173f, 0.0331101f},
{0.0613625f, 0.066561f, 0.0610198f},
{-0.0286757f, -0.0325401f, -0.0338036f},
{0.0141534f, 0.0188266f, 0.0253059f},
{-0.00548197f, -0.00170198f, 0.00561745f},
{-0.0117872f, -0.00763218f, -0.0145037f},
{-0.0253304f, -0.0245217f, -0.0144905f},
{-0.00393624f, 0.00350048f, 0.00765561f},
{0.0113625f, 0.00561576f, -0.0113672f},
{-0.0301278f, -0.0261472f, -0.0301903f},
{0.016863f, 0.0173781f, 0.0170916f},
{-0.00495108f, 0.00686749f, 0.00282767f},
{0.00125409f, -0.00378072f, -0.00264117f},
{-0.00264001f, -0.00529772f, -0.0113109f},
{-0.054888f, -0.0575461f, -0.0509146f},
{-0.019442f, -0.0232916f, -0.0258637f},
{0.0133362f, 0.0161808f, 0.00917951f},
{-0.0349002f, -0.0372642f, -0.0466206f},
{-0.00216926f, 0.00208738f, 0.00766492f},
{0.0268528f, 0.0301179f, 0.0228579f},
{0.0226176f, 0.021536f, 0.023152f},
{-0.0110646f, -0.00511349f, -0.0137346f},
{-0.0098424f, -0.00218176f, 0.00414545f},
{0.00200216f, 0.00441732f, -0.0136515f},
{0.00695946f, 0.00313109f, -0.00379435f},
{0.0188377f, 0.0144059f, 0.0229724f},
};
float ltxav_latent_rgb_bias[3] = {0.043849f, 0.0201085f, 0.0150286f};
const float wan_21_latent_rgb_proj[16][3] = {
{0.015123f, -0.148418f, 0.479828f},
{0.003652f, -0.010680f, -0.037142f},

View File

@ -7,7 +7,6 @@
#include <fstream>
#include <functional>
#include <iostream>
#include <limits>
#include <map>
#include <memory>
#include <optional>
@ -22,7 +21,6 @@
#include "json.hpp"
#include "rope.hpp"
#include "tokenizers/bpe_tokenizer.h"
#include "tokenizers/gemma_tokenizer.h"
#include "tokenizers/mistral_tokenizer.h"
#include "tokenizers/qwen2_tokenizer.h"
@ -35,7 +33,6 @@ namespace LLM {
QWEN3_VL,
MISTRAL_SMALL_3_2,
MINISTRAL_3_3B,
GEMMA3_12B,
ARCH_COUNT,
};
@ -45,12 +42,6 @@ namespace LLM {
"qwen3vl",
"mistral_small3.2",
"ministral3.3b",
"gemma3_12b",
};
enum class MLPActivation {
SILU,
GELU_TANH,
};
enum class LLMVisionArch {
@ -75,71 +66,23 @@ namespace LLM {
};
struct LLMParams {
LLMArch arch = LLMArch::QWEN2_5_VL;
int64_t num_layers = 28;
int64_t hidden_size = 3584;
int64_t intermediate_size = 18944;
int num_heads = 28;
int num_kv_heads = 4;
int head_dim = 128;
bool qkv_bias = true;
bool qk_norm = false;
bool rms_norm_add = false;
bool normalize_input = false;
int64_t vocab_size = 152064;
int64_t max_position_embeddings = 128000;
float rms_norm_eps = 1e-06f;
MLPActivation mlp_activation = MLPActivation::SILU;
std::vector<float> rope_thetas = {1000000.f};
std::vector<float> rope_scales = {1.f};
std::vector<int> sliding_attention;
LLMArch arch = LLMArch::QWEN2_5_VL;
int64_t num_layers = 28;
int64_t hidden_size = 3584;
int64_t intermediate_size = 18944;
int num_heads = 28;
int num_kv_heads = 4;
int head_dim = 128;
bool qkv_bias = true;
bool qk_norm = false;
int64_t vocab_size = 152064;
float rms_norm_eps = 1e-06f;
LLMVisionParams vision;
};
struct LLMRMSNorm : public UnaryBlock {
protected:
int64_t hidden_size;
float eps;
bool add_unit_offset;
std::string prefix;
void init_params(ggml_context* ctx,
const String2TensorStorage& tensor_storage_map = {},
std::string prefix = "") override {
this->prefix = prefix;
params["weight"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
}
public:
LLMRMSNorm(int64_t hidden_size,
float eps = 1e-06f,
bool add_unit_offset = false)
: hidden_size(hidden_size), eps(eps), add_unit_offset(add_unit_offset) {}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
ggml_tensor* w = params["weight"];
if (ctx->weight_adapter) {
w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, w, prefix + "weight");
}
x = ggml_rms_norm(ctx->ggml_ctx, x, eps);
auto scaled = ggml_mul(ctx->ggml_ctx, x, w);
if (add_unit_offset) {
scaled = ggml_add_inplace(ctx->ggml_ctx, scaled, x);
}
return scaled;
}
};
struct MLP : public GGMLBlock {
protected:
MLPActivation activation;
public:
MLP(int64_t hidden_size,
int64_t intermediate_size,
bool bias = false,
MLPActivation activation_ = MLPActivation::SILU)
: activation(activation_) {
MLP(int64_t hidden_size, int64_t intermediate_size, bool bias = false) {
blocks["gate_proj"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, intermediate_size, bias));
blocks["up_proj"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, intermediate_size, bias));
blocks["down_proj"] = std::shared_ptr<GGMLBlock>(new Linear(intermediate_size, hidden_size, bias));
@ -152,13 +95,9 @@ namespace LLM {
auto down_proj = std::dynamic_pointer_cast<Linear>(blocks["down_proj"]);
auto h = gate_proj->forward(ctx, x);
if (activation == MLPActivation::GELU_TANH) {
h = ggml_ext_gelu(ctx->ggml_ctx, h, true);
} else {
h = ggml_silu_inplace(ctx->ggml_ctx, h);
}
h = ggml_mul_inplace(ctx->ggml_ctx, h, up_proj->forward(ctx, x));
h = down_proj->forward(ctx, h);
h = ggml_silu_inplace(ctx->ggml_ctx, h);
h = ggml_mul_inplace(ctx->ggml_ctx, h, up_proj->forward(ctx, x));
h = down_proj->forward(ctx, h);
return h;
}
};
@ -598,35 +537,24 @@ namespace LLM {
int64_t num_heads;
int64_t num_kv_heads;
bool qk_norm;
int64_t max_position_embeddings;
std::vector<float> rope_thetas;
std::vector<float> rope_scales;
public:
Attention(const LLMParams& params)
: arch(params.arch),
num_heads(params.num_heads),
num_kv_heads(params.num_kv_heads),
head_dim(params.head_dim),
qk_norm(params.qk_norm),
max_position_embeddings(params.max_position_embeddings),
rope_thetas(params.rope_thetas),
rope_scales(params.rope_scales) {
: arch(params.arch), num_heads(params.num_heads), num_kv_heads(params.num_kv_heads), head_dim(params.head_dim), qk_norm(params.qk_norm) {
blocks["q_proj"] = std::make_shared<Linear>(params.hidden_size, num_heads * head_dim, params.qkv_bias);
blocks["k_proj"] = std::make_shared<Linear>(params.hidden_size, num_kv_heads * head_dim, params.qkv_bias);
blocks["v_proj"] = std::make_shared<Linear>(params.hidden_size, num_kv_heads * head_dim, params.qkv_bias);
blocks["o_proj"] = std::make_shared<Linear>(num_heads * head_dim, params.hidden_size, false);
if (params.qk_norm) {
blocks["q_norm"] = std::make_shared<LLMRMSNorm>(head_dim, params.rms_norm_eps, params.rms_norm_add);
blocks["k_norm"] = std::make_shared<LLMRMSNorm>(head_dim, params.rms_norm_eps, params.rms_norm_add);
blocks["q_norm"] = std::make_shared<RMSNorm>(head_dim, params.rms_norm_eps);
blocks["k_norm"] = std::make_shared<RMSNorm>(head_dim, params.rms_norm_eps);
}
}
ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_tensor* x,
ggml_tensor* input_pos,
ggml_tensor* attention_mask = nullptr,
int rope_index = 0) {
ggml_tensor* attention_mask = nullptr) {
// x: [N, n_token, hidden_size]
int64_t n_token = x->ne[1];
int64_t N = x->ne[2];
@ -644,8 +572,8 @@ namespace LLM {
v = ggml_reshape_4d(ctx->ggml_ctx, v, head_dim, num_kv_heads, n_token, N); // [N, n_token, num_kv_heads, head_dim]
if (qk_norm) {
auto q_norm = std::dynamic_pointer_cast<LLMRMSNorm>(blocks["q_norm"]);
auto k_norm = std::dynamic_pointer_cast<LLMRMSNorm>(blocks["k_norm"]);
auto q_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["q_norm"]);
auto k_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["k_norm"]);
q = q_norm->forward(ctx, q);
k = k_norm->forward(ctx, k);
@ -660,36 +588,6 @@ namespace LLM {
} else if (arch == LLMArch::QWEN3) {
q = ggml_rope_ext(ctx->ggml_ctx, q, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 40960, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
k = ggml_rope_ext(ctx->ggml_ctx, k, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 40960, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
} else if (arch == LLMArch::GEMMA3_12B) {
float rope_theta = (rope_index == 1 ? 10000.0f : 1000000.0f);
float rope_scale = (rope_index == 1 ? 1.f : 8.f);
float freq_scale = 1.f / rope_scale;
q = ggml_rope_ext(ctx->ggml_ctx,
q,
input_pos,
nullptr,
head_dim,
GGML_ROPE_TYPE_NORMAL,
0,
rope_theta,
freq_scale,
0.f,
1.f,
32.f,
1.f);
k = ggml_rope_ext(ctx->ggml_ctx,
k,
input_pos,
nullptr,
head_dim,
GGML_ROPE_TYPE_NORMAL,
0,
rope_theta,
freq_scale,
0.f,
1.f,
32.f,
1.f);
} else if (arch == LLMArch::QWEN3_VL) {
int sections[4] = {24, 20, 20, 0};
q = ggml_rope_multi(ctx->ggml_ctx, q, input_pos, nullptr, head_dim, sections, GGML_ROPE_TYPE_IMROPE, 262144, 5000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
@ -714,76 +612,33 @@ namespace LLM {
};
struct TransformerBlock : public GGMLBlock {
protected:
LLMArch arch;
int sliding_attention;
bool has_post_attention_norm;
bool has_post_ffw_norm;
public:
TransformerBlock(const LLMParams& params, int layer_index)
: arch(params.arch),
sliding_attention(0),
has_post_attention_norm(params.arch == LLMArch::GEMMA3_12B),
has_post_ffw_norm(params.arch == LLMArch::GEMMA3_12B) {
TransformerBlock(const LLMParams& params) {
blocks["self_attn"] = std::make_shared<Attention>(params);
blocks["mlp"] = std::make_shared<MLP>(params.hidden_size,
params.intermediate_size,
false,
params.mlp_activation);
blocks["input_layernorm"] = std::make_shared<LLMRMSNorm>(params.hidden_size, params.rms_norm_eps, params.rms_norm_add);
blocks["post_attention_layernorm"] = std::make_shared<LLMRMSNorm>(params.hidden_size, params.rms_norm_eps, params.rms_norm_add);
if (has_post_attention_norm) {
blocks["post_attention_norm"] = std::make_shared<LLMRMSNorm>(params.hidden_size, params.rms_norm_eps, params.rms_norm_add);
}
if (has_post_ffw_norm) {
blocks["post_ffw_norm"] = std::make_shared<LLMRMSNorm>(params.hidden_size, params.rms_norm_eps, params.rms_norm_add);
}
if (!params.sliding_attention.empty()) {
sliding_attention = params.sliding_attention[layer_index % params.sliding_attention.size()];
}
blocks["mlp"] = std::make_shared<MLP>(params.hidden_size, params.intermediate_size);
blocks["input_layernorm"] = std::make_shared<RMSNorm>(params.hidden_size, params.rms_norm_eps);
blocks["post_attention_layernorm"] = std::make_shared<RMSNorm>(params.hidden_size, params.rms_norm_eps);
}
ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_tensor* x,
ggml_tensor* input_pos,
ggml_tensor* attention_mask = nullptr,
ggml_tensor* sliding_attention_mask = nullptr) {
ggml_tensor* attention_mask = nullptr) {
// x: [N, n_token, hidden_size]
auto self_attn = std::dynamic_pointer_cast<Attention>(blocks["self_attn"]);
auto mlp = std::dynamic_pointer_cast<MLP>(blocks["mlp"]);
auto input_layernorm = std::dynamic_pointer_cast<LLMRMSNorm>(blocks["input_layernorm"]);
auto post_attention_layernorm = std::dynamic_pointer_cast<LLMRMSNorm>(blocks["post_attention_layernorm"]);
std::shared_ptr<LLMRMSNorm> post_attention_norm = nullptr;
std::shared_ptr<LLMRMSNorm> post_ffw_norm = nullptr;
if (has_post_attention_norm) {
post_attention_norm = std::dynamic_pointer_cast<LLMRMSNorm>(blocks["post_attention_norm"]);
}
if (has_post_ffw_norm) {
post_ffw_norm = std::dynamic_pointer_cast<LLMRMSNorm>(blocks["post_ffw_norm"]);
}
ggml_tensor* block_attention_mask = attention_mask;
int rope_index = 0;
if (arch == LLMArch::GEMMA3_12B && sliding_attention > 0) {
block_attention_mask = sliding_attention_mask;
rope_index = 1;
}
auto self_attn = std::dynamic_pointer_cast<Attention>(blocks["self_attn"]);
auto mlp = std::dynamic_pointer_cast<MLP>(blocks["mlp"]);
auto input_layernorm = std::dynamic_pointer_cast<RMSNorm>(blocks["input_layernorm"]);
auto post_attention_layernorm = std::dynamic_pointer_cast<RMSNorm>(blocks["post_attention_layernorm"]);
auto residual = x;
x = input_layernorm->forward(ctx, x);
x = self_attn->forward(ctx, x, input_pos, block_attention_mask, rope_index);
if (post_attention_norm != nullptr) {
x = post_attention_norm->forward(ctx, x);
}
x = ggml_add_inplace(ctx->ggml_ctx, x, residual);
x = self_attn->forward(ctx, x, input_pos, attention_mask);
x = ggml_add_inplace(ctx->ggml_ctx, x, residual);
residual = x;
x = post_attention_layernorm->forward(ctx, x);
x = mlp->forward(ctx, x);
if (post_ffw_norm != nullptr) {
x = post_ffw_norm->forward(ctx, x);
}
x = ggml_add_inplace(ctx->ggml_ctx, x, residual);
x = ggml_add_inplace(ctx->ggml_ctx, x, residual);
return x;
}
@ -799,9 +654,9 @@ namespace LLM {
: num_layers(params.num_layers), params(params) {
blocks["embed_tokens"] = std::shared_ptr<GGMLBlock>(new Embedding(params.vocab_size, params.hidden_size));
for (int i = 0; i < num_layers; i++) {
blocks["layers." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new TransformerBlock(params, i));
blocks["layers." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new TransformerBlock(params));
}
blocks["norm"] = std::shared_ptr<GGMLBlock>(new LLMRMSNorm(params.hidden_size, params.rms_norm_eps, params.rms_norm_add));
blocks["norm"] = std::shared_ptr<GGMLBlock>(new RMSNorm(params.hidden_size, params.rms_norm_eps));
}
ggml_tensor* embed(GGMLRunnerContext* ctx,
@ -815,78 +670,46 @@ namespace LLM {
ggml_tensor* x,
ggml_tensor* input_pos,
ggml_tensor* attention_mask,
std::set<int> out_layers,
ggml_tensor* sliding_attention_mask = nullptr,
bool return_all_hidden_states = false) {
auto norm = std::dynamic_pointer_cast<LLMRMSNorm>(blocks["norm"]);
std::set<int> out_layers) {
auto norm = std::dynamic_pointer_cast<RMSNorm>(blocks["norm"]);
std::vector<ggml_tensor*> intermediate_outputs;
if (params.normalize_input) {
x = ggml_ext_scale(ctx->ggml_ctx, x, std::sqrt(static_cast<float>(params.hidden_size)), true);
}
if (return_all_hidden_states) {
intermediate_outputs.push_back(x);
}
sd::ggml_graph_cut::mark_graph_cut(x, "llm.text.prelude", "x");
for (int i = 0; i < num_layers; i++) {
auto block = std::dynamic_pointer_cast<TransformerBlock>(blocks["layers." + std::to_string(i)]);
x = block->forward(ctx, x, input_pos, attention_mask, sliding_attention_mask);
if (return_all_hidden_states || out_layers.size() > 1) {
x = block->forward(ctx, x, input_pos, attention_mask);
if (out_layers.size() > 1) {
x = ggml_cont(ctx->ggml_ctx, x);
}
sd::ggml_graph_cut::mark_graph_cut(x, "llm.text.layers." + std::to_string(i), "x");
if (return_all_hidden_states) {
if (i + 1 < num_layers) {
intermediate_outputs.push_back(x);
}
} else if (out_layers.find(i + 1) != out_layers.end()) {
if (out_layers.find(i + 1) != out_layers.end()) {
intermediate_outputs.push_back(x);
}
}
auto normed_x = norm->forward(ctx, x);
if (return_all_hidden_states) {
intermediate_outputs.push_back(normed_x);
if (!intermediate_outputs.empty()) {
x = intermediate_outputs[0];
for (int i = 1; i < intermediate_outputs.size(); i++) {
x = ggml_concat(ctx->ggml_ctx, x, intermediate_outputs[i], 0);
}
} else if (!intermediate_outputs.empty()) {
if (out_layers.find(static_cast<int>(num_layers + 1)) != out_layers.end()) {
intermediate_outputs.push_back(normed_x);
}
x = intermediate_outputs[0];
for (int i = 1; i < intermediate_outputs.size(); i++) {
x = ggml_concat(ctx->ggml_ctx, x, intermediate_outputs[i], 0);
}
} else {
x = normed_x;
return x;
}
return x;
return norm->forward(ctx, x);
}
ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_tensor* input_ids,
ggml_tensor* input_pos,
ggml_tensor* attention_mask,
ggml_tensor* sliding_attention_mask,
std::vector<std::pair<int, ggml_tensor*>> image_embeds,
std::set<int> out_layers,
bool return_all_hidden_states = false) {
std::set<int> out_layers) {
// input_ids: [N, n_token]
// return: [N, n_token, hidden_size]
auto x = embed(ctx, input_ids);
x = splice_image_embeds(ctx, x, image_embeds);
return forward_embeds(ctx,
x,
input_pos,
attention_mask,
std::move(out_layers),
sliding_attention_mask,
return_all_hidden_states);
return forward_embeds(ctx, x, input_pos, attention_mask, std::move(out_layers));
}
};
@ -908,21 +731,12 @@ namespace LLM {
ggml_tensor* input_ids,
ggml_tensor* input_pos,
ggml_tensor* attention_mask,
ggml_tensor* sliding_attention_mask,
std::vector<std::pair<int, ggml_tensor*>> image_embeds,
std::set<int> out_layers,
bool return_all_hidden_states = false) {
std::set<int> out_layers) {
// input_ids: [N, n_token]
auto model = std::dynamic_pointer_cast<TextModel>(blocks["model"]);
auto x = model->forward(ctx,
input_ids,
input_pos,
attention_mask,
sliding_attention_mask,
image_embeds,
out_layers,
return_all_hidden_states);
auto x = model->forward(ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers);
return x;
}
@ -950,7 +764,6 @@ namespace LLM {
std::vector<int> input_pos_vec;
std::vector<float> attention_mask_vec;
std::vector<float> sliding_attention_mask_vec;
std::vector<float> window_mask_vec;
std::vector<int> window_index_vec;
std::vector<int> window_inverse_index_vec;
@ -1185,23 +998,6 @@ namespace LLM {
params.qkv_bias = false;
params.qk_norm = true;
params.rms_norm_eps = 1e-6f;
} else if (arch == LLMArch::GEMMA3_12B) {
params.head_dim = 256;
params.num_heads = 16;
params.num_kv_heads = 8;
params.qkv_bias = false;
params.qk_norm = true;
params.rms_norm_eps = 1e-6f;
// llama.cpp adds +1 to Gemma3 norm.weight when exporting GGUF, so GGUF loading
// must keep rms_norm_add disabled here or the offset gets applied twice.
// Convenient for the converter, less convenient for whoever gets to debug it later.
params.rms_norm_add = false;
params.normalize_input = true;
params.max_position_embeddings = 131072;
params.mlp_activation = MLPActivation::GELU_TANH;
params.rope_thetas = {1000000.f, 10000.f};
params.rope_scales = {8.f, 1.f};
params.sliding_attention = {1024, 1024, 1024, 1024, 1024, 0};
}
bool have_vision_weight = false;
bool llama_cpp_style = false;
@ -1271,18 +1067,9 @@ namespace LLM {
ggml_tensor* input_ids,
ggml_tensor* input_pos,
ggml_tensor* attention_mask,
ggml_tensor* sliding_attention_mask,
std::vector<std::pair<int, ggml_tensor*>> image_embeds,
std::set<int> out_layers,
bool return_all_hidden_states = false) {
auto hidden_states = model.forward(ctx,
input_ids,
input_pos,
attention_mask,
sliding_attention_mask,
image_embeds,
out_layers,
return_all_hidden_states); // [N, n_token, hidden_size]
std::set<int> out_layers) {
auto hidden_states = model.forward(ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers); // [N, n_token, hidden_size]
return hidden_states;
}
@ -1300,9 +1087,8 @@ namespace LLM {
ggml_cgraph* build_graph(const sd::Tensor<int32_t>& input_ids_tensor,
const sd::Tensor<float>& attention_mask_tensor,
const std::vector<std::pair<int, sd::Tensor<float>>>& image_embeds_tensor,
std::set<int> out_layers,
bool return_all_hidden_states = false) {
ggml_cgraph* gf = new_graph_custom(LLM_GRAPH_SIZE);
std::set<int> out_layers) {
ggml_cgraph* gf = ggml_new_graph(compute_ctx);
ggml_tensor* input_ids = make_input(input_ids_tensor);
std::vector<std::pair<int, ggml_tensor*>> image_embeds;
image_embeds.reserve(image_embeds_tensor.size());
@ -1312,10 +1098,7 @@ namespace LLM {
}
int64_t n_tokens = input_ids->ne[0];
if (params.arch == LLMArch::MISTRAL_SMALL_3_2 ||
params.arch == LLMArch::MINISTRAL_3_3B ||
params.arch == LLMArch::QWEN3 ||
params.arch == LLMArch::GEMMA3_12B) {
if (params.arch == LLMArch::MISTRAL_SMALL_3_2 || params.arch == LLMArch::MINISTRAL_3_3B || params.arch == LLMArch::QWEN3) {
input_pos_vec.resize(n_tokens);
for (int i = 0; i < n_tokens; ++i) {
input_pos_vec[i] = i;
@ -1335,8 +1118,7 @@ namespace LLM {
input_pos_vec.size());
set_backend_tensor_data(input_pos, input_pos_vec.data());
ggml_tensor* attention_mask = nullptr;
ggml_tensor* sliding_attention_mask = nullptr;
ggml_tensor* attention_mask = nullptr;
if (!attention_mask_tensor.empty()) {
attention_mask = make_input(attention_mask_tensor);
} else {
@ -1354,36 +1136,9 @@ namespace LLM {
set_backend_tensor_data(attention_mask, attention_mask_vec.data());
}
if (params.arch == LLMArch::GEMMA3_12B) {
sliding_attention_mask_vec.resize(n_tokens * n_tokens);
if (!attention_mask_tensor.empty()) {
GGML_ASSERT(attention_mask_tensor.numel() == n_tokens * n_tokens);
sliding_attention_mask_vec = attention_mask_tensor.values();
} else {
sliding_attention_mask_vec = attention_mask_vec;
}
for (int i0 = 0; i0 < n_tokens; i0++) {
for (int i1 = 0; i1 < n_tokens; i1++) {
if (i0 + 1024 <= i1) {
LOG_DEBUG("xxxxxxxxxxxxxx");
sliding_attention_mask_vec[i1 * n_tokens + i0] = -INFINITY;
}
}
}
sliding_attention_mask = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_F32, n_tokens, n_tokens);
set_backend_tensor_data(sliding_attention_mask, sliding_attention_mask_vec.data());
}
auto runner_ctx = get_context();
ggml_tensor* hidden_states = forward(&runner_ctx,
input_ids,
input_pos,
attention_mask,
sliding_attention_mask,
image_embeds,
out_layers,
return_all_hidden_states);
ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers);
ggml_build_forward_expand(gf, hidden_states);
@ -1394,14 +1149,9 @@ namespace LLM {
const sd::Tensor<int32_t>& input_ids,
const sd::Tensor<float>& attention_mask,
const std::vector<std::pair<int, sd::Tensor<float>>>& image_embeds,
std::set<int> out_layers,
bool return_all_hidden_states = false) {
std::set<int> out_layers) {
auto get_graph = [&]() -> ggml_cgraph* {
return build_graph(input_ids,
attention_mask,
image_embeds,
out_layers,
return_all_hidden_states);
return build_graph(input_ids, attention_mask, image_embeds, out_layers);
};
return take_or_empty(GGMLRunner::compute<float>(get_graph, n_threads, true));
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -462,9 +462,6 @@ SDVersion ModelLoader::get_sd_version() {
if (tensor_storage.name.find("model.diffusion_model.layers.0.adaLN_sa_ln.weight") != std::string::npos) {
return VERSION_ERNIE_IMAGE;
}
if (tensor_storage.name.find("model.diffusion_model.adaln_single.emb.timestep_embedder.linear_1.bias") != std::string::npos) {
return VERSION_LTXAV;
}
if (tensor_storage.name.find("model.diffusion_model.blocks.0.cross_attn.norm_k.weight") != std::string::npos) {
is_wan = true;
}

View File

@ -42,7 +42,6 @@ enum SDVersion {
VERSION_ANIMA,
VERSION_FLUX2,
VERSION_FLUX2_KLEIN,
VERSION_LTXAV,
VERSION_HIDREAM_O1,
VERSION_Z_IMAGE,
VERSION_OVIS_IMAGE,
@ -106,13 +105,6 @@ static inline bool sd_version_is_flux2(SDVersion version) {
return false;
}
static inline bool sd_version_is_ltxav(SDVersion version) {
if (version == VERSION_LTXAV) {
return true;
}
return false;
}
static inline bool sd_version_is_wan(SDVersion version) {
if (version == VERSION_WAN2 || version == VERSION_WAN2_2_I2V || version == VERSION_WAN2_2_TI2V) {
return true;
@ -169,7 +161,6 @@ static inline bool sd_version_is_inpaint(SDVersion version) {
static inline bool sd_version_is_dit(SDVersion version) {
if (sd_version_is_flux(version) ||
sd_version_is_flux2(version) ||
sd_version_is_ltxav(version) ||
sd_version_is_sd3(version) ||
sd_version_is_wan(version) ||
sd_version_is_qwen_image(version) ||

File diff suppressed because it is too large Load Diff

View File

@ -2,6 +2,7 @@
#define __TAE_HPP__
#include "ggml_extend.hpp"
#include "model.h"
/*

View File

@ -104,7 +104,7 @@ namespace sd {
throw std::invalid_argument("tensor file type does not match requested sd::Tensor type");
}
std::vector<int64_t> shape(n_dims, 1);
std::vector<int64_t> shape(4, 1);
for (int i = 0; i < n_dims; ++i) {
int32_t dim = 1;
file.read(reinterpret_cast<char*>(&dim), sizeof(dim));

View File

@ -162,37 +162,13 @@ std::vector<int> BPETokenizer::encode(const std::string& text, on_new_token_cb_t
std::string token_str = token;
std::u32string utf32_token;
if (byte_level_bpe) {
for (int i = 0; i < token_str.length(); i++) {
unsigned char b = token_str[i];
utf32_token += byte_encoder[b];
}
} else {
utf32_token = utf8_to_utf32(token_str);
for (int i = 0; i < static_cast<int>(token_str.length()); i++) {
unsigned char b = token_str[i];
utf32_token += byte_encoder[b];
}
auto bpe_strs = bpe(utf32_token);
for (auto bpe_str : bpe_strs) {
int token_id;
auto iter = encoder.find(bpe_str);
if (iter != encoder.end()) {
token_id = iter->second;
} else {
if (byte_fallback) {
auto utf8_token_str = utf32_to_utf8(bpe_str);
for (int i = 0; i < utf8_token_str.length(); i++) {
unsigned char b = utf8_token_str[i];
char hex_buf[16];
snprintf(hex_buf, sizeof(hex_buf), "<0x%02X>", b);
iter = encoder.find(utf8_to_utf32(hex_buf));
bpe_tokens.push_back(token_id);
token_strs.push_back(hex_buf);
}
continue;
} else {
token_id = UNK_TOKEN_ID;
}
}
bpe_tokens.push_back(token_id);
bpe_tokens.push_back(encoder[bpe_str]);
token_strs.push_back(utf32_to_utf8(bpe_str));
}
}

View File

@ -20,10 +20,8 @@ protected:
std::map<std::u32string, int> encoder;
std::map<int, std::u32string> decoder;
std::map<std::pair<std::u32string, std::u32string>, int> bpe_ranks;
int encoder_len = 0;
int bpe_len = 0;
bool byte_level_bpe = true;
bool byte_fallback = false;
int encoder_len = 0;
int bpe_len = 0;
protected:
static std::vector<std::pair<int, std::u32string>> bytes_to_unicode();

View File

@ -1,191 +0,0 @@
#include "gemma_tokenizer.h"
#include "ggml.h"
#include "json.hpp"
#include "util.h"
#include "vocab/vocab.h"
std::string GemmaTokenizer::normalize(const std::string& text) const {
std::string normalized = text;
size_t pos = 0;
while ((pos = normalized.find(' ', pos)) != std::string::npos) {
normalized.replace(pos, 1, "\xE2\x96\x81");
pos += 3;
}
return normalized;
}
void GemmaTokenizer::load_from_merges(const std::string& merges_utf8_str, const std::string& vocab_utf8_str) {
nlohmann::json vocab;
try {
vocab = nlohmann::json::parse(vocab_utf8_str);
} catch (const nlohmann::json::parse_error&) {
GGML_ABORT("invalid vocab json str");
}
for (const auto& [key, value] : vocab.items()) {
std::u32string token = utf8_to_utf32(key);
int i = value;
encoder[token] = i;
decoder[i] = token;
}
encoder_len = static_cast<int>(vocab.size());
LOG_DEBUG("vocab size: %d", encoder_len);
std::vector<std::u32string> merges = split_utf32(merges_utf8_str);
std::vector<std::pair<std::u32string, std::u32string>> merge_pairs;
for (const auto& merge : merges) {
size_t space_pos = merge.find(' ');
merge_pairs.emplace_back(merge.substr(0, space_pos), merge.substr(space_pos + 1));
}
LOG_DEBUG("merges size %zu", merge_pairs.size());
int rank = 0;
for (const auto& merge : merge_pairs) {
bpe_ranks[merge] = rank++;
}
bpe_len = rank;
}
GemmaTokenizer::GemmaTokenizer(const std::string& merges_utf8_str, const std::string& vocab_utf8_str) {
byte_level_bpe = false;
byte_fallback = true;
add_bos_token = true;
pad_left = true;
PAD_TOKEN = "<pad>";
EOS_TOKEN = "<eos>";
BOS_TOKEN = "<bos>";
UNK_TOKEN = "<unk>";
PAD_TOKEN_ID = 0;
EOS_TOKEN_ID = 1;
BOS_TOKEN_ID = 2;
UNK_TOKEN_ID = 3;
std::vector<std::string> special_tokens_before_merge = {
PAD_TOKEN,
EOS_TOKEN,
BOS_TOKEN,
UNK_TOKEN,
"<mask>",
"[multimodal]",
};
for (int i = 0; i <= 98; i++) {
special_tokens_before_merge.push_back("<unused" + std::to_string(i) + ">");
}
special_tokens_before_merge.push_back("<start_of_turn>");
special_tokens_before_merge.push_back("<end_of_turn>");
for (int i = 1; i <= 31; i++) {
special_tokens_before_merge.push_back(std::string(i, '\n'));
}
for (int i = 2; i <= 31; i++) {
std::string whitespace_token;
for (int j = 0; j < i; j++) {
whitespace_token += "\xE2\x96\x81";
}
special_tokens_before_merge.push_back(whitespace_token);
}
std::vector<std::string> html_tokens = {
"<table>",
"<caption>",
"<thead>",
"<tbody>",
"<tfoot>",
"<tr>",
"<th>",
"<td>",
"</table>",
"</caption>",
"</thead>",
"</tbody>",
"</tfoot>",
"</tr>",
"</th>",
"</td>",
"<h1>",
"<h2>",
"<h3>",
"<h4>",
"<h5>",
"<h6>",
"<blockquote>",
"</h1>",
"</h2>",
"</h3>",
"</h4>",
"</h5>",
"</h6>",
"</blockquote>",
"<strong>",
"<em>",
"<b>",
"<i>",
"<u>",
"<s>",
"<sub>",
"<sup>",
"<code>",
"</strong>",
"</em>",
"</b>",
"</i>",
"</u>",
"</s>",
"</sub>",
"</sup>",
"</code>",
"<a>",
"<html>",
"<body>",
"<img>",
"<span>",
"<bbox>",
"<ul>",
"<li>",
"<div>",
"<iframe>",
"<footer>",
"</a>",
"</html>",
"</body>",
"</img>",
"</span>",
"</bbox>",
"</ul>",
"</li>",
"</div>",
"</iframe>",
"</footer>",
};
special_tokens_before_merge.insert(special_tokens_before_merge.end(),
html_tokens.begin(),
html_tokens.end());
for (int i = 0; i <= 0xFF; i++) {
char hex_buf[16];
snprintf(hex_buf, sizeof(hex_buf), "<0x%02X>", i);
special_tokens_before_merge.push_back(hex_buf);
}
std::vector<std::string> special_tokens_after_merge = {
"<start_of_image>",
"<end_of_image>",
};
for (int i = 1; i <= 31; i++) {
special_tokens_after_merge.insert(special_tokens_after_merge.begin() + i - 1,
std::string(i, '\t'));
}
for (int i = 99; i <= 6241; i++) {
special_tokens_after_merge.push_back("<unused" + std::to_string(i) + ">");
}
special_tokens_after_merge.push_back("<image_soft_token>");
special_tokens = special_tokens_before_merge;
special_tokens.insert(special_tokens.end(),
special_tokens_after_merge.begin(),
special_tokens_after_merge.end());
if (merges_utf8_str.size() > 0 && vocab_utf8_str.size() > 0) {
load_from_merges(merges_utf8_str, vocab_utf8_str);
} else {
load_from_merges(load_gemma_merges(), load_gemma_vocab_json());
}
}

View File

@ -1,17 +0,0 @@
#ifndef __SD_TOKENIZERS_GEMMA_TOKENIZER_H__
#define __SD_TOKENIZERS_GEMMA_TOKENIZER_H__
#include <string>
#include "bpe_tokenizer.h"
class GemmaTokenizer : public BPETokenizer {
protected:
void load_from_merges(const std::string& merges_utf8_str, const std::string& vocab_utf8_str);
std::string normalize(const std::string& text) const override;
public:
explicit GemmaTokenizer(const std::string& merges_utf8_str = "", const std::string& vocab_utf8_str = "");
};
#endif // __SD_TOKENIZERS_GEMMA_TOKENIZER_H__

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,7 +1,5 @@
#include "vocab.h"
#include "clip_t5.hpp"
#include "gemma_merges.hpp"
#include "gemma_vocab.hpp"
#include "mistral.hpp"
#include "qwen.hpp"
#include "umt5.hpp"
@ -35,13 +33,3 @@ std::string load_umt5_tokenizer_json() {
std::string json_str(reinterpret_cast<const char*>(umt5_tokenizer_json_str), sizeof(umt5_tokenizer_json_str));
return json_str;
}
std::string load_gemma_merges() {
std::string merges_utf8_str(reinterpret_cast<const char*>(gemma_merges_utf8_c_str), sizeof(gemma_merges_utf8_c_str));
return merges_utf8_str;
}
std::string load_gemma_vocab_json() {
std::string json_str(reinterpret_cast<const char*>(gemma_vocab_json_utf8_c_str), sizeof(gemma_vocab_json_utf8_c_str));
return json_str;
}

View File

@ -9,7 +9,5 @@ std::string load_mistral_merges();
std::string load_mistral_vocab_json();
std::string load_t5_tokenizer_json();
std::string load_umt5_tokenizer_json();
std::string load_gemma_merges();
std::string load_gemma_vocab_json();
#endif // __SD_TOKENIZERS_VOCAB_VOCAB_H__

View File

@ -67,9 +67,7 @@ public:
int get_scale_factor() {
int scale_factor = 8;
if (version == VERSION_LTXAV) {
scale_factor = 32;
} else if (version == VERSION_WAN2_2_TI2V) {
if (version == VERSION_WAN2_2_TI2V) {
scale_factor = 16;
} else if (sd_version_uses_flux2_vae(version)) {
scale_factor = 16;
@ -215,7 +213,6 @@ public:
virtual sd::Tensor<float> vae_to_diffusion_latents(const sd::Tensor<float>& latents) = 0;
virtual void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) = 0;
virtual void set_conv2d_scale(float scale) { SD_UNUSED(scale); };
virtual void set_temporal_tiling_enabled(bool enabled) { SD_UNUSED(enabled); };
};
struct FakeVAE : public VAE {

View File

@ -972,10 +972,10 @@ namespace WAN {
blocks["conv2"] = std::shared_ptr<GGMLBlock>(new CausalConv3d(z_dim, z_dim, {1, 1, 1}));
}
static ggml_tensor* patchify(ggml_context* ctx,
ggml_tensor* x,
int64_t patch_size,
int64_t b = 1) {
ggml_tensor* patchify(ggml_context* ctx,
ggml_tensor* x,
int64_t patch_size,
int64_t b = 1) {
// x: [b*c, f, h*q, w*r]
// return: [b*c*r*q, f, h, w]
if (patch_size == 1) {
@ -999,10 +999,10 @@ namespace WAN {
return x;
}
static ggml_tensor* unpatchify(ggml_context* ctx,
ggml_tensor* x,
int64_t patch_size,
int64_t b = 1) {
ggml_tensor* unpatchify(ggml_context* ctx,
ggml_tensor* x,
int64_t patch_size,
int64_t b = 1) {
// x: [b*c*r*q, f, h, w]
// return: [b*c, f, h*q, w*r]
if (patch_size == 1) {