From f2ec08f4b15f4c352118e83b6072c0c06c50839a Mon Sep 17 00:00:00 2001 From: leejet Date: Thu, 13 Nov 2025 21:43:09 +0800 Subject: [PATCH] update lora docs --- docs/lora.md | 41 +++++++++-------------------------------- examples/cli/README.md | 6 ++++++ examples/cli/main.cpp | 5 +++-- 3 files changed, 18 insertions(+), 34 deletions(-) diff --git a/docs/lora.md b/docs/lora.md index 9885ae5..fe4fbc0 100644 --- a/docs/lora.md +++ b/docs/lora.md @@ -12,38 +12,15 @@ Here's a simple example: `../models/marblesh.safetensors` or `../models/marblesh.ckpt` will be applied to the model -# Support matrix +# Lora Apply Mode -> ℹ️ CUDA `get_rows` support is defined here: -> [ggml-org/ggml/src/ggml-cuda/getrows.cu#L156](https://github.com/ggml-org/ggml/blob/7dee1d6a1e7611f238d09be96738388da97c88ed/src/ggml-cuda/getrows.cu#L156) -> Currently only the basic types + Q4/Q5/Q8 are implemented. K-quants are **not** supported. +There are two ways to apply LoRA: **immediately** and **at_runtime**. You can specify it using the `--lora-apply-mode` parameter. -NOTE: The other backends may have different support. +By default, the mode is selected automatically: + +* If the model weights contain any quantized parameters, the **at_runtime** mode is used; +* Otherwise, the **immediately** mode is used. + +The **immediately** mode may have precision and compatibility issues with quantized parameters, but it usually offers faster inference speed and, in some cases, lower memory usage. +In contrast, the **at_runtime** mode provides better compatibility and higher precision, but inference may be slower and memory usage may be higher in some cases. -| Quant / Type | CUDA | Vulkan | -|--------------|------|--------| -| F32 | ✔️ | ✔️ | -| F16 | ✔️ | ✔️ | -| BF16 | ✔️ | ✔️ | -| I32 | ✔️ | ❌ | -| Q4_0 | ✔️ | ✔️ | -| Q4_1 | ✔️ | ✔️ | -| Q5_0 | ✔️ | ✔️ | -| Q5_1 | ✔️ | ✔️ | -| Q8_0 | ✔️ | ✔️ | -| Q2_K | ❌ | ❌ | -| Q3_K | ❌ | ❌ | -| Q4_K | ❌ | ❌ | -| Q5_K | ❌ | ❌ | -| Q6_K | ❌ | ❌ | -| Q8_K | ❌ | ❌ | -| IQ1_S | ❌ | ✔️ | -| IQ1_M | ❌ | ✔️ | -| IQ2_XXS | ❌ | ✔️ | -| IQ2_XS | ❌ | ✔️ | -| IQ2_S | ❌ | ✔️ | -| IQ3_XXS | ❌ | ✔️ | -| IQ3_S | ❌ | ✔️ | -| IQ4_XS | ❌ | ✔️ | -| IQ4_NL | ❌ | ✔️ | -| MXFP4 | ❌ | ✔️ | diff --git a/examples/cli/README.md b/examples/cli/README.md index 00e0942..84df1a1 100644 --- a/examples/cli/README.md +++ b/examples/cli/README.md @@ -99,6 +99,12 @@ Options: --sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise) --prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow] + --lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights + contain any quantized parameters, the at_runtime mode will be used; otherwise, + immediately will be used.The immediately mode may have precision and + compatibility issues with quantized parameters, but it usually offers faster inference + speed and, in some cases, lower memory usageThe at_runtime mode, on the other + hand, is exactly the opposite. --scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple], default: discrete --skip-layers layers to skip for SLG steps (default: [7,8,9]) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index d88ba86..a2df094 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -1143,8 +1143,9 @@ void parse_args(int argc, const char** argv, SDParams& params) { "--lora-apply-mode", "the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. " "In auto mode, if the model weights contain any quantized parameters, the at_runtime mode will be used; otherwise, immediately will be used." - "The immediately mode consumes less GPU memory but may lead to precision loss and compatibility issues. " - "The at_runtime mode uses slightly more memory, but offers better compatibility and higher precision.", + "The immediately mode may have precision and compatibility issues with quantized parameters, " + "but it usually offers faster inference speed and, in some cases, lower memory usage" + "The at_runtime mode, on the other hand, is exactly the opposite.", on_lora_apply_mode_arg}, {"", "--scheduler",