From f2ec08f4b15f4c352118e83b6072c0c06c50839a Mon Sep 17 00:00:00 2001
From: leejet <leejet714@gmail.com>
Date: Thu, 13 Nov 2025 21:43:09 +0800
Subject: [PATCH] update lora docs

---
 docs/lora.md           | 41 +++++++++--------------------------------
 examples/cli/README.md |  6 ++++++
 examples/cli/main.cpp  |  5 +++--
 3 files changed, 18 insertions(+), 34 deletions(-)

diff --git a/docs/lora.md b/docs/lora.md
index 9885ae5..fe4fbc0 100644
--- a/docs/lora.md
+++ b/docs/lora.md
@@ -12,38 +12,15 @@ Here's a simple example:
 
 `../models/marblesh.safetensors` or `../models/marblesh.ckpt` will be applied to the model
 
-# Support matrix
+# Lora Apply Mode
 
-> ℹ️ CUDA `get_rows` support is defined here:  
-> [ggml-org/ggml/src/ggml-cuda/getrows.cu#L156](https://github.com/ggml-org/ggml/blob/7dee1d6a1e7611f238d09be96738388da97c88ed/src/ggml-cuda/getrows.cu#L156)  
-> Currently only the basic types + Q4/Q5/Q8 are implemented. K-quants are **not** supported.
+There are two ways to apply LoRA: **immediately** and **at_runtime**. You can specify it using the `--lora-apply-mode` parameter.
 
-NOTE: The other backends may have different support.
+By default, the mode is selected automatically:
+
+* If the model weights contain any quantized parameters, the **at_runtime** mode is used;
+* Otherwise, the **immediately** mode is used.
+
+The **immediately** mode may have precision and compatibility issues with quantized parameters, but it usually offers faster inference speed and, in some cases, lower memory usage.
+In contrast, the **at_runtime** mode provides better compatibility and higher precision, but inference may be slower and memory usage may be higher in some cases.
 
-| Quant / Type | CUDA | Vulkan |
-|--------------|------|--------|
-| F32          | ✔️   | ✔️   |
-| F16          | ✔️   | ✔️   |
-| BF16         | ✔️   | ✔️   |
-| I32          | ✔️   | ❌   |
-| Q4_0         | ✔️   | ✔️   |
-| Q4_1         | ✔️   | ✔️   |
-| Q5_0         | ✔️   | ✔️   |
-| Q5_1         | ✔️   | ✔️   |
-| Q8_0         | ✔️   | ✔️   |
-| Q2_K         | ❌   | ❌   |
-| Q3_K         | ❌   | ❌   |
-| Q4_K         | ❌   | ❌   |
-| Q5_K         | ❌   | ❌   |
-| Q6_K         | ❌   | ❌   |
-| Q8_K         | ❌   | ❌   |
-| IQ1_S        | ❌   | ✔️   |
-| IQ1_M        | ❌   | ✔️   |
-| IQ2_XXS      | ❌   | ✔️   |
-| IQ2_XS       | ❌   | ✔️   |
-| IQ2_S        | ❌   | ✔️   |
-| IQ3_XXS      | ❌   | ✔️   |
-| IQ3_S        | ❌   | ✔️   |
-| IQ4_XS       | ❌   | ✔️   |
-| IQ4_NL       | ❌   | ✔️   |
-| MXFP4        | ❌   | ✔️   |
diff --git a/examples/cli/README.md b/examples/cli/README.md
index 00e0942..84df1a1 100644
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@@ -99,6 +99,12 @@ Options:
   --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
                                            tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise)
   --prediction                             prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow]
+  --lora-apply-mode                        the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights
+                                           contain any quantized parameters, the at_runtime mode will be used; otherwise,
+                                           immediately will be used.The immediately mode may have precision and
+                                           compatibility issues with quantized parameters, but it usually offers faster inference
+                                           speed and, in some cases, lower memory usageThe at_runtime mode, on the other
+                                           hand, is exactly the opposite.
   --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple], default:
                                            discrete
   --skip-layers                            layers to skip for SLG steps (default: [7,8,9])
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index d88ba86..a2df094 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -1143,8 +1143,9 @@ void parse_args(int argc, const char** argv, SDParams& params) {
          "--lora-apply-mode",
          "the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. "
          "In auto mode, if the model weights contain any quantized parameters, the at_runtime mode will be used; otherwise, immediately will be used."
-         "The immediately mode consumes less GPU memory but may lead to precision loss and compatibility issues. "
-         "The at_runtime mode uses slightly more memory, but offers better compatibility and higher precision.",
+         "The immediately mode may have precision and compatibility issues with quantized parameters, "
+         "but it usually offers faster inference speed and, in some cases, lower memory usage"
+         "The at_runtime mode, on the other hand, is exactly the opposite.",
          on_lora_apply_mode_arg},
         {"",
          "--scheduler",