refactor: update c api (#728)

2026-06-23 22:56:42 +00:00 · 2025-07-13 18:48:42 +08:00 · 2025-07-13 18:48:42 +08:00 · ca0bd9396e
commit ca0bd9396e
parent a772dca27a
7 changed files with 1200 additions and 1318 deletions
--- a/README.md
+++ b/README.md
@ -282,14 +282,14 @@ usage: ./bin/sd [arguments]

 arguments:
  -h, --help                         show this help message and exit
-  -M, --mode [MODEL]                 run mode (txt2img or img2img or convert, default: txt2img)
+  -M, --mode [MODE]                  run mode, one of: [img_gen, convert], default: img_gen
  -t, --threads N                    number of threads to use during computation (default: -1)
                                     If threads <= 0, then threads will be set to the number of CPU physical cores
  -m, --model [MODEL]                path to full model
  --diffusion-model                  path to the standalone diffusion model
  --clip_l                           path to the clip-l text encoder
  --clip_g                           path to the clip-g text encoder
-  --t5xxl                            path to the the t5xxl text encoder
+  --t5xxl                            path to the t5xxl text encoder
  --vae [VAE]                        path to vae
  --taesd [TAESD_PATH]               path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
  --control-net [CONTROL_PATH]       path to control net model
@ -301,16 +301,18 @@ arguments:
  --upscale-repeats                  Run the ESRGAN upscaler this many times (default 1)
  --type [TYPE]                      weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)
                                     If not specified, the default is the type of the weight file
+  --tensor-type-rules [EXPRESSION]   weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
  --lora-model-dir [DIR]             lora model directory
  -i, --init-img [IMAGE]             path to the input image, required by img2img
  --mask [MASK]                      path to the mask image, required by img2img with mask
  --control-image [IMAGE]            path to image condition, control net
-  -r, --ref_image [PATH]             reference image for Flux Kontext models (can be used multiple times)
+  -r, --ref-image [PATH]             reference image for Flux Kontext models (can be used multiple times)
  -o, --output OUTPUT                path to write result image to (default: ./output.png)
  -p, --prompt [PROMPT]              the prompt to render
  -n, --negative-prompt PROMPT       the negative prompt (default: "")
  --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)
-  --guidance SCALE                   guidance scale for img2img (default: 3.5)
+  --img-cfg-scale SCALE              image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
+  --guidance SCALE                   distilled guidance scale for models with guidance input (default: 3.5)
  --slg-scale SCALE                  skip layer guidance (SLG) scale, only for DiT models: (default: 0)
                                     0 means disabled, a value of 2.5 is nice for sd3.5 medium
  --eta SCALE                        eta in DDIM, only for DDIM and TCD: (default: 0)
@ -319,7 +321,7 @@ arguments:
  --skip-layer-end END               SLG disabling point: (default: 0.2)
                                     SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])
  --strength STRENGTH                strength for noising/unnoising (default: 0.75)
-  --style-ratio STYLE-RATIO          strength for keeping input identity (default: 20%)
+  --style-ratio STYLE-RATIO          strength for keeping input identity (default: 20)
  --control-strength STRENGTH        strength to apply Control Net (default: 0.9)
                                     1.0 corresponds to full destruction of information in init image
  -H, --height H                     image height, in pixel space (default: 512)
@ -371,7 +373,7 @@ Using formats of different precisions will yield results of varying quality.


 ```
-./bin/sd --mode img2img -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
+./bin/sd -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
 ```

 <p align="center">
--- a/docs/kontext.md
+++ b/docs/kontext.md
@ -27,7 +27,7 @@ You can download the preconverted gguf weights from [FLUX.1-Kontext-dev-GGUF](ht
 For example:

 ```
- .\bin\Release\sd.exe -M edit -r .\flux1-dev-q8_0.png --diffusion-model  ..\models\flux1-kontext-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "change 'flux.cpp' to 'kontext.cpp'" --cfg-scale 1.0 --sampling-method euler -v
+ .\bin\Release\sd.exe -r .\flux1-dev-q8_0.png --diffusion-model  ..\models\flux1-kontext-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "change 'flux.cpp' to 'kontext.cpp'" --cfg-scale 1.0 --sampling-method euler -v
 ```


--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@ -30,7 +30,8 @@ extern "C" {

 enum rng_type_t {
    STD_DEFAULT_RNG,
-    CUDA_RNG
+    CUDA_RNG,
+    RNG_TYPE_COUNT
 };

 enum sample_method_t {
@ -46,7 +47,7 @@ enum sample_method_t {
    LCM,
    DDIM_TRAILING,
    TCD,
-    N_SAMPLE_METHODS
+    SAMPLE_METHOD_COUNT
 };

 enum schedule_t {
@ -56,7 +57,7 @@ enum schedule_t {
    EXPONENTIAL,
    AYS,
    GITS,
-    N_SCHEDULES
+    SCHEDULE_COUNT
 };

 // same as enum ggml_type
@ -103,8 +104,6 @@ enum sd_type_t {
    SD_TYPE_COUNT = 39,
 };

-SD_API const char* sd_type_name(enum sd_type_t type);
-
 enum sd_log_level_t {
    SD_LOG_DEBUG,
    SD_LOG_INFO,
@ -112,13 +111,33 @@ enum sd_log_level_t {
    SD_LOG_ERROR
 };

-typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
-typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data);
-
-SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
-SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
-SD_API int32_t get_num_physical_cores();
-SD_API const char* sd_get_system_info();
+typedef struct {
+    const char* model_path;
+    const char* clip_l_path;
+    const char* clip_g_path;
+    const char* t5xxl_path;
+    const char* diffusion_model_path;
+    const char* vae_path;
+    const char* taesd_path;
+    const char* control_net_path;
+    const char* lora_model_dir;
+    const char* embedding_dir;
+    const char* stacked_id_embed_dir;
+    bool vae_decode_only;
+    bool vae_tiling;
+    bool free_params_immediately;
+    int n_threads;
+    enum sd_type_t wtype;
+    enum rng_type_t rng_type;
+    enum schedule_t schedule;
+    bool keep_clip_on_cpu;
+    bool keep_control_net_on_cpu;
+    bool keep_vae_on_cpu;
+    bool diffusion_flash_attn;
+    bool chroma_use_dit_mask;
+    bool chroma_use_t5_mask;
+    int chroma_t5_mask_pad;
+} sd_ctx_params_t;

 typedef struct {
    uint32_t width;
@ -127,8 +146,6 @@ typedef struct {
    uint8_t* data;
 } sd_image_t;

-typedef struct sd_ctx_t sd_ctx_t;
-
 typedef struct {
    int* layers;
    size_t layer_count;
@ -145,106 +162,76 @@ typedef struct {
    sd_slg_params_t slg;
 } sd_guidance_params_t;

-SD_API sd_ctx_t* new_sd_ctx(const char* model_path,
-                            const char* clip_l_path,
-                            const char* clip_g_path,
-                            const char* t5xxl_path,
-                            const char* diffusion_model_path,
-                            const char* vae_path,
-                            const char* taesd_path,
-                            const char* control_net_path_c_str,
-                            const char* lora_model_dir,
-                            const char* embed_dir_c_str,
-                            const char* stacked_id_embed_dir_c_str,
-                            bool vae_decode_only,
-                            bool vae_tiling,
-                            bool free_params_immediately,
-                            int n_threads,
-                            enum sd_type_t wtype,
-                            enum rng_type_t rng_type,
-                            enum schedule_t s,
-                            bool keep_clip_on_cpu,
-                            bool keep_control_net_cpu,
-                            bool keep_vae_on_cpu,
-                            bool diffusion_flash_attn,
-                            bool chroma_use_dit_mask,
-                            bool chroma_use_t5_mask,
-                            int chroma_t5_mask_pad);
+typedef struct {
+    const char* prompt;
+    const char* negative_prompt;
+    int clip_skip;
+    sd_guidance_params_t guidance;
+    sd_image_t init_image;
+    sd_image_t* ref_images;
+    int ref_images_count;
+    sd_image_t mask_image;
+    int width;
+    int height;
+    enum sample_method_t sample_method;
+    int sample_steps;
+    float eta;
+    float strength;
+    int64_t seed;
+    int batch_count;
+    const sd_image_t* control_cond;
+    float control_strength;
+    float style_strength;
+    bool normalize_input;
+    const char* input_id_images_path;
+} sd_img_gen_params_t;

+typedef struct {
+    sd_image_t init_image;
+    int width;
+    int height;
+    sd_guidance_params_t guidance;
+    enum sample_method_t sample_method;
+    int sample_steps;
+    float strength;
+    int64_t seed;
+    int video_frames;
+    int motion_bucket_id;
+    int fps;
+    float augmentation_level;
+} sd_vid_gen_params_t;
+
+typedef struct sd_ctx_t sd_ctx_t;
+
+typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
+typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data);
+
+SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
+SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
+SD_API int32_t get_num_physical_cores();
+SD_API const char* sd_get_system_info();
+
+SD_API const char* sd_type_name(enum sd_type_t type);
+SD_API enum sd_type_t str_to_sd_type(const char* str);
+SD_API const char* sd_rng_type_name(enum rng_type_t rng_type);
+SD_API enum rng_type_t str_to_rng_type(const char* str);
+SD_API const char* sd_sample_method_name(enum sample_method_t sample_method);
+SD_API enum sample_method_t str_to_sample_method(const char* str);
+SD_API const char* sd_schedule_name(enum schedule_t schedule);
+SD_API enum schedule_t str_to_schedule(const char* str);
+
+SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params);
+SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);
+
+SD_API sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params);
 SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);

-SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
-                           const char* prompt,
-                           const char* negative_prompt,
-                           int clip_skip,
-                           sd_guidance_params_t guidance,
-                           float eta,
-                           int width,
-                           int height,
-                           enum sample_method_t sample_method,
-                           int sample_steps,
-                           int64_t seed,
-                           int batch_count,
-                           const sd_image_t* control_cond,
-                           float control_strength,
-                           float style_strength,
-                           bool normalize_input,
-                           const char* input_id_images_path);
+SD_API void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params);
+SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params);
+SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params);

-SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
-                           sd_image_t init_image,
-                           sd_image_t mask_image,
-                           const char* prompt,
-                           const char* negative_prompt,
-                           int clip_skip,
-                           sd_guidance_params_t guidance,
-                           float eta,
-                           int width,
-                           int height,
-                           enum sample_method_t sample_method,
-                           int sample_steps,
-                           float strength,
-                           int64_t seed,
-                           int batch_count,
-                           const sd_image_t* control_cond,
-                           float control_strength,
-                           float style_strength,
-                           bool normalize_input,
-                           const char* input_id_images_path);
-
-SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
-                           sd_image_t init_image,
-                           int width,
-                           int height,
-                           int video_frames,
-                           int motion_bucket_id,
-                           int fps,
-                           float augmentation_level,
-                           sd_guidance_params_t guidance,
-                           enum sample_method_t sample_method,
-                           int sample_steps,
-                           float strength,
-                           int64_t seed);
-
-SD_API sd_image_t* edit(sd_ctx_t* sd_ctx,
-                        sd_image_t* ref_images,
-                        int ref_images_count,
-                        const char* prompt,
-                        const char* negative_prompt,
-                        int clip_skip,
-                        sd_guidance_params_t guidance,
-                        float eta,
-                        int width,
-                        int height,
-                        enum sample_method_t sample_method,
-                        int sample_steps,
-                        int64_t seed,
-                        int batch_count,
-                        const sd_image_t* control_cond,
-                        float control_strength,
-                        float style_strength,
-                        bool normalize_input,
-                        const char* input_id_images_path);
+SD_API void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params);
+SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params);  // broken

 typedef struct upscaler_ctx_t upscaler_ctx_t;

@ -254,7 +241,11 @@ SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);

 SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor);

-SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, enum sd_type_t output_type, const char* tensor_type_rules);
+SD_API bool convert(const char* input_path,
+                    const char* vae_path,
+                    const char* output_path,
+                    enum sd_type_t output_type,
+                    const char* tensor_type_rules);

 SD_API uint8_t* preprocess_canny(uint8_t* img,
                                 int width,
--- a/util.cpp
+++ b/util.cpp
@ -441,10 +441,6 @@ const char* sd_get_system_info() {
    return buffer;
 }

-const char* sd_type_name(enum sd_type_t type) {
-    return ggml_type_name((ggml_type)type);
-}
-
 sd_image_f32_t sd_image_t_to_sd_image_f32_t(sd_image_t image) {
    sd_image_f32_t converted_image;
    converted_image.width   = image.width;
--- a/util.h
+++ b/util.h
@ -7,6 +7,9 @@

 #include "stable-diffusion.h"

+#define SAFE_STR(s) ((s) ? (s) : "")
+#define BOOL_STR(b) ((b) ? "true" : "false")
+
 bool ends_with(const std::string& str, const std::string& ending);
 bool starts_with(const std::string& str, const std::string& start);
 bool contains(const std::string& str, const std::string& substr);