mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2025-12-13 05:48:56 +00:00
Compare commits
No commits in common. "d7f430cd693f2e12ecbaa0ce881746cf305c3b1f" and "cb1d975e96bbcd0ec65df9a11331224960cf17fd" have entirely different histories.
d7f430cd69
...
cb1d975e96
10
README.md
10
README.md
@ -7,7 +7,7 @@
|
|||||||
Diffusion model(SD,Flux,Wan,...) inference in pure C/C++
|
Diffusion model(SD,Flux,Wan,...) inference in pure C/C++
|
||||||
|
|
||||||
***Note that this project is under active development. \
|
***Note that this project is under active development. \
|
||||||
API and command-line option may change frequently.***
|
API and command-line parameters may change frequently.***
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
@ -290,10 +290,9 @@ usage: ./bin/sd [arguments]
|
|||||||
|
|
||||||
arguments:
|
arguments:
|
||||||
-h, --help show this help message and exit
|
-h, --help show this help message and exit
|
||||||
-M, --mode [MODE] run mode, one of: [img_gen, vid_gen, convert], default: img_gen
|
-M, --mode [MODE] run mode, one of: [img_gen, convert], default: img_gen
|
||||||
-t, --threads N number of threads to use during computation (default: -1)
|
-t, --threads N number of threads to use during computation (default: -1)
|
||||||
If threads <= 0, then threads will be set to the number of CPU physical cores
|
If threads <= 0, then threads will be set to the number of CPU physical cores
|
||||||
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
|
|
||||||
-m, --model [MODEL] path to full model
|
-m, --model [MODEL] path to full model
|
||||||
--diffusion-model path to the standalone diffusion model
|
--diffusion-model path to the standalone diffusion model
|
||||||
--high-noise-diffusion-model path to the standalone high noise diffusion model
|
--high-noise-diffusion-model path to the standalone high noise diffusion model
|
||||||
@ -347,7 +346,7 @@ arguments:
|
|||||||
--high-noise-scheduler {discrete, karras, exponential, ays, gits} Denoiser sigma scheduler (default: discrete)
|
--high-noise-scheduler {discrete, karras, exponential, ays, gits} Denoiser sigma scheduler (default: discrete)
|
||||||
--high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
|
--high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
|
||||||
(high noise) sampling method (default: "euler_a")
|
(high noise) sampling method (default: "euler_a")
|
||||||
--high-noise-steps STEPS (high noise) number of sample steps (default: -1 = auto)
|
--high-noise-steps STEPS (high noise) number of sample steps (default: 20)
|
||||||
SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])
|
SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])
|
||||||
--strength STRENGTH strength for noising/unnoising (default: 0.75)
|
--strength STRENGTH strength for noising/unnoising (default: 0.75)
|
||||||
--style-ratio STYLE-RATIO strength for keeping input identity (default: 20)
|
--style-ratio STYLE-RATIO strength for keeping input identity (default: 20)
|
||||||
@ -378,9 +377,6 @@ arguments:
|
|||||||
--chroma-t5-mask-pad PAD_SIZE t5 mask pad size of chroma
|
--chroma-t5-mask-pad PAD_SIZE t5 mask pad size of chroma
|
||||||
--video-frames video frames (default: 1)
|
--video-frames video frames (default: 1)
|
||||||
--fps fps (default: 24)
|
--fps fps (default: 24)
|
||||||
--moe-boundary BOUNDARY timestep boundary for Wan2.2 MoE model. (default: 0.875)
|
|
||||||
only enabled if `--high-noise-steps` is set to -1
|
|
||||||
--flow-shift SHIFT shift value for Flow models like SD3.x or WAN (default: auto)
|
|
||||||
-v, --verbose print extra info
|
-v, --verbose print extra info
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@ -382,8 +382,7 @@ struct DiscreteFlowDenoiser : public Denoiser {
|
|||||||
|
|
||||||
float sigma_data = 1.0f;
|
float sigma_data = 1.0f;
|
||||||
|
|
||||||
DiscreteFlowDenoiser(float shift = 3.0f)
|
DiscreteFlowDenoiser() {
|
||||||
: shift(shift) {
|
|
||||||
set_parameters();
|
set_parameters();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -43,6 +43,8 @@
|
|||||||
|
|
||||||
## Examples
|
## Examples
|
||||||
|
|
||||||
|
Since GitHub does not support AVI files, the file I uploaded was converted from AVI to MP4.
|
||||||
|
|
||||||
### Wan2.1 T2V 1.3B
|
### Wan2.1 T2V 1.3B
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|||||||
@ -89,8 +89,6 @@ struct SDParams {
|
|||||||
std::vector<int> high_noise_skip_layers = {7, 8, 9};
|
std::vector<int> high_noise_skip_layers = {7, 8, 9};
|
||||||
sd_sample_params_t high_noise_sample_params;
|
sd_sample_params_t high_noise_sample_params;
|
||||||
|
|
||||||
float moe_boundary = 0.875f;
|
|
||||||
|
|
||||||
int video_frames = 1;
|
int video_frames = 1;
|
||||||
int fps = 16;
|
int fps = 16;
|
||||||
|
|
||||||
@ -115,12 +113,10 @@ struct SDParams {
|
|||||||
bool chroma_use_dit_mask = true;
|
bool chroma_use_dit_mask = true;
|
||||||
bool chroma_use_t5_mask = false;
|
bool chroma_use_t5_mask = false;
|
||||||
int chroma_t5_mask_pad = 1;
|
int chroma_t5_mask_pad = 1;
|
||||||
float flow_shift = INFINITY;
|
|
||||||
|
|
||||||
SDParams() {
|
SDParams() {
|
||||||
sd_sample_params_init(&sample_params);
|
sd_sample_params_init(&sample_params);
|
||||||
sd_sample_params_init(&high_noise_sample_params);
|
sd_sample_params_init(&high_noise_sample_params);
|
||||||
high_noise_sample_params.sample_steps = -1;
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -171,8 +167,6 @@ void print_params(SDParams params) {
|
|||||||
printf(" height: %d\n", params.height);
|
printf(" height: %d\n", params.height);
|
||||||
printf(" sample_params: %s\n", SAFE_STR(sample_params_str));
|
printf(" sample_params: %s\n", SAFE_STR(sample_params_str));
|
||||||
printf(" high_noise_sample_params: %s\n", SAFE_STR(high_noise_sample_params_str));
|
printf(" high_noise_sample_params: %s\n", SAFE_STR(high_noise_sample_params_str));
|
||||||
printf(" moe_boundary: %.3f\n", params.moe_boundary);
|
|
||||||
printf(" flow_shift: %.2f\n", params.flow_shift);
|
|
||||||
printf(" strength(img2img): %.2f\n", params.strength);
|
printf(" strength(img2img): %.2f\n", params.strength);
|
||||||
printf(" rng: %s\n", sd_rng_type_name(params.rng_type));
|
printf(" rng: %s\n", sd_rng_type_name(params.rng_type));
|
||||||
printf(" seed: %ld\n", params.seed);
|
printf(" seed: %ld\n", params.seed);
|
||||||
@ -193,10 +187,9 @@ void print_usage(int argc, const char* argv[]) {
|
|||||||
printf("\n");
|
printf("\n");
|
||||||
printf("arguments:\n");
|
printf("arguments:\n");
|
||||||
printf(" -h, --help show this help message and exit\n");
|
printf(" -h, --help show this help message and exit\n");
|
||||||
printf(" -M, --mode [MODE] run mode, one of: [img_gen, vid_gen, convert], default: img_gen\n");
|
printf(" -M, --mode [MODE] run mode, one of: [img_gen, convert], default: img_gen\n");
|
||||||
printf(" -t, --threads N number of threads to use during computation (default: -1)\n");
|
printf(" -t, --threads N number of threads to use during computation (default: -1)\n");
|
||||||
printf(" If threads <= 0, then threads will be set to the number of CPU physical cores\n");
|
printf(" If threads <= 0, then threads will be set to the number of CPU physical cores\n");
|
||||||
printf(" --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed\n");
|
|
||||||
printf(" -m, --model [MODEL] path to full model\n");
|
printf(" -m, --model [MODEL] path to full model\n");
|
||||||
printf(" --diffusion-model path to the standalone diffusion model\n");
|
printf(" --diffusion-model path to the standalone diffusion model\n");
|
||||||
printf(" --high-noise-diffusion-model path to the standalone high noise diffusion model\n");
|
printf(" --high-noise-diffusion-model path to the standalone high noise diffusion model\n");
|
||||||
@ -250,7 +243,7 @@ void print_usage(int argc, const char* argv[]) {
|
|||||||
printf(" --high-noise-scheduler {discrete, karras, exponential, ays, gits} Denoiser sigma scheduler (default: discrete)\n");
|
printf(" --high-noise-scheduler {discrete, karras, exponential, ays, gits} Denoiser sigma scheduler (default: discrete)\n");
|
||||||
printf(" --high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n");
|
printf(" --high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n");
|
||||||
printf(" (high noise) sampling method (default: \"euler_a\")\n");
|
printf(" (high noise) sampling method (default: \"euler_a\")\n");
|
||||||
printf(" --high-noise-steps STEPS (high noise) number of sample steps (default: -1 = auto)\n");
|
printf(" --high-noise-steps STEPS (high noise) number of sample steps (default: 20)\n");
|
||||||
printf(" SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])\n");
|
printf(" SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])\n");
|
||||||
printf(" --strength STRENGTH strength for noising/unnoising (default: 0.75)\n");
|
printf(" --strength STRENGTH strength for noising/unnoising (default: 0.75)\n");
|
||||||
printf(" --style-ratio STYLE-RATIO strength for keeping input identity (default: 20)\n");
|
printf(" --style-ratio STYLE-RATIO strength for keeping input identity (default: 20)\n");
|
||||||
@ -281,9 +274,6 @@ void print_usage(int argc, const char* argv[]) {
|
|||||||
printf(" --chroma-t5-mask-pad PAD_SIZE t5 mask pad size of chroma\n");
|
printf(" --chroma-t5-mask-pad PAD_SIZE t5 mask pad size of chroma\n");
|
||||||
printf(" --video-frames video frames (default: 1)\n");
|
printf(" --video-frames video frames (default: 1)\n");
|
||||||
printf(" --fps fps (default: 24)\n");
|
printf(" --fps fps (default: 24)\n");
|
||||||
printf(" --moe-boundary BOUNDARY timestep boundary for Wan2.2 MoE model. (default: 0.875)\n");
|
|
||||||
printf(" only enabled if `--high-noise-steps` is set to -1\n");
|
|
||||||
printf(" --flow-shift SHIFT shift value for Flow models like SD3.x or WAN (default: auto)\n");
|
|
||||||
printf(" -v, --verbose print extra info\n");
|
printf(" -v, --verbose print extra info\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -517,8 +507,6 @@ void parse_args(int argc, const char** argv, SDParams& params) {
|
|||||||
{"", "--strength", "", ¶ms.strength},
|
{"", "--strength", "", ¶ms.strength},
|
||||||
{"", "--style-ratio", "", ¶ms.style_ratio},
|
{"", "--style-ratio", "", ¶ms.style_ratio},
|
||||||
{"", "--control-strength", "", ¶ms.control_strength},
|
{"", "--control-strength", "", ¶ms.control_strength},
|
||||||
{"", "--moe-boundary", "", ¶ms.moe_boundary},
|
|
||||||
{"", "--flow-shift", "", ¶ms.flow_shift},
|
|
||||||
};
|
};
|
||||||
|
|
||||||
options.bool_options = {
|
options.bool_options = {
|
||||||
@ -779,7 +767,8 @@ void parse_args(int argc, const char** argv, SDParams& params) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (params.high_noise_sample_params.sample_steps <= 0) {
|
if (params.high_noise_sample_params.sample_steps <= 0) {
|
||||||
params.high_noise_sample_params.sample_steps = -1;
|
fprintf(stderr, "error: the high_noise_sample_steps must be greater than 0\n");
|
||||||
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.strength < 0.f || params.strength > 1.f) {
|
if (params.strength < 0.f || params.strength > 1.f) {
|
||||||
@ -1186,7 +1175,6 @@ int main(int argc, const char* argv[]) {
|
|||||||
params.chroma_use_dit_mask,
|
params.chroma_use_dit_mask,
|
||||||
params.chroma_use_t5_mask,
|
params.chroma_use_t5_mask,
|
||||||
params.chroma_t5_mask_pad,
|
params.chroma_t5_mask_pad,
|
||||||
params.flow_shift,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
sd_ctx_t* sd_ctx = new_sd_ctx(&sd_ctx_params);
|
sd_ctx_t* sd_ctx = new_sd_ctx(&sd_ctx_params);
|
||||||
@ -1234,7 +1222,6 @@ int main(int argc, const char* argv[]) {
|
|||||||
params.height,
|
params.height,
|
||||||
params.sample_params,
|
params.sample_params,
|
||||||
params.high_noise_sample_params,
|
params.high_noise_sample_params,
|
||||||
params.moe_boundary,
|
|
||||||
params.strength,
|
params.strength,
|
||||||
params.seed,
|
params.seed,
|
||||||
params.video_frames,
|
params.video_frames,
|
||||||
|
|||||||
@ -681,11 +681,7 @@ public:
|
|||||||
|
|
||||||
if (sd_version_is_sd3(version)) {
|
if (sd_version_is_sd3(version)) {
|
||||||
LOG_INFO("running in FLOW mode");
|
LOG_INFO("running in FLOW mode");
|
||||||
float shift = sd_ctx_params->flow_shift;
|
denoiser = std::make_shared<DiscreteFlowDenoiser>();
|
||||||
if (shift == INFINITY) {
|
|
||||||
shift = 3.0;
|
|
||||||
}
|
|
||||||
denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
|
|
||||||
} else if (sd_version_is_flux(version)) {
|
} else if (sd_version_is_flux(version)) {
|
||||||
LOG_INFO("running in Flux FLOW mode");
|
LOG_INFO("running in Flux FLOW mode");
|
||||||
float shift = 1.0f; // TODO: validate
|
float shift = 1.0f; // TODO: validate
|
||||||
@ -698,11 +694,7 @@ public:
|
|||||||
denoiser = std::make_shared<FluxFlowDenoiser>(shift);
|
denoiser = std::make_shared<FluxFlowDenoiser>(shift);
|
||||||
} else if (sd_version_is_wan(version)) {
|
} else if (sd_version_is_wan(version)) {
|
||||||
LOG_INFO("running in FLOW mode");
|
LOG_INFO("running in FLOW mode");
|
||||||
float shift = sd_ctx_params->flow_shift;
|
denoiser = std::make_shared<DiscreteFlowDenoiser>();
|
||||||
if (shift == INFINITY) {
|
|
||||||
shift = 5.0;
|
|
||||||
}
|
|
||||||
denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
|
|
||||||
} else if (is_using_v_parameterization) {
|
} else if (is_using_v_parameterization) {
|
||||||
LOG_INFO("running in v-prediction mode");
|
LOG_INFO("running in v-prediction mode");
|
||||||
denoiser = std::make_shared<CompVisVDenoiser>();
|
denoiser = std::make_shared<CompVisVDenoiser>();
|
||||||
@ -1561,7 +1553,6 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
|
|||||||
sd_ctx_params->chroma_use_dit_mask = true;
|
sd_ctx_params->chroma_use_dit_mask = true;
|
||||||
sd_ctx_params->chroma_use_t5_mask = false;
|
sd_ctx_params->chroma_use_t5_mask = false;
|
||||||
sd_ctx_params->chroma_t5_mask_pad = 1;
|
sd_ctx_params->chroma_t5_mask_pad = 1;
|
||||||
sd_ctx_params->flow_shift = INFINITY;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
|
char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
|
||||||
@ -1736,13 +1727,11 @@ void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) {
|
|||||||
memset((void*)sd_vid_gen_params, 0, sizeof(sd_vid_gen_params_t));
|
memset((void*)sd_vid_gen_params, 0, sizeof(sd_vid_gen_params_t));
|
||||||
sd_sample_params_init(&sd_vid_gen_params->sample_params);
|
sd_sample_params_init(&sd_vid_gen_params->sample_params);
|
||||||
sd_sample_params_init(&sd_vid_gen_params->high_noise_sample_params);
|
sd_sample_params_init(&sd_vid_gen_params->high_noise_sample_params);
|
||||||
sd_vid_gen_params->high_noise_sample_params.sample_steps = -1;
|
|
||||||
sd_vid_gen_params->width = 512;
|
sd_vid_gen_params->width = 512;
|
||||||
sd_vid_gen_params->height = 512;
|
sd_vid_gen_params->height = 512;
|
||||||
sd_vid_gen_params->strength = 0.75f;
|
sd_vid_gen_params->strength = 0.75f;
|
||||||
sd_vid_gen_params->seed = -1;
|
sd_vid_gen_params->seed = -1;
|
||||||
sd_vid_gen_params->video_frames = 6;
|
sd_vid_gen_params->video_frames = 6;
|
||||||
sd_vid_gen_params->moe_boundary = 0.875f;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
struct sd_ctx_t {
|
struct sd_ctx_t {
|
||||||
@ -2392,24 +2381,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
|
|||||||
high_noise_sample_steps = sd_vid_gen_params->high_noise_sample_params.sample_steps;
|
high_noise_sample_steps = sd_vid_gen_params->high_noise_sample_params.sample_steps;
|
||||||
}
|
}
|
||||||
|
|
||||||
int total_steps = sample_steps;
|
std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps + high_noise_sample_steps);
|
||||||
|
|
||||||
if (high_noise_sample_steps > 0) {
|
|
||||||
total_steps += high_noise_sample_steps;
|
|
||||||
}
|
|
||||||
std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(total_steps);
|
|
||||||
|
|
||||||
if (high_noise_sample_steps < 0) {
|
|
||||||
// timesteps ∝ sigmas for Flow models (like wan2.2 a14b)
|
|
||||||
for (size_t i = 0; i < sigmas.size(); ++i) {
|
|
||||||
if (sigmas[i] < sd_vid_gen_params->moe_boundary) {
|
|
||||||
high_noise_sample_steps = i;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
LOG_DEBUG("switching from high noise model at step %d", high_noise_sample_steps);
|
|
||||||
sample_steps = total_steps - high_noise_sample_steps;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_init_params params;
|
struct ggml_init_params params;
|
||||||
params.mem_size = static_cast<size_t>(200 * 1024) * 1024; // 200 MB
|
params.mem_size = static_cast<size_t>(200 * 1024) * 1024; // 200 MB
|
||||||
|
|||||||
@ -142,7 +142,6 @@ typedef struct {
|
|||||||
bool chroma_use_dit_mask;
|
bool chroma_use_dit_mask;
|
||||||
bool chroma_use_t5_mask;
|
bool chroma_use_t5_mask;
|
||||||
int chroma_t5_mask_pad;
|
int chroma_t5_mask_pad;
|
||||||
float flow_shift;
|
|
||||||
} sd_ctx_params_t;
|
} sd_ctx_params_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
@ -206,7 +205,6 @@ typedef struct {
|
|||||||
int height;
|
int height;
|
||||||
sd_sample_params_t sample_params;
|
sd_sample_params_t sample_params;
|
||||||
sd_sample_params_t high_noise_sample_params;
|
sd_sample_params_t high_noise_sample_params;
|
||||||
float moe_boundary;
|
|
||||||
float strength;
|
float strength;
|
||||||
int64_t seed;
|
int64_t seed;
|
||||||
int video_frames;
|
int video_frames;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user