mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2025-12-13 05:48:56 +00:00
Compare commits
2 Commits
10c6501bd0
...
884e23eeeb
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
884e23eeeb | ||
|
|
c9b5735116 |
18
README.md
18
README.md
@ -13,7 +13,7 @@ Inference of Stable Diffusion and Flux in pure C/C++
|
||||
- SD1.x, SD2.x, SDXL and [SD3/SD3.5](./docs/sd3.md) support
|
||||
- !!!The VAE in SDXL encounters NaN issues under FP16, but unfortunately, the ggml_conv_2d only operates under FP16. Hence, a parameter is needed to specify the VAE that has fixed the FP16 NaN issue. You can find it here: [SDXL VAE FP16 Fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors).
|
||||
- [Flux-dev/Flux-schnell Support](./docs/flux.md)
|
||||
|
||||
- [FLUX.1-Kontext-dev](./docs/kontext.md)
|
||||
- [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo) and [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo) support
|
||||
- [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support.
|
||||
- 16-bit, 32-bit float support
|
||||
@ -220,7 +220,7 @@ arguments:
|
||||
-m, --model [MODEL] path to full model
|
||||
--diffusion-model path to the standalone diffusion model
|
||||
--clip_l path to the clip-l text encoder
|
||||
--clip_g path to the clip-l text encoder
|
||||
--clip_g path to the clip-g text encoder
|
||||
--t5xxl path to the the t5xxl text encoder
|
||||
--vae [VAE] path to vae
|
||||
--taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
|
||||
@ -231,26 +231,32 @@ arguments:
|
||||
--normalize-input normalize PHOTOMAKER input id images
|
||||
--upscale-model [ESRGAN_PATH] path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now
|
||||
--upscale-repeats Run the ESRGAN upscaler this many times (default 1)
|
||||
--type [TYPE] weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_k, q3_k, q4_k)
|
||||
--type [TYPE] weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)
|
||||
If not specified, the default is the type of the weight file
|
||||
--lora-model-dir [DIR] lora model directory
|
||||
-i, --init-img [IMAGE] path to the input image, required by img2img
|
||||
--mask [MASK] path to the mask image, required by img2img with mask
|
||||
--control-image [IMAGE] path to image condition, control net
|
||||
-r, --ref_image [PATH] reference image for Flux Kontext models (can be used multiple times)
|
||||
-o, --output OUTPUT path to write result image to (default: ./output.png)
|
||||
-p, --prompt [PROMPT] the prompt to render
|
||||
-n, --negative-prompt PROMPT the negative prompt (default: "")
|
||||
--cfg-scale SCALE unconditional guidance scale: (default: 7.0)
|
||||
--guidance SCALE guidance scale for img2img (default: 3.5)
|
||||
--slg-scale SCALE skip layer guidance (SLG) scale, only for DiT models: (default: 0)
|
||||
0 means disabled, a value of 2.5 is nice for sd3.5 medium
|
||||
--eta SCALE eta in DDIM, only for DDIM and TCD: (default: 0)
|
||||
--skip-layers LAYERS Layers to skip for SLG steps: (default: [7,8,9])
|
||||
--skip-layer-start START SLG enabling point: (default: 0.01)
|
||||
--skip-layer-end END SLG disabling point: (default: 0.2)
|
||||
SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])
|
||||
SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])
|
||||
--strength STRENGTH strength for noising/unnoising (default: 0.75)
|
||||
--style-ratio STYLE-RATIO strength for keeping input identity (default: 20%)
|
||||
--control-strength STRENGTH strength to apply Control Net (default: 0.9)
|
||||
1.0 corresponds to full destruction of information in init image
|
||||
-H, --height H image height, in pixel space (default: 512)
|
||||
-W, --width W image width, in pixel space (default: 512)
|
||||
--sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm}
|
||||
--sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
|
||||
sampling method (default: "euler_a")
|
||||
--steps STEPS number of sample steps (default: 20)
|
||||
--rng {std_default, cuda} RNG (default: cuda)
|
||||
@ -267,7 +273,7 @@ arguments:
|
||||
This might crash if it is not supported by the backend.
|
||||
--control-net-cpu keep controlnet in cpu (for low vram)
|
||||
--canny apply canny preprocessor (edge detection)
|
||||
--color Colors the logging tags according to level
|
||||
--color colors the logging tags according to level
|
||||
-v, --verbose print extra info
|
||||
```
|
||||
|
||||
|
||||
BIN
assets/flux/kontext1_dev_output.png
Normal file
BIN
assets/flux/kontext1_dev_output.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 496 KiB |
@ -13,6 +13,7 @@ struct DiffusionModel {
|
||||
struct ggml_tensor* c_concat,
|
||||
struct ggml_tensor* y,
|
||||
struct ggml_tensor* guidance,
|
||||
std::vector<ggml_tensor*> ref_latents = {},
|
||||
int num_video_frames = -1,
|
||||
std::vector<struct ggml_tensor*> controls = {},
|
||||
float control_strength = 0.f,
|
||||
@ -68,6 +69,7 @@ struct UNetModel : public DiffusionModel {
|
||||
struct ggml_tensor* c_concat,
|
||||
struct ggml_tensor* y,
|
||||
struct ggml_tensor* guidance,
|
||||
std::vector<ggml_tensor*> ref_latents = {},
|
||||
int num_video_frames = -1,
|
||||
std::vector<struct ggml_tensor*> controls = {},
|
||||
float control_strength = 0.f,
|
||||
@ -118,6 +120,7 @@ struct MMDiTModel : public DiffusionModel {
|
||||
struct ggml_tensor* c_concat,
|
||||
struct ggml_tensor* y,
|
||||
struct ggml_tensor* guidance,
|
||||
std::vector<ggml_tensor*> ref_latents = {},
|
||||
int num_video_frames = -1,
|
||||
std::vector<struct ggml_tensor*> controls = {},
|
||||
float control_strength = 0.f,
|
||||
@ -169,13 +172,14 @@ struct FluxModel : public DiffusionModel {
|
||||
struct ggml_tensor* c_concat,
|
||||
struct ggml_tensor* y,
|
||||
struct ggml_tensor* guidance,
|
||||
std::vector<ggml_tensor*> ref_latents = {},
|
||||
int num_video_frames = -1,
|
||||
std::vector<struct ggml_tensor*> controls = {},
|
||||
float control_strength = 0.f,
|
||||
struct ggml_tensor** output = NULL,
|
||||
struct ggml_context* output_ctx = NULL,
|
||||
std::vector<int> skip_layers = std::vector<int>()) {
|
||||
return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, output, output_ctx, skip_layers);
|
||||
return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, ref_latents, output, output_ctx, skip_layers);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
39
docs/kontext.md
Normal file
39
docs/kontext.md
Normal file
@ -0,0 +1,39 @@
|
||||
# How to Use
|
||||
|
||||
You can run Kontext using stable-diffusion.cpp with a GPU that has 6GB or even 4GB of VRAM, without needing to offload to RAM.
|
||||
|
||||
## Download weights
|
||||
|
||||
- Download Kontext
|
||||
- If you don't want to do the conversion yourself, download the preconverted gguf model from [FLUX.1-Kontext-dev-GGUF](https://huggingface.co/QuantStack/FLUX.1-Kontext-dev-GGUF)
|
||||
- Otherwise, download FLUX.1-Kontext-dev from https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev/blob/main/flux1-kontext-dev.safetensors
|
||||
- Download vae from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
|
||||
- Download clip_l from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/clip_l.safetensors
|
||||
- Download t5xxl from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
|
||||
|
||||
## Convert Kontext weights
|
||||
|
||||
You can download the preconverted gguf weights from [FLUX.1-Kontext-dev-GGUF](https://huggingface.co/QuantStack/FLUX.1-Kontext-dev-GGUF), this way you don't have to do the conversion yourself.
|
||||
|
||||
```
|
||||
.\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-kontext-dev.safetensors -o ..\models\flux1-kontext-dev-q8_0.gguf -v --type q8_0
|
||||
```
|
||||
|
||||
## Run
|
||||
|
||||
- `--cfg-scale` is recommended to be set to 1.
|
||||
|
||||
### Example
|
||||
For example:
|
||||
|
||||
```
|
||||
.\bin\Release\sd.exe -M edit -r .\flux1-dev-q8_0.png --diffusion-model ..\models\flux1-kontext-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "change 'flux.cpp' to 'kontext.cpp'" --cfg-scale 1.0 --sampling-method euler -v
|
||||
```
|
||||
|
||||
|
||||
| ref_image | prompt | output |
|
||||
| ---- | ---- |---- |
|
||||
|  | change 'flux.cpp' to 'kontext.cpp' | |
|
||||
|
||||
|
||||
|
||||
@ -57,6 +57,7 @@ const char* modes_str[] = {
|
||||
"txt2img",
|
||||
"img2img",
|
||||
"img2vid",
|
||||
"edit",
|
||||
"convert",
|
||||
};
|
||||
|
||||
@ -64,6 +65,7 @@ enum SDMode {
|
||||
TXT2IMG,
|
||||
IMG2IMG,
|
||||
IMG2VID,
|
||||
EDIT,
|
||||
CONVERT,
|
||||
MODE_COUNT
|
||||
};
|
||||
@ -89,6 +91,7 @@ struct SDParams {
|
||||
std::string input_path;
|
||||
std::string mask_path;
|
||||
std::string control_image_path;
|
||||
std::vector<std::string> ref_image_paths;
|
||||
|
||||
std::string prompt;
|
||||
std::string negative_prompt;
|
||||
@ -154,6 +157,10 @@ void print_params(SDParams params) {
|
||||
printf(" init_img: %s\n", params.input_path.c_str());
|
||||
printf(" mask_img: %s\n", params.mask_path.c_str());
|
||||
printf(" control_image: %s\n", params.control_image_path.c_str());
|
||||
printf(" ref_images_paths:\n");
|
||||
for (auto& path : params.ref_image_paths) {
|
||||
printf(" %s\n", path.c_str());
|
||||
};
|
||||
printf(" clip on cpu: %s\n", params.clip_on_cpu ? "true" : "false");
|
||||
printf(" controlnet cpu: %s\n", params.control_net_cpu ? "true" : "false");
|
||||
printf(" vae decoder on cpu:%s\n", params.vae_on_cpu ? "true" : "false");
|
||||
@ -208,6 +215,7 @@ void print_usage(int argc, const char* argv[]) {
|
||||
printf(" -i, --init-img [IMAGE] path to the input image, required by img2img\n");
|
||||
printf(" --mask [MASK] path to the mask image, required by img2img with mask\n");
|
||||
printf(" --control-image [IMAGE] path to image condition, control net\n");
|
||||
printf(" -r, --ref_image [PATH] reference image for Flux Kontext models (can be used multiple times) \n");
|
||||
printf(" -o, --output OUTPUT path to write result image to (default: ./output.png)\n");
|
||||
printf(" -p, --prompt [PROMPT] the prompt to render\n");
|
||||
printf(" -n, --negative-prompt PROMPT the negative prompt (default: \"\")\n");
|
||||
@ -243,7 +251,7 @@ void print_usage(int argc, const char* argv[]) {
|
||||
printf(" This might crash if it is not supported by the backend.\n");
|
||||
printf(" --control-net-cpu keep controlnet in cpu (for low vram)\n");
|
||||
printf(" --canny apply canny preprocessor (edge detection)\n");
|
||||
printf(" --color Colors the logging tags according to level\n");
|
||||
printf(" --color colors the logging tags according to level\n");
|
||||
printf(" -v, --verbose print extra info\n");
|
||||
}
|
||||
|
||||
@ -629,6 +637,12 @@ void parse_args(int argc, const char** argv, SDParams& params) {
|
||||
break;
|
||||
}
|
||||
params.skip_layer_end = std::stof(argv[i]);
|
||||
} else if (arg == "-r" || arg == "--ref-image") {
|
||||
if (++i >= argc) {
|
||||
invalid_arg = true;
|
||||
break;
|
||||
}
|
||||
params.ref_image_paths.push_back(argv[i]);
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
print_usage(argc, argv);
|
||||
@ -657,7 +671,13 @@ void parse_args(int argc, const char** argv, SDParams& params) {
|
||||
}
|
||||
|
||||
if ((params.mode == IMG2IMG || params.mode == IMG2VID) && params.input_path.length() == 0) {
|
||||
fprintf(stderr, "error: when using the img2img mode, the following arguments are required: init-img\n");
|
||||
fprintf(stderr, "error: when using the img2img/img2vid mode, the following arguments are required: init-img\n");
|
||||
print_usage(argc, argv);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (params.mode == EDIT && params.ref_image_paths.size() == 0) {
|
||||
fprintf(stderr, "error: when using the edit mode, the following arguments are required: ref-image\n");
|
||||
print_usage(argc, argv);
|
||||
exit(1);
|
||||
}
|
||||
@ -826,6 +846,7 @@ int main(int argc, const char* argv[]) {
|
||||
uint8_t* input_image_buffer = NULL;
|
||||
uint8_t* control_image_buffer = NULL;
|
||||
uint8_t* mask_image_buffer = NULL;
|
||||
std::vector<sd_image_t> ref_images;
|
||||
|
||||
if (params.mode == IMG2IMG || params.mode == IMG2VID) {
|
||||
vae_decode_only = false;
|
||||
@ -877,6 +898,37 @@ int main(int argc, const char* argv[]) {
|
||||
free(input_image_buffer);
|
||||
input_image_buffer = resized_image_buffer;
|
||||
}
|
||||
} else if (params.mode == EDIT) {
|
||||
vae_decode_only = false;
|
||||
for (auto& path : params.ref_image_paths) {
|
||||
int c = 0;
|
||||
int width = 0;
|
||||
int height = 0;
|
||||
uint8_t* image_buffer = stbi_load(path.c_str(), &width, &height, &c, 3);
|
||||
if (image_buffer == NULL) {
|
||||
fprintf(stderr, "load image from '%s' failed\n", path.c_str());
|
||||
return 1;
|
||||
}
|
||||
if (c < 3) {
|
||||
fprintf(stderr, "the number of channels for the input image must be >= 3, but got %d channels\n", c);
|
||||
free(image_buffer);
|
||||
return 1;
|
||||
}
|
||||
if (width <= 0) {
|
||||
fprintf(stderr, "error: the width of image must be greater than 0\n");
|
||||
free(image_buffer);
|
||||
return 1;
|
||||
}
|
||||
if (height <= 0) {
|
||||
fprintf(stderr, "error: the height of image must be greater than 0\n");
|
||||
free(image_buffer);
|
||||
return 1;
|
||||
}
|
||||
ref_images.push_back({(uint32_t)width,
|
||||
(uint32_t)height,
|
||||
3,
|
||||
image_buffer});
|
||||
}
|
||||
}
|
||||
|
||||
sd_ctx_t* sd_ctx = new_sd_ctx(params.model_path.c_str(),
|
||||
@ -968,7 +1020,7 @@ int main(int argc, const char* argv[]) {
|
||||
params.slg_scale,
|
||||
params.skip_layer_start,
|
||||
params.skip_layer_end);
|
||||
} else {
|
||||
} else if (params.mode == IMG2IMG || params.mode == IMG2VID) {
|
||||
sd_image_t input_image = {(uint32_t)params.width,
|
||||
(uint32_t)params.height,
|
||||
3,
|
||||
@ -1038,6 +1090,32 @@ int main(int argc, const char* argv[]) {
|
||||
params.skip_layer_start,
|
||||
params.skip_layer_end);
|
||||
}
|
||||
} else { // EDIT
|
||||
results = edit(sd_ctx,
|
||||
ref_images.data(),
|
||||
ref_images.size(),
|
||||
params.prompt.c_str(),
|
||||
params.negative_prompt.c_str(),
|
||||
params.clip_skip,
|
||||
params.cfg_scale,
|
||||
params.guidance,
|
||||
params.eta,
|
||||
params.width,
|
||||
params.height,
|
||||
params.sample_method,
|
||||
params.sample_steps,
|
||||
params.strength,
|
||||
params.seed,
|
||||
params.batch_count,
|
||||
control_image,
|
||||
params.control_strength,
|
||||
params.style_ratio,
|
||||
params.normalize_input,
|
||||
params.skip_layers.data(),
|
||||
params.skip_layers.size(),
|
||||
params.slg_scale,
|
||||
params.skip_layer_start,
|
||||
params.skip_layer_end);
|
||||
}
|
||||
|
||||
if (results == NULL) {
|
||||
@ -1117,4 +1195,4 @@ int main(int argc, const char* argv[]) {
|
||||
free(input_image_buffer);
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
115
flux.hpp
115
flux.hpp
@ -570,17 +570,22 @@ namespace Flux {
|
||||
}
|
||||
|
||||
// Generate IDs for image patches and text
|
||||
std::vector<std::vector<float>> gen_ids(int h, int w, int patch_size, int bs, int context_len) {
|
||||
std::vector<std::vector<float>> gen_txt_ids(int bs, int context_len) {
|
||||
return std::vector<std::vector<float>>(bs * context_len, std::vector<float>(3, 0.0));
|
||||
}
|
||||
|
||||
std::vector<std::vector<float>> gen_img_ids(int h, int w, int patch_size, int bs, int index = 0, int h_offset = 0, int w_offset = 0) {
|
||||
int h_len = (h + (patch_size / 2)) / patch_size;
|
||||
int w_len = (w + (patch_size / 2)) / patch_size;
|
||||
|
||||
std::vector<std::vector<float>> img_ids(h_len * w_len, std::vector<float>(3, 0.0));
|
||||
|
||||
std::vector<float> row_ids = linspace(0, h_len - 1, h_len);
|
||||
std::vector<float> col_ids = linspace(0, w_len - 1, w_len);
|
||||
std::vector<float> row_ids = linspace(h_offset, h_len - 1 + h_offset, h_len);
|
||||
std::vector<float> col_ids = linspace(w_offset, w_len - 1 + w_offset, w_len);
|
||||
|
||||
for (int i = 0; i < h_len; ++i) {
|
||||
for (int j = 0; j < w_len; ++j) {
|
||||
img_ids[i * w_len + j][0] = index;
|
||||
img_ids[i * w_len + j][1] = row_ids[i];
|
||||
img_ids[i * w_len + j][2] = col_ids[j];
|
||||
}
|
||||
@ -592,24 +597,54 @@ namespace Flux {
|
||||
img_ids_repeated[i * img_ids.size() + j] = img_ids[j];
|
||||
}
|
||||
}
|
||||
return img_ids_repeated;
|
||||
}
|
||||
|
||||
std::vector<std::vector<float>> txt_ids(bs * context_len, std::vector<float>(3, 0.0));
|
||||
std::vector<std::vector<float>> ids(bs * (context_len + img_ids.size()), std::vector<float>(3));
|
||||
std::vector<std::vector<float>> concat_ids(const std::vector<std::vector<float>>& a,
|
||||
const std::vector<std::vector<float>>& b,
|
||||
int bs) {
|
||||
size_t a_len = a.size() / bs;
|
||||
size_t b_len = b.size() / bs;
|
||||
std::vector<std::vector<float>> ids(a.size() + b.size(), std::vector<float>(3));
|
||||
for (int i = 0; i < bs; ++i) {
|
||||
for (int j = 0; j < context_len; ++j) {
|
||||
ids[i * (context_len + img_ids.size()) + j] = txt_ids[j];
|
||||
for (int j = 0; j < a_len; ++j) {
|
||||
ids[i * (a_len + b_len) + j] = a[i * a_len + j];
|
||||
}
|
||||
for (int j = 0; j < img_ids.size(); ++j) {
|
||||
ids[i * (context_len + img_ids.size()) + context_len + j] = img_ids_repeated[i * img_ids.size() + j];
|
||||
for (int j = 0; j < b_len; ++j) {
|
||||
ids[i * (a_len + b_len) + a_len + j] = b[i * b_len + j];
|
||||
}
|
||||
}
|
||||
return ids;
|
||||
}
|
||||
|
||||
std::vector<std::vector<float>> gen_ids(int h, int w, int patch_size, int bs, int context_len, std::vector<ggml_tensor*> ref_latents) {
|
||||
auto txt_ids = gen_txt_ids(bs, context_len);
|
||||
auto img_ids = gen_img_ids(h, w, patch_size, bs);
|
||||
|
||||
auto ids = concat_ids(txt_ids, img_ids, bs);
|
||||
uint64_t curr_h_offset = 0;
|
||||
uint64_t curr_w_offset = 0;
|
||||
for (ggml_tensor* ref : ref_latents) {
|
||||
uint64_t h_offset = 0;
|
||||
uint64_t w_offset = 0;
|
||||
if (ref->ne[1] + curr_h_offset > ref->ne[0] + curr_w_offset) {
|
||||
w_offset = curr_w_offset;
|
||||
} else {
|
||||
h_offset = curr_h_offset;
|
||||
}
|
||||
|
||||
auto ref_ids = gen_img_ids(ref->ne[1], ref->ne[0], patch_size, bs, 1, h_offset, w_offset);
|
||||
ids = concat_ids(ids, ref_ids, bs);
|
||||
|
||||
curr_h_offset = std::max(curr_h_offset, ref->ne[1] + h_offset);
|
||||
curr_w_offset = std::max(curr_w_offset, ref->ne[0] + w_offset);
|
||||
}
|
||||
return ids;
|
||||
}
|
||||
|
||||
// Generate positional embeddings
|
||||
std::vector<float> gen_pe(int h, int w, int patch_size, int bs, int context_len, int theta, const std::vector<int>& axes_dim) {
|
||||
std::vector<std::vector<float>> ids = gen_ids(h, w, patch_size, bs, context_len);
|
||||
std::vector<float> gen_pe(int h, int w, int patch_size, int bs, int context_len, std::vector<ggml_tensor*> ref_latents, int theta, const std::vector<int>& axes_dim) {
|
||||
std::vector<std::vector<float>> ids = gen_ids(h, w, patch_size, bs, context_len, ref_latents);
|
||||
std::vector<std::vector<float>> trans_ids = transpose(ids);
|
||||
size_t pos_len = ids.size();
|
||||
int num_axes = axes_dim.size();
|
||||
@ -726,7 +761,7 @@ namespace Flux {
|
||||
struct ggml_tensor* y,
|
||||
struct ggml_tensor* guidance,
|
||||
struct ggml_tensor* pe,
|
||||
std::vector<int> skip_layers = std::vector<int>()) {
|
||||
std::vector<int> skip_layers = {}) {
|
||||
auto img_in = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
|
||||
auto time_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["time_in"]);
|
||||
auto vector_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["vector_in"]);
|
||||
@ -785,6 +820,21 @@ namespace Flux {
|
||||
return img;
|
||||
}
|
||||
|
||||
struct ggml_tensor* process_img(struct ggml_context* ctx,
|
||||
struct ggml_tensor* x) {
|
||||
|
||||
int64_t W = x->ne[0];
|
||||
int64_t H = x->ne[1];
|
||||
int64_t patch_size = 2;
|
||||
int pad_h = (patch_size - H % patch_size) % patch_size;
|
||||
int pad_w = (patch_size - W % patch_size) % patch_size;
|
||||
x = ggml_pad(ctx, x, pad_w, pad_h, 0, 0); // [N, C, H + pad_h, W + pad_w]
|
||||
|
||||
// img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
|
||||
auto img = patchify(ctx, x, patch_size); // [N, h*w, C * patch_size * patch_size]
|
||||
return img;
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward(struct ggml_context* ctx,
|
||||
struct ggml_tensor* x,
|
||||
struct ggml_tensor* timestep,
|
||||
@ -793,7 +843,8 @@ namespace Flux {
|
||||
struct ggml_tensor* y,
|
||||
struct ggml_tensor* guidance,
|
||||
struct ggml_tensor* pe,
|
||||
std::vector<int> skip_layers = std::vector<int>()) {
|
||||
std::vector<ggml_tensor*> ref_latents = {},
|
||||
std::vector<int> skip_layers = {}) {
|
||||
// Forward pass of DiT.
|
||||
// x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
|
||||
// timestep: (N,) tensor of diffusion timesteps
|
||||
@ -812,25 +863,33 @@ namespace Flux {
|
||||
int64_t patch_size = 2;
|
||||
int pad_h = (patch_size - H % patch_size) % patch_size;
|
||||
int pad_w = (patch_size - W % patch_size) % patch_size;
|
||||
x = ggml_pad(ctx, x, pad_w, pad_h, 0, 0); // [N, C, H + pad_h, W + pad_w]
|
||||
|
||||
// img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
|
||||
auto img = patchify(ctx, x, patch_size); // [N, h*w, C * patch_size * patch_size]
|
||||
auto img = process_img(ctx, x);
|
||||
uint64_t img_tokens = img->ne[1];
|
||||
|
||||
if (c_concat != NULL) {
|
||||
ggml_tensor* masked = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
|
||||
ggml_tensor* mask = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 8 * 8, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
|
||||
|
||||
masked = ggml_pad(ctx, masked, pad_w, pad_h, 0, 0);
|
||||
mask = ggml_pad(ctx, mask, pad_w, pad_h, 0, 0);
|
||||
|
||||
masked = patchify(ctx, masked, patch_size);
|
||||
mask = patchify(ctx, mask, patch_size);
|
||||
masked = process_img(ctx, masked);
|
||||
mask = process_img(ctx, mask);
|
||||
|
||||
img = ggml_concat(ctx, img, ggml_concat(ctx, masked, mask, 0), 0);
|
||||
}
|
||||
|
||||
auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, skip_layers); // [N, h*w, C * patch_size * patch_size]
|
||||
if (ref_latents.size() > 0) {
|
||||
for (ggml_tensor* ref : ref_latents) {
|
||||
ref = process_img(ctx, ref);
|
||||
img = ggml_concat(ctx, img, ref, 1);
|
||||
}
|
||||
}
|
||||
|
||||
auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, skip_layers); // [N, num_tokens, C * patch_size * patch_size]
|
||||
if (out->ne[1] > img_tokens) {
|
||||
out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3)); // [num_tokens, N, C * patch_size * patch_size]
|
||||
out = ggml_view_3d(ctx, out, out->ne[0], out->ne[1], img_tokens, out->nb[1], out->nb[2], 0);
|
||||
out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3)); // [N, h*w, C * patch_size * patch_size]
|
||||
}
|
||||
|
||||
// rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)
|
||||
out = unpatchify(ctx, out, (H + pad_h) / patch_size, (W + pad_w) / patch_size, patch_size); // [N, C, H + pad_h, W + pad_w]
|
||||
@ -909,6 +968,7 @@ namespace Flux {
|
||||
struct ggml_tensor* c_concat,
|
||||
struct ggml_tensor* y,
|
||||
struct ggml_tensor* guidance,
|
||||
std::vector<ggml_tensor*> ref_latents = {},
|
||||
std::vector<int> skip_layers = std::vector<int>()) {
|
||||
GGML_ASSERT(x->ne[3] == 1);
|
||||
struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false);
|
||||
@ -923,8 +983,11 @@ namespace Flux {
|
||||
if (flux_params.guidance_embed) {
|
||||
guidance = to_backend(guidance);
|
||||
}
|
||||
for (int i = 0; i < ref_latents.size(); i++) {
|
||||
ref_latents[i] = to_backend(ref_latents[i]);
|
||||
}
|
||||
|
||||
pe_vec = flux.gen_pe(x->ne[1], x->ne[0], 2, x->ne[3], context->ne[1], flux_params.theta, flux_params.axes_dim);
|
||||
pe_vec = flux.gen_pe(x->ne[1], x->ne[0], 2, x->ne[3], context->ne[1], ref_latents, flux_params.theta, flux_params.axes_dim);
|
||||
int pos_len = pe_vec.size() / flux_params.axes_dim_sum / 2;
|
||||
// LOG_DEBUG("pos_len %d", pos_len);
|
||||
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, flux_params.axes_dim_sum / 2, pos_len);
|
||||
@ -941,6 +1004,7 @@ namespace Flux {
|
||||
y,
|
||||
guidance,
|
||||
pe,
|
||||
ref_latents,
|
||||
skip_layers);
|
||||
|
||||
ggml_build_forward_expand(gf, out);
|
||||
@ -955,6 +1019,7 @@ namespace Flux {
|
||||
struct ggml_tensor* c_concat,
|
||||
struct ggml_tensor* y,
|
||||
struct ggml_tensor* guidance,
|
||||
std::vector<ggml_tensor*> ref_latents = {},
|
||||
struct ggml_tensor** output = NULL,
|
||||
struct ggml_context* output_ctx = NULL,
|
||||
std::vector<int> skip_layers = std::vector<int>()) {
|
||||
@ -964,7 +1029,7 @@ namespace Flux {
|
||||
// y: [N, adm_in_channels] or [1, adm_in_channels]
|
||||
// guidance: [N, ]
|
||||
auto get_graph = [&]() -> struct ggml_cgraph* {
|
||||
return build_graph(x, timesteps, context, c_concat, y, guidance, skip_layers);
|
||||
return build_graph(x, timesteps, context, c_concat, y, guidance, ref_latents, skip_layers);
|
||||
};
|
||||
|
||||
GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
|
||||
@ -1004,7 +1069,7 @@ namespace Flux {
|
||||
struct ggml_tensor* out = NULL;
|
||||
|
||||
int t0 = ggml_time_ms();
|
||||
compute(8, x, timesteps, context, NULL, y, guidance, &out, work_ctx);
|
||||
compute(8, x, timesteps, context, NULL, y, guidance, {}, &out, work_ctx);
|
||||
int t1 = ggml_time_ms();
|
||||
|
||||
print_ggml_tensor(out);
|
||||
|
||||
@ -618,7 +618,7 @@ public:
|
||||
|
||||
int64_t t0 = ggml_time_ms();
|
||||
struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t);
|
||||
diffusion_model->compute(n_threads, x_t, timesteps, c, concat, NULL, NULL, -1, {}, 0.f, &out);
|
||||
diffusion_model->compute(n_threads, x_t, timesteps, c, concat, NULL, NULL, {}, -1, {}, 0.f, &out);
|
||||
diffusion_model->free_compute_buffer();
|
||||
|
||||
double result = 0.f;
|
||||
@ -800,6 +800,7 @@ public:
|
||||
const std::vector<float>& sigmas,
|
||||
int start_merge_step,
|
||||
SDCondition id_cond,
|
||||
std::vector<ggml_tensor*> ref_latents = {},
|
||||
std::vector<int> skip_layers = {},
|
||||
float slg_scale = 0,
|
||||
float skip_layer_start = 0.01,
|
||||
@ -887,6 +888,7 @@ public:
|
||||
cond.c_concat,
|
||||
cond.c_vector,
|
||||
guidance_tensor,
|
||||
ref_latents,
|
||||
-1,
|
||||
controls,
|
||||
control_strength,
|
||||
@ -899,6 +901,7 @@ public:
|
||||
cond.c_concat,
|
||||
id_cond.c_vector,
|
||||
guidance_tensor,
|
||||
ref_latents,
|
||||
-1,
|
||||
controls,
|
||||
control_strength,
|
||||
@ -919,6 +922,7 @@ public:
|
||||
uncond.c_concat,
|
||||
uncond.c_vector,
|
||||
guidance_tensor,
|
||||
ref_latents,
|
||||
-1,
|
||||
controls,
|
||||
control_strength,
|
||||
@ -939,6 +943,7 @@ public:
|
||||
cond.c_concat,
|
||||
cond.c_vector,
|
||||
guidance_tensor,
|
||||
ref_latents,
|
||||
-1,
|
||||
controls,
|
||||
control_strength,
|
||||
@ -1209,6 +1214,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
|
||||
float style_ratio,
|
||||
bool normalize_input,
|
||||
std::string input_id_images_path,
|
||||
std::vector<ggml_tensor*> ref_latents,
|
||||
std::vector<int> skip_layers = {},
|
||||
float slg_scale = 0,
|
||||
float skip_layer_start = 0.01,
|
||||
@ -1466,6 +1472,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
|
||||
sigmas,
|
||||
start_merge_step,
|
||||
id_cond,
|
||||
ref_latents,
|
||||
skip_layers,
|
||||
slg_scale,
|
||||
skip_layer_start,
|
||||
@ -1618,6 +1625,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
|
||||
style_ratio,
|
||||
normalize_input,
|
||||
input_id_images_path_c_str,
|
||||
{},
|
||||
skip_layers_vec,
|
||||
slg_scale,
|
||||
skip_layer_start,
|
||||
@ -1798,6 +1806,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
|
||||
style_ratio,
|
||||
normalize_input,
|
||||
input_id_images_path_c_str,
|
||||
{},
|
||||
skip_layers_vec,
|
||||
slg_scale,
|
||||
skip_layer_start,
|
||||
@ -1943,3 +1952,132 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
|
||||
|
||||
return result_images;
|
||||
}
|
||||
|
||||
|
||||
sd_image_t* edit(sd_ctx_t* sd_ctx,
|
||||
sd_image_t* ref_images,
|
||||
int ref_images_count,
|
||||
const char* prompt_c_str,
|
||||
const char* negative_prompt_c_str,
|
||||
int clip_skip,
|
||||
float cfg_scale,
|
||||
float guidance,
|
||||
float eta,
|
||||
int width,
|
||||
int height,
|
||||
sample_method_t sample_method,
|
||||
int sample_steps,
|
||||
float strength,
|
||||
int64_t seed,
|
||||
int batch_count,
|
||||
const sd_image_t* control_cond,
|
||||
float control_strength,
|
||||
float style_ratio,
|
||||
bool normalize_input,
|
||||
int* skip_layers = NULL,
|
||||
size_t skip_layers_count = 0,
|
||||
float slg_scale = 0,
|
||||
float skip_layer_start = 0.01,
|
||||
float skip_layer_end = 0.2) {
|
||||
std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
|
||||
LOG_DEBUG("edit %dx%d", width, height);
|
||||
if (sd_ctx == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
if (ref_images_count <= 0) {
|
||||
LOG_ERROR("ref images count should > 0");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = static_cast<size_t>(30 * 1024 * 1024); // 10 MB
|
||||
params.mem_size += width * height * 3 * sizeof(float) * 3 * ref_images_count;
|
||||
params.mem_size *= batch_count;
|
||||
params.mem_buffer = NULL;
|
||||
params.no_alloc = false;
|
||||
// LOG_DEBUG("mem_size %u ", params.mem_size);
|
||||
|
||||
struct ggml_context* work_ctx = ggml_init(params);
|
||||
if (!work_ctx) {
|
||||
LOG_ERROR("ggml_init() failed");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (seed < 0) {
|
||||
srand((int)time(NULL));
|
||||
seed = rand();
|
||||
}
|
||||
sd_ctx->sd->rng->manual_seed(seed);
|
||||
|
||||
int C = 4;
|
||||
if (sd_version_is_sd3(sd_ctx->sd->version)) {
|
||||
C = 16;
|
||||
} else if (sd_version_is_flux(sd_ctx->sd->version)) {
|
||||
C = 16;
|
||||
}
|
||||
int W = width / 8;
|
||||
int H = height / 8;
|
||||
ggml_tensor* init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
|
||||
if (sd_version_is_sd3(sd_ctx->sd->version)) {
|
||||
ggml_set_f32(init_latent, 0.0609f);
|
||||
} else if (sd_version_is_flux(sd_ctx->sd->version)) {
|
||||
ggml_set_f32(init_latent, 0.1159f);
|
||||
} else {
|
||||
ggml_set_f32(init_latent, 0.f);
|
||||
}
|
||||
|
||||
size_t t0 = ggml_time_ms();
|
||||
|
||||
std::vector<struct ggml_tensor*> ref_latents;
|
||||
for (int i = 0; i < ref_images_count; i++) {
|
||||
ggml_tensor* img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, ref_images[i].width, ref_images[i].height, 3, 1);
|
||||
sd_image_to_tensor(ref_images[i].data, img);
|
||||
|
||||
ggml_tensor* latent = NULL;
|
||||
if (!sd_ctx->sd->use_tiny_autoencoder) {
|
||||
ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, img);
|
||||
latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
|
||||
} else {
|
||||
latent = sd_ctx->sd->encode_first_stage(work_ctx, img);
|
||||
}
|
||||
ref_latents.push_back(latent);
|
||||
}
|
||||
|
||||
size_t t1 = ggml_time_ms();
|
||||
LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
|
||||
|
||||
std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps);
|
||||
|
||||
sd_image_t* result_images = generate_image(sd_ctx,
|
||||
work_ctx,
|
||||
init_latent,
|
||||
prompt_c_str,
|
||||
negative_prompt_c_str,
|
||||
clip_skip,
|
||||
cfg_scale,
|
||||
guidance,
|
||||
eta,
|
||||
width,
|
||||
height,
|
||||
sample_method,
|
||||
sigmas,
|
||||
seed,
|
||||
batch_count,
|
||||
control_cond,
|
||||
control_strength,
|
||||
style_ratio,
|
||||
normalize_input,
|
||||
"",
|
||||
ref_latents,
|
||||
skip_layers_vec,
|
||||
slg_scale,
|
||||
skip_layer_start,
|
||||
skip_layer_end,
|
||||
NULL);
|
||||
|
||||
size_t t2 = ggml_time_ms();
|
||||
|
||||
LOG_INFO("edit completed in %.2fs", (t2 - t0) * 1.0f / 1000);
|
||||
|
||||
return result_images;
|
||||
}
|
||||
@ -220,6 +220,32 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
|
||||
float strength,
|
||||
int64_t seed);
|
||||
|
||||
SD_API sd_image_t* edit(sd_ctx_t* sd_ctx,
|
||||
sd_image_t* ref_images,
|
||||
int ref_images_count,
|
||||
const char* prompt,
|
||||
const char* negative_prompt,
|
||||
int clip_skip,
|
||||
float cfg_scale,
|
||||
float guidance,
|
||||
float eta,
|
||||
int width,
|
||||
int height,
|
||||
enum sample_method_t sample_method,
|
||||
int sample_steps,
|
||||
float strength,
|
||||
int64_t seed,
|
||||
int batch_count,
|
||||
const sd_image_t* control_cond,
|
||||
float control_strength,
|
||||
float style_strength,
|
||||
bool normalize_input,
|
||||
int* skip_layers,
|
||||
size_t skip_layers_count,
|
||||
float slg_scale,
|
||||
float skip_layer_start,
|
||||
float skip_layer_end);
|
||||
|
||||
typedef struct upscaler_ctx_t upscaler_ctx_t;
|
||||
|
||||
SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user