diff --git a/README.md b/README.md index 4524903..c1636c9 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,9 @@ API and command-line option may change frequently.*** ## ๐Ÿ”ฅImportant News +* **2025/10/13** ๐Ÿš€ stable-diffusion.cpp now supports **FLUX.2-dev** + ๐Ÿ‘‰ Details: [PR #1016](https://github.com/leejet/stable-diffusion.cpp/pull/1016) + * **2025/10/13** ๐Ÿš€ stable-diffusion.cpp now supports **Qwen-Image-Edit / Qwen-Image-Edit 2509** ๐Ÿ‘‰ Details: [PR #877](https://github.com/leejet/stable-diffusion.cpp/pull/877) diff --git a/conditioner.hpp b/conditioner.hpp index d581c56..e28e6e1 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -1814,6 +1814,17 @@ struct LLMEmbedder : public Conditioner { prompt_attn_range.second = static_cast(prompt.size()); prompt += "<|im_end|>\n<|im_start|>assistant\n"; + } else if (sd_version_is_flux2(version)) { + prompt_template_encode_start_idx = 0; + out_layers = {10, 20, 30}; + + prompt = "[SYSTEM_PROMPT]You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object\nattribution and actions without speculation.[/SYSTEM_PROMPT][INST]"; + + prompt_attn_range.first = prompt.size(); + prompt += conditioner_params.text; + prompt_attn_range.second = prompt.size(); + + prompt += "[/INST]"; } else { prompt_template_encode_start_idx = 34; diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index e9995d5..5119ac7 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -151,6 +151,7 @@ struct SDParams { preview_t preview_method = PREVIEW_NONE; int preview_interval = 1; std::string preview_path = "preview.png"; + float preview_fps = 16; bool taesd_preview = false; bool preview_noisy = false; @@ -1638,18 +1639,16 @@ bool load_images_from_dir(const std::string dir, return true; } -std::string preview_path; -float preview_fps; - -void step_callback(int step, int frame_count, sd_image_t* image, bool is_noisy) { +void step_callback(int step, int frame_count, sd_image_t* image, bool is_noisy, void* data) { (void)step; (void)is_noisy; + SDParams* params = (SDParams*)data; // is_noisy is set to true if the preview corresponds to noisy latents, false if it's denoised latents // unused in this app, it will either be always noisy or always denoised here if (frame_count == 1) { - stbi_write_png(preview_path.c_str(), image->width, image->height, image->channel, image->data, 0); + stbi_write_png(params->preview_path.c_str(), image->width, image->height, image->channel, image->data, 0); } else { - create_mjpg_avi_from_sd_images(preview_path.c_str(), image, frame_count, preview_fps); + create_mjpg_avi_from_sd_images(params->preview_path.c_str(), image, frame_count, params->preview_fps); } } @@ -1662,7 +1661,6 @@ int main(int argc, const char* argv[]) { // ZImage::ZImageRunner::load_from_file_and_test(argv[1]); // return 1; parse_args(argc, argv, params); - preview_path = params.preview_path; if (params.video_frames > 4) { size_t last_dot_pos = params.preview_path.find_last_of("."); std::string base_path = params.preview_path; @@ -1673,12 +1671,12 @@ int main(int argc, const char* argv[]) { std::transform(file_ext.begin(), file_ext.end(), file_ext.begin(), ::tolower); } if (file_ext == ".png") { - preview_path = base_path + ".avi"; + params.preview_path = base_path + ".avi"; } } - preview_fps = params.fps; + params.preview_fps = params.fps; if (params.preview_method == PREVIEW_PROJ) - preview_fps /= 4.0f; + params.preview_fps /= 4.0f; params.sample_params.guidance.slg.layers = params.skip_layers.data(); params.sample_params.guidance.slg.layer_count = params.skip_layers.size(); @@ -1686,7 +1684,7 @@ int main(int argc, const char* argv[]) { params.high_noise_sample_params.guidance.slg.layer_count = params.high_noise_skip_layers.size(); sd_set_log_callback(sd_log_cb, (void*)¶ms); - sd_set_preview_callback((sd_preview_cb_t)step_callback, params.preview_method, params.preview_interval, !params.preview_noisy, params.preview_noisy); + sd_set_preview_callback(step_callback, params.preview_method, params.preview_interval, !params.preview_noisy, params.preview_noisy, (void*)¶ms); if (params.verbose) { print_params(params); diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 3ff654a..1a0bd44 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -1141,6 +1141,14 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_ones(struct ggml_context* ctx, } __STATIC_INLINE__ ggml_tensor* ggml_ext_cast_f32(ggml_context* ctx, ggml_tensor* a) { +#ifdef SD_USE_VULKAN + auto zero_index = ggml_get_tensor(ctx, "ggml_runner_build_in_tensor:zero_int"); + auto out = ggml_reshape_1d(ctx, a, ggml_nelements(a)); + out = ggml_get_rows(ctx, out, zero_index); + out = ggml_reshape(ctx, out, a); + // auto out = ggml_cast(ctx, a, GGML_TYPE_F32); + return out; +#else auto out = ggml_reshape_2d(ctx, a, 1, ggml_nelements(a)); ggml_tensor* one = ggml_ext_ones(ctx, 1, 1, 1, 1); // [1,] if (ggml_is_transposed(out)) { @@ -1148,7 +1156,8 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_cast_f32(ggml_context* ctx, ggml_tensor* } else { out = ggml_mul_mat(ctx, out, one); } - out = ggml_reshape(ctx, out, a); + out = ggml_reshape(ctx, out, a); +#endif return out; } @@ -1556,6 +1565,9 @@ protected: std::vector one_vec = {1.f}; ggml_tensor* one_tensor = nullptr; + std::vector zero_int_vec = {0}; + ggml_tensor* zero_int_tensor = nullptr; + std::map backend_tensor_data_map; std::map cache_tensor_map; // name -> tensor const std::string final_result_name = "ggml_runner_final_result_tensor"; @@ -1626,10 +1638,15 @@ protected: one_tensor = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_F32, 1); ggml_set_name(one_tensor, "ggml_runner_build_in_tensor:one"); set_backend_tensor_data(one_tensor, one_vec.data()); + + zero_int_tensor = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, 1); + ggml_set_name(zero_int_tensor, "ggml_runner_build_in_tensor:zero_int"); + set_backend_tensor_data(zero_int_tensor, zero_int_vec.data()); } void prepare_build_in_tensor_after(struct ggml_cgraph* gf) { ggml_build_forward_expand(gf, one_tensor); + ggml_build_forward_expand(gf, zero_int_tensor); } struct ggml_cgraph* new_graph_custom(size_t graph_size) { diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 58ee945..2e873c2 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -45,7 +45,7 @@ const char* model_version_to_str[] = { "Wan 2.2 TI2V", "Qwen Image", "Flux.2", - "Z Image", + "Z-Image", }; const char* sampling_methods_str[] = { @@ -1325,7 +1325,8 @@ public: enum SDVersion version, preview_t preview_mode, ggml_tensor* result, - std::function step_callback, + std::function step_callback, + void* step_callback_data, bool is_noisy) { const uint32_t channel = 3; uint32_t width = latents->ne[0]; @@ -1396,7 +1397,7 @@ public: for (int i = 0; i < frames; i++) { images[i] = {width, height, channel, data + i * width * height * channel}; } - step_callback(step, frames, images, is_noisy); + step_callback(step, frames, images, is_noisy, step_callback_data); free(data); free(images); } else { @@ -1450,7 +1451,7 @@ public: images[i].data = ggml_tensor_to_sd_image(result, i, ggml_n_dims(latents) == 4); } - step_callback(step, frames, images, is_noisy); + step_callback(step, frames, images, is_noisy, step_callback_data); ggml_ext_tensor_scale_inplace(result, 0); for (int i = 0; i < frames; i++) { @@ -1599,8 +1600,9 @@ public: } auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* { - auto sd_preview_cb = sd_get_preview_callback(); - auto sd_preview_mode = sd_get_preview_mode(); + auto sd_preview_cb = sd_get_preview_callback(); + auto sd_preview_cb_data = sd_get_preview_callback_data(); + auto sd_preview_mode = sd_get_preview_mode(); if (step == 1 || step == -1) { pretty_progress(0, (int)steps, 0); } @@ -1671,7 +1673,7 @@ public: } if (sd_preview_cb != nullptr && sd_should_preview_noisy()) { if (step % sd_get_preview_interval() == 0) { - preview_image(work_ctx, step, noised_input, version, sd_preview_mode, preview_tensor, sd_preview_cb, true); + preview_image(work_ctx, step, noised_input, version, sd_preview_mode, preview_tensor, sd_preview_cb, sd_preview_cb_data, true); } } @@ -1819,7 +1821,7 @@ public: if (sd_preview_cb != nullptr && sd_should_preview_denoised()) { if (step % sd_get_preview_interval() == 0) { - preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb, false); + preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb, sd_preview_cb_data, false); } } diff --git a/stable-diffusion.h b/stable-diffusion.h index 4e3f8ea..0cbefd9 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -283,11 +283,11 @@ typedef struct sd_ctx_t sd_ctx_t; typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data); typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data); -typedef void (*sd_preview_cb_t)(int step, int frame_count, sd_image_t* frames, bool is_noisy); +typedef void (*sd_preview_cb_t)(int step, int frame_count, sd_image_t* frames, bool is_noisy, void* data); SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data); SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data); -SD_API void sd_set_preview_callback(sd_preview_cb_t cb, enum preview_t mode, int interval, bool denoised, bool noisy); +SD_API void sd_set_preview_callback(sd_preview_cb_t cb, enum preview_t mode, int interval, bool denoised, bool noisy, void* data); SD_API int32_t get_num_physical_cores(); SD_API const char* sd_get_system_info(); diff --git a/util.cpp b/util.cpp index c462166..f101c3b 100644 --- a/util.cpp +++ b/util.cpp @@ -187,6 +187,7 @@ static sd_progress_cb_t sd_progress_cb = nullptr; void* sd_progress_cb_data = nullptr; static sd_preview_cb_t sd_preview_cb = nullptr; +static void* sd_preview_cb_data = nullptr; preview_t sd_preview_mode = PREVIEW_NONE; int sd_preview_interval = 1; bool sd_preview_denoised = true; @@ -273,13 +274,16 @@ void pretty_progress(int step, int steps, float time) { } } progress += "|"; - printf(time > 1.0f ? "\r%s %i/%i - %.2fs/it" : "\r%s %i/%i - %.2fit/s\033[K", - progress.c_str(), step, steps, - time > 1.0f || time == 0 ? time : (1.0f / time)); - fflush(stdout); // for linux - if (step == steps) { - printf("\n"); + + const char* lf = (step == steps ? "\n" : ""); + const char* unit = "s/it"; + float speed = time; + if (speed < 1.0f && speed > 0.f) { + speed = 1.0f / speed; + unit = "it/s"; } + printf("\r%s %i/%i - %.2f%s\033[K%s", progress.c_str(), step, steps, speed, unit, lf); + fflush(stdout); // for linux } std::string ltrim(const std::string& s) { @@ -335,8 +339,9 @@ void sd_set_progress_callback(sd_progress_cb_t cb, void* data) { sd_progress_cb = cb; sd_progress_cb_data = data; } -void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode = PREVIEW_PROJ, int interval = 1, bool denoised = true, bool noisy = false) { +void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode, int interval, bool denoised, bool noisy, void* data) { sd_preview_cb = cb; + sd_preview_cb_data = data; sd_preview_mode = mode; sd_preview_interval = interval; sd_preview_denoised = denoised; @@ -346,6 +351,9 @@ void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode = PREVIEW_PROJ, sd_preview_cb_t sd_get_preview_callback() { return sd_preview_cb; } +void* sd_get_preview_callback_data() { + return sd_preview_cb_data; +} preview_t sd_get_preview_mode() { return sd_preview_mode; diff --git a/util.h b/util.h index 5bd69a6..2721f29 100644 --- a/util.h +++ b/util.h @@ -58,6 +58,7 @@ sd_progress_cb_t sd_get_progress_callback(); void* sd_get_progress_callback_data(); sd_preview_cb_t sd_get_preview_callback(); +void* sd_get_preview_callback_data(); preview_t sd_get_preview_mode(); int sd_get_preview_interval(); bool sd_should_preview_denoised();