diff --git a/README.md b/README.md
index 4524903..c1636c9 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,9 @@ API and command-line option may change frequently.***
 
 ## 🔥Important News
 
+* **2025/10/13** 🚀 stable-diffusion.cpp now supports **FLUX.2-dev**  
+  👉 Details: [PR #1016](https://github.com/leejet/stable-diffusion.cpp/pull/1016)
+
 * **2025/10/13** 🚀 stable-diffusion.cpp now supports **Qwen-Image-Edit / Qwen-Image-Edit 2509**  
   👉 Details: [PR #877](https://github.com/leejet/stable-diffusion.cpp/pull/877)
 
diff --git a/conditioner.hpp b/conditioner.hpp
index d581c56..e28e6e1 100644
--- a/conditioner.hpp
+++ b/conditioner.hpp
@@ -1814,6 +1814,17 @@ struct LLMEmbedder : public Conditioner {
             prompt_attn_range.second = static_cast<int>(prompt.size());
 
             prompt += "<|im_end|>\n<|im_start|>assistant\n";
+        } else if (sd_version_is_flux2(version)) {
+            prompt_template_encode_start_idx = 0;
+            out_layers                       = {10, 20, 30};
+
+            prompt = "[SYSTEM_PROMPT]You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object\nattribution and actions without speculation.[/SYSTEM_PROMPT][INST]";
+
+            prompt_attn_range.first = prompt.size();
+            prompt += conditioner_params.text;
+            prompt_attn_range.second = prompt.size();
+
+            prompt += "[/INST]";
         } else {
             prompt_template_encode_start_idx = 34;
 
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index e9995d5..5119ac7 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -151,6 +151,7 @@ struct SDParams {
     preview_t preview_method = PREVIEW_NONE;
     int preview_interval     = 1;
     std::string preview_path = "preview.png";
+    float preview_fps        = 16;
     bool taesd_preview       = false;
     bool preview_noisy       = false;
 
@@ -1638,18 +1639,16 @@ bool load_images_from_dir(const std::string dir,
     return true;
 }
 
-std::string preview_path;
-float preview_fps;
-
-void step_callback(int step, int frame_count, sd_image_t* image, bool is_noisy) {
+void step_callback(int step, int frame_count, sd_image_t* image, bool is_noisy, void* data) {
     (void)step;
     (void)is_noisy;
+    SDParams* params = (SDParams*)data;
     // is_noisy is set to true if the preview corresponds to noisy latents, false if it's denoised latents
     // unused in this app, it will either be always noisy or always denoised here
     if (frame_count == 1) {
-        stbi_write_png(preview_path.c_str(), image->width, image->height, image->channel, image->data, 0);
+        stbi_write_png(params->preview_path.c_str(), image->width, image->height, image->channel, image->data, 0);
     } else {
-        create_mjpg_avi_from_sd_images(preview_path.c_str(), image, frame_count, preview_fps);
+        create_mjpg_avi_from_sd_images(params->preview_path.c_str(), image, frame_count, params->preview_fps);
     }
 }
 
@@ -1662,7 +1661,6 @@ int main(int argc, const char* argv[]) {
     // ZImage::ZImageRunner::load_from_file_and_test(argv[1]);
     // return 1;
     parse_args(argc, argv, params);
-    preview_path = params.preview_path;
     if (params.video_frames > 4) {
         size_t last_dot_pos   = params.preview_path.find_last_of(".");
         std::string base_path = params.preview_path;
@@ -1673,12 +1671,12 @@ int main(int argc, const char* argv[]) {
             std::transform(file_ext.begin(), file_ext.end(), file_ext.begin(), ::tolower);
         }
         if (file_ext == ".png") {
-            preview_path = base_path + ".avi";
+            params.preview_path = base_path + ".avi";
         }
     }
-    preview_fps = params.fps;
+    params.preview_fps = params.fps;
     if (params.preview_method == PREVIEW_PROJ)
-        preview_fps /= 4.0f;
+        params.preview_fps /= 4.0f;
 
     params.sample_params.guidance.slg.layers                 = params.skip_layers.data();
     params.sample_params.guidance.slg.layer_count            = params.skip_layers.size();
@@ -1686,7 +1684,7 @@ int main(int argc, const char* argv[]) {
     params.high_noise_sample_params.guidance.slg.layer_count = params.high_noise_skip_layers.size();
 
     sd_set_log_callback(sd_log_cb, (void*)&params);
-    sd_set_preview_callback((sd_preview_cb_t)step_callback, params.preview_method, params.preview_interval, !params.preview_noisy, params.preview_noisy);
+    sd_set_preview_callback(step_callback, params.preview_method, params.preview_interval, !params.preview_noisy, params.preview_noisy, (void*)&params);
 
     if (params.verbose) {
         print_params(params);
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 3ff654a..1a0bd44 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -1141,6 +1141,14 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_ones(struct ggml_context* ctx,
 }
 
 __STATIC_INLINE__ ggml_tensor* ggml_ext_cast_f32(ggml_context* ctx, ggml_tensor* a) {
+#ifdef SD_USE_VULKAN
+    auto zero_index = ggml_get_tensor(ctx, "ggml_runner_build_in_tensor:zero_int");
+    auto out        = ggml_reshape_1d(ctx, a, ggml_nelements(a));
+    out             = ggml_get_rows(ctx, out, zero_index);
+    out             = ggml_reshape(ctx, out, a);
+    // auto out = ggml_cast(ctx, a, GGML_TYPE_F32);
+    return out;
+#else
     auto out         = ggml_reshape_2d(ctx, a, 1, ggml_nelements(a));
     ggml_tensor* one = ggml_ext_ones(ctx, 1, 1, 1, 1);  // [1,]
     if (ggml_is_transposed(out)) {
@@ -1148,7 +1156,8 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_cast_f32(ggml_context* ctx, ggml_tensor*
     } else {
         out = ggml_mul_mat(ctx, out, one);
     }
-    out = ggml_reshape(ctx, out, a);
+    out                    = ggml_reshape(ctx, out, a);
+#endif
     return out;
 }
 
@@ -1556,6 +1565,9 @@ protected:
     std::vector<float> one_vec = {1.f};
     ggml_tensor* one_tensor    = nullptr;
 
+    std::vector<int> zero_int_vec = {0};
+    ggml_tensor* zero_int_tensor  = nullptr;
+
     std::map<struct ggml_tensor*, const void*> backend_tensor_data_map;
     std::map<std::string, struct ggml_tensor*> cache_tensor_map;  // name -> tensor
     const std::string final_result_name = "ggml_runner_final_result_tensor";
@@ -1626,10 +1638,15 @@ protected:
         one_tensor = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_F32, 1);
         ggml_set_name(one_tensor, "ggml_runner_build_in_tensor:one");
         set_backend_tensor_data(one_tensor, one_vec.data());
+
+        zero_int_tensor = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, 1);
+        ggml_set_name(zero_int_tensor, "ggml_runner_build_in_tensor:zero_int");
+        set_backend_tensor_data(zero_int_tensor, zero_int_vec.data());
     }
 
     void prepare_build_in_tensor_after(struct ggml_cgraph* gf) {
         ggml_build_forward_expand(gf, one_tensor);
+        ggml_build_forward_expand(gf, zero_int_tensor);
     }
 
     struct ggml_cgraph* new_graph_custom(size_t graph_size) {
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 58ee945..2e873c2 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -45,7 +45,7 @@ const char* model_version_to_str[] = {
     "Wan 2.2 TI2V",
     "Qwen Image",
     "Flux.2",
-    "Z Image",
+    "Z-Image",
 };
 
 const char* sampling_methods_str[] = {
@@ -1325,7 +1325,8 @@ public:
                        enum SDVersion version,
                        preview_t preview_mode,
                        ggml_tensor* result,
-                       std::function<void(int, int, sd_image_t*, bool)> step_callback,
+                       std::function<void(int, int, sd_image_t*, bool, void*)> step_callback,
+                       void* step_callback_data,
                        bool is_noisy) {
         const uint32_t channel = 3;
         uint32_t width         = latents->ne[0];
@@ -1396,7 +1397,7 @@ public:
             for (int i = 0; i < frames; i++) {
                 images[i] = {width, height, channel, data + i * width * height * channel};
             }
-            step_callback(step, frames, images, is_noisy);
+            step_callback(step, frames, images, is_noisy, step_callback_data);
             free(data);
             free(images);
         } else {
@@ -1450,7 +1451,7 @@ public:
                 images[i].data    = ggml_tensor_to_sd_image(result, i, ggml_n_dims(latents) == 4);
             }
 
-            step_callback(step, frames, images, is_noisy);
+            step_callback(step, frames, images, is_noisy, step_callback_data);
 
             ggml_ext_tensor_scale_inplace(result, 0);
             for (int i = 0; i < frames; i++) {
@@ -1599,8 +1600,9 @@ public:
         }
 
         auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
-            auto sd_preview_cb   = sd_get_preview_callback();
-            auto sd_preview_mode = sd_get_preview_mode();
+            auto sd_preview_cb      = sd_get_preview_callback();
+            auto sd_preview_cb_data = sd_get_preview_callback_data();
+            auto sd_preview_mode    = sd_get_preview_mode();
             if (step == 1 || step == -1) {
                 pretty_progress(0, (int)steps, 0);
             }
@@ -1671,7 +1673,7 @@ public:
             }
             if (sd_preview_cb != nullptr && sd_should_preview_noisy()) {
                 if (step % sd_get_preview_interval() == 0) {
-                    preview_image(work_ctx, step, noised_input, version, sd_preview_mode, preview_tensor, sd_preview_cb, true);
+                    preview_image(work_ctx, step, noised_input, version, sd_preview_mode, preview_tensor, sd_preview_cb, sd_preview_cb_data, true);
                 }
             }
 
@@ -1819,7 +1821,7 @@ public:
 
             if (sd_preview_cb != nullptr && sd_should_preview_denoised()) {
                 if (step % sd_get_preview_interval() == 0) {
-                    preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb, false);
+                    preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb, sd_preview_cb_data, false);
                 }
             }
 
diff --git a/stable-diffusion.h b/stable-diffusion.h
index 4e3f8ea..0cbefd9 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -283,11 +283,11 @@ typedef struct sd_ctx_t sd_ctx_t;
 
 typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
 typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data);
-typedef void (*sd_preview_cb_t)(int step, int frame_count, sd_image_t* frames, bool is_noisy);
+typedef void (*sd_preview_cb_t)(int step, int frame_count, sd_image_t* frames, bool is_noisy, void* data);
 
 SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
 SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
-SD_API void sd_set_preview_callback(sd_preview_cb_t cb, enum preview_t mode, int interval, bool denoised, bool noisy);
+SD_API void sd_set_preview_callback(sd_preview_cb_t cb, enum preview_t mode, int interval, bool denoised, bool noisy, void* data);
 SD_API int32_t get_num_physical_cores();
 SD_API const char* sd_get_system_info();
 
diff --git a/util.cpp b/util.cpp
index c462166..f101c3b 100644
--- a/util.cpp
+++ b/util.cpp
@@ -187,6 +187,7 @@ static sd_progress_cb_t sd_progress_cb = nullptr;
 void* sd_progress_cb_data              = nullptr;
 
 static sd_preview_cb_t sd_preview_cb = nullptr;
+static void* sd_preview_cb_data      = nullptr;
 preview_t sd_preview_mode            = PREVIEW_NONE;
 int sd_preview_interval              = 1;
 bool sd_preview_denoised             = true;
@@ -273,13 +274,16 @@ void pretty_progress(int step, int steps, float time) {
         }
     }
     progress += "|";
-    printf(time > 1.0f ? "\r%s %i/%i - %.2fs/it" : "\r%s %i/%i - %.2fit/s\033[K",
-           progress.c_str(), step, steps,
-           time > 1.0f || time == 0 ? time : (1.0f / time));
-    fflush(stdout);  // for linux
-    if (step == steps) {
-        printf("\n");
+
+    const char* lf   = (step == steps ? "\n" : "");
+    const char* unit = "s/it";
+    float speed      = time;
+    if (speed < 1.0f && speed > 0.f) {
+        speed = 1.0f / speed;
+        unit  = "it/s";
     }
+    printf("\r%s %i/%i - %.2f%s\033[K%s", progress.c_str(), step, steps, speed, unit, lf);
+    fflush(stdout);  // for linux
 }
 
 std::string ltrim(const std::string& s) {
@@ -335,8 +339,9 @@ void sd_set_progress_callback(sd_progress_cb_t cb, void* data) {
     sd_progress_cb      = cb;
     sd_progress_cb_data = data;
 }
-void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode = PREVIEW_PROJ, int interval = 1, bool denoised = true, bool noisy = false) {
+void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode, int interval, bool denoised, bool noisy, void* data) {
     sd_preview_cb       = cb;
+    sd_preview_cb_data  = data;
     sd_preview_mode     = mode;
     sd_preview_interval = interval;
     sd_preview_denoised = denoised;
@@ -346,6 +351,9 @@ void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode = PREVIEW_PROJ,
 sd_preview_cb_t sd_get_preview_callback() {
     return sd_preview_cb;
 }
+void* sd_get_preview_callback_data() {
+    return sd_preview_cb_data;
+}
 
 preview_t sd_get_preview_mode() {
     return sd_preview_mode;
diff --git a/util.h b/util.h
index 5bd69a6..2721f29 100644
--- a/util.h
+++ b/util.h
@@ -58,6 +58,7 @@ sd_progress_cb_t sd_get_progress_callback();
 void* sd_get_progress_callback_data();
 
 sd_preview_cb_t sd_get_preview_callback();
+void* sd_get_preview_callback_data();
 preview_t sd_get_preview_mode();
 int sd_get_preview_interval();
 bool sd_should_preview_denoised();