diff --git a/clip.hpp b/clip.hpp
index 7ca565d..321f5f8 100644
--- a/clip.hpp
+++ b/clip.hpp
@@ -179,9 +179,9 @@ public:
 
         auto it = encoder.find(utf8_to_utf32("img</w>"));
         if (it != encoder.end()) {
-            LOG_DEBUG(" trigger word img already in vocab");
+            LOG_DEBUG("trigger word img already in vocab");
         } else {
-            LOG_DEBUG(" trigger word img not in vocab yet");
+            LOG_DEBUG("trigger word img not in vocab yet");
         }
 
         int rank = 0;
@@ -733,7 +733,7 @@ public:
             if (text_projection != NULL) {
                 pooled = ggml_nn_linear(ctx, pooled, text_projection, NULL);
             } else {
-                LOG_DEBUG("Missing text_projection matrix, assuming identity...");
+                LOG_DEBUG("identity projection");
             }
             return pooled;  // [hidden_size, 1, 1]
         }
diff --git a/conditioner.hpp b/conditioner.hpp
index 6cf7ca3..e63169b 100644
--- a/conditioner.hpp
+++ b/conditioner.hpp
@@ -21,12 +21,12 @@ struct Conditioner {
                                               int clip_skip,
                                               int width,
                                               int height,
-                                              int adm_in_channels        = -1,
-                                              bool force_zero_embeddings = false)                                             = 0;
-    virtual void alloc_params_buffer()                                                                                        = 0;
-    virtual void free_params_buffer()                                                                                         = 0;
-    virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors)                                       = 0;
-    virtual size_t get_params_buffer_size()                                                                                   = 0;
+                                              int adm_in_channels  = -1,
+                                              bool zero_out_masked = false)                                             = 0;
+    virtual void alloc_params_buffer()                                                                                  = 0;
+    virtual void free_params_buffer()                                                                                   = 0;
+    virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors)                                 = 0;
+    virtual size_t get_params_buffer_size()                                                                             = 0;
     virtual std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(ggml_context* work_ctx,
                                                                                           int n_threads,
                                                                                           const std::string& text,
@@ -34,10 +34,10 @@ struct Conditioner {
                                                                                           int width,
                                                                                           int height,
                                                                                           int num_input_imgs,
-                                                                                          int adm_in_channels        = -1,
-                                                                                          bool force_zero_embeddings = false) = 0;
+                                                                                          int adm_in_channels  = -1,
+                                                                                          bool zero_out_masked = false) = 0;
     virtual std::string remove_trigger_from_prompt(ggml_context* work_ctx,
-                                                   const std::string& prompt)                                                 = 0;
+                                                   const std::string& prompt)                                           = 0;
 };
 
 // ldm.modules.encoders.modules.FrozenCLIPEmbedder
@@ -409,8 +409,8 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                              int clip_skip,
                                              int width,
                                              int height,
-                                             int adm_in_channels        = -1,
-                                             bool force_zero_embeddings = false) {
+                                             int adm_in_channels  = -1,
+                                             bool zero_out_masked = false) {
         set_clip_skip(clip_skip);
         int64_t t0                               = ggml_time_ms();
         struct ggml_tensor* hidden_states        = NULL;  // [N, n_token, hidden_size]
@@ -499,7 +499,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                 float new_mean = ggml_tensor_mean(result);
                 ggml_tensor_scale(result, (original_mean / new_mean));
             }
-            if (force_zero_embeddings) {
+            if (zero_out_masked) {
                 float* vec = (float*)result->data;
                 for (int i = 0; i < ggml_nelements(result); i++) {
                     vec[i] = 0;
@@ -562,8 +562,8 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                        int width,
                                        int height,
                                        int num_input_imgs,
-                                       int adm_in_channels        = -1,
-                                       bool force_zero_embeddings = false) {
+                                       int adm_in_channels  = -1,
+                                       bool zero_out_masked = false) {
         auto image_tokens = convert_token_to_id(trigger_word);
         // if(image_tokens.size() == 1){
         //     printf(" image token id is: %d \n", image_tokens[0]);
@@ -584,7 +584,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
         // for(int i = 0; i < clsm.size(); ++i)
         //    printf("%d ", clsm[i]?1:0);
         // printf("\n");
-        auto cond = get_learned_condition_common(work_ctx, n_threads, tokens, weights, clip_skip, width, height, adm_in_channels, force_zero_embeddings);
+        auto cond = get_learned_condition_common(work_ctx, n_threads, tokens, weights, clip_skip, width, height, adm_in_channels, zero_out_masked);
         return std::make_tuple(cond, clsm);
     }
 
@@ -606,12 +606,12 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                       int clip_skip,
                                       int width,
                                       int height,
-                                      int adm_in_channels        = -1,
-                                      bool force_zero_embeddings = false) {
+                                      int adm_in_channels  = -1,
+                                      bool zero_out_masked = false) {
         auto tokens_and_weights     = tokenize(text, true);
         std::vector<int>& tokens    = tokens_and_weights.first;
         std::vector<float>& weights = tokens_and_weights.second;
-        return get_learned_condition_common(work_ctx, n_threads, tokens, weights, clip_skip, width, height, adm_in_channels, force_zero_embeddings);
+        return get_learned_condition_common(work_ctx, n_threads, tokens, weights, clip_skip, width, height, adm_in_channels, zero_out_masked);
     }
 };
 
@@ -773,7 +773,7 @@ struct SD3CLIPEmbedder : public Conditioner {
                                              int n_threads,
                                              std::vector<std::pair<std::vector<int>, std::vector<float>>> token_and_weights,
                                              int clip_skip,
-                                             bool force_zero_embeddings = false) {
+                                             bool zero_out_masked = false) {
         set_clip_skip(clip_skip);
         auto& clip_l_tokens  = token_and_weights[0].first;
         auto& clip_l_weights = token_and_weights[0].second;
@@ -952,7 +952,7 @@ struct SD3CLIPEmbedder : public Conditioner {
 
             int64_t t1 = ggml_time_ms();
             LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
-            if (force_zero_embeddings) {
+            if (zero_out_masked) {
                 float* vec = (float*)chunk_hidden_states->data;
                 for (int i = 0; i < ggml_nelements(chunk_hidden_states); i++) {
                     vec[i] = 0;
@@ -978,10 +978,10 @@ struct SD3CLIPEmbedder : public Conditioner {
                                       int clip_skip,
                                       int width,
                                       int height,
-                                      int adm_in_channels        = -1,
-                                      bool force_zero_embeddings = false) {
+                                      int adm_in_channels  = -1,
+                                      bool zero_out_masked = false) {
         auto tokens_and_weights = tokenize(text, 77, true);
-        return get_learned_condition_common(work_ctx, n_threads, tokens_and_weights, clip_skip, force_zero_embeddings);
+        return get_learned_condition_common(work_ctx, n_threads, tokens_and_weights, clip_skip, zero_out_masked);
     }
 
     std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(ggml_context* work_ctx,
@@ -991,8 +991,8 @@ struct SD3CLIPEmbedder : public Conditioner {
                                                                                   int width,
                                                                                   int height,
                                                                                   int num_input_imgs,
-                                                                                  int adm_in_channels        = -1,
-                                                                                  bool force_zero_embeddings = false) {
+                                                                                  int adm_in_channels  = -1,
+                                                                                  bool zero_out_masked = false) {
         GGML_ASSERT(0 && "Not implemented yet!");
     }
 
@@ -1101,7 +1101,7 @@ struct FluxCLIPEmbedder : public Conditioner {
                                              int n_threads,
                                              std::vector<std::pair<std::vector<int>, std::vector<float>>> token_and_weights,
                                              int clip_skip,
-                                             bool force_zero_embeddings = false) {
+                                             bool zero_out_masked = false) {
         set_clip_skip(clip_skip);
         auto& clip_l_tokens  = token_and_weights[0].first;
         auto& clip_l_weights = token_and_weights[0].second;
@@ -1173,7 +1173,7 @@ struct FluxCLIPEmbedder : public Conditioner {
 
             int64_t t1 = ggml_time_ms();
             LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
-            if (force_zero_embeddings) {
+            if (zero_out_masked) {
                 float* vec = (float*)chunk_hidden_states->data;
                 for (int i = 0; i < ggml_nelements(chunk_hidden_states); i++) {
                     vec[i] = 0;
@@ -1199,10 +1199,10 @@ struct FluxCLIPEmbedder : public Conditioner {
                                       int clip_skip,
                                       int width,
                                       int height,
-                                      int adm_in_channels        = -1,
-                                      bool force_zero_embeddings = false) {
+                                      int adm_in_channels  = -1,
+                                      bool zero_out_masked = false) {
         auto tokens_and_weights = tokenize(text, chunk_len, true);
-        return get_learned_condition_common(work_ctx, n_threads, tokens_and_weights, clip_skip, force_zero_embeddings);
+        return get_learned_condition_common(work_ctx, n_threads, tokens_and_weights, clip_skip, zero_out_masked);
     }
 
     std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(ggml_context* work_ctx,
@@ -1212,8 +1212,8 @@ struct FluxCLIPEmbedder : public Conditioner {
                                                                                   int width,
                                                                                   int height,
                                                                                   int num_input_imgs,
-                                                                                  int adm_in_channels        = -1,
-                                                                                  bool force_zero_embeddings = false) {
+                                                                                  int adm_in_channels  = -1,
+                                                                                  bool zero_out_masked = false) {
         GGML_ASSERT(0 && "Not implemented yet!");
     }
 
@@ -1229,6 +1229,7 @@ struct T5CLIPEmbedder : public Conditioner {
     size_t chunk_len = 512;
     bool use_mask    = false;
     int mask_pad     = 1;
+    bool is_umt5     = false;
 
     T5CLIPEmbedder(ggml_backend_t backend,
                    const String2GGMLType& tensor_types = {},
@@ -1318,16 +1319,16 @@ struct T5CLIPEmbedder : public Conditioner {
                                              int n_threads,
                                              std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> token_and_weights,
                                              int clip_skip,
-                                             bool force_zero_embeddings = false) {
+                                             bool zero_out_masked = false) {
         auto& t5_tokens        = std::get<0>(token_and_weights);
         auto& t5_weights       = std::get<1>(token_and_weights);
         auto& t5_attn_mask_vec = std::get<2>(token_and_weights);
 
         int64_t t0                              = ggml_time_ms();
-        struct ggml_tensor* hidden_states       = NULL;                                               // [N, n_token, 4096]
-        struct ggml_tensor* chunk_hidden_states = NULL;                                               // [n_token, 4096]
-        struct ggml_tensor* pooled              = NULL;                                               // [768,]
-        struct ggml_tensor* t5_attn_mask        = vector_to_ggml_tensor(work_ctx, t5_attn_mask_vec);  // [768,]
+        struct ggml_tensor* hidden_states       = NULL;  // [N, n_token, 4096]
+        struct ggml_tensor* chunk_hidden_states = NULL;  // [n_token, 4096]
+        struct ggml_tensor* pooled              = NULL;
+        struct ggml_tensor* t5_attn_mask        = vector_to_ggml_tensor(work_ctx, t5_attn_mask_vec);  // [n_token]
 
         std::vector<float> hidden_states_vec;
 
@@ -1368,10 +1369,16 @@ struct T5CLIPEmbedder : public Conditioner {
 
             int64_t t1 = ggml_time_ms();
             LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
-            if (force_zero_embeddings) {
-                float* vec = (float*)chunk_hidden_states->data;
-                for (int i = 0; i < ggml_nelements(chunk_hidden_states); i++) {
-                    vec[i] = 0;
+            if (zero_out_masked) {
+                auto tensor = chunk_hidden_states;
+                for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
+                    for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                        for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                            if (chunk_mask[i1] < 0.f) {
+                                ggml_tensor_set_f32(tensor, 0.f, i0, i1, i2);
+                            }
+                        }
+                    }
                 }
             }
 
@@ -1380,16 +1387,12 @@ struct T5CLIPEmbedder : public Conditioner {
                                      ((float*)chunk_hidden_states->data) + ggml_nelements(chunk_hidden_states));
         }
 
-        if (hidden_states_vec.size() > 0) {
-            hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec);
-            hidden_states = ggml_reshape_2d(work_ctx,
-                                            hidden_states,
-                                            chunk_hidden_states->ne[0],
-                                            ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
-        } else {
-            hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, 256);
-            ggml_set_f32(hidden_states, 0.f);
-        }
+        GGML_ASSERT(hidden_states_vec.size() > 0);
+        hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec);
+        hidden_states = ggml_reshape_2d(work_ctx,
+                                        hidden_states,
+                                        chunk_hidden_states->ne[0],
+                                        ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
 
         modify_mask_to_attend_padding(t5_attn_mask, ggml_nelements(t5_attn_mask), mask_pad);
 
@@ -1402,10 +1405,10 @@ struct T5CLIPEmbedder : public Conditioner {
                                       int clip_skip,
                                       int width,
                                       int height,
-                                      int adm_in_channels        = -1,
-                                      bool force_zero_embeddings = false) {
+                                      int adm_in_channels  = -1,
+                                      bool zero_out_masked = false) {
         auto tokens_and_weights = tokenize(text, chunk_len, true);
-        return get_learned_condition_common(work_ctx, n_threads, tokens_and_weights, clip_skip, force_zero_embeddings);
+        return get_learned_condition_common(work_ctx, n_threads, tokens_and_weights, clip_skip, zero_out_masked);
     }
 
     std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(ggml_context* work_ctx,
@@ -1415,8 +1418,8 @@ struct T5CLIPEmbedder : public Conditioner {
                                                                                   int width,
                                                                                   int height,
                                                                                   int num_input_imgs,
-                                                                                  int adm_in_channels        = -1,
-                                                                                  bool force_zero_embeddings = false) {
+                                                                                  int adm_in_channels  = -1,
+                                                                                  bool zero_out_masked = false) {
         GGML_ASSERT(0 && "Not implemented yet!");
     }
 
diff --git a/diffusion_model.hpp b/diffusion_model.hpp
index 787a4fa..6ac5c9b 100644
--- a/diffusion_model.hpp
+++ b/diffusion_model.hpp
@@ -4,6 +4,7 @@
 #include "flux.hpp"
 #include "mmdit.hpp"
 #include "unet.hpp"
+#include "wan.hpp"
 
 struct DiffusionModel {
     virtual void compute(int n_threads,
@@ -184,4 +185,56 @@ struct FluxModel : public DiffusionModel {
     }
 };
 
+struct WanModel : public DiffusionModel {
+    WAN::WanRunner wan;
+
+    WanModel(ggml_backend_t backend,
+             const String2GGMLType& tensor_types = {},
+             SDVersion version                   = VERSION_FLUX,
+             bool flash_attn                     = false)
+        : wan(backend, tensor_types, "model.diffusion_model", version, flash_attn) {
+    }
+
+    void alloc_params_buffer() {
+        wan.alloc_params_buffer();
+    }
+
+    void free_params_buffer() {
+        wan.free_params_buffer();
+    }
+
+    void free_compute_buffer() {
+        wan.free_compute_buffer();
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+        wan.get_param_tensors(tensors, "model.diffusion_model");
+    }
+
+    size_t get_params_buffer_size() {
+        return wan.get_params_buffer_size();
+    }
+
+    int64_t get_adm_in_channels() {
+        return 768;
+    }
+
+    void compute(int n_threads,
+                 struct ggml_tensor* x,
+                 struct ggml_tensor* timesteps,
+                 struct ggml_tensor* context,
+                 struct ggml_tensor* c_concat,
+                 struct ggml_tensor* y,
+                 struct ggml_tensor* guidance,
+                 std::vector<ggml_tensor*> ref_latents     = {},
+                 int num_video_frames                      = -1,
+                 std::vector<struct ggml_tensor*> controls = {},
+                 float control_strength                    = 0.f,
+                 struct ggml_tensor** output               = NULL,
+                 struct ggml_context* output_ctx           = NULL,
+                 std::vector<int> skip_layers              = std::vector<int>()) {
+        return wan.compute(n_threads, x, timesteps, context, NULL, NULL, output, output_ctx);
+    }
+};
+
 #endif
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index ec9f74a..3f0cce1 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -24,11 +24,14 @@
 #define STB_IMAGE_RESIZE_STATIC
 #include "stb_image_resize.h"
 
+#if defined(_WIN32)
+#define NOMINMAX
+#include <windows.h>
+#endif  // _WIN32
+
 #define SAFE_STR(s) ((s) ? (s) : "")
 #define BOOL_STR(b) ((b) ? "true" : "false")
 
-#include "t5.hpp"
-
 const char* modes_str[] = {
     "img_gen",
     "vid_gen",
@@ -69,7 +72,6 @@ struct SDParams {
 
     std::string prompt;
     std::string negative_prompt;
-    float min_cfg       = 1.0f;
     float cfg_scale     = 7.0f;
     float img_cfg_scale = INFINITY;
     float guidance      = 3.5f;
@@ -80,10 +82,7 @@ struct SDParams {
     int height          = 512;
     int batch_count     = 1;
 
-    int video_frames         = 6;
-    int motion_bucket_id     = 127;
-    int fps                  = 6;
-    float augmentation_level = 0.f;
+    int video_frames = 1;
 
     sample_method_t sample_method = EULER_A;
     schedule_t schedule           = DEFAULT;
@@ -147,7 +146,6 @@ void print_params(SDParams params) {
     printf("    strength(control): %.2f\n", params.control_strength);
     printf("    prompt:            %s\n", params.prompt.c_str());
     printf("    negative_prompt:   %s\n", params.negative_prompt.c_str());
-    printf("    min_cfg:           %.2f\n", params.min_cfg);
     printf("    cfg_scale:         %.2f\n", params.cfg_scale);
     printf("    img_cfg_scale:     %.2f\n", params.img_cfg_scale);
     printf("    slg_scale:         %.2f\n", params.slg_scale);
@@ -243,6 +241,42 @@ void print_usage(int argc, const char* argv[]) {
     printf("  -v, --verbose                      print extra info\n");
 }
 
+#if defined(_WIN32)
+static std::string utf16_to_utf8(const std::wstring& wstr) {
+    if (wstr.empty())
+        return {};
+    int size_needed = WideCharToMultiByte(CP_UTF8, 0, wstr.data(), (int)wstr.size(),
+                                          nullptr, 0, nullptr, nullptr);
+    if (size_needed <= 0)
+        throw std::runtime_error("UTF-16 to UTF-8 conversion failed");
+
+    std::string utf8(size_needed, 0);
+    WideCharToMultiByte(CP_UTF8, 0, wstr.data(), (int)wstr.size(),
+                        (char*)utf8.data(), size_needed, nullptr, nullptr);
+    return utf8;
+}
+
+static std::string argv_to_utf8(int index, const char** argv) {
+    int argc;
+    wchar_t** argv_w = CommandLineToArgvW(GetCommandLineW(), &argc);
+    if (!argv_w)
+        throw std::runtime_error("Failed to parse command line");
+
+    std::string result;
+    if (index < argc) {
+        result = utf16_to_utf8(argv_w[index]);
+    }
+    LocalFree(argv_w);
+    return result;
+}
+
+#else  // Linux / macOS
+static std::string argv_to_utf8(int index, const char** argv) {
+    return std::string(argv[index]);
+}
+
+#endif
+
 struct StringOption {
     std::string short_name;
     std::string long_name;
@@ -299,7 +333,7 @@ bool parse_options(int argc, const char** argv, ArgOptions& options) {
                     invalid_arg = true;
                     break;
                 }
-                *option.target = std::string(argv[i]);
+                *option.target = argv_to_utf8(i, argv);
             }
         }
         if (invalid_arg) {
@@ -746,17 +780,9 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
 
 int main(int argc, const char* argv[]) {
     SDParams params;
-    // params.verbose = true;
-    // sd_set_log_callback(sd_log_cb, (void*)&params);
-
-    // T5Embedder::load_from_file_and_test(argv[1]);
-    // return 0;
-
     parse_args(argc, argv, params);
-
     sd_guidance_params_t guidance_params = {params.cfg_scale,
                                             params.img_cfg_scale,
-                                            params.min_cfg,
                                             params.guidance,
                                             {
                                                 params.skip_layers.data(),
@@ -791,11 +817,6 @@ int main(int argc, const char* argv[]) {
         }
     }
 
-    if (params.mode == VID_GEN) {
-        fprintf(stderr, "SVD support is broken, do not use it!!!\n");
-        return 1;
-    }
-
     bool vae_decode_only          = true;
     uint8_t* input_image_buffer   = NULL;
     uint8_t* control_image_buffer = NULL;
@@ -992,18 +1013,19 @@ int main(int argc, const char* argv[]) {
         expected_num_results = params.batch_count;
     } else if (params.mode == VID_GEN) {
         sd_vid_gen_params_t vid_gen_params = {
+            params.prompt.c_str(),
+            params.negative_prompt.c_str(),
+            params.clip_skip,
+            guidance_params,
             input_image,
             params.width,
             params.height,
-            guidance_params,
             params.sample_method,
             params.sample_steps,
+            params.eta,
             params.strength,
             params.seed,
             params.video_frames,
-            params.motion_bucket_id,
-            params.fps,
-            params.augmentation_level,
         };
 
         results              = generate_video(sd_ctx, &vid_gen_params);
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 5d6248d..b5f4274 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -323,17 +323,27 @@ __STATIC_INLINE__ uint8_t* sd_tensor_to_image(struct ggml_tensor* input) {
     return image_data;
 }
 
-__STATIC_INLINE__ uint8_t* sd_tensor_to_mul_image(struct ggml_tensor* input, int idx) {
-    int64_t width    = input->ne[0];
-    int64_t height   = input->ne[1];
-    int64_t channels = input->ne[2];
+__STATIC_INLINE__ uint8_t* sd_tensor_to_image(struct ggml_tensor* input, int idx, bool video = false) {
+    int64_t width  = input->ne[0];
+    int64_t height = input->ne[1];
+    int64_t channels;
+    if (video) {
+        channels = input->ne[3];
+    } else {
+        channels = input->ne[2];
+    }
     GGML_ASSERT(channels == 3 && input->type == GGML_TYPE_F32);
     uint8_t* image_data = (uint8_t*)malloc(width * height * channels);
-    for (int iy = 0; iy < height; iy++) {
-        for (int ix = 0; ix < width; ix++) {
-            for (int k = 0; k < channels; k++) {
-                float value                                               = ggml_tensor_get_f32(input, ix, iy, k, idx);
-                *(image_data + iy * width * channels + ix * channels + k) = (uint8_t)(value * 255.0f);
+    for (int ih = 0; ih < height; ih++) {
+        for (int iw = 0; iw < width; iw++) {
+            for (int ic = 0; ic < channels; ic++) {
+                float value;
+                if (video) {
+                    value = ggml_tensor_get_f32(input, iw, ih, idx, ic);
+                } else {
+                    value = ggml_tensor_get_f32(input, iw, ih, ic, idx);
+                }
+                *(image_data + ih * width * channels + iw * channels + ic) = (uint8_t)(value * 255.0f);
             }
         }
     }
diff --git a/model.cpp b/model.cpp
index 44efa92..7791ded 100644
--- a/model.cpp
+++ b/model.cpp
@@ -1055,7 +1055,11 @@ bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::s
 
         // LOG_DEBUG("%s", name.c_str());
 
-        TensorStorage tensor_storage(prefix + name, dummy->type, dummy->ne, ggml_n_dims(dummy), file_index, offset);
+        if (!starts_with(name, prefix)) {
+            name = prefix + name;
+        }
+
+        TensorStorage tensor_storage(name, dummy->type, dummy->ne, ggml_n_dims(dummy), file_index, offset);
 
         GGML_ASSERT(ggml_nbytes(dummy) == tensor_storage.nbytes());
 
@@ -1195,7 +1199,11 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
             n_dims = 1;
         }
 
-        TensorStorage tensor_storage(prefix + name, type, ne, n_dims, file_index, ST_HEADER_SIZE_LEN + header_size_ + begin);
+        if (!starts_with(name, prefix)) {
+            name = prefix + name;
+        }
+
+        TensorStorage tensor_storage(name, type, ne, n_dims, file_index, ST_HEADER_SIZE_LEN + header_size_ + begin);
         tensor_storage.reverse_ne();
 
         size_t tensor_data_size = end - begin;
@@ -1580,7 +1588,11 @@ bool ModelLoader::parse_data_pkl(uint8_t* buffer,
                         reader.tensor_storage.file_index = file_index;
                         // if(strcmp(prefix.c_str(), "scarlett") == 0)
                         // printf(" ZIP got tensor %s \n ", reader.tensor_storage.name.c_str());
-                        reader.tensor_storage.name = prefix + reader.tensor_storage.name;
+                        std::string name = reader.tensor_storage.name;
+                        if (!starts_with(name, prefix)) {
+                            name = prefix + name;
+                        }
+                        reader.tensor_storage.name = name;
                         tensor_storages.push_back(reader.tensor_storage);
                         add_preprocess_tensor_storage_types(tensor_storages_types, reader.tensor_storage.name, reader.tensor_storage.type);
 
@@ -1654,10 +1666,10 @@ SDVersion ModelLoader::get_sd_version() {
 
     bool is_xl   = false;
     bool is_flux = false;
+    bool is_wan  = false;
 
-#define found_family (is_xl || is_flux)
     for (auto& tensor_storage : tensor_storages) {
-        if (!found_family) {
+        if (!(is_xl || is_flux)) {
             if (tensor_storage.name.find("model.diffusion_model.double_blocks.") != std::string::npos) {
                 is_flux = true;
                 if (input_block_checked) {
@@ -1667,6 +1679,9 @@ SDVersion ModelLoader::get_sd_version() {
             if (tensor_storage.name.find("model.diffusion_model.joint_blocks.") != std::string::npos) {
                 return VERSION_SD3;
             }
+            if (tensor_storage.name.find("model.diffusion_model.blocks.0.cross_attn.norm_k.weight") != std::string::npos) {
+                return VERSION_WAN2;
+            }
             if (tensor_storage.name.find("model.diffusion_model.input_blocks.") != std::string::npos || tensor_storage.name.find("unet.down_blocks.") != std::string::npos) {
                 is_unet = true;
                 if (has_multiple_encoders) {
@@ -1701,7 +1716,7 @@ SDVersion ModelLoader::get_sd_version() {
         if (tensor_storage.name == "model.diffusion_model.input_blocks.0.0.weight" || tensor_storage.name == "model.diffusion_model.img_in.weight" || tensor_storage.name == "unet.conv_in.weight") {
             input_block_weight  = tensor_storage;
             input_block_checked = true;
-            if (found_family) {
+            if (is_xl || is_flux) {
                 break;
             }
         }
diff --git a/model.h b/model.h
index 38eb929..10a7449 100644
--- a/model.h
+++ b/model.h
@@ -31,8 +31,7 @@ enum SDVersion {
     VERSION_SD3,
     VERSION_FLUX,
     VERSION_FLUX_FILL,
-    VERSION_WAN_2_1,
-    VERSION_WAN_2_2,
+    VERSION_WAN2,
     VERSION_COUNT,
 };
 
@@ -72,7 +71,7 @@ static inline bool sd_version_is_flux(SDVersion version) {
 }
 
 static inline bool sd_version_is_wan(SDVersion version) {
-    if (version == VERSION_WAN_2_1 || version == VERSION_WAN_2_2) {
+    if (version == VERSION_WAN2) {
         return true;
     }
     return false;
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 4df23ca..c9d5c28 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -36,7 +36,9 @@ const char* model_version_to_str[] = {
     "SVD",
     "SD3.x",
     "Flux",
-    "Flux Fill"};
+    "Flux Fill",
+    "Wan 2.x",
+};
 
 const char* sampling_methods_str[] = {
     "Euler A",
@@ -50,7 +52,8 @@ const char* sampling_methods_str[] = {
     "iPNDM_v",
     "LCM",
     "DDIM \"trailing\"",
-    "TCD"};
+    "TCD",
+};
 
 /*================================================== Helper Functions ================================================*/
 
@@ -93,7 +96,7 @@ public:
     std::shared_ptr<Conditioner> cond_stage_model;
     std::shared_ptr<FrozenCLIPVisionEmbedder> clip_vision;  // for svd
     std::shared_ptr<DiffusionModel> diffusion_model;
-    std::shared_ptr<AutoEncoderKL> first_stage_model;
+    std::shared_ptr<VAE> first_stage_model;
     std::shared_ptr<TinyAutoEncoder> tae_first_stage;
     std::shared_ptr<ControlNet> control_net;
     std::shared_ptr<PhotoMakerIDEncoder> pmid_model;
@@ -274,10 +277,10 @@ public:
             model_loader.set_wtype_override(GGML_TYPE_F32, "vae.");
         }
 
-        LOG_INFO("Weight type:                 %s", model_wtype != GGML_TYPE_COUNT ? ggml_type_name(model_wtype) : "??");
-        LOG_INFO("Conditioner weight type:     %s", conditioner_wtype != GGML_TYPE_COUNT ? ggml_type_name(conditioner_wtype) : "??");
-        LOG_INFO("Diffusion model weight type: %s", diffusion_model_wtype != GGML_TYPE_COUNT ? ggml_type_name(diffusion_model_wtype) : "??");
-        LOG_INFO("VAE weight type:             %s", vae_wtype != GGML_TYPE_COUNT ? ggml_type_name(vae_wtype) : "??");
+        LOG_INFO("Weight type:                 %s", ggml_type_name(model_wtype));
+        LOG_INFO("Conditioner weight type:     %s", ggml_type_name(conditioner_wtype));
+        LOG_INFO("Diffusion model weight type: %s", ggml_type_name(diffusion_model_wtype));
+        LOG_INFO("VAE weight type:             %s", ggml_type_name(vae_wtype));
 
         LOG_DEBUG("ggml tensor size = %d bytes", (int)sizeof(ggml_tensor));
 
@@ -293,34 +296,25 @@ public:
         } else if (sd_version_is_sd3(version)) {
             scale_factor = 1.5305f;
         } else if (sd_version_is_flux(version)) {
-            scale_factor = 0.3611;
+            scale_factor = 0.3611f;
             // TODO: shift_factor
+        } else if (sd_version_is_wan(version)) {
+            scale_factor = 1.0f;
         }
 
         bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu;
 
-        if (version == VERSION_SVD) {
-            clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(backend, model_loader.tensor_storages_types);
-            clip_vision->alloc_params_buffer();
-            clip_vision->get_param_tensors(tensors);
-
-            diffusion_model = std::make_shared<UNetModel>(backend, model_loader.tensor_storages_types, version);
-            diffusion_model->alloc_params_buffer();
-            diffusion_model->get_param_tensors(tensors);
-
-            first_stage_model = std::make_shared<AutoEncoderKL>(backend, model_loader.tensor_storages_types, "first_stage_model", vae_decode_only, true, version);
-            LOG_DEBUG("vae_decode_only %d", vae_decode_only);
-            first_stage_model->alloc_params_buffer();
-            first_stage_model->get_param_tensors(tensors, "first_stage_model");
-        } else {
+        {
             clip_backend   = backend;
             bool use_t5xxl = false;
             if (sd_version_is_dit(version)) {
                 use_t5xxl = true;
             }
-            if (!ggml_backend_is_cpu(backend) && use_t5xxl && conditioner_wtype != GGML_TYPE_F32) {
-                clip_on_cpu = true;
-                LOG_INFO("set clip_on_cpu to true");
+            if (!ggml_backend_is_cpu(backend) && use_t5xxl) {
+                LOG_WARN(
+                    "!!!It appears that you are using the T5 model. Some backends may encounter issues with it."
+                    "If you notice that the generated images are completely black,"
+                    "try running the T5 model on the CPU using the --clip-on-cpu parameter.");
             }
             if (clip_on_cpu && !ggml_backend_is_cpu(backend)) {
                 LOG_INFO("CLIP: Using CPU backend");
@@ -357,7 +351,18 @@ public:
                                                               version,
                                                               sd_ctx_params->diffusion_flash_attn,
                                                               sd_ctx_params->chroma_use_dit_mask);
-            } else {
+            } else if (sd_version_is_wan(version)) {
+                cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
+                                                                    model_loader.tensor_storages_types,
+                                                                    -1,
+                                                                    true,
+                                                                    1,
+                                                                    true);
+                diffusion_model  = std::make_shared<WanModel>(backend,
+                                                             model_loader.tensor_storages_types,
+                                                             version,
+                                                             sd_ctx_params->diffusion_flash_attn);
+            } else {  // SD1.x SD2.x SDXL
                 if (strstr(SAFE_STR(sd_ctx_params->stacked_id_embed_dir), "v2")) {
                     cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
                                                                                            model_loader.tensor_storages_types,
@@ -382,13 +387,21 @@ public:
             diffusion_model->alloc_params_buffer();
             diffusion_model->get_param_tensors(tensors);
 
-            if (!use_tiny_autoencoder) {
-                if (sd_ctx_params->keep_vae_on_cpu && !ggml_backend_is_cpu(backend)) {
-                    LOG_INFO("VAE Autoencoder: Using CPU backend");
-                    vae_backend = ggml_backend_cpu_init();
-                } else {
-                    vae_backend = backend;
-                }
+            if (sd_ctx_params->keep_vae_on_cpu && !ggml_backend_is_cpu(backend)) {
+                LOG_INFO("VAE Autoencoder: Using CPU backend");
+                vae_backend = ggml_backend_cpu_init();
+            } else {
+                vae_backend = backend;
+            }
+
+            if (sd_version_is_wan(version)) {
+                first_stage_model = std::make_shared<WAN::WanVAERunner>(vae_backend,
+                                                                        model_loader.tensor_storages_types,
+                                                                        "first_stage_model",
+                                                                        vae_decode_only);
+                first_stage_model->alloc_params_buffer();
+                first_stage_model->get_param_tensors(tensors, "first_stage_model");
+            } else if (!use_tiny_autoencoder) {
                 first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend,
                                                                     model_loader.tensor_storages_types,
                                                                     "first_stage_model",
@@ -398,7 +411,7 @@ public:
                 first_stage_model->alloc_params_buffer();
                 first_stage_model->get_param_tensors(tensors, "first_stage_model");
             } else {
-                tae_first_stage = std::make_shared<TinyAutoEncoder>(backend,
+                tae_first_stage = std::make_shared<TinyAutoEncoder>(vae_backend,
                                                                     model_loader.tensor_storages_types,
                                                                     "decoder.layers",
                                                                     vae_decode_only,
@@ -485,11 +498,7 @@ public:
 
         // LOG_DEBUG("model size = %.2fMB", total_size / 1024.0 / 1024.0);
 
-        if (version == VERSION_SVD) {
-            // diffusion_model->test();
-            // first_stage_model->test();
-            // return false;
-        } else {
+        {
             size_t clip_params_mem_size = cond_stage_model->get_params_buffer_size();
             size_t unet_params_mem_size = diffusion_model->get_params_buffer_size();
             size_t vae_params_mem_size  = 0;
@@ -594,6 +603,9 @@ public:
                 }
             }
             denoiser = std::make_shared<FluxFlowDenoiser>(shift);
+        } else if (sd_version_is_wan(version)) {
+            LOG_INFO("running in FLOW mode");
+            denoiser = std::make_shared<DiscreteFlowDenoiser>();
         } else if (is_using_v_parameterization) {
             LOG_INFO("running in v-prediction mode");
             denoiser = std::make_shared<CompVisVDenoiser>();
@@ -733,9 +745,9 @@ public:
 
         size_t rm = lora_state_diff.size() - lora_state.size();
         if (rm != 0) {
-            LOG_INFO("Attempting to apply %lu LoRAs (removing %lu applied LoRAs)", lora_state.size(), rm);
+            LOG_INFO("attempting to apply %lu LoRAs (removing %lu applied LoRAs)", lora_state.size(), rm);
         } else {
-            LOG_INFO("Attempting to apply %lu LoRAs", lora_state.size());
+            LOG_INFO("attempting to apply %lu LoRAs", lora_state.size());
         }
 
         for (auto& kv : lora_state_diff) {
@@ -745,6 +757,21 @@ public:
         curr_lora_state = lora_state;
     }
 
+    std::string apply_loras_from_prompt(const std::string& prompt) {
+        auto result_pair                                = extract_and_remove_lora(prompt);
+        std::unordered_map<std::string, float> lora_f2m = result_pair.first;  // lora_name -> multiplier
+
+        for (auto& kv : lora_f2m) {
+            LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second);
+        }
+        int64_t t0 = ggml_time_ms();
+        apply_loras(lora_f2m);
+        int64_t t1 = ggml_time_ms();
+        LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
+        LOG_DEBUG("prompt after extract and remove lora: \"%s\"", result_pair.second.c_str());
+        return result_pair.second;
+    }
+
     ggml_tensor* id_encoder(ggml_context* work_ctx,
                             ggml_tensor* init_img,
                             ggml_tensor* prompts_embeds,
@@ -759,15 +786,15 @@ public:
                                   sd_image_t init_image,
                                   int width,
                                   int height,
-                                  int fps                    = 6,
-                                  int motion_bucket_id       = 127,
-                                  float augmentation_level   = 0.f,
-                                  bool force_zero_embeddings = false) {
+                                  int fps                  = 6,
+                                  int motion_bucket_id     = 127,
+                                  float augmentation_level = 0.f,
+                                  bool zero_out_masked     = false) {
         // c_crossattn
         int64_t t0                      = ggml_time_ms();
         struct ggml_tensor* c_crossattn = NULL;
         {
-            if (force_zero_embeddings) {
+            if (zero_out_masked) {
                 c_crossattn = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, clip_vision->vision_model.projection_dim);
                 ggml_set_f32(c_crossattn, 0.f);
             } else {
@@ -790,7 +817,7 @@ public:
         // c_concat
         struct ggml_tensor* c_concat = NULL;
         {
-            if (force_zero_embeddings) {
+            if (zero_out_masked) {
                 c_concat = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 4, 1);
                 ggml_set_f32(c_concat, 0.f);
             } else {
@@ -855,28 +882,14 @@ public:
         float img_cfg_scale = guidance.img_cfg;
         float slg_scale     = guidance.slg.scale;
 
-        float min_cfg = guidance.min_cfg;
+        LOG_DEBUG("cfg_scale %.2f", cfg_scale);
 
         if (img_cfg_scale != cfg_scale && !sd_version_is_inpaint_or_unet_edit(version)) {
             LOG_WARN("2-conditioning CFG is not supported with this model, disabling it for better performance...");
             img_cfg_scale = cfg_scale;
         }
 
-        LOG_DEBUG("Sample");
-        struct ggml_init_params params;
-        size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]);
-        for (int i = 1; i < 4; i++) {
-            data_size *= init_latent->ne[i];
-        }
-        data_size += 1024;
-        params.mem_size       = data_size * 3;
-        params.mem_buffer     = NULL;
-        params.no_alloc       = false;
-        ggml_context* tmp_ctx = ggml_init(params);
-
-        size_t steps = sigmas.size() - 1;
-        // noise = load_tensor_from_file(work_ctx, "./rand0.bin");
-        // print_ggml_tensor(noise);
+        size_t steps          = sigmas.size() - 1;
         struct ggml_tensor* x = ggml_dup_tensor(work_ctx, init_latent);
         copy_ggml_tensor(x, init_latent);
         x = denoiser->noise_scaling(sigmas[0], noise, x);
@@ -922,9 +935,9 @@ public:
             float c_in   = scaling[2];
 
             float t = denoiser->sigma_to_t(sigma);
-            std::vector<float> timesteps_vec(x->ne[3], t);  // [N, ]
+            std::vector<float> timesteps_vec(1, t);  // [N, ]
             auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
-            std::vector<float> guidance_vec(x->ne[3], guidance.distilled_guidance);
+            std::vector<float> guidance_vec(1, guidance.distilled_guidance);
             auto guidance_tensor = vector_to_ggml_tensor(work_ctx, guidance_vec);
 
             copy_ggml_tensor(noised_input, input);
@@ -1038,18 +1051,12 @@ public:
                 float latent_result = positive_data[i];
                 if (has_unconditioned) {
                     // out_uncond + cfg_scale * (out_cond - out_uncond)
-                    int64_t ne3 = out_cond->ne[3];
-                    if (min_cfg != cfg_scale && ne3 != 1) {
-                        int64_t i3  = i / out_cond->ne[0] * out_cond->ne[1] * out_cond->ne[2];
-                        float scale = min_cfg + (cfg_scale - min_cfg) * (i3 * 1.0f / ne3);
+                    if (has_img_cond) {
+                        // out_uncond + text_cfg_scale * (out_cond - out_img_cond) + image_cfg_scale * (out_img_cond - out_uncond)
+                        latent_result = negative_data[i] + img_cfg_scale * (img_cond_data[i] - negative_data[i]) + cfg_scale * (positive_data[i] - img_cond_data[i]);
                     } else {
-                        if (has_img_cond) {
-                            // out_uncond + text_cfg_scale * (out_cond - out_img_cond) + image_cfg_scale * (out_img_cond - out_uncond)
-                            latent_result = negative_data[i] + img_cfg_scale * (img_cond_data[i] - negative_data[i]) + cfg_scale * (positive_data[i] - img_cond_data[i]);
-                        } else {
-                            // img_cfg_scale == cfg_scale
-                            latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]);
-                        }
+                        // img_cfg_scale == cfg_scale
+                        latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]);
                     }
                 } else if (has_img_cond) {
                     // img_cfg_scale == 1
@@ -1085,6 +1092,7 @@ public:
 
         sample_k_diffusion(method, denoise, work_ctx, x, sigmas, rng, eta);
 
+        LOG_DEBUG("sigmas[sigmas.size() - 1] %f", sigmas[sigmas.size() - 1]);
         x = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x);
 
         if (control_net) {
@@ -1101,7 +1109,6 @@ public:
         ggml_tensor* latent       = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]);
         struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latent);
         ggml_tensor_set_f32_randn(noise, rng);
-        // noise = load_tensor_from_file(work_ctx, "noise.bin");
         {
             float mean   = 0;
             float logvar = 0;
@@ -1127,9 +1134,9 @@ public:
         return latent;
     }
 
-    ggml_tensor* compute_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode) {
-        int64_t W = x->ne[0];
-        int64_t H = x->ne[1];
+    ggml_tensor* encode_first_stage(ggml_context* work_ctx, ggml_tensor* x) {
+        int64_t W = x->ne[0] / 8;
+        int64_t H = x->ne[1] / 8;
         int64_t C = 8;
         if (use_tiny_autoencoder) {
             C = 4;
@@ -1140,59 +1147,106 @@ public:
                 C = 32;
             }
         }
-        ggml_tensor* result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32,
-                                                 decode ? (W * 8) : (W / 8),  // width
-                                                 decode ? (H * 8) : (H / 8),  // height
-                                                 decode ? 3 : C,
-                                                 x->ne[3]);  // channels
+        ggml_tensor* result = ggml_new_tensor_4d(work_ctx,
+                                                 GGML_TYPE_F32,
+                                                 W,
+                                                 H,
+                                                 C,
+                                                 x->ne[3]);
         int64_t t0          = ggml_time_ms();
         if (!use_tiny_autoencoder) {
-            if (decode) {
-                ggml_tensor_scale(x, 1.0f / scale_factor);
-            } else {
-                ggml_tensor_scale_input(x);
+            ggml_tensor_scale_input(x);
+            first_stage_model->compute(n_threads, x, false, &result, NULL);
+            first_stage_model->free_compute_buffer();
+        } else {
+            tae_first_stage->compute(n_threads, x, false, &result, NULL);
+            tae_first_stage->free_compute_buffer();
+        }
+
+        int64_t t1 = ggml_time_ms();
+        LOG_DEBUG("computing vae encode graph completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
+        return result;
+    }
+
+    void process_latent_out(ggml_tensor* latent) {
+        if (sd_version_is_wan(version)) {
+            GGML_ASSERT(latent->ne[3] == 16);
+            std::vector<float> latents_mean_vec = {-0.7571f, -0.7089f, -0.9113f, 0.1075f, -0.1745f, 0.9653f, -0.1517f, 1.5508f,
+                                                   0.4134f, -0.0715f, 0.5517f, -0.3632f, -0.1922f, -0.9497f, 0.2503f, -0.2921f};
+            std::vector<float> latents_std_vec  = {2.8184f, 1.4541f, 2.3275f, 2.6558f, 1.2196f, 1.7708f, 2.6052f, 2.0743f,
+                                                   3.2687f, 2.1526f, 2.8652f, 1.5579f, 1.6382f, 1.1253f, 2.8251f, 1.9160f};
+            for (int i = 0; i < latent->ne[3]; i++) {
+                float mean = latents_mean_vec[i];
+                float std_ = latents_std_vec[i];
+                for (int j = 0; j < latent->ne[2]; j++) {
+                    for (int k = 0; k < latent->ne[1]; k++) {
+                        for (int l = 0; l < latent->ne[0]; l++) {
+                            float value = ggml_tensor_get_f32(latent, l, k, j, i);
+                            value       = value * std_ / scale_factor + mean;
+                            ggml_tensor_set_f32(latent, value, l, k, j, i);
+                        }
+                    }
+                }
             }
-            if (vae_tiling && decode) {  // TODO: support tiling vae encode
+        } else {
+            ggml_tensor_scale(latent, 1.0f / scale_factor);
+        }
+    }
+
+    ggml_tensor* decode_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode_video = false) {
+        int64_t W = x->ne[0] * 8;
+        int64_t H = x->ne[1] * 8;
+        int64_t C = 3;
+        ggml_tensor* result;
+        if (decode_video) {
+            result = ggml_new_tensor_4d(work_ctx,
+                                        GGML_TYPE_F32,
+                                        W,
+                                        H,
+                                        x->ne[2],
+                                        3);
+        } else {
+            result = ggml_new_tensor_4d(work_ctx,
+                                        GGML_TYPE_F32,
+                                        W,
+                                        H,
+                                        C,
+                                        x->ne[3]);
+        }
+
+        int64_t t0 = ggml_time_ms();
+        if (!use_tiny_autoencoder) {
+            LOG_DEBUG("scale_factor %.2f", scale_factor);
+            process_latent_out(x);
+            if (vae_tiling && !decode_video) {
                 // split latent in 32x32 tiles and compute in several steps
                 auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-                    first_stage_model->compute(n_threads, in, decode, &out);
+                    first_stage_model->compute(n_threads, in, true, &out, NULL);
                 };
                 sd_tiling(x, result, 8, 32, 0.5f, on_tiling);
             } else {
-                first_stage_model->compute(n_threads, x, decode, &result);
+                first_stage_model->compute(n_threads, x, true, &result, NULL);
             }
             first_stage_model->free_compute_buffer();
-            if (decode) {
-                ggml_tensor_scale_output(result);
-            }
+            ggml_tensor_scale_output(result);
         } else {
-            if (vae_tiling && decode) {  // TODO: support tiling vae encode
+            if (vae_tiling && !decode_video) {
                 // split latent in 64x64 tiles and compute in several steps
                 auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-                    tae_first_stage->compute(n_threads, in, decode, &out);
+                    tae_first_stage->compute(n_threads, in, true, &out);
                 };
                 sd_tiling(x, result, 8, 64, 0.5f, on_tiling);
             } else {
-                tae_first_stage->compute(n_threads, x, decode, &result);
+                tae_first_stage->compute(n_threads, x, true, &result);
             }
             tae_first_stage->free_compute_buffer();
         }
 
         int64_t t1 = ggml_time_ms();
-        LOG_DEBUG("computing vae [mode: %s] graph completed, taking %.2fs", decode ? "DECODE" : "ENCODE", (t1 - t0) * 1.0f / 1000);
-        if (decode) {
-            ggml_tensor_clamp(result, 0.0f, 1.0f);
-        }
+        LOG_DEBUG("computing vae decode graph completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
+        ggml_tensor_clamp(result, 0.0f, 1.0f);
         return result;
     }
-
-    ggml_tensor* encode_first_stage(ggml_context* work_ctx, ggml_tensor* x) {
-        return compute_first_stage(work_ctx, x, false);
-    }
-
-    ggml_tensor* decode_first_stage(ggml_context* work_ctx, ggml_tensor* x) {
-        return compute_first_stage(work_ctx, x, true);
-    }
 };
 
 /*================================================= SD API ==================================================*/
@@ -1373,7 +1427,6 @@ void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params) {
     memset((void*)sd_img_gen_params, 0, sizeof(sd_img_gen_params_t));
     sd_img_gen_params->clip_skip                   = -1;
     sd_img_gen_params->guidance.txt_cfg            = 7.0f;
-    sd_img_gen_params->guidance.min_cfg            = 1.0f;
     sd_img_gen_params->guidance.img_cfg            = INFINITY;
     sd_img_gen_params->guidance.distilled_guidance = 3.5f;
     sd_img_gen_params->guidance.slg.layer_count    = 0;
@@ -1406,7 +1459,6 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
              "clip_skip: %d\n"
              "txt_cfg: %.2f\n"
              "img_cfg: %.2f\n"
-             "min_cfg: %.2f\n"
              "distilled_guidance: %.2f\n"
              "slg.layer_count: %zu\n"
              "slg.layer_start: %.2f\n"
@@ -1431,7 +1483,6 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
              sd_img_gen_params->clip_skip,
              sd_img_gen_params->guidance.txt_cfg,
              sd_img_gen_params->guidance.img_cfg,
-             sd_img_gen_params->guidance.min_cfg,
              sd_img_gen_params->guidance.distilled_guidance,
              sd_img_gen_params->guidance.slg.layer_count,
              sd_img_gen_params->guidance.slg.layer_start,
@@ -1457,7 +1508,6 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
 void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) {
     memset((void*)sd_vid_gen_params, 0, sizeof(sd_vid_gen_params_t));
     sd_vid_gen_params->guidance.txt_cfg            = 7.0f;
-    sd_vid_gen_params->guidance.min_cfg            = 1.0f;
     sd_vid_gen_params->guidance.img_cfg            = INFINITY;
     sd_vid_gen_params->guidance.distilled_guidance = 3.5f;
     sd_vid_gen_params->guidance.slg.layer_count    = 0;
@@ -1471,9 +1521,6 @@ void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) {
     sd_vid_gen_params->strength                    = 0.75f;
     sd_vid_gen_params->seed                        = -1;
     sd_vid_gen_params->video_frames                = 6;
-    sd_vid_gen_params->motion_bucket_id            = 127;
-    sd_vid_gen_params->fps                         = 6;
-    sd_vid_gen_params->augmentation_level          = 0.f;
 }
 
 struct sd_ctx_t {
@@ -1545,21 +1592,9 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
 
     int sample_steps = sigmas.size() - 1;
 
-    // Apply lora
-    auto result_pair                                = extract_and_remove_lora(prompt);
-    std::unordered_map<std::string, float> lora_f2m = result_pair.first;  // lora_name -> multiplier
-
-    for (auto& kv : lora_f2m) {
-        LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second);
-    }
-
-    prompt = result_pair.second;
-    LOG_DEBUG("prompt after extract and remove lora: \"%s\"", prompt.c_str());
-
     int64_t t0 = ggml_time_ms();
-    sd_ctx->sd->apply_loras(lora_f2m);
-    int64_t t1 = ggml_time_ms();
-    LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
+    // Apply lora
+    prompt = sd_ctx->sd->apply_loras_from_prompt(prompt);
 
     // Photo Maker
     std::string prompt_text_only;
@@ -1568,9 +1603,9 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
     std::vector<bool> class_tokens_mask;
     if (sd_ctx->sd->stacked_id) {
         if (!sd_ctx->sd->pmid_lora->applied) {
-            t0 = ggml_time_ms();
+            int64_t t0 = ggml_time_ms();
             sd_ctx->sd->pmid_lora->apply(sd_ctx->sd->tensors, sd_ctx->sd->version, sd_ctx->sd->n_threads);
-            t1                             = ggml_time_ms();
+            int64_t t1                     = ggml_time_ms();
             sd_ctx->sd->pmid_lora->applied = true;
             LOG_INFO("pmid_lora apply completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
             if (sd_ctx->sd->free_params_immediately) {
@@ -1625,7 +1660,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
                 else
                     sd_mul_images_to_tensor(init_image->data, init_img, i, NULL, NULL);
             }
-            t0                            = ggml_time_ms();
+            int64_t t0                    = ggml_time_ms();
             auto cond_tup                 = sd_ctx->sd->cond_stage_model->get_learned_condition_with_trigger(work_ctx,
                                                                                                              sd_ctx->sd->n_threads, prompt,
                                                                                                              clip_skip,
@@ -1642,7 +1677,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
                 // print_ggml_tensor(id_embeds, true, "id_embeds:");
             }
             id_cond.c_crossattn = sd_ctx->sd->id_encoder(work_ctx, init_img, id_cond.c_crossattn, id_embeds, class_tokens_mask);
-            t1                  = ggml_time_ms();
+            int64_t t1          = ggml_time_ms();
             LOG_INFO("Photomaker ID Stacking, taking %" PRId64 " ms", t1 - t0);
             if (sd_ctx->sd->free_params_immediately) {
                 sd_ctx->sd->pmid_model->free_params_buffer();
@@ -1679,9 +1714,9 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
     SDCondition uncond;
     if (guidance.txt_cfg != 1.0 ||
         (sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.txt_cfg != guidance.img_cfg)) {
-        bool force_zero_embeddings = false;
+        bool zero_out_masked = false;
         if (sd_version_is_sdxl(sd_ctx->sd->version) && negative_prompt.size() == 0 && !sd_ctx->sd->is_using_edm_v_parameterization) {
-            force_zero_embeddings = true;
+            zero_out_masked = true;
         }
         uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
                                                                      sd_ctx->sd->n_threads,
@@ -1690,9 +1725,9 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
                                                                      width,
                                                                      height,
                                                                      sd_ctx->sd->diffusion_model->get_adm_in_channels(),
-                                                                     force_zero_embeddings);
+                                                                     zero_out_masked);
     }
-    t1 = ggml_time_ms();
+    int64_t t1 = ggml_time_ms();
     LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t1 - t0);
 
     if (sd_ctx->sd->free_params_immediately) {
@@ -1780,9 +1815,6 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
             LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step);
         }
 
-        // Disable min_cfg
-        guidance.min_cfg = guidance.txt_cfg;
-
         struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx,
                                                      x_t,
                                                      noise,
@@ -1799,8 +1831,6 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
                                                      id_cond,
                                                      ref_latents,
                                                      denoise_mask);
-
-        // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
         // print_ggml_tensor(x_0);
         int64_t sampling_end = ggml_time_ms();
         LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
@@ -1852,16 +1882,25 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
 ggml_tensor* generate_init_latent(sd_ctx_t* sd_ctx,
                                   ggml_context* work_ctx,
                                   int width,
-                                  int height) {
+                                  int height,
+                                  int frames = 1,
+                                  bool video = false) {
     int C = 4;
     if (sd_version_is_sd3(sd_ctx->sd->version)) {
         C = 16;
     } else if (sd_version_is_flux(sd_ctx->sd->version)) {
         C = 16;
+    } else if (sd_version_is_wan(sd_ctx->sd->version)) {
+        C = 16;
+    }
+    int W = width / 8;
+    int H = height / 8;
+    ggml_tensor* init_latent;
+    if (video) {
+        init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, frames, C);
+    } else {
+        init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
     }
-    int W                    = width / 8;
-    int H                    = height / 8;
-    ggml_tensor* init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
     if (sd_version_is_sd3(sd_ctx->sd->version)) {
         ggml_set_f32(init_latent, 0.0609f);
     } else if (sd_version_is_flux(sd_ctx->sd->version)) {
@@ -1877,11 +1916,17 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
     int height = sd_img_gen_params->height;
     if (sd_version_is_dit(sd_ctx->sd->version)) {
         if (width % 16 || height % 16) {
-            LOG_ERROR("Image dimensions must be must be a multiple of 16 on each axis for %s models. (Got %dx%d)", model_version_to_str[sd_ctx->sd->version], width, height);
+            LOG_ERROR("Image dimensions must be must be a multiple of 16 on each axis for %s models. (Got %dx%d)",
+                      model_version_to_str[sd_ctx->sd->version],
+                      width,
+                      height);
             return NULL;
         }
     } else if (width % 64 || height % 64) {
-        LOG_ERROR("Image dimensions must be must be a multiple of 64 on each axis for %s models. (Got %dx%d)", model_version_to_str[sd_ctx->sd->version], width, height);
+        LOG_ERROR("Image dimensions must be must be a multiple of 64 on each axis for %s models. (Got %dx%d)",
+                  model_version_to_str[sd_ctx->sd->version],
+                  width,
+                  height);
         return NULL;
     }
     LOG_DEBUG("generate_image %dx%d", width, height);
@@ -2095,20 +2140,23 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
         return NULL;
     }
 
+    std::string prompt          = SAFE_STR(sd_vid_gen_params->prompt);
+    std::string negative_prompt = SAFE_STR(sd_vid_gen_params->negative_prompt);
+
     int width  = sd_vid_gen_params->width;
     int height = sd_vid_gen_params->height;
-    LOG_INFO("img2vid %dx%d", width, height);
+    int frames = sd_vid_gen_params->video_frames;
+    LOG_INFO("img2vid %dx%dx%d", width, height, frames);
 
     std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sd_vid_gen_params->sample_steps);
 
     struct ggml_init_params params;
-    params.mem_size = static_cast<size_t>(10 * 1024) * 1024;  // 10 MB
-    params.mem_size += width * height * 3 * sizeof(float) * sd_vid_gen_params->video_frames;
+    params.mem_size = static_cast<size_t>(100 * 1024) * 1024;  // 50 MB
+    params.mem_size += width * height * frames * 3 * sizeof(float);
     params.mem_buffer = NULL;
     params.no_alloc   = false;
     // LOG_DEBUG("mem_size %u ", params.mem_size);
 
-    // draft context
     struct ggml_context* work_ctx = ggml_init(params);
     if (!work_ctx) {
         LOG_ERROR("ggml_init() failed");
@@ -2124,90 +2172,100 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
 
     int64_t t0 = ggml_time_ms();
 
-    SDCondition cond = sd_ctx->sd->get_svd_condition(work_ctx,
-                                                     sd_vid_gen_params->init_image,
-                                                     width,
-                                                     height,
-                                                     sd_vid_gen_params->fps,
-                                                     sd_vid_gen_params->motion_bucket_id,
-                                                     sd_vid_gen_params->augmentation_level);
-
-    auto uc_crossattn = ggml_dup_tensor(work_ctx, cond.c_crossattn);
-    ggml_set_f32(uc_crossattn, 0.f);
-
-    auto uc_concat = ggml_dup_tensor(work_ctx, cond.c_concat);
-    ggml_set_f32(uc_concat, 0.f);
-
-    auto uc_vector = ggml_dup_tensor(work_ctx, cond.c_vector);
-
-    SDCondition uncond = SDCondition(uc_crossattn, uc_vector, uc_concat);
+    ggml_tensor* init_latent = generate_init_latent(sd_ctx, work_ctx, width, height, frames, true);
+    int sample_steps         = sigmas.size() - 1;
+    // Apply lora
+    prompt = sd_ctx->sd->apply_loras_from_prompt(prompt);
 
+    // Get learned condition
+    bool zero_out_masked = true;
+    t0                   = ggml_time_ms();
+    SDCondition cond     = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
+                                                                               sd_ctx->sd->n_threads,
+                                                                               prompt,
+                                                                               sd_vid_gen_params->clip_skip,
+                                                                               width,
+                                                                               height,
+                                                                               sd_ctx->sd->diffusion_model->get_adm_in_channels(),
+                                                                               zero_out_masked);
+    SDCondition uncond;
+    if (sd_vid_gen_params->guidance.txt_cfg != 1.0) {
+        uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
+                                                                     sd_ctx->sd->n_threads,
+                                                                     negative_prompt,
+                                                                     sd_vid_gen_params->clip_skip,
+                                                                     width,
+                                                                     height,
+                                                                     sd_ctx->sd->diffusion_model->get_adm_in_channels(),
+                                                                     zero_out_masked);
+    }
     int64_t t1 = ggml_time_ms();
     LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t1 - t0);
+
     if (sd_ctx->sd->free_params_immediately) {
-        sd_ctx->sd->clip_vision->free_params_buffer();
+        sd_ctx->sd->cond_stage_model->free_params_buffer();
     }
 
-    sd_ctx->sd->rng->manual_seed(seed);
-    int C                   = 4;
-    int W                   = width / 8;
-    int H                   = height / 8;
-    struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, sd_vid_gen_params->video_frames);
-    ggml_set_f32(x_t, 0.f);
+    int W = width / 8;
+    int H = height / 8;
+    int T = frames;
+    int C = 16;
 
-    struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, sd_vid_gen_params->video_frames);
-    ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng);
+    struct ggml_tensor* final_latent;
+    // Sample
+    {
+        int64_t sampling_start    = ggml_time_ms();
+        struct ggml_tensor* x_t   = init_latent;
+        struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, T, C);
+        ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng);
 
-    LOG_INFO("sampling using %s method", sampling_methods_str[sd_vid_gen_params->sample_method]);
-    struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx,
-                                                 x_t,
-                                                 noise,
-                                                 cond,
-                                                 uncond,
-                                                 {},
-                                                 {},
-                                                 0.f,
-                                                 sd_vid_gen_params->guidance,
-                                                 0.f,
-                                                 sd_vid_gen_params->sample_method,
-                                                 sigmas,
-                                                 -1,
-                                                 SDCondition(NULL, NULL, NULL));
+        final_latent = sd_ctx->sd->sample(work_ctx,
+                                          x_t,
+                                          noise,
+                                          cond,
+                                          uncond,
+                                          {},
+                                          NULL,
+                                          0,
+                                          sd_vid_gen_params->guidance,
+                                          sd_vid_gen_params->eta,
+                                          sd_vid_gen_params->sample_method,
+                                          sigmas,
+                                          -1,
+                                          {});
+
+        int64_t sampling_end = ggml_time_ms();
+        LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
+    }
 
-    int64_t t2 = ggml_time_ms();
-    LOG_INFO("sampling completed, taking %.2fs", (t2 - t1) * 1.0f / 1000);
     if (sd_ctx->sd->free_params_immediately) {
         sd_ctx->sd->diffusion_model->free_params_buffer();
     }
 
-    struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, x_0);
+    int64_t t3 = ggml_time_ms();
+    LOG_INFO("generating latent video completed, taking %.2fs", (t3 - t1) * 1.0f / 1000);
+    struct ggml_tensor* vid = sd_ctx->sd->decode_first_stage(work_ctx, final_latent, true);
+    int64_t t4              = ggml_time_ms();
+    LOG_INFO("decode_first_stage completed, taking %.2fs", (t4 - t3) * 1.0f / 1000);
     if (sd_ctx->sd->free_params_immediately) {
         sd_ctx->sd->first_stage_model->free_params_buffer();
     }
-    if (img == NULL) {
-        ggml_free(work_ctx);
-        return NULL;
-    }
 
-    sd_image_t* result_images = (sd_image_t*)calloc(sd_vid_gen_params->video_frames, sizeof(sd_image_t));
+    sd_image_t* result_images = (sd_image_t*)calloc(T, sizeof(sd_image_t));
     if (result_images == NULL) {
         ggml_free(work_ctx);
         return NULL;
     }
 
-    for (size_t i = 0; i < sd_vid_gen_params->video_frames; i++) {
-        auto img_i = ggml_view_3d(work_ctx, img, img->ne[0], img->ne[1], img->ne[2], img->nb[1], img->nb[2], img->nb[3] * i);
-
-        result_images[i].width   = width;
-        result_images[i].height  = height;
+    for (size_t i = 0; i < T; i++) {
+        result_images[i].width   = final_latent->ne[0] * 8;
+        result_images[i].height  = final_latent->ne[1] * 8;
         result_images[i].channel = 3;
-        result_images[i].data    = sd_tensor_to_image(img_i);
+        result_images[i].data    = sd_tensor_to_image(vid, i, true);
     }
     ggml_free(work_ctx);
 
-    int64_t t3 = ggml_time_ms();
-
-    LOG_INFO("img2vid completed in %.2fs", (t3 - t0) * 1.0f / 1000);
+    LOG_INFO("img2vid completed in %.2fs", (t4 - t0) * 1.0f / 1000);
 
     return result_images;
 }
diff --git a/stable-diffusion.h b/stable-diffusion.h
index a603259..6c4cc96 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -157,7 +157,6 @@ typedef struct {
 typedef struct {
     float txt_cfg;
     float img_cfg;
-    float min_cfg;
     float distilled_guidance;
     sd_slg_params_t slg;
 } sd_guidance_params_t;
@@ -187,18 +186,19 @@ typedef struct {
 } sd_img_gen_params_t;
 
 typedef struct {
+    const char* prompt;
+    const char* negative_prompt;
+    int clip_skip;
+    sd_guidance_params_t guidance;
     sd_image_t init_image;
     int width;
     int height;
-    sd_guidance_params_t guidance;
     enum sample_method_t sample_method;
     int sample_steps;
+    float eta;
     float strength;
     int64_t seed;
     int video_frames;
-    int motion_bucket_id;
-    int fps;
-    float augmentation_level;
 } sd_vid_gen_params_t;
 
 typedef struct sd_ctx_t sd_ctx_t;
diff --git a/vae.hpp b/vae.hpp
index 41f53ee..fcbe091 100644
--- a/vae.hpp
+++ b/vae.hpp
@@ -520,7 +520,18 @@ public:
     }
 };
 
-struct AutoEncoderKL : public GGMLRunner {
+struct VAE : public GGMLRunner {
+    VAE(ggml_backend_t backend)
+        : GGMLRunner(backend) {}
+    virtual void compute(const int n_threads,
+                         struct ggml_tensor* z,
+                         bool decode_graph,
+                         struct ggml_tensor** output,
+                         struct ggml_context* output_ctx)                                                         = 0;
+    virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) = 0;
+};
+
+struct AutoEncoderKL : public VAE {
     bool decode_only = true;
     AutoencodingEngine ae;
 
@@ -530,7 +541,7 @@ struct AutoEncoderKL : public GGMLRunner {
                   bool decode_only       = false,
                   bool use_video_decoder = false,
                   SDVersion version      = VERSION_SD1)
-        : decode_only(decode_only), ae(decode_only, use_video_decoder, version), GGMLRunner(backend) {
+        : decode_only(decode_only), ae(decode_only, use_video_decoder, version), VAE(backend) {
         ae.init(params_ctx, tensor_types, prefix);
     }
 
diff --git a/wan.hpp b/wan.hpp
index 3882a01..18dc07a 100644
--- a/wan.hpp
+++ b/wan.hpp
@@ -7,6 +7,7 @@
 #include "flux.hpp"
 #include "ggml_extend.hpp"
 #include "rope.hpp"
+#include "vae.hpp"
 
 namespace WAN {
 
@@ -522,7 +523,6 @@ namespace WAN {
             for (int i = 0; i < dims.size() - 1; i++) {
                 in_dim  = dims[i];
                 out_dim = dims[i + 1];
-                LOG_DEBUG("in_dim %u out_dim %u", in_dim, out_dim);
                 if (i == 1 || i == 2 || i == 3) {
                     in_dim = in_dim / 2;
                 }
@@ -726,7 +726,7 @@ namespace WAN {
         }
     };
 
-    struct WanVAERunner : public GGMLRunner {
+    struct WanVAERunner : public VAE {
         bool decode_only = true;
         WanVAE ae;
 
@@ -734,7 +734,7 @@ namespace WAN {
                      const String2GGMLType& tensor_types = {},
                      const std::string prefix            = "",
                      bool decode_only                    = false)
-            : decode_only(decode_only), ae(decode_only), GGMLRunner(backend) {
+            : decode_only(decode_only), ae(decode_only), VAE(backend) {
             ae.init(params_ctx, tensor_types, prefix);
         }
 
@@ -1217,13 +1217,13 @@ namespace WAN {
         int64_t axes_dim_sum      = 128;
     };
 
-    class WanModel : public GGMLBlock {
+    class Wan : public GGMLBlock {
     protected:
         WanParams params;
 
     public:
-        WanModel() {}
-        WanModel(WanParams params)
+        Wan() {}
+        Wan(WanParams params)
             : params(params) {
             // patch_embedding
             blocks["patch_embedding"] = std::shared_ptr<GGMLBlock>(new Conv3d(params.in_dim, params.dim, params.patch_size, params.patch_size));
@@ -1418,14 +1418,15 @@ namespace WAN {
     struct WanRunner : public GGMLRunner {
     public:
         WanParams wan_params;
-        WanModel wan;
+        Wan wan;
         std::vector<float> pe_vec;
         SDVersion version;
 
         WanRunner(ggml_backend_t backend,
                   const String2GGMLType& tensor_types = {},
                   const std::string prefix            = "",
-                  SDVersion version                   = VERSION_WAN_2_1)
+                  SDVersion version                   = VERSION_WAN2,
+                  bool flash_attn                     = false)
             : GGMLRunner(backend) {
             wan_params.num_layers = 0;
             for (auto pair : tensor_types) {
@@ -1476,7 +1477,7 @@ namespace WAN {
                 GGML_ABORT("invalid num_layers(%d) of wan", wan_params.num_layers);
             }
 
-            wan = WanModel(wan_params);
+            wan = Wan(wan_params);
             wan.init(params_ctx, tensor_types, prefix);
         }