diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 3d50207..1a630ac 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -2238,15 +2238,15 @@ public:
             forward_params.linear.scale          = scale;
             return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, x, w, b, prefix, forward_params);
         }
-        auto x0 = ggml_ext_linear(ctx->ggml_ctx, x, w, b, force_prec_f32, scale);
+        auto out = ggml_ext_linear(ctx->ggml_ctx, x, w, b, force_prec_f32, scale);
         for (int i = 1; i < out_features_vec.size(); i++) {
-            auto wi = params["weight." + std::to_string(i)];
-            auto bi = bias ? params["bias." + std::to_string(i)] : nullptr;
-            auto xi = ggml_ext_linear(ctx->ggml_ctx, x, wi, bi, force_prec_f32, scale);
-            x0 = ggml_concat(ctx->ggml_ctx, x0, xi, 0);
+            auto wi       = params["weight." + std::to_string(i)];
+            auto bi       = bias ? params["bias." + std::to_string(i)] : nullptr;
+            auto curr_out = ggml_ext_linear(ctx->ggml_ctx, x, wi, bi, force_prec_f32, scale);
+            out           = ggml_concat(ctx->ggml_ctx, out, curr_out, 0);
         }
 
-        return x0;
+        return out;
     }
 };
 
diff --git a/rope.hpp b/rope.hpp
index 95def62..0c18c0a 100644
--- a/rope.hpp
+++ b/rope.hpp
@@ -180,10 +180,11 @@ namespace Rope {
                                                                    int start_index,
                                                                    const std::vector<ggml_tensor*>& ref_latents,
                                                                    bool increase_ref_index,
-                                                                   float ref_index_scale) {
+                                                                   float ref_index_scale,
+                                                                    int base_offset = 0) {
         std::vector<std::vector<float>> ids;
-        uint64_t curr_h_offset = 0;
-        uint64_t curr_w_offset = 0;
+        uint64_t curr_h_offset = base_offset;
+        uint64_t curr_w_offset = base_offset;
         int index              = start_index;
         for (ggml_tensor* ref : ref_latents) {
             uint64_t h_offset = 0;
@@ -227,15 +228,15 @@ namespace Rope {
                                                                    bool increase_ref_index,
                                                                    float ref_index_scale,
                                                                    bool is_longcat) {
-        int start_index = is_longcat ? 1 : 0;
+        int x_index = is_longcat ? 1 : 0;
 
         auto txt_ids = is_longcat ? gen_longcat_txt_ids(bs, context_len, axes_dim_num) : gen_flux_txt_ids(bs, context_len, axes_dim_num, txt_arange_dims);
         int offset   = is_longcat ? context_len : 0;
-        auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num, start_index, offset, offset);
+        auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num, x_index, offset, offset);
 
         auto ids = concat_ids(txt_ids, img_ids, bs);
         if (ref_latents.size() > 0) {
-            auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, start_index + 1, ref_latents, increase_ref_index, ref_index_scale);
+            auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, x_index + 1, ref_latents, increase_ref_index, ref_index_scale, offset);
             ids           = concat_ids(ids, refs_ids, bs);
         }
         return ids;
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index eed5b0d..1e8f04a 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -456,6 +456,9 @@ public:
                                                                 sd_ctx_params->chroma_use_dit_mask);
             } else if (sd_version_is_longcat(version)) {
                 bool enable_vision = false;
+                if (!vae_decode_only) {
+                    enable_vision = true;
+                }
                 cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
                                                                  offload_params_to_cpu,
                                                                  tensor_storage_map,
@@ -850,7 +853,7 @@ public:
                                 flow_shift = 1.15f;
                             }
                         }
-                        if(sd_version_is_longcat(version)) {
+                        if (sd_version_is_longcat(version)) {
                             flow_shift = 3.0f;
                         }
                     }
@@ -2244,6 +2247,7 @@ public:
             sd_version_is_qwen_image(version) ||
             sd_version_is_wan(version) ||
             sd_version_is_flux2(version) ||
+            sd_version_is_longcat(version) ||
             version == VERSION_CHROMA_RADIANCE) {
             latent = vae_output;
         } else if (version == VERSION_SD1_PIX2PIX) {