refactor: reorganize src model layout (#1615)

2026-06-09 15:56:39 +00:00 · 2026-06-07 03:21:12 +08:00 · 2026-06-07 03:21:12 +08:00 · f3fd359b58
commit f3fd359b58
parent dfb2390dd4
81 changed files with 407 additions and 385 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -210,6 +210,19 @@ file(GLOB SD_LIB_SOURCES CONFIGURE_DEPENDS
    "src/*.h"
    "src/*.cpp"
    "src/*.hpp"
+    "src/conditioning/*.h"
+    "src/conditioning/*.cpp"
+    "src/conditioning/*.hpp"
+    "src/core/*.h"
+    "src/core/*.cpp"
+    "src/core/*.hpp"
+    "src/model/*/*.h"
+    "src/model/*/*.cpp"
+    "src/model/*/*.hpp"
+    "src/runtime/*.h"
+    "src/runtime/*.cpp"
+    "src/runtime/*.hpp"
+    "src/runtime/*.inl"
    "src/model_io/*.h"
    "src/model_io/*.cpp"
    "src/tokenizers/*.h"
@ -312,6 +325,7 @@ add_subdirectory(thirdparty)

 target_link_libraries(${SD_LIB} PUBLIC ggml zip)
 target_include_directories(${SD_LIB} PUBLIC . src include)
+target_include_directories(${SD_LIB} PRIVATE src/core)
 target_include_directories(${SD_LIB} PUBLIC . thirdparty)
 target_compile_features(${SD_LIB} PUBLIC c_std_11 cxx_std_17)

--- a/format-code.sh
+++ b/format-code.sh
@ -1,10 +1,16 @@
-for f in src/*.cpp src/*.h src/*.hpp src/tokenizers/*.h src/tokenizers/*.cpp src/tokenizers/vocab/*.h src/tokenizers/vocab/*.cpp \
+for f in src/*.cpp src/*.h src/*.hpp \
+         src/conditioning/*.cpp src/conditioning/*.h src/conditioning/*.hpp \
+         src/core/*.cpp src/core/*.h src/core/*.hpp \
+         src/runtime/*.cpp src/runtime/*.h src/runtime/*.hpp \
+         src/model/*/*.cpp src/model/*/*.h src/model/*/*.hpp \
+         src/tokenizers/*.h src/tokenizers/*.cpp src/tokenizers/vocab/*.h src/tokenizers/vocab/*.cpp \
         src/model_io/*.h src/model_io/*.cpp examples/cli/*.cpp examples/cli/*.h examples/server/*.cpp \
         examples/common/*.hpp examples/common/*.h examples/common/*.cpp; do
+  [[ -e "$f" ]] || continue
  [[ "$f" == vocab* ]] && continue
  echo "formatting '$f'"
  # if [ "$f" != "stable-diffusion.h" ]; then
  #   clang-tidy -fix -p build_linux/ "$f"
  # fi
  clang-format -style=file -i "$f"
-done
+done
--- a/src/conditioning/conditioner.hpp
+++ b/src/conditioning/conditioner.hpp
@ -1,14 +1,14 @@
-#ifndef __CONDITIONER_HPP__
-#define __CONDITIONER_HPP__
+#ifndef __SD_CONDITIONING_CONDITIONER_HPP__
+#define __SD_CONDITIONING_CONDITIONER_HPP__

 #include <cmath>
 #include <limits>
 #include <optional>

-#include "clip.hpp"
-#include "llm.hpp"
-#include "t5.hpp"
-#include "tensor_ggml.hpp"
+#include "core/tensor_ggml.hpp"
+#include "model/te/clip.hpp"
+#include "model/te/llm.hpp"
+#include "model/te/t5.hpp"

 struct SDCondition {
    sd::Tensor<float> c_crossattn;
@ -2554,4 +2554,4 @@ struct LTXAVEmbedder : public Conditioner {
    }
 };

-#endif
+#endif  // __SD_CONDITIONING_CONDITIONER_HPP__
--- a/src/core/ggml_extend.hpp
+++ b/src/core/ggml_extend.hpp
@ -1,5 +1,5 @@
-#ifndef __GGML_EXTEND_HPP__
-#define __GGML_EXTEND_HPP__
+#ifndef __SD_CORE_GGML_EXTEND_HPP__
+#define __SD_CORE_GGML_EXTEND_HPP__

 #include <assert.h>
 #include <inttypes.h>
@ -23,19 +23,19 @@
 #include <unordered_map>
 #include <vector>

+#include "core/ggml_extend_backend.h"
+#include "core/ggml_graph_cut.h"
+#include "core/layer_registry.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 #include "ggml.h"
-#include "ggml_extend_backend.h"
-#include "ggml_graph_cut.h"
-#include "layer_registry.h"

+#include "core/tensor.hpp"
 #include "model.h"
-#include "tensor.hpp"

-#include "rng.hpp"
-#include "tensor_ggml.hpp"
-#include "util.h"
+#include "core/rng.hpp"
+#include "core/tensor_ggml.hpp"
+#include "core/util.h"

 #define EPS 1e-05f

@ -4145,4 +4145,4 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_lokr_forward(
    }
 }

-#endif  // __GGML_EXTEND__HPP__
+#endif  // __SD_CORE_GGML_EXTEND_HPP__
--- a/src/core/ggml_extend_backend.cpp
+++ b/src/core/ggml_extend_backend.cpp
@ -1,4 +1,4 @@
-#include "ggml_extend_backend.h"
+#include "core/ggml_extend_backend.h"

 #include <algorithm>
 #include <cctype>
@ -8,8 +8,8 @@
 #include <stdexcept>
 #include <vector>

+#include "core/util.h"
 #include "stable-diffusion.h"
-#include "util.h"

 static std::string trim_copy(const std::string& value) {
    size_t begin = 0;
--- a/src/core/ggml_extend_backend.h
+++ b/src/core/ggml_extend_backend.h
@ -1,5 +1,5 @@
-#ifndef __SD_GGML_EXTEND_BACKEND_H__
-#define __SD_GGML_EXTEND_BACKEND_H__
+#ifndef __SD_CORE_GGML_EXTEND_BACKEND_H__
+#define __SD_CORE_GGML_EXTEND_BACKEND_H__

 #include <cstdint>
 #include <cstring>
@ -76,4 +76,4 @@ ggml_backend_t sd_backend_cpu_init();
 bool sd_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
 const char* sd_backend_module_name(SDBackendModule module);
 void ggml_ext_im_set_f32_1d(const struct ggml_tensor* tensor, int i, float value);
-#endif
+#endif  // __SD_CORE_GGML_EXTEND_BACKEND_H__
--- a/src/core/ggml_graph_cut.cpp
+++ b/src/core/ggml_graph_cut.cpp
@ -1,4 +1,4 @@
-#include "ggml_graph_cut.h"
+#include "core/ggml_graph_cut.h"

 #include <algorithm>
 #include <cstring>
@ -8,11 +8,11 @@
 #include <stack>
 #include <unordered_map>

+#include "core/util.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
-#include "util.h"

-#include "../ggml/src/ggml-impl.h"
+#include "ggml/src/ggml-impl.h"

 namespace sd::ggml_graph_cut {

--- a/src/core/ggml_graph_cut.h
+++ b/src/core/ggml_graph_cut.h
@ -1,5 +1,5 @@
-#ifndef __SD_GGML_GRAPH_CUT_H__
-#define __SD_GGML_GRAPH_CUT_H__
+#ifndef __SD_CORE_GGML_GRAPH_CUT_H__
+#define __SD_CORE_GGML_GRAPH_CUT_H__

 #include <array>
 #include <cstdint>
@ -114,4 +114,4 @@ namespace sd::ggml_graph_cut {
    void annotate_residency(Plan& plan, size_t max_graph_vram_bytes);
 }  // namespace sd::ggml_graph_cut

-#endif
+#endif  // __SD_CORE_GGML_GRAPH_CUT_H__
--- a/src/core/layer_registry.cpp
+++ b/src/core/layer_registry.cpp
@ -1,8 +1,8 @@
-#include "layer_registry.h"
+#include "core/layer_registry.h"

 #include <utility>

-#include "util.h"
+#include "core/util.h"

 namespace sd::layer_registry {

--- a/src/core/layer_registry.h
+++ b/src/core/layer_registry.h
@ -1,5 +1,5 @@
-#ifndef __LAYER_REGISTRY_H__
-#define __LAYER_REGISTRY_H__
+#ifndef __SD_CORE_LAYER_REGISTRY_H__
+#define __SD_CORE_LAYER_REGISTRY_H__

 #include <map>
 #include <set>
@ -47,4 +47,4 @@ namespace sd::layer_registry {

 }  // namespace sd::layer_registry

-#endif
+#endif  // __SD_CORE_LAYER_REGISTRY_H__
--- a/src/core/ordered_map.hpp
+++ b/src/core/ordered_map.hpp
@ -1,5 +1,5 @@
-#ifndef __ORDERED_MAP_HPP__
-#define __ORDERED_MAP_HPP__
+#ifndef __SD_CORE_ORDERED_MAP_HPP__
+#define __SD_CORE_ORDERED_MAP_HPP__

 #include <iostream>
 #include <list>
@ -174,4 +174,4 @@ public:
    }
 };

-#endif  // __ORDERED_MAP_HPP__
+#endif  // __SD_CORE_ORDERED_MAP_HPP__
--- a/src/core/rng.hpp
+++ b/src/core/rng.hpp
@ -1,5 +1,5 @@
-#ifndef __RNG_H__
-#define __RNG_H__
+#ifndef __SD_CORE_RNG_HPP__
+#define __SD_CORE_RNG_HPP__

 #include <random>
 #include <vector>
@ -32,4 +32,4 @@ public:
    }
 };

-#endif  // __RNG_H__
+#endif  // __SD_CORE_RNG_HPP__
--- a/src/core/rng_mt19937.hpp
+++ b/src/core/rng_mt19937.hpp
@ -1,10 +1,10 @@
-#ifndef __RNG_MT19937_HPP__
-#define __RNG_MT19937_HPP__
+#ifndef __SD_CORE_RNG_MT19937_HPP__
+#define __SD_CORE_RNG_MT19937_HPP__

 #include <cmath>
 #include <vector>

-#include "rng.hpp"
+#include "core/rng.hpp"

 // RNG imitiating torch cpu randn on CPU.
 // Port from pytorch, original license: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/LICENSE
@ -144,4 +144,4 @@ public:
    }
 };

-#endif  // __RNG_MT19937_HPP__
+#endif  // __SD_CORE_RNG_MT19937_HPP__
--- a/src/core/rng_philox.hpp
+++ b/src/core/rng_philox.hpp
@ -1,10 +1,10 @@
-#ifndef __RNG_PHILOX_H__
-#define __RNG_PHILOX_H__
+#ifndef __SD_CORE_RNG_PHILOX_HPP__
+#define __SD_CORE_RNG_PHILOX_HPP__

 #include <cmath>
 #include <vector>

-#include "rng.hpp"
+#include "core/rng.hpp"

 // RNG imitiating torch cuda randn on CPU.
 // Port from: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/5ef669de080814067961f28357256e8fe27544f4/modules/rng_philox.py
@ -122,4 +122,4 @@ public:
    }
 };

-#endif  // __RNG_PHILOX_H__
+#endif  // __SD_CORE_RNG_PHILOX_HPP__
--- a/src/core/tensor.hpp
+++ b/src/core/tensor.hpp
@ -1,5 +1,5 @@
-#ifndef __SD_TENSOR_HPP__
-#define __SD_TENSOR_HPP__
+#ifndef __SD_CORE_TENSOR_HPP__
+#define __SD_CORE_TENSOR_HPP__

 #include <algorithm>
 #include <cmath>
@ -16,7 +16,7 @@
 #include <utility>
 #include <vector>

-#include "rng.hpp"
+#include "core/rng.hpp"

 namespace sd {

@ -1661,4 +1661,4 @@ namespace sd {

 }  // namespace sd

-#endif
+#endif  // __SD_CORE_TENSOR_HPP__
--- a/src/core/tensor_ggml.hpp
+++ b/src/core/tensor_ggml.hpp
@ -1,5 +1,5 @@
-#ifndef __SD_TENSOR_GGML_HPP__
-#define __SD_TENSOR_GGML_HPP__
+#ifndef __SD_CORE_TENSOR_GGML_HPP__
+#define __SD_CORE_TENSOR_GGML_HPP__

 #include <array>
 #include <cstring>
@ -8,8 +8,8 @@
 #include <string>
 #include <type_traits>

+#include "core/tensor.hpp"
 #include "ggml.h"
-#include "tensor.hpp"

 namespace sd {

@ -124,4 +124,4 @@ namespace sd {

 }  // namespace sd

-#endif
+#endif  // __SD_CORE_TENSOR_GGML_HPP__
--- a/src/core/util.cpp
+++ b/src/core/util.cpp
@ -1,4 +1,4 @@
-#include "util.h"
+#include "core/util.h"
 #include <algorithm>
 #include <cctype>
 #include <cmath>
@ -13,7 +13,7 @@
 #include <thread>
 #include <unordered_set>
 #include <vector>
-#include "preprocessing.hpp"
+#include "runtime/preprocessing.hpp"

 #if defined(__APPLE__) && defined(__MACH__)
 #include <sys/sysctl.h>
--- a/src/core/util.h
+++ b/src/core/util.h
@ -1,5 +1,5 @@
-#ifndef __UTIL_H__
-#define __UTIL_H__
+#ifndef __SD_CORE_UTIL_H__
+#define __SD_CORE_UTIL_H__

 #include <cstdint>
 #include <memory>
@ -7,9 +7,9 @@
 #include <utility>
 #include <vector>

+#include "core/tensor.hpp"
 #include "ggml-backend.h"
 #include "stable-diffusion.h"
-#include "tensor.hpp"

 #define SAFE_STR(s) ((s) ? (s) : "")
 #define BOOL_STR(b) ((b) ? "true" : "false")
@ -103,4 +103,4 @@ bool sd_backend_is(ggml_backend_t backend, const std::string& name);
 #define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__)
 #define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__)
 #define LOG_ERROR(format, ...) log_printf(SD_LOG_ERROR, __FILE__, __LINE__, format, ##__VA_ARGS__)
-#endif  // __UTIL_H__
+#endif  // __SD_CORE_UTIL_H__
--- a/src/model.cpp
+++ b/src/model.cpp
@ -13,18 +13,18 @@
 #include <unordered_map>
 #include <vector>

+#include "core/util.h"
 #include "model.h"
 #include "model_io/gguf_io.h"
 #include "model_io/safetensors_io.h"
 #include "model_io/torch_legacy_io.h"
 #include "model_io/torch_zip_io.h"
 #include "stable-diffusion.h"
-#include "util.h"

+#include "core/ggml_extend_backend.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 #include "ggml.h"
-#include "ggml_extend_backend.h"
 #include "zip.h"

 #include "name_conversion.h"
--- a/src/model.h
+++ b/src/model.h
@ -8,10 +8,10 @@
 #include <string>
 #include <vector>

+#include "core/ordered_map.hpp"
 #include "ggml-backend.h"
 #include "ggml.h"
 #include "model_io/tensor_storage.h"
-#include "ordered_map.hpp"

 enum SDVersion {
    VERSION_SD1,
--- a/src/model/adapter/lora.hpp
+++ b/src/model/adapter/lora.hpp
@ -1,8 +1,8 @@
-#ifndef __LORA_HPP__
-#define __LORA_HPP__
+#ifndef __SD_MODEL_ADAPTER_LORA_HPP__
+#define __SD_MODEL_ADAPTER_LORA_HPP__

 #include <mutex>
-#include "ggml_extend.hpp"
+#include "core/ggml_extend.hpp"

 #define LORA_GRAPH_BASE_SIZE 10240

@ -914,4 +914,4 @@ public:
    }
 };

-#endif  // __LORA_HPP__
+#endif  // __SD_MODEL_ADAPTER_LORA_HPP__
--- a/src/model/adapter/pmid.hpp
+++ b/src/model/adapter/pmid.hpp
@ -1,10 +1,11 @@
-#ifndef __PMI_HPP__
-#define __PMI_HPP__
+#ifndef __SD_MODEL_ADAPTER_PMID_HPP__
+#define __SD_MODEL_ADAPTER_PMID_HPP__

-#include "ggml_extend.hpp"
+#include "core/ggml_extend.hpp"

-#include "clip.hpp"
-#include "lora.hpp"
+#include "model/adapter/lora.hpp"
+#include "model/common/block.hpp"
+#include "model/te/clip.hpp"

 struct FuseBlock : public GGMLBlock {
    // network hparams
@ -636,4 +637,4 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
    }
 };

-#endif  // __PMI_HPP__
+#endif  // __SD_MODEL_ADAPTER_PMID_HPP__
--- a/src/model/common/block.hpp
+++ b/src/model/common/block.hpp
@ -1,9 +1,9 @@
-#ifndef __COMMON_BLOCK_HPP__
-#define __COMMON_BLOCK_HPP__
+#ifndef __SD_MODEL_COMMON_BLOCK_HPP__
+#define __SD_MODEL_COMMON_BLOCK_HPP__

+#include "core/ggml_extend.hpp"
+#include "core/util.h"
 #include "ggml-backend.h"
-#include "ggml_extend.hpp"
-#include "util.h"

 class DownSampleBlock : public GGMLBlock {
 protected:
@ -227,6 +227,37 @@ public:
    }
 };

+struct Mlp : public GGMLBlock {
+public:
+    Mlp(int64_t in_features,
+        int64_t hidden_features = -1,
+        int64_t out_features    = -1,
+        bool bias               = true) {
+        // act_layer is always lambda: nn.GELU(approximate="tanh")
+        // norm_layer is always None
+        // use_conv is always False
+        if (hidden_features == -1) {
+            hidden_features = in_features;
+        }
+        if (out_features == -1) {
+            out_features = in_features;
+        }
+        blocks["fc1"] = std::shared_ptr<GGMLBlock>(new Linear(in_features, hidden_features, bias));
+        blocks["fc2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_features, out_features, bias));
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+        // x: [N, n_token, in_features]
+        auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
+        auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);
+
+        x = fc1->forward(ctx, x);
+        x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
+        x = fc2->forward(ctx, x);
+        return x;
+    }
+};
+
 class FeedForward : public GGMLBlock {
 public:
    enum class Activation {
@ -603,4 +634,4 @@ public:
    }
 };

-#endif  // __COMMON_BLOCK_HPP__
+#endif  // __SD_MODEL_COMMON_BLOCK_HPP__
--- a/src/model/common/rope.hpp
+++ b/src/model/common/rope.hpp
@ -1,10 +1,10 @@
-#ifndef __ROPE_HPP__
-#define __ROPE_HPP__
+#ifndef __SD_MODEL_COMMON_ROPE_HPP__
+#define __SD_MODEL_COMMON_ROPE_HPP__

 #include <algorithm>
 #include <cmath>
 #include <vector>
-#include "ggml_extend.hpp"
+#include "core/ggml_extend.hpp"

 namespace Rope {
    enum class EmbedNDLayout {
@ -902,4 +902,4 @@ namespace Rope {
    }
 };  // namespace Rope

-#endif  // __ROPE_HPP__
+#endif  // __SD_MODEL_COMMON_ROPE_HPP__
--- a/src/model/diffusion/anima.hpp
+++ b/src/model/diffusion/anima.hpp
@ -1,5 +1,5 @@
-#ifndef __ANIMA_HPP__
-#define __ANIMA_HPP__
+#ifndef __SD_MODEL_DIFFUSION_ANIMA_HPP__
+#define __SD_MODEL_DIFFUSION_ANIMA_HPP__

 #include <algorithm>
 #include <cmath>
@ -7,10 +7,10 @@
 #include <utility>
 #include <vector>

-#include "common_block.hpp"
-#include "diffusion_model.hpp"
-#include "flux.hpp"
-#include "rope.hpp"
+#include "model/common/block.hpp"
+#include "model/common/rope.hpp"
+#include "model/diffusion/flux.hpp"
+#include "model/diffusion/model.hpp"

 namespace Anima {
    constexpr int ANIMA_GRAPH_SIZE = 65536;
@ -715,4 +715,4 @@ namespace Anima {
    };
 }  // namespace Anima

-#endif  // __ANIMA_HPP__
+#endif  // __SD_MODEL_DIFFUSION_ANIMA_HPP__
--- a/src/model/diffusion/control.hpp
+++ b/src/model/diffusion/control.hpp
@ -1,8 +1,8 @@
-#ifndef __CONTROL_HPP__
-#define __CONTROL_HPP__
+#ifndef __SD_MODEL_DIFFUSION_CONTROL_HPP__
+#define __SD_MODEL_DIFFUSION_CONTROL_HPP__

-#include "common_block.hpp"
 #include "model.h"
+#include "model/common/block.hpp"

 #define CONTROL_NET_GRAPH_SIZE 1536

@ -484,4 +484,4 @@ struct ControlNet : public GGMLRunner {
    }
 };

-#endif  // __CONTROL_HPP__
+#endif  // __SD_MODEL_DIFFUSION_CONTROL_HPP__
--- a/src/model/diffusion/dit.hpp
+++ b/src/model/diffusion/dit.hpp
@ -1,7 +1,7 @@
-#ifndef __COMMON_DIT_HPP__
-#define __COMMON_DIT_HPP__
+#ifndef __SD_MODEL_DIFFUSION_DIT_HPP__
+#define __SD_MODEL_DIFFUSION_DIT_HPP__

-#include "ggml_extend.hpp"
+#include "core/ggml_extend.hpp"

 namespace DiT {
    inline ggml_tensor* patchify(ggml_context* ctx,
@ -163,4 +163,4 @@ namespace DiT {
    }
 }  // namespace DiT

-#endif  // __COMMON_DIT_HPP__
+#endif  // __SD_MODEL_DIFFUSION_DIT_HPP__
--- a/src/model/diffusion/ernie_image.hpp
+++ b/src/model/diffusion/ernie_image.hpp
@ -1,14 +1,14 @@
-#ifndef __SD_ERNIE_IMAGE_HPP__
-#define __SD_ERNIE_IMAGE_HPP__
+#ifndef __SD_MODEL_DIFFUSION_ERNIE_IMAGE_HPP__
+#define __SD_MODEL_DIFFUSION_ERNIE_IMAGE_HPP__

 #include <memory>
 #include <vector>

-#include "common_dit.hpp"
-#include "diffusion_model.hpp"
-#include "flux.hpp"
-#include "qwen_image.hpp"
-#include "rope.hpp"
+#include "model/common/rope.hpp"
+#include "model/diffusion/dit.hpp"
+#include "model/diffusion/flux.hpp"
+#include "model/diffusion/model.hpp"
+#include "model/diffusion/qwen_image.hpp"

 namespace ErnieImage {
    constexpr int ERNIE_IMAGE_GRAPH_SIZE = 40960;
@ -455,4 +455,4 @@ namespace ErnieImage {
    };
 }  // namespace ErnieImage

-#endif  // __SD_ERNIE_IMAGE_HPP__
+#endif  // __SD_MODEL_DIFFUSION_ERNIE_IMAGE_HPP__
--- a/src/model/diffusion/flux.hpp
+++ b/src/model/diffusion/flux.hpp
@ -1,13 +1,13 @@
-#ifndef __FLUX_HPP__
-#define __FLUX_HPP__
+#ifndef __SD_MODEL_DIFFUSION_FLUX_HPP__
+#define __SD_MODEL_DIFFUSION_FLUX_HPP__

 #include <memory>
 #include <vector>

-#include "common_dit.hpp"
-#include "diffusion_model.hpp"
 #include "model.h"
-#include "rope.hpp"
+#include "model/common/rope.hpp"
+#include "model/diffusion/dit.hpp"
+#include "model/diffusion/model.hpp"

 #define FLUX_GRAPH_SIZE 10240

@ -1627,4 +1627,4 @@ namespace Flux {

 }  // namespace Flux

-#endif  // __FLUX_HPP__
+#endif  // __SD_MODEL_DIFFUSION_FLUX_HPP__
--- a/src/model/diffusion/hidream_o1.hpp
+++ b/src/model/diffusion/hidream_o1.hpp
@ -1,5 +1,5 @@
-#ifndef __SD_HIDREAM_O1_H__
-#define __SD_HIDREAM_O1_H__
+#ifndef __SD_MODEL_DIFFUSION_HIDREAM_O1_HPP__
+#define __SD_MODEL_DIFFUSION_HIDREAM_O1_HPP__

 #include <algorithm>
 #include <array>
@ -10,11 +10,11 @@
 #include <utility>
 #include <vector>

-#include "common_dit.hpp"
-#include "conditioner.hpp"
-#include "diffusion_model.hpp"
-#include "llm.hpp"
-#include "util.h"
+#include "conditioning/conditioner.hpp"
+#include "core/util.h"
+#include "model/diffusion/dit.hpp"
+#include "model/diffusion/model.hpp"
+#include "model/te/llm.hpp"

 namespace HiDreamO1 {
    constexpr int HIDREAM_O1_GRAPH_SIZE = 32768;
@ -678,4 +678,4 @@ namespace HiDreamO1 {
    };
 }  // namespace HiDreamO1

-#endif  // __SD_HIDREAM_O1_H__
+#endif  // __SD_MODEL_DIFFUSION_HIDREAM_O1_HPP__
--- a/src/model/diffusion/ideogram4.hpp
+++ b/src/model/diffusion/ideogram4.hpp
@ -1,5 +1,5 @@
-#ifndef __IDEOGRAM4_HPP__
-#define __IDEOGRAM4_HPP__
+#ifndef __SD_MODEL_DIFFUSION_IDEOGRAM4_HPP__
+#define __SD_MODEL_DIFFUSION_IDEOGRAM4_HPP__

 #include <algorithm>
 #include <cmath>
@ -8,10 +8,10 @@
 #include <string>
 #include <vector>

-#include "diffusion_model.hpp"
-#include "ggml_extend.hpp"
-#include "ggml_graph_cut.h"
-#include "rope.hpp"
+#include "core/ggml_extend.hpp"
+#include "core/ggml_graph_cut.h"
+#include "model/common/rope.hpp"
+#include "model/diffusion/model.hpp"

 namespace Ideogram4 {
    constexpr int IDEOGRAM4_GRAPH_SIZE    = 65536;
@ -528,4 +528,4 @@ namespace Ideogram4 {
    };
 }  // namespace Ideogram4

-#endif  // __IDEOGRAM4_HPP__
+#endif  // __SD_MODEL_DIFFUSION_IDEOGRAM4_HPP__
--- a/src/model/diffusion/lens.hpp
+++ b/src/model/diffusion/lens.hpp
@ -1,14 +1,14 @@
-#ifndef __SD_LENS_HPP__
-#define __SD_LENS_HPP__
+#ifndef __SD_MODEL_DIFFUSION_LENS_HPP__
+#define __SD_MODEL_DIFFUSION_LENS_HPP__

 #include <memory>
 #include <vector>

-#include "common_block.hpp"
-#include "diffusion_model.hpp"
-#include "flux.hpp"
-#include "qwen_image.hpp"
-#include "rope.hpp"
+#include "model/common/block.hpp"
+#include "model/common/rope.hpp"
+#include "model/diffusion/flux.hpp"
+#include "model/diffusion/model.hpp"
+#include "model/diffusion/qwen_image.hpp"

 namespace Lens {
    constexpr int LENS_GRAPH_SIZE = 40960;
@ -423,4 +423,4 @@ namespace Lens {
    };
 }  // namespace Lens

-#endif  // __SD_LENS_HPP__
+#endif  // __SD_MODEL_DIFFUSION_LENS_HPP__
--- a/src/model/diffusion/ltxv.hpp
+++ b/src/model/diffusion/ltxv.hpp
@ -1,5 +1,5 @@
-#ifndef __SD_LTXV_HPP__
-#define __SD_LTXV_HPP__
+#ifndef __SD_MODEL_DIFFUSION_LTXV_HPP__
+#define __SD_MODEL_DIFFUSION_LTXV_HPP__

 #include <algorithm>
 #include <cmath>
@ -9,10 +9,10 @@
 #include <utility>
 #include <vector>

-#include "common_block.hpp"
-#include "diffusion_model.hpp"
-#include "flux.hpp"
-#include "rope.hpp"
+#include "model/common/block.hpp"
+#include "model/common/rope.hpp"
+#include "model/diffusion/flux.hpp"
+#include "model/diffusion/model.hpp"

 namespace LTXV {

@ -2062,4 +2062,4 @@ namespace LTXV {

 };  // namespace LTXV

-#endif
+#endif  // __SD_MODEL_DIFFUSION_LTXV_HPP__
--- a/src/model/diffusion/mmdit.hpp
+++ b/src/model/diffusion/mmdit.hpp
@ -1,14 +1,15 @@
-#ifndef __MMDIT_HPP__
-#define __MMDIT_HPP__
+#ifndef __SD_MODEL_DIFFUSION_MMDIT_HPP__
+#define __SD_MODEL_DIFFUSION_MMDIT_HPP__

 #include <algorithm>
 #include <memory>
 #include <string>
 #include <vector>

-#include "diffusion_model.hpp"
-#include "ggml_extend.hpp"
+#include "core/ggml_extend.hpp"
 #include "model.h"
+#include "model/common/block.hpp"
+#include "model/diffusion/model.hpp"

 #define MMDIT_GRAPH_SIZE 10240

@ -134,37 +135,6 @@ struct MMDiTConfig {
    }
 };

-struct Mlp : public GGMLBlock {
-public:
-    Mlp(int64_t in_features,
-        int64_t hidden_features = -1,
-        int64_t out_features    = -1,
-        bool bias               = true) {
-        // act_layer is always lambda: nn.GELU(approximate="tanh")
-        // norm_layer is always None
-        // use_conv is always False
-        if (hidden_features == -1) {
-            hidden_features = in_features;
-        }
-        if (out_features == -1) {
-            out_features = in_features;
-        }
-        blocks["fc1"] = std::shared_ptr<GGMLBlock>(new Linear(in_features, hidden_features, bias));
-        blocks["fc2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_features, out_features, bias));
-    }
-
-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
-        // x: [N, n_token, in_features]
-        auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
-        auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);
-
-        x = fc1->forward(ctx, x);
-        x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
-        x = fc2->forward(ctx, x);
-        return x;
-    }
-};
-
 struct PatchEmbed : public GGMLBlock {
    // 2D Image to Patch Embedding
 protected:
@ -1062,4 +1032,4 @@ struct MMDiTRunner : public DiffusionModelRunner {
    }
 };

-#endif
+#endif  // __SD_MODEL_DIFFUSION_MMDIT_HPP__
--- a/src/model/diffusion/model.hpp
+++ b/src/model/diffusion/model.hpp
@ -1,12 +1,12 @@
-#ifndef __DIFFUSION_MODEL_H__
-#define __DIFFUSION_MODEL_H__
+#ifndef __SD_MODEL_DIFFUSION_MODEL_HPP__
+#define __SD_MODEL_DIFFUSION_MODEL_HPP__

 #include <string>
 #include <utility>
 #include <variant>

-#include "ggml_extend.hpp"
-#include "tensor_ggml.hpp"
+#include "core/ggml_extend.hpp"
+#include "core/tensor_ggml.hpp"

 struct UNetDiffusionExtra {
    int num_video_frames                           = -1;
@ -104,4 +104,4 @@ public:
                                   const std::string& prefix) = 0;
 };

-#endif
+#endif  // __SD_MODEL_DIFFUSION_MODEL_HPP__
--- a/src/model/diffusion/pid.hpp
+++ b/src/model/diffusion/pid.hpp
@ -1,5 +1,5 @@
-#ifndef __SD_PID_HPP__
-#define __SD_PID_HPP__
+#ifndef __SD_MODEL_DIFFUSION_PID_HPP__
+#define __SD_MODEL_DIFFUSION_PID_HPP__

 #include <cmath>
 #include <cstdlib>
@ -7,10 +7,10 @@
 #include <string>
 #include <vector>

-#include "common_dit.hpp"
-#include "ggml_extend.hpp"
-#include "mmdit.hpp"
-#include "rope.hpp"
+#include "core/ggml_extend.hpp"
+#include "model/common/rope.hpp"
+#include "model/diffusion/dit.hpp"
+#include "model/diffusion/mmdit.hpp"

 namespace Pid {
    constexpr int PID_GRAPH_SIZE = 196608;
@ -844,4 +844,4 @@ namespace Pid {
    };
 }  // namespace Pid

-#endif  // __SD_PID_HPP__
+#endif  // __SD_MODEL_DIFFUSION_PID_HPP__
--- a/src/model/diffusion/qwen_image.hpp
+++ b/src/model/diffusion/qwen_image.hpp
@ -1,11 +1,11 @@
-#ifndef __QWEN_IMAGE_HPP__
-#define __QWEN_IMAGE_HPP__
+#ifndef __SD_MODEL_DIFFUSION_QWEN_IMAGE_HPP__
+#define __SD_MODEL_DIFFUSION_QWEN_IMAGE_HPP__

 #include <memory>

-#include "common_block.hpp"
-#include "diffusion_model.hpp"
-#include "flux.hpp"
+#include "model/common/block.hpp"
+#include "model/diffusion/flux.hpp"
+#include "model/diffusion/model.hpp"

 namespace Qwen {
    constexpr int QWEN_IMAGE_GRAPH_SIZE = 20480;
@ -731,4 +731,4 @@ namespace Qwen {

 }  // namespace name

-#endif  // __QWEN_IMAGE_HPP__
+#endif  // __SD_MODEL_DIFFUSION_QWEN_IMAGE_HPP__
--- a/src/model/diffusion/unet.hpp
+++ b/src/model/diffusion/unet.hpp
@ -1,12 +1,12 @@
-#ifndef __UNET_HPP__
-#define __UNET_HPP__
+#ifndef __SD_MODEL_DIFFUSION_UNET_HPP__
+#define __SD_MODEL_DIFFUSION_UNET_HPP__

 #include <algorithm>
 #include <vector>

-#include "common_block.hpp"
-#include "diffusion_model.hpp"
 #include "model.h"
+#include "model/common/block.hpp"
+#include "model/diffusion/model.hpp"

 /*==================================================== UnetModel =====================================================*/

@ -844,4 +844,4 @@ struct UNetModelRunner : public DiffusionModelRunner {
    }
 };

-#endif  // __UNET_HPP__
+#endif  // __SD_MODEL_DIFFUSION_UNET_HPP__
--- a/src/model/diffusion/wan.hpp
+++ b/src/model/diffusion/wan.hpp
@ -1,14 +1,14 @@
-#ifndef __WAN_HPP__
-#define __WAN_HPP__
+#ifndef __SD_MODEL_DIFFUSION_WAN_HPP__
+#define __SD_MODEL_DIFFUSION_WAN_HPP__

 #include <map>
 #include <memory>
 #include <utility>

-#include "common_block.hpp"
-#include "diffusion_model.hpp"
-#include "flux.hpp"
-#include "rope.hpp"
+#include "model/common/block.hpp"
+#include "model/common/rope.hpp"
+#include "model/diffusion/flux.hpp"
+#include "model/diffusion/model.hpp"

 namespace WAN {

@ -1058,4 +1058,4 @@ namespace WAN {

 }  // namespace WAN

-#endif  // __WAN_HPP__
+#endif  // __SD_MODEL_DIFFUSION_WAN_HPP__
--- a/src/model/diffusion/z_image.hpp
+++ b/src/model/diffusion/z_image.hpp
@ -1,14 +1,14 @@
-#ifndef __Z_IMAGE_HPP__
-#define __Z_IMAGE_HPP__
+#ifndef __SD_MODEL_DIFFUSION_Z_IMAGE_HPP__
+#define __SD_MODEL_DIFFUSION_Z_IMAGE_HPP__

 #include <algorithm>

-#include "diffusion_model.hpp"
-#include "flux.hpp"
-#include "ggml_extend.hpp"
-#include "mmdit.hpp"
+#include "core/ggml_extend.hpp"
+#include "model/diffusion/flux.hpp"
+#include "model/diffusion/mmdit.hpp"
+#include "model/diffusion/model.hpp"

-// Ref: https://github.com/Alpha-VLLM/Lumina-Image-2.0/blob/main/models/model.py
+// Ref: https://github.com/Alpha-VLLM/Lumina-Image-2.0/blob/main/model/model.py
 // Ref: https://github.com/huggingface/diffusers/pull/12703

 #ifndef MIN
@ -739,4 +739,4 @@ namespace ZImage {

 }  // namespace ZImage

-#endif  // __Z_IMAGE_HPP__
+#endif  // __SD_MODEL_DIFFUSION_Z_IMAGE_HPP__
--- a/src/model/te/clip.hpp
+++ b/src/model/te/clip.hpp
@ -1,13 +1,13 @@
-#ifndef __CLIP_HPP__
-#define __CLIP_HPP__
+#ifndef __SD_MODEL_TE_CLIP_HPP__
+#define __SD_MODEL_TE_CLIP_HPP__

-#include "ggml_extend.hpp"
+#include "core/ggml_extend.hpp"
 #include "model.h"
 #include "tokenizers/clip_tokenizer.h"

 /*================================================ FrozenCLIPEmbedder ================================================*/

-// Ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/clip/modeling_clip.py
+// Ref: https://github.com/huggingface/transformers/blob/main/src/transformers/model/clip/modeling_clip.py

 struct CLIPMLP : public GGMLBlock {
 protected:
@ -579,4 +579,4 @@ struct CLIPTextModelRunner : public GGMLRunner {
    }
 };

-#endif  // __CLIP_HPP__
+#endif  // __SD_MODEL_TE_CLIP_HPP__
--- a/src/model/te/llm.hpp
+++ b/src/model/te/llm.hpp
@ -1,5 +1,5 @@
-#ifndef __LLM_HPP__
-#define __LLM_HPP__
+#ifndef __SD_MODEL_TE_LLM_HPP__
+#define __SD_MODEL_TE_LLM_HPP__

 #include <algorithm>
 #include <array>
@ -18,9 +18,9 @@
 #include <utility>
 #include <vector>

-#include "ggml_extend.hpp"
+#include "core/ggml_extend.hpp"
 #include "json.hpp"
-#include "rope.hpp"
+#include "model/common/rope.hpp"
 #include "tokenizers/bpe_tokenizer.h"
 #include "tokenizers/gemma_tokenizer.h"
 #include "tokenizers/gpt_oss_tokenizer.h"
@ -2100,4 +2100,4 @@ namespace LLM {
    };
 };  // LLM

-#endif  // __LLM_HPP__
+#endif  // __SD_MODEL_TE_LLM_HPP__
--- a/src/model/te/t5.hpp
+++ b/src/model/te/t5.hpp
@ -1,5 +1,5 @@
-#ifndef __T5_HPP__
-#define __T5_HPP__
+#ifndef __SD_MODEL_TE_T5_HPP__
+#define __SD_MODEL_TE_T5_HPP__

 #include <cfloat>
 #include <limits>
@ -10,7 +10,7 @@
 #include <string>
 #include <unordered_map>

-#include "ggml_extend.hpp"
+#include "core/ggml_extend.hpp"
 #include "model.h"
 #include "tokenizers/t5_unigram_tokenizer.h"

@ -610,4 +610,4 @@ struct T5Embedder {
    }
 };

-#endif  // __T5_HPP__
+#endif  // __SD_MODEL_TE_T5_HPP__
--- a/src/model/upscaler/esrgan.hpp
+++ b/src/model/upscaler/esrgan.hpp
@ -1,7 +1,7 @@
-#ifndef __ESRGAN_HPP__
-#define __ESRGAN_HPP__
+#ifndef __SD_MODEL_UPSCALER_ESRGAN_HPP__
+#define __SD_MODEL_UPSCALER_ESRGAN_HPP__

-#include "ggml_extend.hpp"
+#include "core/ggml_extend.hpp"
 #include "model.h"

 /*
@ -372,4 +372,4 @@ struct ESRGAN : public GGMLRunner {
    }
 };

-#endif  // __ESRGAN_HPP__
+#endif  // __SD_MODEL_UPSCALER_ESRGAN_HPP__
--- a/src/model/upscaler/ltx_latent_upscaler.hpp
+++ b/src/model/upscaler/ltx_latent_upscaler.hpp
@ -1,5 +1,5 @@
-#ifndef __SD_LTX_LATENT_UPSCALER_HPP__
-#define __SD_LTX_LATENT_UPSCALER_HPP__
+#ifndef __SD_MODEL_UPSCALER_LTX_LATENT_UPSCALER_HPP__
+#define __SD_MODEL_UPSCALER_LTX_LATENT_UPSCALER_HPP__

 #include <cinttypes>
 #include <cmath>
@ -11,11 +11,11 @@
 #include <utility>
 #include <vector>

-#include "common_dit.hpp"
-#include "ggml_extend.hpp"
-#include "ggml_graph_cut.h"
+#include "core/ggml_extend.hpp"
+#include "core/ggml_graph_cut.h"
+#include "core/util.h"
 #include "model.h"
-#include "util.h"
+#include "model/diffusion/dit.hpp"

 namespace LTXVUpsampler {
    constexpr int LTX_UPSAMPLER_GRAPH_SIZE = 10240;
@ -548,4 +548,4 @@ namespace LTXVUpsampler {

 }  // namespace LTXVUpsampler

-#endif  // __SD_LTX_LATENT_UPSCALER_HPP__
+#endif  // __SD_MODEL_UPSCALER_LTX_LATENT_UPSCALER_HPP__
--- a/src/model/vae/auto_encoder_kl.hpp
+++ b/src/model/vae/auto_encoder_kl.hpp
@ -1,7 +1,7 @@
-#ifndef __AUTO_ENCODER_KL_HPP__
-#define __AUTO_ENCODER_KL_HPP__
+#ifndef __SD_MODEL_VAE_AUTO_ENCODER_KL_HPP__
+#define __SD_MODEL_VAE_AUTO_ENCODER_KL_HPP__

-#include "vae.hpp"
+#include "model/vae/vae.hpp"

 /*================================================== AutoEncoderKL ===================================================*/

@ -886,4 +886,4 @@ struct AutoEncoderKL : public VAE {
    };
 };

-#endif  // __AUTO_ENCODER_KL_HPP__
+#endif  // __SD_MODEL_VAE_AUTO_ENCODER_KL_HPP__
--- a/src/model/vae/ltx_audio_vae.hpp
+++ b/src/model/vae/ltx_audio_vae.hpp
@ -1,5 +1,5 @@
-#ifndef __SD_LTX_AUDIO_VAE_H__
-#define __SD_LTX_AUDIO_VAE_H__
+#ifndef __SD_MODEL_VAE_LTX_AUDIO_VAE_HPP__
+#define __SD_MODEL_VAE_LTX_AUDIO_VAE_HPP__

 #include <cmath>
 #include <limits>
@ -7,7 +7,7 @@
 #include <string>
 #include <vector>

-#include "ggml_extend.hpp"
+#include "core/ggml_extend.hpp"

 namespace LTXV {

@ -1095,4 +1095,4 @@ namespace LTXV {

 }  // namespace LTXV

-#endif  // __SD_LTX_AUDIO_VAE_H__
+#endif  // __SD_MODEL_VAE_LTX_AUDIO_VAE_HPP__
--- a/src/model/vae/ltx_vae.hpp
+++ b/src/model/vae/ltx_vae.hpp
@ -1,5 +1,5 @@
-#ifndef __SD_LTX_VAE_HPP__
-#define __SD_LTX_VAE_HPP__
+#ifndef __SD_MODEL_VAE_LTX_VAE_HPP__
+#define __SD_MODEL_VAE_LTX_VAE_HPP__

 #include <algorithm>
 #include <fstream>
@ -9,9 +9,9 @@
 #include <utility>
 #include <vector>

-#include "ltxv.hpp"
-#include "vae.hpp"
-#include "wan_vae.hpp"
+#include "model/diffusion/ltxv.hpp"
+#include "model/vae/vae.hpp"
+#include "model/vae/wan_vae.hpp"

 namespace LTXVAE {

@ -1552,4 +1552,4 @@ struct LTXVideoVAE : public VAE {
    }
 };

-#endif  // __SD_LTX_VAE_HPP__
+#endif  // __SD_MODEL_VAE_LTX_VAE_HPP__
--- a/src/model/vae/tae.hpp
+++ b/src/model/vae/tae.hpp
@ -1,13 +1,13 @@
-#ifndef __TAE_HPP__
-#define __TAE_HPP__
+#ifndef __SD_MODEL_VAE_TAE_HPP__
+#define __SD_MODEL_VAE_TAE_HPP__

-#include "ggml_extend.hpp"
+#include "core/ggml_extend.hpp"
 #include "model.h"

 /*
    ===================================    TinyAutoEncoder  ===================================
    References:
-    https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/autoencoders/vae.py
+    https://github.com/huggingface/diffusers/blob/main/src/diffusers/model/autoencoders/vae.py
    https://github.com/madebyollin/taesd/blob/main/taesd.py

 */
@ -750,4 +750,4 @@ struct TinyVideoAutoEncoder : public VAE {
    }
 };

-#endif  // __TAE_HPP__
+#endif  // __SD_MODEL_VAE_TAE_HPP__
--- a/src/model/vae/vae.hpp
+++ b/src/model/vae/vae.hpp
@ -1,8 +1,8 @@
-#ifndef __VAE_HPP__
-#define __VAE_HPP__
+#ifndef __SD_MODEL_VAE_VAE_HPP__
+#define __SD_MODEL_VAE_VAE_HPP__

-#include "common_block.hpp"
-#include "tensor_ggml.hpp"
+#include "core/tensor_ggml.hpp"
+#include "model/common/block.hpp"

 struct VAE : public GGMLRunner {
 protected:
@ -258,4 +258,4 @@ struct FakeVAE : public VAE {
    }
 };

-#endif  // __VAE_HPP__
+#endif  // __SD_MODEL_VAE_VAE_HPP__
--- a/src/model/vae/wan_vae.hpp
+++ b/src/model/vae/wan_vae.hpp
@ -1,12 +1,12 @@
-#ifndef __WAN_VAE_HPP__
-#define __WAN_VAE_HPP__
+#ifndef __SD_MODEL_VAE_WAN_VAE_HPP__
+#define __SD_MODEL_VAE_WAN_VAE_HPP__

 #include <map>
 #include <memory>
 #include <utility>

-#include "common_block.hpp"
-#include "vae.hpp"
+#include "model/common/block.hpp"
+#include "model/vae/vae.hpp"

 namespace WAN {

@ -1358,4 +1358,4 @@ namespace WAN {

 }  // namespace WAN

-#endif  // __WAN_VAE_HPP__
+#endif  // __SD_MODEL_VAE_WAN_VAE_HPP__
--- a/src/model_io/gguf_io.cpp
+++ b/src/model_io/gguf_io.cpp
@ -5,9 +5,9 @@
 #include <string>
 #include <vector>

+#include "core/util.h"
 #include "gguf.h"
 #include "gguf_reader_ext.h"
-#include "util.h"

 static void set_error(std::string* error, const std::string& message) {
    if (error != nullptr) {
--- a/src/model_io/gguf_reader_ext.h
+++ b/src/model_io/gguf_reader_ext.h
@ -6,8 +6,8 @@
 #include <string>
 #include <vector>

+#include "core/util.h"
 #include "ggml.h"
-#include "util.h"

 struct GGUFTensorInfo {
    std::string name;
--- a/src/model_io/pickle_io.cpp
+++ b/src/model_io/pickle_io.cpp
@ -8,7 +8,7 @@
 #include <vector>

 #include "binary_io.h"
-#include "util.h"
+#include "core/util.h"

 // $ python -m pickletools sd-v1-4/archive/data.pkl | head -n 100
 //     0: \x80 PROTO      2
--- a/src/model_io/safetensors_io.cpp
+++ b/src/model_io/safetensors_io.cpp
@ -7,8 +7,8 @@
 #include <vector>

 #include "binary_io.h"
+#include "core/util.h"
 #include "json.hpp"
-#include "util.h"

 static constexpr size_t ST_HEADER_SIZE_LEN = 8;

--- a/src/model_io/torch_legacy_io.cpp
+++ b/src/model_io/torch_legacy_io.cpp
@ -7,8 +7,8 @@
 #include <unordered_map>
 #include <vector>

+#include "core/util.h"
 #include "pickle_io.h"
-#include "util.h"

 // torch.save format background:
 //
--- a/src/name_conversion.cpp
+++ b/src/name_conversion.cpp
@ -1,8 +1,8 @@
 #include <unordered_map>
 #include <unordered_set>

+#include "core/util.h"
 #include "name_conversion.h"
-#include "util.h"

 void replace_with_name_map(std::string& name, const std::vector<std::pair<std::string, std::string>>& name_map) {
    for (auto kv : name_map) {
--- a/src/runtime/cache_dit.hpp
+++ b/src/runtime/cache_dit.hpp
@ -1,5 +1,5 @@
-#ifndef __CACHE_DIT_HPP__
-#define __CACHE_DIT_HPP__
+#ifndef __SD_RUNTIME_CACHE_DIT_HPP__
+#define __SD_RUNTIME_CACHE_DIT_HPP__

 #include <algorithm>
 #include <cmath>
@ -8,9 +8,9 @@
 #include <unordered_map>
 #include <vector>

-#include "condition_cache_utils.hpp"
-#include "ggml_extend.hpp"
-#include "tensor.hpp"
+#include "core/ggml_extend.hpp"
+#include "core/tensor.hpp"
+#include "runtime/condition_cache_utils.hpp"

 struct DBCacheConfig {
    bool enabled                        = false;
@ -893,4 +893,4 @@ struct CacheDitConditionState {
    }
 };

-#endif
+#endif  // __SD_RUNTIME_CACHE_DIT_HPP__
--- a/src/runtime/condition_cache_utils.hpp
+++ b/src/runtime/condition_cache_utils.hpp
@ -1,9 +1,9 @@
-#ifndef __CONDITION_CACHE_UTILS_HPP__
-#define __CONDITION_CACHE_UTILS_HPP__
+#ifndef __SD_RUNTIME_CONDITION_CACHE_UTILS_HPP__
+#define __SD_RUNTIME_CONDITION_CACHE_UTILS_HPP__

 #include <vector>

-#include "tensor.hpp"
+#include "core/tensor.hpp"

 namespace sd {

@ -61,4 +61,4 @@ namespace sd {

 }  // namespace sd

-#endif  // __CONDITION_CACHE_UTILS_HPP__
+#endif  // __SD_RUNTIME_CONDITION_CACHE_UTILS_HPP__
--- a/src/runtime/denoiser.hpp
+++ b/src/runtime/denoiser.hpp
@ -1,5 +1,5 @@
-#ifndef __DENOISER_HPP__
-#define __DENOISER_HPP__
+#ifndef __SD_RUNTIME_DENOISER_HPP__
+#define __SD_RUNTIME_DENOISER_HPP__

 #include <algorithm>
 #include <cctype>
@ -8,10 +8,10 @@
 #include <string>
 #include <utility>

-#include "ggml_extend.hpp"
-#include "gits_noise.inl"
-#include "guidance.h"
-#include "tensor.hpp"
+#include "core/ggml_extend.hpp"
+#include "core/tensor.hpp"
+#include "runtime/gits_noise.inl"
+#include "runtime/guidance.h"

 /*================================================= CompVisDenoiser ==================================================*/

@ -1902,4 +1902,4 @@ static sd::Tensor<float> sample_k_diffusion(sample_method_t method,
    }
 }

-#endif  // __DENOISER_HPP__
+#endif  // __SD_RUNTIME_DENOISER_HPP__
--- a/src/runtime/easycache.hpp
+++ b/src/runtime/easycache.hpp
@ -1,15 +1,15 @@
-#ifndef __EASYCACHE_HPP__
-#define __EASYCACHE_HPP__
+#ifndef __SD_RUNTIME_EASYCACHE_HPP__
+#define __SD_RUNTIME_EASYCACHE_HPP__

 #include <cmath>
 #include <limits>
 #include <unordered_map>
 #include <vector>

-#include "condition_cache_utils.hpp"
-#include "denoiser.hpp"
-#include "ggml_extend.hpp"
-#include "tensor.hpp"
+#include "core/ggml_extend.hpp"
+#include "core/tensor.hpp"
+#include "runtime/condition_cache_utils.hpp"
+#include "runtime/denoiser.hpp"

 struct EasyCacheConfig {
    bool enabled          = false;
@ -258,4 +258,4 @@ struct EasyCacheState {
    }
 };

-#endif
+#endif  // __SD_RUNTIME_EASYCACHE_HPP__
--- a/src/runtime/gits_noise.inl
+++ b/src/runtime/gits_noise.inl
--- a/src/runtime/guidance.cpp
+++ b/src/runtime/guidance.cpp
@ -1,4 +1,4 @@
-#include "guidance.h"
+#include "runtime/guidance.h"

 #include <algorithm>
 #include <cmath>
@ -6,7 +6,7 @@
 #include <string>
 #include <utility>

-#include "util.h"
+#include "core/util.h"

 namespace sd::guidance {

--- a/src/runtime/guidance.h
+++ b/src/runtime/guidance.h
@ -1,11 +1,11 @@
-#ifndef __SD_GUIDANCE_H__
-#define __SD_GUIDANCE_H__
+#ifndef __SD_RUNTIME_GUIDANCE_H__
+#define __SD_RUNTIME_GUIDANCE_H__

 #include <cstddef>
 #include <functional>
 #include <vector>

-#include "tensor.hpp"
+#include "core/tensor.hpp"

 namespace sd::guidance {

@ -93,4 +93,4 @@ namespace sd::guidance {

 }  // namespace sd::guidance

-#endif  // __SD_GUIDANCE_H__
+#endif  // __SD_RUNTIME_GUIDANCE_H__
--- a/src/runtime/latent-preview.h
+++ b/src/runtime/latent-preview.h
@ -1,8 +1,8 @@
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
+#include "core/tensor.hpp"
 #include "ggml.h"
-#include "tensor.hpp"

 const float ltxav_latent_rgb_proj[128][3] = {
    {-0.0293802f, -0.0362516f, -0.0291386f},
--- a/src/runtime/preprocessing.hpp
+++ b/src/runtime/preprocessing.hpp
@ -1,10 +1,10 @@
-#ifndef __PREPROCESSING_HPP__
-#define __PREPROCESSING_HPP__
+#ifndef __SD_RUNTIME_PREPROCESSING_HPP__
+#define __SD_RUNTIME_PREPROCESSING_HPP__

 #include <cmath>
 #include <limits>

-#include "ggml_extend.hpp"
+#include "core/ggml_extend.hpp"

 #define M_PI_ 3.14159265358979323846f

@ -331,4 +331,4 @@ bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold,
    return true;
 }

-#endif  // __PREPROCESSING_HPP__
+#endif  // __SD_RUNTIME_PREPROCESSING_HPP__
--- a/src/runtime/sample-cache.cpp
+++ b/src/runtime/sample-cache.cpp
@ -1,4 +1,4 @@
-#include "sample-cache.h"
+#include "runtime/sample-cache.h"

 namespace sd_sample {

--- a/src/runtime/sample-cache.h
+++ b/src/runtime/sample-cache.h
@ -1,16 +1,16 @@
-#ifndef __SAMPLE_CACHE_H__
-#define __SAMPLE_CACHE_H__
+#ifndef __SD_RUNTIME_SAMPLE_CACHE_H__
+#define __SD_RUNTIME_SAMPLE_CACHE_H__

 #include <vector>

-#include "cache_dit.hpp"
-#include "denoiser.hpp"
-#include "easycache.hpp"
+#include "core/tensor.hpp"
+#include "core/util.h"
 #include "model.h"
-#include "spectrum.hpp"
-#include "tensor.hpp"
-#include "ucache.hpp"
-#include "util.h"
+#include "runtime/cache_dit.hpp"
+#include "runtime/denoiser.hpp"
+#include "runtime/easycache.hpp"
+#include "runtime/spectrum.hpp"
+#include "runtime/ucache.hpp"

 namespace sd_sample {

@ -58,4 +58,4 @@ namespace sd_sample {

 }  // namespace sd_sample

-#endif  // __SAMPLE_CACHE_H__
+#endif  // __SD_RUNTIME_SAMPLE_CACHE_H__
--- a/src/runtime/spectrum.hpp
+++ b/src/runtime/spectrum.hpp
@ -1,12 +1,12 @@
-#ifndef __SPECTRUM_HPP__
-#define __SPECTRUM_HPP__
+#ifndef __SD_RUNTIME_SPECTRUM_HPP__
+#define __SD_RUNTIME_SPECTRUM_HPP__

 #include <cmath>
 #include <cstring>
 #include <vector>

-#include "ggml_extend.hpp"
-#include "tensor.hpp"
+#include "core/ggml_extend.hpp"
+#include "core/tensor.hpp"

 struct SpectrumConfig {
    float w            = 0.40f;
@ -184,4 +184,4 @@ private:
    }
 };

-#endif  // __SPECTRUM_HPP__
+#endif  // __SD_RUNTIME_SPECTRUM_HPP__
--- a/src/runtime/ucache.hpp
+++ b/src/runtime/ucache.hpp
@ -1,15 +1,15 @@
-#ifndef __UCACHE_HPP__
-#define __UCACHE_HPP__
+#ifndef __SD_RUNTIME_UCACHE_HPP__
+#define __SD_RUNTIME_UCACHE_HPP__

 #include <cmath>
 #include <limits>
 #include <unordered_map>
 #include <vector>

-#include "condition_cache_utils.hpp"
-#include "denoiser.hpp"
-#include "ggml_extend.hpp"
-#include "tensor.hpp"
+#include "core/ggml_extend.hpp"
+#include "core/tensor.hpp"
+#include "runtime/condition_cache_utils.hpp"
+#include "runtime/denoiser.hpp"

 struct UCacheConfig {
    bool enabled                = false;
@ -420,4 +420,4 @@ struct UCacheState {
    }
 };

-#endif  // __UCACHE_HPP__
+#endif  // __SD_RUNTIME_UCACHE_HPP__
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@ -2,49 +2,49 @@
 #include <cmath>
 #include <cstdlib>

-#include "ggml_extend.hpp"
-#include "ggml_graph_cut.h"
+#include "core/ggml_extend.hpp"
+#include "core/ggml_graph_cut.h"

+#include "core/rng.hpp"
+#include "core/rng_mt19937.hpp"
+#include "core/rng_philox.hpp"
+#include "core/util.h"
 #include "model.h"
-#include "rng.hpp"
-#include "rng_mt19937.hpp"
-#include "rng_philox.hpp"
 #include "stable-diffusion.h"
-#include "util.h"

-#include "anima.hpp"
-#include "auto_encoder_kl.hpp"
-#include "conditioner.hpp"
-#include "control.hpp"
-#include "denoiser.hpp"
-#include "diffusion_model.hpp"
-#include "ernie_image.hpp"
-#include "esrgan.hpp"
-#include "flux.hpp"
-#include "guidance.h"
-#include "hidream_o1.hpp"
-#include "ideogram4.hpp"
-#include "lens.hpp"
-#include "lora.hpp"
-#include "ltx_audio_vae.h"
-#include "ltx_latent_upscaler.hpp"
-#include "ltx_vae.hpp"
-#include "ltxv.hpp"
-#include "mmdit.hpp"
-#include "pid.hpp"
-#include "pmid.hpp"
-#include "qwen_image.hpp"
-#include "sample-cache.h"
-#include "tae.hpp"
-#include "unet.hpp"
+#include "conditioning/conditioner.hpp"
+#include "model/adapter/lora.hpp"
+#include "model/adapter/pmid.hpp"
+#include "model/diffusion/anima.hpp"
+#include "model/diffusion/control.hpp"
+#include "model/diffusion/ernie_image.hpp"
+#include "model/diffusion/flux.hpp"
+#include "model/diffusion/hidream_o1.hpp"
+#include "model/diffusion/ideogram4.hpp"
+#include "model/diffusion/lens.hpp"
+#include "model/diffusion/ltxv.hpp"
+#include "model/diffusion/mmdit.hpp"
+#include "model/diffusion/model.hpp"
+#include "model/diffusion/pid.hpp"
+#include "model/diffusion/qwen_image.hpp"
+#include "model/diffusion/unet.hpp"
+#include "model/diffusion/wan.hpp"
+#include "model/diffusion/z_image.hpp"
+#include "model/upscaler/esrgan.hpp"
+#include "model/upscaler/ltx_latent_upscaler.hpp"
+#include "model/vae/auto_encoder_kl.hpp"
+#include "model/vae/ltx_audio_vae.hpp"
+#include "model/vae/ltx_vae.hpp"
+#include "model/vae/tae.hpp"
+#include "model/vae/vae.hpp"
+#include "model/vae/wan_vae.hpp"
+#include "runtime/denoiser.hpp"
+#include "runtime/guidance.h"
+#include "runtime/sample-cache.h"
 #include "upscaler.h"
-#include "vae.hpp"
-#include "wan.hpp"
-#include "wan_vae.hpp"
-#include "z_image.hpp"

-#include "latent-preview.h"
 #include "name_conversion.h"
+#include "runtime/latent-preview.h"

 const char* sd_vae_format_name(enum sd_vae_format_t format);
 static SDVersion sd_vae_format_to_version(enum sd_vae_format_t format, SDVersion fallback);
--- a/src/tokenizers/bpe_tokenizer.cpp
+++ b/src/tokenizers/bpe_tokenizer.cpp
@ -3,8 +3,8 @@
 #include <algorithm>
 #include <sstream>

+#include "core/util.h"
 #include "tokenize_util.h"
-#include "util.h"

 std::vector<std::pair<int, std::u32string>> BPETokenizer::bytes_to_unicode() {
    std::vector<std::pair<int, std::u32string>> byte_unicode_pairs;
--- a/src/tokenizers/clip_tokenizer.cpp
+++ b/src/tokenizers/clip_tokenizer.cpp
@ -6,9 +6,9 @@
 #include <regex>
 #include <set>

+#include "core/util.h"
 #include "ggml.h"
 #include "tokenize_util.h"
-#include "util.h"
 #include "vocab/vocab.h"

 CLIPTokenizer::CLIPTokenizer(int pad_token_id, const std::string& merges_utf8_str) {
--- a/src/tokenizers/gemma_tokenizer.cpp
+++ b/src/tokenizers/gemma_tokenizer.cpp
@ -1,8 +1,8 @@
 #include "gemma_tokenizer.h"

+#include "core/util.h"
 #include "ggml.h"
 #include "json.hpp"
-#include "util.h"
 #include "vocab/vocab.h"

 std::string GemmaTokenizer::normalize(const std::string& text) const {
--- a/src/tokenizers/gpt_oss_tokenizer.cpp
+++ b/src/tokenizers/gpt_oss_tokenizer.cpp
@ -1,7 +1,7 @@
 #include "gpt_oss_tokenizer.h"

+#include "core/util.h"
 #include "json.hpp"
-#include "util.h"
 #include "vocab/vocab.h"

 void GPTOSSTokenizer::load_from_merges(const std::string& merges_utf8_str, const std::string& vocab_utf8_str) {
--- a/src/tokenizers/mistral_tokenizer.cpp
+++ b/src/tokenizers/mistral_tokenizer.cpp
@ -1,8 +1,8 @@
 #include "mistral_tokenizer.h"

+#include "core/util.h"
 #include "ggml.h"
 #include "json.hpp"
-#include "util.h"
 #include "vocab/vocab.h"

 void MistralTokenizer::load_from_merges(const std::string& merges_utf8_str, const std::string& vocab_utf8_str) {
--- a/src/tokenizers/qwen2_tokenizer.cpp
+++ b/src/tokenizers/qwen2_tokenizer.cpp
@ -1,6 +1,6 @@
 #include "qwen2_tokenizer.h"

-#include "util.h"
+#include "core/util.h"
 #include "vocab/vocab.h"

 void Qwen2Tokenizer::load_from_merges(const std::string& merges_utf8_str) {
--- a/src/tokenizers/t5_unigram_tokenizer.cpp
+++ b/src/tokenizers/t5_unigram_tokenizer.cpp
@ -6,9 +6,9 @@
 #include <regex>
 #include <sstream>

+#include "core/util.h"
 #include "json.hpp"
 #include "tokenize_util.h"
-#include "util.h"
 #include "vocab/vocab.h"

 // Port from: https://github.com/google/sentencepiece/blob/master/src/unigram_model.h
--- a/src/tokenizers/tokenizer.cpp
+++ b/src/tokenizers/tokenizer.cpp
@ -4,7 +4,7 @@
 #include <cmath>
 #include <regex>

-#include "util.h"
+#include "core/util.h"

 void Tokenizer::add_special_token(const std::string& token) {
    special_tokens.push_back(token);
--- a/src/upscaler.cpp
+++ b/src/upscaler.cpp
@ -1,8 +1,8 @@
 #include "upscaler.h"
-#include "ggml_extend.hpp"
+#include "core/ggml_extend.hpp"
+#include "core/util.h"
 #include "model.h"
 #include "stable-diffusion.h"
-#include "util.h"

 #include <utility>

--- a/src/upscaler.h
+++ b/src/upscaler.h
@ -1,10 +1,10 @@
 #ifndef __SD_UPSCALER_H__
 #define __SD_UPSCALER_H__

-#include "esrgan.hpp"
-#include "ggml_extend_backend.h"
+#include "core/ggml_extend_backend.h"
+#include "core/tensor.hpp"
+#include "model/upscaler/esrgan.hpp"
 #include "stable-diffusion.h"
-#include "tensor.hpp"

 #include <memory>
 #include <string>