Compare commits

...

3 Commits

Author SHA1 Message Date
leejet
f3fd359b58
refactor: reorganize src model layout (#1615) 2026-06-07 03:21:12 +08:00
leejet
dfb2390dd4
refactor: extract Wan VAE implementation (#1614) 2026-06-07 01:33:49 +08:00
leejet
cfbc19d186
refactor: unify model config detection (#1613) 2026-06-07 01:05:12 +08:00
83 changed files with 3409 additions and 2884 deletions

View File

@ -210,6 +210,19 @@ file(GLOB SD_LIB_SOURCES CONFIGURE_DEPENDS
"src/*.h"
"src/*.cpp"
"src/*.hpp"
"src/conditioning/*.h"
"src/conditioning/*.cpp"
"src/conditioning/*.hpp"
"src/core/*.h"
"src/core/*.cpp"
"src/core/*.hpp"
"src/model/*/*.h"
"src/model/*/*.cpp"
"src/model/*/*.hpp"
"src/runtime/*.h"
"src/runtime/*.cpp"
"src/runtime/*.hpp"
"src/runtime/*.inl"
"src/model_io/*.h"
"src/model_io/*.cpp"
"src/tokenizers/*.h"
@ -312,6 +325,7 @@ add_subdirectory(thirdparty)
target_link_libraries(${SD_LIB} PUBLIC ggml zip)
target_include_directories(${SD_LIB} PUBLIC . src include)
target_include_directories(${SD_LIB} PRIVATE src/core)
target_include_directories(${SD_LIB} PUBLIC . thirdparty)
target_compile_features(${SD_LIB} PUBLIC c_std_11 cxx_std_17)

View File

@ -44,6 +44,8 @@ Naming conventions:
Some older code in the project may not fully follow the current conventions. Please do not submit PRs that only rewrite existing code to match style rules.
When adding or modifying model implementations, follow the model config and weight detection conventions in [docs/model_config.md](docs/model_config.md).
## AI-Assisted Contributions
AI tools may be used to assist development, but contributors are responsible for the quality and correctness of the submitted code.

118
docs/model_config.md Normal file
View File

@ -0,0 +1,118 @@
# Model Configuration Conventions
This document describes the conventions for model configuration structs and
weight-based configuration detection.
## Config Types
Model configuration should live in a model-specific `*Config` struct.
Examples:
- `ZImageConfig`
- `UNetConfig`
- `MMDiTConfig`
- `LLMConfig`
Preserve established acronym casing in type names, such as `UNet`, `MMDiT`,
`LLM`, `VAE`, and `T5`.
Place the config struct near the top of the model header, before the main model
blocks and runner types that consume it.
## Config Variables
Variables and members that hold a config should be named `config`.
Examples:
```cpp
UNetConfig config;
UnetModelBlock unet;
MMDiTRunner(...)
: DiffusionModelRunner(backend, params_backend, prefix),
config(MMDiTConfig::detect_from_weights(tensor_storage_map, prefix)),
mmdit(config) {
}
```
Avoid alternate names such as `params`, `params_cfg`, `model_params`, or
model-specific aliases unless an existing public API requires them.
## Weight Detection
If a model can derive configuration from loaded weight metadata, expose that
logic as a static method on the config type:
```cpp
static XxxConfig detect_from_weights(const String2TensorStorage& tensor_storage_map,
const std::string& prefix);
```
Additional selector arguments are allowed when required by an existing model
family, for example `SDVersion version` or an architecture enum:
```cpp
static UNetConfig detect_from_weights(const String2TensorStorage& tensor_storage_map,
const std::string& prefix,
SDVersion version = VERSION_SD1);
```
Use `TensorStorage` metadata, especially `n_dims` and `ne`, to infer shapes.
Do not load or parse tensor data for config detection.
Detection should respect `prefix`. For nested weights, construct full names from
`prefix + "." + suffix` or filter entries with `starts_with(name, prefix)`.
Do not add persistent config fields such as `inferred_from_weights` only to
record whether detection happened. If the function needs to decide whether to
print a debug line, keep that as local control flow inside `detect_from_weights`.
## Logging
When config values are inferred from weights, print one `LOG_DEBUG` line at the
end of `detect_from_weights`.
Example:
```cpp
LOG_DEBUG("llm: num_layers = %" PRId64 ", vocab_size = %" PRId64 ", hidden_size = %" PRId64 ", intermediate_size = %" PRId64,
config.num_layers,
config.vocab_size,
config.hidden_size,
config.intermediate_size);
```
Only print the config detection log when the function actually inferred values
from weights. Do not duplicate the same config summary in runner constructors or
model loading code.
Use the correct format specifiers for field types, such as `%" PRId64 "` for
`int64_t` and `%d` for `int`.
## Runner And Model Responsibilities
Runners should detect the config once and pass it into the model block:
```cpp
struct XxxRunner : public DiffusionModelRunner {
XxxConfig config;
XxxModel model;
XxxRunner(..., const String2TensorStorage& tensor_storage_map, const std::string prefix)
: DiffusionModelRunner(backend, params_backend, prefix),
config(XxxConfig::detect_from_weights(tensor_storage_map, prefix)),
model(config) {
model.init(params_ctx, tensor_storage_map, prefix);
}
};
```
Model blocks should consume `config` directly instead of re-scanning weights in
their constructors. Keep config-derived behavior centralized in the config
struct.
If a model has no weight-derived config today, it may still provide
`detect_from_weights` for API consistency, but it should not print a config
detection log unless it actually derives values from weights.

View File

@ -1,10 +1,16 @@
for f in src/*.cpp src/*.h src/*.hpp src/tokenizers/*.h src/tokenizers/*.cpp src/tokenizers/vocab/*.h src/tokenizers/vocab/*.cpp \
for f in src/*.cpp src/*.h src/*.hpp \
src/conditioning/*.cpp src/conditioning/*.h src/conditioning/*.hpp \
src/core/*.cpp src/core/*.h src/core/*.hpp \
src/runtime/*.cpp src/runtime/*.h src/runtime/*.hpp \
src/model/*/*.cpp src/model/*/*.h src/model/*/*.hpp \
src/tokenizers/*.h src/tokenizers/*.cpp src/tokenizers/vocab/*.h src/tokenizers/vocab/*.cpp \
src/model_io/*.h src/model_io/*.cpp examples/cli/*.cpp examples/cli/*.h examples/server/*.cpp \
examples/common/*.hpp examples/common/*.h examples/common/*.cpp; do
[[ -e "$f" ]] || continue
[[ "$f" == vocab* ]] && continue
echo "formatting '$f'"
# if [ "$f" != "stable-diffusion.h" ]; then
# clang-tidy -fix -p build_linux/ "$f"
# fi
clang-format -style=file -i "$f"
done
done

View File

@ -1,14 +1,14 @@
#ifndef __CONDITIONER_HPP__
#define __CONDITIONER_HPP__
#ifndef __SD_CONDITIONING_CONDITIONER_HPP__
#define __SD_CONDITIONING_CONDITIONER_HPP__
#include <cmath>
#include <limits>
#include <optional>
#include "clip.hpp"
#include "llm.hpp"
#include "t5.hpp"
#include "tensor_ggml.hpp"
#include "core/tensor_ggml.hpp"
#include "model/te/clip.hpp"
#include "model/te/llm.hpp"
#include "model/te/t5.hpp"
struct SDCondition {
sd::Tensor<float> c_crossattn;
@ -1971,7 +1971,7 @@ struct LLMEmbedder : public Conditioner {
for (int i = 0; i < conditioner_params.ref_images->size(); i++) {
const auto& image = (*conditioner_params.ref_images)[i];
double factor = llm->params.vision.patch_size * llm->params.vision.spatial_merge_size;
double factor = llm->config.vision.patch_size * llm->config.vision.spatial_merge_size;
int height = static_cast<int>(image.shape()[1]);
int width = static_cast<int>(image.shape()[0]);
int h_bar = static_cast<int>(std::round(height / factor) * factor);
@ -2042,7 +2042,7 @@ struct LLMEmbedder : public Conditioner {
for (int i = 0; i < conditioner_params.ref_images->size(); i++) {
const auto& image = (*conditioner_params.ref_images)[i];
double factor = llm->params.vision.patch_size * llm->params.vision.spatial_merge_size;
double factor = llm->config.vision.patch_size * llm->config.vision.spatial_merge_size;
int height = static_cast<int>(image.shape()[1]);
int width = static_cast<int>(image.shape()[0]);
int h_bar = static_cast<int>(std::round(height / factor) * factor);
@ -2554,4 +2554,4 @@ struct LTXAVEmbedder : public Conditioner {
}
};
#endif
#endif // __SD_CONDITIONING_CONDITIONER_HPP__

View File

@ -1,5 +1,5 @@
#ifndef __GGML_EXTEND_HPP__
#define __GGML_EXTEND_HPP__
#ifndef __SD_CORE_GGML_EXTEND_HPP__
#define __SD_CORE_GGML_EXTEND_HPP__
#include <assert.h>
#include <inttypes.h>
@ -23,19 +23,19 @@
#include <unordered_map>
#include <vector>
#include "core/ggml_extend_backend.h"
#include "core/ggml_graph_cut.h"
#include "core/layer_registry.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "ggml.h"
#include "ggml_extend_backend.h"
#include "ggml_graph_cut.h"
#include "layer_registry.h"
#include "core/tensor.hpp"
#include "model.h"
#include "tensor.hpp"
#include "rng.hpp"
#include "tensor_ggml.hpp"
#include "util.h"
#include "core/rng.hpp"
#include "core/tensor_ggml.hpp"
#include "core/util.h"
#define EPS 1e-05f
@ -1707,7 +1707,7 @@ protected:
uint64_t resident_state_token = 0;
size_t max_graph_vram_bytes = 0;
bool stream_layers_enabled = false;
bool stream_layers_enabled = false;
size_t observed_max_effective_budget_ = 0;
sd::layer_registry::LayerRegistry layer_registry_;
@ -4145,4 +4145,4 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_lokr_forward(
}
}
#endif // __GGML_EXTEND__HPP__
#endif // __SD_CORE_GGML_EXTEND_HPP__

View File

@ -1,4 +1,4 @@
#include "ggml_extend_backend.h"
#include "core/ggml_extend_backend.h"
#include <algorithm>
#include <cctype>
@ -8,8 +8,8 @@
#include <stdexcept>
#include <vector>
#include "core/util.h"
#include "stable-diffusion.h"
#include "util.h"
static std::string trim_copy(const std::string& value) {
size_t begin = 0;

View File

@ -1,5 +1,5 @@
#ifndef __SD_GGML_EXTEND_BACKEND_H__
#define __SD_GGML_EXTEND_BACKEND_H__
#ifndef __SD_CORE_GGML_EXTEND_BACKEND_H__
#define __SD_CORE_GGML_EXTEND_BACKEND_H__
#include <cstdint>
#include <cstring>
@ -76,4 +76,4 @@ ggml_backend_t sd_backend_cpu_init();
bool sd_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
const char* sd_backend_module_name(SDBackendModule module);
void ggml_ext_im_set_f32_1d(const struct ggml_tensor* tensor, int i, float value);
#endif
#endif // __SD_CORE_GGML_EXTEND_BACKEND_H__

View File

@ -1,4 +1,4 @@
#include "ggml_graph_cut.h"
#include "core/ggml_graph_cut.h"
#include <algorithm>
#include <cstring>
@ -8,11 +8,11 @@
#include <stack>
#include <unordered_map>
#include "core/util.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "util.h"
#include "../ggml/src/ggml-impl.h"
#include "ggml/src/ggml-impl.h"
namespace sd::ggml_graph_cut {

View File

@ -1,5 +1,5 @@
#ifndef __SD_GGML_GRAPH_CUT_H__
#define __SD_GGML_GRAPH_CUT_H__
#ifndef __SD_CORE_GGML_GRAPH_CUT_H__
#define __SD_CORE_GGML_GRAPH_CUT_H__
#include <array>
#include <cstdint>
@ -114,4 +114,4 @@ namespace sd::ggml_graph_cut {
void annotate_residency(Plan& plan, size_t max_graph_vram_bytes);
} // namespace sd::ggml_graph_cut
#endif
#endif // __SD_CORE_GGML_GRAPH_CUT_H__

View File

@ -1,8 +1,8 @@
#include "layer_registry.h"
#include "core/layer_registry.h"
#include <utility>
#include "util.h"
#include "core/util.h"
namespace sd::layer_registry {

View File

@ -1,5 +1,5 @@
#ifndef __LAYER_REGISTRY_H__
#define __LAYER_REGISTRY_H__
#ifndef __SD_CORE_LAYER_REGISTRY_H__
#define __SD_CORE_LAYER_REGISTRY_H__
#include <map>
#include <set>
@ -47,4 +47,4 @@ namespace sd::layer_registry {
} // namespace sd::layer_registry
#endif
#endif // __SD_CORE_LAYER_REGISTRY_H__

View File

@ -1,5 +1,5 @@
#ifndef __ORDERED_MAP_HPP__
#define __ORDERED_MAP_HPP__
#ifndef __SD_CORE_ORDERED_MAP_HPP__
#define __SD_CORE_ORDERED_MAP_HPP__
#include <iostream>
#include <list>
@ -174,4 +174,4 @@ public:
}
};
#endif // __ORDERED_MAP_HPP__
#endif // __SD_CORE_ORDERED_MAP_HPP__

View File

@ -1,5 +1,5 @@
#ifndef __RNG_H__
#define __RNG_H__
#ifndef __SD_CORE_RNG_HPP__
#define __SD_CORE_RNG_HPP__
#include <random>
#include <vector>
@ -32,4 +32,4 @@ public:
}
};
#endif // __RNG_H__
#endif // __SD_CORE_RNG_HPP__

View File

@ -1,10 +1,10 @@
#ifndef __RNG_MT19937_HPP__
#define __RNG_MT19937_HPP__
#ifndef __SD_CORE_RNG_MT19937_HPP__
#define __SD_CORE_RNG_MT19937_HPP__
#include <cmath>
#include <vector>
#include "rng.hpp"
#include "core/rng.hpp"
// RNG imitiating torch cpu randn on CPU.
// Port from pytorch, original license: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/LICENSE
@ -144,4 +144,4 @@ public:
}
};
#endif // __RNG_MT19937_HPP__
#endif // __SD_CORE_RNG_MT19937_HPP__

View File

@ -1,10 +1,10 @@
#ifndef __RNG_PHILOX_H__
#define __RNG_PHILOX_H__
#ifndef __SD_CORE_RNG_PHILOX_HPP__
#define __SD_CORE_RNG_PHILOX_HPP__
#include <cmath>
#include <vector>
#include "rng.hpp"
#include "core/rng.hpp"
// RNG imitiating torch cuda randn on CPU.
// Port from: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/5ef669de080814067961f28357256e8fe27544f4/modules/rng_philox.py
@ -122,4 +122,4 @@ public:
}
};
#endif // __RNG_PHILOX_H__
#endif // __SD_CORE_RNG_PHILOX_HPP__

View File

@ -1,5 +1,5 @@
#ifndef __SD_TENSOR_HPP__
#define __SD_TENSOR_HPP__
#ifndef __SD_CORE_TENSOR_HPP__
#define __SD_CORE_TENSOR_HPP__
#include <algorithm>
#include <cmath>
@ -16,7 +16,7 @@
#include <utility>
#include <vector>
#include "rng.hpp"
#include "core/rng.hpp"
namespace sd {
@ -1661,4 +1661,4 @@ namespace sd {
} // namespace sd
#endif
#endif // __SD_CORE_TENSOR_HPP__

View File

@ -1,5 +1,5 @@
#ifndef __SD_TENSOR_GGML_HPP__
#define __SD_TENSOR_GGML_HPP__
#ifndef __SD_CORE_TENSOR_GGML_HPP__
#define __SD_CORE_TENSOR_GGML_HPP__
#include <array>
#include <cstring>
@ -8,8 +8,8 @@
#include <string>
#include <type_traits>
#include "core/tensor.hpp"
#include "ggml.h"
#include "tensor.hpp"
namespace sd {
@ -124,4 +124,4 @@ namespace sd {
} // namespace sd
#endif
#endif // __SD_CORE_TENSOR_GGML_HPP__

View File

@ -1,4 +1,4 @@
#include "util.h"
#include "core/util.h"
#include <algorithm>
#include <cctype>
#include <cmath>
@ -13,7 +13,7 @@
#include <thread>
#include <unordered_set>
#include <vector>
#include "preprocessing.hpp"
#include "runtime/preprocessing.hpp"
#if defined(__APPLE__) && defined(__MACH__)
#include <sys/sysctl.h>

View File

@ -1,5 +1,5 @@
#ifndef __UTIL_H__
#define __UTIL_H__
#ifndef __SD_CORE_UTIL_H__
#define __SD_CORE_UTIL_H__
#include <cstdint>
#include <memory>
@ -7,9 +7,9 @@
#include <utility>
#include <vector>
#include "core/tensor.hpp"
#include "ggml-backend.h"
#include "stable-diffusion.h"
#include "tensor.hpp"
#define SAFE_STR(s) ((s) ? (s) : "")
#define BOOL_STR(b) ((b) ? "true" : "false")
@ -103,4 +103,4 @@ bool sd_backend_is(ggml_backend_t backend, const std::string& name);
#define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__)
#define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__)
#define LOG_ERROR(format, ...) log_printf(SD_LOG_ERROR, __FILE__, __LINE__, format, ##__VA_ARGS__)
#endif // __UTIL_H__
#endif // __SD_CORE_UTIL_H__

View File

@ -13,18 +13,18 @@
#include <unordered_map>
#include <vector>
#include "core/util.h"
#include "model.h"
#include "model_io/gguf_io.h"
#include "model_io/safetensors_io.h"
#include "model_io/torch_legacy_io.h"
#include "model_io/torch_zip_io.h"
#include "stable-diffusion.h"
#include "util.h"
#include "core/ggml_extend_backend.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "ggml.h"
#include "ggml_extend_backend.h"
#include "zip.h"
#include "name_conversion.h"

View File

@ -8,10 +8,10 @@
#include <string>
#include <vector>
#include "core/ordered_map.hpp"
#include "ggml-backend.h"
#include "ggml.h"
#include "model_io/tensor_storage.h"
#include "ordered_map.hpp"
enum SDVersion {
VERSION_SD1,

View File

@ -1,8 +1,8 @@
#ifndef __LORA_HPP__
#define __LORA_HPP__
#ifndef __SD_MODEL_ADAPTER_LORA_HPP__
#define __SD_MODEL_ADAPTER_LORA_HPP__
#include <mutex>
#include "ggml_extend.hpp"
#include "core/ggml_extend.hpp"
#define LORA_GRAPH_BASE_SIZE 10240
@ -914,4 +914,4 @@ public:
}
};
#endif // __LORA_HPP__
#endif // __SD_MODEL_ADAPTER_LORA_HPP__

View File

@ -1,10 +1,11 @@
#ifndef __PMI_HPP__
#define __PMI_HPP__
#ifndef __SD_MODEL_ADAPTER_PMID_HPP__
#define __SD_MODEL_ADAPTER_PMID_HPP__
#include "ggml_extend.hpp"
#include "core/ggml_extend.hpp"
#include "clip.hpp"
#include "lora.hpp"
#include "model/adapter/lora.hpp"
#include "model/common/block.hpp"
#include "model/te/clip.hpp"
struct FuseBlock : public GGMLBlock {
// network hparams
@ -636,4 +637,4 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
}
};
#endif // __PMI_HPP__
#endif // __SD_MODEL_ADAPTER_PMID_HPP__

View File

@ -1,9 +1,9 @@
#ifndef __COMMON_BLOCK_HPP__
#define __COMMON_BLOCK_HPP__
#ifndef __SD_MODEL_COMMON_BLOCK_HPP__
#define __SD_MODEL_COMMON_BLOCK_HPP__
#include "core/ggml_extend.hpp"
#include "core/util.h"
#include "ggml-backend.h"
#include "ggml_extend.hpp"
#include "util.h"
class DownSampleBlock : public GGMLBlock {
protected:
@ -227,6 +227,37 @@ public:
}
};
struct Mlp : public GGMLBlock {
public:
Mlp(int64_t in_features,
int64_t hidden_features = -1,
int64_t out_features = -1,
bool bias = true) {
// act_layer is always lambda: nn.GELU(approximate="tanh")
// norm_layer is always None
// use_conv is always False
if (hidden_features == -1) {
hidden_features = in_features;
}
if (out_features == -1) {
out_features = in_features;
}
blocks["fc1"] = std::shared_ptr<GGMLBlock>(new Linear(in_features, hidden_features, bias));
blocks["fc2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_features, out_features, bias));
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
// x: [N, n_token, in_features]
auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);
x = fc1->forward(ctx, x);
x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
x = fc2->forward(ctx, x);
return x;
}
};
class FeedForward : public GGMLBlock {
public:
enum class Activation {
@ -603,4 +634,4 @@ public:
}
};
#endif // __COMMON_BLOCK_HPP__
#endif // __SD_MODEL_COMMON_BLOCK_HPP__

View File

@ -1,10 +1,10 @@
#ifndef __ROPE_HPP__
#define __ROPE_HPP__
#ifndef __SD_MODEL_COMMON_ROPE_HPP__
#define __SD_MODEL_COMMON_ROPE_HPP__
#include <algorithm>
#include <cmath>
#include <vector>
#include "ggml_extend.hpp"
#include "core/ggml_extend.hpp"
namespace Rope {
enum class EmbedNDLayout {
@ -902,4 +902,4 @@ namespace Rope {
}
}; // namespace Rope
#endif // __ROPE_HPP__
#endif // __SD_MODEL_COMMON_ROPE_HPP__

View File

@ -1,19 +1,61 @@
#ifndef __ANIMA_HPP__
#define __ANIMA_HPP__
#ifndef __SD_MODEL_DIFFUSION_ANIMA_HPP__
#define __SD_MODEL_DIFFUSION_ANIMA_HPP__
#include <algorithm>
#include <cmath>
#include <memory>
#include <utility>
#include <vector>
#include "common_block.hpp"
#include "diffusion_model.hpp"
#include "flux.hpp"
#include "rope.hpp"
#include "model/common/block.hpp"
#include "model/common/rope.hpp"
#include "model/diffusion/flux.hpp"
#include "model/diffusion/model.hpp"
namespace Anima {
constexpr int ANIMA_GRAPH_SIZE = 65536;
struct AnimaConfig {
int64_t in_channels = 16;
int64_t out_channels = 16;
int64_t hidden_size = 2048;
int64_t text_embed_dim = 1024;
int64_t num_heads = 16;
int64_t head_dim = 128;
int patch_size = 2;
int64_t num_layers = 28;
std::vector<int> axes_dim = {44, 42, 42};
int theta = 10000;
static AnimaConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
AnimaConfig config;
int64_t detected_layers = 0;
std::string layer_tag = prefix.empty() ? "blocks." : prefix + ".blocks.";
for (const auto& [name, _] : tensor_storage_map) {
size_t pos = name.find(layer_tag);
if (pos == std::string::npos) {
continue;
}
size_t start = pos + layer_tag.size();
size_t end = name.find('.', start);
if (end == std::string::npos) {
continue;
}
int64_t layer_id = atoll(name.substr(start, end - start).c_str());
detected_layers = std::max(detected_layers, layer_id + 1);
}
if (detected_layers > 0) {
config.num_layers = detected_layers;
LOG_DEBUG("anima: num_layers = %" PRId64 ", hidden_size = %" PRId64 ", num_heads = %" PRId64 ", head_dim = %" PRId64,
config.num_layers,
config.hidden_size,
config.num_heads,
config.head_dim);
}
return config;
}
};
__STATIC_INLINE__ ggml_tensor* apply_gate(ggml_context* ctx,
ggml_tensor* x,
ggml_tensor* gate) {
@ -418,31 +460,22 @@ namespace Anima {
struct AnimaNet : public GGMLBlock {
public:
int64_t in_channels = 16;
int64_t out_channels = 16;
int64_t hidden_size = 2048;
int64_t text_embed_dim = 1024;
int64_t num_heads = 16;
int64_t head_dim = 128;
int patch_size = 2;
int64_t num_layers = 28;
std::vector<int> axes_dim = {44, 42, 42};
int theta = 10000;
AnimaConfig config;
public:
AnimaNet() = default;
explicit AnimaNet(int64_t num_layers)
: num_layers(num_layers) {
blocks["x_embedder"] = std::make_shared<XEmbedder>((in_channels + 1) * patch_size * patch_size, hidden_size);
blocks["t_embedder"] = std::make_shared<TimestepEmbedder>(hidden_size, hidden_size * 3);
blocks["t_embedding_norm"] = std::make_shared<RMSNorm>(hidden_size, 1e-6f);
for (int i = 0; i < num_layers; i++) {
blocks["blocks." + std::to_string(i)] = std::make_shared<TransformerBlock>(hidden_size,
text_embed_dim,
num_heads,
head_dim);
explicit AnimaNet(AnimaConfig config)
: config(config) {
blocks["x_embedder"] = std::make_shared<XEmbedder>((config.in_channels + 1) * config.patch_size * config.patch_size, config.hidden_size);
blocks["t_embedder"] = std::make_shared<TimestepEmbedder>(config.hidden_size, config.hidden_size * 3);
blocks["t_embedding_norm"] = std::make_shared<RMSNorm>(config.hidden_size, 1e-6f);
for (int i = 0; i < config.num_layers; i++) {
blocks["blocks." + std::to_string(i)] = std::make_shared<TransformerBlock>(config.hidden_size,
config.text_embed_dim,
config.num_heads,
config.head_dim);
}
blocks["final_layer"] = std::make_shared<FinalLayer>(hidden_size, patch_size, out_channels);
blocks["final_layer"] = std::make_shared<FinalLayer>(config.hidden_size, config.patch_size, config.out_channels);
blocks["llm_adapter"] = std::make_shared<LLMAdapter>(1024, 1024, 1024, 6, 16);
}
@ -469,11 +502,11 @@ namespace Anima {
auto padding_mask = ggml_ext_zeros(ctx->ggml_ctx, x->ne[0], x->ne[1], 1, x->ne[3]);
x = ggml_concat(ctx->ggml_ctx, x, padding_mask, 2); // [N, C + 1, H, W]
x = DiT::pad_and_patchify(ctx, x, patch_size, patch_size); // [N, h*w, (C+1)*ph*pw]
x = DiT::pad_and_patchify(ctx, x, config.patch_size, config.patch_size); // [N, h*w, (C+1)*ph*pw]
x = x_embedder->forward(ctx, x);
auto timestep_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep, static_cast<int>(hidden_size));
auto timestep_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep, static_cast<int>(config.hidden_size));
auto temb = t_embedder->forward(ctx, timestep_proj);
auto embedded_timestep = t_embedding_norm->forward(ctx, timestep_proj);
@ -505,7 +538,7 @@ namespace Anima {
sd::ggml_graph_cut::mark_graph_cut(temb, "anima.prelude", "temb");
sd::ggml_graph_cut::mark_graph_cut(encoder_hidden_states, "anima.prelude", "context");
for (int i = 0; i < num_layers; i++) {
for (int i = 0; i < config.num_layers; i++) {
auto block = std::dynamic_pointer_cast<TransformerBlock>(blocks["blocks." + std::to_string(i)]);
x = block->forward(ctx, x, encoder_hidden_states, embedded_timestep, temb, image_pe);
sd::ggml_graph_cut::mark_graph_cut(x, "anima.blocks." + std::to_string(i), "x");
@ -513,7 +546,7 @@ namespace Anima {
x = final_layer->forward(ctx, x, embedded_timestep, temb); // [N, h*w, ph*pw*C]
x = DiT::unpatchify_and_crop(ctx->ggml_ctx, x, H, W, patch_size, patch_size, false); // [N, C, H, W]
x = DiT::unpatchify_and_crop(ctx->ggml_ctx, x, H, W, config.patch_size, config.patch_size, false); // [N, C, H, W]
return x;
}
@ -524,35 +557,16 @@ namespace Anima {
std::vector<float> image_pe_vec;
std::vector<float> adapter_q_pe_vec;
std::vector<float> adapter_k_pe_vec;
AnimaConfig config;
AnimaNet net;
AnimaRunner(ggml_backend_t backend,
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model")
: DiffusionModelRunner(backend, params_backend, prefix) {
int64_t num_layers = 0;
std::string layer_tag = prefix + ".net.blocks.";
for (const auto& kv : tensor_storage_map) {
const std::string& tensor_name = kv.first;
size_t pos = tensor_name.find(layer_tag);
if (pos == std::string::npos) {
continue;
}
size_t start = pos + layer_tag.size();
size_t end = tensor_name.find('.', start);
if (end == std::string::npos) {
continue;
}
int64_t layer_id = atoll(tensor_name.substr(start, end - start).c_str());
num_layers = std::max(num_layers, layer_id + 1);
}
if (num_layers <= 0) {
num_layers = 28;
}
LOG_INFO("anima net layers: %" PRId64, num_layers);
net = AnimaNet(num_layers);
: DiffusionModelRunner(backend, params_backend, prefix),
config(AnimaConfig::detect_from_weights(tensor_storage_map, prefix + ".net")) {
net = AnimaNet(config);
net.init(params_ctx, tensor_storage_map, prefix + ".net");
}
@ -623,22 +637,22 @@ namespace Anima {
GGML_ASSERT(x->ne[3] == 1);
ggml_cgraph* gf = new_graph_custom(ANIMA_GRAPH_SIZE);
int64_t pad_h = (net.patch_size - x->ne[1] % net.patch_size) % net.patch_size;
int64_t pad_w = (net.patch_size - x->ne[0] % net.patch_size) % net.patch_size;
int64_t pad_h = (config.patch_size - x->ne[1] % config.patch_size) % config.patch_size;
int64_t pad_w = (config.patch_size - x->ne[0] % config.patch_size) % config.patch_size;
int64_t h_pad = x->ne[1] + pad_h;
int64_t w_pad = x->ne[0] + pad_w;
image_pe_vec = gen_anima_image_pe_vec(1,
static_cast<int>(h_pad),
static_cast<int>(w_pad),
static_cast<int>(net.patch_size),
net.theta,
net.axes_dim,
static_cast<int>(config.patch_size),
config.theta,
config.axes_dim,
4.0f,
4.0f,
1.0f);
int64_t image_pos_len = static_cast<int64_t>(image_pe_vec.size()) / (2 * 2 * (net.head_dim / 2));
auto image_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, net.head_dim / 2, image_pos_len);
int64_t image_pos_len = static_cast<int64_t>(image_pe_vec.size()) / (2 * 2 * (config.head_dim / 2));
auto image_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.head_dim / 2, image_pos_len);
set_backend_tensor_data(image_pe, image_pe_vec.data());
ggml_tensor* adapter_q_pe = nullptr;
@ -701,4 +715,4 @@ namespace Anima {
};
} // namespace Anima
#endif // __ANIMA_HPP__
#endif // __SD_MODEL_DIFFUSION_ANIMA_HPP__

View File

@ -1,8 +1,8 @@
#ifndef __CONTROL_HPP__
#define __CONTROL_HPP__
#ifndef __SD_MODEL_DIFFUSION_CONTROL_HPP__
#define __SD_MODEL_DIFFUSION_CONTROL_HPP__
#include "common_block.hpp"
#include "model.h"
#include "model/common/block.hpp"
#define CONTROL_NET_GRAPH_SIZE 1536
@ -484,4 +484,4 @@ struct ControlNet : public GGMLRunner {
}
};
#endif // __CONTROL_HPP__
#endif // __SD_MODEL_DIFFUSION_CONTROL_HPP__

View File

@ -1,7 +1,7 @@
#ifndef __COMMON_DIT_HPP__
#define __COMMON_DIT_HPP__
#ifndef __SD_MODEL_DIFFUSION_DIT_HPP__
#define __SD_MODEL_DIFFUSION_DIT_HPP__
#include "ggml_extend.hpp"
#include "core/ggml_extend.hpp"
namespace DiT {
inline ggml_tensor* patchify(ggml_context* ctx,
@ -163,4 +163,4 @@ namespace DiT {
}
} // namespace DiT
#endif // __COMMON_DIT_HPP__
#endif // __SD_MODEL_DIFFUSION_DIT_HPP__

View File

@ -1,18 +1,88 @@
#ifndef __SD_ERNIE_IMAGE_HPP__
#define __SD_ERNIE_IMAGE_HPP__
#ifndef __SD_MODEL_DIFFUSION_ERNIE_IMAGE_HPP__
#define __SD_MODEL_DIFFUSION_ERNIE_IMAGE_HPP__
#include <memory>
#include <vector>
#include "common_dit.hpp"
#include "diffusion_model.hpp"
#include "flux.hpp"
#include "qwen_image.hpp"
#include "rope.hpp"
#include "model/common/rope.hpp"
#include "model/diffusion/dit.hpp"
#include "model/diffusion/flux.hpp"
#include "model/diffusion/model.hpp"
#include "model/diffusion/qwen_image.hpp"
namespace ErnieImage {
constexpr int ERNIE_IMAGE_GRAPH_SIZE = 40960;
struct ErnieImageConfig {
int64_t hidden_size = 4096;
int64_t num_heads = 32;
int64_t num_layers = 36;
int64_t ffn_hidden_size = 12288;
int64_t in_channels = 128;
int64_t out_channels = 128;
int patch_size = 1;
int64_t text_in_dim = 3072;
int theta = 256;
std::vector<int> axes_dim = {32, 48, 48};
int axes_dim_sum = 128;
float eps = 1e-6f;
static ErnieImageConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
ErnieImageConfig config;
config.num_layers = 0;
int64_t detected_head_dim = 0;
for (const auto& [name, tensor_storage] : tensor_storage_map) {
if (!starts_with(name, prefix)) {
continue;
}
if (ends_with(name, "x_embedder.proj.weight") && tensor_storage.n_dims == 4) {
config.patch_size = static_cast<int>(tensor_storage.ne[0]);
config.in_channels = tensor_storage.ne[2];
config.hidden_size = tensor_storage.ne[3];
} else if (ends_with(name, "text_proj.weight") && tensor_storage.n_dims == 2) {
config.text_in_dim = tensor_storage.ne[0];
} else if (ends_with(name, "layers.0.self_attention.norm_q.weight")) {
detected_head_dim = tensor_storage.ne[0];
} else if (ends_with(name, "layers.0.mlp.gate_proj.weight") && tensor_storage.n_dims == 2) {
config.ffn_hidden_size = tensor_storage.ne[1];
} else if (ends_with(name, "final_linear.weight") && tensor_storage.n_dims == 2) {
int64_t out_dim = tensor_storage.ne[1];
int64_t patch_area = config.patch_size * config.patch_size;
config.out_channels = out_dim / patch_area;
}
size_t pos = name.find("layers.");
if (pos != std::string::npos) {
auto items = split_string(name.substr(pos), '.');
if (items.size() > 1) {
int block_index = atoi(items[1].c_str());
if (block_index + 1 > config.num_layers) {
config.num_layers = block_index + 1;
}
}
}
}
if (config.num_layers == 0) {
config.num_layers = 36;
}
if (detected_head_dim > 0) {
config.num_heads = config.hidden_size / detected_head_dim;
}
config.axes_dim_sum = 0;
for (int axis_dim : config.axes_dim) {
config.axes_dim_sum += axis_dim;
}
LOG_DEBUG("ernie_image: num_layers = %" PRId64 ", hidden_size = %" PRId64 ", num_heads = %" PRId64 ", ffn_hidden_size = %" PRId64 ", in_channels = %" PRId64 ", out_channels = %" PRId64,
config.num_layers,
config.hidden_size,
config.num_heads,
config.ffn_hidden_size,
config.in_channels,
config.out_channels);
return config;
}
};
__STATIC_INLINE__ ggml_tensor* timestep_embedding_sin_cos(ggml_context* ctx,
ggml_tensor* timesteps,
int dim,
@ -208,51 +278,36 @@ namespace ErnieImage {
}
};
struct ErnieImageParams {
int64_t hidden_size = 4096;
int64_t num_heads = 32;
int64_t num_layers = 36;
int64_t ffn_hidden_size = 12288;
int64_t in_channels = 128;
int64_t out_channels = 128;
int patch_size = 1;
int64_t text_in_dim = 3072;
int theta = 256;
std::vector<int> axes_dim = {32, 48, 48};
int axes_dim_sum = 128;
float eps = 1e-6f;
};
class ErnieImageModel : public GGMLBlock {
public:
ErnieImageParams params;
ErnieImageConfig config;
ErnieImageModel() = default;
ErnieImageModel(ErnieImageParams params)
: params(params) {
blocks["x_embedder.proj"] = std::make_shared<Conv2d>(params.in_channels,
params.hidden_size,
std::pair<int, int>{params.patch_size, params.patch_size},
std::pair<int, int>{params.patch_size, params.patch_size},
ErnieImageModel(ErnieImageConfig config)
: config(config) {
blocks["x_embedder.proj"] = std::make_shared<Conv2d>(config.in_channels,
config.hidden_size,
std::pair<int, int>{config.patch_size, config.patch_size},
std::pair<int, int>{config.patch_size, config.patch_size},
std::pair<int, int>{0, 0},
std::pair<int, int>{1, 1},
true);
if (params.text_in_dim != params.hidden_size) {
blocks["text_proj"] = std::make_shared<Linear>(params.text_in_dim, params.hidden_size, false);
if (config.text_in_dim != config.hidden_size) {
blocks["text_proj"] = std::make_shared<Linear>(config.text_in_dim, config.hidden_size, false);
}
blocks["time_embedding"] = std::make_shared<Qwen::TimestepEmbedding>(params.hidden_size, params.hidden_size);
blocks["adaLN_modulation.1"] = std::make_shared<Linear>(params.hidden_size, 6 * params.hidden_size, true);
blocks["time_embedding"] = std::make_shared<Qwen::TimestepEmbedding>(config.hidden_size, config.hidden_size);
blocks["adaLN_modulation.1"] = std::make_shared<Linear>(config.hidden_size, 6 * config.hidden_size, true);
for (int i = 0; i < params.num_layers; i++) {
blocks["layers." + std::to_string(i)] = std::make_shared<ErnieImageSharedAdaLNBlock>(params.hidden_size,
params.num_heads,
params.ffn_hidden_size,
params.eps);
for (int i = 0; i < config.num_layers; i++) {
blocks["layers." + std::to_string(i)] = std::make_shared<ErnieImageSharedAdaLNBlock>(config.hidden_size,
config.num_heads,
config.ffn_hidden_size,
config.eps);
}
blocks["final_norm"] = std::make_shared<ErnieImageAdaLNContinuous>(params.hidden_size, params.eps);
blocks["final_linear"] = std::make_shared<Linear>(params.hidden_size,
params.patch_size * params.patch_size * params.out_channels,
blocks["final_norm"] = std::make_shared<ErnieImageAdaLNContinuous>(config.hidden_size, config.eps);
blocks["final_linear"] = std::make_shared<Linear>(config.hidden_size,
config.patch_size * config.patch_size * config.out_channels,
true);
}
@ -265,12 +320,12 @@ namespace ErnieImage {
// context: [N, text_tokens, 3072]
// pe: [image_tokens + text_tokens, head_dim/2, 2, 2]
GGML_ASSERT(context != nullptr);
GGML_ASSERT(x->ne[1] % params.patch_size == 0 && x->ne[0] % params.patch_size == 0);
GGML_ASSERT(x->ne[1] % config.patch_size == 0 && x->ne[0] % config.patch_size == 0);
int64_t W = x->ne[0];
int64_t H = x->ne[1];
int64_t Hp = H / params.patch_size;
int64_t Wp = W / params.patch_size;
int64_t Hp = H / config.patch_size;
int64_t Wp = W / config.patch_size;
int64_t n_img = Hp * Wp;
int64_t N = x->ne[3];
@ -292,7 +347,7 @@ namespace ErnieImage {
auto hidden_states = ggml_concat(ctx->ggml_ctx, img, txt, 1); // [N, image_tokens + text_tokens, hidden_size]
auto sample = timestep_embedding_sin_cos(ctx->ggml_ctx, timestep, static_cast<int>(params.hidden_size));
auto sample = timestep_embedding_sin_cos(ctx->ggml_ctx, timestep, static_cast<int>(config.hidden_size));
auto c = time_embedding->forward(ctx, sample); // [N, hidden_size]
auto mod_params = adaLN_mod->forward(ctx, ggml_silu(ctx->ggml_ctx, c)); // [N, 6 * hidden_size]
@ -305,7 +360,7 @@ namespace ErnieImage {
temb.push_back(ggml_reshape_3d(ctx->ggml_ctx, chunk, chunk->ne[0], 1, chunk->ne[1])); // [N, 1, hidden_size]
}
for (int i = 0; i < params.num_layers; i++) {
for (int i = 0; i < config.num_layers; i++) {
auto layer = std::dynamic_pointer_cast<ErnieImageSharedAdaLNBlock>(blocks["layers." + std::to_string(i)]);
hidden_states = layer->forward(ctx, hidden_states, pe, temb);
sd::ggml_graph_cut::mark_graph_cut(hidden_states, "ernie_image.layers." + std::to_string(i), "hidden_states");
@ -319,15 +374,15 @@ namespace ErnieImage {
patches,
Hp,
Wp,
params.patch_size,
params.patch_size,
config.patch_size,
config.patch_size,
false); // [N, out_channels, H, W]
return out;
}
};
struct ErnieImageRunner : public DiffusionModelRunner {
ErnieImageParams ernie_params;
ErnieImageConfig config;
ErnieImageModel ernie_image;
std::vector<float> pe_vec;
@ -335,58 +390,9 @@ namespace ErnieImage {
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "")
: DiffusionModelRunner(backend, params_backend, prefix) {
ernie_params.num_layers = 0;
for (const auto& [name, tensor_storage] : tensor_storage_map) {
if (!starts_with(name, prefix)) {
continue;
}
if (ends_with(name, "x_embedder.proj.weight") && tensor_storage.n_dims == 4) {
ernie_params.patch_size = static_cast<int>(tensor_storage.ne[0]);
ernie_params.in_channels = tensor_storage.ne[2];
ernie_params.hidden_size = tensor_storage.ne[3];
} else if (ends_with(name, "text_proj.weight") && tensor_storage.n_dims == 2) {
ernie_params.text_in_dim = tensor_storage.ne[0];
} else if (ends_with(name, "layers.0.self_attention.norm_q.weight")) {
int64_t head_dim = tensor_storage.ne[0];
ernie_params.num_heads = ernie_params.hidden_size / head_dim;
} else if (ends_with(name, "layers.0.mlp.gate_proj.weight") && tensor_storage.n_dims == 2) {
ernie_params.ffn_hidden_size = tensor_storage.ne[1];
} else if (ends_with(name, "final_linear.weight") && tensor_storage.n_dims == 2) {
int64_t out_dim = tensor_storage.ne[1];
ernie_params.out_channels = out_dim / ernie_params.patch_size / ernie_params.patch_size;
}
size_t pos = name.find("layers.");
if (pos != std::string::npos) {
std::string layer_name = name.substr(pos);
auto items = split_string(layer_name, '.');
if (items.size() > 1) {
int block_index = atoi(items[1].c_str());
if (block_index + 1 > ernie_params.num_layers) {
ernie_params.num_layers = block_index + 1;
}
}
}
}
if (ernie_params.num_layers == 0) {
ernie_params.num_layers = 36;
}
ernie_params.axes_dim_sum = 0;
for (int axis_dim : ernie_params.axes_dim) {
ernie_params.axes_dim_sum += axis_dim;
}
LOG_INFO("ernie_image: layers = %" PRId64 ", hidden_size = %" PRId64 ", heads = %" PRId64
", ffn_hidden_size = %" PRId64 ", in_channels = %" PRId64 ", out_channels = %" PRId64,
ernie_params.num_layers,
ernie_params.hidden_size,
ernie_params.num_heads,
ernie_params.ffn_hidden_size,
ernie_params.in_channels,
ernie_params.out_channels);
ernie_image = ErnieImageModel(ernie_params);
: DiffusionModelRunner(backend, params_backend, prefix),
config(ErnieImageConfig::detect_from_weights(tensor_storage_map, prefix)) {
ernie_image = ErnieImageModel(config);
ernie_image.init(params_ctx, tensor_storage_map, prefix);
}
@ -410,15 +416,15 @@ namespace ErnieImage {
pe_vec = Rope::gen_ernie_image_pe(static_cast<int>(x->ne[1]),
static_cast<int>(x->ne[0]),
ernie_params.patch_size,
config.patch_size,
static_cast<int>(x->ne[3]),
static_cast<int>(context->ne[1]),
ernie_params.theta,
config.theta,
circular_y_enabled,
circular_x_enabled,
ernie_params.axes_dim);
int pos_len = static_cast<int>(pe_vec.size() / ernie_params.axes_dim_sum / 2);
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, ernie_params.axes_dim_sum, 1, pos_len, 2);
config.axes_dim);
int pos_len = static_cast<int>(pe_vec.size() / config.axes_dim_sum / 2);
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, config.axes_dim_sum, 1, pos_len, 2);
set_backend_tensor_data(pe, pe_vec.data());
auto runner_ctx = get_context();
@ -449,4 +455,4 @@ namespace ErnieImage {
};
} // namespace ErnieImage
#endif // __SD_ERNIE_IMAGE_HPP__
#endif // __SD_MODEL_DIFFUSION_ERNIE_IMAGE_HPP__

View File

@ -1,18 +1,167 @@
#ifndef __FLUX_HPP__
#define __FLUX_HPP__
#ifndef __SD_MODEL_DIFFUSION_FLUX_HPP__
#define __SD_MODEL_DIFFUSION_FLUX_HPP__
#include <memory>
#include <vector>
#include "common_dit.hpp"
#include "diffusion_model.hpp"
#include "model.h"
#include "rope.hpp"
#include "model/common/rope.hpp"
#include "model/diffusion/dit.hpp"
#include "model/diffusion/model.hpp"
#define FLUX_GRAPH_SIZE 10240
namespace Flux {
struct ChromaRadianceConfig {
int64_t nerf_hidden_size = 64;
int nerf_mlp_ratio = 4;
int nerf_depth = 4;
int nerf_max_freqs = 8;
bool use_x0 = false;
bool fake_patch_size_x2 = false;
};
struct FluxConfig {
SDVersion version = VERSION_FLUX;
bool is_chroma = false;
int patch_size = 2;
int64_t in_channels = 64;
int64_t out_channels = 64;
int64_t vec_in_dim = 768;
int64_t context_in_dim = 4096;
int64_t hidden_size = 3072;
float mlp_ratio = 4.0f;
int num_heads = 24;
int depth = 19;
int depth_single_blocks = 38;
std::vector<int> axes_dim = {16, 56, 56};
int axes_dim_sum = 128;
int theta = 10000;
bool qkv_bias = true;
bool guidance_embed = true;
int64_t in_dim = 64;
bool disable_bias = false;
bool share_modulation = false;
bool semantic_txt_norm = false;
bool use_yak_mlp = false;
bool use_mlp_silu_act = false;
float ref_index_scale = 1.f;
ChromaRadianceConfig chroma_radiance_params;
static FluxConfig detect_from_weights(const String2TensorStorage& tensor_storage_map,
const std::string& prefix,
SDVersion version = VERSION_FLUX) {
FluxConfig config;
config.version = version;
config.guidance_embed = false;
config.depth = 0;
config.depth_single_blocks = 0;
if (version == VERSION_FLUX_FILL) {
config.in_channels = 384;
} else if (version == VERSION_FLUX_CONTROLS) {
config.in_channels = 128;
} else if (version == VERSION_FLEX_2) {
config.in_channels = 196;
} else if (version == VERSION_CHROMA_RADIANCE) {
config.in_channels = 3;
config.patch_size = 16;
} else if (version == VERSION_OVIS_IMAGE) {
config.semantic_txt_norm = true;
config.use_yak_mlp = true;
config.vec_in_dim = 0;
} else if (sd_version_is_flux2(version)) {
config.in_channels = 128;
config.patch_size = 1;
config.out_channels = 128;
config.mlp_ratio = 3.f;
config.theta = 2000;
config.axes_dim = {32, 32, 32, 32};
config.vec_in_dim = 0;
config.qkv_bias = false;
config.disable_bias = true;
config.share_modulation = true;
config.ref_index_scale = 10.f;
config.use_mlp_silu_act = true;
} else if (sd_version_is_longcat(version)) {
config.context_in_dim = 3584;
config.vec_in_dim = 0;
}
int64_t head_dim = 0;
int64_t actual_radiance_patch_size = -1;
for (const auto& [name, tensor_storage] : tensor_storage_map) {
if (!starts_with(name, prefix)) {
continue;
}
if (name.find("guidance_in.in_layer.weight") != std::string::npos) {
config.guidance_embed = true;
}
if (name.find("__x0__") != std::string::npos) {
LOG_DEBUG("using x0 prediction");
config.chroma_radiance_params.use_x0 = true;
}
if (name.find("__32x32__") != std::string::npos) {
LOG_DEBUG("using patch size 32");
config.patch_size = 32;
}
if (name.find("img_in_patch.weight") != std::string::npos) {
actual_radiance_patch_size = tensor_storage.ne[0];
LOG_DEBUG("actual radiance patch size: %" PRId64, actual_radiance_patch_size);
}
if (name.find("distilled_guidance_layer.in_proj.weight") != std::string::npos) {
config.is_chroma = true;
}
size_t db = name.find("double_blocks.");
if (db != std::string::npos) {
std::string block_name = name.substr(db);
int block_depth = atoi(block_name.substr(14, block_name.find(".", 14)).c_str());
if (block_depth + 1 > config.depth) {
config.depth = block_depth + 1;
}
}
size_t sb = name.find("single_blocks.");
if (sb != std::string::npos) {
std::string block_name = name.substr(sb);
int block_depth = atoi(block_name.substr(14, block_name.find(".", 14)).c_str());
if (block_depth + 1 > config.depth_single_blocks) {
config.depth_single_blocks = block_depth + 1;
}
}
if (ends_with(name, "txt_in.weight")) {
config.context_in_dim = tensor_storage.ne[0];
config.hidden_size = tensor_storage.ne[1];
}
if (ends_with(name, "single_blocks.0.norm.key_norm.scale")) {
head_dim = tensor_storage.ne[0];
}
if (ends_with(name, "double_blocks.0.txt_attn.norm.key_norm.scale")) {
head_dim = tensor_storage.ne[0];
}
}
if (actual_radiance_patch_size > 0 && actual_radiance_patch_size != config.patch_size) {
GGML_ASSERT(config.patch_size == 2 * actual_radiance_patch_size);
LOG_DEBUG("using fake x2 patch size");
config.chroma_radiance_params.fake_patch_size_x2 = true;
}
if (head_dim > 0) {
config.num_heads = static_cast<int>(config.hidden_size / head_dim);
}
config.axes_dim_sum = 0;
for (int axis_dim : config.axes_dim) {
config.axes_dim_sum += axis_dim;
}
LOG_DEBUG("flux: depth = %d, depth_single_blocks = %d, guidance_embed = %s, context_in_dim = %" PRId64 ", hidden_size = %" PRId64 ", num_heads = %d",
config.depth,
config.depth_single_blocks,
config.guidance_embed ? "true" : "false",
config.context_in_dim,
config.hidden_size,
config.num_heads);
return config;
}
};
struct MLPEmbedder : public UnaryBlock {
public:
MLPEmbedder(int64_t in_dim, int64_t hidden_dim, bool bias = true) {
@ -723,127 +872,90 @@ namespace Flux {
}
};
struct ChromaRadianceParams {
int64_t nerf_hidden_size = 64;
int nerf_mlp_ratio = 4;
int nerf_depth = 4;
int nerf_max_freqs = 8;
bool use_x0 = false;
bool fake_patch_size_x2 = false;
};
struct FluxParams {
SDVersion version = VERSION_FLUX;
bool is_chroma = false;
int patch_size = 2;
int64_t in_channels = 64;
int64_t out_channels = 64;
int64_t vec_in_dim = 768;
int64_t context_in_dim = 4096;
int64_t hidden_size = 3072;
float mlp_ratio = 4.0f;
int num_heads = 24;
int depth = 19;
int depth_single_blocks = 38;
std::vector<int> axes_dim = {16, 56, 56};
int axes_dim_sum = 128;
int theta = 10000;
bool qkv_bias = true;
bool guidance_embed = true;
int64_t in_dim = 64;
bool disable_bias = false;
bool share_modulation = false;
bool semantic_txt_norm = false;
bool use_yak_mlp = false;
bool use_mlp_silu_act = false;
float ref_index_scale = 1.f;
ChromaRadianceParams chroma_radiance_params;
};
struct Flux : public GGMLBlock {
public:
FluxParams params;
FluxConfig config;
Flux() {}
Flux(FluxParams params)
: params(params) {
if (params.version == VERSION_CHROMA_RADIANCE) {
std::pair<int, int> kernel_size = {params.patch_size, params.patch_size};
if (params.chroma_radiance_params.fake_patch_size_x2) {
kernel_size = {params.patch_size / 2, params.patch_size / 2};
Flux(FluxConfig config)
: config(config) {
if (config.version == VERSION_CHROMA_RADIANCE) {
std::pair<int, int> kernel_size = {config.patch_size, config.patch_size};
if (config.chroma_radiance_params.fake_patch_size_x2) {
kernel_size = {config.patch_size / 2, config.patch_size / 2};
}
std::pair<int, int> stride = kernel_size;
blocks["img_in_patch"] = std::make_shared<Conv2d>(params.in_channels,
params.hidden_size,
blocks["img_in_patch"] = std::make_shared<Conv2d>(config.in_channels,
config.hidden_size,
kernel_size,
stride);
} else {
blocks["img_in"] = std::make_shared<Linear>(params.in_channels, params.hidden_size, !params.disable_bias);
blocks["img_in"] = std::make_shared<Linear>(config.in_channels, config.hidden_size, !config.disable_bias);
}
if (params.is_chroma) {
blocks["distilled_guidance_layer"] = std::make_shared<ChromaApproximator>(params.in_dim, params.hidden_size);
if (config.is_chroma) {
blocks["distilled_guidance_layer"] = std::make_shared<ChromaApproximator>(config.in_dim, config.hidden_size);
} else {
blocks["time_in"] = std::make_shared<MLPEmbedder>(256, params.hidden_size, !params.disable_bias);
if (params.vec_in_dim > 0) {
blocks["vector_in"] = std::make_shared<MLPEmbedder>(params.vec_in_dim, params.hidden_size, !params.disable_bias);
blocks["time_in"] = std::make_shared<MLPEmbedder>(256, config.hidden_size, !config.disable_bias);
if (config.vec_in_dim > 0) {
blocks["vector_in"] = std::make_shared<MLPEmbedder>(config.vec_in_dim, config.hidden_size, !config.disable_bias);
}
if (params.guidance_embed) {
blocks["guidance_in"] = std::make_shared<MLPEmbedder>(256, params.hidden_size, !params.disable_bias);
if (config.guidance_embed) {
blocks["guidance_in"] = std::make_shared<MLPEmbedder>(256, config.hidden_size, !config.disable_bias);
}
}
if (params.semantic_txt_norm) {
blocks["txt_norm"] = std::make_shared<RMSNorm>(params.context_in_dim);
if (config.semantic_txt_norm) {
blocks["txt_norm"] = std::make_shared<RMSNorm>(config.context_in_dim);
}
blocks["txt_in"] = std::make_shared<Linear>(params.context_in_dim, params.hidden_size, !params.disable_bias);
blocks["txt_in"] = std::make_shared<Linear>(config.context_in_dim, config.hidden_size, !config.disable_bias);
for (int i = 0; i < params.depth; i++) {
blocks["double_blocks." + std::to_string(i)] = std::make_shared<DoubleStreamBlock>(params.hidden_size,
params.num_heads,
params.mlp_ratio,
for (int i = 0; i < config.depth; i++) {
blocks["double_blocks." + std::to_string(i)] = std::make_shared<DoubleStreamBlock>(config.hidden_size,
config.num_heads,
config.mlp_ratio,
i,
params.qkv_bias,
params.is_chroma,
params.share_modulation,
!params.disable_bias,
params.use_yak_mlp,
params.use_mlp_silu_act);
config.qkv_bias,
config.is_chroma,
config.share_modulation,
!config.disable_bias,
config.use_yak_mlp,
config.use_mlp_silu_act);
}
for (int i = 0; i < params.depth_single_blocks; i++) {
blocks["single_blocks." + std::to_string(i)] = std::make_shared<SingleStreamBlock>(params.hidden_size,
params.num_heads,
params.mlp_ratio,
for (int i = 0; i < config.depth_single_blocks; i++) {
blocks["single_blocks." + std::to_string(i)] = std::make_shared<SingleStreamBlock>(config.hidden_size,
config.num_heads,
config.mlp_ratio,
i,
0.f,
params.is_chroma,
params.share_modulation,
!params.disable_bias,
params.use_yak_mlp,
params.use_mlp_silu_act);
config.is_chroma,
config.share_modulation,
!config.disable_bias,
config.use_yak_mlp,
config.use_mlp_silu_act);
}
if (params.version == VERSION_CHROMA_RADIANCE) {
blocks["nerf_image_embedder"] = std::make_shared<NerfEmbedder>(params.in_channels,
params.chroma_radiance_params.nerf_hidden_size,
params.chroma_radiance_params.nerf_max_freqs);
if (config.version == VERSION_CHROMA_RADIANCE) {
blocks["nerf_image_embedder"] = std::make_shared<NerfEmbedder>(config.in_channels,
config.chroma_radiance_params.nerf_hidden_size,
config.chroma_radiance_params.nerf_max_freqs);
for (int i = 0; i < params.chroma_radiance_params.nerf_depth; i++) {
blocks["nerf_blocks." + std::to_string(i)] = std::make_shared<NerfGLUBlock>(params.hidden_size,
params.chroma_radiance_params.nerf_hidden_size,
params.chroma_radiance_params.nerf_mlp_ratio);
for (int i = 0; i < config.chroma_radiance_params.nerf_depth; i++) {
blocks["nerf_blocks." + std::to_string(i)] = std::make_shared<NerfGLUBlock>(config.hidden_size,
config.chroma_radiance_params.nerf_hidden_size,
config.chroma_radiance_params.nerf_mlp_ratio);
}
blocks["nerf_final_layer_conv"] = std::make_shared<NerfFinalLayerConv>(params.chroma_radiance_params.nerf_hidden_size,
params.in_channels);
blocks["nerf_final_layer_conv"] = std::make_shared<NerfFinalLayerConv>(config.chroma_radiance_params.nerf_hidden_size,
config.in_channels);
} else {
blocks["final_layer"] = std::make_shared<LastLayer>(params.hidden_size, 1, params.out_channels, params.is_chroma, !params.disable_bias);
blocks["final_layer"] = std::make_shared<LastLayer>(config.hidden_size, 1, config.out_channels, config.is_chroma, !config.disable_bias);
}
if (params.share_modulation) {
blocks["double_stream_modulation_img"] = std::make_shared<Modulation>(params.hidden_size, true, !params.disable_bias);
blocks["double_stream_modulation_txt"] = std::make_shared<Modulation>(params.hidden_size, true, !params.disable_bias);
blocks["single_stream_modulation"] = std::make_shared<Modulation>(params.hidden_size, false, !params.disable_bias);
if (config.share_modulation) {
blocks["double_stream_modulation_img"] = std::make_shared<Modulation>(config.hidden_size, true, !config.disable_bias);
blocks["double_stream_modulation_txt"] = std::make_shared<Modulation>(config.hidden_size, true, !config.disable_bias);
blocks["single_stream_modulation"] = std::make_shared<Modulation>(config.hidden_size, false, !config.disable_bias);
}
}
@ -866,7 +978,7 @@ namespace Flux {
ggml_tensor* vec;
ggml_tensor* txt_img_mask = nullptr;
if (params.is_chroma) {
if (config.is_chroma) {
int64_t mod_index_length = 344;
auto approx = std::dynamic_pointer_cast<ChromaApproximator>(blocks["distilled_guidance_layer"]);
auto distill_timestep = ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 16, 10000, 1000.f);
@ -894,7 +1006,7 @@ namespace Flux {
} else {
auto time_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["time_in"]);
vec = time_in->forward(ctx, ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 256, 10000, 1000.f));
if (params.guidance_embed) {
if (config.guidance_embed) {
GGML_ASSERT(guidance != nullptr);
auto guidance_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["guidance_in"]);
// bf16 and fp16 result is different
@ -902,7 +1014,7 @@ namespace Flux {
vec = ggml_add(ctx->ggml_ctx, vec, guidance_in->forward(ctx, g_in));
}
if (params.vec_in_dim > 0) {
if (config.vec_in_dim > 0) {
auto vector_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["vector_in"]);
vec = ggml_add(ctx->ggml_ctx, vec, vector_in->forward(ctx, y));
}
@ -911,7 +1023,7 @@ namespace Flux {
std::vector<ModulationOut> ds_img_mods;
std::vector<ModulationOut> ds_txt_mods;
std::vector<ModulationOut> ss_mods;
if (params.share_modulation) {
if (config.share_modulation) {
auto double_stream_modulation_img = std::dynamic_pointer_cast<Modulation>(blocks["double_stream_modulation_img"]);
auto double_stream_modulation_txt = std::dynamic_pointer_cast<Modulation>(blocks["double_stream_modulation_txt"]);
auto single_stream_modulation = std::dynamic_pointer_cast<Modulation>(blocks["single_stream_modulation"]);
@ -921,7 +1033,7 @@ namespace Flux {
ss_mods = single_stream_modulation->forward(ctx, vec);
}
if (params.semantic_txt_norm) {
if (config.semantic_txt_norm) {
auto semantic_txt_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm"]);
txt = semantic_txt_norm->forward(ctx, txt);
@ -932,7 +1044,7 @@ namespace Flux {
sd::ggml_graph_cut::mark_graph_cut(txt, "flux.prelude", "txt");
sd::ggml_graph_cut::mark_graph_cut(vec, "flux.prelude", "vec");
for (int i = 0; i < params.depth; i++) {
for (int i = 0; i < config.depth; i++) {
if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i) != skip_layers.end()) {
continue;
}
@ -947,8 +1059,8 @@ namespace Flux {
}
auto txt_img = ggml_concat(ctx->ggml_ctx, txt, img, 1); // [N, n_txt_token + n_img_token, hidden_size]
for (int i = 0; i < params.depth_single_blocks; i++) {
if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i + params.depth) != skip_layers.end()) {
for (int i = 0; i < config.depth_single_blocks; i++) {
if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i + config.depth) != skip_layers.end()) {
continue;
}
auto block = std::dynamic_pointer_cast<SingleStreamBlock>(blocks["single_blocks." + std::to_string(i)]);
@ -999,14 +1111,14 @@ namespace Flux {
int64_t W = x->ne[0];
int64_t H = x->ne[1];
int64_t C = x->ne[2];
int patch_size = params.patch_size;
int patch_size = config.patch_size;
int pad_h = (patch_size - H % patch_size) % patch_size;
int pad_w = (patch_size - W % patch_size) % patch_size;
auto img = DiT::pad_to_patch_size(ctx, x, params.patch_size, params.patch_size);
auto img = DiT::pad_to_patch_size(ctx, x, config.patch_size, config.patch_size);
auto orig_img = img;
if (params.chroma_radiance_params.fake_patch_size_x2) {
if (config.chroma_radiance_params.fake_patch_size_x2) {
// It's supposed to be using GGML_SCALE_MODE_NEAREST, but this seems more stable
// Maybe the implementation of nearest-neighbor interpolation in ggml behaves differently than the one in PyTorch?
// img = F.interpolate(img, size=(H//2, W//2), mode="nearest")
@ -1037,7 +1149,7 @@ namespace Flux {
auto nerf_hidden = ggml_reshape_2d(ctx->ggml_ctx, out, out->ne[0], out->ne[1] * out->ne[2]); // [N*num_patches, hidden_size]
auto img_dct = nerf_image_embedder->forward(ctx, nerf_pixels, dct); // [N*num_patches, patch_size*patch_size, nerf_hidden_size]
for (int i = 0; i < params.chroma_radiance_params.nerf_depth; i++) {
for (int i = 0; i < config.chroma_radiance_params.nerf_depth; i++) {
auto block = std::dynamic_pointer_cast<NerfGLUBlock>(blocks["nerf_blocks." + std::to_string(i)]);
img_dct = block->forward(ctx, img_dct, nerf_hidden);
@ -1049,7 +1161,7 @@ namespace Flux {
out = nerf_final_layer_conv->forward(ctx, img_dct); // [N, C, H, W]
if (params.chroma_radiance_params.use_x0) {
if (config.chroma_radiance_params.use_x0) {
out = _apply_x0_residual(ctx, out, orig_img, timestep);
}
@ -1073,14 +1185,14 @@ namespace Flux {
int64_t W = x->ne[0];
int64_t H = x->ne[1];
int64_t C = x->ne[2];
int patch_size = params.patch_size;
int patch_size = config.patch_size;
int pad_h = (patch_size - H % patch_size) % patch_size;
int pad_w = (patch_size - W % patch_size) % patch_size;
auto img = DiT::pad_and_patchify(ctx, x, patch_size, patch_size);
int64_t img_tokens = img->ne[1];
if (params.version == VERSION_FLUX_FILL) {
if (config.version == VERSION_FLUX_FILL) {
GGML_ASSERT(c_concat != nullptr);
ggml_tensor* masked = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
ggml_tensor* mask = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 8 * 8, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
@ -1089,7 +1201,7 @@ namespace Flux {
mask = DiT::pad_and_patchify(ctx, mask, patch_size, patch_size);
img = ggml_concat(ctx->ggml_ctx, img, ggml_concat(ctx->ggml_ctx, masked, mask, 0), 0);
} else if (params.version == VERSION_FLEX_2) {
} else if (config.version == VERSION_FLEX_2) {
GGML_ASSERT(c_concat != nullptr);
ggml_tensor* masked = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
ggml_tensor* mask = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 1, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
@ -1100,7 +1212,7 @@ namespace Flux {
control = DiT::pad_and_patchify(ctx, control, patch_size, patch_size);
img = ggml_concat(ctx->ggml_ctx, img, ggml_concat(ctx->ggml_ctx, ggml_concat(ctx->ggml_ctx, masked, mask, 0), control, 0), 0);
} else if (params.version == VERSION_FLUX_CONTROLS) {
} else if (config.version == VERSION_FLUX_CONTROLS) {
GGML_ASSERT(c_concat != nullptr);
auto control = DiT::pad_and_patchify(ctx, c_concat, patch_size, patch_size);
@ -1147,7 +1259,7 @@ namespace Flux {
// pe: (L, d_head/2, 2, 2)
// return: (N, C, H, W)
if (params.version == VERSION_CHROMA_RADIANCE) {
if (config.version == VERSION_CHROMA_RADIANCE) {
return forward_chroma_radiance(ctx,
x,
timestep,
@ -1179,7 +1291,7 @@ namespace Flux {
struct FluxRunner : public DiffusionModelRunner {
public:
FluxParams flux_params;
FluxConfig config;
Flux flux;
std::vector<float> pe_vec;
std::vector<float> mod_index_arange_vec;
@ -1194,114 +1306,15 @@ namespace Flux {
const std::string prefix = "",
SDVersion version = VERSION_FLUX,
bool use_mask = false)
: DiffusionModelRunner(backend, params_backend, prefix), version(version), use_mask(use_mask) {
flux_params.version = version;
flux_params.guidance_embed = false;
flux_params.depth = 0;
flux_params.depth_single_blocks = 0;
if (version == VERSION_FLUX_FILL) {
flux_params.in_channels = 384;
} else if (version == VERSION_FLUX_CONTROLS) {
flux_params.in_channels = 128;
} else if (version == VERSION_FLEX_2) {
flux_params.in_channels = 196;
} else if (version == VERSION_CHROMA_RADIANCE) {
flux_params.in_channels = 3;
flux_params.patch_size = 16;
} else if (version == VERSION_OVIS_IMAGE) {
flux_params.semantic_txt_norm = true;
flux_params.use_yak_mlp = true;
flux_params.vec_in_dim = 0;
} else if (sd_version_is_flux2(version)) {
flux_params.in_channels = 128;
flux_params.patch_size = 1;
flux_params.out_channels = 128;
flux_params.mlp_ratio = 3.f;
flux_params.theta = 2000;
flux_params.axes_dim = {32, 32, 32, 32};
flux_params.vec_in_dim = 0;
flux_params.qkv_bias = false;
flux_params.disable_bias = true;
flux_params.share_modulation = true;
flux_params.ref_index_scale = 10.f;
flux_params.use_mlp_silu_act = true;
} else if (sd_version_is_longcat(version)) {
flux_params.context_in_dim = 3584;
flux_params.vec_in_dim = 0;
}
int64_t head_dim = 0;
int64_t actual_radiance_patch_size = -1;
for (auto pair : tensor_storage_map) {
std::string tensor_name = pair.first;
if (!starts_with(tensor_name, prefix))
continue;
if (tensor_name.find("guidance_in.in_layer.weight") != std::string::npos) {
flux_params.guidance_embed = true;
}
if (tensor_name.find("__x0__") != std::string::npos) {
LOG_DEBUG("using x0 prediction");
flux_params.chroma_radiance_params.use_x0 = true;
}
if (tensor_name.find("__32x32__") != std::string::npos) {
LOG_DEBUG("using patch size 32");
flux_params.patch_size = 32;
}
if (tensor_name.find("img_in_patch.weight") != std::string::npos) {
actual_radiance_patch_size = pair.second.ne[0];
LOG_DEBUG("actual radiance patch size: %d", actual_radiance_patch_size);
}
if (tensor_name.find("distilled_guidance_layer.in_proj.weight") != std::string::npos) {
// Chroma
flux_params.is_chroma = true;
}
size_t db = tensor_name.find("double_blocks.");
if (db != std::string::npos) {
tensor_name = tensor_name.substr(db); // remove prefix
int block_depth = atoi(tensor_name.substr(14, tensor_name.find(".", 14)).c_str());
if (block_depth + 1 > flux_params.depth) {
flux_params.depth = block_depth + 1;
}
}
size_t sb = tensor_name.find("single_blocks.");
if (sb != std::string::npos) {
tensor_name = tensor_name.substr(sb); // remove prefix
int block_depth = atoi(tensor_name.substr(14, tensor_name.find(".", 14)).c_str());
if (block_depth + 1 > flux_params.depth_single_blocks) {
flux_params.depth_single_blocks = block_depth + 1;
}
}
if (ends_with(tensor_name, "txt_in.weight")) {
flux_params.context_in_dim = pair.second.ne[0];
flux_params.hidden_size = pair.second.ne[1];
}
if (ends_with(tensor_name, "single_blocks.0.norm.key_norm.scale")) {
head_dim = pair.second.ne[0];
}
if (ends_with(tensor_name, "double_blocks.0.txt_attn.norm.key_norm.scale")) {
head_dim = pair.second.ne[0];
}
}
if (actual_radiance_patch_size > 0 && actual_radiance_patch_size != flux_params.patch_size) {
GGML_ASSERT(flux_params.patch_size == 2 * actual_radiance_patch_size);
LOG_DEBUG("using fake x2 patch size");
flux_params.chroma_radiance_params.fake_patch_size_x2 = true;
}
flux_params.num_heads = static_cast<int>(flux_params.hidden_size / head_dim);
LOG_INFO("flux: depth = %d, depth_single_blocks = %d, guidance_embed = %s, context_in_dim = %" PRId64
", hidden_size = %" PRId64 ", num_heads = %d",
flux_params.depth,
flux_params.depth_single_blocks,
flux_params.guidance_embed ? "true" : "false",
flux_params.context_in_dim,
flux_params.hidden_size,
flux_params.num_heads);
if (flux_params.is_chroma) {
: DiffusionModelRunner(backend, params_backend, prefix),
config(FluxConfig::detect_from_weights(tensor_storage_map, prefix, version)),
version(version),
use_mask(use_mask) {
if (config.is_chroma) {
LOG_INFO("Using pruned modulation (Chroma)");
}
flux = Flux(flux_params);
flux = Flux(config);
flux.init(params_ctx, tensor_storage_map, prefix);
}
@ -1377,10 +1390,10 @@ namespace Flux {
ggml_tensor* context = make_optional_input(context_tensor);
ggml_tensor* c_concat = make_optional_input(c_concat_tensor);
ggml_tensor* y = make_optional_input(y_tensor);
if (flux_params.guidance_embed || flux_params.is_chroma) {
if (config.guidance_embed || config.is_chroma) {
if (!guidance_tensor.empty()) {
this->guidance_tensor = guidance_tensor;
if (flux_params.is_chroma) {
if (config.is_chroma) {
this->guidance_tensor.fill_(0.f);
}
}
@ -1398,7 +1411,7 @@ namespace Flux {
ggml_tensor* mod_index_arange = nullptr;
ggml_tensor* dct = nullptr; // for chroma radiance
if (flux_params.is_chroma) {
if (config.is_chroma) {
if (!use_mask) {
y = nullptr;
}
@ -1417,29 +1430,29 @@ namespace Flux {
}
pe_vec = Rope::gen_flux_pe(static_cast<int>(x->ne[1]),
static_cast<int>(x->ne[0]),
flux_params.patch_size,
config.patch_size,
static_cast<int>(x->ne[3]),
static_cast<int>(context->ne[1]),
txt_arange_dims,
ref_latents,
increase_ref_index,
flux_params.ref_index_scale,
flux_params.theta,
config.ref_index_scale,
config.theta,
circular_y_enabled,
circular_x_enabled,
flux_params.axes_dim,
config.axes_dim,
sd_version_is_longcat(version));
int pos_len = static_cast<int>(pe_vec.size() / flux_params.axes_dim_sum / 2);
int pos_len = static_cast<int>(pe_vec.size() / config.axes_dim_sum / 2);
// LOG_DEBUG("pos_len %d", pos_len);
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, flux_params.axes_dim_sum / 2, pos_len);
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.axes_dim_sum / 2, pos_len);
// pe->data = pe_vec.data();
// print_ggml_tensor(pe);
// pe->data = nullptr;
set_backend_tensor_data(pe, pe_vec.data());
if (version == VERSION_CHROMA_RADIANCE) {
int patch_size = flux_params.patch_size;
int nerf_max_freqs = flux_params.chroma_radiance_params.nerf_max_freqs;
int patch_size = config.patch_size;
int nerf_max_freqs = config.chroma_radiance_params.nerf_max_freqs;
dct_vec = fetch_dct_pos(patch_size, nerf_max_freqs);
dct = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_F32, nerf_max_freqs * nerf_max_freqs, patch_size * patch_size);
// dct->data = dct_vec.data();
@ -1614,4 +1627,4 @@ namespace Flux {
} // namespace Flux
#endif // __FLUX_HPP__
#endif // __SD_MODEL_DIFFUSION_FLUX_HPP__

View File

@ -1,5 +1,5 @@
#ifndef __SD_HIDREAM_O1_H__
#define __SD_HIDREAM_O1_H__
#ifndef __SD_MODEL_DIFFUSION_HIDREAM_O1_HPP__
#define __SD_MODEL_DIFFUSION_HIDREAM_O1_HPP__
#include <algorithm>
#include <array>
@ -10,11 +10,11 @@
#include <utility>
#include <vector>
#include "common_dit.hpp"
#include "conditioner.hpp"
#include "diffusion_model.hpp"
#include "llm.hpp"
#include "util.h"
#include "conditioning/conditioner.hpp"
#include "core/util.h"
#include "model/diffusion/dit.hpp"
#include "model/diffusion/model.hpp"
#include "model/te/llm.hpp"
namespace HiDreamO1 {
constexpr int HIDREAM_O1_GRAPH_SIZE = 32768;
@ -23,6 +23,39 @@ namespace HiDreamO1 {
constexpr int IMAGE_TOKEN_ID = 151655;
constexpr int VISION_START_TOKEN_ID = 151652;
struct HiDreamO1Config {
LLM::LLMConfig llm;
int patch_size = PATCH_SIZE;
static HiDreamO1Config detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
(void)tensor_storage_map;
(void)prefix;
HiDreamO1Config config;
config.llm.arch = LLM::LLMArch::QWEN3_VL;
config.llm.hidden_size = 4096;
config.llm.intermediate_size = 12288;
config.llm.num_layers = 36;
config.llm.num_heads = 32;
config.llm.num_kv_heads = 8;
config.llm.head_dim = 128;
config.llm.qkv_bias = false;
config.llm.qk_norm = true;
config.llm.vocab_size = 151936;
config.llm.rms_norm_eps = 1e-6f;
config.llm.vision.arch = LLM::LLMVisionArch::QWEN3_VL;
config.llm.vision.num_layers = 27;
config.llm.vision.hidden_size = 1152;
config.llm.vision.intermediate_size = 4304;
config.llm.vision.num_heads = 16;
config.llm.vision.out_hidden_size = 4096;
config.llm.vision.patch_size = 16;
config.llm.vision.spatial_merge_size = 2;
config.llm.vision.temporal_patch_size = 2;
config.llm.vision.num_position_embeddings = 2304;
return config;
}
};
static inline std::string repeat_special_token(const std::string& token, int64_t count) {
std::string out;
out.reserve(static_cast<size_t>(count) * token.size());
@ -205,50 +238,19 @@ namespace HiDreamO1 {
}
};
struct HiDreamO1Params {
LLM::LLMParams llm;
int patch_size = PATCH_SIZE;
};
static inline HiDreamO1Params make_hidream_o1_params() {
HiDreamO1Params params;
params.llm.arch = LLM::LLMArch::QWEN3_VL;
params.llm.hidden_size = 4096;
params.llm.intermediate_size = 12288;
params.llm.num_layers = 36;
params.llm.num_heads = 32;
params.llm.num_kv_heads = 8;
params.llm.head_dim = 128;
params.llm.qkv_bias = false;
params.llm.qk_norm = true;
params.llm.vocab_size = 151936;
params.llm.rms_norm_eps = 1e-6f;
params.llm.vision.arch = LLM::LLMVisionArch::QWEN3_VL;
params.llm.vision.num_layers = 27;
params.llm.vision.hidden_size = 1152;
params.llm.vision.intermediate_size = 4304;
params.llm.vision.num_heads = 16;
params.llm.vision.out_hidden_size = 4096;
params.llm.vision.patch_size = 16;
params.llm.vision.spatial_merge_size = 2;
params.llm.vision.temporal_patch_size = 2;
params.llm.vision.num_position_embeddings = 2304;
return params;
}
struct HiDreamO1Model : public GGMLBlock {
HiDreamO1Params params;
HiDreamO1Config config;
HiDreamO1Model() = default;
explicit HiDreamO1Model(HiDreamO1Params params)
: params(std::move(params)) {
blocks["language_model"] = std::make_shared<LLM::TextModel>(this->params.llm);
blocks["t_embedder1"] = std::make_shared<TimestepEmbedder>(this->params.llm.hidden_size);
blocks["x_embedder"] = std::make_shared<BottleneckPatchEmbed>(this->params.patch_size * this->params.patch_size * 3,
this->params.llm.hidden_size / 4,
this->params.llm.hidden_size);
blocks["final_layer2"] = std::make_shared<FinalLayer>(this->params.llm.hidden_size,
this->params.patch_size * this->params.patch_size * 3);
explicit HiDreamO1Model(HiDreamO1Config config)
: config(std::move(config)) {
blocks["language_model"] = std::make_shared<LLM::TextModel>(this->config.llm);
blocks["t_embedder1"] = std::make_shared<TimestepEmbedder>(this->config.llm.hidden_size);
blocks["x_embedder"] = std::make_shared<BottleneckPatchEmbed>(this->config.patch_size * this->config.patch_size * 3,
this->config.llm.hidden_size / 4,
this->config.llm.hidden_size);
blocks["final_layer2"] = std::make_shared<FinalLayer>(this->config.llm.hidden_size,
this->config.patch_size * this->config.patch_size * 3);
}
std::shared_ptr<LLM::TextModel> text_model() {
@ -269,7 +271,7 @@ namespace HiDreamO1 {
};
struct HiDreamO1VisionRunner : public GGMLRunner {
HiDreamO1Params params;
HiDreamO1Config config;
std::shared_ptr<LLM::VisionModel> model;
std::vector<int> window_index_vec;
@ -284,8 +286,8 @@ namespace HiDreamO1 {
const String2TensorStorage& tensor_storage_map = {},
const std::string& prefix = "model.visual")
: GGMLRunner(backend, params_backend),
params(make_hidream_o1_params()),
model(std::make_shared<LLM::VisionModel>(false, params.llm.vision)) {
config(HiDreamO1Config::detect_from_weights(tensor_storage_map, prefix)),
model(std::make_shared<LLM::VisionModel>(false, config.llm.vision)) {
model->init(params_ctx, tensor_storage_map, prefix);
}
@ -302,7 +304,7 @@ namespace HiDreamO1 {
compute_ctx,
runner_ctx,
image,
params.llm.vision,
config.llm.vision,
model,
window_index_vec,
window_inverse_index_vec,
@ -331,7 +333,7 @@ namespace HiDreamO1 {
};
struct HiDreamO1Runner : public DiffusionModelRunner {
HiDreamO1Params params;
HiDreamO1Config config;
HiDreamO1Model model;
std::vector<float> attention_mask_vec;
@ -341,8 +343,8 @@ namespace HiDreamO1 {
const String2TensorStorage& tensor_storage_map = {},
const std::string& prefix = "model")
: DiffusionModelRunner(backend, params_backend, prefix),
params(make_hidream_o1_params()) {
model = HiDreamO1Model(params);
config(HiDreamO1Config::detect_from_weights(tensor_storage_map, prefix)) {
model = HiDreamO1Model(config);
model.init(params_ctx, tensor_storage_map, prefix);
}
@ -676,4 +678,4 @@ namespace HiDreamO1 {
};
} // namespace HiDreamO1
#endif // __SD_HIDREAM_O1_H__
#endif // __SD_MODEL_DIFFUSION_HIDREAM_O1_HPP__

View File

@ -1,5 +1,5 @@
#ifndef __IDEOGRAM4_HPP__
#define __IDEOGRAM4_HPP__
#ifndef __SD_MODEL_DIFFUSION_IDEOGRAM4_HPP__
#define __SD_MODEL_DIFFUSION_IDEOGRAM4_HPP__
#include <algorithm>
#include <cmath>
@ -8,10 +8,10 @@
#include <string>
#include <vector>
#include "diffusion_model.hpp"
#include "ggml_extend.hpp"
#include "ggml_graph_cut.h"
#include "rope.hpp"
#include "core/ggml_extend.hpp"
#include "core/ggml_graph_cut.h"
#include "model/common/rope.hpp"
#include "model/diffusion/model.hpp"
namespace Ideogram4 {
constexpr int IDEOGRAM4_GRAPH_SIZE = 65536;
@ -38,6 +38,34 @@ namespace Ideogram4 {
std::vector<int> mrope_section = {DEFAULT_MROPE_SECTION_T,
DEFAULT_MROPE_SECTION_H,
DEFAULT_MROPE_SECTION_W};
static Ideogram4Config detect_from_weights(const String2TensorStorage& tensor_storage_map,
const std::string& prefix) {
Ideogram4Config config;
int64_t detected_layers = 0;
std::string layer_prefix = prefix.empty() ? "layers." : prefix + ".layers.";
for (const auto& [name, _] : tensor_storage_map) {
if (name.find(layer_prefix) != 0) {
continue;
}
std::string tail = name.substr(layer_prefix.size());
size_t dot = tail.find('.');
if (dot == std::string::npos) {
continue;
}
int layer_idx = std::atoi(tail.substr(0, dot).c_str());
detected_layers = std::max<int64_t>(detected_layers, layer_idx + 1);
}
if (detected_layers > 0) {
config.num_layers = detected_layers;
LOG_DEBUG("ideogram4: num_layers = %" PRId64 ", emb_dim = %" PRId64 ", num_heads = %" PRId64 ", intermediate_size = %" PRId64,
config.num_layers,
config.emb_dim,
config.num_heads,
config.intermediate_size);
}
return config;
}
};
__STATIC_INLINE__ ggml_tensor* timestep_embedding_sin_cos(ggml_context* ctx,
@ -380,26 +408,6 @@ namespace Ideogram4 {
class Ideogram4Runner : public DiffusionModelRunner {
protected:
static int64_t detect_num_layers(const String2TensorStorage& tensor_storage_map,
const std::string& prefix) {
int64_t detected_layers = 0;
std::string layer_prefix = prefix.empty() ? "layers." : prefix + ".layers.";
for (const auto& pair : tensor_storage_map) {
const std::string& name = pair.first;
if (name.find(layer_prefix) != 0) {
continue;
}
std::string tail = name.substr(layer_prefix.size());
size_t dot = tail.find('.');
if (dot == std::string::npos) {
continue;
}
int layer_idx = std::atoi(tail.substr(0, dot).c_str());
detected_layers = std::max<int64_t>(detected_layers, layer_idx + 1);
}
return detected_layers;
}
bool should_use_uncond_model(const DiffusionParams& diffusion_params) const {
return has_uncond_model &&
diffusion_params.context == nullptr &&
@ -421,12 +429,8 @@ namespace Ideogram4 {
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "")
: DiffusionModelRunner(backend, params_backend, prefix),
config(Ideogram4Config::detect_from_weights(tensor_storage_map, prefix)),
uncond_prefix(prefix + ".uncond") {
int64_t detected_layers = detect_num_layers(tensor_storage_map, prefix);
if (detected_layers > 0) {
config.num_layers = detected_layers;
}
model = Ideogram4Transformer(config);
model.init(params_ctx, tensor_storage_map, prefix);
for (const auto& pair : tensor_storage_map) {
@ -524,4 +528,4 @@ namespace Ideogram4 {
};
} // namespace Ideogram4
#endif // __IDEOGRAM4_HPP__
#endif // __SD_MODEL_DIFFUSION_IDEOGRAM4_HPP__

View File

@ -1,18 +1,83 @@
#ifndef __SD_LENS_HPP__
#define __SD_LENS_HPP__
#ifndef __SD_MODEL_DIFFUSION_LENS_HPP__
#define __SD_MODEL_DIFFUSION_LENS_HPP__
#include <memory>
#include <vector>
#include "common_block.hpp"
#include "diffusion_model.hpp"
#include "flux.hpp"
#include "qwen_image.hpp"
#include "rope.hpp"
#include "model/common/block.hpp"
#include "model/common/rope.hpp"
#include "model/diffusion/flux.hpp"
#include "model/diffusion/model.hpp"
#include "model/diffusion/qwen_image.hpp"
namespace Lens {
constexpr int LENS_GRAPH_SIZE = 40960;
struct LensConfig {
int patch_size = 2;
int64_t in_channels = 128;
int64_t out_channels = 32;
int num_layers = 48;
int64_t attention_head_dim = 64;
int64_t num_attention_heads = 24;
int64_t joint_attention_dim = 2880;
int selected_layer_count = 4;
int theta = 10000;
std::vector<int> axes_dim = {8, 28, 28};
int axes_dim_sum = 64;
static LensConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
LensConfig config;
config.num_layers = 0;
for (const auto& [name, tensor_storage] : tensor_storage_map) {
if (!starts_with(name, prefix)) {
continue;
}
if (ends_with(name, "img_in.weight") && tensor_storage.n_dims == 2) {
config.in_channels = tensor_storage.ne[0];
int64_t inner_dim = tensor_storage.ne[1];
if (config.attention_head_dim > 0) {
config.num_attention_heads = inner_dim / config.attention_head_dim;
}
} else if (ends_with(name, "txt_in.weight") && tensor_storage.n_dims == 2) {
config.selected_layer_count = static_cast<int>(tensor_storage.ne[0] / config.joint_attention_dim);
} else if (ends_with(name, "proj_out.weight") && tensor_storage.n_dims == 2) {
int64_t patch_area = config.patch_size * config.patch_size;
config.out_channels = tensor_storage.ne[1] / patch_area;
} else if (ends_with(name, "transformer_blocks.0.attn.norm_q.weight") && tensor_storage.n_dims == 1) {
config.attention_head_dim = tensor_storage.ne[0];
}
size_t pos = name.find("transformer_blocks.");
if (pos != std::string::npos) {
auto items = split_string(name.substr(pos), '.');
if (items.size() > 1) {
int block_index = atoi(items[1].c_str());
if (block_index + 1 > config.num_layers) {
config.num_layers = block_index + 1;
}
}
}
}
if (config.num_layers == 0) {
config.num_layers = 48;
}
config.axes_dim_sum = 0;
for (int axis_dim : config.axes_dim) {
config.axes_dim_sum += axis_dim;
}
LOG_DEBUG("lens: num_layers = %d, selected_layer_count = %d, hidden_size = %" PRId64 ", num_attention_heads = %" PRId64 ", attention_head_dim = %" PRId64 ", in_channels = %" PRId64 ", out_channels = %" PRId64,
config.num_layers,
config.selected_layer_count,
config.num_attention_heads * config.attention_head_dim,
config.num_attention_heads,
config.attention_head_dim,
config.in_channels,
config.out_channels);
return config;
}
};
struct LensTimestepProjEmbeddings : public GGMLBlock {
LensTimestepProjEmbeddings(int64_t embedding_dim) {
blocks["timestep_embedder"] = std::make_shared<Qwen::TimestepEmbedding>(256, embedding_dim);
@ -209,41 +274,27 @@ namespace Lens {
}
};
struct LensParams {
int patch_size = 2;
int64_t in_channels = 128;
int64_t out_channels = 32;
int num_layers = 48;
int64_t attention_head_dim = 64;
int64_t num_attention_heads = 24;
int64_t joint_attention_dim = 2880;
int selected_layer_count = 4;
int theta = 10000;
std::vector<int> axes_dim = {8, 28, 28};
int axes_dim_sum = 64;
};
class LensModel : public GGMLBlock {
public:
LensParams params;
LensConfig config;
LensModel() = default;
LensModel(LensParams params)
: params(params) {
int64_t inner_dim = params.num_attention_heads * params.attention_head_dim;
LensModel(LensConfig config)
: config(config) {
int64_t inner_dim = config.num_attention_heads * config.attention_head_dim;
blocks["time_text_embed"] = std::make_shared<LensTimestepProjEmbeddings>(inner_dim);
blocks["img_in"] = std::make_shared<Linear>(params.in_channels, inner_dim, true);
blocks["txt_in"] = std::make_shared<Linear>(params.joint_attention_dim * params.selected_layer_count, inner_dim, true);
for (int i = 0; i < params.selected_layer_count; ++i) {
blocks["txt_norm." + std::to_string(i)] = std::make_shared<RMSNorm>(params.joint_attention_dim, 1e-5f);
blocks["img_in"] = std::make_shared<Linear>(config.in_channels, inner_dim, true);
blocks["txt_in"] = std::make_shared<Linear>(config.joint_attention_dim * config.selected_layer_count, inner_dim, true);
for (int i = 0; i < config.selected_layer_count; ++i) {
blocks["txt_norm." + std::to_string(i)] = std::make_shared<RMSNorm>(config.joint_attention_dim, 1e-5f);
}
for (int i = 0; i < params.num_layers; ++i) {
for (int i = 0; i < config.num_layers; ++i) {
blocks["transformer_blocks." + std::to_string(i)] = std::make_shared<LensTransformerBlock>(inner_dim,
params.num_attention_heads,
params.attention_head_dim);
config.num_attention_heads,
config.attention_head_dim);
}
blocks["norm_out"] = std::make_shared<LensAdaLayerNormContinuous>(inner_dim, 1e-6f);
blocks["proj_out"] = std::make_shared<Linear>(inner_dim, params.patch_size * params.patch_size * params.out_channels, true);
blocks["proj_out"] = std::make_shared<Linear>(inner_dim, config.patch_size * config.patch_size * config.out_channels, true);
}
ggml_tensor* forward(GGMLRunnerContext* ctx,
@ -269,9 +320,9 @@ namespace Lens {
img = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, img, 1, 0, 2, 3));
img = img_in->forward(ctx, img);
std::vector<ggml_tensor*> txt_chunks = ggml_ext_chunk(ctx->ggml_ctx, context, params.selected_layer_count, 0);
std::vector<ggml_tensor*> txt_chunks = ggml_ext_chunk(ctx->ggml_ctx, context, config.selected_layer_count, 0);
ggml_tensor* txt = nullptr;
for (int i = 0; i < params.selected_layer_count; ++i) {
for (int i = 0; i < config.selected_layer_count; ++i) {
auto txt_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm." + std::to_string(i)]);
auto chunk = txt_norm->forward(ctx, txt_chunks[i]);
txt = txt == nullptr ? chunk : ggml_concat(ctx->ggml_ctx, txt, chunk, 0);
@ -281,7 +332,7 @@ namespace Lens {
sd::ggml_graph_cut::mark_graph_cut(img, "lens.prelude", "img");
sd::ggml_graph_cut::mark_graph_cut(txt, "lens.prelude", "txt");
for (int i = 0; i < params.num_layers; ++i) {
for (int i = 0; i < config.num_layers; ++i) {
auto block = std::dynamic_pointer_cast<LensTransformerBlock>(blocks["transformer_blocks." + std::to_string(i)]);
auto out = block->forward(ctx, img, txt, t_emb, pe);
img = out.first;
@ -294,13 +345,13 @@ namespace Lens {
img = proj_out->forward(ctx, img);
auto out = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, img, 1, 0, 2, 3));
out = ggml_reshape_4d(ctx->ggml_ctx, out, W, H, params.patch_size * params.patch_size * params.out_channels, N);
out = ggml_reshape_4d(ctx->ggml_ctx, out, W, H, config.patch_size * config.patch_size * config.out_channels, N);
return out;
}
};
struct LensRunner : public DiffusionModelRunner {
LensParams lens_params;
LensConfig config;
LensModel lens;
std::vector<float> pe_vec;
@ -308,53 +359,9 @@ namespace Lens {
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "")
: DiffusionModelRunner(backend, params_backend, prefix) {
lens_params.num_layers = 0;
for (const auto& [name, tensor_storage] : tensor_storage_map) {
if (!starts_with(name, prefix)) {
continue;
}
if (ends_with(name, "img_in.weight") && tensor_storage.n_dims == 2) {
lens_params.in_channels = tensor_storage.ne[0];
int64_t inner_dim = tensor_storage.ne[1];
lens_params.num_attention_heads = inner_dim / lens_params.attention_head_dim;
} else if (ends_with(name, "txt_in.weight") && tensor_storage.n_dims == 2) {
lens_params.selected_layer_count = static_cast<int>(tensor_storage.ne[0] / lens_params.joint_attention_dim);
} else if (ends_with(name, "proj_out.weight") && tensor_storage.n_dims == 2) {
lens_params.out_channels = tensor_storage.ne[1] / lens_params.patch_size / lens_params.patch_size;
} else if (ends_with(name, "transformer_blocks.0.attn.norm_q.weight") && tensor_storage.n_dims == 1) {
lens_params.attention_head_dim = tensor_storage.ne[0];
}
size_t pos = name.find("transformer_blocks.");
if (pos != std::string::npos) {
std::string layer_name = name.substr(pos);
auto items = split_string(layer_name, '.');
if (items.size() > 1) {
int block_index = atoi(items[1].c_str());
if (block_index + 1 > lens_params.num_layers) {
lens_params.num_layers = block_index + 1;
}
}
}
}
if (lens_params.num_layers == 0) {
lens_params.num_layers = 48;
}
lens_params.axes_dim_sum = 0;
for (int axis_dim : lens_params.axes_dim) {
lens_params.axes_dim_sum += axis_dim;
}
LOG_INFO("lens: layers = %d, in_channels = %" PRId64 ", out_channels = %" PRId64
", heads = %" PRId64 ", head_dim = %" PRId64,
lens_params.num_layers,
lens_params.in_channels,
lens_params.out_channels,
lens_params.num_attention_heads,
lens_params.attention_head_dim);
lens = LensModel(lens_params);
: DiffusionModelRunner(backend, params_backend, prefix),
config(LensConfig::detect_from_weights(tensor_storage_map, prefix)) {
lens = LensModel(config);
lens.init(params_ctx, tensor_storage_map, prefix);
}
@ -380,12 +387,12 @@ namespace Lens {
static_cast<int>(x->ne[0]),
static_cast<int>(x->ne[3]),
static_cast<int>(context->ne[1]),
lens_params.theta,
config.theta,
circular_y_enabled,
circular_x_enabled,
lens_params.axes_dim);
int pos_len = static_cast<int>(pe_vec.size() / lens_params.axes_dim_sum / 2);
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, lens_params.axes_dim_sum / 2, pos_len);
config.axes_dim);
int pos_len = static_cast<int>(pe_vec.size() / config.axes_dim_sum / 2);
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.axes_dim_sum / 2, pos_len);
set_backend_tensor_data(pe, pe_vec.data());
auto runner_ctx = get_context();
@ -416,4 +423,4 @@ namespace Lens {
};
} // namespace Lens
#endif // __SD_LENS_HPP__
#endif // __SD_MODEL_DIFFUSION_LENS_HPP__

View File

@ -1,5 +1,5 @@
#ifndef __SD_LTXV_HPP__
#define __SD_LTXV_HPP__
#ifndef __SD_MODEL_DIFFUSION_LTXV_HPP__
#define __SD_MODEL_DIFFUSION_LTXV_HPP__
#include <algorithm>
#include <cmath>
@ -9,10 +9,10 @@
#include <utility>
#include <vector>
#include "common_block.hpp"
#include "diffusion_model.hpp"
#include "flux.hpp"
#include "rope.hpp"
#include "model/common/block.hpp"
#include "model/common/rope.hpp"
#include "model/diffusion/flux.hpp"
#include "model/diffusion/model.hpp"
namespace LTXV {
@ -72,6 +72,200 @@ namespace LTXV {
return max_block + 1;
}
struct LTXAVConfig {
int64_t in_channels = 128;
int64_t out_channels = 128;
int64_t hidden_size = 3840;
int64_t cross_attention_dim = 4096;
int64_t caption_channels = 3840;
int64_t num_attention_heads = 30;
int64_t attention_head_dim = 128;
int64_t num_layers = 28;
float positional_embedding_theta = 10000.f;
std::vector<int> positional_embedding_max_pos = {20, 2048, 2048};
std::tuple<int, int, int> vae_scale_factors = {8, 32, 32};
bool causal_temporal_positioning = true;
float timestep_scale_multiplier = 1000.f;
int64_t audio_in_channels = 128;
int64_t audio_out_channels = 128;
int64_t audio_hidden_size = 2048;
int64_t audio_cross_attention_dim = 2048;
int64_t audio_num_attention_heads = 32;
int64_t audio_attention_head_dim = 64;
std::vector<int> audio_positional_embedding_max_pos = {20};
float av_ca_timestep_scale_multiplier = 1000.f;
int64_t num_audio_channels = 8;
int64_t audio_frequency_bins = 16;
bool use_connector = false;
int64_t connector_hidden_size = 3840;
int64_t connector_num_heads = 30;
int64_t connector_head_dim = 128;
int64_t connector_num_layers = 2;
int64_t connector_num_registers = 128;
bool connector_rope_interleaved = false;
bool connector_apply_gated_attention = false;
bool use_audio_connector = false;
int64_t audio_connector_hidden_size = 2048;
int64_t audio_connector_num_heads = 32;
int64_t audio_connector_head_dim = 64;
int64_t audio_connector_num_layers = 2;
int64_t audio_connector_num_registers = 128;
bool audio_connector_rope_interleaved = false;
bool audio_connector_apply_gated_attention = false;
bool video_rope_interleaved = false;
bool use_middle_indices_grid = true;
bool cross_attention_adaln = false;
bool use_caption_projection = true;
bool use_audio_caption_projection = true;
bool caption_proj_before_connector = true;
bool caption_projection_first_linear = false;
bool self_attention_gated = false;
bool cross_attention_gated = false;
static std::pair<int64_t, int64_t> infer_attention_layout(int64_t hidden_size,
int64_t preferred_heads = -1) {
if (preferred_heads > 0 && hidden_size % preferred_heads == 0) {
return {preferred_heads, hidden_size / preferred_heads};
}
const int candidates[] = {128, 96, 80, 64, 48, 40, 32};
for (int head_dim : candidates) {
if (hidden_size % head_dim == 0) {
int64_t heads = hidden_size / head_dim;
if (heads >= 8 && heads <= 64) {
return {heads, head_dim};
}
}
}
return {32, hidden_size / 32};
}
static int64_t infer_gate_heads(const String2TensorStorage& tensor_storage_map,
const std::string& bias_name,
int64_t fallback_heads) {
auto it = tensor_storage_map.find(bias_name);
if (it != tensor_storage_map.end()) {
return it->second.ne[0];
}
return fallback_heads;
}
static LTXAVConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
LTXAVConfig config;
auto patchify_proj_iter = tensor_storage_map.find(prefix + ".patchify_proj.weight");
if (patchify_proj_iter != tensor_storage_map.end()) {
config.in_channels = patchify_proj_iter->second.ne[0];
config.hidden_size = patchify_proj_iter->second.ne[1];
int64_t video_heads = infer_gate_heads(tensor_storage_map, prefix + ".transformer_blocks.0.attn1.to_gate_logits.bias", 32);
auto attn_layout = infer_attention_layout(config.hidden_size, video_heads);
config.num_attention_heads = attn_layout.first;
config.attention_head_dim = attn_layout.second;
}
auto audio_patchify_proj_iter = tensor_storage_map.find(prefix + ".audio_patchify_proj.weight");
if (audio_patchify_proj_iter != tensor_storage_map.end()) {
config.audio_in_channels = audio_patchify_proj_iter->second.ne[0];
config.audio_hidden_size = audio_patchify_proj_iter->second.ne[1];
config.audio_out_channels = config.audio_in_channels;
int64_t audio_heads = infer_gate_heads(tensor_storage_map, prefix + ".transformer_blocks.0.audio_attn1.to_gate_logits.bias", 32);
auto audio_attn_layout = infer_attention_layout(config.audio_hidden_size, audio_heads);
config.audio_num_attention_heads = audio_attn_layout.first;
config.audio_attention_head_dim = audio_attn_layout.second;
}
auto proj_out_iter = tensor_storage_map.find(prefix + ".proj_out.weight");
if (proj_out_iter != tensor_storage_map.end()) {
config.out_channels = proj_out_iter->second.ne[1];
}
auto audio_proj_out_iter = tensor_storage_map.find(prefix + ".audio_proj_out.weight");
if (audio_proj_out_iter != tensor_storage_map.end()) {
config.audio_out_channels = audio_proj_out_iter->second.ne[1];
}
auto attn2_iter = tensor_storage_map.find(prefix + ".transformer_blocks.0.attn2.to_k.weight");
if (attn2_iter != tensor_storage_map.end()) {
config.cross_attention_dim = attn2_iter->second.ne[0];
}
auto audio_attn2_iter = tensor_storage_map.find(prefix + ".transformer_blocks.0.audio_attn2.to_k.weight");
if (audio_attn2_iter != tensor_storage_map.end()) {
config.audio_cross_attention_dim = audio_attn2_iter->second.ne[0];
}
if (tensor_storage_map.find(prefix + ".transformer_blocks.0.prompt_scale_shift_table") != tensor_storage_map.end()) {
config.cross_attention_adaln = true;
}
if (tensor_storage_map.find(prefix + ".transformer_blocks.0.attn1.to_gate_logits.weight") != tensor_storage_map.end() ||
tensor_storage_map.find(prefix + ".transformer_blocks.0.audio_attn1.to_gate_logits.weight") != tensor_storage_map.end()) {
config.self_attention_gated = true;
}
if (tensor_storage_map.find(prefix + ".transformer_blocks.0.attn2.to_gate_logits.weight") != tensor_storage_map.end() ||
tensor_storage_map.find(prefix + ".transformer_blocks.0.audio_attn2.to_gate_logits.weight") != tensor_storage_map.end()) {
config.cross_attention_gated = true;
}
if (tensor_storage_map.find(prefix + ".caption_projection.linear_1.weight") == tensor_storage_map.end() &&
tensor_storage_map.find(prefix + ".caption_projection.linear_2.weight") == tensor_storage_map.end()) {
config.use_caption_projection = false;
}
if (tensor_storage_map.find(prefix + ".audio_caption_projection.linear_1.weight") == tensor_storage_map.end() &&
tensor_storage_map.find(prefix + ".audio_caption_projection.linear_2.weight") == tensor_storage_map.end()) {
config.use_audio_caption_projection = false;
}
config.num_layers = count_prefix_blocks(tensor_storage_map, prefix + ".", "transformer_blocks.");
auto connector_iter = tensor_storage_map.find(prefix + ".video_embeddings_connector.transformer_1d_blocks.0.attn1.to_q.weight");
if (connector_iter != tensor_storage_map.end()) {
config.use_connector = true;
config.connector_hidden_size = connector_iter->second.ne[1];
int64_t connector_heads = infer_gate_heads(tensor_storage_map,
prefix + ".video_embeddings_connector.transformer_1d_blocks.0.attn1.to_gate_logits.bias",
32);
auto connector_layout = infer_attention_layout(config.connector_hidden_size, connector_heads);
config.connector_num_heads = connector_layout.first;
config.connector_head_dim = connector_layout.second;
config.connector_num_layers = count_prefix_blocks(tensor_storage_map, prefix + ".video_embeddings_connector.", "transformer_1d_blocks.");
auto register_iter = tensor_storage_map.find(prefix + ".video_embeddings_connector.learnable_registers");
if (register_iter != tensor_storage_map.end()) {
config.connector_num_registers = register_iter->second.ne[1];
}
if (tensor_storage_map.find(prefix + ".video_embeddings_connector.transformer_1d_blocks.0.attn1.to_gate_logits.weight") != tensor_storage_map.end()) {
config.connector_apply_gated_attention = true;
}
}
auto audio_connector_iter = tensor_storage_map.find(prefix + ".audio_embeddings_connector.transformer_1d_blocks.0.attn1.to_q.weight");
if (audio_connector_iter != tensor_storage_map.end()) {
config.use_audio_connector = true;
config.audio_connector_hidden_size = audio_connector_iter->second.ne[1];
int64_t connector_heads = infer_gate_heads(tensor_storage_map,
prefix + ".audio_embeddings_connector.transformer_1d_blocks.0.attn1.to_gate_logits.bias",
32);
auto connector_layout = infer_attention_layout(config.audio_connector_hidden_size, connector_heads);
config.audio_connector_num_heads = connector_layout.first;
config.audio_connector_head_dim = connector_layout.second;
config.audio_connector_num_layers = count_prefix_blocks(tensor_storage_map, prefix + ".audio_embeddings_connector.", "transformer_1d_blocks.");
auto register_iter = tensor_storage_map.find(prefix + ".audio_embeddings_connector.learnable_registers");
if (register_iter != tensor_storage_map.end()) {
config.audio_connector_num_registers = register_iter->second.ne[1];
}
if (tensor_storage_map.find(prefix + ".audio_embeddings_connector.transformer_1d_blocks.0.attn1.to_gate_logits.weight") != tensor_storage_map.end()) {
config.audio_connector_apply_gated_attention = true;
}
}
LOG_DEBUG("ltxav: num_layers = %" PRId64 ", hidden_size = %" PRId64 ", num_attention_heads = %" PRId64 ", audio_hidden_size = %" PRId64 ", audio_num_attention_heads = %" PRId64,
config.num_layers,
config.hidden_size,
config.num_attention_heads,
config.audio_hidden_size,
config.audio_num_attention_heads);
return config;
}
};
__STATIC_INLINE__ std::vector<float> generate_freq_grid(float theta,
int positional_dims,
int dim) {
@ -749,63 +943,6 @@ namespace LTXV {
}
};
struct LTXAVParams {
int64_t in_channels = 128;
int64_t out_channels = 128;
int64_t hidden_size = 3840;
int64_t cross_attention_dim = 4096;
int64_t caption_channels = 3840;
int64_t num_attention_heads = 30;
int64_t attention_head_dim = 128;
int64_t num_layers = 28;
float positional_embedding_theta = 10000.f;
std::vector<int> positional_embedding_max_pos = {20, 2048, 2048};
std::tuple<int, int, int> vae_scale_factors = {8, 32, 32};
bool causal_temporal_positioning = true;
float timestep_scale_multiplier = 1000.f;
int64_t audio_in_channels = 128;
int64_t audio_out_channels = 128;
int64_t audio_hidden_size = 2048;
int64_t audio_cross_attention_dim = 2048;
int64_t audio_num_attention_heads = 32;
int64_t audio_attention_head_dim = 64;
std::vector<int> audio_positional_embedding_max_pos = {20};
float av_ca_timestep_scale_multiplier = 1000.f;
int64_t num_audio_channels = 8;
int64_t audio_frequency_bins = 16;
bool use_connector = false;
int64_t connector_hidden_size = 3840;
int64_t connector_num_heads = 30;
int64_t connector_head_dim = 128;
int64_t connector_num_layers = 2;
int64_t connector_num_registers = 128;
bool connector_rope_interleaved = false;
bool connector_apply_gated_attention = false;
bool use_audio_connector = false;
int64_t audio_connector_hidden_size = 2048;
int64_t audio_connector_num_heads = 32;
int64_t audio_connector_head_dim = 64;
int64_t audio_connector_num_layers = 2;
int64_t audio_connector_num_registers = 128;
bool audio_connector_rope_interleaved = false;
bool audio_connector_apply_gated_attention = false;
bool video_rope_interleaved = false;
bool use_middle_indices_grid = true;
bool cross_attention_adaln = false;
bool use_caption_projection = true;
bool use_audio_caption_projection = true;
bool caption_proj_before_connector = true;
bool caption_projection_first_linear = false;
bool self_attention_gated = false;
bool cross_attention_gated = false;
};
__STATIC_INLINE__ std::pair<int64_t, int64_t> infer_attention_layout(int64_t hidden_size,
int64_t preferred_heads = -1) {
if (preferred_heads > 0 && hidden_size % preferred_heads == 0) {
@ -1169,92 +1306,92 @@ namespace LTXV {
};
struct LTXAVModelBlock : public GGMLBlock {
LTXAVParams cfg;
LTXAVConfig config;
void init_params(ggml_context* ctx,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "") override {
params["scale_shift_table"] = ggml_new_tensor_2d(ctx,
get_type(prefix + "scale_shift_table", tensor_storage_map, GGML_TYPE_F32),
cfg.hidden_size,
config.hidden_size,
2);
params["audio_scale_shift_table"] = ggml_new_tensor_2d(ctx,
get_type(prefix + "audio_scale_shift_table", tensor_storage_map, GGML_TYPE_F32),
cfg.audio_hidden_size,
config.audio_hidden_size,
2);
}
LTXAVModelBlock(const LTXAVParams& params)
: cfg(params) {
blocks["patchify_proj"] = std::make_shared<Linear>(cfg.in_channels, cfg.hidden_size, true, true);
blocks["audio_patchify_proj"] = std::make_shared<Linear>(cfg.audio_in_channels, cfg.audio_hidden_size, true, true);
blocks["adaln_single"] = std::make_shared<AdaLayerNormSingle>(cfg.hidden_size, cfg.cross_attention_adaln ? 9 : 6);
blocks["audio_adaln_single"] = std::make_shared<AdaLayerNormSingle>(cfg.audio_hidden_size, cfg.cross_attention_adaln ? 9 : 6);
if (cfg.cross_attention_adaln) {
blocks["prompt_adaln_single"] = std::make_shared<AdaLayerNormSingle>(cfg.hidden_size, 2);
blocks["audio_prompt_adaln_single"] = std::make_shared<AdaLayerNormSingle>(cfg.audio_hidden_size, 2);
LTXAVModelBlock(const LTXAVConfig& config)
: config(config) {
blocks["patchify_proj"] = std::make_shared<Linear>(config.in_channels, config.hidden_size, true, true);
blocks["audio_patchify_proj"] = std::make_shared<Linear>(config.audio_in_channels, config.audio_hidden_size, true, true);
blocks["adaln_single"] = std::make_shared<AdaLayerNormSingle>(config.hidden_size, config.cross_attention_adaln ? 9 : 6);
blocks["audio_adaln_single"] = std::make_shared<AdaLayerNormSingle>(config.audio_hidden_size, config.cross_attention_adaln ? 9 : 6);
if (config.cross_attention_adaln) {
blocks["prompt_adaln_single"] = std::make_shared<AdaLayerNormSingle>(config.hidden_size, 2);
blocks["audio_prompt_adaln_single"] = std::make_shared<AdaLayerNormSingle>(config.audio_hidden_size, 2);
}
blocks["av_ca_video_scale_shift_adaln_single"] = std::make_shared<AdaLayerNormSingle>(cfg.hidden_size, 4);
blocks["av_ca_a2v_gate_adaln_single"] = std::make_shared<AdaLayerNormSingle>(cfg.hidden_size, 1);
blocks["av_ca_audio_scale_shift_adaln_single"] = std::make_shared<AdaLayerNormSingle>(cfg.audio_hidden_size, 4);
blocks["av_ca_v2a_gate_adaln_single"] = std::make_shared<AdaLayerNormSingle>(cfg.audio_hidden_size, 1);
blocks["av_ca_video_scale_shift_adaln_single"] = std::make_shared<AdaLayerNormSingle>(config.hidden_size, 4);
blocks["av_ca_a2v_gate_adaln_single"] = std::make_shared<AdaLayerNormSingle>(config.hidden_size, 1);
blocks["av_ca_audio_scale_shift_adaln_single"] = std::make_shared<AdaLayerNormSingle>(config.audio_hidden_size, 4);
blocks["av_ca_v2a_gate_adaln_single"] = std::make_shared<AdaLayerNormSingle>(config.audio_hidden_size, 1);
if (cfg.use_caption_projection) {
if (cfg.caption_proj_before_connector) {
if (cfg.caption_projection_first_linear) {
blocks["caption_projection"] = std::make_shared<NormSingleLinearTextProjection>(cfg.caption_channels, cfg.hidden_size);
if (config.use_caption_projection) {
if (config.caption_proj_before_connector) {
if (config.caption_projection_first_linear) {
blocks["caption_projection"] = std::make_shared<NormSingleLinearTextProjection>(config.caption_channels, config.hidden_size);
}
} else {
blocks["caption_projection"] = std::make_shared<PixArtAlphaTextProjection>(cfg.caption_channels, cfg.hidden_size, cfg.hidden_size);
blocks["caption_projection"] = std::make_shared<PixArtAlphaTextProjection>(config.caption_channels, config.hidden_size, config.hidden_size);
}
}
if (cfg.use_audio_caption_projection) {
if (cfg.caption_proj_before_connector) {
if (cfg.caption_projection_first_linear) {
blocks["audio_caption_projection"] = std::make_shared<NormSingleLinearTextProjection>(cfg.caption_channels, cfg.audio_hidden_size);
if (config.use_audio_caption_projection) {
if (config.caption_proj_before_connector) {
if (config.caption_projection_first_linear) {
blocks["audio_caption_projection"] = std::make_shared<NormSingleLinearTextProjection>(config.caption_channels, config.audio_hidden_size);
}
} else {
blocks["audio_caption_projection"] = std::make_shared<PixArtAlphaTextProjection>(cfg.caption_channels, cfg.audio_hidden_size, cfg.audio_hidden_size);
blocks["audio_caption_projection"] = std::make_shared<PixArtAlphaTextProjection>(config.caption_channels, config.audio_hidden_size, config.audio_hidden_size);
}
}
if (cfg.use_connector) {
blocks["video_embeddings_connector"] = std::make_shared<Embeddings1DConnector>(cfg.connector_hidden_size,
cfg.connector_num_heads,
cfg.connector_head_dim,
cfg.connector_num_layers,
cfg.connector_num_registers,
cfg.connector_rope_interleaved,
cfg.connector_apply_gated_attention);
if (config.use_connector) {
blocks["video_embeddings_connector"] = std::make_shared<Embeddings1DConnector>(config.connector_hidden_size,
config.connector_num_heads,
config.connector_head_dim,
config.connector_num_layers,
config.connector_num_registers,
config.connector_rope_interleaved,
config.connector_apply_gated_attention);
}
if (cfg.use_audio_connector) {
blocks["audio_embeddings_connector"] = std::make_shared<Embeddings1DConnector>(cfg.audio_connector_hidden_size,
cfg.audio_connector_num_heads,
cfg.audio_connector_head_dim,
cfg.audio_connector_num_layers,
cfg.audio_connector_num_registers,
cfg.audio_connector_rope_interleaved,
cfg.audio_connector_apply_gated_attention);
if (config.use_audio_connector) {
blocks["audio_embeddings_connector"] = std::make_shared<Embeddings1DConnector>(config.audio_connector_hidden_size,
config.audio_connector_num_heads,
config.audio_connector_head_dim,
config.audio_connector_num_layers,
config.audio_connector_num_registers,
config.audio_connector_rope_interleaved,
config.audio_connector_apply_gated_attention);
}
for (int i = 0; i < cfg.num_layers; i++) {
blocks["transformer_blocks." + std::to_string(i)] = std::make_shared<BasicAVTransformerBlock>(cfg.hidden_size,
cfg.audio_hidden_size,
cfg.num_attention_heads,
cfg.audio_num_attention_heads,
cfg.attention_head_dim,
cfg.audio_attention_head_dim,
cfg.cross_attention_dim,
cfg.audio_cross_attention_dim,
cfg.self_attention_gated || cfg.cross_attention_gated,
cfg.cross_attention_adaln,
cfg.video_rope_interleaved);
for (int i = 0; i < config.num_layers; i++) {
blocks["transformer_blocks." + std::to_string(i)] = std::make_shared<BasicAVTransformerBlock>(config.hidden_size,
config.audio_hidden_size,
config.num_attention_heads,
config.audio_num_attention_heads,
config.attention_head_dim,
config.audio_attention_head_dim,
config.cross_attention_dim,
config.audio_cross_attention_dim,
config.self_attention_gated || config.cross_attention_gated,
config.cross_attention_adaln,
config.video_rope_interleaved);
}
blocks["norm_out"] = std::make_shared<LayerNorm>(cfg.hidden_size, 1e-6f, false);
blocks["proj_out"] = std::make_shared<Linear>(cfg.hidden_size, cfg.out_channels, true, true);
blocks["audio_norm_out"] = std::make_shared<LayerNorm>(cfg.audio_hidden_size, 1e-6f, false);
blocks["audio_proj_out"] = std::make_shared<Linear>(cfg.audio_hidden_size, cfg.audio_out_channels, true, true);
blocks["norm_out"] = std::make_shared<LayerNorm>(config.hidden_size, 1e-6f, false);
blocks["proj_out"] = std::make_shared<Linear>(config.hidden_size, config.out_channels, true, true);
blocks["audio_norm_out"] = std::make_shared<LayerNorm>(config.audio_hidden_size, 1e-6f, false);
blocks["audio_proj_out"] = std::make_shared<Linear>(config.audio_hidden_size, config.audio_out_channels, true, true);
}
ggml_tensor* patchify_video(GGMLRunnerContext* ctx, ggml_tensor* x, int64_t n) {
@ -1293,8 +1430,8 @@ namespace LTXV {
if (ax == nullptr) {
return nullptr;
}
ax = ggml_reshape_4d(ctx->ggml_ctx, ax, cfg.audio_frequency_bins, cfg.num_audio_channels, audio_length, ax->ne[2]); // [b, t, c, f]
ax = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, ax, 0, 2, 1, 3)); // [b, c, t, f]
ax = ggml_reshape_4d(ctx->ggml_ctx, ax, config.audio_frequency_bins, config.num_audio_channels, audio_length, ax->ne[2]); // [b, t, c, f]
ax = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, ax, 0, 2, 1, 3)); // [b, c, t, f]
return ax;
}
@ -1308,17 +1445,17 @@ namespace LTXV {
}
bool is_fully_processed_context =
context->ne[0] == cfg.cross_attention_dim + cfg.audio_cross_attention_dim &&
context->ne[0] == config.cross_attention_dim + config.audio_cross_attention_dim &&
context->ne[1] >= 1024;
bool is_unprocessed_dual_context =
context->ne[0] == cfg.cross_attention_dim + cfg.audio_cross_attention_dim &&
context->ne[0] == config.cross_attention_dim + config.audio_cross_attention_dim &&
context->ne[1] < 1024;
if (is_fully_processed_context) {
auto v_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, cfg.cross_attention_dim);
auto v_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, config.cross_attention_dim);
ggml_tensor* a_context = nullptr;
if (process_audio_context) {
a_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, cfg.cross_attention_dim, cfg.cross_attention_dim + cfg.audio_cross_attention_dim);
a_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, config.cross_attention_dim, config.cross_attention_dim + config.audio_cross_attention_dim);
}
return {v_context, a_context};
}
@ -1326,32 +1463,32 @@ namespace LTXV {
ggml_tensor* v_context = context;
ggml_tensor* a_context = process_audio_context ? context : nullptr;
if (is_unprocessed_dual_context) {
v_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, cfg.cross_attention_dim);
v_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, config.cross_attention_dim);
if (process_audio_context) {
a_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, cfg.cross_attention_dim, cfg.cross_attention_dim + cfg.audio_cross_attention_dim);
a_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, config.cross_attention_dim, config.cross_attention_dim + config.audio_cross_attention_dim);
}
} else if (context->ne[0] == cfg.caption_channels * 2) {
v_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, cfg.caption_channels);
} else if (context->ne[0] == config.caption_channels * 2) {
v_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, config.caption_channels);
if (process_audio_context) {
a_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, cfg.caption_channels, cfg.caption_channels * 2);
a_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, config.caption_channels, config.caption_channels * 2);
}
}
if (cfg.caption_proj_before_connector) {
if (cfg.use_caption_projection &&
if (config.caption_proj_before_connector) {
if (config.use_caption_projection &&
blocks.count("caption_projection") > 0 &&
v_context != nullptr &&
v_context->ne[0] == cfg.caption_channels) {
v_context->ne[0] == config.caption_channels) {
auto caption_projection = std::dynamic_pointer_cast<NormSingleLinearTextProjection>(blocks["caption_projection"]);
if (caption_projection != nullptr) {
v_context = caption_projection->forward(ctx, v_context);
}
}
if (process_audio_context &&
cfg.use_audio_caption_projection &&
config.use_audio_caption_projection &&
blocks.count("audio_caption_projection") > 0 &&
a_context != nullptr &&
a_context->ne[0] == cfg.caption_channels) {
a_context->ne[0] == config.caption_channels) {
auto caption_projection = std::dynamic_pointer_cast<NormSingleLinearTextProjection>(blocks["audio_caption_projection"]);
if (caption_projection != nullptr) {
a_context = caption_projection->forward(ctx, a_context);
@ -1359,34 +1496,34 @@ namespace LTXV {
}
}
if (cfg.use_connector && v_context != nullptr && v_context->ne[0] == cfg.connector_hidden_size) {
if (config.use_connector && v_context != nullptr && v_context->ne[0] == config.connector_hidden_size) {
auto connector = std::dynamic_pointer_cast<Embeddings1DConnector>(blocks["video_embeddings_connector"]);
v_context = connector->forward(ctx, v_context, video_connector_pe);
}
if (process_audio_context &&
cfg.use_audio_connector &&
config.use_audio_connector &&
a_context != nullptr &&
a_context->ne[0] == cfg.audio_connector_hidden_size) {
a_context->ne[0] == config.audio_connector_hidden_size) {
auto connector = std::dynamic_pointer_cast<Embeddings1DConnector>(blocks["audio_embeddings_connector"]);
a_context = connector->forward(ctx, a_context, audio_connector_pe);
}
if (!cfg.caption_proj_before_connector &&
cfg.use_caption_projection &&
if (!config.caption_proj_before_connector &&
config.use_caption_projection &&
blocks.count("caption_projection") > 0 &&
v_context != nullptr &&
v_context->ne[0] == cfg.caption_channels) {
v_context->ne[0] == config.caption_channels) {
auto caption_projection = std::dynamic_pointer_cast<PixArtAlphaTextProjection>(blocks["caption_projection"]);
if (caption_projection != nullptr) {
v_context = caption_projection->forward(ctx, v_context);
}
}
if (process_audio_context &&
!cfg.caption_proj_before_connector &&
cfg.use_audio_caption_projection &&
!config.caption_proj_before_connector &&
config.use_audio_caption_projection &&
blocks.count("audio_caption_projection") > 0 &&
a_context != nullptr &&
a_context->ne[0] == cfg.caption_channels) {
a_context->ne[0] == config.caption_channels) {
auto caption_projection = std::dynamic_pointer_cast<PixArtAlphaTextProjection>(blocks["audio_caption_projection"]);
if (caption_projection != nullptr) {
a_context = caption_projection->forward(ctx, a_context);
@ -1428,8 +1565,8 @@ namespace LTXV {
auto audio_norm_out = std::dynamic_pointer_cast<LayerNorm>(blocks["audio_norm_out"]);
auto audio_proj_out = std::dynamic_pointer_cast<Linear>(blocks["audio_proj_out"]);
GGML_ASSERT(vx->ne[3] % cfg.in_channels == 0);
int64_t n = vx->ne[3] / cfg.in_channels;
GGML_ASSERT(vx->ne[3] % config.in_channels == 0);
int64_t n = vx->ne[3] / config.in_channels;
int64_t width = vx->ne[0];
int64_t height = vx->ne[1];
int64_t frames = vx->ne[2];
@ -1452,20 +1589,20 @@ namespace LTXV {
a_context = ggml_cont(ctx->ggml_ctx, a_context);
}
auto v_timestep_scaled = ggml_ext_scale(ctx->ggml_ctx, timestep, cfg.timestep_scale_multiplier);
auto v_timestep_scaled = ggml_ext_scale(ctx->ggml_ctx, timestep, config.timestep_scale_multiplier);
auto v_pair = adaln_single->forward(ctx, v_timestep_scaled);
auto v_timestep_mod = v_pair.first;
auto v_embedded_time = v_pair.second;
ggml_tensor* effective_audio_timestep = audio_timestep != nullptr ? audio_timestep : timestep;
auto a_timestep_scaled = ggml_ext_scale(ctx->ggml_ctx, effective_audio_timestep, cfg.timestep_scale_multiplier);
auto a_timestep_scaled = ggml_ext_scale(ctx->ggml_ctx, effective_audio_timestep, config.timestep_scale_multiplier);
auto a_pair = audio_adaln_single->forward(ctx, a_timestep_scaled);
auto a_timestep_mod = a_pair.first;
auto a_embedded_time = a_pair.second;
ggml_tensor* v_prompt_timestep_mod = nullptr;
ggml_tensor* a_prompt_timestep_mod = nullptr;
if (cfg.cross_attention_adaln) {
if (config.cross_attention_adaln) {
auto prompt_adaln_single = std::dynamic_pointer_cast<AdaLayerNormSingle>(blocks["prompt_adaln_single"]);
auto audio_prompt_adaln_single = std::dynamic_pointer_cast<AdaLayerNormSingle>(blocks["audio_prompt_adaln_single"]);
v_prompt_timestep_mod = prompt_adaln_single->forward(ctx, a_timestep_scaled).first;
@ -1474,7 +1611,7 @@ namespace LTXV {
auto av_ca_video_timestep = repeat_scalar_timestep_like(ctx, effective_audio_timestep, timestep);
auto av_ca_audio_timestep = effective_audio_timestep;
auto av_ca_factor = cfg.av_ca_timestep_scale_multiplier / cfg.timestep_scale_multiplier;
auto av_ca_factor = config.av_ca_timestep_scale_multiplier / config.timestep_scale_multiplier;
auto av_ca_video_scale_shift_timestep =
std::dynamic_pointer_cast<AdaLayerNormSingle>(blocks["av_ca_video_scale_shift_adaln_single"])->forward(ctx, av_ca_video_timestep).first;
auto av_ca_a2v_gate_noise_timestep =
@ -1491,7 +1628,7 @@ namespace LTXV {
sd::ggml_graph_cut::mark_graph_cut(vx, "ltxav.prelude", "vx");
sd::ggml_graph_cut::mark_graph_cut(ax, "ltxav.prelude", "ax");
for (int i = 0; i < cfg.num_layers; i++) {
for (int i = 0; i < config.num_layers; i++) {
auto block = std::dynamic_pointer_cast<BasicAVTransformerBlock>(blocks["transformer_blocks." + std::to_string(i)]);
auto out = block->forward(ctx,
vx,
@ -1517,14 +1654,14 @@ namespace LTXV {
sd::ggml_graph_cut::mark_graph_cut(ax, "ltxav.transformer_blocks." + std::to_string(i), "ax");
}
auto v_shift_scale = get_output_scale_shift(ctx, params["scale_shift_table"], v_embedded_time, cfg.hidden_size);
auto v_shift_scale = get_output_scale_shift(ctx, params["scale_shift_table"], v_embedded_time, config.hidden_size);
vx = norm_out->forward(ctx, vx);
vx = modulate(ctx->ggml_ctx, vx, v_shift_scale[0], v_shift_scale[1]);
vx = proj_out->forward(ctx, vx);
vx = unpatchify_video(ctx, vx, width, height, frames);
if (ax != nullptr && audio_time > 0) {
auto a_shift_scale = get_output_scale_shift(ctx, params["audio_scale_shift_table"], a_embedded_time, cfg.audio_hidden_size);
auto a_shift_scale = get_output_scale_shift(ctx, params["audio_scale_shift_table"], a_embedded_time, config.audio_hidden_size);
ax = audio_norm_out->forward(ctx, ax);
ax = modulate(ctx->ggml_ctx, ax, a_shift_scale[0], a_shift_scale[1]);
ax = audio_proj_out->forward(ctx, ax);
@ -1536,7 +1673,7 @@ namespace LTXV {
};
struct LTXAVRunner : public DiffusionModelRunner {
LTXAVParams params;
LTXAVConfig config;
LTXAVModelBlock model;
std::vector<float> video_pe_vec;
std::vector<float> audio_pe_vec;
@ -1547,124 +1684,13 @@ namespace LTXV {
sd::Tensor<float> vx_input_cache;
sd::Tensor<float> ax_input_cache;
static int64_t infer_gate_heads(const String2TensorStorage& tensor_storage_map,
const std::string& bias_name,
int64_t fallback_heads) {
auto it = tensor_storage_map.find(bias_name);
if (it != tensor_storage_map.end()) {
return it->second.ne[0];
}
return fallback_heads;
}
LTXAVRunner(ggml_backend_t backend,
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string& prefix = "model.diffusion_model")
: DiffusionModelRunner(backend, params_backend, prefix),
params(),
model(params) {
auto patchify_proj_iter = tensor_storage_map.find(prefix + ".patchify_proj.weight");
if (patchify_proj_iter != tensor_storage_map.end()) {
params.in_channels = patchify_proj_iter->second.ne[0];
params.hidden_size = patchify_proj_iter->second.ne[1];
int64_t video_heads = infer_gate_heads(tensor_storage_map, prefix + ".transformer_blocks.0.attn1.to_gate_logits.bias", 32);
auto attn_layout = infer_attention_layout(params.hidden_size, video_heads);
params.num_attention_heads = attn_layout.first;
params.attention_head_dim = attn_layout.second;
}
auto audio_patchify_proj_iter = tensor_storage_map.find(prefix + ".audio_patchify_proj.weight");
if (audio_patchify_proj_iter != tensor_storage_map.end()) {
params.audio_in_channels = audio_patchify_proj_iter->second.ne[0];
params.audio_hidden_size = audio_patchify_proj_iter->second.ne[1];
params.audio_out_channels = params.audio_in_channels;
int64_t audio_heads = infer_gate_heads(tensor_storage_map, prefix + ".transformer_blocks.0.audio_attn1.to_gate_logits.bias", 32);
auto audio_attn_layout = infer_attention_layout(params.audio_hidden_size, audio_heads);
params.audio_num_attention_heads = audio_attn_layout.first;
params.audio_attention_head_dim = audio_attn_layout.second;
}
auto proj_out_iter = tensor_storage_map.find(prefix + ".proj_out.weight");
if (proj_out_iter != tensor_storage_map.end()) {
params.out_channels = proj_out_iter->second.ne[1];
}
auto audio_proj_out_iter = tensor_storage_map.find(prefix + ".audio_proj_out.weight");
if (audio_proj_out_iter != tensor_storage_map.end()) {
params.audio_out_channels = audio_proj_out_iter->second.ne[1];
}
auto attn2_iter = tensor_storage_map.find(prefix + ".transformer_blocks.0.attn2.to_k.weight");
if (attn2_iter != tensor_storage_map.end()) {
params.cross_attention_dim = attn2_iter->second.ne[0];
}
auto audio_attn2_iter = tensor_storage_map.find(prefix + ".transformer_blocks.0.audio_attn2.to_k.weight");
if (audio_attn2_iter != tensor_storage_map.end()) {
params.audio_cross_attention_dim = audio_attn2_iter->second.ne[0];
}
if (tensor_storage_map.find(prefix + ".transformer_blocks.0.prompt_scale_shift_table") != tensor_storage_map.end()) {
params.cross_attention_adaln = true;
}
if (tensor_storage_map.find(prefix + ".transformer_blocks.0.attn1.to_gate_logits.weight") != tensor_storage_map.end() ||
tensor_storage_map.find(prefix + ".transformer_blocks.0.audio_attn1.to_gate_logits.weight") != tensor_storage_map.end()) {
params.self_attention_gated = true;
}
if (tensor_storage_map.find(prefix + ".transformer_blocks.0.attn2.to_gate_logits.weight") != tensor_storage_map.end() ||
tensor_storage_map.find(prefix + ".transformer_blocks.0.audio_attn2.to_gate_logits.weight") != tensor_storage_map.end()) {
params.cross_attention_gated = true;
}
if (tensor_storage_map.find(prefix + ".caption_projection.linear_1.weight") == tensor_storage_map.end() &&
tensor_storage_map.find(prefix + ".caption_projection.linear_2.weight") == tensor_storage_map.end()) {
params.use_caption_projection = false;
}
if (tensor_storage_map.find(prefix + ".audio_caption_projection.linear_1.weight") == tensor_storage_map.end() &&
tensor_storage_map.find(prefix + ".audio_caption_projection.linear_2.weight") == tensor_storage_map.end()) {
params.use_audio_caption_projection = false;
}
params.num_layers = count_prefix_blocks(tensor_storage_map, prefix + ".", "transformer_blocks.");
auto connector_iter = tensor_storage_map.find(prefix + ".video_embeddings_connector.transformer_1d_blocks.0.attn1.to_q.weight");
if (connector_iter != tensor_storage_map.end()) {
params.use_connector = true;
params.connector_hidden_size = connector_iter->second.ne[1];
int64_t connector_heads = infer_gate_heads(tensor_storage_map,
prefix + ".video_embeddings_connector.transformer_1d_blocks.0.attn1.to_gate_logits.bias",
32);
auto connector_layout = infer_attention_layout(params.connector_hidden_size, connector_heads);
params.connector_num_heads = connector_layout.first;
params.connector_head_dim = connector_layout.second;
params.connector_num_layers = count_prefix_blocks(tensor_storage_map, prefix + ".video_embeddings_connector.", "transformer_1d_blocks.");
auto register_iter = tensor_storage_map.find(prefix + ".video_embeddings_connector.learnable_registers");
if (register_iter != tensor_storage_map.end()) {
params.connector_num_registers = register_iter->second.ne[1];
}
if (tensor_storage_map.find(prefix + ".video_embeddings_connector.transformer_1d_blocks.0.attn1.to_gate_logits.weight") != tensor_storage_map.end()) {
params.connector_apply_gated_attention = true;
}
}
auto audio_connector_iter = tensor_storage_map.find(prefix + ".audio_embeddings_connector.transformer_1d_blocks.0.attn1.to_q.weight");
if (audio_connector_iter != tensor_storage_map.end()) {
params.use_audio_connector = true;
params.audio_connector_hidden_size = audio_connector_iter->second.ne[1];
int64_t connector_heads = infer_gate_heads(tensor_storage_map,
prefix + ".audio_embeddings_connector.transformer_1d_blocks.0.attn1.to_gate_logits.bias",
32);
auto connector_layout = infer_attention_layout(params.audio_connector_hidden_size, connector_heads);
params.audio_connector_num_heads = connector_layout.first;
params.audio_connector_head_dim = connector_layout.second;
params.audio_connector_num_layers = count_prefix_blocks(tensor_storage_map, prefix + ".audio_embeddings_connector.", "transformer_1d_blocks.");
auto register_iter = tensor_storage_map.find(prefix + ".audio_embeddings_connector.learnable_registers");
if (register_iter != tensor_storage_map.end()) {
params.audio_connector_num_registers = register_iter->second.ne[1];
}
if (tensor_storage_map.find(prefix + ".audio_embeddings_connector.transformer_1d_blocks.0.attn1.to_gate_logits.weight") != tensor_storage_map.end()) {
params.audio_connector_apply_gated_attention = true;
}
}
model = LTXAVModelBlock(params);
config(LTXAVConfig::detect_from_weights(tensor_storage_map, prefix)),
model(config) {
model.init(params_ctx, tensor_storage_map, prefix);
}
@ -1692,21 +1718,21 @@ namespace LTXV {
int64_t total_channels = x_tensor.shape()[3];
int64_t spatial_size = width * height * frames;
GGML_ASSERT(total_channels >= params.in_channels);
GGML_ASSERT(total_channels >= config.in_channels);
sd::Tensor<float> vx({width, height, frames, params.in_channels});
size_t video_values = static_cast<size_t>(params.in_channels * spatial_size);
sd::Tensor<float> vx({width, height, frames, config.in_channels});
size_t video_values = static_cast<size_t>(config.in_channels * spatial_size);
std::copy_n(x_tensor.data(), video_values, vx.data());
if (audio_length <= 0 || total_channels == params.in_channels) {
if (audio_length <= 0 || total_channels == config.in_channels) {
return {vx, {}};
}
int64_t needed_audio_values = static_cast<int64_t>(audio_length) * params.num_audio_channels * params.audio_frequency_bins;
int64_t packed_audio_values = (total_channels - params.in_channels) * spatial_size;
int64_t needed_audio_values = static_cast<int64_t>(audio_length) * config.num_audio_channels * config.audio_frequency_bins;
int64_t packed_audio_values = (total_channels - config.in_channels) * spatial_size;
GGML_ASSERT(packed_audio_values >= needed_audio_values);
sd::Tensor<float> ax({params.audio_frequency_bins, audio_length, params.num_audio_channels, 1});
sd::Tensor<float> ax({config.audio_frequency_bins, audio_length, config.num_audio_channels, 1});
const float* audio_src = x_tensor.data() + video_values;
std::copy_n(audio_src, static_cast<size_t>(needed_audio_values), ax.data());
return {vx, ax};
@ -1767,25 +1793,25 @@ namespace LTXV {
if (has_video_positions) {
GGML_ASSERT(video_positions_tensor.shape()[2] == video_token_count);
video_pe_vec = build_video_rope_matrix_from_positions(video_positions_tensor,
static_cast<int>(params.hidden_size),
static_cast<int>(params.num_attention_heads),
params.positional_embedding_theta,
params.positional_embedding_max_pos,
params.use_middle_indices_grid);
static_cast<int>(config.hidden_size),
static_cast<int>(config.num_attention_heads),
config.positional_embedding_theta,
config.positional_embedding_max_pos,
config.use_middle_indices_grid);
} else {
video_pe_vec = build_video_rope_matrix(vx->ne[0],
vx->ne[1],
vx->ne[2],
static_cast<int>(params.hidden_size),
static_cast<int>(params.num_attention_heads),
static_cast<int>(config.hidden_size),
static_cast<int>(config.num_attention_heads),
video_frame_rate,
params.positional_embedding_theta,
params.positional_embedding_max_pos,
params.vae_scale_factors,
params.causal_temporal_positioning,
params.use_middle_indices_grid);
config.positional_embedding_theta,
config.positional_embedding_max_pos,
config.vae_scale_factors,
config.causal_temporal_positioning,
config.use_middle_indices_grid);
}
auto video_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.attention_head_dim / 2, video_token_count * params.num_attention_heads);
auto video_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.attention_head_dim / 2, video_token_count * config.num_attention_heads);
ggml_set_name(video_pe, "ltxav_video_pe");
set_backend_tensor_data(video_pe, video_pe_vec.data());
@ -1794,66 +1820,66 @@ namespace LTXV {
ggml_tensor* audio_cross_pe = nullptr;
if (ax != nullptr && ggml_nelements(ax) > 0 && ax->ne[1] > 0) {
audio_pe_vec = build_audio_rope_matrix(ax->ne[1],
static_cast<int>(params.audio_hidden_size),
static_cast<int>(params.audio_num_attention_heads),
params.positional_embedding_theta,
params.audio_positional_embedding_max_pos[0],
params.use_middle_indices_grid);
audio_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, ax->ne[1] * params.audio_num_attention_heads);
static_cast<int>(config.audio_hidden_size),
static_cast<int>(config.audio_num_attention_heads),
config.positional_embedding_theta,
config.audio_positional_embedding_max_pos[0],
config.use_middle_indices_grid);
audio_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.audio_attention_head_dim / 2, ax->ne[1] * config.audio_num_attention_heads);
ggml_set_name(audio_pe, "ltxav_audio_pe");
set_backend_tensor_data(audio_pe, audio_pe_vec.data());
int temporal_max_pos = std::max(params.positional_embedding_max_pos[0], params.audio_positional_embedding_max_pos[0]);
int temporal_max_pos = std::max(config.positional_embedding_max_pos[0], config.audio_positional_embedding_max_pos[0]);
if (has_video_positions) {
video_cross_pe_vec = build_video_temporal_rope_matrix_from_positions(video_positions_tensor,
static_cast<int>(params.audio_cross_attention_dim),
static_cast<int>(params.audio_num_attention_heads),
params.positional_embedding_theta,
static_cast<int>(config.audio_cross_attention_dim),
static_cast<int>(config.audio_num_attention_heads),
config.positional_embedding_theta,
temporal_max_pos,
true);
} else {
video_cross_pe_vec = build_video_temporal_rope_matrix(vx->ne[0],
vx->ne[1],
vx->ne[2],
static_cast<int>(params.audio_cross_attention_dim),
static_cast<int>(params.audio_num_attention_heads),
static_cast<int>(config.audio_cross_attention_dim),
static_cast<int>(config.audio_num_attention_heads),
video_frame_rate,
params.positional_embedding_theta,
config.positional_embedding_theta,
temporal_max_pos,
std::get<0>(params.vae_scale_factors),
params.causal_temporal_positioning,
std::get<0>(config.vae_scale_factors),
config.causal_temporal_positioning,
true);
}
video_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, video_token_count * params.audio_num_attention_heads);
video_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.audio_attention_head_dim / 2, video_token_count * config.audio_num_attention_heads);
ggml_set_name(video_cross_pe, "ltxav_video_cross_pe");
set_backend_tensor_data(video_cross_pe, video_cross_pe_vec.data());
audio_cross_pe_vec = build_audio_rope_matrix(ax->ne[1],
static_cast<int>(params.audio_cross_attention_dim),
static_cast<int>(params.audio_num_attention_heads),
params.positional_embedding_theta,
static_cast<int>(config.audio_cross_attention_dim),
static_cast<int>(config.audio_num_attention_heads),
config.positional_embedding_theta,
temporal_max_pos,
true);
audio_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, ax->ne[1] * params.audio_num_attention_heads);
audio_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.audio_attention_head_dim / 2, ax->ne[1] * config.audio_num_attention_heads);
ggml_set_name(audio_cross_pe, "ltxav_audio_cross_pe");
set_backend_tensor_data(audio_cross_pe, audio_cross_pe_vec.data());
}
bool needs_video_connector_pe =
params.use_connector &&
config.use_connector &&
context != nullptr &&
(context->ne[0] == params.connector_hidden_size ||
((context->ne[0] == params.cross_attention_dim + params.audio_cross_attention_dim ||
context->ne[0] == params.caption_channels * 2) &&
(context->ne[0] == config.connector_hidden_size ||
((context->ne[0] == config.cross_attention_dim + config.audio_cross_attention_dim ||
context->ne[0] == config.caption_channels * 2) &&
context->ne[1] < 1024));
ggml_tensor* video_connector_pe = nullptr;
if (needs_video_connector_pe) {
int64_t seq_len = context->ne[1];
int64_t target_len = std::max<int64_t>(1024, seq_len);
int64_t duplications = (target_len + params.connector_num_registers - 1) / params.connector_num_registers;
int64_t full_len = seq_len + duplications * params.connector_num_registers - seq_len;
connector_pe_vec = build_1d_rope_matrix(full_len, static_cast<int>(params.connector_hidden_size), static_cast<int>(params.connector_num_heads), 10000.f, 4096.f, true);
video_connector_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.connector_head_dim / 2, full_len * params.connector_num_heads);
int64_t duplications = (target_len + config.connector_num_registers - 1) / config.connector_num_registers;
int64_t full_len = seq_len + duplications * config.connector_num_registers - seq_len;
connector_pe_vec = build_1d_rope_matrix(full_len, static_cast<int>(config.connector_hidden_size), static_cast<int>(config.connector_num_heads), 10000.f, 4096.f, true);
video_connector_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.connector_head_dim / 2, full_len * config.connector_num_heads);
ggml_set_name(video_connector_pe, "ltxav_video_connector_pe");
set_backend_tensor_data(video_connector_pe, connector_pe_vec.data());
}
@ -1864,20 +1890,20 @@ namespace LTXV {
ax->ne[1] > 0;
bool needs_audio_connector_pe =
run_audio_context &&
params.use_audio_connector &&
config.use_audio_connector &&
context != nullptr &&
(context->ne[0] == params.audio_connector_hidden_size ||
((context->ne[0] == params.cross_attention_dim + params.audio_cross_attention_dim ||
context->ne[0] == params.caption_channels * 2) &&
(context->ne[0] == config.audio_connector_hidden_size ||
((context->ne[0] == config.cross_attention_dim + config.audio_cross_attention_dim ||
context->ne[0] == config.caption_channels * 2) &&
context->ne[1] < 1024));
ggml_tensor* audio_connector_pe = nullptr;
if (needs_audio_connector_pe) {
int64_t seq_len = context->ne[1];
int64_t target_len = std::max<int64_t>(1024, seq_len);
int64_t duplications = (target_len + params.audio_connector_num_registers - 1) / params.audio_connector_num_registers;
int64_t full_len = seq_len + duplications * params.audio_connector_num_registers - seq_len;
audio_connector_pe_vec = build_1d_rope_matrix(full_len, static_cast<int>(params.audio_connector_hidden_size), static_cast<int>(params.audio_connector_num_heads), 10000.f, 4096.f, true);
audio_connector_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_connector_head_dim / 2, full_len * params.audio_connector_num_heads);
int64_t duplications = (target_len + config.audio_connector_num_registers - 1) / config.audio_connector_num_registers;
int64_t full_len = seq_len + duplications * config.audio_connector_num_registers - seq_len;
audio_connector_pe_vec = build_1d_rope_matrix(full_len, static_cast<int>(config.audio_connector_hidden_size), static_cast<int>(config.audio_connector_num_heads), 10000.f, 4096.f, true);
audio_connector_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.audio_connector_head_dim / 2, full_len * config.audio_connector_num_heads);
ggml_set_name(audio_connector_pe, "ltxav_audio_connector_pe");
set_backend_tensor_data(audio_connector_pe, audio_connector_pe_vec.data());
}
@ -2036,4 +2062,4 @@ namespace LTXV {
}; // namespace LTXV
#endif
#endif // __SD_MODEL_DIFFUSION_LTXV_HPP__

View File

@ -1,42 +1,137 @@
#ifndef __MMDIT_HPP__
#define __MMDIT_HPP__
#ifndef __SD_MODEL_DIFFUSION_MMDIT_HPP__
#define __SD_MODEL_DIFFUSION_MMDIT_HPP__
#include <algorithm>
#include <memory>
#include <string>
#include <vector>
#include "diffusion_model.hpp"
#include "ggml_extend.hpp"
#include "core/ggml_extend.hpp"
#include "model.h"
#include "model/common/block.hpp"
#include "model/diffusion/model.hpp"
#define MMDIT_GRAPH_SIZE 10240
struct Mlp : public GGMLBlock {
public:
Mlp(int64_t in_features,
int64_t hidden_features = -1,
int64_t out_features = -1,
bool bias = true) {
// act_layer is always lambda: nn.GELU(approximate="tanh")
// norm_layer is always None
// use_conv is always False
if (hidden_features == -1) {
hidden_features = in_features;
}
if (out_features == -1) {
out_features = in_features;
}
blocks["fc1"] = std::shared_ptr<GGMLBlock>(new Linear(in_features, hidden_features, bias));
blocks["fc2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_features, out_features, bias));
}
struct MMDiTConfig {
int64_t input_size = -1;
int patch_size = 2;
int64_t in_channels = 16;
int64_t d_self = -1; // >=0 for MMdiT-X
int64_t depth = 24;
float mlp_ratio = 4.0f;
int64_t adm_in_channels = 2048;
int64_t out_channels = 16;
int64_t pos_embed_max_size = 192;
int64_t num_patches = 36864; // 192 * 192
int64_t context_size = 4096;
int64_t context_embedder_out_dim = 1536;
int64_t hidden_size = 1536;
std::string qk_norm;
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
// x: [N, n_token, in_features]
auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);
static MMDiTConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
MMDiTConfig config;
bool has_weight_config = false;
bool has_pos_embed = false;
bool has_hidden_size = false;
bool has_context_embed = false;
x = fc1->forward(ctx, x);
x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
x = fc2->forward(ctx, x);
return x;
for (const auto& [name, tensor_storage] : tensor_storage_map) {
if (!starts_with(name, prefix)) {
continue;
}
if (name.find("x_embedder.proj.weight") != std::string::npos && tensor_storage.n_dims == 4) {
has_weight_config = true;
has_hidden_size = true;
config.patch_size = static_cast<int>(tensor_storage.ne[0]);
config.in_channels = tensor_storage.ne[2];
config.hidden_size = tensor_storage.ne[3];
} else if (name.find("t_embedder.mlp.0.weight") != std::string::npos && tensor_storage.n_dims == 2) {
has_weight_config = true;
has_hidden_size = true;
config.hidden_size = tensor_storage.ne[1];
} else if (name.find("y_embedder.mlp.0.weight") != std::string::npos && tensor_storage.n_dims == 2) {
has_weight_config = true;
has_hidden_size = true;
config.adm_in_channels = tensor_storage.ne[0];
config.hidden_size = tensor_storage.ne[1];
} else if (name.find("context_embedder.weight") != std::string::npos && tensor_storage.n_dims == 2) {
has_weight_config = true;
has_context_embed = true;
config.context_size = tensor_storage.ne[0];
config.context_embedder_out_dim = tensor_storage.ne[1];
} else if (name.find("final_layer.linear.weight") != std::string::npos && tensor_storage.n_dims == 2) {
has_weight_config = true;
has_hidden_size = true;
config.hidden_size = tensor_storage.ne[0];
int64_t patch_area = static_cast<int64_t>(config.patch_size) * config.patch_size;
if (patch_area > 0) {
config.out_channels = tensor_storage.ne[1] / patch_area;
}
} else if (name.find("pos_embed") != std::string::npos && tensor_storage.n_dims == 3) {
has_weight_config = true;
has_pos_embed = true;
has_hidden_size = true;
config.hidden_size = tensor_storage.ne[0];
config.num_patches = tensor_storage.ne[1];
for (int64_t size = 1; size * size <= config.num_patches; size++) {
if (size * size == config.num_patches) {
config.pos_embed_max_size = size;
break;
}
}
}
size_t jb = name.find("joint_blocks.");
if (jb == std::string::npos) {
continue;
}
has_weight_config = true;
std::string block_name = name.substr(jb);
int64_t block_depth = atoi(block_name.substr(13, block_name.find(".", 13)).c_str());
if (block_depth + 1 > config.depth) {
config.depth = block_depth + 1;
}
if (block_name.find("attn.ln") != std::string::npos) {
if (block_name.find(".bias") != std::string::npos) {
config.qk_norm = "ln";
} else {
config.qk_norm = "rms";
}
}
if (block_name.find("attn2") != std::string::npos) {
if (block_depth > config.d_self) {
config.d_self = block_depth;
}
}
}
if (!has_pos_embed && config.d_self >= 0) {
config.pos_embed_max_size *= 2;
config.num_patches *= 4;
}
if (!has_hidden_size || config.hidden_size <= 0) {
config.hidden_size = 64 * config.depth;
}
if (!has_context_embed || config.context_embedder_out_dim <= 0) {
config.context_embedder_out_dim = config.hidden_size;
}
if (has_weight_config) {
LOG_DEBUG("mmdit: num_layers = %" PRId64 ", num_mmdit_x_layers = %" PRId64 ", hidden_size = %" PRId64 ", patch_size = %d, in_channels = %" PRId64 ", out_channels = %" PRId64 ", context_size = %" PRId64 ", adm_in_channels = %" PRId64 ", qk_norm = %s",
config.depth,
config.d_self + 1,
config.hidden_size,
config.patch_size,
config.in_channels,
config.out_channels,
config.context_size,
config.adm_in_channels,
config.qk_norm.empty() ? "none" : config.qk_norm.c_str());
}
return config;
}
};
@ -612,28 +707,16 @@ public:
struct MMDiT : public GGMLBlock {
// Diffusion model with a Transformer backbone.
protected:
int64_t input_size = -1;
int patch_size = 2;
int64_t in_channels = 16;
int64_t d_self = -1; // >=0 for MMdiT-X
int64_t depth = 24;
float mlp_ratio = 4.0f;
int64_t adm_in_channels = 2048;
int64_t out_channels = 16;
int64_t pos_embed_max_size = 192;
int64_t num_patchs = 36864; // 192 * 192
int64_t context_size = 4096;
int64_t context_embedder_out_dim = 1536;
int64_t hidden_size;
std::string qk_norm;
void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
enum ggml_type wtype = GGML_TYPE_F32;
params["pos_embed"] = ggml_new_tensor_3d(ctx, wtype, hidden_size, num_patchs, 1);
params["pos_embed"] = ggml_new_tensor_3d(ctx, wtype, config.hidden_size, config.num_patches, 1);
}
public:
MMDiT(const String2TensorStorage& tensor_storage_map = {}) {
MMDiTConfig config;
explicit MMDiT(MMDiTConfig config = {})
: config(config) {
// input_size is always None
// learn_sigma is always False
// register_length is alwalys 0
@ -646,64 +729,30 @@ public:
// pos_embed_offset is not used
// context_embedder_config is always {'target': 'torch.nn.Linear', 'params': {'in_features': 4096, 'out_features': 1536}}
for (auto pair : tensor_storage_map) {
std::string tensor_name = pair.first;
if (tensor_name.find("model.diffusion_model.") == std::string::npos)
continue;
size_t jb = tensor_name.find("joint_blocks.");
if (jb != std::string::npos) {
tensor_name = tensor_name.substr(jb); // remove prefix
int block_depth = atoi(tensor_name.substr(13, tensor_name.find(".", 13)).c_str());
if (block_depth + 1 > depth) {
depth = block_depth + 1;
}
if (tensor_name.find("attn.ln") != std::string::npos) {
if (tensor_name.find(".bias") != std::string::npos) {
qk_norm = "ln";
} else {
qk_norm = "rms";
}
}
if (tensor_name.find("attn2") != std::string::npos) {
if (block_depth > d_self) {
d_self = block_depth;
}
}
}
blocks["x_embedder"] = std::shared_ptr<GGMLBlock>(new PatchEmbed(config.input_size,
config.patch_size,
config.in_channels,
config.hidden_size,
true));
blocks["t_embedder"] = std::shared_ptr<GGMLBlock>(new TimestepEmbedder(config.hidden_size));
if (config.adm_in_channels != -1) {
blocks["y_embedder"] = std::shared_ptr<GGMLBlock>(new VectorEmbedder(config.adm_in_channels, config.hidden_size));
}
if (d_self >= 0) {
pos_embed_max_size *= 2;
num_patchs *= 4;
}
blocks["context_embedder"] = std::shared_ptr<GGMLBlock>(new Linear(config.context_size, config.context_embedder_out_dim, true, true));
LOG_INFO("MMDiT layers: %d (including %d MMDiT-x layers)", depth, d_self + 1);
int64_t default_out_channels = in_channels;
hidden_size = 64 * depth;
context_embedder_out_dim = 64 * depth;
int64_t num_heads = depth;
blocks["x_embedder"] = std::shared_ptr<GGMLBlock>(new PatchEmbed(input_size, patch_size, in_channels, hidden_size, true));
blocks["t_embedder"] = std::shared_ptr<GGMLBlock>(new TimestepEmbedder(hidden_size));
if (adm_in_channels != -1) {
blocks["y_embedder"] = std::shared_ptr<GGMLBlock>(new VectorEmbedder(adm_in_channels, hidden_size));
}
blocks["context_embedder"] = std::shared_ptr<GGMLBlock>(new Linear(4096, context_embedder_out_dim, true, true));
for (int i = 0; i < depth; i++) {
blocks["joint_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new JointBlock(hidden_size,
num_heads,
mlp_ratio,
qk_norm,
for (int i = 0; i < config.depth; i++) {
blocks["joint_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new JointBlock(config.hidden_size,
config.depth,
config.mlp_ratio,
config.qk_norm,
true,
i == depth - 1,
i <= d_self));
i == config.depth - 1,
i <= config.d_self));
}
blocks["final_layer"] = std::shared_ptr<GGMLBlock>(new FinalLayer(hidden_size, patch_size, out_channels));
blocks["final_layer"] = std::shared_ptr<GGMLBlock>(new FinalLayer(config.hidden_size, config.patch_size, config.out_channels));
}
ggml_tensor*
@ -712,22 +761,22 @@ public:
int64_t w) {
auto pos_embed = params["pos_embed"];
h = (h + 1) / patch_size;
w = (w + 1) / patch_size;
h = (h + 1) / config.patch_size;
w = (w + 1) / config.patch_size;
GGML_ASSERT(h <= pos_embed_max_size && h > 0);
GGML_ASSERT(w <= pos_embed_max_size && w > 0);
GGML_ASSERT(h <= config.pos_embed_max_size && h > 0);
GGML_ASSERT(w <= config.pos_embed_max_size && w > 0);
int64_t top = (pos_embed_max_size - h) / 2;
int64_t left = (pos_embed_max_size - w) / 2;
int64_t top = (config.pos_embed_max_size - h) / 2;
int64_t left = (config.pos_embed_max_size - w) / 2;
auto spatial_pos_embed = ggml_reshape_3d(ctx, pos_embed, hidden_size, pos_embed_max_size, pos_embed_max_size);
auto spatial_pos_embed = ggml_reshape_3d(ctx, pos_embed, config.hidden_size, config.pos_embed_max_size, config.pos_embed_max_size);
// spatial_pos_embed = spatial_pos_embed[:, top : top + h, left : left + w, :]
spatial_pos_embed = ggml_view_3d(ctx,
spatial_pos_embed,
hidden_size,
pos_embed_max_size,
config.hidden_size,
config.pos_embed_max_size,
h,
spatial_pos_embed->nb[1],
spatial_pos_embed->nb[2],
@ -735,14 +784,14 @@ public:
spatial_pos_embed = ggml_cont(ctx, ggml_permute(ctx, spatial_pos_embed, 0, 2, 1, 3)); // [pos_embed_max_size, h, hidden_size]
spatial_pos_embed = ggml_view_3d(ctx,
spatial_pos_embed,
hidden_size,
config.hidden_size,
h,
w,
spatial_pos_embed->nb[1],
spatial_pos_embed->nb[2],
spatial_pos_embed->nb[2] * left); // [w, h, hidden_size]
spatial_pos_embed = ggml_cont(ctx, ggml_permute(ctx, spatial_pos_embed, 0, 2, 1, 3)); // [h, w, hidden_size]
spatial_pos_embed = ggml_reshape_3d(ctx, spatial_pos_embed, hidden_size, h * w, 1); // [1, h*w, hidden_size]
spatial_pos_embed->nb[2] * left); // [w, h, hidden_size]
spatial_pos_embed = ggml_cont(ctx, ggml_permute(ctx, spatial_pos_embed, 0, 2, 1, 3)); // [h, w, hidden_size]
spatial_pos_embed = ggml_reshape_3d(ctx, spatial_pos_embed, config.hidden_size, h * w, 1); // [1, h*w, hidden_size]
return spatial_pos_embed;
}
@ -757,7 +806,7 @@ public:
// return: [N, N*W, patch_size * patch_size * out_channels]
auto final_layer = std::dynamic_pointer_cast<FinalLayer>(blocks["final_layer"]);
for (int i = 0; i < depth; i++) {
for (int i = 0; i < config.depth; i++) {
// skip iteration if i is in skip_layers
if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i) != skip_layers.end()) {
continue;
@ -800,7 +849,7 @@ public:
x = ggml_add(ctx->ggml_ctx, patch_embed, pos_embed); // [N, H*W, hidden_size]
auto c = t_embedder->forward(ctx, t); // [N, hidden_size]
if (y != nullptr && adm_in_channels != -1) {
if (y != nullptr && config.adm_in_channels != -1) {
auto y_embedder = std::dynamic_pointer_cast<VectorEmbedder>(blocks["y_embedder"]);
y = y_embedder->forward(ctx, y); // [N, hidden_size]
@ -820,19 +869,22 @@ public:
x = forward_core_with_concat(ctx, x, c, context, skip_layers); // (N, H*W, patch_size ** 2 * out_channels)
x = DiT::unpatchify_and_crop(ctx->ggml_ctx, x, H, W, patch_size, patch_size, /*patch_last*/ false); // [N, C, H, W]
x = DiT::unpatchify_and_crop(ctx->ggml_ctx, x, H, W, config.patch_size, config.patch_size, /*patch_last*/ false); // [N, C, H, W]
return x;
}
};
struct MMDiTRunner : public DiffusionModelRunner {
MMDiTConfig config;
MMDiT mmdit;
MMDiTRunner(ggml_backend_t backend,
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "")
: DiffusionModelRunner(backend, params_backend, prefix), mmdit(tensor_storage_map) {
: DiffusionModelRunner(backend, params_backend, prefix),
config(MMDiTConfig::detect_from_weights(tensor_storage_map, prefix)),
mmdit(config) {
mmdit.init(params_ctx, tensor_storage_map, prefix);
}
@ -980,4 +1032,4 @@ struct MMDiTRunner : public DiffusionModelRunner {
}
};
#endif
#endif // __SD_MODEL_DIFFUSION_MMDIT_HPP__

View File

@ -1,12 +1,12 @@
#ifndef __DIFFUSION_MODEL_H__
#define __DIFFUSION_MODEL_H__
#ifndef __SD_MODEL_DIFFUSION_MODEL_HPP__
#define __SD_MODEL_DIFFUSION_MODEL_HPP__
#include <string>
#include <utility>
#include <variant>
#include "ggml_extend.hpp"
#include "tensor_ggml.hpp"
#include "core/ggml_extend.hpp"
#include "core/tensor_ggml.hpp"
struct UNetDiffusionExtra {
int num_video_frames = -1;
@ -104,4 +104,4 @@ public:
const std::string& prefix) = 0;
};
#endif
#endif // __SD_MODEL_DIFFUSION_MODEL_HPP__

View File

@ -1,5 +1,5 @@
#ifndef __SD_PID_HPP__
#define __SD_PID_HPP__
#ifndef __SD_MODEL_DIFFUSION_PID_HPP__
#define __SD_MODEL_DIFFUSION_PID_HPP__
#include <cmath>
#include <cstdlib>
@ -7,16 +7,16 @@
#include <string>
#include <vector>
#include "common_dit.hpp"
#include "ggml_extend.hpp"
#include "mmdit.hpp"
#include "rope.hpp"
#include "core/ggml_extend.hpp"
#include "model/common/rope.hpp"
#include "model/diffusion/dit.hpp"
#include "model/diffusion/mmdit.hpp"
namespace Pid {
constexpr int PID_GRAPH_SIZE = 196608;
constexpr float PID_PI = 3.14159265358979323846f;
struct PixelDiTParams {
struct PixelDiTConfig {
int64_t in_channels = 3;
int64_t hidden_size = 1536;
int64_t num_groups = 24;
@ -38,6 +38,45 @@ namespace Pid {
int64_t lq_latent_down_factor = 8;
int64_t rope_ref_grid_h = 64;
int64_t rope_ref_grid_w = 64;
static PixelDiTConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
PixelDiTConfig config;
for (const auto& [name, tensor_storage] : tensor_storage_map) {
if (!starts_with(name, prefix)) {
continue;
}
size_t pos = name.find("patch_blocks.");
if (pos != std::string::npos) {
auto items = split_string(name.substr(pos), '.');
if (items.size() > 1) {
int block_index = atoi(items[1].c_str());
config.patch_depth = std::max<int64_t>(config.patch_depth, block_index + 1);
}
}
pos = name.find("pixel_blocks.");
if (pos != std::string::npos) {
auto items = split_string(name.substr(pos), '.');
if (items.size() > 1) {
int block_index = atoi(items[1].c_str());
config.pixel_depth = std::max<int64_t>(config.pixel_depth, block_index + 1);
}
}
if (name.find("lq_proj.latent_proj.0.weight") != std::string::npos) {
config.lq_latent_channels = tensor_storage.ne[2];
config.lq_latent_down_factor = config.lq_latent_channels >= 64 ? 16 : 8;
}
if (name.find("patch_blocks.0.mlp_x.w1.weight") != std::string::npos) {
config.patch_mlp_hidden_dim = tensor_storage.ne[1];
}
}
LOG_DEBUG("pid: patch_depth = %" PRId64 ", pixel_depth = %" PRId64 ", patch_mlp_hidden_dim = %" PRId64 ", lq_latent_channels = %" PRId64 ", lq_latent_down_factor = %" PRId64,
config.patch_depth,
config.pixel_depth,
config.patch_mlp_hidden_dim,
config.lq_latent_channels,
config.lq_latent_down_factor);
return config;
}
};
inline std::vector<float> make_rope_1d(int length,
@ -466,29 +505,29 @@ namespace Pid {
};
struct LQProjection2D : public GGMLBlock {
PixelDiTParams params_cfg;
PixelDiTConfig config;
LQProjection2D(const PixelDiTParams& params_cfg)
: params_cfg(params_cfg) {
blocks["latent_proj.0"] = std::make_shared<Conv2d>(params_cfg.lq_latent_channels, params_cfg.lq_hidden_dim, std::pair<int, int>{3, 3}, std::pair<int, int>{1, 1}, std::pair<int, int>{1, 1});
blocks["latent_proj.2"] = std::make_shared<Conv2d>(params_cfg.lq_hidden_dim, params_cfg.lq_hidden_dim, std::pair<int, int>{3, 3}, std::pair<int, int>{1, 1}, std::pair<int, int>{1, 1});
for (int i = 0; i < params_cfg.lq_num_res_blocks; ++i) {
blocks["latent_proj." + std::to_string(3 + i)] = std::make_shared<PiDResBlock>(params_cfg.lq_hidden_dim);
LQProjection2D(const PixelDiTConfig& config)
: config(config) {
blocks["latent_proj.0"] = std::make_shared<Conv2d>(config.lq_latent_channels, config.lq_hidden_dim, std::pair<int, int>{3, 3}, std::pair<int, int>{1, 1}, std::pair<int, int>{1, 1});
blocks["latent_proj.2"] = std::make_shared<Conv2d>(config.lq_hidden_dim, config.lq_hidden_dim, std::pair<int, int>{3, 3}, std::pair<int, int>{1, 1}, std::pair<int, int>{1, 1});
for (int i = 0; i < config.lq_num_res_blocks; ++i) {
blocks["latent_proj." + std::to_string(3 + i)] = std::make_shared<PiDResBlock>(config.lq_hidden_dim);
}
int num_outputs = static_cast<int>((params_cfg.patch_depth + params_cfg.lq_interval - 1) / params_cfg.lq_interval);
int num_outputs = static_cast<int>((config.patch_depth + config.lq_interval - 1) / config.lq_interval);
for (int i = 0; i < num_outputs; ++i) {
blocks["output_heads." + std::to_string(i)] = std::make_shared<Linear>(params_cfg.lq_hidden_dim, params_cfg.hidden_size, true);
blocks["gate_modules." + std::to_string(i)] = std::make_shared<SigmaAwareGate>(params_cfg.hidden_size);
blocks["output_heads." + std::to_string(i)] = std::make_shared<Linear>(config.lq_hidden_dim, config.hidden_size, true);
blocks["gate_modules." + std::to_string(i)] = std::make_shared<SigmaAwareGate>(config.hidden_size);
}
}
bool is_gate_active(int block_idx) const {
return block_idx % params_cfg.lq_interval == 0;
return block_idx % config.lq_interval == 0;
}
int get_output_index(int block_idx) const {
return block_idx / static_cast<int>(params_cfg.lq_interval);
return block_idx / static_cast<int>(config.lq_interval);
}
ggml_tensor* gate(GGMLRunnerContext* ctx,
@ -506,8 +545,8 @@ namespace Pid {
int64_t target_pW) {
auto conv0 = std::dynamic_pointer_cast<Conv2d>(blocks["latent_proj.0"]);
auto conv2 = std::dynamic_pointer_cast<Conv2d>(blocks["latent_proj.2"]);
float z_to_patch_ratio = static_cast<float>(params_cfg.lq_sr_scale * params_cfg.lq_latent_down_factor) /
static_cast<float>(params_cfg.patch_size);
float z_to_patch_ratio = static_cast<float>(config.lq_sr_scale * config.lq_latent_down_factor) /
static_cast<float>(config.patch_size);
GGML_ASSERT(z_to_patch_ratio >= 1.0f);
if (lq_latent->ne[0] != target_pW || lq_latent->ne[1] != target_pH) {
lq_latent = ggml_interpolate(ctx->ggml_ctx,
@ -522,7 +561,7 @@ namespace Pid {
auto feat = conv0->forward(ctx, lq_latent);
feat = ggml_silu_inplace(ctx->ggml_ctx, feat);
feat = conv2->forward(ctx, feat);
for (int i = 0; i < params_cfg.lq_num_res_blocks; ++i) {
for (int i = 0; i < config.lq_num_res_blocks; ++i) {
auto block = std::dynamic_pointer_cast<PiDResBlock>(blocks["latent_proj." + std::to_string(3 + i)]);
feat = block->forward(ctx, feat);
}
@ -533,7 +572,7 @@ namespace Pid {
auto tokens = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, feat, 2, 0, 1, 3));
tokens = ggml_reshape_3d(ctx->ggml_ctx, tokens, C, L, B);
int num_outputs = static_cast<int>((params_cfg.patch_depth + params_cfg.lq_interval - 1) / params_cfg.lq_interval);
int num_outputs = static_cast<int>((config.patch_depth + config.lq_interval - 1) / config.lq_interval);
std::vector<ggml_tensor*> outputs;
outputs.reserve(num_outputs);
for (int i = 0; i < num_outputs; ++i) {
@ -545,34 +584,34 @@ namespace Pid {
};
struct PixelDiT : public GGMLBlock {
PixelDiTParams params_cfg;
PixelDiTConfig config;
PixelDiT() = default;
PixelDiT(const PixelDiTParams& params_cfg)
: params_cfg(params_cfg) {
blocks["pixel_embedder"] = std::make_shared<PixelTokenEmbedder>(params_cfg.in_channels, params_cfg.pixel_hidden_size);
blocks["s_embedder"] = std::make_shared<PatchTokenEmbedder>(params_cfg.in_channels * params_cfg.patch_size * params_cfg.patch_size, params_cfg.hidden_size, false, true);
blocks["t_embedder"] = std::make_shared<PixelDiTTimestepEmbedder>(params_cfg.hidden_size);
blocks["y_embedder"] = std::make_shared<PatchTokenEmbedder>(params_cfg.txt_embed_dim, params_cfg.hidden_size, true, true);
for (int i = 0; i < params_cfg.patch_depth; ++i) {
blocks["patch_blocks." + std::to_string(i)] = std::make_shared<MMDiTBlockT2I>(params_cfg.hidden_size, params_cfg.num_groups, params_cfg.patch_mlp_hidden_dim);
PixelDiT(const PixelDiTConfig& config)
: config(config) {
blocks["pixel_embedder"] = std::make_shared<PixelTokenEmbedder>(config.in_channels, config.pixel_hidden_size);
blocks["s_embedder"] = std::make_shared<PatchTokenEmbedder>(config.in_channels * config.patch_size * config.patch_size, config.hidden_size, false, true);
blocks["t_embedder"] = std::make_shared<PixelDiTTimestepEmbedder>(config.hidden_size);
blocks["y_embedder"] = std::make_shared<PatchTokenEmbedder>(config.txt_embed_dim, config.hidden_size, true, true);
for (int i = 0; i < config.patch_depth; ++i) {
blocks["patch_blocks." + std::to_string(i)] = std::make_shared<MMDiTBlockT2I>(config.hidden_size, config.num_groups, config.patch_mlp_hidden_dim);
}
for (int i = 0; i < params_cfg.pixel_depth; ++i) {
blocks["pixel_blocks." + std::to_string(i)] = std::make_shared<PiTBlock>(params_cfg.pixel_hidden_size,
params_cfg.hidden_size,
params_cfg.patch_size,
params_cfg.pixel_attn_hidden_size,
params_cfg.pixel_num_groups);
for (int i = 0; i < config.pixel_depth; ++i) {
blocks["pixel_blocks." + std::to_string(i)] = std::make_shared<PiTBlock>(config.pixel_hidden_size,
config.hidden_size,
config.patch_size,
config.pixel_attn_hidden_size,
config.pixel_num_groups);
}
blocks["final_layer"] = std::make_shared<FinalLayer>(params_cfg.pixel_hidden_size, params_cfg.in_channels);
blocks["lq_proj"] = std::make_shared<LQProjection2D>(params_cfg);
blocks["final_layer"] = std::make_shared<FinalLayer>(config.pixel_hidden_size, config.in_channels);
blocks["lq_proj"] = std::make_shared<LQProjection2D>(config);
}
void init_params(ggml_context* ctx,
const String2TensorStorage& tensor_storage_map = {},
std::string prefix = "") override {
params["y_pos_embedding"] = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, params_cfg.hidden_size, params_cfg.txt_max_length, 1);
params["y_pos_embedding"] = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, config.hidden_size, config.txt_max_length, 1);
}
ggml_tensor* forward(GGMLRunnerContext* ctx,
@ -594,21 +633,21 @@ namespace Pid {
int64_t W_orig = x->ne[0];
int64_t H_orig = x->ne[1];
x = DiT::pad_to_patch_size(ctx, x, static_cast<int>(params_cfg.patch_size), static_cast<int>(params_cfg.patch_size));
x = DiT::pad_to_patch_size(ctx, x, static_cast<int>(config.patch_size), static_cast<int>(config.patch_size));
int64_t W = x->ne[0];
int64_t H = x->ne[1];
int64_t B = x->ne[3];
int64_t Hs = H / params_cfg.patch_size;
int64_t Ws = W / params_cfg.patch_size;
int64_t Hs = H / config.patch_size;
int64_t Ws = W / config.patch_size;
int64_t L = Hs * Ws;
int64_t P2 = params_cfg.patch_size * params_cfg.patch_size;
int64_t P2 = config.patch_size * config.patch_size;
auto x_patches = DiT::patchify(ctx->ggml_ctx, x, static_cast<int>(params_cfg.patch_size), static_cast<int>(params_cfg.patch_size), true);
auto x_patches = DiT::patchify(ctx->ggml_ctx, x, static_cast<int>(config.patch_size), static_cast<int>(config.patch_size), true);
auto t_emb = t_embedder->forward(ctx, timesteps);
auto condition = ggml_silu(ctx->ggml_ctx, t_emb);
GGML_ASSERT(context != nullptr);
int64_t Ltxt = std::min<int64_t>(context->ne[1], params_cfg.txt_max_length);
int64_t Ltxt = std::min<int64_t>(context->ne[1], config.txt_max_length);
auto y = ggml_ext_slice(ctx->ggml_ctx, context, 1, 0, Ltxt);
auto y_emb = y_embedder->forward(ctx, y);
auto y_pos = ggml_ext_slice(ctx->ggml_ctx, params["y_pos_embedding"], 1, 0, Ltxt);
@ -618,7 +657,7 @@ namespace Pid {
auto s = s_embedder->forward(ctx, x_patches);
for (int i = 0; i < params_cfg.patch_depth; ++i) {
for (int i = 0; i < config.patch_depth; ++i) {
if (lq_proj->is_gate_active(i)) {
int out_idx = lq_proj->get_output_index(i);
if (out_idx < static_cast<int>(lq_features.size())) {
@ -639,22 +678,22 @@ namespace Pid {
}
s = ggml_silu(ctx->ggml_ctx, ggml_add(ctx->ggml_ctx, s, t_emb));
auto s_cond = ggml_reshape_2d(ctx->ggml_ctx, s, params_cfg.hidden_size, L * B);
auto pixels = pixel_embedder->forward(ctx, x, params_cfg.patch_size, pixel_pos_full);
for (int i = 0; i < params_cfg.pixel_depth; ++i) {
auto s_cond = ggml_reshape_2d(ctx->ggml_ctx, s, config.hidden_size, L * B);
auto pixels = pixel_embedder->forward(ctx, x, config.patch_size, pixel_pos_full);
for (int i = 0; i < config.pixel_depth; ++i) {
auto block = std::dynamic_pointer_cast<PiTBlock>(blocks["pixel_blocks." + std::to_string(i)]);
pixels = block->forward(ctx, pixels, s_cond, H, W, pixel_pos_comp);
sd::ggml_graph_cut::mark_graph_cut(pixels, "pid.pixel_blocks." + std::to_string(i), "pixels");
}
pixels = final_layer->forward(ctx, pixels);
pixels = ggml_reshape_3d(ctx->ggml_ctx, pixels, params_cfg.in_channels * P2, L, B);
pixels = ggml_reshape_3d(ctx->ggml_ctx, pixels, config.in_channels * P2, L, B);
auto out = DiT::unpatchify(ctx->ggml_ctx,
pixels,
Hs,
Ws,
static_cast<int>(params_cfg.patch_size),
static_cast<int>(params_cfg.patch_size),
static_cast<int>(config.patch_size),
static_cast<int>(config.patch_size),
false);
out = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, H_orig);
out = ggml_ext_slice(ctx->ggml_ctx, out, 0, 0, W_orig);
@ -663,7 +702,7 @@ namespace Pid {
};
struct PiDRunner : public DiffusionModelRunner {
PixelDiTParams params_cfg;
PixelDiTConfig config;
PixelDiT model;
std::vector<float> pos_img_vec;
std::vector<float> pos_txt_vec;
@ -674,43 +713,9 @@ namespace Pid {
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map,
const std::string prefix = "model.diffusion_model")
: DiffusionModelRunner(backend, params_backend, prefix) {
for (const auto& pair : tensor_storage_map) {
const std::string& tensor_name = pair.first;
if (tensor_name.find(prefix) == std::string::npos) {
continue;
}
size_t pos = tensor_name.find("patch_blocks.");
if (pos != std::string::npos) {
auto items = split_string(tensor_name.substr(pos), '.');
if (items.size() > 1) {
int block_index = atoi(items[1].c_str());
params_cfg.patch_depth = std::max<int64_t>(params_cfg.patch_depth, block_index + 1);
}
}
pos = tensor_name.find("pixel_blocks.");
if (pos != std::string::npos) {
auto items = split_string(tensor_name.substr(pos), '.');
if (items.size() > 1) {
int block_index = atoi(items[1].c_str());
params_cfg.pixel_depth = std::max<int64_t>(params_cfg.pixel_depth, block_index + 1);
}
}
if (tensor_name.find("lq_proj.latent_proj.0.weight") != std::string::npos) {
params_cfg.lq_latent_channels = pair.second.ne[2];
params_cfg.lq_latent_down_factor = params_cfg.lq_latent_channels >= 64 ? 16 : 8;
}
if (tensor_name.find("patch_blocks.0.mlp_x.w1.weight") != std::string::npos) {
params_cfg.patch_mlp_hidden_dim = pair.second.ne[1];
}
}
LOG_INFO("PiD params: patch_depth=%" PRId64 ", pixel_depth=%" PRId64 ", patch_mlp_hidden_dim=%" PRId64 ", lq_latent_channels=%" PRId64 ", lq_latent_down_factor=%" PRId64,
params_cfg.patch_depth,
params_cfg.pixel_depth,
params_cfg.patch_mlp_hidden_dim,
params_cfg.lq_latent_channels,
params_cfg.lq_latent_down_factor);
model = PixelDiT(params_cfg);
: DiffusionModelRunner(backend, params_backend, prefix),
config(PixelDiTConfig::detect_from_weights(tensor_storage_map, prefix)) {
model = PixelDiT(config);
model.init(params_ctx, tensor_storage_map, prefix);
}
@ -737,60 +742,60 @@ namespace Pid {
int64_t W = x->ne[0];
int64_t H = x->ne[1];
int64_t B = x->ne[3];
int64_t Wp = align_up(static_cast<int>(W), static_cast<int>(params_cfg.patch_size));
int64_t Hp = align_up(static_cast<int>(H), static_cast<int>(params_cfg.patch_size));
int64_t Hs = Hp / params_cfg.patch_size;
int64_t Ws = Wp / params_cfg.patch_size;
int64_t Wp = align_up(static_cast<int>(W), static_cast<int>(config.patch_size));
int64_t Hp = align_up(static_cast<int>(H), static_cast<int>(config.patch_size));
int64_t Hs = Hp / config.patch_size;
int64_t Ws = Wp / config.patch_size;
pos_img_vec = make_rope_2d(static_cast<int>(Hs),
static_cast<int>(Ws),
static_cast<int>(params_cfg.hidden_size / params_cfg.num_groups),
static_cast<int>(config.hidden_size / config.num_groups),
10000.f,
16.f,
static_cast<int>(params_cfg.rope_ref_grid_h),
static_cast<int>(params_cfg.rope_ref_grid_w));
static_cast<int>(config.rope_ref_grid_h),
static_cast<int>(config.rope_ref_grid_w));
auto pos_img = ggml_new_tensor_4d(compute_ctx,
GGML_TYPE_F32,
2,
2,
params_cfg.hidden_size / params_cfg.num_groups / 2,
config.hidden_size / config.num_groups / 2,
Hs * Ws);
set_backend_tensor_data(pos_img, pos_img_vec.data());
int64_t Ltxt = std::min<int64_t>(context->ne[1], params_cfg.txt_max_length);
int64_t Ltxt = std::min<int64_t>(context->ne[1], config.txt_max_length);
pos_txt_vec = make_rope_1d(static_cast<int>(Ltxt),
static_cast<int>(params_cfg.hidden_size / params_cfg.num_groups),
params_cfg.text_rope_theta);
static_cast<int>(config.hidden_size / config.num_groups),
config.text_rope_theta);
auto pos_txt = ggml_new_tensor_4d(compute_ctx,
GGML_TYPE_F32,
2,
2,
params_cfg.hidden_size / params_cfg.num_groups / 2,
config.hidden_size / config.num_groups / 2,
Ltxt);
set_backend_tensor_data(pos_txt, pos_txt_vec.data());
pixel_pos_vec = make_pixel_abs_pos(static_cast<int>(Hp),
static_cast<int>(Wp),
static_cast<int>(params_cfg.pixel_hidden_size));
static_cast<int>(config.pixel_hidden_size));
auto pixel_pos = ggml_new_tensor_3d(compute_ctx,
GGML_TYPE_F32,
params_cfg.pixel_hidden_size,
config.pixel_hidden_size,
Wp * Hp,
1);
set_backend_tensor_data(pixel_pos, pixel_pos_vec.data());
pixel_pos_comp_vec = make_rope_2d(static_cast<int>(Hs),
static_cast<int>(Ws),
static_cast<int>(params_cfg.pixel_attn_hidden_size / params_cfg.pixel_num_groups),
static_cast<int>(config.pixel_attn_hidden_size / config.pixel_num_groups),
10000.f,
16.f,
static_cast<int>(params_cfg.rope_ref_grid_h),
static_cast<int>(params_cfg.rope_ref_grid_w));
static_cast<int>(config.rope_ref_grid_h),
static_cast<int>(config.rope_ref_grid_w));
auto pixel_pos_comp = ggml_new_tensor_4d(compute_ctx,
GGML_TYPE_F32,
2,
2,
params_cfg.pixel_attn_hidden_size / params_cfg.pixel_num_groups / 2,
config.pixel_attn_hidden_size / config.pixel_num_groups / 2,
Hs * Ws);
set_backend_tensor_data(pixel_pos_comp, pixel_pos_comp_vec.data());
@ -839,4 +844,4 @@ namespace Pid {
};
} // namespace Pid
#endif // __SD_PID_HPP__
#endif // __SD_MODEL_DIFFUSION_PID_HPP__

View File

@ -1,15 +1,57 @@
#ifndef __QWEN_IMAGE_HPP__
#define __QWEN_IMAGE_HPP__
#ifndef __SD_MODEL_DIFFUSION_QWEN_IMAGE_HPP__
#define __SD_MODEL_DIFFUSION_QWEN_IMAGE_HPP__
#include <memory>
#include "common_block.hpp"
#include "diffusion_model.hpp"
#include "flux.hpp"
#include "model/common/block.hpp"
#include "model/diffusion/flux.hpp"
#include "model/diffusion/model.hpp"
namespace Qwen {
constexpr int QWEN_IMAGE_GRAPH_SIZE = 20480;
struct QwenImageConfig {
int patch_size = 2;
int64_t in_channels = 64;
int64_t out_channels = 16;
int num_layers = 60;
int64_t attention_head_dim = 128;
int64_t num_attention_heads = 24;
int64_t joint_attention_dim = 3584;
int theta = 10000;
std::vector<int> axes_dim = {16, 56, 56};
int axes_dim_sum = 128;
bool zero_cond_t = false;
static QwenImageConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
QwenImageConfig config;
config.num_layers = 0;
for (const auto& [name, _] : tensor_storage_map) {
if (!starts_with(name, prefix)) {
continue;
}
if (name.find("__index_timestep_zero__") != std::string::npos) {
config.zero_cond_t = true;
}
size_t pos = name.find("transformer_blocks.");
if (pos == std::string::npos) {
continue;
}
auto items = split_string(name.substr(pos), '.');
if (items.size() > 1) {
int block_index = atoi(items[1].c_str());
if (block_index + 1 > config.num_layers) {
config.num_layers = block_index + 1;
}
}
}
LOG_DEBUG("qwen_image: num_layers = %d, zero_cond_t = %s",
config.num_layers,
config.zero_cond_t ? "true" : "false");
return config;
}
};
struct TimestepEmbedding : public GGMLBlock {
public:
TimestepEmbedding(int64_t in_channels,
@ -350,46 +392,32 @@ namespace Qwen {
}
};
struct QwenImageParams {
int patch_size = 2;
int64_t in_channels = 64;
int64_t out_channels = 16;
int num_layers = 60;
int64_t attention_head_dim = 128;
int64_t num_attention_heads = 24;
int64_t joint_attention_dim = 3584;
int theta = 10000;
std::vector<int> axes_dim = {16, 56, 56};
int axes_dim_sum = 128;
bool zero_cond_t = false;
};
class QwenImageModel : public GGMLBlock {
protected:
QwenImageParams params;
QwenImageConfig config;
public:
QwenImageModel() {}
QwenImageModel(QwenImageParams params)
: params(params) {
int64_t inner_dim = params.num_attention_heads * params.attention_head_dim;
QwenImageModel(QwenImageConfig config)
: config(config) {
int64_t inner_dim = config.num_attention_heads * config.attention_head_dim;
blocks["time_text_embed"] = std::shared_ptr<GGMLBlock>(new QwenTimestepProjEmbeddings(inner_dim));
blocks["txt_norm"] = std::shared_ptr<GGMLBlock>(new RMSNorm(params.joint_attention_dim, 1e-6f));
blocks["img_in"] = std::shared_ptr<GGMLBlock>(new Linear(params.in_channels, inner_dim));
blocks["txt_in"] = std::shared_ptr<GGMLBlock>(new Linear(params.joint_attention_dim, inner_dim));
blocks["txt_norm"] = std::shared_ptr<GGMLBlock>(new RMSNorm(config.joint_attention_dim, 1e-6f));
blocks["img_in"] = std::shared_ptr<GGMLBlock>(new Linear(config.in_channels, inner_dim));
blocks["txt_in"] = std::shared_ptr<GGMLBlock>(new Linear(config.joint_attention_dim, inner_dim));
// blocks
for (int i = 0; i < params.num_layers; i++) {
for (int i = 0; i < config.num_layers; i++) {
auto block = std::shared_ptr<GGMLBlock>(new QwenImageTransformerBlock(inner_dim,
params.num_attention_heads,
params.attention_head_dim,
config.num_attention_heads,
config.attention_head_dim,
1e-6f,
params.zero_cond_t));
config.zero_cond_t));
blocks["transformer_blocks." + std::to_string(i)] = block;
}
blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new AdaLayerNormContinuous(inner_dim, inner_dim, false, 1e-6f));
blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, params.patch_size * params.patch_size * params.out_channels));
blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, config.patch_size * config.patch_size * config.out_channels));
}
ggml_tensor* forward_orig(GGMLRunnerContext* ctx,
@ -406,7 +434,7 @@ namespace Qwen {
auto proj_out = std::dynamic_pointer_cast<Linear>(blocks["proj_out"]);
auto t_emb = time_text_embed->forward(ctx, timestep);
if (params.zero_cond_t) {
if (config.zero_cond_t) {
auto t_emb_0 = time_text_embed->forward(ctx, ggml_ext_zeros_like(ctx->ggml_ctx, timestep));
t_emb = ggml_concat(ctx->ggml_ctx, t_emb, t_emb_0, 1);
}
@ -417,7 +445,7 @@ namespace Qwen {
sd::ggml_graph_cut::mark_graph_cut(txt, "qwen_image.prelude", "txt");
// sd::ggml_graph_cut::mark_graph_cut(t_emb, "qwen_image.prelude", "t_emb");
for (int i = 0; i < params.num_layers; i++) {
for (int i = 0; i < config.num_layers; i++) {
auto block = std::dynamic_pointer_cast<QwenImageTransformerBlock>(blocks["transformer_blocks." + std::to_string(i)]);
auto result = block->forward(ctx, img, txt, t_emb, pe, modulate_index);
@ -427,7 +455,7 @@ namespace Qwen {
sd::ggml_graph_cut::mark_graph_cut(txt, "qwen_image.transformer_blocks." + std::to_string(i), "txt");
}
if (params.zero_cond_t) {
if (config.zero_cond_t) {
t_emb = ggml_ext_chunk(ctx->ggml_ctx, t_emb, 2, 1)[0];
}
@ -456,12 +484,12 @@ namespace Qwen {
int64_t C = x->ne[2];
int64_t N = x->ne[3];
auto img = DiT::pad_and_patchify(ctx, x, params.patch_size, params.patch_size);
auto img = DiT::pad_and_patchify(ctx, x, config.patch_size, config.patch_size);
int64_t img_tokens = img->ne[1];
if (ref_latents.size() > 0) {
for (ggml_tensor* ref : ref_latents) {
ref = DiT::pad_and_patchify(ctx, ref, params.patch_size, params.patch_size);
ref = DiT::pad_and_patchify(ctx, ref, config.patch_size, config.patch_size);
img = ggml_concat(ctx->ggml_ctx, img, ref, 1);
}
}
@ -474,7 +502,7 @@ namespace Qwen {
out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, out, 0, 2, 1, 3)); // [N, h*w, C * patch_size * patch_size]
}
out = DiT::unpatchify_and_crop(ctx->ggml_ctx, out, H, W, params.patch_size, params.patch_size); // [N, C, H, W]
out = DiT::unpatchify_and_crop(ctx->ggml_ctx, out, H, W, config.patch_size, config.patch_size); // [N, C, H, W]
return out;
}
@ -482,7 +510,7 @@ namespace Qwen {
struct QwenImageRunner : public DiffusionModelRunner {
public:
QwenImageParams qwen_image_params;
QwenImageConfig config;
QwenImageModel qwen_image;
std::vector<float> pe_vec;
std::vector<float> modulate_index_vec;
@ -494,34 +522,10 @@ namespace Qwen {
const std::string prefix = "",
SDVersion version = VERSION_QWEN_IMAGE,
bool zero_cond_t = false)
: DiffusionModelRunner(backend, params_backend, prefix) {
qwen_image_params.num_layers = 0;
qwen_image_params.zero_cond_t = zero_cond_t;
for (auto pair : tensor_storage_map) {
std::string tensor_name = pair.first;
if (tensor_name.find(prefix) == std::string::npos)
continue;
if (tensor_name.find("__index_timestep_zero__") != std::string::npos) {
qwen_image_params.zero_cond_t = true;
}
size_t pos = tensor_name.find("transformer_blocks.");
if (pos != std::string::npos) {
tensor_name = tensor_name.substr(pos); // remove prefix
auto items = split_string(tensor_name, '.');
if (items.size() > 1) {
int block_index = atoi(items[1].c_str());
if (block_index + 1 > qwen_image_params.num_layers) {
qwen_image_params.num_layers = block_index + 1;
}
}
continue;
}
}
LOG_INFO("qwen_image_params.num_layers: %ld", qwen_image_params.num_layers);
if (qwen_image_params.zero_cond_t) {
LOG_INFO("use zero_cond_t");
}
qwen_image = QwenImageModel(qwen_image_params);
: DiffusionModelRunner(backend, params_backend, prefix),
config(QwenImageConfig::detect_from_weights(tensor_storage_map, prefix)) {
config.zero_cond_t = config.zero_cond_t || zero_cond_t;
qwen_image = QwenImageModel(config);
qwen_image.init(params_ctx, tensor_storage_map, prefix);
}
@ -552,36 +556,36 @@ namespace Qwen {
pe_vec = Rope::gen_qwen_image_pe(static_cast<int>(x->ne[1]),
static_cast<int>(x->ne[0]),
qwen_image_params.patch_size,
config.patch_size,
static_cast<int>(x->ne[3]),
static_cast<int>(context->ne[1]),
ref_latents,
increase_ref_index,
qwen_image_params.theta,
config.theta,
circular_y_enabled,
circular_x_enabled,
qwen_image_params.axes_dim);
int pos_len = static_cast<int>(pe_vec.size() / qwen_image_params.axes_dim_sum / 2);
config.axes_dim);
int pos_len = static_cast<int>(pe_vec.size() / config.axes_dim_sum / 2);
// LOG_DEBUG("pos_len %d", pos_len);
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, qwen_image_params.axes_dim_sum / 2, pos_len);
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.axes_dim_sum / 2, pos_len);
// pe->data = pe_vec.data();
// print_ggml_tensor(pe, true, "pe");
// pe->data = nullptr;
set_backend_tensor_data(pe, pe_vec.data());
ggml_tensor* modulate_index = nullptr;
if (qwen_image_params.zero_cond_t) {
if (config.zero_cond_t) {
modulate_index_vec.clear();
int64_t h_len = ((x->ne[1] + (qwen_image_params.patch_size / 2)) / qwen_image_params.patch_size);
int64_t w_len = ((x->ne[0] + (qwen_image_params.patch_size / 2)) / qwen_image_params.patch_size);
int64_t h_len = ((x->ne[1] + (config.patch_size / 2)) / config.patch_size);
int64_t w_len = ((x->ne[0] + (config.patch_size / 2)) / config.patch_size);
int64_t num_img_tokens = h_len * w_len;
modulate_index_vec.insert(modulate_index_vec.end(), num_img_tokens, 0.f);
int64_t num_ref_img_tokens = 0;
for (ggml_tensor* ref : ref_latents) {
int64_t h_len = ((ref->ne[1] + (qwen_image_params.patch_size / 2)) / qwen_image_params.patch_size);
int64_t w_len = ((ref->ne[0] + (qwen_image_params.patch_size / 2)) / qwen_image_params.patch_size);
int64_t h_len = ((ref->ne[1] + (config.patch_size / 2)) / config.patch_size);
int64_t w_len = ((ref->ne[0] + (config.patch_size / 2)) / config.patch_size);
num_ref_img_tokens += h_len * w_len;
}
@ -727,4 +731,4 @@ namespace Qwen {
} // namespace name
#endif // __QWEN_IMAGE_HPP__
#endif // __SD_MODEL_DIFFUSION_QWEN_IMAGE_HPP__

View File

@ -1,14 +1,136 @@
#ifndef __UNET_HPP__
#define __UNET_HPP__
#ifndef __SD_MODEL_DIFFUSION_UNET_HPP__
#define __SD_MODEL_DIFFUSION_UNET_HPP__
#include <algorithm>
#include <vector>
#include "common_block.hpp"
#include "diffusion_model.hpp"
#include "model.h"
#include "model/common/block.hpp"
#include "model/diffusion/model.hpp"
/*==================================================== UnetModel =====================================================*/
#define UNET_GRAPH_SIZE 102400
struct UNetConfig {
SDVersion version = VERSION_SD1;
// network hparams
int in_channels = 4;
int out_channels = 4;
int num_res_blocks = 2;
std::vector<int> attention_resolutions = {4, 2, 1};
std::vector<int> channel_mult = {1, 2, 4, 4};
std::vector<int> transformer_depth = {1, 1, 1, 1};
int time_embed_dim = 1280; // model_channels*4
int num_heads = 8;
int num_head_channels = -1; // channels // num_heads
int context_dim = 768; // 1024 for VERSION_SD2, 2048 for VERSION_SDXL
bool use_linear_projection = false;
bool tiny_unet = false;
int model_channels = 320;
int adm_in_channels = 2816; // only for VERSION_SDXL/SVD
static UNetConfig detect_from_weights(const String2TensorStorage& tensor_storage_map,
const std::string& prefix,
SDVersion version = VERSION_SD1) {
UNetConfig config;
config.version = version;
if (sd_version_is_sd2(version)) {
config.context_dim = 1024;
config.num_head_channels = 64;
config.num_heads = -1;
config.use_linear_projection = true;
} else if (sd_version_is_sdxl(version)) {
config.context_dim = 2048;
config.attention_resolutions = {4, 2};
config.channel_mult = {1, 2, 4};
config.transformer_depth = {1, 2, 10};
config.num_head_channels = 64;
config.num_heads = -1;
config.use_linear_projection = true;
if (version == VERSION_SDXL_VEGA) {
config.transformer_depth = {1, 1, 2};
}
} else if (version == VERSION_SVD) {
config.in_channels = 8;
config.out_channels = 4;
config.context_dim = 1024;
config.adm_in_channels = 768;
config.num_head_channels = 64;
config.num_heads = -1;
config.use_linear_projection = true;
}
if (sd_version_is_inpaint(version)) {
config.in_channels = 9;
} else if (sd_version_is_unet_edit(version)) {
config.in_channels = 8;
}
if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
config.num_res_blocks = 1;
config.channel_mult = {1, 2, 4};
config.tiny_unet = true;
if (version == VERSION_SDXS_512_DS) {
config.attention_resolutions = {4, 2}; // here just like SDXL
}
}
auto find_weight = [&](const std::string& suffix) -> const TensorStorage* {
std::string name = prefix.empty() ? suffix : prefix + "." + suffix;
auto it = tensor_storage_map.find(name);
if (it == tensor_storage_map.end()) {
return nullptr;
}
return &it->second;
};
if (const TensorStorage* input = find_weight("input_blocks.0.0.weight")) {
if (input->n_dims == 4) {
config.in_channels = static_cast<int>(input->ne[2]);
config.model_channels = static_cast<int>(input->ne[3]);
config.time_embed_dim = config.model_channels * 4;
}
}
if (const TensorStorage* time_embed = find_weight("time_embed.0.weight")) {
if (time_embed->n_dims == 2) {
config.model_channels = static_cast<int>(time_embed->ne[0]);
config.time_embed_dim = static_cast<int>(time_embed->ne[1]);
}
}
if (const TensorStorage* label_emb = find_weight("label_emb.0.0.weight")) {
if (label_emb->n_dims == 2) {
config.adm_in_channels = static_cast<int>(label_emb->ne[0]);
config.time_embed_dim = static_cast<int>(label_emb->ne[1]);
}
}
if (const TensorStorage* out = find_weight("out.2.weight")) {
if (out->n_dims == 4) {
config.out_channels = static_cast<int>(out->ne[3]);
}
}
for (const auto& [name, tensor_storage] : tensor_storage_map) {
if (!starts_with(name, prefix)) {
continue;
}
if (name.find("attn2.to_k.weight") != std::string::npos && tensor_storage.n_dims == 2) {
config.context_dim = static_cast<int>(tensor_storage.ne[0]);
break;
}
}
LOG_DEBUG("unet: in_channels = %d, out_channels = %d, model_channels = %d, time_embed_dim = %d, context_dim = %d, adm_in_channels = %d, num_res_blocks = %d, tiny_unet = %s",
config.in_channels,
config.out_channels,
config.model_channels,
config.time_embed_dim,
config.context_dim,
config.adm_in_channels,
config.num_res_blocks,
config.tiny_unet ? "true" : "false");
return config;
}
};
class SpatialVideoTransformer : public SpatialTransformer {
protected:
int64_t time_depth;
@ -166,66 +288,26 @@ public:
// ldm.modules.diffusionmodules.openaimodel.UNetModel
class UnetModelBlock : public GGMLBlock {
protected:
SDVersion version = VERSION_SD1;
// network hparams
int in_channels = 4;
int out_channels = 4;
int num_res_blocks = 2;
std::vector<int> attention_resolutions = {4, 2, 1};
std::vector<int> channel_mult = {1, 2, 4, 4};
std::vector<int> transformer_depth = {1, 1, 1, 1};
int time_embed_dim = 1280; // model_channels*4
int num_heads = 8;
int num_head_channels = -1; // channels // num_heads
int context_dim = 768; // 1024 for VERSION_SD2, 2048 for VERSION_SDXL
bool use_linear_projection = false;
bool tiny_unet = false;
public:
int model_channels = 320;
int adm_in_channels = 2816; // only for VERSION_SDXL/SVD
UNetConfig config;
UnetModelBlock(SDVersion version = VERSION_SD1, const String2TensorStorage& tensor_storage_map = {})
: version(version) {
if (sd_version_is_sd2(version)) {
context_dim = 1024;
num_head_channels = 64;
num_heads = -1;
use_linear_projection = true;
} else if (sd_version_is_sdxl(version)) {
context_dim = 2048;
attention_resolutions = {4, 2};
channel_mult = {1, 2, 4};
transformer_depth = {1, 2, 10};
num_head_channels = 64;
num_heads = -1;
use_linear_projection = true;
if (version == VERSION_SDXL_VEGA) {
transformer_depth = {1, 1, 2};
}
} else if (version == VERSION_SVD) {
in_channels = 8;
out_channels = 4;
context_dim = 1024;
adm_in_channels = 768;
num_head_channels = 64;
num_heads = -1;
use_linear_projection = true;
}
if (sd_version_is_inpaint(version)) {
in_channels = 9;
} else if (sd_version_is_unet_edit(version)) {
in_channels = 8;
}
if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
num_res_blocks = 1;
channel_mult = {1, 2, 4};
tiny_unet = true;
if (version == VERSION_SDXS_512_DS) {
attention_resolutions = {4, 2}; // here just like SDXL
}
}
explicit UnetModelBlock(UNetConfig config = {})
: config(config) {
const SDVersion version = this->config.version;
const int in_channels = this->config.in_channels;
const int out_channels = this->config.out_channels;
const int num_res_blocks = this->config.num_res_blocks;
const auto& attention_resolutions = this->config.attention_resolutions;
const auto& channel_mult = this->config.channel_mult;
const auto& transformer_depth = this->config.transformer_depth;
const int time_embed_dim = this->config.time_embed_dim;
const int num_heads = this->config.num_heads;
const int num_head_channels = this->config.num_head_channels;
const int context_dim = this->config.context_dim;
const bool use_linear_projection = this->config.use_linear_projection;
const bool tiny_unet = this->config.tiny_unet;
const int model_channels = this->config.model_channels;
const int adm_in_channels = this->config.adm_in_channels;
// dims is always 2
// use_temporal_attention is always True for SVD
@ -398,7 +480,7 @@ public:
ggml_tensor* x,
ggml_tensor* emb,
int num_video_frames) {
if (version == VERSION_SVD) {
if (config.version == VERSION_SVD) {
auto block = std::dynamic_pointer_cast<VideoResBlock>(blocks[name]);
return block->forward(ctx, x, emb, num_video_frames);
@ -414,7 +496,7 @@ public:
ggml_tensor* x,
ggml_tensor* context,
int timesteps) {
if (version == VERSION_SVD) {
if (config.version == VERSION_SVD) {
auto block = std::dynamic_pointer_cast<SpatialVideoTransformer>(blocks[name]);
return block->forward(ctx, x, context, timesteps);
@ -440,6 +522,13 @@ public:
// c_concat: [N, in_channels, h, w] or [1, in_channels, h, w]
// y: [N, adm_in_channels] or [1, adm_in_channels]
// return: [N, out_channels, h, w]
const SDVersion version = config.version;
const int model_channels = config.model_channels;
const int num_res_blocks = config.num_res_blocks;
const auto& attention_resolutions = config.attention_resolutions;
const auto& channel_mult = config.channel_mult;
const bool tiny_unet = config.tiny_unet;
if (context != nullptr) {
if (context->ne[2] != x->ne[3]) {
context = ggml_repeat(ctx->ggml_ctx, context, ggml_new_tensor_3d(ctx->ggml_ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
@ -601,6 +690,7 @@ public:
};
struct UNetModelRunner : public DiffusionModelRunner {
UNetConfig config;
UnetModelBlock unet;
UNetModelRunner(ggml_backend_t backend,
@ -608,7 +698,9 @@ struct UNetModelRunner : public DiffusionModelRunner {
const String2TensorStorage& tensor_storage_map,
const std::string prefix,
SDVersion version = VERSION_SD1)
: DiffusionModelRunner(backend, params_backend, prefix), unet(version, tensor_storage_map) {
: DiffusionModelRunner(backend, params_backend, prefix),
config(UNetConfig::detect_from_weights(tensor_storage_map, prefix, version)),
unet(config) {
unet.init(params_ctx, tensor_storage_map, prefix);
}
@ -752,4 +844,4 @@ struct UNetModelRunner : public DiffusionModelRunner {
}
};
#endif // __UNET_HPP__
#endif // __SD_MODEL_DIFFUSION_UNET_HPP__

1061
src/model/diffusion/wan.hpp Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,14 +1,14 @@
#ifndef __Z_IMAGE_HPP__
#define __Z_IMAGE_HPP__
#ifndef __SD_MODEL_DIFFUSION_Z_IMAGE_HPP__
#define __SD_MODEL_DIFFUSION_Z_IMAGE_HPP__
#include <algorithm>
#include "diffusion_model.hpp"
#include "flux.hpp"
#include "ggml_extend.hpp"
#include "mmdit.hpp"
#include "core/ggml_extend.hpp"
#include "model/diffusion/flux.hpp"
#include "model/diffusion/mmdit.hpp"
#include "model/diffusion/model.hpp"
// Ref: https://github.com/Alpha-VLLM/Lumina-Image-2.0/blob/main/models/model.py
// Ref: https://github.com/Alpha-VLLM/Lumina-Image-2.0/blob/main/model/model.py
// Ref: https://github.com/huggingface/diffusers/pull/12703
#ifndef MIN
@ -20,6 +20,104 @@ namespace ZImage {
constexpr int ADALN_EMBED_DIM = 256;
constexpr int SEQ_MULTI_OF = 32;
struct ZImageConfig {
int patch_size = 2;
int64_t hidden_size = 3840;
int64_t in_channels = 16;
int64_t out_channels = 16;
int64_t num_layers = 30;
int64_t num_refiner_layers = 2;
int64_t head_dim = 128;
int64_t num_heads = 30;
int64_t num_kv_heads = 30;
int64_t multiple_of = 256;
float ffn_dim_multiplier = 8.0f / 3.0f;
float norm_eps = 1e-5f;
bool qk_norm = true;
int64_t cap_feat_dim = 2560;
int theta = 256;
std::vector<int> axes_dim = {32, 48, 48};
int64_t axes_dim_sum = 128;
static ZImageConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
ZImageConfig config;
int64_t detected_layers = 0;
int64_t detected_refiner_layers = 0;
int64_t detected_context_refiner = 0;
int64_t detected_head_dim = 0;
int64_t detected_qkv_dim = 0;
for (const auto& [name, tensor_storage] : tensor_storage_map) {
if (!starts_with(name, prefix)) {
continue;
}
if (ends_with(name, "x_embedder.weight") && tensor_storage.n_dims == 2) {
int64_t patch_area = config.patch_size * config.patch_size;
config.in_channels = tensor_storage.ne[0] / patch_area;
config.hidden_size = tensor_storage.ne[1];
} else if (ends_with(name, "cap_embedder.1.weight") && tensor_storage.n_dims == 2) {
config.cap_feat_dim = tensor_storage.ne[0];
config.hidden_size = tensor_storage.ne[1];
} else if (ends_with(name, "layers.0.attention.q_norm.weight") && tensor_storage.n_dims == 1) {
detected_head_dim = tensor_storage.ne[0];
} else if (ends_with(name, "layers.0.attention.qkv.weight") && tensor_storage.n_dims == 2) {
detected_qkv_dim = tensor_storage.ne[1];
} else if (ends_with(name, "final_layer.linear.weight") && tensor_storage.n_dims == 2) {
int64_t patch_area = config.patch_size * config.patch_size;
config.out_channels = tensor_storage.ne[1] / patch_area;
}
size_t pos = name.find("layers.");
if (pos != std::string::npos) {
auto items = split_string(name.substr(pos), '.');
if (items.size() > 1) {
int block_index = atoi(items[1].c_str());
detected_layers = std::max<int64_t>(detected_layers, block_index + 1);
}
}
pos = name.find("noise_refiner.");
if (pos != std::string::npos) {
auto items = split_string(name.substr(pos), '.');
if (items.size() > 1) {
int block_index = atoi(items[1].c_str());
detected_refiner_layers = std::max<int64_t>(detected_refiner_layers, block_index + 1);
}
}
pos = name.find("context_refiner.");
if (pos != std::string::npos) {
auto items = split_string(name.substr(pos), '.');
if (items.size() > 1) {
int block_index = atoi(items[1].c_str());
detected_context_refiner = std::max<int64_t>(detected_context_refiner, block_index + 1);
}
}
}
if (detected_layers > 0) {
config.num_layers = detected_layers;
}
if (detected_refiner_layers > 0 || detected_context_refiner > 0) {
config.num_refiner_layers = std::max(detected_refiner_layers, detected_context_refiner);
}
if (detected_head_dim > 0) {
config.head_dim = detected_head_dim;
config.num_heads = config.hidden_size / config.head_dim;
if (detected_qkv_dim > 0) {
int64_t qkv_heads = detected_qkv_dim / config.head_dim;
config.num_kv_heads = std::max<int64_t>(1, (qkv_heads - config.num_heads) / 2);
}
}
LOG_DEBUG("z_image: num_layers = %" PRId64 ", num_refiner_layers = %" PRId64 ", hidden_size = %" PRId64 ", num_heads = %" PRId64 ", num_kv_heads = %" PRId64 ", in_channels = %" PRId64 ", out_channels = %" PRId64,
config.num_layers,
config.num_refiner_layers,
config.hidden_size,
config.num_heads,
config.num_kv_heads,
config.in_channels,
config.out_channels);
return config;
}
};
struct JointAttention : public GGMLBlock {
protected:
int64_t head_dim;
@ -263,90 +361,70 @@ namespace ZImage {
}
};
struct ZImageParams {
int patch_size = 2;
int64_t hidden_size = 3840;
int64_t in_channels = 16;
int64_t out_channels = 16;
int64_t num_layers = 30;
int64_t num_refiner_layers = 2;
int64_t head_dim = 128;
int64_t num_heads = 30;
int64_t num_kv_heads = 30;
int64_t multiple_of = 256;
float ffn_dim_multiplier = 8.0f / 3.0f;
float norm_eps = 1e-5f;
bool qk_norm = true;
int64_t cap_feat_dim = 2560;
int theta = 256;
std::vector<int> axes_dim = {32, 48, 48};
int64_t axes_dim_sum = 128;
};
class ZImageModel : public GGMLBlock {
protected:
ZImageParams z_image_params;
ZImageConfig config;
void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
params["cap_pad_token"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, z_image_params.hidden_size);
params["x_pad_token"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, z_image_params.hidden_size);
params["cap_pad_token"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, config.hidden_size);
params["x_pad_token"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, config.hidden_size);
}
public:
ZImageModel() = default;
ZImageModel(ZImageParams z_image_params)
: z_image_params(z_image_params) {
blocks["x_embedder"] = std::make_shared<Linear>(z_image_params.patch_size * z_image_params.patch_size * z_image_params.in_channels, z_image_params.hidden_size);
blocks["t_embedder"] = std::make_shared<TimestepEmbedder>(MIN(z_image_params.hidden_size, 1024), 256, 256);
blocks["cap_embedder.0"] = std::make_shared<RMSNorm>(z_image_params.cap_feat_dim, z_image_params.norm_eps);
blocks["cap_embedder.1"] = std::make_shared<Linear>(z_image_params.cap_feat_dim, z_image_params.hidden_size);
ZImageModel(ZImageConfig config)
: config(config) {
blocks["x_embedder"] = std::make_shared<Linear>(config.patch_size * config.patch_size * config.in_channels, config.hidden_size);
blocks["t_embedder"] = std::make_shared<TimestepEmbedder>(MIN(config.hidden_size, 1024), 256, 256);
blocks["cap_embedder.0"] = std::make_shared<RMSNorm>(config.cap_feat_dim, config.norm_eps);
blocks["cap_embedder.1"] = std::make_shared<Linear>(config.cap_feat_dim, config.hidden_size);
for (int i = 0; i < z_image_params.num_refiner_layers; i++) {
for (int i = 0; i < config.num_refiner_layers; i++) {
auto block = std::make_shared<JointTransformerBlock>(i,
z_image_params.hidden_size,
z_image_params.head_dim,
z_image_params.num_heads,
z_image_params.num_kv_heads,
z_image_params.multiple_of,
z_image_params.ffn_dim_multiplier,
z_image_params.norm_eps,
z_image_params.qk_norm,
config.hidden_size,
config.head_dim,
config.num_heads,
config.num_kv_heads,
config.multiple_of,
config.ffn_dim_multiplier,
config.norm_eps,
config.qk_norm,
true);
blocks["noise_refiner." + std::to_string(i)] = block;
}
for (int i = 0; i < z_image_params.num_refiner_layers; i++) {
for (int i = 0; i < config.num_refiner_layers; i++) {
auto block = std::make_shared<JointTransformerBlock>(i,
z_image_params.hidden_size,
z_image_params.head_dim,
z_image_params.num_heads,
z_image_params.num_kv_heads,
z_image_params.multiple_of,
z_image_params.ffn_dim_multiplier,
z_image_params.norm_eps,
z_image_params.qk_norm,
config.hidden_size,
config.head_dim,
config.num_heads,
config.num_kv_heads,
config.multiple_of,
config.ffn_dim_multiplier,
config.norm_eps,
config.qk_norm,
false);
blocks["context_refiner." + std::to_string(i)] = block;
}
for (int i = 0; i < z_image_params.num_layers; i++) {
for (int i = 0; i < config.num_layers; i++) {
auto block = std::make_shared<JointTransformerBlock>(i,
z_image_params.hidden_size,
z_image_params.head_dim,
z_image_params.num_heads,
z_image_params.num_kv_heads,
z_image_params.multiple_of,
z_image_params.ffn_dim_multiplier,
z_image_params.norm_eps,
z_image_params.qk_norm,
config.hidden_size,
config.head_dim,
config.num_heads,
config.num_kv_heads,
config.multiple_of,
config.ffn_dim_multiplier,
config.norm_eps,
config.qk_norm,
true);
blocks["layers." + std::to_string(i)] = block;
}
blocks["final_layer"] = std::make_shared<FinalLayer>(z_image_params.hidden_size, z_image_params.patch_size, z_image_params.out_channels);
blocks["final_layer"] = std::make_shared<FinalLayer>(config.hidden_size, config.patch_size, config.out_channels);
}
ggml_tensor* forward_core(GGMLRunnerContext* ctx,
@ -393,14 +471,14 @@ namespace ZImage {
auto txt_pe = ggml_ext_slice(ctx->ggml_ctx, pe, 3, 0, txt->ne[1]);
auto img_pe = ggml_ext_slice(ctx->ggml_ctx, pe, 3, txt->ne[1], pe->ne[3]);
for (int i = 0; i < z_image_params.num_refiner_layers; i++) {
for (int i = 0; i < config.num_refiner_layers; i++) {
auto block = std::dynamic_pointer_cast<JointTransformerBlock>(blocks["context_refiner." + std::to_string(i)]);
txt = block->forward(ctx, txt, txt_pe, nullptr, nullptr);
sd::ggml_graph_cut::mark_graph_cut(txt, "z_image.context_refiner." + std::to_string(i), "txt");
}
for (int i = 0; i < z_image_params.num_refiner_layers; i++) {
for (int i = 0; i < config.num_refiner_layers; i++) {
auto block = std::dynamic_pointer_cast<JointTransformerBlock>(blocks["noise_refiner." + std::to_string(i)]);
img = block->forward(ctx, img, img_pe, nullptr, t_emb);
@ -410,7 +488,7 @@ namespace ZImage {
auto txt_img = ggml_concat(ctx->ggml_ctx, txt, img, 1); // [N, n_txt_token + n_txt_pad_token + n_img_token + n_img_pad_token, hidden_size]
sd::ggml_graph_cut::mark_graph_cut(txt_img, "z_image.prelude", "txt_img");
for (int i = 0; i < z_image_params.num_layers; i++) {
for (int i = 0; i < config.num_layers; i++) {
auto block = std::dynamic_pointer_cast<JointTransformerBlock>(blocks["layers." + std::to_string(i)]);
txt_img = block->forward(ctx, txt_img, pe, nullptr, t_emb);
@ -442,7 +520,7 @@ namespace ZImage {
int64_t C = x->ne[2];
int64_t N = x->ne[3];
int patch_size = z_image_params.patch_size;
int patch_size = config.patch_size;
auto img = DiT::pad_and_patchify(ctx, x, patch_size, patch_size, false);
uint64_t n_img_token = img->ne[1];
@ -467,7 +545,7 @@ namespace ZImage {
struct ZImageRunner : public DiffusionModelRunner {
public:
ZImageParams z_image_params;
ZImageConfig config;
ZImageModel z_image;
std::vector<float> pe_vec;
std::vector<float> timestep_vec;
@ -478,8 +556,9 @@ namespace ZImage {
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "",
SDVersion version = VERSION_Z_IMAGE)
: DiffusionModelRunner(backend, params_backend, prefix) {
z_image = ZImageModel(z_image_params);
: DiffusionModelRunner(backend, params_backend, prefix),
config(ZImageConfig::detect_from_weights(tensor_storage_map, prefix)) {
z_image = ZImageModel(config);
z_image.init(params_ctx, tensor_storage_map, prefix);
}
@ -510,19 +589,19 @@ namespace ZImage {
pe_vec = Rope::gen_z_image_pe(static_cast<int>(x->ne[1]),
static_cast<int>(x->ne[0]),
z_image_params.patch_size,
config.patch_size,
static_cast<int>(x->ne[3]),
static_cast<int>(context->ne[1]),
SEQ_MULTI_OF,
ref_latents,
increase_ref_index,
z_image_params.theta,
config.theta,
circular_y_enabled,
circular_x_enabled,
z_image_params.axes_dim);
int pos_len = static_cast<int>(pe_vec.size() / z_image_params.axes_dim_sum / 2);
config.axes_dim);
int pos_len = static_cast<int>(pe_vec.size() / config.axes_dim_sum / 2);
// LOG_DEBUG("pos_len %d", pos_len);
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, z_image_params.axes_dim_sum / 2, pos_len);
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.axes_dim_sum / 2, pos_len);
// pe->data = pe_vec.data();
// print_ggml_tensor(pe, true, "pe");
// pe->data = nullptr;
@ -660,4 +739,4 @@ namespace ZImage {
} // namespace ZImage
#endif // __Z_IMAGE_HPP__
#endif // __SD_MODEL_DIFFUSION_Z_IMAGE_HPP__

View File

@ -1,13 +1,13 @@
#ifndef __CLIP_HPP__
#define __CLIP_HPP__
#ifndef __SD_MODEL_TE_CLIP_HPP__
#define __SD_MODEL_TE_CLIP_HPP__
#include "ggml_extend.hpp"
#include "core/ggml_extend.hpp"
#include "model.h"
#include "tokenizers/clip_tokenizer.h"
/*================================================ FrozenCLIPEmbedder ================================================*/
// Ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/clip/modeling_clip.py
// Ref: https://github.com/huggingface/transformers/blob/main/src/transformers/model/clip/modeling_clip.py
struct CLIPMLP : public GGMLBlock {
protected:
@ -579,4 +579,4 @@ struct CLIPTextModelRunner : public GGMLRunner {
}
};
#endif // __CLIP_HPP__
#endif // __SD_MODEL_TE_CLIP_HPP__

View File

@ -1,5 +1,5 @@
#ifndef __LLM_HPP__
#define __LLM_HPP__
#ifndef __SD_MODEL_TE_LLM_HPP__
#define __SD_MODEL_TE_LLM_HPP__
#include <algorithm>
#include <array>
@ -18,9 +18,9 @@
#include <utility>
#include <vector>
#include "ggml_extend.hpp"
#include "core/ggml_extend.hpp"
#include "json.hpp"
#include "rope.hpp"
#include "model/common/rope.hpp"
#include "tokenizers/bpe_tokenizer.h"
#include "tokenizers/gemma_tokenizer.h"
#include "tokenizers/gpt_oss_tokenizer.h"
@ -63,7 +63,7 @@ namespace LLM {
QWEN3_VL,
};
struct LLMVisionParams {
struct LLMVisionConfig {
LLMVisionArch arch = LLMVisionArch::QWEN2_5_VL;
int num_layers = 32;
int64_t hidden_size = 1280;
@ -79,7 +79,7 @@ namespace LLM {
std::set<int> fullatt_block_indexes = {7, 15, 23, 31};
};
struct LLMParams {
struct LLMConfig {
LLMArch arch = LLMArch::QWEN2_5_VL;
int64_t num_layers = 28;
int64_t hidden_size = 3584;
@ -101,7 +101,129 @@ namespace LLM {
std::vector<int> sliding_attention;
int64_t num_experts = 0;
int64_t num_experts_per_tok = 0;
LLMVisionParams vision;
LLMVisionConfig vision;
bool have_vision_weight = false;
bool llama_cpp_style = false;
static LLMConfig detect_from_weights(const String2TensorStorage& tensor_storage_map,
const std::string& prefix,
LLMArch arch) {
LLMConfig config;
config.arch = arch;
if (arch == LLMArch::MISTRAL_SMALL_3_2 || arch == LLMArch::MINISTRAL_3_3B) {
config.head_dim = 128;
config.num_heads = 32;
config.num_kv_heads = 8;
config.qkv_bias = false;
config.rms_norm_eps = 1e-5f;
} else if (arch == LLMArch::QWEN3 || arch == LLMArch::QWEN3_VL) {
config.head_dim = 128;
config.num_heads = 32;
config.num_kv_heads = 8;
config.qkv_bias = false;
config.qk_norm = true;
config.rms_norm_eps = 1e-6f;
if (arch == LLMArch::QWEN3_VL) {
config.max_position_embeddings = 262144;
config.rope_thetas = {5000000.f};
config.vision.arch = LLMVisionArch::QWEN3_VL;
}
} else if (arch == LLMArch::GEMMA3_12B) {
config.head_dim = 256;
config.num_heads = 16;
config.num_kv_heads = 8;
config.qkv_bias = false;
config.qk_norm = true;
config.rms_norm_eps = 1e-6f;
config.rms_norm_add = false;
config.normalize_input = true;
config.max_position_embeddings = 131072;
config.mlp_activation = MLPActivation::GELU_TANH;
config.rope_thetas = {1000000.f, 10000.f};
config.rope_scales = {8.f, 1.f};
config.sliding_attention = {1024, 1024, 1024, 1024, 1024, 0};
} else if (arch == LLMArch::GEMMA2_2B) {
config.head_dim = 256;
config.num_heads = 8;
config.num_kv_heads = 4;
config.qkv_bias = false;
config.qk_norm = false;
config.rms_norm_eps = 1e-6f;
config.rms_norm_add = true;
config.normalize_input = true;
config.max_position_embeddings = 8192;
config.mlp_activation = MLPActivation::GELU_TANH;
config.hidden_size = 2304;
config.intermediate_size = 9216;
config.num_layers = 26;
config.vocab_size = 256000;
} else if (arch == LLMArch::GPT_OSS_20B) {
config.head_dim = 64;
config.num_heads = 64;
config.num_kv_heads = 8;
config.qkv_bias = true;
config.attention_out_bias = true;
config.qk_norm = false;
config.rms_norm_eps = 1e-5f;
config.hidden_size = 2880;
config.intermediate_size = 2880;
config.num_layers = 24;
config.vocab_size = 201088;
config.max_position_embeddings = 131072;
config.rope_thetas = {150000.f};
config.rope_scales = {32.f};
config.sliding_attention = {128, 0};
config.num_experts = 32;
config.num_experts_per_tok = 4;
}
config.num_layers = 0;
for (const auto& [name, tensor_storage] : tensor_storage_map) {
if (!starts_with(name, prefix)) {
continue;
}
size_t pos = name.find("visual.");
if (pos != std::string::npos) {
config.have_vision_weight = true;
if (contains(name, "attn.q_proj")) {
config.llama_cpp_style = true;
}
continue;
}
pos = name.find("layers.");
if (pos != std::string::npos) {
auto items = split_string(name.substr(pos), '.');
if (items.size() > 1) {
int block_index = atoi(items[1].c_str());
if (block_index + 1 > config.num_layers) {
config.num_layers = block_index + 1;
}
}
}
if (contains(name, "embed_tokens.weight")) {
config.hidden_size = tensor_storage.ne[0];
config.vocab_size = tensor_storage.ne[1];
}
if (contains(name, "layers.0.mlp.gate_proj.weight")) {
config.intermediate_size = tensor_storage.ne[1];
}
if (contains(name, "layers.0.mlp.experts.gate_up_proj.weight")) {
config.intermediate_size = tensor_storage.ne[1] / 2;
}
if (contains(name, "layers.0.mlp.experts.gate_proj.weight")) {
config.intermediate_size = tensor_storage.ne[1];
}
}
if (arch == LLMArch::QWEN3 && config.num_layers == 28) {
config.num_heads = 16;
}
LOG_DEBUG("llm: num_layers = %" PRId64 ", vocab_size = %" PRId64 ", hidden_size = %" PRId64 ", intermediate_size = %" PRId64,
config.num_layers,
config.vocab_size,
config.hidden_size,
config.intermediate_size);
return config;
}
};
struct LLMRMSNorm : public UnaryBlock {
@ -232,11 +354,11 @@ namespace LLM {
}
public:
GPTOSSMLP(const LLMParams& params)
: hidden_size(params.hidden_size),
intermediate_size(params.intermediate_size),
num_experts(params.num_experts),
num_experts_per_tok(params.num_experts_per_tok) {}
GPTOSSMLP(const LLMConfig& config)
: hidden_size(config.hidden_size),
intermediate_size(config.intermediate_size),
num_experts(config.num_experts),
num_experts_per_tok(config.num_experts_per_tok) {}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
// x: [N, n_token, hidden_size]
@ -667,7 +789,7 @@ namespace LLM {
public:
VisionModel(bool llama_cpp_style,
const LLMVisionParams& vision_params,
const LLMVisionConfig& vision_params,
float eps = 1e-6f)
: arch_(vision_params.arch),
num_layers(vision_params.num_layers),
@ -784,23 +906,23 @@ namespace LLM {
}
public:
Attention(const LLMParams& params)
: arch(params.arch),
num_heads(params.num_heads),
num_kv_heads(params.num_kv_heads),
head_dim(params.head_dim),
qk_norm(params.qk_norm),
max_position_embeddings(params.max_position_embeddings),
rope_thetas(params.rope_thetas),
rope_scales(params.rope_scales),
has_attention_sinks(params.arch == LLMArch::GPT_OSS_20B) {
blocks["q_proj"] = std::make_shared<Linear>(params.hidden_size, num_heads * head_dim, params.qkv_bias);
blocks["k_proj"] = std::make_shared<Linear>(params.hidden_size, num_kv_heads * head_dim, params.qkv_bias);
blocks["v_proj"] = std::make_shared<Linear>(params.hidden_size, num_kv_heads * head_dim, params.qkv_bias);
blocks["o_proj"] = std::make_shared<Linear>(num_heads * head_dim, params.hidden_size, params.attention_out_bias);
if (params.qk_norm) {
blocks["q_norm"] = std::make_shared<LLMRMSNorm>(head_dim, params.rms_norm_eps, params.rms_norm_add);
blocks["k_norm"] = std::make_shared<LLMRMSNorm>(head_dim, params.rms_norm_eps, params.rms_norm_add);
Attention(const LLMConfig& config)
: arch(config.arch),
num_heads(config.num_heads),
num_kv_heads(config.num_kv_heads),
head_dim(config.head_dim),
qk_norm(config.qk_norm),
max_position_embeddings(config.max_position_embeddings),
rope_thetas(config.rope_thetas),
rope_scales(config.rope_scales),
has_attention_sinks(config.arch == LLMArch::GPT_OSS_20B) {
blocks["q_proj"] = std::make_shared<Linear>(config.hidden_size, num_heads * head_dim, config.qkv_bias);
blocks["k_proj"] = std::make_shared<Linear>(config.hidden_size, num_kv_heads * head_dim, config.qkv_bias);
blocks["v_proj"] = std::make_shared<Linear>(config.hidden_size, num_kv_heads * head_dim, config.qkv_bias);
blocks["o_proj"] = std::make_shared<Linear>(num_heads * head_dim, config.hidden_size, config.attention_out_bias);
if (config.qk_norm) {
blocks["q_norm"] = std::make_shared<LLMRMSNorm>(head_dim, config.rms_norm_eps, config.rms_norm_add);
blocks["k_norm"] = std::make_shared<LLMRMSNorm>(head_dim, config.rms_norm_eps, config.rms_norm_add);
}
}
@ -982,42 +1104,42 @@ namespace LLM {
std::string post_ffw_norm_name;
public:
TransformerBlock(const LLMParams& params, int layer_index)
: arch(params.arch),
TransformerBlock(const LLMConfig& config, int layer_index)
: arch(config.arch),
sliding_attention(0) {
if (params.arch == LLMArch::GEMMA3_12B) {
if (config.arch == LLMArch::GEMMA3_12B) {
post_attention_norm_name = "post_attention_norm"; // attn_post_norm
pre_ffw_norm_name = "post_attention_layernorm"; // ffn_norm
post_ffw_norm_name = "post_ffw_norm"; // ffn_post_norm
} else if (params.arch == LLMArch::GEMMA2_2B) {
} else if (config.arch == LLMArch::GEMMA2_2B) {
post_attention_norm_name = "post_attention_layernorm"; // ffn_norm
pre_ffw_norm_name = "pre_feedforward_layernorm";
post_ffw_norm_name = "post_feedforward_layernorm";
} else if (params.arch == LLMArch::GPT_OSS_20B) {
} else if (config.arch == LLMArch::GPT_OSS_20B) {
pre_ffw_norm_name = "post_attention_norm"; // attn_post_norm
} else {
pre_ffw_norm_name = "post_attention_layernorm"; // ffn_norm
}
blocks["self_attn"] = std::make_shared<Attention>(params);
if (params.arch == LLMArch::GPT_OSS_20B) {
blocks["mlp"] = std::make_shared<GPTOSSMLP>(params);
blocks["self_attn"] = std::make_shared<Attention>(config);
if (config.arch == LLMArch::GPT_OSS_20B) {
blocks["mlp"] = std::make_shared<GPTOSSMLP>(config);
} else {
blocks["mlp"] = std::make_shared<MLP>(params.hidden_size,
params.intermediate_size,
blocks["mlp"] = std::make_shared<MLP>(config.hidden_size,
config.intermediate_size,
false,
params.mlp_activation);
config.mlp_activation);
}
blocks["input_layernorm"] = std::make_shared<LLMRMSNorm>(params.hidden_size, params.rms_norm_eps, params.rms_norm_add);
blocks[pre_ffw_norm_name] = std::make_shared<LLMRMSNorm>(params.hidden_size, params.rms_norm_eps, params.rms_norm_add);
blocks["input_layernorm"] = std::make_shared<LLMRMSNorm>(config.hidden_size, config.rms_norm_eps, config.rms_norm_add);
blocks[pre_ffw_norm_name] = std::make_shared<LLMRMSNorm>(config.hidden_size, config.rms_norm_eps, config.rms_norm_add);
if (!post_attention_norm_name.empty()) {
blocks[post_attention_norm_name] = std::make_shared<LLMRMSNorm>(params.hidden_size, params.rms_norm_eps, params.rms_norm_add);
blocks[post_attention_norm_name] = std::make_shared<LLMRMSNorm>(config.hidden_size, config.rms_norm_eps, config.rms_norm_add);
}
if (!post_ffw_norm_name.empty()) {
blocks[post_ffw_norm_name] = std::make_shared<LLMRMSNorm>(params.hidden_size, params.rms_norm_eps, params.rms_norm_add);
blocks[post_ffw_norm_name] = std::make_shared<LLMRMSNorm>(config.hidden_size, config.rms_norm_eps, config.rms_norm_add);
}
if (!params.sliding_attention.empty()) {
sliding_attention = params.sliding_attention[layer_index % params.sliding_attention.size()];
if (!config.sliding_attention.empty()) {
sliding_attention = config.sliding_attention[layer_index % config.sliding_attention.size()];
}
}
@ -1074,16 +1196,16 @@ namespace LLM {
struct TextModel : public GGMLBlock {
protected:
int64_t num_layers;
LLMParams params;
LLMConfig config;
public:
TextModel(const LLMParams& params)
: num_layers(params.num_layers), params(params) {
blocks["embed_tokens"] = std::shared_ptr<GGMLBlock>(new Embedding(params.vocab_size, params.hidden_size));
TextModel(const LLMConfig& config)
: num_layers(config.num_layers), config(config) {
blocks["embed_tokens"] = std::shared_ptr<GGMLBlock>(new Embedding(config.vocab_size, config.hidden_size));
for (int i = 0; i < num_layers; i++) {
blocks["layers." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new TransformerBlock(params, i));
blocks["layers." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new TransformerBlock(config, i));
}
blocks["norm"] = std::shared_ptr<GGMLBlock>(new LLMRMSNorm(params.hidden_size, params.rms_norm_eps, params.rms_norm_add));
blocks["norm"] = std::shared_ptr<GGMLBlock>(new LLMRMSNorm(config.hidden_size, config.rms_norm_eps, config.rms_norm_add));
}
ggml_tensor* embed(GGMLRunnerContext* ctx,
@ -1103,8 +1225,8 @@ namespace LLM {
auto norm = std::dynamic_pointer_cast<LLMRMSNorm>(blocks["norm"]);
std::vector<ggml_tensor*> intermediate_outputs;
if (params.normalize_input) {
x = ggml_ext_scale(ctx->ggml_ctx, x, std::sqrt(static_cast<float>(params.hidden_size)), true);
if (config.normalize_input) {
x = ggml_ext_scale(ctx->ggml_ctx, x, std::sqrt(static_cast<float>(config.hidden_size)), true);
}
if (return_all_hidden_states) {
intermediate_outputs.push_back(x);
@ -1174,15 +1296,15 @@ namespace LLM {
struct LLM : public GGMLBlock {
bool enable_vision;
LLMParams params;
LLMConfig config;
public:
LLM() = default;
LLM(LLMParams params, bool enable_vision = false, bool llama_cpp_style = false)
: enable_vision(enable_vision), params(params) {
blocks["model"] = std::shared_ptr<GGMLBlock>(new TextModel(params));
LLM(LLMConfig config, bool enable_vision = false, bool llama_cpp_style = false)
: enable_vision(enable_vision), config(config) {
blocks["model"] = std::shared_ptr<GGMLBlock>(new TextModel(config));
if (enable_vision) {
blocks["visual"] = std::shared_ptr<GGMLBlock>(new VisionModel(llama_cpp_style, params.vision));
blocks["visual"] = std::shared_ptr<GGMLBlock>(new VisionModel(llama_cpp_style, config.vision));
}
}
@ -1226,7 +1348,7 @@ namespace LLM {
};
struct LLMRunner : public GGMLRunner {
LLMParams params;
LLMConfig config;
bool enable_vision;
LLM model;
@ -1242,7 +1364,7 @@ namespace LLM {
static ggml_tensor* process_image_common(ggml_context* ctx,
ggml_tensor* image,
const LLMVisionParams& vision_params) {
const LLMVisionConfig& vision_params) {
// image: [C, H, W]
// return: [grid_t*(H/mh/ph)*(W/mw/pw)*mh*mw, C*pt*ph*pw], grid_t == 1
int64_t C = image->ne[2];
@ -1337,7 +1459,7 @@ namespace LLM {
ggml_context* compute_ctx,
GGMLRunnerContext* runner_ctx,
ggml_tensor* image,
const LLMVisionParams& vision_params,
const LLMVisionConfig& vision_params,
std::shared_ptr<VisionModel> vision_model,
std::vector<int>& window_index_vec,
std::vector<int>& window_inverse_index_vec,
@ -1452,141 +1574,25 @@ namespace LLM {
const String2TensorStorage& tensor_storage_map,
const std::string prefix,
bool enable_vision_ = false)
: GGMLRunner(backend, params_backend), enable_vision(enable_vision_) {
params.arch = arch;
if (arch == LLMArch::MISTRAL_SMALL_3_2 || arch == LLMArch::MINISTRAL_3_3B) {
params.head_dim = 128;
params.num_heads = 32;
params.num_kv_heads = 8;
params.qkv_bias = false;
params.rms_norm_eps = 1e-5f;
} else if (arch == LLMArch::QWEN3 || arch == LLMArch::QWEN3_VL) {
params.head_dim = 128;
params.num_heads = 32;
params.num_kv_heads = 8;
params.qkv_bias = false;
params.qk_norm = true;
params.rms_norm_eps = 1e-6f;
if (arch == LLMArch::QWEN3_VL) {
params.max_position_embeddings = 262144;
params.rope_thetas = {5000000.f};
params.vision.arch = LLMVisionArch::QWEN3_VL;
}
} else if (arch == LLMArch::GEMMA3_12B) {
params.head_dim = 256;
params.num_heads = 16;
params.num_kv_heads = 8;
params.qkv_bias = false;
params.qk_norm = true;
params.rms_norm_eps = 1e-6f;
// llama.cpp adds +1 to Gemma3 norm.weight when exporting GGUF, so GGUF loading
// must keep rms_norm_add disabled here or the offset gets applied twice.
// Convenient for the converter, less convenient for whoever gets to debug it later.
params.rms_norm_add = false;
params.normalize_input = true;
params.max_position_embeddings = 131072;
params.mlp_activation = MLPActivation::GELU_TANH;
params.rope_thetas = {1000000.f, 10000.f};
params.rope_scales = {8.f, 1.f};
params.sliding_attention = {1024, 1024, 1024, 1024, 1024, 0};
} else if (arch == LLMArch::GEMMA2_2B) {
params.head_dim = 256;
params.num_heads = 8;
params.num_kv_heads = 4;
params.qkv_bias = false;
params.qk_norm = false;
params.rms_norm_eps = 1e-6f;
params.rms_norm_add = true;
params.normalize_input = true;
params.max_position_embeddings = 8192;
params.mlp_activation = MLPActivation::GELU_TANH;
params.hidden_size = 2304;
params.intermediate_size = 9216;
params.num_layers = 26;
params.vocab_size = 256000;
} else if (arch == LLMArch::GPT_OSS_20B) {
params.head_dim = 64;
params.num_heads = 64;
params.num_kv_heads = 8;
params.qkv_bias = true;
params.attention_out_bias = true;
params.qk_norm = false;
params.rms_norm_eps = 1e-5f;
params.hidden_size = 2880;
params.intermediate_size = 2880;
params.num_layers = 24;
params.vocab_size = 201088;
params.max_position_embeddings = 131072;
params.rope_thetas = {150000.f};
params.rope_scales = {32.f};
params.sliding_attention = {128, 0};
params.num_experts = 32;
params.num_experts_per_tok = 4;
}
bool have_vision_weight = false;
bool llama_cpp_style = false;
params.num_layers = 0;
for (auto pair : tensor_storage_map) {
std::string tensor_name = pair.first;
if (tensor_name.find(prefix) == std::string::npos)
continue;
size_t pos = tensor_name.find("visual.");
if (pos != std::string::npos) {
have_vision_weight = true;
if (contains(tensor_name, "attn.q_proj")) {
llama_cpp_style = true;
}
continue;
}
pos = tensor_name.find("layers.");
if (pos != std::string::npos) {
tensor_name = tensor_name.substr(pos); // remove prefix
auto items = split_string(tensor_name, '.');
if (items.size() > 1) {
int block_index = atoi(items[1].c_str());
if (block_index + 1 > params.num_layers) {
params.num_layers = block_index + 1;
}
}
}
if (contains(tensor_name, "embed_tokens.weight")) {
params.hidden_size = pair.second.ne[0];
params.vocab_size = pair.second.ne[1];
}
if (contains(tensor_name, "layers.0.mlp.gate_proj.weight")) {
params.intermediate_size = pair.second.ne[1];
}
if (contains(tensor_name, "layers.0.mlp.experts.gate_up_proj.weight")) {
params.intermediate_size = pair.second.ne[1] / 2;
}
if (contains(tensor_name, "layers.0.mlp.experts.gate_proj.weight")) {
params.intermediate_size = pair.second.ne[1];
}
}
if (arch == LLMArch::QWEN3 && params.num_layers == 28) { // Qwen3 2B
params.num_heads = 16;
}
LOG_DEBUG("llm: num_layers = %" PRId64 ", vocab_size = %" PRId64 ", hidden_size = %" PRId64 ", intermediate_size = %" PRId64,
params.num_layers,
params.vocab_size,
params.hidden_size,
params.intermediate_size);
if (enable_vision && !have_vision_weight) {
: GGMLRunner(backend, params_backend),
config(LLMConfig::detect_from_weights(tensor_storage_map, prefix, arch)),
enable_vision(enable_vision_) {
if (enable_vision && !config.have_vision_weight) {
LOG_WARN("no vision weights detected, vision disabled");
enable_vision = false;
}
if (enable_vision) {
LOG_DEBUG("enable llm vision");
if (llama_cpp_style) {
if (config.llama_cpp_style) {
LOG_DEBUG("llama.cpp style vision weight");
}
}
model = LLM(params, enable_vision, llama_cpp_style);
model = LLM(config, enable_vision, config.llama_cpp_style);
model.init(params_ctx, tensor_storage_map, prefix);
}
std::string get_desc() override {
return llm_arch_to_str[static_cast<int>(params.arch)];
return llm_arch_to_str[static_cast<int>(config.arch)];
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
@ -1638,12 +1644,12 @@ namespace LLM {
}
int64_t n_tokens = input_ids->ne[0];
if (params.arch == LLMArch::MISTRAL_SMALL_3_2 ||
params.arch == LLMArch::MINISTRAL_3_3B ||
params.arch == LLMArch::QWEN3 ||
params.arch == LLMArch::GEMMA3_12B ||
params.arch == LLMArch::GEMMA2_2B ||
params.arch == LLMArch::GPT_OSS_20B) {
if (config.arch == LLMArch::MISTRAL_SMALL_3_2 ||
config.arch == LLMArch::MINISTRAL_3_3B ||
config.arch == LLMArch::QWEN3 ||
config.arch == LLMArch::GEMMA3_12B ||
config.arch == LLMArch::GEMMA2_2B ||
config.arch == LLMArch::GPT_OSS_20B) {
input_pos_vec.resize(n_tokens);
for (int i = 0; i < n_tokens; ++i) {
input_pos_vec[i] = i;
@ -1682,9 +1688,9 @@ namespace LLM {
set_backend_tensor_data(attention_mask, attention_mask_vec.data());
}
if (params.arch == LLMArch::GEMMA3_12B || params.arch == LLMArch::GPT_OSS_20B) {
if (config.arch == LLMArch::GEMMA3_12B || config.arch == LLMArch::GPT_OSS_20B) {
int sliding_window = 0;
for (int window : params.sliding_attention) {
for (int window : config.sliding_attention) {
sliding_window = std::max(sliding_window, window);
}
sliding_attention_mask_vec.resize(n_tokens * n_tokens);
@ -1740,15 +1746,15 @@ namespace LLM {
int64_t get_num_image_tokens(int64_t t, int64_t h, int64_t w) {
int64_t grid_t = 1;
int64_t grid_h = h / params.vision.patch_size;
int64_t grid_w = w / params.vision.patch_size;
int64_t llm_grid_h = grid_h / params.vision.spatial_merge_size;
int64_t llm_grid_w = grid_w / params.vision.spatial_merge_size;
int64_t grid_h = h / config.vision.patch_size;
int64_t grid_w = w / config.vision.patch_size;
int64_t llm_grid_h = grid_h / config.vision.spatial_merge_size;
int64_t llm_grid_w = grid_w / config.vision.spatial_merge_size;
return grid_t * grid_h * grid_w;
}
ggml_tensor* process_image(ggml_context* ctx, ggml_tensor* image) {
return process_image_common(ctx, image, params.vision);
return process_image_common(ctx, image, config.vision);
}
ggml_tensor* build_patch_pos_embeds(GGMLRunnerContext* runner_ctx,
@ -1770,7 +1776,7 @@ namespace LLM {
compute_ctx,
runner_ctx,
image,
params.vision,
config.vision,
model.vision_model(),
window_index_vec,
window_inverse_index_vec,
@ -1784,8 +1790,8 @@ namespace LLM {
ggml_cgraph* gf = new_graph_custom(LLM_GRAPH_SIZE);
ggml_tensor* image = make_input(image_tensor);
GGML_ASSERT(image->ne[1] % (params.vision.patch_size * params.vision.spatial_merge_size) == 0);
GGML_ASSERT(image->ne[0] % (params.vision.patch_size * params.vision.spatial_merge_size) == 0);
GGML_ASSERT(image->ne[1] % (config.vision.patch_size * config.vision.spatial_merge_size) == 0);
GGML_ASSERT(image->ne[0] % (config.vision.patch_size * config.vision.spatial_merge_size) == 0);
auto runnter_ctx = get_context();
ggml_tensor* hidden_states = encode_image(&runnter_ctx, image);
@ -2094,4 +2100,4 @@ namespace LLM {
};
}; // LLM
#endif // __LLM_HPP__
#endif // __SD_MODEL_TE_LLM_HPP__

View File

@ -1,5 +1,5 @@
#ifndef __T5_HPP__
#define __T5_HPP__
#ifndef __SD_MODEL_TE_T5_HPP__
#define __SD_MODEL_TE_T5_HPP__
#include <cfloat>
#include <limits>
@ -10,10 +10,32 @@
#include <string>
#include <unordered_map>
#include "ggml_extend.hpp"
#include "core/ggml_extend.hpp"
#include "model.h"
#include "tokenizers/t5_unigram_tokenizer.h"
struct T5Config {
int64_t num_layers = 24;
int64_t model_dim = 4096;
int64_t ff_dim = 10240;
int64_t num_heads = 64;
int64_t vocab_size = 32128;
bool relative_attention = true;
static T5Config detect_from_weights(const String2TensorStorage& tensor_storage_map,
const std::string& prefix,
bool is_umt5 = false) {
(void)tensor_storage_map;
(void)prefix;
T5Config config;
if (is_umt5) {
config.vocab_size = 256384;
config.relative_attention = false;
}
return config;
}
};
class T5LayerNorm : public UnaryBlock {
protected:
int64_t hidden_size;
@ -272,30 +294,21 @@ public:
}
};
struct T5Params {
int64_t num_layers = 24;
int64_t model_dim = 4096;
int64_t ff_dim = 10240;
int64_t num_heads = 64;
int64_t vocab_size = 32128;
bool relative_attention = true;
};
struct T5 : public GGMLBlock {
T5Params params;
T5Config config;
public:
T5() {}
T5(T5Params params)
: params(params) {
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new T5Stack(params.num_layers,
params.model_dim,
params.model_dim,
params.ff_dim,
params.num_heads,
params.relative_attention));
blocks["shared"] = std::shared_ptr<GGMLBlock>(new Embedding(params.vocab_size,
params.model_dim));
T5(T5Config config)
: config(config) {
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new T5Stack(config.num_layers,
config.model_dim,
config.model_dim,
config.ff_dim,
config.num_heads,
config.relative_attention));
blocks["shared"] = std::shared_ptr<GGMLBlock>(new Embedding(config.vocab_size,
config.model_dim));
}
ggml_tensor* forward(GGMLRunnerContext* ctx,
@ -316,7 +329,7 @@ public:
};
struct T5Runner : public GGMLRunner {
T5Params params;
T5Config config;
T5 model;
std::vector<int> relative_position_bucket_vec;
@ -325,12 +338,9 @@ struct T5Runner : public GGMLRunner {
const String2TensorStorage& tensor_storage_map,
const std::string prefix,
bool is_umt5 = false)
: GGMLRunner(backend, params_backend) {
if (is_umt5) {
params.vocab_size = 256384;
params.relative_attention = false;
}
model = T5(params);
: GGMLRunner(backend, params_backend),
config(T5Config::detect_from_weights(tensor_storage_map, prefix, is_umt5)) {
model = T5(config);
model.init(params_ctx, tensor_storage_map, prefix);
}
@ -600,4 +610,4 @@ struct T5Embedder {
}
};
#endif // __T5_HPP__
#endif // __SD_MODEL_TE_T5_HPP__

View File

@ -1,7 +1,7 @@
#ifndef __ESRGAN_HPP__
#define __ESRGAN_HPP__
#ifndef __SD_MODEL_UPSCALER_ESRGAN_HPP__
#define __SD_MODEL_UPSCALER_ESRGAN_HPP__
#include "ggml_extend.hpp"
#include "core/ggml_extend.hpp"
#include "model.h"
/*
@ -372,4 +372,4 @@ struct ESRGAN : public GGMLRunner {
}
};
#endif // __ESRGAN_HPP__
#endif // __SD_MODEL_UPSCALER_ESRGAN_HPP__

View File

@ -1,5 +1,5 @@
#ifndef __SD_LTX_LATENT_UPSCALER_HPP__
#define __SD_LTX_LATENT_UPSCALER_HPP__
#ifndef __SD_MODEL_UPSCALER_LTX_LATENT_UPSCALER_HPP__
#define __SD_MODEL_UPSCALER_LTX_LATENT_UPSCALER_HPP__
#include <cinttypes>
#include <cmath>
@ -11,11 +11,11 @@
#include <utility>
#include <vector>
#include "common_dit.hpp"
#include "ggml_extend.hpp"
#include "ggml_graph_cut.h"
#include "core/ggml_extend.hpp"
#include "core/ggml_graph_cut.h"
#include "core/util.h"
#include "model.h"
#include "util.h"
#include "model/diffusion/dit.hpp"
namespace LTXVUpsampler {
constexpr int LTX_UPSAMPLER_GRAPH_SIZE = 10240;
@ -548,4 +548,4 @@ namespace LTXVUpsampler {
} // namespace LTXVUpsampler
#endif // __SD_LTX_LATENT_UPSCALER_HPP__
#endif // __SD_MODEL_UPSCALER_LTX_LATENT_UPSCALER_HPP__

View File

@ -1,7 +1,7 @@
#ifndef __AUTO_ENCODER_KL_HPP__
#define __AUTO_ENCODER_KL_HPP__
#ifndef __SD_MODEL_VAE_AUTO_ENCODER_KL_HPP__
#define __SD_MODEL_VAE_AUTO_ENCODER_KL_HPP__
#include "vae.hpp"
#include "model/vae/vae.hpp"
/*================================================== AutoEncoderKL ===================================================*/
@ -886,4 +886,4 @@ struct AutoEncoderKL : public VAE {
};
};
#endif // __AUTO_ENCODER_KL_HPP__
#endif // __SD_MODEL_VAE_AUTO_ENCODER_KL_HPP__

View File

@ -1,5 +1,5 @@
#ifndef __SD_LTX_AUDIO_VAE_H__
#define __SD_LTX_AUDIO_VAE_H__
#ifndef __SD_MODEL_VAE_LTX_AUDIO_VAE_HPP__
#define __SD_MODEL_VAE_LTX_AUDIO_VAE_HPP__
#include <cmath>
#include <limits>
@ -7,7 +7,7 @@
#include <string>
#include <vector>
#include "ggml_extend.hpp"
#include "core/ggml_extend.hpp"
namespace LTXV {
@ -58,11 +58,12 @@ namespace LTXV {
return base_output_sample_rate();
}
static LTXAudioVAEConfig detect_from_weights(const String2TensorStorage& tensor_storage_map) {
static LTXAudioVAEConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix = "") {
LTXAudioVAEConfig config;
auto require = [&](const std::string& name) -> const TensorStorage* {
auto iter = tensor_storage_map.find(name);
std::string tensor_name = prefix.empty() ? name : prefix + "." + name;
auto iter = tensor_storage_map.find(tensor_name);
if (iter == tensor_storage_map.end()) {
return nullptr;
}
@ -168,6 +169,12 @@ namespace LTXV {
if (config.audio_channels != 2 || config.latent_channels != 8 || config.mel_bins != 64) {
return config;
}
LOG_DEBUG("ltx_audio_vae: sample_rate = %d, mel_bins = %d, latent_channels = %d, latent_frequency_bins = %d, has_bwe = %s",
config.sample_rate,
config.mel_bins,
config.latent_channels,
config.latent_frequency_bins,
config.has_bwe ? "true" : "false");
return config;
}
};
@ -1088,4 +1095,4 @@ namespace LTXV {
} // namespace LTXV
#endif // __SD_LTX_AUDIO_VAE_H__
#endif // __SD_MODEL_VAE_LTX_AUDIO_VAE_HPP__

View File

@ -1,5 +1,5 @@
#ifndef __SD_LTX_VAE_HPP__
#define __SD_LTX_VAE_HPP__
#ifndef __SD_MODEL_VAE_LTX_VAE_HPP__
#define __SD_MODEL_VAE_LTX_VAE_HPP__
#include <algorithm>
#include <fstream>
@ -9,9 +9,9 @@
#include <utility>
#include <vector>
#include "ltxv.hpp"
#include "vae.hpp"
#include "wan.hpp"
#include "model/diffusion/ltxv.hpp"
#include "model/vae/vae.hpp"
#include "model/vae/wan_vae.hpp"
namespace LTXVAE {
@ -1552,4 +1552,4 @@ struct LTXVideoVAE : public VAE {
}
};
#endif // __SD_LTX_VAE_HPP__
#endif // __SD_MODEL_VAE_LTX_VAE_HPP__

View File

@ -1,13 +1,13 @@
#ifndef __TAE_HPP__
#define __TAE_HPP__
#ifndef __SD_MODEL_VAE_TAE_HPP__
#define __SD_MODEL_VAE_TAE_HPP__
#include "ggml_extend.hpp"
#include "core/ggml_extend.hpp"
#include "model.h"
/*
=================================== TinyAutoEncoder ===================================
References:
https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/autoencoders/vae.py
https://github.com/huggingface/diffusers/blob/main/src/diffusers/model/autoencoders/vae.py
https://github.com/madebyollin/taesd/blob/main/taesd.py
*/
@ -750,4 +750,4 @@ struct TinyVideoAutoEncoder : public VAE {
}
};
#endif // __TAE_HPP__
#endif // __SD_MODEL_VAE_TAE_HPP__

View File

@ -1,8 +1,8 @@
#ifndef __VAE_HPP__
#define __VAE_HPP__
#ifndef __SD_MODEL_VAE_VAE_HPP__
#define __SD_MODEL_VAE_VAE_HPP__
#include "common_block.hpp"
#include "tensor_ggml.hpp"
#include "core/tensor_ggml.hpp"
#include "model/common/block.hpp"
struct VAE : public GGMLRunner {
protected:
@ -258,4 +258,4 @@ struct FakeVAE : public VAE {
}
};
#endif // __VAE_HPP__
#endif // __SD_MODEL_VAE_VAE_HPP__

File diff suppressed because it is too large Load Diff

View File

@ -5,9 +5,9 @@
#include <string>
#include <vector>
#include "core/util.h"
#include "gguf.h"
#include "gguf_reader_ext.h"
#include "util.h"
static void set_error(std::string* error, const std::string& message) {
if (error != nullptr) {

View File

@ -6,8 +6,8 @@
#include <string>
#include <vector>
#include "core/util.h"
#include "ggml.h"
#include "util.h"
struct GGUFTensorInfo {
std::string name;

View File

@ -8,7 +8,7 @@
#include <vector>
#include "binary_io.h"
#include "util.h"
#include "core/util.h"
// $ python -m pickletools sd-v1-4/archive/data.pkl | head -n 100
// 0: \x80 PROTO 2

View File

@ -7,8 +7,8 @@
#include <vector>
#include "binary_io.h"
#include "core/util.h"
#include "json.hpp"
#include "util.h"
static constexpr size_t ST_HEADER_SIZE_LEN = 8;

View File

@ -7,8 +7,8 @@
#include <unordered_map>
#include <vector>
#include "core/util.h"
#include "pickle_io.h"
#include "util.h"
// torch.save format background:
//

View File

@ -1,8 +1,8 @@
#include <unordered_map>
#include <unordered_set>
#include "core/util.h"
#include "name_conversion.h"
#include "util.h"
void replace_with_name_map(std::string& name, const std::vector<std::pair<std::string, std::string>>& name_map) {
for (auto kv : name_map) {

View File

@ -1,5 +1,5 @@
#ifndef __CACHE_DIT_HPP__
#define __CACHE_DIT_HPP__
#ifndef __SD_RUNTIME_CACHE_DIT_HPP__
#define __SD_RUNTIME_CACHE_DIT_HPP__
#include <algorithm>
#include <cmath>
@ -8,9 +8,9 @@
#include <unordered_map>
#include <vector>
#include "condition_cache_utils.hpp"
#include "ggml_extend.hpp"
#include "tensor.hpp"
#include "core/ggml_extend.hpp"
#include "core/tensor.hpp"
#include "runtime/condition_cache_utils.hpp"
struct DBCacheConfig {
bool enabled = false;
@ -893,4 +893,4 @@ struct CacheDitConditionState {
}
};
#endif
#endif // __SD_RUNTIME_CACHE_DIT_HPP__

View File

@ -1,9 +1,9 @@
#ifndef __CONDITION_CACHE_UTILS_HPP__
#define __CONDITION_CACHE_UTILS_HPP__
#ifndef __SD_RUNTIME_CONDITION_CACHE_UTILS_HPP__
#define __SD_RUNTIME_CONDITION_CACHE_UTILS_HPP__
#include <vector>
#include "tensor.hpp"
#include "core/tensor.hpp"
namespace sd {
@ -61,4 +61,4 @@ namespace sd {
} // namespace sd
#endif // __CONDITION_CACHE_UTILS_HPP__
#endif // __SD_RUNTIME_CONDITION_CACHE_UTILS_HPP__

View File

@ -1,5 +1,5 @@
#ifndef __DENOISER_HPP__
#define __DENOISER_HPP__
#ifndef __SD_RUNTIME_DENOISER_HPP__
#define __SD_RUNTIME_DENOISER_HPP__
#include <algorithm>
#include <cctype>
@ -8,10 +8,10 @@
#include <string>
#include <utility>
#include "ggml_extend.hpp"
#include "gits_noise.inl"
#include "guidance.h"
#include "tensor.hpp"
#include "core/ggml_extend.hpp"
#include "core/tensor.hpp"
#include "runtime/gits_noise.inl"
#include "runtime/guidance.h"
/*================================================= CompVisDenoiser ==================================================*/
@ -1902,4 +1902,4 @@ static sd::Tensor<float> sample_k_diffusion(sample_method_t method,
}
}
#endif // __DENOISER_HPP__
#endif // __SD_RUNTIME_DENOISER_HPP__

View File

@ -1,15 +1,15 @@
#ifndef __EASYCACHE_HPP__
#define __EASYCACHE_HPP__
#ifndef __SD_RUNTIME_EASYCACHE_HPP__
#define __SD_RUNTIME_EASYCACHE_HPP__
#include <cmath>
#include <limits>
#include <unordered_map>
#include <vector>
#include "condition_cache_utils.hpp"
#include "denoiser.hpp"
#include "ggml_extend.hpp"
#include "tensor.hpp"
#include "core/ggml_extend.hpp"
#include "core/tensor.hpp"
#include "runtime/condition_cache_utils.hpp"
#include "runtime/denoiser.hpp"
struct EasyCacheConfig {
bool enabled = false;
@ -258,4 +258,4 @@ struct EasyCacheState {
}
};
#endif
#endif // __SD_RUNTIME_EASYCACHE_HPP__

View File

@ -1,4 +1,4 @@
#include "guidance.h"
#include "runtime/guidance.h"
#include <algorithm>
#include <cmath>
@ -6,7 +6,7 @@
#include <string>
#include <utility>
#include "util.h"
#include "core/util.h"
namespace sd::guidance {

View File

@ -1,11 +1,11 @@
#ifndef __SD_GUIDANCE_H__
#define __SD_GUIDANCE_H__
#ifndef __SD_RUNTIME_GUIDANCE_H__
#define __SD_RUNTIME_GUIDANCE_H__
#include <cstddef>
#include <functional>
#include <vector>
#include "tensor.hpp"
#include "core/tensor.hpp"
namespace sd::guidance {
@ -93,4 +93,4 @@ namespace sd::guidance {
} // namespace sd::guidance
#endif // __SD_GUIDANCE_H__
#endif // __SD_RUNTIME_GUIDANCE_H__

View File

@ -1,8 +1,8 @@
#include <algorithm>
#include <cstddef>
#include <cstdint>
#include "core/tensor.hpp"
#include "ggml.h"
#include "tensor.hpp"
const float ltxav_latent_rgb_proj[128][3] = {
{-0.0293802f, -0.0362516f, -0.0291386f},

View File

@ -1,10 +1,10 @@
#ifndef __PREPROCESSING_HPP__
#define __PREPROCESSING_HPP__
#ifndef __SD_RUNTIME_PREPROCESSING_HPP__
#define __SD_RUNTIME_PREPROCESSING_HPP__
#include <cmath>
#include <limits>
#include "ggml_extend.hpp"
#include "core/ggml_extend.hpp"
#define M_PI_ 3.14159265358979323846f
@ -331,4 +331,4 @@ bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold,
return true;
}
#endif // __PREPROCESSING_HPP__
#endif // __SD_RUNTIME_PREPROCESSING_HPP__

View File

@ -1,4 +1,4 @@
#include "sample-cache.h"
#include "runtime/sample-cache.h"
namespace sd_sample {

View File

@ -1,16 +1,16 @@
#ifndef __SAMPLE_CACHE_H__
#define __SAMPLE_CACHE_H__
#ifndef __SD_RUNTIME_SAMPLE_CACHE_H__
#define __SD_RUNTIME_SAMPLE_CACHE_H__
#include <vector>
#include "cache_dit.hpp"
#include "denoiser.hpp"
#include "easycache.hpp"
#include "core/tensor.hpp"
#include "core/util.h"
#include "model.h"
#include "spectrum.hpp"
#include "tensor.hpp"
#include "ucache.hpp"
#include "util.h"
#include "runtime/cache_dit.hpp"
#include "runtime/denoiser.hpp"
#include "runtime/easycache.hpp"
#include "runtime/spectrum.hpp"
#include "runtime/ucache.hpp"
namespace sd_sample {
@ -58,4 +58,4 @@ namespace sd_sample {
} // namespace sd_sample
#endif // __SAMPLE_CACHE_H__
#endif // __SD_RUNTIME_SAMPLE_CACHE_H__

View File

@ -1,12 +1,12 @@
#ifndef __SPECTRUM_HPP__
#define __SPECTRUM_HPP__
#ifndef __SD_RUNTIME_SPECTRUM_HPP__
#define __SD_RUNTIME_SPECTRUM_HPP__
#include <cmath>
#include <cstring>
#include <vector>
#include "ggml_extend.hpp"
#include "tensor.hpp"
#include "core/ggml_extend.hpp"
#include "core/tensor.hpp"
struct SpectrumConfig {
float w = 0.40f;
@ -184,4 +184,4 @@ private:
}
};
#endif // __SPECTRUM_HPP__
#endif // __SD_RUNTIME_SPECTRUM_HPP__

View File

@ -1,15 +1,15 @@
#ifndef __UCACHE_HPP__
#define __UCACHE_HPP__
#ifndef __SD_RUNTIME_UCACHE_HPP__
#define __SD_RUNTIME_UCACHE_HPP__
#include <cmath>
#include <limits>
#include <unordered_map>
#include <vector>
#include "condition_cache_utils.hpp"
#include "denoiser.hpp"
#include "ggml_extend.hpp"
#include "tensor.hpp"
#include "core/ggml_extend.hpp"
#include "core/tensor.hpp"
#include "runtime/condition_cache_utils.hpp"
#include "runtime/denoiser.hpp"
struct UCacheConfig {
bool enabled = false;
@ -420,4 +420,4 @@ struct UCacheState {
}
};
#endif // __UCACHE_HPP__
#endif // __SD_RUNTIME_UCACHE_HPP__

View File

@ -2,48 +2,49 @@
#include <cmath>
#include <cstdlib>
#include "ggml_extend.hpp"
#include "ggml_graph_cut.h"
#include "core/ggml_extend.hpp"
#include "core/ggml_graph_cut.h"
#include "core/rng.hpp"
#include "core/rng_mt19937.hpp"
#include "core/rng_philox.hpp"
#include "core/util.h"
#include "model.h"
#include "rng.hpp"
#include "rng_mt19937.hpp"
#include "rng_philox.hpp"
#include "stable-diffusion.h"
#include "util.h"
#include "anima.hpp"
#include "auto_encoder_kl.hpp"
#include "conditioner.hpp"
#include "control.hpp"
#include "denoiser.hpp"
#include "diffusion_model.hpp"
#include "ernie_image.hpp"
#include "esrgan.hpp"
#include "flux.hpp"
#include "guidance.h"
#include "hidream_o1.hpp"
#include "ideogram4.hpp"
#include "lens.hpp"
#include "lora.hpp"
#include "ltx_audio_vae.h"
#include "ltx_latent_upscaler.hpp"
#include "ltx_vae.hpp"
#include "ltxv.hpp"
#include "mmdit.hpp"
#include "pid.hpp"
#include "pmid.hpp"
#include "qwen_image.hpp"
#include "sample-cache.h"
#include "tae.hpp"
#include "unet.hpp"
#include "conditioning/conditioner.hpp"
#include "model/adapter/lora.hpp"
#include "model/adapter/pmid.hpp"
#include "model/diffusion/anima.hpp"
#include "model/diffusion/control.hpp"
#include "model/diffusion/ernie_image.hpp"
#include "model/diffusion/flux.hpp"
#include "model/diffusion/hidream_o1.hpp"
#include "model/diffusion/ideogram4.hpp"
#include "model/diffusion/lens.hpp"
#include "model/diffusion/ltxv.hpp"
#include "model/diffusion/mmdit.hpp"
#include "model/diffusion/model.hpp"
#include "model/diffusion/pid.hpp"
#include "model/diffusion/qwen_image.hpp"
#include "model/diffusion/unet.hpp"
#include "model/diffusion/wan.hpp"
#include "model/diffusion/z_image.hpp"
#include "model/upscaler/esrgan.hpp"
#include "model/upscaler/ltx_latent_upscaler.hpp"
#include "model/vae/auto_encoder_kl.hpp"
#include "model/vae/ltx_audio_vae.hpp"
#include "model/vae/ltx_vae.hpp"
#include "model/vae/tae.hpp"
#include "model/vae/vae.hpp"
#include "model/vae/wan_vae.hpp"
#include "runtime/denoiser.hpp"
#include "runtime/guidance.h"
#include "runtime/sample-cache.h"
#include "upscaler.h"
#include "vae.hpp"
#include "wan.hpp"
#include "z_image.hpp"
#include "latent-preview.h"
#include "name_conversion.h"
#include "runtime/latent-preview.h"
const char* sd_vae_format_name(enum sd_vae_format_t format);
static SDVersion sd_vae_format_to_version(enum sd_vae_format_t format, SDVersion fallback);

View File

@ -3,8 +3,8 @@
#include <algorithm>
#include <sstream>
#include "core/util.h"
#include "tokenize_util.h"
#include "util.h"
std::vector<std::pair<int, std::u32string>> BPETokenizer::bytes_to_unicode() {
std::vector<std::pair<int, std::u32string>> byte_unicode_pairs;

View File

@ -6,9 +6,9 @@
#include <regex>
#include <set>
#include "core/util.h"
#include "ggml.h"
#include "tokenize_util.h"
#include "util.h"
#include "vocab/vocab.h"
CLIPTokenizer::CLIPTokenizer(int pad_token_id, const std::string& merges_utf8_str) {

View File

@ -1,8 +1,8 @@
#include "gemma_tokenizer.h"
#include "core/util.h"
#include "ggml.h"
#include "json.hpp"
#include "util.h"
#include "vocab/vocab.h"
std::string GemmaTokenizer::normalize(const std::string& text) const {

View File

@ -1,7 +1,7 @@
#include "gpt_oss_tokenizer.h"
#include "core/util.h"
#include "json.hpp"
#include "util.h"
#include "vocab/vocab.h"
void GPTOSSTokenizer::load_from_merges(const std::string& merges_utf8_str, const std::string& vocab_utf8_str) {

View File

@ -1,8 +1,8 @@
#include "mistral_tokenizer.h"
#include "core/util.h"
#include "ggml.h"
#include "json.hpp"
#include "util.h"
#include "vocab/vocab.h"
void MistralTokenizer::load_from_merges(const std::string& merges_utf8_str, const std::string& vocab_utf8_str) {

View File

@ -1,6 +1,6 @@
#include "qwen2_tokenizer.h"
#include "util.h"
#include "core/util.h"
#include "vocab/vocab.h"
void Qwen2Tokenizer::load_from_merges(const std::string& merges_utf8_str) {

View File

@ -6,9 +6,9 @@
#include <regex>
#include <sstream>
#include "core/util.h"
#include "json.hpp"
#include "tokenize_util.h"
#include "util.h"
#include "vocab/vocab.h"
// Port from: https://github.com/google/sentencepiece/blob/master/src/unigram_model.h

View File

@ -4,7 +4,7 @@
#include <cmath>
#include <regex>
#include "util.h"
#include "core/util.h"
void Tokenizer::add_special_token(const std::string& token) {
special_tokens.push_back(token);

View File

@ -1,8 +1,8 @@
#include "upscaler.h"
#include "ggml_extend.hpp"
#include "core/ggml_extend.hpp"
#include "core/util.h"
#include "model.h"
#include "stable-diffusion.h"
#include "util.h"
#include <utility>

View File

@ -1,10 +1,10 @@
#ifndef __SD_UPSCALER_H__
#define __SD_UPSCALER_H__
#include "esrgan.hpp"
#include "ggml_extend_backend.h"
#include "core/ggml_extend_backend.h"
#include "core/tensor.hpp"
#include "model/upscaler/esrgan.hpp"
#include "stable-diffusion.h"
#include "tensor.hpp"
#include <memory>
#include <string>