mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2026-06-23 22:56:42 +00:00
Compare commits
3 Commits
b9254dda0d
...
f3fd359b58
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f3fd359b58 | ||
|
|
dfb2390dd4 | ||
|
|
cfbc19d186 |
@ -210,6 +210,19 @@ file(GLOB SD_LIB_SOURCES CONFIGURE_DEPENDS
|
||||
"src/*.h"
|
||||
"src/*.cpp"
|
||||
"src/*.hpp"
|
||||
"src/conditioning/*.h"
|
||||
"src/conditioning/*.cpp"
|
||||
"src/conditioning/*.hpp"
|
||||
"src/core/*.h"
|
||||
"src/core/*.cpp"
|
||||
"src/core/*.hpp"
|
||||
"src/model/*/*.h"
|
||||
"src/model/*/*.cpp"
|
||||
"src/model/*/*.hpp"
|
||||
"src/runtime/*.h"
|
||||
"src/runtime/*.cpp"
|
||||
"src/runtime/*.hpp"
|
||||
"src/runtime/*.inl"
|
||||
"src/model_io/*.h"
|
||||
"src/model_io/*.cpp"
|
||||
"src/tokenizers/*.h"
|
||||
@ -312,6 +325,7 @@ add_subdirectory(thirdparty)
|
||||
|
||||
target_link_libraries(${SD_LIB} PUBLIC ggml zip)
|
||||
target_include_directories(${SD_LIB} PUBLIC . src include)
|
||||
target_include_directories(${SD_LIB} PRIVATE src/core)
|
||||
target_include_directories(${SD_LIB} PUBLIC . thirdparty)
|
||||
target_compile_features(${SD_LIB} PUBLIC c_std_11 cxx_std_17)
|
||||
|
||||
|
||||
@ -44,6 +44,8 @@ Naming conventions:
|
||||
|
||||
Some older code in the project may not fully follow the current conventions. Please do not submit PRs that only rewrite existing code to match style rules.
|
||||
|
||||
When adding or modifying model implementations, follow the model config and weight detection conventions in [docs/model_config.md](docs/model_config.md).
|
||||
|
||||
## AI-Assisted Contributions
|
||||
|
||||
AI tools may be used to assist development, but contributors are responsible for the quality and correctness of the submitted code.
|
||||
|
||||
118
docs/model_config.md
Normal file
118
docs/model_config.md
Normal file
@ -0,0 +1,118 @@
|
||||
# Model Configuration Conventions
|
||||
|
||||
This document describes the conventions for model configuration structs and
|
||||
weight-based configuration detection.
|
||||
|
||||
## Config Types
|
||||
|
||||
Model configuration should live in a model-specific `*Config` struct.
|
||||
|
||||
Examples:
|
||||
|
||||
- `ZImageConfig`
|
||||
- `UNetConfig`
|
||||
- `MMDiTConfig`
|
||||
- `LLMConfig`
|
||||
|
||||
Preserve established acronym casing in type names, such as `UNet`, `MMDiT`,
|
||||
`LLM`, `VAE`, and `T5`.
|
||||
|
||||
Place the config struct near the top of the model header, before the main model
|
||||
blocks and runner types that consume it.
|
||||
|
||||
## Config Variables
|
||||
|
||||
Variables and members that hold a config should be named `config`.
|
||||
|
||||
Examples:
|
||||
|
||||
```cpp
|
||||
UNetConfig config;
|
||||
UnetModelBlock unet;
|
||||
|
||||
MMDiTRunner(...)
|
||||
: DiffusionModelRunner(backend, params_backend, prefix),
|
||||
config(MMDiTConfig::detect_from_weights(tensor_storage_map, prefix)),
|
||||
mmdit(config) {
|
||||
}
|
||||
```
|
||||
|
||||
Avoid alternate names such as `params`, `params_cfg`, `model_params`, or
|
||||
model-specific aliases unless an existing public API requires them.
|
||||
|
||||
## Weight Detection
|
||||
|
||||
If a model can derive configuration from loaded weight metadata, expose that
|
||||
logic as a static method on the config type:
|
||||
|
||||
```cpp
|
||||
static XxxConfig detect_from_weights(const String2TensorStorage& tensor_storage_map,
|
||||
const std::string& prefix);
|
||||
```
|
||||
|
||||
Additional selector arguments are allowed when required by an existing model
|
||||
family, for example `SDVersion version` or an architecture enum:
|
||||
|
||||
```cpp
|
||||
static UNetConfig detect_from_weights(const String2TensorStorage& tensor_storage_map,
|
||||
const std::string& prefix,
|
||||
SDVersion version = VERSION_SD1);
|
||||
```
|
||||
|
||||
Use `TensorStorage` metadata, especially `n_dims` and `ne`, to infer shapes.
|
||||
Do not load or parse tensor data for config detection.
|
||||
|
||||
Detection should respect `prefix`. For nested weights, construct full names from
|
||||
`prefix + "." + suffix` or filter entries with `starts_with(name, prefix)`.
|
||||
|
||||
Do not add persistent config fields such as `inferred_from_weights` only to
|
||||
record whether detection happened. If the function needs to decide whether to
|
||||
print a debug line, keep that as local control flow inside `detect_from_weights`.
|
||||
|
||||
## Logging
|
||||
|
||||
When config values are inferred from weights, print one `LOG_DEBUG` line at the
|
||||
end of `detect_from_weights`.
|
||||
|
||||
Example:
|
||||
|
||||
```cpp
|
||||
LOG_DEBUG("llm: num_layers = %" PRId64 ", vocab_size = %" PRId64 ", hidden_size = %" PRId64 ", intermediate_size = %" PRId64,
|
||||
config.num_layers,
|
||||
config.vocab_size,
|
||||
config.hidden_size,
|
||||
config.intermediate_size);
|
||||
```
|
||||
|
||||
Only print the config detection log when the function actually inferred values
|
||||
from weights. Do not duplicate the same config summary in runner constructors or
|
||||
model loading code.
|
||||
|
||||
Use the correct format specifiers for field types, such as `%" PRId64 "` for
|
||||
`int64_t` and `%d` for `int`.
|
||||
|
||||
## Runner And Model Responsibilities
|
||||
|
||||
Runners should detect the config once and pass it into the model block:
|
||||
|
||||
```cpp
|
||||
struct XxxRunner : public DiffusionModelRunner {
|
||||
XxxConfig config;
|
||||
XxxModel model;
|
||||
|
||||
XxxRunner(..., const String2TensorStorage& tensor_storage_map, const std::string prefix)
|
||||
: DiffusionModelRunner(backend, params_backend, prefix),
|
||||
config(XxxConfig::detect_from_weights(tensor_storage_map, prefix)),
|
||||
model(config) {
|
||||
model.init(params_ctx, tensor_storage_map, prefix);
|
||||
}
|
||||
};
|
||||
```
|
||||
|
||||
Model blocks should consume `config` directly instead of re-scanning weights in
|
||||
their constructors. Keep config-derived behavior centralized in the config
|
||||
struct.
|
||||
|
||||
If a model has no weight-derived config today, it may still provide
|
||||
`detect_from_weights` for API consistency, but it should not print a config
|
||||
detection log unless it actually derives values from weights.
|
||||
@ -1,10 +1,16 @@
|
||||
for f in src/*.cpp src/*.h src/*.hpp src/tokenizers/*.h src/tokenizers/*.cpp src/tokenizers/vocab/*.h src/tokenizers/vocab/*.cpp \
|
||||
for f in src/*.cpp src/*.h src/*.hpp \
|
||||
src/conditioning/*.cpp src/conditioning/*.h src/conditioning/*.hpp \
|
||||
src/core/*.cpp src/core/*.h src/core/*.hpp \
|
||||
src/runtime/*.cpp src/runtime/*.h src/runtime/*.hpp \
|
||||
src/model/*/*.cpp src/model/*/*.h src/model/*/*.hpp \
|
||||
src/tokenizers/*.h src/tokenizers/*.cpp src/tokenizers/vocab/*.h src/tokenizers/vocab/*.cpp \
|
||||
src/model_io/*.h src/model_io/*.cpp examples/cli/*.cpp examples/cli/*.h examples/server/*.cpp \
|
||||
examples/common/*.hpp examples/common/*.h examples/common/*.cpp; do
|
||||
[[ -e "$f" ]] || continue
|
||||
[[ "$f" == vocab* ]] && continue
|
||||
echo "formatting '$f'"
|
||||
# if [ "$f" != "stable-diffusion.h" ]; then
|
||||
# clang-tidy -fix -p build_linux/ "$f"
|
||||
# fi
|
||||
clang-format -style=file -i "$f"
|
||||
done
|
||||
done
|
||||
|
||||
@ -1,14 +1,14 @@
|
||||
#ifndef __CONDITIONER_HPP__
|
||||
#define __CONDITIONER_HPP__
|
||||
#ifndef __SD_CONDITIONING_CONDITIONER_HPP__
|
||||
#define __SD_CONDITIONING_CONDITIONER_HPP__
|
||||
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include <optional>
|
||||
|
||||
#include "clip.hpp"
|
||||
#include "llm.hpp"
|
||||
#include "t5.hpp"
|
||||
#include "tensor_ggml.hpp"
|
||||
#include "core/tensor_ggml.hpp"
|
||||
#include "model/te/clip.hpp"
|
||||
#include "model/te/llm.hpp"
|
||||
#include "model/te/t5.hpp"
|
||||
|
||||
struct SDCondition {
|
||||
sd::Tensor<float> c_crossattn;
|
||||
@ -1971,7 +1971,7 @@ struct LLMEmbedder : public Conditioner {
|
||||
|
||||
for (int i = 0; i < conditioner_params.ref_images->size(); i++) {
|
||||
const auto& image = (*conditioner_params.ref_images)[i];
|
||||
double factor = llm->params.vision.patch_size * llm->params.vision.spatial_merge_size;
|
||||
double factor = llm->config.vision.patch_size * llm->config.vision.spatial_merge_size;
|
||||
int height = static_cast<int>(image.shape()[1]);
|
||||
int width = static_cast<int>(image.shape()[0]);
|
||||
int h_bar = static_cast<int>(std::round(height / factor) * factor);
|
||||
@ -2042,7 +2042,7 @@ struct LLMEmbedder : public Conditioner {
|
||||
|
||||
for (int i = 0; i < conditioner_params.ref_images->size(); i++) {
|
||||
const auto& image = (*conditioner_params.ref_images)[i];
|
||||
double factor = llm->params.vision.patch_size * llm->params.vision.spatial_merge_size;
|
||||
double factor = llm->config.vision.patch_size * llm->config.vision.spatial_merge_size;
|
||||
int height = static_cast<int>(image.shape()[1]);
|
||||
int width = static_cast<int>(image.shape()[0]);
|
||||
int h_bar = static_cast<int>(std::round(height / factor) * factor);
|
||||
@ -2554,4 +2554,4 @@ struct LTXAVEmbedder : public Conditioner {
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
#endif // __SD_CONDITIONING_CONDITIONER_HPP__
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef __GGML_EXTEND_HPP__
|
||||
#define __GGML_EXTEND_HPP__
|
||||
#ifndef __SD_CORE_GGML_EXTEND_HPP__
|
||||
#define __SD_CORE_GGML_EXTEND_HPP__
|
||||
|
||||
#include <assert.h>
|
||||
#include <inttypes.h>
|
||||
@ -23,19 +23,19 @@
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "core/ggml_extend_backend.h"
|
||||
#include "core/ggml_graph_cut.h"
|
||||
#include "core/layer_registry.h"
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml_extend_backend.h"
|
||||
#include "ggml_graph_cut.h"
|
||||
#include "layer_registry.h"
|
||||
|
||||
#include "core/tensor.hpp"
|
||||
#include "model.h"
|
||||
#include "tensor.hpp"
|
||||
|
||||
#include "rng.hpp"
|
||||
#include "tensor_ggml.hpp"
|
||||
#include "util.h"
|
||||
#include "core/rng.hpp"
|
||||
#include "core/tensor_ggml.hpp"
|
||||
#include "core/util.h"
|
||||
|
||||
#define EPS 1e-05f
|
||||
|
||||
@ -1707,7 +1707,7 @@ protected:
|
||||
uint64_t resident_state_token = 0;
|
||||
|
||||
size_t max_graph_vram_bytes = 0;
|
||||
bool stream_layers_enabled = false;
|
||||
bool stream_layers_enabled = false;
|
||||
size_t observed_max_effective_budget_ = 0;
|
||||
|
||||
sd::layer_registry::LayerRegistry layer_registry_;
|
||||
@ -4145,4 +4145,4 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_lokr_forward(
|
||||
}
|
||||
}
|
||||
|
||||
#endif // __GGML_EXTEND__HPP__
|
||||
#endif // __SD_CORE_GGML_EXTEND_HPP__
|
||||
@ -1,4 +1,4 @@
|
||||
#include "ggml_extend_backend.h"
|
||||
#include "core/ggml_extend_backend.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
@ -8,8 +8,8 @@
|
||||
#include <stdexcept>
|
||||
#include <vector>
|
||||
|
||||
#include "core/util.h"
|
||||
#include "stable-diffusion.h"
|
||||
#include "util.h"
|
||||
|
||||
static std::string trim_copy(const std::string& value) {
|
||||
size_t begin = 0;
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef __SD_GGML_EXTEND_BACKEND_H__
|
||||
#define __SD_GGML_EXTEND_BACKEND_H__
|
||||
#ifndef __SD_CORE_GGML_EXTEND_BACKEND_H__
|
||||
#define __SD_CORE_GGML_EXTEND_BACKEND_H__
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
@ -76,4 +76,4 @@ ggml_backend_t sd_backend_cpu_init();
|
||||
bool sd_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
|
||||
const char* sd_backend_module_name(SDBackendModule module);
|
||||
void ggml_ext_im_set_f32_1d(const struct ggml_tensor* tensor, int i, float value);
|
||||
#endif
|
||||
#endif // __SD_CORE_GGML_EXTEND_BACKEND_H__
|
||||
@ -1,4 +1,4 @@
|
||||
#include "ggml_graph_cut.h"
|
||||
#include "core/ggml_graph_cut.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
@ -8,11 +8,11 @@
|
||||
#include <stack>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "core/util.h"
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "util.h"
|
||||
|
||||
#include "../ggml/src/ggml-impl.h"
|
||||
#include "ggml/src/ggml-impl.h"
|
||||
|
||||
namespace sd::ggml_graph_cut {
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef __SD_GGML_GRAPH_CUT_H__
|
||||
#define __SD_GGML_GRAPH_CUT_H__
|
||||
#ifndef __SD_CORE_GGML_GRAPH_CUT_H__
|
||||
#define __SD_CORE_GGML_GRAPH_CUT_H__
|
||||
|
||||
#include <array>
|
||||
#include <cstdint>
|
||||
@ -114,4 +114,4 @@ namespace sd::ggml_graph_cut {
|
||||
void annotate_residency(Plan& plan, size_t max_graph_vram_bytes);
|
||||
} // namespace sd::ggml_graph_cut
|
||||
|
||||
#endif
|
||||
#endif // __SD_CORE_GGML_GRAPH_CUT_H__
|
||||
@ -1,8 +1,8 @@
|
||||
#include "layer_registry.h"
|
||||
#include "core/layer_registry.h"
|
||||
|
||||
#include <utility>
|
||||
|
||||
#include "util.h"
|
||||
#include "core/util.h"
|
||||
|
||||
namespace sd::layer_registry {
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef __LAYER_REGISTRY_H__
|
||||
#define __LAYER_REGISTRY_H__
|
||||
#ifndef __SD_CORE_LAYER_REGISTRY_H__
|
||||
#define __SD_CORE_LAYER_REGISTRY_H__
|
||||
|
||||
#include <map>
|
||||
#include <set>
|
||||
@ -47,4 +47,4 @@ namespace sd::layer_registry {
|
||||
|
||||
} // namespace sd::layer_registry
|
||||
|
||||
#endif
|
||||
#endif // __SD_CORE_LAYER_REGISTRY_H__
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef __ORDERED_MAP_HPP__
|
||||
#define __ORDERED_MAP_HPP__
|
||||
#ifndef __SD_CORE_ORDERED_MAP_HPP__
|
||||
#define __SD_CORE_ORDERED_MAP_HPP__
|
||||
|
||||
#include <iostream>
|
||||
#include <list>
|
||||
@ -174,4 +174,4 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
#endif // __ORDERED_MAP_HPP__
|
||||
#endif // __SD_CORE_ORDERED_MAP_HPP__
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef __RNG_H__
|
||||
#define __RNG_H__
|
||||
#ifndef __SD_CORE_RNG_HPP__
|
||||
#define __SD_CORE_RNG_HPP__
|
||||
|
||||
#include <random>
|
||||
#include <vector>
|
||||
@ -32,4 +32,4 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
#endif // __RNG_H__
|
||||
#endif // __SD_CORE_RNG_HPP__
|
||||
@ -1,10 +1,10 @@
|
||||
#ifndef __RNG_MT19937_HPP__
|
||||
#define __RNG_MT19937_HPP__
|
||||
#ifndef __SD_CORE_RNG_MT19937_HPP__
|
||||
#define __SD_CORE_RNG_MT19937_HPP__
|
||||
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
|
||||
#include "rng.hpp"
|
||||
#include "core/rng.hpp"
|
||||
|
||||
// RNG imitiating torch cpu randn on CPU.
|
||||
// Port from pytorch, original license: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/LICENSE
|
||||
@ -144,4 +144,4 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
#endif // __RNG_MT19937_HPP__
|
||||
#endif // __SD_CORE_RNG_MT19937_HPP__
|
||||
@ -1,10 +1,10 @@
|
||||
#ifndef __RNG_PHILOX_H__
|
||||
#define __RNG_PHILOX_H__
|
||||
#ifndef __SD_CORE_RNG_PHILOX_HPP__
|
||||
#define __SD_CORE_RNG_PHILOX_HPP__
|
||||
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
|
||||
#include "rng.hpp"
|
||||
#include "core/rng.hpp"
|
||||
|
||||
// RNG imitiating torch cuda randn on CPU.
|
||||
// Port from: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/5ef669de080814067961f28357256e8fe27544f4/modules/rng_philox.py
|
||||
@ -122,4 +122,4 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
#endif // __RNG_PHILOX_H__
|
||||
#endif // __SD_CORE_RNG_PHILOX_HPP__
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef __SD_TENSOR_HPP__
|
||||
#define __SD_TENSOR_HPP__
|
||||
#ifndef __SD_CORE_TENSOR_HPP__
|
||||
#define __SD_CORE_TENSOR_HPP__
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
@ -16,7 +16,7 @@
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "rng.hpp"
|
||||
#include "core/rng.hpp"
|
||||
|
||||
namespace sd {
|
||||
|
||||
@ -1661,4 +1661,4 @@ namespace sd {
|
||||
|
||||
} // namespace sd
|
||||
|
||||
#endif
|
||||
#endif // __SD_CORE_TENSOR_HPP__
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef __SD_TENSOR_GGML_HPP__
|
||||
#define __SD_TENSOR_GGML_HPP__
|
||||
#ifndef __SD_CORE_TENSOR_GGML_HPP__
|
||||
#define __SD_CORE_TENSOR_GGML_HPP__
|
||||
|
||||
#include <array>
|
||||
#include <cstring>
|
||||
@ -8,8 +8,8 @@
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
|
||||
#include "core/tensor.hpp"
|
||||
#include "ggml.h"
|
||||
#include "tensor.hpp"
|
||||
|
||||
namespace sd {
|
||||
|
||||
@ -124,4 +124,4 @@ namespace sd {
|
||||
|
||||
} // namespace sd
|
||||
|
||||
#endif
|
||||
#endif // __SD_CORE_TENSOR_GGML_HPP__
|
||||
@ -1,4 +1,4 @@
|
||||
#include "util.h"
|
||||
#include "core/util.h"
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
#include <cmath>
|
||||
@ -13,7 +13,7 @@
|
||||
#include <thread>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
#include "preprocessing.hpp"
|
||||
#include "runtime/preprocessing.hpp"
|
||||
|
||||
#if defined(__APPLE__) && defined(__MACH__)
|
||||
#include <sys/sysctl.h>
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef __UTIL_H__
|
||||
#define __UTIL_H__
|
||||
#ifndef __SD_CORE_UTIL_H__
|
||||
#define __SD_CORE_UTIL_H__
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
@ -7,9 +7,9 @@
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "core/tensor.hpp"
|
||||
#include "ggml-backend.h"
|
||||
#include "stable-diffusion.h"
|
||||
#include "tensor.hpp"
|
||||
|
||||
#define SAFE_STR(s) ((s) ? (s) : "")
|
||||
#define BOOL_STR(b) ((b) ? "true" : "false")
|
||||
@ -103,4 +103,4 @@ bool sd_backend_is(ggml_backend_t backend, const std::string& name);
|
||||
#define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__)
|
||||
#define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__)
|
||||
#define LOG_ERROR(format, ...) log_printf(SD_LOG_ERROR, __FILE__, __LINE__, format, ##__VA_ARGS__)
|
||||
#endif // __UTIL_H__
|
||||
#endif // __SD_CORE_UTIL_H__
|
||||
@ -13,18 +13,18 @@
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "core/util.h"
|
||||
#include "model.h"
|
||||
#include "model_io/gguf_io.h"
|
||||
#include "model_io/safetensors_io.h"
|
||||
#include "model_io/torch_legacy_io.h"
|
||||
#include "model_io/torch_zip_io.h"
|
||||
#include "stable-diffusion.h"
|
||||
#include "util.h"
|
||||
|
||||
#include "core/ggml_extend_backend.h"
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml_extend_backend.h"
|
||||
#include "zip.h"
|
||||
|
||||
#include "name_conversion.h"
|
||||
|
||||
@ -8,10 +8,10 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "core/ordered_map.hpp"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml.h"
|
||||
#include "model_io/tensor_storage.h"
|
||||
#include "ordered_map.hpp"
|
||||
|
||||
enum SDVersion {
|
||||
VERSION_SD1,
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
#ifndef __LORA_HPP__
|
||||
#define __LORA_HPP__
|
||||
#ifndef __SD_MODEL_ADAPTER_LORA_HPP__
|
||||
#define __SD_MODEL_ADAPTER_LORA_HPP__
|
||||
|
||||
#include <mutex>
|
||||
#include "ggml_extend.hpp"
|
||||
#include "core/ggml_extend.hpp"
|
||||
|
||||
#define LORA_GRAPH_BASE_SIZE 10240
|
||||
|
||||
@ -914,4 +914,4 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
#endif // __LORA_HPP__
|
||||
#endif // __SD_MODEL_ADAPTER_LORA_HPP__
|
||||
@ -1,10 +1,11 @@
|
||||
#ifndef __PMI_HPP__
|
||||
#define __PMI_HPP__
|
||||
#ifndef __SD_MODEL_ADAPTER_PMID_HPP__
|
||||
#define __SD_MODEL_ADAPTER_PMID_HPP__
|
||||
|
||||
#include "ggml_extend.hpp"
|
||||
#include "core/ggml_extend.hpp"
|
||||
|
||||
#include "clip.hpp"
|
||||
#include "lora.hpp"
|
||||
#include "model/adapter/lora.hpp"
|
||||
#include "model/common/block.hpp"
|
||||
#include "model/te/clip.hpp"
|
||||
|
||||
struct FuseBlock : public GGMLBlock {
|
||||
// network hparams
|
||||
@ -636,4 +637,4 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
|
||||
}
|
||||
};
|
||||
|
||||
#endif // __PMI_HPP__
|
||||
#endif // __SD_MODEL_ADAPTER_PMID_HPP__
|
||||
@ -1,9 +1,9 @@
|
||||
#ifndef __COMMON_BLOCK_HPP__
|
||||
#define __COMMON_BLOCK_HPP__
|
||||
#ifndef __SD_MODEL_COMMON_BLOCK_HPP__
|
||||
#define __SD_MODEL_COMMON_BLOCK_HPP__
|
||||
|
||||
#include "core/ggml_extend.hpp"
|
||||
#include "core/util.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml_extend.hpp"
|
||||
#include "util.h"
|
||||
|
||||
class DownSampleBlock : public GGMLBlock {
|
||||
protected:
|
||||
@ -227,6 +227,37 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
struct Mlp : public GGMLBlock {
|
||||
public:
|
||||
Mlp(int64_t in_features,
|
||||
int64_t hidden_features = -1,
|
||||
int64_t out_features = -1,
|
||||
bool bias = true) {
|
||||
// act_layer is always lambda: nn.GELU(approximate="tanh")
|
||||
// norm_layer is always None
|
||||
// use_conv is always False
|
||||
if (hidden_features == -1) {
|
||||
hidden_features = in_features;
|
||||
}
|
||||
if (out_features == -1) {
|
||||
out_features = in_features;
|
||||
}
|
||||
blocks["fc1"] = std::shared_ptr<GGMLBlock>(new Linear(in_features, hidden_features, bias));
|
||||
blocks["fc2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_features, out_features, bias));
|
||||
}
|
||||
|
||||
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
|
||||
// x: [N, n_token, in_features]
|
||||
auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
|
||||
auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);
|
||||
|
||||
x = fc1->forward(ctx, x);
|
||||
x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
|
||||
x = fc2->forward(ctx, x);
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
class FeedForward : public GGMLBlock {
|
||||
public:
|
||||
enum class Activation {
|
||||
@ -603,4 +634,4 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
#endif // __COMMON_BLOCK_HPP__
|
||||
#endif // __SD_MODEL_COMMON_BLOCK_HPP__
|
||||
@ -1,10 +1,10 @@
|
||||
#ifndef __ROPE_HPP__
|
||||
#define __ROPE_HPP__
|
||||
#ifndef __SD_MODEL_COMMON_ROPE_HPP__
|
||||
#define __SD_MODEL_COMMON_ROPE_HPP__
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
#include "ggml_extend.hpp"
|
||||
#include "core/ggml_extend.hpp"
|
||||
|
||||
namespace Rope {
|
||||
enum class EmbedNDLayout {
|
||||
@ -902,4 +902,4 @@ namespace Rope {
|
||||
}
|
||||
}; // namespace Rope
|
||||
|
||||
#endif // __ROPE_HPP__
|
||||
#endif // __SD_MODEL_COMMON_ROPE_HPP__
|
||||
@ -1,19 +1,61 @@
|
||||
#ifndef __ANIMA_HPP__
|
||||
#define __ANIMA_HPP__
|
||||
#ifndef __SD_MODEL_DIFFUSION_ANIMA_HPP__
|
||||
#define __SD_MODEL_DIFFUSION_ANIMA_HPP__
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "common_block.hpp"
|
||||
#include "diffusion_model.hpp"
|
||||
#include "flux.hpp"
|
||||
#include "rope.hpp"
|
||||
#include "model/common/block.hpp"
|
||||
#include "model/common/rope.hpp"
|
||||
#include "model/diffusion/flux.hpp"
|
||||
#include "model/diffusion/model.hpp"
|
||||
|
||||
namespace Anima {
|
||||
constexpr int ANIMA_GRAPH_SIZE = 65536;
|
||||
|
||||
struct AnimaConfig {
|
||||
int64_t in_channels = 16;
|
||||
int64_t out_channels = 16;
|
||||
int64_t hidden_size = 2048;
|
||||
int64_t text_embed_dim = 1024;
|
||||
int64_t num_heads = 16;
|
||||
int64_t head_dim = 128;
|
||||
int patch_size = 2;
|
||||
int64_t num_layers = 28;
|
||||
std::vector<int> axes_dim = {44, 42, 42};
|
||||
int theta = 10000;
|
||||
|
||||
static AnimaConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
|
||||
AnimaConfig config;
|
||||
int64_t detected_layers = 0;
|
||||
std::string layer_tag = prefix.empty() ? "blocks." : prefix + ".blocks.";
|
||||
for (const auto& [name, _] : tensor_storage_map) {
|
||||
size_t pos = name.find(layer_tag);
|
||||
if (pos == std::string::npos) {
|
||||
continue;
|
||||
}
|
||||
size_t start = pos + layer_tag.size();
|
||||
size_t end = name.find('.', start);
|
||||
if (end == std::string::npos) {
|
||||
continue;
|
||||
}
|
||||
int64_t layer_id = atoll(name.substr(start, end - start).c_str());
|
||||
detected_layers = std::max(detected_layers, layer_id + 1);
|
||||
}
|
||||
if (detected_layers > 0) {
|
||||
config.num_layers = detected_layers;
|
||||
LOG_DEBUG("anima: num_layers = %" PRId64 ", hidden_size = %" PRId64 ", num_heads = %" PRId64 ", head_dim = %" PRId64,
|
||||
config.num_layers,
|
||||
config.hidden_size,
|
||||
config.num_heads,
|
||||
config.head_dim);
|
||||
}
|
||||
return config;
|
||||
}
|
||||
};
|
||||
|
||||
__STATIC_INLINE__ ggml_tensor* apply_gate(ggml_context* ctx,
|
||||
ggml_tensor* x,
|
||||
ggml_tensor* gate) {
|
||||
@ -418,31 +460,22 @@ namespace Anima {
|
||||
|
||||
struct AnimaNet : public GGMLBlock {
|
||||
public:
|
||||
int64_t in_channels = 16;
|
||||
int64_t out_channels = 16;
|
||||
int64_t hidden_size = 2048;
|
||||
int64_t text_embed_dim = 1024;
|
||||
int64_t num_heads = 16;
|
||||
int64_t head_dim = 128;
|
||||
int patch_size = 2;
|
||||
int64_t num_layers = 28;
|
||||
std::vector<int> axes_dim = {44, 42, 42};
|
||||
int theta = 10000;
|
||||
AnimaConfig config;
|
||||
|
||||
public:
|
||||
AnimaNet() = default;
|
||||
explicit AnimaNet(int64_t num_layers)
|
||||
: num_layers(num_layers) {
|
||||
blocks["x_embedder"] = std::make_shared<XEmbedder>((in_channels + 1) * patch_size * patch_size, hidden_size);
|
||||
blocks["t_embedder"] = std::make_shared<TimestepEmbedder>(hidden_size, hidden_size * 3);
|
||||
blocks["t_embedding_norm"] = std::make_shared<RMSNorm>(hidden_size, 1e-6f);
|
||||
for (int i = 0; i < num_layers; i++) {
|
||||
blocks["blocks." + std::to_string(i)] = std::make_shared<TransformerBlock>(hidden_size,
|
||||
text_embed_dim,
|
||||
num_heads,
|
||||
head_dim);
|
||||
explicit AnimaNet(AnimaConfig config)
|
||||
: config(config) {
|
||||
blocks["x_embedder"] = std::make_shared<XEmbedder>((config.in_channels + 1) * config.patch_size * config.patch_size, config.hidden_size);
|
||||
blocks["t_embedder"] = std::make_shared<TimestepEmbedder>(config.hidden_size, config.hidden_size * 3);
|
||||
blocks["t_embedding_norm"] = std::make_shared<RMSNorm>(config.hidden_size, 1e-6f);
|
||||
for (int i = 0; i < config.num_layers; i++) {
|
||||
blocks["blocks." + std::to_string(i)] = std::make_shared<TransformerBlock>(config.hidden_size,
|
||||
config.text_embed_dim,
|
||||
config.num_heads,
|
||||
config.head_dim);
|
||||
}
|
||||
blocks["final_layer"] = std::make_shared<FinalLayer>(hidden_size, patch_size, out_channels);
|
||||
blocks["final_layer"] = std::make_shared<FinalLayer>(config.hidden_size, config.patch_size, config.out_channels);
|
||||
blocks["llm_adapter"] = std::make_shared<LLMAdapter>(1024, 1024, 1024, 6, 16);
|
||||
}
|
||||
|
||||
@ -469,11 +502,11 @@ namespace Anima {
|
||||
auto padding_mask = ggml_ext_zeros(ctx->ggml_ctx, x->ne[0], x->ne[1], 1, x->ne[3]);
|
||||
x = ggml_concat(ctx->ggml_ctx, x, padding_mask, 2); // [N, C + 1, H, W]
|
||||
|
||||
x = DiT::pad_and_patchify(ctx, x, patch_size, patch_size); // [N, h*w, (C+1)*ph*pw]
|
||||
x = DiT::pad_and_patchify(ctx, x, config.patch_size, config.patch_size); // [N, h*w, (C+1)*ph*pw]
|
||||
|
||||
x = x_embedder->forward(ctx, x);
|
||||
|
||||
auto timestep_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep, static_cast<int>(hidden_size));
|
||||
auto timestep_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep, static_cast<int>(config.hidden_size));
|
||||
auto temb = t_embedder->forward(ctx, timestep_proj);
|
||||
auto embedded_timestep = t_embedding_norm->forward(ctx, timestep_proj);
|
||||
|
||||
@ -505,7 +538,7 @@ namespace Anima {
|
||||
sd::ggml_graph_cut::mark_graph_cut(temb, "anima.prelude", "temb");
|
||||
sd::ggml_graph_cut::mark_graph_cut(encoder_hidden_states, "anima.prelude", "context");
|
||||
|
||||
for (int i = 0; i < num_layers; i++) {
|
||||
for (int i = 0; i < config.num_layers; i++) {
|
||||
auto block = std::dynamic_pointer_cast<TransformerBlock>(blocks["blocks." + std::to_string(i)]);
|
||||
x = block->forward(ctx, x, encoder_hidden_states, embedded_timestep, temb, image_pe);
|
||||
sd::ggml_graph_cut::mark_graph_cut(x, "anima.blocks." + std::to_string(i), "x");
|
||||
@ -513,7 +546,7 @@ namespace Anima {
|
||||
|
||||
x = final_layer->forward(ctx, x, embedded_timestep, temb); // [N, h*w, ph*pw*C]
|
||||
|
||||
x = DiT::unpatchify_and_crop(ctx->ggml_ctx, x, H, W, patch_size, patch_size, false); // [N, C, H, W]
|
||||
x = DiT::unpatchify_and_crop(ctx->ggml_ctx, x, H, W, config.patch_size, config.patch_size, false); // [N, C, H, W]
|
||||
|
||||
return x;
|
||||
}
|
||||
@ -524,35 +557,16 @@ namespace Anima {
|
||||
std::vector<float> image_pe_vec;
|
||||
std::vector<float> adapter_q_pe_vec;
|
||||
std::vector<float> adapter_k_pe_vec;
|
||||
AnimaConfig config;
|
||||
AnimaNet net;
|
||||
|
||||
AnimaRunner(ggml_backend_t backend,
|
||||
ggml_backend_t params_backend,
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string prefix = "model.diffusion_model")
|
||||
: DiffusionModelRunner(backend, params_backend, prefix) {
|
||||
int64_t num_layers = 0;
|
||||
std::string layer_tag = prefix + ".net.blocks.";
|
||||
for (const auto& kv : tensor_storage_map) {
|
||||
const std::string& tensor_name = kv.first;
|
||||
size_t pos = tensor_name.find(layer_tag);
|
||||
if (pos == std::string::npos) {
|
||||
continue;
|
||||
}
|
||||
size_t start = pos + layer_tag.size();
|
||||
size_t end = tensor_name.find('.', start);
|
||||
if (end == std::string::npos) {
|
||||
continue;
|
||||
}
|
||||
int64_t layer_id = atoll(tensor_name.substr(start, end - start).c_str());
|
||||
num_layers = std::max(num_layers, layer_id + 1);
|
||||
}
|
||||
if (num_layers <= 0) {
|
||||
num_layers = 28;
|
||||
}
|
||||
LOG_INFO("anima net layers: %" PRId64, num_layers);
|
||||
|
||||
net = AnimaNet(num_layers);
|
||||
: DiffusionModelRunner(backend, params_backend, prefix),
|
||||
config(AnimaConfig::detect_from_weights(tensor_storage_map, prefix + ".net")) {
|
||||
net = AnimaNet(config);
|
||||
net.init(params_ctx, tensor_storage_map, prefix + ".net");
|
||||
}
|
||||
|
||||
@ -623,22 +637,22 @@ namespace Anima {
|
||||
GGML_ASSERT(x->ne[3] == 1);
|
||||
ggml_cgraph* gf = new_graph_custom(ANIMA_GRAPH_SIZE);
|
||||
|
||||
int64_t pad_h = (net.patch_size - x->ne[1] % net.patch_size) % net.patch_size;
|
||||
int64_t pad_w = (net.patch_size - x->ne[0] % net.patch_size) % net.patch_size;
|
||||
int64_t pad_h = (config.patch_size - x->ne[1] % config.patch_size) % config.patch_size;
|
||||
int64_t pad_w = (config.patch_size - x->ne[0] % config.patch_size) % config.patch_size;
|
||||
int64_t h_pad = x->ne[1] + pad_h;
|
||||
int64_t w_pad = x->ne[0] + pad_w;
|
||||
|
||||
image_pe_vec = gen_anima_image_pe_vec(1,
|
||||
static_cast<int>(h_pad),
|
||||
static_cast<int>(w_pad),
|
||||
static_cast<int>(net.patch_size),
|
||||
net.theta,
|
||||
net.axes_dim,
|
||||
static_cast<int>(config.patch_size),
|
||||
config.theta,
|
||||
config.axes_dim,
|
||||
4.0f,
|
||||
4.0f,
|
||||
1.0f);
|
||||
int64_t image_pos_len = static_cast<int64_t>(image_pe_vec.size()) / (2 * 2 * (net.head_dim / 2));
|
||||
auto image_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, net.head_dim / 2, image_pos_len);
|
||||
int64_t image_pos_len = static_cast<int64_t>(image_pe_vec.size()) / (2 * 2 * (config.head_dim / 2));
|
||||
auto image_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.head_dim / 2, image_pos_len);
|
||||
set_backend_tensor_data(image_pe, image_pe_vec.data());
|
||||
|
||||
ggml_tensor* adapter_q_pe = nullptr;
|
||||
@ -701,4 +715,4 @@ namespace Anima {
|
||||
};
|
||||
} // namespace Anima
|
||||
|
||||
#endif // __ANIMA_HPP__
|
||||
#endif // __SD_MODEL_DIFFUSION_ANIMA_HPP__
|
||||
@ -1,8 +1,8 @@
|
||||
#ifndef __CONTROL_HPP__
|
||||
#define __CONTROL_HPP__
|
||||
#ifndef __SD_MODEL_DIFFUSION_CONTROL_HPP__
|
||||
#define __SD_MODEL_DIFFUSION_CONTROL_HPP__
|
||||
|
||||
#include "common_block.hpp"
|
||||
#include "model.h"
|
||||
#include "model/common/block.hpp"
|
||||
|
||||
#define CONTROL_NET_GRAPH_SIZE 1536
|
||||
|
||||
@ -484,4 +484,4 @@ struct ControlNet : public GGMLRunner {
|
||||
}
|
||||
};
|
||||
|
||||
#endif // __CONTROL_HPP__
|
||||
#endif // __SD_MODEL_DIFFUSION_CONTROL_HPP__
|
||||
@ -1,7 +1,7 @@
|
||||
#ifndef __COMMON_DIT_HPP__
|
||||
#define __COMMON_DIT_HPP__
|
||||
#ifndef __SD_MODEL_DIFFUSION_DIT_HPP__
|
||||
#define __SD_MODEL_DIFFUSION_DIT_HPP__
|
||||
|
||||
#include "ggml_extend.hpp"
|
||||
#include "core/ggml_extend.hpp"
|
||||
|
||||
namespace DiT {
|
||||
inline ggml_tensor* patchify(ggml_context* ctx,
|
||||
@ -163,4 +163,4 @@ namespace DiT {
|
||||
}
|
||||
} // namespace DiT
|
||||
|
||||
#endif // __COMMON_DIT_HPP__
|
||||
#endif // __SD_MODEL_DIFFUSION_DIT_HPP__
|
||||
@ -1,18 +1,88 @@
|
||||
#ifndef __SD_ERNIE_IMAGE_HPP__
|
||||
#define __SD_ERNIE_IMAGE_HPP__
|
||||
#ifndef __SD_MODEL_DIFFUSION_ERNIE_IMAGE_HPP__
|
||||
#define __SD_MODEL_DIFFUSION_ERNIE_IMAGE_HPP__
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "common_dit.hpp"
|
||||
#include "diffusion_model.hpp"
|
||||
#include "flux.hpp"
|
||||
#include "qwen_image.hpp"
|
||||
#include "rope.hpp"
|
||||
#include "model/common/rope.hpp"
|
||||
#include "model/diffusion/dit.hpp"
|
||||
#include "model/diffusion/flux.hpp"
|
||||
#include "model/diffusion/model.hpp"
|
||||
#include "model/diffusion/qwen_image.hpp"
|
||||
|
||||
namespace ErnieImage {
|
||||
constexpr int ERNIE_IMAGE_GRAPH_SIZE = 40960;
|
||||
|
||||
struct ErnieImageConfig {
|
||||
int64_t hidden_size = 4096;
|
||||
int64_t num_heads = 32;
|
||||
int64_t num_layers = 36;
|
||||
int64_t ffn_hidden_size = 12288;
|
||||
int64_t in_channels = 128;
|
||||
int64_t out_channels = 128;
|
||||
int patch_size = 1;
|
||||
int64_t text_in_dim = 3072;
|
||||
int theta = 256;
|
||||
std::vector<int> axes_dim = {32, 48, 48};
|
||||
int axes_dim_sum = 128;
|
||||
float eps = 1e-6f;
|
||||
|
||||
static ErnieImageConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
|
||||
ErnieImageConfig config;
|
||||
config.num_layers = 0;
|
||||
int64_t detected_head_dim = 0;
|
||||
for (const auto& [name, tensor_storage] : tensor_storage_map) {
|
||||
if (!starts_with(name, prefix)) {
|
||||
continue;
|
||||
}
|
||||
if (ends_with(name, "x_embedder.proj.weight") && tensor_storage.n_dims == 4) {
|
||||
config.patch_size = static_cast<int>(tensor_storage.ne[0]);
|
||||
config.in_channels = tensor_storage.ne[2];
|
||||
config.hidden_size = tensor_storage.ne[3];
|
||||
} else if (ends_with(name, "text_proj.weight") && tensor_storage.n_dims == 2) {
|
||||
config.text_in_dim = tensor_storage.ne[0];
|
||||
} else if (ends_with(name, "layers.0.self_attention.norm_q.weight")) {
|
||||
detected_head_dim = tensor_storage.ne[0];
|
||||
} else if (ends_with(name, "layers.0.mlp.gate_proj.weight") && tensor_storage.n_dims == 2) {
|
||||
config.ffn_hidden_size = tensor_storage.ne[1];
|
||||
} else if (ends_with(name, "final_linear.weight") && tensor_storage.n_dims == 2) {
|
||||
int64_t out_dim = tensor_storage.ne[1];
|
||||
int64_t patch_area = config.patch_size * config.patch_size;
|
||||
config.out_channels = out_dim / patch_area;
|
||||
}
|
||||
|
||||
size_t pos = name.find("layers.");
|
||||
if (pos != std::string::npos) {
|
||||
auto items = split_string(name.substr(pos), '.');
|
||||
if (items.size() > 1) {
|
||||
int block_index = atoi(items[1].c_str());
|
||||
if (block_index + 1 > config.num_layers) {
|
||||
config.num_layers = block_index + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (config.num_layers == 0) {
|
||||
config.num_layers = 36;
|
||||
}
|
||||
if (detected_head_dim > 0) {
|
||||
config.num_heads = config.hidden_size / detected_head_dim;
|
||||
}
|
||||
config.axes_dim_sum = 0;
|
||||
for (int axis_dim : config.axes_dim) {
|
||||
config.axes_dim_sum += axis_dim;
|
||||
}
|
||||
LOG_DEBUG("ernie_image: num_layers = %" PRId64 ", hidden_size = %" PRId64 ", num_heads = %" PRId64 ", ffn_hidden_size = %" PRId64 ", in_channels = %" PRId64 ", out_channels = %" PRId64,
|
||||
config.num_layers,
|
||||
config.hidden_size,
|
||||
config.num_heads,
|
||||
config.ffn_hidden_size,
|
||||
config.in_channels,
|
||||
config.out_channels);
|
||||
return config;
|
||||
}
|
||||
};
|
||||
|
||||
__STATIC_INLINE__ ggml_tensor* timestep_embedding_sin_cos(ggml_context* ctx,
|
||||
ggml_tensor* timesteps,
|
||||
int dim,
|
||||
@ -208,51 +278,36 @@ namespace ErnieImage {
|
||||
}
|
||||
};
|
||||
|
||||
struct ErnieImageParams {
|
||||
int64_t hidden_size = 4096;
|
||||
int64_t num_heads = 32;
|
||||
int64_t num_layers = 36;
|
||||
int64_t ffn_hidden_size = 12288;
|
||||
int64_t in_channels = 128;
|
||||
int64_t out_channels = 128;
|
||||
int patch_size = 1;
|
||||
int64_t text_in_dim = 3072;
|
||||
int theta = 256;
|
||||
std::vector<int> axes_dim = {32, 48, 48};
|
||||
int axes_dim_sum = 128;
|
||||
float eps = 1e-6f;
|
||||
};
|
||||
|
||||
class ErnieImageModel : public GGMLBlock {
|
||||
public:
|
||||
ErnieImageParams params;
|
||||
ErnieImageConfig config;
|
||||
|
||||
ErnieImageModel() = default;
|
||||
ErnieImageModel(ErnieImageParams params)
|
||||
: params(params) {
|
||||
blocks["x_embedder.proj"] = std::make_shared<Conv2d>(params.in_channels,
|
||||
params.hidden_size,
|
||||
std::pair<int, int>{params.patch_size, params.patch_size},
|
||||
std::pair<int, int>{params.patch_size, params.patch_size},
|
||||
ErnieImageModel(ErnieImageConfig config)
|
||||
: config(config) {
|
||||
blocks["x_embedder.proj"] = std::make_shared<Conv2d>(config.in_channels,
|
||||
config.hidden_size,
|
||||
std::pair<int, int>{config.patch_size, config.patch_size},
|
||||
std::pair<int, int>{config.patch_size, config.patch_size},
|
||||
std::pair<int, int>{0, 0},
|
||||
std::pair<int, int>{1, 1},
|
||||
true);
|
||||
if (params.text_in_dim != params.hidden_size) {
|
||||
blocks["text_proj"] = std::make_shared<Linear>(params.text_in_dim, params.hidden_size, false);
|
||||
if (config.text_in_dim != config.hidden_size) {
|
||||
blocks["text_proj"] = std::make_shared<Linear>(config.text_in_dim, config.hidden_size, false);
|
||||
}
|
||||
blocks["time_embedding"] = std::make_shared<Qwen::TimestepEmbedding>(params.hidden_size, params.hidden_size);
|
||||
blocks["adaLN_modulation.1"] = std::make_shared<Linear>(params.hidden_size, 6 * params.hidden_size, true);
|
||||
blocks["time_embedding"] = std::make_shared<Qwen::TimestepEmbedding>(config.hidden_size, config.hidden_size);
|
||||
blocks["adaLN_modulation.1"] = std::make_shared<Linear>(config.hidden_size, 6 * config.hidden_size, true);
|
||||
|
||||
for (int i = 0; i < params.num_layers; i++) {
|
||||
blocks["layers." + std::to_string(i)] = std::make_shared<ErnieImageSharedAdaLNBlock>(params.hidden_size,
|
||||
params.num_heads,
|
||||
params.ffn_hidden_size,
|
||||
params.eps);
|
||||
for (int i = 0; i < config.num_layers; i++) {
|
||||
blocks["layers." + std::to_string(i)] = std::make_shared<ErnieImageSharedAdaLNBlock>(config.hidden_size,
|
||||
config.num_heads,
|
||||
config.ffn_hidden_size,
|
||||
config.eps);
|
||||
}
|
||||
|
||||
blocks["final_norm"] = std::make_shared<ErnieImageAdaLNContinuous>(params.hidden_size, params.eps);
|
||||
blocks["final_linear"] = std::make_shared<Linear>(params.hidden_size,
|
||||
params.patch_size * params.patch_size * params.out_channels,
|
||||
blocks["final_norm"] = std::make_shared<ErnieImageAdaLNContinuous>(config.hidden_size, config.eps);
|
||||
blocks["final_linear"] = std::make_shared<Linear>(config.hidden_size,
|
||||
config.patch_size * config.patch_size * config.out_channels,
|
||||
true);
|
||||
}
|
||||
|
||||
@ -265,12 +320,12 @@ namespace ErnieImage {
|
||||
// context: [N, text_tokens, 3072]
|
||||
// pe: [image_tokens + text_tokens, head_dim/2, 2, 2]
|
||||
GGML_ASSERT(context != nullptr);
|
||||
GGML_ASSERT(x->ne[1] % params.patch_size == 0 && x->ne[0] % params.patch_size == 0);
|
||||
GGML_ASSERT(x->ne[1] % config.patch_size == 0 && x->ne[0] % config.patch_size == 0);
|
||||
|
||||
int64_t W = x->ne[0];
|
||||
int64_t H = x->ne[1];
|
||||
int64_t Hp = H / params.patch_size;
|
||||
int64_t Wp = W / params.patch_size;
|
||||
int64_t Hp = H / config.patch_size;
|
||||
int64_t Wp = W / config.patch_size;
|
||||
int64_t n_img = Hp * Wp;
|
||||
int64_t N = x->ne[3];
|
||||
|
||||
@ -292,7 +347,7 @@ namespace ErnieImage {
|
||||
|
||||
auto hidden_states = ggml_concat(ctx->ggml_ctx, img, txt, 1); // [N, image_tokens + text_tokens, hidden_size]
|
||||
|
||||
auto sample = timestep_embedding_sin_cos(ctx->ggml_ctx, timestep, static_cast<int>(params.hidden_size));
|
||||
auto sample = timestep_embedding_sin_cos(ctx->ggml_ctx, timestep, static_cast<int>(config.hidden_size));
|
||||
auto c = time_embedding->forward(ctx, sample); // [N, hidden_size]
|
||||
|
||||
auto mod_params = adaLN_mod->forward(ctx, ggml_silu(ctx->ggml_ctx, c)); // [N, 6 * hidden_size]
|
||||
@ -305,7 +360,7 @@ namespace ErnieImage {
|
||||
temb.push_back(ggml_reshape_3d(ctx->ggml_ctx, chunk, chunk->ne[0], 1, chunk->ne[1])); // [N, 1, hidden_size]
|
||||
}
|
||||
|
||||
for (int i = 0; i < params.num_layers; i++) {
|
||||
for (int i = 0; i < config.num_layers; i++) {
|
||||
auto layer = std::dynamic_pointer_cast<ErnieImageSharedAdaLNBlock>(blocks["layers." + std::to_string(i)]);
|
||||
hidden_states = layer->forward(ctx, hidden_states, pe, temb);
|
||||
sd::ggml_graph_cut::mark_graph_cut(hidden_states, "ernie_image.layers." + std::to_string(i), "hidden_states");
|
||||
@ -319,15 +374,15 @@ namespace ErnieImage {
|
||||
patches,
|
||||
Hp,
|
||||
Wp,
|
||||
params.patch_size,
|
||||
params.patch_size,
|
||||
config.patch_size,
|
||||
config.patch_size,
|
||||
false); // [N, out_channels, H, W]
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
struct ErnieImageRunner : public DiffusionModelRunner {
|
||||
ErnieImageParams ernie_params;
|
||||
ErnieImageConfig config;
|
||||
ErnieImageModel ernie_image;
|
||||
std::vector<float> pe_vec;
|
||||
|
||||
@ -335,58 +390,9 @@ namespace ErnieImage {
|
||||
ggml_backend_t params_backend,
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string prefix = "")
|
||||
: DiffusionModelRunner(backend, params_backend, prefix) {
|
||||
ernie_params.num_layers = 0;
|
||||
for (const auto& [name, tensor_storage] : tensor_storage_map) {
|
||||
if (!starts_with(name, prefix)) {
|
||||
continue;
|
||||
}
|
||||
if (ends_with(name, "x_embedder.proj.weight") && tensor_storage.n_dims == 4) {
|
||||
ernie_params.patch_size = static_cast<int>(tensor_storage.ne[0]);
|
||||
ernie_params.in_channels = tensor_storage.ne[2];
|
||||
ernie_params.hidden_size = tensor_storage.ne[3];
|
||||
} else if (ends_with(name, "text_proj.weight") && tensor_storage.n_dims == 2) {
|
||||
ernie_params.text_in_dim = tensor_storage.ne[0];
|
||||
} else if (ends_with(name, "layers.0.self_attention.norm_q.weight")) {
|
||||
int64_t head_dim = tensor_storage.ne[0];
|
||||
ernie_params.num_heads = ernie_params.hidden_size / head_dim;
|
||||
} else if (ends_with(name, "layers.0.mlp.gate_proj.weight") && tensor_storage.n_dims == 2) {
|
||||
ernie_params.ffn_hidden_size = tensor_storage.ne[1];
|
||||
} else if (ends_with(name, "final_linear.weight") && tensor_storage.n_dims == 2) {
|
||||
int64_t out_dim = tensor_storage.ne[1];
|
||||
ernie_params.out_channels = out_dim / ernie_params.patch_size / ernie_params.patch_size;
|
||||
}
|
||||
|
||||
size_t pos = name.find("layers.");
|
||||
if (pos != std::string::npos) {
|
||||
std::string layer_name = name.substr(pos);
|
||||
auto items = split_string(layer_name, '.');
|
||||
if (items.size() > 1) {
|
||||
int block_index = atoi(items[1].c_str());
|
||||
if (block_index + 1 > ernie_params.num_layers) {
|
||||
ernie_params.num_layers = block_index + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (ernie_params.num_layers == 0) {
|
||||
ernie_params.num_layers = 36;
|
||||
}
|
||||
ernie_params.axes_dim_sum = 0;
|
||||
for (int axis_dim : ernie_params.axes_dim) {
|
||||
ernie_params.axes_dim_sum += axis_dim;
|
||||
}
|
||||
|
||||
LOG_INFO("ernie_image: layers = %" PRId64 ", hidden_size = %" PRId64 ", heads = %" PRId64
|
||||
", ffn_hidden_size = %" PRId64 ", in_channels = %" PRId64 ", out_channels = %" PRId64,
|
||||
ernie_params.num_layers,
|
||||
ernie_params.hidden_size,
|
||||
ernie_params.num_heads,
|
||||
ernie_params.ffn_hidden_size,
|
||||
ernie_params.in_channels,
|
||||
ernie_params.out_channels);
|
||||
|
||||
ernie_image = ErnieImageModel(ernie_params);
|
||||
: DiffusionModelRunner(backend, params_backend, prefix),
|
||||
config(ErnieImageConfig::detect_from_weights(tensor_storage_map, prefix)) {
|
||||
ernie_image = ErnieImageModel(config);
|
||||
ernie_image.init(params_ctx, tensor_storage_map, prefix);
|
||||
}
|
||||
|
||||
@ -410,15 +416,15 @@ namespace ErnieImage {
|
||||
|
||||
pe_vec = Rope::gen_ernie_image_pe(static_cast<int>(x->ne[1]),
|
||||
static_cast<int>(x->ne[0]),
|
||||
ernie_params.patch_size,
|
||||
config.patch_size,
|
||||
static_cast<int>(x->ne[3]),
|
||||
static_cast<int>(context->ne[1]),
|
||||
ernie_params.theta,
|
||||
config.theta,
|
||||
circular_y_enabled,
|
||||
circular_x_enabled,
|
||||
ernie_params.axes_dim);
|
||||
int pos_len = static_cast<int>(pe_vec.size() / ernie_params.axes_dim_sum / 2);
|
||||
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, ernie_params.axes_dim_sum, 1, pos_len, 2);
|
||||
config.axes_dim);
|
||||
int pos_len = static_cast<int>(pe_vec.size() / config.axes_dim_sum / 2);
|
||||
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, config.axes_dim_sum, 1, pos_len, 2);
|
||||
set_backend_tensor_data(pe, pe_vec.data());
|
||||
|
||||
auto runner_ctx = get_context();
|
||||
@ -449,4 +455,4 @@ namespace ErnieImage {
|
||||
};
|
||||
} // namespace ErnieImage
|
||||
|
||||
#endif // __SD_ERNIE_IMAGE_HPP__
|
||||
#endif // __SD_MODEL_DIFFUSION_ERNIE_IMAGE_HPP__
|
||||
@ -1,18 +1,167 @@
|
||||
#ifndef __FLUX_HPP__
|
||||
#define __FLUX_HPP__
|
||||
#ifndef __SD_MODEL_DIFFUSION_FLUX_HPP__
|
||||
#define __SD_MODEL_DIFFUSION_FLUX_HPP__
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "common_dit.hpp"
|
||||
#include "diffusion_model.hpp"
|
||||
#include "model.h"
|
||||
#include "rope.hpp"
|
||||
#include "model/common/rope.hpp"
|
||||
#include "model/diffusion/dit.hpp"
|
||||
#include "model/diffusion/model.hpp"
|
||||
|
||||
#define FLUX_GRAPH_SIZE 10240
|
||||
|
||||
namespace Flux {
|
||||
|
||||
struct ChromaRadianceConfig {
|
||||
int64_t nerf_hidden_size = 64;
|
||||
int nerf_mlp_ratio = 4;
|
||||
int nerf_depth = 4;
|
||||
int nerf_max_freqs = 8;
|
||||
bool use_x0 = false;
|
||||
bool fake_patch_size_x2 = false;
|
||||
};
|
||||
|
||||
struct FluxConfig {
|
||||
SDVersion version = VERSION_FLUX;
|
||||
bool is_chroma = false;
|
||||
int patch_size = 2;
|
||||
int64_t in_channels = 64;
|
||||
int64_t out_channels = 64;
|
||||
int64_t vec_in_dim = 768;
|
||||
int64_t context_in_dim = 4096;
|
||||
int64_t hidden_size = 3072;
|
||||
float mlp_ratio = 4.0f;
|
||||
int num_heads = 24;
|
||||
int depth = 19;
|
||||
int depth_single_blocks = 38;
|
||||
std::vector<int> axes_dim = {16, 56, 56};
|
||||
int axes_dim_sum = 128;
|
||||
int theta = 10000;
|
||||
bool qkv_bias = true;
|
||||
bool guidance_embed = true;
|
||||
int64_t in_dim = 64;
|
||||
bool disable_bias = false;
|
||||
bool share_modulation = false;
|
||||
bool semantic_txt_norm = false;
|
||||
bool use_yak_mlp = false;
|
||||
bool use_mlp_silu_act = false;
|
||||
float ref_index_scale = 1.f;
|
||||
ChromaRadianceConfig chroma_radiance_params;
|
||||
|
||||
static FluxConfig detect_from_weights(const String2TensorStorage& tensor_storage_map,
|
||||
const std::string& prefix,
|
||||
SDVersion version = VERSION_FLUX) {
|
||||
FluxConfig config;
|
||||
config.version = version;
|
||||
config.guidance_embed = false;
|
||||
config.depth = 0;
|
||||
config.depth_single_blocks = 0;
|
||||
if (version == VERSION_FLUX_FILL) {
|
||||
config.in_channels = 384;
|
||||
} else if (version == VERSION_FLUX_CONTROLS) {
|
||||
config.in_channels = 128;
|
||||
} else if (version == VERSION_FLEX_2) {
|
||||
config.in_channels = 196;
|
||||
} else if (version == VERSION_CHROMA_RADIANCE) {
|
||||
config.in_channels = 3;
|
||||
config.patch_size = 16;
|
||||
} else if (version == VERSION_OVIS_IMAGE) {
|
||||
config.semantic_txt_norm = true;
|
||||
config.use_yak_mlp = true;
|
||||
config.vec_in_dim = 0;
|
||||
} else if (sd_version_is_flux2(version)) {
|
||||
config.in_channels = 128;
|
||||
config.patch_size = 1;
|
||||
config.out_channels = 128;
|
||||
config.mlp_ratio = 3.f;
|
||||
config.theta = 2000;
|
||||
config.axes_dim = {32, 32, 32, 32};
|
||||
config.vec_in_dim = 0;
|
||||
config.qkv_bias = false;
|
||||
config.disable_bias = true;
|
||||
config.share_modulation = true;
|
||||
config.ref_index_scale = 10.f;
|
||||
config.use_mlp_silu_act = true;
|
||||
} else if (sd_version_is_longcat(version)) {
|
||||
config.context_in_dim = 3584;
|
||||
config.vec_in_dim = 0;
|
||||
}
|
||||
|
||||
int64_t head_dim = 0;
|
||||
int64_t actual_radiance_patch_size = -1;
|
||||
for (const auto& [name, tensor_storage] : tensor_storage_map) {
|
||||
if (!starts_with(name, prefix)) {
|
||||
continue;
|
||||
}
|
||||
if (name.find("guidance_in.in_layer.weight") != std::string::npos) {
|
||||
config.guidance_embed = true;
|
||||
}
|
||||
if (name.find("__x0__") != std::string::npos) {
|
||||
LOG_DEBUG("using x0 prediction");
|
||||
config.chroma_radiance_params.use_x0 = true;
|
||||
}
|
||||
if (name.find("__32x32__") != std::string::npos) {
|
||||
LOG_DEBUG("using patch size 32");
|
||||
config.patch_size = 32;
|
||||
}
|
||||
if (name.find("img_in_patch.weight") != std::string::npos) {
|
||||
actual_radiance_patch_size = tensor_storage.ne[0];
|
||||
LOG_DEBUG("actual radiance patch size: %" PRId64, actual_radiance_patch_size);
|
||||
}
|
||||
if (name.find("distilled_guidance_layer.in_proj.weight") != std::string::npos) {
|
||||
config.is_chroma = true;
|
||||
}
|
||||
size_t db = name.find("double_blocks.");
|
||||
if (db != std::string::npos) {
|
||||
std::string block_name = name.substr(db);
|
||||
int block_depth = atoi(block_name.substr(14, block_name.find(".", 14)).c_str());
|
||||
if (block_depth + 1 > config.depth) {
|
||||
config.depth = block_depth + 1;
|
||||
}
|
||||
}
|
||||
size_t sb = name.find("single_blocks.");
|
||||
if (sb != std::string::npos) {
|
||||
std::string block_name = name.substr(sb);
|
||||
int block_depth = atoi(block_name.substr(14, block_name.find(".", 14)).c_str());
|
||||
if (block_depth + 1 > config.depth_single_blocks) {
|
||||
config.depth_single_blocks = block_depth + 1;
|
||||
}
|
||||
}
|
||||
if (ends_with(name, "txt_in.weight")) {
|
||||
config.context_in_dim = tensor_storage.ne[0];
|
||||
config.hidden_size = tensor_storage.ne[1];
|
||||
}
|
||||
if (ends_with(name, "single_blocks.0.norm.key_norm.scale")) {
|
||||
head_dim = tensor_storage.ne[0];
|
||||
}
|
||||
if (ends_with(name, "double_blocks.0.txt_attn.norm.key_norm.scale")) {
|
||||
head_dim = tensor_storage.ne[0];
|
||||
}
|
||||
}
|
||||
if (actual_radiance_patch_size > 0 && actual_radiance_patch_size != config.patch_size) {
|
||||
GGML_ASSERT(config.patch_size == 2 * actual_radiance_patch_size);
|
||||
LOG_DEBUG("using fake x2 patch size");
|
||||
config.chroma_radiance_params.fake_patch_size_x2 = true;
|
||||
}
|
||||
if (head_dim > 0) {
|
||||
config.num_heads = static_cast<int>(config.hidden_size / head_dim);
|
||||
}
|
||||
config.axes_dim_sum = 0;
|
||||
for (int axis_dim : config.axes_dim) {
|
||||
config.axes_dim_sum += axis_dim;
|
||||
}
|
||||
LOG_DEBUG("flux: depth = %d, depth_single_blocks = %d, guidance_embed = %s, context_in_dim = %" PRId64 ", hidden_size = %" PRId64 ", num_heads = %d",
|
||||
config.depth,
|
||||
config.depth_single_blocks,
|
||||
config.guidance_embed ? "true" : "false",
|
||||
config.context_in_dim,
|
||||
config.hidden_size,
|
||||
config.num_heads);
|
||||
return config;
|
||||
}
|
||||
};
|
||||
|
||||
struct MLPEmbedder : public UnaryBlock {
|
||||
public:
|
||||
MLPEmbedder(int64_t in_dim, int64_t hidden_dim, bool bias = true) {
|
||||
@ -723,127 +872,90 @@ namespace Flux {
|
||||
}
|
||||
};
|
||||
|
||||
struct ChromaRadianceParams {
|
||||
int64_t nerf_hidden_size = 64;
|
||||
int nerf_mlp_ratio = 4;
|
||||
int nerf_depth = 4;
|
||||
int nerf_max_freqs = 8;
|
||||
bool use_x0 = false;
|
||||
bool fake_patch_size_x2 = false;
|
||||
};
|
||||
|
||||
struct FluxParams {
|
||||
SDVersion version = VERSION_FLUX;
|
||||
bool is_chroma = false;
|
||||
int patch_size = 2;
|
||||
int64_t in_channels = 64;
|
||||
int64_t out_channels = 64;
|
||||
int64_t vec_in_dim = 768;
|
||||
int64_t context_in_dim = 4096;
|
||||
int64_t hidden_size = 3072;
|
||||
float mlp_ratio = 4.0f;
|
||||
int num_heads = 24;
|
||||
int depth = 19;
|
||||
int depth_single_blocks = 38;
|
||||
std::vector<int> axes_dim = {16, 56, 56};
|
||||
int axes_dim_sum = 128;
|
||||
int theta = 10000;
|
||||
bool qkv_bias = true;
|
||||
bool guidance_embed = true;
|
||||
int64_t in_dim = 64;
|
||||
bool disable_bias = false;
|
||||
bool share_modulation = false;
|
||||
bool semantic_txt_norm = false;
|
||||
bool use_yak_mlp = false;
|
||||
bool use_mlp_silu_act = false;
|
||||
float ref_index_scale = 1.f;
|
||||
ChromaRadianceParams chroma_radiance_params;
|
||||
};
|
||||
|
||||
struct Flux : public GGMLBlock {
|
||||
public:
|
||||
FluxParams params;
|
||||
FluxConfig config;
|
||||
Flux() {}
|
||||
Flux(FluxParams params)
|
||||
: params(params) {
|
||||
if (params.version == VERSION_CHROMA_RADIANCE) {
|
||||
std::pair<int, int> kernel_size = {params.patch_size, params.patch_size};
|
||||
if (params.chroma_radiance_params.fake_patch_size_x2) {
|
||||
kernel_size = {params.patch_size / 2, params.patch_size / 2};
|
||||
Flux(FluxConfig config)
|
||||
: config(config) {
|
||||
if (config.version == VERSION_CHROMA_RADIANCE) {
|
||||
std::pair<int, int> kernel_size = {config.patch_size, config.patch_size};
|
||||
if (config.chroma_radiance_params.fake_patch_size_x2) {
|
||||
kernel_size = {config.patch_size / 2, config.patch_size / 2};
|
||||
}
|
||||
std::pair<int, int> stride = kernel_size;
|
||||
|
||||
blocks["img_in_patch"] = std::make_shared<Conv2d>(params.in_channels,
|
||||
params.hidden_size,
|
||||
blocks["img_in_patch"] = std::make_shared<Conv2d>(config.in_channels,
|
||||
config.hidden_size,
|
||||
kernel_size,
|
||||
stride);
|
||||
} else {
|
||||
blocks["img_in"] = std::make_shared<Linear>(params.in_channels, params.hidden_size, !params.disable_bias);
|
||||
blocks["img_in"] = std::make_shared<Linear>(config.in_channels, config.hidden_size, !config.disable_bias);
|
||||
}
|
||||
if (params.is_chroma) {
|
||||
blocks["distilled_guidance_layer"] = std::make_shared<ChromaApproximator>(params.in_dim, params.hidden_size);
|
||||
if (config.is_chroma) {
|
||||
blocks["distilled_guidance_layer"] = std::make_shared<ChromaApproximator>(config.in_dim, config.hidden_size);
|
||||
} else {
|
||||
blocks["time_in"] = std::make_shared<MLPEmbedder>(256, params.hidden_size, !params.disable_bias);
|
||||
if (params.vec_in_dim > 0) {
|
||||
blocks["vector_in"] = std::make_shared<MLPEmbedder>(params.vec_in_dim, params.hidden_size, !params.disable_bias);
|
||||
blocks["time_in"] = std::make_shared<MLPEmbedder>(256, config.hidden_size, !config.disable_bias);
|
||||
if (config.vec_in_dim > 0) {
|
||||
blocks["vector_in"] = std::make_shared<MLPEmbedder>(config.vec_in_dim, config.hidden_size, !config.disable_bias);
|
||||
}
|
||||
if (params.guidance_embed) {
|
||||
blocks["guidance_in"] = std::make_shared<MLPEmbedder>(256, params.hidden_size, !params.disable_bias);
|
||||
if (config.guidance_embed) {
|
||||
blocks["guidance_in"] = std::make_shared<MLPEmbedder>(256, config.hidden_size, !config.disable_bias);
|
||||
}
|
||||
}
|
||||
if (params.semantic_txt_norm) {
|
||||
blocks["txt_norm"] = std::make_shared<RMSNorm>(params.context_in_dim);
|
||||
if (config.semantic_txt_norm) {
|
||||
blocks["txt_norm"] = std::make_shared<RMSNorm>(config.context_in_dim);
|
||||
}
|
||||
blocks["txt_in"] = std::make_shared<Linear>(params.context_in_dim, params.hidden_size, !params.disable_bias);
|
||||
blocks["txt_in"] = std::make_shared<Linear>(config.context_in_dim, config.hidden_size, !config.disable_bias);
|
||||
|
||||
for (int i = 0; i < params.depth; i++) {
|
||||
blocks["double_blocks." + std::to_string(i)] = std::make_shared<DoubleStreamBlock>(params.hidden_size,
|
||||
params.num_heads,
|
||||
params.mlp_ratio,
|
||||
for (int i = 0; i < config.depth; i++) {
|
||||
blocks["double_blocks." + std::to_string(i)] = std::make_shared<DoubleStreamBlock>(config.hidden_size,
|
||||
config.num_heads,
|
||||
config.mlp_ratio,
|
||||
i,
|
||||
params.qkv_bias,
|
||||
params.is_chroma,
|
||||
params.share_modulation,
|
||||
!params.disable_bias,
|
||||
params.use_yak_mlp,
|
||||
params.use_mlp_silu_act);
|
||||
config.qkv_bias,
|
||||
config.is_chroma,
|
||||
config.share_modulation,
|
||||
!config.disable_bias,
|
||||
config.use_yak_mlp,
|
||||
config.use_mlp_silu_act);
|
||||
}
|
||||
|
||||
for (int i = 0; i < params.depth_single_blocks; i++) {
|
||||
blocks["single_blocks." + std::to_string(i)] = std::make_shared<SingleStreamBlock>(params.hidden_size,
|
||||
params.num_heads,
|
||||
params.mlp_ratio,
|
||||
for (int i = 0; i < config.depth_single_blocks; i++) {
|
||||
blocks["single_blocks." + std::to_string(i)] = std::make_shared<SingleStreamBlock>(config.hidden_size,
|
||||
config.num_heads,
|
||||
config.mlp_ratio,
|
||||
i,
|
||||
0.f,
|
||||
params.is_chroma,
|
||||
params.share_modulation,
|
||||
!params.disable_bias,
|
||||
params.use_yak_mlp,
|
||||
params.use_mlp_silu_act);
|
||||
config.is_chroma,
|
||||
config.share_modulation,
|
||||
!config.disable_bias,
|
||||
config.use_yak_mlp,
|
||||
config.use_mlp_silu_act);
|
||||
}
|
||||
|
||||
if (params.version == VERSION_CHROMA_RADIANCE) {
|
||||
blocks["nerf_image_embedder"] = std::make_shared<NerfEmbedder>(params.in_channels,
|
||||
params.chroma_radiance_params.nerf_hidden_size,
|
||||
params.chroma_radiance_params.nerf_max_freqs);
|
||||
if (config.version == VERSION_CHROMA_RADIANCE) {
|
||||
blocks["nerf_image_embedder"] = std::make_shared<NerfEmbedder>(config.in_channels,
|
||||
config.chroma_radiance_params.nerf_hidden_size,
|
||||
config.chroma_radiance_params.nerf_max_freqs);
|
||||
|
||||
for (int i = 0; i < params.chroma_radiance_params.nerf_depth; i++) {
|
||||
blocks["nerf_blocks." + std::to_string(i)] = std::make_shared<NerfGLUBlock>(params.hidden_size,
|
||||
params.chroma_radiance_params.nerf_hidden_size,
|
||||
params.chroma_radiance_params.nerf_mlp_ratio);
|
||||
for (int i = 0; i < config.chroma_radiance_params.nerf_depth; i++) {
|
||||
blocks["nerf_blocks." + std::to_string(i)] = std::make_shared<NerfGLUBlock>(config.hidden_size,
|
||||
config.chroma_radiance_params.nerf_hidden_size,
|
||||
config.chroma_radiance_params.nerf_mlp_ratio);
|
||||
}
|
||||
|
||||
blocks["nerf_final_layer_conv"] = std::make_shared<NerfFinalLayerConv>(params.chroma_radiance_params.nerf_hidden_size,
|
||||
params.in_channels);
|
||||
blocks["nerf_final_layer_conv"] = std::make_shared<NerfFinalLayerConv>(config.chroma_radiance_params.nerf_hidden_size,
|
||||
config.in_channels);
|
||||
|
||||
} else {
|
||||
blocks["final_layer"] = std::make_shared<LastLayer>(params.hidden_size, 1, params.out_channels, params.is_chroma, !params.disable_bias);
|
||||
blocks["final_layer"] = std::make_shared<LastLayer>(config.hidden_size, 1, config.out_channels, config.is_chroma, !config.disable_bias);
|
||||
}
|
||||
|
||||
if (params.share_modulation) {
|
||||
blocks["double_stream_modulation_img"] = std::make_shared<Modulation>(params.hidden_size, true, !params.disable_bias);
|
||||
blocks["double_stream_modulation_txt"] = std::make_shared<Modulation>(params.hidden_size, true, !params.disable_bias);
|
||||
blocks["single_stream_modulation"] = std::make_shared<Modulation>(params.hidden_size, false, !params.disable_bias);
|
||||
if (config.share_modulation) {
|
||||
blocks["double_stream_modulation_img"] = std::make_shared<Modulation>(config.hidden_size, true, !config.disable_bias);
|
||||
blocks["double_stream_modulation_txt"] = std::make_shared<Modulation>(config.hidden_size, true, !config.disable_bias);
|
||||
blocks["single_stream_modulation"] = std::make_shared<Modulation>(config.hidden_size, false, !config.disable_bias);
|
||||
}
|
||||
}
|
||||
|
||||
@ -866,7 +978,7 @@ namespace Flux {
|
||||
|
||||
ggml_tensor* vec;
|
||||
ggml_tensor* txt_img_mask = nullptr;
|
||||
if (params.is_chroma) {
|
||||
if (config.is_chroma) {
|
||||
int64_t mod_index_length = 344;
|
||||
auto approx = std::dynamic_pointer_cast<ChromaApproximator>(blocks["distilled_guidance_layer"]);
|
||||
auto distill_timestep = ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 16, 10000, 1000.f);
|
||||
@ -894,7 +1006,7 @@ namespace Flux {
|
||||
} else {
|
||||
auto time_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["time_in"]);
|
||||
vec = time_in->forward(ctx, ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 256, 10000, 1000.f));
|
||||
if (params.guidance_embed) {
|
||||
if (config.guidance_embed) {
|
||||
GGML_ASSERT(guidance != nullptr);
|
||||
auto guidance_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["guidance_in"]);
|
||||
// bf16 and fp16 result is different
|
||||
@ -902,7 +1014,7 @@ namespace Flux {
|
||||
vec = ggml_add(ctx->ggml_ctx, vec, guidance_in->forward(ctx, g_in));
|
||||
}
|
||||
|
||||
if (params.vec_in_dim > 0) {
|
||||
if (config.vec_in_dim > 0) {
|
||||
auto vector_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["vector_in"]);
|
||||
vec = ggml_add(ctx->ggml_ctx, vec, vector_in->forward(ctx, y));
|
||||
}
|
||||
@ -911,7 +1023,7 @@ namespace Flux {
|
||||
std::vector<ModulationOut> ds_img_mods;
|
||||
std::vector<ModulationOut> ds_txt_mods;
|
||||
std::vector<ModulationOut> ss_mods;
|
||||
if (params.share_modulation) {
|
||||
if (config.share_modulation) {
|
||||
auto double_stream_modulation_img = std::dynamic_pointer_cast<Modulation>(blocks["double_stream_modulation_img"]);
|
||||
auto double_stream_modulation_txt = std::dynamic_pointer_cast<Modulation>(blocks["double_stream_modulation_txt"]);
|
||||
auto single_stream_modulation = std::dynamic_pointer_cast<Modulation>(blocks["single_stream_modulation"]);
|
||||
@ -921,7 +1033,7 @@ namespace Flux {
|
||||
ss_mods = single_stream_modulation->forward(ctx, vec);
|
||||
}
|
||||
|
||||
if (params.semantic_txt_norm) {
|
||||
if (config.semantic_txt_norm) {
|
||||
auto semantic_txt_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm"]);
|
||||
|
||||
txt = semantic_txt_norm->forward(ctx, txt);
|
||||
@ -932,7 +1044,7 @@ namespace Flux {
|
||||
sd::ggml_graph_cut::mark_graph_cut(txt, "flux.prelude", "txt");
|
||||
sd::ggml_graph_cut::mark_graph_cut(vec, "flux.prelude", "vec");
|
||||
|
||||
for (int i = 0; i < params.depth; i++) {
|
||||
for (int i = 0; i < config.depth; i++) {
|
||||
if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i) != skip_layers.end()) {
|
||||
continue;
|
||||
}
|
||||
@ -947,8 +1059,8 @@ namespace Flux {
|
||||
}
|
||||
|
||||
auto txt_img = ggml_concat(ctx->ggml_ctx, txt, img, 1); // [N, n_txt_token + n_img_token, hidden_size]
|
||||
for (int i = 0; i < params.depth_single_blocks; i++) {
|
||||
if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i + params.depth) != skip_layers.end()) {
|
||||
for (int i = 0; i < config.depth_single_blocks; i++) {
|
||||
if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i + config.depth) != skip_layers.end()) {
|
||||
continue;
|
||||
}
|
||||
auto block = std::dynamic_pointer_cast<SingleStreamBlock>(blocks["single_blocks." + std::to_string(i)]);
|
||||
@ -999,14 +1111,14 @@ namespace Flux {
|
||||
int64_t W = x->ne[0];
|
||||
int64_t H = x->ne[1];
|
||||
int64_t C = x->ne[2];
|
||||
int patch_size = params.patch_size;
|
||||
int patch_size = config.patch_size;
|
||||
int pad_h = (patch_size - H % patch_size) % patch_size;
|
||||
int pad_w = (patch_size - W % patch_size) % patch_size;
|
||||
|
||||
auto img = DiT::pad_to_patch_size(ctx, x, params.patch_size, params.patch_size);
|
||||
auto img = DiT::pad_to_patch_size(ctx, x, config.patch_size, config.patch_size);
|
||||
auto orig_img = img;
|
||||
|
||||
if (params.chroma_radiance_params.fake_patch_size_x2) {
|
||||
if (config.chroma_radiance_params.fake_patch_size_x2) {
|
||||
// It's supposed to be using GGML_SCALE_MODE_NEAREST, but this seems more stable
|
||||
// Maybe the implementation of nearest-neighbor interpolation in ggml behaves differently than the one in PyTorch?
|
||||
// img = F.interpolate(img, size=(H//2, W//2), mode="nearest")
|
||||
@ -1037,7 +1149,7 @@ namespace Flux {
|
||||
auto nerf_hidden = ggml_reshape_2d(ctx->ggml_ctx, out, out->ne[0], out->ne[1] * out->ne[2]); // [N*num_patches, hidden_size]
|
||||
auto img_dct = nerf_image_embedder->forward(ctx, nerf_pixels, dct); // [N*num_patches, patch_size*patch_size, nerf_hidden_size]
|
||||
|
||||
for (int i = 0; i < params.chroma_radiance_params.nerf_depth; i++) {
|
||||
for (int i = 0; i < config.chroma_radiance_params.nerf_depth; i++) {
|
||||
auto block = std::dynamic_pointer_cast<NerfGLUBlock>(blocks["nerf_blocks." + std::to_string(i)]);
|
||||
|
||||
img_dct = block->forward(ctx, img_dct, nerf_hidden);
|
||||
@ -1049,7 +1161,7 @@ namespace Flux {
|
||||
|
||||
out = nerf_final_layer_conv->forward(ctx, img_dct); // [N, C, H, W]
|
||||
|
||||
if (params.chroma_radiance_params.use_x0) {
|
||||
if (config.chroma_radiance_params.use_x0) {
|
||||
out = _apply_x0_residual(ctx, out, orig_img, timestep);
|
||||
}
|
||||
|
||||
@ -1073,14 +1185,14 @@ namespace Flux {
|
||||
int64_t W = x->ne[0];
|
||||
int64_t H = x->ne[1];
|
||||
int64_t C = x->ne[2];
|
||||
int patch_size = params.patch_size;
|
||||
int patch_size = config.patch_size;
|
||||
int pad_h = (patch_size - H % patch_size) % patch_size;
|
||||
int pad_w = (patch_size - W % patch_size) % patch_size;
|
||||
|
||||
auto img = DiT::pad_and_patchify(ctx, x, patch_size, patch_size);
|
||||
int64_t img_tokens = img->ne[1];
|
||||
|
||||
if (params.version == VERSION_FLUX_FILL) {
|
||||
if (config.version == VERSION_FLUX_FILL) {
|
||||
GGML_ASSERT(c_concat != nullptr);
|
||||
ggml_tensor* masked = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
|
||||
ggml_tensor* mask = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 8 * 8, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
|
||||
@ -1089,7 +1201,7 @@ namespace Flux {
|
||||
mask = DiT::pad_and_patchify(ctx, mask, patch_size, patch_size);
|
||||
|
||||
img = ggml_concat(ctx->ggml_ctx, img, ggml_concat(ctx->ggml_ctx, masked, mask, 0), 0);
|
||||
} else if (params.version == VERSION_FLEX_2) {
|
||||
} else if (config.version == VERSION_FLEX_2) {
|
||||
GGML_ASSERT(c_concat != nullptr);
|
||||
ggml_tensor* masked = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
|
||||
ggml_tensor* mask = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 1, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
|
||||
@ -1100,7 +1212,7 @@ namespace Flux {
|
||||
control = DiT::pad_and_patchify(ctx, control, patch_size, patch_size);
|
||||
|
||||
img = ggml_concat(ctx->ggml_ctx, img, ggml_concat(ctx->ggml_ctx, ggml_concat(ctx->ggml_ctx, masked, mask, 0), control, 0), 0);
|
||||
} else if (params.version == VERSION_FLUX_CONTROLS) {
|
||||
} else if (config.version == VERSION_FLUX_CONTROLS) {
|
||||
GGML_ASSERT(c_concat != nullptr);
|
||||
|
||||
auto control = DiT::pad_and_patchify(ctx, c_concat, patch_size, patch_size);
|
||||
@ -1147,7 +1259,7 @@ namespace Flux {
|
||||
// pe: (L, d_head/2, 2, 2)
|
||||
// return: (N, C, H, W)
|
||||
|
||||
if (params.version == VERSION_CHROMA_RADIANCE) {
|
||||
if (config.version == VERSION_CHROMA_RADIANCE) {
|
||||
return forward_chroma_radiance(ctx,
|
||||
x,
|
||||
timestep,
|
||||
@ -1179,7 +1291,7 @@ namespace Flux {
|
||||
|
||||
struct FluxRunner : public DiffusionModelRunner {
|
||||
public:
|
||||
FluxParams flux_params;
|
||||
FluxConfig config;
|
||||
Flux flux;
|
||||
std::vector<float> pe_vec;
|
||||
std::vector<float> mod_index_arange_vec;
|
||||
@ -1194,114 +1306,15 @@ namespace Flux {
|
||||
const std::string prefix = "",
|
||||
SDVersion version = VERSION_FLUX,
|
||||
bool use_mask = false)
|
||||
: DiffusionModelRunner(backend, params_backend, prefix), version(version), use_mask(use_mask) {
|
||||
flux_params.version = version;
|
||||
flux_params.guidance_embed = false;
|
||||
flux_params.depth = 0;
|
||||
flux_params.depth_single_blocks = 0;
|
||||
if (version == VERSION_FLUX_FILL) {
|
||||
flux_params.in_channels = 384;
|
||||
} else if (version == VERSION_FLUX_CONTROLS) {
|
||||
flux_params.in_channels = 128;
|
||||
} else if (version == VERSION_FLEX_2) {
|
||||
flux_params.in_channels = 196;
|
||||
} else if (version == VERSION_CHROMA_RADIANCE) {
|
||||
flux_params.in_channels = 3;
|
||||
flux_params.patch_size = 16;
|
||||
} else if (version == VERSION_OVIS_IMAGE) {
|
||||
flux_params.semantic_txt_norm = true;
|
||||
flux_params.use_yak_mlp = true;
|
||||
flux_params.vec_in_dim = 0;
|
||||
} else if (sd_version_is_flux2(version)) {
|
||||
flux_params.in_channels = 128;
|
||||
flux_params.patch_size = 1;
|
||||
flux_params.out_channels = 128;
|
||||
flux_params.mlp_ratio = 3.f;
|
||||
flux_params.theta = 2000;
|
||||
flux_params.axes_dim = {32, 32, 32, 32};
|
||||
flux_params.vec_in_dim = 0;
|
||||
flux_params.qkv_bias = false;
|
||||
flux_params.disable_bias = true;
|
||||
flux_params.share_modulation = true;
|
||||
flux_params.ref_index_scale = 10.f;
|
||||
flux_params.use_mlp_silu_act = true;
|
||||
} else if (sd_version_is_longcat(version)) {
|
||||
flux_params.context_in_dim = 3584;
|
||||
flux_params.vec_in_dim = 0;
|
||||
}
|
||||
int64_t head_dim = 0;
|
||||
int64_t actual_radiance_patch_size = -1;
|
||||
for (auto pair : tensor_storage_map) {
|
||||
std::string tensor_name = pair.first;
|
||||
if (!starts_with(tensor_name, prefix))
|
||||
continue;
|
||||
if (tensor_name.find("guidance_in.in_layer.weight") != std::string::npos) {
|
||||
flux_params.guidance_embed = true;
|
||||
}
|
||||
if (tensor_name.find("__x0__") != std::string::npos) {
|
||||
LOG_DEBUG("using x0 prediction");
|
||||
flux_params.chroma_radiance_params.use_x0 = true;
|
||||
}
|
||||
if (tensor_name.find("__32x32__") != std::string::npos) {
|
||||
LOG_DEBUG("using patch size 32");
|
||||
flux_params.patch_size = 32;
|
||||
}
|
||||
if (tensor_name.find("img_in_patch.weight") != std::string::npos) {
|
||||
actual_radiance_patch_size = pair.second.ne[0];
|
||||
LOG_DEBUG("actual radiance patch size: %d", actual_radiance_patch_size);
|
||||
}
|
||||
if (tensor_name.find("distilled_guidance_layer.in_proj.weight") != std::string::npos) {
|
||||
// Chroma
|
||||
flux_params.is_chroma = true;
|
||||
}
|
||||
size_t db = tensor_name.find("double_blocks.");
|
||||
if (db != std::string::npos) {
|
||||
tensor_name = tensor_name.substr(db); // remove prefix
|
||||
int block_depth = atoi(tensor_name.substr(14, tensor_name.find(".", 14)).c_str());
|
||||
if (block_depth + 1 > flux_params.depth) {
|
||||
flux_params.depth = block_depth + 1;
|
||||
}
|
||||
}
|
||||
size_t sb = tensor_name.find("single_blocks.");
|
||||
if (sb != std::string::npos) {
|
||||
tensor_name = tensor_name.substr(sb); // remove prefix
|
||||
int block_depth = atoi(tensor_name.substr(14, tensor_name.find(".", 14)).c_str());
|
||||
if (block_depth + 1 > flux_params.depth_single_blocks) {
|
||||
flux_params.depth_single_blocks = block_depth + 1;
|
||||
}
|
||||
}
|
||||
if (ends_with(tensor_name, "txt_in.weight")) {
|
||||
flux_params.context_in_dim = pair.second.ne[0];
|
||||
flux_params.hidden_size = pair.second.ne[1];
|
||||
}
|
||||
if (ends_with(tensor_name, "single_blocks.0.norm.key_norm.scale")) {
|
||||
head_dim = pair.second.ne[0];
|
||||
}
|
||||
if (ends_with(tensor_name, "double_blocks.0.txt_attn.norm.key_norm.scale")) {
|
||||
head_dim = pair.second.ne[0];
|
||||
}
|
||||
}
|
||||
if (actual_radiance_patch_size > 0 && actual_radiance_patch_size != flux_params.patch_size) {
|
||||
GGML_ASSERT(flux_params.patch_size == 2 * actual_radiance_patch_size);
|
||||
LOG_DEBUG("using fake x2 patch size");
|
||||
flux_params.chroma_radiance_params.fake_patch_size_x2 = true;
|
||||
}
|
||||
|
||||
flux_params.num_heads = static_cast<int>(flux_params.hidden_size / head_dim);
|
||||
|
||||
LOG_INFO("flux: depth = %d, depth_single_blocks = %d, guidance_embed = %s, context_in_dim = %" PRId64
|
||||
", hidden_size = %" PRId64 ", num_heads = %d",
|
||||
flux_params.depth,
|
||||
flux_params.depth_single_blocks,
|
||||
flux_params.guidance_embed ? "true" : "false",
|
||||
flux_params.context_in_dim,
|
||||
flux_params.hidden_size,
|
||||
flux_params.num_heads);
|
||||
if (flux_params.is_chroma) {
|
||||
: DiffusionModelRunner(backend, params_backend, prefix),
|
||||
config(FluxConfig::detect_from_weights(tensor_storage_map, prefix, version)),
|
||||
version(version),
|
||||
use_mask(use_mask) {
|
||||
if (config.is_chroma) {
|
||||
LOG_INFO("Using pruned modulation (Chroma)");
|
||||
}
|
||||
|
||||
flux = Flux(flux_params);
|
||||
flux = Flux(config);
|
||||
flux.init(params_ctx, tensor_storage_map, prefix);
|
||||
}
|
||||
|
||||
@ -1377,10 +1390,10 @@ namespace Flux {
|
||||
ggml_tensor* context = make_optional_input(context_tensor);
|
||||
ggml_tensor* c_concat = make_optional_input(c_concat_tensor);
|
||||
ggml_tensor* y = make_optional_input(y_tensor);
|
||||
if (flux_params.guidance_embed || flux_params.is_chroma) {
|
||||
if (config.guidance_embed || config.is_chroma) {
|
||||
if (!guidance_tensor.empty()) {
|
||||
this->guidance_tensor = guidance_tensor;
|
||||
if (flux_params.is_chroma) {
|
||||
if (config.is_chroma) {
|
||||
this->guidance_tensor.fill_(0.f);
|
||||
}
|
||||
}
|
||||
@ -1398,7 +1411,7 @@ namespace Flux {
|
||||
ggml_tensor* mod_index_arange = nullptr;
|
||||
ggml_tensor* dct = nullptr; // for chroma radiance
|
||||
|
||||
if (flux_params.is_chroma) {
|
||||
if (config.is_chroma) {
|
||||
if (!use_mask) {
|
||||
y = nullptr;
|
||||
}
|
||||
@ -1417,29 +1430,29 @@ namespace Flux {
|
||||
}
|
||||
pe_vec = Rope::gen_flux_pe(static_cast<int>(x->ne[1]),
|
||||
static_cast<int>(x->ne[0]),
|
||||
flux_params.patch_size,
|
||||
config.patch_size,
|
||||
static_cast<int>(x->ne[3]),
|
||||
static_cast<int>(context->ne[1]),
|
||||
txt_arange_dims,
|
||||
ref_latents,
|
||||
increase_ref_index,
|
||||
flux_params.ref_index_scale,
|
||||
flux_params.theta,
|
||||
config.ref_index_scale,
|
||||
config.theta,
|
||||
circular_y_enabled,
|
||||
circular_x_enabled,
|
||||
flux_params.axes_dim,
|
||||
config.axes_dim,
|
||||
sd_version_is_longcat(version));
|
||||
int pos_len = static_cast<int>(pe_vec.size() / flux_params.axes_dim_sum / 2);
|
||||
int pos_len = static_cast<int>(pe_vec.size() / config.axes_dim_sum / 2);
|
||||
// LOG_DEBUG("pos_len %d", pos_len);
|
||||
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, flux_params.axes_dim_sum / 2, pos_len);
|
||||
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.axes_dim_sum / 2, pos_len);
|
||||
// pe->data = pe_vec.data();
|
||||
// print_ggml_tensor(pe);
|
||||
// pe->data = nullptr;
|
||||
set_backend_tensor_data(pe, pe_vec.data());
|
||||
|
||||
if (version == VERSION_CHROMA_RADIANCE) {
|
||||
int patch_size = flux_params.patch_size;
|
||||
int nerf_max_freqs = flux_params.chroma_radiance_params.nerf_max_freqs;
|
||||
int patch_size = config.patch_size;
|
||||
int nerf_max_freqs = config.chroma_radiance_params.nerf_max_freqs;
|
||||
dct_vec = fetch_dct_pos(patch_size, nerf_max_freqs);
|
||||
dct = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_F32, nerf_max_freqs * nerf_max_freqs, patch_size * patch_size);
|
||||
// dct->data = dct_vec.data();
|
||||
@ -1614,4 +1627,4 @@ namespace Flux {
|
||||
|
||||
} // namespace Flux
|
||||
|
||||
#endif // __FLUX_HPP__
|
||||
#endif // __SD_MODEL_DIFFUSION_FLUX_HPP__
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef __SD_HIDREAM_O1_H__
|
||||
#define __SD_HIDREAM_O1_H__
|
||||
#ifndef __SD_MODEL_DIFFUSION_HIDREAM_O1_HPP__
|
||||
#define __SD_MODEL_DIFFUSION_HIDREAM_O1_HPP__
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
@ -10,11 +10,11 @@
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "common_dit.hpp"
|
||||
#include "conditioner.hpp"
|
||||
#include "diffusion_model.hpp"
|
||||
#include "llm.hpp"
|
||||
#include "util.h"
|
||||
#include "conditioning/conditioner.hpp"
|
||||
#include "core/util.h"
|
||||
#include "model/diffusion/dit.hpp"
|
||||
#include "model/diffusion/model.hpp"
|
||||
#include "model/te/llm.hpp"
|
||||
|
||||
namespace HiDreamO1 {
|
||||
constexpr int HIDREAM_O1_GRAPH_SIZE = 32768;
|
||||
@ -23,6 +23,39 @@ namespace HiDreamO1 {
|
||||
constexpr int IMAGE_TOKEN_ID = 151655;
|
||||
constexpr int VISION_START_TOKEN_ID = 151652;
|
||||
|
||||
struct HiDreamO1Config {
|
||||
LLM::LLMConfig llm;
|
||||
int patch_size = PATCH_SIZE;
|
||||
|
||||
static HiDreamO1Config detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
|
||||
(void)tensor_storage_map;
|
||||
(void)prefix;
|
||||
HiDreamO1Config config;
|
||||
config.llm.arch = LLM::LLMArch::QWEN3_VL;
|
||||
config.llm.hidden_size = 4096;
|
||||
config.llm.intermediate_size = 12288;
|
||||
config.llm.num_layers = 36;
|
||||
config.llm.num_heads = 32;
|
||||
config.llm.num_kv_heads = 8;
|
||||
config.llm.head_dim = 128;
|
||||
config.llm.qkv_bias = false;
|
||||
config.llm.qk_norm = true;
|
||||
config.llm.vocab_size = 151936;
|
||||
config.llm.rms_norm_eps = 1e-6f;
|
||||
config.llm.vision.arch = LLM::LLMVisionArch::QWEN3_VL;
|
||||
config.llm.vision.num_layers = 27;
|
||||
config.llm.vision.hidden_size = 1152;
|
||||
config.llm.vision.intermediate_size = 4304;
|
||||
config.llm.vision.num_heads = 16;
|
||||
config.llm.vision.out_hidden_size = 4096;
|
||||
config.llm.vision.patch_size = 16;
|
||||
config.llm.vision.spatial_merge_size = 2;
|
||||
config.llm.vision.temporal_patch_size = 2;
|
||||
config.llm.vision.num_position_embeddings = 2304;
|
||||
return config;
|
||||
}
|
||||
};
|
||||
|
||||
static inline std::string repeat_special_token(const std::string& token, int64_t count) {
|
||||
std::string out;
|
||||
out.reserve(static_cast<size_t>(count) * token.size());
|
||||
@ -205,50 +238,19 @@ namespace HiDreamO1 {
|
||||
}
|
||||
};
|
||||
|
||||
struct HiDreamO1Params {
|
||||
LLM::LLMParams llm;
|
||||
int patch_size = PATCH_SIZE;
|
||||
};
|
||||
|
||||
static inline HiDreamO1Params make_hidream_o1_params() {
|
||||
HiDreamO1Params params;
|
||||
params.llm.arch = LLM::LLMArch::QWEN3_VL;
|
||||
params.llm.hidden_size = 4096;
|
||||
params.llm.intermediate_size = 12288;
|
||||
params.llm.num_layers = 36;
|
||||
params.llm.num_heads = 32;
|
||||
params.llm.num_kv_heads = 8;
|
||||
params.llm.head_dim = 128;
|
||||
params.llm.qkv_bias = false;
|
||||
params.llm.qk_norm = true;
|
||||
params.llm.vocab_size = 151936;
|
||||
params.llm.rms_norm_eps = 1e-6f;
|
||||
params.llm.vision.arch = LLM::LLMVisionArch::QWEN3_VL;
|
||||
params.llm.vision.num_layers = 27;
|
||||
params.llm.vision.hidden_size = 1152;
|
||||
params.llm.vision.intermediate_size = 4304;
|
||||
params.llm.vision.num_heads = 16;
|
||||
params.llm.vision.out_hidden_size = 4096;
|
||||
params.llm.vision.patch_size = 16;
|
||||
params.llm.vision.spatial_merge_size = 2;
|
||||
params.llm.vision.temporal_patch_size = 2;
|
||||
params.llm.vision.num_position_embeddings = 2304;
|
||||
return params;
|
||||
}
|
||||
|
||||
struct HiDreamO1Model : public GGMLBlock {
|
||||
HiDreamO1Params params;
|
||||
HiDreamO1Config config;
|
||||
|
||||
HiDreamO1Model() = default;
|
||||
explicit HiDreamO1Model(HiDreamO1Params params)
|
||||
: params(std::move(params)) {
|
||||
blocks["language_model"] = std::make_shared<LLM::TextModel>(this->params.llm);
|
||||
blocks["t_embedder1"] = std::make_shared<TimestepEmbedder>(this->params.llm.hidden_size);
|
||||
blocks["x_embedder"] = std::make_shared<BottleneckPatchEmbed>(this->params.patch_size * this->params.patch_size * 3,
|
||||
this->params.llm.hidden_size / 4,
|
||||
this->params.llm.hidden_size);
|
||||
blocks["final_layer2"] = std::make_shared<FinalLayer>(this->params.llm.hidden_size,
|
||||
this->params.patch_size * this->params.patch_size * 3);
|
||||
explicit HiDreamO1Model(HiDreamO1Config config)
|
||||
: config(std::move(config)) {
|
||||
blocks["language_model"] = std::make_shared<LLM::TextModel>(this->config.llm);
|
||||
blocks["t_embedder1"] = std::make_shared<TimestepEmbedder>(this->config.llm.hidden_size);
|
||||
blocks["x_embedder"] = std::make_shared<BottleneckPatchEmbed>(this->config.patch_size * this->config.patch_size * 3,
|
||||
this->config.llm.hidden_size / 4,
|
||||
this->config.llm.hidden_size);
|
||||
blocks["final_layer2"] = std::make_shared<FinalLayer>(this->config.llm.hidden_size,
|
||||
this->config.patch_size * this->config.patch_size * 3);
|
||||
}
|
||||
|
||||
std::shared_ptr<LLM::TextModel> text_model() {
|
||||
@ -269,7 +271,7 @@ namespace HiDreamO1 {
|
||||
};
|
||||
|
||||
struct HiDreamO1VisionRunner : public GGMLRunner {
|
||||
HiDreamO1Params params;
|
||||
HiDreamO1Config config;
|
||||
std::shared_ptr<LLM::VisionModel> model;
|
||||
|
||||
std::vector<int> window_index_vec;
|
||||
@ -284,8 +286,8 @@ namespace HiDreamO1 {
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string& prefix = "model.visual")
|
||||
: GGMLRunner(backend, params_backend),
|
||||
params(make_hidream_o1_params()),
|
||||
model(std::make_shared<LLM::VisionModel>(false, params.llm.vision)) {
|
||||
config(HiDreamO1Config::detect_from_weights(tensor_storage_map, prefix)),
|
||||
model(std::make_shared<LLM::VisionModel>(false, config.llm.vision)) {
|
||||
model->init(params_ctx, tensor_storage_map, prefix);
|
||||
}
|
||||
|
||||
@ -302,7 +304,7 @@ namespace HiDreamO1 {
|
||||
compute_ctx,
|
||||
runner_ctx,
|
||||
image,
|
||||
params.llm.vision,
|
||||
config.llm.vision,
|
||||
model,
|
||||
window_index_vec,
|
||||
window_inverse_index_vec,
|
||||
@ -331,7 +333,7 @@ namespace HiDreamO1 {
|
||||
};
|
||||
|
||||
struct HiDreamO1Runner : public DiffusionModelRunner {
|
||||
HiDreamO1Params params;
|
||||
HiDreamO1Config config;
|
||||
HiDreamO1Model model;
|
||||
|
||||
std::vector<float> attention_mask_vec;
|
||||
@ -341,8 +343,8 @@ namespace HiDreamO1 {
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string& prefix = "model")
|
||||
: DiffusionModelRunner(backend, params_backend, prefix),
|
||||
params(make_hidream_o1_params()) {
|
||||
model = HiDreamO1Model(params);
|
||||
config(HiDreamO1Config::detect_from_weights(tensor_storage_map, prefix)) {
|
||||
model = HiDreamO1Model(config);
|
||||
model.init(params_ctx, tensor_storage_map, prefix);
|
||||
}
|
||||
|
||||
@ -676,4 +678,4 @@ namespace HiDreamO1 {
|
||||
};
|
||||
} // namespace HiDreamO1
|
||||
|
||||
#endif // __SD_HIDREAM_O1_H__
|
||||
#endif // __SD_MODEL_DIFFUSION_HIDREAM_O1_HPP__
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef __IDEOGRAM4_HPP__
|
||||
#define __IDEOGRAM4_HPP__
|
||||
#ifndef __SD_MODEL_DIFFUSION_IDEOGRAM4_HPP__
|
||||
#define __SD_MODEL_DIFFUSION_IDEOGRAM4_HPP__
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
@ -8,10 +8,10 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "diffusion_model.hpp"
|
||||
#include "ggml_extend.hpp"
|
||||
#include "ggml_graph_cut.h"
|
||||
#include "rope.hpp"
|
||||
#include "core/ggml_extend.hpp"
|
||||
#include "core/ggml_graph_cut.h"
|
||||
#include "model/common/rope.hpp"
|
||||
#include "model/diffusion/model.hpp"
|
||||
|
||||
namespace Ideogram4 {
|
||||
constexpr int IDEOGRAM4_GRAPH_SIZE = 65536;
|
||||
@ -38,6 +38,34 @@ namespace Ideogram4 {
|
||||
std::vector<int> mrope_section = {DEFAULT_MROPE_SECTION_T,
|
||||
DEFAULT_MROPE_SECTION_H,
|
||||
DEFAULT_MROPE_SECTION_W};
|
||||
|
||||
static Ideogram4Config detect_from_weights(const String2TensorStorage& tensor_storage_map,
|
||||
const std::string& prefix) {
|
||||
Ideogram4Config config;
|
||||
int64_t detected_layers = 0;
|
||||
std::string layer_prefix = prefix.empty() ? "layers." : prefix + ".layers.";
|
||||
for (const auto& [name, _] : tensor_storage_map) {
|
||||
if (name.find(layer_prefix) != 0) {
|
||||
continue;
|
||||
}
|
||||
std::string tail = name.substr(layer_prefix.size());
|
||||
size_t dot = tail.find('.');
|
||||
if (dot == std::string::npos) {
|
||||
continue;
|
||||
}
|
||||
int layer_idx = std::atoi(tail.substr(0, dot).c_str());
|
||||
detected_layers = std::max<int64_t>(detected_layers, layer_idx + 1);
|
||||
}
|
||||
if (detected_layers > 0) {
|
||||
config.num_layers = detected_layers;
|
||||
LOG_DEBUG("ideogram4: num_layers = %" PRId64 ", emb_dim = %" PRId64 ", num_heads = %" PRId64 ", intermediate_size = %" PRId64,
|
||||
config.num_layers,
|
||||
config.emb_dim,
|
||||
config.num_heads,
|
||||
config.intermediate_size);
|
||||
}
|
||||
return config;
|
||||
}
|
||||
};
|
||||
|
||||
__STATIC_INLINE__ ggml_tensor* timestep_embedding_sin_cos(ggml_context* ctx,
|
||||
@ -380,26 +408,6 @@ namespace Ideogram4 {
|
||||
|
||||
class Ideogram4Runner : public DiffusionModelRunner {
|
||||
protected:
|
||||
static int64_t detect_num_layers(const String2TensorStorage& tensor_storage_map,
|
||||
const std::string& prefix) {
|
||||
int64_t detected_layers = 0;
|
||||
std::string layer_prefix = prefix.empty() ? "layers." : prefix + ".layers.";
|
||||
for (const auto& pair : tensor_storage_map) {
|
||||
const std::string& name = pair.first;
|
||||
if (name.find(layer_prefix) != 0) {
|
||||
continue;
|
||||
}
|
||||
std::string tail = name.substr(layer_prefix.size());
|
||||
size_t dot = tail.find('.');
|
||||
if (dot == std::string::npos) {
|
||||
continue;
|
||||
}
|
||||
int layer_idx = std::atoi(tail.substr(0, dot).c_str());
|
||||
detected_layers = std::max<int64_t>(detected_layers, layer_idx + 1);
|
||||
}
|
||||
return detected_layers;
|
||||
}
|
||||
|
||||
bool should_use_uncond_model(const DiffusionParams& diffusion_params) const {
|
||||
return has_uncond_model &&
|
||||
diffusion_params.context == nullptr &&
|
||||
@ -421,12 +429,8 @@ namespace Ideogram4 {
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string prefix = "")
|
||||
: DiffusionModelRunner(backend, params_backend, prefix),
|
||||
config(Ideogram4Config::detect_from_weights(tensor_storage_map, prefix)),
|
||||
uncond_prefix(prefix + ".uncond") {
|
||||
int64_t detected_layers = detect_num_layers(tensor_storage_map, prefix);
|
||||
if (detected_layers > 0) {
|
||||
config.num_layers = detected_layers;
|
||||
}
|
||||
|
||||
model = Ideogram4Transformer(config);
|
||||
model.init(params_ctx, tensor_storage_map, prefix);
|
||||
for (const auto& pair : tensor_storage_map) {
|
||||
@ -524,4 +528,4 @@ namespace Ideogram4 {
|
||||
};
|
||||
} // namespace Ideogram4
|
||||
|
||||
#endif // __IDEOGRAM4_HPP__
|
||||
#endif // __SD_MODEL_DIFFUSION_IDEOGRAM4_HPP__
|
||||
@ -1,18 +1,83 @@
|
||||
#ifndef __SD_LENS_HPP__
|
||||
#define __SD_LENS_HPP__
|
||||
#ifndef __SD_MODEL_DIFFUSION_LENS_HPP__
|
||||
#define __SD_MODEL_DIFFUSION_LENS_HPP__
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "common_block.hpp"
|
||||
#include "diffusion_model.hpp"
|
||||
#include "flux.hpp"
|
||||
#include "qwen_image.hpp"
|
||||
#include "rope.hpp"
|
||||
#include "model/common/block.hpp"
|
||||
#include "model/common/rope.hpp"
|
||||
#include "model/diffusion/flux.hpp"
|
||||
#include "model/diffusion/model.hpp"
|
||||
#include "model/diffusion/qwen_image.hpp"
|
||||
|
||||
namespace Lens {
|
||||
constexpr int LENS_GRAPH_SIZE = 40960;
|
||||
|
||||
struct LensConfig {
|
||||
int patch_size = 2;
|
||||
int64_t in_channels = 128;
|
||||
int64_t out_channels = 32;
|
||||
int num_layers = 48;
|
||||
int64_t attention_head_dim = 64;
|
||||
int64_t num_attention_heads = 24;
|
||||
int64_t joint_attention_dim = 2880;
|
||||
int selected_layer_count = 4;
|
||||
int theta = 10000;
|
||||
std::vector<int> axes_dim = {8, 28, 28};
|
||||
int axes_dim_sum = 64;
|
||||
|
||||
static LensConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
|
||||
LensConfig config;
|
||||
config.num_layers = 0;
|
||||
for (const auto& [name, tensor_storage] : tensor_storage_map) {
|
||||
if (!starts_with(name, prefix)) {
|
||||
continue;
|
||||
}
|
||||
if (ends_with(name, "img_in.weight") && tensor_storage.n_dims == 2) {
|
||||
config.in_channels = tensor_storage.ne[0];
|
||||
int64_t inner_dim = tensor_storage.ne[1];
|
||||
if (config.attention_head_dim > 0) {
|
||||
config.num_attention_heads = inner_dim / config.attention_head_dim;
|
||||
}
|
||||
} else if (ends_with(name, "txt_in.weight") && tensor_storage.n_dims == 2) {
|
||||
config.selected_layer_count = static_cast<int>(tensor_storage.ne[0] / config.joint_attention_dim);
|
||||
} else if (ends_with(name, "proj_out.weight") && tensor_storage.n_dims == 2) {
|
||||
int64_t patch_area = config.patch_size * config.patch_size;
|
||||
config.out_channels = tensor_storage.ne[1] / patch_area;
|
||||
} else if (ends_with(name, "transformer_blocks.0.attn.norm_q.weight") && tensor_storage.n_dims == 1) {
|
||||
config.attention_head_dim = tensor_storage.ne[0];
|
||||
}
|
||||
|
||||
size_t pos = name.find("transformer_blocks.");
|
||||
if (pos != std::string::npos) {
|
||||
auto items = split_string(name.substr(pos), '.');
|
||||
if (items.size() > 1) {
|
||||
int block_index = atoi(items[1].c_str());
|
||||
if (block_index + 1 > config.num_layers) {
|
||||
config.num_layers = block_index + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (config.num_layers == 0) {
|
||||
config.num_layers = 48;
|
||||
}
|
||||
config.axes_dim_sum = 0;
|
||||
for (int axis_dim : config.axes_dim) {
|
||||
config.axes_dim_sum += axis_dim;
|
||||
}
|
||||
LOG_DEBUG("lens: num_layers = %d, selected_layer_count = %d, hidden_size = %" PRId64 ", num_attention_heads = %" PRId64 ", attention_head_dim = %" PRId64 ", in_channels = %" PRId64 ", out_channels = %" PRId64,
|
||||
config.num_layers,
|
||||
config.selected_layer_count,
|
||||
config.num_attention_heads * config.attention_head_dim,
|
||||
config.num_attention_heads,
|
||||
config.attention_head_dim,
|
||||
config.in_channels,
|
||||
config.out_channels);
|
||||
return config;
|
||||
}
|
||||
};
|
||||
|
||||
struct LensTimestepProjEmbeddings : public GGMLBlock {
|
||||
LensTimestepProjEmbeddings(int64_t embedding_dim) {
|
||||
blocks["timestep_embedder"] = std::make_shared<Qwen::TimestepEmbedding>(256, embedding_dim);
|
||||
@ -209,41 +274,27 @@ namespace Lens {
|
||||
}
|
||||
};
|
||||
|
||||
struct LensParams {
|
||||
int patch_size = 2;
|
||||
int64_t in_channels = 128;
|
||||
int64_t out_channels = 32;
|
||||
int num_layers = 48;
|
||||
int64_t attention_head_dim = 64;
|
||||
int64_t num_attention_heads = 24;
|
||||
int64_t joint_attention_dim = 2880;
|
||||
int selected_layer_count = 4;
|
||||
int theta = 10000;
|
||||
std::vector<int> axes_dim = {8, 28, 28};
|
||||
int axes_dim_sum = 64;
|
||||
};
|
||||
|
||||
class LensModel : public GGMLBlock {
|
||||
public:
|
||||
LensParams params;
|
||||
LensConfig config;
|
||||
|
||||
LensModel() = default;
|
||||
LensModel(LensParams params)
|
||||
: params(params) {
|
||||
int64_t inner_dim = params.num_attention_heads * params.attention_head_dim;
|
||||
LensModel(LensConfig config)
|
||||
: config(config) {
|
||||
int64_t inner_dim = config.num_attention_heads * config.attention_head_dim;
|
||||
blocks["time_text_embed"] = std::make_shared<LensTimestepProjEmbeddings>(inner_dim);
|
||||
blocks["img_in"] = std::make_shared<Linear>(params.in_channels, inner_dim, true);
|
||||
blocks["txt_in"] = std::make_shared<Linear>(params.joint_attention_dim * params.selected_layer_count, inner_dim, true);
|
||||
for (int i = 0; i < params.selected_layer_count; ++i) {
|
||||
blocks["txt_norm." + std::to_string(i)] = std::make_shared<RMSNorm>(params.joint_attention_dim, 1e-5f);
|
||||
blocks["img_in"] = std::make_shared<Linear>(config.in_channels, inner_dim, true);
|
||||
blocks["txt_in"] = std::make_shared<Linear>(config.joint_attention_dim * config.selected_layer_count, inner_dim, true);
|
||||
for (int i = 0; i < config.selected_layer_count; ++i) {
|
||||
blocks["txt_norm." + std::to_string(i)] = std::make_shared<RMSNorm>(config.joint_attention_dim, 1e-5f);
|
||||
}
|
||||
for (int i = 0; i < params.num_layers; ++i) {
|
||||
for (int i = 0; i < config.num_layers; ++i) {
|
||||
blocks["transformer_blocks." + std::to_string(i)] = std::make_shared<LensTransformerBlock>(inner_dim,
|
||||
params.num_attention_heads,
|
||||
params.attention_head_dim);
|
||||
config.num_attention_heads,
|
||||
config.attention_head_dim);
|
||||
}
|
||||
blocks["norm_out"] = std::make_shared<LensAdaLayerNormContinuous>(inner_dim, 1e-6f);
|
||||
blocks["proj_out"] = std::make_shared<Linear>(inner_dim, params.patch_size * params.patch_size * params.out_channels, true);
|
||||
blocks["proj_out"] = std::make_shared<Linear>(inner_dim, config.patch_size * config.patch_size * config.out_channels, true);
|
||||
}
|
||||
|
||||
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||
@ -269,9 +320,9 @@ namespace Lens {
|
||||
img = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, img, 1, 0, 2, 3));
|
||||
img = img_in->forward(ctx, img);
|
||||
|
||||
std::vector<ggml_tensor*> txt_chunks = ggml_ext_chunk(ctx->ggml_ctx, context, params.selected_layer_count, 0);
|
||||
std::vector<ggml_tensor*> txt_chunks = ggml_ext_chunk(ctx->ggml_ctx, context, config.selected_layer_count, 0);
|
||||
ggml_tensor* txt = nullptr;
|
||||
for (int i = 0; i < params.selected_layer_count; ++i) {
|
||||
for (int i = 0; i < config.selected_layer_count; ++i) {
|
||||
auto txt_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm." + std::to_string(i)]);
|
||||
auto chunk = txt_norm->forward(ctx, txt_chunks[i]);
|
||||
txt = txt == nullptr ? chunk : ggml_concat(ctx->ggml_ctx, txt, chunk, 0);
|
||||
@ -281,7 +332,7 @@ namespace Lens {
|
||||
sd::ggml_graph_cut::mark_graph_cut(img, "lens.prelude", "img");
|
||||
sd::ggml_graph_cut::mark_graph_cut(txt, "lens.prelude", "txt");
|
||||
|
||||
for (int i = 0; i < params.num_layers; ++i) {
|
||||
for (int i = 0; i < config.num_layers; ++i) {
|
||||
auto block = std::dynamic_pointer_cast<LensTransformerBlock>(blocks["transformer_blocks." + std::to_string(i)]);
|
||||
auto out = block->forward(ctx, img, txt, t_emb, pe);
|
||||
img = out.first;
|
||||
@ -294,13 +345,13 @@ namespace Lens {
|
||||
img = proj_out->forward(ctx, img);
|
||||
|
||||
auto out = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, img, 1, 0, 2, 3));
|
||||
out = ggml_reshape_4d(ctx->ggml_ctx, out, W, H, params.patch_size * params.patch_size * params.out_channels, N);
|
||||
out = ggml_reshape_4d(ctx->ggml_ctx, out, W, H, config.patch_size * config.patch_size * config.out_channels, N);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
struct LensRunner : public DiffusionModelRunner {
|
||||
LensParams lens_params;
|
||||
LensConfig config;
|
||||
LensModel lens;
|
||||
std::vector<float> pe_vec;
|
||||
|
||||
@ -308,53 +359,9 @@ namespace Lens {
|
||||
ggml_backend_t params_backend,
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string prefix = "")
|
||||
: DiffusionModelRunner(backend, params_backend, prefix) {
|
||||
lens_params.num_layers = 0;
|
||||
for (const auto& [name, tensor_storage] : tensor_storage_map) {
|
||||
if (!starts_with(name, prefix)) {
|
||||
continue;
|
||||
}
|
||||
if (ends_with(name, "img_in.weight") && tensor_storage.n_dims == 2) {
|
||||
lens_params.in_channels = tensor_storage.ne[0];
|
||||
int64_t inner_dim = tensor_storage.ne[1];
|
||||
lens_params.num_attention_heads = inner_dim / lens_params.attention_head_dim;
|
||||
} else if (ends_with(name, "txt_in.weight") && tensor_storage.n_dims == 2) {
|
||||
lens_params.selected_layer_count = static_cast<int>(tensor_storage.ne[0] / lens_params.joint_attention_dim);
|
||||
} else if (ends_with(name, "proj_out.weight") && tensor_storage.n_dims == 2) {
|
||||
lens_params.out_channels = tensor_storage.ne[1] / lens_params.patch_size / lens_params.patch_size;
|
||||
} else if (ends_with(name, "transformer_blocks.0.attn.norm_q.weight") && tensor_storage.n_dims == 1) {
|
||||
lens_params.attention_head_dim = tensor_storage.ne[0];
|
||||
}
|
||||
|
||||
size_t pos = name.find("transformer_blocks.");
|
||||
if (pos != std::string::npos) {
|
||||
std::string layer_name = name.substr(pos);
|
||||
auto items = split_string(layer_name, '.');
|
||||
if (items.size() > 1) {
|
||||
int block_index = atoi(items[1].c_str());
|
||||
if (block_index + 1 > lens_params.num_layers) {
|
||||
lens_params.num_layers = block_index + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (lens_params.num_layers == 0) {
|
||||
lens_params.num_layers = 48;
|
||||
}
|
||||
lens_params.axes_dim_sum = 0;
|
||||
for (int axis_dim : lens_params.axes_dim) {
|
||||
lens_params.axes_dim_sum += axis_dim;
|
||||
}
|
||||
|
||||
LOG_INFO("lens: layers = %d, in_channels = %" PRId64 ", out_channels = %" PRId64
|
||||
", heads = %" PRId64 ", head_dim = %" PRId64,
|
||||
lens_params.num_layers,
|
||||
lens_params.in_channels,
|
||||
lens_params.out_channels,
|
||||
lens_params.num_attention_heads,
|
||||
lens_params.attention_head_dim);
|
||||
|
||||
lens = LensModel(lens_params);
|
||||
: DiffusionModelRunner(backend, params_backend, prefix),
|
||||
config(LensConfig::detect_from_weights(tensor_storage_map, prefix)) {
|
||||
lens = LensModel(config);
|
||||
lens.init(params_ctx, tensor_storage_map, prefix);
|
||||
}
|
||||
|
||||
@ -380,12 +387,12 @@ namespace Lens {
|
||||
static_cast<int>(x->ne[0]),
|
||||
static_cast<int>(x->ne[3]),
|
||||
static_cast<int>(context->ne[1]),
|
||||
lens_params.theta,
|
||||
config.theta,
|
||||
circular_y_enabled,
|
||||
circular_x_enabled,
|
||||
lens_params.axes_dim);
|
||||
int pos_len = static_cast<int>(pe_vec.size() / lens_params.axes_dim_sum / 2);
|
||||
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, lens_params.axes_dim_sum / 2, pos_len);
|
||||
config.axes_dim);
|
||||
int pos_len = static_cast<int>(pe_vec.size() / config.axes_dim_sum / 2);
|
||||
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.axes_dim_sum / 2, pos_len);
|
||||
set_backend_tensor_data(pe, pe_vec.data());
|
||||
|
||||
auto runner_ctx = get_context();
|
||||
@ -416,4 +423,4 @@ namespace Lens {
|
||||
};
|
||||
} // namespace Lens
|
||||
|
||||
#endif // __SD_LENS_HPP__
|
||||
#endif // __SD_MODEL_DIFFUSION_LENS_HPP__
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef __SD_LTXV_HPP__
|
||||
#define __SD_LTXV_HPP__
|
||||
#ifndef __SD_MODEL_DIFFUSION_LTXV_HPP__
|
||||
#define __SD_MODEL_DIFFUSION_LTXV_HPP__
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
@ -9,10 +9,10 @@
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "common_block.hpp"
|
||||
#include "diffusion_model.hpp"
|
||||
#include "flux.hpp"
|
||||
#include "rope.hpp"
|
||||
#include "model/common/block.hpp"
|
||||
#include "model/common/rope.hpp"
|
||||
#include "model/diffusion/flux.hpp"
|
||||
#include "model/diffusion/model.hpp"
|
||||
|
||||
namespace LTXV {
|
||||
|
||||
@ -72,6 +72,200 @@ namespace LTXV {
|
||||
return max_block + 1;
|
||||
}
|
||||
|
||||
struct LTXAVConfig {
|
||||
int64_t in_channels = 128;
|
||||
int64_t out_channels = 128;
|
||||
int64_t hidden_size = 3840;
|
||||
int64_t cross_attention_dim = 4096;
|
||||
int64_t caption_channels = 3840;
|
||||
int64_t num_attention_heads = 30;
|
||||
int64_t attention_head_dim = 128;
|
||||
int64_t num_layers = 28;
|
||||
float positional_embedding_theta = 10000.f;
|
||||
std::vector<int> positional_embedding_max_pos = {20, 2048, 2048};
|
||||
std::tuple<int, int, int> vae_scale_factors = {8, 32, 32};
|
||||
bool causal_temporal_positioning = true;
|
||||
float timestep_scale_multiplier = 1000.f;
|
||||
|
||||
int64_t audio_in_channels = 128;
|
||||
int64_t audio_out_channels = 128;
|
||||
int64_t audio_hidden_size = 2048;
|
||||
int64_t audio_cross_attention_dim = 2048;
|
||||
int64_t audio_num_attention_heads = 32;
|
||||
int64_t audio_attention_head_dim = 64;
|
||||
std::vector<int> audio_positional_embedding_max_pos = {20};
|
||||
float av_ca_timestep_scale_multiplier = 1000.f;
|
||||
int64_t num_audio_channels = 8;
|
||||
int64_t audio_frequency_bins = 16;
|
||||
|
||||
bool use_connector = false;
|
||||
int64_t connector_hidden_size = 3840;
|
||||
int64_t connector_num_heads = 30;
|
||||
int64_t connector_head_dim = 128;
|
||||
int64_t connector_num_layers = 2;
|
||||
int64_t connector_num_registers = 128;
|
||||
bool connector_rope_interleaved = false;
|
||||
bool connector_apply_gated_attention = false;
|
||||
|
||||
bool use_audio_connector = false;
|
||||
int64_t audio_connector_hidden_size = 2048;
|
||||
int64_t audio_connector_num_heads = 32;
|
||||
int64_t audio_connector_head_dim = 64;
|
||||
int64_t audio_connector_num_layers = 2;
|
||||
int64_t audio_connector_num_registers = 128;
|
||||
bool audio_connector_rope_interleaved = false;
|
||||
bool audio_connector_apply_gated_attention = false;
|
||||
|
||||
bool video_rope_interleaved = false;
|
||||
bool use_middle_indices_grid = true;
|
||||
bool cross_attention_adaln = false;
|
||||
|
||||
bool use_caption_projection = true;
|
||||
bool use_audio_caption_projection = true;
|
||||
bool caption_proj_before_connector = true;
|
||||
bool caption_projection_first_linear = false;
|
||||
|
||||
bool self_attention_gated = false;
|
||||
bool cross_attention_gated = false;
|
||||
|
||||
static std::pair<int64_t, int64_t> infer_attention_layout(int64_t hidden_size,
|
||||
int64_t preferred_heads = -1) {
|
||||
if (preferred_heads > 0 && hidden_size % preferred_heads == 0) {
|
||||
return {preferred_heads, hidden_size / preferred_heads};
|
||||
}
|
||||
const int candidates[] = {128, 96, 80, 64, 48, 40, 32};
|
||||
for (int head_dim : candidates) {
|
||||
if (hidden_size % head_dim == 0) {
|
||||
int64_t heads = hidden_size / head_dim;
|
||||
if (heads >= 8 && heads <= 64) {
|
||||
return {heads, head_dim};
|
||||
}
|
||||
}
|
||||
}
|
||||
return {32, hidden_size / 32};
|
||||
}
|
||||
|
||||
static int64_t infer_gate_heads(const String2TensorStorage& tensor_storage_map,
|
||||
const std::string& bias_name,
|
||||
int64_t fallback_heads) {
|
||||
auto it = tensor_storage_map.find(bias_name);
|
||||
if (it != tensor_storage_map.end()) {
|
||||
return it->second.ne[0];
|
||||
}
|
||||
return fallback_heads;
|
||||
}
|
||||
|
||||
static LTXAVConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
|
||||
LTXAVConfig config;
|
||||
auto patchify_proj_iter = tensor_storage_map.find(prefix + ".patchify_proj.weight");
|
||||
if (patchify_proj_iter != tensor_storage_map.end()) {
|
||||
config.in_channels = patchify_proj_iter->second.ne[0];
|
||||
config.hidden_size = patchify_proj_iter->second.ne[1];
|
||||
int64_t video_heads = infer_gate_heads(tensor_storage_map, prefix + ".transformer_blocks.0.attn1.to_gate_logits.bias", 32);
|
||||
auto attn_layout = infer_attention_layout(config.hidden_size, video_heads);
|
||||
config.num_attention_heads = attn_layout.first;
|
||||
config.attention_head_dim = attn_layout.second;
|
||||
}
|
||||
|
||||
auto audio_patchify_proj_iter = tensor_storage_map.find(prefix + ".audio_patchify_proj.weight");
|
||||
if (audio_patchify_proj_iter != tensor_storage_map.end()) {
|
||||
config.audio_in_channels = audio_patchify_proj_iter->second.ne[0];
|
||||
config.audio_hidden_size = audio_patchify_proj_iter->second.ne[1];
|
||||
config.audio_out_channels = config.audio_in_channels;
|
||||
int64_t audio_heads = infer_gate_heads(tensor_storage_map, prefix + ".transformer_blocks.0.audio_attn1.to_gate_logits.bias", 32);
|
||||
auto audio_attn_layout = infer_attention_layout(config.audio_hidden_size, audio_heads);
|
||||
config.audio_num_attention_heads = audio_attn_layout.first;
|
||||
config.audio_attention_head_dim = audio_attn_layout.second;
|
||||
}
|
||||
|
||||
auto proj_out_iter = tensor_storage_map.find(prefix + ".proj_out.weight");
|
||||
if (proj_out_iter != tensor_storage_map.end()) {
|
||||
config.out_channels = proj_out_iter->second.ne[1];
|
||||
}
|
||||
auto audio_proj_out_iter = tensor_storage_map.find(prefix + ".audio_proj_out.weight");
|
||||
if (audio_proj_out_iter != tensor_storage_map.end()) {
|
||||
config.audio_out_channels = audio_proj_out_iter->second.ne[1];
|
||||
}
|
||||
|
||||
auto attn2_iter = tensor_storage_map.find(prefix + ".transformer_blocks.0.attn2.to_k.weight");
|
||||
if (attn2_iter != tensor_storage_map.end()) {
|
||||
config.cross_attention_dim = attn2_iter->second.ne[0];
|
||||
}
|
||||
auto audio_attn2_iter = tensor_storage_map.find(prefix + ".transformer_blocks.0.audio_attn2.to_k.weight");
|
||||
if (audio_attn2_iter != tensor_storage_map.end()) {
|
||||
config.audio_cross_attention_dim = audio_attn2_iter->second.ne[0];
|
||||
}
|
||||
if (tensor_storage_map.find(prefix + ".transformer_blocks.0.prompt_scale_shift_table") != tensor_storage_map.end()) {
|
||||
config.cross_attention_adaln = true;
|
||||
}
|
||||
if (tensor_storage_map.find(prefix + ".transformer_blocks.0.attn1.to_gate_logits.weight") != tensor_storage_map.end() ||
|
||||
tensor_storage_map.find(prefix + ".transformer_blocks.0.audio_attn1.to_gate_logits.weight") != tensor_storage_map.end()) {
|
||||
config.self_attention_gated = true;
|
||||
}
|
||||
if (tensor_storage_map.find(prefix + ".transformer_blocks.0.attn2.to_gate_logits.weight") != tensor_storage_map.end() ||
|
||||
tensor_storage_map.find(prefix + ".transformer_blocks.0.audio_attn2.to_gate_logits.weight") != tensor_storage_map.end()) {
|
||||
config.cross_attention_gated = true;
|
||||
}
|
||||
if (tensor_storage_map.find(prefix + ".caption_projection.linear_1.weight") == tensor_storage_map.end() &&
|
||||
tensor_storage_map.find(prefix + ".caption_projection.linear_2.weight") == tensor_storage_map.end()) {
|
||||
config.use_caption_projection = false;
|
||||
}
|
||||
if (tensor_storage_map.find(prefix + ".audio_caption_projection.linear_1.weight") == tensor_storage_map.end() &&
|
||||
tensor_storage_map.find(prefix + ".audio_caption_projection.linear_2.weight") == tensor_storage_map.end()) {
|
||||
config.use_audio_caption_projection = false;
|
||||
}
|
||||
|
||||
config.num_layers = count_prefix_blocks(tensor_storage_map, prefix + ".", "transformer_blocks.");
|
||||
|
||||
auto connector_iter = tensor_storage_map.find(prefix + ".video_embeddings_connector.transformer_1d_blocks.0.attn1.to_q.weight");
|
||||
if (connector_iter != tensor_storage_map.end()) {
|
||||
config.use_connector = true;
|
||||
config.connector_hidden_size = connector_iter->second.ne[1];
|
||||
int64_t connector_heads = infer_gate_heads(tensor_storage_map,
|
||||
prefix + ".video_embeddings_connector.transformer_1d_blocks.0.attn1.to_gate_logits.bias",
|
||||
32);
|
||||
auto connector_layout = infer_attention_layout(config.connector_hidden_size, connector_heads);
|
||||
config.connector_num_heads = connector_layout.first;
|
||||
config.connector_head_dim = connector_layout.second;
|
||||
config.connector_num_layers = count_prefix_blocks(tensor_storage_map, prefix + ".video_embeddings_connector.", "transformer_1d_blocks.");
|
||||
auto register_iter = tensor_storage_map.find(prefix + ".video_embeddings_connector.learnable_registers");
|
||||
if (register_iter != tensor_storage_map.end()) {
|
||||
config.connector_num_registers = register_iter->second.ne[1];
|
||||
}
|
||||
if (tensor_storage_map.find(prefix + ".video_embeddings_connector.transformer_1d_blocks.0.attn1.to_gate_logits.weight") != tensor_storage_map.end()) {
|
||||
config.connector_apply_gated_attention = true;
|
||||
}
|
||||
}
|
||||
|
||||
auto audio_connector_iter = tensor_storage_map.find(prefix + ".audio_embeddings_connector.transformer_1d_blocks.0.attn1.to_q.weight");
|
||||
if (audio_connector_iter != tensor_storage_map.end()) {
|
||||
config.use_audio_connector = true;
|
||||
config.audio_connector_hidden_size = audio_connector_iter->second.ne[1];
|
||||
int64_t connector_heads = infer_gate_heads(tensor_storage_map,
|
||||
prefix + ".audio_embeddings_connector.transformer_1d_blocks.0.attn1.to_gate_logits.bias",
|
||||
32);
|
||||
auto connector_layout = infer_attention_layout(config.audio_connector_hidden_size, connector_heads);
|
||||
config.audio_connector_num_heads = connector_layout.first;
|
||||
config.audio_connector_head_dim = connector_layout.second;
|
||||
config.audio_connector_num_layers = count_prefix_blocks(tensor_storage_map, prefix + ".audio_embeddings_connector.", "transformer_1d_blocks.");
|
||||
auto register_iter = tensor_storage_map.find(prefix + ".audio_embeddings_connector.learnable_registers");
|
||||
if (register_iter != tensor_storage_map.end()) {
|
||||
config.audio_connector_num_registers = register_iter->second.ne[1];
|
||||
}
|
||||
if (tensor_storage_map.find(prefix + ".audio_embeddings_connector.transformer_1d_blocks.0.attn1.to_gate_logits.weight") != tensor_storage_map.end()) {
|
||||
config.audio_connector_apply_gated_attention = true;
|
||||
}
|
||||
}
|
||||
LOG_DEBUG("ltxav: num_layers = %" PRId64 ", hidden_size = %" PRId64 ", num_attention_heads = %" PRId64 ", audio_hidden_size = %" PRId64 ", audio_num_attention_heads = %" PRId64,
|
||||
config.num_layers,
|
||||
config.hidden_size,
|
||||
config.num_attention_heads,
|
||||
config.audio_hidden_size,
|
||||
config.audio_num_attention_heads);
|
||||
return config;
|
||||
}
|
||||
};
|
||||
|
||||
__STATIC_INLINE__ std::vector<float> generate_freq_grid(float theta,
|
||||
int positional_dims,
|
||||
int dim) {
|
||||
@ -749,63 +943,6 @@ namespace LTXV {
|
||||
}
|
||||
};
|
||||
|
||||
struct LTXAVParams {
|
||||
int64_t in_channels = 128;
|
||||
int64_t out_channels = 128;
|
||||
int64_t hidden_size = 3840;
|
||||
int64_t cross_attention_dim = 4096;
|
||||
int64_t caption_channels = 3840;
|
||||
int64_t num_attention_heads = 30;
|
||||
int64_t attention_head_dim = 128;
|
||||
int64_t num_layers = 28;
|
||||
float positional_embedding_theta = 10000.f;
|
||||
std::vector<int> positional_embedding_max_pos = {20, 2048, 2048};
|
||||
std::tuple<int, int, int> vae_scale_factors = {8, 32, 32};
|
||||
bool causal_temporal_positioning = true;
|
||||
float timestep_scale_multiplier = 1000.f;
|
||||
|
||||
int64_t audio_in_channels = 128;
|
||||
int64_t audio_out_channels = 128;
|
||||
int64_t audio_hidden_size = 2048;
|
||||
int64_t audio_cross_attention_dim = 2048;
|
||||
int64_t audio_num_attention_heads = 32;
|
||||
int64_t audio_attention_head_dim = 64;
|
||||
std::vector<int> audio_positional_embedding_max_pos = {20};
|
||||
float av_ca_timestep_scale_multiplier = 1000.f;
|
||||
int64_t num_audio_channels = 8;
|
||||
int64_t audio_frequency_bins = 16;
|
||||
|
||||
bool use_connector = false;
|
||||
int64_t connector_hidden_size = 3840;
|
||||
int64_t connector_num_heads = 30;
|
||||
int64_t connector_head_dim = 128;
|
||||
int64_t connector_num_layers = 2;
|
||||
int64_t connector_num_registers = 128;
|
||||
bool connector_rope_interleaved = false;
|
||||
bool connector_apply_gated_attention = false;
|
||||
|
||||
bool use_audio_connector = false;
|
||||
int64_t audio_connector_hidden_size = 2048;
|
||||
int64_t audio_connector_num_heads = 32;
|
||||
int64_t audio_connector_head_dim = 64;
|
||||
int64_t audio_connector_num_layers = 2;
|
||||
int64_t audio_connector_num_registers = 128;
|
||||
bool audio_connector_rope_interleaved = false;
|
||||
bool audio_connector_apply_gated_attention = false;
|
||||
|
||||
bool video_rope_interleaved = false;
|
||||
bool use_middle_indices_grid = true;
|
||||
bool cross_attention_adaln = false;
|
||||
|
||||
bool use_caption_projection = true;
|
||||
bool use_audio_caption_projection = true;
|
||||
bool caption_proj_before_connector = true;
|
||||
bool caption_projection_first_linear = false;
|
||||
|
||||
bool self_attention_gated = false;
|
||||
bool cross_attention_gated = false;
|
||||
};
|
||||
|
||||
__STATIC_INLINE__ std::pair<int64_t, int64_t> infer_attention_layout(int64_t hidden_size,
|
||||
int64_t preferred_heads = -1) {
|
||||
if (preferred_heads > 0 && hidden_size % preferred_heads == 0) {
|
||||
@ -1169,92 +1306,92 @@ namespace LTXV {
|
||||
};
|
||||
|
||||
struct LTXAVModelBlock : public GGMLBlock {
|
||||
LTXAVParams cfg;
|
||||
LTXAVConfig config;
|
||||
|
||||
void init_params(ggml_context* ctx,
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string prefix = "") override {
|
||||
params["scale_shift_table"] = ggml_new_tensor_2d(ctx,
|
||||
get_type(prefix + "scale_shift_table", tensor_storage_map, GGML_TYPE_F32),
|
||||
cfg.hidden_size,
|
||||
config.hidden_size,
|
||||
2);
|
||||
params["audio_scale_shift_table"] = ggml_new_tensor_2d(ctx,
|
||||
get_type(prefix + "audio_scale_shift_table", tensor_storage_map, GGML_TYPE_F32),
|
||||
cfg.audio_hidden_size,
|
||||
config.audio_hidden_size,
|
||||
2);
|
||||
}
|
||||
|
||||
LTXAVModelBlock(const LTXAVParams& params)
|
||||
: cfg(params) {
|
||||
blocks["patchify_proj"] = std::make_shared<Linear>(cfg.in_channels, cfg.hidden_size, true, true);
|
||||
blocks["audio_patchify_proj"] = std::make_shared<Linear>(cfg.audio_in_channels, cfg.audio_hidden_size, true, true);
|
||||
blocks["adaln_single"] = std::make_shared<AdaLayerNormSingle>(cfg.hidden_size, cfg.cross_attention_adaln ? 9 : 6);
|
||||
blocks["audio_adaln_single"] = std::make_shared<AdaLayerNormSingle>(cfg.audio_hidden_size, cfg.cross_attention_adaln ? 9 : 6);
|
||||
if (cfg.cross_attention_adaln) {
|
||||
blocks["prompt_adaln_single"] = std::make_shared<AdaLayerNormSingle>(cfg.hidden_size, 2);
|
||||
blocks["audio_prompt_adaln_single"] = std::make_shared<AdaLayerNormSingle>(cfg.audio_hidden_size, 2);
|
||||
LTXAVModelBlock(const LTXAVConfig& config)
|
||||
: config(config) {
|
||||
blocks["patchify_proj"] = std::make_shared<Linear>(config.in_channels, config.hidden_size, true, true);
|
||||
blocks["audio_patchify_proj"] = std::make_shared<Linear>(config.audio_in_channels, config.audio_hidden_size, true, true);
|
||||
blocks["adaln_single"] = std::make_shared<AdaLayerNormSingle>(config.hidden_size, config.cross_attention_adaln ? 9 : 6);
|
||||
blocks["audio_adaln_single"] = std::make_shared<AdaLayerNormSingle>(config.audio_hidden_size, config.cross_attention_adaln ? 9 : 6);
|
||||
if (config.cross_attention_adaln) {
|
||||
blocks["prompt_adaln_single"] = std::make_shared<AdaLayerNormSingle>(config.hidden_size, 2);
|
||||
blocks["audio_prompt_adaln_single"] = std::make_shared<AdaLayerNormSingle>(config.audio_hidden_size, 2);
|
||||
}
|
||||
blocks["av_ca_video_scale_shift_adaln_single"] = std::make_shared<AdaLayerNormSingle>(cfg.hidden_size, 4);
|
||||
blocks["av_ca_a2v_gate_adaln_single"] = std::make_shared<AdaLayerNormSingle>(cfg.hidden_size, 1);
|
||||
blocks["av_ca_audio_scale_shift_adaln_single"] = std::make_shared<AdaLayerNormSingle>(cfg.audio_hidden_size, 4);
|
||||
blocks["av_ca_v2a_gate_adaln_single"] = std::make_shared<AdaLayerNormSingle>(cfg.audio_hidden_size, 1);
|
||||
blocks["av_ca_video_scale_shift_adaln_single"] = std::make_shared<AdaLayerNormSingle>(config.hidden_size, 4);
|
||||
blocks["av_ca_a2v_gate_adaln_single"] = std::make_shared<AdaLayerNormSingle>(config.hidden_size, 1);
|
||||
blocks["av_ca_audio_scale_shift_adaln_single"] = std::make_shared<AdaLayerNormSingle>(config.audio_hidden_size, 4);
|
||||
blocks["av_ca_v2a_gate_adaln_single"] = std::make_shared<AdaLayerNormSingle>(config.audio_hidden_size, 1);
|
||||
|
||||
if (cfg.use_caption_projection) {
|
||||
if (cfg.caption_proj_before_connector) {
|
||||
if (cfg.caption_projection_first_linear) {
|
||||
blocks["caption_projection"] = std::make_shared<NormSingleLinearTextProjection>(cfg.caption_channels, cfg.hidden_size);
|
||||
if (config.use_caption_projection) {
|
||||
if (config.caption_proj_before_connector) {
|
||||
if (config.caption_projection_first_linear) {
|
||||
blocks["caption_projection"] = std::make_shared<NormSingleLinearTextProjection>(config.caption_channels, config.hidden_size);
|
||||
}
|
||||
} else {
|
||||
blocks["caption_projection"] = std::make_shared<PixArtAlphaTextProjection>(cfg.caption_channels, cfg.hidden_size, cfg.hidden_size);
|
||||
blocks["caption_projection"] = std::make_shared<PixArtAlphaTextProjection>(config.caption_channels, config.hidden_size, config.hidden_size);
|
||||
}
|
||||
}
|
||||
if (cfg.use_audio_caption_projection) {
|
||||
if (cfg.caption_proj_before_connector) {
|
||||
if (cfg.caption_projection_first_linear) {
|
||||
blocks["audio_caption_projection"] = std::make_shared<NormSingleLinearTextProjection>(cfg.caption_channels, cfg.audio_hidden_size);
|
||||
if (config.use_audio_caption_projection) {
|
||||
if (config.caption_proj_before_connector) {
|
||||
if (config.caption_projection_first_linear) {
|
||||
blocks["audio_caption_projection"] = std::make_shared<NormSingleLinearTextProjection>(config.caption_channels, config.audio_hidden_size);
|
||||
}
|
||||
} else {
|
||||
blocks["audio_caption_projection"] = std::make_shared<PixArtAlphaTextProjection>(cfg.caption_channels, cfg.audio_hidden_size, cfg.audio_hidden_size);
|
||||
blocks["audio_caption_projection"] = std::make_shared<PixArtAlphaTextProjection>(config.caption_channels, config.audio_hidden_size, config.audio_hidden_size);
|
||||
}
|
||||
}
|
||||
|
||||
if (cfg.use_connector) {
|
||||
blocks["video_embeddings_connector"] = std::make_shared<Embeddings1DConnector>(cfg.connector_hidden_size,
|
||||
cfg.connector_num_heads,
|
||||
cfg.connector_head_dim,
|
||||
cfg.connector_num_layers,
|
||||
cfg.connector_num_registers,
|
||||
cfg.connector_rope_interleaved,
|
||||
cfg.connector_apply_gated_attention);
|
||||
if (config.use_connector) {
|
||||
blocks["video_embeddings_connector"] = std::make_shared<Embeddings1DConnector>(config.connector_hidden_size,
|
||||
config.connector_num_heads,
|
||||
config.connector_head_dim,
|
||||
config.connector_num_layers,
|
||||
config.connector_num_registers,
|
||||
config.connector_rope_interleaved,
|
||||
config.connector_apply_gated_attention);
|
||||
}
|
||||
if (cfg.use_audio_connector) {
|
||||
blocks["audio_embeddings_connector"] = std::make_shared<Embeddings1DConnector>(cfg.audio_connector_hidden_size,
|
||||
cfg.audio_connector_num_heads,
|
||||
cfg.audio_connector_head_dim,
|
||||
cfg.audio_connector_num_layers,
|
||||
cfg.audio_connector_num_registers,
|
||||
cfg.audio_connector_rope_interleaved,
|
||||
cfg.audio_connector_apply_gated_attention);
|
||||
if (config.use_audio_connector) {
|
||||
blocks["audio_embeddings_connector"] = std::make_shared<Embeddings1DConnector>(config.audio_connector_hidden_size,
|
||||
config.audio_connector_num_heads,
|
||||
config.audio_connector_head_dim,
|
||||
config.audio_connector_num_layers,
|
||||
config.audio_connector_num_registers,
|
||||
config.audio_connector_rope_interleaved,
|
||||
config.audio_connector_apply_gated_attention);
|
||||
}
|
||||
|
||||
for (int i = 0; i < cfg.num_layers; i++) {
|
||||
blocks["transformer_blocks." + std::to_string(i)] = std::make_shared<BasicAVTransformerBlock>(cfg.hidden_size,
|
||||
cfg.audio_hidden_size,
|
||||
cfg.num_attention_heads,
|
||||
cfg.audio_num_attention_heads,
|
||||
cfg.attention_head_dim,
|
||||
cfg.audio_attention_head_dim,
|
||||
cfg.cross_attention_dim,
|
||||
cfg.audio_cross_attention_dim,
|
||||
cfg.self_attention_gated || cfg.cross_attention_gated,
|
||||
cfg.cross_attention_adaln,
|
||||
cfg.video_rope_interleaved);
|
||||
for (int i = 0; i < config.num_layers; i++) {
|
||||
blocks["transformer_blocks." + std::to_string(i)] = std::make_shared<BasicAVTransformerBlock>(config.hidden_size,
|
||||
config.audio_hidden_size,
|
||||
config.num_attention_heads,
|
||||
config.audio_num_attention_heads,
|
||||
config.attention_head_dim,
|
||||
config.audio_attention_head_dim,
|
||||
config.cross_attention_dim,
|
||||
config.audio_cross_attention_dim,
|
||||
config.self_attention_gated || config.cross_attention_gated,
|
||||
config.cross_attention_adaln,
|
||||
config.video_rope_interleaved);
|
||||
}
|
||||
|
||||
blocks["norm_out"] = std::make_shared<LayerNorm>(cfg.hidden_size, 1e-6f, false);
|
||||
blocks["proj_out"] = std::make_shared<Linear>(cfg.hidden_size, cfg.out_channels, true, true);
|
||||
blocks["audio_norm_out"] = std::make_shared<LayerNorm>(cfg.audio_hidden_size, 1e-6f, false);
|
||||
blocks["audio_proj_out"] = std::make_shared<Linear>(cfg.audio_hidden_size, cfg.audio_out_channels, true, true);
|
||||
blocks["norm_out"] = std::make_shared<LayerNorm>(config.hidden_size, 1e-6f, false);
|
||||
blocks["proj_out"] = std::make_shared<Linear>(config.hidden_size, config.out_channels, true, true);
|
||||
blocks["audio_norm_out"] = std::make_shared<LayerNorm>(config.audio_hidden_size, 1e-6f, false);
|
||||
blocks["audio_proj_out"] = std::make_shared<Linear>(config.audio_hidden_size, config.audio_out_channels, true, true);
|
||||
}
|
||||
|
||||
ggml_tensor* patchify_video(GGMLRunnerContext* ctx, ggml_tensor* x, int64_t n) {
|
||||
@ -1293,8 +1430,8 @@ namespace LTXV {
|
||||
if (ax == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
ax = ggml_reshape_4d(ctx->ggml_ctx, ax, cfg.audio_frequency_bins, cfg.num_audio_channels, audio_length, ax->ne[2]); // [b, t, c, f]
|
||||
ax = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, ax, 0, 2, 1, 3)); // [b, c, t, f]
|
||||
ax = ggml_reshape_4d(ctx->ggml_ctx, ax, config.audio_frequency_bins, config.num_audio_channels, audio_length, ax->ne[2]); // [b, t, c, f]
|
||||
ax = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, ax, 0, 2, 1, 3)); // [b, c, t, f]
|
||||
return ax;
|
||||
}
|
||||
|
||||
@ -1308,17 +1445,17 @@ namespace LTXV {
|
||||
}
|
||||
|
||||
bool is_fully_processed_context =
|
||||
context->ne[0] == cfg.cross_attention_dim + cfg.audio_cross_attention_dim &&
|
||||
context->ne[0] == config.cross_attention_dim + config.audio_cross_attention_dim &&
|
||||
context->ne[1] >= 1024;
|
||||
bool is_unprocessed_dual_context =
|
||||
context->ne[0] == cfg.cross_attention_dim + cfg.audio_cross_attention_dim &&
|
||||
context->ne[0] == config.cross_attention_dim + config.audio_cross_attention_dim &&
|
||||
context->ne[1] < 1024;
|
||||
|
||||
if (is_fully_processed_context) {
|
||||
auto v_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, cfg.cross_attention_dim);
|
||||
auto v_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, config.cross_attention_dim);
|
||||
ggml_tensor* a_context = nullptr;
|
||||
if (process_audio_context) {
|
||||
a_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, cfg.cross_attention_dim, cfg.cross_attention_dim + cfg.audio_cross_attention_dim);
|
||||
a_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, config.cross_attention_dim, config.cross_attention_dim + config.audio_cross_attention_dim);
|
||||
}
|
||||
return {v_context, a_context};
|
||||
}
|
||||
@ -1326,32 +1463,32 @@ namespace LTXV {
|
||||
ggml_tensor* v_context = context;
|
||||
ggml_tensor* a_context = process_audio_context ? context : nullptr;
|
||||
if (is_unprocessed_dual_context) {
|
||||
v_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, cfg.cross_attention_dim);
|
||||
v_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, config.cross_attention_dim);
|
||||
if (process_audio_context) {
|
||||
a_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, cfg.cross_attention_dim, cfg.cross_attention_dim + cfg.audio_cross_attention_dim);
|
||||
a_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, config.cross_attention_dim, config.cross_attention_dim + config.audio_cross_attention_dim);
|
||||
}
|
||||
} else if (context->ne[0] == cfg.caption_channels * 2) {
|
||||
v_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, cfg.caption_channels);
|
||||
} else if (context->ne[0] == config.caption_channels * 2) {
|
||||
v_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, 0, config.caption_channels);
|
||||
if (process_audio_context) {
|
||||
a_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, cfg.caption_channels, cfg.caption_channels * 2);
|
||||
a_context = ggml_ext_slice(ctx->ggml_ctx, context, 0, config.caption_channels, config.caption_channels * 2);
|
||||
}
|
||||
}
|
||||
|
||||
if (cfg.caption_proj_before_connector) {
|
||||
if (cfg.use_caption_projection &&
|
||||
if (config.caption_proj_before_connector) {
|
||||
if (config.use_caption_projection &&
|
||||
blocks.count("caption_projection") > 0 &&
|
||||
v_context != nullptr &&
|
||||
v_context->ne[0] == cfg.caption_channels) {
|
||||
v_context->ne[0] == config.caption_channels) {
|
||||
auto caption_projection = std::dynamic_pointer_cast<NormSingleLinearTextProjection>(blocks["caption_projection"]);
|
||||
if (caption_projection != nullptr) {
|
||||
v_context = caption_projection->forward(ctx, v_context);
|
||||
}
|
||||
}
|
||||
if (process_audio_context &&
|
||||
cfg.use_audio_caption_projection &&
|
||||
config.use_audio_caption_projection &&
|
||||
blocks.count("audio_caption_projection") > 0 &&
|
||||
a_context != nullptr &&
|
||||
a_context->ne[0] == cfg.caption_channels) {
|
||||
a_context->ne[0] == config.caption_channels) {
|
||||
auto caption_projection = std::dynamic_pointer_cast<NormSingleLinearTextProjection>(blocks["audio_caption_projection"]);
|
||||
if (caption_projection != nullptr) {
|
||||
a_context = caption_projection->forward(ctx, a_context);
|
||||
@ -1359,34 +1496,34 @@ namespace LTXV {
|
||||
}
|
||||
}
|
||||
|
||||
if (cfg.use_connector && v_context != nullptr && v_context->ne[0] == cfg.connector_hidden_size) {
|
||||
if (config.use_connector && v_context != nullptr && v_context->ne[0] == config.connector_hidden_size) {
|
||||
auto connector = std::dynamic_pointer_cast<Embeddings1DConnector>(blocks["video_embeddings_connector"]);
|
||||
v_context = connector->forward(ctx, v_context, video_connector_pe);
|
||||
}
|
||||
if (process_audio_context &&
|
||||
cfg.use_audio_connector &&
|
||||
config.use_audio_connector &&
|
||||
a_context != nullptr &&
|
||||
a_context->ne[0] == cfg.audio_connector_hidden_size) {
|
||||
a_context->ne[0] == config.audio_connector_hidden_size) {
|
||||
auto connector = std::dynamic_pointer_cast<Embeddings1DConnector>(blocks["audio_embeddings_connector"]);
|
||||
a_context = connector->forward(ctx, a_context, audio_connector_pe);
|
||||
}
|
||||
|
||||
if (!cfg.caption_proj_before_connector &&
|
||||
cfg.use_caption_projection &&
|
||||
if (!config.caption_proj_before_connector &&
|
||||
config.use_caption_projection &&
|
||||
blocks.count("caption_projection") > 0 &&
|
||||
v_context != nullptr &&
|
||||
v_context->ne[0] == cfg.caption_channels) {
|
||||
v_context->ne[0] == config.caption_channels) {
|
||||
auto caption_projection = std::dynamic_pointer_cast<PixArtAlphaTextProjection>(blocks["caption_projection"]);
|
||||
if (caption_projection != nullptr) {
|
||||
v_context = caption_projection->forward(ctx, v_context);
|
||||
}
|
||||
}
|
||||
if (process_audio_context &&
|
||||
!cfg.caption_proj_before_connector &&
|
||||
cfg.use_audio_caption_projection &&
|
||||
!config.caption_proj_before_connector &&
|
||||
config.use_audio_caption_projection &&
|
||||
blocks.count("audio_caption_projection") > 0 &&
|
||||
a_context != nullptr &&
|
||||
a_context->ne[0] == cfg.caption_channels) {
|
||||
a_context->ne[0] == config.caption_channels) {
|
||||
auto caption_projection = std::dynamic_pointer_cast<PixArtAlphaTextProjection>(blocks["audio_caption_projection"]);
|
||||
if (caption_projection != nullptr) {
|
||||
a_context = caption_projection->forward(ctx, a_context);
|
||||
@ -1428,8 +1565,8 @@ namespace LTXV {
|
||||
auto audio_norm_out = std::dynamic_pointer_cast<LayerNorm>(blocks["audio_norm_out"]);
|
||||
auto audio_proj_out = std::dynamic_pointer_cast<Linear>(blocks["audio_proj_out"]);
|
||||
|
||||
GGML_ASSERT(vx->ne[3] % cfg.in_channels == 0);
|
||||
int64_t n = vx->ne[3] / cfg.in_channels;
|
||||
GGML_ASSERT(vx->ne[3] % config.in_channels == 0);
|
||||
int64_t n = vx->ne[3] / config.in_channels;
|
||||
int64_t width = vx->ne[0];
|
||||
int64_t height = vx->ne[1];
|
||||
int64_t frames = vx->ne[2];
|
||||
@ -1452,20 +1589,20 @@ namespace LTXV {
|
||||
a_context = ggml_cont(ctx->ggml_ctx, a_context);
|
||||
}
|
||||
|
||||
auto v_timestep_scaled = ggml_ext_scale(ctx->ggml_ctx, timestep, cfg.timestep_scale_multiplier);
|
||||
auto v_timestep_scaled = ggml_ext_scale(ctx->ggml_ctx, timestep, config.timestep_scale_multiplier);
|
||||
auto v_pair = adaln_single->forward(ctx, v_timestep_scaled);
|
||||
auto v_timestep_mod = v_pair.first;
|
||||
auto v_embedded_time = v_pair.second;
|
||||
|
||||
ggml_tensor* effective_audio_timestep = audio_timestep != nullptr ? audio_timestep : timestep;
|
||||
auto a_timestep_scaled = ggml_ext_scale(ctx->ggml_ctx, effective_audio_timestep, cfg.timestep_scale_multiplier);
|
||||
auto a_timestep_scaled = ggml_ext_scale(ctx->ggml_ctx, effective_audio_timestep, config.timestep_scale_multiplier);
|
||||
auto a_pair = audio_adaln_single->forward(ctx, a_timestep_scaled);
|
||||
auto a_timestep_mod = a_pair.first;
|
||||
auto a_embedded_time = a_pair.second;
|
||||
|
||||
ggml_tensor* v_prompt_timestep_mod = nullptr;
|
||||
ggml_tensor* a_prompt_timestep_mod = nullptr;
|
||||
if (cfg.cross_attention_adaln) {
|
||||
if (config.cross_attention_adaln) {
|
||||
auto prompt_adaln_single = std::dynamic_pointer_cast<AdaLayerNormSingle>(blocks["prompt_adaln_single"]);
|
||||
auto audio_prompt_adaln_single = std::dynamic_pointer_cast<AdaLayerNormSingle>(blocks["audio_prompt_adaln_single"]);
|
||||
v_prompt_timestep_mod = prompt_adaln_single->forward(ctx, a_timestep_scaled).first;
|
||||
@ -1474,7 +1611,7 @@ namespace LTXV {
|
||||
|
||||
auto av_ca_video_timestep = repeat_scalar_timestep_like(ctx, effective_audio_timestep, timestep);
|
||||
auto av_ca_audio_timestep = effective_audio_timestep;
|
||||
auto av_ca_factor = cfg.av_ca_timestep_scale_multiplier / cfg.timestep_scale_multiplier;
|
||||
auto av_ca_factor = config.av_ca_timestep_scale_multiplier / config.timestep_scale_multiplier;
|
||||
auto av_ca_video_scale_shift_timestep =
|
||||
std::dynamic_pointer_cast<AdaLayerNormSingle>(blocks["av_ca_video_scale_shift_adaln_single"])->forward(ctx, av_ca_video_timestep).first;
|
||||
auto av_ca_a2v_gate_noise_timestep =
|
||||
@ -1491,7 +1628,7 @@ namespace LTXV {
|
||||
sd::ggml_graph_cut::mark_graph_cut(vx, "ltxav.prelude", "vx");
|
||||
sd::ggml_graph_cut::mark_graph_cut(ax, "ltxav.prelude", "ax");
|
||||
|
||||
for (int i = 0; i < cfg.num_layers; i++) {
|
||||
for (int i = 0; i < config.num_layers; i++) {
|
||||
auto block = std::dynamic_pointer_cast<BasicAVTransformerBlock>(blocks["transformer_blocks." + std::to_string(i)]);
|
||||
auto out = block->forward(ctx,
|
||||
vx,
|
||||
@ -1517,14 +1654,14 @@ namespace LTXV {
|
||||
sd::ggml_graph_cut::mark_graph_cut(ax, "ltxav.transformer_blocks." + std::to_string(i), "ax");
|
||||
}
|
||||
|
||||
auto v_shift_scale = get_output_scale_shift(ctx, params["scale_shift_table"], v_embedded_time, cfg.hidden_size);
|
||||
auto v_shift_scale = get_output_scale_shift(ctx, params["scale_shift_table"], v_embedded_time, config.hidden_size);
|
||||
vx = norm_out->forward(ctx, vx);
|
||||
vx = modulate(ctx->ggml_ctx, vx, v_shift_scale[0], v_shift_scale[1]);
|
||||
vx = proj_out->forward(ctx, vx);
|
||||
vx = unpatchify_video(ctx, vx, width, height, frames);
|
||||
|
||||
if (ax != nullptr && audio_time > 0) {
|
||||
auto a_shift_scale = get_output_scale_shift(ctx, params["audio_scale_shift_table"], a_embedded_time, cfg.audio_hidden_size);
|
||||
auto a_shift_scale = get_output_scale_shift(ctx, params["audio_scale_shift_table"], a_embedded_time, config.audio_hidden_size);
|
||||
ax = audio_norm_out->forward(ctx, ax);
|
||||
ax = modulate(ctx->ggml_ctx, ax, a_shift_scale[0], a_shift_scale[1]);
|
||||
ax = audio_proj_out->forward(ctx, ax);
|
||||
@ -1536,7 +1673,7 @@ namespace LTXV {
|
||||
};
|
||||
|
||||
struct LTXAVRunner : public DiffusionModelRunner {
|
||||
LTXAVParams params;
|
||||
LTXAVConfig config;
|
||||
LTXAVModelBlock model;
|
||||
std::vector<float> video_pe_vec;
|
||||
std::vector<float> audio_pe_vec;
|
||||
@ -1547,124 +1684,13 @@ namespace LTXV {
|
||||
sd::Tensor<float> vx_input_cache;
|
||||
sd::Tensor<float> ax_input_cache;
|
||||
|
||||
static int64_t infer_gate_heads(const String2TensorStorage& tensor_storage_map,
|
||||
const std::string& bias_name,
|
||||
int64_t fallback_heads) {
|
||||
auto it = tensor_storage_map.find(bias_name);
|
||||
if (it != tensor_storage_map.end()) {
|
||||
return it->second.ne[0];
|
||||
}
|
||||
return fallback_heads;
|
||||
}
|
||||
|
||||
LTXAVRunner(ggml_backend_t backend,
|
||||
ggml_backend_t params_backend,
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string& prefix = "model.diffusion_model")
|
||||
: DiffusionModelRunner(backend, params_backend, prefix),
|
||||
params(),
|
||||
model(params) {
|
||||
auto patchify_proj_iter = tensor_storage_map.find(prefix + ".patchify_proj.weight");
|
||||
if (patchify_proj_iter != tensor_storage_map.end()) {
|
||||
params.in_channels = patchify_proj_iter->second.ne[0];
|
||||
params.hidden_size = patchify_proj_iter->second.ne[1];
|
||||
int64_t video_heads = infer_gate_heads(tensor_storage_map, prefix + ".transformer_blocks.0.attn1.to_gate_logits.bias", 32);
|
||||
auto attn_layout = infer_attention_layout(params.hidden_size, video_heads);
|
||||
params.num_attention_heads = attn_layout.first;
|
||||
params.attention_head_dim = attn_layout.second;
|
||||
}
|
||||
|
||||
auto audio_patchify_proj_iter = tensor_storage_map.find(prefix + ".audio_patchify_proj.weight");
|
||||
if (audio_patchify_proj_iter != tensor_storage_map.end()) {
|
||||
params.audio_in_channels = audio_patchify_proj_iter->second.ne[0];
|
||||
params.audio_hidden_size = audio_patchify_proj_iter->second.ne[1];
|
||||
params.audio_out_channels = params.audio_in_channels;
|
||||
int64_t audio_heads = infer_gate_heads(tensor_storage_map, prefix + ".transformer_blocks.0.audio_attn1.to_gate_logits.bias", 32);
|
||||
auto audio_attn_layout = infer_attention_layout(params.audio_hidden_size, audio_heads);
|
||||
params.audio_num_attention_heads = audio_attn_layout.first;
|
||||
params.audio_attention_head_dim = audio_attn_layout.second;
|
||||
}
|
||||
|
||||
auto proj_out_iter = tensor_storage_map.find(prefix + ".proj_out.weight");
|
||||
if (proj_out_iter != tensor_storage_map.end()) {
|
||||
params.out_channels = proj_out_iter->second.ne[1];
|
||||
}
|
||||
auto audio_proj_out_iter = tensor_storage_map.find(prefix + ".audio_proj_out.weight");
|
||||
if (audio_proj_out_iter != tensor_storage_map.end()) {
|
||||
params.audio_out_channels = audio_proj_out_iter->second.ne[1];
|
||||
}
|
||||
|
||||
auto attn2_iter = tensor_storage_map.find(prefix + ".transformer_blocks.0.attn2.to_k.weight");
|
||||
if (attn2_iter != tensor_storage_map.end()) {
|
||||
params.cross_attention_dim = attn2_iter->second.ne[0];
|
||||
}
|
||||
auto audio_attn2_iter = tensor_storage_map.find(prefix + ".transformer_blocks.0.audio_attn2.to_k.weight");
|
||||
if (audio_attn2_iter != tensor_storage_map.end()) {
|
||||
params.audio_cross_attention_dim = audio_attn2_iter->second.ne[0];
|
||||
}
|
||||
if (tensor_storage_map.find(prefix + ".transformer_blocks.0.prompt_scale_shift_table") != tensor_storage_map.end()) {
|
||||
params.cross_attention_adaln = true;
|
||||
}
|
||||
if (tensor_storage_map.find(prefix + ".transformer_blocks.0.attn1.to_gate_logits.weight") != tensor_storage_map.end() ||
|
||||
tensor_storage_map.find(prefix + ".transformer_blocks.0.audio_attn1.to_gate_logits.weight") != tensor_storage_map.end()) {
|
||||
params.self_attention_gated = true;
|
||||
}
|
||||
if (tensor_storage_map.find(prefix + ".transformer_blocks.0.attn2.to_gate_logits.weight") != tensor_storage_map.end() ||
|
||||
tensor_storage_map.find(prefix + ".transformer_blocks.0.audio_attn2.to_gate_logits.weight") != tensor_storage_map.end()) {
|
||||
params.cross_attention_gated = true;
|
||||
}
|
||||
if (tensor_storage_map.find(prefix + ".caption_projection.linear_1.weight") == tensor_storage_map.end() &&
|
||||
tensor_storage_map.find(prefix + ".caption_projection.linear_2.weight") == tensor_storage_map.end()) {
|
||||
params.use_caption_projection = false;
|
||||
}
|
||||
if (tensor_storage_map.find(prefix + ".audio_caption_projection.linear_1.weight") == tensor_storage_map.end() &&
|
||||
tensor_storage_map.find(prefix + ".audio_caption_projection.linear_2.weight") == tensor_storage_map.end()) {
|
||||
params.use_audio_caption_projection = false;
|
||||
}
|
||||
|
||||
params.num_layers = count_prefix_blocks(tensor_storage_map, prefix + ".", "transformer_blocks.");
|
||||
|
||||
auto connector_iter = tensor_storage_map.find(prefix + ".video_embeddings_connector.transformer_1d_blocks.0.attn1.to_q.weight");
|
||||
if (connector_iter != tensor_storage_map.end()) {
|
||||
params.use_connector = true;
|
||||
params.connector_hidden_size = connector_iter->second.ne[1];
|
||||
int64_t connector_heads = infer_gate_heads(tensor_storage_map,
|
||||
prefix + ".video_embeddings_connector.transformer_1d_blocks.0.attn1.to_gate_logits.bias",
|
||||
32);
|
||||
auto connector_layout = infer_attention_layout(params.connector_hidden_size, connector_heads);
|
||||
params.connector_num_heads = connector_layout.first;
|
||||
params.connector_head_dim = connector_layout.second;
|
||||
params.connector_num_layers = count_prefix_blocks(tensor_storage_map, prefix + ".video_embeddings_connector.", "transformer_1d_blocks.");
|
||||
auto register_iter = tensor_storage_map.find(prefix + ".video_embeddings_connector.learnable_registers");
|
||||
if (register_iter != tensor_storage_map.end()) {
|
||||
params.connector_num_registers = register_iter->second.ne[1];
|
||||
}
|
||||
if (tensor_storage_map.find(prefix + ".video_embeddings_connector.transformer_1d_blocks.0.attn1.to_gate_logits.weight") != tensor_storage_map.end()) {
|
||||
params.connector_apply_gated_attention = true;
|
||||
}
|
||||
}
|
||||
|
||||
auto audio_connector_iter = tensor_storage_map.find(prefix + ".audio_embeddings_connector.transformer_1d_blocks.0.attn1.to_q.weight");
|
||||
if (audio_connector_iter != tensor_storage_map.end()) {
|
||||
params.use_audio_connector = true;
|
||||
params.audio_connector_hidden_size = audio_connector_iter->second.ne[1];
|
||||
int64_t connector_heads = infer_gate_heads(tensor_storage_map,
|
||||
prefix + ".audio_embeddings_connector.transformer_1d_blocks.0.attn1.to_gate_logits.bias",
|
||||
32);
|
||||
auto connector_layout = infer_attention_layout(params.audio_connector_hidden_size, connector_heads);
|
||||
params.audio_connector_num_heads = connector_layout.first;
|
||||
params.audio_connector_head_dim = connector_layout.second;
|
||||
params.audio_connector_num_layers = count_prefix_blocks(tensor_storage_map, prefix + ".audio_embeddings_connector.", "transformer_1d_blocks.");
|
||||
auto register_iter = tensor_storage_map.find(prefix + ".audio_embeddings_connector.learnable_registers");
|
||||
if (register_iter != tensor_storage_map.end()) {
|
||||
params.audio_connector_num_registers = register_iter->second.ne[1];
|
||||
}
|
||||
if (tensor_storage_map.find(prefix + ".audio_embeddings_connector.transformer_1d_blocks.0.attn1.to_gate_logits.weight") != tensor_storage_map.end()) {
|
||||
params.audio_connector_apply_gated_attention = true;
|
||||
}
|
||||
}
|
||||
|
||||
model = LTXAVModelBlock(params);
|
||||
config(LTXAVConfig::detect_from_weights(tensor_storage_map, prefix)),
|
||||
model(config) {
|
||||
model.init(params_ctx, tensor_storage_map, prefix);
|
||||
}
|
||||
|
||||
@ -1692,21 +1718,21 @@ namespace LTXV {
|
||||
int64_t total_channels = x_tensor.shape()[3];
|
||||
int64_t spatial_size = width * height * frames;
|
||||
|
||||
GGML_ASSERT(total_channels >= params.in_channels);
|
||||
GGML_ASSERT(total_channels >= config.in_channels);
|
||||
|
||||
sd::Tensor<float> vx({width, height, frames, params.in_channels});
|
||||
size_t video_values = static_cast<size_t>(params.in_channels * spatial_size);
|
||||
sd::Tensor<float> vx({width, height, frames, config.in_channels});
|
||||
size_t video_values = static_cast<size_t>(config.in_channels * spatial_size);
|
||||
std::copy_n(x_tensor.data(), video_values, vx.data());
|
||||
|
||||
if (audio_length <= 0 || total_channels == params.in_channels) {
|
||||
if (audio_length <= 0 || total_channels == config.in_channels) {
|
||||
return {vx, {}};
|
||||
}
|
||||
|
||||
int64_t needed_audio_values = static_cast<int64_t>(audio_length) * params.num_audio_channels * params.audio_frequency_bins;
|
||||
int64_t packed_audio_values = (total_channels - params.in_channels) * spatial_size;
|
||||
int64_t needed_audio_values = static_cast<int64_t>(audio_length) * config.num_audio_channels * config.audio_frequency_bins;
|
||||
int64_t packed_audio_values = (total_channels - config.in_channels) * spatial_size;
|
||||
GGML_ASSERT(packed_audio_values >= needed_audio_values);
|
||||
|
||||
sd::Tensor<float> ax({params.audio_frequency_bins, audio_length, params.num_audio_channels, 1});
|
||||
sd::Tensor<float> ax({config.audio_frequency_bins, audio_length, config.num_audio_channels, 1});
|
||||
const float* audio_src = x_tensor.data() + video_values;
|
||||
std::copy_n(audio_src, static_cast<size_t>(needed_audio_values), ax.data());
|
||||
return {vx, ax};
|
||||
@ -1767,25 +1793,25 @@ namespace LTXV {
|
||||
if (has_video_positions) {
|
||||
GGML_ASSERT(video_positions_tensor.shape()[2] == video_token_count);
|
||||
video_pe_vec = build_video_rope_matrix_from_positions(video_positions_tensor,
|
||||
static_cast<int>(params.hidden_size),
|
||||
static_cast<int>(params.num_attention_heads),
|
||||
params.positional_embedding_theta,
|
||||
params.positional_embedding_max_pos,
|
||||
params.use_middle_indices_grid);
|
||||
static_cast<int>(config.hidden_size),
|
||||
static_cast<int>(config.num_attention_heads),
|
||||
config.positional_embedding_theta,
|
||||
config.positional_embedding_max_pos,
|
||||
config.use_middle_indices_grid);
|
||||
} else {
|
||||
video_pe_vec = build_video_rope_matrix(vx->ne[0],
|
||||
vx->ne[1],
|
||||
vx->ne[2],
|
||||
static_cast<int>(params.hidden_size),
|
||||
static_cast<int>(params.num_attention_heads),
|
||||
static_cast<int>(config.hidden_size),
|
||||
static_cast<int>(config.num_attention_heads),
|
||||
video_frame_rate,
|
||||
params.positional_embedding_theta,
|
||||
params.positional_embedding_max_pos,
|
||||
params.vae_scale_factors,
|
||||
params.causal_temporal_positioning,
|
||||
params.use_middle_indices_grid);
|
||||
config.positional_embedding_theta,
|
||||
config.positional_embedding_max_pos,
|
||||
config.vae_scale_factors,
|
||||
config.causal_temporal_positioning,
|
||||
config.use_middle_indices_grid);
|
||||
}
|
||||
auto video_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.attention_head_dim / 2, video_token_count * params.num_attention_heads);
|
||||
auto video_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.attention_head_dim / 2, video_token_count * config.num_attention_heads);
|
||||
ggml_set_name(video_pe, "ltxav_video_pe");
|
||||
set_backend_tensor_data(video_pe, video_pe_vec.data());
|
||||
|
||||
@ -1794,66 +1820,66 @@ namespace LTXV {
|
||||
ggml_tensor* audio_cross_pe = nullptr;
|
||||
if (ax != nullptr && ggml_nelements(ax) > 0 && ax->ne[1] > 0) {
|
||||
audio_pe_vec = build_audio_rope_matrix(ax->ne[1],
|
||||
static_cast<int>(params.audio_hidden_size),
|
||||
static_cast<int>(params.audio_num_attention_heads),
|
||||
params.positional_embedding_theta,
|
||||
params.audio_positional_embedding_max_pos[0],
|
||||
params.use_middle_indices_grid);
|
||||
audio_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, ax->ne[1] * params.audio_num_attention_heads);
|
||||
static_cast<int>(config.audio_hidden_size),
|
||||
static_cast<int>(config.audio_num_attention_heads),
|
||||
config.positional_embedding_theta,
|
||||
config.audio_positional_embedding_max_pos[0],
|
||||
config.use_middle_indices_grid);
|
||||
audio_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.audio_attention_head_dim / 2, ax->ne[1] * config.audio_num_attention_heads);
|
||||
ggml_set_name(audio_pe, "ltxav_audio_pe");
|
||||
set_backend_tensor_data(audio_pe, audio_pe_vec.data());
|
||||
|
||||
int temporal_max_pos = std::max(params.positional_embedding_max_pos[0], params.audio_positional_embedding_max_pos[0]);
|
||||
int temporal_max_pos = std::max(config.positional_embedding_max_pos[0], config.audio_positional_embedding_max_pos[0]);
|
||||
if (has_video_positions) {
|
||||
video_cross_pe_vec = build_video_temporal_rope_matrix_from_positions(video_positions_tensor,
|
||||
static_cast<int>(params.audio_cross_attention_dim),
|
||||
static_cast<int>(params.audio_num_attention_heads),
|
||||
params.positional_embedding_theta,
|
||||
static_cast<int>(config.audio_cross_attention_dim),
|
||||
static_cast<int>(config.audio_num_attention_heads),
|
||||
config.positional_embedding_theta,
|
||||
temporal_max_pos,
|
||||
true);
|
||||
} else {
|
||||
video_cross_pe_vec = build_video_temporal_rope_matrix(vx->ne[0],
|
||||
vx->ne[1],
|
||||
vx->ne[2],
|
||||
static_cast<int>(params.audio_cross_attention_dim),
|
||||
static_cast<int>(params.audio_num_attention_heads),
|
||||
static_cast<int>(config.audio_cross_attention_dim),
|
||||
static_cast<int>(config.audio_num_attention_heads),
|
||||
video_frame_rate,
|
||||
params.positional_embedding_theta,
|
||||
config.positional_embedding_theta,
|
||||
temporal_max_pos,
|
||||
std::get<0>(params.vae_scale_factors),
|
||||
params.causal_temporal_positioning,
|
||||
std::get<0>(config.vae_scale_factors),
|
||||
config.causal_temporal_positioning,
|
||||
true);
|
||||
}
|
||||
video_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, video_token_count * params.audio_num_attention_heads);
|
||||
video_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.audio_attention_head_dim / 2, video_token_count * config.audio_num_attention_heads);
|
||||
ggml_set_name(video_cross_pe, "ltxav_video_cross_pe");
|
||||
set_backend_tensor_data(video_cross_pe, video_cross_pe_vec.data());
|
||||
|
||||
audio_cross_pe_vec = build_audio_rope_matrix(ax->ne[1],
|
||||
static_cast<int>(params.audio_cross_attention_dim),
|
||||
static_cast<int>(params.audio_num_attention_heads),
|
||||
params.positional_embedding_theta,
|
||||
static_cast<int>(config.audio_cross_attention_dim),
|
||||
static_cast<int>(config.audio_num_attention_heads),
|
||||
config.positional_embedding_theta,
|
||||
temporal_max_pos,
|
||||
true);
|
||||
audio_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, ax->ne[1] * params.audio_num_attention_heads);
|
||||
audio_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.audio_attention_head_dim / 2, ax->ne[1] * config.audio_num_attention_heads);
|
||||
ggml_set_name(audio_cross_pe, "ltxav_audio_cross_pe");
|
||||
set_backend_tensor_data(audio_cross_pe, audio_cross_pe_vec.data());
|
||||
}
|
||||
|
||||
bool needs_video_connector_pe =
|
||||
params.use_connector &&
|
||||
config.use_connector &&
|
||||
context != nullptr &&
|
||||
(context->ne[0] == params.connector_hidden_size ||
|
||||
((context->ne[0] == params.cross_attention_dim + params.audio_cross_attention_dim ||
|
||||
context->ne[0] == params.caption_channels * 2) &&
|
||||
(context->ne[0] == config.connector_hidden_size ||
|
||||
((context->ne[0] == config.cross_attention_dim + config.audio_cross_attention_dim ||
|
||||
context->ne[0] == config.caption_channels * 2) &&
|
||||
context->ne[1] < 1024));
|
||||
ggml_tensor* video_connector_pe = nullptr;
|
||||
if (needs_video_connector_pe) {
|
||||
int64_t seq_len = context->ne[1];
|
||||
int64_t target_len = std::max<int64_t>(1024, seq_len);
|
||||
int64_t duplications = (target_len + params.connector_num_registers - 1) / params.connector_num_registers;
|
||||
int64_t full_len = seq_len + duplications * params.connector_num_registers - seq_len;
|
||||
connector_pe_vec = build_1d_rope_matrix(full_len, static_cast<int>(params.connector_hidden_size), static_cast<int>(params.connector_num_heads), 10000.f, 4096.f, true);
|
||||
video_connector_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.connector_head_dim / 2, full_len * params.connector_num_heads);
|
||||
int64_t duplications = (target_len + config.connector_num_registers - 1) / config.connector_num_registers;
|
||||
int64_t full_len = seq_len + duplications * config.connector_num_registers - seq_len;
|
||||
connector_pe_vec = build_1d_rope_matrix(full_len, static_cast<int>(config.connector_hidden_size), static_cast<int>(config.connector_num_heads), 10000.f, 4096.f, true);
|
||||
video_connector_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.connector_head_dim / 2, full_len * config.connector_num_heads);
|
||||
ggml_set_name(video_connector_pe, "ltxav_video_connector_pe");
|
||||
set_backend_tensor_data(video_connector_pe, connector_pe_vec.data());
|
||||
}
|
||||
@ -1864,20 +1890,20 @@ namespace LTXV {
|
||||
ax->ne[1] > 0;
|
||||
bool needs_audio_connector_pe =
|
||||
run_audio_context &&
|
||||
params.use_audio_connector &&
|
||||
config.use_audio_connector &&
|
||||
context != nullptr &&
|
||||
(context->ne[0] == params.audio_connector_hidden_size ||
|
||||
((context->ne[0] == params.cross_attention_dim + params.audio_cross_attention_dim ||
|
||||
context->ne[0] == params.caption_channels * 2) &&
|
||||
(context->ne[0] == config.audio_connector_hidden_size ||
|
||||
((context->ne[0] == config.cross_attention_dim + config.audio_cross_attention_dim ||
|
||||
context->ne[0] == config.caption_channels * 2) &&
|
||||
context->ne[1] < 1024));
|
||||
ggml_tensor* audio_connector_pe = nullptr;
|
||||
if (needs_audio_connector_pe) {
|
||||
int64_t seq_len = context->ne[1];
|
||||
int64_t target_len = std::max<int64_t>(1024, seq_len);
|
||||
int64_t duplications = (target_len + params.audio_connector_num_registers - 1) / params.audio_connector_num_registers;
|
||||
int64_t full_len = seq_len + duplications * params.audio_connector_num_registers - seq_len;
|
||||
audio_connector_pe_vec = build_1d_rope_matrix(full_len, static_cast<int>(params.audio_connector_hidden_size), static_cast<int>(params.audio_connector_num_heads), 10000.f, 4096.f, true);
|
||||
audio_connector_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_connector_head_dim / 2, full_len * params.audio_connector_num_heads);
|
||||
int64_t duplications = (target_len + config.audio_connector_num_registers - 1) / config.audio_connector_num_registers;
|
||||
int64_t full_len = seq_len + duplications * config.audio_connector_num_registers - seq_len;
|
||||
audio_connector_pe_vec = build_1d_rope_matrix(full_len, static_cast<int>(config.audio_connector_hidden_size), static_cast<int>(config.audio_connector_num_heads), 10000.f, 4096.f, true);
|
||||
audio_connector_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.audio_connector_head_dim / 2, full_len * config.audio_connector_num_heads);
|
||||
ggml_set_name(audio_connector_pe, "ltxav_audio_connector_pe");
|
||||
set_backend_tensor_data(audio_connector_pe, audio_connector_pe_vec.data());
|
||||
}
|
||||
@ -2036,4 +2062,4 @@ namespace LTXV {
|
||||
|
||||
}; // namespace LTXV
|
||||
|
||||
#endif
|
||||
#endif // __SD_MODEL_DIFFUSION_LTXV_HPP__
|
||||
@ -1,42 +1,137 @@
|
||||
#ifndef __MMDIT_HPP__
|
||||
#define __MMDIT_HPP__
|
||||
#ifndef __SD_MODEL_DIFFUSION_MMDIT_HPP__
|
||||
#define __SD_MODEL_DIFFUSION_MMDIT_HPP__
|
||||
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "diffusion_model.hpp"
|
||||
#include "ggml_extend.hpp"
|
||||
#include "core/ggml_extend.hpp"
|
||||
#include "model.h"
|
||||
#include "model/common/block.hpp"
|
||||
#include "model/diffusion/model.hpp"
|
||||
|
||||
#define MMDIT_GRAPH_SIZE 10240
|
||||
|
||||
struct Mlp : public GGMLBlock {
|
||||
public:
|
||||
Mlp(int64_t in_features,
|
||||
int64_t hidden_features = -1,
|
||||
int64_t out_features = -1,
|
||||
bool bias = true) {
|
||||
// act_layer is always lambda: nn.GELU(approximate="tanh")
|
||||
// norm_layer is always None
|
||||
// use_conv is always False
|
||||
if (hidden_features == -1) {
|
||||
hidden_features = in_features;
|
||||
}
|
||||
if (out_features == -1) {
|
||||
out_features = in_features;
|
||||
}
|
||||
blocks["fc1"] = std::shared_ptr<GGMLBlock>(new Linear(in_features, hidden_features, bias));
|
||||
blocks["fc2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_features, out_features, bias));
|
||||
}
|
||||
struct MMDiTConfig {
|
||||
int64_t input_size = -1;
|
||||
int patch_size = 2;
|
||||
int64_t in_channels = 16;
|
||||
int64_t d_self = -1; // >=0 for MMdiT-X
|
||||
int64_t depth = 24;
|
||||
float mlp_ratio = 4.0f;
|
||||
int64_t adm_in_channels = 2048;
|
||||
int64_t out_channels = 16;
|
||||
int64_t pos_embed_max_size = 192;
|
||||
int64_t num_patches = 36864; // 192 * 192
|
||||
int64_t context_size = 4096;
|
||||
int64_t context_embedder_out_dim = 1536;
|
||||
int64_t hidden_size = 1536;
|
||||
std::string qk_norm;
|
||||
|
||||
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
|
||||
// x: [N, n_token, in_features]
|
||||
auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
|
||||
auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);
|
||||
static MMDiTConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
|
||||
MMDiTConfig config;
|
||||
bool has_weight_config = false;
|
||||
bool has_pos_embed = false;
|
||||
bool has_hidden_size = false;
|
||||
bool has_context_embed = false;
|
||||
|
||||
x = fc1->forward(ctx, x);
|
||||
x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
|
||||
x = fc2->forward(ctx, x);
|
||||
return x;
|
||||
for (const auto& [name, tensor_storage] : tensor_storage_map) {
|
||||
if (!starts_with(name, prefix)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (name.find("x_embedder.proj.weight") != std::string::npos && tensor_storage.n_dims == 4) {
|
||||
has_weight_config = true;
|
||||
has_hidden_size = true;
|
||||
config.patch_size = static_cast<int>(tensor_storage.ne[0]);
|
||||
config.in_channels = tensor_storage.ne[2];
|
||||
config.hidden_size = tensor_storage.ne[3];
|
||||
} else if (name.find("t_embedder.mlp.0.weight") != std::string::npos && tensor_storage.n_dims == 2) {
|
||||
has_weight_config = true;
|
||||
has_hidden_size = true;
|
||||
config.hidden_size = tensor_storage.ne[1];
|
||||
} else if (name.find("y_embedder.mlp.0.weight") != std::string::npos && tensor_storage.n_dims == 2) {
|
||||
has_weight_config = true;
|
||||
has_hidden_size = true;
|
||||
config.adm_in_channels = tensor_storage.ne[0];
|
||||
config.hidden_size = tensor_storage.ne[1];
|
||||
} else if (name.find("context_embedder.weight") != std::string::npos && tensor_storage.n_dims == 2) {
|
||||
has_weight_config = true;
|
||||
has_context_embed = true;
|
||||
config.context_size = tensor_storage.ne[0];
|
||||
config.context_embedder_out_dim = tensor_storage.ne[1];
|
||||
} else if (name.find("final_layer.linear.weight") != std::string::npos && tensor_storage.n_dims == 2) {
|
||||
has_weight_config = true;
|
||||
has_hidden_size = true;
|
||||
config.hidden_size = tensor_storage.ne[0];
|
||||
int64_t patch_area = static_cast<int64_t>(config.patch_size) * config.patch_size;
|
||||
if (patch_area > 0) {
|
||||
config.out_channels = tensor_storage.ne[1] / patch_area;
|
||||
}
|
||||
} else if (name.find("pos_embed") != std::string::npos && tensor_storage.n_dims == 3) {
|
||||
has_weight_config = true;
|
||||
has_pos_embed = true;
|
||||
has_hidden_size = true;
|
||||
config.hidden_size = tensor_storage.ne[0];
|
||||
config.num_patches = tensor_storage.ne[1];
|
||||
for (int64_t size = 1; size * size <= config.num_patches; size++) {
|
||||
if (size * size == config.num_patches) {
|
||||
config.pos_embed_max_size = size;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t jb = name.find("joint_blocks.");
|
||||
if (jb == std::string::npos) {
|
||||
continue;
|
||||
}
|
||||
|
||||
has_weight_config = true;
|
||||
std::string block_name = name.substr(jb);
|
||||
int64_t block_depth = atoi(block_name.substr(13, block_name.find(".", 13)).c_str());
|
||||
if (block_depth + 1 > config.depth) {
|
||||
config.depth = block_depth + 1;
|
||||
}
|
||||
if (block_name.find("attn.ln") != std::string::npos) {
|
||||
if (block_name.find(".bias") != std::string::npos) {
|
||||
config.qk_norm = "ln";
|
||||
} else {
|
||||
config.qk_norm = "rms";
|
||||
}
|
||||
}
|
||||
if (block_name.find("attn2") != std::string::npos) {
|
||||
if (block_depth > config.d_self) {
|
||||
config.d_self = block_depth;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!has_pos_embed && config.d_self >= 0) {
|
||||
config.pos_embed_max_size *= 2;
|
||||
config.num_patches *= 4;
|
||||
}
|
||||
if (!has_hidden_size || config.hidden_size <= 0) {
|
||||
config.hidden_size = 64 * config.depth;
|
||||
}
|
||||
if (!has_context_embed || config.context_embedder_out_dim <= 0) {
|
||||
config.context_embedder_out_dim = config.hidden_size;
|
||||
}
|
||||
|
||||
if (has_weight_config) {
|
||||
LOG_DEBUG("mmdit: num_layers = %" PRId64 ", num_mmdit_x_layers = %" PRId64 ", hidden_size = %" PRId64 ", patch_size = %d, in_channels = %" PRId64 ", out_channels = %" PRId64 ", context_size = %" PRId64 ", adm_in_channels = %" PRId64 ", qk_norm = %s",
|
||||
config.depth,
|
||||
config.d_self + 1,
|
||||
config.hidden_size,
|
||||
config.patch_size,
|
||||
config.in_channels,
|
||||
config.out_channels,
|
||||
config.context_size,
|
||||
config.adm_in_channels,
|
||||
config.qk_norm.empty() ? "none" : config.qk_norm.c_str());
|
||||
}
|
||||
return config;
|
||||
}
|
||||
};
|
||||
|
||||
@ -612,28 +707,16 @@ public:
|
||||
struct MMDiT : public GGMLBlock {
|
||||
// Diffusion model with a Transformer backbone.
|
||||
protected:
|
||||
int64_t input_size = -1;
|
||||
int patch_size = 2;
|
||||
int64_t in_channels = 16;
|
||||
int64_t d_self = -1; // >=0 for MMdiT-X
|
||||
int64_t depth = 24;
|
||||
float mlp_ratio = 4.0f;
|
||||
int64_t adm_in_channels = 2048;
|
||||
int64_t out_channels = 16;
|
||||
int64_t pos_embed_max_size = 192;
|
||||
int64_t num_patchs = 36864; // 192 * 192
|
||||
int64_t context_size = 4096;
|
||||
int64_t context_embedder_out_dim = 1536;
|
||||
int64_t hidden_size;
|
||||
std::string qk_norm;
|
||||
|
||||
void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
|
||||
enum ggml_type wtype = GGML_TYPE_F32;
|
||||
params["pos_embed"] = ggml_new_tensor_3d(ctx, wtype, hidden_size, num_patchs, 1);
|
||||
params["pos_embed"] = ggml_new_tensor_3d(ctx, wtype, config.hidden_size, config.num_patches, 1);
|
||||
}
|
||||
|
||||
public:
|
||||
MMDiT(const String2TensorStorage& tensor_storage_map = {}) {
|
||||
MMDiTConfig config;
|
||||
|
||||
explicit MMDiT(MMDiTConfig config = {})
|
||||
: config(config) {
|
||||
// input_size is always None
|
||||
// learn_sigma is always False
|
||||
// register_length is alwalys 0
|
||||
@ -646,64 +729,30 @@ public:
|
||||
// pos_embed_offset is not used
|
||||
// context_embedder_config is always {'target': 'torch.nn.Linear', 'params': {'in_features': 4096, 'out_features': 1536}}
|
||||
|
||||
for (auto pair : tensor_storage_map) {
|
||||
std::string tensor_name = pair.first;
|
||||
if (tensor_name.find("model.diffusion_model.") == std::string::npos)
|
||||
continue;
|
||||
size_t jb = tensor_name.find("joint_blocks.");
|
||||
if (jb != std::string::npos) {
|
||||
tensor_name = tensor_name.substr(jb); // remove prefix
|
||||
int block_depth = atoi(tensor_name.substr(13, tensor_name.find(".", 13)).c_str());
|
||||
if (block_depth + 1 > depth) {
|
||||
depth = block_depth + 1;
|
||||
}
|
||||
if (tensor_name.find("attn.ln") != std::string::npos) {
|
||||
if (tensor_name.find(".bias") != std::string::npos) {
|
||||
qk_norm = "ln";
|
||||
} else {
|
||||
qk_norm = "rms";
|
||||
}
|
||||
}
|
||||
if (tensor_name.find("attn2") != std::string::npos) {
|
||||
if (block_depth > d_self) {
|
||||
d_self = block_depth;
|
||||
}
|
||||
}
|
||||
}
|
||||
blocks["x_embedder"] = std::shared_ptr<GGMLBlock>(new PatchEmbed(config.input_size,
|
||||
config.patch_size,
|
||||
config.in_channels,
|
||||
config.hidden_size,
|
||||
true));
|
||||
blocks["t_embedder"] = std::shared_ptr<GGMLBlock>(new TimestepEmbedder(config.hidden_size));
|
||||
|
||||
if (config.adm_in_channels != -1) {
|
||||
blocks["y_embedder"] = std::shared_ptr<GGMLBlock>(new VectorEmbedder(config.adm_in_channels, config.hidden_size));
|
||||
}
|
||||
|
||||
if (d_self >= 0) {
|
||||
pos_embed_max_size *= 2;
|
||||
num_patchs *= 4;
|
||||
}
|
||||
blocks["context_embedder"] = std::shared_ptr<GGMLBlock>(new Linear(config.context_size, config.context_embedder_out_dim, true, true));
|
||||
|
||||
LOG_INFO("MMDiT layers: %d (including %d MMDiT-x layers)", depth, d_self + 1);
|
||||
|
||||
int64_t default_out_channels = in_channels;
|
||||
hidden_size = 64 * depth;
|
||||
context_embedder_out_dim = 64 * depth;
|
||||
int64_t num_heads = depth;
|
||||
|
||||
blocks["x_embedder"] = std::shared_ptr<GGMLBlock>(new PatchEmbed(input_size, patch_size, in_channels, hidden_size, true));
|
||||
blocks["t_embedder"] = std::shared_ptr<GGMLBlock>(new TimestepEmbedder(hidden_size));
|
||||
|
||||
if (adm_in_channels != -1) {
|
||||
blocks["y_embedder"] = std::shared_ptr<GGMLBlock>(new VectorEmbedder(adm_in_channels, hidden_size));
|
||||
}
|
||||
|
||||
blocks["context_embedder"] = std::shared_ptr<GGMLBlock>(new Linear(4096, context_embedder_out_dim, true, true));
|
||||
|
||||
for (int i = 0; i < depth; i++) {
|
||||
blocks["joint_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new JointBlock(hidden_size,
|
||||
num_heads,
|
||||
mlp_ratio,
|
||||
qk_norm,
|
||||
for (int i = 0; i < config.depth; i++) {
|
||||
blocks["joint_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new JointBlock(config.hidden_size,
|
||||
config.depth,
|
||||
config.mlp_ratio,
|
||||
config.qk_norm,
|
||||
true,
|
||||
i == depth - 1,
|
||||
i <= d_self));
|
||||
i == config.depth - 1,
|
||||
i <= config.d_self));
|
||||
}
|
||||
|
||||
blocks["final_layer"] = std::shared_ptr<GGMLBlock>(new FinalLayer(hidden_size, patch_size, out_channels));
|
||||
blocks["final_layer"] = std::shared_ptr<GGMLBlock>(new FinalLayer(config.hidden_size, config.patch_size, config.out_channels));
|
||||
}
|
||||
|
||||
ggml_tensor*
|
||||
@ -712,22 +761,22 @@ public:
|
||||
int64_t w) {
|
||||
auto pos_embed = params["pos_embed"];
|
||||
|
||||
h = (h + 1) / patch_size;
|
||||
w = (w + 1) / patch_size;
|
||||
h = (h + 1) / config.patch_size;
|
||||
w = (w + 1) / config.patch_size;
|
||||
|
||||
GGML_ASSERT(h <= pos_embed_max_size && h > 0);
|
||||
GGML_ASSERT(w <= pos_embed_max_size && w > 0);
|
||||
GGML_ASSERT(h <= config.pos_embed_max_size && h > 0);
|
||||
GGML_ASSERT(w <= config.pos_embed_max_size && w > 0);
|
||||
|
||||
int64_t top = (pos_embed_max_size - h) / 2;
|
||||
int64_t left = (pos_embed_max_size - w) / 2;
|
||||
int64_t top = (config.pos_embed_max_size - h) / 2;
|
||||
int64_t left = (config.pos_embed_max_size - w) / 2;
|
||||
|
||||
auto spatial_pos_embed = ggml_reshape_3d(ctx, pos_embed, hidden_size, pos_embed_max_size, pos_embed_max_size);
|
||||
auto spatial_pos_embed = ggml_reshape_3d(ctx, pos_embed, config.hidden_size, config.pos_embed_max_size, config.pos_embed_max_size);
|
||||
|
||||
// spatial_pos_embed = spatial_pos_embed[:, top : top + h, left : left + w, :]
|
||||
spatial_pos_embed = ggml_view_3d(ctx,
|
||||
spatial_pos_embed,
|
||||
hidden_size,
|
||||
pos_embed_max_size,
|
||||
config.hidden_size,
|
||||
config.pos_embed_max_size,
|
||||
h,
|
||||
spatial_pos_embed->nb[1],
|
||||
spatial_pos_embed->nb[2],
|
||||
@ -735,14 +784,14 @@ public:
|
||||
spatial_pos_embed = ggml_cont(ctx, ggml_permute(ctx, spatial_pos_embed, 0, 2, 1, 3)); // [pos_embed_max_size, h, hidden_size]
|
||||
spatial_pos_embed = ggml_view_3d(ctx,
|
||||
spatial_pos_embed,
|
||||
hidden_size,
|
||||
config.hidden_size,
|
||||
h,
|
||||
w,
|
||||
spatial_pos_embed->nb[1],
|
||||
spatial_pos_embed->nb[2],
|
||||
spatial_pos_embed->nb[2] * left); // [w, h, hidden_size]
|
||||
spatial_pos_embed = ggml_cont(ctx, ggml_permute(ctx, spatial_pos_embed, 0, 2, 1, 3)); // [h, w, hidden_size]
|
||||
spatial_pos_embed = ggml_reshape_3d(ctx, spatial_pos_embed, hidden_size, h * w, 1); // [1, h*w, hidden_size]
|
||||
spatial_pos_embed->nb[2] * left); // [w, h, hidden_size]
|
||||
spatial_pos_embed = ggml_cont(ctx, ggml_permute(ctx, spatial_pos_embed, 0, 2, 1, 3)); // [h, w, hidden_size]
|
||||
spatial_pos_embed = ggml_reshape_3d(ctx, spatial_pos_embed, config.hidden_size, h * w, 1); // [1, h*w, hidden_size]
|
||||
return spatial_pos_embed;
|
||||
}
|
||||
|
||||
@ -757,7 +806,7 @@ public:
|
||||
// return: [N, N*W, patch_size * patch_size * out_channels]
|
||||
auto final_layer = std::dynamic_pointer_cast<FinalLayer>(blocks["final_layer"]);
|
||||
|
||||
for (int i = 0; i < depth; i++) {
|
||||
for (int i = 0; i < config.depth; i++) {
|
||||
// skip iteration if i is in skip_layers
|
||||
if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i) != skip_layers.end()) {
|
||||
continue;
|
||||
@ -800,7 +849,7 @@ public:
|
||||
x = ggml_add(ctx->ggml_ctx, patch_embed, pos_embed); // [N, H*W, hidden_size]
|
||||
|
||||
auto c = t_embedder->forward(ctx, t); // [N, hidden_size]
|
||||
if (y != nullptr && adm_in_channels != -1) {
|
||||
if (y != nullptr && config.adm_in_channels != -1) {
|
||||
auto y_embedder = std::dynamic_pointer_cast<VectorEmbedder>(blocks["y_embedder"]);
|
||||
|
||||
y = y_embedder->forward(ctx, y); // [N, hidden_size]
|
||||
@ -820,19 +869,22 @@ public:
|
||||
|
||||
x = forward_core_with_concat(ctx, x, c, context, skip_layers); // (N, H*W, patch_size ** 2 * out_channels)
|
||||
|
||||
x = DiT::unpatchify_and_crop(ctx->ggml_ctx, x, H, W, patch_size, patch_size, /*patch_last*/ false); // [N, C, H, W]
|
||||
x = DiT::unpatchify_and_crop(ctx->ggml_ctx, x, H, W, config.patch_size, config.patch_size, /*patch_last*/ false); // [N, C, H, W]
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
struct MMDiTRunner : public DiffusionModelRunner {
|
||||
MMDiTConfig config;
|
||||
MMDiT mmdit;
|
||||
|
||||
MMDiTRunner(ggml_backend_t backend,
|
||||
ggml_backend_t params_backend,
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string prefix = "")
|
||||
: DiffusionModelRunner(backend, params_backend, prefix), mmdit(tensor_storage_map) {
|
||||
: DiffusionModelRunner(backend, params_backend, prefix),
|
||||
config(MMDiTConfig::detect_from_weights(tensor_storage_map, prefix)),
|
||||
mmdit(config) {
|
||||
mmdit.init(params_ctx, tensor_storage_map, prefix);
|
||||
}
|
||||
|
||||
@ -980,4 +1032,4 @@ struct MMDiTRunner : public DiffusionModelRunner {
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
#endif // __SD_MODEL_DIFFUSION_MMDIT_HPP__
|
||||
@ -1,12 +1,12 @@
|
||||
#ifndef __DIFFUSION_MODEL_H__
|
||||
#define __DIFFUSION_MODEL_H__
|
||||
#ifndef __SD_MODEL_DIFFUSION_MODEL_HPP__
|
||||
#define __SD_MODEL_DIFFUSION_MODEL_HPP__
|
||||
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <variant>
|
||||
|
||||
#include "ggml_extend.hpp"
|
||||
#include "tensor_ggml.hpp"
|
||||
#include "core/ggml_extend.hpp"
|
||||
#include "core/tensor_ggml.hpp"
|
||||
|
||||
struct UNetDiffusionExtra {
|
||||
int num_video_frames = -1;
|
||||
@ -104,4 +104,4 @@ public:
|
||||
const std::string& prefix) = 0;
|
||||
};
|
||||
|
||||
#endif
|
||||
#endif // __SD_MODEL_DIFFUSION_MODEL_HPP__
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef __SD_PID_HPP__
|
||||
#define __SD_PID_HPP__
|
||||
#ifndef __SD_MODEL_DIFFUSION_PID_HPP__
|
||||
#define __SD_MODEL_DIFFUSION_PID_HPP__
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
@ -7,16 +7,16 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "common_dit.hpp"
|
||||
#include "ggml_extend.hpp"
|
||||
#include "mmdit.hpp"
|
||||
#include "rope.hpp"
|
||||
#include "core/ggml_extend.hpp"
|
||||
#include "model/common/rope.hpp"
|
||||
#include "model/diffusion/dit.hpp"
|
||||
#include "model/diffusion/mmdit.hpp"
|
||||
|
||||
namespace Pid {
|
||||
constexpr int PID_GRAPH_SIZE = 196608;
|
||||
constexpr float PID_PI = 3.14159265358979323846f;
|
||||
|
||||
struct PixelDiTParams {
|
||||
struct PixelDiTConfig {
|
||||
int64_t in_channels = 3;
|
||||
int64_t hidden_size = 1536;
|
||||
int64_t num_groups = 24;
|
||||
@ -38,6 +38,45 @@ namespace Pid {
|
||||
int64_t lq_latent_down_factor = 8;
|
||||
int64_t rope_ref_grid_h = 64;
|
||||
int64_t rope_ref_grid_w = 64;
|
||||
|
||||
static PixelDiTConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
|
||||
PixelDiTConfig config;
|
||||
for (const auto& [name, tensor_storage] : tensor_storage_map) {
|
||||
if (!starts_with(name, prefix)) {
|
||||
continue;
|
||||
}
|
||||
size_t pos = name.find("patch_blocks.");
|
||||
if (pos != std::string::npos) {
|
||||
auto items = split_string(name.substr(pos), '.');
|
||||
if (items.size() > 1) {
|
||||
int block_index = atoi(items[1].c_str());
|
||||
config.patch_depth = std::max<int64_t>(config.patch_depth, block_index + 1);
|
||||
}
|
||||
}
|
||||
pos = name.find("pixel_blocks.");
|
||||
if (pos != std::string::npos) {
|
||||
auto items = split_string(name.substr(pos), '.');
|
||||
if (items.size() > 1) {
|
||||
int block_index = atoi(items[1].c_str());
|
||||
config.pixel_depth = std::max<int64_t>(config.pixel_depth, block_index + 1);
|
||||
}
|
||||
}
|
||||
if (name.find("lq_proj.latent_proj.0.weight") != std::string::npos) {
|
||||
config.lq_latent_channels = tensor_storage.ne[2];
|
||||
config.lq_latent_down_factor = config.lq_latent_channels >= 64 ? 16 : 8;
|
||||
}
|
||||
if (name.find("patch_blocks.0.mlp_x.w1.weight") != std::string::npos) {
|
||||
config.patch_mlp_hidden_dim = tensor_storage.ne[1];
|
||||
}
|
||||
}
|
||||
LOG_DEBUG("pid: patch_depth = %" PRId64 ", pixel_depth = %" PRId64 ", patch_mlp_hidden_dim = %" PRId64 ", lq_latent_channels = %" PRId64 ", lq_latent_down_factor = %" PRId64,
|
||||
config.patch_depth,
|
||||
config.pixel_depth,
|
||||
config.patch_mlp_hidden_dim,
|
||||
config.lq_latent_channels,
|
||||
config.lq_latent_down_factor);
|
||||
return config;
|
||||
}
|
||||
};
|
||||
|
||||
inline std::vector<float> make_rope_1d(int length,
|
||||
@ -466,29 +505,29 @@ namespace Pid {
|
||||
};
|
||||
|
||||
struct LQProjection2D : public GGMLBlock {
|
||||
PixelDiTParams params_cfg;
|
||||
PixelDiTConfig config;
|
||||
|
||||
LQProjection2D(const PixelDiTParams& params_cfg)
|
||||
: params_cfg(params_cfg) {
|
||||
blocks["latent_proj.0"] = std::make_shared<Conv2d>(params_cfg.lq_latent_channels, params_cfg.lq_hidden_dim, std::pair<int, int>{3, 3}, std::pair<int, int>{1, 1}, std::pair<int, int>{1, 1});
|
||||
blocks["latent_proj.2"] = std::make_shared<Conv2d>(params_cfg.lq_hidden_dim, params_cfg.lq_hidden_dim, std::pair<int, int>{3, 3}, std::pair<int, int>{1, 1}, std::pair<int, int>{1, 1});
|
||||
for (int i = 0; i < params_cfg.lq_num_res_blocks; ++i) {
|
||||
blocks["latent_proj." + std::to_string(3 + i)] = std::make_shared<PiDResBlock>(params_cfg.lq_hidden_dim);
|
||||
LQProjection2D(const PixelDiTConfig& config)
|
||||
: config(config) {
|
||||
blocks["latent_proj.0"] = std::make_shared<Conv2d>(config.lq_latent_channels, config.lq_hidden_dim, std::pair<int, int>{3, 3}, std::pair<int, int>{1, 1}, std::pair<int, int>{1, 1});
|
||||
blocks["latent_proj.2"] = std::make_shared<Conv2d>(config.lq_hidden_dim, config.lq_hidden_dim, std::pair<int, int>{3, 3}, std::pair<int, int>{1, 1}, std::pair<int, int>{1, 1});
|
||||
for (int i = 0; i < config.lq_num_res_blocks; ++i) {
|
||||
blocks["latent_proj." + std::to_string(3 + i)] = std::make_shared<PiDResBlock>(config.lq_hidden_dim);
|
||||
}
|
||||
|
||||
int num_outputs = static_cast<int>((params_cfg.patch_depth + params_cfg.lq_interval - 1) / params_cfg.lq_interval);
|
||||
int num_outputs = static_cast<int>((config.patch_depth + config.lq_interval - 1) / config.lq_interval);
|
||||
for (int i = 0; i < num_outputs; ++i) {
|
||||
blocks["output_heads." + std::to_string(i)] = std::make_shared<Linear>(params_cfg.lq_hidden_dim, params_cfg.hidden_size, true);
|
||||
blocks["gate_modules." + std::to_string(i)] = std::make_shared<SigmaAwareGate>(params_cfg.hidden_size);
|
||||
blocks["output_heads." + std::to_string(i)] = std::make_shared<Linear>(config.lq_hidden_dim, config.hidden_size, true);
|
||||
blocks["gate_modules." + std::to_string(i)] = std::make_shared<SigmaAwareGate>(config.hidden_size);
|
||||
}
|
||||
}
|
||||
|
||||
bool is_gate_active(int block_idx) const {
|
||||
return block_idx % params_cfg.lq_interval == 0;
|
||||
return block_idx % config.lq_interval == 0;
|
||||
}
|
||||
|
||||
int get_output_index(int block_idx) const {
|
||||
return block_idx / static_cast<int>(params_cfg.lq_interval);
|
||||
return block_idx / static_cast<int>(config.lq_interval);
|
||||
}
|
||||
|
||||
ggml_tensor* gate(GGMLRunnerContext* ctx,
|
||||
@ -506,8 +545,8 @@ namespace Pid {
|
||||
int64_t target_pW) {
|
||||
auto conv0 = std::dynamic_pointer_cast<Conv2d>(blocks["latent_proj.0"]);
|
||||
auto conv2 = std::dynamic_pointer_cast<Conv2d>(blocks["latent_proj.2"]);
|
||||
float z_to_patch_ratio = static_cast<float>(params_cfg.lq_sr_scale * params_cfg.lq_latent_down_factor) /
|
||||
static_cast<float>(params_cfg.patch_size);
|
||||
float z_to_patch_ratio = static_cast<float>(config.lq_sr_scale * config.lq_latent_down_factor) /
|
||||
static_cast<float>(config.patch_size);
|
||||
GGML_ASSERT(z_to_patch_ratio >= 1.0f);
|
||||
if (lq_latent->ne[0] != target_pW || lq_latent->ne[1] != target_pH) {
|
||||
lq_latent = ggml_interpolate(ctx->ggml_ctx,
|
||||
@ -522,7 +561,7 @@ namespace Pid {
|
||||
auto feat = conv0->forward(ctx, lq_latent);
|
||||
feat = ggml_silu_inplace(ctx->ggml_ctx, feat);
|
||||
feat = conv2->forward(ctx, feat);
|
||||
for (int i = 0; i < params_cfg.lq_num_res_blocks; ++i) {
|
||||
for (int i = 0; i < config.lq_num_res_blocks; ++i) {
|
||||
auto block = std::dynamic_pointer_cast<PiDResBlock>(blocks["latent_proj." + std::to_string(3 + i)]);
|
||||
feat = block->forward(ctx, feat);
|
||||
}
|
||||
@ -533,7 +572,7 @@ namespace Pid {
|
||||
auto tokens = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, feat, 2, 0, 1, 3));
|
||||
tokens = ggml_reshape_3d(ctx->ggml_ctx, tokens, C, L, B);
|
||||
|
||||
int num_outputs = static_cast<int>((params_cfg.patch_depth + params_cfg.lq_interval - 1) / params_cfg.lq_interval);
|
||||
int num_outputs = static_cast<int>((config.patch_depth + config.lq_interval - 1) / config.lq_interval);
|
||||
std::vector<ggml_tensor*> outputs;
|
||||
outputs.reserve(num_outputs);
|
||||
for (int i = 0; i < num_outputs; ++i) {
|
||||
@ -545,34 +584,34 @@ namespace Pid {
|
||||
};
|
||||
|
||||
struct PixelDiT : public GGMLBlock {
|
||||
PixelDiTParams params_cfg;
|
||||
PixelDiTConfig config;
|
||||
|
||||
PixelDiT() = default;
|
||||
|
||||
PixelDiT(const PixelDiTParams& params_cfg)
|
||||
: params_cfg(params_cfg) {
|
||||
blocks["pixel_embedder"] = std::make_shared<PixelTokenEmbedder>(params_cfg.in_channels, params_cfg.pixel_hidden_size);
|
||||
blocks["s_embedder"] = std::make_shared<PatchTokenEmbedder>(params_cfg.in_channels * params_cfg.patch_size * params_cfg.patch_size, params_cfg.hidden_size, false, true);
|
||||
blocks["t_embedder"] = std::make_shared<PixelDiTTimestepEmbedder>(params_cfg.hidden_size);
|
||||
blocks["y_embedder"] = std::make_shared<PatchTokenEmbedder>(params_cfg.txt_embed_dim, params_cfg.hidden_size, true, true);
|
||||
for (int i = 0; i < params_cfg.patch_depth; ++i) {
|
||||
blocks["patch_blocks." + std::to_string(i)] = std::make_shared<MMDiTBlockT2I>(params_cfg.hidden_size, params_cfg.num_groups, params_cfg.patch_mlp_hidden_dim);
|
||||
PixelDiT(const PixelDiTConfig& config)
|
||||
: config(config) {
|
||||
blocks["pixel_embedder"] = std::make_shared<PixelTokenEmbedder>(config.in_channels, config.pixel_hidden_size);
|
||||
blocks["s_embedder"] = std::make_shared<PatchTokenEmbedder>(config.in_channels * config.patch_size * config.patch_size, config.hidden_size, false, true);
|
||||
blocks["t_embedder"] = std::make_shared<PixelDiTTimestepEmbedder>(config.hidden_size);
|
||||
blocks["y_embedder"] = std::make_shared<PatchTokenEmbedder>(config.txt_embed_dim, config.hidden_size, true, true);
|
||||
for (int i = 0; i < config.patch_depth; ++i) {
|
||||
blocks["patch_blocks." + std::to_string(i)] = std::make_shared<MMDiTBlockT2I>(config.hidden_size, config.num_groups, config.patch_mlp_hidden_dim);
|
||||
}
|
||||
for (int i = 0; i < params_cfg.pixel_depth; ++i) {
|
||||
blocks["pixel_blocks." + std::to_string(i)] = std::make_shared<PiTBlock>(params_cfg.pixel_hidden_size,
|
||||
params_cfg.hidden_size,
|
||||
params_cfg.patch_size,
|
||||
params_cfg.pixel_attn_hidden_size,
|
||||
params_cfg.pixel_num_groups);
|
||||
for (int i = 0; i < config.pixel_depth; ++i) {
|
||||
blocks["pixel_blocks." + std::to_string(i)] = std::make_shared<PiTBlock>(config.pixel_hidden_size,
|
||||
config.hidden_size,
|
||||
config.patch_size,
|
||||
config.pixel_attn_hidden_size,
|
||||
config.pixel_num_groups);
|
||||
}
|
||||
blocks["final_layer"] = std::make_shared<FinalLayer>(params_cfg.pixel_hidden_size, params_cfg.in_channels);
|
||||
blocks["lq_proj"] = std::make_shared<LQProjection2D>(params_cfg);
|
||||
blocks["final_layer"] = std::make_shared<FinalLayer>(config.pixel_hidden_size, config.in_channels);
|
||||
blocks["lq_proj"] = std::make_shared<LQProjection2D>(config);
|
||||
}
|
||||
|
||||
void init_params(ggml_context* ctx,
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
std::string prefix = "") override {
|
||||
params["y_pos_embedding"] = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, params_cfg.hidden_size, params_cfg.txt_max_length, 1);
|
||||
params["y_pos_embedding"] = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, config.hidden_size, config.txt_max_length, 1);
|
||||
}
|
||||
|
||||
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||
@ -594,21 +633,21 @@ namespace Pid {
|
||||
|
||||
int64_t W_orig = x->ne[0];
|
||||
int64_t H_orig = x->ne[1];
|
||||
x = DiT::pad_to_patch_size(ctx, x, static_cast<int>(params_cfg.patch_size), static_cast<int>(params_cfg.patch_size));
|
||||
x = DiT::pad_to_patch_size(ctx, x, static_cast<int>(config.patch_size), static_cast<int>(config.patch_size));
|
||||
int64_t W = x->ne[0];
|
||||
int64_t H = x->ne[1];
|
||||
int64_t B = x->ne[3];
|
||||
int64_t Hs = H / params_cfg.patch_size;
|
||||
int64_t Ws = W / params_cfg.patch_size;
|
||||
int64_t Hs = H / config.patch_size;
|
||||
int64_t Ws = W / config.patch_size;
|
||||
int64_t L = Hs * Ws;
|
||||
int64_t P2 = params_cfg.patch_size * params_cfg.patch_size;
|
||||
int64_t P2 = config.patch_size * config.patch_size;
|
||||
|
||||
auto x_patches = DiT::patchify(ctx->ggml_ctx, x, static_cast<int>(params_cfg.patch_size), static_cast<int>(params_cfg.patch_size), true);
|
||||
auto x_patches = DiT::patchify(ctx->ggml_ctx, x, static_cast<int>(config.patch_size), static_cast<int>(config.patch_size), true);
|
||||
auto t_emb = t_embedder->forward(ctx, timesteps);
|
||||
auto condition = ggml_silu(ctx->ggml_ctx, t_emb);
|
||||
|
||||
GGML_ASSERT(context != nullptr);
|
||||
int64_t Ltxt = std::min<int64_t>(context->ne[1], params_cfg.txt_max_length);
|
||||
int64_t Ltxt = std::min<int64_t>(context->ne[1], config.txt_max_length);
|
||||
auto y = ggml_ext_slice(ctx->ggml_ctx, context, 1, 0, Ltxt);
|
||||
auto y_emb = y_embedder->forward(ctx, y);
|
||||
auto y_pos = ggml_ext_slice(ctx->ggml_ctx, params["y_pos_embedding"], 1, 0, Ltxt);
|
||||
@ -618,7 +657,7 @@ namespace Pid {
|
||||
|
||||
auto s = s_embedder->forward(ctx, x_patches);
|
||||
|
||||
for (int i = 0; i < params_cfg.patch_depth; ++i) {
|
||||
for (int i = 0; i < config.patch_depth; ++i) {
|
||||
if (lq_proj->is_gate_active(i)) {
|
||||
int out_idx = lq_proj->get_output_index(i);
|
||||
if (out_idx < static_cast<int>(lq_features.size())) {
|
||||
@ -639,22 +678,22 @@ namespace Pid {
|
||||
}
|
||||
s = ggml_silu(ctx->ggml_ctx, ggml_add(ctx->ggml_ctx, s, t_emb));
|
||||
|
||||
auto s_cond = ggml_reshape_2d(ctx->ggml_ctx, s, params_cfg.hidden_size, L * B);
|
||||
auto pixels = pixel_embedder->forward(ctx, x, params_cfg.patch_size, pixel_pos_full);
|
||||
for (int i = 0; i < params_cfg.pixel_depth; ++i) {
|
||||
auto s_cond = ggml_reshape_2d(ctx->ggml_ctx, s, config.hidden_size, L * B);
|
||||
auto pixels = pixel_embedder->forward(ctx, x, config.patch_size, pixel_pos_full);
|
||||
for (int i = 0; i < config.pixel_depth; ++i) {
|
||||
auto block = std::dynamic_pointer_cast<PiTBlock>(blocks["pixel_blocks." + std::to_string(i)]);
|
||||
pixels = block->forward(ctx, pixels, s_cond, H, W, pixel_pos_comp);
|
||||
sd::ggml_graph_cut::mark_graph_cut(pixels, "pid.pixel_blocks." + std::to_string(i), "pixels");
|
||||
}
|
||||
|
||||
pixels = final_layer->forward(ctx, pixels);
|
||||
pixels = ggml_reshape_3d(ctx->ggml_ctx, pixels, params_cfg.in_channels * P2, L, B);
|
||||
pixels = ggml_reshape_3d(ctx->ggml_ctx, pixels, config.in_channels * P2, L, B);
|
||||
auto out = DiT::unpatchify(ctx->ggml_ctx,
|
||||
pixels,
|
||||
Hs,
|
||||
Ws,
|
||||
static_cast<int>(params_cfg.patch_size),
|
||||
static_cast<int>(params_cfg.patch_size),
|
||||
static_cast<int>(config.patch_size),
|
||||
static_cast<int>(config.patch_size),
|
||||
false);
|
||||
out = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, H_orig);
|
||||
out = ggml_ext_slice(ctx->ggml_ctx, out, 0, 0, W_orig);
|
||||
@ -663,7 +702,7 @@ namespace Pid {
|
||||
};
|
||||
|
||||
struct PiDRunner : public DiffusionModelRunner {
|
||||
PixelDiTParams params_cfg;
|
||||
PixelDiTConfig config;
|
||||
PixelDiT model;
|
||||
std::vector<float> pos_img_vec;
|
||||
std::vector<float> pos_txt_vec;
|
||||
@ -674,43 +713,9 @@ namespace Pid {
|
||||
ggml_backend_t params_backend,
|
||||
const String2TensorStorage& tensor_storage_map,
|
||||
const std::string prefix = "model.diffusion_model")
|
||||
: DiffusionModelRunner(backend, params_backend, prefix) {
|
||||
for (const auto& pair : tensor_storage_map) {
|
||||
const std::string& tensor_name = pair.first;
|
||||
if (tensor_name.find(prefix) == std::string::npos) {
|
||||
continue;
|
||||
}
|
||||
size_t pos = tensor_name.find("patch_blocks.");
|
||||
if (pos != std::string::npos) {
|
||||
auto items = split_string(tensor_name.substr(pos), '.');
|
||||
if (items.size() > 1) {
|
||||
int block_index = atoi(items[1].c_str());
|
||||
params_cfg.patch_depth = std::max<int64_t>(params_cfg.patch_depth, block_index + 1);
|
||||
}
|
||||
}
|
||||
pos = tensor_name.find("pixel_blocks.");
|
||||
if (pos != std::string::npos) {
|
||||
auto items = split_string(tensor_name.substr(pos), '.');
|
||||
if (items.size() > 1) {
|
||||
int block_index = atoi(items[1].c_str());
|
||||
params_cfg.pixel_depth = std::max<int64_t>(params_cfg.pixel_depth, block_index + 1);
|
||||
}
|
||||
}
|
||||
if (tensor_name.find("lq_proj.latent_proj.0.weight") != std::string::npos) {
|
||||
params_cfg.lq_latent_channels = pair.second.ne[2];
|
||||
params_cfg.lq_latent_down_factor = params_cfg.lq_latent_channels >= 64 ? 16 : 8;
|
||||
}
|
||||
if (tensor_name.find("patch_blocks.0.mlp_x.w1.weight") != std::string::npos) {
|
||||
params_cfg.patch_mlp_hidden_dim = pair.second.ne[1];
|
||||
}
|
||||
}
|
||||
LOG_INFO("PiD params: patch_depth=%" PRId64 ", pixel_depth=%" PRId64 ", patch_mlp_hidden_dim=%" PRId64 ", lq_latent_channels=%" PRId64 ", lq_latent_down_factor=%" PRId64,
|
||||
params_cfg.patch_depth,
|
||||
params_cfg.pixel_depth,
|
||||
params_cfg.patch_mlp_hidden_dim,
|
||||
params_cfg.lq_latent_channels,
|
||||
params_cfg.lq_latent_down_factor);
|
||||
model = PixelDiT(params_cfg);
|
||||
: DiffusionModelRunner(backend, params_backend, prefix),
|
||||
config(PixelDiTConfig::detect_from_weights(tensor_storage_map, prefix)) {
|
||||
model = PixelDiT(config);
|
||||
model.init(params_ctx, tensor_storage_map, prefix);
|
||||
}
|
||||
|
||||
@ -737,60 +742,60 @@ namespace Pid {
|
||||
int64_t W = x->ne[0];
|
||||
int64_t H = x->ne[1];
|
||||
int64_t B = x->ne[3];
|
||||
int64_t Wp = align_up(static_cast<int>(W), static_cast<int>(params_cfg.patch_size));
|
||||
int64_t Hp = align_up(static_cast<int>(H), static_cast<int>(params_cfg.patch_size));
|
||||
int64_t Hs = Hp / params_cfg.patch_size;
|
||||
int64_t Ws = Wp / params_cfg.patch_size;
|
||||
int64_t Wp = align_up(static_cast<int>(W), static_cast<int>(config.patch_size));
|
||||
int64_t Hp = align_up(static_cast<int>(H), static_cast<int>(config.patch_size));
|
||||
int64_t Hs = Hp / config.patch_size;
|
||||
int64_t Ws = Wp / config.patch_size;
|
||||
|
||||
pos_img_vec = make_rope_2d(static_cast<int>(Hs),
|
||||
static_cast<int>(Ws),
|
||||
static_cast<int>(params_cfg.hidden_size / params_cfg.num_groups),
|
||||
static_cast<int>(config.hidden_size / config.num_groups),
|
||||
10000.f,
|
||||
16.f,
|
||||
static_cast<int>(params_cfg.rope_ref_grid_h),
|
||||
static_cast<int>(params_cfg.rope_ref_grid_w));
|
||||
static_cast<int>(config.rope_ref_grid_h),
|
||||
static_cast<int>(config.rope_ref_grid_w));
|
||||
auto pos_img = ggml_new_tensor_4d(compute_ctx,
|
||||
GGML_TYPE_F32,
|
||||
2,
|
||||
2,
|
||||
params_cfg.hidden_size / params_cfg.num_groups / 2,
|
||||
config.hidden_size / config.num_groups / 2,
|
||||
Hs * Ws);
|
||||
set_backend_tensor_data(pos_img, pos_img_vec.data());
|
||||
|
||||
int64_t Ltxt = std::min<int64_t>(context->ne[1], params_cfg.txt_max_length);
|
||||
int64_t Ltxt = std::min<int64_t>(context->ne[1], config.txt_max_length);
|
||||
pos_txt_vec = make_rope_1d(static_cast<int>(Ltxt),
|
||||
static_cast<int>(params_cfg.hidden_size / params_cfg.num_groups),
|
||||
params_cfg.text_rope_theta);
|
||||
static_cast<int>(config.hidden_size / config.num_groups),
|
||||
config.text_rope_theta);
|
||||
auto pos_txt = ggml_new_tensor_4d(compute_ctx,
|
||||
GGML_TYPE_F32,
|
||||
2,
|
||||
2,
|
||||
params_cfg.hidden_size / params_cfg.num_groups / 2,
|
||||
config.hidden_size / config.num_groups / 2,
|
||||
Ltxt);
|
||||
set_backend_tensor_data(pos_txt, pos_txt_vec.data());
|
||||
|
||||
pixel_pos_vec = make_pixel_abs_pos(static_cast<int>(Hp),
|
||||
static_cast<int>(Wp),
|
||||
static_cast<int>(params_cfg.pixel_hidden_size));
|
||||
static_cast<int>(config.pixel_hidden_size));
|
||||
auto pixel_pos = ggml_new_tensor_3d(compute_ctx,
|
||||
GGML_TYPE_F32,
|
||||
params_cfg.pixel_hidden_size,
|
||||
config.pixel_hidden_size,
|
||||
Wp * Hp,
|
||||
1);
|
||||
set_backend_tensor_data(pixel_pos, pixel_pos_vec.data());
|
||||
|
||||
pixel_pos_comp_vec = make_rope_2d(static_cast<int>(Hs),
|
||||
static_cast<int>(Ws),
|
||||
static_cast<int>(params_cfg.pixel_attn_hidden_size / params_cfg.pixel_num_groups),
|
||||
static_cast<int>(config.pixel_attn_hidden_size / config.pixel_num_groups),
|
||||
10000.f,
|
||||
16.f,
|
||||
static_cast<int>(params_cfg.rope_ref_grid_h),
|
||||
static_cast<int>(params_cfg.rope_ref_grid_w));
|
||||
static_cast<int>(config.rope_ref_grid_h),
|
||||
static_cast<int>(config.rope_ref_grid_w));
|
||||
auto pixel_pos_comp = ggml_new_tensor_4d(compute_ctx,
|
||||
GGML_TYPE_F32,
|
||||
2,
|
||||
2,
|
||||
params_cfg.pixel_attn_hidden_size / params_cfg.pixel_num_groups / 2,
|
||||
config.pixel_attn_hidden_size / config.pixel_num_groups / 2,
|
||||
Hs * Ws);
|
||||
set_backend_tensor_data(pixel_pos_comp, pixel_pos_comp_vec.data());
|
||||
|
||||
@ -839,4 +844,4 @@ namespace Pid {
|
||||
};
|
||||
} // namespace Pid
|
||||
|
||||
#endif // __SD_PID_HPP__
|
||||
#endif // __SD_MODEL_DIFFUSION_PID_HPP__
|
||||
@ -1,15 +1,57 @@
|
||||
#ifndef __QWEN_IMAGE_HPP__
|
||||
#define __QWEN_IMAGE_HPP__
|
||||
#ifndef __SD_MODEL_DIFFUSION_QWEN_IMAGE_HPP__
|
||||
#define __SD_MODEL_DIFFUSION_QWEN_IMAGE_HPP__
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "common_block.hpp"
|
||||
#include "diffusion_model.hpp"
|
||||
#include "flux.hpp"
|
||||
#include "model/common/block.hpp"
|
||||
#include "model/diffusion/flux.hpp"
|
||||
#include "model/diffusion/model.hpp"
|
||||
|
||||
namespace Qwen {
|
||||
constexpr int QWEN_IMAGE_GRAPH_SIZE = 20480;
|
||||
|
||||
struct QwenImageConfig {
|
||||
int patch_size = 2;
|
||||
int64_t in_channels = 64;
|
||||
int64_t out_channels = 16;
|
||||
int num_layers = 60;
|
||||
int64_t attention_head_dim = 128;
|
||||
int64_t num_attention_heads = 24;
|
||||
int64_t joint_attention_dim = 3584;
|
||||
int theta = 10000;
|
||||
std::vector<int> axes_dim = {16, 56, 56};
|
||||
int axes_dim_sum = 128;
|
||||
bool zero_cond_t = false;
|
||||
|
||||
static QwenImageConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
|
||||
QwenImageConfig config;
|
||||
config.num_layers = 0;
|
||||
for (const auto& [name, _] : tensor_storage_map) {
|
||||
if (!starts_with(name, prefix)) {
|
||||
continue;
|
||||
}
|
||||
if (name.find("__index_timestep_zero__") != std::string::npos) {
|
||||
config.zero_cond_t = true;
|
||||
}
|
||||
size_t pos = name.find("transformer_blocks.");
|
||||
if (pos == std::string::npos) {
|
||||
continue;
|
||||
}
|
||||
auto items = split_string(name.substr(pos), '.');
|
||||
if (items.size() > 1) {
|
||||
int block_index = atoi(items[1].c_str());
|
||||
if (block_index + 1 > config.num_layers) {
|
||||
config.num_layers = block_index + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
LOG_DEBUG("qwen_image: num_layers = %d, zero_cond_t = %s",
|
||||
config.num_layers,
|
||||
config.zero_cond_t ? "true" : "false");
|
||||
return config;
|
||||
}
|
||||
};
|
||||
|
||||
struct TimestepEmbedding : public GGMLBlock {
|
||||
public:
|
||||
TimestepEmbedding(int64_t in_channels,
|
||||
@ -350,46 +392,32 @@ namespace Qwen {
|
||||
}
|
||||
};
|
||||
|
||||
struct QwenImageParams {
|
||||
int patch_size = 2;
|
||||
int64_t in_channels = 64;
|
||||
int64_t out_channels = 16;
|
||||
int num_layers = 60;
|
||||
int64_t attention_head_dim = 128;
|
||||
int64_t num_attention_heads = 24;
|
||||
int64_t joint_attention_dim = 3584;
|
||||
int theta = 10000;
|
||||
std::vector<int> axes_dim = {16, 56, 56};
|
||||
int axes_dim_sum = 128;
|
||||
bool zero_cond_t = false;
|
||||
};
|
||||
|
||||
class QwenImageModel : public GGMLBlock {
|
||||
protected:
|
||||
QwenImageParams params;
|
||||
QwenImageConfig config;
|
||||
|
||||
public:
|
||||
QwenImageModel() {}
|
||||
QwenImageModel(QwenImageParams params)
|
||||
: params(params) {
|
||||
int64_t inner_dim = params.num_attention_heads * params.attention_head_dim;
|
||||
QwenImageModel(QwenImageConfig config)
|
||||
: config(config) {
|
||||
int64_t inner_dim = config.num_attention_heads * config.attention_head_dim;
|
||||
blocks["time_text_embed"] = std::shared_ptr<GGMLBlock>(new QwenTimestepProjEmbeddings(inner_dim));
|
||||
blocks["txt_norm"] = std::shared_ptr<GGMLBlock>(new RMSNorm(params.joint_attention_dim, 1e-6f));
|
||||
blocks["img_in"] = std::shared_ptr<GGMLBlock>(new Linear(params.in_channels, inner_dim));
|
||||
blocks["txt_in"] = std::shared_ptr<GGMLBlock>(new Linear(params.joint_attention_dim, inner_dim));
|
||||
blocks["txt_norm"] = std::shared_ptr<GGMLBlock>(new RMSNorm(config.joint_attention_dim, 1e-6f));
|
||||
blocks["img_in"] = std::shared_ptr<GGMLBlock>(new Linear(config.in_channels, inner_dim));
|
||||
blocks["txt_in"] = std::shared_ptr<GGMLBlock>(new Linear(config.joint_attention_dim, inner_dim));
|
||||
|
||||
// blocks
|
||||
for (int i = 0; i < params.num_layers; i++) {
|
||||
for (int i = 0; i < config.num_layers; i++) {
|
||||
auto block = std::shared_ptr<GGMLBlock>(new QwenImageTransformerBlock(inner_dim,
|
||||
params.num_attention_heads,
|
||||
params.attention_head_dim,
|
||||
config.num_attention_heads,
|
||||
config.attention_head_dim,
|
||||
1e-6f,
|
||||
params.zero_cond_t));
|
||||
config.zero_cond_t));
|
||||
blocks["transformer_blocks." + std::to_string(i)] = block;
|
||||
}
|
||||
|
||||
blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new AdaLayerNormContinuous(inner_dim, inner_dim, false, 1e-6f));
|
||||
blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, params.patch_size * params.patch_size * params.out_channels));
|
||||
blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, config.patch_size * config.patch_size * config.out_channels));
|
||||
}
|
||||
|
||||
ggml_tensor* forward_orig(GGMLRunnerContext* ctx,
|
||||
@ -406,7 +434,7 @@ namespace Qwen {
|
||||
auto proj_out = std::dynamic_pointer_cast<Linear>(blocks["proj_out"]);
|
||||
|
||||
auto t_emb = time_text_embed->forward(ctx, timestep);
|
||||
if (params.zero_cond_t) {
|
||||
if (config.zero_cond_t) {
|
||||
auto t_emb_0 = time_text_embed->forward(ctx, ggml_ext_zeros_like(ctx->ggml_ctx, timestep));
|
||||
t_emb = ggml_concat(ctx->ggml_ctx, t_emb, t_emb_0, 1);
|
||||
}
|
||||
@ -417,7 +445,7 @@ namespace Qwen {
|
||||
sd::ggml_graph_cut::mark_graph_cut(txt, "qwen_image.prelude", "txt");
|
||||
// sd::ggml_graph_cut::mark_graph_cut(t_emb, "qwen_image.prelude", "t_emb");
|
||||
|
||||
for (int i = 0; i < params.num_layers; i++) {
|
||||
for (int i = 0; i < config.num_layers; i++) {
|
||||
auto block = std::dynamic_pointer_cast<QwenImageTransformerBlock>(blocks["transformer_blocks." + std::to_string(i)]);
|
||||
|
||||
auto result = block->forward(ctx, img, txt, t_emb, pe, modulate_index);
|
||||
@ -427,7 +455,7 @@ namespace Qwen {
|
||||
sd::ggml_graph_cut::mark_graph_cut(txt, "qwen_image.transformer_blocks." + std::to_string(i), "txt");
|
||||
}
|
||||
|
||||
if (params.zero_cond_t) {
|
||||
if (config.zero_cond_t) {
|
||||
t_emb = ggml_ext_chunk(ctx->ggml_ctx, t_emb, 2, 1)[0];
|
||||
}
|
||||
|
||||
@ -456,12 +484,12 @@ namespace Qwen {
|
||||
int64_t C = x->ne[2];
|
||||
int64_t N = x->ne[3];
|
||||
|
||||
auto img = DiT::pad_and_patchify(ctx, x, params.patch_size, params.patch_size);
|
||||
auto img = DiT::pad_and_patchify(ctx, x, config.patch_size, config.patch_size);
|
||||
int64_t img_tokens = img->ne[1];
|
||||
|
||||
if (ref_latents.size() > 0) {
|
||||
for (ggml_tensor* ref : ref_latents) {
|
||||
ref = DiT::pad_and_patchify(ctx, ref, params.patch_size, params.patch_size);
|
||||
ref = DiT::pad_and_patchify(ctx, ref, config.patch_size, config.patch_size);
|
||||
img = ggml_concat(ctx->ggml_ctx, img, ref, 1);
|
||||
}
|
||||
}
|
||||
@ -474,7 +502,7 @@ namespace Qwen {
|
||||
out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, out, 0, 2, 1, 3)); // [N, h*w, C * patch_size * patch_size]
|
||||
}
|
||||
|
||||
out = DiT::unpatchify_and_crop(ctx->ggml_ctx, out, H, W, params.patch_size, params.patch_size); // [N, C, H, W]
|
||||
out = DiT::unpatchify_and_crop(ctx->ggml_ctx, out, H, W, config.patch_size, config.patch_size); // [N, C, H, W]
|
||||
|
||||
return out;
|
||||
}
|
||||
@ -482,7 +510,7 @@ namespace Qwen {
|
||||
|
||||
struct QwenImageRunner : public DiffusionModelRunner {
|
||||
public:
|
||||
QwenImageParams qwen_image_params;
|
||||
QwenImageConfig config;
|
||||
QwenImageModel qwen_image;
|
||||
std::vector<float> pe_vec;
|
||||
std::vector<float> modulate_index_vec;
|
||||
@ -494,34 +522,10 @@ namespace Qwen {
|
||||
const std::string prefix = "",
|
||||
SDVersion version = VERSION_QWEN_IMAGE,
|
||||
bool zero_cond_t = false)
|
||||
: DiffusionModelRunner(backend, params_backend, prefix) {
|
||||
qwen_image_params.num_layers = 0;
|
||||
qwen_image_params.zero_cond_t = zero_cond_t;
|
||||
for (auto pair : tensor_storage_map) {
|
||||
std::string tensor_name = pair.first;
|
||||
if (tensor_name.find(prefix) == std::string::npos)
|
||||
continue;
|
||||
if (tensor_name.find("__index_timestep_zero__") != std::string::npos) {
|
||||
qwen_image_params.zero_cond_t = true;
|
||||
}
|
||||
size_t pos = tensor_name.find("transformer_blocks.");
|
||||
if (pos != std::string::npos) {
|
||||
tensor_name = tensor_name.substr(pos); // remove prefix
|
||||
auto items = split_string(tensor_name, '.');
|
||||
if (items.size() > 1) {
|
||||
int block_index = atoi(items[1].c_str());
|
||||
if (block_index + 1 > qwen_image_params.num_layers) {
|
||||
qwen_image_params.num_layers = block_index + 1;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
LOG_INFO("qwen_image_params.num_layers: %ld", qwen_image_params.num_layers);
|
||||
if (qwen_image_params.zero_cond_t) {
|
||||
LOG_INFO("use zero_cond_t");
|
||||
}
|
||||
qwen_image = QwenImageModel(qwen_image_params);
|
||||
: DiffusionModelRunner(backend, params_backend, prefix),
|
||||
config(QwenImageConfig::detect_from_weights(tensor_storage_map, prefix)) {
|
||||
config.zero_cond_t = config.zero_cond_t || zero_cond_t;
|
||||
qwen_image = QwenImageModel(config);
|
||||
qwen_image.init(params_ctx, tensor_storage_map, prefix);
|
||||
}
|
||||
|
||||
@ -552,36 +556,36 @@ namespace Qwen {
|
||||
|
||||
pe_vec = Rope::gen_qwen_image_pe(static_cast<int>(x->ne[1]),
|
||||
static_cast<int>(x->ne[0]),
|
||||
qwen_image_params.patch_size,
|
||||
config.patch_size,
|
||||
static_cast<int>(x->ne[3]),
|
||||
static_cast<int>(context->ne[1]),
|
||||
ref_latents,
|
||||
increase_ref_index,
|
||||
qwen_image_params.theta,
|
||||
config.theta,
|
||||
circular_y_enabled,
|
||||
circular_x_enabled,
|
||||
qwen_image_params.axes_dim);
|
||||
int pos_len = static_cast<int>(pe_vec.size() / qwen_image_params.axes_dim_sum / 2);
|
||||
config.axes_dim);
|
||||
int pos_len = static_cast<int>(pe_vec.size() / config.axes_dim_sum / 2);
|
||||
// LOG_DEBUG("pos_len %d", pos_len);
|
||||
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, qwen_image_params.axes_dim_sum / 2, pos_len);
|
||||
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.axes_dim_sum / 2, pos_len);
|
||||
// pe->data = pe_vec.data();
|
||||
// print_ggml_tensor(pe, true, "pe");
|
||||
// pe->data = nullptr;
|
||||
set_backend_tensor_data(pe, pe_vec.data());
|
||||
|
||||
ggml_tensor* modulate_index = nullptr;
|
||||
if (qwen_image_params.zero_cond_t) {
|
||||
if (config.zero_cond_t) {
|
||||
modulate_index_vec.clear();
|
||||
|
||||
int64_t h_len = ((x->ne[1] + (qwen_image_params.patch_size / 2)) / qwen_image_params.patch_size);
|
||||
int64_t w_len = ((x->ne[0] + (qwen_image_params.patch_size / 2)) / qwen_image_params.patch_size);
|
||||
int64_t h_len = ((x->ne[1] + (config.patch_size / 2)) / config.patch_size);
|
||||
int64_t w_len = ((x->ne[0] + (config.patch_size / 2)) / config.patch_size);
|
||||
int64_t num_img_tokens = h_len * w_len;
|
||||
|
||||
modulate_index_vec.insert(modulate_index_vec.end(), num_img_tokens, 0.f);
|
||||
int64_t num_ref_img_tokens = 0;
|
||||
for (ggml_tensor* ref : ref_latents) {
|
||||
int64_t h_len = ((ref->ne[1] + (qwen_image_params.patch_size / 2)) / qwen_image_params.patch_size);
|
||||
int64_t w_len = ((ref->ne[0] + (qwen_image_params.patch_size / 2)) / qwen_image_params.patch_size);
|
||||
int64_t h_len = ((ref->ne[1] + (config.patch_size / 2)) / config.patch_size);
|
||||
int64_t w_len = ((ref->ne[0] + (config.patch_size / 2)) / config.patch_size);
|
||||
|
||||
num_ref_img_tokens += h_len * w_len;
|
||||
}
|
||||
@ -727,4 +731,4 @@ namespace Qwen {
|
||||
|
||||
} // namespace name
|
||||
|
||||
#endif // __QWEN_IMAGE_HPP__
|
||||
#endif // __SD_MODEL_DIFFUSION_QWEN_IMAGE_HPP__
|
||||
@ -1,14 +1,136 @@
|
||||
#ifndef __UNET_HPP__
|
||||
#define __UNET_HPP__
|
||||
#ifndef __SD_MODEL_DIFFUSION_UNET_HPP__
|
||||
#define __SD_MODEL_DIFFUSION_UNET_HPP__
|
||||
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
|
||||
#include "common_block.hpp"
|
||||
#include "diffusion_model.hpp"
|
||||
#include "model.h"
|
||||
#include "model/common/block.hpp"
|
||||
#include "model/diffusion/model.hpp"
|
||||
|
||||
/*==================================================== UnetModel =====================================================*/
|
||||
|
||||
#define UNET_GRAPH_SIZE 102400
|
||||
|
||||
struct UNetConfig {
|
||||
SDVersion version = VERSION_SD1;
|
||||
// network hparams
|
||||
int in_channels = 4;
|
||||
int out_channels = 4;
|
||||
int num_res_blocks = 2;
|
||||
std::vector<int> attention_resolutions = {4, 2, 1};
|
||||
std::vector<int> channel_mult = {1, 2, 4, 4};
|
||||
std::vector<int> transformer_depth = {1, 1, 1, 1};
|
||||
int time_embed_dim = 1280; // model_channels*4
|
||||
int num_heads = 8;
|
||||
int num_head_channels = -1; // channels // num_heads
|
||||
int context_dim = 768; // 1024 for VERSION_SD2, 2048 for VERSION_SDXL
|
||||
bool use_linear_projection = false;
|
||||
bool tiny_unet = false;
|
||||
int model_channels = 320;
|
||||
int adm_in_channels = 2816; // only for VERSION_SDXL/SVD
|
||||
|
||||
static UNetConfig detect_from_weights(const String2TensorStorage& tensor_storage_map,
|
||||
const std::string& prefix,
|
||||
SDVersion version = VERSION_SD1) {
|
||||
UNetConfig config;
|
||||
config.version = version;
|
||||
|
||||
if (sd_version_is_sd2(version)) {
|
||||
config.context_dim = 1024;
|
||||
config.num_head_channels = 64;
|
||||
config.num_heads = -1;
|
||||
config.use_linear_projection = true;
|
||||
} else if (sd_version_is_sdxl(version)) {
|
||||
config.context_dim = 2048;
|
||||
config.attention_resolutions = {4, 2};
|
||||
config.channel_mult = {1, 2, 4};
|
||||
config.transformer_depth = {1, 2, 10};
|
||||
config.num_head_channels = 64;
|
||||
config.num_heads = -1;
|
||||
config.use_linear_projection = true;
|
||||
if (version == VERSION_SDXL_VEGA) {
|
||||
config.transformer_depth = {1, 1, 2};
|
||||
}
|
||||
} else if (version == VERSION_SVD) {
|
||||
config.in_channels = 8;
|
||||
config.out_channels = 4;
|
||||
config.context_dim = 1024;
|
||||
config.adm_in_channels = 768;
|
||||
config.num_head_channels = 64;
|
||||
config.num_heads = -1;
|
||||
config.use_linear_projection = true;
|
||||
}
|
||||
if (sd_version_is_inpaint(version)) {
|
||||
config.in_channels = 9;
|
||||
} else if (sd_version_is_unet_edit(version)) {
|
||||
config.in_channels = 8;
|
||||
}
|
||||
if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
|
||||
config.num_res_blocks = 1;
|
||||
config.channel_mult = {1, 2, 4};
|
||||
config.tiny_unet = true;
|
||||
if (version == VERSION_SDXS_512_DS) {
|
||||
config.attention_resolutions = {4, 2}; // here just like SDXL
|
||||
}
|
||||
}
|
||||
|
||||
auto find_weight = [&](const std::string& suffix) -> const TensorStorage* {
|
||||
std::string name = prefix.empty() ? suffix : prefix + "." + suffix;
|
||||
auto it = tensor_storage_map.find(name);
|
||||
if (it == tensor_storage_map.end()) {
|
||||
return nullptr;
|
||||
}
|
||||
return &it->second;
|
||||
};
|
||||
|
||||
if (const TensorStorage* input = find_weight("input_blocks.0.0.weight")) {
|
||||
if (input->n_dims == 4) {
|
||||
config.in_channels = static_cast<int>(input->ne[2]);
|
||||
config.model_channels = static_cast<int>(input->ne[3]);
|
||||
config.time_embed_dim = config.model_channels * 4;
|
||||
}
|
||||
}
|
||||
if (const TensorStorage* time_embed = find_weight("time_embed.0.weight")) {
|
||||
if (time_embed->n_dims == 2) {
|
||||
config.model_channels = static_cast<int>(time_embed->ne[0]);
|
||||
config.time_embed_dim = static_cast<int>(time_embed->ne[1]);
|
||||
}
|
||||
}
|
||||
if (const TensorStorage* label_emb = find_weight("label_emb.0.0.weight")) {
|
||||
if (label_emb->n_dims == 2) {
|
||||
config.adm_in_channels = static_cast<int>(label_emb->ne[0]);
|
||||
config.time_embed_dim = static_cast<int>(label_emb->ne[1]);
|
||||
}
|
||||
}
|
||||
if (const TensorStorage* out = find_weight("out.2.weight")) {
|
||||
if (out->n_dims == 4) {
|
||||
config.out_channels = static_cast<int>(out->ne[3]);
|
||||
}
|
||||
}
|
||||
for (const auto& [name, tensor_storage] : tensor_storage_map) {
|
||||
if (!starts_with(name, prefix)) {
|
||||
continue;
|
||||
}
|
||||
if (name.find("attn2.to_k.weight") != std::string::npos && tensor_storage.n_dims == 2) {
|
||||
config.context_dim = static_cast<int>(tensor_storage.ne[0]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
LOG_DEBUG("unet: in_channels = %d, out_channels = %d, model_channels = %d, time_embed_dim = %d, context_dim = %d, adm_in_channels = %d, num_res_blocks = %d, tiny_unet = %s",
|
||||
config.in_channels,
|
||||
config.out_channels,
|
||||
config.model_channels,
|
||||
config.time_embed_dim,
|
||||
config.context_dim,
|
||||
config.adm_in_channels,
|
||||
config.num_res_blocks,
|
||||
config.tiny_unet ? "true" : "false");
|
||||
return config;
|
||||
}
|
||||
};
|
||||
|
||||
class SpatialVideoTransformer : public SpatialTransformer {
|
||||
protected:
|
||||
int64_t time_depth;
|
||||
@ -166,66 +288,26 @@ public:
|
||||
|
||||
// ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
class UnetModelBlock : public GGMLBlock {
|
||||
protected:
|
||||
SDVersion version = VERSION_SD1;
|
||||
// network hparams
|
||||
int in_channels = 4;
|
||||
int out_channels = 4;
|
||||
int num_res_blocks = 2;
|
||||
std::vector<int> attention_resolutions = {4, 2, 1};
|
||||
std::vector<int> channel_mult = {1, 2, 4, 4};
|
||||
std::vector<int> transformer_depth = {1, 1, 1, 1};
|
||||
int time_embed_dim = 1280; // model_channels*4
|
||||
int num_heads = 8;
|
||||
int num_head_channels = -1; // channels // num_heads
|
||||
int context_dim = 768; // 1024 for VERSION_SD2, 2048 for VERSION_SDXL
|
||||
bool use_linear_projection = false;
|
||||
bool tiny_unet = false;
|
||||
|
||||
public:
|
||||
int model_channels = 320;
|
||||
int adm_in_channels = 2816; // only for VERSION_SDXL/SVD
|
||||
UNetConfig config;
|
||||
|
||||
UnetModelBlock(SDVersion version = VERSION_SD1, const String2TensorStorage& tensor_storage_map = {})
|
||||
: version(version) {
|
||||
if (sd_version_is_sd2(version)) {
|
||||
context_dim = 1024;
|
||||
num_head_channels = 64;
|
||||
num_heads = -1;
|
||||
use_linear_projection = true;
|
||||
} else if (sd_version_is_sdxl(version)) {
|
||||
context_dim = 2048;
|
||||
attention_resolutions = {4, 2};
|
||||
channel_mult = {1, 2, 4};
|
||||
transformer_depth = {1, 2, 10};
|
||||
num_head_channels = 64;
|
||||
num_heads = -1;
|
||||
use_linear_projection = true;
|
||||
if (version == VERSION_SDXL_VEGA) {
|
||||
transformer_depth = {1, 1, 2};
|
||||
}
|
||||
} else if (version == VERSION_SVD) {
|
||||
in_channels = 8;
|
||||
out_channels = 4;
|
||||
context_dim = 1024;
|
||||
adm_in_channels = 768;
|
||||
num_head_channels = 64;
|
||||
num_heads = -1;
|
||||
use_linear_projection = true;
|
||||
}
|
||||
if (sd_version_is_inpaint(version)) {
|
||||
in_channels = 9;
|
||||
} else if (sd_version_is_unet_edit(version)) {
|
||||
in_channels = 8;
|
||||
}
|
||||
if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
|
||||
num_res_blocks = 1;
|
||||
channel_mult = {1, 2, 4};
|
||||
tiny_unet = true;
|
||||
if (version == VERSION_SDXS_512_DS) {
|
||||
attention_resolutions = {4, 2}; // here just like SDXL
|
||||
}
|
||||
}
|
||||
explicit UnetModelBlock(UNetConfig config = {})
|
||||
: config(config) {
|
||||
const SDVersion version = this->config.version;
|
||||
const int in_channels = this->config.in_channels;
|
||||
const int out_channels = this->config.out_channels;
|
||||
const int num_res_blocks = this->config.num_res_blocks;
|
||||
const auto& attention_resolutions = this->config.attention_resolutions;
|
||||
const auto& channel_mult = this->config.channel_mult;
|
||||
const auto& transformer_depth = this->config.transformer_depth;
|
||||
const int time_embed_dim = this->config.time_embed_dim;
|
||||
const int num_heads = this->config.num_heads;
|
||||
const int num_head_channels = this->config.num_head_channels;
|
||||
const int context_dim = this->config.context_dim;
|
||||
const bool use_linear_projection = this->config.use_linear_projection;
|
||||
const bool tiny_unet = this->config.tiny_unet;
|
||||
const int model_channels = this->config.model_channels;
|
||||
const int adm_in_channels = this->config.adm_in_channels;
|
||||
|
||||
// dims is always 2
|
||||
// use_temporal_attention is always True for SVD
|
||||
@ -398,7 +480,7 @@ public:
|
||||
ggml_tensor* x,
|
||||
ggml_tensor* emb,
|
||||
int num_video_frames) {
|
||||
if (version == VERSION_SVD) {
|
||||
if (config.version == VERSION_SVD) {
|
||||
auto block = std::dynamic_pointer_cast<VideoResBlock>(blocks[name]);
|
||||
|
||||
return block->forward(ctx, x, emb, num_video_frames);
|
||||
@ -414,7 +496,7 @@ public:
|
||||
ggml_tensor* x,
|
||||
ggml_tensor* context,
|
||||
int timesteps) {
|
||||
if (version == VERSION_SVD) {
|
||||
if (config.version == VERSION_SVD) {
|
||||
auto block = std::dynamic_pointer_cast<SpatialVideoTransformer>(blocks[name]);
|
||||
|
||||
return block->forward(ctx, x, context, timesteps);
|
||||
@ -440,6 +522,13 @@ public:
|
||||
// c_concat: [N, in_channels, h, w] or [1, in_channels, h, w]
|
||||
// y: [N, adm_in_channels] or [1, adm_in_channels]
|
||||
// return: [N, out_channels, h, w]
|
||||
const SDVersion version = config.version;
|
||||
const int model_channels = config.model_channels;
|
||||
const int num_res_blocks = config.num_res_blocks;
|
||||
const auto& attention_resolutions = config.attention_resolutions;
|
||||
const auto& channel_mult = config.channel_mult;
|
||||
const bool tiny_unet = config.tiny_unet;
|
||||
|
||||
if (context != nullptr) {
|
||||
if (context->ne[2] != x->ne[3]) {
|
||||
context = ggml_repeat(ctx->ggml_ctx, context, ggml_new_tensor_3d(ctx->ggml_ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
|
||||
@ -601,6 +690,7 @@ public:
|
||||
};
|
||||
|
||||
struct UNetModelRunner : public DiffusionModelRunner {
|
||||
UNetConfig config;
|
||||
UnetModelBlock unet;
|
||||
|
||||
UNetModelRunner(ggml_backend_t backend,
|
||||
@ -608,7 +698,9 @@ struct UNetModelRunner : public DiffusionModelRunner {
|
||||
const String2TensorStorage& tensor_storage_map,
|
||||
const std::string prefix,
|
||||
SDVersion version = VERSION_SD1)
|
||||
: DiffusionModelRunner(backend, params_backend, prefix), unet(version, tensor_storage_map) {
|
||||
: DiffusionModelRunner(backend, params_backend, prefix),
|
||||
config(UNetConfig::detect_from_weights(tensor_storage_map, prefix, version)),
|
||||
unet(config) {
|
||||
unet.init(params_ctx, tensor_storage_map, prefix);
|
||||
}
|
||||
|
||||
@ -752,4 +844,4 @@ struct UNetModelRunner : public DiffusionModelRunner {
|
||||
}
|
||||
};
|
||||
|
||||
#endif // __UNET_HPP__
|
||||
#endif // __SD_MODEL_DIFFUSION_UNET_HPP__
|
||||
1061
src/model/diffusion/wan.hpp
Normal file
1061
src/model/diffusion/wan.hpp
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,14 +1,14 @@
|
||||
#ifndef __Z_IMAGE_HPP__
|
||||
#define __Z_IMAGE_HPP__
|
||||
#ifndef __SD_MODEL_DIFFUSION_Z_IMAGE_HPP__
|
||||
#define __SD_MODEL_DIFFUSION_Z_IMAGE_HPP__
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "diffusion_model.hpp"
|
||||
#include "flux.hpp"
|
||||
#include "ggml_extend.hpp"
|
||||
#include "mmdit.hpp"
|
||||
#include "core/ggml_extend.hpp"
|
||||
#include "model/diffusion/flux.hpp"
|
||||
#include "model/diffusion/mmdit.hpp"
|
||||
#include "model/diffusion/model.hpp"
|
||||
|
||||
// Ref: https://github.com/Alpha-VLLM/Lumina-Image-2.0/blob/main/models/model.py
|
||||
// Ref: https://github.com/Alpha-VLLM/Lumina-Image-2.0/blob/main/model/model.py
|
||||
// Ref: https://github.com/huggingface/diffusers/pull/12703
|
||||
|
||||
#ifndef MIN
|
||||
@ -20,6 +20,104 @@ namespace ZImage {
|
||||
constexpr int ADALN_EMBED_DIM = 256;
|
||||
constexpr int SEQ_MULTI_OF = 32;
|
||||
|
||||
struct ZImageConfig {
|
||||
int patch_size = 2;
|
||||
int64_t hidden_size = 3840;
|
||||
int64_t in_channels = 16;
|
||||
int64_t out_channels = 16;
|
||||
int64_t num_layers = 30;
|
||||
int64_t num_refiner_layers = 2;
|
||||
int64_t head_dim = 128;
|
||||
int64_t num_heads = 30;
|
||||
int64_t num_kv_heads = 30;
|
||||
int64_t multiple_of = 256;
|
||||
float ffn_dim_multiplier = 8.0f / 3.0f;
|
||||
float norm_eps = 1e-5f;
|
||||
bool qk_norm = true;
|
||||
int64_t cap_feat_dim = 2560;
|
||||
int theta = 256;
|
||||
std::vector<int> axes_dim = {32, 48, 48};
|
||||
int64_t axes_dim_sum = 128;
|
||||
|
||||
static ZImageConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
|
||||
ZImageConfig config;
|
||||
int64_t detected_layers = 0;
|
||||
int64_t detected_refiner_layers = 0;
|
||||
int64_t detected_context_refiner = 0;
|
||||
int64_t detected_head_dim = 0;
|
||||
int64_t detected_qkv_dim = 0;
|
||||
|
||||
for (const auto& [name, tensor_storage] : tensor_storage_map) {
|
||||
if (!starts_with(name, prefix)) {
|
||||
continue;
|
||||
}
|
||||
if (ends_with(name, "x_embedder.weight") && tensor_storage.n_dims == 2) {
|
||||
int64_t patch_area = config.patch_size * config.patch_size;
|
||||
config.in_channels = tensor_storage.ne[0] / patch_area;
|
||||
config.hidden_size = tensor_storage.ne[1];
|
||||
} else if (ends_with(name, "cap_embedder.1.weight") && tensor_storage.n_dims == 2) {
|
||||
config.cap_feat_dim = tensor_storage.ne[0];
|
||||
config.hidden_size = tensor_storage.ne[1];
|
||||
} else if (ends_with(name, "layers.0.attention.q_norm.weight") && tensor_storage.n_dims == 1) {
|
||||
detected_head_dim = tensor_storage.ne[0];
|
||||
} else if (ends_with(name, "layers.0.attention.qkv.weight") && tensor_storage.n_dims == 2) {
|
||||
detected_qkv_dim = tensor_storage.ne[1];
|
||||
} else if (ends_with(name, "final_layer.linear.weight") && tensor_storage.n_dims == 2) {
|
||||
int64_t patch_area = config.patch_size * config.patch_size;
|
||||
config.out_channels = tensor_storage.ne[1] / patch_area;
|
||||
}
|
||||
|
||||
size_t pos = name.find("layers.");
|
||||
if (pos != std::string::npos) {
|
||||
auto items = split_string(name.substr(pos), '.');
|
||||
if (items.size() > 1) {
|
||||
int block_index = atoi(items[1].c_str());
|
||||
detected_layers = std::max<int64_t>(detected_layers, block_index + 1);
|
||||
}
|
||||
}
|
||||
pos = name.find("noise_refiner.");
|
||||
if (pos != std::string::npos) {
|
||||
auto items = split_string(name.substr(pos), '.');
|
||||
if (items.size() > 1) {
|
||||
int block_index = atoi(items[1].c_str());
|
||||
detected_refiner_layers = std::max<int64_t>(detected_refiner_layers, block_index + 1);
|
||||
}
|
||||
}
|
||||
pos = name.find("context_refiner.");
|
||||
if (pos != std::string::npos) {
|
||||
auto items = split_string(name.substr(pos), '.');
|
||||
if (items.size() > 1) {
|
||||
int block_index = atoi(items[1].c_str());
|
||||
detected_context_refiner = std::max<int64_t>(detected_context_refiner, block_index + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (detected_layers > 0) {
|
||||
config.num_layers = detected_layers;
|
||||
}
|
||||
if (detected_refiner_layers > 0 || detected_context_refiner > 0) {
|
||||
config.num_refiner_layers = std::max(detected_refiner_layers, detected_context_refiner);
|
||||
}
|
||||
if (detected_head_dim > 0) {
|
||||
config.head_dim = detected_head_dim;
|
||||
config.num_heads = config.hidden_size / config.head_dim;
|
||||
if (detected_qkv_dim > 0) {
|
||||
int64_t qkv_heads = detected_qkv_dim / config.head_dim;
|
||||
config.num_kv_heads = std::max<int64_t>(1, (qkv_heads - config.num_heads) / 2);
|
||||
}
|
||||
}
|
||||
LOG_DEBUG("z_image: num_layers = %" PRId64 ", num_refiner_layers = %" PRId64 ", hidden_size = %" PRId64 ", num_heads = %" PRId64 ", num_kv_heads = %" PRId64 ", in_channels = %" PRId64 ", out_channels = %" PRId64,
|
||||
config.num_layers,
|
||||
config.num_refiner_layers,
|
||||
config.hidden_size,
|
||||
config.num_heads,
|
||||
config.num_kv_heads,
|
||||
config.in_channels,
|
||||
config.out_channels);
|
||||
return config;
|
||||
}
|
||||
};
|
||||
|
||||
struct JointAttention : public GGMLBlock {
|
||||
protected:
|
||||
int64_t head_dim;
|
||||
@ -263,90 +361,70 @@ namespace ZImage {
|
||||
}
|
||||
};
|
||||
|
||||
struct ZImageParams {
|
||||
int patch_size = 2;
|
||||
int64_t hidden_size = 3840;
|
||||
int64_t in_channels = 16;
|
||||
int64_t out_channels = 16;
|
||||
int64_t num_layers = 30;
|
||||
int64_t num_refiner_layers = 2;
|
||||
int64_t head_dim = 128;
|
||||
int64_t num_heads = 30;
|
||||
int64_t num_kv_heads = 30;
|
||||
int64_t multiple_of = 256;
|
||||
float ffn_dim_multiplier = 8.0f / 3.0f;
|
||||
float norm_eps = 1e-5f;
|
||||
bool qk_norm = true;
|
||||
int64_t cap_feat_dim = 2560;
|
||||
int theta = 256;
|
||||
std::vector<int> axes_dim = {32, 48, 48};
|
||||
int64_t axes_dim_sum = 128;
|
||||
};
|
||||
|
||||
class ZImageModel : public GGMLBlock {
|
||||
protected:
|
||||
ZImageParams z_image_params;
|
||||
ZImageConfig config;
|
||||
|
||||
void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
|
||||
params["cap_pad_token"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, z_image_params.hidden_size);
|
||||
params["x_pad_token"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, z_image_params.hidden_size);
|
||||
params["cap_pad_token"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, config.hidden_size);
|
||||
params["x_pad_token"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, config.hidden_size);
|
||||
}
|
||||
|
||||
public:
|
||||
ZImageModel() = default;
|
||||
ZImageModel(ZImageParams z_image_params)
|
||||
: z_image_params(z_image_params) {
|
||||
blocks["x_embedder"] = std::make_shared<Linear>(z_image_params.patch_size * z_image_params.patch_size * z_image_params.in_channels, z_image_params.hidden_size);
|
||||
blocks["t_embedder"] = std::make_shared<TimestepEmbedder>(MIN(z_image_params.hidden_size, 1024), 256, 256);
|
||||
blocks["cap_embedder.0"] = std::make_shared<RMSNorm>(z_image_params.cap_feat_dim, z_image_params.norm_eps);
|
||||
blocks["cap_embedder.1"] = std::make_shared<Linear>(z_image_params.cap_feat_dim, z_image_params.hidden_size);
|
||||
ZImageModel(ZImageConfig config)
|
||||
: config(config) {
|
||||
blocks["x_embedder"] = std::make_shared<Linear>(config.patch_size * config.patch_size * config.in_channels, config.hidden_size);
|
||||
blocks["t_embedder"] = std::make_shared<TimestepEmbedder>(MIN(config.hidden_size, 1024), 256, 256);
|
||||
blocks["cap_embedder.0"] = std::make_shared<RMSNorm>(config.cap_feat_dim, config.norm_eps);
|
||||
blocks["cap_embedder.1"] = std::make_shared<Linear>(config.cap_feat_dim, config.hidden_size);
|
||||
|
||||
for (int i = 0; i < z_image_params.num_refiner_layers; i++) {
|
||||
for (int i = 0; i < config.num_refiner_layers; i++) {
|
||||
auto block = std::make_shared<JointTransformerBlock>(i,
|
||||
z_image_params.hidden_size,
|
||||
z_image_params.head_dim,
|
||||
z_image_params.num_heads,
|
||||
z_image_params.num_kv_heads,
|
||||
z_image_params.multiple_of,
|
||||
z_image_params.ffn_dim_multiplier,
|
||||
z_image_params.norm_eps,
|
||||
z_image_params.qk_norm,
|
||||
config.hidden_size,
|
||||
config.head_dim,
|
||||
config.num_heads,
|
||||
config.num_kv_heads,
|
||||
config.multiple_of,
|
||||
config.ffn_dim_multiplier,
|
||||
config.norm_eps,
|
||||
config.qk_norm,
|
||||
true);
|
||||
|
||||
blocks["noise_refiner." + std::to_string(i)] = block;
|
||||
}
|
||||
|
||||
for (int i = 0; i < z_image_params.num_refiner_layers; i++) {
|
||||
for (int i = 0; i < config.num_refiner_layers; i++) {
|
||||
auto block = std::make_shared<JointTransformerBlock>(i,
|
||||
z_image_params.hidden_size,
|
||||
z_image_params.head_dim,
|
||||
z_image_params.num_heads,
|
||||
z_image_params.num_kv_heads,
|
||||
z_image_params.multiple_of,
|
||||
z_image_params.ffn_dim_multiplier,
|
||||
z_image_params.norm_eps,
|
||||
z_image_params.qk_norm,
|
||||
config.hidden_size,
|
||||
config.head_dim,
|
||||
config.num_heads,
|
||||
config.num_kv_heads,
|
||||
config.multiple_of,
|
||||
config.ffn_dim_multiplier,
|
||||
config.norm_eps,
|
||||
config.qk_norm,
|
||||
false);
|
||||
|
||||
blocks["context_refiner." + std::to_string(i)] = block;
|
||||
}
|
||||
|
||||
for (int i = 0; i < z_image_params.num_layers; i++) {
|
||||
for (int i = 0; i < config.num_layers; i++) {
|
||||
auto block = std::make_shared<JointTransformerBlock>(i,
|
||||
z_image_params.hidden_size,
|
||||
z_image_params.head_dim,
|
||||
z_image_params.num_heads,
|
||||
z_image_params.num_kv_heads,
|
||||
z_image_params.multiple_of,
|
||||
z_image_params.ffn_dim_multiplier,
|
||||
z_image_params.norm_eps,
|
||||
z_image_params.qk_norm,
|
||||
config.hidden_size,
|
||||
config.head_dim,
|
||||
config.num_heads,
|
||||
config.num_kv_heads,
|
||||
config.multiple_of,
|
||||
config.ffn_dim_multiplier,
|
||||
config.norm_eps,
|
||||
config.qk_norm,
|
||||
true);
|
||||
|
||||
blocks["layers." + std::to_string(i)] = block;
|
||||
}
|
||||
|
||||
blocks["final_layer"] = std::make_shared<FinalLayer>(z_image_params.hidden_size, z_image_params.patch_size, z_image_params.out_channels);
|
||||
blocks["final_layer"] = std::make_shared<FinalLayer>(config.hidden_size, config.patch_size, config.out_channels);
|
||||
}
|
||||
|
||||
ggml_tensor* forward_core(GGMLRunnerContext* ctx,
|
||||
@ -393,14 +471,14 @@ namespace ZImage {
|
||||
auto txt_pe = ggml_ext_slice(ctx->ggml_ctx, pe, 3, 0, txt->ne[1]);
|
||||
auto img_pe = ggml_ext_slice(ctx->ggml_ctx, pe, 3, txt->ne[1], pe->ne[3]);
|
||||
|
||||
for (int i = 0; i < z_image_params.num_refiner_layers; i++) {
|
||||
for (int i = 0; i < config.num_refiner_layers; i++) {
|
||||
auto block = std::dynamic_pointer_cast<JointTransformerBlock>(blocks["context_refiner." + std::to_string(i)]);
|
||||
|
||||
txt = block->forward(ctx, txt, txt_pe, nullptr, nullptr);
|
||||
sd::ggml_graph_cut::mark_graph_cut(txt, "z_image.context_refiner." + std::to_string(i), "txt");
|
||||
}
|
||||
|
||||
for (int i = 0; i < z_image_params.num_refiner_layers; i++) {
|
||||
for (int i = 0; i < config.num_refiner_layers; i++) {
|
||||
auto block = std::dynamic_pointer_cast<JointTransformerBlock>(blocks["noise_refiner." + std::to_string(i)]);
|
||||
|
||||
img = block->forward(ctx, img, img_pe, nullptr, t_emb);
|
||||
@ -410,7 +488,7 @@ namespace ZImage {
|
||||
auto txt_img = ggml_concat(ctx->ggml_ctx, txt, img, 1); // [N, n_txt_token + n_txt_pad_token + n_img_token + n_img_pad_token, hidden_size]
|
||||
sd::ggml_graph_cut::mark_graph_cut(txt_img, "z_image.prelude", "txt_img");
|
||||
|
||||
for (int i = 0; i < z_image_params.num_layers; i++) {
|
||||
for (int i = 0; i < config.num_layers; i++) {
|
||||
auto block = std::dynamic_pointer_cast<JointTransformerBlock>(blocks["layers." + std::to_string(i)]);
|
||||
|
||||
txt_img = block->forward(ctx, txt_img, pe, nullptr, t_emb);
|
||||
@ -442,7 +520,7 @@ namespace ZImage {
|
||||
int64_t C = x->ne[2];
|
||||
int64_t N = x->ne[3];
|
||||
|
||||
int patch_size = z_image_params.patch_size;
|
||||
int patch_size = config.patch_size;
|
||||
|
||||
auto img = DiT::pad_and_patchify(ctx, x, patch_size, patch_size, false);
|
||||
uint64_t n_img_token = img->ne[1];
|
||||
@ -467,7 +545,7 @@ namespace ZImage {
|
||||
|
||||
struct ZImageRunner : public DiffusionModelRunner {
|
||||
public:
|
||||
ZImageParams z_image_params;
|
||||
ZImageConfig config;
|
||||
ZImageModel z_image;
|
||||
std::vector<float> pe_vec;
|
||||
std::vector<float> timestep_vec;
|
||||
@ -478,8 +556,9 @@ namespace ZImage {
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string prefix = "",
|
||||
SDVersion version = VERSION_Z_IMAGE)
|
||||
: DiffusionModelRunner(backend, params_backend, prefix) {
|
||||
z_image = ZImageModel(z_image_params);
|
||||
: DiffusionModelRunner(backend, params_backend, prefix),
|
||||
config(ZImageConfig::detect_from_weights(tensor_storage_map, prefix)) {
|
||||
z_image = ZImageModel(config);
|
||||
z_image.init(params_ctx, tensor_storage_map, prefix);
|
||||
}
|
||||
|
||||
@ -510,19 +589,19 @@ namespace ZImage {
|
||||
|
||||
pe_vec = Rope::gen_z_image_pe(static_cast<int>(x->ne[1]),
|
||||
static_cast<int>(x->ne[0]),
|
||||
z_image_params.patch_size,
|
||||
config.patch_size,
|
||||
static_cast<int>(x->ne[3]),
|
||||
static_cast<int>(context->ne[1]),
|
||||
SEQ_MULTI_OF,
|
||||
ref_latents,
|
||||
increase_ref_index,
|
||||
z_image_params.theta,
|
||||
config.theta,
|
||||
circular_y_enabled,
|
||||
circular_x_enabled,
|
||||
z_image_params.axes_dim);
|
||||
int pos_len = static_cast<int>(pe_vec.size() / z_image_params.axes_dim_sum / 2);
|
||||
config.axes_dim);
|
||||
int pos_len = static_cast<int>(pe_vec.size() / config.axes_dim_sum / 2);
|
||||
// LOG_DEBUG("pos_len %d", pos_len);
|
||||
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, z_image_params.axes_dim_sum / 2, pos_len);
|
||||
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.axes_dim_sum / 2, pos_len);
|
||||
// pe->data = pe_vec.data();
|
||||
// print_ggml_tensor(pe, true, "pe");
|
||||
// pe->data = nullptr;
|
||||
@ -660,4 +739,4 @@ namespace ZImage {
|
||||
|
||||
} // namespace ZImage
|
||||
|
||||
#endif // __Z_IMAGE_HPP__
|
||||
#endif // __SD_MODEL_DIFFUSION_Z_IMAGE_HPP__
|
||||
@ -1,13 +1,13 @@
|
||||
#ifndef __CLIP_HPP__
|
||||
#define __CLIP_HPP__
|
||||
#ifndef __SD_MODEL_TE_CLIP_HPP__
|
||||
#define __SD_MODEL_TE_CLIP_HPP__
|
||||
|
||||
#include "ggml_extend.hpp"
|
||||
#include "core/ggml_extend.hpp"
|
||||
#include "model.h"
|
||||
#include "tokenizers/clip_tokenizer.h"
|
||||
|
||||
/*================================================ FrozenCLIPEmbedder ================================================*/
|
||||
|
||||
// Ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/clip/modeling_clip.py
|
||||
// Ref: https://github.com/huggingface/transformers/blob/main/src/transformers/model/clip/modeling_clip.py
|
||||
|
||||
struct CLIPMLP : public GGMLBlock {
|
||||
protected:
|
||||
@ -579,4 +579,4 @@ struct CLIPTextModelRunner : public GGMLRunner {
|
||||
}
|
||||
};
|
||||
|
||||
#endif // __CLIP_HPP__
|
||||
#endif // __SD_MODEL_TE_CLIP_HPP__
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef __LLM_HPP__
|
||||
#define __LLM_HPP__
|
||||
#ifndef __SD_MODEL_TE_LLM_HPP__
|
||||
#define __SD_MODEL_TE_LLM_HPP__
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
@ -18,9 +18,9 @@
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "ggml_extend.hpp"
|
||||
#include "core/ggml_extend.hpp"
|
||||
#include "json.hpp"
|
||||
#include "rope.hpp"
|
||||
#include "model/common/rope.hpp"
|
||||
#include "tokenizers/bpe_tokenizer.h"
|
||||
#include "tokenizers/gemma_tokenizer.h"
|
||||
#include "tokenizers/gpt_oss_tokenizer.h"
|
||||
@ -63,7 +63,7 @@ namespace LLM {
|
||||
QWEN3_VL,
|
||||
};
|
||||
|
||||
struct LLMVisionParams {
|
||||
struct LLMVisionConfig {
|
||||
LLMVisionArch arch = LLMVisionArch::QWEN2_5_VL;
|
||||
int num_layers = 32;
|
||||
int64_t hidden_size = 1280;
|
||||
@ -79,7 +79,7 @@ namespace LLM {
|
||||
std::set<int> fullatt_block_indexes = {7, 15, 23, 31};
|
||||
};
|
||||
|
||||
struct LLMParams {
|
||||
struct LLMConfig {
|
||||
LLMArch arch = LLMArch::QWEN2_5_VL;
|
||||
int64_t num_layers = 28;
|
||||
int64_t hidden_size = 3584;
|
||||
@ -101,7 +101,129 @@ namespace LLM {
|
||||
std::vector<int> sliding_attention;
|
||||
int64_t num_experts = 0;
|
||||
int64_t num_experts_per_tok = 0;
|
||||
LLMVisionParams vision;
|
||||
LLMVisionConfig vision;
|
||||
bool have_vision_weight = false;
|
||||
bool llama_cpp_style = false;
|
||||
|
||||
static LLMConfig detect_from_weights(const String2TensorStorage& tensor_storage_map,
|
||||
const std::string& prefix,
|
||||
LLMArch arch) {
|
||||
LLMConfig config;
|
||||
config.arch = arch;
|
||||
if (arch == LLMArch::MISTRAL_SMALL_3_2 || arch == LLMArch::MINISTRAL_3_3B) {
|
||||
config.head_dim = 128;
|
||||
config.num_heads = 32;
|
||||
config.num_kv_heads = 8;
|
||||
config.qkv_bias = false;
|
||||
config.rms_norm_eps = 1e-5f;
|
||||
} else if (arch == LLMArch::QWEN3 || arch == LLMArch::QWEN3_VL) {
|
||||
config.head_dim = 128;
|
||||
config.num_heads = 32;
|
||||
config.num_kv_heads = 8;
|
||||
config.qkv_bias = false;
|
||||
config.qk_norm = true;
|
||||
config.rms_norm_eps = 1e-6f;
|
||||
if (arch == LLMArch::QWEN3_VL) {
|
||||
config.max_position_embeddings = 262144;
|
||||
config.rope_thetas = {5000000.f};
|
||||
config.vision.arch = LLMVisionArch::QWEN3_VL;
|
||||
}
|
||||
} else if (arch == LLMArch::GEMMA3_12B) {
|
||||
config.head_dim = 256;
|
||||
config.num_heads = 16;
|
||||
config.num_kv_heads = 8;
|
||||
config.qkv_bias = false;
|
||||
config.qk_norm = true;
|
||||
config.rms_norm_eps = 1e-6f;
|
||||
config.rms_norm_add = false;
|
||||
config.normalize_input = true;
|
||||
config.max_position_embeddings = 131072;
|
||||
config.mlp_activation = MLPActivation::GELU_TANH;
|
||||
config.rope_thetas = {1000000.f, 10000.f};
|
||||
config.rope_scales = {8.f, 1.f};
|
||||
config.sliding_attention = {1024, 1024, 1024, 1024, 1024, 0};
|
||||
} else if (arch == LLMArch::GEMMA2_2B) {
|
||||
config.head_dim = 256;
|
||||
config.num_heads = 8;
|
||||
config.num_kv_heads = 4;
|
||||
config.qkv_bias = false;
|
||||
config.qk_norm = false;
|
||||
config.rms_norm_eps = 1e-6f;
|
||||
config.rms_norm_add = true;
|
||||
config.normalize_input = true;
|
||||
config.max_position_embeddings = 8192;
|
||||
config.mlp_activation = MLPActivation::GELU_TANH;
|
||||
config.hidden_size = 2304;
|
||||
config.intermediate_size = 9216;
|
||||
config.num_layers = 26;
|
||||
config.vocab_size = 256000;
|
||||
} else if (arch == LLMArch::GPT_OSS_20B) {
|
||||
config.head_dim = 64;
|
||||
config.num_heads = 64;
|
||||
config.num_kv_heads = 8;
|
||||
config.qkv_bias = true;
|
||||
config.attention_out_bias = true;
|
||||
config.qk_norm = false;
|
||||
config.rms_norm_eps = 1e-5f;
|
||||
config.hidden_size = 2880;
|
||||
config.intermediate_size = 2880;
|
||||
config.num_layers = 24;
|
||||
config.vocab_size = 201088;
|
||||
config.max_position_embeddings = 131072;
|
||||
config.rope_thetas = {150000.f};
|
||||
config.rope_scales = {32.f};
|
||||
config.sliding_attention = {128, 0};
|
||||
config.num_experts = 32;
|
||||
config.num_experts_per_tok = 4;
|
||||
}
|
||||
|
||||
config.num_layers = 0;
|
||||
for (const auto& [name, tensor_storage] : tensor_storage_map) {
|
||||
if (!starts_with(name, prefix)) {
|
||||
continue;
|
||||
}
|
||||
size_t pos = name.find("visual.");
|
||||
if (pos != std::string::npos) {
|
||||
config.have_vision_weight = true;
|
||||
if (contains(name, "attn.q_proj")) {
|
||||
config.llama_cpp_style = true;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
pos = name.find("layers.");
|
||||
if (pos != std::string::npos) {
|
||||
auto items = split_string(name.substr(pos), '.');
|
||||
if (items.size() > 1) {
|
||||
int block_index = atoi(items[1].c_str());
|
||||
if (block_index + 1 > config.num_layers) {
|
||||
config.num_layers = block_index + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (contains(name, "embed_tokens.weight")) {
|
||||
config.hidden_size = tensor_storage.ne[0];
|
||||
config.vocab_size = tensor_storage.ne[1];
|
||||
}
|
||||
if (contains(name, "layers.0.mlp.gate_proj.weight")) {
|
||||
config.intermediate_size = tensor_storage.ne[1];
|
||||
}
|
||||
if (contains(name, "layers.0.mlp.experts.gate_up_proj.weight")) {
|
||||
config.intermediate_size = tensor_storage.ne[1] / 2;
|
||||
}
|
||||
if (contains(name, "layers.0.mlp.experts.gate_proj.weight")) {
|
||||
config.intermediate_size = tensor_storage.ne[1];
|
||||
}
|
||||
}
|
||||
if (arch == LLMArch::QWEN3 && config.num_layers == 28) {
|
||||
config.num_heads = 16;
|
||||
}
|
||||
LOG_DEBUG("llm: num_layers = %" PRId64 ", vocab_size = %" PRId64 ", hidden_size = %" PRId64 ", intermediate_size = %" PRId64,
|
||||
config.num_layers,
|
||||
config.vocab_size,
|
||||
config.hidden_size,
|
||||
config.intermediate_size);
|
||||
return config;
|
||||
}
|
||||
};
|
||||
|
||||
struct LLMRMSNorm : public UnaryBlock {
|
||||
@ -232,11 +354,11 @@ namespace LLM {
|
||||
}
|
||||
|
||||
public:
|
||||
GPTOSSMLP(const LLMParams& params)
|
||||
: hidden_size(params.hidden_size),
|
||||
intermediate_size(params.intermediate_size),
|
||||
num_experts(params.num_experts),
|
||||
num_experts_per_tok(params.num_experts_per_tok) {}
|
||||
GPTOSSMLP(const LLMConfig& config)
|
||||
: hidden_size(config.hidden_size),
|
||||
intermediate_size(config.intermediate_size),
|
||||
num_experts(config.num_experts),
|
||||
num_experts_per_tok(config.num_experts_per_tok) {}
|
||||
|
||||
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
|
||||
// x: [N, n_token, hidden_size]
|
||||
@ -667,7 +789,7 @@ namespace LLM {
|
||||
|
||||
public:
|
||||
VisionModel(bool llama_cpp_style,
|
||||
const LLMVisionParams& vision_params,
|
||||
const LLMVisionConfig& vision_params,
|
||||
float eps = 1e-6f)
|
||||
: arch_(vision_params.arch),
|
||||
num_layers(vision_params.num_layers),
|
||||
@ -784,23 +906,23 @@ namespace LLM {
|
||||
}
|
||||
|
||||
public:
|
||||
Attention(const LLMParams& params)
|
||||
: arch(params.arch),
|
||||
num_heads(params.num_heads),
|
||||
num_kv_heads(params.num_kv_heads),
|
||||
head_dim(params.head_dim),
|
||||
qk_norm(params.qk_norm),
|
||||
max_position_embeddings(params.max_position_embeddings),
|
||||
rope_thetas(params.rope_thetas),
|
||||
rope_scales(params.rope_scales),
|
||||
has_attention_sinks(params.arch == LLMArch::GPT_OSS_20B) {
|
||||
blocks["q_proj"] = std::make_shared<Linear>(params.hidden_size, num_heads * head_dim, params.qkv_bias);
|
||||
blocks["k_proj"] = std::make_shared<Linear>(params.hidden_size, num_kv_heads * head_dim, params.qkv_bias);
|
||||
blocks["v_proj"] = std::make_shared<Linear>(params.hidden_size, num_kv_heads * head_dim, params.qkv_bias);
|
||||
blocks["o_proj"] = std::make_shared<Linear>(num_heads * head_dim, params.hidden_size, params.attention_out_bias);
|
||||
if (params.qk_norm) {
|
||||
blocks["q_norm"] = std::make_shared<LLMRMSNorm>(head_dim, params.rms_norm_eps, params.rms_norm_add);
|
||||
blocks["k_norm"] = std::make_shared<LLMRMSNorm>(head_dim, params.rms_norm_eps, params.rms_norm_add);
|
||||
Attention(const LLMConfig& config)
|
||||
: arch(config.arch),
|
||||
num_heads(config.num_heads),
|
||||
num_kv_heads(config.num_kv_heads),
|
||||
head_dim(config.head_dim),
|
||||
qk_norm(config.qk_norm),
|
||||
max_position_embeddings(config.max_position_embeddings),
|
||||
rope_thetas(config.rope_thetas),
|
||||
rope_scales(config.rope_scales),
|
||||
has_attention_sinks(config.arch == LLMArch::GPT_OSS_20B) {
|
||||
blocks["q_proj"] = std::make_shared<Linear>(config.hidden_size, num_heads * head_dim, config.qkv_bias);
|
||||
blocks["k_proj"] = std::make_shared<Linear>(config.hidden_size, num_kv_heads * head_dim, config.qkv_bias);
|
||||
blocks["v_proj"] = std::make_shared<Linear>(config.hidden_size, num_kv_heads * head_dim, config.qkv_bias);
|
||||
blocks["o_proj"] = std::make_shared<Linear>(num_heads * head_dim, config.hidden_size, config.attention_out_bias);
|
||||
if (config.qk_norm) {
|
||||
blocks["q_norm"] = std::make_shared<LLMRMSNorm>(head_dim, config.rms_norm_eps, config.rms_norm_add);
|
||||
blocks["k_norm"] = std::make_shared<LLMRMSNorm>(head_dim, config.rms_norm_eps, config.rms_norm_add);
|
||||
}
|
||||
}
|
||||
|
||||
@ -982,42 +1104,42 @@ namespace LLM {
|
||||
std::string post_ffw_norm_name;
|
||||
|
||||
public:
|
||||
TransformerBlock(const LLMParams& params, int layer_index)
|
||||
: arch(params.arch),
|
||||
TransformerBlock(const LLMConfig& config, int layer_index)
|
||||
: arch(config.arch),
|
||||
sliding_attention(0) {
|
||||
if (params.arch == LLMArch::GEMMA3_12B) {
|
||||
if (config.arch == LLMArch::GEMMA3_12B) {
|
||||
post_attention_norm_name = "post_attention_norm"; // attn_post_norm
|
||||
pre_ffw_norm_name = "post_attention_layernorm"; // ffn_norm
|
||||
post_ffw_norm_name = "post_ffw_norm"; // ffn_post_norm
|
||||
} else if (params.arch == LLMArch::GEMMA2_2B) {
|
||||
} else if (config.arch == LLMArch::GEMMA2_2B) {
|
||||
post_attention_norm_name = "post_attention_layernorm"; // ffn_norm
|
||||
pre_ffw_norm_name = "pre_feedforward_layernorm";
|
||||
post_ffw_norm_name = "post_feedforward_layernorm";
|
||||
} else if (params.arch == LLMArch::GPT_OSS_20B) {
|
||||
} else if (config.arch == LLMArch::GPT_OSS_20B) {
|
||||
pre_ffw_norm_name = "post_attention_norm"; // attn_post_norm
|
||||
} else {
|
||||
pre_ffw_norm_name = "post_attention_layernorm"; // ffn_norm
|
||||
}
|
||||
|
||||
blocks["self_attn"] = std::make_shared<Attention>(params);
|
||||
if (params.arch == LLMArch::GPT_OSS_20B) {
|
||||
blocks["mlp"] = std::make_shared<GPTOSSMLP>(params);
|
||||
blocks["self_attn"] = std::make_shared<Attention>(config);
|
||||
if (config.arch == LLMArch::GPT_OSS_20B) {
|
||||
blocks["mlp"] = std::make_shared<GPTOSSMLP>(config);
|
||||
} else {
|
||||
blocks["mlp"] = std::make_shared<MLP>(params.hidden_size,
|
||||
params.intermediate_size,
|
||||
blocks["mlp"] = std::make_shared<MLP>(config.hidden_size,
|
||||
config.intermediate_size,
|
||||
false,
|
||||
params.mlp_activation);
|
||||
config.mlp_activation);
|
||||
}
|
||||
blocks["input_layernorm"] = std::make_shared<LLMRMSNorm>(params.hidden_size, params.rms_norm_eps, params.rms_norm_add);
|
||||
blocks[pre_ffw_norm_name] = std::make_shared<LLMRMSNorm>(params.hidden_size, params.rms_norm_eps, params.rms_norm_add);
|
||||
blocks["input_layernorm"] = std::make_shared<LLMRMSNorm>(config.hidden_size, config.rms_norm_eps, config.rms_norm_add);
|
||||
blocks[pre_ffw_norm_name] = std::make_shared<LLMRMSNorm>(config.hidden_size, config.rms_norm_eps, config.rms_norm_add);
|
||||
if (!post_attention_norm_name.empty()) {
|
||||
blocks[post_attention_norm_name] = std::make_shared<LLMRMSNorm>(params.hidden_size, params.rms_norm_eps, params.rms_norm_add);
|
||||
blocks[post_attention_norm_name] = std::make_shared<LLMRMSNorm>(config.hidden_size, config.rms_norm_eps, config.rms_norm_add);
|
||||
}
|
||||
if (!post_ffw_norm_name.empty()) {
|
||||
blocks[post_ffw_norm_name] = std::make_shared<LLMRMSNorm>(params.hidden_size, params.rms_norm_eps, params.rms_norm_add);
|
||||
blocks[post_ffw_norm_name] = std::make_shared<LLMRMSNorm>(config.hidden_size, config.rms_norm_eps, config.rms_norm_add);
|
||||
}
|
||||
if (!params.sliding_attention.empty()) {
|
||||
sliding_attention = params.sliding_attention[layer_index % params.sliding_attention.size()];
|
||||
if (!config.sliding_attention.empty()) {
|
||||
sliding_attention = config.sliding_attention[layer_index % config.sliding_attention.size()];
|
||||
}
|
||||
}
|
||||
|
||||
@ -1074,16 +1196,16 @@ namespace LLM {
|
||||
struct TextModel : public GGMLBlock {
|
||||
protected:
|
||||
int64_t num_layers;
|
||||
LLMParams params;
|
||||
LLMConfig config;
|
||||
|
||||
public:
|
||||
TextModel(const LLMParams& params)
|
||||
: num_layers(params.num_layers), params(params) {
|
||||
blocks["embed_tokens"] = std::shared_ptr<GGMLBlock>(new Embedding(params.vocab_size, params.hidden_size));
|
||||
TextModel(const LLMConfig& config)
|
||||
: num_layers(config.num_layers), config(config) {
|
||||
blocks["embed_tokens"] = std::shared_ptr<GGMLBlock>(new Embedding(config.vocab_size, config.hidden_size));
|
||||
for (int i = 0; i < num_layers; i++) {
|
||||
blocks["layers." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new TransformerBlock(params, i));
|
||||
blocks["layers." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new TransformerBlock(config, i));
|
||||
}
|
||||
blocks["norm"] = std::shared_ptr<GGMLBlock>(new LLMRMSNorm(params.hidden_size, params.rms_norm_eps, params.rms_norm_add));
|
||||
blocks["norm"] = std::shared_ptr<GGMLBlock>(new LLMRMSNorm(config.hidden_size, config.rms_norm_eps, config.rms_norm_add));
|
||||
}
|
||||
|
||||
ggml_tensor* embed(GGMLRunnerContext* ctx,
|
||||
@ -1103,8 +1225,8 @@ namespace LLM {
|
||||
auto norm = std::dynamic_pointer_cast<LLMRMSNorm>(blocks["norm"]);
|
||||
std::vector<ggml_tensor*> intermediate_outputs;
|
||||
|
||||
if (params.normalize_input) {
|
||||
x = ggml_ext_scale(ctx->ggml_ctx, x, std::sqrt(static_cast<float>(params.hidden_size)), true);
|
||||
if (config.normalize_input) {
|
||||
x = ggml_ext_scale(ctx->ggml_ctx, x, std::sqrt(static_cast<float>(config.hidden_size)), true);
|
||||
}
|
||||
if (return_all_hidden_states) {
|
||||
intermediate_outputs.push_back(x);
|
||||
@ -1174,15 +1296,15 @@ namespace LLM {
|
||||
|
||||
struct LLM : public GGMLBlock {
|
||||
bool enable_vision;
|
||||
LLMParams params;
|
||||
LLMConfig config;
|
||||
|
||||
public:
|
||||
LLM() = default;
|
||||
LLM(LLMParams params, bool enable_vision = false, bool llama_cpp_style = false)
|
||||
: enable_vision(enable_vision), params(params) {
|
||||
blocks["model"] = std::shared_ptr<GGMLBlock>(new TextModel(params));
|
||||
LLM(LLMConfig config, bool enable_vision = false, bool llama_cpp_style = false)
|
||||
: enable_vision(enable_vision), config(config) {
|
||||
blocks["model"] = std::shared_ptr<GGMLBlock>(new TextModel(config));
|
||||
if (enable_vision) {
|
||||
blocks["visual"] = std::shared_ptr<GGMLBlock>(new VisionModel(llama_cpp_style, params.vision));
|
||||
blocks["visual"] = std::shared_ptr<GGMLBlock>(new VisionModel(llama_cpp_style, config.vision));
|
||||
}
|
||||
}
|
||||
|
||||
@ -1226,7 +1348,7 @@ namespace LLM {
|
||||
};
|
||||
|
||||
struct LLMRunner : public GGMLRunner {
|
||||
LLMParams params;
|
||||
LLMConfig config;
|
||||
bool enable_vision;
|
||||
LLM model;
|
||||
|
||||
@ -1242,7 +1364,7 @@ namespace LLM {
|
||||
|
||||
static ggml_tensor* process_image_common(ggml_context* ctx,
|
||||
ggml_tensor* image,
|
||||
const LLMVisionParams& vision_params) {
|
||||
const LLMVisionConfig& vision_params) {
|
||||
// image: [C, H, W]
|
||||
// return: [grid_t*(H/mh/ph)*(W/mw/pw)*mh*mw, C*pt*ph*pw], grid_t == 1
|
||||
int64_t C = image->ne[2];
|
||||
@ -1337,7 +1459,7 @@ namespace LLM {
|
||||
ggml_context* compute_ctx,
|
||||
GGMLRunnerContext* runner_ctx,
|
||||
ggml_tensor* image,
|
||||
const LLMVisionParams& vision_params,
|
||||
const LLMVisionConfig& vision_params,
|
||||
std::shared_ptr<VisionModel> vision_model,
|
||||
std::vector<int>& window_index_vec,
|
||||
std::vector<int>& window_inverse_index_vec,
|
||||
@ -1452,141 +1574,25 @@ namespace LLM {
|
||||
const String2TensorStorage& tensor_storage_map,
|
||||
const std::string prefix,
|
||||
bool enable_vision_ = false)
|
||||
: GGMLRunner(backend, params_backend), enable_vision(enable_vision_) {
|
||||
params.arch = arch;
|
||||
if (arch == LLMArch::MISTRAL_SMALL_3_2 || arch == LLMArch::MINISTRAL_3_3B) {
|
||||
params.head_dim = 128;
|
||||
params.num_heads = 32;
|
||||
params.num_kv_heads = 8;
|
||||
params.qkv_bias = false;
|
||||
params.rms_norm_eps = 1e-5f;
|
||||
} else if (arch == LLMArch::QWEN3 || arch == LLMArch::QWEN3_VL) {
|
||||
params.head_dim = 128;
|
||||
params.num_heads = 32;
|
||||
params.num_kv_heads = 8;
|
||||
params.qkv_bias = false;
|
||||
params.qk_norm = true;
|
||||
params.rms_norm_eps = 1e-6f;
|
||||
if (arch == LLMArch::QWEN3_VL) {
|
||||
params.max_position_embeddings = 262144;
|
||||
params.rope_thetas = {5000000.f};
|
||||
params.vision.arch = LLMVisionArch::QWEN3_VL;
|
||||
}
|
||||
} else if (arch == LLMArch::GEMMA3_12B) {
|
||||
params.head_dim = 256;
|
||||
params.num_heads = 16;
|
||||
params.num_kv_heads = 8;
|
||||
params.qkv_bias = false;
|
||||
params.qk_norm = true;
|
||||
params.rms_norm_eps = 1e-6f;
|
||||
// llama.cpp adds +1 to Gemma3 norm.weight when exporting GGUF, so GGUF loading
|
||||
// must keep rms_norm_add disabled here or the offset gets applied twice.
|
||||
// Convenient for the converter, less convenient for whoever gets to debug it later.
|
||||
params.rms_norm_add = false;
|
||||
params.normalize_input = true;
|
||||
params.max_position_embeddings = 131072;
|
||||
params.mlp_activation = MLPActivation::GELU_TANH;
|
||||
params.rope_thetas = {1000000.f, 10000.f};
|
||||
params.rope_scales = {8.f, 1.f};
|
||||
params.sliding_attention = {1024, 1024, 1024, 1024, 1024, 0};
|
||||
} else if (arch == LLMArch::GEMMA2_2B) {
|
||||
params.head_dim = 256;
|
||||
params.num_heads = 8;
|
||||
params.num_kv_heads = 4;
|
||||
params.qkv_bias = false;
|
||||
params.qk_norm = false;
|
||||
params.rms_norm_eps = 1e-6f;
|
||||
params.rms_norm_add = true;
|
||||
params.normalize_input = true;
|
||||
params.max_position_embeddings = 8192;
|
||||
params.mlp_activation = MLPActivation::GELU_TANH;
|
||||
params.hidden_size = 2304;
|
||||
params.intermediate_size = 9216;
|
||||
params.num_layers = 26;
|
||||
params.vocab_size = 256000;
|
||||
} else if (arch == LLMArch::GPT_OSS_20B) {
|
||||
params.head_dim = 64;
|
||||
params.num_heads = 64;
|
||||
params.num_kv_heads = 8;
|
||||
params.qkv_bias = true;
|
||||
params.attention_out_bias = true;
|
||||
params.qk_norm = false;
|
||||
params.rms_norm_eps = 1e-5f;
|
||||
params.hidden_size = 2880;
|
||||
params.intermediate_size = 2880;
|
||||
params.num_layers = 24;
|
||||
params.vocab_size = 201088;
|
||||
params.max_position_embeddings = 131072;
|
||||
params.rope_thetas = {150000.f};
|
||||
params.rope_scales = {32.f};
|
||||
params.sliding_attention = {128, 0};
|
||||
params.num_experts = 32;
|
||||
params.num_experts_per_tok = 4;
|
||||
}
|
||||
bool have_vision_weight = false;
|
||||
bool llama_cpp_style = false;
|
||||
params.num_layers = 0;
|
||||
for (auto pair : tensor_storage_map) {
|
||||
std::string tensor_name = pair.first;
|
||||
if (tensor_name.find(prefix) == std::string::npos)
|
||||
continue;
|
||||
size_t pos = tensor_name.find("visual.");
|
||||
if (pos != std::string::npos) {
|
||||
have_vision_weight = true;
|
||||
if (contains(tensor_name, "attn.q_proj")) {
|
||||
llama_cpp_style = true;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
pos = tensor_name.find("layers.");
|
||||
if (pos != std::string::npos) {
|
||||
tensor_name = tensor_name.substr(pos); // remove prefix
|
||||
auto items = split_string(tensor_name, '.');
|
||||
if (items.size() > 1) {
|
||||
int block_index = atoi(items[1].c_str());
|
||||
if (block_index + 1 > params.num_layers) {
|
||||
params.num_layers = block_index + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (contains(tensor_name, "embed_tokens.weight")) {
|
||||
params.hidden_size = pair.second.ne[0];
|
||||
params.vocab_size = pair.second.ne[1];
|
||||
}
|
||||
if (contains(tensor_name, "layers.0.mlp.gate_proj.weight")) {
|
||||
params.intermediate_size = pair.second.ne[1];
|
||||
}
|
||||
if (contains(tensor_name, "layers.0.mlp.experts.gate_up_proj.weight")) {
|
||||
params.intermediate_size = pair.second.ne[1] / 2;
|
||||
}
|
||||
if (contains(tensor_name, "layers.0.mlp.experts.gate_proj.weight")) {
|
||||
params.intermediate_size = pair.second.ne[1];
|
||||
}
|
||||
}
|
||||
if (arch == LLMArch::QWEN3 && params.num_layers == 28) { // Qwen3 2B
|
||||
params.num_heads = 16;
|
||||
}
|
||||
LOG_DEBUG("llm: num_layers = %" PRId64 ", vocab_size = %" PRId64 ", hidden_size = %" PRId64 ", intermediate_size = %" PRId64,
|
||||
params.num_layers,
|
||||
params.vocab_size,
|
||||
params.hidden_size,
|
||||
params.intermediate_size);
|
||||
if (enable_vision && !have_vision_weight) {
|
||||
: GGMLRunner(backend, params_backend),
|
||||
config(LLMConfig::detect_from_weights(tensor_storage_map, prefix, arch)),
|
||||
enable_vision(enable_vision_) {
|
||||
if (enable_vision && !config.have_vision_weight) {
|
||||
LOG_WARN("no vision weights detected, vision disabled");
|
||||
enable_vision = false;
|
||||
}
|
||||
if (enable_vision) {
|
||||
LOG_DEBUG("enable llm vision");
|
||||
if (llama_cpp_style) {
|
||||
if (config.llama_cpp_style) {
|
||||
LOG_DEBUG("llama.cpp style vision weight");
|
||||
}
|
||||
}
|
||||
model = LLM(params, enable_vision, llama_cpp_style);
|
||||
model = LLM(config, enable_vision, config.llama_cpp_style);
|
||||
model.init(params_ctx, tensor_storage_map, prefix);
|
||||
}
|
||||
|
||||
std::string get_desc() override {
|
||||
return llm_arch_to_str[static_cast<int>(params.arch)];
|
||||
return llm_arch_to_str[static_cast<int>(config.arch)];
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
|
||||
@ -1638,12 +1644,12 @@ namespace LLM {
|
||||
}
|
||||
|
||||
int64_t n_tokens = input_ids->ne[0];
|
||||
if (params.arch == LLMArch::MISTRAL_SMALL_3_2 ||
|
||||
params.arch == LLMArch::MINISTRAL_3_3B ||
|
||||
params.arch == LLMArch::QWEN3 ||
|
||||
params.arch == LLMArch::GEMMA3_12B ||
|
||||
params.arch == LLMArch::GEMMA2_2B ||
|
||||
params.arch == LLMArch::GPT_OSS_20B) {
|
||||
if (config.arch == LLMArch::MISTRAL_SMALL_3_2 ||
|
||||
config.arch == LLMArch::MINISTRAL_3_3B ||
|
||||
config.arch == LLMArch::QWEN3 ||
|
||||
config.arch == LLMArch::GEMMA3_12B ||
|
||||
config.arch == LLMArch::GEMMA2_2B ||
|
||||
config.arch == LLMArch::GPT_OSS_20B) {
|
||||
input_pos_vec.resize(n_tokens);
|
||||
for (int i = 0; i < n_tokens; ++i) {
|
||||
input_pos_vec[i] = i;
|
||||
@ -1682,9 +1688,9 @@ namespace LLM {
|
||||
set_backend_tensor_data(attention_mask, attention_mask_vec.data());
|
||||
}
|
||||
|
||||
if (params.arch == LLMArch::GEMMA3_12B || params.arch == LLMArch::GPT_OSS_20B) {
|
||||
if (config.arch == LLMArch::GEMMA3_12B || config.arch == LLMArch::GPT_OSS_20B) {
|
||||
int sliding_window = 0;
|
||||
for (int window : params.sliding_attention) {
|
||||
for (int window : config.sliding_attention) {
|
||||
sliding_window = std::max(sliding_window, window);
|
||||
}
|
||||
sliding_attention_mask_vec.resize(n_tokens * n_tokens);
|
||||
@ -1740,15 +1746,15 @@ namespace LLM {
|
||||
|
||||
int64_t get_num_image_tokens(int64_t t, int64_t h, int64_t w) {
|
||||
int64_t grid_t = 1;
|
||||
int64_t grid_h = h / params.vision.patch_size;
|
||||
int64_t grid_w = w / params.vision.patch_size;
|
||||
int64_t llm_grid_h = grid_h / params.vision.spatial_merge_size;
|
||||
int64_t llm_grid_w = grid_w / params.vision.spatial_merge_size;
|
||||
int64_t grid_h = h / config.vision.patch_size;
|
||||
int64_t grid_w = w / config.vision.patch_size;
|
||||
int64_t llm_grid_h = grid_h / config.vision.spatial_merge_size;
|
||||
int64_t llm_grid_w = grid_w / config.vision.spatial_merge_size;
|
||||
return grid_t * grid_h * grid_w;
|
||||
}
|
||||
|
||||
ggml_tensor* process_image(ggml_context* ctx, ggml_tensor* image) {
|
||||
return process_image_common(ctx, image, params.vision);
|
||||
return process_image_common(ctx, image, config.vision);
|
||||
}
|
||||
|
||||
ggml_tensor* build_patch_pos_embeds(GGMLRunnerContext* runner_ctx,
|
||||
@ -1770,7 +1776,7 @@ namespace LLM {
|
||||
compute_ctx,
|
||||
runner_ctx,
|
||||
image,
|
||||
params.vision,
|
||||
config.vision,
|
||||
model.vision_model(),
|
||||
window_index_vec,
|
||||
window_inverse_index_vec,
|
||||
@ -1784,8 +1790,8 @@ namespace LLM {
|
||||
ggml_cgraph* gf = new_graph_custom(LLM_GRAPH_SIZE);
|
||||
ggml_tensor* image = make_input(image_tensor);
|
||||
|
||||
GGML_ASSERT(image->ne[1] % (params.vision.patch_size * params.vision.spatial_merge_size) == 0);
|
||||
GGML_ASSERT(image->ne[0] % (params.vision.patch_size * params.vision.spatial_merge_size) == 0);
|
||||
GGML_ASSERT(image->ne[1] % (config.vision.patch_size * config.vision.spatial_merge_size) == 0);
|
||||
GGML_ASSERT(image->ne[0] % (config.vision.patch_size * config.vision.spatial_merge_size) == 0);
|
||||
|
||||
auto runnter_ctx = get_context();
|
||||
ggml_tensor* hidden_states = encode_image(&runnter_ctx, image);
|
||||
@ -2094,4 +2100,4 @@ namespace LLM {
|
||||
};
|
||||
}; // LLM
|
||||
|
||||
#endif // __LLM_HPP__
|
||||
#endif // __SD_MODEL_TE_LLM_HPP__
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef __T5_HPP__
|
||||
#define __T5_HPP__
|
||||
#ifndef __SD_MODEL_TE_T5_HPP__
|
||||
#define __SD_MODEL_TE_T5_HPP__
|
||||
|
||||
#include <cfloat>
|
||||
#include <limits>
|
||||
@ -10,10 +10,32 @@
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "ggml_extend.hpp"
|
||||
#include "core/ggml_extend.hpp"
|
||||
#include "model.h"
|
||||
#include "tokenizers/t5_unigram_tokenizer.h"
|
||||
|
||||
struct T5Config {
|
||||
int64_t num_layers = 24;
|
||||
int64_t model_dim = 4096;
|
||||
int64_t ff_dim = 10240;
|
||||
int64_t num_heads = 64;
|
||||
int64_t vocab_size = 32128;
|
||||
bool relative_attention = true;
|
||||
|
||||
static T5Config detect_from_weights(const String2TensorStorage& tensor_storage_map,
|
||||
const std::string& prefix,
|
||||
bool is_umt5 = false) {
|
||||
(void)tensor_storage_map;
|
||||
(void)prefix;
|
||||
T5Config config;
|
||||
if (is_umt5) {
|
||||
config.vocab_size = 256384;
|
||||
config.relative_attention = false;
|
||||
}
|
||||
return config;
|
||||
}
|
||||
};
|
||||
|
||||
class T5LayerNorm : public UnaryBlock {
|
||||
protected:
|
||||
int64_t hidden_size;
|
||||
@ -272,30 +294,21 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
struct T5Params {
|
||||
int64_t num_layers = 24;
|
||||
int64_t model_dim = 4096;
|
||||
int64_t ff_dim = 10240;
|
||||
int64_t num_heads = 64;
|
||||
int64_t vocab_size = 32128;
|
||||
bool relative_attention = true;
|
||||
};
|
||||
|
||||
struct T5 : public GGMLBlock {
|
||||
T5Params params;
|
||||
T5Config config;
|
||||
|
||||
public:
|
||||
T5() {}
|
||||
T5(T5Params params)
|
||||
: params(params) {
|
||||
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new T5Stack(params.num_layers,
|
||||
params.model_dim,
|
||||
params.model_dim,
|
||||
params.ff_dim,
|
||||
params.num_heads,
|
||||
params.relative_attention));
|
||||
blocks["shared"] = std::shared_ptr<GGMLBlock>(new Embedding(params.vocab_size,
|
||||
params.model_dim));
|
||||
T5(T5Config config)
|
||||
: config(config) {
|
||||
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new T5Stack(config.num_layers,
|
||||
config.model_dim,
|
||||
config.model_dim,
|
||||
config.ff_dim,
|
||||
config.num_heads,
|
||||
config.relative_attention));
|
||||
blocks["shared"] = std::shared_ptr<GGMLBlock>(new Embedding(config.vocab_size,
|
||||
config.model_dim));
|
||||
}
|
||||
|
||||
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||
@ -316,7 +329,7 @@ public:
|
||||
};
|
||||
|
||||
struct T5Runner : public GGMLRunner {
|
||||
T5Params params;
|
||||
T5Config config;
|
||||
T5 model;
|
||||
std::vector<int> relative_position_bucket_vec;
|
||||
|
||||
@ -325,12 +338,9 @@ struct T5Runner : public GGMLRunner {
|
||||
const String2TensorStorage& tensor_storage_map,
|
||||
const std::string prefix,
|
||||
bool is_umt5 = false)
|
||||
: GGMLRunner(backend, params_backend) {
|
||||
if (is_umt5) {
|
||||
params.vocab_size = 256384;
|
||||
params.relative_attention = false;
|
||||
}
|
||||
model = T5(params);
|
||||
: GGMLRunner(backend, params_backend),
|
||||
config(T5Config::detect_from_weights(tensor_storage_map, prefix, is_umt5)) {
|
||||
model = T5(config);
|
||||
model.init(params_ctx, tensor_storage_map, prefix);
|
||||
}
|
||||
|
||||
@ -600,4 +610,4 @@ struct T5Embedder {
|
||||
}
|
||||
};
|
||||
|
||||
#endif // __T5_HPP__
|
||||
#endif // __SD_MODEL_TE_T5_HPP__
|
||||
@ -1,7 +1,7 @@
|
||||
#ifndef __ESRGAN_HPP__
|
||||
#define __ESRGAN_HPP__
|
||||
#ifndef __SD_MODEL_UPSCALER_ESRGAN_HPP__
|
||||
#define __SD_MODEL_UPSCALER_ESRGAN_HPP__
|
||||
|
||||
#include "ggml_extend.hpp"
|
||||
#include "core/ggml_extend.hpp"
|
||||
#include "model.h"
|
||||
|
||||
/*
|
||||
@ -372,4 +372,4 @@ struct ESRGAN : public GGMLRunner {
|
||||
}
|
||||
};
|
||||
|
||||
#endif // __ESRGAN_HPP__
|
||||
#endif // __SD_MODEL_UPSCALER_ESRGAN_HPP__
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef __SD_LTX_LATENT_UPSCALER_HPP__
|
||||
#define __SD_LTX_LATENT_UPSCALER_HPP__
|
||||
#ifndef __SD_MODEL_UPSCALER_LTX_LATENT_UPSCALER_HPP__
|
||||
#define __SD_MODEL_UPSCALER_LTX_LATENT_UPSCALER_HPP__
|
||||
|
||||
#include <cinttypes>
|
||||
#include <cmath>
|
||||
@ -11,11 +11,11 @@
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "common_dit.hpp"
|
||||
#include "ggml_extend.hpp"
|
||||
#include "ggml_graph_cut.h"
|
||||
#include "core/ggml_extend.hpp"
|
||||
#include "core/ggml_graph_cut.h"
|
||||
#include "core/util.h"
|
||||
#include "model.h"
|
||||
#include "util.h"
|
||||
#include "model/diffusion/dit.hpp"
|
||||
|
||||
namespace LTXVUpsampler {
|
||||
constexpr int LTX_UPSAMPLER_GRAPH_SIZE = 10240;
|
||||
@ -548,4 +548,4 @@ namespace LTXVUpsampler {
|
||||
|
||||
} // namespace LTXVUpsampler
|
||||
|
||||
#endif // __SD_LTX_LATENT_UPSCALER_HPP__
|
||||
#endif // __SD_MODEL_UPSCALER_LTX_LATENT_UPSCALER_HPP__
|
||||
@ -1,7 +1,7 @@
|
||||
#ifndef __AUTO_ENCODER_KL_HPP__
|
||||
#define __AUTO_ENCODER_KL_HPP__
|
||||
#ifndef __SD_MODEL_VAE_AUTO_ENCODER_KL_HPP__
|
||||
#define __SD_MODEL_VAE_AUTO_ENCODER_KL_HPP__
|
||||
|
||||
#include "vae.hpp"
|
||||
#include "model/vae/vae.hpp"
|
||||
|
||||
/*================================================== AutoEncoderKL ===================================================*/
|
||||
|
||||
@ -886,4 +886,4 @@ struct AutoEncoderKL : public VAE {
|
||||
};
|
||||
};
|
||||
|
||||
#endif // __AUTO_ENCODER_KL_HPP__
|
||||
#endif // __SD_MODEL_VAE_AUTO_ENCODER_KL_HPP__
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef __SD_LTX_AUDIO_VAE_H__
|
||||
#define __SD_LTX_AUDIO_VAE_H__
|
||||
#ifndef __SD_MODEL_VAE_LTX_AUDIO_VAE_HPP__
|
||||
#define __SD_MODEL_VAE_LTX_AUDIO_VAE_HPP__
|
||||
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
@ -7,7 +7,7 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "ggml_extend.hpp"
|
||||
#include "core/ggml_extend.hpp"
|
||||
|
||||
namespace LTXV {
|
||||
|
||||
@ -58,11 +58,12 @@ namespace LTXV {
|
||||
return base_output_sample_rate();
|
||||
}
|
||||
|
||||
static LTXAudioVAEConfig detect_from_weights(const String2TensorStorage& tensor_storage_map) {
|
||||
static LTXAudioVAEConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix = "") {
|
||||
LTXAudioVAEConfig config;
|
||||
|
||||
auto require = [&](const std::string& name) -> const TensorStorage* {
|
||||
auto iter = tensor_storage_map.find(name);
|
||||
std::string tensor_name = prefix.empty() ? name : prefix + "." + name;
|
||||
auto iter = tensor_storage_map.find(tensor_name);
|
||||
if (iter == tensor_storage_map.end()) {
|
||||
return nullptr;
|
||||
}
|
||||
@ -168,6 +169,12 @@ namespace LTXV {
|
||||
if (config.audio_channels != 2 || config.latent_channels != 8 || config.mel_bins != 64) {
|
||||
return config;
|
||||
}
|
||||
LOG_DEBUG("ltx_audio_vae: sample_rate = %d, mel_bins = %d, latent_channels = %d, latent_frequency_bins = %d, has_bwe = %s",
|
||||
config.sample_rate,
|
||||
config.mel_bins,
|
||||
config.latent_channels,
|
||||
config.latent_frequency_bins,
|
||||
config.has_bwe ? "true" : "false");
|
||||
return config;
|
||||
}
|
||||
};
|
||||
@ -1088,4 +1095,4 @@ namespace LTXV {
|
||||
|
||||
} // namespace LTXV
|
||||
|
||||
#endif // __SD_LTX_AUDIO_VAE_H__
|
||||
#endif // __SD_MODEL_VAE_LTX_AUDIO_VAE_HPP__
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef __SD_LTX_VAE_HPP__
|
||||
#define __SD_LTX_VAE_HPP__
|
||||
#ifndef __SD_MODEL_VAE_LTX_VAE_HPP__
|
||||
#define __SD_MODEL_VAE_LTX_VAE_HPP__
|
||||
|
||||
#include <algorithm>
|
||||
#include <fstream>
|
||||
@ -9,9 +9,9 @@
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "ltxv.hpp"
|
||||
#include "vae.hpp"
|
||||
#include "wan.hpp"
|
||||
#include "model/diffusion/ltxv.hpp"
|
||||
#include "model/vae/vae.hpp"
|
||||
#include "model/vae/wan_vae.hpp"
|
||||
|
||||
namespace LTXVAE {
|
||||
|
||||
@ -1552,4 +1552,4 @@ struct LTXVideoVAE : public VAE {
|
||||
}
|
||||
};
|
||||
|
||||
#endif // __SD_LTX_VAE_HPP__
|
||||
#endif // __SD_MODEL_VAE_LTX_VAE_HPP__
|
||||
@ -1,13 +1,13 @@
|
||||
#ifndef __TAE_HPP__
|
||||
#define __TAE_HPP__
|
||||
#ifndef __SD_MODEL_VAE_TAE_HPP__
|
||||
#define __SD_MODEL_VAE_TAE_HPP__
|
||||
|
||||
#include "ggml_extend.hpp"
|
||||
#include "core/ggml_extend.hpp"
|
||||
#include "model.h"
|
||||
|
||||
/*
|
||||
=================================== TinyAutoEncoder ===================================
|
||||
References:
|
||||
https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/autoencoders/vae.py
|
||||
https://github.com/huggingface/diffusers/blob/main/src/diffusers/model/autoencoders/vae.py
|
||||
https://github.com/madebyollin/taesd/blob/main/taesd.py
|
||||
|
||||
*/
|
||||
@ -750,4 +750,4 @@ struct TinyVideoAutoEncoder : public VAE {
|
||||
}
|
||||
};
|
||||
|
||||
#endif // __TAE_HPP__
|
||||
#endif // __SD_MODEL_VAE_TAE_HPP__
|
||||
@ -1,8 +1,8 @@
|
||||
#ifndef __VAE_HPP__
|
||||
#define __VAE_HPP__
|
||||
#ifndef __SD_MODEL_VAE_VAE_HPP__
|
||||
#define __SD_MODEL_VAE_VAE_HPP__
|
||||
|
||||
#include "common_block.hpp"
|
||||
#include "tensor_ggml.hpp"
|
||||
#include "core/tensor_ggml.hpp"
|
||||
#include "model/common/block.hpp"
|
||||
|
||||
struct VAE : public GGMLRunner {
|
||||
protected:
|
||||
@ -258,4 +258,4 @@ struct FakeVAE : public VAE {
|
||||
}
|
||||
};
|
||||
|
||||
#endif // __VAE_HPP__
|
||||
#endif // __SD_MODEL_VAE_VAE_HPP__
|
||||
File diff suppressed because it is too large
Load Diff
@ -5,9 +5,9 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "core/util.h"
|
||||
#include "gguf.h"
|
||||
#include "gguf_reader_ext.h"
|
||||
#include "util.h"
|
||||
|
||||
static void set_error(std::string* error, const std::string& message) {
|
||||
if (error != nullptr) {
|
||||
|
||||
@ -6,8 +6,8 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "core/util.h"
|
||||
#include "ggml.h"
|
||||
#include "util.h"
|
||||
|
||||
struct GGUFTensorInfo {
|
||||
std::string name;
|
||||
|
||||
@ -8,7 +8,7 @@
|
||||
#include <vector>
|
||||
|
||||
#include "binary_io.h"
|
||||
#include "util.h"
|
||||
#include "core/util.h"
|
||||
|
||||
// $ python -m pickletools sd-v1-4/archive/data.pkl | head -n 100
|
||||
// 0: \x80 PROTO 2
|
||||
|
||||
@ -7,8 +7,8 @@
|
||||
#include <vector>
|
||||
|
||||
#include "binary_io.h"
|
||||
#include "core/util.h"
|
||||
#include "json.hpp"
|
||||
#include "util.h"
|
||||
|
||||
static constexpr size_t ST_HEADER_SIZE_LEN = 8;
|
||||
|
||||
|
||||
@ -7,8 +7,8 @@
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "core/util.h"
|
||||
#include "pickle_io.h"
|
||||
#include "util.h"
|
||||
|
||||
// torch.save format background:
|
||||
//
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
|
||||
#include "core/util.h"
|
||||
#include "name_conversion.h"
|
||||
#include "util.h"
|
||||
|
||||
void replace_with_name_map(std::string& name, const std::vector<std::pair<std::string, std::string>>& name_map) {
|
||||
for (auto kv : name_map) {
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef __CACHE_DIT_HPP__
|
||||
#define __CACHE_DIT_HPP__
|
||||
#ifndef __SD_RUNTIME_CACHE_DIT_HPP__
|
||||
#define __SD_RUNTIME_CACHE_DIT_HPP__
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
@ -8,9 +8,9 @@
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "condition_cache_utils.hpp"
|
||||
#include "ggml_extend.hpp"
|
||||
#include "tensor.hpp"
|
||||
#include "core/ggml_extend.hpp"
|
||||
#include "core/tensor.hpp"
|
||||
#include "runtime/condition_cache_utils.hpp"
|
||||
|
||||
struct DBCacheConfig {
|
||||
bool enabled = false;
|
||||
@ -893,4 +893,4 @@ struct CacheDitConditionState {
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
#endif // __SD_RUNTIME_CACHE_DIT_HPP__
|
||||
@ -1,9 +1,9 @@
|
||||
#ifndef __CONDITION_CACHE_UTILS_HPP__
|
||||
#define __CONDITION_CACHE_UTILS_HPP__
|
||||
#ifndef __SD_RUNTIME_CONDITION_CACHE_UTILS_HPP__
|
||||
#define __SD_RUNTIME_CONDITION_CACHE_UTILS_HPP__
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "tensor.hpp"
|
||||
#include "core/tensor.hpp"
|
||||
|
||||
namespace sd {
|
||||
|
||||
@ -61,4 +61,4 @@ namespace sd {
|
||||
|
||||
} // namespace sd
|
||||
|
||||
#endif // __CONDITION_CACHE_UTILS_HPP__
|
||||
#endif // __SD_RUNTIME_CONDITION_CACHE_UTILS_HPP__
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef __DENOISER_HPP__
|
||||
#define __DENOISER_HPP__
|
||||
#ifndef __SD_RUNTIME_DENOISER_HPP__
|
||||
#define __SD_RUNTIME_DENOISER_HPP__
|
||||
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
@ -8,10 +8,10 @@
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
#include "ggml_extend.hpp"
|
||||
#include "gits_noise.inl"
|
||||
#include "guidance.h"
|
||||
#include "tensor.hpp"
|
||||
#include "core/ggml_extend.hpp"
|
||||
#include "core/tensor.hpp"
|
||||
#include "runtime/gits_noise.inl"
|
||||
#include "runtime/guidance.h"
|
||||
|
||||
/*================================================= CompVisDenoiser ==================================================*/
|
||||
|
||||
@ -1902,4 +1902,4 @@ static sd::Tensor<float> sample_k_diffusion(sample_method_t method,
|
||||
}
|
||||
}
|
||||
|
||||
#endif // __DENOISER_HPP__
|
||||
#endif // __SD_RUNTIME_DENOISER_HPP__
|
||||
@ -1,15 +1,15 @@
|
||||
#ifndef __EASYCACHE_HPP__
|
||||
#define __EASYCACHE_HPP__
|
||||
#ifndef __SD_RUNTIME_EASYCACHE_HPP__
|
||||
#define __SD_RUNTIME_EASYCACHE_HPP__
|
||||
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "condition_cache_utils.hpp"
|
||||
#include "denoiser.hpp"
|
||||
#include "ggml_extend.hpp"
|
||||
#include "tensor.hpp"
|
||||
#include "core/ggml_extend.hpp"
|
||||
#include "core/tensor.hpp"
|
||||
#include "runtime/condition_cache_utils.hpp"
|
||||
#include "runtime/denoiser.hpp"
|
||||
|
||||
struct EasyCacheConfig {
|
||||
bool enabled = false;
|
||||
@ -258,4 +258,4 @@ struct EasyCacheState {
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
#endif // __SD_RUNTIME_EASYCACHE_HPP__
|
||||
@ -1,4 +1,4 @@
|
||||
#include "guidance.h"
|
||||
#include "runtime/guidance.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
@ -6,7 +6,7 @@
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
#include "util.h"
|
||||
#include "core/util.h"
|
||||
|
||||
namespace sd::guidance {
|
||||
|
||||
@ -1,11 +1,11 @@
|
||||
#ifndef __SD_GUIDANCE_H__
|
||||
#define __SD_GUIDANCE_H__
|
||||
#ifndef __SD_RUNTIME_GUIDANCE_H__
|
||||
#define __SD_RUNTIME_GUIDANCE_H__
|
||||
|
||||
#include <cstddef>
|
||||
#include <functional>
|
||||
#include <vector>
|
||||
|
||||
#include "tensor.hpp"
|
||||
#include "core/tensor.hpp"
|
||||
|
||||
namespace sd::guidance {
|
||||
|
||||
@ -93,4 +93,4 @@ namespace sd::guidance {
|
||||
|
||||
} // namespace sd::guidance
|
||||
|
||||
#endif // __SD_GUIDANCE_H__
|
||||
#endif // __SD_RUNTIME_GUIDANCE_H__
|
||||
@ -1,8 +1,8 @@
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include "core/tensor.hpp"
|
||||
#include "ggml.h"
|
||||
#include "tensor.hpp"
|
||||
|
||||
const float ltxav_latent_rgb_proj[128][3] = {
|
||||
{-0.0293802f, -0.0362516f, -0.0291386f},
|
||||
@ -1,10 +1,10 @@
|
||||
#ifndef __PREPROCESSING_HPP__
|
||||
#define __PREPROCESSING_HPP__
|
||||
#ifndef __SD_RUNTIME_PREPROCESSING_HPP__
|
||||
#define __SD_RUNTIME_PREPROCESSING_HPP__
|
||||
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
|
||||
#include "ggml_extend.hpp"
|
||||
#include "core/ggml_extend.hpp"
|
||||
|
||||
#define M_PI_ 3.14159265358979323846f
|
||||
|
||||
@ -331,4 +331,4 @@ bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold,
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif // __PREPROCESSING_HPP__
|
||||
#endif // __SD_RUNTIME_PREPROCESSING_HPP__
|
||||
@ -1,4 +1,4 @@
|
||||
#include "sample-cache.h"
|
||||
#include "runtime/sample-cache.h"
|
||||
|
||||
namespace sd_sample {
|
||||
|
||||
@ -1,16 +1,16 @@
|
||||
#ifndef __SAMPLE_CACHE_H__
|
||||
#define __SAMPLE_CACHE_H__
|
||||
#ifndef __SD_RUNTIME_SAMPLE_CACHE_H__
|
||||
#define __SD_RUNTIME_SAMPLE_CACHE_H__
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "cache_dit.hpp"
|
||||
#include "denoiser.hpp"
|
||||
#include "easycache.hpp"
|
||||
#include "core/tensor.hpp"
|
||||
#include "core/util.h"
|
||||
#include "model.h"
|
||||
#include "spectrum.hpp"
|
||||
#include "tensor.hpp"
|
||||
#include "ucache.hpp"
|
||||
#include "util.h"
|
||||
#include "runtime/cache_dit.hpp"
|
||||
#include "runtime/denoiser.hpp"
|
||||
#include "runtime/easycache.hpp"
|
||||
#include "runtime/spectrum.hpp"
|
||||
#include "runtime/ucache.hpp"
|
||||
|
||||
namespace sd_sample {
|
||||
|
||||
@ -58,4 +58,4 @@ namespace sd_sample {
|
||||
|
||||
} // namespace sd_sample
|
||||
|
||||
#endif // __SAMPLE_CACHE_H__
|
||||
#endif // __SD_RUNTIME_SAMPLE_CACHE_H__
|
||||
@ -1,12 +1,12 @@
|
||||
#ifndef __SPECTRUM_HPP__
|
||||
#define __SPECTRUM_HPP__
|
||||
#ifndef __SD_RUNTIME_SPECTRUM_HPP__
|
||||
#define __SD_RUNTIME_SPECTRUM_HPP__
|
||||
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
|
||||
#include "ggml_extend.hpp"
|
||||
#include "tensor.hpp"
|
||||
#include "core/ggml_extend.hpp"
|
||||
#include "core/tensor.hpp"
|
||||
|
||||
struct SpectrumConfig {
|
||||
float w = 0.40f;
|
||||
@ -184,4 +184,4 @@ private:
|
||||
}
|
||||
};
|
||||
|
||||
#endif // __SPECTRUM_HPP__
|
||||
#endif // __SD_RUNTIME_SPECTRUM_HPP__
|
||||
@ -1,15 +1,15 @@
|
||||
#ifndef __UCACHE_HPP__
|
||||
#define __UCACHE_HPP__
|
||||
#ifndef __SD_RUNTIME_UCACHE_HPP__
|
||||
#define __SD_RUNTIME_UCACHE_HPP__
|
||||
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "condition_cache_utils.hpp"
|
||||
#include "denoiser.hpp"
|
||||
#include "ggml_extend.hpp"
|
||||
#include "tensor.hpp"
|
||||
#include "core/ggml_extend.hpp"
|
||||
#include "core/tensor.hpp"
|
||||
#include "runtime/condition_cache_utils.hpp"
|
||||
#include "runtime/denoiser.hpp"
|
||||
|
||||
struct UCacheConfig {
|
||||
bool enabled = false;
|
||||
@ -420,4 +420,4 @@ struct UCacheState {
|
||||
}
|
||||
};
|
||||
|
||||
#endif // __UCACHE_HPP__
|
||||
#endif // __SD_RUNTIME_UCACHE_HPP__
|
||||
@ -2,48 +2,49 @@
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ggml_extend.hpp"
|
||||
#include "ggml_graph_cut.h"
|
||||
#include "core/ggml_extend.hpp"
|
||||
#include "core/ggml_graph_cut.h"
|
||||
|
||||
#include "core/rng.hpp"
|
||||
#include "core/rng_mt19937.hpp"
|
||||
#include "core/rng_philox.hpp"
|
||||
#include "core/util.h"
|
||||
#include "model.h"
|
||||
#include "rng.hpp"
|
||||
#include "rng_mt19937.hpp"
|
||||
#include "rng_philox.hpp"
|
||||
#include "stable-diffusion.h"
|
||||
#include "util.h"
|
||||
|
||||
#include "anima.hpp"
|
||||
#include "auto_encoder_kl.hpp"
|
||||
#include "conditioner.hpp"
|
||||
#include "control.hpp"
|
||||
#include "denoiser.hpp"
|
||||
#include "diffusion_model.hpp"
|
||||
#include "ernie_image.hpp"
|
||||
#include "esrgan.hpp"
|
||||
#include "flux.hpp"
|
||||
#include "guidance.h"
|
||||
#include "hidream_o1.hpp"
|
||||
#include "ideogram4.hpp"
|
||||
#include "lens.hpp"
|
||||
#include "lora.hpp"
|
||||
#include "ltx_audio_vae.h"
|
||||
#include "ltx_latent_upscaler.hpp"
|
||||
#include "ltx_vae.hpp"
|
||||
#include "ltxv.hpp"
|
||||
#include "mmdit.hpp"
|
||||
#include "pid.hpp"
|
||||
#include "pmid.hpp"
|
||||
#include "qwen_image.hpp"
|
||||
#include "sample-cache.h"
|
||||
#include "tae.hpp"
|
||||
#include "unet.hpp"
|
||||
#include "conditioning/conditioner.hpp"
|
||||
#include "model/adapter/lora.hpp"
|
||||
#include "model/adapter/pmid.hpp"
|
||||
#include "model/diffusion/anima.hpp"
|
||||
#include "model/diffusion/control.hpp"
|
||||
#include "model/diffusion/ernie_image.hpp"
|
||||
#include "model/diffusion/flux.hpp"
|
||||
#include "model/diffusion/hidream_o1.hpp"
|
||||
#include "model/diffusion/ideogram4.hpp"
|
||||
#include "model/diffusion/lens.hpp"
|
||||
#include "model/diffusion/ltxv.hpp"
|
||||
#include "model/diffusion/mmdit.hpp"
|
||||
#include "model/diffusion/model.hpp"
|
||||
#include "model/diffusion/pid.hpp"
|
||||
#include "model/diffusion/qwen_image.hpp"
|
||||
#include "model/diffusion/unet.hpp"
|
||||
#include "model/diffusion/wan.hpp"
|
||||
#include "model/diffusion/z_image.hpp"
|
||||
#include "model/upscaler/esrgan.hpp"
|
||||
#include "model/upscaler/ltx_latent_upscaler.hpp"
|
||||
#include "model/vae/auto_encoder_kl.hpp"
|
||||
#include "model/vae/ltx_audio_vae.hpp"
|
||||
#include "model/vae/ltx_vae.hpp"
|
||||
#include "model/vae/tae.hpp"
|
||||
#include "model/vae/vae.hpp"
|
||||
#include "model/vae/wan_vae.hpp"
|
||||
#include "runtime/denoiser.hpp"
|
||||
#include "runtime/guidance.h"
|
||||
#include "runtime/sample-cache.h"
|
||||
#include "upscaler.h"
|
||||
#include "vae.hpp"
|
||||
#include "wan.hpp"
|
||||
#include "z_image.hpp"
|
||||
|
||||
#include "latent-preview.h"
|
||||
#include "name_conversion.h"
|
||||
#include "runtime/latent-preview.h"
|
||||
|
||||
const char* sd_vae_format_name(enum sd_vae_format_t format);
|
||||
static SDVersion sd_vae_format_to_version(enum sd_vae_format_t format, SDVersion fallback);
|
||||
|
||||
@ -3,8 +3,8 @@
|
||||
#include <algorithm>
|
||||
#include <sstream>
|
||||
|
||||
#include "core/util.h"
|
||||
#include "tokenize_util.h"
|
||||
#include "util.h"
|
||||
|
||||
std::vector<std::pair<int, std::u32string>> BPETokenizer::bytes_to_unicode() {
|
||||
std::vector<std::pair<int, std::u32string>> byte_unicode_pairs;
|
||||
|
||||
@ -6,9 +6,9 @@
|
||||
#include <regex>
|
||||
#include <set>
|
||||
|
||||
#include "core/util.h"
|
||||
#include "ggml.h"
|
||||
#include "tokenize_util.h"
|
||||
#include "util.h"
|
||||
#include "vocab/vocab.h"
|
||||
|
||||
CLIPTokenizer::CLIPTokenizer(int pad_token_id, const std::string& merges_utf8_str) {
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
#include "gemma_tokenizer.h"
|
||||
|
||||
#include "core/util.h"
|
||||
#include "ggml.h"
|
||||
#include "json.hpp"
|
||||
#include "util.h"
|
||||
#include "vocab/vocab.h"
|
||||
|
||||
std::string GemmaTokenizer::normalize(const std::string& text) const {
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
#include "gpt_oss_tokenizer.h"
|
||||
|
||||
#include "core/util.h"
|
||||
#include "json.hpp"
|
||||
#include "util.h"
|
||||
#include "vocab/vocab.h"
|
||||
|
||||
void GPTOSSTokenizer::load_from_merges(const std::string& merges_utf8_str, const std::string& vocab_utf8_str) {
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
#include "mistral_tokenizer.h"
|
||||
|
||||
#include "core/util.h"
|
||||
#include "ggml.h"
|
||||
#include "json.hpp"
|
||||
#include "util.h"
|
||||
#include "vocab/vocab.h"
|
||||
|
||||
void MistralTokenizer::load_from_merges(const std::string& merges_utf8_str, const std::string& vocab_utf8_str) {
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
#include "qwen2_tokenizer.h"
|
||||
|
||||
#include "util.h"
|
||||
#include "core/util.h"
|
||||
#include "vocab/vocab.h"
|
||||
|
||||
void Qwen2Tokenizer::load_from_merges(const std::string& merges_utf8_str) {
|
||||
|
||||
@ -6,9 +6,9 @@
|
||||
#include <regex>
|
||||
#include <sstream>
|
||||
|
||||
#include "core/util.h"
|
||||
#include "json.hpp"
|
||||
#include "tokenize_util.h"
|
||||
#include "util.h"
|
||||
#include "vocab/vocab.h"
|
||||
|
||||
// Port from: https://github.com/google/sentencepiece/blob/master/src/unigram_model.h
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
#include <cmath>
|
||||
#include <regex>
|
||||
|
||||
#include "util.h"
|
||||
#include "core/util.h"
|
||||
|
||||
void Tokenizer::add_special_token(const std::string& token) {
|
||||
special_tokens.push_back(token);
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
#include "upscaler.h"
|
||||
#include "ggml_extend.hpp"
|
||||
#include "core/ggml_extend.hpp"
|
||||
#include "core/util.h"
|
||||
#include "model.h"
|
||||
#include "stable-diffusion.h"
|
||||
#include "util.h"
|
||||
|
||||
#include <utility>
|
||||
|
||||
|
||||
@ -1,10 +1,10 @@
|
||||
#ifndef __SD_UPSCALER_H__
|
||||
#define __SD_UPSCALER_H__
|
||||
|
||||
#include "esrgan.hpp"
|
||||
#include "ggml_extend_backend.h"
|
||||
#include "core/ggml_extend_backend.h"
|
||||
#include "core/tensor.hpp"
|
||||
#include "model/upscaler/esrgan.hpp"
|
||||
#include "stable-diffusion.h"
|
||||
#include "tensor.hpp"
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user