mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2025-12-13 05:48:56 +00:00
Compare commits
8 Commits
5af0bb0aee
...
a3a2b2d721
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a3a2b2d721 | ||
|
|
a8d3aa0415 | ||
|
|
1ba30ce005 | ||
|
|
2abe9451c4 | ||
|
|
f3140eadbb | ||
|
|
98ba155fc6 | ||
|
|
513f36d495 | ||
|
|
1e0d2821bb |
@ -125,13 +125,14 @@ cmake --build . --config Release
|
||||
|
||||
##### Using HipBLAS
|
||||
This provides BLAS acceleration using the ROCm cores of your AMD GPU. Make sure to have the ROCm toolkit installed.
|
||||
To build for another GPU architecture than installed in your system, set `$GFX_NAME` manually to the desired architecture (replace first command). This is also necessary if your GPU is not officially supported by ROCm, for example you have to set `$GFX_NAME` manually to `gfx1030` for consumer RDNA2 cards.
|
||||
|
||||
Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
|
||||
|
||||
```
|
||||
export GFX_NAME=$(rocminfo | grep -m 1 -E "gfx[^0]{1}" | sed -e 's/ *Name: *//' | awk '{$1=$1; print}' || echo "rocminfo missing")
|
||||
echo $GFX_NAME
|
||||
cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON
|
||||
if command -v rocminfo; then export GFX_NAME=$(rocminfo | awk '/ *Name: +gfx[1-9]/ {print $2; exit}'); else echo "rocminfo missing!"; fi
|
||||
if [ -z "${GFX_NAME}" ]; then echo "Error: Couldn't detect GPU!"; else echo "Building for GPU: ${GFX_NAME}"; fi
|
||||
cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DAMDGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
|
||||
9
clip.hpp
9
clip.hpp
@ -553,12 +553,13 @@ protected:
|
||||
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
|
||||
enum ggml_type token_wtype = GGML_TYPE_F32;
|
||||
if (!force_clip_f32) {
|
||||
auto tensor_type = tensor_types.find(prefix + "token_embedding.weight");
|
||||
if (tensor_type != tensor_types.end())
|
||||
auto tensor_type = tensor_types.find(prefix + "token_embedding.weight");
|
||||
std::set<ggml_type> allow_types = {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0};
|
||||
if (tensor_type != tensor_types.end() && allow_types.find(tensor_type->second) != allow_types.end()) {
|
||||
token_wtype = tensor_type->second;
|
||||
}
|
||||
}
|
||||
enum ggml_type position_wtype = GGML_TYPE_F32;
|
||||
|
||||
enum ggml_type position_wtype = GGML_TYPE_F32;
|
||||
params["token_embedding.weight"] = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
|
||||
params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, position_wtype, embed_dim, num_positions);
|
||||
}
|
||||
|
||||
@ -146,7 +146,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
||||
}
|
||||
return true;
|
||||
};
|
||||
model_loader.load_tensors(on_load);
|
||||
model_loader.load_tensors(on_load, 1);
|
||||
readed_embeddings.push_back(embd_name);
|
||||
if (embd) {
|
||||
int64_t hidden_size = text_model->model.hidden_size;
|
||||
|
||||
@ -445,7 +445,7 @@ struct ControlNet : public GGMLRunner {
|
||||
guided_hint_cached = true;
|
||||
}
|
||||
|
||||
bool load_from_file(const std::string& file_path) {
|
||||
bool load_from_file(const std::string& file_path, int n_threads) {
|
||||
LOG_INFO("loading control net from '%s'", file_path.c_str());
|
||||
alloc_params_buffer();
|
||||
std::map<std::string, ggml_tensor*> tensors;
|
||||
@ -458,7 +458,7 @@ struct ControlNet : public GGMLRunner {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool success = model_loader.load_tensors(tensors, ignore_tensors);
|
||||
bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads);
|
||||
|
||||
if (!success) {
|
||||
LOG_ERROR("load control net tensors from model loader failed");
|
||||
|
||||
44
docs/lora.md
44
docs/lora.md
@ -20,20 +20,30 @@ Here's a simple example:
|
||||
|
||||
NOTE: The other backends may have different support.
|
||||
|
||||
| Quant / Type | CUDA |
|
||||
|--------------|------|
|
||||
| F32 | ✔️ |
|
||||
| F16 | ✔️ |
|
||||
| BF16 | ✔️ |
|
||||
| I32 | ✔️ |
|
||||
| Q4_0 | ✔️ |
|
||||
| Q4_1 | ✔️ |
|
||||
| Q5_0 | ✔️ |
|
||||
| Q5_1 | ✔️ |
|
||||
| Q8_0 | ✔️ |
|
||||
| Q2_K | ❌ |
|
||||
| Q3_K | ❌ |
|
||||
| Q4_K | ❌ |
|
||||
| Q5_K | ❌ |
|
||||
| Q6_K | ❌ |
|
||||
| Q8_K | ❌ |
|
||||
| Quant / Type | CUDA | Vulkan |
|
||||
|--------------|------|--------|
|
||||
| F32 | ✔️ | ✔️ |
|
||||
| F16 | ✔️ | ✔️ |
|
||||
| BF16 | ✔️ | ✔️ |
|
||||
| I32 | ✔️ | ❌ |
|
||||
| Q4_0 | ✔️ | ✔️ |
|
||||
| Q4_1 | ✔️ | ✔️ |
|
||||
| Q5_0 | ✔️ | ✔️ |
|
||||
| Q5_1 | ✔️ | ✔️ |
|
||||
| Q8_0 | ✔️ | ✔️ |
|
||||
| Q2_K | ❌ | ❌ |
|
||||
| Q3_K | ❌ | ❌ |
|
||||
| Q4_K | ❌ | ❌ |
|
||||
| Q5_K | ❌ | ❌ |
|
||||
| Q6_K | ❌ | ❌ |
|
||||
| Q8_K | ❌ | ❌ |
|
||||
| IQ1_S | ❌ | ✔️ |
|
||||
| IQ1_M | ❌ | ✔️ |
|
||||
| IQ2_XXS | ❌ | ✔️ |
|
||||
| IQ2_XS | ❌ | ✔️ |
|
||||
| IQ2_S | ❌ | ✔️ |
|
||||
| IQ3_XXS | ❌ | ✔️ |
|
||||
| IQ3_S | ❌ | ✔️ |
|
||||
| IQ4_XS | ❌ | ✔️ |
|
||||
| IQ4_NL | ❌ | ✔️ |
|
||||
| MXFP4 | ❌ | ✔️ |
|
||||
|
||||
@ -164,7 +164,7 @@ struct ESRGAN : public GGMLRunner {
|
||||
return "esrgan";
|
||||
}
|
||||
|
||||
bool load_from_file(const std::string& file_path) {
|
||||
bool load_from_file(const std::string& file_path, int n_threads) {
|
||||
LOG_INFO("loading esrgan from '%s'", file_path.c_str());
|
||||
|
||||
alloc_params_buffer();
|
||||
@ -177,7 +177,7 @@ struct ESRGAN : public GGMLRunner {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool success = model_loader.load_tensors(esrgan_tensors);
|
||||
bool success = model_loader.load_tensors(esrgan_tensors, {}, n_threads);
|
||||
|
||||
if (!success) {
|
||||
LOG_ERROR("load esrgan tensors from model loader failed");
|
||||
|
||||
2
ggml
2
ggml
@ -1 +1 @@
|
||||
Subproject commit 5fdc78fff274094e2a1b155928131983362d8a71
|
||||
Subproject commit 553c44706c3cc6e4077f4ab214923fc4c20a013c
|
||||
2
lora.hpp
2
lora.hpp
@ -116,7 +116,7 @@ struct LoraModel : public GGMLRunner {
|
||||
return "lora";
|
||||
}
|
||||
|
||||
bool load_from_file(bool filter_tensor = false, int n_threads = 0) {
|
||||
bool load_from_file(bool filter_tensor, int n_threads) {
|
||||
LOG_INFO("loading LoRA from '%s'", file_path.c_str());
|
||||
|
||||
if (load_failed) {
|
||||
|
||||
26
model.cpp
26
model.cpp
@ -1,4 +1,5 @@
|
||||
#include <stdarg.h>
|
||||
#include <algorithm>
|
||||
#include <atomic>
|
||||
#include <chrono>
|
||||
#include <fstream>
|
||||
@ -1995,7 +1996,8 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
|
||||
std::atomic<int64_t> copy_to_backend_time_ms(0);
|
||||
std::atomic<int64_t> convert_time_ms(0);
|
||||
|
||||
int num_threads_to_use = n_threads_p > 0 ? n_threads_p : (int)std::thread::hardware_concurrency();
|
||||
int num_threads_to_use = n_threads_p > 0 ? n_threads_p : get_num_physical_cores();
|
||||
LOG_DEBUG("using %d threads for model loading", num_threads_to_use);
|
||||
|
||||
int64_t start_time = ggml_time_ms();
|
||||
std::vector<TensorStorage> processed_tensor_storages;
|
||||
@ -2045,13 +2047,25 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
|
||||
w.join();
|
||||
}
|
||||
|
||||
std::unordered_map<std::string, IndexedStorage> latest_map;
|
||||
std::vector<IndexedStorage> deduplicated;
|
||||
deduplicated.reserve(all_results.size());
|
||||
std::unordered_map<std::string, size_t> name_to_pos;
|
||||
for (auto& entry : all_results) {
|
||||
latest_map[entry.ts.name] = entry;
|
||||
auto it = name_to_pos.find(entry.ts.name);
|
||||
if (it == name_to_pos.end()) {
|
||||
name_to_pos.emplace(entry.ts.name, deduplicated.size());
|
||||
deduplicated.push_back(entry);
|
||||
} else if (deduplicated[it->second].index < entry.index) {
|
||||
deduplicated[it->second] = entry;
|
||||
}
|
||||
}
|
||||
|
||||
processed_tensor_storages.reserve(latest_map.size());
|
||||
for (auto& [name, entry] : latest_map) {
|
||||
std::sort(deduplicated.begin(), deduplicated.end(), [](const IndexedStorage& a, const IndexedStorage& b) {
|
||||
return a.index < b.index;
|
||||
});
|
||||
|
||||
processed_tensor_storages.reserve(deduplicated.size());
|
||||
for (auto& entry : deduplicated) {
|
||||
processed_tensor_storages.push_back(entry.ts);
|
||||
}
|
||||
}
|
||||
@ -2447,6 +2461,8 @@ bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage
|
||||
// Pass, do not convert. For MMDiT
|
||||
} else if (contains(name, "time_embed.") || contains(name, "label_emb.")) {
|
||||
// Pass, do not convert. For Unet
|
||||
} else if (contains(name, "embedding")) {
|
||||
// Pass, do not convert embedding
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
|
||||
6
pmid.hpp
6
pmid.hpp
@ -591,7 +591,7 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
|
||||
return "id_embeds";
|
||||
}
|
||||
|
||||
bool load_from_file(bool filter_tensor = false) {
|
||||
bool load_from_file(bool filter_tensor, int n_threads) {
|
||||
LOG_INFO("loading PhotoMaker ID Embeds from '%s'", file_path.c_str());
|
||||
|
||||
if (load_failed) {
|
||||
@ -623,11 +623,11 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
|
||||
return true;
|
||||
};
|
||||
|
||||
model_loader->load_tensors(on_new_tensor_cb);
|
||||
model_loader->load_tensors(on_new_tensor_cb, n_threads);
|
||||
alloc_params_buffer();
|
||||
|
||||
dry_run = false;
|
||||
model_loader->load_tensors(on_new_tensor_cb);
|
||||
model_loader->load_tensors(on_new_tensor_cb, n_threads);
|
||||
|
||||
LOG_DEBUG("finished loading PhotoMaker ID Embeds ");
|
||||
return true;
|
||||
|
||||
@ -549,7 +549,7 @@ public:
|
||||
}
|
||||
if (strlen(SAFE_STR(sd_ctx_params->photo_maker_path)) > 0) {
|
||||
pmid_lora = std::make_shared<LoraModel>(backend, sd_ctx_params->photo_maker_path, "");
|
||||
if (!pmid_lora->load_from_file(true)) {
|
||||
if (!pmid_lora->load_from_file(true, n_threads)) {
|
||||
LOG_WARN("load photomaker lora tensors from %s failed", sd_ctx_params->photo_maker_path);
|
||||
return false;
|
||||
}
|
||||
@ -617,14 +617,14 @@ public:
|
||||
if (!use_tiny_autoencoder) {
|
||||
vae_params_mem_size = first_stage_model->get_params_buffer_size();
|
||||
} else {
|
||||
if (!tae_first_stage->load_from_file(taesd_path)) {
|
||||
if (!tae_first_stage->load_from_file(taesd_path, n_threads)) {
|
||||
return false;
|
||||
}
|
||||
vae_params_mem_size = tae_first_stage->get_params_buffer_size();
|
||||
}
|
||||
size_t control_net_params_mem_size = 0;
|
||||
if (control_net) {
|
||||
if (!control_net->load_from_file(SAFE_STR(sd_ctx_params->control_net_path))) {
|
||||
if (!control_net->load_from_file(SAFE_STR(sd_ctx_params->control_net_path), n_threads)) {
|
||||
return false;
|
||||
}
|
||||
control_net_params_mem_size = control_net->get_params_buffer_size();
|
||||
@ -861,7 +861,7 @@ public:
|
||||
return;
|
||||
}
|
||||
LoraModel lora(backend, file_path, is_high_noise ? "model.high_noise_" : "");
|
||||
if (!lora.load_from_file()) {
|
||||
if (!lora.load_from_file(false, n_threads)) {
|
||||
LOG_WARN("load lora tensors from %s failed", file_path.c_str());
|
||||
return;
|
||||
}
|
||||
@ -1510,7 +1510,7 @@ public:
|
||||
|
||||
ggml_tensor* get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* vae_output) {
|
||||
ggml_tensor* latent;
|
||||
if (use_tiny_autoencoder || sd_version_is_qwen_image(version)) {
|
||||
if (use_tiny_autoencoder || sd_version_is_qwen_image(version) || sd_version_is_wan(version)) {
|
||||
latent = vae_output;
|
||||
} else if (version == VERSION_SD1_PIX2PIX) {
|
||||
latent = ggml_view_3d(work_ctx,
|
||||
|
||||
4
tae.hpp
4
tae.hpp
@ -222,7 +222,7 @@ struct TinyAutoEncoder : public GGMLRunner {
|
||||
return "taesd";
|
||||
}
|
||||
|
||||
bool load_from_file(const std::string& file_path) {
|
||||
bool load_from_file(const std::string& file_path, int n_threads) {
|
||||
LOG_INFO("loading taesd from '%s', decode_only = %s", file_path.c_str(), decode_only ? "true" : "false");
|
||||
alloc_params_buffer();
|
||||
std::map<std::string, ggml_tensor*> taesd_tensors;
|
||||
@ -238,7 +238,7 @@ struct TinyAutoEncoder : public GGMLRunner {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool success = model_loader.load_tensors(taesd_tensors, ignore_tensors);
|
||||
bool success = model_loader.load_tensors(taesd_tensors, ignore_tensors, n_threads);
|
||||
|
||||
if (!success) {
|
||||
LOG_ERROR("load tae tensors from model loader failed");
|
||||
|
||||
@ -18,7 +18,8 @@ struct UpscalerGGML {
|
||||
}
|
||||
|
||||
bool load_from_file(const std::string& esrgan_path,
|
||||
bool offload_params_to_cpu) {
|
||||
bool offload_params_to_cpu,
|
||||
int n_threads) {
|
||||
ggml_log_set(ggml_log_callback_default, nullptr);
|
||||
#ifdef SD_USE_CUDA
|
||||
LOG_DEBUG("Using CUDA backend");
|
||||
@ -54,7 +55,7 @@ struct UpscalerGGML {
|
||||
if (direct) {
|
||||
esrgan_upscaler->enable_conv2d_direct();
|
||||
}
|
||||
if (!esrgan_upscaler->load_from_file(esrgan_path)) {
|
||||
if (!esrgan_upscaler->load_from_file(esrgan_path, n_threads)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
@ -124,7 +125,7 @@ upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu)) {
|
||||
if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu, n_threads)) {
|
||||
delete upscaler_ctx->upscaler;
|
||||
upscaler_ctx->upscaler = NULL;
|
||||
free(upscaler_ctx);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user