Compare commits

...

5 Commits

7 changed files with 425 additions and 200 deletions

View File

@ -485,6 +485,153 @@ jobs:
path: | path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
ubuntu-latest-rocm:
runs-on: ubuntu-latest
container: rocm/dev-ubuntu-24.04:7.2
env:
ROCM_VERSION: "7.2"
UBUNTU_VERSION: "24.04"
GPU_TARGETS: "gfx1151;gfx1150;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
steps:
- run: apt-get update && apt-get install -y git
- name: Clone
id: checkout
uses: actions/checkout@v6
with:
submodules: recursive
- name: Free disk space
run: |
# Remove preinstalled SDKs and caches not needed for this job
sudo rm -rf /usr/share/dotnet || true
sudo rm -rf /usr/local/lib/android || true
sudo rm -rf /opt/ghc || true
sudo rm -rf /usr/local/.ghcup || true
sudo rm -rf /opt/hostedtoolcache || true
# Remove old package lists and caches
sudo rm -rf /var/lib/apt/lists/* || true
sudo apt clean
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt install -y \
cmake \
hip-dev \
hipblas-dev \
ninja-build \
rocm-dev \
zip
# Clean apt caches to recover disk space
sudo apt clean
sudo rm -rf /var/lib/apt/lists/* || true
- name: Setup ROCm Environment
run: |
# Add ROCm to PATH for current session
echo "/opt/rocm/bin" >> $GITHUB_PATH
# Build case pattern from GPU_TARGETS
PATTERN=$(printf '%s' "$GPU_TARGETS" | sed 's/;/\*|\*/g')
PATTERN="*${PATTERN}*"
# Remove library files for architectures we're not building for to save disk space
echo "Cleaning up unneeded architecture files..."
cd /opt/rocm/lib/rocblas/library
# Keep only our target architectures
for file in *; do
case "$file" in
$PATTERN)
;;
*)
sudo rm -f "$file" ;;
esac;
done
cd /opt/rocm/lib/hipblaslt/library
for file in *; do
case "$file" in
$PATTERN)
;;
*)
sudo rm -f "$file" ;;
esac;
done
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. -G Ninja \
-DCMAKE_CXX_COMPILER=amdclang++ \
-DCMAKE_C_COMPILER=amdclang \
-DCMAKE_BUILD_TYPE=Release \
-DSD_HIPBLAS=ON \
-DGPU_TARGETS="${{ env.GPU_TARGETS }}" \
-DAMDGPU_TARGETS="${{ env.GPU_TARGETS }}" \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DSD_BUILD_SHARED_LIBS=ON
cmake --build . --config Release
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: pr-mpt/actions-commit-hash@v2
- name: Prepare artifacts
id: prepare_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
# Copy licenses
cp ggml/LICENSE ./build/bin/ggml.txt
cp LICENSE ./build/bin/stable-diffusion.cpp.txt
# Create directories for ROCm libraries
mkdir -p ./build/bin/rocblas/library
mkdir -p ./build/bin/hipblaslt/library
# Copy ROCm runtime libraries (use || true to continue if files don't exist)
cp /opt/rocm/lib/librocsparse.so* ./build/bin/ || true
cp /opt/rocm/lib/libhsa-runtime64.so* ./build/bin/ || true
cp /opt/rocm/lib/libamdhip64.so* ./build/bin/ || true
cp /opt/rocm/lib/libhipblas.so* ./build/bin/ || true
cp /opt/rocm/lib/libhipblaslt.so* ./build/bin/ || true
cp /opt/rocm/lib/librocblas.so* ./build/bin/ || true
# Copy library files (already filtered to target architectures)
cp /opt/rocm/lib/rocblas/library/* ./build/bin/rocblas/library/ || true
cp /opt/rocm/lib/hipblaslt/library/* ./build/bin/hipblaslt/library/ || true
- name: Fetch system info
id: system-info
run: |
echo "CPU_ARCH=`uname -m`" >> "$GITHUB_OUTPUT"
echo "OS_NAME=`lsb_release -s -i`" >> "$GITHUB_OUTPUT"
echo "OS_VERSION=`lsb_release -s -r`" >> "$GITHUB_OUTPUT"
echo "OS_TYPE=`uname -s`" >> "$GITHUB_OUTPUT"
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp ggml/LICENSE ./build/bin/ggml.txt
cp LICENSE ./build/bin/stable-diffusion.cpp.txt
zip -j sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip ./build/bin/*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip
path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip
release: release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@ -493,6 +640,7 @@ jobs:
needs: needs:
- ubuntu-latest-cmake - ubuntu-latest-cmake
- ubuntu-latest-cmake-vulkan - ubuntu-latest-cmake-vulkan
- ubuntu-latest-rocm
- build-and-push-docker-images - build-and-push-docker-images
- macOS-latest-cmake - macOS-latest-cmake
- windows-latest-cmake - windows-latest-cmake

View File

@ -10,9 +10,14 @@ struct SDCondition {
struct ggml_tensor* c_vector = nullptr; // aka y struct ggml_tensor* c_vector = nullptr; // aka y
struct ggml_tensor* c_concat = nullptr; struct ggml_tensor* c_concat = nullptr;
std::vector<struct ggml_tensor*> extra_c_crossattns;
SDCondition() = default; SDCondition() = default;
SDCondition(struct ggml_tensor* c_crossattn, struct ggml_tensor* c_vector, struct ggml_tensor* c_concat) SDCondition(struct ggml_tensor* c_crossattn,
: c_crossattn(c_crossattn), c_vector(c_vector), c_concat(c_concat) {} struct ggml_tensor* c_vector,
struct ggml_tensor* c_concat,
const std::vector<struct ggml_tensor*>& extra_c_crossattns = {})
: c_crossattn(c_crossattn), c_vector(c_vector), c_concat(c_concat), extra_c_crossattns(extra_c_crossattns) {}
}; };
struct ConditionerParams { struct ConditionerParams {
@ -1696,10 +1701,11 @@ struct LLMEmbedder : public Conditioner {
} }
std::tuple<std::vector<int>, std::vector<float>> tokenize(std::string text, std::tuple<std::vector<int>, std::vector<float>> tokenize(std::string text,
std::pair<int, int> attn_range, const std::pair<int, int>& attn_range,
size_t max_length = 0, size_t max_length = 0,
bool padding = false) { bool padding = false) {
std::vector<std::pair<std::string, float>> parsed_attention; std::vector<std::pair<std::string, float>> parsed_attention;
if (attn_range.first >= 0 && attn_range.second > 0) {
parsed_attention.emplace_back(text.substr(0, attn_range.first), 1.f); parsed_attention.emplace_back(text.substr(0, attn_range.first), 1.f);
if (attn_range.second - attn_range.first > 0) { if (attn_range.second - attn_range.first > 0) {
auto new_parsed_attention = parse_prompt_attention(text.substr(attn_range.first, attn_range.second - attn_range.first)); auto new_parsed_attention = parse_prompt_attention(text.substr(attn_range.first, attn_range.second - attn_range.first));
@ -1708,6 +1714,10 @@ struct LLMEmbedder : public Conditioner {
new_parsed_attention.end()); new_parsed_attention.end());
} }
parsed_attention.emplace_back(text.substr(attn_range.second), 1.f); parsed_attention.emplace_back(text.substr(attn_range.second), 1.f);
} else {
parsed_attention.emplace_back(text, 1.f);
}
{ {
std::stringstream ss; std::stringstream ss;
ss << "["; ss << "[";
@ -1738,19 +1748,110 @@ struct LLMEmbedder : public Conditioner {
return {tokens, weights}; return {tokens, weights};
} }
ggml_tensor* encode_prompt(ggml_context* work_ctx,
int n_threads,
const std::string prompt,
const std::pair<int, int>& prompt_attn_range,
int max_length,
int min_length,
std::vector<std::pair<int, ggml_tensor*>> image_embeds,
const std::set<int>& out_layers,
int prompt_template_encode_start_idx) {
auto tokens_and_weights = tokenize(prompt, prompt_attn_range);
auto& tokens = std::get<0>(tokens_and_weights);
auto& weights = std::get<1>(tokens_and_weights);
std::vector<float> mask;
if (max_length > 0 && tokens.size() < max_length) {
mask.insert(mask.end(), tokens.size(), 1.f);
mask.insert(mask.end(), max_length - tokens.size(), 0.f);
tokenizer->pad_tokens(tokens, weights, max_length, true);
}
struct ggml_tensor* hidden_states = nullptr; // [N, n_token, hidden_size]
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens);
ggml_tensor* attention_mask = nullptr;
if (!mask.empty()) {
attention_mask = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, mask.size(), mask.size());
ggml_ext_tensor_iter(attention_mask, [&](ggml_tensor* attention_mask, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
float value = 0.f;
if (mask[i0] == 0.f) {
value = -INFINITY;
} else if (i0 > i1) {
value = -INFINITY;
}
ggml_ext_tensor_set_f32(attention_mask, value, i0, i1, i2, i3);
});
}
llm->compute(n_threads,
input_ids,
attention_mask,
image_embeds,
out_layers,
&hidden_states,
work_ctx);
{
auto tensor = hidden_states;
float original_mean = ggml_ext_tensor_mean(tensor);
for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
value *= weights[i1];
ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
}
}
}
float new_mean = ggml_ext_tensor_mean(tensor);
ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
}
GGML_ASSERT(hidden_states->ne[1] > prompt_template_encode_start_idx);
int64_t zero_pad_len = 0;
if (min_length > 0) {
if (hidden_states->ne[1] - prompt_template_encode_start_idx < min_length) {
zero_pad_len = min_length - hidden_states->ne[1] + prompt_template_encode_start_idx;
}
}
ggml_tensor* new_hidden_states = ggml_new_tensor_3d(work_ctx,
GGML_TYPE_F32,
hidden_states->ne[0],
hidden_states->ne[1] - prompt_template_encode_start_idx + zero_pad_len,
hidden_states->ne[2]);
ggml_ext_tensor_iter(new_hidden_states, [&](ggml_tensor* new_hidden_states, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
float value = 0.f;
if (i1 + prompt_template_encode_start_idx < hidden_states->ne[1]) {
value = ggml_ext_tensor_get_f32(hidden_states, i0, i1 + prompt_template_encode_start_idx, i2, i3);
}
ggml_ext_tensor_set_f32(new_hidden_states, value, i0, i1, i2, i3);
});
return new_hidden_states;
}
SDCondition get_learned_condition(ggml_context* work_ctx, SDCondition get_learned_condition(ggml_context* work_ctx,
int n_threads, int n_threads,
const ConditionerParams& conditioner_params) override { const ConditionerParams& conditioner_params) override {
std::string prompt; std::string prompt;
std::vector<std::pair<int, ggml_tensor*>> image_embeds;
std::pair<int, int> prompt_attn_range; std::pair<int, int> prompt_attn_range;
std::vector<std::string> extra_prompts;
std::vector<std::pair<int, int>> extra_prompts_attn_range;
std::vector<std::pair<int, ggml_tensor*>> image_embeds;
int prompt_template_encode_start_idx = 34; int prompt_template_encode_start_idx = 34;
int max_length = 0; int max_length = 0; // pad tokens
int min_length = 0; // zero pad hidden_states
std::set<int> out_layers; std::set<int> out_layers;
std::vector<int> tokens;
std::vector<float> weights; int64_t t0 = ggml_time_ms();
std::vector<float> mask;
if (llm->enable_vision && conditioner_params.ref_images.size() > 0) { if (sd_version_is_qwen_image(version)) {
if (llm->enable_vision && !conditioner_params.ref_images.empty()) {
LOG_INFO("QwenImageEditPlusPipeline"); LOG_INFO("QwenImageEditPlusPipeline");
prompt_template_encode_start_idx = 64; prompt_template_encode_start_idx = 64;
int image_embed_idx = 64 + 6; int image_embed_idx = 64 + 6;
@ -1813,8 +1914,20 @@ struct LLMEmbedder : public Conditioner {
prompt_attn_range.second = static_cast<int>(prompt.size()); prompt_attn_range.second = static_cast<int>(prompt.size());
prompt += "<|im_end|>\n<|im_start|>assistant\n"; prompt += "<|im_end|>\n<|im_start|>assistant\n";
} else {
prompt_template_encode_start_idx = 34;
prompt = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n";
prompt_attn_range.first = static_cast<int>(prompt.size());
prompt += conditioner_params.text;
prompt_attn_range.second = static_cast<int>(prompt.size());
prompt += "<|im_end|>\n<|im_start|>assistant\n";
}
} else if (version == VERSION_FLUX2) { } else if (version == VERSION_FLUX2) {
prompt_template_encode_start_idx = 0; prompt_template_encode_start_idx = 0;
min_length = 512;
out_layers = {10, 20, 30}; out_layers = {10, 20, 30};
prompt = "[SYSTEM_PROMPT]You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object\nattribution and actions without speculation.[/SYSTEM_PROMPT][INST]"; prompt = "[SYSTEM_PROMPT]You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object\nattribution and actions without speculation.[/SYSTEM_PROMPT][INST]";
@ -1828,6 +1941,15 @@ struct LLMEmbedder : public Conditioner {
prompt_template_encode_start_idx = 0; prompt_template_encode_start_idx = 0;
out_layers = {35}; // -2 out_layers = {35}; // -2
if (!conditioner_params.ref_images.empty()) {
LOG_INFO("ZImageOmniPipeline");
prompt = "<|im_start|>user\n<|vision_start|>";
for (int i = 0; i < conditioner_params.ref_images.size() - 1; i++) {
extra_prompts.push_back("<|vision_end|><|vision_start|>");
}
extra_prompts.push_back("<|vision_end|>" + conditioner_params.text + "<|im_end|>\n<|im_start|>assistant\n<|vision_start|>");
extra_prompts.push_back("<|vision_end|><|im_end|>");
} else {
prompt = "<|im_start|>user\n"; prompt = "<|im_start|>user\n";
prompt_attn_range.first = static_cast<int>(prompt.size()); prompt_attn_range.first = static_cast<int>(prompt.size());
@ -1835,6 +1957,7 @@ struct LLMEmbedder : public Conditioner {
prompt_attn_range.second = static_cast<int>(prompt.size()); prompt_attn_range.second = static_cast<int>(prompt.size());
prompt += "<|im_end|>\n<|im_start|>assistant\n"; prompt += "<|im_end|>\n<|im_start|>assistant\n";
}
} else if (version == VERSION_FLUX2_KLEIN) { } else if (version == VERSION_FLUX2_KLEIN) {
prompt_template_encode_start_idx = 0; prompt_template_encode_start_idx = 0;
max_length = 512; max_length = 512;
@ -1847,16 +1970,6 @@ struct LLMEmbedder : public Conditioner {
prompt_attn_range.second = static_cast<int>(prompt.size()); prompt_attn_range.second = static_cast<int>(prompt.size());
prompt += "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"; prompt += "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
auto tokens_and_weights = tokenize(prompt, prompt_attn_range, 0, false);
tokens = std::get<0>(tokens_and_weights);
weights = std::get<1>(tokens_and_weights);
mask.insert(mask.end(), tokens.size(), 1.f);
if (tokens.size() < max_length) {
mask.insert(mask.end(), max_length - tokens.size(), 0.f);
tokenizer->pad_tokens(tokens, weights, max_length, true);
}
} else if (version == VERSION_OVIS_IMAGE) { } else if (version == VERSION_OVIS_IMAGE) {
prompt_template_encode_start_idx = 28; prompt_template_encode_start_idx = 28;
max_length = prompt_template_encode_start_idx + 256; max_length = prompt_template_encode_start_idx + 256;
@ -1869,98 +1982,36 @@ struct LLMEmbedder : public Conditioner {
prompt += "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"; prompt += "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
} else { } else {
prompt_template_encode_start_idx = 34; GGML_ABORT("unknown version %d", version);
prompt = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n";
prompt_attn_range.first = static_cast<int>(prompt.size());
prompt += conditioner_params.text;
prompt_attn_range.second = static_cast<int>(prompt.size());
prompt += "<|im_end|>\n<|im_start|>assistant\n";
} }
if (tokens.empty()) { auto hidden_states = encode_prompt(work_ctx,
auto tokens_and_weights = tokenize(prompt, prompt_attn_range, max_length, max_length > 0); n_threads,
tokens = std::get<0>(tokens_and_weights); prompt,
weights = std::get<1>(tokens_and_weights); prompt_attn_range,
} max_length,
min_length,
int64_t t0 = ggml_time_ms();
struct ggml_tensor* hidden_states = nullptr; // [N, n_token, 3584]
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens);
ggml_tensor* attention_mask = nullptr;
if (!mask.empty()) {
attention_mask = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, mask.size(), mask.size());
ggml_ext_tensor_iter(attention_mask, [&](ggml_tensor* attention_mask, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
float value = 0.f;
if (mask[i0] == 0.f) {
value = -INFINITY;
} else if (i0 > i1) {
value = -INFINITY;
}
ggml_ext_tensor_set_f32(attention_mask, value, i0, i1, i2, i3);
});
}
llm->compute(n_threads,
input_ids,
attention_mask,
image_embeds, image_embeds,
out_layers, out_layers,
&hidden_states, prompt_template_encode_start_idx);
work_ctx);
{
auto tensor = hidden_states;
float original_mean = ggml_ext_tensor_mean(tensor);
for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
value *= weights[i1];
ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
}
}
}
float new_mean = ggml_ext_tensor_mean(tensor);
ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
}
GGML_ASSERT(hidden_states->ne[1] > prompt_template_encode_start_idx); std::vector<ggml_tensor*> extra_hidden_states_vec;
for (int i = 0; i < extra_prompts.size(); i++) {
int64_t min_length = 0; auto extra_hidden_states = encode_prompt(work_ctx,
if (version == VERSION_FLUX2) { n_threads,
min_length = 512; extra_prompts[i],
extra_prompts_attn_range[i],
max_length,
min_length,
image_embeds,
out_layers,
prompt_template_encode_start_idx);
extra_hidden_states_vec.push_back(extra_hidden_states);
} }
int64_t zero_pad_len = 0;
if (min_length > 0) {
if (hidden_states->ne[1] - prompt_template_encode_start_idx < min_length) {
zero_pad_len = min_length - hidden_states->ne[1] + prompt_template_encode_start_idx;
}
}
ggml_tensor* new_hidden_states = ggml_new_tensor_3d(work_ctx,
GGML_TYPE_F32,
hidden_states->ne[0],
hidden_states->ne[1] - prompt_template_encode_start_idx + zero_pad_len,
hidden_states->ne[2]);
ggml_ext_tensor_iter(new_hidden_states, [&](ggml_tensor* new_hidden_states, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
float value = 0.f;
if (i1 + prompt_template_encode_start_idx < hidden_states->ne[1]) {
value = ggml_ext_tensor_get_f32(hidden_states, i0, i1 + prompt_template_encode_start_idx, i2, i3);
}
ggml_ext_tensor_set_f32(new_hidden_states, value, i0, i1, i2, i3);
});
// print_ggml_tensor(new_hidden_states);
int64_t t1 = ggml_time_ms(); int64_t t1 = ggml_time_ms();
LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
return {new_hidden_states, nullptr, nullptr}; return {hidden_states, nullptr, nullptr, extra_hidden_states_vec};
} }
}; };

View File

@ -394,12 +394,15 @@ bool save_results(const SDCliParams& cli_params,
fs::path base_path = out_path; fs::path base_path = out_path;
fs::path ext = out_path.has_extension() ? out_path.extension() : fs::path{}; fs::path ext = out_path.has_extension() ? out_path.extension() : fs::path{};
if (!ext.empty())
base_path.replace_extension();
std::string ext_lower = ext.string(); std::string ext_lower = ext.string();
std::transform(ext_lower.begin(), ext_lower.end(), ext_lower.begin(), ::tolower); std::transform(ext_lower.begin(), ext_lower.end(), ext_lower.begin(), ::tolower);
bool is_jpg = (ext_lower == ".jpg" || ext_lower == ".jpeg" || ext_lower == ".jpe"); bool is_jpg = (ext_lower == ".jpg" || ext_lower == ".jpeg" || ext_lower == ".jpe");
if (!ext.empty()) {
if (is_jpg || ext_lower == ".png") {
base_path.replace_extension();
}
}
int output_begin_idx = cli_params.output_begin_idx; int output_begin_idx = cli_params.output_begin_idx;
if (output_begin_idx < 0) { if (output_begin_idx < 0) {
@ -409,7 +412,7 @@ bool save_results(const SDCliParams& cli_params,
auto write_image = [&](const fs::path& path, int idx) { auto write_image = [&](const fs::path& path, int idx) {
const sd_image_t& img = results[idx]; const sd_image_t& img = results[idx];
if (!img.data) if (!img.data)
return; return false;
std::string params = get_image_params(cli_params, ctx_params, gen_params, gen_params.seed + idx); std::string params = get_image_params(cli_params, ctx_params, gen_params, gen_params.seed + idx);
int ok = 0; int ok = 0;
@ -419,8 +422,11 @@ bool save_results(const SDCliParams& cli_params,
ok = stbi_write_png(path.string().c_str(), img.width, img.height, img.channel, img.data, 0, params.c_str()); ok = stbi_write_png(path.string().c_str(), img.width, img.height, img.channel, img.data, 0, params.c_str());
} }
LOG_INFO("save result image %d to '%s' (%s)", idx, path.string().c_str(), ok ? "success" : "failure"); LOG_INFO("save result image %d to '%s' (%s)", idx, path.string().c_str(), ok ? "success" : "failure");
return ok != 0;
}; };
int sucessful_reults = 0;
if (std::regex_search(cli_params.output_path, format_specifier_regex)) { if (std::regex_search(cli_params.output_path, format_specifier_regex)) {
if (!is_jpg && ext_lower != ".png") if (!is_jpg && ext_lower != ".png")
ext = ".png"; ext = ".png";
@ -429,9 +435,12 @@ bool save_results(const SDCliParams& cli_params,
for (int i = 0; i < num_results; ++i) { for (int i = 0; i < num_results; ++i) {
fs::path img_path = format_frame_idx(pattern.string(), output_begin_idx + i); fs::path img_path = format_frame_idx(pattern.string(), output_begin_idx + i);
write_image(img_path, i); if (write_image(img_path, i)) {
sucessful_reults++;
} }
return true; }
LOG_INFO("%d/%d images saved", sucessful_reults, num_results);
return sucessful_reults != 0;
} }
if (cli_params.mode == VID_GEN && num_results > 1) { if (cli_params.mode == VID_GEN && num_results > 1) {
@ -439,9 +448,13 @@ bool save_results(const SDCliParams& cli_params,
ext = ".avi"; ext = ".avi";
fs::path video_path = base_path; fs::path video_path = base_path;
video_path += ext; video_path += ext;
create_mjpg_avi_from_sd_images(video_path.string().c_str(), results, num_results, gen_params.fps); if (create_mjpg_avi_from_sd_images(video_path.string().c_str(), results, num_results, gen_params.fps) == 0) {
LOG_INFO("save result MJPG AVI video to '%s'", video_path.string().c_str()); LOG_INFO("save result MJPG AVI video to '%s'", video_path.string().c_str());
return true; return true;
} else {
LOG_ERROR("Failed to save result MPG AVI video to '%s'", video_path.string().c_str());
return false;
}
} }
if (!is_jpg && ext_lower != ".png") if (!is_jpg && ext_lower != ".png")
@ -453,10 +466,12 @@ bool save_results(const SDCliParams& cli_params,
img_path += "_" + std::to_string(output_begin_idx + i); img_path += "_" + std::to_string(output_begin_idx + i);
} }
img_path += ext; img_path += ext;
write_image(img_path, i); if (write_image(img_path, i)) {
sucessful_reults++;
} }
}
return true; LOG_INFO("%d/%d images saved", sucessful_reults, num_results);
return sucessful_reults != 0;
} }
int main(int argc, const char* argv[]) { int main(int argc, const char* argv[]) {

View File

@ -786,8 +786,8 @@ int main(int argc, const char** argv) {
std::string negative_prompt = j.value("negative_prompt", ""); std::string negative_prompt = j.value("negative_prompt", "");
int width = j.value("width", 512); int width = j.value("width", 512);
int height = j.value("height", 512); int height = j.value("height", 512);
int steps = j.value("steps", -1); int steps = j.value("steps", default_gen_params.sample_params.sample_steps);
float cfg_scale = j.value("cfg_scale", 7.f); float cfg_scale = j.value("cfg_scale", default_gen_params.sample_params.guidance.txt_cfg);
int64_t seed = j.value("seed", -1); int64_t seed = j.value("seed", -1);
int batch_size = j.value("batch_size", 1); int batch_size = j.value("batch_size", 1);
int clip_skip = j.value("clip_skip", -1); int clip_skip = j.value("clip_skip", -1);
@ -883,8 +883,6 @@ int main(int argc, const char** argv) {
enum scheduler_t scheduler = str_to_scheduler(scheduler_name.c_str()); enum scheduler_t scheduler = str_to_scheduler(scheduler_name.c_str());
// avoid excessive resource usage
SDGenerationParams gen_params = default_gen_params; SDGenerationParams gen_params = default_gen_params;
gen_params.prompt = prompt; gen_params.prompt = prompt;
gen_params.negative_prompt = negative_prompt; gen_params.negative_prompt = negative_prompt;
@ -893,6 +891,7 @@ int main(int argc, const char** argv) {
gen_params.seed = seed; gen_params.seed = seed;
gen_params.sample_params.sample_steps = steps; gen_params.sample_params.sample_steps = steps;
gen_params.batch_count = batch_size; gen_params.batch_count = batch_size;
gen_params.sample_params.guidance.txt_cfg = cfg_scale;
if (clip_skip > 0) { if (clip_skip > 0) {
gen_params.clip_skip = clip_skip; gen_params.clip_skip = clip_skip;

View File

@ -767,7 +767,7 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_silu_act(ggml_context* ctx, ggml_tensor*
return x; return x;
} }
typedef std::function<void(ggml_tensor*, ggml_tensor*, bool)> on_tile_process; typedef std::function<bool(ggml_tensor*, ggml_tensor*, bool)> on_tile_process;
__STATIC_INLINE__ void sd_tiling_calc_tiles(int& num_tiles_dim, __STATIC_INLINE__ void sd_tiling_calc_tiles(int& num_tiles_dim,
float& tile_overlap_factor_dim, float& tile_overlap_factor_dim,
@ -918,12 +918,15 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,
int64_t t1 = ggml_time_ms(); int64_t t1 = ggml_time_ms();
ggml_ext_tensor_split_2d(input, input_tile, x_in, y_in); ggml_ext_tensor_split_2d(input, input_tile, x_in, y_in);
on_processing(input_tile, output_tile, false); if (on_processing(input_tile, output_tile, false)) {
ggml_ext_tensor_merge_2d(output_tile, output, x_out, y_out, overlap_x_out, overlap_y_out, dx, dy); ggml_ext_tensor_merge_2d(output_tile, output, x_out, y_out, overlap_x_out, overlap_y_out, dx, dy);
int64_t t2 = ggml_time_ms(); int64_t t2 = ggml_time_ms();
last_time = (t2 - t1) / 1000.0f; last_time = (t2 - t1) / 1000.0f;
pretty_progress(tile_count, num_tiles, last_time); pretty_progress(tile_count, num_tiles, last_time);
} else {
LOG_ERROR("Failed to process patch %d at (%d, %d)", tile_count, x, y);
}
tile_count++; tile_count++;
} }
last_x = false; last_x = false;

View File

@ -1558,7 +1558,7 @@ public:
if (vae_tiling_params.enabled) { if (vae_tiling_params.enabled) {
// split latent in 32x32 tiles and compute in several steps // split latent in 32x32 tiles and compute in several steps
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
first_stage_model->compute(n_threads, in, true, &out, nullptr); return first_stage_model->compute(n_threads, in, true, &out, nullptr);
}; };
silent_tiling(latents, result, get_vae_scale_factor(), 32, 0.5f, on_tiling); silent_tiling(latents, result, get_vae_scale_factor(), 32, 0.5f, on_tiling);
@ -1577,7 +1577,7 @@ public:
if (vae_tiling_params.enabled) { if (vae_tiling_params.enabled) {
// split latent in 64x64 tiles and compute in several steps // split latent in 64x64 tiles and compute in several steps
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
tae_first_stage->compute(n_threads, in, true, &out, nullptr); return tae_first_stage->compute(n_threads, in, true, &out, nullptr);
}; };
silent_tiling(latents, result, get_vae_scale_factor(), 64, 0.5f, on_tiling); silent_tiling(latents, result, get_vae_scale_factor(), 64, 0.5f, on_tiling);
} else { } else {
@ -2546,7 +2546,7 @@ public:
LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y); LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y);
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
first_stage_model->compute(n_threads, in, false, &out, work_ctx); return first_stage_model->compute(n_threads, in, false, &out, work_ctx);
}; };
sd_tiling_non_square(x, result, vae_scale_factor, tile_size_x, tile_size_y, tile_overlap, on_tiling); sd_tiling_non_square(x, result, vae_scale_factor, tile_size_x, tile_size_y, tile_overlap, on_tiling);
} else { } else {
@ -2557,7 +2557,7 @@ public:
if (vae_tiling_params.enabled && !encode_video) { if (vae_tiling_params.enabled && !encode_video) {
// split latent in 32x32 tiles and compute in several steps // split latent in 32x32 tiles and compute in several steps
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
tae_first_stage->compute(n_threads, in, false, &out, nullptr); return tae_first_stage->compute(n_threads, in, false, &out, nullptr);
}; };
sd_tiling(x, result, vae_scale_factor, 64, 0.5f, on_tiling); sd_tiling(x, result, vae_scale_factor, 64, 0.5f, on_tiling);
} else { } else {
@ -2675,11 +2675,15 @@ public:
// split latent in 32x32 tiles and compute in several steps // split latent in 32x32 tiles and compute in several steps
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
first_stage_model->compute(n_threads, in, true, &out, nullptr); return first_stage_model->compute(n_threads, in, true, &out, nullptr);
}; };
sd_tiling_non_square(x, result, vae_scale_factor, tile_size_x, tile_size_y, tile_overlap, on_tiling); sd_tiling_non_square(x, result, vae_scale_factor, tile_size_x, tile_size_y, tile_overlap, on_tiling);
} else { } else {
first_stage_model->compute(n_threads, x, true, &result, work_ctx); if(!first_stage_model->compute(n_threads, x, true, &result, work_ctx)){
LOG_ERROR("Failed to decode latetnts");
first_stage_model->free_compute_buffer();
return nullptr;
}
} }
first_stage_model->free_compute_buffer(); first_stage_model->free_compute_buffer();
process_vae_output_tensor(result); process_vae_output_tensor(result);
@ -2687,11 +2691,15 @@ public:
if (vae_tiling_params.enabled) { if (vae_tiling_params.enabled) {
// split latent in 64x64 tiles and compute in several steps // split latent in 64x64 tiles and compute in several steps
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
tae_first_stage->compute(n_threads, in, true, &out); return tae_first_stage->compute(n_threads, in, true, &out);
}; };
sd_tiling(x, result, vae_scale_factor, 64, 0.5f, on_tiling); sd_tiling(x, result, vae_scale_factor, 64, 0.5f, on_tiling);
} else { } else {
tae_first_stage->compute(n_threads, x, true, &result); if(!tae_first_stage->compute(n_threads, x, true, &result)){
LOG_ERROR("Failed to decode latetnts");
tae_first_stage->free_compute_buffer();
return nullptr;
}
} }
tae_first_stage->free_compute_buffer(); tae_first_stage->free_compute_buffer();
} }
@ -3461,6 +3469,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
ggml_free(work_ctx); ggml_free(work_ctx);
return nullptr; return nullptr;
} }
memset(result_images, 0, batch_count * sizeof(sd_image_t));
for (size_t i = 0; i < decoded_images.size(); i++) { for (size_t i = 0; i < decoded_images.size(); i++) {
result_images[i].width = width; result_images[i].width = width;

View File

@ -89,7 +89,7 @@ struct UpscalerGGML {
ggml_tensor* upscaled = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, output_width, output_height, 3, 1); ggml_tensor* upscaled = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, output_width, output_height, 3, 1);
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
esrgan_upscaler->compute(n_threads, in, &out); return esrgan_upscaler->compute(n_threads, in, &out);
}; };
int64_t t0 = ggml_time_ms(); int64_t t0 = ggml_time_ms();
sd_tiling(input_image_tensor, upscaled, esrgan_upscaler->scale, esrgan_upscaler->tile_size, 0.25f, on_tiling); sd_tiling(input_image_tensor, upscaled, esrgan_upscaler->scale, esrgan_upscaler->tile_size, 0.25f, on_tiling);