mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2026-03-24 10:18:51 +00:00
Compare commits
5 Commits
f0f641a142
...
3296545090
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3296545090 | ||
|
|
d60fb27560 | ||
|
|
c7ccafbd6f | ||
|
|
aa0b899397 | ||
|
|
5e264372ce |
148
.github/workflows/build.yml
vendored
148
.github/workflows/build.yml
vendored
@ -485,6 +485,153 @@ jobs:
|
|||||||
path: |
|
path: |
|
||||||
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
|
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
|
||||||
|
|
||||||
|
ubuntu-latest-rocm:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
container: rocm/dev-ubuntu-24.04:7.2
|
||||||
|
|
||||||
|
env:
|
||||||
|
ROCM_VERSION: "7.2"
|
||||||
|
UBUNTU_VERSION: "24.04"
|
||||||
|
GPU_TARGETS: "gfx1151;gfx1150;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- run: apt-get update && apt-get install -y git
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
with:
|
||||||
|
submodules: recursive
|
||||||
|
|
||||||
|
- name: Free disk space
|
||||||
|
run: |
|
||||||
|
# Remove preinstalled SDKs and caches not needed for this job
|
||||||
|
sudo rm -rf /usr/share/dotnet || true
|
||||||
|
sudo rm -rf /usr/local/lib/android || true
|
||||||
|
sudo rm -rf /opt/ghc || true
|
||||||
|
sudo rm -rf /usr/local/.ghcup || true
|
||||||
|
sudo rm -rf /opt/hostedtoolcache || true
|
||||||
|
|
||||||
|
# Remove old package lists and caches
|
||||||
|
sudo rm -rf /var/lib/apt/lists/* || true
|
||||||
|
sudo apt clean
|
||||||
|
|
||||||
|
- name: Dependencies
|
||||||
|
id: depends
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt install -y \
|
||||||
|
cmake \
|
||||||
|
hip-dev \
|
||||||
|
hipblas-dev \
|
||||||
|
ninja-build \
|
||||||
|
rocm-dev \
|
||||||
|
zip
|
||||||
|
# Clean apt caches to recover disk space
|
||||||
|
sudo apt clean
|
||||||
|
sudo rm -rf /var/lib/apt/lists/* || true
|
||||||
|
|
||||||
|
- name: Setup ROCm Environment
|
||||||
|
run: |
|
||||||
|
# Add ROCm to PATH for current session
|
||||||
|
echo "/opt/rocm/bin" >> $GITHUB_PATH
|
||||||
|
|
||||||
|
# Build case pattern from GPU_TARGETS
|
||||||
|
PATTERN=$(printf '%s' "$GPU_TARGETS" | sed 's/;/\*|\*/g')
|
||||||
|
PATTERN="*${PATTERN}*"
|
||||||
|
|
||||||
|
# Remove library files for architectures we're not building for to save disk space
|
||||||
|
echo "Cleaning up unneeded architecture files..."
|
||||||
|
cd /opt/rocm/lib/rocblas/library
|
||||||
|
# Keep only our target architectures
|
||||||
|
for file in *; do
|
||||||
|
case "$file" in
|
||||||
|
$PATTERN)
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
sudo rm -f "$file" ;;
|
||||||
|
esac;
|
||||||
|
done
|
||||||
|
|
||||||
|
cd /opt/rocm/lib/hipblaslt/library
|
||||||
|
for file in *; do
|
||||||
|
case "$file" in
|
||||||
|
$PATTERN)
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
sudo rm -f "$file" ;;
|
||||||
|
esac;
|
||||||
|
done
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake .. -G Ninja \
|
||||||
|
-DCMAKE_CXX_COMPILER=amdclang++ \
|
||||||
|
-DCMAKE_C_COMPILER=amdclang \
|
||||||
|
-DCMAKE_BUILD_TYPE=Release \
|
||||||
|
-DSD_HIPBLAS=ON \
|
||||||
|
-DGPU_TARGETS="${{ env.GPU_TARGETS }}" \
|
||||||
|
-DAMDGPU_TARGETS="${{ env.GPU_TARGETS }}" \
|
||||||
|
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
|
||||||
|
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
|
||||||
|
-DSD_BUILD_SHARED_LIBS=ON
|
||||||
|
cmake --build . --config Release
|
||||||
|
|
||||||
|
- name: Get commit hash
|
||||||
|
id: commit
|
||||||
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
|
uses: pr-mpt/actions-commit-hash@v2
|
||||||
|
|
||||||
|
- name: Prepare artifacts
|
||||||
|
id: prepare_artifacts
|
||||||
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
|
run: |
|
||||||
|
# Copy licenses
|
||||||
|
cp ggml/LICENSE ./build/bin/ggml.txt
|
||||||
|
cp LICENSE ./build/bin/stable-diffusion.cpp.txt
|
||||||
|
|
||||||
|
# Create directories for ROCm libraries
|
||||||
|
mkdir -p ./build/bin/rocblas/library
|
||||||
|
mkdir -p ./build/bin/hipblaslt/library
|
||||||
|
|
||||||
|
# Copy ROCm runtime libraries (use || true to continue if files don't exist)
|
||||||
|
cp /opt/rocm/lib/librocsparse.so* ./build/bin/ || true
|
||||||
|
cp /opt/rocm/lib/libhsa-runtime64.so* ./build/bin/ || true
|
||||||
|
cp /opt/rocm/lib/libamdhip64.so* ./build/bin/ || true
|
||||||
|
cp /opt/rocm/lib/libhipblas.so* ./build/bin/ || true
|
||||||
|
cp /opt/rocm/lib/libhipblaslt.so* ./build/bin/ || true
|
||||||
|
cp /opt/rocm/lib/librocblas.so* ./build/bin/ || true
|
||||||
|
|
||||||
|
# Copy library files (already filtered to target architectures)
|
||||||
|
cp /opt/rocm/lib/rocblas/library/* ./build/bin/rocblas/library/ || true
|
||||||
|
cp /opt/rocm/lib/hipblaslt/library/* ./build/bin/hipblaslt/library/ || true
|
||||||
|
|
||||||
|
- name: Fetch system info
|
||||||
|
id: system-info
|
||||||
|
run: |
|
||||||
|
echo "CPU_ARCH=`uname -m`" >> "$GITHUB_OUTPUT"
|
||||||
|
echo "OS_NAME=`lsb_release -s -i`" >> "$GITHUB_OUTPUT"
|
||||||
|
echo "OS_VERSION=`lsb_release -s -r`" >> "$GITHUB_OUTPUT"
|
||||||
|
echo "OS_TYPE=`uname -s`" >> "$GITHUB_OUTPUT"
|
||||||
|
|
||||||
|
- name: Pack artifacts
|
||||||
|
id: pack_artifacts
|
||||||
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
|
run: |
|
||||||
|
cp ggml/LICENSE ./build/bin/ggml.txt
|
||||||
|
cp LICENSE ./build/bin/stable-diffusion.cpp.txt
|
||||||
|
zip -j sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip ./build/bin/*
|
||||||
|
|
||||||
|
- name: Upload artifacts
|
||||||
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip
|
||||||
|
path: |
|
||||||
|
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip
|
||||||
|
|
||||||
release:
|
release:
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
|
|
||||||
@ -493,6 +640,7 @@ jobs:
|
|||||||
needs:
|
needs:
|
||||||
- ubuntu-latest-cmake
|
- ubuntu-latest-cmake
|
||||||
- ubuntu-latest-cmake-vulkan
|
- ubuntu-latest-cmake-vulkan
|
||||||
|
- ubuntu-latest-rocm
|
||||||
- build-and-push-docker-images
|
- build-and-push-docker-images
|
||||||
- macOS-latest-cmake
|
- macOS-latest-cmake
|
||||||
- windows-latest-cmake
|
- windows-latest-cmake
|
||||||
|
|||||||
375
conditioner.hpp
375
conditioner.hpp
@ -10,9 +10,14 @@ struct SDCondition {
|
|||||||
struct ggml_tensor* c_vector = nullptr; // aka y
|
struct ggml_tensor* c_vector = nullptr; // aka y
|
||||||
struct ggml_tensor* c_concat = nullptr;
|
struct ggml_tensor* c_concat = nullptr;
|
||||||
|
|
||||||
|
std::vector<struct ggml_tensor*> extra_c_crossattns;
|
||||||
|
|
||||||
SDCondition() = default;
|
SDCondition() = default;
|
||||||
SDCondition(struct ggml_tensor* c_crossattn, struct ggml_tensor* c_vector, struct ggml_tensor* c_concat)
|
SDCondition(struct ggml_tensor* c_crossattn,
|
||||||
: c_crossattn(c_crossattn), c_vector(c_vector), c_concat(c_concat) {}
|
struct ggml_tensor* c_vector,
|
||||||
|
struct ggml_tensor* c_concat,
|
||||||
|
const std::vector<struct ggml_tensor*>& extra_c_crossattns = {})
|
||||||
|
: c_crossattn(c_crossattn), c_vector(c_vector), c_concat(c_concat), extra_c_crossattns(extra_c_crossattns) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ConditionerParams {
|
struct ConditionerParams {
|
||||||
@ -1696,18 +1701,23 @@ struct LLMEmbedder : public Conditioner {
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::tuple<std::vector<int>, std::vector<float>> tokenize(std::string text,
|
std::tuple<std::vector<int>, std::vector<float>> tokenize(std::string text,
|
||||||
std::pair<int, int> attn_range,
|
const std::pair<int, int>& attn_range,
|
||||||
size_t max_length = 0,
|
size_t max_length = 0,
|
||||||
bool padding = false) {
|
bool padding = false) {
|
||||||
std::vector<std::pair<std::string, float>> parsed_attention;
|
std::vector<std::pair<std::string, float>> parsed_attention;
|
||||||
parsed_attention.emplace_back(text.substr(0, attn_range.first), 1.f);
|
if (attn_range.first >= 0 && attn_range.second > 0) {
|
||||||
if (attn_range.second - attn_range.first > 0) {
|
parsed_attention.emplace_back(text.substr(0, attn_range.first), 1.f);
|
||||||
auto new_parsed_attention = parse_prompt_attention(text.substr(attn_range.first, attn_range.second - attn_range.first));
|
if (attn_range.second - attn_range.first > 0) {
|
||||||
parsed_attention.insert(parsed_attention.end(),
|
auto new_parsed_attention = parse_prompt_attention(text.substr(attn_range.first, attn_range.second - attn_range.first));
|
||||||
new_parsed_attention.begin(),
|
parsed_attention.insert(parsed_attention.end(),
|
||||||
new_parsed_attention.end());
|
new_parsed_attention.begin(),
|
||||||
|
new_parsed_attention.end());
|
||||||
|
}
|
||||||
|
parsed_attention.emplace_back(text.substr(attn_range.second), 1.f);
|
||||||
|
} else {
|
||||||
|
parsed_attention.emplace_back(text, 1.f);
|
||||||
}
|
}
|
||||||
parsed_attention.emplace_back(text.substr(attn_range.second), 1.f);
|
|
||||||
{
|
{
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
ss << "[";
|
ss << "[";
|
||||||
@ -1738,156 +1748,27 @@ struct LLMEmbedder : public Conditioner {
|
|||||||
return {tokens, weights};
|
return {tokens, weights};
|
||||||
}
|
}
|
||||||
|
|
||||||
SDCondition get_learned_condition(ggml_context* work_ctx,
|
ggml_tensor* encode_prompt(ggml_context* work_ctx,
|
||||||
int n_threads,
|
int n_threads,
|
||||||
const ConditionerParams& conditioner_params) override {
|
const std::string prompt,
|
||||||
std::string prompt;
|
const std::pair<int, int>& prompt_attn_range,
|
||||||
std::vector<std::pair<int, ggml_tensor*>> image_embeds;
|
int max_length,
|
||||||
std::pair<int, int> prompt_attn_range;
|
int min_length,
|
||||||
int prompt_template_encode_start_idx = 34;
|
std::vector<std::pair<int, ggml_tensor*>> image_embeds,
|
||||||
int max_length = 0;
|
const std::set<int>& out_layers,
|
||||||
std::set<int> out_layers;
|
int prompt_template_encode_start_idx) {
|
||||||
std::vector<int> tokens;
|
auto tokens_and_weights = tokenize(prompt, prompt_attn_range);
|
||||||
std::vector<float> weights;
|
auto& tokens = std::get<0>(tokens_and_weights);
|
||||||
|
auto& weights = std::get<1>(tokens_and_weights);
|
||||||
std::vector<float> mask;
|
std::vector<float> mask;
|
||||||
if (llm->enable_vision && conditioner_params.ref_images.size() > 0) {
|
|
||||||
LOG_INFO("QwenImageEditPlusPipeline");
|
|
||||||
prompt_template_encode_start_idx = 64;
|
|
||||||
int image_embed_idx = 64 + 6;
|
|
||||||
|
|
||||||
int min_pixels = 384 * 384;
|
|
||||||
int max_pixels = 560 * 560;
|
|
||||||
std::string placeholder = "<|image_pad|>";
|
|
||||||
std::string img_prompt;
|
|
||||||
|
|
||||||
for (int i = 0; i < conditioner_params.ref_images.size(); i++) {
|
|
||||||
sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(*conditioner_params.ref_images[i]);
|
|
||||||
double factor = llm->params.vision.patch_size * llm->params.vision.spatial_merge_size;
|
|
||||||
int height = image.height;
|
|
||||||
int width = image.width;
|
|
||||||
int h_bar = static_cast<int>(std::round(height / factor) * factor);
|
|
||||||
int w_bar = static_cast<int>(std::round(width / factor) * factor);
|
|
||||||
|
|
||||||
if (static_cast<double>(h_bar) * w_bar > max_pixels) {
|
|
||||||
double beta = std::sqrt((height * width) / static_cast<double>(max_pixels));
|
|
||||||
h_bar = std::max(static_cast<int>(factor),
|
|
||||||
static_cast<int>(std::floor(height / beta / factor)) * static_cast<int>(factor));
|
|
||||||
w_bar = std::max(static_cast<int>(factor),
|
|
||||||
static_cast<int>(std::floor(width / beta / factor)) * static_cast<int>(factor));
|
|
||||||
} else if (static_cast<double>(h_bar) * w_bar < min_pixels) {
|
|
||||||
double beta = std::sqrt(static_cast<double>(min_pixels) / (height * width));
|
|
||||||
h_bar = static_cast<int>(std::ceil(height * beta / factor)) * static_cast<int>(factor);
|
|
||||||
w_bar = static_cast<int>(std::ceil(width * beta / factor)) * static_cast<int>(factor);
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, image.height, image.width, h_bar, w_bar);
|
|
||||||
|
|
||||||
sd_image_f32_t resized_image = clip_preprocess(image, w_bar, h_bar);
|
|
||||||
free(image.data);
|
|
||||||
image.data = nullptr;
|
|
||||||
|
|
||||||
ggml_tensor* image_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, resized_image.width, resized_image.height, 3, 1);
|
|
||||||
sd_image_f32_to_ggml_tensor(resized_image, image_tensor, false);
|
|
||||||
free(resized_image.data);
|
|
||||||
resized_image.data = nullptr;
|
|
||||||
|
|
||||||
ggml_tensor* image_embed = nullptr;
|
|
||||||
llm->encode_image(n_threads, image_tensor, &image_embed, work_ctx);
|
|
||||||
image_embeds.emplace_back(image_embed_idx, image_embed);
|
|
||||||
image_embed_idx += 1 + static_cast<int>(image_embed->ne[1]) + 6;
|
|
||||||
|
|
||||||
img_prompt += "Picture " + std::to_string(i + 1) + ": <|vision_start|>"; // [24669, 220, index, 25, 220, 151652]
|
|
||||||
int64_t num_image_tokens = image_embed->ne[1];
|
|
||||||
img_prompt.reserve(num_image_tokens * placeholder.size());
|
|
||||||
for (int j = 0; j < num_image_tokens; j++) {
|
|
||||||
img_prompt += placeholder;
|
|
||||||
}
|
|
||||||
img_prompt += "<|vision_end|>";
|
|
||||||
}
|
|
||||||
|
|
||||||
prompt = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n";
|
|
||||||
prompt += img_prompt;
|
|
||||||
|
|
||||||
prompt_attn_range.first = static_cast<int>(prompt.size());
|
|
||||||
prompt += conditioner_params.text;
|
|
||||||
prompt_attn_range.second = static_cast<int>(prompt.size());
|
|
||||||
|
|
||||||
prompt += "<|im_end|>\n<|im_start|>assistant\n";
|
|
||||||
} else if (version == VERSION_FLUX2) {
|
|
||||||
prompt_template_encode_start_idx = 0;
|
|
||||||
out_layers = {10, 20, 30};
|
|
||||||
|
|
||||||
prompt = "[SYSTEM_PROMPT]You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object\nattribution and actions without speculation.[/SYSTEM_PROMPT][INST]";
|
|
||||||
|
|
||||||
prompt_attn_range.first = static_cast<int>(prompt.size());
|
|
||||||
prompt += conditioner_params.text;
|
|
||||||
prompt_attn_range.second = static_cast<int>(prompt.size());
|
|
||||||
|
|
||||||
prompt += "[/INST]";
|
|
||||||
} else if (sd_version_is_z_image(version)) {
|
|
||||||
prompt_template_encode_start_idx = 0;
|
|
||||||
out_layers = {35}; // -2
|
|
||||||
|
|
||||||
prompt = "<|im_start|>user\n";
|
|
||||||
|
|
||||||
prompt_attn_range.first = static_cast<int>(prompt.size());
|
|
||||||
prompt += conditioner_params.text;
|
|
||||||
prompt_attn_range.second = static_cast<int>(prompt.size());
|
|
||||||
|
|
||||||
prompt += "<|im_end|>\n<|im_start|>assistant\n";
|
|
||||||
} else if (version == VERSION_FLUX2_KLEIN) {
|
|
||||||
prompt_template_encode_start_idx = 0;
|
|
||||||
max_length = 512;
|
|
||||||
out_layers = {9, 18, 27};
|
|
||||||
|
|
||||||
prompt = "<|im_start|>user\n";
|
|
||||||
|
|
||||||
prompt_attn_range.first = static_cast<int>(prompt.size());
|
|
||||||
prompt += conditioner_params.text;
|
|
||||||
prompt_attn_range.second = static_cast<int>(prompt.size());
|
|
||||||
|
|
||||||
prompt += "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
|
|
||||||
|
|
||||||
auto tokens_and_weights = tokenize(prompt, prompt_attn_range, 0, false);
|
|
||||||
tokens = std::get<0>(tokens_and_weights);
|
|
||||||
weights = std::get<1>(tokens_and_weights);
|
|
||||||
|
|
||||||
|
if (max_length > 0 && tokens.size() < max_length) {
|
||||||
mask.insert(mask.end(), tokens.size(), 1.f);
|
mask.insert(mask.end(), tokens.size(), 1.f);
|
||||||
if (tokens.size() < max_length) {
|
mask.insert(mask.end(), max_length - tokens.size(), 0.f);
|
||||||
mask.insert(mask.end(), max_length - tokens.size(), 0.f);
|
tokenizer->pad_tokens(tokens, weights, max_length, true);
|
||||||
tokenizer->pad_tokens(tokens, weights, max_length, true);
|
|
||||||
}
|
|
||||||
} else if (version == VERSION_OVIS_IMAGE) {
|
|
||||||
prompt_template_encode_start_idx = 28;
|
|
||||||
max_length = prompt_template_encode_start_idx + 256;
|
|
||||||
|
|
||||||
prompt = "<|im_start|>user\nDescribe the image by detailing the color, quantity, text, shape, size, texture, spatial relationships of the objects and background:";
|
|
||||||
|
|
||||||
prompt_attn_range.first = static_cast<int>(prompt.size());
|
|
||||||
prompt += " " + conditioner_params.text;
|
|
||||||
prompt_attn_range.second = static_cast<int>(prompt.size());
|
|
||||||
|
|
||||||
prompt += "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
|
|
||||||
} else {
|
|
||||||
prompt_template_encode_start_idx = 34;
|
|
||||||
|
|
||||||
prompt = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n";
|
|
||||||
|
|
||||||
prompt_attn_range.first = static_cast<int>(prompt.size());
|
|
||||||
prompt += conditioner_params.text;
|
|
||||||
prompt_attn_range.second = static_cast<int>(prompt.size());
|
|
||||||
|
|
||||||
prompt += "<|im_end|>\n<|im_start|>assistant\n";
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tokens.empty()) {
|
struct ggml_tensor* hidden_states = nullptr; // [N, n_token, hidden_size]
|
||||||
auto tokens_and_weights = tokenize(prompt, prompt_attn_range, max_length, max_length > 0);
|
|
||||||
tokens = std::get<0>(tokens_and_weights);
|
|
||||||
weights = std::get<1>(tokens_and_weights);
|
|
||||||
}
|
|
||||||
|
|
||||||
int64_t t0 = ggml_time_ms();
|
|
||||||
struct ggml_tensor* hidden_states = nullptr; // [N, n_token, 3584]
|
|
||||||
|
|
||||||
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens);
|
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens);
|
||||||
|
|
||||||
@ -1930,11 +1811,6 @@ struct LLMEmbedder : public Conditioner {
|
|||||||
|
|
||||||
GGML_ASSERT(hidden_states->ne[1] > prompt_template_encode_start_idx);
|
GGML_ASSERT(hidden_states->ne[1] > prompt_template_encode_start_idx);
|
||||||
|
|
||||||
int64_t min_length = 0;
|
|
||||||
if (version == VERSION_FLUX2) {
|
|
||||||
min_length = 512;
|
|
||||||
}
|
|
||||||
|
|
||||||
int64_t zero_pad_len = 0;
|
int64_t zero_pad_len = 0;
|
||||||
if (min_length > 0) {
|
if (min_length > 0) {
|
||||||
if (hidden_states->ne[1] - prompt_template_encode_start_idx < min_length) {
|
if (hidden_states->ne[1] - prompt_template_encode_start_idx < min_length) {
|
||||||
@ -1956,11 +1832,186 @@ struct LLMEmbedder : public Conditioner {
|
|||||||
ggml_ext_tensor_set_f32(new_hidden_states, value, i0, i1, i2, i3);
|
ggml_ext_tensor_set_f32(new_hidden_states, value, i0, i1, i2, i3);
|
||||||
});
|
});
|
||||||
|
|
||||||
// print_ggml_tensor(new_hidden_states);
|
return new_hidden_states;
|
||||||
|
}
|
||||||
|
|
||||||
|
SDCondition get_learned_condition(ggml_context* work_ctx,
|
||||||
|
int n_threads,
|
||||||
|
const ConditionerParams& conditioner_params) override {
|
||||||
|
std::string prompt;
|
||||||
|
std::pair<int, int> prompt_attn_range;
|
||||||
|
std::vector<std::string> extra_prompts;
|
||||||
|
std::vector<std::pair<int, int>> extra_prompts_attn_range;
|
||||||
|
std::vector<std::pair<int, ggml_tensor*>> image_embeds;
|
||||||
|
int prompt_template_encode_start_idx = 34;
|
||||||
|
int max_length = 0; // pad tokens
|
||||||
|
int min_length = 0; // zero pad hidden_states
|
||||||
|
std::set<int> out_layers;
|
||||||
|
|
||||||
|
int64_t t0 = ggml_time_ms();
|
||||||
|
|
||||||
|
if (sd_version_is_qwen_image(version)) {
|
||||||
|
if (llm->enable_vision && !conditioner_params.ref_images.empty()) {
|
||||||
|
LOG_INFO("QwenImageEditPlusPipeline");
|
||||||
|
prompt_template_encode_start_idx = 64;
|
||||||
|
int image_embed_idx = 64 + 6;
|
||||||
|
|
||||||
|
int min_pixels = 384 * 384;
|
||||||
|
int max_pixels = 560 * 560;
|
||||||
|
std::string placeholder = "<|image_pad|>";
|
||||||
|
std::string img_prompt;
|
||||||
|
|
||||||
|
for (int i = 0; i < conditioner_params.ref_images.size(); i++) {
|
||||||
|
sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(*conditioner_params.ref_images[i]);
|
||||||
|
double factor = llm->params.vision.patch_size * llm->params.vision.spatial_merge_size;
|
||||||
|
int height = image.height;
|
||||||
|
int width = image.width;
|
||||||
|
int h_bar = static_cast<int>(std::round(height / factor) * factor);
|
||||||
|
int w_bar = static_cast<int>(std::round(width / factor) * factor);
|
||||||
|
|
||||||
|
if (static_cast<double>(h_bar) * w_bar > max_pixels) {
|
||||||
|
double beta = std::sqrt((height * width) / static_cast<double>(max_pixels));
|
||||||
|
h_bar = std::max(static_cast<int>(factor),
|
||||||
|
static_cast<int>(std::floor(height / beta / factor)) * static_cast<int>(factor));
|
||||||
|
w_bar = std::max(static_cast<int>(factor),
|
||||||
|
static_cast<int>(std::floor(width / beta / factor)) * static_cast<int>(factor));
|
||||||
|
} else if (static_cast<double>(h_bar) * w_bar < min_pixels) {
|
||||||
|
double beta = std::sqrt(static_cast<double>(min_pixels) / (height * width));
|
||||||
|
h_bar = static_cast<int>(std::ceil(height * beta / factor)) * static_cast<int>(factor);
|
||||||
|
w_bar = static_cast<int>(std::ceil(width * beta / factor)) * static_cast<int>(factor);
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, image.height, image.width, h_bar, w_bar);
|
||||||
|
|
||||||
|
sd_image_f32_t resized_image = clip_preprocess(image, w_bar, h_bar);
|
||||||
|
free(image.data);
|
||||||
|
image.data = nullptr;
|
||||||
|
|
||||||
|
ggml_tensor* image_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, resized_image.width, resized_image.height, 3, 1);
|
||||||
|
sd_image_f32_to_ggml_tensor(resized_image, image_tensor, false);
|
||||||
|
free(resized_image.data);
|
||||||
|
resized_image.data = nullptr;
|
||||||
|
|
||||||
|
ggml_tensor* image_embed = nullptr;
|
||||||
|
llm->encode_image(n_threads, image_tensor, &image_embed, work_ctx);
|
||||||
|
image_embeds.emplace_back(image_embed_idx, image_embed);
|
||||||
|
image_embed_idx += 1 + static_cast<int>(image_embed->ne[1]) + 6;
|
||||||
|
|
||||||
|
img_prompt += "Picture " + std::to_string(i + 1) + ": <|vision_start|>"; // [24669, 220, index, 25, 220, 151652]
|
||||||
|
int64_t num_image_tokens = image_embed->ne[1];
|
||||||
|
img_prompt.reserve(num_image_tokens * placeholder.size());
|
||||||
|
for (int j = 0; j < num_image_tokens; j++) {
|
||||||
|
img_prompt += placeholder;
|
||||||
|
}
|
||||||
|
img_prompt += "<|vision_end|>";
|
||||||
|
}
|
||||||
|
|
||||||
|
prompt = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n";
|
||||||
|
prompt += img_prompt;
|
||||||
|
|
||||||
|
prompt_attn_range.first = static_cast<int>(prompt.size());
|
||||||
|
prompt += conditioner_params.text;
|
||||||
|
prompt_attn_range.second = static_cast<int>(prompt.size());
|
||||||
|
|
||||||
|
prompt += "<|im_end|>\n<|im_start|>assistant\n";
|
||||||
|
} else {
|
||||||
|
prompt_template_encode_start_idx = 34;
|
||||||
|
|
||||||
|
prompt = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n";
|
||||||
|
|
||||||
|
prompt_attn_range.first = static_cast<int>(prompt.size());
|
||||||
|
prompt += conditioner_params.text;
|
||||||
|
prompt_attn_range.second = static_cast<int>(prompt.size());
|
||||||
|
|
||||||
|
prompt += "<|im_end|>\n<|im_start|>assistant\n";
|
||||||
|
}
|
||||||
|
} else if (version == VERSION_FLUX2) {
|
||||||
|
prompt_template_encode_start_idx = 0;
|
||||||
|
min_length = 512;
|
||||||
|
out_layers = {10, 20, 30};
|
||||||
|
|
||||||
|
prompt = "[SYSTEM_PROMPT]You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object\nattribution and actions without speculation.[/SYSTEM_PROMPT][INST]";
|
||||||
|
|
||||||
|
prompt_attn_range.first = static_cast<int>(prompt.size());
|
||||||
|
prompt += conditioner_params.text;
|
||||||
|
prompt_attn_range.second = static_cast<int>(prompt.size());
|
||||||
|
|
||||||
|
prompt += "[/INST]";
|
||||||
|
} else if (sd_version_is_z_image(version)) {
|
||||||
|
prompt_template_encode_start_idx = 0;
|
||||||
|
out_layers = {35}; // -2
|
||||||
|
|
||||||
|
if (!conditioner_params.ref_images.empty()) {
|
||||||
|
LOG_INFO("ZImageOmniPipeline");
|
||||||
|
prompt = "<|im_start|>user\n<|vision_start|>";
|
||||||
|
for (int i = 0; i < conditioner_params.ref_images.size() - 1; i++) {
|
||||||
|
extra_prompts.push_back("<|vision_end|><|vision_start|>");
|
||||||
|
}
|
||||||
|
extra_prompts.push_back("<|vision_end|>" + conditioner_params.text + "<|im_end|>\n<|im_start|>assistant\n<|vision_start|>");
|
||||||
|
extra_prompts.push_back("<|vision_end|><|im_end|>");
|
||||||
|
} else {
|
||||||
|
prompt = "<|im_start|>user\n";
|
||||||
|
|
||||||
|
prompt_attn_range.first = static_cast<int>(prompt.size());
|
||||||
|
prompt += conditioner_params.text;
|
||||||
|
prompt_attn_range.second = static_cast<int>(prompt.size());
|
||||||
|
|
||||||
|
prompt += "<|im_end|>\n<|im_start|>assistant\n";
|
||||||
|
}
|
||||||
|
} else if (version == VERSION_FLUX2_KLEIN) {
|
||||||
|
prompt_template_encode_start_idx = 0;
|
||||||
|
max_length = 512;
|
||||||
|
out_layers = {9, 18, 27};
|
||||||
|
|
||||||
|
prompt = "<|im_start|>user\n";
|
||||||
|
|
||||||
|
prompt_attn_range.first = static_cast<int>(prompt.size());
|
||||||
|
prompt += conditioner_params.text;
|
||||||
|
prompt_attn_range.second = static_cast<int>(prompt.size());
|
||||||
|
|
||||||
|
prompt += "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
|
||||||
|
} else if (version == VERSION_OVIS_IMAGE) {
|
||||||
|
prompt_template_encode_start_idx = 28;
|
||||||
|
max_length = prompt_template_encode_start_idx + 256;
|
||||||
|
|
||||||
|
prompt = "<|im_start|>user\nDescribe the image by detailing the color, quantity, text, shape, size, texture, spatial relationships of the objects and background:";
|
||||||
|
|
||||||
|
prompt_attn_range.first = static_cast<int>(prompt.size());
|
||||||
|
prompt += " " + conditioner_params.text;
|
||||||
|
prompt_attn_range.second = static_cast<int>(prompt.size());
|
||||||
|
|
||||||
|
prompt += "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
|
||||||
|
} else {
|
||||||
|
GGML_ABORT("unknown version %d", version);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto hidden_states = encode_prompt(work_ctx,
|
||||||
|
n_threads,
|
||||||
|
prompt,
|
||||||
|
prompt_attn_range,
|
||||||
|
max_length,
|
||||||
|
min_length,
|
||||||
|
image_embeds,
|
||||||
|
out_layers,
|
||||||
|
prompt_template_encode_start_idx);
|
||||||
|
|
||||||
|
std::vector<ggml_tensor*> extra_hidden_states_vec;
|
||||||
|
for (int i = 0; i < extra_prompts.size(); i++) {
|
||||||
|
auto extra_hidden_states = encode_prompt(work_ctx,
|
||||||
|
n_threads,
|
||||||
|
extra_prompts[i],
|
||||||
|
extra_prompts_attn_range[i],
|
||||||
|
max_length,
|
||||||
|
min_length,
|
||||||
|
image_embeds,
|
||||||
|
out_layers,
|
||||||
|
prompt_template_encode_start_idx);
|
||||||
|
extra_hidden_states_vec.push_back(extra_hidden_states);
|
||||||
|
}
|
||||||
|
|
||||||
int64_t t1 = ggml_time_ms();
|
int64_t t1 = ggml_time_ms();
|
||||||
LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
|
LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
|
||||||
return {new_hidden_states, nullptr, nullptr};
|
return {hidden_states, nullptr, nullptr, extra_hidden_states_vec};
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -394,12 +394,15 @@ bool save_results(const SDCliParams& cli_params,
|
|||||||
|
|
||||||
fs::path base_path = out_path;
|
fs::path base_path = out_path;
|
||||||
fs::path ext = out_path.has_extension() ? out_path.extension() : fs::path{};
|
fs::path ext = out_path.has_extension() ? out_path.extension() : fs::path{};
|
||||||
if (!ext.empty())
|
|
||||||
base_path.replace_extension();
|
|
||||||
|
|
||||||
std::string ext_lower = ext.string();
|
std::string ext_lower = ext.string();
|
||||||
std::transform(ext_lower.begin(), ext_lower.end(), ext_lower.begin(), ::tolower);
|
std::transform(ext_lower.begin(), ext_lower.end(), ext_lower.begin(), ::tolower);
|
||||||
bool is_jpg = (ext_lower == ".jpg" || ext_lower == ".jpeg" || ext_lower == ".jpe");
|
bool is_jpg = (ext_lower == ".jpg" || ext_lower == ".jpeg" || ext_lower == ".jpe");
|
||||||
|
if (!ext.empty()) {
|
||||||
|
if (is_jpg || ext_lower == ".png") {
|
||||||
|
base_path.replace_extension();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int output_begin_idx = cli_params.output_begin_idx;
|
int output_begin_idx = cli_params.output_begin_idx;
|
||||||
if (output_begin_idx < 0) {
|
if (output_begin_idx < 0) {
|
||||||
@ -409,7 +412,7 @@ bool save_results(const SDCliParams& cli_params,
|
|||||||
auto write_image = [&](const fs::path& path, int idx) {
|
auto write_image = [&](const fs::path& path, int idx) {
|
||||||
const sd_image_t& img = results[idx];
|
const sd_image_t& img = results[idx];
|
||||||
if (!img.data)
|
if (!img.data)
|
||||||
return;
|
return false;
|
||||||
|
|
||||||
std::string params = get_image_params(cli_params, ctx_params, gen_params, gen_params.seed + idx);
|
std::string params = get_image_params(cli_params, ctx_params, gen_params, gen_params.seed + idx);
|
||||||
int ok = 0;
|
int ok = 0;
|
||||||
@ -419,8 +422,11 @@ bool save_results(const SDCliParams& cli_params,
|
|||||||
ok = stbi_write_png(path.string().c_str(), img.width, img.height, img.channel, img.data, 0, params.c_str());
|
ok = stbi_write_png(path.string().c_str(), img.width, img.height, img.channel, img.data, 0, params.c_str());
|
||||||
}
|
}
|
||||||
LOG_INFO("save result image %d to '%s' (%s)", idx, path.string().c_str(), ok ? "success" : "failure");
|
LOG_INFO("save result image %d to '%s' (%s)", idx, path.string().c_str(), ok ? "success" : "failure");
|
||||||
|
return ok != 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
int sucessful_reults = 0;
|
||||||
|
|
||||||
if (std::regex_search(cli_params.output_path, format_specifier_regex)) {
|
if (std::regex_search(cli_params.output_path, format_specifier_regex)) {
|
||||||
if (!is_jpg && ext_lower != ".png")
|
if (!is_jpg && ext_lower != ".png")
|
||||||
ext = ".png";
|
ext = ".png";
|
||||||
@ -429,9 +435,12 @@ bool save_results(const SDCliParams& cli_params,
|
|||||||
|
|
||||||
for (int i = 0; i < num_results; ++i) {
|
for (int i = 0; i < num_results; ++i) {
|
||||||
fs::path img_path = format_frame_idx(pattern.string(), output_begin_idx + i);
|
fs::path img_path = format_frame_idx(pattern.string(), output_begin_idx + i);
|
||||||
write_image(img_path, i);
|
if (write_image(img_path, i)) {
|
||||||
|
sucessful_reults++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return true;
|
LOG_INFO("%d/%d images saved", sucessful_reults, num_results);
|
||||||
|
return sucessful_reults != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cli_params.mode == VID_GEN && num_results > 1) {
|
if (cli_params.mode == VID_GEN && num_results > 1) {
|
||||||
@ -439,9 +448,13 @@ bool save_results(const SDCliParams& cli_params,
|
|||||||
ext = ".avi";
|
ext = ".avi";
|
||||||
fs::path video_path = base_path;
|
fs::path video_path = base_path;
|
||||||
video_path += ext;
|
video_path += ext;
|
||||||
create_mjpg_avi_from_sd_images(video_path.string().c_str(), results, num_results, gen_params.fps);
|
if (create_mjpg_avi_from_sd_images(video_path.string().c_str(), results, num_results, gen_params.fps) == 0) {
|
||||||
LOG_INFO("save result MJPG AVI video to '%s'", video_path.string().c_str());
|
LOG_INFO("save result MJPG AVI video to '%s'", video_path.string().c_str());
|
||||||
return true;
|
return true;
|
||||||
|
} else {
|
||||||
|
LOG_ERROR("Failed to save result MPG AVI video to '%s'", video_path.string().c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!is_jpg && ext_lower != ".png")
|
if (!is_jpg && ext_lower != ".png")
|
||||||
@ -453,10 +466,12 @@ bool save_results(const SDCliParams& cli_params,
|
|||||||
img_path += "_" + std::to_string(output_begin_idx + i);
|
img_path += "_" + std::to_string(output_begin_idx + i);
|
||||||
}
|
}
|
||||||
img_path += ext;
|
img_path += ext;
|
||||||
write_image(img_path, i);
|
if (write_image(img_path, i)) {
|
||||||
|
sucessful_reults++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
LOG_INFO("%d/%d images saved", sucessful_reults, num_results);
|
||||||
return true;
|
return sucessful_reults != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, const char* argv[]) {
|
int main(int argc, const char* argv[]) {
|
||||||
|
|||||||
@ -786,8 +786,8 @@ int main(int argc, const char** argv) {
|
|||||||
std::string negative_prompt = j.value("negative_prompt", "");
|
std::string negative_prompt = j.value("negative_prompt", "");
|
||||||
int width = j.value("width", 512);
|
int width = j.value("width", 512);
|
||||||
int height = j.value("height", 512);
|
int height = j.value("height", 512);
|
||||||
int steps = j.value("steps", -1);
|
int steps = j.value("steps", default_gen_params.sample_params.sample_steps);
|
||||||
float cfg_scale = j.value("cfg_scale", 7.f);
|
float cfg_scale = j.value("cfg_scale", default_gen_params.sample_params.guidance.txt_cfg);
|
||||||
int64_t seed = j.value("seed", -1);
|
int64_t seed = j.value("seed", -1);
|
||||||
int batch_size = j.value("batch_size", 1);
|
int batch_size = j.value("batch_size", 1);
|
||||||
int clip_skip = j.value("clip_skip", -1);
|
int clip_skip = j.value("clip_skip", -1);
|
||||||
@ -883,16 +883,15 @@ int main(int argc, const char** argv) {
|
|||||||
|
|
||||||
enum scheduler_t scheduler = str_to_scheduler(scheduler_name.c_str());
|
enum scheduler_t scheduler = str_to_scheduler(scheduler_name.c_str());
|
||||||
|
|
||||||
// avoid excessive resource usage
|
SDGenerationParams gen_params = default_gen_params;
|
||||||
|
gen_params.prompt = prompt;
|
||||||
SDGenerationParams gen_params = default_gen_params;
|
gen_params.negative_prompt = negative_prompt;
|
||||||
gen_params.prompt = prompt;
|
gen_params.width = width;
|
||||||
gen_params.negative_prompt = negative_prompt;
|
gen_params.height = height;
|
||||||
gen_params.width = width;
|
gen_params.seed = seed;
|
||||||
gen_params.height = height;
|
gen_params.sample_params.sample_steps = steps;
|
||||||
gen_params.seed = seed;
|
gen_params.batch_count = batch_size;
|
||||||
gen_params.sample_params.sample_steps = steps;
|
gen_params.sample_params.guidance.txt_cfg = cfg_scale;
|
||||||
gen_params.batch_count = batch_size;
|
|
||||||
|
|
||||||
if (clip_skip > 0) {
|
if (clip_skip > 0) {
|
||||||
gen_params.clip_skip = clip_skip;
|
gen_params.clip_skip = clip_skip;
|
||||||
|
|||||||
@ -767,7 +767,7 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_silu_act(ggml_context* ctx, ggml_tensor*
|
|||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
typedef std::function<void(ggml_tensor*, ggml_tensor*, bool)> on_tile_process;
|
typedef std::function<bool(ggml_tensor*, ggml_tensor*, bool)> on_tile_process;
|
||||||
|
|
||||||
__STATIC_INLINE__ void sd_tiling_calc_tiles(int& num_tiles_dim,
|
__STATIC_INLINE__ void sd_tiling_calc_tiles(int& num_tiles_dim,
|
||||||
float& tile_overlap_factor_dim,
|
float& tile_overlap_factor_dim,
|
||||||
@ -918,12 +918,15 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,
|
|||||||
|
|
||||||
int64_t t1 = ggml_time_ms();
|
int64_t t1 = ggml_time_ms();
|
||||||
ggml_ext_tensor_split_2d(input, input_tile, x_in, y_in);
|
ggml_ext_tensor_split_2d(input, input_tile, x_in, y_in);
|
||||||
on_processing(input_tile, output_tile, false);
|
if (on_processing(input_tile, output_tile, false)) {
|
||||||
ggml_ext_tensor_merge_2d(output_tile, output, x_out, y_out, overlap_x_out, overlap_y_out, dx, dy);
|
ggml_ext_tensor_merge_2d(output_tile, output, x_out, y_out, overlap_x_out, overlap_y_out, dx, dy);
|
||||||
|
|
||||||
int64_t t2 = ggml_time_ms();
|
int64_t t2 = ggml_time_ms();
|
||||||
last_time = (t2 - t1) / 1000.0f;
|
last_time = (t2 - t1) / 1000.0f;
|
||||||
pretty_progress(tile_count, num_tiles, last_time);
|
pretty_progress(tile_count, num_tiles, last_time);
|
||||||
|
} else {
|
||||||
|
LOG_ERROR("Failed to process patch %d at (%d, %d)", tile_count, x, y);
|
||||||
|
}
|
||||||
tile_count++;
|
tile_count++;
|
||||||
}
|
}
|
||||||
last_x = false;
|
last_x = false;
|
||||||
|
|||||||
@ -1558,7 +1558,7 @@ public:
|
|||||||
if (vae_tiling_params.enabled) {
|
if (vae_tiling_params.enabled) {
|
||||||
// split latent in 32x32 tiles and compute in several steps
|
// split latent in 32x32 tiles and compute in several steps
|
||||||
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
|
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
|
||||||
first_stage_model->compute(n_threads, in, true, &out, nullptr);
|
return first_stage_model->compute(n_threads, in, true, &out, nullptr);
|
||||||
};
|
};
|
||||||
silent_tiling(latents, result, get_vae_scale_factor(), 32, 0.5f, on_tiling);
|
silent_tiling(latents, result, get_vae_scale_factor(), 32, 0.5f, on_tiling);
|
||||||
|
|
||||||
@ -1577,7 +1577,7 @@ public:
|
|||||||
if (vae_tiling_params.enabled) {
|
if (vae_tiling_params.enabled) {
|
||||||
// split latent in 64x64 tiles and compute in several steps
|
// split latent in 64x64 tiles and compute in several steps
|
||||||
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
|
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
|
||||||
tae_first_stage->compute(n_threads, in, true, &out, nullptr);
|
return tae_first_stage->compute(n_threads, in, true, &out, nullptr);
|
||||||
};
|
};
|
||||||
silent_tiling(latents, result, get_vae_scale_factor(), 64, 0.5f, on_tiling);
|
silent_tiling(latents, result, get_vae_scale_factor(), 64, 0.5f, on_tiling);
|
||||||
} else {
|
} else {
|
||||||
@ -2546,7 +2546,7 @@ public:
|
|||||||
LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y);
|
LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y);
|
||||||
|
|
||||||
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
|
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
|
||||||
first_stage_model->compute(n_threads, in, false, &out, work_ctx);
|
return first_stage_model->compute(n_threads, in, false, &out, work_ctx);
|
||||||
};
|
};
|
||||||
sd_tiling_non_square(x, result, vae_scale_factor, tile_size_x, tile_size_y, tile_overlap, on_tiling);
|
sd_tiling_non_square(x, result, vae_scale_factor, tile_size_x, tile_size_y, tile_overlap, on_tiling);
|
||||||
} else {
|
} else {
|
||||||
@ -2557,7 +2557,7 @@ public:
|
|||||||
if (vae_tiling_params.enabled && !encode_video) {
|
if (vae_tiling_params.enabled && !encode_video) {
|
||||||
// split latent in 32x32 tiles and compute in several steps
|
// split latent in 32x32 tiles and compute in several steps
|
||||||
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
|
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
|
||||||
tae_first_stage->compute(n_threads, in, false, &out, nullptr);
|
return tae_first_stage->compute(n_threads, in, false, &out, nullptr);
|
||||||
};
|
};
|
||||||
sd_tiling(x, result, vae_scale_factor, 64, 0.5f, on_tiling);
|
sd_tiling(x, result, vae_scale_factor, 64, 0.5f, on_tiling);
|
||||||
} else {
|
} else {
|
||||||
@ -2675,11 +2675,15 @@ public:
|
|||||||
|
|
||||||
// split latent in 32x32 tiles and compute in several steps
|
// split latent in 32x32 tiles and compute in several steps
|
||||||
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
|
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
|
||||||
first_stage_model->compute(n_threads, in, true, &out, nullptr);
|
return first_stage_model->compute(n_threads, in, true, &out, nullptr);
|
||||||
};
|
};
|
||||||
sd_tiling_non_square(x, result, vae_scale_factor, tile_size_x, tile_size_y, tile_overlap, on_tiling);
|
sd_tiling_non_square(x, result, vae_scale_factor, tile_size_x, tile_size_y, tile_overlap, on_tiling);
|
||||||
} else {
|
} else {
|
||||||
first_stage_model->compute(n_threads, x, true, &result, work_ctx);
|
if(!first_stage_model->compute(n_threads, x, true, &result, work_ctx)){
|
||||||
|
LOG_ERROR("Failed to decode latetnts");
|
||||||
|
first_stage_model->free_compute_buffer();
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
first_stage_model->free_compute_buffer();
|
first_stage_model->free_compute_buffer();
|
||||||
process_vae_output_tensor(result);
|
process_vae_output_tensor(result);
|
||||||
@ -2687,11 +2691,15 @@ public:
|
|||||||
if (vae_tiling_params.enabled) {
|
if (vae_tiling_params.enabled) {
|
||||||
// split latent in 64x64 tiles and compute in several steps
|
// split latent in 64x64 tiles and compute in several steps
|
||||||
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
|
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
|
||||||
tae_first_stage->compute(n_threads, in, true, &out);
|
return tae_first_stage->compute(n_threads, in, true, &out);
|
||||||
};
|
};
|
||||||
sd_tiling(x, result, vae_scale_factor, 64, 0.5f, on_tiling);
|
sd_tiling(x, result, vae_scale_factor, 64, 0.5f, on_tiling);
|
||||||
} else {
|
} else {
|
||||||
tae_first_stage->compute(n_threads, x, true, &result);
|
if(!tae_first_stage->compute(n_threads, x, true, &result)){
|
||||||
|
LOG_ERROR("Failed to decode latetnts");
|
||||||
|
tae_first_stage->free_compute_buffer();
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
tae_first_stage->free_compute_buffer();
|
tae_first_stage->free_compute_buffer();
|
||||||
}
|
}
|
||||||
@ -3461,6 +3469,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
|
|||||||
ggml_free(work_ctx);
|
ggml_free(work_ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
memset(result_images, 0, batch_count * sizeof(sd_image_t));
|
||||||
|
|
||||||
for (size_t i = 0; i < decoded_images.size(); i++) {
|
for (size_t i = 0; i < decoded_images.size(); i++) {
|
||||||
result_images[i].width = width;
|
result_images[i].width = width;
|
||||||
|
|||||||
@ -89,7 +89,7 @@ struct UpscalerGGML {
|
|||||||
|
|
||||||
ggml_tensor* upscaled = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, output_width, output_height, 3, 1);
|
ggml_tensor* upscaled = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, output_width, output_height, 3, 1);
|
||||||
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
|
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
|
||||||
esrgan_upscaler->compute(n_threads, in, &out);
|
return esrgan_upscaler->compute(n_threads, in, &out);
|
||||||
};
|
};
|
||||||
int64_t t0 = ggml_time_ms();
|
int64_t t0 = ggml_time_ms();
|
||||||
sd_tiling(input_image_tensor, upscaled, esrgan_upscaler->scale, esrgan_upscaler->tile_size, 0.25f, on_tiling);
|
sd_tiling(input_image_tensor, upscaled, esrgan_upscaler->scale, esrgan_upscaler->tile_size, 0.25f, on_tiling);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user