stable-diffusion.cpp/src/util.cpp

#include "util.h"
#include <algorithm>
#include <cmath>
#include <codecvt>
#include <cstdarg>
#include <fstream>
#include <locale>
#include <regex>
#include <sstream>
#include <string>
#include <thread>
#include <unordered_set>
#include <vector>
#include "preprocessing.hpp"

#if defined(__APPLE__) && defined(__MACH__)
#include <sys/sysctl.h>
#include <sys/types.h>
#endif

#if !defined(_WIN32)
#include <sys/ioctl.h>
#include <unistd.h>
#endif

#include "ggml-backend.h"
#include "ggml.h"
#include "ggml_extend_backend.hpp"
#include "stable-diffusion.h"

bool ends_with(const std::string& str, const std::string& ending) {
    if (str.length() >= ending.length()) {
        return (str.compare(str.length() - ending.length(), ending.length(), ending) == 0);
    } else {
        return false;
    }
}

bool starts_with(const std::string& str, const std::string& start) {
    if (str.find(start) == 0) {
        return true;
    }
    return false;
}

bool contains(const std::string& str, const std::string& substr) {
    if (str.find(substr) != std::string::npos) {
        return true;
    }
    return false;
}

void replace_all_chars(std::string& str, char target, char replacement) {
    for (size_t i = 0; i < str.length(); ++i) {
        if (str[i] == target) {
            str[i] = replacement;
        }
    }
}

std::string sd_format(const char* fmt, ...) {
    va_list ap;
    va_list ap2;
    va_start(ap, fmt);
    va_copy(ap2, ap);
    int size = vsnprintf(nullptr, 0, fmt, ap);
    std::vector<char> buf(size + 1);
    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
    va_end(ap2);
    va_end(ap);
    return std::string(buf.data(), size);
}

int round_up_to(int value, int base) {
    if (base <= 0) {
        return value;
    }
    if (value % base == 0) {
        return value;
    } else {
        return ((value / base) + 1) * base;
    }
}

#ifdef _WIN32  // code for windows
#define NOMINMAX
#include <windows.h>

bool file_exists(const std::string& filename) {
    DWORD attributes = GetFileAttributesA(filename.c_str());
    return (attributes != INVALID_FILE_ATTRIBUTES && !(attributes & FILE_ATTRIBUTE_DIRECTORY));
}

bool is_directory(const std::string& path) {
    DWORD attributes = GetFileAttributesA(path.c_str());
    return (attributes != INVALID_FILE_ATTRIBUTES && (attributes & FILE_ATTRIBUTE_DIRECTORY));
}

class MmapWrapperImpl : public MmapWrapper {
public:
    MmapWrapperImpl(void* data, size_t size, HANDLE hfile, HANDLE hmapping)
        : MmapWrapper(data, size), hfile_(hfile), hmapping_(hmapping) {}

    ~MmapWrapperImpl() override {
        UnmapViewOfFile(data_);
        CloseHandle(hmapping_);
        CloseHandle(hfile_);
    }

private:
    HANDLE hfile_;
    HANDLE hmapping_;
};

std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
    void* mapped_data = nullptr;
    size_t file_size  = 0;

    HANDLE file_handle = CreateFileA(
        filename.c_str(),
        GENERIC_READ,
        FILE_SHARE_READ,
        nullptr,
        OPEN_EXISTING,
        FILE_ATTRIBUTE_NORMAL,
        nullptr);

    if (file_handle == INVALID_HANDLE_VALUE) {
        return nullptr;
    }

    LARGE_INTEGER size;
    if (!GetFileSizeEx(file_handle, &size)) {
        CloseHandle(file_handle);
        return nullptr;
    }

    file_size = static_cast<size_t>(size.QuadPart);

    HANDLE mapping_handle = CreateFileMapping(file_handle, nullptr, PAGE_READONLY, 0, 0, nullptr);

    if (mapping_handle == nullptr) {
        CloseHandle(file_handle);
        return nullptr;
    }

    mapped_data = MapViewOfFile(mapping_handle, FILE_MAP_READ, 0, 0, file_size);

    if (mapped_data == nullptr) {
        CloseHandle(mapping_handle);
        CloseHandle(file_handle);
        return nullptr;
    }

    return std::make_unique<MmapWrapperImpl>(mapped_data, file_size, file_handle, mapping_handle);
}

#else  // Unix
#include <dirent.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>

bool file_exists(const std::string& filename) {
    struct stat buffer;
    return (stat(filename.c_str(), &buffer) == 0 && S_ISREG(buffer.st_mode));
}

bool is_directory(const std::string& path) {
    struct stat buffer;
    return (stat(path.c_str(), &buffer) == 0 && S_ISDIR(buffer.st_mode));
}

class MmapWrapperImpl : public MmapWrapper {
public:
    MmapWrapperImpl(void* data, size_t size)
        : MmapWrapper(data, size) {}

    ~MmapWrapperImpl() override {
        munmap(data_, size_);
    }
};

std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
    int file_descriptor = open(filename.c_str(), O_RDONLY);
    if (file_descriptor == -1) {
        return nullptr;
    }

    int mmap_flags = MAP_PRIVATE;

#ifdef __linux__
    // performance flags used by llama.cpp
    // posix_fadvise(file_descriptor, 0, 0, POSIX_FADV_SEQUENTIAL);
    // mmap_flags |= MAP_POPULATE;
#endif

    struct stat sb;
    if (fstat(file_descriptor, &sb) == -1) {
        close(file_descriptor);
        return nullptr;
    }

    size_t file_size = sb.st_size;

    void* mapped_data = mmap(nullptr, file_size, PROT_READ, mmap_flags, file_descriptor, 0);

    close(file_descriptor);

    if (mapped_data == MAP_FAILED) {
        return nullptr;
    }

#ifdef __linux__
    // performance flags used by llama.cpp
    // posix_madvise(mapped_data, file_size, POSIX_MADV_WILLNEED);
#endif

    return std::make_unique<MmapWrapperImpl>(mapped_data, file_size);
}

#endif

bool MmapWrapper::copy_data(void* buf, size_t n, size_t offset) const {
    if (offset >= size_ || n > (size_ - offset)) {
        return false;
    }
    std::memcpy(buf, data() + offset, n);
    return true;
}

// get_num_physical_cores is copy from
// https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp
// LICENSE: https://github.com/ggerganov/llama.cpp/blob/master/LICENSE
int32_t sd_get_num_physical_cores() {
#ifdef __linux__
    // enumerate the set of thread siblings, num entries is num cores
    std::unordered_set<std::string> siblings;
    for (uint32_t cpu = 0; cpu < UINT32_MAX; ++cpu) {
        std::ifstream thread_siblings("/sys/devices/system/cpu" + std::to_string(cpu) + "/topology/thread_siblings");
        if (!thread_siblings.is_open()) {
            break;  // no more cpus
        }
        std::string line;
        if (std::getline(thread_siblings, line)) {
            siblings.insert(line);
        }
    }
    if (siblings.size() > 0) {
        return static_cast<int32_t>(siblings.size());
    }
#elif defined(__APPLE__) && defined(__MACH__)
    int32_t num_physical_cores;
    size_t len = sizeof(num_physical_cores);
    int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, nullptr, 0);
    if (result == 0) {
        return num_physical_cores;
    }
    result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, nullptr, 0);
    if (result == 0) {
        return num_physical_cores;
    }
#elif defined(_WIN32)
    // TODO: Implement
#endif
    unsigned int n_threads = std::thread::hardware_concurrency();
    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
}

static sd_progress_cb_t sd_progress_cb = nullptr;
void* sd_progress_cb_data              = nullptr;

static sd_preview_cb_t sd_preview_cb = nullptr;
static void* sd_preview_cb_data      = nullptr;
preview_t sd_preview_mode            = PREVIEW_NONE;
int sd_preview_interval              = 1;
bool sd_preview_denoised             = true;
bool sd_preview_noisy                = false;

std::u32string utf8_to_utf32(const std::string& utf8_str) {
    std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
    return converter.from_bytes(utf8_str);
}

std::string utf32_to_utf8(const std::u32string& utf32_str) {
    std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
    return converter.to_bytes(utf32_str);
}

std::u32string unicode_value_to_utf32(int unicode_value) {
    std::u32string utf32_string = {static_cast<char32_t>(unicode_value)};
    return utf32_string;
}

static std::string sd_basename(const std::string& path) {
    size_t pos = path.find_last_of('/');
    if (pos != std::string::npos) {
        return path.substr(pos + 1);
    }
    pos = path.find_last_of('\\');
    if (pos != std::string::npos) {
        return path.substr(pos + 1);
    }
    return path;
}

std::string path_join(const std::string& p1, const std::string& p2) {
    if (p1.empty()) {
        return p2;
    }

    if (p2.empty()) {
        return p1;
    }

    if (p1[p1.length() - 1] == '/' || p1[p1.length() - 1] == '\\') {
        return p1 + p2;
    }

    return p1 + "/" + p2;
}

std::vector<std::string> split_string(const std::string& str, char delimiter) {
    std::vector<std::string> result;
    size_t start = 0;
    size_t end   = str.find(delimiter);

    while (end != std::string::npos) {
        result.push_back(str.substr(start, end - start));
        start = end + 1;
        end   = str.find(delimiter, start);
    }

    // Add the last segment after the last delimiter
    result.push_back(str.substr(start));

    return result;
}

static std::string build_progress_bar(int step, int steps) {
    std::string progress = "  |";
    int max_progress     = 50;
    int32_t current      = 0;
    if (steps > 0) {
        current = (int32_t)(step * 1.f * max_progress / steps);
    }
    for (int i = 0; i < 50; i++) {
        if (i > current) {
            progress += " ";
        } else if (i == current && i != max_progress - 1) {
            progress += ">";
        } else {
            progress += "=";
        }
    }
    progress += "|";
    return progress;
}

static void print_progress_line(int step, int steps, const std::string& speed_text) {
    if (step == 0) {
        return;
    }
    std::string progress = build_progress_bar(step, steps);
    const char* lf       = (step == steps ? "\n" : "");
    printf("\r%s %i/%i - %s\033[K%s", progress.c_str(), step, steps, speed_text.c_str(), lf);
    fflush(stdout);  // for linux
}

void pretty_progress(int step, int steps, float time) {
    if (sd_progress_cb) {
        sd_progress_cb(step, steps, time, sd_progress_cb_data);
        return;
    }
    if (step == 0) {
        return;
    }
    const char* unit = "s/it";
    float speed      = time;
    if (speed < 1.0f && speed > 0.f) {
        speed = 1.0f / speed;
        unit  = "it/s";
    }
    print_progress_line(step, steps, sd_format("%.2f%s", speed, unit));
}

void pretty_bytes_progress(int step, int steps, uint64_t bytes_processed, float elapsed_seconds) {
    if (sd_progress_cb) {
        float time = elapsed_seconds / (step + 1e-6f);
        sd_progress_cb(step, steps, time, sd_progress_cb_data);
        return;
    }
    if (step == 0) {
        return;
    }

    double bytes_per_second = 0.0;
    if (elapsed_seconds > 0.0f) {
        bytes_per_second = bytes_processed / (double)elapsed_seconds;
    }

    double speed_mb = bytes_per_second / (1024.0 * 1024.0);
    if (speed_mb >= 1024.0) {
        print_progress_line(step, steps, sd_format("%.2fGB/s", speed_mb / 1024.0));
    } else {
        print_progress_line(step, steps, sd_format("%.2fMB/s", speed_mb));
    }
}

std::string ltrim(const std::string& s) {
    auto it = std::find_if(s.begin(), s.end(), [](int ch) {
        return !std::isspace(ch);
    });
    return std::string(it, s.end());
}

std::string rtrim(const std::string& s) {
    auto it = std::find_if(s.rbegin(), s.rend(), [](int ch) {
        return !std::isspace(ch);
    });
    return std::string(s.begin(), it.base());
}

std::string trim(const std::string& s) {
    return rtrim(ltrim(s));
}

static sd_log_cb_t sd_log_cb = nullptr;
void* sd_log_cb_data         = nullptr;

#define LOG_BUFFER_SIZE 4096

void log_printf(sd_log_level_t level, const char* file, int line, const char* format, ...) {
    va_list args;
    va_start(args, format);

    static char log_buffer[LOG_BUFFER_SIZE + 1];
    int written = snprintf(log_buffer, LOG_BUFFER_SIZE, "%s:%-4d - ", sd_basename(file).c_str(), line);

    if (written >= 0 && written < LOG_BUFFER_SIZE) {
        vsnprintf(log_buffer + written, LOG_BUFFER_SIZE - written, format, args);
    }
    size_t len = strlen(log_buffer);
    if (log_buffer[len - 1] != '\n') {
        strncat(log_buffer, "\n", LOG_BUFFER_SIZE - len);
    }

    if (sd_log_cb) {
        sd_log_cb(level, log_buffer, sd_log_cb_data);
    }

    va_end(args);
}

void sd_set_log_callback(sd_log_cb_t cb, void* data) {
    sd_log_cb      = cb;
    sd_log_cb_data = data;
}
void sd_set_progress_callback(sd_progress_cb_t cb, void* data) {
    sd_progress_cb      = cb;
    sd_progress_cb_data = data;
}
void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode, int interval, bool denoised, bool noisy, void* data) {
    sd_preview_cb       = cb;
    sd_preview_cb_data  = data;
    sd_preview_mode     = mode;
    sd_preview_interval = interval;
    sd_preview_denoised = denoised;
    sd_preview_noisy    = noisy;
}

sd_preview_cb_t sd_get_preview_callback() {
    return sd_preview_cb;
}
void* sd_get_preview_callback_data() {
    return sd_preview_cb_data;
}

preview_t sd_get_preview_mode() {
    return sd_preview_mode;
}
int sd_get_preview_interval() {
    return sd_preview_interval;
}
bool sd_should_preview_denoised() {
    return sd_preview_denoised;
}
bool sd_should_preview_noisy() {
    return sd_preview_noisy;
}

sd_progress_cb_t sd_get_progress_callback() {
    return sd_progress_cb;
}
void* sd_get_progress_callback_data() {
    return sd_progress_cb_data;
}

sd_image_t tensor_to_sd_image(const sd::Tensor<float>& tensor, int frame_index) {
    const auto& shape = tensor.shape();
    GGML_ASSERT(shape.size() == 4 || shape.size() == 5);
    int width     = static_cast<int>(shape[0]);
    int height    = static_cast<int>(shape[1]);
    int channel   = static_cast<int>(shape[shape.size() == 5 ? 3 : 2]);
    uint8_t* data = (uint8_t*)malloc(static_cast<size_t>(width * height * channel));
    GGML_ASSERT(data != nullptr);
    preprocessing_tensor_frame_to_sd_image(tensor, frame_index, data);
    return {
        static_cast<uint32_t>(width),
        static_cast<uint32_t>(height),
        static_cast<uint32_t>(channel),
        data,
    };
}

sd::Tensor<float> sd_image_to_tensor(sd_image_t image,
                                     int target_width,
                                     int target_height,
                                     bool scale) {
    sd::Tensor<float> tensor = sd::zeros<float>({static_cast<int64_t>(image.width),
                                                 static_cast<int64_t>(image.height),
                                                 static_cast<int64_t>(image.channel),
                                                 1});
    for (uint32_t iw = 0; iw < image.width; ++iw) {
        for (uint32_t ih = 0; ih < image.height; ++ih) {
            for (uint32_t ic = 0; ic < image.channel; ++ic) {
                tensor.index(iw, ih, ic, 0) = sd_image_get_f32(image, iw, ih, ic, scale);
            }
        }
    }
    if (target_width >= 0 && target_height >= 0 &&
        (tensor.shape()[0] != target_width || tensor.shape()[1] != target_height)) {
        tensor = sd::ops::interpolate(tensor,
                                      {target_width,
                                       target_height,
                                       tensor.shape()[2],
                                       tensor.shape()[3]});
    }
    return tensor;
}

// Constants for means and std
float means[3] = {0.48145466f, 0.4578275f, 0.40821073f};
float stds[3]  = {0.26862954f, 0.26130258f, 0.27577711f};

sd::Tensor<float> clip_preprocess(const sd::Tensor<float>& image, int target_width, int target_height) {
    GGML_ASSERT(image.dim() == 4);
    GGML_ASSERT(image.shape()[2] == 3);
    GGML_ASSERT(image.shape()[3] == 1);
    GGML_ASSERT(target_width > 0 && target_height > 0);

    float width_scale  = static_cast<float>(target_width) / static_cast<float>(image.shape()[0]);
    float height_scale = static_cast<float>(target_height) / static_cast<float>(image.shape()[1]);
    float scale        = std::fmax(width_scale, height_scale);

    int64_t resized_width  = static_cast<int64_t>(scale * static_cast<float>(image.shape()[0]));
    int64_t resized_height = static_cast<int64_t>(scale * static_cast<float>(image.shape()[1]));

    sd::Tensor<float> resized = sd::ops::interpolate(
        image,
        {resized_width, resized_height, image.shape()[2], image.shape()[3]});

    int64_t h_offset = std::max<int64_t>((resized_height - target_height) / 2, 0);
    int64_t w_offset = std::max<int64_t>((resized_width - target_width) / 2, 0);

    sd::Tensor<float> cropped({target_width, target_height, image.shape()[2], image.shape()[3]});
    for (int64_t y = 0; y < target_height; ++y) {
        for (int64_t x = 0; x < target_width; ++x) {
            for (int64_t c = 0; c < image.shape()[2]; ++c) {
                cropped.index(x, y, c, 0) = resized.index(x + w_offset, y + h_offset, c, 0);
            }
        }
    }

    sd::Tensor<float> normalized = sd::ops::clamp(cropped, 0.0f, 1.0f);
    sd::Tensor<float> mean({1, 1, 3, 1}, {means[0], means[1], means[2]});
    sd::Tensor<float> std({1, 1, 3, 1}, {stds[0], stds[1], stds[2]});
    return (normalized - mean) / std;
}

// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/prompt_parser.py#L345
//
// Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
// Accepted tokens are:
//   (abc) - increases attention to abc by a multiplier of 1.1
//   (abc:3.12) - increases attention to abc by a multiplier of 3.12
//   [abc] - decreases attention to abc by a multiplier of 1.1
//   BREAK - separates the prompt into conceptually distinct parts for sequential processing
//   B - internal helper pattern; prevents 'B' in 'BREAK' from being consumed as normal text
//   \( - literal character '('
//   \[ - literal character '['
//   \) - literal character ')'
//   \] - literal character ']'
//   \\ - literal character '\'
//   anything else - just text
//
// >>> parse_prompt_attention('normal text')
// [['normal text', 1.0]]
// >>> parse_prompt_attention('an (important) word')
// [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
// >>> parse_prompt_attention('(unbalanced')
// [['unbalanced', 1.1]]
// >>> parse_prompt_attention('\(literal\]')
// [['(literal]', 1.0]]
// >>> parse_prompt_attention('(unnecessary)(parens)')
// [['unnecessaryparens', 1.1]]
// >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
// [['a ', 1.0],
//  ['house', 1.5730000000000004],
//  [' ', 1.1],
//  ['on', 1.0],
//  [' a ', 1.1],
//  ['hill', 0.55],
//  [', sun, ', 1.1],
//  ['sky', 1.4641000000000006],
//  ['.', 1.1]]
std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text) {
    std::vector<std::pair<std::string, float>> res;
    std::vector<int> round_brackets;
    std::vector<int> square_brackets;

    float round_bracket_multiplier  = 1.1f;
    float square_bracket_multiplier = 1 / 1.1f;

    std::regex re_attention(R"(\\\(|\\\)|\\\[|\\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|\)|\]|\bBREAK\b|[^\\()\[\]:B]+|:|\bB)");
    std::regex re_break(R"(\s*\bBREAK\b\s*)");

    auto multiply_range = [&](int start_position, float multiplier) {
        for (int p = start_position; p < res.size(); ++p) {
            res[p].second *= multiplier;
        }
    };

    std::smatch m, m2;
    std::string remaining_text = text;

    while (std::regex_search(remaining_text, m, re_attention)) {
        std::string text   = m[0];
        std::string weight = m[1];

        if (text == "(") {
            round_brackets.push_back((int)res.size());
        } else if (text == "[") {
            square_brackets.push_back((int)res.size());
        } else if (!weight.empty()) {
            if (!round_brackets.empty()) {
                multiply_range(round_brackets.back(), std::stof(weight));
                round_brackets.pop_back();
            }
        } else if (text == ")" && !round_brackets.empty()) {
            multiply_range(round_brackets.back(), round_bracket_multiplier);
            round_brackets.pop_back();
        } else if (text == "]" && !square_brackets.empty()) {
            multiply_range(square_brackets.back(), square_bracket_multiplier);
            square_brackets.pop_back();
        } else if (text == "\\(") {
            res.push_back({text.substr(1), 1.0f});
        } else if (std::regex_search(text, m2, re_break)) {
            res.push_back({"BREAK", -1.0f});
        } else {
            res.push_back({text, 1.0f});
        }

        remaining_text = m.suffix();
    }

    for (int pos : round_brackets) {
        multiply_range(pos, round_bracket_multiplier);
    }

    for (int pos : square_brackets) {
        multiply_range(pos, square_bracket_multiplier);
    }

    if (res.empty()) {
        res.push_back({"", 1.0f});
    }

    int i = 0;
    while (i + 1 < res.size()) {
        if (res[i].second == res[i + 1].second) {
            res[i].first += res[i + 1].first;
            res.erase(res.begin() + i + 1);
        } else {
            ++i;
        }
    }

    return res;
}

// test if the backend is a specific one, e.g. "CUDA", "ROCm", "Vulkan" etc.
bool sd_backend_is(ggml_backend_t backend, const std::string& name) {
    if (!backend) {
        return false;
    }
    ggml_backend_dev_t dev = ggml_backend_get_device(backend);
    if (!dev)
        return false;
    std::string dev_name = ggml_backend_dev_name(dev);
    return dev_name.find(name) != std::string::npos;
}

ggml_backend_t sd_get_default_backend() {
    ggml_backend_load_all_once();
    static std::once_flag once;
    std::call_once(once, []() {
        size_t dev_count = ggml_backend_dev_count();
        if (dev_count == 0) {
            LOG_ERROR("No devices found!");
        } else {
            LOG_DEBUG("Found %zu backend devices:", dev_count);
            for (size_t i = 0; i < dev_count; ++i) {
                auto dev = ggml_backend_dev_get(i);
                LOG_DEBUG("#%zu: %s", i, ggml_backend_dev_name(dev));
            }
        }
    });
    ggml_backend_t backend   = nullptr;
    const char* SD_VK_DEVICE = getenv("SD_VK_DEVICE");
    if (SD_VK_DEVICE != nullptr) {
        std::string sd_vk_device_str = SD_VK_DEVICE;
        try {
            unsigned long long device  = std::stoull(sd_vk_device_str);
            std::string vk_device_name = "Vulkan" + std::to_string(device);
            if (backend_name_exists(vk_device_name)) {
                LOG_INFO("Selecting %s as main device by env var SD_VK_DEVICE", vk_device_name.c_str());
                backend = init_named_backend(vk_device_name);
                if (!backend) {
                    LOG_WARN("Device %s requested by SD_VK_DEVICE failed to init. Falling back to the default device.", vk_device_name.c_str());
                }
            } else {
                LOG_WARN("Device %s requested by SD_VK_DEVICE was not found. Falling back to the default device.", vk_device_name.c_str());
            }
        } catch (const std::invalid_argument&) {
            LOG_WARN("SD_VK_DEVICE environment variable is not a valid integer (%s). Falling back to the default device.", SD_VK_DEVICE);
        } catch (const std::out_of_range&) {
            LOG_WARN("SD_VK_DEVICE environment variable value is out of range for `unsigned long long` type (%s). Falling back to the default device.", SD_VK_DEVICE);
        }
    }

    if (!backend) {
        std::string dev_name = get_default_backend_name();
        backend              = init_named_backend(dev_name);
        if (!backend && !dev_name.empty()) {
            LOG_WARN("device %s failed to init", dev_name.c_str());
        }
    }

    if (!backend) {
        LOG_WARN("loading CPU backend");
        backend = ggml_backend_cpu_init();
    }

    if (ggml_backend_is_cpu(backend)) {
        LOG_DEBUG("Using CPU backend");
    }

    return backend;
}

// namespace is needed to avoid conflicts with ggml_backend_extend.hpp
namespace ggml_cpu {
#include "ggml-cpu.h"
}

const char* sd_get_system_info() {
    using namespace ggml_cpu;
    static char buffer[1024];
    std::stringstream ss;
    ss << "System Info: \n";
    ss << "    SSE3 = " << ggml_cpu_has_sse3() << " | ";
    ss << "    AVX = " << ggml_cpu_has_avx() << " | ";
    ss << "    AVX2 = " << ggml_cpu_has_avx2() << " | ";
    ss << "    AVX512 = " << ggml_cpu_has_avx512() << " | ";
    ss << "    AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << " | ";
    ss << "    AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << " | ";
    ss << "    FMA = " << ggml_cpu_has_fma() << " | ";
    ss << "    NEON = " << ggml_cpu_has_neon() << " | ";
    ss << "    ARM_FMA = " << ggml_cpu_has_arm_fma() << " | ";
    ss << "    F16C = " << ggml_cpu_has_f16c() << " | ";
    ss << "    FP16_VA = " << ggml_cpu_has_fp16_va() << " | ";
    ss << "    WASM_SIMD = " << ggml_cpu_has_wasm_simd() << " | ";
    ss << "    VSX = " << ggml_cpu_has_vsx() << " | ";
    snprintf(buffer, sizeof(buffer), "%s", ss.str().c_str());
    return buffer;
}