stable-diffusion.cpp/src/preprocessing.hpp

#ifndef __PREPROCESSING_HPP__
#define __PREPROCESSING_HPP__

#include <cmath>
#include <limits>

#include "ggml_extend.hpp"

#define M_PI_ 3.14159265358979323846f

static inline int64_t preprocessing_offset_4d(const sd::Tensor<float>& tensor, int64_t i0, int64_t i1 = 0, int64_t i2 = 0, int64_t i3 = 0) {
    const auto& shape = tensor.shape();
    int64_t n0        = shape.size() > 0 ? shape[0] : 1;
    int64_t n1        = shape.size() > 1 ? shape[1] : 1;
    int64_t n2        = shape.size() > 2 ? shape[2] : 1;
    return ((i3 * n2 + i2) * n1 + i1) * n0 + i0;
}

static inline float preprocessing_get_4d(const sd::Tensor<float>& tensor, int64_t i0, int64_t i1 = 0, int64_t i2 = 0, int64_t i3 = 0) {
    return tensor.values()[static_cast<size_t>(preprocessing_offset_4d(tensor, i0, i1, i2, i3))];
}

static inline void preprocessing_set_4d(sd::Tensor<float>& tensor, float value, int64_t i0, int64_t i1 = 0, int64_t i2 = 0, int64_t i3 = 0) {
    tensor.values()[static_cast<size_t>(preprocessing_offset_4d(tensor, i0, i1, i2, i3))] = value;
}

static inline sd::Tensor<float> sd_image_to_preprocessing_tensor(sd_image_t image) {
    sd::Tensor<float> tensor({static_cast<int64_t>(image.width), static_cast<int64_t>(image.height), static_cast<int64_t>(image.channel), 1});
    for (uint32_t y = 0; y < image.height; ++y) {
        for (uint32_t x = 0; x < image.width; ++x) {
            for (uint32_t c = 0; c < image.channel; ++c) {
                preprocessing_set_4d(tensor, sd_image_get_f32(image, x, y, c), x, y, c, 0);
            }
        }
    }
    return tensor;
}

static inline void preprocessing_tensor_to_sd_image(const sd::Tensor<float>& tensor, uint8_t* image_data) {
    GGML_ASSERT(tensor.dim() == 4);
    GGML_ASSERT(tensor.shape()[3] == 1);
    GGML_ASSERT(image_data != nullptr);

    int width   = static_cast<int>(tensor.shape()[0]);
    int height  = static_cast<int>(tensor.shape()[1]);
    int channel = static_cast<int>(tensor.shape()[2]);
    for (int y = 0; y < height; ++y) {
        for (int x = 0; x < width; ++x) {
            for (int c = 0; c < channel; ++c) {
                float value                               = preprocessing_get_4d(tensor, x, y, c, 0);
                value                                     = std::min(1.0f, std::max(0.0f, value));
                image_data[(y * width + x) * channel + c] = static_cast<uint8_t>(std::round(value * 255.0f));
            }
        }
    }
}

static inline sd::Tensor<float> gaussian_kernel_tensor(int kernel_size) {
    sd::Tensor<float> kernel({kernel_size, kernel_size, 1, 1});
    int ks_mid   = kernel_size / 2;
    float sigma  = 1.4f;
    float normal = 1.f / (2.0f * M_PI_ * std::pow(sigma, 2.0f));
    for (int y = 0; y < kernel_size; ++y) {
        float gx = static_cast<float>(-ks_mid + y);
        for (int x = 0; x < kernel_size; ++x) {
            float gy = static_cast<float>(-ks_mid + x);
            float k  = std::exp(-((gx * gx + gy * gy) / (2.0f * std::pow(sigma, 2.0f)))) * normal;
            preprocessing_set_4d(kernel, k, x, y, 0, 0);
        }
    }
    return kernel;
}

static inline sd::Tensor<float> convolve_tensor(const sd::Tensor<float>& input, const sd::Tensor<float>& kernel, int padding) {
    GGML_ASSERT(input.dim() == 4);
    GGML_ASSERT(kernel.dim() == 4);
    GGML_ASSERT(input.shape()[3] == 1);
    GGML_ASSERT(kernel.shape()[2] == 1);
    GGML_ASSERT(kernel.shape()[3] == 1);

    sd::Tensor<float> output(input.shape());
    int64_t width    = input.shape()[0];
    int64_t height   = input.shape()[1];
    int64_t channels = input.shape()[2];
    int64_t kernel_w = kernel.shape()[0];
    int64_t kernel_h = kernel.shape()[1];

    for (int64_t c = 0; c < channels; ++c) {
        for (int64_t y = 0; y < height; ++y) {
            for (int64_t x = 0; x < width; ++x) {
                float sum = 0.0f;
                for (int64_t ky = 0; ky < kernel_h; ++ky) {
                    int64_t iy = y + ky - padding;
                    if (iy < 0 || iy >= height) {
                        continue;
                    }
                    for (int64_t kx = 0; kx < kernel_w; ++kx) {
                        int64_t ix = x + kx - padding;
                        if (ix < 0 || ix >= width) {
                            continue;
                        }
                        sum += preprocessing_get_4d(input, ix, iy, c, 0) * preprocessing_get_4d(kernel, kx, ky, 0, 0);
                    }
                }
                preprocessing_set_4d(output, sum, x, y, c, 0);
            }
        }
    }
    return output;
}

static inline sd::Tensor<float> grayscale_tensor(const sd::Tensor<float>& rgb_img) {
    GGML_ASSERT(rgb_img.dim() == 4);
    GGML_ASSERT(rgb_img.shape()[2] >= 3);
    sd::Tensor<float> grayscale({rgb_img.shape()[0], rgb_img.shape()[1], 1, rgb_img.shape()[3]});
    for (int64_t iy = 0; iy < rgb_img.shape()[1]; ++iy) {
        for (int64_t ix = 0; ix < rgb_img.shape()[0]; ++ix) {
            float r    = preprocessing_get_4d(rgb_img, ix, iy, 0, 0);
            float g    = preprocessing_get_4d(rgb_img, ix, iy, 1, 0);
            float b    = preprocessing_get_4d(rgb_img, ix, iy, 2, 0);
            float gray = 0.2989f * r + 0.5870f * g + 0.1140f * b;
            preprocessing_set_4d(grayscale, gray, ix, iy, 0, 0);
        }
    }
    return grayscale;
}

static inline sd::Tensor<float> tensor_hypot(const sd::Tensor<float>& x, const sd::Tensor<float>& y) {
    sd::tensor_check_same_shape(x, y);
    sd::Tensor<float> out(x.shape());
    for (int64_t i = 0; i < out.numel(); ++i) {
        out[i] = std::sqrt(x[i] * x[i] + y[i] * y[i]);
    }
    return out;
}

static inline sd::Tensor<float> tensor_arctan2(const sd::Tensor<float>& x, const sd::Tensor<float>& y) {
    sd::tensor_check_same_shape(x, y);
    sd::Tensor<float> out(x.shape());
    for (int64_t i = 0; i < out.numel(); ++i) {
        out[i] = std::atan2(y[i], x[i]);
    }
    return out;
}

static inline void normalize_tensor(sd::Tensor<float>* g) {
    GGML_ASSERT(g != nullptr);
    if (g->empty()) {
        return;
    }
    float max_value = -std::numeric_limits<float>::infinity();
    for (int64_t i = 0; i < g->numel(); ++i) {
        max_value = std::max(max_value, (*g)[i]);
    }
    if (max_value == 0.0f || !std::isfinite(max_value)) {
        return;
    }
    *g *= (1.0f / max_value);
}

static inline sd::Tensor<float> non_max_supression(const sd::Tensor<float>& G, const sd::Tensor<float>& D) {
    GGML_ASSERT(G.shape() == D.shape());
    sd::Tensor<float> result = sd::Tensor<float>::zeros(G.shape());
    for (int64_t iy = 1; iy < result.shape()[1] - 1; ++iy) {
        for (int64_t ix = 1; ix < result.shape()[0] - 1; ++ix) {
            float angle = preprocessing_get_4d(D, ix, iy, 0, 0) * 180.0f / M_PI_;
            angle       = angle < 0.0f ? angle + 180.0f : angle;
            float q     = 1.0f;
            float r     = 1.0f;

            if ((0 >= angle && angle < 22.5f) || (157.5f >= angle && angle <= 180.0f)) {
                q = preprocessing_get_4d(G, ix, iy + 1, 0, 0);
                r = preprocessing_get_4d(G, ix, iy - 1, 0, 0);
            } else if (22.5f >= angle && angle < 67.5f) {
                q = preprocessing_get_4d(G, ix + 1, iy - 1, 0, 0);
                r = preprocessing_get_4d(G, ix - 1, iy + 1, 0, 0);
            } else if (67.5f >= angle && angle < 112.5f) {
                q = preprocessing_get_4d(G, ix + 1, iy, 0, 0);
                r = preprocessing_get_4d(G, ix - 1, iy, 0, 0);
            } else if (112.5f >= angle && angle < 157.5f) {
                q = preprocessing_get_4d(G, ix - 1, iy - 1, 0, 0);
                r = preprocessing_get_4d(G, ix + 1, iy + 1, 0, 0);
            }

            float cur = preprocessing_get_4d(G, ix, iy, 0, 0);
            preprocessing_set_4d(result, (cur >= q && cur >= r) ? cur : 0.0f, ix, iy, 0, 0);
        }
    }
    return result;
}

static inline void threshold_hystersis(sd::Tensor<float>* img, float high_threshold, float low_threshold, float weak, float strong) {
    GGML_ASSERT(img != nullptr);
    if (img->empty()) {
        return;
    }
    float max_value = -std::numeric_limits<float>::infinity();
    for (int64_t i = 0; i < img->numel(); ++i) {
        max_value = std::max(max_value, (*img)[i]);
    }

    float ht = max_value * high_threshold;
    float lt = ht * low_threshold;
    for (int64_t i = 0; i < img->numel(); ++i) {
        float img_v = (*img)[i];
        if (img_v >= ht) {
            (*img)[i] = strong;
        } else if (img_v <= ht && img_v >= lt) {
            (*img)[i] = weak;
        }
    }

    for (int64_t iy = 0; iy < img->shape()[1]; ++iy) {
        for (int64_t ix = 0; ix < img->shape()[0]; ++ix) {
            if (!(ix >= 3 && ix <= img->shape()[0] - 3 && iy >= 3 && iy <= img->shape()[1] - 3)) {
                preprocessing_set_4d(*img, 0.0f, ix, iy, 0, 0);
            }
        }
    }

    for (int64_t iy = 1; iy < img->shape()[1] - 1; ++iy) {
        for (int64_t ix = 1; ix < img->shape()[0] - 1; ++ix) {
            float imd_v = preprocessing_get_4d(*img, ix, iy, 0, 0);
            if (imd_v == weak) {
                bool has_strong_neighbor =
                    preprocessing_get_4d(*img, ix + 1, iy - 1, 0, 0) == strong ||
                    preprocessing_get_4d(*img, ix + 1, iy, 0, 0) == strong ||
                    preprocessing_get_4d(*img, ix, iy - 1, 0, 0) == strong ||
                    preprocessing_get_4d(*img, ix, iy + 1, 0, 0) == strong ||
                    preprocessing_get_4d(*img, ix - 1, iy - 1, 0, 0) == strong ||
                    preprocessing_get_4d(*img, ix - 1, iy, 0, 0) == strong;
                preprocessing_set_4d(*img, has_strong_neighbor ? strong : 0.0f, ix, iy, 0, 0);
            }
        }
    }
}

bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold, float weak, float strong, bool inverse) {
    float kX[9] = {
        -1, 0, 1,
        -2, 0, 2,
        -1, 0, 1};

    float kY[9] = {
        1, 2, 1,
        0, 0, 0,
        -1, -2, -1};

    sd::Tensor<float> gkernel = gaussian_kernel_tensor(5);
    sd::Tensor<float> sf_kx({3, 3, 1, 1}, std::vector<float>(kX, kX + 9));
    sd::Tensor<float> sf_ky({3, 3, 1, 1}, std::vector<float>(kY, kY + 9));

    sd::Tensor<float> image      = sd_image_to_preprocessing_tensor(img);
    sd::Tensor<float> image_gray = grayscale_tensor(image);
    image_gray                   = convolve_tensor(image_gray, gkernel, 2);
    sd::Tensor<float> iX         = convolve_tensor(image_gray, sf_kx, 1);
    sd::Tensor<float> iY         = convolve_tensor(image_gray, sf_ky, 1);
    sd::Tensor<float> G          = tensor_hypot(iX, iY);
    normalize_tensor(&G);
    sd::Tensor<float> theta = tensor_arctan2(iX, iY);
    image_gray              = non_max_supression(G, theta);
    threshold_hystersis(&image_gray, high_threshold, low_threshold, weak, strong);

    for (uint32_t iy = 0; iy < img.height; ++iy) {
        for (uint32_t ix = 0; ix < img.width; ++ix) {
            float gray = preprocessing_get_4d(image_gray, ix, iy, 0, 0);
            gray       = inverse ? 1.0f - gray : gray;
            for (uint32_t c = 0; c < img.channel; ++c) {
                preprocessing_set_4d(image, gray, ix, iy, c, 0);
            }
        }
    }

    preprocessing_tensor_to_sd_image(image, img.data);
    return true;
}

#endif  // __PREPROCESSING_HPP__