perf: speed up tensor_to_sd_image conversion (#1466)

2026-06-24 07:06:44 +00:00 · 2026-04-30 01:13:56 +08:00 · 2026-04-30 01:13:56 +08:00 · 3d6064b37e
commit 3d6064b37e
parent b8079e253d
2 changed files with 71 additions and 25 deletions
--- a/src/preprocessing.hpp
+++ b/src/preprocessing.hpp
@ -24,6 +24,75 @@ static inline void preprocessing_set_4d(sd::Tensor<float>& tensor, float value,
    tensor.values()[static_cast<size_t>(preprocessing_offset_4d(tensor, i0, i1, i2, i3))] = value;
 }
 static inline uint8_t preprocessing_float_to_u8(float value) {
    if (value <= 0.0f) {
        return 0;
    }
    if (value >= 1.0f) {
        return 255;
    }
    return static_cast<uint8_t>(value * 255.0f + 0.5f);
 }
 static inline void preprocessing_tensor_frame_to_sd_image(const sd::Tensor<float>& tensor, int frame_index, uint8_t* image_data) {
    const auto& shape = tensor.shape();
    GGML_ASSERT(shape.size() == 4 || shape.size() == 5);
    GGML_ASSERT(image_data != nullptr);
    const int width     = static_cast<int>(shape[0]);
    const int height    = static_cast<int>(shape[1]);
    const int channel   = static_cast<int>(shape[shape.size() == 5 ? 3 : 2]);
    const size_t pixels = static_cast<size_t>(width) * static_cast<size_t>(height);
    const float* src    = tensor.data();
    if (shape.size() == 4) {
        GGML_ASSERT(frame_index >= 0 && frame_index < shape[3]);
        const size_t frame_stride = pixels * static_cast<size_t>(channel);
        const float* frame_ptr    = src + static_cast<size_t>(frame_index) * frame_stride;
        if (channel == 3) {
            const float* c0 = frame_ptr;
            const float* c1 = frame_ptr + pixels;
            const float* c2 = frame_ptr + pixels * 2;
            for (size_t i = 0; i < pixels; ++i) {
                image_data[i * 3 + 0] = preprocessing_float_to_u8(c0[i]);
                image_data[i * 3 + 1] = preprocessing_float_to_u8(c1[i]);
                image_data[i * 3 + 2] = preprocessing_float_to_u8(c2[i]);
            }
            return;
        }
        for (size_t i = 0; i < pixels; ++i) {
            for (int c = 0; c < channel; ++c) {
                image_data[i * static_cast<size_t>(channel) + static_cast<size_t>(c)] =
                    preprocessing_float_to_u8(frame_ptr[i + pixels * static_cast<size_t>(c)]);
            }
        }
        return;
    }
    GGML_ASSERT(frame_index >= 0 && frame_index < shape[2]);
    const size_t channel_stride = pixels * static_cast<size_t>(shape[2]);
    const float* frame_ptr      = src + static_cast<size_t>(frame_index) * pixels;
    if (channel == 3) {
        const float* c0 = frame_ptr;
        const float* c1 = frame_ptr + channel_stride;
        const float* c2 = frame_ptr + channel_stride * 2;
        for (size_t i = 0; i < pixels; ++i) {
            image_data[i * 3 + 0] = preprocessing_float_to_u8(c0[i]);
            image_data[i * 3 + 1] = preprocessing_float_to_u8(c1[i]);
            image_data[i * 3 + 2] = preprocessing_float_to_u8(c2[i]);
        }
        return;
    }
    for (size_t i = 0; i < pixels; ++i) {
        for (int c = 0; c < channel; ++c) {
            image_data[i * static_cast<size_t>(channel) + static_cast<size_t>(c)] =
                preprocessing_float_to_u8(frame_ptr[i + channel_stride * static_cast<size_t>(c)]);
        }
    }
 }
 static inline sd::Tensor<float> sd_image_to_preprocessing_tensor(sd_image_t image) {
    sd::Tensor<float> tensor({static_cast<int64_t>(image.width), static_cast<int64_t>(image.height), static_cast<int64_t>(image.channel), 1});
    for (uint32_t y = 0; y < image.height; ++y) {
@ -39,20 +108,7 @@ static inline sd::Tensor<float> sd_image_to_preprocessing_tensor(sd_image_t imag
 static inline void preprocessing_tensor_to_sd_image(const sd::Tensor<float>& tensor, uint8_t* image_data) {
    GGML_ASSERT(tensor.dim() == 4);
    GGML_ASSERT(tensor.shape()[3] == 1);
-    GGML_ASSERT(image_data != nullptr);
+    preprocessing_tensor_frame_to_sd_image(tensor, 0, image_data);
    int width   = static_cast<int>(tensor.shape()[0]);
    int height  = static_cast<int>(tensor.shape()[1]);
    int channel = static_cast<int>(tensor.shape()[2]);
    for (int y = 0; y < height; ++y) {
        for (int x = 0; x < width; ++x) {
            for (int c = 0; c < channel; ++c) {
                float value                               = preprocessing_get_4d(tensor, x, y, c, 0);
                value                                     = std::min(1.0f, std::max(0.0f, value));
                image_data[(y * width + x) * channel + c] = static_cast<uint8_t>(std::round(value * 255.0f));
            }
        }
    }
 }
 static inline sd::Tensor<float> gaussian_kernel_tensor(int kernel_size) {
--- a/src/util.cpp
+++ b/src/util.cpp
@ -505,17 +505,7 @@ sd_image_t tensor_to_sd_image(const sd::Tensor<float>& tensor, int frame_index)
    int channel   = static_cast<int>(shape[shape.size() == 5 ? 3 : 2]);
    uint8_t* data = (uint8_t*)malloc(static_cast<size_t>(width * height * channel));
    GGML_ASSERT(data != nullptr);
-
+    preprocessing_tensor_frame_to_sd_image(tensor, frame_index, data);
    for (int iw = 0; iw < width; ++iw) {
        for (int ih = 0; ih < height; ++ih) {
            for (int ic = 0; ic < channel; ++ic) {
                float value                            = shape.size() == 5 ? tensor.index(iw, ih, frame_index, ic, 0)
                                                                           : tensor.index(iw, ih, ic, frame_index);
                value                                  = std::clamp(value, 0.0f, 1.0f);
                data[(ih * width + iw) * channel + ic] = static_cast<uint8_t>(std::round(value * 255.0f));
            }
        }
    }
    return {
        static_cast<uint32_t>(width),
        static_cast<uint32_t>(height),