feat: add webm support (#1391 )

feat: inpaint improvements (#1357 )
* inpaint: get max pixel max instead of single sample * inpaint: masked diffusion for inpainting models with inflated mask * refactor tensor interpolate nearest-like reduction paths and generalize max_pool_2d --------- Co-authored-by: leejet <leejet714@gmail.com>
2026-06-25 07:36:38 +00:00 · 2026-04-06 01:49:28 +08:00 · 2026-04-06 00:44:26 +08:00
14 changed files with 479 additions and 26 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -239,6 +239,7 @@ jobs:
        id: build-push
        uses: docker/build-push-action@v6
        with:
+          context: .
          platforms: linux/amd64
          push: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
          file: Dockerfile.${{ matrix.variant }}
--- a/.gitmodules
+++ b/.gitmodules
@ -7,3 +7,6 @@
 [submodule "thirdparty/libwebp"]
 	path = thirdparty/libwebp
 	url = https://github.com/webmproject/libwebp.git
+[submodule "thirdparty/libwebm"]
+	path = thirdparty/libwebm
+	url = https://github.com/webmproject/libwebm.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -32,6 +32,16 @@ else()
    set(SD_WEBP_DEFAULT ${SD_USE_SYSTEM_WEBP})
 endif()

+set(SD_SUBMODULE_WEBM FALSE)
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libwebm/CMakeLists.txt")
+    set(SD_SUBMODULE_WEBM TRUE)
+endif()
+if(SD_SUBMODULE_WEBM)
+    set(SD_WEBM_DEFAULT ON)
+else()
+    set(SD_WEBM_DEFAULT ${SD_USE_SYSTEM_WEBM})
+endif()
+
 #
 # Option list
 #
@ -41,6 +51,8 @@ endif()
 option(SD_BUILD_EXAMPLES             "sd: build examples" ${SD_STANDALONE})
 option(SD_WEBP                       "sd: enable WebP image I/O support" ${SD_WEBP_DEFAULT})
 option(SD_USE_SYSTEM_WEBP            "sd: link against system libwebp" OFF)
+option(SD_WEBM                       "sd: enable WebM video output support" ${SD_WEBM_DEFAULT})
+option(SD_USE_SYSTEM_WEBM            "sd: link against system libwebm" OFF)
 option(SD_CUDA                       "sd: cuda backend" OFF)
 option(SD_HIPBLAS                    "sd: rocm backend" OFF)
 option(SD_METAL                      "sd: metal backend" OFF)
@ -111,7 +123,31 @@ if(SD_WEBP)
            )
        endif()
    endif()
-    add_compile_definitions(SD_USE_WEBP)
+endif()
+
+if(SD_WEBM)
+    if(NOT SD_WEBP)
+        message(FATAL_ERROR "SD_WEBM requires SD_WEBP because WebM output reuses libwebp VP8 encoding.")
+    endif()
+    if(NOT SD_SUBMODULE_WEBM AND NOT SD_USE_SYSTEM_WEBM)
+        message(FATAL_ERROR "WebM support enabled but no source found.
+          Either initialize the submodule:\n  git submodule update --init thirdparty/libwebm\n\n"
+          "Or link against system library:\n  cmake (...) -DSD_USE_SYSTEM_WEBM=ON")
+    endif()
+    if(SD_USE_SYSTEM_WEBM)
+        find_path(WEBM_INCLUDE_DIR
+            NAMES mkvmuxer/mkvmuxer.h mkvparser/mkvparser.h common/webmids.h
+            PATH_SUFFIXES webm
+            REQUIRED)
+        find_library(WEBM_LIBRARY
+            NAMES webm libwebm
+            REQUIRED)
+
+        add_library(webm UNKNOWN IMPORTED)
+        set_target_properties(webm PROPERTIES
+            IMPORTED_LOCATION "${WEBM_LIBRARY}"
+            INTERFACE_INCLUDE_DIRECTORIES "${WEBM_INCLUDE_DIR}")
+    endif()
 endif()

 set(SD_LIB stable-diffusion)
--- a/docs/build.md
+++ b/docs/build.md
@ -16,15 +16,23 @@ git submodule init
 git submodule update
 ```

-## WebP Support in Examples
+## WebP and WebM Support in Examples

-The example applications (`examples/cli` and `examples/server`) use `libwebp` to support WebP image I/O. This is enabled by default.
+The example applications (`examples/cli` and `examples/server`) use `libwebp` to support WebP image I/O, and `examples/cli` can also use `libwebm` for `.webm` video output. Both are enabled by default. WebM output currently reuses `libwebp` to encode each frame as VP8 before muxing with `libwebm`.

-If you do not want WebP support, you can disable it at configure time:
+If you do not want WebP/WebM support, you can disable them at configure time:

 ```shell
 mkdir build && cd build
-cmake .. -DSD_WEBP=OFF
+cmake .. -DSD_WEBP=OFF -DSD_WEBM=OFF
+cmake --build . --config Release
+```
+
+If the submodules are not available, you can also link against system packages instead:
+
+```shell
+mkdir build && cd build
+cmake .. -DSD_USE_SYSTEM_WEBP=ON -DSD_USE_SYSTEM_WEBM=ON
 cmake --build . --config Release
 ```

--- a/examples/cli/CMakeLists.txt
+++ b/examples/cli/CMakeLists.txt
@ -9,6 +9,11 @@ add_executable(${TARGET}
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE stable-diffusion zip ${CMAKE_THREAD_LIBS_INIT})
 if(SD_WEBP)
+    target_compile_definitions(${TARGET} PRIVATE SD_USE_WEBP)
    target_link_libraries(${TARGET} PRIVATE webp libwebpmux)
 endif()
+if(SD_WEBM)
+    target_compile_definitions(${TARGET} PRIVATE SD_USE_WEBM)
+    target_link_libraries(${TARGET} PRIVATE webm)
+endif()
 target_compile_features(${TARGET} PUBLIC c_std_11 cxx_std_17)
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@ -5,8 +5,8 @@ usage: ./bin/sd-cli  [options]

 CLI Options:
  -o, --output <string>       path to write result image to. you can use printf-style %d format specifiers for image sequences (default:
-                              ./output.png) (eg. output_%03d.png). For video generation, single-file outputs support .avi and animated .webp
-  --preview-path <string>     path to write preview image to (default: ./preview.png). Multi-frame previews support .avi and animated .webp
+                              ./output.png) (eg. output_%03d.png). For video generation, single-file outputs support .avi, .webm, and animated .webp
+  --preview-path <string>     path to write preview image to (default: ./preview.png). Multi-frame previews support .avi, .webm, and animated .webp
  --preview-interval <int>    interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at
                              every step)
  --output-begin-idx <int>    starting index for output image sequence, must be non-negative (default 0 if specified %d in output path, 1 otherwise)
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -58,7 +58,7 @@ struct SDCliParams {
        options.string_options = {
            {"-o",
             "--output",
-             "path to write result image to. you can use printf-style %d format specifiers for image sequences (default: ./output.png) (eg. output_%03d.png)",
+             "path to write result image to. you can use printf-style %d format specifiers for image sequences (default: ./output.png) (eg. output_%03d.png). Single-file video outputs support .avi, .webm, and animated .webp",
             &output_path},
            {"",
             "--image",
@ -70,7 +70,7 @@ struct SDCliParams {
             &metadata_format},
            {"",
             "--preview-path",
-             "path to write preview image to (default: ./preview.png)",
+             "path to write preview image to (default: ./preview.png). Multi-frame previews support .avi, .webm, and animated .webp",
             &preview_path},
        };

@ -396,7 +396,9 @@ bool save_results(const SDCliParams& cli_params,
    if (!ext.empty()) {
        if (output_format == EncodedImageFormat::JPEG ||
            output_format == EncodedImageFormat::PNG ||
-            output_format == EncodedImageFormat::WEBP) {
+            output_format == EncodedImageFormat::WEBP ||
+            ext_lower == ".avi" ||
+            ext_lower == ".webm") {
            base_path.replace_extension();
        }
    }
@ -438,7 +440,7 @@ bool save_results(const SDCliParams& cli_params,
    }

    if (cli_params.mode == VID_GEN && num_results > 1) {
-        if (ext_lower != ".avi" && ext_lower != ".webp")
+        if (ext_lower != ".avi" && ext_lower != ".webp" && ext_lower != ".webm")
            ext = ".avi";
        fs::path video_path = base_path;
        video_path += ext;
--- a/examples/common/media_io.cpp
+++ b/examples/common/media_io.cpp
@ -30,6 +30,11 @@
 #include "webp/mux.h"
 #endif

+#ifdef SD_USE_WEBM
+#include "mkvmuxer/mkvmuxer.h"
+#include "mkvmuxer/mkvwriter.h"
+#endif
+
 namespace fs = std::filesystem;

 namespace {
@ -71,6 +76,13 @@ bool write_binary_file_bytes(const std::string& path, const std::vector<uint8_t>
    return true;
 }

+uint32_t read_u32_le_bytes(const uint8_t* data) {
+    return static_cast<uint32_t>(data[0]) |
+           (static_cast<uint32_t>(data[1]) << 8) |
+           (static_cast<uint32_t>(data[2]) << 16) |
+           (static_cast<uint32_t>(data[3]) << 24);
+}
+
 int stbi_ext_write_png_to_func(stbi_write_func* func,
                               void* context,
                               int x,
@ -289,6 +301,76 @@ bool encode_webp_image_to_vector(const uint8_t* image,
    WebPMuxDelete(mux);
    return ok;
 }
+
+#ifdef SD_USE_WEBM
+bool extract_vp8_frame_from_webp(const std::vector<uint8_t>& webp_data, std::vector<uint8_t>& vp8_frame) {
+    if (!is_webp_signature(webp_data.data(), webp_data.size())) {
+        return false;
+    }
+
+    size_t offset = 12;
+    while (offset + 8 <= webp_data.size()) {
+        const uint8_t* chunk     = webp_data.data() + offset;
+        const uint32_t chunk_len = read_u32_le_bytes(chunk + 4);
+        const size_t chunk_start = offset + 8;
+        const size_t padded_len  = static_cast<size_t>(chunk_len) + (chunk_len & 1u);
+
+        if (chunk_start + chunk_len > webp_data.size()) {
+            return false;
+        }
+
+        if (memcmp(chunk, "VP8 ", 4) == 0) {
+            vp8_frame.assign(webp_data.data() + chunk_start,
+                             webp_data.data() + chunk_start + chunk_len);
+            return !vp8_frame.empty();
+        }
+
+        offset = chunk_start + padded_len;
+    }
+
+    return false;
+}
+
+bool encode_sd_image_to_vp8_frame(const sd_image_t& image, int quality, std::vector<uint8_t>& vp8_frame) {
+    if (image.data == nullptr || image.width == 0 || image.height == 0) {
+        return false;
+    }
+
+    const int width         = static_cast<int>(image.width);
+    const int height        = static_cast<int>(image.height);
+    const int input_channel = static_cast<int>(image.channel);
+    if (input_channel != 1 && input_channel != 3 && input_channel != 4) {
+        return false;
+    }
+
+    std::vector<uint8_t> rgb_buffer;
+    const uint8_t* rgb_data = image.data;
+    if (input_channel == 1) {
+        rgb_buffer.resize(static_cast<size_t>(width) * static_cast<size_t>(height) * 3);
+        for (int i = 0; i < width * height; ++i) {
+            rgb_buffer[i * 3 + 0] = image.data[i];
+            rgb_buffer[i * 3 + 1] = image.data[i];
+            rgb_buffer[i * 3 + 2] = image.data[i];
+        }
+        rgb_data = rgb_buffer.data();
+    } else if (input_channel == 4) {
+        rgb_buffer.resize(static_cast<size_t>(width) * static_cast<size_t>(height) * 3);
+        for (int i = 0; i < width * height; ++i) {
+            rgb_buffer[i * 3 + 0] = image.data[i * 4 + 0];
+            rgb_buffer[i * 3 + 1] = image.data[i * 4 + 1];
+            rgb_buffer[i * 3 + 2] = image.data[i * 4 + 2];
+        }
+        rgb_data = rgb_buffer.data();
+    }
+
+    std::vector<uint8_t> encoded_webp;
+    if (!encode_webp_image_to_vector(rgb_data, width, height, 3, "", quality, encoded_webp)) {
+        return false;
+    }
+
+    return extract_vp8_frame_from_webp(encoded_webp, vp8_frame);
+}
+#endif
 #endif

 uint8_t* load_image_common(bool from_memory,
@ -861,6 +943,99 @@ cleanup:
 }
 #endif

+#ifdef SD_USE_WEBM
+int create_webm_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) {
+    if (num_images == 0) {
+        fprintf(stderr, "Error: Image array is empty.\n");
+        return -1;
+    }
+    if (fps <= 0) {
+        fprintf(stderr, "Error: FPS must be positive.\n");
+        return -1;
+    }
+
+    const int width = static_cast<int>(images[0].width);
+    const int height = static_cast<int>(images[0].height);
+    if (width <= 0 || height <= 0) {
+        fprintf(stderr, "Error: Invalid frame dimensions.\n");
+        return -1;
+    }
+
+    mkvmuxer::MkvWriter writer;
+    if (!writer.Open(filename)) {
+        fprintf(stderr, "Error: Could not open WebM file for writing.\n");
+        return -1;
+    }
+
+    const int ret = [&]() -> int {
+        mkvmuxer::Segment segment;
+        if (!segment.Init(&writer)) {
+            fprintf(stderr, "Error: Failed to initialize WebM muxer.\n");
+            return -1;
+        }
+
+        segment.set_mode(mkvmuxer::Segment::kFile);
+        segment.OutputCues(true);
+
+        const uint64_t track_number = segment.AddVideoTrack(width, height, 0);
+        if (track_number == 0) {
+            fprintf(stderr, "Error: Failed to add VP8 video track.\n");
+            return -1;
+        }
+        if (!segment.CuesTrack(track_number)) {
+            fprintf(stderr, "Error: Failed to set WebM cues track.\n");
+            return -1;
+        }
+
+        mkvmuxer::VideoTrack* video_track = static_cast<mkvmuxer::VideoTrack*>(segment.GetTrackByNumber(track_number));
+        if (video_track != nullptr) {
+            video_track->set_display_width(static_cast<uint64_t>(width));
+            video_track->set_display_height(static_cast<uint64_t>(height));
+            video_track->set_frame_rate(static_cast<double>(fps));
+        }
+        segment.GetSegmentInfo()->set_writing_app("stable-diffusion.cpp");
+        segment.GetSegmentInfo()->set_muxing_app("stable-diffusion.cpp");
+
+        const uint64_t frame_duration_ns = std::max<uint64_t>(
+            1, static_cast<uint64_t>(std::llround(1000000000.0 / static_cast<double>(fps))));
+        uint64_t timestamp_ns = 0;
+
+        for (int i = 0; i < num_images; ++i) {
+            const sd_image_t& image = images[i];
+            if (static_cast<int>(image.width) != width || static_cast<int>(image.height) != height) {
+                fprintf(stderr, "Error: Frame dimensions do not match.\n");
+                return -1;
+            }
+
+            std::vector<uint8_t> vp8_frame;
+            if (!encode_sd_image_to_vp8_frame(image, quality, vp8_frame)) {
+                fprintf(stderr, "Error: Failed to encode frame %d as VP8.\n", i);
+                return -1;
+            }
+
+            if (!segment.AddFrame(vp8_frame.data(),
+                                  static_cast<uint64_t>(vp8_frame.size()),
+                                  track_number,
+                                  timestamp_ns,
+                                  true)) {
+                fprintf(stderr, "Error: Failed to mux frame %d into WebM.\n", i);
+                return -1;
+            }
+
+            timestamp_ns += frame_duration_ns;
+        }
+
+        if (!segment.Finalize()) {
+            fprintf(stderr, "Error: Failed to finalize WebM output.\n");
+            return -1;
+        }
+        return 0;
+    }();
+    writer.Close();
+    return ret;
+}
+#endif
+
 int create_video_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) {
    std::string path = filename ? filename : "";
    auto pos         = path.find_last_of('.');
@ -869,6 +1044,12 @@ int create_video_from_sd_images(const char* filename, sd_image_t* images, int nu
        ch = static_cast<char>(tolower(static_cast<unsigned char>(ch)));
    }

+#ifdef SD_USE_WEBM
+    if (ext == ".webm") {
+        return create_webm_from_sd_images(filename, images, num_images, fps, quality);
+    }
+#endif
+
 #ifdef SD_USE_WEBP
    if (ext == ".webp") {
        return create_animated_webp_from_sd_images(filename, images, num_images, fps, quality);
--- a/examples/common/media_io.h
+++ b/examples/common/media_io.h
@ -67,6 +67,14 @@ int create_animated_webp_from_sd_images(const char* filename,
                                        int quality = 90);
 #endif

+#ifdef SD_USE_WEBM
+int create_webm_from_sd_images(const char* filename,
+                               sd_image_t* images,
+                               int num_images,
+                               int fps,
+                               int quality = 90);
+#endif
+
 int create_video_from_sd_images(const char* filename,
                                sd_image_t* images,
                                int num_images,
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -75,8 +75,13 @@ endif()
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE stable-diffusion ${CMAKE_THREAD_LIBS_INIT})
 if(SD_WEBP)
+    target_compile_definitions(${TARGET} PRIVATE SD_USE_WEBP)
    target_link_libraries(${TARGET} PRIVATE webp libwebpmux)
 endif()
+if(SD_WEBM)
+    target_compile_definitions(${TARGET} PRIVATE SD_USE_WEBM)
+    target_link_libraries(${TARGET} PRIVATE webm)
+endif()

 # due to httplib; it contains a pragma for MSVC, but other things need explicit flags
 if(WIN32 AND NOT MSVC)
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@ -2846,7 +2846,8 @@ static std::optional<ImageGenerationLatents> prepare_image_generation_latents(sd
                                                         {request->width / request->vae_scale_factor,
                                                          request->height / request->vae_scale_factor,
                                                          1,
-                                                          1});
+                                                          1},
+                                                         sd::ops::InterpolateMode::NearestMax);

    sd::Tensor<float> init_latent;
    sd::Tensor<float> control_latent;
@ -2991,8 +2992,12 @@ static std::optional<ImageGenerationLatents> prepare_image_generation_latents(sd
    latents.ref_latents          = std::move(ref_latents);

    if (sd_version_is_inpaint(sd_ctx->sd->version)) {
-        latents.denoise_mask = std::move(latent_mask);
+        latent_mask = sd::ops::max_pool_2d(latent_mask,
+                                           {3, 3},
+                                           {1, 1},
+                                           {1, 1});
    }
+    latents.denoise_mask = std::move(latent_mask);

    return latents;
 }
--- a/src/tensor.hpp
+++ b/src/tensor.hpp
@ -815,6 +815,9 @@ namespace sd {
    namespace ops {
        enum class InterpolateMode {
            Nearest,
+            NearestMax,
+            NearestMin,
+            NearestAvg,
        };

        inline int64_t normalize_slice_bound(int64_t index, int64_t dim_size) {
@ -1012,12 +1015,16 @@ namespace sd {
                                     std::vector<int64_t> output_shape,
                                     InterpolateMode mode = InterpolateMode::Nearest,
                                     bool align_corners   = false) {
-            if (mode != InterpolateMode::Nearest) {
-                tensor_throw_invalid_argument("Only nearest interpolate mode is implemented, got mode=" +
+            const bool is_nearest_like_mode = (mode == InterpolateMode::Nearest ||
+                                               mode == InterpolateMode::NearestMax ||
+                                               mode == InterpolateMode::NearestMin ||
+                                               mode == InterpolateMode::NearestAvg);
+            if (!is_nearest_like_mode) {
+                tensor_throw_invalid_argument("Only nearest-like interpolate modes are implemented, got mode=" +
                                              std::to_string(static_cast<int>(mode)));
            }
            if (align_corners) {
-                tensor_throw_invalid_argument("align_corners is not supported for nearest interpolate: input_shape=" +
+                tensor_throw_invalid_argument("align_corners is not supported for nearest-like interpolate: input_shape=" +
                                              tensor_shape_to_string(input.shape()) + ", output_shape=" +
                                              tensor_shape_to_string(output_shape));
            }
@ -1044,7 +1051,16 @@ namespace sd {
                }
            }

+            bool has_downsampling = false;
+            for (int64_t i = 0; i < input.dim(); ++i) {
+                if (input.shape()[i] > output_shape[i]) {
+                    has_downsampling = true;
+                    break;
+                }
+            }
+
            Tensor<T> output(std::move(output_shape));
+            if (mode == InterpolateMode::Nearest || !has_downsampling) {
                for (int64_t flat = 0; flat < output.numel(); ++flat) {
                    std::vector<int64_t> output_coord = tensor_unravel_index(flat, output.shape());
                    std::vector<int64_t> input_coord(static_cast<size_t>(input.dim()), 0);
@ -1057,18 +1073,101 @@ namespace sd {
                return output;
            }

+            auto init_reduction = [&]() -> T {
+                switch (mode) {
+                    case InterpolateMode::NearestMax:
+                        return std::numeric_limits<T>::lowest();
+                    case InterpolateMode::NearestMin:
+                        return std::numeric_limits<T>::max();
+                    case InterpolateMode::NearestAvg:
+                        return T(0);
+                    case InterpolateMode::Nearest:
+                        return T(0);
+                }
+
+                tensor_throw_invalid_argument("Unsupported interpolate mode: mode=" +
+                                              std::to_string(static_cast<int>(mode)));
+            };
+
+            auto reduce_value = [&](T& acc, const T& sample) {
+                switch (mode) {
+                    case InterpolateMode::NearestMax:
+                        acc = std::max(acc, sample);
+                        break;
+                    case InterpolateMode::NearestMin:
+                        acc = std::min(acc, sample);
+                        break;
+                    case InterpolateMode::NearestAvg:
+                        acc += sample;
+                        break;
+                    case InterpolateMode::Nearest:
+                        break;
+                }
+            };
+
+            // Reduction modes only differ from nearest mode when downsampling.
+            for (int64_t flat_out = 0; flat_out < output.numel(); ++flat_out) {
+                std::vector<int64_t> output_coord = tensor_unravel_index(flat_out, output.shape());
+
+                std::vector<int64_t> input_start(output.dim(), 0);
+                std::vector<int64_t> input_end(output.dim(), 0);
+
+                for (size_t i = 0; i < static_cast<size_t>(output.dim()); ++i) {
+                    const int64_t input_dim  = input.shape()[i];
+                    const int64_t output_dim = output.shape()[i];
+
+                    input_start[i] = std::max(int64_t(0), static_cast<int64_t>(output_coord[i] * input_dim / output_dim));
+                    input_end[i]   = std::min(input_dim, ((output_coord[i] + 1) * input_dim + output_dim - 1) / output_dim);
+                }
+
+                T value                               = init_reduction();
+                bool done_window                      = false;
+                std::vector<int64_t> current_in_coord = input_start;
+
+                while (!done_window) {
+                    reduce_value(value, input.index(current_in_coord));
+
+                    for (int d = static_cast<int>(output.dim()) - 1; d >= 0; --d) {
+                        if (++current_in_coord[d] < input_end[d]) {
+                            break;
+                        }
+                        current_in_coord[d] = input_start[d];
+                        if (d == 0) {
+                            done_window = true;
+                        }
+                    }
+                }
+
+                if (mode == InterpolateMode::NearestAvg) {
+                    int64_t window_size = 1;
+                    for (size_t i = 0; i < static_cast<size_t>(output.dim()); ++i) {
+                        window_size *= (input_end[i] - input_start[i]);
+                    }
+                    value /= static_cast<T>(window_size);
+                }
+
+                output[flat_out] = value;
+            }
+
+            return output;
+        }
+
        template <typename T>
        inline Tensor<T> interpolate(const Tensor<T>& input,
                                     const std::optional<std::vector<int64_t>>& size,
                                     const std::optional<std::vector<double>>& scale_factor,
                                     InterpolateMode mode = InterpolateMode::Nearest,
                                     bool align_corners   = false) {
-            if (mode != InterpolateMode::Nearest) {
-                tensor_throw_invalid_argument("Only nearest interpolate mode is implemented, got mode=" +
+            const bool is_nearest_like_mode = (mode == InterpolateMode::Nearest ||
+                                               mode == InterpolateMode::NearestMax ||
+                                               mode == InterpolateMode::NearestMin ||
+                                               mode == InterpolateMode::NearestAvg);
+            if (!is_nearest_like_mode) {
+                tensor_throw_invalid_argument("Only nearest-like interpolate modes are implemented, got mode=" +
                                              std::to_string(static_cast<int>(mode)));
            }
            if (align_corners) {
-                tensor_throw_invalid_argument("align_corners is not supported for nearest interpolate: input_shape=" +
+                tensor_throw_invalid_argument("align_corners is not supported for nearest-like interpolate: input_shape=" +
                                              tensor_shape_to_string(input.shape()));
            }
            if (size.has_value() == scale_factor.has_value()) {
@ -1128,6 +1227,80 @@ namespace sd {
                               align_corners);
        }

+        template <typename T>
+        inline Tensor<T> max_pool_2d(const Tensor<T>& input,
+                                     std::vector<int64_t> kernel_size,
+                                     std::vector<int64_t> stride,
+                                     std::vector<int64_t> padding) {
+            if (input.dim() < 2) {
+                tensor_throw_invalid_argument("Tensor max_pool_2d requires input_dim >= 2: input_dim=" +
+                                              std::to_string(input.dim()) + ", input_shape=" +
+                                              tensor_shape_to_string(input.shape()));
+            }
+            if (kernel_size.size() != 2 || stride.size() != 2 || padding.size() != 2) {
+                tensor_throw_invalid_argument("Tensor max_pool_2d requires kernel_size, stride, and padding to have length 2");
+            }
+            for (size_t i = 0; i < 2; ++i) {
+                if (kernel_size[i] <= 0) {
+                    tensor_throw_invalid_argument("Tensor max_pool_2d kernel_size must be positive: kernel_size=" +
+                                                  tensor_shape_to_string(kernel_size));
+                }
+                if (stride[i] <= 0) {
+                    tensor_throw_invalid_argument("Tensor max_pool_2d stride must be positive: stride=" +
+                                                  tensor_shape_to_string(stride));
+                }
+                if (padding[i] < 0) {
+                    tensor_throw_invalid_argument("Tensor max_pool_2d padding must be non-negative: padding=" +
+                                                  tensor_shape_to_string(padding));
+                }
+            }
+
+            const int64_t in_height = input.shape()[0];
+            const int64_t in_width  = input.shape()[1];
+
+            const int64_t out_height = (in_height + 2 * padding[0] - kernel_size[0]) / stride[0] + 1;
+            const int64_t out_width  = (in_width + 2 * padding[1] - kernel_size[1]) / stride[1] + 1;
+
+            if (out_height <= 0 || out_width <= 0) {
+                tensor_throw_invalid_argument("max_pool_2d results in invalid output dimensions: " +
+                                              std::to_string(out_height) + "x" + std::to_string(out_width));
+            }
+
+            std::vector<int64_t> output_shape = input.shape();
+            output_shape[0]                   = out_height;
+            output_shape[1]                   = out_width;
+
+            Tensor<T> output(std::move(output_shape));
+
+            for (int64_t flat_out = 0; flat_out < output.numel(); ++flat_out) {
+                std::vector<int64_t> output_coord = tensor_unravel_index(flat_out, output.shape());
+                std::vector<int64_t> input_coord  = output_coord;
+
+                const int64_t oh = output_coord[0];
+                const int64_t ow = output_coord[1];
+
+                T max_val            = std::numeric_limits<T>::lowest();
+                bool has_valid_input = false;
+
+                for (int64_t kh = 0; kh < kernel_size[0]; ++kh) {
+                    for (int64_t kw = 0; kw < kernel_size[1]; ++kw) {
+                        const int64_t ih = oh * stride[0] + kh - padding[0];
+                        const int64_t iw = ow * stride[1] + kw - padding[1];
+
+                        if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) {
+                            input_coord[0]  = ih;
+                            input_coord[1]  = iw;
+                            max_val         = std::max(max_val, input.index(input_coord));
+                            has_valid_input = true;
+                        }
+                    }
+                }
+
+                output[flat_out] = has_valid_input ? max_val : T(0);
+            }
+            return output;
+        }
+
        template <typename T>
        inline Tensor<T> concat(const Tensor<T>& lhs, const Tensor<T>& rhs, size_t dim) {
            if (lhs.dim() != rhs.dim()) {
--- a/thirdparty/CMakeLists.txt
+++ b/thirdparty/CMakeLists.txt
@ -18,3 +18,28 @@ if(SD_WEBP AND NOT SD_USE_SYSTEM_WEBP)

    add_subdirectory(libwebp EXCLUDE_FROM_ALL)
 endif()
+
+if(SD_WEBM AND NOT SD_USE_SYSTEM_WEBM)
+    if(MSVC)
+        set(MSVC_RUNTIME dll)
+    endif()
+    set(ENABLE_WEBMTS OFF)
+    set(ENABLE_WEBMINFO OFF)
+    set(ENABLE_TESTS OFF)
+    set(ENABLE_WEBM_PARSER OFF)
+    set(ENABLE_SAMPLE_PROGRAMS OFF)
+
+    set(SD_LIBWEBM_PARENT_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+
+    add_subdirectory(libwebm EXCLUDE_FROM_ALL)
+
+    # libwebm mutates the global CMAKE_CXX_FLAGS for non-MSVC compilers to force
+    # C++11. Restore the parent flags so the main project keeps its own C++17
+    # requirements, then pin the libwebm targets to C++17 explicitly.
+    set(CMAKE_CXX_FLAGS "${SD_LIBWEBM_PARENT_CXX_FLAGS}" CACHE STRING "" FORCE)
+    target_compile_features(mkvmuxer PRIVATE cxx_std_17)
+    target_compile_features(mkvparser PRIVATE cxx_std_17)
+    target_compile_features(webm PRIVATE cxx_std_17)
+
+    target_include_directories(webm INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/libwebm)
+endif()
--- a/thirdparty/libwebm
+++ b/thirdparty/libwebm
@ -0,0 +1 @@
+Subproject commit 5bf12267eea773a32fcf4949de52b0add158a8d5
				`@ -0,0 +1 @@`
				`Subproject commit 5bf12267eea773a32fcf4949de52b0add158a8d5`