feat(server): implement vid_gen async API and mode-aware capabilities (#1437)

2026-05-08 08:18:51 +00:00 · 2026-04-18 15:06:36 +08:00 · 2026-04-18 15:06:36 +08:00 · 4d626d24b2
commit 4d626d24b2
parent f3f69e2fbe
14 changed files with 1345 additions and 339 deletions
--- a/examples/common/common.cpp
+++ b/examples/common/common.cpp
@ -1589,10 +1589,18 @@ bool SDGenerationParams::from_json_str(
        LOG_ERROR("invalid init_image");
        return false;
    }
+    if (!parse_image_json_field(j, "end_image", 3, width, height, end_image)) {
+        LOG_ERROR("invalid end_image");
+        return false;
+    }
    if (!parse_image_array_json_field(j, "ref_images", 3, width, height, ref_images)) {
        LOG_ERROR("invalid ref_images");
        return false;
    }
+    if (!parse_image_array_json_field(j, "control_frames", 3, width, height, control_frames)) {
+        LOG_ERROR("invalid control_frames");
+        return false;
+    }
    if (!parse_image_json_field(j, "mask_image", 1, width, height, mask_image)) {
        LOG_ERROR("invalid mask_image");
        return false;
--- a/examples/common/media_io.cpp
+++ b/examples/common/media_io.cpp
@ -95,6 +95,57 @@ using WebPMuxPtr         = std::unique_ptr<WebPMux, WebPMuxDeleter>;
 using WebPAnimEncoderPtr = std::unique_ptr<WebPAnimEncoder, WebPAnimEncoderDeleter>;
 #endif

+#ifdef SD_USE_WEBM
+class MemoryMkvWriter : public mkvmuxer::IMkvWriter {
+public:
+    mkvmuxer::int32 Write(const void* buf, mkvmuxer::uint32 len) override {
+        if (buf == nullptr && len > 0) {
+            return -1;
+        }
+        const size_t end_pos = position_ + static_cast<size_t>(len);
+        if (end_pos > data_.size()) {
+            data_.resize(end_pos);
+        }
+        if (len > 0) {
+            memcpy(data_.data() + position_, buf, len);
+        }
+        position_ = end_pos;
+        return 0;
+    }
+
+    mkvmuxer::int64 Position() const override {
+        return static_cast<mkvmuxer::int64>(position_);
+    }
+
+    mkvmuxer::int32 Position(mkvmuxer::int64 position) override {
+        if (position < 0) {
+            return -1;
+        }
+        const size_t target = static_cast<size_t>(position);
+        if (target > data_.size()) {
+            data_.resize(target);
+        }
+        position_ = target;
+        return 0;
+    }
+
+    bool Seekable() const override {
+        return true;
+    }
+
+    void ElementStartNotify(mkvmuxer::uint64, mkvmuxer::int64) override {
+    }
+
+    const std::vector<uint8_t>& data() const {
+        return data_;
+    }
+
+private:
+    std::vector<uint8_t> data_;
+    size_t position_ = 0;
+};
+#endif
+
 bool read_binary_file_bytes(const char* path, std::vector<uint8_t>& data) {
    std::ifstream fin(fs::path(path), std::ios::binary);
    if (!fin) {
@ -570,6 +621,32 @@ void write_u16_le(FILE* f, uint16_t val) {
    fwrite(&val, 2, 1, f);
 }

+void write_u32_le(std::vector<uint8_t>& data, uint32_t val) {
+    data.push_back(static_cast<uint8_t>(val & 0xFF));
+    data.push_back(static_cast<uint8_t>((val >> 8) & 0xFF));
+    data.push_back(static_cast<uint8_t>((val >> 16) & 0xFF));
+    data.push_back(static_cast<uint8_t>((val >> 24) & 0xFF));
+}
+
+void write_u16_le(std::vector<uint8_t>& data, uint16_t val) {
+    data.push_back(static_cast<uint8_t>(val & 0xFF));
+    data.push_back(static_cast<uint8_t>((val >> 8) & 0xFF));
+}
+
+void patch_u32_le(std::vector<uint8_t>& data, size_t offset, uint32_t val) {
+    if (offset + 4 > data.size()) {
+        return;
+    }
+    data[offset + 0] = static_cast<uint8_t>(val & 0xFF);
+    data[offset + 1] = static_cast<uint8_t>((val >> 8) & 0xFF);
+    data[offset + 2] = static_cast<uint8_t>((val >> 16) & 0xFF);
+    data[offset + 3] = static_cast<uint8_t>((val >> 24) & 0xFF);
+}
+
+void write_fourcc(std::vector<uint8_t>& data, const char* fourcc) {
+    data.insert(data.end(), fourcc, fourcc + 4);
+}
+
 EncodedImageFormat encoded_image_format_from_path(const std::string& path) {
    std::string ext = fs::path(path).extension().string();
    std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
@ -699,95 +776,96 @@ uint8_t* load_image_from_memory(const char* image_bytes,
    return load_image_common(true, image_bytes, len, width, height, expected_width, expected_height, expected_channel);
 }

-int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) {
+std::vector<uint8_t> create_mjpg_avi_from_sd_images_to_vector(sd_image_t* images, int num_images, int fps, int quality) {
    if (num_images == 0) {
        fprintf(stderr, "Error: Image array is empty.\n");
-        return -1;
+        return {};
    }

-    FilePtr file(fopen(filename, "wb"));
-    if (!file) {
-        perror("Error opening file for writing");
-        return -1;
-    }
-    FILE* f = file.get();
-
    uint32_t width    = images[0].width;
    uint32_t height   = images[0].height;
    uint32_t channels = images[0].channel;
    if (channels != 3 && channels != 4) {
        fprintf(stderr, "Error: Unsupported channel count: %u\n", channels);
-        return -1;
+        return {};
    }

-    fwrite("RIFF", 4, 1, f);
-    long riff_size_pos = ftell(f);
-    write_u32_le(f, 0);
-    fwrite("AVI ", 4, 1, f);
+    // stb_image_write changes JPEG sampling behavior above quality 90.
+    // MJPG AVI playback is more compatible when we keep the encoder on the
+    // <= 90 path.
+    const int mjpg_quality = std::clamp(quality, 1, 90);

-    fwrite("LIST", 4, 1, f);
-    write_u32_le(f, 4 + 8 + 56 + 8 + 4 + 8 + 56 + 8 + 40);
-    fwrite("hdrl", 4, 1, f);
+    std::vector<uint8_t> avi_data;
+    avi_data.reserve(static_cast<size_t>(num_images) * 1024);

-    fwrite("avih", 4, 1, f);
-    write_u32_le(f, 56);
-    write_u32_le(f, 1000000 / fps);
-    write_u32_le(f, 0);
-    write_u32_le(f, 0);
-    write_u32_le(f, 0x110);
-    write_u32_le(f, num_images);
-    write_u32_le(f, 0);
-    write_u32_le(f, 1);
-    write_u32_le(f, width * height * 3);
-    write_u32_le(f, width);
-    write_u32_le(f, height);
-    write_u32_le(f, 0);
-    write_u32_le(f, 0);
-    write_u32_le(f, 0);
-    write_u32_le(f, 0);
+    write_fourcc(avi_data, "RIFF");
+    const size_t riff_size_pos = avi_data.size();
+    write_u32_le(avi_data, 0);
+    write_fourcc(avi_data, "AVI ");

-    fwrite("LIST", 4, 1, f);
-    write_u32_le(f, 4 + 8 + 56 + 8 + 40);
-    fwrite("strl", 4, 1, f);
+    write_fourcc(avi_data, "LIST");
+    write_u32_le(avi_data, 4 + 8 + 56 + 8 + 4 + 8 + 56 + 8 + 40);
+    write_fourcc(avi_data, "hdrl");

-    fwrite("strh", 4, 1, f);
-    write_u32_le(f, 56);
-    fwrite("vids", 4, 1, f);
-    fwrite("MJPG", 4, 1, f);
-    write_u32_le(f, 0);
-    write_u16_le(f, 0);
-    write_u16_le(f, 0);
-    write_u32_le(f, 0);
-    write_u32_le(f, 1);
-    write_u32_le(f, fps);
-    write_u32_le(f, 0);
-    write_u32_le(f, num_images);
-    write_u32_le(f, width * height * 3);
-    write_u32_le(f, (uint32_t)-1);
-    write_u32_le(f, 0);
-    write_u16_le(f, 0);
-    write_u16_le(f, 0);
-    write_u16_le(f, 0);
-    write_u16_le(f, 0);
+    write_fourcc(avi_data, "avih");
+    write_u32_le(avi_data, 56);
+    write_u32_le(avi_data, 1000000 / fps);
+    write_u32_le(avi_data, 0);
+    write_u32_le(avi_data, 0);
+    write_u32_le(avi_data, 0x110);
+    write_u32_le(avi_data, num_images);
+    write_u32_le(avi_data, 0);
+    write_u32_le(avi_data, 1);
+    write_u32_le(avi_data, width * height * 3);
+    write_u32_le(avi_data, width);
+    write_u32_le(avi_data, height);
+    write_u32_le(avi_data, 0);
+    write_u32_le(avi_data, 0);
+    write_u32_le(avi_data, 0);
+    write_u32_le(avi_data, 0);

-    fwrite("strf", 4, 1, f);
-    write_u32_le(f, 40);
-    write_u32_le(f, 40);
-    write_u32_le(f, width);
-    write_u32_le(f, height);
-    write_u16_le(f, 1);
-    write_u16_le(f, 24);
-    fwrite("MJPG", 4, 1, f);
-    write_u32_le(f, width * height * 3);
-    write_u32_le(f, 0);
-    write_u32_le(f, 0);
-    write_u32_le(f, 0);
-    write_u32_le(f, 0);
+    write_fourcc(avi_data, "LIST");
+    write_u32_le(avi_data, 4 + 8 + 56 + 8 + 40);
+    write_fourcc(avi_data, "strl");

-    fwrite("LIST", 4, 1, f);
-    long movi_size_pos = ftell(f);
-    write_u32_le(f, 0);
-    fwrite("movi", 4, 1, f);
+    write_fourcc(avi_data, "strh");
+    write_u32_le(avi_data, 56);
+    write_fourcc(avi_data, "vids");
+    write_fourcc(avi_data, "MJPG");
+    write_u32_le(avi_data, 0);
+    write_u16_le(avi_data, 0);
+    write_u16_le(avi_data, 0);
+    write_u32_le(avi_data, 0);
+    write_u32_le(avi_data, 1);
+    write_u32_le(avi_data, fps);
+    write_u32_le(avi_data, 0);
+    write_u32_le(avi_data, num_images);
+    write_u32_le(avi_data, width * height * 3);
+    write_u32_le(avi_data, static_cast<uint32_t>(-1));
+    write_u32_le(avi_data, 0);
+    write_u16_le(avi_data, 0);
+    write_u16_le(avi_data, 0);
+    write_u16_le(avi_data, 0);
+    write_u16_le(avi_data, 0);
+
+    write_fourcc(avi_data, "strf");
+    write_u32_le(avi_data, 40);
+    write_u32_le(avi_data, 40);
+    write_u32_le(avi_data, width);
+    write_u32_le(avi_data, height);
+    write_u16_le(avi_data, 1);
+    write_u16_le(avi_data, 24);
+    write_fourcc(avi_data, "MJPG");
+    write_u32_le(avi_data, width * height * 3);
+    write_u32_le(avi_data, 0);
+    write_u32_le(avi_data, 0);
+    write_u32_le(avi_data, 0);
+    write_u32_le(avi_data, 0);
+
+    write_fourcc(avi_data, "LIST");
+    const size_t movi_size_pos = avi_data.size();
+    write_u32_le(avi_data, 0);
+    write_fourcc(avi_data, "movi");

    std::vector<avi_index_entry> index(static_cast<size_t>(num_images));
    std::vector<uint8_t> jpeg_data;
@ -801,55 +879,61 @@ int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int
            buffer->insert(buffer->end(), src, src + size);
        };

-        if (!stbi_write_jpg_to_func(write_to_buf, &jpeg_data, images[i].width, images[i].height, channels, images[i].data, quality)) {
+        if (!stbi_write_jpg_to_func(write_to_buf, &jpeg_data, images[i].width, images[i].height, channels, images[i].data, mjpg_quality)) {
            fprintf(stderr, "Error: Failed to encode JPEG frame.\n");
-            return -1;
+            return {};
        }

-        fwrite("00dc", 4, 1, f);
-        write_u32_le(f, (uint32_t)jpeg_data.size());
-        index[i].offset = ftell(f) - 8;
+        index[i].offset = static_cast<uint32_t>(avi_data.size());
+        write_fourcc(avi_data, "00dc");
+        write_u32_le(avi_data, static_cast<uint32_t>(jpeg_data.size()));
        index[i].size = (uint32_t)jpeg_data.size();
-        fwrite(jpeg_data.data(), 1, jpeg_data.size(), f);
+        avi_data.insert(avi_data.end(), jpeg_data.begin(), jpeg_data.end());

        if (jpeg_data.size() % 2) {
-            fputc(0, f);
+            avi_data.push_back(0);
        }
    }

-    long cur_pos   = ftell(f);
-    long movi_size = cur_pos - movi_size_pos - 4;
-    fseek(f, movi_size_pos, SEEK_SET);
-    write_u32_le(f, movi_size);
-    fseek(f, cur_pos, SEEK_SET);
+    const size_t movi_size = avi_data.size() - movi_size_pos - 4;
+    patch_u32_le(avi_data, movi_size_pos, static_cast<uint32_t>(movi_size));

-    fwrite("idx1", 4, 1, f);
-    write_u32_le(f, num_images * 16);
+    write_fourcc(avi_data, "idx1");
+    write_u32_le(avi_data, num_images * 16);
    for (int i = 0; i < num_images; i++) {
-        fwrite("00dc", 4, 1, f);
-        write_u32_le(f, 0x10);
-        write_u32_le(f, index[i].offset);
-        write_u32_le(f, index[i].size);
+        write_fourcc(avi_data, "00dc");
+        write_u32_le(avi_data, 0x10);
+        write_u32_le(avi_data, index[i].offset);
+        write_u32_le(avi_data, index[i].size);
    }

-    cur_pos        = ftell(f);
-    long file_size = cur_pos - riff_size_pos - 4;
-    fseek(f, riff_size_pos, SEEK_SET);
-    write_u32_le(f, file_size);
-    fseek(f, cur_pos, SEEK_SET);
+    const size_t file_size = avi_data.size() - riff_size_pos - 4;
+    patch_u32_le(avi_data, riff_size_pos, static_cast<uint32_t>(file_size));

+    return avi_data;
+}
+
+int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) {
+    std::vector<uint8_t> avi_data = create_mjpg_avi_from_sd_images_to_vector(images, num_images, fps, quality);
+    if (avi_data.empty()) {
+        return -1;
+    }
+    if (!write_binary_file_bytes(filename, avi_data)) {
+        perror("Error opening file for writing");
+        return -1;
+    }
    return 0;
 }

 #ifdef SD_USE_WEBP
-int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) {
+std::vector<uint8_t> create_animated_webp_from_sd_images_to_vector(sd_image_t* images, int num_images, int fps, int quality) {
    if (num_images == 0) {
        fprintf(stderr, "Error: Image array is empty.\n");
-        return -1;
+        return {};
    }
    if (fps <= 0) {
        fprintf(stderr, "Error: FPS must be positive.\n");
-        return -1;
+        return {};
    }

    const int width    = static_cast<int>(images[0].width);
@ -857,14 +941,14 @@ int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images
    const int channels = static_cast<int>(images[0].channel);
    if (channels != 1 && channels != 3 && channels != 4) {
        fprintf(stderr, "Error: Unsupported channel count: %d\n", channels);
-        return -1;
+        return {};
    }

    WebPAnimEncoderOptions anim_options;
    WebPConfig config;
    if (!WebPAnimEncoderOptionsInit(&anim_options) || !WebPConfigInit(&config)) {
        fprintf(stderr, "Error: Failed to initialize WebP animation encoder.\n");
-        return -1;
+        return {};
    }

    config.quality      = static_cast<float>(quality);
@ -875,13 +959,13 @@ int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images
    }
    if (!WebPValidateConfig(&config)) {
        fprintf(stderr, "Error: Invalid WebP encoder configuration.\n");
-        return -1;
+        return {};
    }

    WebPAnimEncoderPtr enc(WebPAnimEncoderNew(width, height, &anim_options));
    if (enc == nullptr) {
        fprintf(stderr, "Error: Could not create WebPAnimEncoder object.\n");
-        return -1;
+        return {};
    }

    const int frame_duration_ms = std::max(1, static_cast<int>(std::lround(1000.0 / static_cast<double>(fps))));
@ -891,13 +975,13 @@ int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images
        const sd_image_t& image = images[i];
        if (static_cast<int>(image.width) != width || static_cast<int>(image.height) != height) {
            fprintf(stderr, "Error: Frame dimensions do not match.\n");
-            return -1;
+            return {};
        }

        WebPPictureGuard picture;
        if (!picture.initialized) {
            fprintf(stderr, "Error: Failed to initialize WebPPicture.\n");
-            return -1;
+            return {};
        }
        picture.picture.use_argb = 1;
        picture.picture.width    = width;
@ -921,12 +1005,12 @@ int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images

        if (!picture_ok) {
            fprintf(stderr, "Error: Failed to import frame into WebPPicture.\n");
-            return -1;
+            return {};
        }

        if (!WebPAnimEncoderAdd(enc.get(), &picture.picture, timestamp_ms, &config)) {
            fprintf(stderr, "Error: Failed to add frame to animated WebP: %s\n", WebPAnimEncoderGetError(enc.get()));
-            return -1;
+            return {};
        }

        timestamp_ms += frame_duration_ms;
@ -934,52 +1018,50 @@ int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images

    if (!WebPAnimEncoderAdd(enc.get(), nullptr, timestamp_ms, nullptr)) {
        fprintf(stderr, "Error: Failed to finalize animated WebP frames: %s\n", WebPAnimEncoderGetError(enc.get()));
-        return -1;
+        return {};
    }

    WebPDataGuard webp_data;
    if (!WebPAnimEncoderAssemble(enc.get(), &webp_data.data)) {
        fprintf(stderr, "Error: Failed to assemble animated WebP: %s\n", WebPAnimEncoderGetError(enc.get()));
-        return -1;
+        return {};
    }

-    FilePtr f(fopen(filename, "wb"));
-    if (!f) {
+    return std::vector<uint8_t>(webp_data.data.bytes, webp_data.data.bytes + webp_data.data.size);
+}
+
+int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) {
+    std::vector<uint8_t> webp_data = create_animated_webp_from_sd_images_to_vector(images, num_images, fps, quality);
+    if (webp_data.empty()) {
+        return -1;
+    }
+    if (!write_binary_file_bytes(filename, webp_data)) {
        perror("Error opening file for writing");
        return -1;
    }
-    if (webp_data.data.size > 0 && fwrite(webp_data.data.bytes, 1, webp_data.data.size, f.get()) != webp_data.data.size) {
-        fprintf(stderr, "Error: Failed to write animated WebP file.\n");
-        return -1;
-    }
-
    return 0;
 }
 #endif

 #ifdef SD_USE_WEBM
-int create_webm_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) {
+std::vector<uint8_t> create_webm_from_sd_images_to_vector(sd_image_t* images, int num_images, int fps, int quality) {
    if (num_images == 0) {
        fprintf(stderr, "Error: Image array is empty.\n");
-        return -1;
+        return {};
    }
    if (fps <= 0) {
        fprintf(stderr, "Error: FPS must be positive.\n");
-        return -1;
+        return {};
    }

    const int width  = static_cast<int>(images[0].width);
    const int height = static_cast<int>(images[0].height);
    if (width <= 0 || height <= 0) {
        fprintf(stderr, "Error: Invalid frame dimensions.\n");
-        return -1;
+        return {};
    }

-    mkvmuxer::MkvWriter writer;
-    if (!writer.Open(filename)) {
-        fprintf(stderr, "Error: Could not open WebM file for writing.\n");
-        return -1;
-    }
+    MemoryMkvWriter writer;

    const int ret = [&]() -> int {
        mkvmuxer::Segment segment;
@ -1045,30 +1127,63 @@ int create_webm_from_sd_images(const char* filename, sd_image_t* images, int num
        }
        return 0;
    }();
-    writer.Close();
-    return ret;
+    if (ret != 0) {
+        return {};
+    }
+    return writer.data();
+}
+
+int create_webm_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) {
+    std::vector<uint8_t> webm_data = create_webm_from_sd_images_to_vector(images, num_images, fps, quality);
+    if (webm_data.empty()) {
+        return -1;
+    }
+    if (!write_binary_file_bytes(filename, webm_data)) {
+        perror("Error opening file for writing");
+        return -1;
+    }
+    return 0;
 }
 #endif

+std::vector<uint8_t> create_video_from_sd_images_to_vector(const std::string& output_format,
+                                                           sd_image_t* images,
+                                                           int num_images,
+                                                           int fps,
+                                                           int quality) {
+    std::string format = output_format;
+    std::transform(format.begin(), format.end(), format.begin(),
+                   [](unsigned char c) { return static_cast<char>(tolower(c)); });
+    if (!format.empty() && format[0] == '.') {
+        format.erase(format.begin());
+    }
+
+#ifdef SD_USE_WEBM
+    if (format == "webm") {
+        return create_webm_from_sd_images_to_vector(images, num_images, fps, quality);
+    }
+#endif
+
+#ifdef SD_USE_WEBP
+    if (format == "webp") {
+        return create_animated_webp_from_sd_images_to_vector(images, num_images, fps, quality);
+    }
+#endif
+
+    return create_mjpg_avi_from_sd_images_to_vector(images, num_images, fps, quality);
+}
+
 int create_video_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) {
    std::string path                = filename ? filename : "";
    auto pos                        = path.find_last_of('.');
    std::string ext                 = pos == std::string::npos ? "" : path.substr(pos);
-    for (char& ch : ext) {
-        ch = static_cast<char>(tolower(static_cast<unsigned char>(ch)));
+    std::vector<uint8_t> video_data = create_video_from_sd_images_to_vector(ext, images, num_images, fps, quality);
+    if (video_data.empty()) {
+        return -1;
    }
-
-#ifdef SD_USE_WEBM
-    if (ext == ".webm") {
-        return create_webm_from_sd_images(filename, images, num_images, fps, quality);
+    if (!write_binary_file_bytes(filename, video_data)) {
+        perror("Error opening file for writing");
+        return -1;
    }
-#endif
-
-#ifdef SD_USE_WEBP
-    if (ext == ".webp") {
-        return create_animated_webp_from_sd_images(filename, images, num_images, fps, quality);
-    }
-#endif
-
-    return create_mjpg_avi_from_sd_images(filename, images, num_images, fps, quality);
+    return 0;
 }
--- a/examples/common/media_io.h
+++ b/examples/common/media_io.h
@ -58,6 +58,10 @@ int create_mjpg_avi_from_sd_images(const char* filename,
                                   int num_images,
                                   int fps,
                                   int quality = 90);
+std::vector<uint8_t> create_mjpg_avi_from_sd_images_to_vector(sd_image_t* images,
+                                                              int num_images,
+                                                              int fps,
+                                                              int quality = 90);

 #ifdef SD_USE_WEBP
 int create_animated_webp_from_sd_images(const char* filename,
@ -65,6 +69,10 @@ int create_animated_webp_from_sd_images(const char* filename,
                                        int num_images,
                                        int fps,
                                        int quality = 90);
+std::vector<uint8_t> create_animated_webp_from_sd_images_to_vector(sd_image_t* images,
+                                                                   int num_images,
+                                                                   int fps,
+                                                                   int quality = 90);
 #endif

 #ifdef SD_USE_WEBM
@ -73,6 +81,10 @@ int create_webm_from_sd_images(const char* filename,
                               int num_images,
                               int fps,
                               int quality = 90);
+std::vector<uint8_t> create_webm_from_sd_images_to_vector(sd_image_t* images,
+                                                          int num_images,
+                                                          int fps,
+                                                          int quality = 90);
 #endif

 int create_video_from_sd_images(const char* filename,
@ -80,5 +92,10 @@ int create_video_from_sd_images(const char* filename,
                                int num_images,
                                int fps,
                                int quality = 90);
+std::vector<uint8_t> create_video_from_sd_images_to_vector(const std::string& output_format,
+                                                           sd_image_t* images,
+                                                           int num_images,
+                                                           int fps,
+                                                           int quality = 90);

 #endif  // __MEDIA_IO_H__
--- a/examples/server/api.md
+++ b/examples/server/api.md
@ -9,7 +9,7 @@ The server currently exposes three API families:
 - `sdcpp API` under `/sdcpp/v1/...`

 The `sdcpp API` is the native API surface.
-Its request schema is also the canonical schema for `sd_cpp_extra_args`.
+Its request schema is the same schema used by `sd_cpp_extra_args`.

 Global LoRA rule:

@ -55,8 +55,6 @@ Current endpoints include:
 - `POST /sdcpp/v1/jobs/{id}/cancel`
 - `POST /sdcpp/v1/vid_gen`

-`POST /sdcpp/v1/vid_gen` is currently exposed but returns `501 Not Implemented`.
-
 ## `sd_cpp_extra_args`

 `sd_cpp_extra_args` is an extension mechanism for the compatibility APIs.
@ -79,12 +77,12 @@ Behavior:
 - The JSON block is parsed using the same field rules as the `sdcpp API`.
 - The block is removed from the final prompt before generation.

-Intended use:
+Supported use:

 - extend `OpenAI API` requests with native `stable-diffusion.cpp` controls
 - extend `sdapi` requests with native `stable-diffusion.cpp` controls

-Not intended use:
+Unsupported use:

 - do not use `sd_cpp_extra_args` with `/sdcpp/v1/*`

@ -372,20 +370,25 @@ Field types:

 Returns frontend-friendly capability metadata.

-Typical contents:
+The mode-aware fields are the primary interface. The top-level compatibility fields are deprecated mirrors kept for older clients.

-| Field | Type |
-| --- | --- |
-| `model` | `object` |
-| `defaults` | `object` |
-| `loras` | `array<object>` |
-| `samplers` | `array<string>` |
-| `schedulers` | `array<string>` |
-| `output_formats` | `array<string>` |
-| `limits` | `object` |
-| `features` | `object` |
+Top-level fields:

-Nested fields currently returned:
+| Field | Type | Notes |
+| --- | --- | --- |
+| `model` | `object` | Loaded model metadata |
+| `current_mode` | `string` | The native generation mode mirrored by top-level compatibility fields |
+| `supported_modes` | `array<string>` | Supported native modes such as `img_gen` or `vid_gen` |
+| `defaults` | `object` | Deprecated compatibility mirror of `defaults_by_mode[current_mode]` |
+| `output_formats` | `array<string>` | Deprecated compatibility mirror of `output_formats_by_mode[current_mode]` |
+| `features` | `object` | Deprecated compatibility mirror of `features_by_mode[current_mode]` |
+| `defaults_by_mode` | `object` | Explicit defaults for each supported mode |
+| `output_formats_by_mode` | `object` | Explicit output formats for each supported mode |
+| `features_by_mode` | `object` | Explicit feature flags for each supported mode |
+| `samplers` | `array<string>` | Available sampling methods |
+| `schedulers` | `array<string>` | Available schedulers |
+| `loras` | `array<object>` | Available LoRA entries |
+| `limits` | `object` | Shared queue and size limits |

 `model`

@ -395,50 +398,24 @@ Nested fields currently returned:
 | `model.stem` | `string` |
 | `model.path` | `string` |

-`defaults`
+Compatibility rules:
+
+- `defaults`, `output_formats`, and `features` are deprecated compatibility mirrors
+- those three top-level fields always mirror `current_mode`
+- `supported_modes`, `defaults_by_mode`, `output_formats_by_mode`, and `features_by_mode` are the mode-aware fields
+
+Mode-aware objects:

 | Field | Type |
 | --- | --- |
-| `defaults.prompt` | `string` |
-| `defaults.negative_prompt` | `string` |
-| `defaults.clip_skip` | `integer` |
-| `defaults.width` | `integer` |
-| `defaults.height` | `integer` |
-| `defaults.strength` | `number` |
-| `defaults.seed` | `integer` |
-| `defaults.batch_count` | `integer` |
-| `defaults.auto_resize_ref_image` | `boolean` |
-| `defaults.increase_ref_index` | `boolean` |
-| `defaults.control_strength` | `number` |
-| `defaults.sample_params` | `object` |
-| `defaults.sample_params.scheduler` | `string` |
-| `defaults.sample_params.sample_method` | `string` |
-| `defaults.sample_params.sample_steps` | `integer` |
-| `defaults.sample_params.eta` | `number \| null` |
-| `defaults.sample_params.shifted_timestep` | `integer` |
-| `defaults.sample_params.flow_shift` | `number \| null` |
-| `defaults.sample_params.guidance` | `object` |
-| `defaults.sample_params.guidance.txt_cfg` | `number` |
-| `defaults.sample_params.guidance.img_cfg` | `number \| null` |
-| `defaults.sample_params.guidance.distilled_guidance` | `number` |
-| `defaults.sample_params.guidance.slg` | `object` |
-| `defaults.sample_params.guidance.slg.layers` | `array<integer>` |
-| `defaults.sample_params.guidance.slg.layer_start` | `number` |
-| `defaults.sample_params.guidance.slg.layer_end` | `number` |
-| `defaults.sample_params.guidance.slg.scale` | `number` |
-| `defaults.vae_tiling_params` | `object` |
-| `defaults.vae_tiling_params.enabled` | `boolean` |
-| `defaults.vae_tiling_params.tile_size_x` | `integer` |
-| `defaults.vae_tiling_params.tile_size_y` | `integer` |
-| `defaults.vae_tiling_params.target_overlap` | `number` |
-| `defaults.vae_tiling_params.rel_size_x` | `number` |
-| `defaults.vae_tiling_params.rel_size_y` | `number` |
-| `defaults.cache_mode` | `string` |
-| `defaults.cache_option` | `string` |
-| `defaults.scm_mask` | `string` |
-| `defaults.scm_policy_dynamic` | `boolean` |
-| `defaults.output_format` | `string` |
-| `defaults.output_compression` | `integer` |
+| `defaults_by_mode.img_gen` | `object` |
+| `defaults_by_mode.vid_gen` | `object` |
+| `output_formats_by_mode.img_gen` | `array<string>` |
+| `output_formats_by_mode.vid_gen` | `array<string>` |
+| `features_by_mode.img_gen` | `object` |
+| `features_by_mode.vid_gen` | `object` |
+
+Shared nested fields:

 `loras`

@ -458,19 +435,100 @@ Nested fields currently returned:
 | `limits.max_batch_count` | `integer` |
 | `limits.max_queue_size` | `integer` |

-`features`
+Shared default fields used by both `img_gen` and `vid_gen`:

 | Field | Type |
 | --- | --- |
-| `features.init_image` | `boolean` |
-| `features.mask_image` | `boolean` |
-| `features.control_image` | `boolean` |
-| `features.ref_images` | `boolean` |
-| `features.lora` | `boolean` |
-| `features.vae_tiling` | `boolean` |
-| `features.cache` | `boolean` |
-| `features.cancel_queued` | `boolean` |
-| `features.cancel_generating` | `boolean` |
+| `prompt` | `string` |
+| `negative_prompt` | `string` |
+| `clip_skip` | `integer` |
+| `width` | `integer` |
+| `height` | `integer` |
+| `strength` | `number` |
+| `seed` | `integer` |
+| `sample_params` | `object` |
+| `sample_params.scheduler` | `string` |
+| `sample_params.sample_method` | `string` |
+| `sample_params.sample_steps` | `integer` |
+| `sample_params.eta` | `number \| null` |
+| `sample_params.shifted_timestep` | `integer` |
+| `sample_params.flow_shift` | `number \| null` |
+| `sample_params.guidance.txt_cfg` | `number` |
+| `sample_params.guidance.img_cfg` | `number \| null` |
+| `sample_params.guidance.distilled_guidance` | `number` |
+| `sample_params.guidance.slg.layers` | `array<integer>` |
+| `sample_params.guidance.slg.layer_start` | `number` |
+| `sample_params.guidance.slg.layer_end` | `number` |
+| `sample_params.guidance.slg.scale` | `number` |
+| `vae_tiling_params` | `object` |
+| `vae_tiling_params.enabled` | `boolean` |
+| `vae_tiling_params.tile_size_x` | `integer` |
+| `vae_tiling_params.tile_size_y` | `integer` |
+| `vae_tiling_params.target_overlap` | `number` |
+| `vae_tiling_params.rel_size_x` | `number` |
+| `vae_tiling_params.rel_size_y` | `number` |
+| `cache_mode` | `string` |
+| `cache_option` | `string` |
+| `scm_mask` | `string` |
+| `scm_policy_dynamic` | `boolean` |
+| `output_format` | `string` |
+| `output_compression` | `integer` |
+
+`img_gen`-specific default fields:
+
+| Field | Type |
+| --- | --- |
+| `batch_count` | `integer` |
+| `auto_resize_ref_image` | `boolean` |
+| `increase_ref_index` | `boolean` |
+| `control_strength` | `number` |
+
+`vid_gen`-specific default fields:
+
+| Field | Type |
+| --- | --- |
+| `video_frames` | `integer` |
+| `fps` | `integer` |
+| `moe_boundary` | `number` |
+| `vace_strength` | `number` |
+| `high_noise_sample_params` | `object` |
+| `high_noise_sample_params.scheduler` | `string` |
+| `high_noise_sample_params.sample_method` | `string` |
+| `high_noise_sample_params.sample_steps` | `integer` |
+| `high_noise_sample_params.eta` | `number \| null` |
+| `high_noise_sample_params.shifted_timestep` | `integer` |
+| `high_noise_sample_params.flow_shift` | `number \| null` |
+| `high_noise_sample_params.guidance.txt_cfg` | `number` |
+| `high_noise_sample_params.guidance.img_cfg` | `number \| null` |
+| `high_noise_sample_params.guidance.distilled_guidance` | `number` |
+| `high_noise_sample_params.guidance.slg.layers` | `array<integer>` |
+| `high_noise_sample_params.guidance.slg.layer_start` | `number` |
+| `high_noise_sample_params.guidance.slg.layer_end` | `number` |
+| `high_noise_sample_params.guidance.slg.scale` | `number` |
+
+Fields returned in `features_by_mode.img_gen`:
+
+- `init_image`
+- `mask_image`
+- `control_image`
+- `ref_images`
+- `lora`
+- `vae_tiling`
+- `cache`
+- `cancel_queued`
+- `cancel_generating`
+
+Fields returned in `features_by_mode.vid_gen`:
+
+- `init_image`
+- `end_image`
+- `control_frames`
+- `high_noise_sample_params`
+- `lora`
+- `vae_tiling`
+- `cache`
+- `cancel_queued`
+- `cancel_generating`

 #### `POST /sdcpp/v1/img_gen`

@ -521,9 +579,7 @@ Typical status codes:
 - `409 Conflict`
 - `410 Gone`

-### Canonical Request Schema
-
-The `sdcpp API` request body is the canonical native schema.
+### Request Body

 Example:

@ -612,7 +668,7 @@ Channel expectations:
 If omitted or null:

 - single-image fields map to an empty `sd_image_t`
- array fields map to `nullptr + count = 0`
+- array fields map to an empty C-style array, represented as `pointer = nullptr` and `count = 0`

 ### Field Mapping Summary

@ -686,11 +742,11 @@ HTTP-only output fields:
 | `output_format` | `string` |
 | `output_compression` | `integer` |

-### Optional Field Semantics
+### Optional Field Handling

-Clients should preserve unset semantics for optional sampling fields.
+Optional sampling fields may be omitted.

-If a user has not explicitly provided one of these fields, the client should omit it instead of injecting a guessed fallback:
+When omitted, backend defaults apply to these fields:

 - `sample_params.scheduler`
 - `sample_params.sample_method`
@ -766,29 +822,394 @@ Example cancelled job:
 }
 ```

-### Validation and Retention
+### Submission Errors

-Recommended behavior:
+`POST /sdcpp/v1/img_gen` may return:

- malformed JSON returns `400`
- invalid image payloads return `400`
- invalid parameter structure returns `400`
- queue full returns `429` or `503`
- accepted runtime failures transition the job to `failed`
- unsupported in-progress cancellation may return `409`
+- `202 Accepted` when the job is created
+- `400 Bad Request` for an empty body, unsupported model mode, invalid JSON, or invalid generation parameters
+- `429 Too Many Requests` when the job queue is full
+- `500 Internal Server Error` for unexpected server exceptions during submission

-Recommended retention controls:
+### `vid_gen`

- pending job limit
- completed job TTL
- failed job TTL
+The following section documents the native async contract for video generation.

-### Future `vid_gen`
+#### `POST /sdcpp/v1/vid_gen`

-Future `vid_gen` should reuse the same async job model:
+Submits an async video generation job.

- `POST /sdcpp/v1/vid_gen`
- `GET /sdcpp/v1/jobs/{id}`
- `POST /sdcpp/v1/jobs/{id}/cancel`
+Successful submission returns `202 Accepted`.

-Its request body should mirror `sd_vid_gen_params_t` in the same way that `img_gen` mirrors `sd_img_gen_params_t`.
+Example response:
+
+```json
+{
+  "id": "job_01HTXYZVID",
+  "kind": "vid_gen",
+  "status": "queued",
+  "created": 1775401200,
+  "poll_url": "/sdcpp/v1/jobs/job_01HTXYZVID"
+}
+```
+
+Response fields:
+
+| Field | Type |
+| --- | --- |
+| `id` | `string` |
+| `kind` | `string` |
+| `status` | `string` |
+| `created` | `integer` |
+| `poll_url` | `string` |
+
+### Request Body
+
+Compared with `img_gen`, the `vid_gen` request body:
+
+- `vid_gen` is a single video sequence job, so `batch_count` is not part of the request schema
+- `ref_images`, `mask_image`, `control_image`, `control_strength`, and `embed_image_metadata` are not part of the request schema
+- `vid_gen` adds `end_image`, `control_frames`, `high_noise_sample_params`, `video_frames`, `fps`, `moe_boundary`, and `vace_strength`
+
+Example:
+
+```json
+{
+  "prompt": "a cat walking through a rainy alley",
+  "negative_prompt": "",
+  "clip_skip": -1,
+  "width": 832,
+  "height": 480,
+  "strength": 0.75,
+  "seed": -1,
+  "video_frames": 33,
+  "fps": 16,
+  "moe_boundary": 0.875,
+  "vace_strength": 1.0,
+
+  "init_image": null,
+  "end_image": null,
+  "control_frames": [],
+
+  "sample_params": {
+    "scheduler": "discrete",
+    "sample_method": "euler",
+    "sample_steps": 28,
+    "eta": 1.0,
+    "shifted_timestep": 0,
+    "custom_sigmas": [],
+    "flow_shift": 0.0,
+    "guidance": {
+      "txt_cfg": 7.0,
+      "img_cfg": 7.0,
+      "distilled_guidance": 3.5,
+      "slg": {
+        "layers": [7, 8, 9],
+        "layer_start": 0.01,
+        "layer_end": 0.2,
+        "scale": 0.0
+      }
+    }
+  },
+
+  "high_noise_sample_params": {
+    "scheduler": "discrete",
+    "sample_method": "euler",
+    "sample_steps": -1,
+    "eta": 1.0,
+    "shifted_timestep": 0,
+    "flow_shift": 0.0,
+    "guidance": {
+      "txt_cfg": 7.0,
+      "img_cfg": 7.0,
+      "distilled_guidance": 3.5,
+      "slg": {
+        "layers": [7, 8, 9],
+        "layer_start": 0.01,
+        "layer_end": 0.2,
+        "scale": 0.0
+      }
+    }
+  },
+
+  "lora": [],
+
+  "vae_tiling_params": {
+    "enabled": false,
+    "tile_size_x": 0,
+    "tile_size_y": 0,
+    "target_overlap": 0.5,
+    "rel_size_x": 0.0,
+    "rel_size_y": 0.0
+  },
+
+  "cache_mode": "disabled",
+  "cache_option": "",
+  "scm_mask": "",
+  "scm_policy_dynamic": true,
+
+  "output_format": "webm",
+  "output_compression": 100
+}
+```
+
+### LoRA Rules
+
+- The server only accepts explicit LoRA entries from the `lora` field.
+- Prompt-embedded `<lora:...>` tags are intentionally unsupported.
+- `lora[].is_high_noise` controls whether a LoRA applies only to the high-noise stage.
+
+### Image and Frame Encoding Rules
+
+Any image field accepts:
+
+- a raw base64 string, or
+- a data URL such as `data:image/png;base64,...`
+
+Channel expectations:
+
+- `init_image`: 3 channels
+- `end_image`: 3 channels
+- `control_frames[]`: 3 channels
+
+Frame ordering rules:
+
+- `control_frames[]` order is the conditioning frame order
+- `control_frames[]` is preserved in request order
+
+If omitted or null:
+
+- single-image fields map to an empty `sd_image_t`
+- array fields map to an empty C-style array, represented as `pointer = nullptr` and `count = 0`
+
+### Field Mapping Summary
+
+Top-level scalar fields:
+
+| Field | Type |
+| --- | --- |
+| `prompt` | `string` |
+| `negative_prompt` | `string` |
+| `clip_skip` | `integer` |
+| `width` | `integer` |
+| `height` | `integer` |
+| `strength` | `number` |
+| `seed` | `integer` |
+| `video_frames` | `integer` |
+| `fps` | `integer` |
+| `moe_boundary` | `number` |
+| `vace_strength` | `number` |
+
+Image and frame fields:
+
+| Field | Type |
+| --- | --- |
+| `init_image` | `string \| null` |
+| `end_image` | `string \| null` |
+| `control_frames` | `array<string>` |
+
+LoRA fields:
+
+| Field | Type |
+| --- | --- |
+| `lora[].path` | `string` |
+| `lora[].multiplier` | `number` |
+| `lora[].is_high_noise` | `boolean` |
+
+Sampling fields:
+
+| Field | Type |
+| --- | --- |
+| `sample_params.scheduler` | `string` |
+| `sample_params.sample_method` | `string` |
+| `sample_params.sample_steps` | `integer` |
+| `sample_params.eta` | `number` |
+| `sample_params.shifted_timestep` | `integer` |
+| `sample_params.custom_sigmas` | `array<number>` |
+| `sample_params.flow_shift` | `number` |
+| `sample_params.guidance.txt_cfg` | `number` |
+| `sample_params.guidance.img_cfg` | `number` |
+| `sample_params.guidance.distilled_guidance` | `number` |
+| `sample_params.guidance.slg.layers` | `array<integer>` |
+| `sample_params.guidance.slg.layer_start` | `number` |
+| `sample_params.guidance.slg.layer_end` | `number` |
+| `sample_params.guidance.slg.scale` | `number` |
+
+High-noise sampling fields:
+
+| Field | Type |
+| --- | --- |
+| `high_noise_sample_params.scheduler` | `string` |
+| `high_noise_sample_params.sample_method` | `string` |
+| `high_noise_sample_params.sample_steps` | `integer` |
+| `high_noise_sample_params.eta` | `number` |
+| `high_noise_sample_params.shifted_timestep` | `integer` |
+| `high_noise_sample_params.flow_shift` | `number` |
+| `high_noise_sample_params.guidance.txt_cfg` | `number` |
+| `high_noise_sample_params.guidance.img_cfg` | `number` |
+| `high_noise_sample_params.guidance.distilled_guidance` | `number` |
+| `high_noise_sample_params.guidance.slg.layers` | `array<integer>` |
+| `high_noise_sample_params.guidance.slg.layer_start` | `number` |
+| `high_noise_sample_params.guidance.slg.layer_end` | `number` |
+| `high_noise_sample_params.guidance.slg.scale` | `number` |
+
+Other native fields:
+
+| Field | Type |
+| --- | --- |
+| `vae_tiling_params` | `object` |
+| `cache_mode` | `string` |
+| `cache_option` | `string` |
+| `scm_mask` | `string` |
+| `scm_policy_dynamic` | `boolean` |
+
+HTTP-only output fields:
+
+| Field | Type |
+| --- | --- |
+| `output_format` | `string` |
+| `output_compression` | `integer` |
+
+For `vid_gen`, `output_format` and `output_compression` control container encoding.
+`fps` is request metadata for the generated sequence and is echoed in the completed job result.
+
+Allowed `output_format` values:
+
+- `webm`
+- `webp`
+- `avi`
+
+Output format behavior:
+
+- `output_format` defaults to `webm`
+- `webp` means animated WebP
+- `avi` means MJPG AVI
+- `webm` requires the server to be built with WebM support; otherwise the request returns `400`
+
+### Result Payload
+
+Completed jobs return one encoded container payload, not a list of per-frame images.
+
+Result fields:
+
+- `result.b64_json` contains the whole encoded container file as base64
+- `result.mime_type` identifies the media type
+- `result.output_format` echoes the selected container format
+- `result.fps` echoes the effective playback FPS
+- `result.frame_count` reports the actual decoded frame count used to build the container
+
+Expected MIME types:
+
+| `output_format` | `mime_type` |
+| --- | --- |
+| `webm` | `video/webm` |
+| `webp` | `image/webp` |
+| `avi` | `video/x-msvideo` |
+
+### Optional Field Handling
+
+Optional sampling fields may be omitted.
+
+When omitted, backend defaults apply to these fields:
+
+- `sample_params.scheduler`
+- `sample_params.sample_method`
+- `sample_params.eta`
+- `sample_params.flow_shift`
+- `sample_params.guidance.img_cfg`
+- `high_noise_sample_params.scheduler`
+- `high_noise_sample_params.sample_method`
+- `high_noise_sample_params.eta`
+- `high_noise_sample_params.flow_shift`
+- `high_noise_sample_params.guidance.img_cfg`
+
+`high_noise_sample_params` may also be omitted entirely.
+
+### Frame Count Semantics
+
+`video_frames` is the requested target length, but the current core video path internally normalizes the effective frame count to the largest `4n + 1` value that does not exceed the requested count.
+
+Examples:
+
+- `video_frames = 33` stays `33`
+- `video_frames = 34` becomes `33`
+- `video_frames = 32` becomes `29`
+
+The completed job payload includes the actual decoded `frame_count`.
+
+### Completion Result
+
+Example completed job:
+
+```json
+{
+  "id": "job_01HTXYZVID",
+  "kind": "vid_gen",
+  "status": "completed",
+  "created": 1775401200,
+  "started": 1775401203,
+  "completed": 1775401215,
+  "queue_position": 0,
+  "result": {
+    "output_format": "webm",
+    "mime_type": "video/webm",
+    "fps": 16,
+    "frame_count": 33,
+    "b64_json": "GkXfo59ChoEBQveBAULygQRC84EIQo..."
+  },
+  "error": null
+}
+```
+
+The response returns the encoded `.webm`, animated `.webp`, or `.avi` container payload directly.
+
+### Failure Result
+
+Example failed job:
+
+```json
+{
+  "id": "job_01HTXYZVID",
+  "kind": "vid_gen",
+  "status": "failed",
+  "created": 1775401200,
+  "started": 1775401203,
+  "completed": 1775401204,
+  "queue_position": 0,
+  "result": null,
+  "error": {
+    "code": "generation_failed",
+    "message": "generate_video returned no results"
+  }
+}
+```
+
+### Cancelled Result
+
+Example cancelled job:
+
+```json
+{
+  "id": "job_01HTXYZVID",
+  "kind": "vid_gen",
+  "status": "cancelled",
+  "created": 1775401200,
+  "started": null,
+  "completed": 1775401202,
+  "queue_position": 0,
+  "result": null,
+  "error": {
+    "code": "cancelled",
+    "message": "job cancelled by client"
+  }
+}
+```
+
+### Submission Errors
+
+`POST /sdcpp/v1/vid_gen` may return:
+
+- `202 Accepted` when the job is created
+- `400 Bad Request` for an empty body, unsupported model mode, invalid JSON, invalid generation parameters, or an unsupported output format
+- `429 Too Many Requests` when the job queue is full
+- `500 Internal Server Error` for unexpected server exceptions during submission
--- a/examples/server/async_jobs.cpp
+++ b/examples/server/async_jobs.cpp
@ -95,6 +95,10 @@ bool cancel_queued_job(AsyncJobManager& manager, AsyncGenerationJob& job) {
    job.status       = AsyncJobStatus::Cancelled;
    job.completed_at = unix_timestamp_now();
    job.result_images_b64.clear();
+    job.result_media_b64.clear();
+    job.result_media_mime_type.clear();
+    job.result_frame_count = 0;
+    job.result_fps         = 0;
    job.error_code         = "cancelled";
    job.error_message      = "job cancelled by client";
    return true;
@ -122,6 +126,15 @@ json make_async_job_json(const AsyncJobManager& manager, const AsyncGenerationJo
    }

    if (job.status == AsyncJobStatus::Completed) {
+        if (job.kind == AsyncJobKind::VidGen) {
+            result["result"] = {
+                {"output_format", job.vid_gen.output_format},
+                {"mime_type", job.result_media_mime_type},
+                {"fps", job.result_fps},
+                {"frame_count", job.result_frame_count},
+                {"b64_json", job.result_media_b64},
+            };
+        } else {
            json images = json::array();
            for (size_t i = 0; i < job.result_images_b64.size(); ++i) {
                images.push_back({{"index", i}, {"b64_json", job.result_images_b64[i]}});
@ -130,6 +143,7 @@ json make_async_job_json(const AsyncJobManager& manager, const AsyncGenerationJo
                {"output_format", job.img_gen.output_format},
                {"images", images},
            };
+        }
        result["error"] = nullptr;
    } else if (job.status == AsyncJobStatus::Failed ||
               job.status == AsyncJobStatus::Cancelled) {
@ -156,16 +170,15 @@ bool execute_img_gen_job(ServerRuntime& runtime,
    sd_img_gen_params_t params = job.img_gen.to_sd_img_gen_params_t();

    SDImageVec results;
-    int num_results = 0;

    {
        std::lock_guard<std::mutex> lock(*runtime.sd_ctx_mutex);
        sd_image_t* raw_results = generate_image(runtime.sd_ctx, &params);
-        num_results             = params.batch_count;
-        results.adopt(raw_results, num_results);
+        results.adopt(raw_results, params.batch_count);
    }

-    if (results.empty() || num_results <= 0) {
+    const int num_results = results.count();
+    if (num_results <= 0) {
        error_message = "generate_image returned no results";
        return false;
    }
@ -208,6 +221,47 @@ bool execute_img_gen_job(ServerRuntime& runtime,
    return true;
 }

+bool execute_vid_gen_job(ServerRuntime& runtime,
+                         AsyncGenerationJob& job,
+                         std::string& output_media_b64,
+                         std::string& output_media_mime_type,
+                         int& output_frame_count,
+                         int& output_fps,
+                         std::string& error_message) {
+    sd_vid_gen_params_t params = job.vid_gen.to_sd_vid_gen_params_t();
+
+    SDImageVec results;
+    int num_results = 0;
+
+    {
+        std::lock_guard<std::mutex> lock(*runtime.sd_ctx_mutex);
+        sd_image_t* raw_results = generate_video(runtime.sd_ctx, &params, &num_results);
+        results.adopt(raw_results, num_results);
+    }
+
+    num_results = results.count();
+    if (num_results <= 0) {
+        error_message = "generate_video returned no results";
+        return false;
+    }
+
+    std::vector<uint8_t> video_bytes = create_video_from_sd_images_to_vector(job.vid_gen.output_format,
+                                                                             results.data(),
+                                                                             num_results,
+                                                                             job.vid_gen.gen_params.fps,
+                                                                             job.vid_gen.output_compression);
+    if (video_bytes.empty()) {
+        error_message = "failed to encode generated video container";
+        return false;
+    }
+
+    output_media_b64       = base64_encode(video_bytes);
+    output_media_mime_type = video_mime_type(job.vid_gen.output_format);
+    output_frame_count     = num_results;
+    output_fps             = job.vid_gen.gen_params.fps;
+    return true;
+}
+
 void async_job_worker(ServerRuntime& runtime) {
    AsyncJobManager& manager = *runtime.async_job_manager;

@ -240,11 +294,23 @@ void async_job_worker(ServerRuntime& runtime) {
        }

        std::vector<std::string> output_images;
+        std::string output_media_b64;
+        std::string output_media_mime_type;
+        int output_frame_count = 0;
+        int output_fps         = 0;
        std::string error_message;
        bool ok = false;

        if (job->kind == AsyncJobKind::ImgGen) {
            ok = execute_img_gen_job(runtime, *job, output_images, error_message);
+        } else if (job->kind == AsyncJobKind::VidGen) {
+            ok = execute_vid_gen_job(runtime,
+                                     *job,
+                                     output_media_b64,
+                                     output_media_mime_type,
+                                     output_frame_count,
+                                     output_fps,
+                                     error_message);
        } else {
            error_message = "unsupported job kind";
        }
@ -260,6 +326,10 @@ void async_job_worker(ServerRuntime& runtime) {
            if (ok) {
                job->status                 = AsyncJobStatus::Completed;
                job->result_images_b64      = std::move(output_images);
+                job->result_media_b64       = std::move(output_media_b64);
+                job->result_media_mime_type = std::move(output_media_mime_type);
+                job->result_frame_count     = output_frame_count;
+                job->result_fps             = output_fps;
                job->error_code.clear();
                job->error_message.clear();
            } else {
@ -267,6 +337,10 @@ void async_job_worker(ServerRuntime& runtime) {
                job->error_code    = "generation_failed";
                job->error_message = error_message.empty() ? "unknown generation error" : error_message;
                job->result_images_b64.clear();
+                job->result_media_b64.clear();
+                job->result_media_mime_type.clear();
+                job->result_frame_count = 0;
+                job->result_fps         = 0;
            }

            purge_expired_jobs(manager);
--- a/examples/server/async_jobs.h
+++ b/examples/server/async_jobs.h
@ -36,7 +36,12 @@ struct AsyncGenerationJob {
    int64_t started_at    = 0;
    int64_t completed_at  = 0;
    ImgGenJobRequest img_gen;
+    VidGenJobRequest vid_gen;
    std::vector<std::string> result_images_b64;
+    std::string result_media_b64;
+    std::string result_media_mime_type;
+    int result_frame_count = 0;
+    int result_fps         = 0;
    std::string error_code;
    std::string error_message;
 };
@ -63,4 +68,11 @@ bool execute_img_gen_job(ServerRuntime& runtime,
                         AsyncGenerationJob& job,
                         std::vector<std::string>& output_images,
                         std::string& error_message);
+bool execute_vid_gen_job(ServerRuntime& runtime,
+                         AsyncGenerationJob& job,
+                         std::string& output_media_b64,
+                         std::string& output_media_mime_type,
+                         int& output_frame_count,
+                         int& output_fps,
+                         std::string& error_message);
 void async_job_worker(ServerRuntime& runtime);
--- a/examples/server/frontend
+++ b/examples/server/frontend
@ -1 +1 @@
-Subproject commit 740475a7a6794dc07fb23e8ec5dc56e7e80aa8c1
+Subproject commit 797ccf80825cc035508ba9b599b2a21953e7f835
--- a/examples/server/routes_openai.cpp
+++ b/examples/server/routes_openai.cpp
@ -253,6 +253,12 @@ void register_openai_api_endpoints(httplib::Server& svr, ServerRuntime& rt) {

    svr.Post("/v1/images/generations", [runtime](const httplib::Request& req, httplib::Response& res) {
        try {
+            if (!runtime_supports_generation_mode(*runtime, IMG_GEN)) {
+                res.status = 400;
+                res.set_content(json({{"error", unsupported_generation_mode_error(IMG_GEN)}}).dump(), "application/json");
+                return;
+            }
+
            ImgGenJobRequest request;
            std::string error_message;
            if (!build_openai_generation_request(req, *runtime, request, error_message)) {
@ -319,6 +325,12 @@ void register_openai_api_endpoints(httplib::Server& svr, ServerRuntime& rt) {

    svr.Post("/v1/images/edits", [runtime](const httplib::Request& req, httplib::Response& res) {
        try {
+            if (!runtime_supports_generation_mode(*runtime, IMG_GEN)) {
+                res.status = 400;
+                res.set_content(json({{"error", unsupported_generation_mode_error(IMG_GEN)}}).dump(), "application/json");
+                return;
+            }
+
            ImgGenJobRequest request;
            std::string error_message;
            if (!build_openai_edit_request(req, *runtime, request, error_message)) {
--- a/examples/server/routes_sdapi.cpp
+++ b/examples/server/routes_sdapi.cpp
@ -246,6 +246,11 @@ void register_sdapi_endpoints(httplib::Server& svr, ServerRuntime& rt) {
                res.set_content(R"({"error":"empty body"})", "application/json");
                return;
            }
+            if (!runtime_supports_generation_mode(*runtime, IMG_GEN)) {
+                res.status = 400;
+                res.set_content(json({{"error", unsupported_generation_mode_error(IMG_GEN)}}).dump(), "application/json");
+                return;
+            }

            json j = json::parse(req.body);
            ImgGenJobRequest request;
--- a/examples/server/routes_sdcpp.cpp
+++ b/examples/server/routes_sdcpp.cpp
@ -75,61 +75,9 @@ static fs::path resolve_display_model_path(const ServerRuntime& runtime) {
    return {};
 }

-static json make_capabilities_json(ServerRuntime& runtime) {
-    refresh_lora_cache(runtime);
-
-    AsyncJobManager& manager  = *runtime.async_job_manager;
-    const auto& defaults      = *runtime.default_gen_params;
-    const auto& sample_params = defaults.sample_params;
+static json make_sample_params_json(const sd_sample_params_t& sample_params, const std::vector<int>& skip_layers) {
    const auto& guidance = sample_params.guidance;
-    const fs::path model_path = resolve_display_model_path(runtime);
-    json samplers             = json::array();
-    json schedulers           = json::array();
-    json output_formats       = json::array({"png", "jpeg"});
-    json available_loras      = json::array();
-
-    for (int i = 0; i < SAMPLE_METHOD_COUNT; ++i) {
-        samplers.push_back(sd_sample_method_name((sample_method_t)i));
-    }
-
-    for (int i = 0; i < SCHEDULER_COUNT; ++i) {
-        schedulers.push_back(sd_scheduler_name((scheduler_t)i));
-    }
-
-#ifdef SD_USE_WEBP
-    output_formats.push_back("webp");
-#endif
-
-    {
-        std::lock_guard<std::mutex> lock(*runtime.lora_mutex);
-        for (const auto& entry : *runtime.lora_cache) {
-            available_loras.push_back({
-                {"name", entry.name},
-                {"path", entry.path},
-            });
-        }
-    }
-
-    json result;
-    result["model"] = {
-        {"name", model_path.filename().u8string()},
-        {"stem", model_path.stem().u8string()},
-        {"path", model_path.u8string()},
-    };
-    result["defaults"] = {
-        {"prompt", defaults.prompt},
-        {"negative_prompt", defaults.negative_prompt},
-        {"clip_skip", defaults.clip_skip},
-        {"width", defaults.width > 0 ? defaults.width : 512},
-        {"height", defaults.height > 0 ? defaults.height : 512},
-        {"strength", defaults.strength},
-        {"seed", defaults.seed},
-        {"batch_count", defaults.batch_count},
-        {"auto_resize_ref_image", defaults.auto_resize_ref_image},
-        {"increase_ref_index", defaults.increase_ref_index},
-        {"control_strength", defaults.control_strength},
-        {"sample_params",
-         {
+    return {
        {"scheduler", capability_scheduler_name(sample_params.scheduler)},
        {"sample_method", capability_sample_method_name(sample_params.sample_method)},
        {"sample_steps", sample_params.sample_steps},
@ -143,33 +91,66 @@ static json make_capabilities_json(ServerRuntime& runtime) {
             {"distilled_guidance", guidance.distilled_guidance},
             {"slg",
              {
-                       {"layers", defaults.skip_layers},
+                  {"layers", skip_layers},
                  {"layer_start", guidance.slg.layer_start},
                  {"layer_end", guidance.slg.layer_end},
                  {"scale", guidance.slg.scale},
              }},
         }},
-         }},
+    };
+}
+
+static json make_img_gen_defaults_json(const SDGenerationParams& defaults, const std::string& output_format) {
+    return {
+        {"prompt", defaults.prompt},
+        {"negative_prompt", defaults.negative_prompt},
+        {"clip_skip", defaults.clip_skip},
+        {"width", defaults.width > 0 ? defaults.width : 512},
+        {"height", defaults.height > 0 ? defaults.height : 512},
+        {"strength", defaults.strength},
+        {"seed", defaults.seed},
+        {"batch_count", defaults.batch_count},
+        {"auto_resize_ref_image", defaults.auto_resize_ref_image},
+        {"increase_ref_index", defaults.increase_ref_index},
+        {"control_strength", defaults.control_strength},
+        {"sample_params", make_sample_params_json(defaults.sample_params, defaults.skip_layers)},
        {"vae_tiling_params", make_vae_tiling_json(defaults.vae_tiling_params)},
        {"cache_mode", defaults.cache_mode},
        {"cache_option", defaults.cache_option},
        {"scm_mask", defaults.scm_mask},
        {"scm_policy_dynamic", defaults.scm_policy_dynamic},
-        {"output_format", "png"},
+        {"output_format", output_format},
        {"output_compression", 100},
    };
-    result["limits"] = {
-        {"min_width", 64},
-        {"max_width", 4096},
-        {"min_height", 64},
-        {"max_height", 4096},
-        {"max_batch_count", 8},
-        {"max_queue_size", manager.max_pending_jobs},
+}
+
+static json make_vid_gen_defaults_json(const SDGenerationParams& defaults, const std::string& output_format) {
+    return {
+        {"prompt", defaults.prompt},
+        {"negative_prompt", defaults.negative_prompt},
+        {"clip_skip", defaults.clip_skip},
+        {"width", defaults.width > 0 ? defaults.width : 512},
+        {"height", defaults.height > 0 ? defaults.height : 512},
+        {"strength", defaults.strength},
+        {"seed", defaults.seed},
+        {"video_frames", defaults.video_frames},
+        {"fps", defaults.fps},
+        {"moe_boundary", defaults.moe_boundary},
+        {"vace_strength", defaults.vace_strength},
+        {"sample_params", make_sample_params_json(defaults.sample_params, defaults.skip_layers)},
+        {"high_noise_sample_params", make_sample_params_json(defaults.high_noise_sample_params, defaults.high_noise_skip_layers)},
+        {"vae_tiling_params", make_vae_tiling_json(defaults.vae_tiling_params)},
+        {"cache_mode", defaults.cache_mode},
+        {"cache_option", defaults.cache_option},
+        {"scm_mask", defaults.scm_mask},
+        {"scm_policy_dynamic", defaults.scm_policy_dynamic},
+        {"output_format", output_format},
+        {"output_compression", 100},
    };
-    result["samplers"]       = samplers;
-    result["schedulers"]     = schedulers;
-    result["output_formats"] = output_formats;
-    result["features"]       = {
+}
+
+static json make_img_gen_features_json() {
+    return {
        {"init_image", true},
        {"mask_image", true},
        {"control_image", true},
@ -180,6 +161,128 @@ static json make_capabilities_json(ServerRuntime& runtime) {
        {"cancel_queued", true},
        {"cancel_generating", false},
    };
+}
+
+static json make_vid_gen_features_json() {
+    return {
+        {"init_image", true},
+        {"end_image", true},
+        {"control_frames", true},
+        {"high_noise_sample_params", true},
+        {"lora", true},
+        {"vae_tiling", true},
+        {"cache", true},
+        {"cancel_queued", true},
+        {"cancel_generating", false},
+    };
+}
+
+static json make_capabilities_json(ServerRuntime& runtime) {
+    refresh_lora_cache(runtime);
+
+    AsyncJobManager& manager  = *runtime.async_job_manager;
+    const auto& defaults      = *runtime.default_gen_params;
+    const fs::path model_path = resolve_display_model_path(runtime);
+    const bool supports_img   = runtime_supports_generation_mode(runtime, IMG_GEN);
+    const bool supports_vid   = runtime_supports_generation_mode(runtime, VID_GEN);
+    json samplers             = json::array();
+    json schedulers           = json::array();
+    json image_output_formats = supported_img_output_formats();
+    json video_output_formats = supported_vid_output_formats();
+    json available_loras      = json::array();
+    json supported_modes      = json::array();
+
+    for (int i = 0; i < SAMPLE_METHOD_COUNT; ++i) {
+        samplers.push_back(sd_sample_method_name((sample_method_t)i));
+    }
+
+    for (int i = 0; i < SCHEDULER_COUNT; ++i) {
+        schedulers.push_back(sd_scheduler_name((scheduler_t)i));
+    }
+
+    {
+        std::lock_guard<std::mutex> lock(*runtime.lora_mutex);
+        for (const auto& entry : *runtime.lora_cache) {
+            available_loras.push_back({
+                {"name", entry.name},
+                {"path", entry.path},
+            });
+        }
+    }
+
+    if (supports_img) {
+        supported_modes.push_back("img_gen");
+    }
+    if (supports_vid) {
+        supported_modes.push_back("vid_gen");
+    }
+
+    std::string default_img_output_format = "png";
+    std::string default_vid_output_format = "avi";
+    if (!image_output_formats.empty()) {
+        default_img_output_format = image_output_formats[0].get<std::string>();
+    }
+    if (!video_output_formats.empty()) {
+        default_vid_output_format = video_output_formats[0].get<std::string>();
+    }
+
+    json defaults_by_mode       = json::object();
+    json output_formats_by_mode = json::object();
+    json features_by_mode       = json::object();
+    if (supports_img) {
+        defaults_by_mode["img_gen"]       = make_img_gen_defaults_json(defaults, default_img_output_format);
+        output_formats_by_mode["img_gen"] = image_output_formats;
+        features_by_mode["img_gen"]       = make_img_gen_features_json();
+    }
+    if (supports_vid) {
+        defaults_by_mode["vid_gen"]       = make_vid_gen_defaults_json(defaults, default_vid_output_format);
+        output_formats_by_mode["vid_gen"] = video_output_formats;
+        features_by_mode["vid_gen"]       = make_vid_gen_features_json();
+    }
+
+    json top_level_defaults       = json::object();
+    json top_level_output_formats = json::array();
+    json top_level_features       = {
+              {"cancel_queued", true},
+              {"cancel_generating", false},
+    };
+    std::string current_mode = "";
+    if (supports_img) {
+        current_mode             = "img_gen";
+        top_level_defaults       = defaults_by_mode["img_gen"];
+        top_level_output_formats = output_formats_by_mode["img_gen"];
+        top_level_features       = features_by_mode["img_gen"];
+    } else if (supports_vid) {
+        current_mode             = "vid_gen";
+        top_level_defaults       = defaults_by_mode["vid_gen"];
+        top_level_output_formats = output_formats_by_mode["vid_gen"];
+        top_level_features       = features_by_mode["vid_gen"];
+    }
+
+    json result;
+    result["model"] = {
+        {"name", model_path.filename().u8string()},
+        {"stem", model_path.stem().u8string()},
+        {"path", model_path.u8string()},
+    };
+    result["current_mode"]     = current_mode;
+    result["supported_modes"]  = supported_modes;
+    result["defaults"]         = top_level_defaults;
+    result["defaults_by_mode"] = defaults_by_mode;
+    result["limits"]           = {
+                  {"min_width", 64},
+                  {"max_width", 4096},
+                  {"min_height", 64},
+                  {"max_height", 4096},
+                  {"max_batch_count", 8},
+                  {"max_queue_size", manager.max_pending_jobs},
+    };
+    result["samplers"]               = samplers;
+    result["schedulers"]             = schedulers;
+    result["output_formats"]         = top_level_output_formats;
+    result["output_formats_by_mode"] = output_formats_by_mode;
+    result["features"]               = top_level_features;
+    result["features_by_mode"]       = features_by_mode;
    result["loras"]                  = available_loras;
    return result;
 }
@ -211,6 +314,33 @@ static bool parse_img_gen_request(const json& body,
    return true;
 }

+static bool parse_vid_gen_request(const json& body,
+                                  ServerRuntime& runtime,
+                                  VidGenJobRequest& request,
+                                  std::string& error_message) {
+    request.gen_params = *runtime.default_gen_params;
+
+    refresh_lora_cache(runtime);
+    if (!request.gen_params.from_json_str(body.dump(), [&](const std::string& path) {
+            return get_lora_full_path(runtime, path);
+        })) {
+        error_message = "invalid generation parameters";
+        return false;
+    }
+
+    std::string output_format = body.value("output_format", "webm");
+    int output_compression    = body.value("output_compression", 100);
+    if (!assign_output_options(request, output_format, output_compression, error_message)) {
+        return false;
+    }
+    // Intentionally disable prompt-embedded LoRA tag parsing for server APIs.
+    if (!request.gen_params.resolve_and_validate(VID_GEN, "", true)) {
+        error_message = "invalid generation parameters";
+        return false;
+    }
+    return true;
+}
+
 void register_sdcpp_api_endpoints(httplib::Server& svr, ServerRuntime& rt) {
    ServerRuntime* runtime = &rt;

@ -226,6 +356,11 @@ void register_sdcpp_api_endpoints(httplib::Server& svr, ServerRuntime& rt) {
                res.set_content(R"({"error":"empty body"})", "application/json");
                return;
            }
+            if (!runtime_supports_generation_mode(*runtime, IMG_GEN)) {
+                res.status = 400;
+                res.set_content(json({{"error", unsupported_generation_mode_error(IMG_GEN)}}).dump(), "application/json");
+                return;
+            }

            json body = json::parse(req.body);
            ImgGenJobRequest request;
@ -276,9 +411,66 @@ void register_sdcpp_api_endpoints(httplib::Server& svr, ServerRuntime& rt) {
        }
    });

-    svr.Post("/sdcpp/v1/vid_gen", [](const httplib::Request&, httplib::Response& res) {
-        res.status = 501;
-        res.set_content(R"({"error":"vid_gen is reserved and not implemented yet"})", "application/json");
+    svr.Post("/sdcpp/v1/vid_gen", [runtime](const httplib::Request& req, httplib::Response& res) {
+        try {
+            if (req.body.empty()) {
+                res.status = 400;
+                res.set_content(R"({"error":"empty body"})", "application/json");
+                return;
+            }
+            if (!runtime_supports_generation_mode(*runtime, VID_GEN)) {
+                res.status = 400;
+                res.set_content(json({{"error", unsupported_generation_mode_error(VID_GEN)}}).dump(), "application/json");
+                return;
+            }
+
+            json body = json::parse(req.body);
+            VidGenJobRequest request;
+            std::string error_message;
+            if (!parse_vid_gen_request(body, *runtime, request, error_message)) {
+                res.status = 400;
+                res.set_content(json({{"error", error_message}}).dump(), "application/json");
+                return;
+            }
+
+            AsyncJobManager& manager                = *runtime->async_job_manager;
+            std::shared_ptr<AsyncGenerationJob> job = std::make_shared<AsyncGenerationJob>();
+            job->kind                               = AsyncJobKind::VidGen;
+            job->status                             = AsyncJobStatus::Queued;
+            job->created_at                         = unix_timestamp_now();
+            job->vid_gen                            = std::move(request);
+
+            {
+                std::lock_guard<std::mutex> lock(manager.mutex);
+                purge_expired_jobs(manager);
+                if (count_pending_jobs(manager) >= manager.max_pending_jobs) {
+                    res.status = 429;
+                    res.set_content(R"({"error":"job queue is full"})", "application/json");
+                    return;
+                }
+                job->id               = make_async_job_id(manager);
+                manager.jobs[job->id] = job;
+                manager.queue.push_back(job->id);
+            }
+
+            manager.cv.notify_one();
+
+            json out;
+            out["id"]       = job->id;
+            out["kind"]     = async_job_kind_name(job->kind);
+            out["status"]   = async_job_status_name(job->status);
+            out["created"]  = job->created_at;
+            out["poll_url"] = "/sdcpp/v1/jobs/" + job->id;
+
+            res.status = 202;
+            res.set_content(out.dump(), "application/json");
+        } catch (const json::parse_error& e) {
+            res.status = 400;
+            res.set_content(json({{"error", "invalid json"}, {"message", e.what()}}).dump(), "application/json");
+        } catch (const std::exception& e) {
+            res.status = 500;
+            res.set_content(json({{"error", "server_error"}, {"message", e.what()}}).dump(), "application/json");
+        }
    });

    svr.Get(R"(/sdcpp/v1/jobs/([A-Za-z0-9_\-]+))", [runtime](const httplib::Request& req, httplib::Response& res) {
--- a/examples/server/runtime.cpp
+++ b/examples/server/runtime.cpp
@ -45,6 +45,44 @@ std::string normalize_output_format(std::string output_format) {
    return output_format;
 }

+std::vector<std::string> supported_img_output_formats(bool allow_webp) {
+    std::vector<std::string> formats = {"png", "jpeg"};
+#ifdef SD_USE_WEBP
+    if (allow_webp) {
+        formats.push_back("webp");
+    }
+#else
+    (void)allow_webp;
+#endif
+    return formats;
+}
+
+std::vector<std::string> supported_vid_output_formats() {
+    std::vector<std::string> formats;
+#ifdef SD_USE_WEBM
+    formats.push_back("webm");
+#endif
+#ifdef SD_USE_WEBP
+    formats.push_back("webp");
+#endif
+    formats.push_back("avi");
+    return formats;
+}
+
+static std::string valid_vid_output_formats_message() {
+    const std::vector<std::string> formats = supported_vid_output_formats();
+
+    std::string message = "invalid output_format, must be one of [";
+    for (size_t i = 0; i < formats.size(); ++i) {
+        if (i > 0) {
+            message += ", ";
+        }
+        message += formats[i];
+    }
+    message += "]";
+    return message;
+}
+
 bool assign_output_options(ImgGenJobRequest& request,
                           std::string output_format,
                           int output_compression,
@ -53,19 +91,88 @@ bool assign_output_options(ImgGenJobRequest& request,
    request.output_format      = normalize_output_format(std::move(output_format));
    request.output_compression = std::clamp(output_compression, 0, 100);

-    const bool valid_format = request.output_format == "png" ||
-                              request.output_format == "jpeg" ||
-                              (allow_webp && request.output_format == "webp");
+    const std::vector<std::string> valid_formats = supported_img_output_formats(allow_webp);
+    const bool valid_format                      = std::find(valid_formats.begin(),
+                                                             valid_formats.end(),
+                                                             request.output_format) != valid_formats.end();
    if (!valid_format) {
-        error_message = allow_webp
-                            ? "invalid output_format, must be one of [png, jpeg, webp]"
-                            : "invalid output_format, must be one of [png, jpeg]";
+        error_message = "invalid output_format, must be one of [";
+        for (size_t i = 0; i < valid_formats.size(); ++i) {
+            if (i > 0) {
+                error_message += ", ";
+            }
+            error_message += valid_formats[i];
+        }
+        error_message += "]";
        return false;
    }

    return true;
 }

+bool assign_output_options(VidGenJobRequest& request,
+                           std::string output_format,
+                           int output_compression,
+                           std::string& error_message) {
+    request.output_format      = normalize_output_format(std::move(output_format));
+    request.output_compression = std::clamp(output_compression, 0, 100);
+
+    if (request.output_format == "avi") {
+        return true;
+    }
+
+    if (request.output_format == "webm") {
+#ifdef SD_USE_WEBM
+        return true;
+#else
+        error_message = valid_vid_output_formats_message();
+        return false;
+#endif
+    }
+
+    if (request.output_format == "webp") {
+#ifdef SD_USE_WEBP
+        return true;
+#else
+        error_message = valid_vid_output_formats_message();
+        return false;
+#endif
+    }
+
+    error_message = valid_vid_output_formats_message();
+    return false;
+}
+
+std::string video_mime_type(const std::string& output_format) {
+    if (output_format == "webm") {
+        return "video/webm";
+    }
+    if (output_format == "webp") {
+        return "image/webp";
+    }
+    return "video/x-msvideo";
+}
+
+bool runtime_supports_generation_mode(const ServerRuntime& runtime, SDMode mode) {
+    if (mode == VID_GEN) {
+        return sd_ctx_supports_video_generation(runtime.sd_ctx);
+    }
+    if (mode == IMG_GEN) {
+        return sd_ctx_supports_image_generation(runtime.sd_ctx);
+    }
+    return true;
+}
+
+std::string unsupported_generation_mode_error(SDMode mode) {
+    if (mode == VID_GEN) {
+        return "loaded model does not support vid_gen";
+    }
+    if (mode == IMG_GEN) {
+        return "loaded model does not support img_gen";
+    }
+    return "loaded model does not support requested mode";
+}
+
 ArgOptions SDSvrParams::get_options() {
    ArgOptions options;

--- a/examples/server/runtime.h
+++ b/examples/server/runtime.h
@ -58,13 +58,32 @@ struct ImgGenJobRequest {
    }
 };

+struct VidGenJobRequest {
+    SDGenerationParams gen_params;
+    std::string output_format = "webm";
+    int output_compression    = 100;
+
+    sd_vid_gen_params_t to_sd_vid_gen_params_t() {
+        return gen_params.to_sd_vid_gen_params_t();
+    }
+};
+
 std::string base64_encode(const std::vector<uint8_t>& bytes);
 std::string normalize_output_format(std::string output_format);
+std::vector<std::string> supported_img_output_formats(bool allow_webp = true);
+std::vector<std::string> supported_vid_output_formats();
 bool assign_output_options(ImgGenJobRequest& request,
                           std::string output_format,
                           int output_compression,
                           bool allow_webp,
                           std::string& error_message);
+bool assign_output_options(VidGenJobRequest& request,
+                           std::string output_format,
+                           int output_compression,
+                           std::string& error_message);
+std::string video_mime_type(const std::string& output_format);
+bool runtime_supports_generation_mode(const ServerRuntime& runtime, SDMode mode);
+std::string unsupported_generation_mode_error(SDMode mode);
 void refresh_lora_cache(ServerRuntime& rt);
 std::string get_lora_full_path(ServerRuntime& rt, const std::string& path);
 int64_t unix_timestamp_now();
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@ -348,6 +348,8 @@ SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
 SD_API void sd_set_preview_callback(sd_preview_cb_t cb, enum preview_t mode, int interval, bool denoised, bool noisy, void* data);
 SD_API int32_t sd_get_num_physical_cores();
 SD_API const char* sd_get_system_info();
+SD_API bool sd_ctx_supports_image_generation(const sd_ctx_t* sd_ctx);
+SD_API bool sd_ctx_supports_video_generation(const sd_ctx_t* sd_ctx);

 SD_API const char* sd_type_name(enum sd_type_t type);
 SD_API enum sd_type_t str_to_sd_type(const char* str);
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@ -2390,6 +2390,14 @@ struct sd_ctx_t {
    StableDiffusionGGML* sd = nullptr;
 };

+static bool sd_version_supports_video_generation(SDVersion version) {
+    return version == VERSION_SVD || sd_version_is_wan(version);
+}
+
+static bool sd_version_supports_image_generation(SDVersion version) {
+    return !sd_version_supports_video_generation(version);
+}
+
 sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params) {
    sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t));
    if (sd_ctx == nullptr) {
@ -2419,6 +2427,20 @@ void free_sd_ctx(sd_ctx_t* sd_ctx) {
    free(sd_ctx);
 }

+SD_API bool sd_ctx_supports_image_generation(const sd_ctx_t* sd_ctx) {
+    if (sd_ctx == nullptr || sd_ctx->sd == nullptr) {
+        return false;
+    }
+    return sd_version_supports_image_generation(sd_ctx->sd->version);
+}
+
+SD_API bool sd_ctx_supports_video_generation(const sd_ctx_t* sd_ctx) {
+    if (sd_ctx == nullptr || sd_ctx->sd == nullptr) {
+        return false;
+    }
+    return sd_version_supports_video_generation(sd_ctx->sd->version);
+}
+
 enum sample_method_t sd_get_default_sample_method(const sd_ctx_t* sd_ctx) {
    if (sd_ctx != nullptr && sd_ctx->sd != nullptr) {
        if (sd_version_is_dit(sd_ctx->sd->version)) {