feat(server): implement vid_gen async API and mode-aware capabilities (#1437)

This commit is contained in:
leejet 2026-04-18 15:06:36 +08:00 committed by GitHub
parent f3f69e2fbe
commit 4d626d24b2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 1345 additions and 339 deletions

View File

@ -1589,10 +1589,18 @@ bool SDGenerationParams::from_json_str(
LOG_ERROR("invalid init_image"); LOG_ERROR("invalid init_image");
return false; return false;
} }
if (!parse_image_json_field(j, "end_image", 3, width, height, end_image)) {
LOG_ERROR("invalid end_image");
return false;
}
if (!parse_image_array_json_field(j, "ref_images", 3, width, height, ref_images)) { if (!parse_image_array_json_field(j, "ref_images", 3, width, height, ref_images)) {
LOG_ERROR("invalid ref_images"); LOG_ERROR("invalid ref_images");
return false; return false;
} }
if (!parse_image_array_json_field(j, "control_frames", 3, width, height, control_frames)) {
LOG_ERROR("invalid control_frames");
return false;
}
if (!parse_image_json_field(j, "mask_image", 1, width, height, mask_image)) { if (!parse_image_json_field(j, "mask_image", 1, width, height, mask_image)) {
LOG_ERROR("invalid mask_image"); LOG_ERROR("invalid mask_image");
return false; return false;

View File

@ -95,6 +95,57 @@ using WebPMuxPtr = std::unique_ptr<WebPMux, WebPMuxDeleter>;
using WebPAnimEncoderPtr = std::unique_ptr<WebPAnimEncoder, WebPAnimEncoderDeleter>; using WebPAnimEncoderPtr = std::unique_ptr<WebPAnimEncoder, WebPAnimEncoderDeleter>;
#endif #endif
#ifdef SD_USE_WEBM
class MemoryMkvWriter : public mkvmuxer::IMkvWriter {
public:
mkvmuxer::int32 Write(const void* buf, mkvmuxer::uint32 len) override {
if (buf == nullptr && len > 0) {
return -1;
}
const size_t end_pos = position_ + static_cast<size_t>(len);
if (end_pos > data_.size()) {
data_.resize(end_pos);
}
if (len > 0) {
memcpy(data_.data() + position_, buf, len);
}
position_ = end_pos;
return 0;
}
mkvmuxer::int64 Position() const override {
return static_cast<mkvmuxer::int64>(position_);
}
mkvmuxer::int32 Position(mkvmuxer::int64 position) override {
if (position < 0) {
return -1;
}
const size_t target = static_cast<size_t>(position);
if (target > data_.size()) {
data_.resize(target);
}
position_ = target;
return 0;
}
bool Seekable() const override {
return true;
}
void ElementStartNotify(mkvmuxer::uint64, mkvmuxer::int64) override {
}
const std::vector<uint8_t>& data() const {
return data_;
}
private:
std::vector<uint8_t> data_;
size_t position_ = 0;
};
#endif
bool read_binary_file_bytes(const char* path, std::vector<uint8_t>& data) { bool read_binary_file_bytes(const char* path, std::vector<uint8_t>& data) {
std::ifstream fin(fs::path(path), std::ios::binary); std::ifstream fin(fs::path(path), std::ios::binary);
if (!fin) { if (!fin) {
@ -570,6 +621,32 @@ void write_u16_le(FILE* f, uint16_t val) {
fwrite(&val, 2, 1, f); fwrite(&val, 2, 1, f);
} }
void write_u32_le(std::vector<uint8_t>& data, uint32_t val) {
data.push_back(static_cast<uint8_t>(val & 0xFF));
data.push_back(static_cast<uint8_t>((val >> 8) & 0xFF));
data.push_back(static_cast<uint8_t>((val >> 16) & 0xFF));
data.push_back(static_cast<uint8_t>((val >> 24) & 0xFF));
}
void write_u16_le(std::vector<uint8_t>& data, uint16_t val) {
data.push_back(static_cast<uint8_t>(val & 0xFF));
data.push_back(static_cast<uint8_t>((val >> 8) & 0xFF));
}
void patch_u32_le(std::vector<uint8_t>& data, size_t offset, uint32_t val) {
if (offset + 4 > data.size()) {
return;
}
data[offset + 0] = static_cast<uint8_t>(val & 0xFF);
data[offset + 1] = static_cast<uint8_t>((val >> 8) & 0xFF);
data[offset + 2] = static_cast<uint8_t>((val >> 16) & 0xFF);
data[offset + 3] = static_cast<uint8_t>((val >> 24) & 0xFF);
}
void write_fourcc(std::vector<uint8_t>& data, const char* fourcc) {
data.insert(data.end(), fourcc, fourcc + 4);
}
EncodedImageFormat encoded_image_format_from_path(const std::string& path) { EncodedImageFormat encoded_image_format_from_path(const std::string& path) {
std::string ext = fs::path(path).extension().string(); std::string ext = fs::path(path).extension().string();
std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower); std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
@ -699,95 +776,96 @@ uint8_t* load_image_from_memory(const char* image_bytes,
return load_image_common(true, image_bytes, len, width, height, expected_width, expected_height, expected_channel); return load_image_common(true, image_bytes, len, width, height, expected_width, expected_height, expected_channel);
} }
int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) { std::vector<uint8_t> create_mjpg_avi_from_sd_images_to_vector(sd_image_t* images, int num_images, int fps, int quality) {
if (num_images == 0) { if (num_images == 0) {
fprintf(stderr, "Error: Image array is empty.\n"); fprintf(stderr, "Error: Image array is empty.\n");
return -1; return {};
} }
FilePtr file(fopen(filename, "wb"));
if (!file) {
perror("Error opening file for writing");
return -1;
}
FILE* f = file.get();
uint32_t width = images[0].width; uint32_t width = images[0].width;
uint32_t height = images[0].height; uint32_t height = images[0].height;
uint32_t channels = images[0].channel; uint32_t channels = images[0].channel;
if (channels != 3 && channels != 4) { if (channels != 3 && channels != 4) {
fprintf(stderr, "Error: Unsupported channel count: %u\n", channels); fprintf(stderr, "Error: Unsupported channel count: %u\n", channels);
return -1; return {};
} }
fwrite("RIFF", 4, 1, f); // stb_image_write changes JPEG sampling behavior above quality 90.
long riff_size_pos = ftell(f); // MJPG AVI playback is more compatible when we keep the encoder on the
write_u32_le(f, 0); // <= 90 path.
fwrite("AVI ", 4, 1, f); const int mjpg_quality = std::clamp(quality, 1, 90);
fwrite("LIST", 4, 1, f); std::vector<uint8_t> avi_data;
write_u32_le(f, 4 + 8 + 56 + 8 + 4 + 8 + 56 + 8 + 40); avi_data.reserve(static_cast<size_t>(num_images) * 1024);
fwrite("hdrl", 4, 1, f);
fwrite("avih", 4, 1, f); write_fourcc(avi_data, "RIFF");
write_u32_le(f, 56); const size_t riff_size_pos = avi_data.size();
write_u32_le(f, 1000000 / fps); write_u32_le(avi_data, 0);
write_u32_le(f, 0); write_fourcc(avi_data, "AVI ");
write_u32_le(f, 0);
write_u32_le(f, 0x110);
write_u32_le(f, num_images);
write_u32_le(f, 0);
write_u32_le(f, 1);
write_u32_le(f, width * height * 3);
write_u32_le(f, width);
write_u32_le(f, height);
write_u32_le(f, 0);
write_u32_le(f, 0);
write_u32_le(f, 0);
write_u32_le(f, 0);
fwrite("LIST", 4, 1, f); write_fourcc(avi_data, "LIST");
write_u32_le(f, 4 + 8 + 56 + 8 + 40); write_u32_le(avi_data, 4 + 8 + 56 + 8 + 4 + 8 + 56 + 8 + 40);
fwrite("strl", 4, 1, f); write_fourcc(avi_data, "hdrl");
fwrite("strh", 4, 1, f); write_fourcc(avi_data, "avih");
write_u32_le(f, 56); write_u32_le(avi_data, 56);
fwrite("vids", 4, 1, f); write_u32_le(avi_data, 1000000 / fps);
fwrite("MJPG", 4, 1, f); write_u32_le(avi_data, 0);
write_u32_le(f, 0); write_u32_le(avi_data, 0);
write_u16_le(f, 0); write_u32_le(avi_data, 0x110);
write_u16_le(f, 0); write_u32_le(avi_data, num_images);
write_u32_le(f, 0); write_u32_le(avi_data, 0);
write_u32_le(f, 1); write_u32_le(avi_data, 1);
write_u32_le(f, fps); write_u32_le(avi_data, width * height * 3);
write_u32_le(f, 0); write_u32_le(avi_data, width);
write_u32_le(f, num_images); write_u32_le(avi_data, height);
write_u32_le(f, width * height * 3); write_u32_le(avi_data, 0);
write_u32_le(f, (uint32_t)-1); write_u32_le(avi_data, 0);
write_u32_le(f, 0); write_u32_le(avi_data, 0);
write_u16_le(f, 0); write_u32_le(avi_data, 0);
write_u16_le(f, 0);
write_u16_le(f, 0);
write_u16_le(f, 0);
fwrite("strf", 4, 1, f); write_fourcc(avi_data, "LIST");
write_u32_le(f, 40); write_u32_le(avi_data, 4 + 8 + 56 + 8 + 40);
write_u32_le(f, 40); write_fourcc(avi_data, "strl");
write_u32_le(f, width);
write_u32_le(f, height);
write_u16_le(f, 1);
write_u16_le(f, 24);
fwrite("MJPG", 4, 1, f);
write_u32_le(f, width * height * 3);
write_u32_le(f, 0);
write_u32_le(f, 0);
write_u32_le(f, 0);
write_u32_le(f, 0);
fwrite("LIST", 4, 1, f); write_fourcc(avi_data, "strh");
long movi_size_pos = ftell(f); write_u32_le(avi_data, 56);
write_u32_le(f, 0); write_fourcc(avi_data, "vids");
fwrite("movi", 4, 1, f); write_fourcc(avi_data, "MJPG");
write_u32_le(avi_data, 0);
write_u16_le(avi_data, 0);
write_u16_le(avi_data, 0);
write_u32_le(avi_data, 0);
write_u32_le(avi_data, 1);
write_u32_le(avi_data, fps);
write_u32_le(avi_data, 0);
write_u32_le(avi_data, num_images);
write_u32_le(avi_data, width * height * 3);
write_u32_le(avi_data, static_cast<uint32_t>(-1));
write_u32_le(avi_data, 0);
write_u16_le(avi_data, 0);
write_u16_le(avi_data, 0);
write_u16_le(avi_data, 0);
write_u16_le(avi_data, 0);
write_fourcc(avi_data, "strf");
write_u32_le(avi_data, 40);
write_u32_le(avi_data, 40);
write_u32_le(avi_data, width);
write_u32_le(avi_data, height);
write_u16_le(avi_data, 1);
write_u16_le(avi_data, 24);
write_fourcc(avi_data, "MJPG");
write_u32_le(avi_data, width * height * 3);
write_u32_le(avi_data, 0);
write_u32_le(avi_data, 0);
write_u32_le(avi_data, 0);
write_u32_le(avi_data, 0);
write_fourcc(avi_data, "LIST");
const size_t movi_size_pos = avi_data.size();
write_u32_le(avi_data, 0);
write_fourcc(avi_data, "movi");
std::vector<avi_index_entry> index(static_cast<size_t>(num_images)); std::vector<avi_index_entry> index(static_cast<size_t>(num_images));
std::vector<uint8_t> jpeg_data; std::vector<uint8_t> jpeg_data;
@ -801,55 +879,61 @@ int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int
buffer->insert(buffer->end(), src, src + size); buffer->insert(buffer->end(), src, src + size);
}; };
if (!stbi_write_jpg_to_func(write_to_buf, &jpeg_data, images[i].width, images[i].height, channels, images[i].data, quality)) { if (!stbi_write_jpg_to_func(write_to_buf, &jpeg_data, images[i].width, images[i].height, channels, images[i].data, mjpg_quality)) {
fprintf(stderr, "Error: Failed to encode JPEG frame.\n"); fprintf(stderr, "Error: Failed to encode JPEG frame.\n");
return -1; return {};
} }
fwrite("00dc", 4, 1, f); index[i].offset = static_cast<uint32_t>(avi_data.size());
write_u32_le(f, (uint32_t)jpeg_data.size()); write_fourcc(avi_data, "00dc");
index[i].offset = ftell(f) - 8; write_u32_le(avi_data, static_cast<uint32_t>(jpeg_data.size()));
index[i].size = (uint32_t)jpeg_data.size(); index[i].size = (uint32_t)jpeg_data.size();
fwrite(jpeg_data.data(), 1, jpeg_data.size(), f); avi_data.insert(avi_data.end(), jpeg_data.begin(), jpeg_data.end());
if (jpeg_data.size() % 2) { if (jpeg_data.size() % 2) {
fputc(0, f); avi_data.push_back(0);
} }
} }
long cur_pos = ftell(f); const size_t movi_size = avi_data.size() - movi_size_pos - 4;
long movi_size = cur_pos - movi_size_pos - 4; patch_u32_le(avi_data, movi_size_pos, static_cast<uint32_t>(movi_size));
fseek(f, movi_size_pos, SEEK_SET);
write_u32_le(f, movi_size);
fseek(f, cur_pos, SEEK_SET);
fwrite("idx1", 4, 1, f); write_fourcc(avi_data, "idx1");
write_u32_le(f, num_images * 16); write_u32_le(avi_data, num_images * 16);
for (int i = 0; i < num_images; i++) { for (int i = 0; i < num_images; i++) {
fwrite("00dc", 4, 1, f); write_fourcc(avi_data, "00dc");
write_u32_le(f, 0x10); write_u32_le(avi_data, 0x10);
write_u32_le(f, index[i].offset); write_u32_le(avi_data, index[i].offset);
write_u32_le(f, index[i].size); write_u32_le(avi_data, index[i].size);
} }
cur_pos = ftell(f); const size_t file_size = avi_data.size() - riff_size_pos - 4;
long file_size = cur_pos - riff_size_pos - 4; patch_u32_le(avi_data, riff_size_pos, static_cast<uint32_t>(file_size));
fseek(f, riff_size_pos, SEEK_SET);
write_u32_le(f, file_size);
fseek(f, cur_pos, SEEK_SET);
return avi_data;
}
int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) {
std::vector<uint8_t> avi_data = create_mjpg_avi_from_sd_images_to_vector(images, num_images, fps, quality);
if (avi_data.empty()) {
return -1;
}
if (!write_binary_file_bytes(filename, avi_data)) {
perror("Error opening file for writing");
return -1;
}
return 0; return 0;
} }
#ifdef SD_USE_WEBP #ifdef SD_USE_WEBP
int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) { std::vector<uint8_t> create_animated_webp_from_sd_images_to_vector(sd_image_t* images, int num_images, int fps, int quality) {
if (num_images == 0) { if (num_images == 0) {
fprintf(stderr, "Error: Image array is empty.\n"); fprintf(stderr, "Error: Image array is empty.\n");
return -1; return {};
} }
if (fps <= 0) { if (fps <= 0) {
fprintf(stderr, "Error: FPS must be positive.\n"); fprintf(stderr, "Error: FPS must be positive.\n");
return -1; return {};
} }
const int width = static_cast<int>(images[0].width); const int width = static_cast<int>(images[0].width);
@ -857,14 +941,14 @@ int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images
const int channels = static_cast<int>(images[0].channel); const int channels = static_cast<int>(images[0].channel);
if (channels != 1 && channels != 3 && channels != 4) { if (channels != 1 && channels != 3 && channels != 4) {
fprintf(stderr, "Error: Unsupported channel count: %d\n", channels); fprintf(stderr, "Error: Unsupported channel count: %d\n", channels);
return -1; return {};
} }
WebPAnimEncoderOptions anim_options; WebPAnimEncoderOptions anim_options;
WebPConfig config; WebPConfig config;
if (!WebPAnimEncoderOptionsInit(&anim_options) || !WebPConfigInit(&config)) { if (!WebPAnimEncoderOptionsInit(&anim_options) || !WebPConfigInit(&config)) {
fprintf(stderr, "Error: Failed to initialize WebP animation encoder.\n"); fprintf(stderr, "Error: Failed to initialize WebP animation encoder.\n");
return -1; return {};
} }
config.quality = static_cast<float>(quality); config.quality = static_cast<float>(quality);
@ -875,13 +959,13 @@ int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images
} }
if (!WebPValidateConfig(&config)) { if (!WebPValidateConfig(&config)) {
fprintf(stderr, "Error: Invalid WebP encoder configuration.\n"); fprintf(stderr, "Error: Invalid WebP encoder configuration.\n");
return -1; return {};
} }
WebPAnimEncoderPtr enc(WebPAnimEncoderNew(width, height, &anim_options)); WebPAnimEncoderPtr enc(WebPAnimEncoderNew(width, height, &anim_options));
if (enc == nullptr) { if (enc == nullptr) {
fprintf(stderr, "Error: Could not create WebPAnimEncoder object.\n"); fprintf(stderr, "Error: Could not create WebPAnimEncoder object.\n");
return -1; return {};
} }
const int frame_duration_ms = std::max(1, static_cast<int>(std::lround(1000.0 / static_cast<double>(fps)))); const int frame_duration_ms = std::max(1, static_cast<int>(std::lround(1000.0 / static_cast<double>(fps))));
@ -891,13 +975,13 @@ int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images
const sd_image_t& image = images[i]; const sd_image_t& image = images[i];
if (static_cast<int>(image.width) != width || static_cast<int>(image.height) != height) { if (static_cast<int>(image.width) != width || static_cast<int>(image.height) != height) {
fprintf(stderr, "Error: Frame dimensions do not match.\n"); fprintf(stderr, "Error: Frame dimensions do not match.\n");
return -1; return {};
} }
WebPPictureGuard picture; WebPPictureGuard picture;
if (!picture.initialized) { if (!picture.initialized) {
fprintf(stderr, "Error: Failed to initialize WebPPicture.\n"); fprintf(stderr, "Error: Failed to initialize WebPPicture.\n");
return -1; return {};
} }
picture.picture.use_argb = 1; picture.picture.use_argb = 1;
picture.picture.width = width; picture.picture.width = width;
@ -921,12 +1005,12 @@ int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images
if (!picture_ok) { if (!picture_ok) {
fprintf(stderr, "Error: Failed to import frame into WebPPicture.\n"); fprintf(stderr, "Error: Failed to import frame into WebPPicture.\n");
return -1; return {};
} }
if (!WebPAnimEncoderAdd(enc.get(), &picture.picture, timestamp_ms, &config)) { if (!WebPAnimEncoderAdd(enc.get(), &picture.picture, timestamp_ms, &config)) {
fprintf(stderr, "Error: Failed to add frame to animated WebP: %s\n", WebPAnimEncoderGetError(enc.get())); fprintf(stderr, "Error: Failed to add frame to animated WebP: %s\n", WebPAnimEncoderGetError(enc.get()));
return -1; return {};
} }
timestamp_ms += frame_duration_ms; timestamp_ms += frame_duration_ms;
@ -934,52 +1018,50 @@ int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images
if (!WebPAnimEncoderAdd(enc.get(), nullptr, timestamp_ms, nullptr)) { if (!WebPAnimEncoderAdd(enc.get(), nullptr, timestamp_ms, nullptr)) {
fprintf(stderr, "Error: Failed to finalize animated WebP frames: %s\n", WebPAnimEncoderGetError(enc.get())); fprintf(stderr, "Error: Failed to finalize animated WebP frames: %s\n", WebPAnimEncoderGetError(enc.get()));
return -1; return {};
} }
WebPDataGuard webp_data; WebPDataGuard webp_data;
if (!WebPAnimEncoderAssemble(enc.get(), &webp_data.data)) { if (!WebPAnimEncoderAssemble(enc.get(), &webp_data.data)) {
fprintf(stderr, "Error: Failed to assemble animated WebP: %s\n", WebPAnimEncoderGetError(enc.get())); fprintf(stderr, "Error: Failed to assemble animated WebP: %s\n", WebPAnimEncoderGetError(enc.get()));
return -1; return {};
} }
FilePtr f(fopen(filename, "wb")); return std::vector<uint8_t>(webp_data.data.bytes, webp_data.data.bytes + webp_data.data.size);
if (!f) { }
int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) {
std::vector<uint8_t> webp_data = create_animated_webp_from_sd_images_to_vector(images, num_images, fps, quality);
if (webp_data.empty()) {
return -1;
}
if (!write_binary_file_bytes(filename, webp_data)) {
perror("Error opening file for writing"); perror("Error opening file for writing");
return -1; return -1;
} }
if (webp_data.data.size > 0 && fwrite(webp_data.data.bytes, 1, webp_data.data.size, f.get()) != webp_data.data.size) {
fprintf(stderr, "Error: Failed to write animated WebP file.\n");
return -1;
}
return 0; return 0;
} }
#endif #endif
#ifdef SD_USE_WEBM #ifdef SD_USE_WEBM
int create_webm_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) { std::vector<uint8_t> create_webm_from_sd_images_to_vector(sd_image_t* images, int num_images, int fps, int quality) {
if (num_images == 0) { if (num_images == 0) {
fprintf(stderr, "Error: Image array is empty.\n"); fprintf(stderr, "Error: Image array is empty.\n");
return -1; return {};
} }
if (fps <= 0) { if (fps <= 0) {
fprintf(stderr, "Error: FPS must be positive.\n"); fprintf(stderr, "Error: FPS must be positive.\n");
return -1; return {};
} }
const int width = static_cast<int>(images[0].width); const int width = static_cast<int>(images[0].width);
const int height = static_cast<int>(images[0].height); const int height = static_cast<int>(images[0].height);
if (width <= 0 || height <= 0) { if (width <= 0 || height <= 0) {
fprintf(stderr, "Error: Invalid frame dimensions.\n"); fprintf(stderr, "Error: Invalid frame dimensions.\n");
return -1; return {};
} }
mkvmuxer::MkvWriter writer; MemoryMkvWriter writer;
if (!writer.Open(filename)) {
fprintf(stderr, "Error: Could not open WebM file for writing.\n");
return -1;
}
const int ret = [&]() -> int { const int ret = [&]() -> int {
mkvmuxer::Segment segment; mkvmuxer::Segment segment;
@ -1045,30 +1127,63 @@ int create_webm_from_sd_images(const char* filename, sd_image_t* images, int num
} }
return 0; return 0;
}(); }();
writer.Close(); if (ret != 0) {
return ret; return {};
}
return writer.data();
}
int create_webm_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) {
std::vector<uint8_t> webm_data = create_webm_from_sd_images_to_vector(images, num_images, fps, quality);
if (webm_data.empty()) {
return -1;
}
if (!write_binary_file_bytes(filename, webm_data)) {
perror("Error opening file for writing");
return -1;
}
return 0;
} }
#endif #endif
int create_video_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) { std::vector<uint8_t> create_video_from_sd_images_to_vector(const std::string& output_format,
std::string path = filename ? filename : ""; sd_image_t* images,
auto pos = path.find_last_of('.'); int num_images,
std::string ext = pos == std::string::npos ? "" : path.substr(pos); int fps,
for (char& ch : ext) { int quality) {
ch = static_cast<char>(tolower(static_cast<unsigned char>(ch))); std::string format = output_format;
std::transform(format.begin(), format.end(), format.begin(),
[](unsigned char c) { return static_cast<char>(tolower(c)); });
if (!format.empty() && format[0] == '.') {
format.erase(format.begin());
} }
#ifdef SD_USE_WEBM #ifdef SD_USE_WEBM
if (ext == ".webm") { if (format == "webm") {
return create_webm_from_sd_images(filename, images, num_images, fps, quality); return create_webm_from_sd_images_to_vector(images, num_images, fps, quality);
} }
#endif #endif
#ifdef SD_USE_WEBP #ifdef SD_USE_WEBP
if (ext == ".webp") { if (format == "webp") {
return create_animated_webp_from_sd_images(filename, images, num_images, fps, quality); return create_animated_webp_from_sd_images_to_vector(images, num_images, fps, quality);
} }
#endif #endif
return create_mjpg_avi_from_sd_images(filename, images, num_images, fps, quality); return create_mjpg_avi_from_sd_images_to_vector(images, num_images, fps, quality);
}
int create_video_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) {
std::string path = filename ? filename : "";
auto pos = path.find_last_of('.');
std::string ext = pos == std::string::npos ? "" : path.substr(pos);
std::vector<uint8_t> video_data = create_video_from_sd_images_to_vector(ext, images, num_images, fps, quality);
if (video_data.empty()) {
return -1;
}
if (!write_binary_file_bytes(filename, video_data)) {
perror("Error opening file for writing");
return -1;
}
return 0;
} }

View File

@ -58,6 +58,10 @@ int create_mjpg_avi_from_sd_images(const char* filename,
int num_images, int num_images,
int fps, int fps,
int quality = 90); int quality = 90);
std::vector<uint8_t> create_mjpg_avi_from_sd_images_to_vector(sd_image_t* images,
int num_images,
int fps,
int quality = 90);
#ifdef SD_USE_WEBP #ifdef SD_USE_WEBP
int create_animated_webp_from_sd_images(const char* filename, int create_animated_webp_from_sd_images(const char* filename,
@ -65,6 +69,10 @@ int create_animated_webp_from_sd_images(const char* filename,
int num_images, int num_images,
int fps, int fps,
int quality = 90); int quality = 90);
std::vector<uint8_t> create_animated_webp_from_sd_images_to_vector(sd_image_t* images,
int num_images,
int fps,
int quality = 90);
#endif #endif
#ifdef SD_USE_WEBM #ifdef SD_USE_WEBM
@ -73,6 +81,10 @@ int create_webm_from_sd_images(const char* filename,
int num_images, int num_images,
int fps, int fps,
int quality = 90); int quality = 90);
std::vector<uint8_t> create_webm_from_sd_images_to_vector(sd_image_t* images,
int num_images,
int fps,
int quality = 90);
#endif #endif
int create_video_from_sd_images(const char* filename, int create_video_from_sd_images(const char* filename,
@ -80,5 +92,10 @@ int create_video_from_sd_images(const char* filename,
int num_images, int num_images,
int fps, int fps,
int quality = 90); int quality = 90);
std::vector<uint8_t> create_video_from_sd_images_to_vector(const std::string& output_format,
sd_image_t* images,
int num_images,
int fps,
int quality = 90);
#endif // __MEDIA_IO_H__ #endif // __MEDIA_IO_H__

View File

@ -9,7 +9,7 @@ The server currently exposes three API families:
- `sdcpp API` under `/sdcpp/v1/...` - `sdcpp API` under `/sdcpp/v1/...`
The `sdcpp API` is the native API surface. The `sdcpp API` is the native API surface.
Its request schema is also the canonical schema for `sd_cpp_extra_args`. Its request schema is the same schema used by `sd_cpp_extra_args`.
Global LoRA rule: Global LoRA rule:
@ -55,8 +55,6 @@ Current endpoints include:
- `POST /sdcpp/v1/jobs/{id}/cancel` - `POST /sdcpp/v1/jobs/{id}/cancel`
- `POST /sdcpp/v1/vid_gen` - `POST /sdcpp/v1/vid_gen`
`POST /sdcpp/v1/vid_gen` is currently exposed but returns `501 Not Implemented`.
## `sd_cpp_extra_args` ## `sd_cpp_extra_args`
`sd_cpp_extra_args` is an extension mechanism for the compatibility APIs. `sd_cpp_extra_args` is an extension mechanism for the compatibility APIs.
@ -79,12 +77,12 @@ Behavior:
- The JSON block is parsed using the same field rules as the `sdcpp API`. - The JSON block is parsed using the same field rules as the `sdcpp API`.
- The block is removed from the final prompt before generation. - The block is removed from the final prompt before generation.
Intended use: Supported use:
- extend `OpenAI API` requests with native `stable-diffusion.cpp` controls - extend `OpenAI API` requests with native `stable-diffusion.cpp` controls
- extend `sdapi` requests with native `stable-diffusion.cpp` controls - extend `sdapi` requests with native `stable-diffusion.cpp` controls
Not intended use: Unsupported use:
- do not use `sd_cpp_extra_args` with `/sdcpp/v1/*` - do not use `sd_cpp_extra_args` with `/sdcpp/v1/*`
@ -372,20 +370,25 @@ Field types:
Returns frontend-friendly capability metadata. Returns frontend-friendly capability metadata.
Typical contents: The mode-aware fields are the primary interface. The top-level compatibility fields are deprecated mirrors kept for older clients.
| Field | Type | Top-level fields:
| --- | --- |
| `model` | `object` |
| `defaults` | `object` |
| `loras` | `array<object>` |
| `samplers` | `array<string>` |
| `schedulers` | `array<string>` |
| `output_formats` | `array<string>` |
| `limits` | `object` |
| `features` | `object` |
Nested fields currently returned: | Field | Type | Notes |
| --- | --- | --- |
| `model` | `object` | Loaded model metadata |
| `current_mode` | `string` | The native generation mode mirrored by top-level compatibility fields |
| `supported_modes` | `array<string>` | Supported native modes such as `img_gen` or `vid_gen` |
| `defaults` | `object` | Deprecated compatibility mirror of `defaults_by_mode[current_mode]` |
| `output_formats` | `array<string>` | Deprecated compatibility mirror of `output_formats_by_mode[current_mode]` |
| `features` | `object` | Deprecated compatibility mirror of `features_by_mode[current_mode]` |
| `defaults_by_mode` | `object` | Explicit defaults for each supported mode |
| `output_formats_by_mode` | `object` | Explicit output formats for each supported mode |
| `features_by_mode` | `object` | Explicit feature flags for each supported mode |
| `samplers` | `array<string>` | Available sampling methods |
| `schedulers` | `array<string>` | Available schedulers |
| `loras` | `array<object>` | Available LoRA entries |
| `limits` | `object` | Shared queue and size limits |
`model` `model`
@ -395,50 +398,24 @@ Nested fields currently returned:
| `model.stem` | `string` | | `model.stem` | `string` |
| `model.path` | `string` | | `model.path` | `string` |
`defaults` Compatibility rules:
- `defaults`, `output_formats`, and `features` are deprecated compatibility mirrors
- those three top-level fields always mirror `current_mode`
- `supported_modes`, `defaults_by_mode`, `output_formats_by_mode`, and `features_by_mode` are the mode-aware fields
Mode-aware objects:
| Field | Type | | Field | Type |
| --- | --- | | --- | --- |
| `defaults.prompt` | `string` | | `defaults_by_mode.img_gen` | `object` |
| `defaults.negative_prompt` | `string` | | `defaults_by_mode.vid_gen` | `object` |
| `defaults.clip_skip` | `integer` | | `output_formats_by_mode.img_gen` | `array<string>` |
| `defaults.width` | `integer` | | `output_formats_by_mode.vid_gen` | `array<string>` |
| `defaults.height` | `integer` | | `features_by_mode.img_gen` | `object` |
| `defaults.strength` | `number` | | `features_by_mode.vid_gen` | `object` |
| `defaults.seed` | `integer` |
| `defaults.batch_count` | `integer` | Shared nested fields:
| `defaults.auto_resize_ref_image` | `boolean` |
| `defaults.increase_ref_index` | `boolean` |
| `defaults.control_strength` | `number` |
| `defaults.sample_params` | `object` |
| `defaults.sample_params.scheduler` | `string` |
| `defaults.sample_params.sample_method` | `string` |
| `defaults.sample_params.sample_steps` | `integer` |
| `defaults.sample_params.eta` | `number \| null` |
| `defaults.sample_params.shifted_timestep` | `integer` |
| `defaults.sample_params.flow_shift` | `number \| null` |
| `defaults.sample_params.guidance` | `object` |
| `defaults.sample_params.guidance.txt_cfg` | `number` |
| `defaults.sample_params.guidance.img_cfg` | `number \| null` |
| `defaults.sample_params.guidance.distilled_guidance` | `number` |
| `defaults.sample_params.guidance.slg` | `object` |
| `defaults.sample_params.guidance.slg.layers` | `array<integer>` |
| `defaults.sample_params.guidance.slg.layer_start` | `number` |
| `defaults.sample_params.guidance.slg.layer_end` | `number` |
| `defaults.sample_params.guidance.slg.scale` | `number` |
| `defaults.vae_tiling_params` | `object` |
| `defaults.vae_tiling_params.enabled` | `boolean` |
| `defaults.vae_tiling_params.tile_size_x` | `integer` |
| `defaults.vae_tiling_params.tile_size_y` | `integer` |
| `defaults.vae_tiling_params.target_overlap` | `number` |
| `defaults.vae_tiling_params.rel_size_x` | `number` |
| `defaults.vae_tiling_params.rel_size_y` | `number` |
| `defaults.cache_mode` | `string` |
| `defaults.cache_option` | `string` |
| `defaults.scm_mask` | `string` |
| `defaults.scm_policy_dynamic` | `boolean` |
| `defaults.output_format` | `string` |
| `defaults.output_compression` | `integer` |
`loras` `loras`
@ -458,19 +435,100 @@ Nested fields currently returned:
| `limits.max_batch_count` | `integer` | | `limits.max_batch_count` | `integer` |
| `limits.max_queue_size` | `integer` | | `limits.max_queue_size` | `integer` |
`features` Shared default fields used by both `img_gen` and `vid_gen`:
| Field | Type | | Field | Type |
| --- | --- | | --- | --- |
| `features.init_image` | `boolean` | | `prompt` | `string` |
| `features.mask_image` | `boolean` | | `negative_prompt` | `string` |
| `features.control_image` | `boolean` | | `clip_skip` | `integer` |
| `features.ref_images` | `boolean` | | `width` | `integer` |
| `features.lora` | `boolean` | | `height` | `integer` |
| `features.vae_tiling` | `boolean` | | `strength` | `number` |
| `features.cache` | `boolean` | | `seed` | `integer` |
| `features.cancel_queued` | `boolean` | | `sample_params` | `object` |
| `features.cancel_generating` | `boolean` | | `sample_params.scheduler` | `string` |
| `sample_params.sample_method` | `string` |
| `sample_params.sample_steps` | `integer` |
| `sample_params.eta` | `number \| null` |
| `sample_params.shifted_timestep` | `integer` |
| `sample_params.flow_shift` | `number \| null` |
| `sample_params.guidance.txt_cfg` | `number` |
| `sample_params.guidance.img_cfg` | `number \| null` |
| `sample_params.guidance.distilled_guidance` | `number` |
| `sample_params.guidance.slg.layers` | `array<integer>` |
| `sample_params.guidance.slg.layer_start` | `number` |
| `sample_params.guidance.slg.layer_end` | `number` |
| `sample_params.guidance.slg.scale` | `number` |
| `vae_tiling_params` | `object` |
| `vae_tiling_params.enabled` | `boolean` |
| `vae_tiling_params.tile_size_x` | `integer` |
| `vae_tiling_params.tile_size_y` | `integer` |
| `vae_tiling_params.target_overlap` | `number` |
| `vae_tiling_params.rel_size_x` | `number` |
| `vae_tiling_params.rel_size_y` | `number` |
| `cache_mode` | `string` |
| `cache_option` | `string` |
| `scm_mask` | `string` |
| `scm_policy_dynamic` | `boolean` |
| `output_format` | `string` |
| `output_compression` | `integer` |
`img_gen`-specific default fields:
| Field | Type |
| --- | --- |
| `batch_count` | `integer` |
| `auto_resize_ref_image` | `boolean` |
| `increase_ref_index` | `boolean` |
| `control_strength` | `number` |
`vid_gen`-specific default fields:
| Field | Type |
| --- | --- |
| `video_frames` | `integer` |
| `fps` | `integer` |
| `moe_boundary` | `number` |
| `vace_strength` | `number` |
| `high_noise_sample_params` | `object` |
| `high_noise_sample_params.scheduler` | `string` |
| `high_noise_sample_params.sample_method` | `string` |
| `high_noise_sample_params.sample_steps` | `integer` |
| `high_noise_sample_params.eta` | `number \| null` |
| `high_noise_sample_params.shifted_timestep` | `integer` |
| `high_noise_sample_params.flow_shift` | `number \| null` |
| `high_noise_sample_params.guidance.txt_cfg` | `number` |
| `high_noise_sample_params.guidance.img_cfg` | `number \| null` |
| `high_noise_sample_params.guidance.distilled_guidance` | `number` |
| `high_noise_sample_params.guidance.slg.layers` | `array<integer>` |
| `high_noise_sample_params.guidance.slg.layer_start` | `number` |
| `high_noise_sample_params.guidance.slg.layer_end` | `number` |
| `high_noise_sample_params.guidance.slg.scale` | `number` |
Fields returned in `features_by_mode.img_gen`:
- `init_image`
- `mask_image`
- `control_image`
- `ref_images`
- `lora`
- `vae_tiling`
- `cache`
- `cancel_queued`
- `cancel_generating`
Fields returned in `features_by_mode.vid_gen`:
- `init_image`
- `end_image`
- `control_frames`
- `high_noise_sample_params`
- `lora`
- `vae_tiling`
- `cache`
- `cancel_queued`
- `cancel_generating`
#### `POST /sdcpp/v1/img_gen` #### `POST /sdcpp/v1/img_gen`
@ -521,9 +579,7 @@ Typical status codes:
- `409 Conflict` - `409 Conflict`
- `410 Gone` - `410 Gone`
### Canonical Request Schema ### Request Body
The `sdcpp API` request body is the canonical native schema.
Example: Example:
@ -612,7 +668,7 @@ Channel expectations:
If omitted or null: If omitted or null:
- single-image fields map to an empty `sd_image_t` - single-image fields map to an empty `sd_image_t`
- array fields map to `nullptr + count = 0` - array fields map to an empty C-style array, represented as `pointer = nullptr` and `count = 0`
### Field Mapping Summary ### Field Mapping Summary
@ -686,11 +742,11 @@ HTTP-only output fields:
| `output_format` | `string` | | `output_format` | `string` |
| `output_compression` | `integer` | | `output_compression` | `integer` |
### Optional Field Semantics ### Optional Field Handling
Clients should preserve unset semantics for optional sampling fields. Optional sampling fields may be omitted.
If a user has not explicitly provided one of these fields, the client should omit it instead of injecting a guessed fallback: When omitted, backend defaults apply to these fields:
- `sample_params.scheduler` - `sample_params.scheduler`
- `sample_params.sample_method` - `sample_params.sample_method`
@ -766,29 +822,394 @@ Example cancelled job:
} }
``` ```
### Validation and Retention ### Submission Errors
Recommended behavior: `POST /sdcpp/v1/img_gen` may return:
- malformed JSON returns `400` - `202 Accepted` when the job is created
- invalid image payloads return `400` - `400 Bad Request` for an empty body, unsupported model mode, invalid JSON, or invalid generation parameters
- invalid parameter structure returns `400` - `429 Too Many Requests` when the job queue is full
- queue full returns `429` or `503` - `500 Internal Server Error` for unexpected server exceptions during submission
- accepted runtime failures transition the job to `failed`
- unsupported in-progress cancellation may return `409`
Recommended retention controls: ### `vid_gen`
- pending job limit The following section documents the native async contract for video generation.
- completed job TTL
- failed job TTL
### Future `vid_gen` #### `POST /sdcpp/v1/vid_gen`
Future `vid_gen` should reuse the same async job model: Submits an async video generation job.
- `POST /sdcpp/v1/vid_gen` Successful submission returns `202 Accepted`.
- `GET /sdcpp/v1/jobs/{id}`
- `POST /sdcpp/v1/jobs/{id}/cancel`
Its request body should mirror `sd_vid_gen_params_t` in the same way that `img_gen` mirrors `sd_img_gen_params_t`. Example response:
```json
{
"id": "job_01HTXYZVID",
"kind": "vid_gen",
"status": "queued",
"created": 1775401200,
"poll_url": "/sdcpp/v1/jobs/job_01HTXYZVID"
}
```
Response fields:
| Field | Type |
| --- | --- |
| `id` | `string` |
| `kind` | `string` |
| `status` | `string` |
| `created` | `integer` |
| `poll_url` | `string` |
### Request Body
Compared with `img_gen`, the `vid_gen` request body:
- `vid_gen` is a single video sequence job, so `batch_count` is not part of the request schema
- `ref_images`, `mask_image`, `control_image`, `control_strength`, and `embed_image_metadata` are not part of the request schema
- `vid_gen` adds `end_image`, `control_frames`, `high_noise_sample_params`, `video_frames`, `fps`, `moe_boundary`, and `vace_strength`
Example:
```json
{
"prompt": "a cat walking through a rainy alley",
"negative_prompt": "",
"clip_skip": -1,
"width": 832,
"height": 480,
"strength": 0.75,
"seed": -1,
"video_frames": 33,
"fps": 16,
"moe_boundary": 0.875,
"vace_strength": 1.0,
"init_image": null,
"end_image": null,
"control_frames": [],
"sample_params": {
"scheduler": "discrete",
"sample_method": "euler",
"sample_steps": 28,
"eta": 1.0,
"shifted_timestep": 0,
"custom_sigmas": [],
"flow_shift": 0.0,
"guidance": {
"txt_cfg": 7.0,
"img_cfg": 7.0,
"distilled_guidance": 3.5,
"slg": {
"layers": [7, 8, 9],
"layer_start": 0.01,
"layer_end": 0.2,
"scale": 0.0
}
}
},
"high_noise_sample_params": {
"scheduler": "discrete",
"sample_method": "euler",
"sample_steps": -1,
"eta": 1.0,
"shifted_timestep": 0,
"flow_shift": 0.0,
"guidance": {
"txt_cfg": 7.0,
"img_cfg": 7.0,
"distilled_guidance": 3.5,
"slg": {
"layers": [7, 8, 9],
"layer_start": 0.01,
"layer_end": 0.2,
"scale": 0.0
}
}
},
"lora": [],
"vae_tiling_params": {
"enabled": false,
"tile_size_x": 0,
"tile_size_y": 0,
"target_overlap": 0.5,
"rel_size_x": 0.0,
"rel_size_y": 0.0
},
"cache_mode": "disabled",
"cache_option": "",
"scm_mask": "",
"scm_policy_dynamic": true,
"output_format": "webm",
"output_compression": 100
}
```
### LoRA Rules
- The server only accepts explicit LoRA entries from the `lora` field.
- Prompt-embedded `<lora:...>` tags are intentionally unsupported.
- `lora[].is_high_noise` controls whether a LoRA applies only to the high-noise stage.
### Image and Frame Encoding Rules
Any image field accepts:
- a raw base64 string, or
- a data URL such as `data:image/png;base64,...`
Channel expectations:
- `init_image`: 3 channels
- `end_image`: 3 channels
- `control_frames[]`: 3 channels
Frame ordering rules:
- `control_frames[]` order is the conditioning frame order
- `control_frames[]` is preserved in request order
If omitted or null:
- single-image fields map to an empty `sd_image_t`
- array fields map to an empty C-style array, represented as `pointer = nullptr` and `count = 0`
### Field Mapping Summary
Top-level scalar fields:
| Field | Type |
| --- | --- |
| `prompt` | `string` |
| `negative_prompt` | `string` |
| `clip_skip` | `integer` |
| `width` | `integer` |
| `height` | `integer` |
| `strength` | `number` |
| `seed` | `integer` |
| `video_frames` | `integer` |
| `fps` | `integer` |
| `moe_boundary` | `number` |
| `vace_strength` | `number` |
Image and frame fields:
| Field | Type |
| --- | --- |
| `init_image` | `string \| null` |
| `end_image` | `string \| null` |
| `control_frames` | `array<string>` |
LoRA fields:
| Field | Type |
| --- | --- |
| `lora[].path` | `string` |
| `lora[].multiplier` | `number` |
| `lora[].is_high_noise` | `boolean` |
Sampling fields:
| Field | Type |
| --- | --- |
| `sample_params.scheduler` | `string` |
| `sample_params.sample_method` | `string` |
| `sample_params.sample_steps` | `integer` |
| `sample_params.eta` | `number` |
| `sample_params.shifted_timestep` | `integer` |
| `sample_params.custom_sigmas` | `array<number>` |
| `sample_params.flow_shift` | `number` |
| `sample_params.guidance.txt_cfg` | `number` |
| `sample_params.guidance.img_cfg` | `number` |
| `sample_params.guidance.distilled_guidance` | `number` |
| `sample_params.guidance.slg.layers` | `array<integer>` |
| `sample_params.guidance.slg.layer_start` | `number` |
| `sample_params.guidance.slg.layer_end` | `number` |
| `sample_params.guidance.slg.scale` | `number` |
High-noise sampling fields:
| Field | Type |
| --- | --- |
| `high_noise_sample_params.scheduler` | `string` |
| `high_noise_sample_params.sample_method` | `string` |
| `high_noise_sample_params.sample_steps` | `integer` |
| `high_noise_sample_params.eta` | `number` |
| `high_noise_sample_params.shifted_timestep` | `integer` |
| `high_noise_sample_params.flow_shift` | `number` |
| `high_noise_sample_params.guidance.txt_cfg` | `number` |
| `high_noise_sample_params.guidance.img_cfg` | `number` |
| `high_noise_sample_params.guidance.distilled_guidance` | `number` |
| `high_noise_sample_params.guidance.slg.layers` | `array<integer>` |
| `high_noise_sample_params.guidance.slg.layer_start` | `number` |
| `high_noise_sample_params.guidance.slg.layer_end` | `number` |
| `high_noise_sample_params.guidance.slg.scale` | `number` |
Other native fields:
| Field | Type |
| --- | --- |
| `vae_tiling_params` | `object` |
| `cache_mode` | `string` |
| `cache_option` | `string` |
| `scm_mask` | `string` |
| `scm_policy_dynamic` | `boolean` |
HTTP-only output fields:
| Field | Type |
| --- | --- |
| `output_format` | `string` |
| `output_compression` | `integer` |
For `vid_gen`, `output_format` and `output_compression` control container encoding.
`fps` is request metadata for the generated sequence and is echoed in the completed job result.
Allowed `output_format` values:
- `webm`
- `webp`
- `avi`
Output format behavior:
- `output_format` defaults to `webm`
- `webp` means animated WebP
- `avi` means MJPG AVI
- `webm` requires the server to be built with WebM support; otherwise the request returns `400`
### Result Payload
Completed jobs return one encoded container payload, not a list of per-frame images.
Result fields:
- `result.b64_json` contains the whole encoded container file as base64
- `result.mime_type` identifies the media type
- `result.output_format` echoes the selected container format
- `result.fps` echoes the effective playback FPS
- `result.frame_count` reports the actual decoded frame count used to build the container
Expected MIME types:
| `output_format` | `mime_type` |
| --- | --- |
| `webm` | `video/webm` |
| `webp` | `image/webp` |
| `avi` | `video/x-msvideo` |
### Optional Field Handling
Optional sampling fields may be omitted.
When omitted, backend defaults apply to these fields:
- `sample_params.scheduler`
- `sample_params.sample_method`
- `sample_params.eta`
- `sample_params.flow_shift`
- `sample_params.guidance.img_cfg`
- `high_noise_sample_params.scheduler`
- `high_noise_sample_params.sample_method`
- `high_noise_sample_params.eta`
- `high_noise_sample_params.flow_shift`
- `high_noise_sample_params.guidance.img_cfg`
`high_noise_sample_params` may also be omitted entirely.
### Frame Count Semantics
`video_frames` is the requested target length, but the current core video path internally normalizes the effective frame count to the largest `4n + 1` value that does not exceed the requested count.
Examples:
- `video_frames = 33` stays `33`
- `video_frames = 34` becomes `33`
- `video_frames = 32` becomes `29`
The completed job payload includes the actual decoded `frame_count`.
### Completion Result
Example completed job:
```json
{
"id": "job_01HTXYZVID",
"kind": "vid_gen",
"status": "completed",
"created": 1775401200,
"started": 1775401203,
"completed": 1775401215,
"queue_position": 0,
"result": {
"output_format": "webm",
"mime_type": "video/webm",
"fps": 16,
"frame_count": 33,
"b64_json": "GkXfo59ChoEBQveBAULygQRC84EIQo..."
},
"error": null
}
```
The response returns the encoded `.webm`, animated `.webp`, or `.avi` container payload directly.
### Failure Result
Example failed job:
```json
{
"id": "job_01HTXYZVID",
"kind": "vid_gen",
"status": "failed",
"created": 1775401200,
"started": 1775401203,
"completed": 1775401204,
"queue_position": 0,
"result": null,
"error": {
"code": "generation_failed",
"message": "generate_video returned no results"
}
}
```
### Cancelled Result
Example cancelled job:
```json
{
"id": "job_01HTXYZVID",
"kind": "vid_gen",
"status": "cancelled",
"created": 1775401200,
"started": null,
"completed": 1775401202,
"queue_position": 0,
"result": null,
"error": {
"code": "cancelled",
"message": "job cancelled by client"
}
}
```
### Submission Errors
`POST /sdcpp/v1/vid_gen` may return:
- `202 Accepted` when the job is created
- `400 Bad Request` for an empty body, unsupported model mode, invalid JSON, invalid generation parameters, or an unsupported output format
- `429 Too Many Requests` when the job queue is full
- `500 Internal Server Error` for unexpected server exceptions during submission

View File

@ -95,8 +95,12 @@ bool cancel_queued_job(AsyncJobManager& manager, AsyncGenerationJob& job) {
job.status = AsyncJobStatus::Cancelled; job.status = AsyncJobStatus::Cancelled;
job.completed_at = unix_timestamp_now(); job.completed_at = unix_timestamp_now();
job.result_images_b64.clear(); job.result_images_b64.clear();
job.error_code = "cancelled"; job.result_media_b64.clear();
job.error_message = "job cancelled by client"; job.result_media_mime_type.clear();
job.result_frame_count = 0;
job.result_fps = 0;
job.error_code = "cancelled";
job.error_message = "job cancelled by client";
return true; return true;
} }
@ -122,14 +126,24 @@ json make_async_job_json(const AsyncJobManager& manager, const AsyncGenerationJo
} }
if (job.status == AsyncJobStatus::Completed) { if (job.status == AsyncJobStatus::Completed) {
json images = json::array(); if (job.kind == AsyncJobKind::VidGen) {
for (size_t i = 0; i < job.result_images_b64.size(); ++i) { result["result"] = {
images.push_back({{"index", i}, {"b64_json", job.result_images_b64[i]}}); {"output_format", job.vid_gen.output_format},
{"mime_type", job.result_media_mime_type},
{"fps", job.result_fps},
{"frame_count", job.result_frame_count},
{"b64_json", job.result_media_b64},
};
} else {
json images = json::array();
for (size_t i = 0; i < job.result_images_b64.size(); ++i) {
images.push_back({{"index", i}, {"b64_json", job.result_images_b64[i]}});
}
result["result"] = {
{"output_format", job.img_gen.output_format},
{"images", images},
};
} }
result["result"] = {
{"output_format", job.img_gen.output_format},
{"images", images},
};
result["error"] = nullptr; result["error"] = nullptr;
} else if (job.status == AsyncJobStatus::Failed || } else if (job.status == AsyncJobStatus::Failed ||
job.status == AsyncJobStatus::Cancelled) { job.status == AsyncJobStatus::Cancelled) {
@ -156,16 +170,15 @@ bool execute_img_gen_job(ServerRuntime& runtime,
sd_img_gen_params_t params = job.img_gen.to_sd_img_gen_params_t(); sd_img_gen_params_t params = job.img_gen.to_sd_img_gen_params_t();
SDImageVec results; SDImageVec results;
int num_results = 0;
{ {
std::lock_guard<std::mutex> lock(*runtime.sd_ctx_mutex); std::lock_guard<std::mutex> lock(*runtime.sd_ctx_mutex);
sd_image_t* raw_results = generate_image(runtime.sd_ctx, &params); sd_image_t* raw_results = generate_image(runtime.sd_ctx, &params);
num_results = params.batch_count; results.adopt(raw_results, params.batch_count);
results.adopt(raw_results, num_results);
} }
if (results.empty() || num_results <= 0) { const int num_results = results.count();
if (num_results <= 0) {
error_message = "generate_image returned no results"; error_message = "generate_image returned no results";
return false; return false;
} }
@ -208,6 +221,47 @@ bool execute_img_gen_job(ServerRuntime& runtime,
return true; return true;
} }
bool execute_vid_gen_job(ServerRuntime& runtime,
AsyncGenerationJob& job,
std::string& output_media_b64,
std::string& output_media_mime_type,
int& output_frame_count,
int& output_fps,
std::string& error_message) {
sd_vid_gen_params_t params = job.vid_gen.to_sd_vid_gen_params_t();
SDImageVec results;
int num_results = 0;
{
std::lock_guard<std::mutex> lock(*runtime.sd_ctx_mutex);
sd_image_t* raw_results = generate_video(runtime.sd_ctx, &params, &num_results);
results.adopt(raw_results, num_results);
}
num_results = results.count();
if (num_results <= 0) {
error_message = "generate_video returned no results";
return false;
}
std::vector<uint8_t> video_bytes = create_video_from_sd_images_to_vector(job.vid_gen.output_format,
results.data(),
num_results,
job.vid_gen.gen_params.fps,
job.vid_gen.output_compression);
if (video_bytes.empty()) {
error_message = "failed to encode generated video container";
return false;
}
output_media_b64 = base64_encode(video_bytes);
output_media_mime_type = video_mime_type(job.vid_gen.output_format);
output_frame_count = num_results;
output_fps = job.vid_gen.gen_params.fps;
return true;
}
void async_job_worker(ServerRuntime& runtime) { void async_job_worker(ServerRuntime& runtime) {
AsyncJobManager& manager = *runtime.async_job_manager; AsyncJobManager& manager = *runtime.async_job_manager;
@ -240,11 +294,23 @@ void async_job_worker(ServerRuntime& runtime) {
} }
std::vector<std::string> output_images; std::vector<std::string> output_images;
std::string output_media_b64;
std::string output_media_mime_type;
int output_frame_count = 0;
int output_fps = 0;
std::string error_message; std::string error_message;
bool ok = false; bool ok = false;
if (job->kind == AsyncJobKind::ImgGen) { if (job->kind == AsyncJobKind::ImgGen) {
ok = execute_img_gen_job(runtime, *job, output_images, error_message); ok = execute_img_gen_job(runtime, *job, output_images, error_message);
} else if (job->kind == AsyncJobKind::VidGen) {
ok = execute_vid_gen_job(runtime,
*job,
output_media_b64,
output_media_mime_type,
output_frame_count,
output_fps,
error_message);
} else { } else {
error_message = "unsupported job kind"; error_message = "unsupported job kind";
} }
@ -258,8 +324,12 @@ void async_job_worker(ServerRuntime& runtime) {
job->completed_at = unix_timestamp_now(); job->completed_at = unix_timestamp_now();
if (ok) { if (ok) {
job->status = AsyncJobStatus::Completed; job->status = AsyncJobStatus::Completed;
job->result_images_b64 = std::move(output_images); job->result_images_b64 = std::move(output_images);
job->result_media_b64 = std::move(output_media_b64);
job->result_media_mime_type = std::move(output_media_mime_type);
job->result_frame_count = output_frame_count;
job->result_fps = output_fps;
job->error_code.clear(); job->error_code.clear();
job->error_message.clear(); job->error_message.clear();
} else { } else {
@ -267,6 +337,10 @@ void async_job_worker(ServerRuntime& runtime) {
job->error_code = "generation_failed"; job->error_code = "generation_failed";
job->error_message = error_message.empty() ? "unknown generation error" : error_message; job->error_message = error_message.empty() ? "unknown generation error" : error_message;
job->result_images_b64.clear(); job->result_images_b64.clear();
job->result_media_b64.clear();
job->result_media_mime_type.clear();
job->result_frame_count = 0;
job->result_fps = 0;
} }
purge_expired_jobs(manager); purge_expired_jobs(manager);

View File

@ -36,7 +36,12 @@ struct AsyncGenerationJob {
int64_t started_at = 0; int64_t started_at = 0;
int64_t completed_at = 0; int64_t completed_at = 0;
ImgGenJobRequest img_gen; ImgGenJobRequest img_gen;
VidGenJobRequest vid_gen;
std::vector<std::string> result_images_b64; std::vector<std::string> result_images_b64;
std::string result_media_b64;
std::string result_media_mime_type;
int result_frame_count = 0;
int result_fps = 0;
std::string error_code; std::string error_code;
std::string error_message; std::string error_message;
}; };
@ -63,4 +68,11 @@ bool execute_img_gen_job(ServerRuntime& runtime,
AsyncGenerationJob& job, AsyncGenerationJob& job,
std::vector<std::string>& output_images, std::vector<std::string>& output_images,
std::string& error_message); std::string& error_message);
bool execute_vid_gen_job(ServerRuntime& runtime,
AsyncGenerationJob& job,
std::string& output_media_b64,
std::string& output_media_mime_type,
int& output_frame_count,
int& output_fps,
std::string& error_message);
void async_job_worker(ServerRuntime& runtime); void async_job_worker(ServerRuntime& runtime);

@ -1 +1 @@
Subproject commit 740475a7a6794dc07fb23e8ec5dc56e7e80aa8c1 Subproject commit 797ccf80825cc035508ba9b599b2a21953e7f835

View File

@ -253,6 +253,12 @@ void register_openai_api_endpoints(httplib::Server& svr, ServerRuntime& rt) {
svr.Post("/v1/images/generations", [runtime](const httplib::Request& req, httplib::Response& res) { svr.Post("/v1/images/generations", [runtime](const httplib::Request& req, httplib::Response& res) {
try { try {
if (!runtime_supports_generation_mode(*runtime, IMG_GEN)) {
res.status = 400;
res.set_content(json({{"error", unsupported_generation_mode_error(IMG_GEN)}}).dump(), "application/json");
return;
}
ImgGenJobRequest request; ImgGenJobRequest request;
std::string error_message; std::string error_message;
if (!build_openai_generation_request(req, *runtime, request, error_message)) { if (!build_openai_generation_request(req, *runtime, request, error_message)) {
@ -319,6 +325,12 @@ void register_openai_api_endpoints(httplib::Server& svr, ServerRuntime& rt) {
svr.Post("/v1/images/edits", [runtime](const httplib::Request& req, httplib::Response& res) { svr.Post("/v1/images/edits", [runtime](const httplib::Request& req, httplib::Response& res) {
try { try {
if (!runtime_supports_generation_mode(*runtime, IMG_GEN)) {
res.status = 400;
res.set_content(json({{"error", unsupported_generation_mode_error(IMG_GEN)}}).dump(), "application/json");
return;
}
ImgGenJobRequest request; ImgGenJobRequest request;
std::string error_message; std::string error_message;
if (!build_openai_edit_request(req, *runtime, request, error_message)) { if (!build_openai_edit_request(req, *runtime, request, error_message)) {

View File

@ -246,6 +246,11 @@ void register_sdapi_endpoints(httplib::Server& svr, ServerRuntime& rt) {
res.set_content(R"({"error":"empty body"})", "application/json"); res.set_content(R"({"error":"empty body"})", "application/json");
return; return;
} }
if (!runtime_supports_generation_mode(*runtime, IMG_GEN)) {
res.status = 400;
res.set_content(json({{"error", unsupported_generation_mode_error(IMG_GEN)}}).dump(), "application/json");
return;
}
json j = json::parse(req.body); json j = json::parse(req.body);
ImgGenJobRequest request; ImgGenJobRequest request;

View File

@ -75,48 +75,33 @@ static fs::path resolve_display_model_path(const ServerRuntime& runtime) {
return {}; return {};
} }
static json make_capabilities_json(ServerRuntime& runtime) { static json make_sample_params_json(const sd_sample_params_t& sample_params, const std::vector<int>& skip_layers) {
refresh_lora_cache(runtime); const auto& guidance = sample_params.guidance;
return {
AsyncJobManager& manager = *runtime.async_job_manager; {"scheduler", capability_scheduler_name(sample_params.scheduler)},
const auto& defaults = *runtime.default_gen_params; {"sample_method", capability_sample_method_name(sample_params.sample_method)},
const auto& sample_params = defaults.sample_params; {"sample_steps", sample_params.sample_steps},
const auto& guidance = sample_params.guidance; {"eta", finite_number_or_null(sample_params.eta)},
const fs::path model_path = resolve_display_model_path(runtime); {"shifted_timestep", sample_params.shifted_timestep},
json samplers = json::array(); {"flow_shift", finite_number_or_null(sample_params.flow_shift)},
json schedulers = json::array(); {"guidance",
json output_formats = json::array({"png", "jpeg"}); {
json available_loras = json::array(); {"txt_cfg", guidance.txt_cfg},
{"img_cfg", finite_number_or_null(guidance.img_cfg)},
for (int i = 0; i < SAMPLE_METHOD_COUNT; ++i) { {"distilled_guidance", guidance.distilled_guidance},
samplers.push_back(sd_sample_method_name((sample_method_t)i)); {"slg",
} {
{"layers", skip_layers},
for (int i = 0; i < SCHEDULER_COUNT; ++i) { {"layer_start", guidance.slg.layer_start},
schedulers.push_back(sd_scheduler_name((scheduler_t)i)); {"layer_end", guidance.slg.layer_end},
} {"scale", guidance.slg.scale},
}},
#ifdef SD_USE_WEBP }},
output_formats.push_back("webp");
#endif
{
std::lock_guard<std::mutex> lock(*runtime.lora_mutex);
for (const auto& entry : *runtime.lora_cache) {
available_loras.push_back({
{"name", entry.name},
{"path", entry.path},
});
}
}
json result;
result["model"] = {
{"name", model_path.filename().u8string()},
{"stem", model_path.stem().u8string()},
{"path", model_path.u8string()},
}; };
result["defaults"] = { }
static json make_img_gen_defaults_json(const SDGenerationParams& defaults, const std::string& output_format) {
return {
{"prompt", defaults.prompt}, {"prompt", defaults.prompt},
{"negative_prompt", defaults.negative_prompt}, {"negative_prompt", defaults.negative_prompt},
{"clip_skip", defaults.clip_skip}, {"clip_skip", defaults.clip_skip},
@ -128,59 +113,177 @@ static json make_capabilities_json(ServerRuntime& runtime) {
{"auto_resize_ref_image", defaults.auto_resize_ref_image}, {"auto_resize_ref_image", defaults.auto_resize_ref_image},
{"increase_ref_index", defaults.increase_ref_index}, {"increase_ref_index", defaults.increase_ref_index},
{"control_strength", defaults.control_strength}, {"control_strength", defaults.control_strength},
{"sample_params", {"sample_params", make_sample_params_json(defaults.sample_params, defaults.skip_layers)},
{
{"scheduler", capability_scheduler_name(sample_params.scheduler)},
{"sample_method", capability_sample_method_name(sample_params.sample_method)},
{"sample_steps", sample_params.sample_steps},
{"eta", finite_number_or_null(sample_params.eta)},
{"shifted_timestep", sample_params.shifted_timestep},
{"flow_shift", finite_number_or_null(sample_params.flow_shift)},
{"guidance",
{
{"txt_cfg", guidance.txt_cfg},
{"img_cfg", finite_number_or_null(guidance.img_cfg)},
{"distilled_guidance", guidance.distilled_guidance},
{"slg",
{
{"layers", defaults.skip_layers},
{"layer_start", guidance.slg.layer_start},
{"layer_end", guidance.slg.layer_end},
{"scale", guidance.slg.scale},
}},
}},
}},
{"vae_tiling_params", make_vae_tiling_json(defaults.vae_tiling_params)}, {"vae_tiling_params", make_vae_tiling_json(defaults.vae_tiling_params)},
{"cache_mode", defaults.cache_mode}, {"cache_mode", defaults.cache_mode},
{"cache_option", defaults.cache_option}, {"cache_option", defaults.cache_option},
{"scm_mask", defaults.scm_mask}, {"scm_mask", defaults.scm_mask},
{"scm_policy_dynamic", defaults.scm_policy_dynamic}, {"scm_policy_dynamic", defaults.scm_policy_dynamic},
{"output_format", "png"}, {"output_format", output_format},
{"output_compression", 100}, {"output_compression", 100},
}; };
result["limits"] = { }
{"min_width", 64},
{"max_width", 4096}, static json make_vid_gen_defaults_json(const SDGenerationParams& defaults, const std::string& output_format) {
{"min_height", 64}, return {
{"max_height", 4096}, {"prompt", defaults.prompt},
{"max_batch_count", 8}, {"negative_prompt", defaults.negative_prompt},
{"max_queue_size", manager.max_pending_jobs}, {"clip_skip", defaults.clip_skip},
{"width", defaults.width > 0 ? defaults.width : 512},
{"height", defaults.height > 0 ? defaults.height : 512},
{"strength", defaults.strength},
{"seed", defaults.seed},
{"video_frames", defaults.video_frames},
{"fps", defaults.fps},
{"moe_boundary", defaults.moe_boundary},
{"vace_strength", defaults.vace_strength},
{"sample_params", make_sample_params_json(defaults.sample_params, defaults.skip_layers)},
{"high_noise_sample_params", make_sample_params_json(defaults.high_noise_sample_params, defaults.high_noise_skip_layers)},
{"vae_tiling_params", make_vae_tiling_json(defaults.vae_tiling_params)},
{"cache_mode", defaults.cache_mode},
{"cache_option", defaults.cache_option},
{"scm_mask", defaults.scm_mask},
{"scm_policy_dynamic", defaults.scm_policy_dynamic},
{"output_format", output_format},
{"output_compression", 100},
}; };
result["samplers"] = samplers; }
result["schedulers"] = schedulers;
result["output_formats"] = output_formats; static json make_img_gen_features_json() {
result["features"] = { return {
{"init_image", true}, {"init_image", true},
{"mask_image", true}, {"mask_image", true},
{"control_image", true}, {"control_image", true},
{"ref_images", true}, {"ref_images", true},
{"lora", true}, {"lora", true},
{"vae_tiling", true}, {"vae_tiling", true},
{"cache", true}, {"cache", true},
{"cancel_queued", true},
{"cancel_generating", false},
};
}
static json make_vid_gen_features_json() {
return {
{"init_image", true},
{"end_image", true},
{"control_frames", true},
{"high_noise_sample_params", true},
{"lora", true},
{"vae_tiling", true},
{"cache", true},
{"cancel_queued", true},
{"cancel_generating", false},
};
}
static json make_capabilities_json(ServerRuntime& runtime) {
refresh_lora_cache(runtime);
AsyncJobManager& manager = *runtime.async_job_manager;
const auto& defaults = *runtime.default_gen_params;
const fs::path model_path = resolve_display_model_path(runtime);
const bool supports_img = runtime_supports_generation_mode(runtime, IMG_GEN);
const bool supports_vid = runtime_supports_generation_mode(runtime, VID_GEN);
json samplers = json::array();
json schedulers = json::array();
json image_output_formats = supported_img_output_formats();
json video_output_formats = supported_vid_output_formats();
json available_loras = json::array();
json supported_modes = json::array();
for (int i = 0; i < SAMPLE_METHOD_COUNT; ++i) {
samplers.push_back(sd_sample_method_name((sample_method_t)i));
}
for (int i = 0; i < SCHEDULER_COUNT; ++i) {
schedulers.push_back(sd_scheduler_name((scheduler_t)i));
}
{
std::lock_guard<std::mutex> lock(*runtime.lora_mutex);
for (const auto& entry : *runtime.lora_cache) {
available_loras.push_back({
{"name", entry.name},
{"path", entry.path},
});
}
}
if (supports_img) {
supported_modes.push_back("img_gen");
}
if (supports_vid) {
supported_modes.push_back("vid_gen");
}
std::string default_img_output_format = "png";
std::string default_vid_output_format = "avi";
if (!image_output_formats.empty()) {
default_img_output_format = image_output_formats[0].get<std::string>();
}
if (!video_output_formats.empty()) {
default_vid_output_format = video_output_formats[0].get<std::string>();
}
json defaults_by_mode = json::object();
json output_formats_by_mode = json::object();
json features_by_mode = json::object();
if (supports_img) {
defaults_by_mode["img_gen"] = make_img_gen_defaults_json(defaults, default_img_output_format);
output_formats_by_mode["img_gen"] = image_output_formats;
features_by_mode["img_gen"] = make_img_gen_features_json();
}
if (supports_vid) {
defaults_by_mode["vid_gen"] = make_vid_gen_defaults_json(defaults, default_vid_output_format);
output_formats_by_mode["vid_gen"] = video_output_formats;
features_by_mode["vid_gen"] = make_vid_gen_features_json();
}
json top_level_defaults = json::object();
json top_level_output_formats = json::array();
json top_level_features = {
{"cancel_queued", true}, {"cancel_queued", true},
{"cancel_generating", false}, {"cancel_generating", false},
}; };
result["loras"] = available_loras; std::string current_mode = "";
if (supports_img) {
current_mode = "img_gen";
top_level_defaults = defaults_by_mode["img_gen"];
top_level_output_formats = output_formats_by_mode["img_gen"];
top_level_features = features_by_mode["img_gen"];
} else if (supports_vid) {
current_mode = "vid_gen";
top_level_defaults = defaults_by_mode["vid_gen"];
top_level_output_formats = output_formats_by_mode["vid_gen"];
top_level_features = features_by_mode["vid_gen"];
}
json result;
result["model"] = {
{"name", model_path.filename().u8string()},
{"stem", model_path.stem().u8string()},
{"path", model_path.u8string()},
};
result["current_mode"] = current_mode;
result["supported_modes"] = supported_modes;
result["defaults"] = top_level_defaults;
result["defaults_by_mode"] = defaults_by_mode;
result["limits"] = {
{"min_width", 64},
{"max_width", 4096},
{"min_height", 64},
{"max_height", 4096},
{"max_batch_count", 8},
{"max_queue_size", manager.max_pending_jobs},
};
result["samplers"] = samplers;
result["schedulers"] = schedulers;
result["output_formats"] = top_level_output_formats;
result["output_formats_by_mode"] = output_formats_by_mode;
result["features"] = top_level_features;
result["features_by_mode"] = features_by_mode;
result["loras"] = available_loras;
return result; return result;
} }
@ -211,6 +314,33 @@ static bool parse_img_gen_request(const json& body,
return true; return true;
} }
static bool parse_vid_gen_request(const json& body,
ServerRuntime& runtime,
VidGenJobRequest& request,
std::string& error_message) {
request.gen_params = *runtime.default_gen_params;
refresh_lora_cache(runtime);
if (!request.gen_params.from_json_str(body.dump(), [&](const std::string& path) {
return get_lora_full_path(runtime, path);
})) {
error_message = "invalid generation parameters";
return false;
}
std::string output_format = body.value("output_format", "webm");
int output_compression = body.value("output_compression", 100);
if (!assign_output_options(request, output_format, output_compression, error_message)) {
return false;
}
// Intentionally disable prompt-embedded LoRA tag parsing for server APIs.
if (!request.gen_params.resolve_and_validate(VID_GEN, "", true)) {
error_message = "invalid generation parameters";
return false;
}
return true;
}
void register_sdcpp_api_endpoints(httplib::Server& svr, ServerRuntime& rt) { void register_sdcpp_api_endpoints(httplib::Server& svr, ServerRuntime& rt) {
ServerRuntime* runtime = &rt; ServerRuntime* runtime = &rt;
@ -226,6 +356,11 @@ void register_sdcpp_api_endpoints(httplib::Server& svr, ServerRuntime& rt) {
res.set_content(R"({"error":"empty body"})", "application/json"); res.set_content(R"({"error":"empty body"})", "application/json");
return; return;
} }
if (!runtime_supports_generation_mode(*runtime, IMG_GEN)) {
res.status = 400;
res.set_content(json({{"error", unsupported_generation_mode_error(IMG_GEN)}}).dump(), "application/json");
return;
}
json body = json::parse(req.body); json body = json::parse(req.body);
ImgGenJobRequest request; ImgGenJobRequest request;
@ -276,9 +411,66 @@ void register_sdcpp_api_endpoints(httplib::Server& svr, ServerRuntime& rt) {
} }
}); });
svr.Post("/sdcpp/v1/vid_gen", [](const httplib::Request&, httplib::Response& res) { svr.Post("/sdcpp/v1/vid_gen", [runtime](const httplib::Request& req, httplib::Response& res) {
res.status = 501; try {
res.set_content(R"({"error":"vid_gen is reserved and not implemented yet"})", "application/json"); if (req.body.empty()) {
res.status = 400;
res.set_content(R"({"error":"empty body"})", "application/json");
return;
}
if (!runtime_supports_generation_mode(*runtime, VID_GEN)) {
res.status = 400;
res.set_content(json({{"error", unsupported_generation_mode_error(VID_GEN)}}).dump(), "application/json");
return;
}
json body = json::parse(req.body);
VidGenJobRequest request;
std::string error_message;
if (!parse_vid_gen_request(body, *runtime, request, error_message)) {
res.status = 400;
res.set_content(json({{"error", error_message}}).dump(), "application/json");
return;
}
AsyncJobManager& manager = *runtime->async_job_manager;
std::shared_ptr<AsyncGenerationJob> job = std::make_shared<AsyncGenerationJob>();
job->kind = AsyncJobKind::VidGen;
job->status = AsyncJobStatus::Queued;
job->created_at = unix_timestamp_now();
job->vid_gen = std::move(request);
{
std::lock_guard<std::mutex> lock(manager.mutex);
purge_expired_jobs(manager);
if (count_pending_jobs(manager) >= manager.max_pending_jobs) {
res.status = 429;
res.set_content(R"({"error":"job queue is full"})", "application/json");
return;
}
job->id = make_async_job_id(manager);
manager.jobs[job->id] = job;
manager.queue.push_back(job->id);
}
manager.cv.notify_one();
json out;
out["id"] = job->id;
out["kind"] = async_job_kind_name(job->kind);
out["status"] = async_job_status_name(job->status);
out["created"] = job->created_at;
out["poll_url"] = "/sdcpp/v1/jobs/" + job->id;
res.status = 202;
res.set_content(out.dump(), "application/json");
} catch (const json::parse_error& e) {
res.status = 400;
res.set_content(json({{"error", "invalid json"}, {"message", e.what()}}).dump(), "application/json");
} catch (const std::exception& e) {
res.status = 500;
res.set_content(json({{"error", "server_error"}, {"message", e.what()}}).dump(), "application/json");
}
}); });
svr.Get(R"(/sdcpp/v1/jobs/([A-Za-z0-9_\-]+))", [runtime](const httplib::Request& req, httplib::Response& res) { svr.Get(R"(/sdcpp/v1/jobs/([A-Za-z0-9_\-]+))", [runtime](const httplib::Request& req, httplib::Response& res) {

View File

@ -45,6 +45,44 @@ std::string normalize_output_format(std::string output_format) {
return output_format; return output_format;
} }
std::vector<std::string> supported_img_output_formats(bool allow_webp) {
std::vector<std::string> formats = {"png", "jpeg"};
#ifdef SD_USE_WEBP
if (allow_webp) {
formats.push_back("webp");
}
#else
(void)allow_webp;
#endif
return formats;
}
std::vector<std::string> supported_vid_output_formats() {
std::vector<std::string> formats;
#ifdef SD_USE_WEBM
formats.push_back("webm");
#endif
#ifdef SD_USE_WEBP
formats.push_back("webp");
#endif
formats.push_back("avi");
return formats;
}
static std::string valid_vid_output_formats_message() {
const std::vector<std::string> formats = supported_vid_output_formats();
std::string message = "invalid output_format, must be one of [";
for (size_t i = 0; i < formats.size(); ++i) {
if (i > 0) {
message += ", ";
}
message += formats[i];
}
message += "]";
return message;
}
bool assign_output_options(ImgGenJobRequest& request, bool assign_output_options(ImgGenJobRequest& request,
std::string output_format, std::string output_format,
int output_compression, int output_compression,
@ -53,19 +91,88 @@ bool assign_output_options(ImgGenJobRequest& request,
request.output_format = normalize_output_format(std::move(output_format)); request.output_format = normalize_output_format(std::move(output_format));
request.output_compression = std::clamp(output_compression, 0, 100); request.output_compression = std::clamp(output_compression, 0, 100);
const bool valid_format = request.output_format == "png" || const std::vector<std::string> valid_formats = supported_img_output_formats(allow_webp);
request.output_format == "jpeg" || const bool valid_format = std::find(valid_formats.begin(),
(allow_webp && request.output_format == "webp"); valid_formats.end(),
request.output_format) != valid_formats.end();
if (!valid_format) { if (!valid_format) {
error_message = allow_webp error_message = "invalid output_format, must be one of [";
? "invalid output_format, must be one of [png, jpeg, webp]" for (size_t i = 0; i < valid_formats.size(); ++i) {
: "invalid output_format, must be one of [png, jpeg]"; if (i > 0) {
error_message += ", ";
}
error_message += valid_formats[i];
}
error_message += "]";
return false; return false;
} }
return true; return true;
} }
bool assign_output_options(VidGenJobRequest& request,
std::string output_format,
int output_compression,
std::string& error_message) {
request.output_format = normalize_output_format(std::move(output_format));
request.output_compression = std::clamp(output_compression, 0, 100);
if (request.output_format == "avi") {
return true;
}
if (request.output_format == "webm") {
#ifdef SD_USE_WEBM
return true;
#else
error_message = valid_vid_output_formats_message();
return false;
#endif
}
if (request.output_format == "webp") {
#ifdef SD_USE_WEBP
return true;
#else
error_message = valid_vid_output_formats_message();
return false;
#endif
}
error_message = valid_vid_output_formats_message();
return false;
}
std::string video_mime_type(const std::string& output_format) {
if (output_format == "webm") {
return "video/webm";
}
if (output_format == "webp") {
return "image/webp";
}
return "video/x-msvideo";
}
bool runtime_supports_generation_mode(const ServerRuntime& runtime, SDMode mode) {
if (mode == VID_GEN) {
return sd_ctx_supports_video_generation(runtime.sd_ctx);
}
if (mode == IMG_GEN) {
return sd_ctx_supports_image_generation(runtime.sd_ctx);
}
return true;
}
std::string unsupported_generation_mode_error(SDMode mode) {
if (mode == VID_GEN) {
return "loaded model does not support vid_gen";
}
if (mode == IMG_GEN) {
return "loaded model does not support img_gen";
}
return "loaded model does not support requested mode";
}
ArgOptions SDSvrParams::get_options() { ArgOptions SDSvrParams::get_options() {
ArgOptions options; ArgOptions options;

View File

@ -58,13 +58,32 @@ struct ImgGenJobRequest {
} }
}; };
struct VidGenJobRequest {
SDGenerationParams gen_params;
std::string output_format = "webm";
int output_compression = 100;
sd_vid_gen_params_t to_sd_vid_gen_params_t() {
return gen_params.to_sd_vid_gen_params_t();
}
};
std::string base64_encode(const std::vector<uint8_t>& bytes); std::string base64_encode(const std::vector<uint8_t>& bytes);
std::string normalize_output_format(std::string output_format); std::string normalize_output_format(std::string output_format);
std::vector<std::string> supported_img_output_formats(bool allow_webp = true);
std::vector<std::string> supported_vid_output_formats();
bool assign_output_options(ImgGenJobRequest& request, bool assign_output_options(ImgGenJobRequest& request,
std::string output_format, std::string output_format,
int output_compression, int output_compression,
bool allow_webp, bool allow_webp,
std::string& error_message); std::string& error_message);
bool assign_output_options(VidGenJobRequest& request,
std::string output_format,
int output_compression,
std::string& error_message);
std::string video_mime_type(const std::string& output_format);
bool runtime_supports_generation_mode(const ServerRuntime& runtime, SDMode mode);
std::string unsupported_generation_mode_error(SDMode mode);
void refresh_lora_cache(ServerRuntime& rt); void refresh_lora_cache(ServerRuntime& rt);
std::string get_lora_full_path(ServerRuntime& rt, const std::string& path); std::string get_lora_full_path(ServerRuntime& rt, const std::string& path);
int64_t unix_timestamp_now(); int64_t unix_timestamp_now();

View File

@ -348,6 +348,8 @@ SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
SD_API void sd_set_preview_callback(sd_preview_cb_t cb, enum preview_t mode, int interval, bool denoised, bool noisy, void* data); SD_API void sd_set_preview_callback(sd_preview_cb_t cb, enum preview_t mode, int interval, bool denoised, bool noisy, void* data);
SD_API int32_t sd_get_num_physical_cores(); SD_API int32_t sd_get_num_physical_cores();
SD_API const char* sd_get_system_info(); SD_API const char* sd_get_system_info();
SD_API bool sd_ctx_supports_image_generation(const sd_ctx_t* sd_ctx);
SD_API bool sd_ctx_supports_video_generation(const sd_ctx_t* sd_ctx);
SD_API const char* sd_type_name(enum sd_type_t type); SD_API const char* sd_type_name(enum sd_type_t type);
SD_API enum sd_type_t str_to_sd_type(const char* str); SD_API enum sd_type_t str_to_sd_type(const char* str);

View File

@ -2390,6 +2390,14 @@ struct sd_ctx_t {
StableDiffusionGGML* sd = nullptr; StableDiffusionGGML* sd = nullptr;
}; };
static bool sd_version_supports_video_generation(SDVersion version) {
return version == VERSION_SVD || sd_version_is_wan(version);
}
static bool sd_version_supports_image_generation(SDVersion version) {
return !sd_version_supports_video_generation(version);
}
sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params) { sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params) {
sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t)); sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t));
if (sd_ctx == nullptr) { if (sd_ctx == nullptr) {
@ -2419,6 +2427,20 @@ void free_sd_ctx(sd_ctx_t* sd_ctx) {
free(sd_ctx); free(sd_ctx);
} }
SD_API bool sd_ctx_supports_image_generation(const sd_ctx_t* sd_ctx) {
if (sd_ctx == nullptr || sd_ctx->sd == nullptr) {
return false;
}
return sd_version_supports_image_generation(sd_ctx->sd->version);
}
SD_API bool sd_ctx_supports_video_generation(const sd_ctx_t* sd_ctx) {
if (sd_ctx == nullptr || sd_ctx->sd == nullptr) {
return false;
}
return sd_version_supports_video_generation(sd_ctx->sd->version);
}
enum sample_method_t sd_get_default_sample_method(const sd_ctx_t* sd_ctx) { enum sample_method_t sd_get_default_sample_method(const sd_ctx_t* sd_ctx) {
if (sd_ctx != nullptr && sd_ctx->sd != nullptr) { if (sd_ctx != nullptr && sd_ctx->sd != nullptr) {
if (sd_version_is_dit(sd_ctx->sd->version)) { if (sd_version_is_dit(sd_ctx->sd->version)) {