mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2026-05-08 08:18:51 +00:00
feat(server): implement vid_gen async API and mode-aware capabilities (#1437)
This commit is contained in:
parent
f3f69e2fbe
commit
4d626d24b2
@ -1589,10 +1589,18 @@ bool SDGenerationParams::from_json_str(
|
||||
LOG_ERROR("invalid init_image");
|
||||
return false;
|
||||
}
|
||||
if (!parse_image_json_field(j, "end_image", 3, width, height, end_image)) {
|
||||
LOG_ERROR("invalid end_image");
|
||||
return false;
|
||||
}
|
||||
if (!parse_image_array_json_field(j, "ref_images", 3, width, height, ref_images)) {
|
||||
LOG_ERROR("invalid ref_images");
|
||||
return false;
|
||||
}
|
||||
if (!parse_image_array_json_field(j, "control_frames", 3, width, height, control_frames)) {
|
||||
LOG_ERROR("invalid control_frames");
|
||||
return false;
|
||||
}
|
||||
if (!parse_image_json_field(j, "mask_image", 1, width, height, mask_image)) {
|
||||
LOG_ERROR("invalid mask_image");
|
||||
return false;
|
||||
|
||||
@ -95,6 +95,57 @@ using WebPMuxPtr = std::unique_ptr<WebPMux, WebPMuxDeleter>;
|
||||
using WebPAnimEncoderPtr = std::unique_ptr<WebPAnimEncoder, WebPAnimEncoderDeleter>;
|
||||
#endif
|
||||
|
||||
#ifdef SD_USE_WEBM
|
||||
class MemoryMkvWriter : public mkvmuxer::IMkvWriter {
|
||||
public:
|
||||
mkvmuxer::int32 Write(const void* buf, mkvmuxer::uint32 len) override {
|
||||
if (buf == nullptr && len > 0) {
|
||||
return -1;
|
||||
}
|
||||
const size_t end_pos = position_ + static_cast<size_t>(len);
|
||||
if (end_pos > data_.size()) {
|
||||
data_.resize(end_pos);
|
||||
}
|
||||
if (len > 0) {
|
||||
memcpy(data_.data() + position_, buf, len);
|
||||
}
|
||||
position_ = end_pos;
|
||||
return 0;
|
||||
}
|
||||
|
||||
mkvmuxer::int64 Position() const override {
|
||||
return static_cast<mkvmuxer::int64>(position_);
|
||||
}
|
||||
|
||||
mkvmuxer::int32 Position(mkvmuxer::int64 position) override {
|
||||
if (position < 0) {
|
||||
return -1;
|
||||
}
|
||||
const size_t target = static_cast<size_t>(position);
|
||||
if (target > data_.size()) {
|
||||
data_.resize(target);
|
||||
}
|
||||
position_ = target;
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool Seekable() const override {
|
||||
return true;
|
||||
}
|
||||
|
||||
void ElementStartNotify(mkvmuxer::uint64, mkvmuxer::int64) override {
|
||||
}
|
||||
|
||||
const std::vector<uint8_t>& data() const {
|
||||
return data_;
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<uint8_t> data_;
|
||||
size_t position_ = 0;
|
||||
};
|
||||
#endif
|
||||
|
||||
bool read_binary_file_bytes(const char* path, std::vector<uint8_t>& data) {
|
||||
std::ifstream fin(fs::path(path), std::ios::binary);
|
||||
if (!fin) {
|
||||
@ -570,6 +621,32 @@ void write_u16_le(FILE* f, uint16_t val) {
|
||||
fwrite(&val, 2, 1, f);
|
||||
}
|
||||
|
||||
void write_u32_le(std::vector<uint8_t>& data, uint32_t val) {
|
||||
data.push_back(static_cast<uint8_t>(val & 0xFF));
|
||||
data.push_back(static_cast<uint8_t>((val >> 8) & 0xFF));
|
||||
data.push_back(static_cast<uint8_t>((val >> 16) & 0xFF));
|
||||
data.push_back(static_cast<uint8_t>((val >> 24) & 0xFF));
|
||||
}
|
||||
|
||||
void write_u16_le(std::vector<uint8_t>& data, uint16_t val) {
|
||||
data.push_back(static_cast<uint8_t>(val & 0xFF));
|
||||
data.push_back(static_cast<uint8_t>((val >> 8) & 0xFF));
|
||||
}
|
||||
|
||||
void patch_u32_le(std::vector<uint8_t>& data, size_t offset, uint32_t val) {
|
||||
if (offset + 4 > data.size()) {
|
||||
return;
|
||||
}
|
||||
data[offset + 0] = static_cast<uint8_t>(val & 0xFF);
|
||||
data[offset + 1] = static_cast<uint8_t>((val >> 8) & 0xFF);
|
||||
data[offset + 2] = static_cast<uint8_t>((val >> 16) & 0xFF);
|
||||
data[offset + 3] = static_cast<uint8_t>((val >> 24) & 0xFF);
|
||||
}
|
||||
|
||||
void write_fourcc(std::vector<uint8_t>& data, const char* fourcc) {
|
||||
data.insert(data.end(), fourcc, fourcc + 4);
|
||||
}
|
||||
|
||||
EncodedImageFormat encoded_image_format_from_path(const std::string& path) {
|
||||
std::string ext = fs::path(path).extension().string();
|
||||
std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
|
||||
@ -699,95 +776,96 @@ uint8_t* load_image_from_memory(const char* image_bytes,
|
||||
return load_image_common(true, image_bytes, len, width, height, expected_width, expected_height, expected_channel);
|
||||
}
|
||||
|
||||
int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) {
|
||||
std::vector<uint8_t> create_mjpg_avi_from_sd_images_to_vector(sd_image_t* images, int num_images, int fps, int quality) {
|
||||
if (num_images == 0) {
|
||||
fprintf(stderr, "Error: Image array is empty.\n");
|
||||
return -1;
|
||||
return {};
|
||||
}
|
||||
|
||||
FilePtr file(fopen(filename, "wb"));
|
||||
if (!file) {
|
||||
perror("Error opening file for writing");
|
||||
return -1;
|
||||
}
|
||||
FILE* f = file.get();
|
||||
|
||||
uint32_t width = images[0].width;
|
||||
uint32_t height = images[0].height;
|
||||
uint32_t channels = images[0].channel;
|
||||
if (channels != 3 && channels != 4) {
|
||||
fprintf(stderr, "Error: Unsupported channel count: %u\n", channels);
|
||||
return -1;
|
||||
return {};
|
||||
}
|
||||
|
||||
fwrite("RIFF", 4, 1, f);
|
||||
long riff_size_pos = ftell(f);
|
||||
write_u32_le(f, 0);
|
||||
fwrite("AVI ", 4, 1, f);
|
||||
// stb_image_write changes JPEG sampling behavior above quality 90.
|
||||
// MJPG AVI playback is more compatible when we keep the encoder on the
|
||||
// <= 90 path.
|
||||
const int mjpg_quality = std::clamp(quality, 1, 90);
|
||||
|
||||
fwrite("LIST", 4, 1, f);
|
||||
write_u32_le(f, 4 + 8 + 56 + 8 + 4 + 8 + 56 + 8 + 40);
|
||||
fwrite("hdrl", 4, 1, f);
|
||||
std::vector<uint8_t> avi_data;
|
||||
avi_data.reserve(static_cast<size_t>(num_images) * 1024);
|
||||
|
||||
fwrite("avih", 4, 1, f);
|
||||
write_u32_le(f, 56);
|
||||
write_u32_le(f, 1000000 / fps);
|
||||
write_u32_le(f, 0);
|
||||
write_u32_le(f, 0);
|
||||
write_u32_le(f, 0x110);
|
||||
write_u32_le(f, num_images);
|
||||
write_u32_le(f, 0);
|
||||
write_u32_le(f, 1);
|
||||
write_u32_le(f, width * height * 3);
|
||||
write_u32_le(f, width);
|
||||
write_u32_le(f, height);
|
||||
write_u32_le(f, 0);
|
||||
write_u32_le(f, 0);
|
||||
write_u32_le(f, 0);
|
||||
write_u32_le(f, 0);
|
||||
write_fourcc(avi_data, "RIFF");
|
||||
const size_t riff_size_pos = avi_data.size();
|
||||
write_u32_le(avi_data, 0);
|
||||
write_fourcc(avi_data, "AVI ");
|
||||
|
||||
fwrite("LIST", 4, 1, f);
|
||||
write_u32_le(f, 4 + 8 + 56 + 8 + 40);
|
||||
fwrite("strl", 4, 1, f);
|
||||
write_fourcc(avi_data, "LIST");
|
||||
write_u32_le(avi_data, 4 + 8 + 56 + 8 + 4 + 8 + 56 + 8 + 40);
|
||||
write_fourcc(avi_data, "hdrl");
|
||||
|
||||
fwrite("strh", 4, 1, f);
|
||||
write_u32_le(f, 56);
|
||||
fwrite("vids", 4, 1, f);
|
||||
fwrite("MJPG", 4, 1, f);
|
||||
write_u32_le(f, 0);
|
||||
write_u16_le(f, 0);
|
||||
write_u16_le(f, 0);
|
||||
write_u32_le(f, 0);
|
||||
write_u32_le(f, 1);
|
||||
write_u32_le(f, fps);
|
||||
write_u32_le(f, 0);
|
||||
write_u32_le(f, num_images);
|
||||
write_u32_le(f, width * height * 3);
|
||||
write_u32_le(f, (uint32_t)-1);
|
||||
write_u32_le(f, 0);
|
||||
write_u16_le(f, 0);
|
||||
write_u16_le(f, 0);
|
||||
write_u16_le(f, 0);
|
||||
write_u16_le(f, 0);
|
||||
write_fourcc(avi_data, "avih");
|
||||
write_u32_le(avi_data, 56);
|
||||
write_u32_le(avi_data, 1000000 / fps);
|
||||
write_u32_le(avi_data, 0);
|
||||
write_u32_le(avi_data, 0);
|
||||
write_u32_le(avi_data, 0x110);
|
||||
write_u32_le(avi_data, num_images);
|
||||
write_u32_le(avi_data, 0);
|
||||
write_u32_le(avi_data, 1);
|
||||
write_u32_le(avi_data, width * height * 3);
|
||||
write_u32_le(avi_data, width);
|
||||
write_u32_le(avi_data, height);
|
||||
write_u32_le(avi_data, 0);
|
||||
write_u32_le(avi_data, 0);
|
||||
write_u32_le(avi_data, 0);
|
||||
write_u32_le(avi_data, 0);
|
||||
|
||||
fwrite("strf", 4, 1, f);
|
||||
write_u32_le(f, 40);
|
||||
write_u32_le(f, 40);
|
||||
write_u32_le(f, width);
|
||||
write_u32_le(f, height);
|
||||
write_u16_le(f, 1);
|
||||
write_u16_le(f, 24);
|
||||
fwrite("MJPG", 4, 1, f);
|
||||
write_u32_le(f, width * height * 3);
|
||||
write_u32_le(f, 0);
|
||||
write_u32_le(f, 0);
|
||||
write_u32_le(f, 0);
|
||||
write_u32_le(f, 0);
|
||||
write_fourcc(avi_data, "LIST");
|
||||
write_u32_le(avi_data, 4 + 8 + 56 + 8 + 40);
|
||||
write_fourcc(avi_data, "strl");
|
||||
|
||||
fwrite("LIST", 4, 1, f);
|
||||
long movi_size_pos = ftell(f);
|
||||
write_u32_le(f, 0);
|
||||
fwrite("movi", 4, 1, f);
|
||||
write_fourcc(avi_data, "strh");
|
||||
write_u32_le(avi_data, 56);
|
||||
write_fourcc(avi_data, "vids");
|
||||
write_fourcc(avi_data, "MJPG");
|
||||
write_u32_le(avi_data, 0);
|
||||
write_u16_le(avi_data, 0);
|
||||
write_u16_le(avi_data, 0);
|
||||
write_u32_le(avi_data, 0);
|
||||
write_u32_le(avi_data, 1);
|
||||
write_u32_le(avi_data, fps);
|
||||
write_u32_le(avi_data, 0);
|
||||
write_u32_le(avi_data, num_images);
|
||||
write_u32_le(avi_data, width * height * 3);
|
||||
write_u32_le(avi_data, static_cast<uint32_t>(-1));
|
||||
write_u32_le(avi_data, 0);
|
||||
write_u16_le(avi_data, 0);
|
||||
write_u16_le(avi_data, 0);
|
||||
write_u16_le(avi_data, 0);
|
||||
write_u16_le(avi_data, 0);
|
||||
|
||||
write_fourcc(avi_data, "strf");
|
||||
write_u32_le(avi_data, 40);
|
||||
write_u32_le(avi_data, 40);
|
||||
write_u32_le(avi_data, width);
|
||||
write_u32_le(avi_data, height);
|
||||
write_u16_le(avi_data, 1);
|
||||
write_u16_le(avi_data, 24);
|
||||
write_fourcc(avi_data, "MJPG");
|
||||
write_u32_le(avi_data, width * height * 3);
|
||||
write_u32_le(avi_data, 0);
|
||||
write_u32_le(avi_data, 0);
|
||||
write_u32_le(avi_data, 0);
|
||||
write_u32_le(avi_data, 0);
|
||||
|
||||
write_fourcc(avi_data, "LIST");
|
||||
const size_t movi_size_pos = avi_data.size();
|
||||
write_u32_le(avi_data, 0);
|
||||
write_fourcc(avi_data, "movi");
|
||||
|
||||
std::vector<avi_index_entry> index(static_cast<size_t>(num_images));
|
||||
std::vector<uint8_t> jpeg_data;
|
||||
@ -801,55 +879,61 @@ int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int
|
||||
buffer->insert(buffer->end(), src, src + size);
|
||||
};
|
||||
|
||||
if (!stbi_write_jpg_to_func(write_to_buf, &jpeg_data, images[i].width, images[i].height, channels, images[i].data, quality)) {
|
||||
if (!stbi_write_jpg_to_func(write_to_buf, &jpeg_data, images[i].width, images[i].height, channels, images[i].data, mjpg_quality)) {
|
||||
fprintf(stderr, "Error: Failed to encode JPEG frame.\n");
|
||||
return -1;
|
||||
return {};
|
||||
}
|
||||
|
||||
fwrite("00dc", 4, 1, f);
|
||||
write_u32_le(f, (uint32_t)jpeg_data.size());
|
||||
index[i].offset = ftell(f) - 8;
|
||||
index[i].offset = static_cast<uint32_t>(avi_data.size());
|
||||
write_fourcc(avi_data, "00dc");
|
||||
write_u32_le(avi_data, static_cast<uint32_t>(jpeg_data.size()));
|
||||
index[i].size = (uint32_t)jpeg_data.size();
|
||||
fwrite(jpeg_data.data(), 1, jpeg_data.size(), f);
|
||||
avi_data.insert(avi_data.end(), jpeg_data.begin(), jpeg_data.end());
|
||||
|
||||
if (jpeg_data.size() % 2) {
|
||||
fputc(0, f);
|
||||
avi_data.push_back(0);
|
||||
}
|
||||
}
|
||||
|
||||
long cur_pos = ftell(f);
|
||||
long movi_size = cur_pos - movi_size_pos - 4;
|
||||
fseek(f, movi_size_pos, SEEK_SET);
|
||||
write_u32_le(f, movi_size);
|
||||
fseek(f, cur_pos, SEEK_SET);
|
||||
const size_t movi_size = avi_data.size() - movi_size_pos - 4;
|
||||
patch_u32_le(avi_data, movi_size_pos, static_cast<uint32_t>(movi_size));
|
||||
|
||||
fwrite("idx1", 4, 1, f);
|
||||
write_u32_le(f, num_images * 16);
|
||||
write_fourcc(avi_data, "idx1");
|
||||
write_u32_le(avi_data, num_images * 16);
|
||||
for (int i = 0; i < num_images; i++) {
|
||||
fwrite("00dc", 4, 1, f);
|
||||
write_u32_le(f, 0x10);
|
||||
write_u32_le(f, index[i].offset);
|
||||
write_u32_le(f, index[i].size);
|
||||
write_fourcc(avi_data, "00dc");
|
||||
write_u32_le(avi_data, 0x10);
|
||||
write_u32_le(avi_data, index[i].offset);
|
||||
write_u32_le(avi_data, index[i].size);
|
||||
}
|
||||
|
||||
cur_pos = ftell(f);
|
||||
long file_size = cur_pos - riff_size_pos - 4;
|
||||
fseek(f, riff_size_pos, SEEK_SET);
|
||||
write_u32_le(f, file_size);
|
||||
fseek(f, cur_pos, SEEK_SET);
|
||||
const size_t file_size = avi_data.size() - riff_size_pos - 4;
|
||||
patch_u32_le(avi_data, riff_size_pos, static_cast<uint32_t>(file_size));
|
||||
|
||||
return avi_data;
|
||||
}
|
||||
|
||||
int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) {
|
||||
std::vector<uint8_t> avi_data = create_mjpg_avi_from_sd_images_to_vector(images, num_images, fps, quality);
|
||||
if (avi_data.empty()) {
|
||||
return -1;
|
||||
}
|
||||
if (!write_binary_file_bytes(filename, avi_data)) {
|
||||
perror("Error opening file for writing");
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef SD_USE_WEBP
|
||||
int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) {
|
||||
std::vector<uint8_t> create_animated_webp_from_sd_images_to_vector(sd_image_t* images, int num_images, int fps, int quality) {
|
||||
if (num_images == 0) {
|
||||
fprintf(stderr, "Error: Image array is empty.\n");
|
||||
return -1;
|
||||
return {};
|
||||
}
|
||||
if (fps <= 0) {
|
||||
fprintf(stderr, "Error: FPS must be positive.\n");
|
||||
return -1;
|
||||
return {};
|
||||
}
|
||||
|
||||
const int width = static_cast<int>(images[0].width);
|
||||
@ -857,14 +941,14 @@ int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images
|
||||
const int channels = static_cast<int>(images[0].channel);
|
||||
if (channels != 1 && channels != 3 && channels != 4) {
|
||||
fprintf(stderr, "Error: Unsupported channel count: %d\n", channels);
|
||||
return -1;
|
||||
return {};
|
||||
}
|
||||
|
||||
WebPAnimEncoderOptions anim_options;
|
||||
WebPConfig config;
|
||||
if (!WebPAnimEncoderOptionsInit(&anim_options) || !WebPConfigInit(&config)) {
|
||||
fprintf(stderr, "Error: Failed to initialize WebP animation encoder.\n");
|
||||
return -1;
|
||||
return {};
|
||||
}
|
||||
|
||||
config.quality = static_cast<float>(quality);
|
||||
@ -875,13 +959,13 @@ int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images
|
||||
}
|
||||
if (!WebPValidateConfig(&config)) {
|
||||
fprintf(stderr, "Error: Invalid WebP encoder configuration.\n");
|
||||
return -1;
|
||||
return {};
|
||||
}
|
||||
|
||||
WebPAnimEncoderPtr enc(WebPAnimEncoderNew(width, height, &anim_options));
|
||||
if (enc == nullptr) {
|
||||
fprintf(stderr, "Error: Could not create WebPAnimEncoder object.\n");
|
||||
return -1;
|
||||
return {};
|
||||
}
|
||||
|
||||
const int frame_duration_ms = std::max(1, static_cast<int>(std::lround(1000.0 / static_cast<double>(fps))));
|
||||
@ -891,13 +975,13 @@ int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images
|
||||
const sd_image_t& image = images[i];
|
||||
if (static_cast<int>(image.width) != width || static_cast<int>(image.height) != height) {
|
||||
fprintf(stderr, "Error: Frame dimensions do not match.\n");
|
||||
return -1;
|
||||
return {};
|
||||
}
|
||||
|
||||
WebPPictureGuard picture;
|
||||
if (!picture.initialized) {
|
||||
fprintf(stderr, "Error: Failed to initialize WebPPicture.\n");
|
||||
return -1;
|
||||
return {};
|
||||
}
|
||||
picture.picture.use_argb = 1;
|
||||
picture.picture.width = width;
|
||||
@ -921,12 +1005,12 @@ int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images
|
||||
|
||||
if (!picture_ok) {
|
||||
fprintf(stderr, "Error: Failed to import frame into WebPPicture.\n");
|
||||
return -1;
|
||||
return {};
|
||||
}
|
||||
|
||||
if (!WebPAnimEncoderAdd(enc.get(), &picture.picture, timestamp_ms, &config)) {
|
||||
fprintf(stderr, "Error: Failed to add frame to animated WebP: %s\n", WebPAnimEncoderGetError(enc.get()));
|
||||
return -1;
|
||||
return {};
|
||||
}
|
||||
|
||||
timestamp_ms += frame_duration_ms;
|
||||
@ -934,52 +1018,50 @@ int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images
|
||||
|
||||
if (!WebPAnimEncoderAdd(enc.get(), nullptr, timestamp_ms, nullptr)) {
|
||||
fprintf(stderr, "Error: Failed to finalize animated WebP frames: %s\n", WebPAnimEncoderGetError(enc.get()));
|
||||
return -1;
|
||||
return {};
|
||||
}
|
||||
|
||||
WebPDataGuard webp_data;
|
||||
if (!WebPAnimEncoderAssemble(enc.get(), &webp_data.data)) {
|
||||
fprintf(stderr, "Error: Failed to assemble animated WebP: %s\n", WebPAnimEncoderGetError(enc.get()));
|
||||
return -1;
|
||||
return {};
|
||||
}
|
||||
|
||||
FilePtr f(fopen(filename, "wb"));
|
||||
if (!f) {
|
||||
return std::vector<uint8_t>(webp_data.data.bytes, webp_data.data.bytes + webp_data.data.size);
|
||||
}
|
||||
|
||||
int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) {
|
||||
std::vector<uint8_t> webp_data = create_animated_webp_from_sd_images_to_vector(images, num_images, fps, quality);
|
||||
if (webp_data.empty()) {
|
||||
return -1;
|
||||
}
|
||||
if (!write_binary_file_bytes(filename, webp_data)) {
|
||||
perror("Error opening file for writing");
|
||||
return -1;
|
||||
}
|
||||
if (webp_data.data.size > 0 && fwrite(webp_data.data.bytes, 1, webp_data.data.size, f.get()) != webp_data.data.size) {
|
||||
fprintf(stderr, "Error: Failed to write animated WebP file.\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef SD_USE_WEBM
|
||||
int create_webm_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) {
|
||||
std::vector<uint8_t> create_webm_from_sd_images_to_vector(sd_image_t* images, int num_images, int fps, int quality) {
|
||||
if (num_images == 0) {
|
||||
fprintf(stderr, "Error: Image array is empty.\n");
|
||||
return -1;
|
||||
return {};
|
||||
}
|
||||
if (fps <= 0) {
|
||||
fprintf(stderr, "Error: FPS must be positive.\n");
|
||||
return -1;
|
||||
return {};
|
||||
}
|
||||
|
||||
const int width = static_cast<int>(images[0].width);
|
||||
const int height = static_cast<int>(images[0].height);
|
||||
if (width <= 0 || height <= 0) {
|
||||
fprintf(stderr, "Error: Invalid frame dimensions.\n");
|
||||
return -1;
|
||||
return {};
|
||||
}
|
||||
|
||||
mkvmuxer::MkvWriter writer;
|
||||
if (!writer.Open(filename)) {
|
||||
fprintf(stderr, "Error: Could not open WebM file for writing.\n");
|
||||
return -1;
|
||||
}
|
||||
MemoryMkvWriter writer;
|
||||
|
||||
const int ret = [&]() -> int {
|
||||
mkvmuxer::Segment segment;
|
||||
@ -1045,30 +1127,63 @@ int create_webm_from_sd_images(const char* filename, sd_image_t* images, int num
|
||||
}
|
||||
return 0;
|
||||
}();
|
||||
writer.Close();
|
||||
return ret;
|
||||
if (ret != 0) {
|
||||
return {};
|
||||
}
|
||||
return writer.data();
|
||||
}
|
||||
|
||||
int create_webm_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) {
|
||||
std::vector<uint8_t> webm_data = create_webm_from_sd_images_to_vector(images, num_images, fps, quality);
|
||||
if (webm_data.empty()) {
|
||||
return -1;
|
||||
}
|
||||
if (!write_binary_file_bytes(filename, webm_data)) {
|
||||
perror("Error opening file for writing");
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
std::vector<uint8_t> create_video_from_sd_images_to_vector(const std::string& output_format,
|
||||
sd_image_t* images,
|
||||
int num_images,
|
||||
int fps,
|
||||
int quality) {
|
||||
std::string format = output_format;
|
||||
std::transform(format.begin(), format.end(), format.begin(),
|
||||
[](unsigned char c) { return static_cast<char>(tolower(c)); });
|
||||
if (!format.empty() && format[0] == '.') {
|
||||
format.erase(format.begin());
|
||||
}
|
||||
|
||||
#ifdef SD_USE_WEBM
|
||||
if (format == "webm") {
|
||||
return create_webm_from_sd_images_to_vector(images, num_images, fps, quality);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef SD_USE_WEBP
|
||||
if (format == "webp") {
|
||||
return create_animated_webp_from_sd_images_to_vector(images, num_images, fps, quality);
|
||||
}
|
||||
#endif
|
||||
|
||||
return create_mjpg_avi_from_sd_images_to_vector(images, num_images, fps, quality);
|
||||
}
|
||||
|
||||
int create_video_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) {
|
||||
std::string path = filename ? filename : "";
|
||||
auto pos = path.find_last_of('.');
|
||||
std::string ext = pos == std::string::npos ? "" : path.substr(pos);
|
||||
for (char& ch : ext) {
|
||||
ch = static_cast<char>(tolower(static_cast<unsigned char>(ch)));
|
||||
std::vector<uint8_t> video_data = create_video_from_sd_images_to_vector(ext, images, num_images, fps, quality);
|
||||
if (video_data.empty()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
#ifdef SD_USE_WEBM
|
||||
if (ext == ".webm") {
|
||||
return create_webm_from_sd_images(filename, images, num_images, fps, quality);
|
||||
if (!write_binary_file_bytes(filename, video_data)) {
|
||||
perror("Error opening file for writing");
|
||||
return -1;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef SD_USE_WEBP
|
||||
if (ext == ".webp") {
|
||||
return create_animated_webp_from_sd_images(filename, images, num_images, fps, quality);
|
||||
}
|
||||
#endif
|
||||
|
||||
return create_mjpg_avi_from_sd_images(filename, images, num_images, fps, quality);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -58,6 +58,10 @@ int create_mjpg_avi_from_sd_images(const char* filename,
|
||||
int num_images,
|
||||
int fps,
|
||||
int quality = 90);
|
||||
std::vector<uint8_t> create_mjpg_avi_from_sd_images_to_vector(sd_image_t* images,
|
||||
int num_images,
|
||||
int fps,
|
||||
int quality = 90);
|
||||
|
||||
#ifdef SD_USE_WEBP
|
||||
int create_animated_webp_from_sd_images(const char* filename,
|
||||
@ -65,6 +69,10 @@ int create_animated_webp_from_sd_images(const char* filename,
|
||||
int num_images,
|
||||
int fps,
|
||||
int quality = 90);
|
||||
std::vector<uint8_t> create_animated_webp_from_sd_images_to_vector(sd_image_t* images,
|
||||
int num_images,
|
||||
int fps,
|
||||
int quality = 90);
|
||||
#endif
|
||||
|
||||
#ifdef SD_USE_WEBM
|
||||
@ -73,6 +81,10 @@ int create_webm_from_sd_images(const char* filename,
|
||||
int num_images,
|
||||
int fps,
|
||||
int quality = 90);
|
||||
std::vector<uint8_t> create_webm_from_sd_images_to_vector(sd_image_t* images,
|
||||
int num_images,
|
||||
int fps,
|
||||
int quality = 90);
|
||||
#endif
|
||||
|
||||
int create_video_from_sd_images(const char* filename,
|
||||
@ -80,5 +92,10 @@ int create_video_from_sd_images(const char* filename,
|
||||
int num_images,
|
||||
int fps,
|
||||
int quality = 90);
|
||||
std::vector<uint8_t> create_video_from_sd_images_to_vector(const std::string& output_format,
|
||||
sd_image_t* images,
|
||||
int num_images,
|
||||
int fps,
|
||||
int quality = 90);
|
||||
|
||||
#endif // __MEDIA_IO_H__
|
||||
|
||||
@ -9,7 +9,7 @@ The server currently exposes three API families:
|
||||
- `sdcpp API` under `/sdcpp/v1/...`
|
||||
|
||||
The `sdcpp API` is the native API surface.
|
||||
Its request schema is also the canonical schema for `sd_cpp_extra_args`.
|
||||
Its request schema is the same schema used by `sd_cpp_extra_args`.
|
||||
|
||||
Global LoRA rule:
|
||||
|
||||
@ -55,8 +55,6 @@ Current endpoints include:
|
||||
- `POST /sdcpp/v1/jobs/{id}/cancel`
|
||||
- `POST /sdcpp/v1/vid_gen`
|
||||
|
||||
`POST /sdcpp/v1/vid_gen` is currently exposed but returns `501 Not Implemented`.
|
||||
|
||||
## `sd_cpp_extra_args`
|
||||
|
||||
`sd_cpp_extra_args` is an extension mechanism for the compatibility APIs.
|
||||
@ -79,12 +77,12 @@ Behavior:
|
||||
- The JSON block is parsed using the same field rules as the `sdcpp API`.
|
||||
- The block is removed from the final prompt before generation.
|
||||
|
||||
Intended use:
|
||||
Supported use:
|
||||
|
||||
- extend `OpenAI API` requests with native `stable-diffusion.cpp` controls
|
||||
- extend `sdapi` requests with native `stable-diffusion.cpp` controls
|
||||
|
||||
Not intended use:
|
||||
Unsupported use:
|
||||
|
||||
- do not use `sd_cpp_extra_args` with `/sdcpp/v1/*`
|
||||
|
||||
@ -372,20 +370,25 @@ Field types:
|
||||
|
||||
Returns frontend-friendly capability metadata.
|
||||
|
||||
Typical contents:
|
||||
The mode-aware fields are the primary interface. The top-level compatibility fields are deprecated mirrors kept for older clients.
|
||||
|
||||
| Field | Type |
|
||||
| --- | --- |
|
||||
| `model` | `object` |
|
||||
| `defaults` | `object` |
|
||||
| `loras` | `array<object>` |
|
||||
| `samplers` | `array<string>` |
|
||||
| `schedulers` | `array<string>` |
|
||||
| `output_formats` | `array<string>` |
|
||||
| `limits` | `object` |
|
||||
| `features` | `object` |
|
||||
Top-level fields:
|
||||
|
||||
Nested fields currently returned:
|
||||
| Field | Type | Notes |
|
||||
| --- | --- | --- |
|
||||
| `model` | `object` | Loaded model metadata |
|
||||
| `current_mode` | `string` | The native generation mode mirrored by top-level compatibility fields |
|
||||
| `supported_modes` | `array<string>` | Supported native modes such as `img_gen` or `vid_gen` |
|
||||
| `defaults` | `object` | Deprecated compatibility mirror of `defaults_by_mode[current_mode]` |
|
||||
| `output_formats` | `array<string>` | Deprecated compatibility mirror of `output_formats_by_mode[current_mode]` |
|
||||
| `features` | `object` | Deprecated compatibility mirror of `features_by_mode[current_mode]` |
|
||||
| `defaults_by_mode` | `object` | Explicit defaults for each supported mode |
|
||||
| `output_formats_by_mode` | `object` | Explicit output formats for each supported mode |
|
||||
| `features_by_mode` | `object` | Explicit feature flags for each supported mode |
|
||||
| `samplers` | `array<string>` | Available sampling methods |
|
||||
| `schedulers` | `array<string>` | Available schedulers |
|
||||
| `loras` | `array<object>` | Available LoRA entries |
|
||||
| `limits` | `object` | Shared queue and size limits |
|
||||
|
||||
`model`
|
||||
|
||||
@ -395,50 +398,24 @@ Nested fields currently returned:
|
||||
| `model.stem` | `string` |
|
||||
| `model.path` | `string` |
|
||||
|
||||
`defaults`
|
||||
Compatibility rules:
|
||||
|
||||
- `defaults`, `output_formats`, and `features` are deprecated compatibility mirrors
|
||||
- those three top-level fields always mirror `current_mode`
|
||||
- `supported_modes`, `defaults_by_mode`, `output_formats_by_mode`, and `features_by_mode` are the mode-aware fields
|
||||
|
||||
Mode-aware objects:
|
||||
|
||||
| Field | Type |
|
||||
| --- | --- |
|
||||
| `defaults.prompt` | `string` |
|
||||
| `defaults.negative_prompt` | `string` |
|
||||
| `defaults.clip_skip` | `integer` |
|
||||
| `defaults.width` | `integer` |
|
||||
| `defaults.height` | `integer` |
|
||||
| `defaults.strength` | `number` |
|
||||
| `defaults.seed` | `integer` |
|
||||
| `defaults.batch_count` | `integer` |
|
||||
| `defaults.auto_resize_ref_image` | `boolean` |
|
||||
| `defaults.increase_ref_index` | `boolean` |
|
||||
| `defaults.control_strength` | `number` |
|
||||
| `defaults.sample_params` | `object` |
|
||||
| `defaults.sample_params.scheduler` | `string` |
|
||||
| `defaults.sample_params.sample_method` | `string` |
|
||||
| `defaults.sample_params.sample_steps` | `integer` |
|
||||
| `defaults.sample_params.eta` | `number \| null` |
|
||||
| `defaults.sample_params.shifted_timestep` | `integer` |
|
||||
| `defaults.sample_params.flow_shift` | `number \| null` |
|
||||
| `defaults.sample_params.guidance` | `object` |
|
||||
| `defaults.sample_params.guidance.txt_cfg` | `number` |
|
||||
| `defaults.sample_params.guidance.img_cfg` | `number \| null` |
|
||||
| `defaults.sample_params.guidance.distilled_guidance` | `number` |
|
||||
| `defaults.sample_params.guidance.slg` | `object` |
|
||||
| `defaults.sample_params.guidance.slg.layers` | `array<integer>` |
|
||||
| `defaults.sample_params.guidance.slg.layer_start` | `number` |
|
||||
| `defaults.sample_params.guidance.slg.layer_end` | `number` |
|
||||
| `defaults.sample_params.guidance.slg.scale` | `number` |
|
||||
| `defaults.vae_tiling_params` | `object` |
|
||||
| `defaults.vae_tiling_params.enabled` | `boolean` |
|
||||
| `defaults.vae_tiling_params.tile_size_x` | `integer` |
|
||||
| `defaults.vae_tiling_params.tile_size_y` | `integer` |
|
||||
| `defaults.vae_tiling_params.target_overlap` | `number` |
|
||||
| `defaults.vae_tiling_params.rel_size_x` | `number` |
|
||||
| `defaults.vae_tiling_params.rel_size_y` | `number` |
|
||||
| `defaults.cache_mode` | `string` |
|
||||
| `defaults.cache_option` | `string` |
|
||||
| `defaults.scm_mask` | `string` |
|
||||
| `defaults.scm_policy_dynamic` | `boolean` |
|
||||
| `defaults.output_format` | `string` |
|
||||
| `defaults.output_compression` | `integer` |
|
||||
| `defaults_by_mode.img_gen` | `object` |
|
||||
| `defaults_by_mode.vid_gen` | `object` |
|
||||
| `output_formats_by_mode.img_gen` | `array<string>` |
|
||||
| `output_formats_by_mode.vid_gen` | `array<string>` |
|
||||
| `features_by_mode.img_gen` | `object` |
|
||||
| `features_by_mode.vid_gen` | `object` |
|
||||
|
||||
Shared nested fields:
|
||||
|
||||
`loras`
|
||||
|
||||
@ -458,19 +435,100 @@ Nested fields currently returned:
|
||||
| `limits.max_batch_count` | `integer` |
|
||||
| `limits.max_queue_size` | `integer` |
|
||||
|
||||
`features`
|
||||
Shared default fields used by both `img_gen` and `vid_gen`:
|
||||
|
||||
| Field | Type |
|
||||
| --- | --- |
|
||||
| `features.init_image` | `boolean` |
|
||||
| `features.mask_image` | `boolean` |
|
||||
| `features.control_image` | `boolean` |
|
||||
| `features.ref_images` | `boolean` |
|
||||
| `features.lora` | `boolean` |
|
||||
| `features.vae_tiling` | `boolean` |
|
||||
| `features.cache` | `boolean` |
|
||||
| `features.cancel_queued` | `boolean` |
|
||||
| `features.cancel_generating` | `boolean` |
|
||||
| `prompt` | `string` |
|
||||
| `negative_prompt` | `string` |
|
||||
| `clip_skip` | `integer` |
|
||||
| `width` | `integer` |
|
||||
| `height` | `integer` |
|
||||
| `strength` | `number` |
|
||||
| `seed` | `integer` |
|
||||
| `sample_params` | `object` |
|
||||
| `sample_params.scheduler` | `string` |
|
||||
| `sample_params.sample_method` | `string` |
|
||||
| `sample_params.sample_steps` | `integer` |
|
||||
| `sample_params.eta` | `number \| null` |
|
||||
| `sample_params.shifted_timestep` | `integer` |
|
||||
| `sample_params.flow_shift` | `number \| null` |
|
||||
| `sample_params.guidance.txt_cfg` | `number` |
|
||||
| `sample_params.guidance.img_cfg` | `number \| null` |
|
||||
| `sample_params.guidance.distilled_guidance` | `number` |
|
||||
| `sample_params.guidance.slg.layers` | `array<integer>` |
|
||||
| `sample_params.guidance.slg.layer_start` | `number` |
|
||||
| `sample_params.guidance.slg.layer_end` | `number` |
|
||||
| `sample_params.guidance.slg.scale` | `number` |
|
||||
| `vae_tiling_params` | `object` |
|
||||
| `vae_tiling_params.enabled` | `boolean` |
|
||||
| `vae_tiling_params.tile_size_x` | `integer` |
|
||||
| `vae_tiling_params.tile_size_y` | `integer` |
|
||||
| `vae_tiling_params.target_overlap` | `number` |
|
||||
| `vae_tiling_params.rel_size_x` | `number` |
|
||||
| `vae_tiling_params.rel_size_y` | `number` |
|
||||
| `cache_mode` | `string` |
|
||||
| `cache_option` | `string` |
|
||||
| `scm_mask` | `string` |
|
||||
| `scm_policy_dynamic` | `boolean` |
|
||||
| `output_format` | `string` |
|
||||
| `output_compression` | `integer` |
|
||||
|
||||
`img_gen`-specific default fields:
|
||||
|
||||
| Field | Type |
|
||||
| --- | --- |
|
||||
| `batch_count` | `integer` |
|
||||
| `auto_resize_ref_image` | `boolean` |
|
||||
| `increase_ref_index` | `boolean` |
|
||||
| `control_strength` | `number` |
|
||||
|
||||
`vid_gen`-specific default fields:
|
||||
|
||||
| Field | Type |
|
||||
| --- | --- |
|
||||
| `video_frames` | `integer` |
|
||||
| `fps` | `integer` |
|
||||
| `moe_boundary` | `number` |
|
||||
| `vace_strength` | `number` |
|
||||
| `high_noise_sample_params` | `object` |
|
||||
| `high_noise_sample_params.scheduler` | `string` |
|
||||
| `high_noise_sample_params.sample_method` | `string` |
|
||||
| `high_noise_sample_params.sample_steps` | `integer` |
|
||||
| `high_noise_sample_params.eta` | `number \| null` |
|
||||
| `high_noise_sample_params.shifted_timestep` | `integer` |
|
||||
| `high_noise_sample_params.flow_shift` | `number \| null` |
|
||||
| `high_noise_sample_params.guidance.txt_cfg` | `number` |
|
||||
| `high_noise_sample_params.guidance.img_cfg` | `number \| null` |
|
||||
| `high_noise_sample_params.guidance.distilled_guidance` | `number` |
|
||||
| `high_noise_sample_params.guidance.slg.layers` | `array<integer>` |
|
||||
| `high_noise_sample_params.guidance.slg.layer_start` | `number` |
|
||||
| `high_noise_sample_params.guidance.slg.layer_end` | `number` |
|
||||
| `high_noise_sample_params.guidance.slg.scale` | `number` |
|
||||
|
||||
Fields returned in `features_by_mode.img_gen`:
|
||||
|
||||
- `init_image`
|
||||
- `mask_image`
|
||||
- `control_image`
|
||||
- `ref_images`
|
||||
- `lora`
|
||||
- `vae_tiling`
|
||||
- `cache`
|
||||
- `cancel_queued`
|
||||
- `cancel_generating`
|
||||
|
||||
Fields returned in `features_by_mode.vid_gen`:
|
||||
|
||||
- `init_image`
|
||||
- `end_image`
|
||||
- `control_frames`
|
||||
- `high_noise_sample_params`
|
||||
- `lora`
|
||||
- `vae_tiling`
|
||||
- `cache`
|
||||
- `cancel_queued`
|
||||
- `cancel_generating`
|
||||
|
||||
#### `POST /sdcpp/v1/img_gen`
|
||||
|
||||
@ -521,9 +579,7 @@ Typical status codes:
|
||||
- `409 Conflict`
|
||||
- `410 Gone`
|
||||
|
||||
### Canonical Request Schema
|
||||
|
||||
The `sdcpp API` request body is the canonical native schema.
|
||||
### Request Body
|
||||
|
||||
Example:
|
||||
|
||||
@ -612,7 +668,7 @@ Channel expectations:
|
||||
If omitted or null:
|
||||
|
||||
- single-image fields map to an empty `sd_image_t`
|
||||
- array fields map to `nullptr + count = 0`
|
||||
- array fields map to an empty C-style array, represented as `pointer = nullptr` and `count = 0`
|
||||
|
||||
### Field Mapping Summary
|
||||
|
||||
@ -686,11 +742,11 @@ HTTP-only output fields:
|
||||
| `output_format` | `string` |
|
||||
| `output_compression` | `integer` |
|
||||
|
||||
### Optional Field Semantics
|
||||
### Optional Field Handling
|
||||
|
||||
Clients should preserve unset semantics for optional sampling fields.
|
||||
Optional sampling fields may be omitted.
|
||||
|
||||
If a user has not explicitly provided one of these fields, the client should omit it instead of injecting a guessed fallback:
|
||||
When omitted, backend defaults apply to these fields:
|
||||
|
||||
- `sample_params.scheduler`
|
||||
- `sample_params.sample_method`
|
||||
@ -766,29 +822,394 @@ Example cancelled job:
|
||||
}
|
||||
```
|
||||
|
||||
### Validation and Retention
|
||||
### Submission Errors
|
||||
|
||||
Recommended behavior:
|
||||
`POST /sdcpp/v1/img_gen` may return:
|
||||
|
||||
- malformed JSON returns `400`
|
||||
- invalid image payloads return `400`
|
||||
- invalid parameter structure returns `400`
|
||||
- queue full returns `429` or `503`
|
||||
- accepted runtime failures transition the job to `failed`
|
||||
- unsupported in-progress cancellation may return `409`
|
||||
- `202 Accepted` when the job is created
|
||||
- `400 Bad Request` for an empty body, unsupported model mode, invalid JSON, or invalid generation parameters
|
||||
- `429 Too Many Requests` when the job queue is full
|
||||
- `500 Internal Server Error` for unexpected server exceptions during submission
|
||||
|
||||
Recommended retention controls:
|
||||
### `vid_gen`
|
||||
|
||||
- pending job limit
|
||||
- completed job TTL
|
||||
- failed job TTL
|
||||
The following section documents the native async contract for video generation.
|
||||
|
||||
### Future `vid_gen`
|
||||
#### `POST /sdcpp/v1/vid_gen`
|
||||
|
||||
Future `vid_gen` should reuse the same async job model:
|
||||
Submits an async video generation job.
|
||||
|
||||
- `POST /sdcpp/v1/vid_gen`
|
||||
- `GET /sdcpp/v1/jobs/{id}`
|
||||
- `POST /sdcpp/v1/jobs/{id}/cancel`
|
||||
Successful submission returns `202 Accepted`.
|
||||
|
||||
Its request body should mirror `sd_vid_gen_params_t` in the same way that `img_gen` mirrors `sd_img_gen_params_t`.
|
||||
Example response:
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "job_01HTXYZVID",
|
||||
"kind": "vid_gen",
|
||||
"status": "queued",
|
||||
"created": 1775401200,
|
||||
"poll_url": "/sdcpp/v1/jobs/job_01HTXYZVID"
|
||||
}
|
||||
```
|
||||
|
||||
Response fields:
|
||||
|
||||
| Field | Type |
|
||||
| --- | --- |
|
||||
| `id` | `string` |
|
||||
| `kind` | `string` |
|
||||
| `status` | `string` |
|
||||
| `created` | `integer` |
|
||||
| `poll_url` | `string` |
|
||||
|
||||
### Request Body
|
||||
|
||||
Compared with `img_gen`, the `vid_gen` request body:
|
||||
|
||||
- `vid_gen` is a single video sequence job, so `batch_count` is not part of the request schema
|
||||
- `ref_images`, `mask_image`, `control_image`, `control_strength`, and `embed_image_metadata` are not part of the request schema
|
||||
- `vid_gen` adds `end_image`, `control_frames`, `high_noise_sample_params`, `video_frames`, `fps`, `moe_boundary`, and `vace_strength`
|
||||
|
||||
Example:
|
||||
|
||||
```json
|
||||
{
|
||||
"prompt": "a cat walking through a rainy alley",
|
||||
"negative_prompt": "",
|
||||
"clip_skip": -1,
|
||||
"width": 832,
|
||||
"height": 480,
|
||||
"strength": 0.75,
|
||||
"seed": -1,
|
||||
"video_frames": 33,
|
||||
"fps": 16,
|
||||
"moe_boundary": 0.875,
|
||||
"vace_strength": 1.0,
|
||||
|
||||
"init_image": null,
|
||||
"end_image": null,
|
||||
"control_frames": [],
|
||||
|
||||
"sample_params": {
|
||||
"scheduler": "discrete",
|
||||
"sample_method": "euler",
|
||||
"sample_steps": 28,
|
||||
"eta": 1.0,
|
||||
"shifted_timestep": 0,
|
||||
"custom_sigmas": [],
|
||||
"flow_shift": 0.0,
|
||||
"guidance": {
|
||||
"txt_cfg": 7.0,
|
||||
"img_cfg": 7.0,
|
||||
"distilled_guidance": 3.5,
|
||||
"slg": {
|
||||
"layers": [7, 8, 9],
|
||||
"layer_start": 0.01,
|
||||
"layer_end": 0.2,
|
||||
"scale": 0.0
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
"high_noise_sample_params": {
|
||||
"scheduler": "discrete",
|
||||
"sample_method": "euler",
|
||||
"sample_steps": -1,
|
||||
"eta": 1.0,
|
||||
"shifted_timestep": 0,
|
||||
"flow_shift": 0.0,
|
||||
"guidance": {
|
||||
"txt_cfg": 7.0,
|
||||
"img_cfg": 7.0,
|
||||
"distilled_guidance": 3.5,
|
||||
"slg": {
|
||||
"layers": [7, 8, 9],
|
||||
"layer_start": 0.01,
|
||||
"layer_end": 0.2,
|
||||
"scale": 0.0
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
"lora": [],
|
||||
|
||||
"vae_tiling_params": {
|
||||
"enabled": false,
|
||||
"tile_size_x": 0,
|
||||
"tile_size_y": 0,
|
||||
"target_overlap": 0.5,
|
||||
"rel_size_x": 0.0,
|
||||
"rel_size_y": 0.0
|
||||
},
|
||||
|
||||
"cache_mode": "disabled",
|
||||
"cache_option": "",
|
||||
"scm_mask": "",
|
||||
"scm_policy_dynamic": true,
|
||||
|
||||
"output_format": "webm",
|
||||
"output_compression": 100
|
||||
}
|
||||
```
|
||||
|
||||
### LoRA Rules
|
||||
|
||||
- The server only accepts explicit LoRA entries from the `lora` field.
|
||||
- Prompt-embedded `<lora:...>` tags are intentionally unsupported.
|
||||
- `lora[].is_high_noise` controls whether a LoRA applies only to the high-noise stage.
|
||||
|
||||
### Image and Frame Encoding Rules
|
||||
|
||||
Any image field accepts:
|
||||
|
||||
- a raw base64 string, or
|
||||
- a data URL such as `data:image/png;base64,...`
|
||||
|
||||
Channel expectations:
|
||||
|
||||
- `init_image`: 3 channels
|
||||
- `end_image`: 3 channels
|
||||
- `control_frames[]`: 3 channels
|
||||
|
||||
Frame ordering rules:
|
||||
|
||||
- `control_frames[]` order is the conditioning frame order
|
||||
- `control_frames[]` is preserved in request order
|
||||
|
||||
If omitted or null:
|
||||
|
||||
- single-image fields map to an empty `sd_image_t`
|
||||
- array fields map to an empty C-style array, represented as `pointer = nullptr` and `count = 0`
|
||||
|
||||
### Field Mapping Summary
|
||||
|
||||
Top-level scalar fields:
|
||||
|
||||
| Field | Type |
|
||||
| --- | --- |
|
||||
| `prompt` | `string` |
|
||||
| `negative_prompt` | `string` |
|
||||
| `clip_skip` | `integer` |
|
||||
| `width` | `integer` |
|
||||
| `height` | `integer` |
|
||||
| `strength` | `number` |
|
||||
| `seed` | `integer` |
|
||||
| `video_frames` | `integer` |
|
||||
| `fps` | `integer` |
|
||||
| `moe_boundary` | `number` |
|
||||
| `vace_strength` | `number` |
|
||||
|
||||
Image and frame fields:
|
||||
|
||||
| Field | Type |
|
||||
| --- | --- |
|
||||
| `init_image` | `string \| null` |
|
||||
| `end_image` | `string \| null` |
|
||||
| `control_frames` | `array<string>` |
|
||||
|
||||
LoRA fields:
|
||||
|
||||
| Field | Type |
|
||||
| --- | --- |
|
||||
| `lora[].path` | `string` |
|
||||
| `lora[].multiplier` | `number` |
|
||||
| `lora[].is_high_noise` | `boolean` |
|
||||
|
||||
Sampling fields:
|
||||
|
||||
| Field | Type |
|
||||
| --- | --- |
|
||||
| `sample_params.scheduler` | `string` |
|
||||
| `sample_params.sample_method` | `string` |
|
||||
| `sample_params.sample_steps` | `integer` |
|
||||
| `sample_params.eta` | `number` |
|
||||
| `sample_params.shifted_timestep` | `integer` |
|
||||
| `sample_params.custom_sigmas` | `array<number>` |
|
||||
| `sample_params.flow_shift` | `number` |
|
||||
| `sample_params.guidance.txt_cfg` | `number` |
|
||||
| `sample_params.guidance.img_cfg` | `number` |
|
||||
| `sample_params.guidance.distilled_guidance` | `number` |
|
||||
| `sample_params.guidance.slg.layers` | `array<integer>` |
|
||||
| `sample_params.guidance.slg.layer_start` | `number` |
|
||||
| `sample_params.guidance.slg.layer_end` | `number` |
|
||||
| `sample_params.guidance.slg.scale` | `number` |
|
||||
|
||||
High-noise sampling fields:
|
||||
|
||||
| Field | Type |
|
||||
| --- | --- |
|
||||
| `high_noise_sample_params.scheduler` | `string` |
|
||||
| `high_noise_sample_params.sample_method` | `string` |
|
||||
| `high_noise_sample_params.sample_steps` | `integer` |
|
||||
| `high_noise_sample_params.eta` | `number` |
|
||||
| `high_noise_sample_params.shifted_timestep` | `integer` |
|
||||
| `high_noise_sample_params.flow_shift` | `number` |
|
||||
| `high_noise_sample_params.guidance.txt_cfg` | `number` |
|
||||
| `high_noise_sample_params.guidance.img_cfg` | `number` |
|
||||
| `high_noise_sample_params.guidance.distilled_guidance` | `number` |
|
||||
| `high_noise_sample_params.guidance.slg.layers` | `array<integer>` |
|
||||
| `high_noise_sample_params.guidance.slg.layer_start` | `number` |
|
||||
| `high_noise_sample_params.guidance.slg.layer_end` | `number` |
|
||||
| `high_noise_sample_params.guidance.slg.scale` | `number` |
|
||||
|
||||
Other native fields:
|
||||
|
||||
| Field | Type |
|
||||
| --- | --- |
|
||||
| `vae_tiling_params` | `object` |
|
||||
| `cache_mode` | `string` |
|
||||
| `cache_option` | `string` |
|
||||
| `scm_mask` | `string` |
|
||||
| `scm_policy_dynamic` | `boolean` |
|
||||
|
||||
HTTP-only output fields:
|
||||
|
||||
| Field | Type |
|
||||
| --- | --- |
|
||||
| `output_format` | `string` |
|
||||
| `output_compression` | `integer` |
|
||||
|
||||
For `vid_gen`, `output_format` and `output_compression` control container encoding.
|
||||
`fps` is request metadata for the generated sequence and is echoed in the completed job result.
|
||||
|
||||
Allowed `output_format` values:
|
||||
|
||||
- `webm`
|
||||
- `webp`
|
||||
- `avi`
|
||||
|
||||
Output format behavior:
|
||||
|
||||
- `output_format` defaults to `webm`
|
||||
- `webp` means animated WebP
|
||||
- `avi` means MJPG AVI
|
||||
- `webm` requires the server to be built with WebM support; otherwise the request returns `400`
|
||||
|
||||
### Result Payload
|
||||
|
||||
Completed jobs return one encoded container payload, not a list of per-frame images.
|
||||
|
||||
Result fields:
|
||||
|
||||
- `result.b64_json` contains the whole encoded container file as base64
|
||||
- `result.mime_type` identifies the media type
|
||||
- `result.output_format` echoes the selected container format
|
||||
- `result.fps` echoes the effective playback FPS
|
||||
- `result.frame_count` reports the actual decoded frame count used to build the container
|
||||
|
||||
Expected MIME types:
|
||||
|
||||
| `output_format` | `mime_type` |
|
||||
| --- | --- |
|
||||
| `webm` | `video/webm` |
|
||||
| `webp` | `image/webp` |
|
||||
| `avi` | `video/x-msvideo` |
|
||||
|
||||
### Optional Field Handling
|
||||
|
||||
Optional sampling fields may be omitted.
|
||||
|
||||
When omitted, backend defaults apply to these fields:
|
||||
|
||||
- `sample_params.scheduler`
|
||||
- `sample_params.sample_method`
|
||||
- `sample_params.eta`
|
||||
- `sample_params.flow_shift`
|
||||
- `sample_params.guidance.img_cfg`
|
||||
- `high_noise_sample_params.scheduler`
|
||||
- `high_noise_sample_params.sample_method`
|
||||
- `high_noise_sample_params.eta`
|
||||
- `high_noise_sample_params.flow_shift`
|
||||
- `high_noise_sample_params.guidance.img_cfg`
|
||||
|
||||
`high_noise_sample_params` may also be omitted entirely.
|
||||
|
||||
### Frame Count Semantics
|
||||
|
||||
`video_frames` is the requested target length, but the current core video path internally normalizes the effective frame count to the largest `4n + 1` value that does not exceed the requested count.
|
||||
|
||||
Examples:
|
||||
|
||||
- `video_frames = 33` stays `33`
|
||||
- `video_frames = 34` becomes `33`
|
||||
- `video_frames = 32` becomes `29`
|
||||
|
||||
The completed job payload includes the actual decoded `frame_count`.
|
||||
|
||||
### Completion Result
|
||||
|
||||
Example completed job:
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "job_01HTXYZVID",
|
||||
"kind": "vid_gen",
|
||||
"status": "completed",
|
||||
"created": 1775401200,
|
||||
"started": 1775401203,
|
||||
"completed": 1775401215,
|
||||
"queue_position": 0,
|
||||
"result": {
|
||||
"output_format": "webm",
|
||||
"mime_type": "video/webm",
|
||||
"fps": 16,
|
||||
"frame_count": 33,
|
||||
"b64_json": "GkXfo59ChoEBQveBAULygQRC84EIQo..."
|
||||
},
|
||||
"error": null
|
||||
}
|
||||
```
|
||||
|
||||
The response returns the encoded `.webm`, animated `.webp`, or `.avi` container payload directly.
|
||||
|
||||
### Failure Result
|
||||
|
||||
Example failed job:
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "job_01HTXYZVID",
|
||||
"kind": "vid_gen",
|
||||
"status": "failed",
|
||||
"created": 1775401200,
|
||||
"started": 1775401203,
|
||||
"completed": 1775401204,
|
||||
"queue_position": 0,
|
||||
"result": null,
|
||||
"error": {
|
||||
"code": "generation_failed",
|
||||
"message": "generate_video returned no results"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Cancelled Result
|
||||
|
||||
Example cancelled job:
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "job_01HTXYZVID",
|
||||
"kind": "vid_gen",
|
||||
"status": "cancelled",
|
||||
"created": 1775401200,
|
||||
"started": null,
|
||||
"completed": 1775401202,
|
||||
"queue_position": 0,
|
||||
"result": null,
|
||||
"error": {
|
||||
"code": "cancelled",
|
||||
"message": "job cancelled by client"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Submission Errors
|
||||
|
||||
`POST /sdcpp/v1/vid_gen` may return:
|
||||
|
||||
- `202 Accepted` when the job is created
|
||||
- `400 Bad Request` for an empty body, unsupported model mode, invalid JSON, invalid generation parameters, or an unsupported output format
|
||||
- `429 Too Many Requests` when the job queue is full
|
||||
- `500 Internal Server Error` for unexpected server exceptions during submission
|
||||
|
||||
@ -95,6 +95,10 @@ bool cancel_queued_job(AsyncJobManager& manager, AsyncGenerationJob& job) {
|
||||
job.status = AsyncJobStatus::Cancelled;
|
||||
job.completed_at = unix_timestamp_now();
|
||||
job.result_images_b64.clear();
|
||||
job.result_media_b64.clear();
|
||||
job.result_media_mime_type.clear();
|
||||
job.result_frame_count = 0;
|
||||
job.result_fps = 0;
|
||||
job.error_code = "cancelled";
|
||||
job.error_message = "job cancelled by client";
|
||||
return true;
|
||||
@ -122,6 +126,15 @@ json make_async_job_json(const AsyncJobManager& manager, const AsyncGenerationJo
|
||||
}
|
||||
|
||||
if (job.status == AsyncJobStatus::Completed) {
|
||||
if (job.kind == AsyncJobKind::VidGen) {
|
||||
result["result"] = {
|
||||
{"output_format", job.vid_gen.output_format},
|
||||
{"mime_type", job.result_media_mime_type},
|
||||
{"fps", job.result_fps},
|
||||
{"frame_count", job.result_frame_count},
|
||||
{"b64_json", job.result_media_b64},
|
||||
};
|
||||
} else {
|
||||
json images = json::array();
|
||||
for (size_t i = 0; i < job.result_images_b64.size(); ++i) {
|
||||
images.push_back({{"index", i}, {"b64_json", job.result_images_b64[i]}});
|
||||
@ -130,6 +143,7 @@ json make_async_job_json(const AsyncJobManager& manager, const AsyncGenerationJo
|
||||
{"output_format", job.img_gen.output_format},
|
||||
{"images", images},
|
||||
};
|
||||
}
|
||||
result["error"] = nullptr;
|
||||
} else if (job.status == AsyncJobStatus::Failed ||
|
||||
job.status == AsyncJobStatus::Cancelled) {
|
||||
@ -156,16 +170,15 @@ bool execute_img_gen_job(ServerRuntime& runtime,
|
||||
sd_img_gen_params_t params = job.img_gen.to_sd_img_gen_params_t();
|
||||
|
||||
SDImageVec results;
|
||||
int num_results = 0;
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(*runtime.sd_ctx_mutex);
|
||||
sd_image_t* raw_results = generate_image(runtime.sd_ctx, ¶ms);
|
||||
num_results = params.batch_count;
|
||||
results.adopt(raw_results, num_results);
|
||||
results.adopt(raw_results, params.batch_count);
|
||||
}
|
||||
|
||||
if (results.empty() || num_results <= 0) {
|
||||
const int num_results = results.count();
|
||||
if (num_results <= 0) {
|
||||
error_message = "generate_image returned no results";
|
||||
return false;
|
||||
}
|
||||
@ -208,6 +221,47 @@ bool execute_img_gen_job(ServerRuntime& runtime,
|
||||
return true;
|
||||
}
|
||||
|
||||
bool execute_vid_gen_job(ServerRuntime& runtime,
|
||||
AsyncGenerationJob& job,
|
||||
std::string& output_media_b64,
|
||||
std::string& output_media_mime_type,
|
||||
int& output_frame_count,
|
||||
int& output_fps,
|
||||
std::string& error_message) {
|
||||
sd_vid_gen_params_t params = job.vid_gen.to_sd_vid_gen_params_t();
|
||||
|
||||
SDImageVec results;
|
||||
int num_results = 0;
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(*runtime.sd_ctx_mutex);
|
||||
sd_image_t* raw_results = generate_video(runtime.sd_ctx, ¶ms, &num_results);
|
||||
results.adopt(raw_results, num_results);
|
||||
}
|
||||
|
||||
num_results = results.count();
|
||||
if (num_results <= 0) {
|
||||
error_message = "generate_video returned no results";
|
||||
return false;
|
||||
}
|
||||
|
||||
std::vector<uint8_t> video_bytes = create_video_from_sd_images_to_vector(job.vid_gen.output_format,
|
||||
results.data(),
|
||||
num_results,
|
||||
job.vid_gen.gen_params.fps,
|
||||
job.vid_gen.output_compression);
|
||||
if (video_bytes.empty()) {
|
||||
error_message = "failed to encode generated video container";
|
||||
return false;
|
||||
}
|
||||
|
||||
output_media_b64 = base64_encode(video_bytes);
|
||||
output_media_mime_type = video_mime_type(job.vid_gen.output_format);
|
||||
output_frame_count = num_results;
|
||||
output_fps = job.vid_gen.gen_params.fps;
|
||||
return true;
|
||||
}
|
||||
|
||||
void async_job_worker(ServerRuntime& runtime) {
|
||||
AsyncJobManager& manager = *runtime.async_job_manager;
|
||||
|
||||
@ -240,11 +294,23 @@ void async_job_worker(ServerRuntime& runtime) {
|
||||
}
|
||||
|
||||
std::vector<std::string> output_images;
|
||||
std::string output_media_b64;
|
||||
std::string output_media_mime_type;
|
||||
int output_frame_count = 0;
|
||||
int output_fps = 0;
|
||||
std::string error_message;
|
||||
bool ok = false;
|
||||
|
||||
if (job->kind == AsyncJobKind::ImgGen) {
|
||||
ok = execute_img_gen_job(runtime, *job, output_images, error_message);
|
||||
} else if (job->kind == AsyncJobKind::VidGen) {
|
||||
ok = execute_vid_gen_job(runtime,
|
||||
*job,
|
||||
output_media_b64,
|
||||
output_media_mime_type,
|
||||
output_frame_count,
|
||||
output_fps,
|
||||
error_message);
|
||||
} else {
|
||||
error_message = "unsupported job kind";
|
||||
}
|
||||
@ -260,6 +326,10 @@ void async_job_worker(ServerRuntime& runtime) {
|
||||
if (ok) {
|
||||
job->status = AsyncJobStatus::Completed;
|
||||
job->result_images_b64 = std::move(output_images);
|
||||
job->result_media_b64 = std::move(output_media_b64);
|
||||
job->result_media_mime_type = std::move(output_media_mime_type);
|
||||
job->result_frame_count = output_frame_count;
|
||||
job->result_fps = output_fps;
|
||||
job->error_code.clear();
|
||||
job->error_message.clear();
|
||||
} else {
|
||||
@ -267,6 +337,10 @@ void async_job_worker(ServerRuntime& runtime) {
|
||||
job->error_code = "generation_failed";
|
||||
job->error_message = error_message.empty() ? "unknown generation error" : error_message;
|
||||
job->result_images_b64.clear();
|
||||
job->result_media_b64.clear();
|
||||
job->result_media_mime_type.clear();
|
||||
job->result_frame_count = 0;
|
||||
job->result_fps = 0;
|
||||
}
|
||||
|
||||
purge_expired_jobs(manager);
|
||||
|
||||
@ -36,7 +36,12 @@ struct AsyncGenerationJob {
|
||||
int64_t started_at = 0;
|
||||
int64_t completed_at = 0;
|
||||
ImgGenJobRequest img_gen;
|
||||
VidGenJobRequest vid_gen;
|
||||
std::vector<std::string> result_images_b64;
|
||||
std::string result_media_b64;
|
||||
std::string result_media_mime_type;
|
||||
int result_frame_count = 0;
|
||||
int result_fps = 0;
|
||||
std::string error_code;
|
||||
std::string error_message;
|
||||
};
|
||||
@ -63,4 +68,11 @@ bool execute_img_gen_job(ServerRuntime& runtime,
|
||||
AsyncGenerationJob& job,
|
||||
std::vector<std::string>& output_images,
|
||||
std::string& error_message);
|
||||
bool execute_vid_gen_job(ServerRuntime& runtime,
|
||||
AsyncGenerationJob& job,
|
||||
std::string& output_media_b64,
|
||||
std::string& output_media_mime_type,
|
||||
int& output_frame_count,
|
||||
int& output_fps,
|
||||
std::string& error_message);
|
||||
void async_job_worker(ServerRuntime& runtime);
|
||||
|
||||
@ -1 +1 @@
|
||||
Subproject commit 740475a7a6794dc07fb23e8ec5dc56e7e80aa8c1
|
||||
Subproject commit 797ccf80825cc035508ba9b599b2a21953e7f835
|
||||
@ -253,6 +253,12 @@ void register_openai_api_endpoints(httplib::Server& svr, ServerRuntime& rt) {
|
||||
|
||||
svr.Post("/v1/images/generations", [runtime](const httplib::Request& req, httplib::Response& res) {
|
||||
try {
|
||||
if (!runtime_supports_generation_mode(*runtime, IMG_GEN)) {
|
||||
res.status = 400;
|
||||
res.set_content(json({{"error", unsupported_generation_mode_error(IMG_GEN)}}).dump(), "application/json");
|
||||
return;
|
||||
}
|
||||
|
||||
ImgGenJobRequest request;
|
||||
std::string error_message;
|
||||
if (!build_openai_generation_request(req, *runtime, request, error_message)) {
|
||||
@ -319,6 +325,12 @@ void register_openai_api_endpoints(httplib::Server& svr, ServerRuntime& rt) {
|
||||
|
||||
svr.Post("/v1/images/edits", [runtime](const httplib::Request& req, httplib::Response& res) {
|
||||
try {
|
||||
if (!runtime_supports_generation_mode(*runtime, IMG_GEN)) {
|
||||
res.status = 400;
|
||||
res.set_content(json({{"error", unsupported_generation_mode_error(IMG_GEN)}}).dump(), "application/json");
|
||||
return;
|
||||
}
|
||||
|
||||
ImgGenJobRequest request;
|
||||
std::string error_message;
|
||||
if (!build_openai_edit_request(req, *runtime, request, error_message)) {
|
||||
|
||||
@ -246,6 +246,11 @@ void register_sdapi_endpoints(httplib::Server& svr, ServerRuntime& rt) {
|
||||
res.set_content(R"({"error":"empty body"})", "application/json");
|
||||
return;
|
||||
}
|
||||
if (!runtime_supports_generation_mode(*runtime, IMG_GEN)) {
|
||||
res.status = 400;
|
||||
res.set_content(json({{"error", unsupported_generation_mode_error(IMG_GEN)}}).dump(), "application/json");
|
||||
return;
|
||||
}
|
||||
|
||||
json j = json::parse(req.body);
|
||||
ImgGenJobRequest request;
|
||||
|
||||
@ -75,61 +75,9 @@ static fs::path resolve_display_model_path(const ServerRuntime& runtime) {
|
||||
return {};
|
||||
}
|
||||
|
||||
static json make_capabilities_json(ServerRuntime& runtime) {
|
||||
refresh_lora_cache(runtime);
|
||||
|
||||
AsyncJobManager& manager = *runtime.async_job_manager;
|
||||
const auto& defaults = *runtime.default_gen_params;
|
||||
const auto& sample_params = defaults.sample_params;
|
||||
static json make_sample_params_json(const sd_sample_params_t& sample_params, const std::vector<int>& skip_layers) {
|
||||
const auto& guidance = sample_params.guidance;
|
||||
const fs::path model_path = resolve_display_model_path(runtime);
|
||||
json samplers = json::array();
|
||||
json schedulers = json::array();
|
||||
json output_formats = json::array({"png", "jpeg"});
|
||||
json available_loras = json::array();
|
||||
|
||||
for (int i = 0; i < SAMPLE_METHOD_COUNT; ++i) {
|
||||
samplers.push_back(sd_sample_method_name((sample_method_t)i));
|
||||
}
|
||||
|
||||
for (int i = 0; i < SCHEDULER_COUNT; ++i) {
|
||||
schedulers.push_back(sd_scheduler_name((scheduler_t)i));
|
||||
}
|
||||
|
||||
#ifdef SD_USE_WEBP
|
||||
output_formats.push_back("webp");
|
||||
#endif
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(*runtime.lora_mutex);
|
||||
for (const auto& entry : *runtime.lora_cache) {
|
||||
available_loras.push_back({
|
||||
{"name", entry.name},
|
||||
{"path", entry.path},
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
json result;
|
||||
result["model"] = {
|
||||
{"name", model_path.filename().u8string()},
|
||||
{"stem", model_path.stem().u8string()},
|
||||
{"path", model_path.u8string()},
|
||||
};
|
||||
result["defaults"] = {
|
||||
{"prompt", defaults.prompt},
|
||||
{"negative_prompt", defaults.negative_prompt},
|
||||
{"clip_skip", defaults.clip_skip},
|
||||
{"width", defaults.width > 0 ? defaults.width : 512},
|
||||
{"height", defaults.height > 0 ? defaults.height : 512},
|
||||
{"strength", defaults.strength},
|
||||
{"seed", defaults.seed},
|
||||
{"batch_count", defaults.batch_count},
|
||||
{"auto_resize_ref_image", defaults.auto_resize_ref_image},
|
||||
{"increase_ref_index", defaults.increase_ref_index},
|
||||
{"control_strength", defaults.control_strength},
|
||||
{"sample_params",
|
||||
{
|
||||
return {
|
||||
{"scheduler", capability_scheduler_name(sample_params.scheduler)},
|
||||
{"sample_method", capability_sample_method_name(sample_params.sample_method)},
|
||||
{"sample_steps", sample_params.sample_steps},
|
||||
@ -143,33 +91,66 @@ static json make_capabilities_json(ServerRuntime& runtime) {
|
||||
{"distilled_guidance", guidance.distilled_guidance},
|
||||
{"slg",
|
||||
{
|
||||
{"layers", defaults.skip_layers},
|
||||
{"layers", skip_layers},
|
||||
{"layer_start", guidance.slg.layer_start},
|
||||
{"layer_end", guidance.slg.layer_end},
|
||||
{"scale", guidance.slg.scale},
|
||||
}},
|
||||
}},
|
||||
}},
|
||||
};
|
||||
}
|
||||
|
||||
static json make_img_gen_defaults_json(const SDGenerationParams& defaults, const std::string& output_format) {
|
||||
return {
|
||||
{"prompt", defaults.prompt},
|
||||
{"negative_prompt", defaults.negative_prompt},
|
||||
{"clip_skip", defaults.clip_skip},
|
||||
{"width", defaults.width > 0 ? defaults.width : 512},
|
||||
{"height", defaults.height > 0 ? defaults.height : 512},
|
||||
{"strength", defaults.strength},
|
||||
{"seed", defaults.seed},
|
||||
{"batch_count", defaults.batch_count},
|
||||
{"auto_resize_ref_image", defaults.auto_resize_ref_image},
|
||||
{"increase_ref_index", defaults.increase_ref_index},
|
||||
{"control_strength", defaults.control_strength},
|
||||
{"sample_params", make_sample_params_json(defaults.sample_params, defaults.skip_layers)},
|
||||
{"vae_tiling_params", make_vae_tiling_json(defaults.vae_tiling_params)},
|
||||
{"cache_mode", defaults.cache_mode},
|
||||
{"cache_option", defaults.cache_option},
|
||||
{"scm_mask", defaults.scm_mask},
|
||||
{"scm_policy_dynamic", defaults.scm_policy_dynamic},
|
||||
{"output_format", "png"},
|
||||
{"output_format", output_format},
|
||||
{"output_compression", 100},
|
||||
};
|
||||
result["limits"] = {
|
||||
{"min_width", 64},
|
||||
{"max_width", 4096},
|
||||
{"min_height", 64},
|
||||
{"max_height", 4096},
|
||||
{"max_batch_count", 8},
|
||||
{"max_queue_size", manager.max_pending_jobs},
|
||||
}
|
||||
|
||||
static json make_vid_gen_defaults_json(const SDGenerationParams& defaults, const std::string& output_format) {
|
||||
return {
|
||||
{"prompt", defaults.prompt},
|
||||
{"negative_prompt", defaults.negative_prompt},
|
||||
{"clip_skip", defaults.clip_skip},
|
||||
{"width", defaults.width > 0 ? defaults.width : 512},
|
||||
{"height", defaults.height > 0 ? defaults.height : 512},
|
||||
{"strength", defaults.strength},
|
||||
{"seed", defaults.seed},
|
||||
{"video_frames", defaults.video_frames},
|
||||
{"fps", defaults.fps},
|
||||
{"moe_boundary", defaults.moe_boundary},
|
||||
{"vace_strength", defaults.vace_strength},
|
||||
{"sample_params", make_sample_params_json(defaults.sample_params, defaults.skip_layers)},
|
||||
{"high_noise_sample_params", make_sample_params_json(defaults.high_noise_sample_params, defaults.high_noise_skip_layers)},
|
||||
{"vae_tiling_params", make_vae_tiling_json(defaults.vae_tiling_params)},
|
||||
{"cache_mode", defaults.cache_mode},
|
||||
{"cache_option", defaults.cache_option},
|
||||
{"scm_mask", defaults.scm_mask},
|
||||
{"scm_policy_dynamic", defaults.scm_policy_dynamic},
|
||||
{"output_format", output_format},
|
||||
{"output_compression", 100},
|
||||
};
|
||||
result["samplers"] = samplers;
|
||||
result["schedulers"] = schedulers;
|
||||
result["output_formats"] = output_formats;
|
||||
result["features"] = {
|
||||
}
|
||||
|
||||
static json make_img_gen_features_json() {
|
||||
return {
|
||||
{"init_image", true},
|
||||
{"mask_image", true},
|
||||
{"control_image", true},
|
||||
@ -180,6 +161,128 @@ static json make_capabilities_json(ServerRuntime& runtime) {
|
||||
{"cancel_queued", true},
|
||||
{"cancel_generating", false},
|
||||
};
|
||||
}
|
||||
|
||||
static json make_vid_gen_features_json() {
|
||||
return {
|
||||
{"init_image", true},
|
||||
{"end_image", true},
|
||||
{"control_frames", true},
|
||||
{"high_noise_sample_params", true},
|
||||
{"lora", true},
|
||||
{"vae_tiling", true},
|
||||
{"cache", true},
|
||||
{"cancel_queued", true},
|
||||
{"cancel_generating", false},
|
||||
};
|
||||
}
|
||||
|
||||
static json make_capabilities_json(ServerRuntime& runtime) {
|
||||
refresh_lora_cache(runtime);
|
||||
|
||||
AsyncJobManager& manager = *runtime.async_job_manager;
|
||||
const auto& defaults = *runtime.default_gen_params;
|
||||
const fs::path model_path = resolve_display_model_path(runtime);
|
||||
const bool supports_img = runtime_supports_generation_mode(runtime, IMG_GEN);
|
||||
const bool supports_vid = runtime_supports_generation_mode(runtime, VID_GEN);
|
||||
json samplers = json::array();
|
||||
json schedulers = json::array();
|
||||
json image_output_formats = supported_img_output_formats();
|
||||
json video_output_formats = supported_vid_output_formats();
|
||||
json available_loras = json::array();
|
||||
json supported_modes = json::array();
|
||||
|
||||
for (int i = 0; i < SAMPLE_METHOD_COUNT; ++i) {
|
||||
samplers.push_back(sd_sample_method_name((sample_method_t)i));
|
||||
}
|
||||
|
||||
for (int i = 0; i < SCHEDULER_COUNT; ++i) {
|
||||
schedulers.push_back(sd_scheduler_name((scheduler_t)i));
|
||||
}
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(*runtime.lora_mutex);
|
||||
for (const auto& entry : *runtime.lora_cache) {
|
||||
available_loras.push_back({
|
||||
{"name", entry.name},
|
||||
{"path", entry.path},
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
if (supports_img) {
|
||||
supported_modes.push_back("img_gen");
|
||||
}
|
||||
if (supports_vid) {
|
||||
supported_modes.push_back("vid_gen");
|
||||
}
|
||||
|
||||
std::string default_img_output_format = "png";
|
||||
std::string default_vid_output_format = "avi";
|
||||
if (!image_output_formats.empty()) {
|
||||
default_img_output_format = image_output_formats[0].get<std::string>();
|
||||
}
|
||||
if (!video_output_formats.empty()) {
|
||||
default_vid_output_format = video_output_formats[0].get<std::string>();
|
||||
}
|
||||
|
||||
json defaults_by_mode = json::object();
|
||||
json output_formats_by_mode = json::object();
|
||||
json features_by_mode = json::object();
|
||||
if (supports_img) {
|
||||
defaults_by_mode["img_gen"] = make_img_gen_defaults_json(defaults, default_img_output_format);
|
||||
output_formats_by_mode["img_gen"] = image_output_formats;
|
||||
features_by_mode["img_gen"] = make_img_gen_features_json();
|
||||
}
|
||||
if (supports_vid) {
|
||||
defaults_by_mode["vid_gen"] = make_vid_gen_defaults_json(defaults, default_vid_output_format);
|
||||
output_formats_by_mode["vid_gen"] = video_output_formats;
|
||||
features_by_mode["vid_gen"] = make_vid_gen_features_json();
|
||||
}
|
||||
|
||||
json top_level_defaults = json::object();
|
||||
json top_level_output_formats = json::array();
|
||||
json top_level_features = {
|
||||
{"cancel_queued", true},
|
||||
{"cancel_generating", false},
|
||||
};
|
||||
std::string current_mode = "";
|
||||
if (supports_img) {
|
||||
current_mode = "img_gen";
|
||||
top_level_defaults = defaults_by_mode["img_gen"];
|
||||
top_level_output_formats = output_formats_by_mode["img_gen"];
|
||||
top_level_features = features_by_mode["img_gen"];
|
||||
} else if (supports_vid) {
|
||||
current_mode = "vid_gen";
|
||||
top_level_defaults = defaults_by_mode["vid_gen"];
|
||||
top_level_output_formats = output_formats_by_mode["vid_gen"];
|
||||
top_level_features = features_by_mode["vid_gen"];
|
||||
}
|
||||
|
||||
json result;
|
||||
result["model"] = {
|
||||
{"name", model_path.filename().u8string()},
|
||||
{"stem", model_path.stem().u8string()},
|
||||
{"path", model_path.u8string()},
|
||||
};
|
||||
result["current_mode"] = current_mode;
|
||||
result["supported_modes"] = supported_modes;
|
||||
result["defaults"] = top_level_defaults;
|
||||
result["defaults_by_mode"] = defaults_by_mode;
|
||||
result["limits"] = {
|
||||
{"min_width", 64},
|
||||
{"max_width", 4096},
|
||||
{"min_height", 64},
|
||||
{"max_height", 4096},
|
||||
{"max_batch_count", 8},
|
||||
{"max_queue_size", manager.max_pending_jobs},
|
||||
};
|
||||
result["samplers"] = samplers;
|
||||
result["schedulers"] = schedulers;
|
||||
result["output_formats"] = top_level_output_formats;
|
||||
result["output_formats_by_mode"] = output_formats_by_mode;
|
||||
result["features"] = top_level_features;
|
||||
result["features_by_mode"] = features_by_mode;
|
||||
result["loras"] = available_loras;
|
||||
return result;
|
||||
}
|
||||
@ -211,6 +314,33 @@ static bool parse_img_gen_request(const json& body,
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool parse_vid_gen_request(const json& body,
|
||||
ServerRuntime& runtime,
|
||||
VidGenJobRequest& request,
|
||||
std::string& error_message) {
|
||||
request.gen_params = *runtime.default_gen_params;
|
||||
|
||||
refresh_lora_cache(runtime);
|
||||
if (!request.gen_params.from_json_str(body.dump(), [&](const std::string& path) {
|
||||
return get_lora_full_path(runtime, path);
|
||||
})) {
|
||||
error_message = "invalid generation parameters";
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string output_format = body.value("output_format", "webm");
|
||||
int output_compression = body.value("output_compression", 100);
|
||||
if (!assign_output_options(request, output_format, output_compression, error_message)) {
|
||||
return false;
|
||||
}
|
||||
// Intentionally disable prompt-embedded LoRA tag parsing for server APIs.
|
||||
if (!request.gen_params.resolve_and_validate(VID_GEN, "", true)) {
|
||||
error_message = "invalid generation parameters";
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void register_sdcpp_api_endpoints(httplib::Server& svr, ServerRuntime& rt) {
|
||||
ServerRuntime* runtime = &rt;
|
||||
|
||||
@ -226,6 +356,11 @@ void register_sdcpp_api_endpoints(httplib::Server& svr, ServerRuntime& rt) {
|
||||
res.set_content(R"({"error":"empty body"})", "application/json");
|
||||
return;
|
||||
}
|
||||
if (!runtime_supports_generation_mode(*runtime, IMG_GEN)) {
|
||||
res.status = 400;
|
||||
res.set_content(json({{"error", unsupported_generation_mode_error(IMG_GEN)}}).dump(), "application/json");
|
||||
return;
|
||||
}
|
||||
|
||||
json body = json::parse(req.body);
|
||||
ImgGenJobRequest request;
|
||||
@ -276,9 +411,66 @@ void register_sdcpp_api_endpoints(httplib::Server& svr, ServerRuntime& rt) {
|
||||
}
|
||||
});
|
||||
|
||||
svr.Post("/sdcpp/v1/vid_gen", [](const httplib::Request&, httplib::Response& res) {
|
||||
res.status = 501;
|
||||
res.set_content(R"({"error":"vid_gen is reserved and not implemented yet"})", "application/json");
|
||||
svr.Post("/sdcpp/v1/vid_gen", [runtime](const httplib::Request& req, httplib::Response& res) {
|
||||
try {
|
||||
if (req.body.empty()) {
|
||||
res.status = 400;
|
||||
res.set_content(R"({"error":"empty body"})", "application/json");
|
||||
return;
|
||||
}
|
||||
if (!runtime_supports_generation_mode(*runtime, VID_GEN)) {
|
||||
res.status = 400;
|
||||
res.set_content(json({{"error", unsupported_generation_mode_error(VID_GEN)}}).dump(), "application/json");
|
||||
return;
|
||||
}
|
||||
|
||||
json body = json::parse(req.body);
|
||||
VidGenJobRequest request;
|
||||
std::string error_message;
|
||||
if (!parse_vid_gen_request(body, *runtime, request, error_message)) {
|
||||
res.status = 400;
|
||||
res.set_content(json({{"error", error_message}}).dump(), "application/json");
|
||||
return;
|
||||
}
|
||||
|
||||
AsyncJobManager& manager = *runtime->async_job_manager;
|
||||
std::shared_ptr<AsyncGenerationJob> job = std::make_shared<AsyncGenerationJob>();
|
||||
job->kind = AsyncJobKind::VidGen;
|
||||
job->status = AsyncJobStatus::Queued;
|
||||
job->created_at = unix_timestamp_now();
|
||||
job->vid_gen = std::move(request);
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(manager.mutex);
|
||||
purge_expired_jobs(manager);
|
||||
if (count_pending_jobs(manager) >= manager.max_pending_jobs) {
|
||||
res.status = 429;
|
||||
res.set_content(R"({"error":"job queue is full"})", "application/json");
|
||||
return;
|
||||
}
|
||||
job->id = make_async_job_id(manager);
|
||||
manager.jobs[job->id] = job;
|
||||
manager.queue.push_back(job->id);
|
||||
}
|
||||
|
||||
manager.cv.notify_one();
|
||||
|
||||
json out;
|
||||
out["id"] = job->id;
|
||||
out["kind"] = async_job_kind_name(job->kind);
|
||||
out["status"] = async_job_status_name(job->status);
|
||||
out["created"] = job->created_at;
|
||||
out["poll_url"] = "/sdcpp/v1/jobs/" + job->id;
|
||||
|
||||
res.status = 202;
|
||||
res.set_content(out.dump(), "application/json");
|
||||
} catch (const json::parse_error& e) {
|
||||
res.status = 400;
|
||||
res.set_content(json({{"error", "invalid json"}, {"message", e.what()}}).dump(), "application/json");
|
||||
} catch (const std::exception& e) {
|
||||
res.status = 500;
|
||||
res.set_content(json({{"error", "server_error"}, {"message", e.what()}}).dump(), "application/json");
|
||||
}
|
||||
});
|
||||
|
||||
svr.Get(R"(/sdcpp/v1/jobs/([A-Za-z0-9_\-]+))", [runtime](const httplib::Request& req, httplib::Response& res) {
|
||||
|
||||
@ -45,6 +45,44 @@ std::string normalize_output_format(std::string output_format) {
|
||||
return output_format;
|
||||
}
|
||||
|
||||
std::vector<std::string> supported_img_output_formats(bool allow_webp) {
|
||||
std::vector<std::string> formats = {"png", "jpeg"};
|
||||
#ifdef SD_USE_WEBP
|
||||
if (allow_webp) {
|
||||
formats.push_back("webp");
|
||||
}
|
||||
#else
|
||||
(void)allow_webp;
|
||||
#endif
|
||||
return formats;
|
||||
}
|
||||
|
||||
std::vector<std::string> supported_vid_output_formats() {
|
||||
std::vector<std::string> formats;
|
||||
#ifdef SD_USE_WEBM
|
||||
formats.push_back("webm");
|
||||
#endif
|
||||
#ifdef SD_USE_WEBP
|
||||
formats.push_back("webp");
|
||||
#endif
|
||||
formats.push_back("avi");
|
||||
return formats;
|
||||
}
|
||||
|
||||
static std::string valid_vid_output_formats_message() {
|
||||
const std::vector<std::string> formats = supported_vid_output_formats();
|
||||
|
||||
std::string message = "invalid output_format, must be one of [";
|
||||
for (size_t i = 0; i < formats.size(); ++i) {
|
||||
if (i > 0) {
|
||||
message += ", ";
|
||||
}
|
||||
message += formats[i];
|
||||
}
|
||||
message += "]";
|
||||
return message;
|
||||
}
|
||||
|
||||
bool assign_output_options(ImgGenJobRequest& request,
|
||||
std::string output_format,
|
||||
int output_compression,
|
||||
@ -53,19 +91,88 @@ bool assign_output_options(ImgGenJobRequest& request,
|
||||
request.output_format = normalize_output_format(std::move(output_format));
|
||||
request.output_compression = std::clamp(output_compression, 0, 100);
|
||||
|
||||
const bool valid_format = request.output_format == "png" ||
|
||||
request.output_format == "jpeg" ||
|
||||
(allow_webp && request.output_format == "webp");
|
||||
const std::vector<std::string> valid_formats = supported_img_output_formats(allow_webp);
|
||||
const bool valid_format = std::find(valid_formats.begin(),
|
||||
valid_formats.end(),
|
||||
request.output_format) != valid_formats.end();
|
||||
if (!valid_format) {
|
||||
error_message = allow_webp
|
||||
? "invalid output_format, must be one of [png, jpeg, webp]"
|
||||
: "invalid output_format, must be one of [png, jpeg]";
|
||||
error_message = "invalid output_format, must be one of [";
|
||||
for (size_t i = 0; i < valid_formats.size(); ++i) {
|
||||
if (i > 0) {
|
||||
error_message += ", ";
|
||||
}
|
||||
error_message += valid_formats[i];
|
||||
}
|
||||
error_message += "]";
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool assign_output_options(VidGenJobRequest& request,
|
||||
std::string output_format,
|
||||
int output_compression,
|
||||
std::string& error_message) {
|
||||
request.output_format = normalize_output_format(std::move(output_format));
|
||||
request.output_compression = std::clamp(output_compression, 0, 100);
|
||||
|
||||
if (request.output_format == "avi") {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (request.output_format == "webm") {
|
||||
#ifdef SD_USE_WEBM
|
||||
return true;
|
||||
#else
|
||||
error_message = valid_vid_output_formats_message();
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
if (request.output_format == "webp") {
|
||||
#ifdef SD_USE_WEBP
|
||||
return true;
|
||||
#else
|
||||
error_message = valid_vid_output_formats_message();
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
error_message = valid_vid_output_formats_message();
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string video_mime_type(const std::string& output_format) {
|
||||
if (output_format == "webm") {
|
||||
return "video/webm";
|
||||
}
|
||||
if (output_format == "webp") {
|
||||
return "image/webp";
|
||||
}
|
||||
return "video/x-msvideo";
|
||||
}
|
||||
|
||||
bool runtime_supports_generation_mode(const ServerRuntime& runtime, SDMode mode) {
|
||||
if (mode == VID_GEN) {
|
||||
return sd_ctx_supports_video_generation(runtime.sd_ctx);
|
||||
}
|
||||
if (mode == IMG_GEN) {
|
||||
return sd_ctx_supports_image_generation(runtime.sd_ctx);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
std::string unsupported_generation_mode_error(SDMode mode) {
|
||||
if (mode == VID_GEN) {
|
||||
return "loaded model does not support vid_gen";
|
||||
}
|
||||
if (mode == IMG_GEN) {
|
||||
return "loaded model does not support img_gen";
|
||||
}
|
||||
return "loaded model does not support requested mode";
|
||||
}
|
||||
|
||||
ArgOptions SDSvrParams::get_options() {
|
||||
ArgOptions options;
|
||||
|
||||
|
||||
@ -58,13 +58,32 @@ struct ImgGenJobRequest {
|
||||
}
|
||||
};
|
||||
|
||||
struct VidGenJobRequest {
|
||||
SDGenerationParams gen_params;
|
||||
std::string output_format = "webm";
|
||||
int output_compression = 100;
|
||||
|
||||
sd_vid_gen_params_t to_sd_vid_gen_params_t() {
|
||||
return gen_params.to_sd_vid_gen_params_t();
|
||||
}
|
||||
};
|
||||
|
||||
std::string base64_encode(const std::vector<uint8_t>& bytes);
|
||||
std::string normalize_output_format(std::string output_format);
|
||||
std::vector<std::string> supported_img_output_formats(bool allow_webp = true);
|
||||
std::vector<std::string> supported_vid_output_formats();
|
||||
bool assign_output_options(ImgGenJobRequest& request,
|
||||
std::string output_format,
|
||||
int output_compression,
|
||||
bool allow_webp,
|
||||
std::string& error_message);
|
||||
bool assign_output_options(VidGenJobRequest& request,
|
||||
std::string output_format,
|
||||
int output_compression,
|
||||
std::string& error_message);
|
||||
std::string video_mime_type(const std::string& output_format);
|
||||
bool runtime_supports_generation_mode(const ServerRuntime& runtime, SDMode mode);
|
||||
std::string unsupported_generation_mode_error(SDMode mode);
|
||||
void refresh_lora_cache(ServerRuntime& rt);
|
||||
std::string get_lora_full_path(ServerRuntime& rt, const std::string& path);
|
||||
int64_t unix_timestamp_now();
|
||||
|
||||
@ -348,6 +348,8 @@ SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
|
||||
SD_API void sd_set_preview_callback(sd_preview_cb_t cb, enum preview_t mode, int interval, bool denoised, bool noisy, void* data);
|
||||
SD_API int32_t sd_get_num_physical_cores();
|
||||
SD_API const char* sd_get_system_info();
|
||||
SD_API bool sd_ctx_supports_image_generation(const sd_ctx_t* sd_ctx);
|
||||
SD_API bool sd_ctx_supports_video_generation(const sd_ctx_t* sd_ctx);
|
||||
|
||||
SD_API const char* sd_type_name(enum sd_type_t type);
|
||||
SD_API enum sd_type_t str_to_sd_type(const char* str);
|
||||
|
||||
@ -2390,6 +2390,14 @@ struct sd_ctx_t {
|
||||
StableDiffusionGGML* sd = nullptr;
|
||||
};
|
||||
|
||||
static bool sd_version_supports_video_generation(SDVersion version) {
|
||||
return version == VERSION_SVD || sd_version_is_wan(version);
|
||||
}
|
||||
|
||||
static bool sd_version_supports_image_generation(SDVersion version) {
|
||||
return !sd_version_supports_video_generation(version);
|
||||
}
|
||||
|
||||
sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params) {
|
||||
sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t));
|
||||
if (sd_ctx == nullptr) {
|
||||
@ -2419,6 +2427,20 @@ void free_sd_ctx(sd_ctx_t* sd_ctx) {
|
||||
free(sd_ctx);
|
||||
}
|
||||
|
||||
SD_API bool sd_ctx_supports_image_generation(const sd_ctx_t* sd_ctx) {
|
||||
if (sd_ctx == nullptr || sd_ctx->sd == nullptr) {
|
||||
return false;
|
||||
}
|
||||
return sd_version_supports_image_generation(sd_ctx->sd->version);
|
||||
}
|
||||
|
||||
SD_API bool sd_ctx_supports_video_generation(const sd_ctx_t* sd_ctx) {
|
||||
if (sd_ctx == nullptr || sd_ctx->sd == nullptr) {
|
||||
return false;
|
||||
}
|
||||
return sd_version_supports_video_generation(sd_ctx->sd->version);
|
||||
}
|
||||
|
||||
enum sample_method_t sd_get_default_sample_method(const sd_ctx_t* sd_ctx) {
|
||||
if (sd_ctx != nullptr && sd_ctx->sd != nullptr) {
|
||||
if (sd_version_is_dit(sd_ctx->sd->version)) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user