mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2025-12-12 13:28:37 +00:00
feat: add Flux.2 VAE proj matrix for previews (#1017)
This commit is contained in:
parent
96c3e64057
commit
583a02e29e
@ -91,6 +91,41 @@ const float flux_latent_rgb_proj[16][3] = {
|
||||
{-0.111849f, -0.055589f, -0.032361f}};
|
||||
float flux_latent_rgb_bias[3] = {0.024600f, -0.006937f, -0.008089f};
|
||||
|
||||
const float flux2_latent_rgb_proj[32][3] = {
|
||||
{0.000736f, -0.008385f, -0.019710f},
|
||||
{-0.001352f, -0.016392f, 0.020693f},
|
||||
{-0.006376f, 0.002428f, 0.036736f},
|
||||
{0.039384f, 0.074167f, 0.119789f},
|
||||
{0.007464f, -0.005705f, -0.004734f},
|
||||
{-0.004086f, 0.005287f, -0.000409f},
|
||||
{-0.032835f, 0.050802f, -0.028120f},
|
||||
{-0.003158f, -0.000835f, 0.000406f},
|
||||
{-0.112840f, -0.084337f, -0.023083f},
|
||||
{0.001462f, -0.006656f, 0.000549f},
|
||||
{-0.009980f, -0.007480f, 0.009702f},
|
||||
{0.032540f, 0.000214f, -0.061388f},
|
||||
{0.011023f, 0.000694f, 0.007143f},
|
||||
{-0.001468f, -0.006723f, -0.001678f},
|
||||
{-0.005921f, -0.010320f, -0.003907f},
|
||||
{-0.028434f, 0.027584f, 0.018457f},
|
||||
{0.014349f, 0.011523f, 0.000441f},
|
||||
{0.009874f, 0.003081f, 0.001507f},
|
||||
{0.002218f, 0.005712f, 0.001563f},
|
||||
{0.053010f, -0.019844f, 0.008683f},
|
||||
{-0.002507f, 0.005384f, 0.000938f},
|
||||
{-0.002177f, -0.011366f, 0.003559f},
|
||||
{-0.000261f, 0.015121f, -0.003240f},
|
||||
{-0.003944f, -0.002083f, 0.005043f},
|
||||
{-0.009138f, 0.011336f, 0.003781f},
|
||||
{0.011429f, 0.003985f, -0.003855f},
|
||||
{0.010518f, -0.005586f, 0.010131f},
|
||||
{0.007883f, 0.002912f, -0.001473f},
|
||||
{-0.003318f, -0.003160f, 0.003684f},
|
||||
{-0.034560f, -0.008740f, 0.012996f},
|
||||
{0.000166f, 0.001079f, -0.012153f},
|
||||
{0.017772f, 0.000937f, -0.011953f}};
|
||||
float flux2_latent_rgb_bias[3] = {-0.028738f, -0.098463f, -0.107619f};
|
||||
|
||||
// This one was taken straight from
|
||||
// https://github.com/Stability-AI/sd3.5/blob/8565799a3b41eb0c7ba976d18375f0f753f56402/sd3_impls.py#L288-L303
|
||||
// (MiT Licence)
|
||||
@ -128,16 +163,42 @@ const float sd_latent_rgb_proj[4][3] = {
|
||||
{-0.178022f, -0.200862f, -0.678514f}};
|
||||
float sd_latent_rgb_bias[3] = {-0.017478f, -0.055834f, -0.105825f};
|
||||
|
||||
void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int width, int height, int frames, int dim) {
|
||||
void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int patch_size) {
|
||||
size_t buffer_head = 0;
|
||||
|
||||
uint32_t latent_width = latents->ne[0];
|
||||
uint32_t latent_height = latents->ne[1];
|
||||
uint32_t dim = latents->ne[ggml_n_dims(latents) - 1];
|
||||
uint32_t frames = 1;
|
||||
if (ggml_n_dims(latents) == 4) {
|
||||
frames = latents->ne[2];
|
||||
}
|
||||
|
||||
uint32_t rgb_width = latent_width * patch_size;
|
||||
uint32_t rgb_height = latent_height * patch_size;
|
||||
|
||||
uint32_t unpatched_dim = dim / (patch_size * patch_size);
|
||||
|
||||
for (int k = 0; k < frames; k++) {
|
||||
for (int j = 0; j < height; j++) {
|
||||
for (int i = 0; i < width; i++) {
|
||||
size_t latent_id = (i * latents->nb[0] + j * latents->nb[1] + k * latents->nb[2]);
|
||||
for (int rgb_x = 0; rgb_x < rgb_width; rgb_x++) {
|
||||
for (int rgb_y = 0; rgb_y < rgb_height; rgb_y++) {
|
||||
int latent_x = rgb_x / patch_size;
|
||||
int latent_y = rgb_y / patch_size;
|
||||
|
||||
int channel_offset = 0;
|
||||
if (patch_size > 1) {
|
||||
channel_offset = ((rgb_y % patch_size) * patch_size + (rgb_x % patch_size));
|
||||
}
|
||||
|
||||
size_t latent_id = (latent_x * latents->nb[0] + latent_y * latents->nb[1] + k * latents->nb[2]);
|
||||
|
||||
// should be incremented by 1 for each pixel
|
||||
size_t pixel_id = k * rgb_width * rgb_height + rgb_y * rgb_width + rgb_x;
|
||||
|
||||
float r = 0, g = 0, b = 0;
|
||||
if (latent_rgb_proj != nullptr) {
|
||||
for (int d = 0; d < dim; d++) {
|
||||
float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[ggml_n_dims(latents) - 1]);
|
||||
for (int d = 0; d < unpatched_dim; d++) {
|
||||
float value = *(float*)((char*)latents->data + latent_id + (d * patch_size * patch_size + channel_offset) * latents->nb[ggml_n_dims(latents) - 1]);
|
||||
r += value * latent_rgb_proj[d][0];
|
||||
g += value * latent_rgb_proj[d][1];
|
||||
b += value * latent_rgb_proj[d][2];
|
||||
@ -164,9 +225,9 @@ void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const fl
|
||||
g = g >= 0 ? g <= 1 ? g : 1 : 0;
|
||||
b = b >= 0 ? b <= 1 ? b : 1 : 0;
|
||||
|
||||
buffer[buffer_head++] = (uint8_t)(r * 255);
|
||||
buffer[buffer_head++] = (uint8_t)(g * 255);
|
||||
buffer[buffer_head++] = (uint8_t)(b * 255);
|
||||
buffer[pixel_id * 3 + 0] = (uint8_t)(r * 255);
|
||||
buffer[pixel_id * 3 + 1] = (uint8_t)(g * 255);
|
||||
buffer[pixel_id * 3 + 2] = (uint8_t)(b * 255);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1326,10 +1326,17 @@ public:
|
||||
uint32_t dim = latents->ne[ggml_n_dims(latents) - 1];
|
||||
|
||||
if (preview_mode == PREVIEW_PROJ) {
|
||||
int64_t patch_sz = 1;
|
||||
const float(*latent_rgb_proj)[channel] = nullptr;
|
||||
float* latent_rgb_bias = nullptr;
|
||||
|
||||
if (dim == 48) {
|
||||
if (dim == 128) {
|
||||
if (sd_version_is_flux2(version)) {
|
||||
latent_rgb_proj = flux2_latent_rgb_proj;
|
||||
latent_rgb_bias = flux2_latent_rgb_bias;
|
||||
patch_sz = 2;
|
||||
}
|
||||
} else if (dim == 48) {
|
||||
if (sd_version_is_wan(version)) {
|
||||
latent_rgb_proj = wan_22_latent_rgb_proj;
|
||||
latent_rgb_bias = wan_22_latent_rgb_bias;
|
||||
@ -1382,12 +1389,15 @@ public:
|
||||
frames = latents->ne[2];
|
||||
}
|
||||
|
||||
uint8_t* data = (uint8_t*)malloc(frames * width * height * channel * sizeof(uint8_t));
|
||||
uint32_t img_width = width * patch_sz;
|
||||
uint32_t img_height = height * patch_sz;
|
||||
|
||||
preview_latent_video(data, latents, latent_rgb_proj, latent_rgb_bias, width, height, frames, dim);
|
||||
uint8_t* data = (uint8_t*)malloc(frames * img_width * img_height * channel * sizeof(uint8_t));
|
||||
|
||||
preview_latent_video(data, latents, latent_rgb_proj, latent_rgb_bias, patch_sz);
|
||||
sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t));
|
||||
for (int i = 0; i < frames; i++) {
|
||||
images[i] = {width, height, channel, data + i * width * height * channel};
|
||||
images[i] = {img_width, img_height, channel, data + i * img_width * img_height * channel};
|
||||
}
|
||||
step_callback(step, frames, images, is_noisy, step_callback_data);
|
||||
free(data);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user