Compare commits

...

5 Commits

30 changed files with 2322 additions and 1487 deletions

1
.gitignore vendored
View File

@ -12,3 +12,4 @@ test/
output*.png output*.png
models* models*
*.log *.log
preview.png

View File

@ -111,7 +111,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
bool load_embedding(std::string embd_name, std::string embd_path, std::vector<int32_t>& bpe_tokens) { bool load_embedding(std::string embd_name, std::string embd_path, std::vector<int32_t>& bpe_tokens) {
// the order matters // the order matters
ModelLoader model_loader; ModelLoader model_loader;
if (!model_loader.init_from_file(embd_path)) { if (!model_loader.init_from_file_and_convert_name(embd_path)) {
LOG_ERROR("embedding '%s' failed", embd_name.c_str()); LOG_ERROR("embedding '%s' failed", embd_name.c_str());
return false; return false;
} }

View File

@ -442,7 +442,7 @@ struct ControlNet : public GGMLRunner {
std::set<std::string> ignore_tensors; std::set<std::string> ignore_tensors;
ModelLoader model_loader; ModelLoader model_loader;
if (!model_loader.init_from_file(file_path)) { if (!model_loader.init_from_file_and_convert_name(file_path)) {
LOG_ERROR("init control net model loader from file failed: '%s'", file_path.c_str()); LOG_ERROR("init control net model loader from file failed: '%s'", file_path.c_str());
return false; return false;
} }

View File

@ -1,40 +1,66 @@
# Running distilled models: SSD1B and SD1.x with tiny U-Nets # Running distilled models: SSD1B and SDx.x with tiny U-Nets
## Preface ## Preface
This kind of models have a reduced U-Net part. These models feature a reduced U-Net architecture. Unlike standard SDXL models, the SSD-1B U-Net contains only one middle block and fewer attention layers in its up- and down-blocks, resulting in significantly smaller file sizes. Using these models can reduce inference time by more than 33%. For more details, refer to Segmind's paper: https://arxiv.org/abs/2401.02677v1.
Unlike other SDXL models the U-Net of SSD1B has only one middle block and lesser attention layers in up and down blocks, resulting in relatively smaller files. Running these models saves more than 33% of the time. For more details, refer to Segmind's paper on https://arxiv.org/abs/2401.02677v1 . Similarly, SD1.x- and SD2.x-style models with a tiny U-Net consist of only 6 U-Net blocks, leading to very small files and time savings of up to 50%. For more information, see the paper: https://arxiv.org/pdf/2305.15798.pdf.
Unlike other SD 1.x models Tiny-UNet models consist of only 6 U-Net blocks, resulting in relatively smaller files (approximately 1 GB). Running these models saves almost 50% of the time. For more details, refer to the paper: https://arxiv.org/pdf/2305.15798.pdf .
## SSD1B ## SSD1B
Unfortunately not all of this models follow the standard model parameter naming mapping. Note that not all of these models follow the standard parameter naming conventions. However, several useful SSD-1B models are available online, such as:
Anyway there are some very useful SSD1B models available online, such as:
* https://huggingface.co/segmind/SSD-1B/resolve/main/SSD-1B-A1111.safetensors * https://huggingface.co/segmind/SSD-1B/resolve/main/SSD-1B-A1111.safetensors
* https://huggingface.co/hassenhamdi/SSD-1B-fp8_e4m3fn/resolve/main/SSD-1B_fp8_e4m3fn.safetensors * https://huggingface.co/hassenhamdi/SSD-1B-fp8_e4m3fn/resolve/main/SSD-1B_fp8_e4m3fn.safetensors
Also there are useful LORAs available: Useful LoRAs are also available:
* https://huggingface.co/seungminh/lora-swarovski-SSD-1B/resolve/main/pytorch_lora_weights.safetensors * https://huggingface.co/seungminh/lora-swarovski-SSD-1B/resolve/main/pytorch_lora_weights.safetensors
* https://huggingface.co/kylielee505/mylcmlorassd/resolve/main/pytorch_lora_weights.safetensors * https://huggingface.co/kylielee505/mylcmlorassd/resolve/main/pytorch_lora_weights.safetensors
You can use this files **out-of-the-box** - unlike models in next section. These files can be used out-of-the-box, unlike the models described in the next section.
## SD1.x with tiny U-Nets ## SD1.x, SD2.x with tiny U-Nets
There are some Tiny SD 1.x models available online, such as: These models require conversion before use. You will need a Python script provided by the diffusers team, available on GitHub:
* https://raw.githubusercontent.com/huggingface/diffusers/refs/heads/main/scripts/convert_diffusers_to_original_stable_diffusion.py
### SD2.x
NotaAI provides the following model online:
* https://huggingface.co/nota-ai/bk-sdm-v2-tiny
Creating a .safetensors file involves two steps. First, run this short Python script to download the model from Hugging Face:
```python
from diffusers import StableDiffusionPipeline
pipe = StableDiffusionPipeline.from_pretrained("nota-ai/bk-sdm-v2-tiny",cache_dir="./")
```
Second, create the .safetensors file by running:
```bash
python convert_diffusers_to_original_stable_diffusion.py \
--model_path models--nota-ai--bk-sdm-v2-tiny/snapshots/68277af553777858cd47e133f92e4db47321bc74 \
--checkpoint_path bk-sdm-v2-tiny.safetensors --half --use_safetensors
```
This will generate the **file bk-sdm-v2-tiny.safetensors**, which is now ready for use with sd.cpp.
### SD1.x
Several Tiny SD 1.x models are available online, such as:
* https://huggingface.co/segmind/tiny-sd * https://huggingface.co/segmind/tiny-sd
* https://huggingface.co/segmind/portrait-finetuned * https://huggingface.co/segmind/portrait-finetuned
* https://huggingface.co/nota-ai/bk-sdm-tiny * https://huggingface.co/nota-ai/bk-sdm-tiny
These models need some conversion, for example because partially tensors are **non contiguous** stored. To create a usable checkpoint file, follow these **easy** steps: These models also require conversion, partly because some tensors are stored in a non-contiguous manner. To create a usable checkpoint file, follow these simple steps:
Download and prepare the model using Python:
### Download model from Hugging Face ##### Download the model using Python on your computer, for example this way:
Download the model using Python on your computer, for example this way:
```python ```python
import torch import torch
@ -46,35 +72,22 @@ for param in unet.parameters():
pipe.save_pretrained("segmindtiny-sd", safe_serialization=True) pipe.save_pretrained("segmindtiny-sd", safe_serialization=True)
``` ```
### Convert that to a ckpt file ##### Run the conversion script:
To convert the downloaded model to a checkpoint file, you need another Python script. Download the conversion script from here:
* https://raw.githubusercontent.com/huggingface/diffusers/refs/heads/main/scripts/convert_diffusers_to_original_stable_diffusion.py
### Run convert script
Now, run that conversion script:
```bash ```bash
python convert_diffusers_to_original_stable_diffusion.py \ python convert_diffusers_to_original_stable_diffusion.py \
--model_path ./segmindtiny-sd \ --model_path ./segmindtiny-sd \
--checkpoint_path ./segmind_tiny-sd.ckpt --half --checkpoint_path ./segmind_tiny-sd.ckpt --half
``` ```
The file **segmind_tiny-sd.ckpt** will be generated and is now ready to use with sd.cpp The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
You can follow a similar process for other models mentioned above from Hugging Face.
### Another ckpt file on the net ### Another available .ckpt file:
There is another model file available online:
* https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt * https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt
If you want to use that, you have to adjust some **non-contiguous tensors** first: To use this file, you must first adjust its non-contiguous tensors:
```python ```python
import torch import torch

View File

@ -40,7 +40,7 @@ Running PMV2 is now a two-step process:
``` ```
python face_detect.py input_image_dir python face_detect.py input_image_dir
``` ```
An ```id_embeds.safetensors``` file will be generated in ```input_images_dir``` An ```id_embeds.bin``` file will be generated in ```input_images_dir```
**Note: this step is only needed to run once; the same ```id_embeds``` can be reused** **Note: this step is only needed to run once; the same ```id_embeds``` can be reused**
@ -48,6 +48,6 @@ An ```id_embeds.safetensors``` file will be generated in ```input_images_dir```
You can download ```photomaker-v2.safetensors``` from [here](https://huggingface.co/bssrdf/PhotoMakerV2) You can download ```photomaker-v2.safetensors``` from [here](https://huggingface.co/bssrdf/PhotoMakerV2)
- All the command line parameters from Version 1 remain the same for Version 2 - All the command line parameters from Version 1 remain the same for Version 2 plus one extra pointing to a valid ```id_embeds``` file: --pm-id-embed-path [path_to__id_embeds.bin]

View File

@ -169,7 +169,7 @@ struct ESRGAN : public GGMLRunner {
LOG_INFO("loading esrgan from '%s'", file_path.c_str()); LOG_INFO("loading esrgan from '%s'", file_path.c_str());
ModelLoader model_loader; ModelLoader model_loader;
if (!model_loader.init_from_file(file_path)) { if (!model_loader.init_from_file_and_convert_name(file_path)) {
LOG_ERROR("init esrgan model loader from file failed: '%s'", file_path.c_str()); LOG_ERROR("init esrgan model loader from file failed: '%s'", file_path.c_str());
return false; return false;
} }

View File

@ -32,6 +32,7 @@ Options:
-o, --output <string> path to write result image to (default: ./output.png) -o, --output <string> path to write result image to (default: ./output.png)
-p, --prompt <string> the prompt to render -p, --prompt <string> the prompt to render
-n, --negative-prompt <string> the negative prompt (default: "") -n, --negative-prompt <string> the negative prompt (default: "")
--preview-path <string> path to write preview image to (default: ./preview.png)
--upscale-model <string> path to esrgan model. --upscale-model <string> path to esrgan model.
-t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of -t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
CPU physical cores CPU physical cores
@ -48,6 +49,8 @@ Options:
--fps <int> fps (default: 24) --fps <int> fps (default: 24)
--timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for --timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
NitroSD-Vibrant NitroSD-Vibrant
--preview-interval <int> interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at
every step)
--cfg-scale <float> unconditional guidance scale: (default: 7.0) --cfg-scale <float> unconditional guidance scale: (default: 7.0)
--img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale) --img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
--guidance <float> distilled guidance scale for models with guidance input (default: 3.5) --guidance <float> distilled guidance scale for models with guidance input (default: 3.5)
@ -86,6 +89,8 @@ Options:
--chroma-enable-t5-mask enable t5 mask for chroma --chroma-enable-t5-mask enable t5 mask for chroma
--increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1). --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).
--disable-auto-resize-ref-image disable auto resize of ref images --disable-auto-resize-ref-image disable auto resize of ref images
--taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview tae)
--preview-noisy enables previewing noisy inputs of the models rather than the denoised outputs
-M, --mode run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen -M, --mode run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the --type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
type of the weight file type of the weight file
@ -107,4 +112,5 @@ Options:
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32) --vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 --vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
(overrides --vae-tile-size) (overrides --vae-tile-size)
--preview preview method. must be one of the following [none, proj, tae, vae] (default is none)
``` ```

View File

@ -46,6 +46,13 @@ const char* modes_str[] = {
}; };
#define SD_ALL_MODES_STR "img_gen, vid_gen, convert, upscale" #define SD_ALL_MODES_STR "img_gen, vid_gen, convert, upscale"
const char* previews_str[] = {
"none",
"proj",
"tae",
"vae",
};
enum SDMode { enum SDMode {
IMG_GEN, IMG_GEN,
VID_GEN, VID_GEN,
@ -135,6 +142,12 @@ struct SDParams {
sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f}; sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f};
bool force_sdxl_vae_conv_scale = false; bool force_sdxl_vae_conv_scale = false;
preview_t preview_method = PREVIEW_NONE;
int preview_interval = 1;
std::string preview_path = "preview.png";
bool taesd_preview = false;
bool preview_noisy = false;
SDParams() { SDParams() {
sd_sample_params_init(&sample_params); sd_sample_params_init(&sample_params);
sd_sample_params_init(&high_noise_sample_params); sd_sample_params_init(&high_noise_sample_params);
@ -210,6 +223,8 @@ void print_params(SDParams params) {
printf(" video_frames: %d\n", params.video_frames); printf(" video_frames: %d\n", params.video_frames);
printf(" vace_strength: %.2f\n", params.vace_strength); printf(" vace_strength: %.2f\n", params.vace_strength);
printf(" fps: %d\n", params.fps); printf(" fps: %d\n", params.fps);
printf(" preview_mode: %s (%s)\n", previews_str[params.preview_method], params.preview_noisy ? "noisy" : "denoised");
printf(" preview_interval: %d\n", params.preview_interval);
free(sample_params_str); free(sample_params_str);
free(high_noise_sample_params_str); free(high_noise_sample_params_str);
} }
@ -589,6 +604,10 @@ void parse_args(int argc, const char** argv, SDParams& params) {
"--negative-prompt", "--negative-prompt",
"the negative prompt (default: \"\")", "the negative prompt (default: \"\")",
&params.negative_prompt}, &params.negative_prompt},
{"",
"--preview-path",
"path to write preview image to (default: ./preview.png)",
&params.preview_path},
{"", {"",
"--upscale-model", "--upscale-model",
"path to esrgan model.", "path to esrgan model.",
@ -647,6 +666,10 @@ void parse_args(int argc, const char** argv, SDParams& params) {
"shift timestep for NitroFusion models (default: 0). " "shift timestep for NitroFusion models (default: 0). "
"recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant", "recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant",
&params.sample_params.shifted_timestep}, &params.sample_params.shifted_timestep},
{"",
"--preview-interval",
"interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at every step)",
&params.preview_interval},
}; };
options.float_options = { options.float_options = {
@ -801,7 +824,14 @@ void parse_args(int argc, const char** argv, SDParams& params) {
"--disable-auto-resize-ref-image", "--disable-auto-resize-ref-image",
"disable auto resize of ref images", "disable auto resize of ref images",
false, &params.auto_resize_ref_image}, false, &params.auto_resize_ref_image},
}; {"",
"--taesd-preview-only",
std::string("prevents usage of taesd for decoding the final image. (for use with --preview ") + previews_str[PREVIEW_TAE] + ")",
true, &params.taesd_preview},
{"",
"--preview-noisy",
"enables previewing noisy inputs of the models rather than the denoised outputs",
true, &params.preview_noisy}};
auto on_mode_arg = [&](int argc, const char** argv, int index) { auto on_mode_arg = [&](int argc, const char** argv, int index) {
if (++index >= argc) { if (++index >= argc) {
@ -1046,6 +1076,26 @@ void parse_args(int argc, const char** argv, SDParams& params) {
return 1; return 1;
}; };
auto on_preview_arg = [&](int argc, const char** argv, int index) {
if (++index >= argc) {
return -1;
}
const char* preview = argv[index];
int preview_method = -1;
for (int m = 0; m < PREVIEW_COUNT; m++) {
if (!strcmp(preview, previews_str[m])) {
preview_method = m;
}
}
if (preview_method == -1) {
fprintf(stderr, "error: preview method %s\n",
preview);
return -1;
}
params.preview_method = (preview_t)preview_method;
return 1;
};
options.manual_options = { options.manual_options = {
{"-M", {"-M",
"--mode", "--mode",
@ -1110,6 +1160,10 @@ void parse_args(int argc, const char** argv, SDParams& params) {
"--vae-relative-tile-size", "--vae-relative-tile-size",
"relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)", "relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)",
on_relative_tile_size_arg}, on_relative_tile_size_arg},
{"",
"--preview",
std::string("preview method. must be one of the following [") + previews_str[0] + ", " + previews_str[1] + ", " + previews_str[2] + ", " + previews_str[3] + "] (default is " + previews_str[PREVIEW_NONE] + ")\n",
on_preview_arg},
}; };
if (!parse_options(argc, argv, options)) { if (!parse_options(argc, argv, options)) {
@ -1452,15 +1506,50 @@ bool load_images_from_dir(const std::string dir,
return true; return true;
} }
const char* preview_path;
float preview_fps;
void step_callback(int step, int frame_count, sd_image_t* image, bool is_noisy) {
(void)step;
(void)is_noisy;
// is_noisy is set to true if the preview corresponds to noisy latents, false if it's denoised latents
// unused in this app, it will either be always noisy or always denoised here
if (frame_count == 1) {
stbi_write_png(preview_path, image->width, image->height, image->channel, image->data, 0);
} else {
create_mjpg_avi_from_sd_images(preview_path, image, frame_count, preview_fps);
}
}
int main(int argc, const char* argv[]) { int main(int argc, const char* argv[]) {
SDParams params; SDParams params;
parse_args(argc, argv, params); parse_args(argc, argv, params);
preview_path = params.preview_path.c_str();
if (params.video_frames > 4) {
size_t last_dot_pos = params.preview_path.find_last_of(".");
std::string base_path = params.preview_path;
std::string file_ext = "";
if (last_dot_pos != std::string::npos) { // filename has extension
base_path = params.preview_path.substr(0, last_dot_pos);
file_ext = params.preview_path.substr(last_dot_pos);
std::transform(file_ext.begin(), file_ext.end(), file_ext.begin(), ::tolower);
}
if (file_ext == ".png") {
base_path = base_path + ".avi";
preview_path = base_path.c_str();
}
}
preview_fps = params.fps;
if (params.preview_method == PREVIEW_PROJ)
preview_fps /= 4.0f;
params.sample_params.guidance.slg.layers = params.skip_layers.data(); params.sample_params.guidance.slg.layers = params.skip_layers.data();
params.sample_params.guidance.slg.layer_count = params.skip_layers.size(); params.sample_params.guidance.slg.layer_count = params.skip_layers.size();
params.high_noise_sample_params.guidance.slg.layers = params.high_noise_skip_layers.data(); params.high_noise_sample_params.guidance.slg.layers = params.high_noise_skip_layers.data();
params.high_noise_sample_params.guidance.slg.layer_count = params.high_noise_skip_layers.size(); params.high_noise_sample_params.guidance.slg.layer_count = params.high_noise_skip_layers.size();
sd_set_log_callback(sd_log_cb, (void*)&params); sd_set_log_callback(sd_log_cb, (void*)&params);
sd_set_preview_callback((sd_preview_cb_t)step_callback, params.preview_method, params.preview_interval, !params.preview_noisy, params.preview_noisy);
if (params.verbose) { if (params.verbose) {
print_params(params); print_params(params);
@ -1654,6 +1743,7 @@ int main(int argc, const char* argv[]) {
params.control_net_cpu, params.control_net_cpu,
params.vae_on_cpu, params.vae_on_cpu,
params.diffusion_flash_attn, params.diffusion_flash_attn,
params.taesd_preview,
params.diffusion_conv_direct, params.diffusion_conv_direct,
params.vae_conv_direct, params.vae_conv_direct,
params.force_sdxl_vae_conv_scale, params.force_sdxl_vae_conv_scale,

View File

@ -1398,7 +1398,7 @@ namespace Flux {
ggml_type model_data_type = GGML_TYPE_Q8_0; ggml_type model_data_type = GGML_TYPE_Q8_0;
ModelLoader model_loader; ModelLoader model_loader;
if (!model_loader.init_from_file(file_path, "model.diffusion_model.")) { if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) {
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str()); LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
return; return;
} }

View File

@ -875,7 +875,7 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,
ggml_tensor* input_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, input_tile_size_x, input_tile_size_y, input->ne[2], input->ne[3]); ggml_tensor* input_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, input_tile_size_x, input_tile_size_y, input->ne[2], input->ne[3]);
ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, output_tile_size_x, output_tile_size_y, output->ne[2], output->ne[3]); ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, output_tile_size_x, output_tile_size_y, output->ne[2], output->ne[3]);
int num_tiles = num_tiles_x * num_tiles_y; int num_tiles = num_tiles_x * num_tiles_y;
LOG_INFO("processing %i tiles", num_tiles); LOG_DEBUG("processing %i tiles", num_tiles);
pretty_progress(0, num_tiles, 0.0f); pretty_progress(0, num_tiles, 0.0f);
int tile_count = 1; int tile_count = 1;
bool last_y = false, last_x = false; bool last_y = false, last_x = false;
@ -1568,8 +1568,10 @@ protected:
struct ggml_cgraph* get_compute_graph(get_graph_cb_t get_graph) { struct ggml_cgraph* get_compute_graph(get_graph_cb_t get_graph) {
prepare_build_in_tensor_before(); prepare_build_in_tensor_before();
struct ggml_cgraph* gf = get_graph(); struct ggml_cgraph* gf = get_graph();
auto result = ggml_graph_node(gf, -1); if (ggml_graph_n_nodes(gf) > 0) {
ggml_set_name(result, final_result_name.c_str()); auto result = ggml_graph_node(gf, -1);
ggml_set_name(result, final_result_name.c_str());
}
prepare_build_in_tensor_after(gf); prepare_build_in_tensor_after(gf);
return gf; return gf;
} }

173
latent-preview.h Normal file
View File

@ -0,0 +1,173 @@
#include <cstddef>
#include <cstdint>
#include "ggml.h"
const float wan_21_latent_rgb_proj[16][3] = {
{0.015123f, -0.148418f, 0.479828f},
{0.003652f, -0.010680f, -0.037142f},
{0.212264f, 0.063033f, 0.016779f},
{0.232999f, 0.406476f, 0.220125f},
{-0.051864f, -0.082384f, -0.069396f},
{0.085005f, -0.161492f, 0.010689f},
{-0.245369f, -0.506846f, -0.117010f},
{-0.151145f, 0.017721f, 0.007207f},
{-0.293239f, -0.207936f, -0.421135f},
{-0.187721f, 0.050783f, 0.177649f},
{-0.013067f, 0.265964f, 0.166578f},
{0.028327f, 0.109329f, 0.108642f},
{-0.205343f, 0.043991f, 0.148914f},
{0.014307f, -0.048647f, -0.007219f},
{0.217150f, 0.053074f, 0.319923f},
{0.155357f, 0.083156f, 0.064780f}};
float wan_21_latent_rgb_bias[3] = {-0.270270f, -0.234976f, -0.456853f};
const float wan_22_latent_rgb_proj[48][3] = {
{0.017126f, -0.027230f, -0.019257f},
{-0.113739f, -0.028715f, -0.022885f},
{-0.000106f, 0.021494f, 0.004629f},
{-0.013273f, -0.107137f, -0.033638f},
{-0.000381f, 0.000279f, 0.025877f},
{-0.014216f, -0.003975f, 0.040528f},
{0.001638f, -0.000748f, 0.011022f},
{0.029238f, -0.006697f, 0.035933f},
{0.021641f, -0.015874f, 0.040531f},
{-0.101984f, -0.070160f, -0.028855f},
{0.033207f, -0.021068f, 0.002663f},
{-0.104711f, 0.121673f, 0.102981f},
{0.082647f, -0.004991f, 0.057237f},
{-0.027375f, 0.031581f, 0.006868f},
{-0.045434f, 0.029444f, 0.019287f},
{-0.046572f, -0.012537f, 0.006675f},
{0.074709f, 0.033690f, 0.025289f},
{-0.008251f, -0.002745f, -0.006999f},
{0.012685f, -0.061856f, -0.048658f},
{0.042304f, -0.007039f, 0.000295f},
{-0.007644f, -0.060843f, -0.033142f},
{0.159909f, 0.045628f, 0.367541f},
{0.095171f, 0.086438f, 0.010271f},
{0.006812f, 0.019643f, 0.029637f},
{0.003467f, -0.010705f, 0.014252f},
{-0.099681f, -0.066272f, -0.006243f},
{0.047357f, 0.037040f, 0.000185f},
{-0.041797f, -0.089225f, -0.032257f},
{0.008928f, 0.017028f, 0.018684f},
{-0.042255f, 0.016045f, 0.006849f},
{0.011268f, 0.036462f, 0.037387f},
{0.011553f, -0.016375f, -0.048589f},
{0.046266f, -0.027189f, 0.056979f},
{0.009640f, -0.017576f, 0.030324f},
{-0.045794f, -0.036083f, -0.010616f},
{0.022418f, 0.039783f, -0.032939f},
{-0.052714f, -0.015525f, 0.007438f},
{0.193004f, 0.223541f, 0.264175f},
{-0.059406f, -0.008188f, 0.022867f},
{-0.156742f, -0.263791f, -0.007385f},
{-0.015717f, 0.016570f, 0.033969f},
{0.037969f, 0.109835f, 0.200449f},
{-0.000782f, -0.009566f, -0.008058f},
{0.010709f, 0.052960f, -0.044195f},
{0.017271f, 0.045839f, 0.034569f},
{0.009424f, 0.013088f, -0.001714f},
{-0.024805f, -0.059378f, -0.033756f},
{-0.078293f, 0.029070f, 0.026129f}};
float wan_22_latent_rgb_bias[3] = {0.013160f, -0.096492f, -0.071323f};
const float flux_latent_rgb_proj[16][3] = {
{-0.041168f, 0.019917f, 0.097253f},
{0.028096f, 0.026730f, 0.129576f},
{0.065618f, -0.067950f, -0.014651f},
{-0.012998f, -0.014762f, 0.081251f},
{0.078567f, 0.059296f, -0.024687f},
{-0.015987f, -0.003697f, 0.005012f},
{0.033605f, 0.138999f, 0.068517f},
{-0.024450f, -0.063567f, -0.030101f},
{-0.040194f, -0.016710f, 0.127185f},
{0.112681f, 0.088764f, -0.041940f},
{-0.023498f, 0.093664f, 0.025543f},
{0.082899f, 0.048320f, 0.007491f},
{0.075712f, 0.074139f, 0.081965f},
{-0.143501f, 0.018263f, -0.136138f},
{-0.025767f, -0.082035f, -0.040023f},
{-0.111849f, -0.055589f, -0.032361f}};
float flux_latent_rgb_bias[3] = {0.024600f, -0.006937f, -0.008089f};
// This one was taken straight from
// https://github.com/Stability-AI/sd3.5/blob/8565799a3b41eb0c7ba976d18375f0f753f56402/sd3_impls.py#L288-L303
// (MiT Licence)
const float sd3_latent_rgb_proj[16][3] = {
{-0.0645f, 0.0177f, 0.1052f},
{0.0028f, 0.0312f, 0.0650f},
{0.1848f, 0.0762f, 0.0360f},
{0.0944f, 0.0360f, 0.0889f},
{0.0897f, 0.0506f, -0.0364f},
{-0.0020f, 0.1203f, 0.0284f},
{0.0855f, 0.0118f, 0.0283f},
{-0.0539f, 0.0658f, 0.1047f},
{-0.0057f, 0.0116f, 0.0700f},
{-0.0412f, 0.0281f, -0.0039f},
{0.1106f, 0.1171f, 0.1220f},
{-0.0248f, 0.0682f, -0.0481f},
{0.0815f, 0.0846f, 0.1207f},
{-0.0120f, -0.0055f, -0.0867f},
{-0.0749f, -0.0634f, -0.0456f},
{-0.1418f, -0.1457f, -0.1259f},
};
float sd3_latent_rgb_bias[3] = {0, 0, 0};
const float sdxl_latent_rgb_proj[4][3] = {
{0.258303f, 0.277640f, 0.329699f},
{-0.299701f, 0.105446f, 0.014194f},
{0.050522f, 0.186163f, -0.143257f},
{-0.211938f, -0.149892f, -0.080036f}};
float sdxl_latent_rgb_bias[3] = {0.144381f, -0.033313f, 0.007061f};
const float sd_latent_rgb_proj[4][3] = {
{0.337366f, 0.216344f, 0.257386f},
{0.165636f, 0.386828f, 0.046994f},
{-0.267803f, 0.237036f, 0.223517f},
{-0.178022f, -0.200862f, -0.678514f}};
float sd_latent_rgb_bias[3] = {-0.017478f, -0.055834f, -0.105825f};
void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int width, int height, int frames, int dim) {
size_t buffer_head = 0;
for (int k = 0; k < frames; k++) {
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; i++) {
size_t latent_id = (i * latents->nb[0] + j * latents->nb[1] + k * latents->nb[2]);
float r = 0, g = 0, b = 0;
if (latent_rgb_proj != nullptr) {
for (int d = 0; d < dim; d++) {
float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[ggml_n_dims(latents) - 1]);
r += value * latent_rgb_proj[d][0];
g += value * latent_rgb_proj[d][1];
b += value * latent_rgb_proj[d][2];
}
} else {
// interpret first 3 channels as RGB
r = *(float*)((char*)latents->data + latent_id + 0 * latents->nb[ggml_n_dims(latents) - 1]);
g = *(float*)((char*)latents->data + latent_id + 1 * latents->nb[ggml_n_dims(latents) - 1]);
b = *(float*)((char*)latents->data + latent_id + 2 * latents->nb[ggml_n_dims(latents) - 1]);
}
if (latent_rgb_bias != nullptr) {
// bias
r += latent_rgb_bias[0];
g += latent_rgb_bias[1];
b += latent_rgb_bias[2];
}
// change range
r = r * .5f + .5f;
g = g * .5f + .5f;
b = b * .5f + .5f;
// clamp rgb values to [0,1] range
r = r >= 0 ? r <= 1 ? r : 1 : 0;
g = g >= 0 ? g <= 1 ? g : 1 : 0;
b = b >= 0 ? b <= 1 ? b : 1 : 0;
buffer[buffer_head++] = (uint8_t)(r * 255);
buffer[buffer_head++] = (uint8_t)(g * 255);
buffer[buffer_head++] = (uint8_t)(b * 255);
}
}
}
}

1127
lora.hpp

File diff suppressed because it is too large Load Diff

View File

@ -961,7 +961,7 @@ struct MMDiTRunner : public GGMLRunner {
mmdit->get_param_tensors(tensors, "model.diffusion_model"); mmdit->get_param_tensors(tensors, "model.diffusion_model");
ModelLoader model_loader; ModelLoader model_loader;
if (!model_loader.init_from_file(file_path)) { if (!model_loader.init_from_file_and_convert_name(file_path)) {
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str()); LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
return; return;
} }

690
model.cpp
View File

@ -25,6 +25,7 @@
#include "ggml-cpu.h" #include "ggml-cpu.h"
#include "ggml.h" #include "ggml.h"
#include "name_conversion.h"
#include "stable-diffusion.h" #include "stable-diffusion.h"
#ifdef SD_USE_METAL #ifdef SD_USE_METAL
@ -75,15 +76,6 @@ uint16_t read_short(uint8_t* buffer) {
/*================================================= Preprocess ==================================================*/ /*================================================= Preprocess ==================================================*/
std::string self_attn_names[] = {
"self_attn.q_proj.weight",
"self_attn.k_proj.weight",
"self_attn.v_proj.weight",
"self_attn.q_proj.bias",
"self_attn.k_proj.bias",
"self_attn.v_proj.bias",
};
const char* unused_tensors[] = { const char* unused_tensors[] = {
"betas", "betas",
"alphas_cumprod_prev", "alphas_cumprod_prev",
@ -97,9 +89,9 @@ const char* unused_tensors[] = {
"posterior_mean_coef1", "posterior_mean_coef1",
"posterior_mean_coef2", "posterior_mean_coef2",
"cond_stage_model.transformer.text_model.embeddings.position_ids", "cond_stage_model.transformer.text_model.embeddings.position_ids",
"cond_stage_model.1.model.text_model.embeddings.position_ids",
"cond_stage_model.transformer.vision_model.embeddings.position_ids", "cond_stage_model.transformer.vision_model.embeddings.position_ids",
"cond_stage_model.model.logit_scale", "cond_stage_model.model.logit_scale",
"cond_stage_model.model.text_projection",
"conditioner.embedders.0.transformer.text_model.embeddings.position_ids", "conditioner.embedders.0.transformer.text_model.embeddings.position_ids",
"conditioner.embedders.0.model.logit_scale", "conditioner.embedders.0.model.logit_scale",
"conditioner.embedders.1.model.logit_scale", "conditioner.embedders.1.model.logit_scale",
@ -110,6 +102,7 @@ const char* unused_tensors[] = {
"model_ema.diffusion_model", "model_ema.diffusion_model",
"embedding_manager", "embedding_manager",
"denoiser.sigmas", "denoiser.sigmas",
"edm_vpred.sigma_max",
"text_encoders.t5xxl.transformer.encoder.embed_tokens.weight", // only used during training "text_encoders.t5xxl.transformer.encoder.embed_tokens.weight", // only used during training
"text_encoders.qwen2vl.output.weight", "text_encoders.qwen2vl.output.weight",
"text_encoders.qwen2vl.lm_head.", "text_encoders.qwen2vl.lm_head.",
@ -124,622 +117,6 @@ bool is_unused_tensor(std::string name) {
return false; return false;
} }
std::unordered_map<std::string, std::string> open_clip_to_hf_clip_model = {
{"model.ln_final.bias", "transformer.text_model.final_layer_norm.bias"},
{"model.ln_final.weight", "transformer.text_model.final_layer_norm.weight"},
{"model.positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"},
{"model.token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"},
{"model.text_projection", "transformer.text_model.text_projection"},
{"model.visual.class_embedding", "transformer.vision_model.embeddings.class_embedding"},
{"model.visual.conv1.weight", "transformer.vision_model.embeddings.patch_embedding.weight"},
{"model.visual.ln_post.bias", "transformer.vision_model.post_layernorm.bias"},
{"model.visual.ln_post.weight", "transformer.vision_model.post_layernorm.weight"},
{"model.visual.ln_pre.bias", "transformer.vision_model.pre_layernorm.bias"},
{"model.visual.ln_pre.weight", "transformer.vision_model.pre_layernorm.weight"},
{"model.visual.positional_embedding", "transformer.vision_model.embeddings.position_embedding.weight"},
{"model.visual.proj", "transformer.visual_projection.weight"},
};
std::unordered_map<std::string, std::string> open_clip_to_hf_clip_resblock = {
{"attn.in_proj_bias", "self_attn.in_proj.bias"},
{"attn.in_proj_weight", "self_attn.in_proj.weight"},
{"attn.out_proj.bias", "self_attn.out_proj.bias"},
{"attn.out_proj.weight", "self_attn.out_proj.weight"},
{"ln_1.bias", "layer_norm1.bias"},
{"ln_1.weight", "layer_norm1.weight"},
{"ln_2.bias", "layer_norm2.bias"},
{"ln_2.weight", "layer_norm2.weight"},
{"mlp.c_fc.bias", "mlp.fc1.bias"},
{"mlp.c_fc.weight", "mlp.fc1.weight"},
{"mlp.c_proj.bias", "mlp.fc2.bias"},
{"mlp.c_proj.weight", "mlp.fc2.weight"},
};
std::unordered_map<std::string, std::string> cond_model_name_map = {
{"transformer.vision_model.pre_layrnorm.weight", "transformer.vision_model.pre_layernorm.weight"},
{"transformer.vision_model.pre_layrnorm.bias", "transformer.vision_model.pre_layernorm.bias"},
};
std::unordered_map<std::string, std::string> vae_decoder_name_map = {
{"first_stage_model.decoder.mid.attn_1.to_k.bias", "first_stage_model.decoder.mid.attn_1.k.bias"},
{"first_stage_model.decoder.mid.attn_1.to_k.weight", "first_stage_model.decoder.mid.attn_1.k.weight"},
{"first_stage_model.decoder.mid.attn_1.to_out.0.bias", "first_stage_model.decoder.mid.attn_1.proj_out.bias"},
{"first_stage_model.decoder.mid.attn_1.to_out.0.weight", "first_stage_model.decoder.mid.attn_1.proj_out.weight"},
{"first_stage_model.decoder.mid.attn_1.to_q.bias", "first_stage_model.decoder.mid.attn_1.q.bias"},
{"first_stage_model.decoder.mid.attn_1.to_q.weight", "first_stage_model.decoder.mid.attn_1.q.weight"},
{"first_stage_model.decoder.mid.attn_1.to_v.bias", "first_stage_model.decoder.mid.attn_1.v.bias"},
{"first_stage_model.decoder.mid.attn_1.to_v.weight", "first_stage_model.decoder.mid.attn_1.v.weight"},
};
std::unordered_map<std::string, std::string> pmid_v2_name_map = {
{"pmid.qformer_perceiver.perceiver_resampler.layers.0.1.1.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.0.1.1.fc1.weight"},
{"pmid.qformer_perceiver.perceiver_resampler.layers.0.1.3.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.0.1.1.fc2.weight"},
{"pmid.qformer_perceiver.perceiver_resampler.layers.1.1.1.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.1.1.1.fc1.weight"},
{"pmid.qformer_perceiver.perceiver_resampler.layers.1.1.3.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.1.1.1.fc2.weight"},
{"pmid.qformer_perceiver.perceiver_resampler.layers.2.1.1.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.2.1.1.fc1.weight"},
{"pmid.qformer_perceiver.perceiver_resampler.layers.2.1.3.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.2.1.1.fc2.weight"},
{"pmid.qformer_perceiver.perceiver_resampler.layers.3.1.1.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.3.1.1.fc1.weight"},
{"pmid.qformer_perceiver.perceiver_resampler.layers.3.1.3.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.3.1.1.fc2.weight"},
{"pmid.qformer_perceiver.token_proj.0.bias",
"pmid.qformer_perceiver.token_proj.fc1.bias"},
{"pmid.qformer_perceiver.token_proj.2.bias",
"pmid.qformer_perceiver.token_proj.fc2.bias"},
{"pmid.qformer_perceiver.token_proj.0.weight",
"pmid.qformer_perceiver.token_proj.fc1.weight"},
{"pmid.qformer_perceiver.token_proj.2.weight",
"pmid.qformer_perceiver.token_proj.fc2.weight"},
};
std::unordered_map<std::string, std::string> qwenvl_name_map{
{"token_embd.", "model.embed_tokens."},
{"blk.", "model.layers."},
{"attn_q.", "self_attn.q_proj."},
{"attn_k.", "self_attn.k_proj."},
{"attn_v.", "self_attn.v_proj."},
{"attn_output.", "self_attn.o_proj."},
{"attn_norm.", "input_layernorm."},
{"ffn_down.", "mlp.down_proj."},
{"ffn_gate.", "mlp.gate_proj."},
{"ffn_up.", "mlp.up_proj."},
{"ffn_norm.", "post_attention_layernorm."},
{"output_norm.", "model.norm."},
};
std::unordered_map<std::string, std::string> qwenvl_vision_name_map{
{"mm.", "merger.mlp."},
{"v.post_ln.", "merger.ln_q."},
{"v.patch_embd.weight", "patch_embed.proj.0.weight"},
{"patch_embed.proj.0.weight.1", "patch_embed.proj.1.weight"},
{"v.patch_embd.weight.1", "patch_embed.proj.1.weight"},
{"v.blk.", "blocks."},
{"attn_q.", "attn.q_proj."},
{"attn_k.", "attn.k_proj."},
{"attn_v.", "attn.v_proj."},
{"attn_out.", "attn.proj."},
{"ffn_down.", "mlp.down_proj."},
{"ffn_gate.", "mlp.gate_proj."},
{"ffn_up.", "mlp.up_proj."},
{"ln1.", "norm1."},
{"ln2.", "norm2."},
};
std::string convert_cond_model_name(const std::string& name) {
std::string new_name = name;
std::string prefix;
if (contains(new_name, ".enc.")) {
// llama.cpp naming convention for T5
size_t pos = new_name.find(".enc.");
if (pos != std::string::npos) {
new_name.replace(pos, 5, ".encoder.");
}
pos = new_name.find("blk.");
if (pos != std::string::npos) {
new_name.replace(pos, 4, "block.");
}
pos = new_name.find("output_norm.");
if (pos != std::string::npos) {
new_name.replace(pos, 12, "final_layer_norm.");
}
pos = new_name.find("attn_k.");
if (pos != std::string::npos) {
new_name.replace(pos, 7, "layer.0.SelfAttention.k.");
}
pos = new_name.find("attn_v.");
if (pos != std::string::npos) {
new_name.replace(pos, 7, "layer.0.SelfAttention.v.");
}
pos = new_name.find("attn_o.");
if (pos != std::string::npos) {
new_name.replace(pos, 7, "layer.0.SelfAttention.o.");
}
pos = new_name.find("attn_q.");
if (pos != std::string::npos) {
new_name.replace(pos, 7, "layer.0.SelfAttention.q.");
}
pos = new_name.find("attn_norm.");
if (pos != std::string::npos) {
new_name.replace(pos, 10, "layer.0.layer_norm.");
}
pos = new_name.find("ffn_norm.");
if (pos != std::string::npos) {
new_name.replace(pos, 9, "layer.1.layer_norm.");
}
pos = new_name.find("ffn_up.");
if (pos != std::string::npos) {
new_name.replace(pos, 7, "layer.1.DenseReluDense.wi_1.");
}
pos = new_name.find("ffn_down.");
if (pos != std::string::npos) {
new_name.replace(pos, 9, "layer.1.DenseReluDense.wo.");
}
pos = new_name.find("ffn_gate.");
if (pos != std::string::npos) {
new_name.replace(pos, 9, "layer.1.DenseReluDense.wi_0.");
}
pos = new_name.find("attn_rel_b.");
if (pos != std::string::npos) {
new_name.replace(pos, 11, "layer.0.SelfAttention.relative_attention_bias.");
}
} else if (contains(name, "qwen2vl")) {
if (contains(name, "qwen2vl.visual")) {
for (auto kv : qwenvl_vision_name_map) {
size_t pos = new_name.find(kv.first);
if (pos != std::string::npos) {
new_name.replace(pos, kv.first.size(), kv.second);
}
}
} else {
for (auto kv : qwenvl_name_map) {
size_t pos = new_name.find(kv.first);
if (pos != std::string::npos) {
new_name.replace(pos, kv.first.size(), kv.second);
}
}
}
} else if (name == "text_encoders.t5xxl.transformer.token_embd.weight") {
new_name = "text_encoders.t5xxl.transformer.shared.weight";
}
if (starts_with(new_name, "conditioner.embedders.0.open_clip.")) {
prefix = "cond_stage_model.";
new_name = new_name.substr(strlen("conditioner.embedders.0.open_clip."));
} else if (starts_with(new_name, "conditioner.embedders.0.")) {
prefix = "cond_stage_model.";
new_name = new_name.substr(strlen("conditioner.embedders.0."));
} else if (starts_with(new_name, "conditioner.embedders.1.")) {
prefix = "cond_stage_model.1.";
new_name = new_name.substr(strlen("conditioner.embedders.0."));
} else if (starts_with(new_name, "cond_stage_model.")) {
prefix = "cond_stage_model.";
new_name = new_name.substr(strlen("cond_stage_model."));
} else if (ends_with(new_name, "vision_model.visual_projection.weight")) {
prefix = new_name.substr(0, new_name.size() - strlen("vision_model.visual_projection.weight"));
new_name = prefix + "visual_projection.weight";
return new_name;
} else if (ends_with(new_name, "transformer.text_projection.weight")) {
prefix = new_name.substr(0, new_name.size() - strlen("transformer.text_projection.weight"));
new_name = prefix + "transformer.text_model.text_projection";
return new_name;
} else {
return new_name;
}
if (new_name == "model.text_projection.weight") {
new_name = "transformer.text_model.text_projection";
}
if (open_clip_to_hf_clip_model.find(new_name) != open_clip_to_hf_clip_model.end()) {
new_name = open_clip_to_hf_clip_model[new_name];
}
if (cond_model_name_map.find(new_name) != cond_model_name_map.end()) {
new_name = cond_model_name_map[new_name];
}
std::string open_clip_resblock_prefix = "model.transformer.resblocks.";
std::string hf_clip_resblock_prefix = "transformer.text_model.encoder.layers.";
auto replace_suffix = [&]() {
if (new_name.find(open_clip_resblock_prefix) == 0) {
std::string remain = new_name.substr(open_clip_resblock_prefix.length());
std::string idx = remain.substr(0, remain.find("."));
std::string suffix = remain.substr(idx.length() + 1);
if (open_clip_to_hf_clip_resblock.find(suffix) != open_clip_to_hf_clip_resblock.end()) {
std::string new_suffix = open_clip_to_hf_clip_resblock[suffix];
new_name = hf_clip_resblock_prefix + idx + "." + new_suffix;
}
}
};
replace_suffix();
open_clip_resblock_prefix = "model.visual.transformer.resblocks.";
hf_clip_resblock_prefix = "transformer.vision_model.encoder.layers.";
replace_suffix();
return prefix + new_name;
}
std::string convert_vae_decoder_name(const std::string& name) {
if (vae_decoder_name_map.find(name) != vae_decoder_name_map.end()) {
return vae_decoder_name_map[name];
}
return name;
}
std::string convert_pmid_v2_name(const std::string& name) {
if (pmid_v2_name_map.find(name) != pmid_v2_name_map.end()) {
return pmid_v2_name_map[name];
}
return name;
}
/* If not a SDXL LoRA the unet" prefix will have already been replaced by this
* point and "te2" and "te1" don't seem to appear in non-SDXL only "te_" */
std::string convert_sdxl_lora_name(std::string tensor_name) {
const std::pair<std::string, std::string> sdxl_lora_name_lookup[] = {
{"unet", "model_diffusion_model"},
{"te2", "cond_stage_model_1_transformer"},
{"te1", "cond_stage_model_transformer"},
{"text_encoder_2", "cond_stage_model_1_transformer"},
{"text_encoder", "cond_stage_model_transformer"},
};
for (auto& pair_i : sdxl_lora_name_lookup) {
if (tensor_name.compare(0, pair_i.first.length(), pair_i.first) == 0) {
tensor_name = std::regex_replace(tensor_name, std::regex(pair_i.first), pair_i.second);
break;
}
}
return tensor_name;
}
std::unordered_map<std::string, std::unordered_map<std::string, std::string>> suffix_conversion_underline = {
{
"attentions",
{
{"to_k", "k"},
{"to_q", "q"},
{"to_v", "v"},
{"to_out_0", "proj_out"},
{"group_norm", "norm"},
{"key", "k"},
{"query", "q"},
{"value", "v"},
{"proj_attn", "proj_out"},
},
},
{
"resnets",
{
{"conv1", "in_layers_2"},
{"conv2", "out_layers_3"},
{"norm1", "in_layers_0"},
{"norm2", "out_layers_0"},
{"time_emb_proj", "emb_layers_1"},
{"conv_shortcut", "skip_connection"},
},
},
};
std::unordered_map<std::string, std::unordered_map<std::string, std::string>> suffix_conversion_dot = {
{
"attentions",
{
{"to_k", "k"},
{"to_q", "q"},
{"to_v", "v"},
{"to_out.0", "proj_out"},
{"group_norm", "norm"},
{"key", "k"},
{"query", "q"},
{"value", "v"},
{"proj_attn", "proj_out"},
},
},
{
"resnets",
{
{"conv1", "in_layers.2"},
{"conv2", "out_layers.3"},
{"norm1", "in_layers.0"},
{"norm2", "out_layers.0"},
{"time_emb_proj", "emb_layers.1"},
{"conv_shortcut", "skip_connection"},
},
},
};
std::string convert_diffusers_name_to_compvis(std::string key, char seq) {
std::vector<std::string> m;
auto match = [](std::vector<std::string>& match_list, const std::regex& regex, const std::string& key) {
auto r = std::smatch{};
if (!std::regex_match(key, r, regex)) {
return false;
}
match_list.clear();
for (size_t i = 1; i < r.size(); ++i) {
match_list.push_back(r.str(i));
}
return true;
};
std::unordered_map<std::string, std::unordered_map<std::string, std::string>> suffix_conversion;
if (seq == '_') {
suffix_conversion = suffix_conversion_underline;
} else {
suffix_conversion = suffix_conversion_dot;
}
auto get_converted_suffix = [&suffix_conversion](const std::string& outer_key, const std::string& inner_key) {
auto outer_iter = suffix_conversion.find(outer_key);
if (outer_iter != suffix_conversion.end()) {
auto inner_iter = outer_iter->second.find(inner_key);
if (inner_iter != outer_iter->second.end()) {
return inner_iter->second;
}
}
return inner_key;
};
// convert attn to out
if (ends_with(key, "to_out")) {
key += format("%c0", seq);
}
// unet
if (match(m, std::regex(format("unet%cconv_in(.*)", seq)), key)) {
return format("model%cdiffusion_model%cinput_blocks%c0%c0", seq, seq, seq, seq) + m[0];
}
if (match(m, std::regex(format("unet%cconv%cout(.*)", seq, seq)), key)) {
return format("model%cdiffusion_model%cout%c2", seq, seq, seq) + m[0];
}
if (match(m, std::regex(format("unet%cconv_norm_out(.*)", seq)), key)) {
return format("model%cdiffusion_model%cout%c0", seq, seq, seq) + m[0];
}
if (match(m, std::regex(format("unet%ctime_embedding%clinear_(\\d+)(.*)", seq, seq)), key)) {
return format("model%cdiffusion_model%ctime_embed%c", seq, seq, seq) + std::to_string(std::stoi(m[0]) * 2 - 2) + m[1];
}
if (match(m, std::regex(format("unet%cadd_embedding%clinear_(\\d+)(.*)", seq, seq)), key)) {
return format("model%cdiffusion_model%clabel_emb%c0%c", seq, seq, seq, seq) + std::to_string(std::stoi(m[0]) * 2 - 2) + m[1];
}
if (match(m, std::regex(format("unet%cdown_blocks%c(\\d+)%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) {
std::string suffix = get_converted_suffix(m[1], m[3]);
// LOG_DEBUG("%s %s %s %s", m[0].c_str(), m[1].c_str(), m[2].c_str(), m[3].c_str());
return format("model%cdiffusion_model%cinput_blocks%c", seq, seq, seq) + std::to_string(1 + std::stoi(m[0]) * 3 + std::stoi(m[2])) + seq +
(m[1] == "attentions" ? "1" : "0") + seq + suffix;
}
if (match(m, std::regex(format("unet%cmid_block%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq)), key)) {
std::string suffix = get_converted_suffix(m[0], m[2]);
return format("model%cdiffusion_model%cmiddle_block%c", seq, seq, seq) + (m[0] == "attentions" ? "1" : std::to_string(std::stoi(m[1]) * 2)) +
seq + suffix;
}
if (match(m, std::regex(format("unet%cup_blocks%c(\\d+)%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) {
std::string suffix = get_converted_suffix(m[1], m[3]);
return format("model%cdiffusion_model%coutput_blocks%c", seq, seq, seq) + std::to_string(std::stoi(m[0]) * 3 + std::stoi(m[2])) + seq +
(m[1] == "attentions" ? "1" : "0") + seq + suffix;
}
if (match(m, std::regex(format("unet%cdown_blocks%c(\\d+)%cdownsamplers%c0%cconv", seq, seq, seq, seq, seq)), key)) {
return format("model%cdiffusion_model%cinput_blocks%c", seq, seq, seq) + std::to_string(3 + std::stoi(m[0]) * 3) + seq + "0" + seq + "op";
}
if (match(m, std::regex(format("unet%cup_blocks%c(\\d+)%cupsamplers%c0%cconv", seq, seq, seq, seq, seq)), key)) {
return format("model%cdiffusion_model%coutput_blocks%c", seq, seq, seq) + std::to_string(2 + std::stoi(m[0]) * 3) + seq +
(std::stoi(m[0]) > 0 ? "2" : "1") + seq + "conv";
}
// clip
if (match(m, std::regex(format("te%ctext_model%cencoder%clayers%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) {
return format("cond_stage_model%ctransformer%ctext_model%cencoder%clayers%c", seq, seq, seq, seq, seq) + m[0] + seq + m[1];
}
if (match(m, std::regex(format("te%ctext_model(.*)", seq)), key)) {
return format("cond_stage_model%ctransformer%ctext_model", seq, seq) + m[0];
}
// clip-g
if (match(m, std::regex(format("te%c1%ctext_model%cencoder%clayers%c(\\d+)%c(.+)", seq, seq, seq, seq, seq, seq)), key)) {
return format("cond_stage_model%c1%ctransformer%ctext_model%cencoder%clayers%c", seq, seq, seq, seq, seq, seq) + m[0] + seq + m[1];
}
if (match(m, std::regex(format("te%c1%ctext_model(.*)", seq, seq)), key)) {
return format("cond_stage_model%c1%ctransformer%ctext_model", seq, seq, seq) + m[0];
}
if (match(m, std::regex(format("te%c1%ctext_projection", seq, seq)), key)) {
return format("cond_stage_model%c1%ctransformer%ctext_model%ctext_projection", seq, seq, seq, seq);
}
// vae
if (match(m, std::regex(format("vae%c(.*)%cconv_norm_out(.*)", seq, seq)), key)) {
return format("first_stage_model%c%s%cnorm_out%s", seq, m[0].c_str(), seq, m[1].c_str());
}
if (match(m, std::regex(format("vae%c(.*)%cmid_block%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) {
std::string suffix;
std::string block_name;
if (m[1] == "attentions") {
block_name = "attn";
suffix = get_converted_suffix(m[1], m[3]);
} else {
block_name = "block";
suffix = m[3];
}
return format("first_stage_model%c%s%cmid%c%s_%d%c%s",
seq, m[0].c_str(), seq, seq, block_name.c_str(), std::stoi(m[2]) + 1, seq, suffix.c_str());
}
if (match(m, std::regex(format("vae%c(.*)%cup_blocks%c(\\d+)%cresnets%c(\\d+)%c(.+)", seq, seq, seq, seq, seq, seq)), key)) {
std::string suffix = m[3];
if (suffix == "conv_shortcut") {
suffix = "nin_shortcut";
}
return format("first_stage_model%c%s%cup%c%d%cblock%c%s%c%s",
seq, m[0].c_str(), seq, seq, 3 - std::stoi(m[1]), seq, seq, m[2].c_str(), seq, suffix.c_str());
}
if (match(m, std::regex(format("vae%c(.*)%cdown_blocks%c(\\d+)%cdownsamplers%c0%cconv", seq, seq, seq, seq, seq, seq)), key)) {
return format("first_stage_model%c%s%cdown%c%d%cdownsample%cconv",
seq, m[0].c_str(), seq, seq, std::stoi(m[1]), seq, seq);
}
if (match(m, std::regex(format("vae%c(.*)%cdown_blocks%c(\\d+)%cresnets%c(\\d+)%c(.+)", seq, seq, seq, seq, seq, seq)), key)) {
std::string suffix = m[3];
if (suffix == "conv_shortcut") {
suffix = "nin_shortcut";
}
return format("first_stage_model%c%s%cdown%c%d%cblock%c%s%c%s",
seq, m[0].c_str(), seq, seq, std::stoi(m[1]), seq, seq, m[2].c_str(), seq, suffix.c_str());
}
if (match(m, std::regex(format("vae%c(.*)%cup_blocks%c(\\d+)%cupsamplers%c0%cconv", seq, seq, seq, seq, seq, seq)), key)) {
return format("first_stage_model%c%s%cup%c%d%cupsample%cconv",
seq, m[0].c_str(), seq, seq, 3 - std::stoi(m[1]), seq, seq);
}
if (match(m, std::regex(format("vae%c(.*)", seq)), key)) {
return format("first_stage_model%c", seq) + m[0];
}
return key;
}
std::string convert_tensor_name(std::string name) {
if (starts_with(name, "diffusion_model")) {
name = "model." + name;
}
if (starts_with(name, "model.diffusion_model.up_blocks.0.attentions.0.")) {
name.replace(0, sizeof("model.diffusion_model.up_blocks.0.attentions.0.") - 1,
"model.diffusion_model.output_blocks.0.1.");
}
if (starts_with(name, "model.diffusion_model.up_blocks.0.attentions.1.")) {
name.replace(0, sizeof("model.diffusion_model.up_blocks.0.attentions.1.") - 1,
"model.diffusion_model.output_blocks.1.1.");
}
// size_t pos = name.find("lora_A");
// if (pos != std::string::npos) {
// name.replace(pos, strlen("lora_A"), "lora_up");
// }
// pos = name.find("lora_B");
// if (pos != std::string::npos) {
// name.replace(pos, strlen("lora_B"), "lora_down");
// }
std::string new_name = name;
if (starts_with(name, "cond_stage_model.") ||
starts_with(name, "conditioner.embedders.") ||
starts_with(name, "text_encoders.") ||
ends_with(name, ".vision_model.visual_projection.weight") ||
starts_with(name, "qwen2vl")) {
new_name = convert_cond_model_name(name);
} else if (starts_with(name, "first_stage_model.decoder")) {
new_name = convert_vae_decoder_name(name);
} else if (starts_with(name, "pmid.qformer_perceiver")) {
new_name = convert_pmid_v2_name(name);
} else if (starts_with(name, "control_model.")) { // for controlnet pth models
size_t pos = name.find('.');
if (pos != std::string::npos) {
new_name = name.substr(pos + 1);
}
} else if (starts_with(name, "lora_")) { // for lora
size_t pos = name.find('.');
if (pos != std::string::npos) {
std::string name_without_network_parts = name.substr(5, pos - 5);
std::string network_part = name.substr(pos + 1);
// LOG_DEBUG("%s %s", name_without_network_parts.c_str(), network_part.c_str());
std::string new_key = convert_diffusers_name_to_compvis(name_without_network_parts, '_');
/* For dealing with the new SDXL LoRA tensor naming convention */
new_key = convert_sdxl_lora_name(new_key);
if (new_key.empty()) {
new_name = name;
} else {
new_name = "lora." + new_key + "." + network_part;
}
} else {
new_name = name;
}
} else if (ends_with(name, ".diff") || ends_with(name, ".diff_b")) {
new_name = "lora." + name;
} else if (contains(name, "lora_up") || contains(name, "lora_down") ||
contains(name, "lora.up") || contains(name, "lora.down") ||
contains(name, "lora_linear") || ends_with(name, ".alpha")) {
size_t pos = new_name.find(".processor");
if (pos != std::string::npos) {
new_name.replace(pos, strlen(".processor"), "");
}
// if (starts_with(new_name, "transformer.transformer_blocks") || starts_with(new_name, "transformer.single_transformer_blocks")) {
// new_name = "model.diffusion_model." + new_name;
// }
if (ends_with(name, ".alpha")) {
pos = new_name.rfind("alpha");
} else {
pos = new_name.rfind("lora");
}
if (pos != std::string::npos) {
std::string name_without_network_parts = new_name.substr(0, pos - 1);
std::string network_part = new_name.substr(pos);
// LOG_DEBUG("%s %s", name_without_network_parts.c_str(), network_part.c_str());
std::string new_key = convert_diffusers_name_to_compvis(name_without_network_parts, '.');
new_key = convert_sdxl_lora_name(new_key);
replace_all_chars(new_key, '.', '_');
size_t npos = network_part.rfind("_linear_layer");
if (npos != std::string::npos) {
network_part.replace(npos, strlen("_linear_layer"), "");
}
if (starts_with(network_part, "lora.")) {
network_part = "lora_" + network_part.substr(5);
}
if (new_key.size() > 0) {
new_name = "lora." + new_key + "." + network_part;
}
// LOG_DEBUG("new name: %s", new_name.c_str());
}
} else if (starts_with(name, "unet") || starts_with(name, "vae") || starts_with(name, "te")) { // for diffuser
size_t pos = name.find_last_of('.');
if (pos != std::string::npos) {
std::string name_without_network_parts = name.substr(0, pos);
std::string network_part = name.substr(pos + 1);
// LOG_DEBUG("%s %s", name_without_network_parts.c_str(), network_part.c_str());
std::string new_key = convert_diffusers_name_to_compvis(name_without_network_parts, '.');
if (new_key.empty()) {
new_name = name;
} else if (new_key == "cond_stage_model.1.transformer.text_model.text_projection") {
new_name = new_key;
} else {
new_name = new_key + "." + network_part;
}
} else {
new_name = name;
}
} else {
new_name = name;
}
// if (new_name != name) {
// LOG_DEBUG("%s => %s", name.c_str(), new_name.c_str());
// }
return new_name;
}
float bf16_to_f32(uint16_t bfloat16) { float bf16_to_f32(uint16_t bfloat16) {
uint32_t val_bits = (static_cast<uint32_t>(bfloat16) << 16); uint32_t val_bits = (static_cast<uint32_t>(bfloat16) << 16);
return *reinterpret_cast<float*>(&val_bits); return *reinterpret_cast<float*>(&val_bits);
@ -916,9 +293,7 @@ void convert_tensor(void* src,
/*================================================= ModelLoader ==================================================*/ /*================================================= ModelLoader ==================================================*/
void ModelLoader::add_tensor_storage(const TensorStorage& tensor_storage) { void ModelLoader::add_tensor_storage(const TensorStorage& tensor_storage) {
TensorStorage copy = tensor_storage; tensor_storage_map[tensor_storage.name] = tensor_storage;
copy.name = convert_tensor_name(copy.name);
tensor_storage_map[copy.name] = std::move(copy);
} }
bool is_zip_file(const std::string& file_path) { bool is_zip_file(const std::string& file_path) {
@ -1012,6 +387,31 @@ bool ModelLoader::init_from_file(const std::string& file_path, const std::string
} }
} }
void ModelLoader::convert_tensors_name() {
SDVersion version = (version_ == VERSION_COUNT) ? get_sd_version() : version_;
String2TensorStorage new_map;
for (auto& [_, tensor_storage] : tensor_storage_map) {
auto new_name = convert_tensor_name(tensor_storage.name, version);
// LOG_DEBUG("%s -> %s", tensor_storage.name.c_str(), new_name.c_str());
tensor_storage.name = new_name;
new_map[new_name] = std::move(tensor_storage);
}
tensor_storage_map.swap(new_map);
}
bool ModelLoader::init_from_file_and_convert_name(const std::string& file_path, const std::string& prefix, SDVersion version) {
if (version_ == VERSION_COUNT && version != VERSION_COUNT) {
version_ = version;
}
if (!init_from_file(file_path, prefix)) {
return false;
}
convert_tensors_name();
return true;
}
/*================================================= GGUFModelLoader ==================================================*/ /*================================================= GGUFModelLoader ==================================================*/
bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::string& prefix) { bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::string& prefix) {
@ -1259,32 +659,6 @@ bool ModelLoader::init_from_diffusers_file(const std::string& file_path, const s
if (!init_from_safetensors_file(unet_path, "unet.")) { if (!init_from_safetensors_file(unet_path, "unet.")) {
return false; return false;
} }
for (auto& [name, tensor_storage] : tensor_storage_map) {
if (name.find("add_embedding") != std::string::npos || name.find("label_emb") != std::string::npos) {
// probably SDXL
LOG_DEBUG("Fixing name for SDXL output blocks.2.2");
String2TensorStorage new_tensor_storage_map;
for (auto& [name, tensor_storage] : tensor_storage_map) {
int len = 34;
auto pos = tensor_storage.name.find("unet.up_blocks.0.upsamplers.0.conv");
if (pos == std::string::npos) {
len = 44;
pos = tensor_storage.name.find("model.diffusion_model.output_blocks.2.1.conv");
}
if (pos != std::string::npos) {
std::string new_name = "model.diffusion_model.output_blocks.2.2.conv" + name.substr(len);
LOG_DEBUG("NEW NAME: %s", new_name.c_str());
tensor_storage.name = new_name;
new_tensor_storage_map[new_name] = tensor_storage;
} else {
new_tensor_storage_map[name] = tensor_storage;
}
}
tensor_storage_map = new_tensor_storage_map;
break;
}
}
if (!init_from_safetensors_file(vae_path, "vae.")) { if (!init_from_safetensors_file(vae_path, "vae.")) {
LOG_WARN("Couldn't find working VAE in %s", file_path.c_str()); LOG_WARN("Couldn't find working VAE in %s", file_path.c_str());
@ -1788,6 +1162,9 @@ SDVersion ModelLoader::get_sd_version() {
if (is_inpaint) { if (is_inpaint) {
return VERSION_SD2_INPAINT; return VERSION_SD2_INPAINT;
} }
if (!has_middle_block_1) {
return VERSION_SD2_TINY_UNET;
}
return VERSION_SD2; return VERSION_SD2;
} }
return VERSION_COUNT; return VERSION_COUNT;
@ -1922,7 +1299,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
int64_t start_time = ggml_time_ms(); int64_t start_time = ggml_time_ms();
std::vector<TensorStorage> processed_tensor_storages; std::vector<TensorStorage> processed_tensor_storages;
for (auto& [name, tensor_storage] : tensor_storage_map) { for (const auto& [name, tensor_storage] : tensor_storage_map) {
if (is_unused_tensor(tensor_storage.name)) { if (is_unused_tensor(tensor_storage.name)) {
continue; continue;
} }
@ -2391,6 +1768,7 @@ bool convert(const char* input_path, const char* vae_path, const char* output_pa
return false; return false;
} }
} }
model_loader.convert_tensors_name();
bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type, tensor_type_rules); bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type, tensor_type_rules);
return success; return success;
} }

17
model.h
View File

@ -15,6 +15,7 @@
#include "ggml.h" #include "ggml.h"
#include "gguf.h" #include "gguf.h"
#include "json.hpp" #include "json.hpp"
#include "ordered_map.hpp"
#include "zip.h" #include "zip.h"
#define SD_MAX_DIMS 5 #define SD_MAX_DIMS 5
@ -26,6 +27,7 @@ enum SDVersion {
VERSION_SD1_TINY_UNET, VERSION_SD1_TINY_UNET,
VERSION_SD2, VERSION_SD2,
VERSION_SD2_INPAINT, VERSION_SD2_INPAINT,
VERSION_SD2_TINY_UNET,
VERSION_SDXL, VERSION_SDXL,
VERSION_SDXL_INPAINT, VERSION_SDXL_INPAINT,
VERSION_SDXL_PIX2PIX, VERSION_SDXL_PIX2PIX,
@ -52,7 +54,7 @@ static inline bool sd_version_is_sd1(SDVersion version) {
} }
static inline bool sd_version_is_sd2(SDVersion version) { static inline bool sd_version_is_sd2(SDVersion version) {
if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT) { if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET) {
return true; return true;
} }
return false; return false;
@ -107,7 +109,11 @@ static inline bool sd_version_is_qwen_image(SDVersion version) {
} }
static inline bool sd_version_is_inpaint(SDVersion version) { static inline bool sd_version_is_inpaint(SDVersion version) {
if (version == VERSION_SD1_INPAINT || version == VERSION_SD2_INPAINT || version == VERSION_SDXL_INPAINT || version == VERSION_FLUX_FILL || version == VERSION_FLEX_2) { if (version == VERSION_SD1_INPAINT ||
version == VERSION_SD2_INPAINT ||
version == VERSION_SDXL_INPAINT ||
version == VERSION_FLUX_FILL ||
version == VERSION_FLEX_2) {
return true; return true;
} }
return false; return false;
@ -252,10 +258,11 @@ struct TensorStorage {
typedef std::function<bool(const TensorStorage&, ggml_tensor**)> on_new_tensor_cb_t; typedef std::function<bool(const TensorStorage&, ggml_tensor**)> on_new_tensor_cb_t;
typedef std::map<std::string, TensorStorage> String2TensorStorage; typedef OrderedMap<std::string, TensorStorage> String2TensorStorage;
class ModelLoader { class ModelLoader {
protected: protected:
SDVersion version_ = VERSION_COUNT;
std::vector<std::string> file_paths_; std::vector<std::string> file_paths_;
String2TensorStorage tensor_storage_map; String2TensorStorage tensor_storage_map;
@ -275,6 +282,10 @@ protected:
public: public:
bool init_from_file(const std::string& file_path, const std::string& prefix = ""); bool init_from_file(const std::string& file_path, const std::string& prefix = "");
void convert_tensors_name();
bool init_from_file_and_convert_name(const std::string& file_path,
const std::string& prefix = "",
SDVersion version = VERSION_COUNT);
SDVersion get_sd_version(); SDVersion get_sd_version();
std::map<ggml_type, uint32_t> get_wtype_stat(); std::map<ggml_type, uint32_t> get_wtype_stat();
std::map<ggml_type, uint32_t> get_conditioner_wtype_stat(); std::map<ggml_type, uint32_t> get_conditioner_wtype_stat();

1028
name_conversion.cpp Normal file

File diff suppressed because it is too large Load Diff

10
name_conversion.h Normal file
View File

@ -0,0 +1,10 @@
#ifndef __NAME_CONVERSTION_H__
#define __NAME_CONVERSTION_H__
#include <string>
#include "model.h"
std::string convert_tensor_name(std::string name, SDVersion version);
#endif // __NAME_CONVERSTION_H__

177
ordered_map.hpp Normal file
View File

@ -0,0 +1,177 @@
#ifndef __ORDERED_MAP_HPP__
#define __ORDERED_MAP_HPP__
#include <iostream>
#include <list>
#include <string>
#include <unordered_map>
#include <initializer_list>
#include <iterator>
#include <list>
#include <stdexcept>
#include <unordered_map>
#include <utility>
template <typename Key, typename T>
class OrderedMap {
public:
using key_type = Key;
using mapped_type = T;
using value_type = std::pair<const Key, T>;
using list_type = std::list<value_type>;
using size_type = typename list_type::size_type;
using difference_type = typename list_type::difference_type;
using iterator = typename list_type::iterator;
using const_iterator = typename list_type::const_iterator;
private:
list_type data_;
std::unordered_map<Key, iterator> index_;
public:
// --- constructors ---
OrderedMap() = default;
OrderedMap(std::initializer_list<value_type> init) {
for (const auto& kv : init)
insert(kv);
}
OrderedMap(const OrderedMap&) = default;
OrderedMap(OrderedMap&&) noexcept = default;
OrderedMap& operator=(const OrderedMap&) = default;
OrderedMap& operator=(OrderedMap&&) noexcept = default;
// --- element access ---
T& at(const Key& key) {
auto it = index_.find(key);
if (it == index_.end())
throw std::out_of_range("OrderedMap::at: key not found");
return it->second->second;
}
const T& at(const Key& key) const {
auto it = index_.find(key);
if (it == index_.end())
throw std::out_of_range("OrderedMap::at: key not found");
return it->second->second;
}
T& operator[](const Key& key) {
auto it = index_.find(key);
if (it == index_.end()) {
data_.emplace_back(key, T{});
auto iter = std::prev(data_.end());
index_[key] = iter;
return iter->second;
}
return it->second->second;
}
// --- iterators ---
iterator begin() noexcept { return data_.begin(); }
const_iterator begin() const noexcept { return data_.begin(); }
const_iterator cbegin() const noexcept { return data_.cbegin(); }
iterator end() noexcept { return data_.end(); }
const_iterator end() const noexcept { return data_.end(); }
const_iterator cend() const noexcept { return data_.cend(); }
// --- capacity ---
bool empty() const noexcept { return data_.empty(); }
size_type size() const noexcept { return data_.size(); }
// --- modifiers ---
void clear() noexcept {
data_.clear();
index_.clear();
}
std::pair<iterator, bool> insert(const value_type& value) {
auto it = index_.find(value.first);
if (it != index_.end()) {
return {it->second, false};
}
data_.push_back(value);
auto iter = std::prev(data_.end());
index_[value.first] = iter;
return {iter, true};
}
std::pair<iterator, bool> insert(value_type&& value) {
auto it = index_.find(value.first);
if (it != index_.end()) {
return {it->second, false};
}
data_.push_back(std::move(value));
auto iter = std::prev(data_.end());
index_[iter->first] = iter;
return {iter, true};
}
void erase(const Key& key) {
auto it = index_.find(key);
if (it != index_.end()) {
data_.erase(it->second);
index_.erase(it);
}
}
iterator erase(iterator pos) {
index_.erase(pos->first);
return data_.erase(pos);
}
// --- lookup ---
size_type count(const Key& key) const {
return index_.count(key);
}
iterator find(const Key& key) {
auto it = index_.find(key);
if (it == index_.end())
return data_.end();
return it->second;
}
const_iterator find(const Key& key) const {
auto it = index_.find(key);
if (it == index_.end())
return data_.end();
return it->second;
}
bool contains(const Key& key) const {
return index_.find(key) != index_.end();
}
// --- comparison ---
bool operator==(const OrderedMap& other) const {
return data_ == other.data_;
}
bool operator!=(const OrderedMap& other) const {
return !(*this == other);
}
template <typename... Args>
std::pair<iterator, bool> emplace(Args&&... args) {
value_type value(std::forward<Args>(args)...);
auto it = index_.find(value.first);
if (it != index_.end()) {
return {it->second, false};
}
data_.push_back(std::move(value));
auto iter = std::prev(data_.end());
index_[iter->first] = iter;
return {iter, true};
}
void swap(OrderedMap& other) noexcept {
data_.swap(other.data_);
index_.swap(other.index_);
}
};
#endif // __ORDERED_MAP_HPP__

View File

@ -578,7 +578,7 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
const std::string& file_path = "", const std::string& file_path = "",
const std::string& prefix = "") const std::string& prefix = "")
: file_path(file_path), GGMLRunner(backend, offload_params_to_cpu), model_loader(ml) { : file_path(file_path), GGMLRunner(backend, offload_params_to_cpu), model_loader(ml) {
if (!model_loader->init_from_file(file_path, prefix)) { if (!model_loader->init_from_file_and_convert_name(file_path, prefix)) {
load_failed = true; load_failed = true;
} }
} }

View File

@ -644,7 +644,7 @@ namespace Qwen {
ggml_type model_data_type = GGML_TYPE_Q8_0; ggml_type model_data_type = GGML_TYPE_Q8_0;
ModelLoader model_loader; ModelLoader model_loader;
if (!model_loader.init_from_file(file_path, "model.diffusion_model.")) { if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) {
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str()); LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
return; return;
} }

View File

@ -1342,7 +1342,7 @@ namespace Qwen {
ggml_type model_data_type = GGML_TYPE_F16; ggml_type model_data_type = GGML_TYPE_F16;
ModelLoader model_loader; ModelLoader model_loader;
if (!model_loader.init_from_file(file_path, "qwen2vl.")) { if (!model_loader.init_from_file_and_convert_name(file_path, "qwen2vl.")) {
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str()); LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
return; return;
} }

View File

@ -16,6 +16,8 @@
#include "tae.hpp" #include "tae.hpp"
#include "vae.hpp" #include "vae.hpp"
#include "latent-preview.h"
const char* model_version_to_str[] = { const char* model_version_to_str[] = {
"SD 1.x", "SD 1.x",
"SD 1.x Inpaint", "SD 1.x Inpaint",
@ -23,6 +25,7 @@ const char* model_version_to_str[] = {
"SD 1.x Tiny UNet", "SD 1.x Tiny UNet",
"SD 2.x", "SD 2.x",
"SD 2.x Inpaint", "SD 2.x Inpaint",
"SD 2.x Tiny UNet",
"SDXL", "SDXL",
"SDXL Inpaint", "SDXL Inpaint",
"SDXL Instruct-Pix2Pix", "SDXL Instruct-Pix2Pix",
@ -73,6 +76,14 @@ void calculate_alphas_cumprod(float* alphas_cumprod,
} }
} }
void suppress_pp(int step, int steps, float time, void* data) {
(void)step;
(void)steps;
(void)time;
(void)data;
return;
}
/*=============================================== StableDiffusionGGML ================================================*/ /*=============================================== StableDiffusionGGML ================================================*/
class StableDiffusionGGML { class StableDiffusionGGML {
@ -267,6 +278,8 @@ public:
} }
} }
model_loader.convert_tensors_name();
version = model_loader.get_sd_version(); version = model_loader.get_sd_version();
if (version == VERSION_COUNT) { if (version == VERSION_COUNT) {
LOG_ERROR("get sd version from file failed: '%s'", SAFE_STR(sd_ctx_params->model_path)); LOG_ERROR("get sd version from file failed: '%s'", SAFE_STR(sd_ctx_params->model_path));
@ -486,7 +499,7 @@ public:
} else if (version == VERSION_CHROMA_RADIANCE) { } else if (version == VERSION_CHROMA_RADIANCE) {
first_stage_model = std::make_shared<FakeVAE>(vae_backend, first_stage_model = std::make_shared<FakeVAE>(vae_backend,
offload_params_to_cpu); offload_params_to_cpu);
} else if (!use_tiny_autoencoder) { } else if (!use_tiny_autoencoder || sd_ctx_params->tae_preview_only) {
first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend, first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend,
offload_params_to_cpu, offload_params_to_cpu,
tensor_storage_map, tensor_storage_map,
@ -509,7 +522,8 @@ public:
} }
first_stage_model->alloc_params_buffer(); first_stage_model->alloc_params_buffer();
first_stage_model->get_param_tensors(tensors, "first_stage_model"); first_stage_model->get_param_tensors(tensors, "first_stage_model");
} else { }
if (use_tiny_autoencoder) {
tae_first_stage = std::make_shared<TinyAutoEncoder>(vae_backend, tae_first_stage = std::make_shared<TinyAutoEncoder>(vae_backend,
offload_params_to_cpu, offload_params_to_cpu,
tensor_storage_map, tensor_storage_map,
@ -557,13 +571,13 @@ public:
version); version);
} }
if (strlen(SAFE_STR(sd_ctx_params->photo_maker_path)) > 0) { if (strlen(SAFE_STR(sd_ctx_params->photo_maker_path)) > 0) {
pmid_lora = std::make_shared<LoraModel>(backend, sd_ctx_params->photo_maker_path, ""); pmid_lora = std::make_shared<LoraModel>(backend, sd_ctx_params->photo_maker_path, "", version);
if (!pmid_lora->load_from_file(true, n_threads)) { if (!pmid_lora->load_from_file(true, n_threads)) {
LOG_WARN("load photomaker lora tensors from %s failed", sd_ctx_params->photo_maker_path); LOG_WARN("load photomaker lora tensors from %s failed", sd_ctx_params->photo_maker_path);
return false; return false;
} }
LOG_INFO("loading stacked ID embedding (PHOTOMAKER) model file from '%s'", sd_ctx_params->photo_maker_path); LOG_INFO("loading stacked ID embedding (PHOTOMAKER) model file from '%s'", sd_ctx_params->photo_maker_path);
if (!model_loader.init_from_file(sd_ctx_params->photo_maker_path, "pmid.")) { if (!model_loader.init_from_file_and_convert_name(sd_ctx_params->photo_maker_path, "pmid.")) {
LOG_WARN("loading stacked ID embedding from '%s' failed", sd_ctx_params->photo_maker_path); LOG_WARN("loading stacked ID embedding from '%s' failed", sd_ctx_params->photo_maker_path);
} else { } else {
stacked_id = true; stacked_id = true;
@ -597,7 +611,7 @@ public:
ignore_tensors.insert("first_stage_model."); ignore_tensors.insert("first_stage_model.");
} }
if (stacked_id) { if (stacked_id) {
ignore_tensors.insert("lora."); ignore_tensors.insert("pmid.unet.");
} }
if (vae_decode_only) { if (vae_decode_only) {
@ -625,9 +639,10 @@ public:
unet_params_mem_size += high_noise_diffusion_model->get_params_buffer_size(); unet_params_mem_size += high_noise_diffusion_model->get_params_buffer_size();
} }
size_t vae_params_mem_size = 0; size_t vae_params_mem_size = 0;
if (!use_tiny_autoencoder) { if (!use_tiny_autoencoder || sd_ctx_params->tae_preview_only) {
vae_params_mem_size = first_stage_model->get_params_buffer_size(); vae_params_mem_size = first_stage_model->get_params_buffer_size();
} else { }
if (use_tiny_autoencoder) {
if (!tae_first_stage->load_from_file(taesd_path, n_threads)) { if (!tae_first_stage->load_from_file(taesd_path, n_threads)) {
return false; return false;
} }
@ -800,6 +815,7 @@ public:
LOG_DEBUG("finished loaded file"); LOG_DEBUG("finished loaded file");
ggml_free(ctx); ggml_free(ctx);
use_tiny_autoencoder = use_tiny_autoencoder && !sd_ctx_params->tae_preview_only;
return true; return true;
} }
@ -911,7 +927,7 @@ public:
LOG_WARN("can not find %s or %s for lora %s", st_file_path.c_str(), ckpt_file_path.c_str(), lora_name.c_str()); LOG_WARN("can not find %s or %s for lora %s", st_file_path.c_str(), ckpt_file_path.c_str(), lora_name.c_str());
return; return;
} }
LoraModel lora(backend, file_path, is_high_noise ? "model.high_noise_" : ""); LoraModel lora(backend, file_path, is_high_noise ? "model.high_noise_" : "", version);
if (!lora.load_from_file(false, n_threads)) { if (!lora.load_from_file(false, n_threads)) {
LOG_WARN("load lora tensors from %s failed", file_path.c_str()); LOG_WARN("load lora tensors from %s failed", file_path.c_str());
return; return;
@ -1108,6 +1124,156 @@ public:
} }
} }
void silent_tiling(ggml_tensor* input, ggml_tensor* output, const int scale, const int tile_size, const float tile_overlap_factor, on_tile_process on_processing) {
sd_progress_cb_t cb = sd_get_progress_callback();
void* cbd = sd_get_progress_callback_data();
sd_set_progress_callback((sd_progress_cb_t)suppress_pp, nullptr);
sd_tiling(input, output, scale, tile_size, tile_overlap_factor, on_processing);
sd_set_progress_callback(cb, cbd);
}
void preview_image(ggml_context* work_ctx,
int step,
struct ggml_tensor* latents,
enum SDVersion version,
preview_t preview_mode,
ggml_tensor* result,
std::function<void(int, int, sd_image_t*, bool)> step_callback,
bool is_noisy) {
const uint32_t channel = 3;
uint32_t width = latents->ne[0];
uint32_t height = latents->ne[1];
uint32_t dim = latents->ne[ggml_n_dims(latents) - 1];
if (preview_mode == PREVIEW_PROJ) {
const float(*latent_rgb_proj)[channel] = nullptr;
float* latent_rgb_bias = nullptr;
if (dim == 48) {
if (sd_version_is_wan(version)) {
latent_rgb_proj = wan_22_latent_rgb_proj;
latent_rgb_bias = wan_22_latent_rgb_bias;
} else {
LOG_WARN("No latent to RGB projection known for this model");
// unknown model
return;
}
} else if (dim == 16) {
// 16 channels VAE -> Flux or SD3
if (sd_version_is_sd3(version)) {
latent_rgb_proj = sd3_latent_rgb_proj;
latent_rgb_bias = sd3_latent_rgb_bias;
} else if (sd_version_is_flux(version)) {
latent_rgb_proj = flux_latent_rgb_proj;
latent_rgb_bias = flux_latent_rgb_bias;
} else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
latent_rgb_proj = wan_21_latent_rgb_proj;
latent_rgb_bias = wan_21_latent_rgb_bias;
} else {
LOG_WARN("No latent to RGB projection known for this model");
// unknown model
return;
}
} else if (dim == 4) {
// 4 channels VAE
if (sd_version_is_sdxl(version)) {
latent_rgb_proj = sdxl_latent_rgb_proj;
latent_rgb_bias = sdxl_latent_rgb_bias;
} else if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) {
latent_rgb_proj = sd_latent_rgb_proj;
latent_rgb_bias = sd_latent_rgb_bias;
} else {
// unknown model
LOG_WARN("No latent to RGB projection known for this model");
return;
}
} else if (dim == 3) {
// Do nothing, assuming already RGB latents
} else {
LOG_WARN("No latent to RGB projection known for this model");
// unknown latent space
return;
}
uint32_t frames = 1;
if (ggml_n_dims(latents) == 4) {
frames = latents->ne[2];
}
uint8_t* data = (uint8_t*)malloc(frames * width * height * channel * sizeof(uint8_t));
preview_latent_video(data, latents, latent_rgb_proj, latent_rgb_bias, width, height, frames, dim);
sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t));
for (int i = 0; i < frames; i++) {
images[i] = {width, height, channel, data + i * width * height * channel};
}
step_callback(step, frames, images, is_noisy);
free(data);
free(images);
} else {
if (preview_mode == PREVIEW_VAE) {
process_latent_out(latents);
if (vae_tiling_params.enabled) {
// split latent in 32x32 tiles and compute in several steps
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
first_stage_model->compute(n_threads, in, true, &out, nullptr);
};
silent_tiling(latents, result, get_vae_scale_factor(), 32, 0.5f, on_tiling);
} else {
first_stage_model->compute(n_threads, latents, true, &result, work_ctx);
}
first_stage_model->free_compute_buffer();
process_vae_output_tensor(result);
process_latent_in(latents);
} else if (preview_mode == PREVIEW_TAE) {
if (tae_first_stage == nullptr) {
LOG_WARN("TAE not found for preview");
return;
}
if (vae_tiling_params.enabled) {
// split latent in 64x64 tiles and compute in several steps
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
tae_first_stage->compute(n_threads, in, true, &out, nullptr);
};
silent_tiling(latents, result, get_vae_scale_factor(), 64, 0.5f, on_tiling);
} else {
tae_first_stage->compute(n_threads, latents, true, &result, work_ctx);
}
tae_first_stage->free_compute_buffer();
} else {
return;
}
ggml_ext_tensor_clamp_inplace(result, 0.0f, 1.0f);
uint32_t frames = 1;
if (ggml_n_dims(latents) == 4) {
frames = result->ne[2];
}
sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t));
// print_ggml_tensor(result,true);
for (size_t i = 0; i < frames; i++) {
images[i].width = result->ne[0];
images[i].height = result->ne[1];
images[i].channel = 3;
images[i].data = ggml_tensor_to_sd_image(result, i, ggml_n_dims(latents) == 4);
}
step_callback(step, frames, images, is_noisy);
ggml_ext_tensor_scale_inplace(result, 0);
for (int i = 0; i < frames; i++) {
free(images[i].data);
}
free(images);
}
}
ggml_tensor* sample(ggml_context* work_ctx, ggml_tensor* sample(ggml_context* work_ctx,
std::shared_ptr<DiffusionModel> work_diffusion_model, std::shared_ptr<DiffusionModel> work_diffusion_model,
bool inverse_noise_scaling, bool inverse_noise_scaling,
@ -1183,7 +1349,34 @@ public:
int64_t t0 = ggml_time_us(); int64_t t0 = ggml_time_us();
struct ggml_tensor* preview_tensor = nullptr;
auto sd_preview_mode = sd_get_preview_mode();
if (sd_preview_mode != PREVIEW_NONE && sd_preview_mode != PREVIEW_PROJ) {
int64_t W = x->ne[0] * get_vae_scale_factor();
int64_t H = x->ne[1] * get_vae_scale_factor();
if (ggml_n_dims(x) == 4) {
// assuming video mode (if batch processing gets implemented this will break)
int T = x->ne[2];
if (sd_version_is_wan(version)) {
T = ((T - 1) * 4) + 1;
}
preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32,
W,
H,
T,
3);
} else {
preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32,
W,
H,
3,
x->ne[3]);
}
}
auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* { auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
auto sd_preview_cb = sd_get_preview_callback();
auto sd_preview_mode = sd_get_preview_mode();
if (step == 1 || step == -1) { if (step == 1 || step == -1) {
pretty_progress(0, (int)steps, 0); pretty_progress(0, (int)steps, 0);
} }
@ -1218,6 +1411,11 @@ public:
if (denoise_mask != nullptr && version == VERSION_WAN2_2_TI2V) { if (denoise_mask != nullptr && version == VERSION_WAN2_2_TI2V) {
apply_mask(noised_input, init_latent, denoise_mask); apply_mask(noised_input, init_latent, denoise_mask);
} }
if (sd_preview_cb != nullptr && sd_should_preview_noisy()) {
if (step % sd_get_preview_interval() == 0) {
preview_image(work_ctx, step, noised_input, version, sd_preview_mode, preview_tensor, sd_preview_cb, true);
}
}
std::vector<struct ggml_tensor*> controls; std::vector<struct ggml_tensor*> controls;
@ -1339,16 +1537,22 @@ public:
vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip; vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip;
} }
if (denoise_mask != nullptr) {
apply_mask(denoised, init_latent, denoise_mask);
}
if (sd_preview_cb != nullptr && sd_should_preview_denoised()) {
if (step % sd_get_preview_interval() == 0) {
preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb, false);
}
}
int64_t t1 = ggml_time_us(); int64_t t1 = ggml_time_us();
if (step > 0 || step == -(int)steps) { if (step > 0 || step == -(int)steps) {
int showstep = std::abs(step); int showstep = std::abs(step);
pretty_progress(showstep, (int)steps, (t1 - t0) / 1000000.f / showstep); pretty_progress(showstep, (int)steps, (t1 - t0) / 1000000.f / showstep);
// LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000); // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
} }
if (denoise_mask != nullptr) {
apply_mask(denoised, init_latent, denoise_mask);
}
return denoised; return denoised;
}; };
@ -1854,6 +2058,29 @@ enum prediction_t str_to_prediction(const char* str) {
return PREDICTION_COUNT; return PREDICTION_COUNT;
} }
const char* preview_to_str[] = {
"none",
"proj",
"tae",
"vae",
};
const char* sd_preview_name(enum preview_t preview) {
if (preview < PREVIEW_COUNT) {
return preview_to_str[preview];
}
return NONE_STR;
}
enum preview_t str_to_preview(const char* str) {
for (int i = 0; i < PREVIEW_COUNT; i++) {
if (!strcmp(str, preview_to_str[i])) {
return (enum preview_t)i;
}
}
return PREVIEW_COUNT;
}
void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
*sd_ctx_params = {}; *sd_ctx_params = {};
sd_ctx_params->vae_decode_only = true; sd_ctx_params->vae_decode_only = true;
@ -2199,7 +2426,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
} }
ggml_ext_tensor_iter(init_img, [&](ggml_tensor* init_img, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { ggml_ext_tensor_iter(init_img, [&](ggml_tensor* init_img, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
float value = sd_image_get_f32(processed_id_images[i3], i0, i1, i2); float value = sd_image_get_f32(processed_id_images[i3], i0, i1, i2, false);
ggml_ext_tensor_set_f32(init_img, value, i0, i1, i2, i3); ggml_ext_tensor_set_f32(init_img, value, i0, i1, i2, i3);
}); });
@ -2222,18 +2449,24 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
id_embeds = load_tensor_from_file(work_ctx, pm_params.id_embed_path); id_embeds = load_tensor_from_file(work_ctx, pm_params.id_embed_path);
// print_ggml_tensor(id_embeds, true, "id_embeds:"); // print_ggml_tensor(id_embeds, true, "id_embeds:");
} }
id_cond.c_crossattn = sd_ctx->sd->id_encoder(work_ctx, init_img, id_cond.c_crossattn, id_embeds, class_tokens_mask); if (pmv2 && id_embeds == nullptr) {
int64_t t1 = ggml_time_ms(); LOG_WARN("Provided PhotoMaker images, but NO valid ID embeds file for PM v2");
LOG_INFO("Photomaker ID Stacking, taking %" PRId64 " ms", t1 - t0); LOG_WARN("Turn off PhotoMaker");
if (sd_ctx->sd->free_params_immediately) { sd_ctx->sd->stacked_id = false;
sd_ctx->sd->pmid_model->free_params_buffer(); } else {
} id_cond.c_crossattn = sd_ctx->sd->id_encoder(work_ctx, init_img, id_cond.c_crossattn, id_embeds, class_tokens_mask);
// Encode input prompt without the trigger word for delayed conditioning int64_t t1 = ggml_time_ms();
prompt_text_only = sd_ctx->sd->cond_stage_model->remove_trigger_from_prompt(work_ctx, prompt); LOG_INFO("Photomaker ID Stacking, taking %" PRId64 " ms", t1 - t0);
// printf("%s || %s \n", prompt.c_str(), prompt_text_only.c_str()); if (sd_ctx->sd->free_params_immediately) {
prompt = prompt_text_only; // sd_ctx->sd->pmid_model->free_params_buffer();
if (sample_steps < 50) { }
LOG_WARN("It's recommended to use >= 50 steps for photo maker!"); // Encode input prompt without the trigger word for delayed conditioning
prompt_text_only = sd_ctx->sd->cond_stage_model->remove_trigger_from_prompt(work_ctx, prompt);
// printf("%s || %s \n", prompt.c_str(), prompt_text_only.c_str());
prompt = prompt_text_only; //
if (sample_steps < 50) {
LOG_WARN("It's recommended to use >= 50 steps for photo maker!");
}
} }
} else { } else {
LOG_WARN("Provided PhotoMaker model file, but NO input ID images"); LOG_WARN("Provided PhotoMaker model file, but NO input ID images");

View File

@ -126,6 +126,14 @@ enum sd_log_level_t {
SD_LOG_ERROR SD_LOG_ERROR
}; };
enum preview_t {
PREVIEW_NONE,
PREVIEW_PROJ,
PREVIEW_TAE,
PREVIEW_VAE,
PREVIEW_COUNT
};
typedef struct { typedef struct {
bool enabled; bool enabled;
int tile_size_x; int tile_size_x;
@ -162,6 +170,7 @@ typedef struct {
bool keep_control_net_on_cpu; bool keep_control_net_on_cpu;
bool keep_vae_on_cpu; bool keep_vae_on_cpu;
bool diffusion_flash_attn; bool diffusion_flash_attn;
bool tae_preview_only;
bool diffusion_conv_direct; bool diffusion_conv_direct;
bool vae_conv_direct; bool vae_conv_direct;
bool force_sdxl_vae_conv_scale; bool force_sdxl_vae_conv_scale;
@ -254,9 +263,11 @@ typedef struct sd_ctx_t sd_ctx_t;
typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data); typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data); typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data);
typedef void (*sd_preview_cb_t)(int step, int frame_count, sd_image_t* frames, bool is_noisy);
SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data); SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data); SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
SD_API void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode, int interval, bool denoised, bool noisy);
SD_API int32_t get_num_physical_cores(); SD_API int32_t get_num_physical_cores();
SD_API const char* sd_get_system_info(); SD_API const char* sd_get_system_info();
@ -270,6 +281,8 @@ SD_API const char* sd_schedule_name(enum scheduler_t scheduler);
SD_API enum scheduler_t str_to_schedule(const char* str); SD_API enum scheduler_t str_to_schedule(const char* str);
SD_API const char* sd_prediction_name(enum prediction_t prediction); SD_API const char* sd_prediction_name(enum prediction_t prediction);
SD_API enum prediction_t str_to_prediction(const char* str); SD_API enum prediction_t str_to_prediction(const char* str);
SD_API const char* sd_preview_name(enum preview_t preview);
SD_API enum preview_t str_to_preview(const char* str);
SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params); SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params);
SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params); SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);

2
t5.hpp
View File

@ -1004,7 +1004,7 @@ struct T5Embedder {
ggml_type model_data_type = GGML_TYPE_F16; ggml_type model_data_type = GGML_TYPE_F16;
ModelLoader model_loader; ModelLoader model_loader;
if (!model_loader.init_from_file(file_path)) { if (!model_loader.init_from_file_and_convert_name(file_path)) {
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str()); LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
return; return;
} }

View File

@ -222,7 +222,7 @@ struct TinyAutoEncoder : public GGMLRunner {
} }
ModelLoader model_loader; ModelLoader model_loader;
if (!model_loader.init_from_file(file_path)) { if (!model_loader.init_from_file_and_convert_name(file_path)) {
LOG_ERROR("init taesd model loader from file failed: '%s'", file_path.c_str()); LOG_ERROR("init taesd model loader from file failed: '%s'", file_path.c_str());
return false; return false;
} }

View File

@ -180,6 +180,7 @@ protected:
int num_head_channels = -1; // channels // num_heads int num_head_channels = -1; // channels // num_heads
int context_dim = 768; // 1024 for VERSION_SD2, 2048 for VERSION_SDXL int context_dim = 768; // 1024 for VERSION_SD2, 2048 for VERSION_SDXL
bool use_linear_projection = false; bool use_linear_projection = false;
bool tiny_unet = false;
public: public:
int model_channels = 320; int model_channels = 320;
@ -208,15 +209,17 @@ public:
num_head_channels = 64; num_head_channels = 64;
num_heads = -1; num_heads = -1;
use_linear_projection = true; use_linear_projection = true;
} else if (version == VERSION_SD1_TINY_UNET) {
num_res_blocks = 1;
channel_mult = {1, 2, 4};
} }
if (sd_version_is_inpaint(version)) { if (sd_version_is_inpaint(version)) {
in_channels = 9; in_channels = 9;
} else if (sd_version_is_unet_edit(version)) { } else if (sd_version_is_unet_edit(version)) {
in_channels = 8; in_channels = 8;
} }
if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET) {
num_res_blocks = 1;
channel_mult = {1, 2, 4};
tiny_unet = true;
}
// dims is always 2 // dims is always 2
// use_temporal_attention is always True for SVD // use_temporal_attention is always True for SVD
@ -290,7 +293,7 @@ public:
context_dim)); context_dim));
} }
input_block_chans.push_back(ch); input_block_chans.push_back(ch);
if (version == VERSION_SD1_TINY_UNET) { if (tiny_unet) {
input_block_idx++; input_block_idx++;
} }
} }
@ -311,7 +314,7 @@ public:
d_head = num_head_channels; d_head = num_head_channels;
n_head = ch / d_head; n_head = ch / d_head;
} }
if (version != VERSION_SD1_TINY_UNET) { if (!tiny_unet) {
blocks["middle_block.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch)); blocks["middle_block.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
if (version != VERSION_SDXL_SSD1B) { if (version != VERSION_SDXL_SSD1B) {
blocks["middle_block.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch, blocks["middle_block.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
@ -358,7 +361,7 @@ public:
} }
if (i > 0 && j == num_res_blocks) { if (i > 0 && j == num_res_blocks) {
if (version == VERSION_SD1_TINY_UNET) { if (tiny_unet) {
output_block_idx++; output_block_idx++;
if (output_block_idx == 2) { if (output_block_idx == 2) {
up_sample_idx = 1; up_sample_idx = 1;
@ -495,7 +498,7 @@ public:
} }
hs.push_back(h); hs.push_back(h);
} }
if (version == VERSION_SD1_TINY_UNET) { if (tiny_unet) {
input_block_idx++; input_block_idx++;
} }
if (i != len_mults - 1) { if (i != len_mults - 1) {
@ -512,7 +515,7 @@ public:
// [N, 4*model_channels, h/8, w/8] // [N, 4*model_channels, h/8, w/8]
// middle_block // middle_block
if (version != VERSION_SD1_TINY_UNET) { if (!tiny_unet) {
h = resblock_forward("middle_block.0", ctx, h, emb, num_video_frames); // [N, 4*model_channels, h/8, w/8] h = resblock_forward("middle_block.0", ctx, h, emb, num_video_frames); // [N, 4*model_channels, h/8, w/8]
if (version != VERSION_SDXL_SSD1B) { if (version != VERSION_SDXL_SSD1B) {
h = attention_layer_forward("middle_block.1", ctx, h, context, num_video_frames); // [N, 4*model_channels, h/8, w/8] h = attention_layer_forward("middle_block.1", ctx, h, context, num_video_frames); // [N, 4*model_channels, h/8, w/8]
@ -554,7 +557,7 @@ public:
} }
if (i > 0 && j == num_res_blocks) { if (i > 0 && j == num_res_blocks) {
if (version == VERSION_SD1_TINY_UNET) { if (tiny_unet) {
output_block_idx++; output_block_idx++;
if (output_block_idx == 2) { if (output_block_idx == 2) {
up_sample_idx = 1; up_sample_idx = 1;

View File

@ -42,7 +42,7 @@ struct UpscalerGGML {
backend = ggml_backend_sycl_init(0); backend = ggml_backend_sycl_init(0);
#endif #endif
ModelLoader model_loader; ModelLoader model_loader;
if (!model_loader.init_from_file(esrgan_path)) { if (!model_loader.init_from_file_and_convert_name(esrgan_path)) {
LOG_ERROR("init model loader from file failed: '%s'", esrgan_path.c_str()); LOG_ERROR("init model loader from file failed: '%s'", esrgan_path.c_str());
} }
model_loader.set_wtype_override(model_data_type); model_loader.set_wtype_override(model_data_type);

View File

@ -185,6 +185,12 @@ int32_t get_num_physical_cores() {
static sd_progress_cb_t sd_progress_cb = nullptr; static sd_progress_cb_t sd_progress_cb = nullptr;
void* sd_progress_cb_data = nullptr; void* sd_progress_cb_data = nullptr;
static sd_preview_cb_t sd_preview_cb = nullptr;
preview_t sd_preview_mode = PREVIEW_NONE;
int sd_preview_interval = 1;
bool sd_preview_denoised = true;
bool sd_preview_noisy = false;
std::u32string utf8_to_utf32(const std::string& utf8_str) { std::u32string utf8_to_utf32(const std::string& utf8_str) {
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter; std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
return converter.from_bytes(utf8_str); return converter.from_bytes(utf8_str);
@ -328,6 +334,37 @@ void sd_set_progress_callback(sd_progress_cb_t cb, void* data) {
sd_progress_cb = cb; sd_progress_cb = cb;
sd_progress_cb_data = data; sd_progress_cb_data = data;
} }
void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode = PREVIEW_PROJ, int interval = 1, bool denoised = true, bool noisy = false) {
sd_preview_cb = cb;
sd_preview_mode = mode;
sd_preview_interval = interval;
sd_preview_denoised = denoised;
sd_preview_noisy = noisy;
}
sd_preview_cb_t sd_get_preview_callback() {
return sd_preview_cb;
}
preview_t sd_get_preview_mode() {
return sd_preview_mode;
}
int sd_get_preview_interval() {
return sd_preview_interval;
}
bool sd_should_preview_denoised() {
return sd_preview_denoised;
}
bool sd_should_preview_noisy() {
return sd_preview_noisy;
}
sd_progress_cb_t sd_get_progress_callback() {
return sd_progress_cb;
}
void* sd_get_progress_callback_data() {
return sd_progress_cb_data;
}
const char* sd_get_system_info() { const char* sd_get_system_info() {
static char buffer[1024]; static char buffer[1024];
std::stringstream ss; std::stringstream ss;

9
util.h
View File

@ -54,6 +54,15 @@ std::string trim(const std::string& s);
std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text); std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text);
sd_progress_cb_t sd_get_progress_callback();
void* sd_get_progress_callback_data();
sd_preview_cb_t sd_get_preview_callback();
preview_t sd_get_preview_mode();
int sd_get_preview_interval();
bool sd_should_preview_denoised();
bool sd_should_preview_noisy();
#define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__) #define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__)
#define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__) #define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__)
#define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__) #define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__)

View File

@ -1271,7 +1271,7 @@ namespace WAN {
vae->get_param_tensors(tensors, "first_stage_model"); vae->get_param_tensors(tensors, "first_stage_model");
ModelLoader model_loader; ModelLoader model_loader;
if (!model_loader.init_from_file(file_path, "vae.")) { if (!model_loader.init_from_file_and_convert_name(file_path, "vae.")) {
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str()); LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
return; return;
} }
@ -2255,7 +2255,7 @@ namespace WAN {
LOG_INFO("loading from '%s'", file_path.c_str()); LOG_INFO("loading from '%s'", file_path.c_str());
ModelLoader model_loader; ModelLoader model_loader;
if (!model_loader.init_from_file(file_path, "model.diffusion_model.")) { if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) {
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str()); LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
return; return;
} }