From 28ef93c0e172a29b0dcbd131ec8e34ecbd2b98e2 Mon Sep 17 00:00:00 2001 From: leejet Date: Tue, 10 Feb 2026 23:13:35 +0800 Subject: [PATCH] refactor: reorganize the file structure (#1266) --- CMakeLists.txt | 11 +- .../stable-diffusion.h | 0 face_detect.py => script/face_detect.py | 174 +++---- cache_dit.hpp => src/cache_dit.hpp | 0 clip.hpp => src/clip.hpp | 0 common.hpp => src/common.hpp | 0 conditioner.hpp => src/conditioner.hpp | 0 control.hpp => src/control.hpp | 0 denoiser.hpp => src/denoiser.hpp | 0 .../diffusion_model.hpp | 0 easycache.hpp => src/easycache.hpp | 0 esrgan.hpp => src/esrgan.hpp | 0 flux.hpp => src/flux.hpp | 0 ggml_extend.hpp => src/ggml_extend.hpp | 0 gguf_reader.hpp => src/gguf_reader.hpp | 0 gits_noise.inl => src/gits_noise.inl | 0 latent-preview.h => src/latent-preview.h | 468 +++++++++--------- llm.hpp => src/llm.hpp | 0 lora.hpp => src/lora.hpp | 0 ltxv.hpp => src/ltxv.hpp | 0 mmdit.hpp => src/mmdit.hpp | 0 model.cpp => src/model.cpp | 0 model.h => src/model.h | 0 .../name_conversion.cpp | 0 name_conversion.h => src/name_conversion.h | 0 ordered_map.hpp => src/ordered_map.hpp | 0 pmid.hpp => src/pmid.hpp | 0 preprocessing.hpp => src/preprocessing.hpp | 0 qwen_image.hpp => src/qwen_image.hpp | 0 rng.hpp => src/rng.hpp | 0 rng_mt19937.hpp => src/rng_mt19937.hpp | 0 rng_philox.hpp => src/rng_philox.hpp | 0 rope.hpp => src/rope.hpp | 0 .../stable-diffusion.cpp | 0 t5.hpp => src/t5.hpp | 0 tae.hpp => src/tae.hpp | 0 tokenize_util.cpp => src/tokenize_util.cpp | 0 tokenize_util.h => src/tokenize_util.h | 0 ucache.hpp => src/ucache.hpp | 0 unet.hpp => src/unet.hpp | 0 upscaler.cpp => src/upscaler.cpp | 0 util.cpp => src/util.cpp | 0 util.h => src/util.h | 0 vae.hpp => src/vae.hpp | 0 version.cpp => src/version.cpp | 0 vocab.hpp => src/vocab.hpp | 0 vocab_mistral.hpp => src/vocab_mistral.hpp | 0 vocab_qwen.hpp => src/vocab_qwen.hpp | 0 vocab_umt5.hpp => src/vocab_umt5.hpp | 0 wan.hpp => src/wan.hpp | 0 z_image.hpp => src/z_image.hpp | 0 51 files changed, 327 insertions(+), 326 deletions(-) rename stable-diffusion.h => include/stable-diffusion.h (100%) rename face_detect.py => script/face_detect.py (97%) rename cache_dit.hpp => src/cache_dit.hpp (100%) rename clip.hpp => src/clip.hpp (100%) rename common.hpp => src/common.hpp (100%) rename conditioner.hpp => src/conditioner.hpp (100%) rename control.hpp => src/control.hpp (100%) rename denoiser.hpp => src/denoiser.hpp (100%) rename diffusion_model.hpp => src/diffusion_model.hpp (100%) rename easycache.hpp => src/easycache.hpp (100%) rename esrgan.hpp => src/esrgan.hpp (100%) rename flux.hpp => src/flux.hpp (100%) rename ggml_extend.hpp => src/ggml_extend.hpp (100%) rename gguf_reader.hpp => src/gguf_reader.hpp (100%) rename gits_noise.inl => src/gits_noise.inl (100%) rename latent-preview.h => src/latent-preview.h (97%) rename llm.hpp => src/llm.hpp (100%) rename lora.hpp => src/lora.hpp (100%) rename ltxv.hpp => src/ltxv.hpp (100%) rename mmdit.hpp => src/mmdit.hpp (100%) rename model.cpp => src/model.cpp (100%) rename model.h => src/model.h (100%) rename name_conversion.cpp => src/name_conversion.cpp (100%) rename name_conversion.h => src/name_conversion.h (100%) rename ordered_map.hpp => src/ordered_map.hpp (100%) rename pmid.hpp => src/pmid.hpp (100%) rename preprocessing.hpp => src/preprocessing.hpp (100%) rename qwen_image.hpp => src/qwen_image.hpp (100%) rename rng.hpp => src/rng.hpp (100%) rename rng_mt19937.hpp => src/rng_mt19937.hpp (100%) rename rng_philox.hpp => src/rng_philox.hpp (100%) rename rope.hpp => src/rope.hpp (100%) rename stable-diffusion.cpp => src/stable-diffusion.cpp (100%) rename t5.hpp => src/t5.hpp (100%) rename tae.hpp => src/tae.hpp (100%) rename tokenize_util.cpp => src/tokenize_util.cpp (100%) rename tokenize_util.h => src/tokenize_util.h (100%) rename ucache.hpp => src/ucache.hpp (100%) rename unet.hpp => src/unet.hpp (100%) rename upscaler.cpp => src/upscaler.cpp (100%) rename util.cpp => src/util.cpp (100%) rename util.h => src/util.h (100%) rename vae.hpp => src/vae.hpp (100%) rename version.cpp => src/version.cpp (100%) rename vocab.hpp => src/vocab.hpp (100%) rename vocab_mistral.hpp => src/vocab_mistral.hpp (100%) rename vocab_qwen.hpp => src/vocab_qwen.hpp (100%) rename vocab_umt5.hpp => src/vocab_umt5.hpp (100%) rename wan.hpp => src/wan.hpp (100%) rename z_image.hpp => src/z_image.hpp (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index e731d95..7b77225 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -87,9 +87,9 @@ endif() set(SD_LIB stable-diffusion) file(GLOB SD_LIB_SOURCES - "*.h" - "*.cpp" - "*.hpp" + "src/*.h" + "src/*.cpp" + "src/*.hpp" ) find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH) @@ -119,7 +119,7 @@ endif() message(STATUS "stable-diffusion.cpp commit ${SDCPP_BUILD_COMMIT}") set_property( - SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/version.cpp + SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/src/version.cpp APPEND PROPERTY COMPILE_DEFINITIONS SDCPP_BUILD_COMMIT=${SDCPP_BUILD_COMMIT} SDCPP_BUILD_VERSION=${SDCPP_BUILD_VERSION} ) @@ -182,6 +182,7 @@ endif() add_subdirectory(thirdparty) target_link_libraries(${SD_LIB} PUBLIC ggml zip) +target_include_directories(${SD_LIB} PUBLIC . include) target_include_directories(${SD_LIB} PUBLIC . thirdparty) target_compile_features(${SD_LIB} PUBLIC c_std_11 cxx_std_17) @@ -190,7 +191,7 @@ if (SD_BUILD_EXAMPLES) add_subdirectory(examples) endif() -set(SD_PUBLIC_HEADERS stable-diffusion.h) +set(SD_PUBLIC_HEADERS include/stable-diffusion.h) set_target_properties(${SD_LIB} PROPERTIES PUBLIC_HEADER "${SD_PUBLIC_HEADERS}") install(TARGETS ${SD_LIB} LIBRARY PUBLIC_HEADER) diff --git a/stable-diffusion.h b/include/stable-diffusion.h similarity index 100% rename from stable-diffusion.h rename to include/stable-diffusion.h diff --git a/face_detect.py b/script/face_detect.py similarity index 97% rename from face_detect.py rename to script/face_detect.py index 7131af3..e7a3eae 100644 --- a/face_detect.py +++ b/script/face_detect.py @@ -1,88 +1,88 @@ -import os -import sys - -import numpy as np -import torch -from diffusers.utils import load_image -# pip install insightface==0.7.3 -from insightface.app import FaceAnalysis -from insightface.data import get_image as ins_get_image -from safetensors.torch import save_file - -### -# https://github.com/cubiq/ComfyUI_IPAdapter_plus/issues/165#issue-2055829543 -### -class FaceAnalysis2(FaceAnalysis): - # NOTE: allows setting det_size for each detection call. - # the model allows it but the wrapping code from insightface - # doesn't show it, and people end up loading duplicate models - # for different sizes where there is absolutely no need to - def get(self, img, max_num=0, det_size=(640, 640)): - if det_size is not None: - self.det_model.input_size = det_size - - return super().get(img, max_num) - -def analyze_faces(face_analysis: FaceAnalysis, img_data: np.ndarray, det_size=(640, 640)): - # NOTE: try detect faces, if no faces detected, lower det_size until it does - detection_sizes = [None] + [(size, size) for size in range(640, 256, -64)] + [(256, 256)] - - for size in detection_sizes: - faces = face_analysis.get(img_data, det_size=size) - if len(faces) > 0: - return faces - - return [] - -if __name__ == "__main__": - #face_detector = FaceAnalysis2(providers=['CUDAExecutionProvider'], allowed_modules=['detection', 'recognition']) - face_detector = FaceAnalysis2(providers=['CPUExecutionProvider'], allowed_modules=['detection', 'recognition']) - face_detector.prepare(ctx_id=0, det_size=(640, 640)) - #input_folder_name = './scarletthead_woman' - input_folder_name = sys.argv[1] - image_basename_list = os.listdir(input_folder_name) - image_path_list = sorted([os.path.join(input_folder_name, basename) for basename in image_basename_list]) - - input_id_images = [] - for image_path in image_path_list: - input_id_images.append(load_image(image_path)) - - id_embed_list = [] - - for img in input_id_images: - img = np.array(img) - img = img[:, :, ::-1] - faces = analyze_faces(face_detector, img) - if len(faces) > 0: - id_embed_list.append(torch.from_numpy((faces[0]['embedding']))) - - if len(id_embed_list) == 0: - raise ValueError(f"No face detected in input image pool") - - id_embeds = torch.stack(id_embed_list) - - # for r in id_embeds: - # print(r) - # #torch.save(id_embeds, input_folder_name+'/id_embeds.pt'); - # weights = dict() - # weights["id_embeds"] = id_embeds - # save_file(weights, input_folder_name+'/id_embeds.safetensors') - - binary_data = id_embeds.numpy().tobytes() - two = 4 - zero = 0 - one = 1 - tensor_name = "id_embeds" -# Write binary data to a file - with open(input_folder_name+'/id_embeds.bin', "wb") as f: - f.write(two.to_bytes(4, byteorder='little')) - f.write((len(tensor_name)).to_bytes(4, byteorder='little')) - f.write(zero.to_bytes(4, byteorder='little')) - f.write((id_embeds.shape[1]).to_bytes(4, byteorder='little')) - f.write((id_embeds.shape[0]).to_bytes(4, byteorder='little')) - f.write(one.to_bytes(4, byteorder='little')) - f.write(one.to_bytes(4, byteorder='little')) - f.write(tensor_name.encode('ascii')) - f.write(binary_data) - +import os +import sys + +import numpy as np +import torch +from diffusers.utils import load_image +# pip install insightface==0.7.3 +from insightface.app import FaceAnalysis +from insightface.data import get_image as ins_get_image +from safetensors.torch import save_file + +### +# https://github.com/cubiq/ComfyUI_IPAdapter_plus/issues/165#issue-2055829543 +### +class FaceAnalysis2(FaceAnalysis): + # NOTE: allows setting det_size for each detection call. + # the model allows it but the wrapping code from insightface + # doesn't show it, and people end up loading duplicate models + # for different sizes where there is absolutely no need to + def get(self, img, max_num=0, det_size=(640, 640)): + if det_size is not None: + self.det_model.input_size = det_size + + return super().get(img, max_num) + +def analyze_faces(face_analysis: FaceAnalysis, img_data: np.ndarray, det_size=(640, 640)): + # NOTE: try detect faces, if no faces detected, lower det_size until it does + detection_sizes = [None] + [(size, size) for size in range(640, 256, -64)] + [(256, 256)] + + for size in detection_sizes: + faces = face_analysis.get(img_data, det_size=size) + if len(faces) > 0: + return faces + + return [] + +if __name__ == "__main__": + #face_detector = FaceAnalysis2(providers=['CUDAExecutionProvider'], allowed_modules=['detection', 'recognition']) + face_detector = FaceAnalysis2(providers=['CPUExecutionProvider'], allowed_modules=['detection', 'recognition']) + face_detector.prepare(ctx_id=0, det_size=(640, 640)) + #input_folder_name = './scarletthead_woman' + input_folder_name = sys.argv[1] + image_basename_list = os.listdir(input_folder_name) + image_path_list = sorted([os.path.join(input_folder_name, basename) for basename in image_basename_list]) + + input_id_images = [] + for image_path in image_path_list: + input_id_images.append(load_image(image_path)) + + id_embed_list = [] + + for img in input_id_images: + img = np.array(img) + img = img[:, :, ::-1] + faces = analyze_faces(face_detector, img) + if len(faces) > 0: + id_embed_list.append(torch.from_numpy((faces[0]['embedding']))) + + if len(id_embed_list) == 0: + raise ValueError(f"No face detected in input image pool") + + id_embeds = torch.stack(id_embed_list) + + # for r in id_embeds: + # print(r) + # #torch.save(id_embeds, input_folder_name+'/id_embeds.pt'); + # weights = dict() + # weights["id_embeds"] = id_embeds + # save_file(weights, input_folder_name+'/id_embeds.safetensors') + + binary_data = id_embeds.numpy().tobytes() + two = 4 + zero = 0 + one = 1 + tensor_name = "id_embeds" +# Write binary data to a file + with open(input_folder_name+'/id_embeds.bin', "wb") as f: + f.write(two.to_bytes(4, byteorder='little')) + f.write((len(tensor_name)).to_bytes(4, byteorder='little')) + f.write(zero.to_bytes(4, byteorder='little')) + f.write((id_embeds.shape[1]).to_bytes(4, byteorder='little')) + f.write((id_embeds.shape[0]).to_bytes(4, byteorder='little')) + f.write(one.to_bytes(4, byteorder='little')) + f.write(one.to_bytes(4, byteorder='little')) + f.write(tensor_name.encode('ascii')) + f.write(binary_data) + \ No newline at end of file diff --git a/cache_dit.hpp b/src/cache_dit.hpp similarity index 100% rename from cache_dit.hpp rename to src/cache_dit.hpp diff --git a/clip.hpp b/src/clip.hpp similarity index 100% rename from clip.hpp rename to src/clip.hpp diff --git a/common.hpp b/src/common.hpp similarity index 100% rename from common.hpp rename to src/common.hpp diff --git a/conditioner.hpp b/src/conditioner.hpp similarity index 100% rename from conditioner.hpp rename to src/conditioner.hpp diff --git a/control.hpp b/src/control.hpp similarity index 100% rename from control.hpp rename to src/control.hpp diff --git a/denoiser.hpp b/src/denoiser.hpp similarity index 100% rename from denoiser.hpp rename to src/denoiser.hpp diff --git a/diffusion_model.hpp b/src/diffusion_model.hpp similarity index 100% rename from diffusion_model.hpp rename to src/diffusion_model.hpp diff --git a/easycache.hpp b/src/easycache.hpp similarity index 100% rename from easycache.hpp rename to src/easycache.hpp diff --git a/esrgan.hpp b/src/esrgan.hpp similarity index 100% rename from esrgan.hpp rename to src/esrgan.hpp diff --git a/flux.hpp b/src/flux.hpp similarity index 100% rename from flux.hpp rename to src/flux.hpp diff --git a/ggml_extend.hpp b/src/ggml_extend.hpp similarity index 100% rename from ggml_extend.hpp rename to src/ggml_extend.hpp diff --git a/gguf_reader.hpp b/src/gguf_reader.hpp similarity index 100% rename from gguf_reader.hpp rename to src/gguf_reader.hpp diff --git a/gits_noise.inl b/src/gits_noise.inl similarity index 100% rename from gits_noise.inl rename to src/gits_noise.inl diff --git a/latent-preview.h b/src/latent-preview.h similarity index 97% rename from latent-preview.h rename to src/latent-preview.h index 76e1741..85c8e0d 100644 --- a/latent-preview.h +++ b/src/latent-preview.h @@ -1,234 +1,234 @@ -#include -#include -#include "ggml.h" - -const float wan_21_latent_rgb_proj[16][3] = { - {0.015123f, -0.148418f, 0.479828f}, - {0.003652f, -0.010680f, -0.037142f}, - {0.212264f, 0.063033f, 0.016779f}, - {0.232999f, 0.406476f, 0.220125f}, - {-0.051864f, -0.082384f, -0.069396f}, - {0.085005f, -0.161492f, 0.010689f}, - {-0.245369f, -0.506846f, -0.117010f}, - {-0.151145f, 0.017721f, 0.007207f}, - {-0.293239f, -0.207936f, -0.421135f}, - {-0.187721f, 0.050783f, 0.177649f}, - {-0.013067f, 0.265964f, 0.166578f}, - {0.028327f, 0.109329f, 0.108642f}, - {-0.205343f, 0.043991f, 0.148914f}, - {0.014307f, -0.048647f, -0.007219f}, - {0.217150f, 0.053074f, 0.319923f}, - {0.155357f, 0.083156f, 0.064780f}}; -float wan_21_latent_rgb_bias[3] = {-0.270270f, -0.234976f, -0.456853f}; - -const float wan_22_latent_rgb_proj[48][3] = { - {0.017126f, -0.027230f, -0.019257f}, - {-0.113739f, -0.028715f, -0.022885f}, - {-0.000106f, 0.021494f, 0.004629f}, - {-0.013273f, -0.107137f, -0.033638f}, - {-0.000381f, 0.000279f, 0.025877f}, - {-0.014216f, -0.003975f, 0.040528f}, - {0.001638f, -0.000748f, 0.011022f}, - {0.029238f, -0.006697f, 0.035933f}, - {0.021641f, -0.015874f, 0.040531f}, - {-0.101984f, -0.070160f, -0.028855f}, - {0.033207f, -0.021068f, 0.002663f}, - {-0.104711f, 0.121673f, 0.102981f}, - {0.082647f, -0.004991f, 0.057237f}, - {-0.027375f, 0.031581f, 0.006868f}, - {-0.045434f, 0.029444f, 0.019287f}, - {-0.046572f, -0.012537f, 0.006675f}, - {0.074709f, 0.033690f, 0.025289f}, - {-0.008251f, -0.002745f, -0.006999f}, - {0.012685f, -0.061856f, -0.048658f}, - {0.042304f, -0.007039f, 0.000295f}, - {-0.007644f, -0.060843f, -0.033142f}, - {0.159909f, 0.045628f, 0.367541f}, - {0.095171f, 0.086438f, 0.010271f}, - {0.006812f, 0.019643f, 0.029637f}, - {0.003467f, -0.010705f, 0.014252f}, - {-0.099681f, -0.066272f, -0.006243f}, - {0.047357f, 0.037040f, 0.000185f}, - {-0.041797f, -0.089225f, -0.032257f}, - {0.008928f, 0.017028f, 0.018684f}, - {-0.042255f, 0.016045f, 0.006849f}, - {0.011268f, 0.036462f, 0.037387f}, - {0.011553f, -0.016375f, -0.048589f}, - {0.046266f, -0.027189f, 0.056979f}, - {0.009640f, -0.017576f, 0.030324f}, - {-0.045794f, -0.036083f, -0.010616f}, - {0.022418f, 0.039783f, -0.032939f}, - {-0.052714f, -0.015525f, 0.007438f}, - {0.193004f, 0.223541f, 0.264175f}, - {-0.059406f, -0.008188f, 0.022867f}, - {-0.156742f, -0.263791f, -0.007385f}, - {-0.015717f, 0.016570f, 0.033969f}, - {0.037969f, 0.109835f, 0.200449f}, - {-0.000782f, -0.009566f, -0.008058f}, - {0.010709f, 0.052960f, -0.044195f}, - {0.017271f, 0.045839f, 0.034569f}, - {0.009424f, 0.013088f, -0.001714f}, - {-0.024805f, -0.059378f, -0.033756f}, - {-0.078293f, 0.029070f, 0.026129f}}; -float wan_22_latent_rgb_bias[3] = {0.013160f, -0.096492f, -0.071323f}; - -const float flux_latent_rgb_proj[16][3] = { - {-0.041168f, 0.019917f, 0.097253f}, - {0.028096f, 0.026730f, 0.129576f}, - {0.065618f, -0.067950f, -0.014651f}, - {-0.012998f, -0.014762f, 0.081251f}, - {0.078567f, 0.059296f, -0.024687f}, - {-0.015987f, -0.003697f, 0.005012f}, - {0.033605f, 0.138999f, 0.068517f}, - {-0.024450f, -0.063567f, -0.030101f}, - {-0.040194f, -0.016710f, 0.127185f}, - {0.112681f, 0.088764f, -0.041940f}, - {-0.023498f, 0.093664f, 0.025543f}, - {0.082899f, 0.048320f, 0.007491f}, - {0.075712f, 0.074139f, 0.081965f}, - {-0.143501f, 0.018263f, -0.136138f}, - {-0.025767f, -0.082035f, -0.040023f}, - {-0.111849f, -0.055589f, -0.032361f}}; -float flux_latent_rgb_bias[3] = {0.024600f, -0.006937f, -0.008089f}; - -const float flux2_latent_rgb_proj[32][3] = { - {0.000736f, -0.008385f, -0.019710f}, - {-0.001352f, -0.016392f, 0.020693f}, - {-0.006376f, 0.002428f, 0.036736f}, - {0.039384f, 0.074167f, 0.119789f}, - {0.007464f, -0.005705f, -0.004734f}, - {-0.004086f, 0.005287f, -0.000409f}, - {-0.032835f, 0.050802f, -0.028120f}, - {-0.003158f, -0.000835f, 0.000406f}, - {-0.112840f, -0.084337f, -0.023083f}, - {0.001462f, -0.006656f, 0.000549f}, - {-0.009980f, -0.007480f, 0.009702f}, - {0.032540f, 0.000214f, -0.061388f}, - {0.011023f, 0.000694f, 0.007143f}, - {-0.001468f, -0.006723f, -0.001678f}, - {-0.005921f, -0.010320f, -0.003907f}, - {-0.028434f, 0.027584f, 0.018457f}, - {0.014349f, 0.011523f, 0.000441f}, - {0.009874f, 0.003081f, 0.001507f}, - {0.002218f, 0.005712f, 0.001563f}, - {0.053010f, -0.019844f, 0.008683f}, - {-0.002507f, 0.005384f, 0.000938f}, - {-0.002177f, -0.011366f, 0.003559f}, - {-0.000261f, 0.015121f, -0.003240f}, - {-0.003944f, -0.002083f, 0.005043f}, - {-0.009138f, 0.011336f, 0.003781f}, - {0.011429f, 0.003985f, -0.003855f}, - {0.010518f, -0.005586f, 0.010131f}, - {0.007883f, 0.002912f, -0.001473f}, - {-0.003318f, -0.003160f, 0.003684f}, - {-0.034560f, -0.008740f, 0.012996f}, - {0.000166f, 0.001079f, -0.012153f}, - {0.017772f, 0.000937f, -0.011953f}}; -float flux2_latent_rgb_bias[3] = {-0.028738f, -0.098463f, -0.107619f}; - -// This one was taken straight from -// https://github.com/Stability-AI/sd3.5/blob/8565799a3b41eb0c7ba976d18375f0f753f56402/sd3_impls.py#L288-L303 -// (MiT Licence) -const float sd3_latent_rgb_proj[16][3] = { - {-0.0645f, 0.0177f, 0.1052f}, - {0.0028f, 0.0312f, 0.0650f}, - {0.1848f, 0.0762f, 0.0360f}, - {0.0944f, 0.0360f, 0.0889f}, - {0.0897f, 0.0506f, -0.0364f}, - {-0.0020f, 0.1203f, 0.0284f}, - {0.0855f, 0.0118f, 0.0283f}, - {-0.0539f, 0.0658f, 0.1047f}, - {-0.0057f, 0.0116f, 0.0700f}, - {-0.0412f, 0.0281f, -0.0039f}, - {0.1106f, 0.1171f, 0.1220f}, - {-0.0248f, 0.0682f, -0.0481f}, - {0.0815f, 0.0846f, 0.1207f}, - {-0.0120f, -0.0055f, -0.0867f}, - {-0.0749f, -0.0634f, -0.0456f}, - {-0.1418f, -0.1457f, -0.1259f}, -}; -float sd3_latent_rgb_bias[3] = {0, 0, 0}; - -const float sdxl_latent_rgb_proj[4][3] = { - {0.258303f, 0.277640f, 0.329699f}, - {-0.299701f, 0.105446f, 0.014194f}, - {0.050522f, 0.186163f, -0.143257f}, - {-0.211938f, -0.149892f, -0.080036f}}; -float sdxl_latent_rgb_bias[3] = {0.144381f, -0.033313f, 0.007061f}; - -const float sd_latent_rgb_proj[4][3] = { - {0.337366f, 0.216344f, 0.257386f}, - {0.165636f, 0.386828f, 0.046994f}, - {-0.267803f, 0.237036f, 0.223517f}, - {-0.178022f, -0.200862f, -0.678514f}}; -float sd_latent_rgb_bias[3] = {-0.017478f, -0.055834f, -0.105825f}; - -void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int patch_size) { - size_t buffer_head = 0; - - uint32_t latent_width = static_cast(latents->ne[0]); - uint32_t latent_height = static_cast(latents->ne[1]); - uint32_t dim = static_cast(latents->ne[ggml_n_dims(latents) - 1]); - uint32_t frames = 1; - if (ggml_n_dims(latents) == 4) { - frames = static_cast(latents->ne[2]); - } - - uint32_t rgb_width = latent_width * patch_size; - uint32_t rgb_height = latent_height * patch_size; - - uint32_t unpatched_dim = dim / (patch_size * patch_size); - - for (uint32_t k = 0; k < frames; k++) { - for (uint32_t rgb_x = 0; rgb_x < rgb_width; rgb_x++) { - for (uint32_t rgb_y = 0; rgb_y < rgb_height; rgb_y++) { - int latent_x = rgb_x / patch_size; - int latent_y = rgb_y / patch_size; - - int channel_offset = 0; - if (patch_size > 1) { - channel_offset = ((rgb_y % patch_size) * patch_size + (rgb_x % patch_size)); - } - - size_t latent_id = (latent_x * latents->nb[0] + latent_y * latents->nb[1] + k * latents->nb[2]); - - // should be incremented by 1 for each pixel - size_t pixel_id = k * rgb_width * rgb_height + rgb_y * rgb_width + rgb_x; - - float r = 0, g = 0, b = 0; - if (latent_rgb_proj != nullptr) { - for (uint32_t d = 0; d < unpatched_dim; d++) { - float value = *(float*)((char*)latents->data + latent_id + (d * patch_size * patch_size + channel_offset) * latents->nb[ggml_n_dims(latents) - 1]); - r += value * latent_rgb_proj[d][0]; - g += value * latent_rgb_proj[d][1]; - b += value * latent_rgb_proj[d][2]; - } - } else { - // interpret first 3 channels as RGB - r = *(float*)((char*)latents->data + latent_id + 0 * latents->nb[ggml_n_dims(latents) - 1]); - g = *(float*)((char*)latents->data + latent_id + 1 * latents->nb[ggml_n_dims(latents) - 1]); - b = *(float*)((char*)latents->data + latent_id + 2 * latents->nb[ggml_n_dims(latents) - 1]); - } - if (latent_rgb_bias != nullptr) { - // bias - r += latent_rgb_bias[0]; - g += latent_rgb_bias[1]; - b += latent_rgb_bias[2]; - } - // change range - r = r * .5f + .5f; - g = g * .5f + .5f; - b = b * .5f + .5f; - - // clamp rgb values to [0,1] range - r = r >= 0 ? r <= 1 ? r : 1 : 0; - g = g >= 0 ? g <= 1 ? g : 1 : 0; - b = b >= 0 ? b <= 1 ? b : 1 : 0; - - buffer[pixel_id * 3 + 0] = (uint8_t)(r * 255); - buffer[pixel_id * 3 + 1] = (uint8_t)(g * 255); - buffer[pixel_id * 3 + 2] = (uint8_t)(b * 255); - } - } - } -} +#include +#include +#include "ggml.h" + +const float wan_21_latent_rgb_proj[16][3] = { + {0.015123f, -0.148418f, 0.479828f}, + {0.003652f, -0.010680f, -0.037142f}, + {0.212264f, 0.063033f, 0.016779f}, + {0.232999f, 0.406476f, 0.220125f}, + {-0.051864f, -0.082384f, -0.069396f}, + {0.085005f, -0.161492f, 0.010689f}, + {-0.245369f, -0.506846f, -0.117010f}, + {-0.151145f, 0.017721f, 0.007207f}, + {-0.293239f, -0.207936f, -0.421135f}, + {-0.187721f, 0.050783f, 0.177649f}, + {-0.013067f, 0.265964f, 0.166578f}, + {0.028327f, 0.109329f, 0.108642f}, + {-0.205343f, 0.043991f, 0.148914f}, + {0.014307f, -0.048647f, -0.007219f}, + {0.217150f, 0.053074f, 0.319923f}, + {0.155357f, 0.083156f, 0.064780f}}; +float wan_21_latent_rgb_bias[3] = {-0.270270f, -0.234976f, -0.456853f}; + +const float wan_22_latent_rgb_proj[48][3] = { + {0.017126f, -0.027230f, -0.019257f}, + {-0.113739f, -0.028715f, -0.022885f}, + {-0.000106f, 0.021494f, 0.004629f}, + {-0.013273f, -0.107137f, -0.033638f}, + {-0.000381f, 0.000279f, 0.025877f}, + {-0.014216f, -0.003975f, 0.040528f}, + {0.001638f, -0.000748f, 0.011022f}, + {0.029238f, -0.006697f, 0.035933f}, + {0.021641f, -0.015874f, 0.040531f}, + {-0.101984f, -0.070160f, -0.028855f}, + {0.033207f, -0.021068f, 0.002663f}, + {-0.104711f, 0.121673f, 0.102981f}, + {0.082647f, -0.004991f, 0.057237f}, + {-0.027375f, 0.031581f, 0.006868f}, + {-0.045434f, 0.029444f, 0.019287f}, + {-0.046572f, -0.012537f, 0.006675f}, + {0.074709f, 0.033690f, 0.025289f}, + {-0.008251f, -0.002745f, -0.006999f}, + {0.012685f, -0.061856f, -0.048658f}, + {0.042304f, -0.007039f, 0.000295f}, + {-0.007644f, -0.060843f, -0.033142f}, + {0.159909f, 0.045628f, 0.367541f}, + {0.095171f, 0.086438f, 0.010271f}, + {0.006812f, 0.019643f, 0.029637f}, + {0.003467f, -0.010705f, 0.014252f}, + {-0.099681f, -0.066272f, -0.006243f}, + {0.047357f, 0.037040f, 0.000185f}, + {-0.041797f, -0.089225f, -0.032257f}, + {0.008928f, 0.017028f, 0.018684f}, + {-0.042255f, 0.016045f, 0.006849f}, + {0.011268f, 0.036462f, 0.037387f}, + {0.011553f, -0.016375f, -0.048589f}, + {0.046266f, -0.027189f, 0.056979f}, + {0.009640f, -0.017576f, 0.030324f}, + {-0.045794f, -0.036083f, -0.010616f}, + {0.022418f, 0.039783f, -0.032939f}, + {-0.052714f, -0.015525f, 0.007438f}, + {0.193004f, 0.223541f, 0.264175f}, + {-0.059406f, -0.008188f, 0.022867f}, + {-0.156742f, -0.263791f, -0.007385f}, + {-0.015717f, 0.016570f, 0.033969f}, + {0.037969f, 0.109835f, 0.200449f}, + {-0.000782f, -0.009566f, -0.008058f}, + {0.010709f, 0.052960f, -0.044195f}, + {0.017271f, 0.045839f, 0.034569f}, + {0.009424f, 0.013088f, -0.001714f}, + {-0.024805f, -0.059378f, -0.033756f}, + {-0.078293f, 0.029070f, 0.026129f}}; +float wan_22_latent_rgb_bias[3] = {0.013160f, -0.096492f, -0.071323f}; + +const float flux_latent_rgb_proj[16][3] = { + {-0.041168f, 0.019917f, 0.097253f}, + {0.028096f, 0.026730f, 0.129576f}, + {0.065618f, -0.067950f, -0.014651f}, + {-0.012998f, -0.014762f, 0.081251f}, + {0.078567f, 0.059296f, -0.024687f}, + {-0.015987f, -0.003697f, 0.005012f}, + {0.033605f, 0.138999f, 0.068517f}, + {-0.024450f, -0.063567f, -0.030101f}, + {-0.040194f, -0.016710f, 0.127185f}, + {0.112681f, 0.088764f, -0.041940f}, + {-0.023498f, 0.093664f, 0.025543f}, + {0.082899f, 0.048320f, 0.007491f}, + {0.075712f, 0.074139f, 0.081965f}, + {-0.143501f, 0.018263f, -0.136138f}, + {-0.025767f, -0.082035f, -0.040023f}, + {-0.111849f, -0.055589f, -0.032361f}}; +float flux_latent_rgb_bias[3] = {0.024600f, -0.006937f, -0.008089f}; + +const float flux2_latent_rgb_proj[32][3] = { + {0.000736f, -0.008385f, -0.019710f}, + {-0.001352f, -0.016392f, 0.020693f}, + {-0.006376f, 0.002428f, 0.036736f}, + {0.039384f, 0.074167f, 0.119789f}, + {0.007464f, -0.005705f, -0.004734f}, + {-0.004086f, 0.005287f, -0.000409f}, + {-0.032835f, 0.050802f, -0.028120f}, + {-0.003158f, -0.000835f, 0.000406f}, + {-0.112840f, -0.084337f, -0.023083f}, + {0.001462f, -0.006656f, 0.000549f}, + {-0.009980f, -0.007480f, 0.009702f}, + {0.032540f, 0.000214f, -0.061388f}, + {0.011023f, 0.000694f, 0.007143f}, + {-0.001468f, -0.006723f, -0.001678f}, + {-0.005921f, -0.010320f, -0.003907f}, + {-0.028434f, 0.027584f, 0.018457f}, + {0.014349f, 0.011523f, 0.000441f}, + {0.009874f, 0.003081f, 0.001507f}, + {0.002218f, 0.005712f, 0.001563f}, + {0.053010f, -0.019844f, 0.008683f}, + {-0.002507f, 0.005384f, 0.000938f}, + {-0.002177f, -0.011366f, 0.003559f}, + {-0.000261f, 0.015121f, -0.003240f}, + {-0.003944f, -0.002083f, 0.005043f}, + {-0.009138f, 0.011336f, 0.003781f}, + {0.011429f, 0.003985f, -0.003855f}, + {0.010518f, -0.005586f, 0.010131f}, + {0.007883f, 0.002912f, -0.001473f}, + {-0.003318f, -0.003160f, 0.003684f}, + {-0.034560f, -0.008740f, 0.012996f}, + {0.000166f, 0.001079f, -0.012153f}, + {0.017772f, 0.000937f, -0.011953f}}; +float flux2_latent_rgb_bias[3] = {-0.028738f, -0.098463f, -0.107619f}; + +// This one was taken straight from +// https://github.com/Stability-AI/sd3.5/blob/8565799a3b41eb0c7ba976d18375f0f753f56402/sd3_impls.py#L288-L303 +// (MiT Licence) +const float sd3_latent_rgb_proj[16][3] = { + {-0.0645f, 0.0177f, 0.1052f}, + {0.0028f, 0.0312f, 0.0650f}, + {0.1848f, 0.0762f, 0.0360f}, + {0.0944f, 0.0360f, 0.0889f}, + {0.0897f, 0.0506f, -0.0364f}, + {-0.0020f, 0.1203f, 0.0284f}, + {0.0855f, 0.0118f, 0.0283f}, + {-0.0539f, 0.0658f, 0.1047f}, + {-0.0057f, 0.0116f, 0.0700f}, + {-0.0412f, 0.0281f, -0.0039f}, + {0.1106f, 0.1171f, 0.1220f}, + {-0.0248f, 0.0682f, -0.0481f}, + {0.0815f, 0.0846f, 0.1207f}, + {-0.0120f, -0.0055f, -0.0867f}, + {-0.0749f, -0.0634f, -0.0456f}, + {-0.1418f, -0.1457f, -0.1259f}, +}; +float sd3_latent_rgb_bias[3] = {0, 0, 0}; + +const float sdxl_latent_rgb_proj[4][3] = { + {0.258303f, 0.277640f, 0.329699f}, + {-0.299701f, 0.105446f, 0.014194f}, + {0.050522f, 0.186163f, -0.143257f}, + {-0.211938f, -0.149892f, -0.080036f}}; +float sdxl_latent_rgb_bias[3] = {0.144381f, -0.033313f, 0.007061f}; + +const float sd_latent_rgb_proj[4][3] = { + {0.337366f, 0.216344f, 0.257386f}, + {0.165636f, 0.386828f, 0.046994f}, + {-0.267803f, 0.237036f, 0.223517f}, + {-0.178022f, -0.200862f, -0.678514f}}; +float sd_latent_rgb_bias[3] = {-0.017478f, -0.055834f, -0.105825f}; + +void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int patch_size) { + size_t buffer_head = 0; + + uint32_t latent_width = static_cast(latents->ne[0]); + uint32_t latent_height = static_cast(latents->ne[1]); + uint32_t dim = static_cast(latents->ne[ggml_n_dims(latents) - 1]); + uint32_t frames = 1; + if (ggml_n_dims(latents) == 4) { + frames = static_cast(latents->ne[2]); + } + + uint32_t rgb_width = latent_width * patch_size; + uint32_t rgb_height = latent_height * patch_size; + + uint32_t unpatched_dim = dim / (patch_size * patch_size); + + for (uint32_t k = 0; k < frames; k++) { + for (uint32_t rgb_x = 0; rgb_x < rgb_width; rgb_x++) { + for (uint32_t rgb_y = 0; rgb_y < rgb_height; rgb_y++) { + int latent_x = rgb_x / patch_size; + int latent_y = rgb_y / patch_size; + + int channel_offset = 0; + if (patch_size > 1) { + channel_offset = ((rgb_y % patch_size) * patch_size + (rgb_x % patch_size)); + } + + size_t latent_id = (latent_x * latents->nb[0] + latent_y * latents->nb[1] + k * latents->nb[2]); + + // should be incremented by 1 for each pixel + size_t pixel_id = k * rgb_width * rgb_height + rgb_y * rgb_width + rgb_x; + + float r = 0, g = 0, b = 0; + if (latent_rgb_proj != nullptr) { + for (uint32_t d = 0; d < unpatched_dim; d++) { + float value = *(float*)((char*)latents->data + latent_id + (d * patch_size * patch_size + channel_offset) * latents->nb[ggml_n_dims(latents) - 1]); + r += value * latent_rgb_proj[d][0]; + g += value * latent_rgb_proj[d][1]; + b += value * latent_rgb_proj[d][2]; + } + } else { + // interpret first 3 channels as RGB + r = *(float*)((char*)latents->data + latent_id + 0 * latents->nb[ggml_n_dims(latents) - 1]); + g = *(float*)((char*)latents->data + latent_id + 1 * latents->nb[ggml_n_dims(latents) - 1]); + b = *(float*)((char*)latents->data + latent_id + 2 * latents->nb[ggml_n_dims(latents) - 1]); + } + if (latent_rgb_bias != nullptr) { + // bias + r += latent_rgb_bias[0]; + g += latent_rgb_bias[1]; + b += latent_rgb_bias[2]; + } + // change range + r = r * .5f + .5f; + g = g * .5f + .5f; + b = b * .5f + .5f; + + // clamp rgb values to [0,1] range + r = r >= 0 ? r <= 1 ? r : 1 : 0; + g = g >= 0 ? g <= 1 ? g : 1 : 0; + b = b >= 0 ? b <= 1 ? b : 1 : 0; + + buffer[pixel_id * 3 + 0] = (uint8_t)(r * 255); + buffer[pixel_id * 3 + 1] = (uint8_t)(g * 255); + buffer[pixel_id * 3 + 2] = (uint8_t)(b * 255); + } + } + } +} diff --git a/llm.hpp b/src/llm.hpp similarity index 100% rename from llm.hpp rename to src/llm.hpp diff --git a/lora.hpp b/src/lora.hpp similarity index 100% rename from lora.hpp rename to src/lora.hpp diff --git a/ltxv.hpp b/src/ltxv.hpp similarity index 100% rename from ltxv.hpp rename to src/ltxv.hpp diff --git a/mmdit.hpp b/src/mmdit.hpp similarity index 100% rename from mmdit.hpp rename to src/mmdit.hpp diff --git a/model.cpp b/src/model.cpp similarity index 100% rename from model.cpp rename to src/model.cpp diff --git a/model.h b/src/model.h similarity index 100% rename from model.h rename to src/model.h diff --git a/name_conversion.cpp b/src/name_conversion.cpp similarity index 100% rename from name_conversion.cpp rename to src/name_conversion.cpp diff --git a/name_conversion.h b/src/name_conversion.h similarity index 100% rename from name_conversion.h rename to src/name_conversion.h diff --git a/ordered_map.hpp b/src/ordered_map.hpp similarity index 100% rename from ordered_map.hpp rename to src/ordered_map.hpp diff --git a/pmid.hpp b/src/pmid.hpp similarity index 100% rename from pmid.hpp rename to src/pmid.hpp diff --git a/preprocessing.hpp b/src/preprocessing.hpp similarity index 100% rename from preprocessing.hpp rename to src/preprocessing.hpp diff --git a/qwen_image.hpp b/src/qwen_image.hpp similarity index 100% rename from qwen_image.hpp rename to src/qwen_image.hpp diff --git a/rng.hpp b/src/rng.hpp similarity index 100% rename from rng.hpp rename to src/rng.hpp diff --git a/rng_mt19937.hpp b/src/rng_mt19937.hpp similarity index 100% rename from rng_mt19937.hpp rename to src/rng_mt19937.hpp diff --git a/rng_philox.hpp b/src/rng_philox.hpp similarity index 100% rename from rng_philox.hpp rename to src/rng_philox.hpp diff --git a/rope.hpp b/src/rope.hpp similarity index 100% rename from rope.hpp rename to src/rope.hpp diff --git a/stable-diffusion.cpp b/src/stable-diffusion.cpp similarity index 100% rename from stable-diffusion.cpp rename to src/stable-diffusion.cpp diff --git a/t5.hpp b/src/t5.hpp similarity index 100% rename from t5.hpp rename to src/t5.hpp diff --git a/tae.hpp b/src/tae.hpp similarity index 100% rename from tae.hpp rename to src/tae.hpp diff --git a/tokenize_util.cpp b/src/tokenize_util.cpp similarity index 100% rename from tokenize_util.cpp rename to src/tokenize_util.cpp diff --git a/tokenize_util.h b/src/tokenize_util.h similarity index 100% rename from tokenize_util.h rename to src/tokenize_util.h diff --git a/ucache.hpp b/src/ucache.hpp similarity index 100% rename from ucache.hpp rename to src/ucache.hpp diff --git a/unet.hpp b/src/unet.hpp similarity index 100% rename from unet.hpp rename to src/unet.hpp diff --git a/upscaler.cpp b/src/upscaler.cpp similarity index 100% rename from upscaler.cpp rename to src/upscaler.cpp diff --git a/util.cpp b/src/util.cpp similarity index 100% rename from util.cpp rename to src/util.cpp diff --git a/util.h b/src/util.h similarity index 100% rename from util.h rename to src/util.h diff --git a/vae.hpp b/src/vae.hpp similarity index 100% rename from vae.hpp rename to src/vae.hpp diff --git a/version.cpp b/src/version.cpp similarity index 100% rename from version.cpp rename to src/version.cpp diff --git a/vocab.hpp b/src/vocab.hpp similarity index 100% rename from vocab.hpp rename to src/vocab.hpp diff --git a/vocab_mistral.hpp b/src/vocab_mistral.hpp similarity index 100% rename from vocab_mistral.hpp rename to src/vocab_mistral.hpp diff --git a/vocab_qwen.hpp b/src/vocab_qwen.hpp similarity index 100% rename from vocab_qwen.hpp rename to src/vocab_qwen.hpp diff --git a/vocab_umt5.hpp b/src/vocab_umt5.hpp similarity index 100% rename from vocab_umt5.hpp rename to src/vocab_umt5.hpp diff --git a/wan.hpp b/src/wan.hpp similarity index 100% rename from wan.hpp rename to src/wan.hpp diff --git a/z_image.hpp b/src/z_image.hpp similarity index 100% rename from z_image.hpp rename to src/z_image.hpp