mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2025-12-13 05:48:56 +00:00
Compare commits
No commits in common. "d6c87dce5ce22db02d9c41386fc4704f9444df91" and "884e23eeebe4eb4a2fde9fb6ab097e649c4f742d" have entirely different histories.
d6c87dce5c
...
884e23eeeb
@ -14,7 +14,6 @@ Inference of Stable Diffusion and Flux in pure C/C++
|
||||
- !!!The VAE in SDXL encounters NaN issues under FP16, but unfortunately, the ggml_conv_2d only operates under FP16. Hence, a parameter is needed to specify the VAE that has fixed the FP16 NaN issue. You can find it here: [SDXL VAE FP16 Fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors).
|
||||
- [Flux-dev/Flux-schnell Support](./docs/flux.md)
|
||||
- [FLUX.1-Kontext-dev](./docs/kontext.md)
|
||||
- [Chroma](./docs/chroma.md)
|
||||
- [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo) and [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo) support
|
||||
- [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support.
|
||||
- 16-bit, 32-bit float support
|
||||
@ -275,9 +274,6 @@ arguments:
|
||||
--control-net-cpu keep controlnet in cpu (for low vram)
|
||||
--canny apply canny preprocessor (edge detection)
|
||||
--color colors the logging tags according to level
|
||||
--chroma-disable-dit-mask disable dit mask for chroma
|
||||
--chroma-enable-t5-mask enable t5 mask for chroma
|
||||
--chroma-t5-mask-pad PAD_SIZE t5 mask pad size of chroma
|
||||
-v, --verbose print extra info
|
||||
```
|
||||
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 539 KiB |
212
conditioner.hpp
212
conditioner.hpp
@ -747,7 +747,7 @@ struct SD3CLIPEmbedder : public Conditioner {
|
||||
|
||||
clip_l_tokenizer.pad_tokens(clip_l_tokens, clip_l_weights, max_length, padding);
|
||||
clip_g_tokenizer.pad_tokens(clip_g_tokens, clip_g_weights, max_length, padding);
|
||||
t5_tokenizer.pad_tokens(t5_tokens, t5_weights, NULL, max_length, padding);
|
||||
t5_tokenizer.pad_tokens(t5_tokens, t5_weights, max_length, padding);
|
||||
|
||||
// for (int i = 0; i < clip_l_tokens.size(); i++) {
|
||||
// std::cout << clip_l_tokens[i] << ":" << clip_l_weights[i] << ", ";
|
||||
@ -902,7 +902,6 @@ struct SD3CLIPEmbedder : public Conditioner {
|
||||
|
||||
t5->compute(n_threads,
|
||||
input_ids,
|
||||
NULL,
|
||||
&chunk_hidden_states_t5,
|
||||
work_ctx);
|
||||
{
|
||||
@ -1005,7 +1004,6 @@ struct FluxCLIPEmbedder : public Conditioner {
|
||||
T5UniGramTokenizer t5_tokenizer;
|
||||
std::shared_ptr<CLIPTextModelRunner> clip_l;
|
||||
std::shared_ptr<T5Runner> t5;
|
||||
size_t chunk_len = 256;
|
||||
|
||||
FluxCLIPEmbedder(ggml_backend_t backend,
|
||||
std::map<std::string, enum ggml_type>& tensor_types,
|
||||
@ -1079,7 +1077,7 @@ struct FluxCLIPEmbedder : public Conditioner {
|
||||
}
|
||||
|
||||
clip_l_tokenizer.pad_tokens(clip_l_tokens, clip_l_weights, 77, padding);
|
||||
t5_tokenizer.pad_tokens(t5_tokens, t5_weights, NULL, max_length, padding);
|
||||
t5_tokenizer.pad_tokens(t5_tokens, t5_weights, max_length, padding);
|
||||
|
||||
// for (int i = 0; i < clip_l_tokens.size(); i++) {
|
||||
// std::cout << clip_l_tokens[i] << ":" << clip_l_weights[i] << ", ";
|
||||
@ -1111,6 +1109,7 @@ struct FluxCLIPEmbedder : public Conditioner {
|
||||
struct ggml_tensor* pooled = NULL; // [768,]
|
||||
std::vector<float> hidden_states_vec;
|
||||
|
||||
size_t chunk_len = 256;
|
||||
size_t chunk_count = t5_tokens.size() / chunk_len;
|
||||
for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) {
|
||||
// clip_l
|
||||
@ -1148,7 +1147,6 @@ struct FluxCLIPEmbedder : public Conditioner {
|
||||
|
||||
t5->compute(n_threads,
|
||||
input_ids,
|
||||
NULL,
|
||||
&chunk_hidden_states,
|
||||
work_ctx);
|
||||
{
|
||||
@ -1198,209 +1196,7 @@ struct FluxCLIPEmbedder : public Conditioner {
|
||||
int height,
|
||||
int adm_in_channels = -1,
|
||||
bool force_zero_embeddings = false) {
|
||||
auto tokens_and_weights = tokenize(text, chunk_len, true);
|
||||
return get_learned_condition_common(work_ctx, n_threads, tokens_and_weights, clip_skip, force_zero_embeddings);
|
||||
}
|
||||
|
||||
std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(ggml_context* work_ctx,
|
||||
int n_threads,
|
||||
const std::string& text,
|
||||
int clip_skip,
|
||||
int width,
|
||||
int height,
|
||||
int num_input_imgs,
|
||||
int adm_in_channels = -1,
|
||||
bool force_zero_embeddings = false) {
|
||||
GGML_ASSERT(0 && "Not implemented yet!");
|
||||
}
|
||||
|
||||
std::string remove_trigger_from_prompt(ggml_context* work_ctx,
|
||||
const std::string& prompt) {
|
||||
GGML_ASSERT(0 && "Not implemented yet!");
|
||||
}
|
||||
};
|
||||
|
||||
struct PixArtCLIPEmbedder : public Conditioner {
|
||||
T5UniGramTokenizer t5_tokenizer;
|
||||
std::shared_ptr<T5Runner> t5;
|
||||
size_t chunk_len = 512;
|
||||
bool use_mask = false;
|
||||
int mask_pad = 1;
|
||||
|
||||
PixArtCLIPEmbedder(ggml_backend_t backend,
|
||||
std::map<std::string, enum ggml_type>& tensor_types,
|
||||
int clip_skip = -1,
|
||||
bool use_mask = false,
|
||||
int mask_pad = 1)
|
||||
: use_mask(use_mask), mask_pad(mask_pad) {
|
||||
t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
|
||||
}
|
||||
|
||||
void set_clip_skip(int clip_skip) {
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
|
||||
t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
|
||||
}
|
||||
|
||||
void alloc_params_buffer() {
|
||||
t5->alloc_params_buffer();
|
||||
}
|
||||
|
||||
void free_params_buffer() {
|
||||
t5->free_params_buffer();
|
||||
}
|
||||
|
||||
size_t get_params_buffer_size() {
|
||||
size_t buffer_size = 0;
|
||||
|
||||
buffer_size += t5->get_params_buffer_size();
|
||||
|
||||
return buffer_size;
|
||||
}
|
||||
|
||||
std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text,
|
||||
size_t max_length = 0,
|
||||
bool padding = false) {
|
||||
auto parsed_attention = parse_prompt_attention(text);
|
||||
|
||||
{
|
||||
std::stringstream ss;
|
||||
ss << "[";
|
||||
for (const auto& item : parsed_attention) {
|
||||
ss << "['" << item.first << "', " << item.second << "], ";
|
||||
}
|
||||
ss << "]";
|
||||
LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
|
||||
}
|
||||
|
||||
auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
|
||||
return false;
|
||||
};
|
||||
|
||||
std::vector<int> t5_tokens;
|
||||
std::vector<float> t5_weights;
|
||||
std::vector<float> t5_mask;
|
||||
for (const auto& item : parsed_attention) {
|
||||
const std::string& curr_text = item.first;
|
||||
float curr_weight = item.second;
|
||||
|
||||
std::vector<int> curr_tokens = t5_tokenizer.Encode(curr_text, true);
|
||||
t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end());
|
||||
t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight);
|
||||
}
|
||||
|
||||
t5_tokenizer.pad_tokens(t5_tokens, t5_weights, &t5_mask, max_length, padding);
|
||||
|
||||
return {t5_tokens, t5_weights, t5_mask};
|
||||
}
|
||||
|
||||
void modify_mask_to_attend_padding(struct ggml_tensor* mask, int max_seq_length, int num_extra_padding = 8) {
|
||||
float* mask_data = (float*)mask->data;
|
||||
int num_pad = 0;
|
||||
for (int64_t i = 0; i < max_seq_length; i++) {
|
||||
if (num_pad >= num_extra_padding) {
|
||||
break;
|
||||
}
|
||||
if (std::isinf(mask_data[i])) {
|
||||
mask_data[i] = 0;
|
||||
++num_pad;
|
||||
}
|
||||
}
|
||||
// LOG_DEBUG("PAD: %d", num_pad);
|
||||
}
|
||||
|
||||
SDCondition get_learned_condition_common(ggml_context* work_ctx,
|
||||
int n_threads,
|
||||
std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> token_and_weights,
|
||||
int clip_skip,
|
||||
bool force_zero_embeddings = false) {
|
||||
auto& t5_tokens = std::get<0>(token_and_weights);
|
||||
auto& t5_weights = std::get<1>(token_and_weights);
|
||||
auto& t5_attn_mask_vec = std::get<2>(token_and_weights);
|
||||
|
||||
int64_t t0 = ggml_time_ms();
|
||||
struct ggml_tensor* hidden_states = NULL; // [N, n_token, 4096]
|
||||
struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, 4096]
|
||||
struct ggml_tensor* pooled = NULL; // [768,]
|
||||
struct ggml_tensor* t5_attn_mask = vector_to_ggml_tensor(work_ctx, t5_attn_mask_vec); // [768,]
|
||||
|
||||
std::vector<float> hidden_states_vec;
|
||||
|
||||
size_t chunk_count = t5_tokens.size() / chunk_len;
|
||||
|
||||
for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) {
|
||||
// t5
|
||||
std::vector<int> chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len,
|
||||
t5_tokens.begin() + (chunk_idx + 1) * chunk_len);
|
||||
std::vector<float> chunk_weights(t5_weights.begin() + chunk_idx * chunk_len,
|
||||
t5_weights.begin() + (chunk_idx + 1) * chunk_len);
|
||||
std::vector<float> chunk_mask(t5_attn_mask_vec.begin() + chunk_idx * chunk_len,
|
||||
t5_attn_mask_vec.begin() + (chunk_idx + 1) * chunk_len);
|
||||
|
||||
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
|
||||
auto t5_attn_mask_chunk = use_mask ? vector_to_ggml_tensor(work_ctx, chunk_mask) : NULL;
|
||||
|
||||
t5->compute(n_threads,
|
||||
input_ids,
|
||||
t5_attn_mask_chunk,
|
||||
&chunk_hidden_states,
|
||||
work_ctx);
|
||||
{
|
||||
auto tensor = chunk_hidden_states;
|
||||
float original_mean = ggml_tensor_mean(tensor);
|
||||
for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
|
||||
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
|
||||
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
|
||||
float value = ggml_tensor_get_f32(tensor, i0, i1, i2);
|
||||
value *= chunk_weights[i1];
|
||||
ggml_tensor_set_f32(tensor, value, i0, i1, i2);
|
||||
}
|
||||
}
|
||||
}
|
||||
float new_mean = ggml_tensor_mean(tensor);
|
||||
ggml_tensor_scale(tensor, (original_mean / new_mean));
|
||||
}
|
||||
|
||||
int64_t t1 = ggml_time_ms();
|
||||
LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
|
||||
if (force_zero_embeddings) {
|
||||
float* vec = (float*)chunk_hidden_states->data;
|
||||
for (int i = 0; i < ggml_nelements(chunk_hidden_states); i++) {
|
||||
vec[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
hidden_states_vec.insert(hidden_states_vec.end(),
|
||||
(float*)chunk_hidden_states->data,
|
||||
((float*)chunk_hidden_states->data) + ggml_nelements(chunk_hidden_states));
|
||||
}
|
||||
|
||||
if (hidden_states_vec.size() > 0) {
|
||||
hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec);
|
||||
hidden_states = ggml_reshape_2d(work_ctx,
|
||||
hidden_states,
|
||||
chunk_hidden_states->ne[0],
|
||||
ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
|
||||
} else {
|
||||
hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, 256);
|
||||
ggml_set_f32(hidden_states, 0.f);
|
||||
}
|
||||
|
||||
modify_mask_to_attend_padding(t5_attn_mask, ggml_nelements(t5_attn_mask), mask_pad);
|
||||
|
||||
return SDCondition(hidden_states, t5_attn_mask, NULL);
|
||||
}
|
||||
|
||||
SDCondition get_learned_condition(ggml_context* work_ctx,
|
||||
int n_threads,
|
||||
const std::string& text,
|
||||
int clip_skip,
|
||||
int width,
|
||||
int height,
|
||||
int adm_in_channels = -1,
|
||||
bool force_zero_embeddings = false) {
|
||||
auto tokens_and_weights = tokenize(text, chunk_len, true);
|
||||
auto tokens_and_weights = tokenize(text, 256, true);
|
||||
return get_learned_condition_common(work_ctx, n_threads, tokens_and_weights, clip_skip, force_zero_embeddings);
|
||||
}
|
||||
|
||||
|
||||
72
denoiser.hpp
72
denoiser.hpp
@ -1019,7 +1019,7 @@ static void sample_k_diffusion(sample_method_t method,
|
||||
// also needed to invert the behavior of CompVisDenoiser
|
||||
// (k-diffusion's LMSDiscreteScheduler)
|
||||
float beta_start = 0.00085f;
|
||||
float beta_end = 0.0120f;
|
||||
float beta_end = 0.0120f;
|
||||
std::vector<double> alphas_cumprod;
|
||||
std::vector<double> compvis_sigmas;
|
||||
|
||||
@ -1030,9 +1030,8 @@ static void sample_k_diffusion(sample_method_t method,
|
||||
(i == 0 ? 1.0f : alphas_cumprod[i - 1]) *
|
||||
(1.0f -
|
||||
std::pow(sqrtf(beta_start) +
|
||||
(sqrtf(beta_end) - sqrtf(beta_start)) *
|
||||
((float)i / (TIMESTEPS - 1)),
|
||||
2));
|
||||
(sqrtf(beta_end) - sqrtf(beta_start)) *
|
||||
((float)i / (TIMESTEPS - 1)), 2));
|
||||
compvis_sigmas[i] =
|
||||
std::sqrt((1 - alphas_cumprod[i]) /
|
||||
alphas_cumprod[i]);
|
||||
@ -1062,8 +1061,7 @@ static void sample_k_diffusion(sample_method_t method,
|
||||
// - pred_prev_sample -> "x_t-1"
|
||||
int timestep =
|
||||
roundf(TIMESTEPS -
|
||||
i * ((float)TIMESTEPS / steps)) -
|
||||
1;
|
||||
i * ((float)TIMESTEPS / steps)) - 1;
|
||||
// 1. get previous step value (=t-1)
|
||||
int prev_timestep = timestep - TIMESTEPS / steps;
|
||||
// The sigma here is chosen to cause the
|
||||
@ -1088,9 +1086,10 @@ static void sample_k_diffusion(sample_method_t method,
|
||||
float* vec_x = (float*)x->data;
|
||||
for (int j = 0; j < ggml_nelements(x); j++) {
|
||||
vec_x[j] *= std::sqrt(sigma * sigma + 1) /
|
||||
sigma;
|
||||
sigma;
|
||||
}
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
// For the subsequent steps after the first one,
|
||||
// at this point x = latents or x = sample, and
|
||||
// needs to be prescaled with x <- sample / c_in
|
||||
@ -1128,8 +1127,9 @@ static void sample_k_diffusion(sample_method_t method,
|
||||
float alpha_prod_t = alphas_cumprod[timestep];
|
||||
// Note final_alpha_cumprod = alphas_cumprod[0] due to
|
||||
// trailing timestep spacing
|
||||
float alpha_prod_t_prev = prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0];
|
||||
float beta_prod_t = 1 - alpha_prod_t;
|
||||
float alpha_prod_t_prev = prev_timestep >= 0 ?
|
||||
alphas_cumprod[prev_timestep] : alphas_cumprod[0];
|
||||
float beta_prod_t = 1 - alpha_prod_t;
|
||||
// 3. compute predicted original sample from predicted
|
||||
// noise also called "predicted x_0" of formula (12)
|
||||
// from https://arxiv.org/pdf/2010.02502.pdf
|
||||
@ -1145,7 +1145,7 @@ static void sample_k_diffusion(sample_method_t method,
|
||||
vec_pred_original_sample[j] =
|
||||
(vec_x[j] / std::sqrt(sigma * sigma + 1) -
|
||||
std::sqrt(beta_prod_t) *
|
||||
vec_model_output[j]) *
|
||||
vec_model_output[j]) *
|
||||
(1 / std::sqrt(alpha_prod_t));
|
||||
}
|
||||
}
|
||||
@ -1159,8 +1159,8 @@ static void sample_k_diffusion(sample_method_t method,
|
||||
// sigma_t = sqrt((1 - alpha_t-1)/(1 - alpha_t)) *
|
||||
// sqrt(1 - alpha_t/alpha_t-1)
|
||||
float beta_prod_t_prev = 1 - alpha_prod_t_prev;
|
||||
float variance = (beta_prod_t_prev / beta_prod_t) *
|
||||
(1 - alpha_prod_t / alpha_prod_t_prev);
|
||||
float variance = (beta_prod_t_prev / beta_prod_t) *
|
||||
(1 - alpha_prod_t / alpha_prod_t_prev);
|
||||
float std_dev_t = eta * std::sqrt(variance);
|
||||
// 6. compute "direction pointing to x_t" of formula
|
||||
// (12) from https://arxiv.org/pdf/2010.02502.pdf
|
||||
@ -1179,8 +1179,8 @@ static void sample_k_diffusion(sample_method_t method,
|
||||
std::pow(std_dev_t, 2)) *
|
||||
vec_model_output[j];
|
||||
vec_x[j] = std::sqrt(alpha_prod_t_prev) *
|
||||
vec_pred_original_sample[j] +
|
||||
pred_sample_direction;
|
||||
vec_pred_original_sample[j] +
|
||||
pred_sample_direction;
|
||||
}
|
||||
}
|
||||
if (eta > 0) {
|
||||
@ -1208,7 +1208,7 @@ static void sample_k_diffusion(sample_method_t method,
|
||||
// by Semi-Linear Consistency Function with Trajectory
|
||||
// Mapping", arXiv:2402.19159 [cs.CV]
|
||||
float beta_start = 0.00085f;
|
||||
float beta_end = 0.0120f;
|
||||
float beta_end = 0.0120f;
|
||||
std::vector<double> alphas_cumprod;
|
||||
std::vector<double> compvis_sigmas;
|
||||
|
||||
@ -1219,9 +1219,8 @@ static void sample_k_diffusion(sample_method_t method,
|
||||
(i == 0 ? 1.0f : alphas_cumprod[i - 1]) *
|
||||
(1.0f -
|
||||
std::pow(sqrtf(beta_start) +
|
||||
(sqrtf(beta_end) - sqrtf(beta_start)) *
|
||||
((float)i / (TIMESTEPS - 1)),
|
||||
2));
|
||||
(sqrtf(beta_end) - sqrtf(beta_start)) *
|
||||
((float)i / (TIMESTEPS - 1)), 2));
|
||||
compvis_sigmas[i] =
|
||||
std::sqrt((1 - alphas_cumprod[i]) /
|
||||
alphas_cumprod[i]);
|
||||
@ -1236,10 +1235,13 @@ static void sample_k_diffusion(sample_method_t method,
|
||||
for (int i = 0; i < steps; i++) {
|
||||
// Analytic form for TCD timesteps
|
||||
int timestep = TIMESTEPS - 1 -
|
||||
(TIMESTEPS / original_steps) *
|
||||
(int)floor(i * ((float)original_steps / steps));
|
||||
(TIMESTEPS / original_steps) *
|
||||
(int)floor(i * ((float)original_steps / steps));
|
||||
// 1. get previous step value
|
||||
int prev_timestep = i >= steps - 1 ? 0 : TIMESTEPS - 1 - (TIMESTEPS / original_steps) * (int)floor((i + 1) * ((float)original_steps / steps));
|
||||
int prev_timestep = i >= steps - 1 ? 0 :
|
||||
TIMESTEPS - 1 - (TIMESTEPS / original_steps) *
|
||||
(int)floor((i + 1) *
|
||||
((float)original_steps / steps));
|
||||
// Here timestep_s is tau_n' in Algorithm 4. The _s
|
||||
// notation appears to be that from C. Lu,
|
||||
// "DPM-Solver: A Fast ODE Solver for Diffusion
|
||||
@ -1256,9 +1258,10 @@ static void sample_k_diffusion(sample_method_t method,
|
||||
float* vec_x = (float*)x->data;
|
||||
for (int j = 0; j < ggml_nelements(x); j++) {
|
||||
vec_x[j] *= std::sqrt(sigma * sigma + 1) /
|
||||
sigma;
|
||||
sigma;
|
||||
}
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
float* vec_x = (float*)x->data;
|
||||
for (int j = 0; j < ggml_nelements(x); j++) {
|
||||
vec_x[j] *= std::sqrt(sigma * sigma + 1);
|
||||
@ -1291,14 +1294,15 @@ static void sample_k_diffusion(sample_method_t method,
|
||||
// DPM-Solver. In fact, we have alpha_{t_n} =
|
||||
// \sqrt{\hat{alpha_n}}, [...]"
|
||||
float alpha_prod_t = alphas_cumprod[timestep];
|
||||
float beta_prod_t = 1 - alpha_prod_t;
|
||||
float beta_prod_t = 1 - alpha_prod_t;
|
||||
// Note final_alpha_cumprod = alphas_cumprod[0] since
|
||||
// TCD is always "trailing"
|
||||
float alpha_prod_t_prev = prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0];
|
||||
float alpha_prod_t_prev = prev_timestep >= 0 ?
|
||||
alphas_cumprod[prev_timestep] : alphas_cumprod[0];
|
||||
// The subscript _s are the only portion in this
|
||||
// section (2) unique to TCD
|
||||
float alpha_prod_s = alphas_cumprod[timestep_s];
|
||||
float beta_prod_s = 1 - alpha_prod_s;
|
||||
float beta_prod_s = 1 - alpha_prod_s;
|
||||
// 3. Compute the predicted noised sample x_s based on
|
||||
// the model parameterization
|
||||
//
|
||||
@ -1313,7 +1317,7 @@ static void sample_k_diffusion(sample_method_t method,
|
||||
vec_pred_original_sample[j] =
|
||||
(vec_x[j] / std::sqrt(sigma * sigma + 1) -
|
||||
std::sqrt(beta_prod_t) *
|
||||
vec_model_output[j]) *
|
||||
vec_model_output[j]) *
|
||||
(1 / std::sqrt(alpha_prod_t));
|
||||
}
|
||||
}
|
||||
@ -1335,9 +1339,9 @@ static void sample_k_diffusion(sample_method_t method,
|
||||
// pred_epsilon = model_output
|
||||
vec_x[j] =
|
||||
std::sqrt(alpha_prod_s) *
|
||||
vec_pred_original_sample[j] +
|
||||
vec_pred_original_sample[j] +
|
||||
std::sqrt(beta_prod_s) *
|
||||
vec_model_output[j];
|
||||
vec_model_output[j];
|
||||
}
|
||||
}
|
||||
// 4. Sample and inject noise z ~ N(0, I) for
|
||||
@ -1353,7 +1357,7 @@ static void sample_k_diffusion(sample_method_t method,
|
||||
// In this case, x is still pred_noised_sample,
|
||||
// continue in-place
|
||||
ggml_tensor_set_f32_randn(noise, rng);
|
||||
float* vec_x = (float*)x->data;
|
||||
float* vec_x = (float*)x->data;
|
||||
float* vec_noise = (float*)noise->data;
|
||||
for (int j = 0; j < ggml_nelements(x); j++) {
|
||||
// Corresponding to (35) in Zheng et
|
||||
@ -1362,10 +1366,10 @@ static void sample_k_diffusion(sample_method_t method,
|
||||
vec_x[j] =
|
||||
std::sqrt(alpha_prod_t_prev /
|
||||
alpha_prod_s) *
|
||||
vec_x[j] +
|
||||
vec_x[j] +
|
||||
std::sqrt(1 - alpha_prod_t_prev /
|
||||
alpha_prod_s) *
|
||||
vec_noise[j];
|
||||
alpha_prod_s) *
|
||||
vec_noise[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -13,7 +13,7 @@ struct DiffusionModel {
|
||||
struct ggml_tensor* c_concat,
|
||||
struct ggml_tensor* y,
|
||||
struct ggml_tensor* guidance,
|
||||
std::vector<ggml_tensor*> ref_latents = {},
|
||||
std::vector<ggml_tensor*> ref_latents = {},
|
||||
int num_video_frames = -1,
|
||||
std::vector<struct ggml_tensor*> controls = {},
|
||||
float control_strength = 0.f,
|
||||
@ -69,7 +69,7 @@ struct UNetModel : public DiffusionModel {
|
||||
struct ggml_tensor* c_concat,
|
||||
struct ggml_tensor* y,
|
||||
struct ggml_tensor* guidance,
|
||||
std::vector<ggml_tensor*> ref_latents = {},
|
||||
std::vector<ggml_tensor*> ref_latents = {},
|
||||
int num_video_frames = -1,
|
||||
std::vector<struct ggml_tensor*> controls = {},
|
||||
float control_strength = 0.f,
|
||||
@ -120,7 +120,7 @@ struct MMDiTModel : public DiffusionModel {
|
||||
struct ggml_tensor* c_concat,
|
||||
struct ggml_tensor* y,
|
||||
struct ggml_tensor* guidance,
|
||||
std::vector<ggml_tensor*> ref_latents = {},
|
||||
std::vector<ggml_tensor*> ref_latents = {},
|
||||
int num_video_frames = -1,
|
||||
std::vector<struct ggml_tensor*> controls = {},
|
||||
float control_strength = 0.f,
|
||||
@ -137,9 +137,8 @@ struct FluxModel : public DiffusionModel {
|
||||
FluxModel(ggml_backend_t backend,
|
||||
std::map<std::string, enum ggml_type>& tensor_types,
|
||||
SDVersion version = VERSION_FLUX,
|
||||
bool flash_attn = false,
|
||||
bool use_mask = false)
|
||||
: flux(backend, tensor_types, "model.diffusion_model", version, flash_attn, use_mask) {
|
||||
bool flash_attn = false)
|
||||
: flux(backend, tensor_types, "model.diffusion_model", version, flash_attn) {
|
||||
}
|
||||
|
||||
void alloc_params_buffer() {
|
||||
@ -173,7 +172,7 @@ struct FluxModel : public DiffusionModel {
|
||||
struct ggml_tensor* c_concat,
|
||||
struct ggml_tensor* y,
|
||||
struct ggml_tensor* guidance,
|
||||
std::vector<ggml_tensor*> ref_latents = {},
|
||||
std::vector<ggml_tensor*> ref_latents = {},
|
||||
int num_video_frames = -1,
|
||||
std::vector<struct ggml_tensor*> controls = {},
|
||||
float control_strength = 0.f,
|
||||
|
||||
@ -1,33 +0,0 @@
|
||||
# How to Use
|
||||
|
||||
You can run Chroma using stable-diffusion.cpp with a GPU that has 6GB or even 4GB of VRAM, without needing to offload to RAM.
|
||||
|
||||
## Download weights
|
||||
|
||||
- Download Chroma
|
||||
- If you don't want to do the conversion yourself, download the preconverted gguf model from [silveroxides/Chroma-GGUF](https://huggingface.co/silveroxides/Chroma-GGUF)
|
||||
- Otherwise, download chroma's safetensors from [lodestones/Chroma](https://huggingface.co/lodestones/Chroma)
|
||||
- Download vae from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
|
||||
- Download t5xxl from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
|
||||
|
||||
## Convert Chroma weights
|
||||
|
||||
You can download the preconverted gguf weights from [silveroxides/Chroma-GGUF](https://huggingface.co/silveroxides/Chroma-GGUF), this way you don't have to do the conversion yourself.
|
||||
|
||||
```
|
||||
.\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\chroma-unlocked-v40.safetensors -o ..\models\chroma-unlocked-v40-q8_0.gguf -v --type q8_0
|
||||
```
|
||||
|
||||
## Run
|
||||
|
||||
### Example
|
||||
For example:
|
||||
|
||||
```
|
||||
.\bin\Release\sd.exe -diffusion-model ..\models\chroma-unlocked-v40-q8_0.gguf --vae ..\models\ae.sft --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'chroma.cpp'" --cfg-scale 4.0 --sampling-method euler -v --chroma-disable-dit-mask
|
||||
```
|
||||
|
||||

|
||||
|
||||
|
||||
|
||||
@ -132,10 +132,6 @@ struct SDParams {
|
||||
float slg_scale = 0.f;
|
||||
float skip_layer_start = 0.01f;
|
||||
float skip_layer_end = 0.2f;
|
||||
|
||||
bool chroma_use_dit_mask = true;
|
||||
bool chroma_use_t5_mask = false;
|
||||
int chroma_t5_mask_pad = 1;
|
||||
};
|
||||
|
||||
void print_params(SDParams params) {
|
||||
@ -189,9 +185,6 @@ void print_params(SDParams params) {
|
||||
printf(" batch_count: %d\n", params.batch_count);
|
||||
printf(" vae_tiling: %s\n", params.vae_tiling ? "true" : "false");
|
||||
printf(" upscale_repeats: %d\n", params.upscale_repeats);
|
||||
printf(" chroma_use_dit_mask: %s\n", params.chroma_use_dit_mask ? "true" : "false");
|
||||
printf(" chroma_use_t5_mask: %s\n", params.chroma_use_t5_mask ? "true" : "false");
|
||||
printf(" chroma_t5_mask_pad: %d\n", params.chroma_t5_mask_pad);
|
||||
}
|
||||
|
||||
void print_usage(int argc, const char* argv[]) {
|
||||
@ -259,9 +252,6 @@ void print_usage(int argc, const char* argv[]) {
|
||||
printf(" --control-net-cpu keep controlnet in cpu (for low vram)\n");
|
||||
printf(" --canny apply canny preprocessor (edge detection)\n");
|
||||
printf(" --color colors the logging tags according to level\n");
|
||||
printf(" --chroma-disable-dit-mask disable dit mask for chroma\n");
|
||||
printf(" --chroma-enable-t5-mask enable t5 mask for chroma\n");
|
||||
printf(" --chroma-t5-mask-pad PAD_SIZE t5 mask pad size of chroma\n");
|
||||
printf(" -v, --verbose print extra info\n");
|
||||
}
|
||||
|
||||
@ -653,16 +643,6 @@ void parse_args(int argc, const char** argv, SDParams& params) {
|
||||
break;
|
||||
}
|
||||
params.ref_image_paths.push_back(argv[i]);
|
||||
} else if (arg == "--chroma-disable-dit-mask") {
|
||||
params.chroma_use_dit_mask = false;
|
||||
} else if (arg == "--chroma-use-t5-mask") {
|
||||
params.chroma_use_t5_mask = true;
|
||||
} else if (arg == "--chroma-t5-mask-pad") {
|
||||
if (++i >= argc) {
|
||||
invalid_arg = true;
|
||||
break;
|
||||
}
|
||||
params.chroma_t5_mask_pad = std::stoi(argv[i]);
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
print_usage(argc, argv);
|
||||
@ -919,7 +899,7 @@ int main(int argc, const char* argv[]) {
|
||||
input_image_buffer = resized_image_buffer;
|
||||
}
|
||||
} else if (params.mode == EDIT) {
|
||||
vae_decode_only = false;
|
||||
vae_decode_only = false;
|
||||
for (auto& path : params.ref_image_paths) {
|
||||
int c = 0;
|
||||
int width = 0;
|
||||
@ -972,10 +952,7 @@ int main(int argc, const char* argv[]) {
|
||||
params.clip_on_cpu,
|
||||
params.control_net_cpu,
|
||||
params.vae_on_cpu,
|
||||
params.diffusion_flash_attn,
|
||||
params.chroma_use_dit_mask,
|
||||
params.chroma_use_t5_mask,
|
||||
params.chroma_t5_mask_pad);
|
||||
params.diffusion_flash_attn);
|
||||
|
||||
if (sd_ctx == NULL) {
|
||||
printf("new_sd_ctx_t failed\n");
|
||||
@ -1113,7 +1090,7 @@ int main(int argc, const char* argv[]) {
|
||||
params.skip_layer_start,
|
||||
params.skip_layer_end);
|
||||
}
|
||||
} else { // EDIT
|
||||
} else { // EDIT
|
||||
results = edit(sd_ctx,
|
||||
ref_images.data(),
|
||||
ref_images.size(),
|
||||
@ -1176,11 +1153,11 @@ int main(int argc, const char* argv[]) {
|
||||
|
||||
std::string dummy_name, ext, lc_ext;
|
||||
bool is_jpg;
|
||||
size_t last = params.output_path.find_last_of(".");
|
||||
size_t last = params.output_path.find_last_of(".");
|
||||
size_t last_path = std::min(params.output_path.find_last_of("/"),
|
||||
params.output_path.find_last_of("\\"));
|
||||
if (last != std::string::npos // filename has extension
|
||||
&& (last_path == std::string::npos || last > last_path)) {
|
||||
if (last != std::string::npos // filename has extension
|
||||
&& (last_path == std::string::npos || last > last_path)) {
|
||||
dummy_name = params.output_path.substr(0, last);
|
||||
ext = lc_ext = params.output_path.substr(last);
|
||||
std::transform(ext.begin(), ext.end(), lc_ext.begin(), ::tolower);
|
||||
@ -1188,7 +1165,7 @@ int main(int argc, const char* argv[]) {
|
||||
} else {
|
||||
dummy_name = params.output_path;
|
||||
ext = lc_ext = "";
|
||||
is_jpg = false;
|
||||
is_jpg = false;
|
||||
}
|
||||
// appending ".png" to absent or unknown extension
|
||||
if (!is_jpg && lc_ext != ".png") {
|
||||
@ -1200,7 +1177,7 @@ int main(int argc, const char* argv[]) {
|
||||
continue;
|
||||
}
|
||||
std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ext : dummy_name + ext;
|
||||
if (is_jpg) {
|
||||
if(is_jpg) {
|
||||
stbi_write_jpg(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
|
||||
results[i].data, 90, get_image_params(params, params.seed + i).c_str());
|
||||
printf("save result JPEG image to '%s'\n", final_image_path.c_str());
|
||||
|
||||
358
flux.hpp
358
flux.hpp
@ -117,7 +117,6 @@ namespace Flux {
|
||||
struct ggml_tensor* k,
|
||||
struct ggml_tensor* v,
|
||||
struct ggml_tensor* pe,
|
||||
struct ggml_tensor* mask,
|
||||
bool flash_attn) {
|
||||
// q,k,v: [N, L, n_head, d_head]
|
||||
// pe: [L, d_head/2, 2, 2]
|
||||
@ -125,7 +124,7 @@ namespace Flux {
|
||||
q = apply_rope(ctx, q, pe); // [N*n_head, L, d_head]
|
||||
k = apply_rope(ctx, k, pe); // [N*n_head, L, d_head]
|
||||
|
||||
auto x = ggml_nn_attention_ext(ctx, q, k, v, v->ne[1], mask, false, true, flash_attn); // [N, L, n_head*d_head]
|
||||
auto x = ggml_nn_attention_ext(ctx, q, k, v, v->ne[1], NULL, false, true, flash_attn); // [N, L, n_head*d_head]
|
||||
return x;
|
||||
}
|
||||
|
||||
@ -168,13 +167,13 @@ namespace Flux {
|
||||
return x;
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* pe, struct ggml_tensor* mask) {
|
||||
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* pe) {
|
||||
// x: [N, n_token, dim]
|
||||
// pe: [n_token, d_head/2, 2, 2]
|
||||
// return [N, n_token, dim]
|
||||
auto qkv = pre_attention(ctx, x); // q,k,v: [N, n_token, n_head, d_head]
|
||||
x = attention(ctx, qkv[0], qkv[1], qkv[2], pe, mask, flash_attn); // [N, n_token, dim]
|
||||
x = post_attention(ctx, x); // [N, n_token, dim]
|
||||
auto qkv = pre_attention(ctx, x); // q,k,v: [N, n_token, n_head, d_head]
|
||||
x = attention(ctx, qkv[0], qkv[1], qkv[2], pe, flash_attn); // [N, n_token, dim]
|
||||
x = post_attention(ctx, x); // [N, n_token, dim]
|
||||
return x;
|
||||
}
|
||||
};
|
||||
@ -186,13 +185,6 @@ namespace Flux {
|
||||
|
||||
ModulationOut(ggml_tensor* shift = NULL, ggml_tensor* scale = NULL, ggml_tensor* gate = NULL)
|
||||
: shift(shift), scale(scale), gate(gate) {}
|
||||
|
||||
ModulationOut(struct ggml_context* ctx, ggml_tensor* vec, int64_t offset) {
|
||||
int64_t stride = vec->nb[1] * vec->ne[1];
|
||||
shift = ggml_view_2d(ctx, vec, vec->ne[0], vec->ne[1], vec->nb[1], stride * (offset + 0)); // [N, dim]
|
||||
scale = ggml_view_2d(ctx, vec, vec->ne[0], vec->ne[1], vec->nb[1], stride * (offset + 1)); // [N, dim]
|
||||
gate = ggml_view_2d(ctx, vec, vec->ne[0], vec->ne[1], vec->nb[1], stride * (offset + 2)); // [N, dim]
|
||||
}
|
||||
};
|
||||
|
||||
struct Modulation : public GGMLBlock {
|
||||
@ -218,12 +210,19 @@ namespace Flux {
|
||||
auto m = ggml_reshape_3d(ctx, out, vec->ne[0], multiplier, vec->ne[1]); // [N, multiplier, dim]
|
||||
m = ggml_cont(ctx, ggml_permute(ctx, m, 0, 2, 1, 3)); // [multiplier, N, dim]
|
||||
|
||||
ModulationOut m_0 = ModulationOut(ctx, m, 0);
|
||||
int64_t offset = m->nb[1] * m->ne[1];
|
||||
auto shift_0 = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0); // [N, dim]
|
||||
auto scale_0 = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1); // [N, dim]
|
||||
auto gate_0 = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 2); // [N, dim]
|
||||
|
||||
if (is_double) {
|
||||
return {m_0, ModulationOut(ctx, m, 3)};
|
||||
auto shift_1 = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 3); // [N, dim]
|
||||
auto scale_1 = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 4); // [N, dim]
|
||||
auto gate_1 = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 5); // [N, dim]
|
||||
return {ModulationOut(shift_0, scale_0, gate_0), ModulationOut(shift_1, scale_1, gate_1)};
|
||||
}
|
||||
|
||||
return {m_0, ModulationOut()};
|
||||
return {ModulationOut(shift_0, scale_0, gate_0), ModulationOut()};
|
||||
}
|
||||
};
|
||||
|
||||
@ -243,33 +242,25 @@ namespace Flux {
|
||||
|
||||
struct DoubleStreamBlock : public GGMLBlock {
|
||||
bool flash_attn;
|
||||
bool prune_mod;
|
||||
int idx = 0;
|
||||
|
||||
public:
|
||||
DoubleStreamBlock(int64_t hidden_size,
|
||||
int64_t num_heads,
|
||||
float mlp_ratio,
|
||||
int idx = 0,
|
||||
bool qkv_bias = false,
|
||||
bool flash_attn = false,
|
||||
bool prune_mod = false)
|
||||
: idx(idx), flash_attn(flash_attn), prune_mod(prune_mod) {
|
||||
bool flash_attn = false)
|
||||
: flash_attn(flash_attn) {
|
||||
int64_t mlp_hidden_dim = hidden_size * mlp_ratio;
|
||||
if (!prune_mod) {
|
||||
blocks["img_mod"] = std::shared_ptr<GGMLBlock>(new Modulation(hidden_size, true));
|
||||
}
|
||||
blocks["img_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
|
||||
blocks["img_attn"] = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qkv_bias, flash_attn));
|
||||
blocks["img_mod"] = std::shared_ptr<GGMLBlock>(new Modulation(hidden_size, true));
|
||||
blocks["img_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
|
||||
blocks["img_attn"] = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qkv_bias, flash_attn));
|
||||
|
||||
blocks["img_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
|
||||
blocks["img_mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, mlp_hidden_dim));
|
||||
// img_mlp.1 is nn.GELU(approximate="tanh")
|
||||
blocks["img_mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(mlp_hidden_dim, hidden_size));
|
||||
|
||||
if (!prune_mod) {
|
||||
blocks["txt_mod"] = std::shared_ptr<GGMLBlock>(new Modulation(hidden_size, true));
|
||||
}
|
||||
blocks["txt_mod"] = std::shared_ptr<GGMLBlock>(new Modulation(hidden_size, true));
|
||||
blocks["txt_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
|
||||
blocks["txt_attn"] = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qkv_bias, flash_attn));
|
||||
|
||||
@ -279,34 +270,17 @@ namespace Flux {
|
||||
blocks["txt_mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(mlp_hidden_dim, hidden_size));
|
||||
}
|
||||
|
||||
std::vector<ModulationOut> get_distil_img_mod(struct ggml_context* ctx, struct ggml_tensor* vec) {
|
||||
// TODO: not hardcoded?
|
||||
const int single_blocks_count = 38;
|
||||
const int double_blocks_count = 19;
|
||||
|
||||
int64_t offset = 6 * idx + 3 * single_blocks_count;
|
||||
return {ModulationOut(ctx, vec, offset), ModulationOut(ctx, vec, offset + 3)};
|
||||
}
|
||||
|
||||
std::vector<ModulationOut> get_distil_txt_mod(struct ggml_context* ctx, struct ggml_tensor* vec) {
|
||||
// TODO: not hardcoded?
|
||||
const int single_blocks_count = 38;
|
||||
const int double_blocks_count = 19;
|
||||
|
||||
int64_t offset = 6 * idx + 6 * double_blocks_count + 3 * single_blocks_count;
|
||||
return {ModulationOut(ctx, vec, offset), ModulationOut(ctx, vec, offset + 3)};
|
||||
}
|
||||
|
||||
std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx,
|
||||
struct ggml_tensor* img,
|
||||
struct ggml_tensor* txt,
|
||||
struct ggml_tensor* vec,
|
||||
struct ggml_tensor* pe,
|
||||
struct ggml_tensor* mask = NULL) {
|
||||
struct ggml_tensor* pe) {
|
||||
// img: [N, n_img_token, hidden_size]
|
||||
// txt: [N, n_txt_token, hidden_size]
|
||||
// pe: [n_img_token + n_txt_token, d_head/2, 2, 2]
|
||||
// return: ([N, n_img_token, hidden_size], [N, n_txt_token, hidden_size])
|
||||
|
||||
auto img_mod = std::dynamic_pointer_cast<Modulation>(blocks["img_mod"]);
|
||||
auto img_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["img_norm1"]);
|
||||
auto img_attn = std::dynamic_pointer_cast<SelfAttention>(blocks["img_attn"]);
|
||||
|
||||
@ -314,6 +288,7 @@ namespace Flux {
|
||||
auto img_mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["img_mlp.0"]);
|
||||
auto img_mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["img_mlp.2"]);
|
||||
|
||||
auto txt_mod = std::dynamic_pointer_cast<Modulation>(blocks["txt_mod"]);
|
||||
auto txt_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["txt_norm1"]);
|
||||
auto txt_attn = std::dynamic_pointer_cast<SelfAttention>(blocks["txt_attn"]);
|
||||
|
||||
@ -321,22 +296,10 @@ namespace Flux {
|
||||
auto txt_mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["txt_mlp.0"]);
|
||||
auto txt_mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["txt_mlp.2"]);
|
||||
|
||||
std::vector<ModulationOut> img_mods;
|
||||
if (prune_mod) {
|
||||
img_mods = get_distil_img_mod(ctx, vec);
|
||||
} else {
|
||||
auto img_mod = std::dynamic_pointer_cast<Modulation>(blocks["img_mod"]);
|
||||
img_mods = img_mod->forward(ctx, vec);
|
||||
}
|
||||
auto img_mods = img_mod->forward(ctx, vec);
|
||||
ModulationOut img_mod1 = img_mods[0];
|
||||
ModulationOut img_mod2 = img_mods[1];
|
||||
std::vector<ModulationOut> txt_mods;
|
||||
if (prune_mod) {
|
||||
txt_mods = get_distil_txt_mod(ctx, vec);
|
||||
} else {
|
||||
auto txt_mod = std::dynamic_pointer_cast<Modulation>(blocks["txt_mod"]);
|
||||
txt_mods = txt_mod->forward(ctx, vec);
|
||||
}
|
||||
auto txt_mods = txt_mod->forward(ctx, vec);
|
||||
ModulationOut txt_mod1 = txt_mods[0];
|
||||
ModulationOut txt_mod2 = txt_mods[1];
|
||||
|
||||
@ -361,7 +324,7 @@ namespace Flux {
|
||||
auto k = ggml_concat(ctx, txt_k, img_k, 2); // [N, n_txt_token + n_img_token, n_head, d_head]
|
||||
auto v = ggml_concat(ctx, txt_v, img_v, 2); // [N, n_txt_token + n_img_token, n_head, d_head]
|
||||
|
||||
auto attn = attention(ctx, q, k, v, pe, mask, flash_attn); // [N, n_txt_token + n_img_token, n_head*d_head]
|
||||
auto attn = attention(ctx, q, k, v, pe, flash_attn); // [N, n_txt_token + n_img_token, n_head*d_head]
|
||||
attn = ggml_cont(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3)); // [n_txt_token + n_img_token, N, hidden_size]
|
||||
auto txt_attn_out = ggml_view_3d(ctx,
|
||||
attn,
|
||||
@ -410,18 +373,14 @@ namespace Flux {
|
||||
int64_t hidden_size;
|
||||
int64_t mlp_hidden_dim;
|
||||
bool flash_attn;
|
||||
bool prune_mod;
|
||||
int idx = 0;
|
||||
|
||||
public:
|
||||
SingleStreamBlock(int64_t hidden_size,
|
||||
int64_t num_heads,
|
||||
float mlp_ratio = 4.0f,
|
||||
int idx = 0,
|
||||
float qk_scale = 0.f,
|
||||
bool flash_attn = false,
|
||||
bool prune_mod = false)
|
||||
: hidden_size(hidden_size), num_heads(num_heads), idx(idx), flash_attn(flash_attn), prune_mod(prune_mod) {
|
||||
bool flash_attn = false)
|
||||
: hidden_size(hidden_size), num_heads(num_heads), flash_attn(flash_attn) {
|
||||
int64_t head_dim = hidden_size / num_heads;
|
||||
float scale = qk_scale;
|
||||
if (scale <= 0.f) {
|
||||
@ -434,37 +393,26 @@ namespace Flux {
|
||||
blocks["norm"] = std::shared_ptr<GGMLBlock>(new QKNorm(head_dim));
|
||||
blocks["pre_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
|
||||
// mlp_act is nn.GELU(approximate="tanh")
|
||||
if (!prune_mod) {
|
||||
blocks["modulation"] = std::shared_ptr<GGMLBlock>(new Modulation(hidden_size, false));
|
||||
}
|
||||
}
|
||||
|
||||
ModulationOut get_distil_mod(struct ggml_context* ctx, struct ggml_tensor* vec) {
|
||||
int64_t offset = 3 * idx;
|
||||
return ModulationOut(ctx, vec, offset);
|
||||
blocks["modulation"] = std::shared_ptr<GGMLBlock>(new Modulation(hidden_size, false));
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward(struct ggml_context* ctx,
|
||||
struct ggml_tensor* x,
|
||||
struct ggml_tensor* vec,
|
||||
struct ggml_tensor* pe,
|
||||
struct ggml_tensor* mask = NULL) {
|
||||
struct ggml_tensor* pe) {
|
||||
// x: [N, n_token, hidden_size]
|
||||
// pe: [n_token, d_head/2, 2, 2]
|
||||
// return: [N, n_token, hidden_size]
|
||||
|
||||
auto linear1 = std::dynamic_pointer_cast<Linear>(blocks["linear1"]);
|
||||
auto linear2 = std::dynamic_pointer_cast<Linear>(blocks["linear2"]);
|
||||
auto norm = std::dynamic_pointer_cast<QKNorm>(blocks["norm"]);
|
||||
auto pre_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["pre_norm"]);
|
||||
ModulationOut mod;
|
||||
if (prune_mod) {
|
||||
mod = get_distil_mod(ctx, vec);
|
||||
} else {
|
||||
auto modulation = std::dynamic_pointer_cast<Modulation>(blocks["modulation"]);
|
||||
auto linear1 = std::dynamic_pointer_cast<Linear>(blocks["linear1"]);
|
||||
auto linear2 = std::dynamic_pointer_cast<Linear>(blocks["linear2"]);
|
||||
auto norm = std::dynamic_pointer_cast<QKNorm>(blocks["norm"]);
|
||||
auto pre_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["pre_norm"]);
|
||||
auto modulation = std::dynamic_pointer_cast<Modulation>(blocks["modulation"]);
|
||||
|
||||
auto mods = modulation->forward(ctx, vec);
|
||||
ModulationOut mod = mods[0];
|
||||
|
||||
mod = modulation->forward(ctx, vec)[0];
|
||||
}
|
||||
auto x_mod = Flux::modulate(ctx, pre_norm->forward(ctx, x), mod.shift, mod.scale);
|
||||
auto qkv_mlp = linear1->forward(ctx, x_mod); // [N, n_token, hidden_size * 3 + mlp_hidden_dim]
|
||||
qkv_mlp = ggml_cont(ctx, ggml_permute(ctx, qkv_mlp, 2, 0, 1, 3)); // [hidden_size * 3 + mlp_hidden_dim, N, n_token]
|
||||
@ -495,7 +443,7 @@ namespace Flux {
|
||||
auto v = ggml_reshape_4d(ctx, qkv_vec[2], head_dim, num_heads, qkv_vec[2]->ne[1], qkv_vec[2]->ne[2]); // [N, n_token, n_head, d_head]
|
||||
q = norm->query_norm(ctx, q);
|
||||
k = norm->key_norm(ctx, k);
|
||||
auto attn = attention(ctx, q, k, v, pe, mask, flash_attn); // [N, n_token, hidden_size]
|
||||
auto attn = attention(ctx, q, k, v, pe, flash_attn); // [N, n_token, hidden_size]
|
||||
|
||||
auto attn_mlp = ggml_concat(ctx, attn, ggml_gelu_inplace(ctx, mlp), 0); // [N, n_token, hidden_size + mlp_hidden_dim]
|
||||
auto output = linear2->forward(ctx, attn_mlp); // [N, n_token, hidden_size]
|
||||
@ -506,28 +454,13 @@ namespace Flux {
|
||||
};
|
||||
|
||||
struct LastLayer : public GGMLBlock {
|
||||
bool prune_mod;
|
||||
|
||||
public:
|
||||
LastLayer(int64_t hidden_size,
|
||||
int64_t patch_size,
|
||||
int64_t out_channels,
|
||||
bool prune_mod = false)
|
||||
: prune_mod(prune_mod) {
|
||||
blocks["norm_final"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-06f, false));
|
||||
blocks["linear"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, patch_size * patch_size * out_channels));
|
||||
if (!prune_mod) {
|
||||
blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, 2 * hidden_size));
|
||||
}
|
||||
}
|
||||
|
||||
ModulationOut get_distil_mod(struct ggml_context* ctx, struct ggml_tensor* vec) {
|
||||
int64_t offset = vec->ne[2] - 2;
|
||||
int64_t stride = vec->nb[1] * vec->ne[1];
|
||||
auto shift = ggml_view_2d(ctx, vec, vec->ne[0], vec->ne[1], vec->nb[1], stride * (offset + 0)); // [N, dim]
|
||||
auto scale = ggml_view_2d(ctx, vec, vec->ne[0], vec->ne[1], vec->nb[1], stride * (offset + 1)); // [N, dim]
|
||||
// No gate
|
||||
return ModulationOut(shift, scale, NULL);
|
||||
int64_t out_channels) {
|
||||
blocks["norm_final"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-06f, false));
|
||||
blocks["linear"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, patch_size * patch_size * out_channels));
|
||||
blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, 2 * hidden_size));
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward(struct ggml_context* ctx,
|
||||
@ -536,24 +469,17 @@ namespace Flux {
|
||||
// x: [N, n_token, hidden_size]
|
||||
// c: [N, hidden_size]
|
||||
// return: [N, n_token, patch_size * patch_size * out_channels]
|
||||
auto norm_final = std::dynamic_pointer_cast<LayerNorm>(blocks["norm_final"]);
|
||||
auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
|
||||
struct ggml_tensor *shift, *scale;
|
||||
if (prune_mod) {
|
||||
auto mod = get_distil_mod(ctx, c);
|
||||
shift = mod.shift;
|
||||
scale = mod.scale;
|
||||
} else {
|
||||
auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]);
|
||||
auto norm_final = std::dynamic_pointer_cast<LayerNorm>(blocks["norm_final"]);
|
||||
auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
|
||||
auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]);
|
||||
|
||||
auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx, c)); // [N, 2 * hidden_size]
|
||||
m = ggml_reshape_3d(ctx, m, c->ne[0], 2, c->ne[1]); // [N, 2, hidden_size]
|
||||
m = ggml_cont(ctx, ggml_permute(ctx, m, 0, 2, 1, 3)); // [2, N, hidden_size]
|
||||
auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx, c)); // [N, 2 * hidden_size]
|
||||
m = ggml_reshape_3d(ctx, m, c->ne[0], 2, c->ne[1]); // [N, 2, hidden_size]
|
||||
m = ggml_cont(ctx, ggml_permute(ctx, m, 0, 2, 1, 3)); // [2, N, hidden_size]
|
||||
|
||||
int64_t offset = m->nb[1] * m->ne[1];
|
||||
shift = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0); // [N, hidden_size]
|
||||
scale = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1); // [N, hidden_size]
|
||||
}
|
||||
int64_t offset = m->nb[1] * m->ne[1];
|
||||
auto shift = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0); // [N, hidden_size]
|
||||
auto scale = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1); // [N, hidden_size]
|
||||
|
||||
x = Flux::modulate(ctx, norm_final->forward(ctx, x), shift, scale);
|
||||
x = linear->forward(ctx, x);
|
||||
@ -562,34 +488,6 @@ namespace Flux {
|
||||
}
|
||||
};
|
||||
|
||||
struct ChromaApproximator : public GGMLBlock {
|
||||
int64_t inner_size = 5120;
|
||||
int64_t n_layers = 5;
|
||||
ChromaApproximator(int64_t in_channels = 64, int64_t hidden_size = 3072) {
|
||||
blocks["in_proj"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, inner_size, true));
|
||||
for (int i = 0; i < n_layers; i++) {
|
||||
blocks["norms." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new RMSNorm(inner_size));
|
||||
blocks["layers." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new MLPEmbedder(inner_size, inner_size));
|
||||
}
|
||||
blocks["out_proj"] = std::shared_ptr<GGMLBlock>(new Linear(inner_size, hidden_size, true));
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||
auto in_proj = std::dynamic_pointer_cast<Linear>(blocks["in_proj"]);
|
||||
auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["out_proj"]);
|
||||
|
||||
x = in_proj->forward(ctx, x);
|
||||
for (int i = 0; i < n_layers; i++) {
|
||||
auto norm = std::dynamic_pointer_cast<RMSNorm>(blocks["norms." + std::to_string(i)]);
|
||||
auto embed = std::dynamic_pointer_cast<MLPEmbedder>(blocks["layers." + std::to_string(i)]);
|
||||
x = ggml_add_inplace(ctx, x, embed->forward(ctx, norm->forward(ctx, x)));
|
||||
}
|
||||
x = out_proj->forward(ctx, x);
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
struct FluxParams {
|
||||
int64_t in_channels = 64;
|
||||
int64_t out_channels = 64;
|
||||
@ -606,7 +504,6 @@ namespace Flux {
|
||||
bool qkv_bias = true;
|
||||
bool guidance_embed = true;
|
||||
bool flash_attn = true;
|
||||
bool is_chroma = false;
|
||||
};
|
||||
|
||||
struct Flux : public GGMLBlock {
|
||||
@ -724,7 +621,7 @@ namespace Flux {
|
||||
auto txt_ids = gen_txt_ids(bs, context_len);
|
||||
auto img_ids = gen_img_ids(h, w, patch_size, bs);
|
||||
|
||||
auto ids = concat_ids(txt_ids, img_ids, bs);
|
||||
auto ids = concat_ids(txt_ids, img_ids, bs);
|
||||
uint64_t curr_h_offset = 0;
|
||||
uint64_t curr_w_offset = 0;
|
||||
for (ggml_tensor* ref : ref_latents) {
|
||||
@ -737,7 +634,7 @@ namespace Flux {
|
||||
}
|
||||
|
||||
auto ref_ids = gen_img_ids(ref->ne[1], ref->ne[0], patch_size, bs, 1, h_offset, w_offset);
|
||||
ids = concat_ids(ids, ref_ids, bs);
|
||||
ids = concat_ids(ids, ref_ids, bs);
|
||||
|
||||
curr_h_offset = std::max(curr_h_offset, ref->ne[1] + h_offset);
|
||||
curr_w_offset = std::max(curr_w_offset, ref->ne[0] + w_offset);
|
||||
@ -783,15 +680,11 @@ namespace Flux {
|
||||
: params(params) {
|
||||
int64_t pe_dim = params.hidden_size / params.num_heads;
|
||||
|
||||
blocks["img_in"] = std::shared_ptr<GGMLBlock>(new Linear(params.in_channels, params.hidden_size, true));
|
||||
if (params.is_chroma) {
|
||||
blocks["distilled_guidance_layer"] = std::shared_ptr<GGMLBlock>(new ChromaApproximator(params.in_channels, params.hidden_size));
|
||||
} else {
|
||||
blocks["time_in"] = std::shared_ptr<GGMLBlock>(new MLPEmbedder(256, params.hidden_size));
|
||||
blocks["vector_in"] = std::shared_ptr<GGMLBlock>(new MLPEmbedder(params.vec_in_dim, params.hidden_size));
|
||||
if (params.guidance_embed) {
|
||||
blocks["guidance_in"] = std::shared_ptr<GGMLBlock>(new MLPEmbedder(256, params.hidden_size));
|
||||
}
|
||||
blocks["img_in"] = std::shared_ptr<GGMLBlock>(new Linear(params.in_channels, params.hidden_size, true));
|
||||
blocks["time_in"] = std::shared_ptr<GGMLBlock>(new MLPEmbedder(256, params.hidden_size));
|
||||
blocks["vector_in"] = std::shared_ptr<GGMLBlock>(new MLPEmbedder(params.vec_in_dim, params.hidden_size));
|
||||
if (params.guidance_embed) {
|
||||
blocks["guidance_in"] = std::shared_ptr<GGMLBlock>(new MLPEmbedder(256, params.hidden_size));
|
||||
}
|
||||
blocks["txt_in"] = std::shared_ptr<GGMLBlock>(new Linear(params.context_in_dim, params.hidden_size, true));
|
||||
|
||||
@ -799,23 +692,19 @@ namespace Flux {
|
||||
blocks["double_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new DoubleStreamBlock(params.hidden_size,
|
||||
params.num_heads,
|
||||
params.mlp_ratio,
|
||||
i,
|
||||
params.qkv_bias,
|
||||
params.flash_attn,
|
||||
params.is_chroma));
|
||||
params.flash_attn));
|
||||
}
|
||||
|
||||
for (int i = 0; i < params.depth_single_blocks; i++) {
|
||||
blocks["single_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new SingleStreamBlock(params.hidden_size,
|
||||
params.num_heads,
|
||||
params.mlp_ratio,
|
||||
i,
|
||||
0.f,
|
||||
params.flash_attn,
|
||||
params.is_chroma));
|
||||
params.flash_attn));
|
||||
}
|
||||
|
||||
blocks["final_layer"] = std::shared_ptr<GGMLBlock>(new LastLayer(params.hidden_size, 1, params.out_channels, params.is_chroma));
|
||||
blocks["final_layer"] = std::shared_ptr<GGMLBlock>(new LastLayer(params.hidden_size, 1, params.out_channels));
|
||||
}
|
||||
|
||||
struct ggml_tensor* patchify(struct ggml_context* ctx,
|
||||
@ -872,55 +761,25 @@ namespace Flux {
|
||||
struct ggml_tensor* y,
|
||||
struct ggml_tensor* guidance,
|
||||
struct ggml_tensor* pe,
|
||||
struct ggml_tensor* mod_index_arange = NULL,
|
||||
std::vector<int> skip_layers = {}) {
|
||||
std::vector<int> skip_layers = {}) {
|
||||
auto img_in = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
|
||||
auto time_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["time_in"]);
|
||||
auto vector_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["vector_in"]);
|
||||
auto txt_in = std::dynamic_pointer_cast<Linear>(blocks["txt_in"]);
|
||||
auto final_layer = std::dynamic_pointer_cast<LastLayer>(blocks["final_layer"]);
|
||||
|
||||
img = img_in->forward(ctx, img);
|
||||
struct ggml_tensor* vec;
|
||||
struct ggml_tensor* txt_img_mask = NULL;
|
||||
if (params.is_chroma) {
|
||||
int64_t mod_index_length = 344;
|
||||
auto approx = std::dynamic_pointer_cast<ChromaApproximator>(blocks["distilled_guidance_layer"]);
|
||||
auto distill_timestep = ggml_nn_timestep_embedding(ctx, timesteps, 16, 10000, 1000.f);
|
||||
auto distill_guidance = ggml_nn_timestep_embedding(ctx, guidance, 16, 10000, 1000.f);
|
||||
img = img_in->forward(ctx, img);
|
||||
auto vec = time_in->forward(ctx, ggml_nn_timestep_embedding(ctx, timesteps, 256, 10000, 1000.f));
|
||||
|
||||
// auto mod_index_arange = ggml_arange(ctx, 0, (float)mod_index_length, 1);
|
||||
// ggml_arange tot working on a lot of backends, precomputing it on CPU instead
|
||||
GGML_ASSERT(arange != NULL);
|
||||
auto modulation_index = ggml_nn_timestep_embedding(ctx, mod_index_arange, 32, 10000, 1000.f); // [1, 344, 32]
|
||||
|
||||
// Batch broadcast (will it ever be useful)
|
||||
modulation_index = ggml_repeat(ctx, modulation_index, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, modulation_index->ne[0], modulation_index->ne[1], img->ne[2])); // [N, 344, 32]
|
||||
|
||||
auto timestep_guidance = ggml_concat(ctx, distill_timestep, distill_guidance, 0); // [N, 1, 32]
|
||||
timestep_guidance = ggml_repeat(ctx, timestep_guidance, modulation_index); // [N, 344, 32]
|
||||
|
||||
vec = ggml_concat(ctx, timestep_guidance, modulation_index, 0); // [N, 344, 64]
|
||||
// Permute for consistency with non-distilled modulation implementation
|
||||
vec = ggml_cont(ctx, ggml_permute(ctx, vec, 0, 2, 1, 3)); // [344, N, 64]
|
||||
vec = approx->forward(ctx, vec); // [344, N, hidden_size]
|
||||
|
||||
if (y != NULL) {
|
||||
txt_img_mask = ggml_pad(ctx, y, img->ne[1], 0, 0, 0);
|
||||
}
|
||||
} else {
|
||||
auto time_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["time_in"]);
|
||||
auto vector_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["vector_in"]);
|
||||
vec = time_in->forward(ctx, ggml_nn_timestep_embedding(ctx, timesteps, 256, 10000, 1000.f));
|
||||
if (params.guidance_embed) {
|
||||
GGML_ASSERT(guidance != NULL);
|
||||
auto guidance_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["guidance_in"]);
|
||||
// bf16 and fp16 result is different
|
||||
auto g_in = ggml_nn_timestep_embedding(ctx, guidance, 256, 10000, 1000.f);
|
||||
vec = ggml_add(ctx, vec, guidance_in->forward(ctx, g_in));
|
||||
}
|
||||
|
||||
vec = ggml_add(ctx, vec, vector_in->forward(ctx, y));
|
||||
if (params.guidance_embed) {
|
||||
GGML_ASSERT(guidance != NULL);
|
||||
auto guidance_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["guidance_in"]);
|
||||
// bf16 and fp16 result is different
|
||||
auto g_in = ggml_nn_timestep_embedding(ctx, guidance, 256, 10000, 1000.f);
|
||||
vec = ggml_add(ctx, vec, guidance_in->forward(ctx, g_in));
|
||||
}
|
||||
|
||||
vec = ggml_add(ctx, vec, vector_in->forward(ctx, y));
|
||||
txt = txt_in->forward(ctx, txt);
|
||||
|
||||
for (int i = 0; i < params.depth; i++) {
|
||||
@ -930,7 +789,7 @@ namespace Flux {
|
||||
|
||||
auto block = std::dynamic_pointer_cast<DoubleStreamBlock>(blocks["double_blocks." + std::to_string(i)]);
|
||||
|
||||
auto img_txt = block->forward(ctx, img, txt, vec, pe, txt_img_mask);
|
||||
auto img_txt = block->forward(ctx, img, txt, vec, pe);
|
||||
img = img_txt.first; // [N, n_img_token, hidden_size]
|
||||
txt = img_txt.second; // [N, n_txt_token, hidden_size]
|
||||
}
|
||||
@ -942,7 +801,7 @@ namespace Flux {
|
||||
}
|
||||
auto block = std::dynamic_pointer_cast<SingleStreamBlock>(blocks["single_blocks." + std::to_string(i)]);
|
||||
|
||||
txt_img = block->forward(ctx, txt_img, vec, pe, txt_img_mask);
|
||||
txt_img = block->forward(ctx, txt_img, vec, pe);
|
||||
}
|
||||
|
||||
txt_img = ggml_cont(ctx, ggml_permute(ctx, txt_img, 0, 2, 1, 3)); // [n_txt_token + n_img_token, N, hidden_size]
|
||||
@ -957,11 +816,13 @@ namespace Flux {
|
||||
img = ggml_cont(ctx, ggml_permute(ctx, img, 0, 2, 1, 3)); // [N, n_img_token, hidden_size]
|
||||
|
||||
img = final_layer->forward(ctx, img, vec); // (N, T, patch_size ** 2 * out_channels)
|
||||
|
||||
return img;
|
||||
}
|
||||
|
||||
struct ggml_tensor* process_img(struct ggml_context* ctx,
|
||||
struct ggml_tensor* x) {
|
||||
|
||||
int64_t W = x->ne[0];
|
||||
int64_t H = x->ne[1];
|
||||
int64_t patch_size = 2;
|
||||
@ -982,9 +843,8 @@ namespace Flux {
|
||||
struct ggml_tensor* y,
|
||||
struct ggml_tensor* guidance,
|
||||
struct ggml_tensor* pe,
|
||||
struct ggml_tensor* mod_index_arange = NULL,
|
||||
std::vector<ggml_tensor*> ref_latents = {},
|
||||
std::vector<int> skip_layers = {}) {
|
||||
std::vector<int> skip_layers = {}) {
|
||||
// Forward pass of DiT.
|
||||
// x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
|
||||
// timestep: (N,) tensor of diffusion timesteps
|
||||
@ -1004,7 +864,7 @@ namespace Flux {
|
||||
int pad_h = (patch_size - H % patch_size) % patch_size;
|
||||
int pad_w = (patch_size - W % patch_size) % patch_size;
|
||||
|
||||
auto img = process_img(ctx, x);
|
||||
auto img = process_img(ctx, x);
|
||||
uint64_t img_tokens = img->ne[1];
|
||||
|
||||
if (c_concat != NULL) {
|
||||
@ -1012,7 +872,7 @@ namespace Flux {
|
||||
ggml_tensor* mask = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 8 * 8, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
|
||||
|
||||
masked = process_img(ctx, masked);
|
||||
mask = process_img(ctx, mask);
|
||||
mask = process_img(ctx, mask);
|
||||
|
||||
img = ggml_concat(ctx, img, ggml_concat(ctx, masked, mask, 0), 0);
|
||||
}
|
||||
@ -1024,11 +884,11 @@ namespace Flux {
|
||||
}
|
||||
}
|
||||
|
||||
auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, mod_index_arange, skip_layers); // [N, num_tokens, C * patch_size * patch_size]
|
||||
auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, skip_layers); // [N, num_tokens, C * patch_size * patch_size]
|
||||
if (out->ne[1] > img_tokens) {
|
||||
out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3)); // [num_tokens, N, C * patch_size * patch_size]
|
||||
out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3)); // [num_tokens, N, C * patch_size * patch_size]
|
||||
out = ggml_view_3d(ctx, out, out->ne[0], out->ne[1], img_tokens, out->nb[1], out->nb[2], 0);
|
||||
out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3)); // [N, h*w, C * patch_size * patch_size]
|
||||
out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3)); // [N, h*w, C * patch_size * patch_size]
|
||||
}
|
||||
|
||||
// rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)
|
||||
@ -1044,18 +904,14 @@ namespace Flux {
|
||||
public:
|
||||
FluxParams flux_params;
|
||||
Flux flux;
|
||||
std::vector<float> pe_vec;
|
||||
std::vector<float> mod_index_arange_vec; // for cache
|
||||
SDVersion version;
|
||||
bool use_mask = false;
|
||||
std::vector<float> pe_vec; // for cache
|
||||
|
||||
FluxRunner(ggml_backend_t backend,
|
||||
std::map<std::string, enum ggml_type>& tensor_types = empty_tensor_types,
|
||||
const std::string prefix = "",
|
||||
SDVersion version = VERSION_FLUX,
|
||||
bool flash_attn = false,
|
||||
bool use_mask = false)
|
||||
: GGMLRunner(backend), use_mask(use_mask) {
|
||||
bool flash_attn = false)
|
||||
: GGMLRunner(backend) {
|
||||
flux_params.flash_attn = flash_attn;
|
||||
flux_params.guidance_embed = false;
|
||||
flux_params.depth = 0;
|
||||
@ -1071,10 +927,6 @@ namespace Flux {
|
||||
// not schnell
|
||||
flux_params.guidance_embed = true;
|
||||
}
|
||||
if (tensor_name.find("distilled_guidance_layer.in_proj.weight") != std::string::npos) {
|
||||
// Chroma
|
||||
flux_params.is_chroma = true;
|
||||
}
|
||||
size_t db = tensor_name.find("double_blocks.");
|
||||
if (db != std::string::npos) {
|
||||
tensor_name = tensor_name.substr(db); // remove prefix
|
||||
@ -1094,9 +946,7 @@ namespace Flux {
|
||||
}
|
||||
|
||||
LOG_INFO("Flux blocks: %d double, %d single", flux_params.depth, flux_params.depth_single_blocks);
|
||||
if (flux_params.is_chroma) {
|
||||
LOG_INFO("Using pruned modulation (Chroma)");
|
||||
} else if (!flux_params.guidance_embed) {
|
||||
if (!flux_params.guidance_embed) {
|
||||
LOG_INFO("Flux guidance is disabled (Schnell mode)");
|
||||
}
|
||||
|
||||
@ -1119,33 +969,18 @@ namespace Flux {
|
||||
struct ggml_tensor* y,
|
||||
struct ggml_tensor* guidance,
|
||||
std::vector<ggml_tensor*> ref_latents = {},
|
||||
std::vector<int> skip_layers = {}) {
|
||||
std::vector<int> skip_layers = std::vector<int>()) {
|
||||
GGML_ASSERT(x->ne[3] == 1);
|
||||
struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false);
|
||||
|
||||
struct ggml_tensor* mod_index_arange = NULL;
|
||||
|
||||
x = to_backend(x);
|
||||
context = to_backend(context);
|
||||
if (c_concat != NULL) {
|
||||
c_concat = to_backend(c_concat);
|
||||
}
|
||||
if (flux_params.is_chroma) {
|
||||
guidance = ggml_set_f32(guidance, 0);
|
||||
|
||||
if (!use_mask) {
|
||||
y = NULL;
|
||||
}
|
||||
|
||||
// ggml_arange is not working on some backends, precompute it
|
||||
mod_index_arange_vec = arange(0, 344);
|
||||
mod_index_arange = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_F32, mod_index_arange_vec.size());
|
||||
set_backend_tensor_data(mod_index_arange, mod_index_arange_vec.data());
|
||||
}
|
||||
y = to_backend(y);
|
||||
|
||||
y = to_backend(y);
|
||||
timesteps = to_backend(timesteps);
|
||||
if (flux_params.guidance_embed || flux_params.is_chroma) {
|
||||
if (flux_params.guidance_embed) {
|
||||
guidance = to_backend(guidance);
|
||||
}
|
||||
for (int i = 0; i < ref_latents.size(); i++) {
|
||||
@ -1169,7 +1004,6 @@ namespace Flux {
|
||||
y,
|
||||
guidance,
|
||||
pe,
|
||||
mod_index_arange,
|
||||
ref_latents,
|
||||
skip_layers);
|
||||
|
||||
@ -1186,9 +1020,9 @@ namespace Flux {
|
||||
struct ggml_tensor* y,
|
||||
struct ggml_tensor* guidance,
|
||||
std::vector<ggml_tensor*> ref_latents = {},
|
||||
struct ggml_tensor** output = NULL,
|
||||
struct ggml_context* output_ctx = NULL,
|
||||
std::vector<int> skip_layers = std::vector<int>()) {
|
||||
struct ggml_tensor** output = NULL,
|
||||
struct ggml_context* output_ctx = NULL,
|
||||
std::vector<int> skip_layers = std::vector<int>()) {
|
||||
// x: [N, in_channels, h, w]
|
||||
// timesteps: [N, ]
|
||||
// context: [N, max_position, hidden_size]
|
||||
|
||||
@ -864,18 +864,6 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*
|
||||
v = ggml_reshape_3d(ctx, v, d_head, L_k, n_head * N); // [N * n_head, L_k, d_head]
|
||||
v = ggml_cast(ctx, v, GGML_TYPE_F16);
|
||||
|
||||
if (mask != nullptr) {
|
||||
mask = ggml_transpose(ctx, mask);
|
||||
|
||||
if (mask->ne[1] < GGML_PAD(q->ne[1], GGML_KQ_MASK_PAD)) {
|
||||
LOG_DEBUG("mask dims %ld, %ld, %ld, %ld\n", mask->ne[0], mask->ne[1], mask->ne[2], mask->ne[3]);
|
||||
LOG_DEBUG("needs padding, padding from %ld to %ld\n", mask->ne[1], GGML_PAD(q->ne[1], GGML_KQ_MASK_PAD));
|
||||
mask = ggml_pad(ctx, mask, 0, GGML_PAD(q->ne[1], GGML_KQ_MASK_PAD) - mask->ne[1], 0, 0);
|
||||
}
|
||||
|
||||
mask = ggml_cast(ctx, mask, GGML_TYPE_F16);
|
||||
}
|
||||
|
||||
kqv = ggml_flash_attn_ext(ctx, q, k, v, mask, scale, 0, 0);
|
||||
ggml_flash_attn_ext_set_prec(kqv, GGML_PREC_F32);
|
||||
|
||||
@ -888,7 +876,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*
|
||||
auto kq = ggml_mul_mat(ctx, k, q); // [N * n_head, L_q, L_k]
|
||||
kq = ggml_scale_inplace(ctx, kq, scale);
|
||||
if (mask) {
|
||||
kq = ggml_add_inplace(ctx, kq, mask);
|
||||
kq = ggml_add(ctx, kq, mask);
|
||||
}
|
||||
if (diag_mask_inf) {
|
||||
kq = ggml_diag_mask_inf_inplace(ctx, kq, 0);
|
||||
|
||||
3
lora.hpp
3
lora.hpp
@ -291,6 +291,7 @@ struct LoraModel : public GGMLRunner {
|
||||
std::string hada_2_down_name = "";
|
||||
std::string hada_2_up_name = "";
|
||||
|
||||
|
||||
hada_1_down_name = fk + ".hada_w1_b";
|
||||
hada_1_up_name = fk + ".hada_w1_a";
|
||||
hada_1_mid_name = fk + ".hada_t1";
|
||||
@ -413,7 +414,7 @@ struct LoraModel : public GGMLRunner {
|
||||
}
|
||||
lokr_w2 = ggml_merge_lora(compute_ctx, down, up);
|
||||
}
|
||||
|
||||
|
||||
// Technically it might be unused, but I believe it's the expected behavior
|
||||
applied_lora_tensors.insert(alpha_name);
|
||||
|
||||
|
||||
2
model.h
2
model.h
@ -12,9 +12,9 @@
|
||||
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml.h"
|
||||
#include "gguf.h"
|
||||
#include "json.hpp"
|
||||
#include "zip.h"
|
||||
#include "gguf.h"
|
||||
|
||||
#define SD_MAX_DIMS 5
|
||||
|
||||
|
||||
@ -48,7 +48,8 @@ const char* sampling_methods_str[] = {
|
||||
"iPNDM_v",
|
||||
"LCM",
|
||||
"DDIM \"trailing\"",
|
||||
"TCD"};
|
||||
"TCD"
|
||||
};
|
||||
|
||||
/*================================================== Helper Functions ================================================*/
|
||||
|
||||
@ -158,10 +159,7 @@ public:
|
||||
bool clip_on_cpu,
|
||||
bool control_net_cpu,
|
||||
bool vae_on_cpu,
|
||||
bool diffusion_flash_attn,
|
||||
bool chroma_use_dit_mask,
|
||||
bool chroma_use_t5_mask,
|
||||
int chroma_t5_mask_pad) {
|
||||
bool diffusion_flash_attn) {
|
||||
use_tiny_autoencoder = taesd_path.size() > 0;
|
||||
#ifdef SD_USE_CUDA
|
||||
LOG_DEBUG("Using CUDA backend");
|
||||
@ -336,19 +334,8 @@ public:
|
||||
cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
|
||||
diffusion_model = std::make_shared<MMDiTModel>(backend, model_loader.tensor_storages_types);
|
||||
} else if (sd_version_is_flux(version)) {
|
||||
bool is_chroma = false;
|
||||
for (auto pair : model_loader.tensor_storages_types) {
|
||||
if (pair.first.find("distilled_guidance_layer.in_proj.weight") != std::string::npos) {
|
||||
is_chroma = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (is_chroma) {
|
||||
cond_stage_model = std::make_shared<PixArtCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types, -1, chroma_use_t5_mask, chroma_t5_mask_pad);
|
||||
} else {
|
||||
cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
|
||||
}
|
||||
diffusion_model = std::make_shared<FluxModel>(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn, chroma_use_dit_mask);
|
||||
cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
|
||||
diffusion_model = std::make_shared<FluxModel>(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn);
|
||||
} else {
|
||||
if (id_embeddings_path.find("v2") != std::string::npos) {
|
||||
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_loader.tensor_storages_types, embeddings_path, version, PM_VERSION_2);
|
||||
@ -695,7 +682,7 @@ public:
|
||||
float curr_multiplier = kv.second;
|
||||
lora_state_diff[lora_name] -= curr_multiplier;
|
||||
}
|
||||
|
||||
|
||||
size_t rm = lora_state_diff.size() - lora_state.size();
|
||||
if (rm != 0) {
|
||||
LOG_INFO("Attempting to apply %lu LoRAs (removing %lu applied LoRAs)", lora_state.size(), rm);
|
||||
@ -814,11 +801,11 @@ public:
|
||||
int start_merge_step,
|
||||
SDCondition id_cond,
|
||||
std::vector<ggml_tensor*> ref_latents = {},
|
||||
std::vector<int> skip_layers = {},
|
||||
float slg_scale = 0,
|
||||
float skip_layer_start = 0.01,
|
||||
float skip_layer_end = 0.2,
|
||||
ggml_tensor* noise_mask = nullptr) {
|
||||
std::vector<int> skip_layers = {},
|
||||
float slg_scale = 0,
|
||||
float skip_layer_start = 0.01,
|
||||
float skip_layer_end = 0.2,
|
||||
ggml_tensor* noise_mask = nullptr) {
|
||||
LOG_DEBUG("Sample");
|
||||
struct ggml_init_params params;
|
||||
size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]);
|
||||
@ -1148,10 +1135,7 @@ sd_ctx_t* new_sd_ctx(const char* model_path_c_str,
|
||||
bool keep_clip_on_cpu,
|
||||
bool keep_control_net_cpu,
|
||||
bool keep_vae_on_cpu,
|
||||
bool diffusion_flash_attn,
|
||||
bool chroma_use_dit_mask,
|
||||
bool chroma_use_t5_mask,
|
||||
int chroma_t5_mask_pad) {
|
||||
bool diffusion_flash_attn) {
|
||||
sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t));
|
||||
if (sd_ctx == NULL) {
|
||||
return NULL;
|
||||
@ -1193,10 +1177,7 @@ sd_ctx_t* new_sd_ctx(const char* model_path_c_str,
|
||||
keep_clip_on_cpu,
|
||||
keep_control_net_cpu,
|
||||
keep_vae_on_cpu,
|
||||
diffusion_flash_attn,
|
||||
chroma_use_dit_mask,
|
||||
chroma_use_t5_mask,
|
||||
chroma_t5_mask_pad)) {
|
||||
diffusion_flash_attn)) {
|
||||
delete sd_ctx->sd;
|
||||
sd_ctx->sd = NULL;
|
||||
free(sd_ctx);
|
||||
@ -1972,6 +1953,7 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
|
||||
return result_images;
|
||||
}
|
||||
|
||||
|
||||
sd_image_t* edit(sd_ctx_t* sd_ctx,
|
||||
sd_image_t* ref_images,
|
||||
int ref_images_count,
|
||||
@ -2060,7 +2042,7 @@ sd_image_t* edit(sd_ctx_t* sd_ctx,
|
||||
}
|
||||
ref_latents.push_back(latent);
|
||||
}
|
||||
|
||||
|
||||
size_t t1 = ggml_time_ms();
|
||||
LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
|
||||
|
||||
|
||||
@ -61,10 +61,10 @@ enum schedule_t {
|
||||
|
||||
// same as enum ggml_type
|
||||
enum sd_type_t {
|
||||
SD_TYPE_F32 = 0,
|
||||
SD_TYPE_F16 = 1,
|
||||
SD_TYPE_Q4_0 = 2,
|
||||
SD_TYPE_Q4_1 = 3,
|
||||
SD_TYPE_F32 = 0,
|
||||
SD_TYPE_F16 = 1,
|
||||
SD_TYPE_Q4_0 = 2,
|
||||
SD_TYPE_Q4_1 = 3,
|
||||
// SD_TYPE_Q4_2 = 4, support has been removed
|
||||
// SD_TYPE_Q4_3 = 5, support has been removed
|
||||
SD_TYPE_Q5_0 = 6,
|
||||
@ -95,12 +95,12 @@ enum sd_type_t {
|
||||
// SD_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
|
||||
// SD_TYPE_Q4_0_4_8 = 32,
|
||||
// SD_TYPE_Q4_0_8_8 = 33,
|
||||
SD_TYPE_TQ1_0 = 34,
|
||||
SD_TYPE_TQ2_0 = 35,
|
||||
SD_TYPE_TQ1_0 = 34,
|
||||
SD_TYPE_TQ2_0 = 35,
|
||||
// SD_TYPE_IQ4_NL_4_4 = 36,
|
||||
// SD_TYPE_IQ4_NL_4_8 = 37,
|
||||
// SD_TYPE_IQ4_NL_8_8 = 38,
|
||||
SD_TYPE_COUNT = 39,
|
||||
SD_TYPE_COUNT = 39,
|
||||
};
|
||||
|
||||
SD_API const char* sd_type_name(enum sd_type_t type);
|
||||
@ -150,10 +150,7 @@ SD_API sd_ctx_t* new_sd_ctx(const char* model_path,
|
||||
bool keep_clip_on_cpu,
|
||||
bool keep_control_net_cpu,
|
||||
bool keep_vae_on_cpu,
|
||||
bool diffusion_flash_attn,
|
||||
bool chroma_use_dit_mask,
|
||||
bool chroma_use_t5_mask,
|
||||
int chroma_t5_mask_pad);
|
||||
bool diffusion_flash_attn);
|
||||
|
||||
SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
|
||||
|
||||
|
||||
48
t5.hpp
48
t5.hpp
@ -385,7 +385,6 @@ public:
|
||||
|
||||
void pad_tokens(std::vector<int>& tokens,
|
||||
std::vector<float>& weights,
|
||||
std::vector<float>* attention_mask,
|
||||
size_t max_length = 0,
|
||||
bool padding = false) {
|
||||
if (max_length > 0 && padding) {
|
||||
@ -398,15 +397,11 @@ public:
|
||||
LOG_DEBUG("token length: %llu", length);
|
||||
std::vector<int> new_tokens;
|
||||
std::vector<float> new_weights;
|
||||
std::vector<float> new_attention_mask;
|
||||
int token_idx = 0;
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (token_idx >= orig_token_num) {
|
||||
break;
|
||||
}
|
||||
if (attention_mask != nullptr) {
|
||||
new_attention_mask.push_back(0.0);
|
||||
}
|
||||
if (i % max_length == max_length - 1) {
|
||||
new_tokens.push_back(eos_id_);
|
||||
new_weights.push_back(1.0);
|
||||
@ -419,24 +414,13 @@ public:
|
||||
|
||||
new_tokens.push_back(eos_id_);
|
||||
new_weights.push_back(1.0);
|
||||
if (attention_mask != nullptr) {
|
||||
new_attention_mask.push_back(0.0);
|
||||
}
|
||||
|
||||
tokens = new_tokens;
|
||||
weights = new_weights;
|
||||
if (attention_mask != nullptr) {
|
||||
*attention_mask = new_attention_mask;
|
||||
}
|
||||
|
||||
if (padding) {
|
||||
int pad_token_id = pad_id_;
|
||||
tokens.insert(tokens.end(), length - tokens.size(), pad_token_id);
|
||||
weights.insert(weights.end(), length - weights.size(), 1.0);
|
||||
if (attention_mask != nullptr) {
|
||||
// maybe keep some padding tokens unmasked?
|
||||
attention_mask->insert(attention_mask->end(), length - attention_mask->size(), -HUGE_VALF);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -595,7 +579,6 @@ public:
|
||||
}
|
||||
if (past_bias != NULL) {
|
||||
if (mask != NULL) {
|
||||
mask = ggml_repeat(ctx, mask, past_bias);
|
||||
mask = ggml_add(ctx, mask, past_bias);
|
||||
} else {
|
||||
mask = past_bias;
|
||||
@ -756,17 +739,15 @@ struct T5Runner : public GGMLRunner {
|
||||
|
||||
struct ggml_tensor* forward(struct ggml_context* ctx,
|
||||
struct ggml_tensor* input_ids,
|
||||
struct ggml_tensor* relative_position_bucket,
|
||||
struct ggml_tensor* attention_mask = NULL) {
|
||||
struct ggml_tensor* relative_position_bucket) {
|
||||
size_t N = input_ids->ne[1];
|
||||
size_t n_token = input_ids->ne[0];
|
||||
|
||||
auto hidden_states = model.forward(ctx, input_ids, NULL, attention_mask, relative_position_bucket); // [N, n_token, model_dim]
|
||||
auto hidden_states = model.forward(ctx, input_ids, NULL, NULL, relative_position_bucket); // [N, n_token, model_dim]
|
||||
return hidden_states;
|
||||
}
|
||||
|
||||
struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
|
||||
struct ggml_tensor* attention_mask = NULL) {
|
||||
struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids) {
|
||||
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
|
||||
|
||||
input_ids = to_backend(input_ids);
|
||||
@ -786,7 +767,7 @@ struct T5Runner : public GGMLRunner {
|
||||
input_ids->ne[0]);
|
||||
set_backend_tensor_data(relative_position_bucket, relative_position_bucket_vec.data());
|
||||
|
||||
struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, relative_position_bucket, attention_mask);
|
||||
struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, relative_position_bucket);
|
||||
|
||||
ggml_build_forward_expand(gf, hidden_states);
|
||||
|
||||
@ -795,11 +776,10 @@ struct T5Runner : public GGMLRunner {
|
||||
|
||||
void compute(const int n_threads,
|
||||
struct ggml_tensor* input_ids,
|
||||
struct ggml_tensor* attention_mask,
|
||||
ggml_tensor** output,
|
||||
ggml_context* output_ctx = NULL) {
|
||||
auto get_graph = [&]() -> struct ggml_cgraph* {
|
||||
return build_graph(input_ids, attention_mask);
|
||||
return build_graph(input_ids);
|
||||
};
|
||||
GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
|
||||
}
|
||||
@ -897,9 +877,9 @@ struct T5Embedder {
|
||||
model.alloc_params_buffer();
|
||||
}
|
||||
|
||||
std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text,
|
||||
size_t max_length = 0,
|
||||
bool padding = false) {
|
||||
std::pair<std::vector<int>, std::vector<float>> tokenize(std::string text,
|
||||
size_t max_length = 0,
|
||||
bool padding = false) {
|
||||
auto parsed_attention = parse_prompt_attention(text);
|
||||
|
||||
{
|
||||
@ -926,16 +906,14 @@ struct T5Embedder {
|
||||
tokens.push_back(EOS_TOKEN_ID);
|
||||
weights.push_back(1.0);
|
||||
|
||||
std::vector<float> attention_mask;
|
||||
|
||||
tokenizer.pad_tokens(tokens, weights, &attention_mask, max_length, padding);
|
||||
tokenizer.pad_tokens(tokens, weights, max_length, padding);
|
||||
|
||||
// for (int i = 0; i < tokens.size(); i++) {
|
||||
// std::cout << tokens[i] << ":" << weights[i] << ", ";
|
||||
// }
|
||||
// std::cout << std::endl;
|
||||
|
||||
return {tokens, weights, attention_mask};
|
||||
return {tokens, weights};
|
||||
}
|
||||
|
||||
void test() {
|
||||
@ -956,8 +934,8 @@ struct T5Embedder {
|
||||
// TODO: fix cuda nan
|
||||
std::string text("a lovely cat");
|
||||
auto tokens_and_weights = tokenize(text, 77, true);
|
||||
std::vector<int>& tokens = std::get<0>(tokens_and_weights);
|
||||
std::vector<float>& weights = std::get<1>(tokens_and_weights);
|
||||
std::vector<int>& tokens = tokens_and_weights.first;
|
||||
std::vector<float>& weights = tokens_and_weights.second;
|
||||
for (auto token : tokens) {
|
||||
printf("%d ", token);
|
||||
}
|
||||
@ -966,7 +944,7 @@ struct T5Embedder {
|
||||
struct ggml_tensor* out = NULL;
|
||||
|
||||
int t0 = ggml_time_ms();
|
||||
model.compute(8, input_ids, NULL, &out, work_ctx);
|
||||
model.compute(8, input_ids, &out, work_ctx);
|
||||
int t1 = ggml_time_ms();
|
||||
|
||||
print_ggml_tensor(out);
|
||||
|
||||
4
util.cpp
4
util.cpp
@ -112,7 +112,7 @@ std::vector<std::string> get_files_from_dir(const std::string& dir) {
|
||||
sprintf(directoryPath, "%s\\%s\\*", currentDirectory, dir.c_str());
|
||||
|
||||
// Find the first file in the directory
|
||||
hFind = FindFirstFile(directoryPath, &findFileData);
|
||||
hFind = FindFirstFile(directoryPath, &findFileData);
|
||||
bool isAbsolutePath = false;
|
||||
// Check if the directory was found
|
||||
if (hFind == INVALID_HANDLE_VALUE) {
|
||||
@ -121,7 +121,7 @@ std::vector<std::string> get_files_from_dir(const std::string& dir) {
|
||||
char directoryPathAbsolute[MAX_PATH];
|
||||
sprintf(directoryPathAbsolute, "%s*", dir.c_str());
|
||||
|
||||
hFind = FindFirstFile(directoryPathAbsolute, &findFileData);
|
||||
hFind = FindFirstFile(directoryPathAbsolute, &findFileData);
|
||||
isAbsolutePath = true;
|
||||
if (hFind == INVALID_HANDLE_VALUE) {
|
||||
printf("Absolute path was also wrong.\n");
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user