From ded8583a2448931668f2f7a273ca68b44c4c602c Mon Sep 17 00:00:00 2001 From: KenForever1 <2962666398@qq.com> Date: Thu, 18 Jun 2026 23:44:34 +0800 Subject: [PATCH 1/7] Add MiniT2I inference support --- src/conditioning/conditioner.hpp | 95 +++++ src/core/ggml_extend_backend.cpp | 81 ++++- src/model.h | 9 + src/model/diffusion/minit2i.hpp | 573 +++++++++++++++++++++++++++++++ src/model/te/t5.hpp | 79 ++++- src/model/vae/vae.hpp | 2 +- src/model_loader.cpp | 11 + src/stable-diffusion.cpp | 100 +++++- 8 files changed, 931 insertions(+), 19 deletions(-) create mode 100644 src/model/diffusion/minit2i.hpp diff --git a/src/conditioning/conditioner.hpp b/src/conditioning/conditioner.hpp index e8b8ee3da..d63303a82 100644 --- a/src/conditioning/conditioner.hpp +++ b/src/conditioning/conditioner.hpp @@ -1378,6 +1378,101 @@ struct T5CLIPEmbedder : public Conditioner { } }; +struct MiniT2IConditioner : public Conditioner { + T5UniGramTokenizer tokenizer; + std::shared_ptr t5; + size_t prompt_length = 256; + + MiniT2IConditioner(ggml_backend_t backend, + const String2TensorStorage& tensor_storage_map = {}, + std::shared_ptr weight_manager = nullptr) { + bool use_t5 = false; + for (const auto& pair : tensor_storage_map) { + if (pair.first.find("text_encoders.t5xxl") != std::string::npos) { + use_t5 = true; + break; + } + } + if (!use_t5) { + LOG_WARN("IMPORTANT NOTICE: No MiniT2I T5 text encoder provided, cannot process prompts!"); + return; + } + t5 = std::make_shared(backend, tensor_storage_map, "text_encoders.t5xxl.transformer", false, weight_manager); + } + + void get_param_tensors(std::map& tensors) override { + if (t5) { + t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer"); + } + } + + void set_max_graph_vram_bytes(size_t max_vram_bytes) override { + if (t5) { + t5->set_max_graph_vram_bytes(max_vram_bytes); + } + } + + void set_stream_layers_enabled(bool enabled) override { + if (t5) { + t5->set_stream_layers_enabled(enabled); + } + } + + void set_flash_attention_enabled(bool enabled) override { + if (t5) { + t5->set_flash_attention_enabled(enabled); + } + } + + void set_weight_adapter(const std::shared_ptr& adapter) override { + if (t5) { + t5->set_weight_adapter(adapter); + } + } + + void runner_done() override { + if (t5) { + t5->runner_done(); + } + } + + SDCondition get_learned_condition(int n_threads, + const ConditionerParams& conditioner_params) override { + SDCondition result; + if (!t5) { + result.c_crossattn = sd::Tensor::zeros({1024, static_cast(prompt_length)}); + result.c_vector = sd::Tensor::zeros({static_cast(prompt_length)}); + return result; + } + + std::vector tokens = tokenizer.encode(conditioner_params.text); + if (tokens.size() > prompt_length) { + tokens.resize(prompt_length); + } + std::vector mask(tokens.size(), 1.0f); + while (tokens.size() < prompt_length) { + tokens.push_back(tokenizer.PAD_TOKEN_ID); + mask.push_back(0.0f); + } + + sd::Tensor input_ids({static_cast(tokens.size())}, tokens); + std::vector t5_mask(mask.size(), 0.0f); + for (size_t i = 0; i < mask.size(); ++i) { + t5_mask[i] = mask[i] > 0.0f ? 0.0f : -HUGE_VALF; + } + sd::Tensor hidden_states = t5->compute(n_threads, + input_ids, + sd::Tensor::from_vector(t5_mask), + false, + true, + true); + GGML_ASSERT(!hidden_states.empty()); + result.c_crossattn = std::move(hidden_states); + result.c_vector = sd::Tensor::from_vector(mask); + return result; + } +}; + struct AnimaConditioner : public Conditioner { std::shared_ptr qwen_tokenizer; T5UniGramTokenizer t5_tokenizer; diff --git a/src/core/ggml_extend_backend.cpp b/src/core/ggml_extend_backend.cpp index f3e2cceba..2eb62d3a3 100644 --- a/src/core/ggml_extend_backend.cpp +++ b/src/core/ggml_extend_backend.cpp @@ -110,7 +110,67 @@ static std::string resolve_first_device_by_type(enum ggml_backend_dev_type type) if (dev == nullptr) { return ""; } - return ggml_backend_dev_name(dev); + const char* dev_name = ggml_backend_dev_name(dev); + if (dev_name != nullptr && dev_name[0] != '\0') { + return dev_name; + } + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); + const char* reg_name = reg != nullptr ? ggml_backend_reg_name(reg) : nullptr; + return reg_name != nullptr ? reg_name : ""; +} + +static ggml_backend_dev_t resolve_first_device_by_registry_name(const std::string& name) { + std::string lower = lower_copy(trim_copy(name)); + if (lower == "metal") { + lower = "mtl"; + } + if (lower.empty()) { + return nullptr; + } + + const size_t device_count = ggml_backend_dev_count(); + for (size_t i = 0; i < device_count; ++i) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); + if (reg == nullptr) { + continue; + } + const char* reg_name = ggml_backend_reg_name(reg); + if (reg_name != nullptr && lower_copy(reg_name) == lower) { + return dev; + } + } + return nullptr; +} + +static ggml_backend_dev_t resolve_device_by_name(const std::string& name) { + const std::string lower = lower_copy(trim_copy(name)); + if (lower.empty()) { + return nullptr; + } + + const size_t device_count = ggml_backend_dev_count(); + for (size_t i = 0; i < device_count; ++i) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + const char* dev_name = ggml_backend_dev_name(dev); + if (dev_name != nullptr && lower_copy(dev_name) == lower) { + return dev; + } + } + return nullptr; +} + +static std::string backend_device_name(ggml_backend_dev_t dev) { + if (dev == nullptr) { + return ""; + } + const char* name = ggml_backend_dev_name(dev); + if (name != nullptr && name[0] != '\0') { + return name; + } + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); + const char* reg_name = reg != nullptr ? ggml_backend_reg_name(reg) : nullptr; + return reg_name != nullptr ? reg_name : ""; } static ggml_backend_buffer_t ggml_backend_tensor_buffer(const struct ggml_tensor* tensor) { @@ -296,6 +356,10 @@ std::string sd_backend_resolve_name(const std::string& name) { return resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU); } + if (ggml_backend_dev_t dev = resolve_first_device_by_registry_name(requested)) { + return backend_device_name(dev); + } + const size_t device_count = ggml_backend_dev_count(); for (size_t i = 0; i < device_count; ++i) { ggml_backend_dev_t dev = ggml_backend_dev_get(i); @@ -328,7 +392,20 @@ static ggml_backend_t init_named_backend(const std::string& name) { return ggml_backend_init_best(); } + if (ggml_backend_dev_t dev = resolve_device_by_name(name)) { + return ggml_backend_dev_init(dev, nullptr); + } + if (ggml_backend_dev_t dev = resolve_first_device_by_registry_name(name)) { + return ggml_backend_dev_init(dev, nullptr); + } + std::string resolved = sd_backend_resolve_name(name); + if (ggml_backend_dev_t dev = resolve_device_by_name(resolved)) { + return ggml_backend_dev_init(dev, nullptr); + } + if (ggml_backend_dev_t dev = resolve_first_device_by_registry_name(resolved)) { + return ggml_backend_dev_init(dev, nullptr); + } if (resolved.empty()) { return nullptr; } @@ -599,7 +676,7 @@ bool SDBackendManager::validate(std::string* error) const { } return false; } - if (!sd_backend_resolve_name(name).empty()) { + if (!sd_backend_resolve_name(name).empty() || resolve_first_device_by_registry_name(name) != nullptr) { return true; } if (error != nullptr) { diff --git a/src/model.h b/src/model.h index fff050149..75fdbe643 100644 --- a/src/model.h +++ b/src/model.h @@ -46,6 +46,7 @@ enum SDVersion { VERSION_OVIS_IMAGE, VERSION_ERNIE_IMAGE, VERSION_LENS, + VERSION_MINIT2I, VERSION_LONGCAT, VERSION_PID, VERSION_IDEOGRAM4, @@ -174,6 +175,13 @@ static inline bool sd_version_is_lens(SDVersion version) { return false; } +static inline bool sd_version_is_minit2i(SDVersion version) { + if (version == VERSION_MINIT2I) { + return true; + } + return false; +} + static inline bool sd_version_is_pid(SDVersion version) { if (version == VERSION_PID) { return true; @@ -247,6 +255,7 @@ static inline bool sd_version_is_dit(SDVersion version) { sd_version_is_boogu_image(version) || sd_version_is_ernie_image(version) || sd_version_is_lens(version) || + sd_version_is_minit2i(version) || sd_version_is_longcat(version) || sd_version_is_pid(version) || sd_version_is_ideogram4(version) || diff --git a/src/model/diffusion/minit2i.hpp b/src/model/diffusion/minit2i.hpp new file mode 100644 index 000000000..d69f1f4ac --- /dev/null +++ b/src/model/diffusion/minit2i.hpp @@ -0,0 +1,573 @@ +#ifndef __SD_MODEL_DIFFUSION_MINIT2I_HPP__ +#define __SD_MODEL_DIFFUSION_MINIT2I_HPP__ + +#include +#include +#include +#include +#include +#include +#include + +#include "core/ggml_extend.hpp" +#include "model/common/rope.hpp" +#include "model/diffusion/dit.hpp" +#include "model/diffusion/model.hpp" +#include "model_loader.h" + +namespace MiniT2I { + constexpr int MINIT2I_GRAPH_SIZE = 196608; + + struct MiniT2IConfig { + int64_t image_size = 512; + int64_t patch_size = 16; + int64_t in_channels = 3; + int64_t txt_input_size = 1024; + int64_t hidden_size = 768; + int64_t txt_hidden_size = 768; + int64_t cond_vec_size = 768; + int64_t depth_double = 17; + int64_t txt_preamble_depth = 2; + int64_t num_heads = 12; + int64_t head_dim = 64; + float mlp_ratio = 2.6667f; + int64_t pca_channels = 128; + int64_t prompt_length = 256; + int64_t n_T = 100; + float cfg_interval_start = 0.0f; + float cfg_interval_end = 1.0f; + + static MiniT2IConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) { + MiniT2IConfig config; + config.depth_double = 0; + config.txt_preamble_depth = 0; + + for (const auto& [name, tensor_storage] : tensor_storage_map) { + if (!starts_with(name, prefix)) { + continue; + } + if (ends_with(name, "img_embedder.proj1.weight") && tensor_storage.n_dims == 4) { + config.patch_size = tensor_storage.ne[0]; + config.in_channels = tensor_storage.ne[2]; + config.pca_channels = tensor_storage.ne[3]; + } else if (ends_with(name, "img_embedder.proj2.weight") && tensor_storage.n_dims == 4) { + config.pca_channels = tensor_storage.ne[2]; + config.hidden_size = tensor_storage.ne[3]; + } else if (ends_with(name, "txt_embedder.weight") && tensor_storage.n_dims == 2) { + config.txt_input_size = tensor_storage.ne[0]; + config.txt_hidden_size = tensor_storage.ne[1]; + } else if (ends_with(name, "pooled_embedder.weight") && tensor_storage.n_dims == 2) { + config.cond_vec_size = tensor_storage.ne[1]; + } else if (ends_with(name, "double_blocks.0.img_qkv.weight") && tensor_storage.n_dims == 2) { + int64_t inner3 = tensor_storage.ne[1]; + int64_t inner = inner3 / 3; + config.hidden_size = tensor_storage.ne[0]; + if (config.hidden_size == 768) { + config.num_heads = 12; + config.head_dim = 64; + } else if (config.hidden_size == 1248) { + config.num_heads = 24; + config.head_dim = 52; + } else if (inner > 0) { + config.head_dim = 64; + config.num_heads = std::max(1, inner / config.head_dim); + } + } else if (ends_with(name, "final_layer.linear.weight") && tensor_storage.n_dims == 2) { + int64_t patch_area = config.patch_size * config.patch_size; + config.hidden_size = tensor_storage.ne[0]; + config.in_channels = patch_area > 0 ? tensor_storage.ne[1] / patch_area : config.in_channels; + } else if (ends_with(name, "mask_token") && tensor_storage.n_dims >= 2) { + config.prompt_length = tensor_storage.ne[1]; + } + + size_t pos = name.find("double_blocks."); + if (pos != std::string::npos) { + auto items = split_string(name.substr(pos), '.'); + if (items.size() > 1) { + int64_t idx = atoi(items[1].c_str()); + config.depth_double = std::max(config.depth_double, idx + 1); + } + } + pos = name.find("txt_preamble_blocks."); + if (pos != std::string::npos) { + auto items = split_string(name.substr(pos), '.'); + if (items.size() > 1) { + int64_t idx = atoi(items[1].c_str()); + config.txt_preamble_depth = std::max(config.txt_preamble_depth, idx + 1); + } + } + } + + if (config.depth_double <= 0) { + config.depth_double = config.hidden_size == 1248 ? 23 : 17; + } + if (config.txt_preamble_depth <= 0) { + config.txt_preamble_depth = 2; + } + if (config.head_dim <= 0 || config.num_heads <= 0) { + config.head_dim = config.hidden_size == 1248 ? 52 : 64; + config.num_heads = config.hidden_size / config.head_dim; + } + LOG_DEBUG("minit2i: hidden_size=%" PRId64 ", txt_hidden_size=%" PRId64 ", heads=%" PRId64 ", head_dim=%" PRId64 ", double_blocks=%" PRId64 ", txt_blocks=%" PRId64 ", patch=%" PRId64 ", in_channels=%" PRId64, + config.hidden_size, + config.txt_hidden_size, + config.num_heads, + config.head_dim, + config.depth_double, + config.txt_preamble_depth, + config.patch_size, + config.in_channels); + return config; + } + }; + + inline std::vector make_2d_sincos_pos_embed(int grid_size, int dim) { + GGML_ASSERT(dim % 4 == 0); + int half_dim = dim / 2; + int quarter = half_dim / 2; + std::vector out(static_cast(grid_size) * grid_size * dim); + std::vector omega(quarter); + for (int i = 0; i < quarter; ++i) { + omega[i] = 1.0f / std::pow(10000.0f, static_cast(i) / static_cast(quarter)); + } + for (int y = 0; y < grid_size; ++y) { + for (int x = 0; x < grid_size; ++x) { + size_t base = static_cast(y * grid_size + x) * dim; + for (int i = 0; i < quarter; ++i) { + float ay = y * omega[i]; + float ax = x * omega[i]; + out[base + i] = std::sin(ax); + out[base + quarter + i] = std::cos(ax); + out[base + half_dim + i] = std::sin(ay); + out[base + half_dim + quarter + i] = std::cos(ay); + } + } + } + return out; + } + + inline std::vector make_text_rope(int length, int head_dim) { + return Rope::flatten(Rope::rope(Rope::linspace(0.f, static_cast(length - 1), length), head_dim, 10000.f)); + } + + inline std::vector make_vision_rope(int side, int head_dim) { + GGML_ASSERT(head_dim % 4 == 0); + int dim = head_dim / 2; + int quarter = dim / 2; + int length = side * side; + std::vector out(static_cast(length) * (head_dim / 2) * 4); + std::vector freqs(quarter); + for (int i = 0; i < quarter; ++i) { + freqs[i] = 1.0f / std::pow(10000.0f, static_cast(2 * i) / static_cast(dim)); + } + for (int y = 0; y < side; ++y) { + for (int x = 0; x < side; ++x) { + int pos = y * side + x; + size_t base = static_cast(pos) * (head_dim / 2) * 4; + for (int i = 0; i < quarter; ++i) { + float ay = y * freqs[i]; + float ax = x * freqs[i]; + float angles[2] = {ay, ax}; + for (int axis = 0; axis < 2; ++axis) { + int j = axis * quarter + i; + out[base + 4 * j] = std::cos(angles[axis]); + out[base + 4 * j + 1] = -std::sin(angles[axis]); + out[base + 4 * j + 2] = std::sin(angles[axis]); + out[base + 4 * j + 3] = std::cos(angles[axis]); + } + } + } + } + return out; + } + + struct SwiGLUMlp : public GGMLBlock { + SwiGLUMlp(int64_t in_features, int64_t hidden_features) { + int64_t hidden_dim = ((hidden_features + 7) / 8) * 8; + blocks["w1"] = std::make_shared(in_features, hidden_dim, false); + blocks["w3"] = std::make_shared(in_features, hidden_dim, false); + blocks["w2"] = std::make_shared(hidden_dim, in_features, false); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { + auto w1 = std::dynamic_pointer_cast(blocks["w1"]); + auto w3 = std::dynamic_pointer_cast(blocks["w3"]); + auto w2 = std::dynamic_pointer_cast(blocks["w2"]); + auto gate = ggml_silu(ctx->ggml_ctx, w1->forward(ctx, x)); + auto up = w3->forward(ctx, x); + return w2->forward(ctx, ggml_mul(ctx->ggml_ctx, gate, up)); + } + }; + + struct BottleneckPatchEmbed : public GGMLBlock { + int64_t patch_size; + + BottleneckPatchEmbed(int64_t patch_size, int64_t in_channels, int64_t pca_channels, int64_t hidden_size) + : patch_size(patch_size) { + blocks["proj1"] = std::make_shared(in_channels, + pca_channels, + std::pair{static_cast(patch_size), static_cast(patch_size)}, + std::pair{static_cast(patch_size), static_cast(patch_size)}, + std::pair{0, 0}, + std::pair{1, 1}, + false); + blocks["proj2"] = std::make_shared(pca_channels, + hidden_size, + std::pair{1, 1}, + std::pair{1, 1}, + std::pair{0, 0}, + std::pair{1, 1}, + true); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { + auto proj1 = std::dynamic_pointer_cast(blocks["proj1"]); + auto proj2 = std::dynamic_pointer_cast(blocks["proj2"]); + x = proj1->forward(ctx, x); + x = proj2->forward(ctx, x); + x = ggml_reshape_3d(ctx->ggml_ctx, x, x->ne[0] * x->ne[1], x->ne[2], x->ne[3]); + x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3)); + return x; + } + }; + + struct TimestepEmbedder : public GGMLBlock { + int frequency_embedding_size; + + TimestepEmbedder(int64_t hidden_size, int frequency_embedding_size = 256) + : frequency_embedding_size(frequency_embedding_size) { + blocks["mlp.0"] = std::make_shared(frequency_embedding_size, hidden_size, true, true); + blocks["mlp.2"] = std::make_shared(hidden_size, hidden_size, true, true); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* t) { + auto mlp_0 = std::dynamic_pointer_cast(blocks["mlp.0"]); + auto mlp_2 = std::dynamic_pointer_cast(blocks["mlp.2"]); + auto t_emb = ggml_ext_timestep_embedding(ctx->ggml_ctx, t, frequency_embedding_size, 10000, 1.0f); + t_emb = mlp_0->forward(ctx, t_emb); + t_emb = ggml_silu_inplace(ctx->ggml_ctx, t_emb); + return mlp_2->forward(ctx, t_emb); + } + }; + + inline std::vector split_qkv(ggml_context* ctx, ggml_tensor* qkv, int64_t num_heads, int64_t head_dim) { + int64_t N = qkv->ne[2]; + int64_t L = qkv->ne[1]; + auto q = ggml_view_4d(ctx, qkv, head_dim, num_heads, L, N, + qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], 0); + auto k = ggml_view_4d(ctx, qkv, head_dim, num_heads, L, N, + qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], qkv->nb[0] * head_dim * num_heads); + auto v = ggml_view_4d(ctx, qkv, head_dim, num_heads, L, N, + qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], qkv->nb[0] * head_dim * num_heads * 2); + return {q, k, v}; + } + + struct PlainTextTransformerBlock : public GGMLBlock { + int64_t num_heads; + int64_t head_dim; + + PlainTextTransformerBlock(int64_t hidden_size, int64_t num_heads, int64_t head_dim, float mlp_ratio) + : num_heads(num_heads), head_dim(head_dim) { + int64_t inner_dim = num_heads * head_dim; + blocks["norm1"] = std::make_shared(hidden_size, 1e-6f); + blocks["norm2"] = std::make_shared(hidden_size, 1e-6f); + blocks["qkv"] = std::make_shared(hidden_size, inner_dim * 3, true); + blocks["attn_proj"] = std::make_shared(inner_dim, hidden_size, true); + blocks["mlp"] = std::make_shared(hidden_size, static_cast(hidden_size * mlp_ratio)); + blocks["q_norm"] = std::make_shared(head_dim, 1e-6f); + blocks["k_norm"] = std::make_shared(head_dim, 1e-6f); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* txt, ggml_tensor* pe) { + auto norm1 = std::dynamic_pointer_cast(blocks["norm1"]); + auto norm2 = std::dynamic_pointer_cast(blocks["norm2"]); + auto qkv_proj = std::dynamic_pointer_cast(blocks["qkv"]); + auto attn_proj = std::dynamic_pointer_cast(blocks["attn_proj"]); + auto mlp = std::dynamic_pointer_cast(blocks["mlp"]); + auto q_norm = std::dynamic_pointer_cast(blocks["q_norm"]); + auto k_norm = std::dynamic_pointer_cast(blocks["k_norm"]); + + auto qkv = split_qkv(ctx->ggml_ctx, qkv_proj->forward(ctx, norm1->forward(ctx, txt)), num_heads, head_dim); + auto q = q_norm->forward(ctx, qkv[0]); + auto k = k_norm->forward(ctx, qkv[1]); + auto v = qkv[2]; + auto out = Rope::attention(ctx, q, k, v, pe, nullptr, 1.0f, false); + txt = ggml_add(ctx->ggml_ctx, txt, attn_proj->forward(ctx, out)); + txt = ggml_add(ctx->ggml_ctx, txt, mlp->forward(ctx, norm2->forward(ctx, txt))); + return txt; + } + }; + + struct DoubleStreamDiTBlock : public GGMLBlock { + int64_t num_heads; + int64_t head_dim; + + DoubleStreamDiTBlock(int64_t hidden_size, int64_t txt_hidden_size, int64_t num_heads, int64_t head_dim, float mlp_ratio) + : num_heads(num_heads), head_dim(head_dim) { + int64_t inner_dim = num_heads * head_dim; + blocks["img_norm1"] = std::make_shared(hidden_size, 1e-6f); + blocks["img_norm2"] = std::make_shared(hidden_size, 1e-6f); + blocks["txt_norm1"] = std::make_shared(txt_hidden_size, 1e-6f); + blocks["txt_norm2"] = std::make_shared(txt_hidden_size, 1e-6f); + blocks["img_qkv"] = std::make_shared(hidden_size, inner_dim * 3, true); + blocks["txt_qkv"] = std::make_shared(txt_hidden_size, inner_dim * 3, true); + blocks["q_norm"] = std::make_shared(head_dim, 1e-6f); + blocks["k_norm"] = std::make_shared(head_dim, 1e-6f); + blocks["img_attn_proj"] = std::make_shared(inner_dim, hidden_size, true); + blocks["txt_attn_proj"] = std::make_shared(inner_dim, txt_hidden_size, true); + blocks["img_mlp"] = std::make_shared(hidden_size, static_cast(hidden_size * mlp_ratio)); + blocks["txt_mlp"] = std::make_shared(txt_hidden_size, static_cast(txt_hidden_size * mlp_ratio)); + } + + std::pair forward(GGMLRunnerContext* ctx, + ggml_tensor* img, + ggml_tensor* txt, + ggml_tensor* pe) { + auto img_norm1 = std::dynamic_pointer_cast(blocks["img_norm1"]); + auto img_norm2 = std::dynamic_pointer_cast(blocks["img_norm2"]); + auto txt_norm1 = std::dynamic_pointer_cast(blocks["txt_norm1"]); + auto txt_norm2 = std::dynamic_pointer_cast(blocks["txt_norm2"]); + auto img_qkv_p = std::dynamic_pointer_cast(blocks["img_qkv"]); + auto txt_qkv_p = std::dynamic_pointer_cast(blocks["txt_qkv"]); + auto q_norm = std::dynamic_pointer_cast(blocks["q_norm"]); + auto k_norm = std::dynamic_pointer_cast(blocks["k_norm"]); + auto img_proj = std::dynamic_pointer_cast(blocks["img_attn_proj"]); + auto txt_proj = std::dynamic_pointer_cast(blocks["txt_attn_proj"]); + auto img_mlp = std::dynamic_pointer_cast(blocks["img_mlp"]); + auto txt_mlp = std::dynamic_pointer_cast(blocks["txt_mlp"]); + + int64_t li = img->ne[1]; + int64_t lt = txt->ne[1]; + + auto img_qkv = split_qkv(ctx->ggml_ctx, img_qkv_p->forward(ctx, img_norm1->forward(ctx, img)), num_heads, head_dim); + auto txt_qkv = split_qkv(ctx->ggml_ctx, txt_qkv_p->forward(ctx, txt_norm1->forward(ctx, txt)), num_heads, head_dim); + + auto q = ggml_concat(ctx->ggml_ctx, q_norm->forward(ctx, txt_qkv[0]), q_norm->forward(ctx, img_qkv[0]), 2); + auto k = ggml_concat(ctx->ggml_ctx, k_norm->forward(ctx, txt_qkv[1]), k_norm->forward(ctx, img_qkv[1]), 2); + auto v = ggml_concat(ctx->ggml_ctx, txt_qkv[2], img_qkv[2], 2); + + auto out = Rope::attention(ctx, q, k, v, pe, nullptr, 1.0f, false); + auto out_txt = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, lt); + auto out_img = ggml_ext_slice(ctx->ggml_ctx, out, 1, lt, lt + li); + + img = ggml_add(ctx->ggml_ctx, img, img_proj->forward(ctx, out_img)); + txt = ggml_add(ctx->ggml_ctx, txt, txt_proj->forward(ctx, out_txt)); + img = ggml_add(ctx->ggml_ctx, img, img_mlp->forward(ctx, img_norm2->forward(ctx, img))); + txt = ggml_add(ctx->ggml_ctx, txt, txt_mlp->forward(ctx, txt_norm2->forward(ctx, txt))); + return {img, txt}; + } + }; + + struct FinalLayer : public GGMLBlock { + FinalLayer(int64_t hidden_size, int64_t patch_size, int64_t out_channels) { + blocks["norm_final"] = std::make_shared(hidden_size, 1e-6f); + blocks["linear"] = std::make_shared(hidden_size, patch_size * patch_size * out_channels, true); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { + auto norm_final = std::dynamic_pointer_cast(blocks["norm_final"]); + auto linear = std::dynamic_pointer_cast(blocks["linear"]); + return linear->forward(ctx, norm_final->forward(ctx, x)); + } + }; + + struct MMJiT : public GGMLBlock { + MiniT2IConfig config; + + MMJiT(const MiniT2IConfig& config) + : config(config) { + blocks["img_embedder"] = std::make_shared(config.patch_size, config.in_channels, config.pca_channels, config.hidden_size); + blocks["txt_embedder"] = std::make_shared(config.txt_input_size, config.txt_hidden_size, false); + blocks["t_embedder"] = std::make_shared(config.cond_vec_size); + blocks["pooled_embedder"] = std::make_shared(config.txt_input_size, config.cond_vec_size, false); + for (int64_t i = 0; i < config.txt_preamble_depth; ++i) { + blocks["txt_preamble_blocks." + std::to_string(i)] = std::make_shared(config.txt_hidden_size, config.num_heads, config.head_dim, config.mlp_ratio); + } + for (int64_t i = 0; i < config.depth_double; ++i) { + blocks["double_blocks." + std::to_string(i)] = std::make_shared(config.hidden_size, config.txt_hidden_size, config.num_heads, config.head_dim, config.mlp_ratio); + } + blocks["final_layer"] = std::make_shared(config.hidden_size, config.patch_size, config.in_channels); + } + + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + GGMLBlock::init_params(ctx, tensor_storage_map, prefix); + enum ggml_type wtype = get_type(prefix + "mask_token", tensor_storage_map, GGML_TYPE_F32); + params["mask_token"] = ggml_new_tensor_3d(ctx, wtype, config.txt_input_size, 1, 1); + } + + ggml_tensor* apply_text_mask(GGMLRunnerContext* ctx, ggml_tensor* context, ggml_tensor* mask) { + if (mask == nullptr) { + return context; + } + mask = ggml_reshape_3d(ctx->ggml_ctx, mask, 1, mask->ne[0], mask->ne[1]); + mask = ggml_repeat(ctx->ggml_ctx, mask, context); + auto keep = ggml_mul(ctx->ggml_ctx, context, mask); + auto inv = ggml_sub(ctx->ggml_ctx, ggml_ext_ones_like(ctx->ggml_ctx, mask), mask); + auto mask_token = ggml_repeat(ctx->ggml_ctx, params["mask_token"], context); + return ggml_add(ctx->ggml_ctx, keep, ggml_mul(ctx->ggml_ctx, mask_token, inv)); + } + + ggml_tensor* pool_context(GGMLRunnerContext* ctx, ggml_tensor* context) { + int64_t dim = context->ne[0]; + int64_t len = context->ne[1]; + int64_t N = context->ne[2]; + auto x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, context, 1, 0, 2, 3)); + x = ggml_reshape_3d(ctx->ggml_ctx, x, len, dim, N); + x = ggml_mean(ctx->ggml_ctx, x); + x = ggml_reshape_2d(ctx->ggml_ctx, x, dim, N); + return x; + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* img, + ggml_tensor* t, + ggml_tensor* context, + ggml_tensor* mask, + ggml_tensor* pos_embed, + ggml_tensor* txt_pe, + ggml_tensor* joint_pe) { + auto img_embedder = std::dynamic_pointer_cast(blocks["img_embedder"]); + auto txt_embedder = std::dynamic_pointer_cast(blocks["txt_embedder"]); + auto t_embedder = std::dynamic_pointer_cast(blocks["t_embedder"]); + auto pooled_embedder = std::dynamic_pointer_cast(blocks["pooled_embedder"]); + auto final_layer = std::dynamic_pointer_cast(blocks["final_layer"]); + + int64_t W = img->ne[0]; + int64_t H = img->ne[1]; + int64_t hp = H / config.patch_size; + int64_t wp = W / config.patch_size; + + context = apply_text_mask(ctx, context, mask); + auto x = img_embedder->forward(ctx, img); + x = ggml_add(ctx->ggml_ctx, x, pos_embed); + + auto t_vec = t_embedder->forward(ctx, t); + auto pooled_text = pool_context(ctx, context); + auto vec = ggml_add(ctx->ggml_ctx, t_vec, pooled_embedder->forward(ctx, pooled_text)); + SD_UNUSED(vec); + + auto txt = txt_embedder->forward(ctx, context); + for (int64_t i = 0; i < config.txt_preamble_depth; ++i) { + auto block = std::dynamic_pointer_cast(blocks["txt_preamble_blocks." + std::to_string(i)]); + txt = block->forward(ctx, txt, txt_pe); + sd::ggml_graph_cut::mark_graph_cut(txt, "minit2i.txt_preamble_blocks." + std::to_string(i), "txt"); + } + for (int64_t i = 0; i < config.depth_double; ++i) { + auto block = std::dynamic_pointer_cast(blocks["double_blocks." + std::to_string(i)]); + auto out = block->forward(ctx, x, txt, joint_pe); + x = out.first; + txt = out.second; + sd::ggml_graph_cut::mark_graph_cut(x, "minit2i.double_blocks." + std::to_string(i), "x"); + sd::ggml_graph_cut::mark_graph_cut(txt, "minit2i.double_blocks." + std::to_string(i), "txt"); + } + auto combined = ggml_concat(ctx->ggml_ctx, txt, x, 1); + auto out = final_layer->forward(ctx, combined); + auto img_out = ggml_ext_slice(ctx->ggml_ctx, out, 1, txt->ne[1], txt->ne[1] + x->ne[1]); + return DiT::unpatchify(ctx->ggml_ctx, img_out, hp, wp, static_cast(config.patch_size), static_cast(config.patch_size), false); + } + }; + + inline std::string resolve_prefix(const String2TensorStorage& tensor_storage_map, const std::string& requested) { + if (!requested.empty() && tensor_storage_map.find(requested + ".img_embedder.proj1.weight") != tensor_storage_map.end()) { + return requested; + } + static const std::vector candidates = { + "model.net", + "model.diffusion_model.net", + "model.diffusion_model.model.net", + }; + for (const auto& candidate : candidates) { + if (tensor_storage_map.find(candidate + ".img_embedder.proj1.weight") != tensor_storage_map.end()) { + return candidate; + } + } + return requested.empty() ? "model.net" : requested; + } + + struct MiniT2IRunner : public DiffusionModelRunner { + MiniT2IConfig config; + MMJiT model; + std::vector pos_embed_vec; + std::vector txt_pe_vec; + std::vector joint_pe_vec; + + MiniT2IRunner(ggml_backend_t backend, + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "", + std::shared_ptr weight_manager = nullptr) + : DiffusionModelRunner(backend, resolve_prefix(tensor_storage_map, prefix), weight_manager), + config(MiniT2IConfig::detect_from_weights(tensor_storage_map, this->prefix)), + model(config) { + model.init(params_ctx, tensor_storage_map, this->prefix); + } + + std::string get_desc() override { + return "MiniT2I"; + } + + void get_param_tensors(std::map& tensors, const std::string& prefix) override { + model.get_param_tensors(tensors, prefix); + } + + ggml_cgraph* build_graph(const sd::Tensor& x_tensor, + const sd::Tensor& timesteps_tensor, + const sd::Tensor& context_tensor, + const sd::Tensor& mask_tensor) { + ggml_cgraph* gf = new_graph_custom(MINIT2I_GRAPH_SIZE); + ggml_tensor* x = make_input(x_tensor); + ggml_tensor* timesteps = make_input(timesteps_tensor); + ggml_tensor* context = make_input(context_tensor); + ggml_tensor* mask = make_input(mask_tensor); + + int64_t W = x->ne[0]; + int64_t H = x->ne[1]; + int64_t img_side = H / config.patch_size; + int64_t txt_len = context->ne[1]; + + pos_embed_vec = make_2d_sincos_pos_embed(static_cast(img_side), static_cast(config.hidden_size)); + auto pos_embed = ggml_new_tensor_3d(compute_ctx, GGML_TYPE_F32, config.hidden_size, img_side * img_side, 1); + set_backend_tensor_data(pos_embed, pos_embed_vec.data()); + + txt_pe_vec = make_text_rope(static_cast(txt_len), static_cast(config.head_dim)); + auto txt_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.head_dim / 2, txt_len); + set_backend_tensor_data(txt_pe, txt_pe_vec.data()); + + auto img_pe_vec = make_vision_rope(static_cast(img_side), static_cast(config.head_dim)); + joint_pe_vec = txt_pe_vec; + joint_pe_vec.insert(joint_pe_vec.end(), img_pe_vec.begin(), img_pe_vec.end()); + auto joint_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.head_dim / 2, txt_len + img_side * img_side); + set_backend_tensor_data(joint_pe, joint_pe_vec.data()); + + auto runner_ctx = get_context(); + auto out = model.forward(&runner_ctx, x, timesteps, context, mask, pos_embed, txt_pe, joint_pe); + ggml_build_forward_expand(gf, out); + return gf; + } + + sd::Tensor compute(int n_threads, + const sd::Tensor& x, + const sd::Tensor& timesteps, + const sd::Tensor& context, + const sd::Tensor& mask) { + auto get_graph = [&]() -> ggml_cgraph* { + return build_graph(x, timesteps, context, mask); + }; + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false, false, false), x.dim()); + } + + sd::Tensor compute(int n_threads, + const DiffusionParams& diffusion_params) override { + GGML_ASSERT(diffusion_params.x != nullptr); + GGML_ASSERT(diffusion_params.timesteps != nullptr); + GGML_ASSERT(diffusion_params.context != nullptr); + GGML_ASSERT(diffusion_params.y != nullptr); + return compute(n_threads, + *diffusion_params.x, + *diffusion_params.timesteps, + *diffusion_params.context, + *diffusion_params.y); + } + }; +} // namespace MiniT2I + +#endif // __SD_MODEL_DIFFUSION_MINIT2I_HPP__ diff --git a/src/model/te/t5.hpp b/src/model/te/t5.hpp index 7a92ec577..c7cfef2df 100644 --- a/src/model/te/t5.hpp +++ b/src/model/te/t5.hpp @@ -23,19 +23,72 @@ struct T5Config { int64_t vocab_size = 32128; bool relative_attention = true; - static T5Config detect_from_weights(const String2TensorStorage& tensor_storage_map, - const std::string& prefix, - bool is_umt5 = false) { - (void)tensor_storage_map; - (void)prefix; - T5Config config; - if (is_umt5) { - config.vocab_size = 256384; - config.relative_attention = false; - } - return config; - } -}; + static T5Config detect_from_weights(const String2TensorStorage& tensor_storage_map, + const std::string& prefix, + bool is_umt5 = false) { + T5Config config; + if (is_umt5) { + config.vocab_size = 256384; + config.relative_attention = false; + } + auto find_tensor = [&](const std::string& suffix) -> const TensorStorage* { + auto it = tensor_storage_map.find(prefix + "." + suffix); + if (it != tensor_storage_map.end()) { + return &it->second; + } + it = tensor_storage_map.find(prefix + suffix); + if (it != tensor_storage_map.end()) { + return &it->second; + } + return nullptr; + }; + + if (const TensorStorage* shared = find_tensor("shared.weight")) { + if (shared->n_dims == 2) { + config.vocab_size = shared->ne[1]; + config.model_dim = shared->ne[0]; + } + } + if (const TensorStorage* q = find_tensor("encoder.block.0.layer.0.SelfAttention.q.weight")) { + if (q->n_dims == 2) { + config.model_dim = q->ne[0]; + int64_t inner_dim = q->ne[1]; + // Flan-T5/T5 uses d_kv=64 for common sizes. + if (inner_dim % 64 == 0) { + config.num_heads = inner_dim / 64; + } + } + } + if (const TensorStorage* wi = find_tensor("encoder.block.0.layer.1.DenseReluDense.wi_0.weight")) { + if (wi->n_dims == 2) { + config.model_dim = wi->ne[0]; + config.ff_dim = wi->ne[1]; + } + } + int64_t detected_layers = 0; + for (const auto& [name, _] : tensor_storage_map) { + std::string base = prefix; + if (!base.empty() && base.back() != '.') { + base += "."; + } + std::string layer_prefix = base + "encoder.block."; + if (!starts_with(name, layer_prefix)) { + continue; + } + size_t pos = layer_prefix.size(); + size_t dot = name.find('.', pos); + if (dot == std::string::npos) { + continue; + } + int64_t layer = atoi(name.substr(pos, dot - pos).c_str()); + detected_layers = std::max(detected_layers, layer + 1); + } + if (detected_layers > 0) { + config.num_layers = detected_layers; + } + return config; + } +}; class T5LayerNorm : public UnaryBlock { protected: diff --git a/src/model/vae/vae.hpp b/src/model/vae/vae.hpp index 34a0d9663..8b8c46ded 100644 --- a/src/model/vae/vae.hpp +++ b/src/model/vae/vae.hpp @@ -78,7 +78,7 @@ struct VAE : public GGMLRunner { scale_factor = 16; } else if (sd_version_uses_flux2_vae(version)) { scale_factor = 16; - } else if (version == VERSION_CHROMA_RADIANCE || version == VERSION_HIDREAM_O1) { + } else if (version == VERSION_CHROMA_RADIANCE || version == VERSION_HIDREAM_O1 || sd_version_is_minit2i(version)) { scale_factor = 1; } return scale_factor; diff --git a/src/model_loader.cpp b/src/model_loader.cpp index c239e22d2..9c702897e 100644 --- a/src/model_loader.cpp +++ b/src/model_loader.cpp @@ -470,6 +470,17 @@ SDVersion ModelLoader::get_sd_version() { tensor_storage_map.find("model.diffusion_model.transformer_blocks.0.img_mlp.w1.weight") != tensor_storage_map.end()) { return VERSION_LENS; } + if ((tensor_storage_map.find("model.net.img_embedder.proj1.weight") != tensor_storage_map.end() && + tensor_storage_map.find("model.net.double_blocks.0.img_qkv.weight") != tensor_storage_map.end() && + tensor_storage_map.find("model.net.txt_embedder.weight") != tensor_storage_map.end()) || + (tensor_storage_map.find("model.diffusion_model.net.img_embedder.proj1.weight") != tensor_storage_map.end() && + tensor_storage_map.find("model.diffusion_model.net.double_blocks.0.img_qkv.weight") != tensor_storage_map.end() && + tensor_storage_map.find("model.diffusion_model.net.txt_embedder.weight") != tensor_storage_map.end()) || + (tensor_storage_map.find("model.diffusion_model.model.net.img_embedder.proj1.weight") != tensor_storage_map.end() && + tensor_storage_map.find("model.diffusion_model.model.net.double_blocks.0.img_qkv.weight") != tensor_storage_map.end() && + tensor_storage_map.find("model.diffusion_model.model.net.txt_embedder.weight") != tensor_storage_map.end())) { + return VERSION_MINIT2I; + } if (tensor_storage.name.find("model.diffusion_model.transformer_blocks.0.img_mod.1.weight") != std::string::npos) { return VERSION_QWEN_IMAGE; } diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index a1623252c..2ae2e9651 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -29,6 +29,7 @@ #include "model/diffusion/krea2.hpp" #include "model/diffusion/lens.hpp" #include "model/diffusion/ltxv.hpp" +#include "model/diffusion/minit2i.hpp" #include "model/diffusion/mmdit.hpp" #include "model/diffusion/model.hpp" #include "model/diffusion/pid.hpp" @@ -93,6 +94,7 @@ const char* model_version_to_str[] = { "Ovis Image", "Ernie Image", "Lens", + "MiniT2I", "Longcat-Image", "PiD", "Ideogram 4", @@ -785,6 +787,14 @@ class StableDiffusionGGML { tensor_storage_map, "model", model_manager); + } else if (sd_version_is_minit2i(version)) { + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + tensor_storage_map, + model_manager); + diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), + tensor_storage_map, + "", + model_manager); } else if (sd_version_is_anima(version)) { cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), tensor_storage_map, @@ -958,7 +968,7 @@ class StableDiffusionGGML { } }; - if (version == VERSION_CHROMA_RADIANCE || version == VERSION_HIDREAM_O1) { + if (version == VERSION_CHROMA_RADIANCE || version == VERSION_HIDREAM_O1 || sd_version_is_minit2i(version)) { LOG_INFO("using FakeVAE"); first_stage_model = std::make_shared(version, backend_for(SDBackendModule::VAE), @@ -2032,11 +2042,93 @@ class StableDiffusionGGML { } int64_t last_progress_us = ggml_time_us(); + SamplePreviewContext preview = prepare_sample_preview_context(); + + if (sd_version_is_minit2i(version)) { + if (noise.empty()) { + LOG_ERROR("MiniT2I sampling requires initial noise"); + return {}; + } + if (cond.c_crossattn.empty() || cond.c_vector.empty()) { + LOG_ERROR("MiniT2I requires T5 hidden states and prompt mask"); + return {}; + } + size_t minit2i_steps = steps > 0 ? steps : 100; + sd::Tensor x_t = noise * 2.0f; + sd::Tensor denoised = x_t; + sd::Tensor uncond_mask = sd::Tensor::zeros_like(cond.c_vector); + + auto run_minit2i = [&](const sd::Tensor& x, + float t_value, + const sd::Tensor& mask) -> sd::Tensor { + int64_t batch = x.dim() >= 4 ? x.shape()[3] : 1; + if (batch <= 0) { + LOG_ERROR("MiniT2I got invalid input shape for sampling"); + return {}; + } + LOG_DEBUG("MiniT2I sampling input shape: dim=%" PRId64 ", batch=%" PRId64, + x.dim(), + batch); + std::vector t_vec(static_cast(batch), t_value); + const int64_t t_vec_size = static_cast(t_vec.size()); + sd::Tensor timesteps_tensor({t_vec_size}, std::move(t_vec)); + DiffusionParams diffusion_params; + diffusion_params.x = &x; + diffusion_params.timesteps = ×teps_tensor; + diffusion_params.context = &cond.c_crossattn; + diffusion_params.y = &mask; + auto out = work_diffusion_model->compute(n_threads, diffusion_params); + if (out.empty()) { + LOG_ERROR("MiniT2I diffusion model compute failed"); + return {}; + } + return out; + }; + + pretty_progress(0, static_cast(minit2i_steps), 0); + last_progress_us = ggml_time_us(); + for (size_t i = 0; i < minit2i_steps; ++i) { + if (get_cancel_flag() == SD_CANCEL_ALL) { + LOG_DEBUG("cancelling generation"); + return {}; + } + float t_cur = static_cast(i) / static_cast(minit2i_steps); + float t_next = static_cast(i + 1) / static_cast(minit2i_steps); + + if (sd_should_preview_noisy() && preview.callback != nullptr) { + preview_image(static_cast(i + 1), x_t, version, preview.mode, preview.callback, preview.data, true); + } + + auto cond_x0 = run_minit2i(x_t, t_cur, cond.c_vector); + if (cond_x0.empty()) { + return {}; + } + auto uncond_x0 = run_minit2i(x_t, t_cur, uncond_mask); + if (uncond_x0.empty()) { + return {}; + } + float denom = std::max(1.0f - t_cur, 0.001f); + auto cond_v = (cond_x0 - x_t) / denom; + auto uncond_v = (uncond_x0 - x_t) / denom; + auto v = uncond_v + (cond_v - uncond_v) * cfg_scale; + x_t += v * (t_next - t_cur); + denoised = x_t; + + if (sd_should_preview_denoised() && preview.callback != nullptr) { + preview_image(static_cast(i + 1), denoised, version, preview.mode, preview.callback, preview.data, false); + } + report_sample_progress(static_cast(i + 1), minit2i_steps, &last_progress_us); + } + if (work_diffusion_model) { + work_diffusion_model->free_compute_buffer(); + } + return denoised; + } + sd::Tensor x_t = !noise.empty() ? denoiser->noise_scaling(sigmas[0], noise, init_latent) : init_latent; sd::Tensor denoised = x_t; - SamplePreviewContext preview = prepare_sample_preview_context(); auto denoise = [&](const sd::Tensor& x, float sigma, int step) -> sd::guidance::GuiderOutput { if (get_cancel_flag() == SD_CANCEL_ALL) { @@ -2335,6 +2427,8 @@ class StableDiffusionGGML { latent_channel = 3; } else if (version == VERSION_CHROMA_RADIANCE) { latent_channel = 3; + } else if (sd_version_is_minit2i(version)) { + latent_channel = 3; } else if (sd_version_is_pid(version)) { latent_channel = 3; } else if (sd_version_is_sefi_image(version)) { @@ -2416,7 +2510,7 @@ class StableDiffusionGGML { } sd::Tensor decode_first_stage(const sd::Tensor& x, bool decode_video = false) { - if (sd_version_is_pid(version)) { + if (sd_version_is_pid(version) || sd_version_is_minit2i(version)) { return sd::ops::clamp((x + 1.f) * 0.5f, 0.0f, 1.0f); } auto latents = first_stage_model->diffusion_to_vae_latents(x); From 9153c16a5545c0cad500d9a94a2148e3ac453576 Mon Sep 17 00:00:00 2001 From: KenForever1 <2962666398@qq.com> Date: Fri, 19 Jun 2026 15:41:20 +0800 Subject: [PATCH 2/7] Optimize MiniT2I position cache Cache MiniT2I positional embeddings and text/vision RoPE tensors in a runner-level backend buffer. This avoids regenerating and uploading the same step-invariant constants for every denoise graph while preserving model batch semantics. --- src/model/diffusion/minit2i.hpp | 98 +++++++++++++++++++++++++++------ 1 file changed, 80 insertions(+), 18 deletions(-) diff --git a/src/model/diffusion/minit2i.hpp b/src/model/diffusion/minit2i.hpp index d69f1f4ac..ea1675273 100644 --- a/src/model/diffusion/minit2i.hpp +++ b/src/model/diffusion/minit2i.hpp @@ -487,9 +487,15 @@ namespace MiniT2I { struct MiniT2IRunner : public DiffusionModelRunner { MiniT2IConfig config; MMJiT model; - std::vector pos_embed_vec; - std::vector txt_pe_vec; - std::vector joint_pe_vec; + ggml_context* position_cache_ctx = nullptr; + ggml_backend_buffer_t position_cache_buffer = nullptr; + ggml_tensor* cached_pos_embed = nullptr; + ggml_tensor* cached_txt_pe = nullptr; + ggml_tensor* cached_joint_pe = nullptr; + int64_t cached_img_side = -1; + int64_t cached_txt_len = -1; + int64_t cached_hidden_size = -1; + int64_t cached_head_dim = -1; MiniT2IRunner(ggml_backend_t backend, const String2TensorStorage& tensor_storage_map = {}, @@ -501,6 +507,10 @@ namespace MiniT2I { model.init(params_ctx, tensor_storage_map, this->prefix); } + ~MiniT2IRunner() override { + free_position_cache(); + } + std::string get_desc() override { return "MiniT2I"; } @@ -509,6 +519,71 @@ namespace MiniT2I { model.get_param_tensors(tensors, prefix); } + void free_position_cache() { + if (position_cache_buffer != nullptr) { + ggml_backend_buffer_free(position_cache_buffer); + position_cache_buffer = nullptr; + } + if (position_cache_ctx != nullptr) { + ggml_free(position_cache_ctx); + position_cache_ctx = nullptr; + } + cached_pos_embed = nullptr; + cached_txt_pe = nullptr; + cached_joint_pe = nullptr; + cached_img_side = -1; + cached_txt_len = -1; + cached_hidden_size = -1; + cached_head_dim = -1; + } + + void ensure_position_cache(int64_t img_side, int64_t txt_len) { + if (cached_img_side == img_side && + cached_txt_len == txt_len && + cached_hidden_size == config.hidden_size && + cached_head_dim == config.head_dim && + cached_pos_embed != nullptr && + cached_txt_pe != nullptr && + cached_joint_pe != nullptr) { + return; + } + + free_position_cache(); + + auto pos_embed_vec = make_2d_sincos_pos_embed(static_cast(img_side), static_cast(config.hidden_size)); + auto txt_pe_vec = make_text_rope(static_cast(txt_len), static_cast(config.head_dim)); + auto img_pe_vec = make_vision_rope(static_cast(img_side), static_cast(config.head_dim)); + auto joint_pe_vec = txt_pe_vec; + joint_pe_vec.insert(joint_pe_vec.end(), img_pe_vec.begin(), img_pe_vec.end()); + + ggml_init_params params; + params.mem_size = static_cast(3 * ggml_tensor_overhead()); + params.mem_buffer = nullptr; + params.no_alloc = true; + position_cache_ctx = ggml_init(params); + GGML_ASSERT(position_cache_ctx != nullptr); + + cached_pos_embed = ggml_new_tensor_3d(position_cache_ctx, GGML_TYPE_F32, config.hidden_size, img_side * img_side, 1); + ggml_set_name(cached_pos_embed, "minit2i.pos_embed"); + cached_txt_pe = ggml_new_tensor_4d(position_cache_ctx, GGML_TYPE_F32, 2, 2, config.head_dim / 2, txt_len); + ggml_set_name(cached_txt_pe, "minit2i.txt_pe"); + cached_joint_pe = ggml_new_tensor_4d(position_cache_ctx, GGML_TYPE_F32, 2, 2, config.head_dim / 2, txt_len + img_side * img_side); + ggml_set_name(cached_joint_pe, "minit2i.joint_pe"); + + position_cache_buffer = ggml_backend_alloc_ctx_tensors(position_cache_ctx, runtime_backend); + GGML_ASSERT(position_cache_buffer != nullptr); + ggml_backend_buffer_set_usage(position_cache_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + ggml_backend_tensor_set(cached_pos_embed, pos_embed_vec.data(), 0, ggml_nbytes(cached_pos_embed)); + ggml_backend_tensor_set(cached_txt_pe, txt_pe_vec.data(), 0, ggml_nbytes(cached_txt_pe)); + ggml_backend_tensor_set(cached_joint_pe, joint_pe_vec.data(), 0, ggml_nbytes(cached_joint_pe)); + ggml_backend_synchronize(runtime_backend); + + cached_img_side = img_side; + cached_txt_len = txt_len; + cached_hidden_size = config.hidden_size; + cached_head_dim = config.head_dim; + } + ggml_cgraph* build_graph(const sd::Tensor& x_tensor, const sd::Tensor& timesteps_tensor, const sd::Tensor& context_tensor, @@ -523,23 +598,10 @@ namespace MiniT2I { int64_t H = x->ne[1]; int64_t img_side = H / config.patch_size; int64_t txt_len = context->ne[1]; - - pos_embed_vec = make_2d_sincos_pos_embed(static_cast(img_side), static_cast(config.hidden_size)); - auto pos_embed = ggml_new_tensor_3d(compute_ctx, GGML_TYPE_F32, config.hidden_size, img_side * img_side, 1); - set_backend_tensor_data(pos_embed, pos_embed_vec.data()); - - txt_pe_vec = make_text_rope(static_cast(txt_len), static_cast(config.head_dim)); - auto txt_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.head_dim / 2, txt_len); - set_backend_tensor_data(txt_pe, txt_pe_vec.data()); - - auto img_pe_vec = make_vision_rope(static_cast(img_side), static_cast(config.head_dim)); - joint_pe_vec = txt_pe_vec; - joint_pe_vec.insert(joint_pe_vec.end(), img_pe_vec.begin(), img_pe_vec.end()); - auto joint_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.head_dim / 2, txt_len + img_side * img_side); - set_backend_tensor_data(joint_pe, joint_pe_vec.data()); + ensure_position_cache(img_side, txt_len); auto runner_ctx = get_context(); - auto out = model.forward(&runner_ctx, x, timesteps, context, mask, pos_embed, txt_pe, joint_pe); + auto out = model.forward(&runner_ctx, x, timesteps, context, mask, cached_pos_embed, cached_txt_pe, cached_joint_pe); ggml_build_forward_expand(gf, out); return gf; } From 09221cd996cddf9a3a1ea6f3303ca431194d661c Mon Sep 17 00:00:00 2001 From: KenForever1 <2962666398@qq.com> Date: Fri, 19 Jun 2026 16:29:32 +0800 Subject: [PATCH 3/7] Remove unused MiniT2I conditioning branch Drop the unused timestep and pooled-text vec path from MiniT2I graph construction. The Python reference currently passes this vec through unused block/final-layer parameters, and local validation produced identical output hashes before and after the cleanup. --- src/model/diffusion/minit2i.hpp | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/src/model/diffusion/minit2i.hpp b/src/model/diffusion/minit2i.hpp index ea1675273..e3c9dd538 100644 --- a/src/model/diffusion/minit2i.hpp +++ b/src/model/diffusion/minit2i.hpp @@ -420,17 +420,14 @@ namespace MiniT2I { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* img, - ggml_tensor* t, ggml_tensor* context, ggml_tensor* mask, ggml_tensor* pos_embed, ggml_tensor* txt_pe, ggml_tensor* joint_pe) { - auto img_embedder = std::dynamic_pointer_cast(blocks["img_embedder"]); - auto txt_embedder = std::dynamic_pointer_cast(blocks["txt_embedder"]); - auto t_embedder = std::dynamic_pointer_cast(blocks["t_embedder"]); - auto pooled_embedder = std::dynamic_pointer_cast(blocks["pooled_embedder"]); - auto final_layer = std::dynamic_pointer_cast(blocks["final_layer"]); + auto img_embedder = std::dynamic_pointer_cast(blocks["img_embedder"]); + auto txt_embedder = std::dynamic_pointer_cast(blocks["txt_embedder"]); + auto final_layer = std::dynamic_pointer_cast(blocks["final_layer"]); int64_t W = img->ne[0]; int64_t H = img->ne[1]; @@ -441,11 +438,6 @@ namespace MiniT2I { auto x = img_embedder->forward(ctx, img); x = ggml_add(ctx->ggml_ctx, x, pos_embed); - auto t_vec = t_embedder->forward(ctx, t); - auto pooled_text = pool_context(ctx, context); - auto vec = ggml_add(ctx->ggml_ctx, t_vec, pooled_embedder->forward(ctx, pooled_text)); - SD_UNUSED(vec); - auto txt = txt_embedder->forward(ctx, context); for (int64_t i = 0; i < config.txt_preamble_depth; ++i) { auto block = std::dynamic_pointer_cast(blocks["txt_preamble_blocks." + std::to_string(i)]); @@ -590,9 +582,9 @@ namespace MiniT2I { const sd::Tensor& mask_tensor) { ggml_cgraph* gf = new_graph_custom(MINIT2I_GRAPH_SIZE); ggml_tensor* x = make_input(x_tensor); - ggml_tensor* timesteps = make_input(timesteps_tensor); ggml_tensor* context = make_input(context_tensor); ggml_tensor* mask = make_input(mask_tensor); + SD_UNUSED(timesteps_tensor); int64_t W = x->ne[0]; int64_t H = x->ne[1]; @@ -601,7 +593,7 @@ namespace MiniT2I { ensure_position_cache(img_side, txt_len); auto runner_ctx = get_context(); - auto out = model.forward(&runner_ctx, x, timesteps, context, mask, cached_pos_embed, cached_txt_pe, cached_joint_pe); + auto out = model.forward(&runner_ctx, x, context, mask, cached_pos_embed, cached_txt_pe, cached_joint_pe); ggml_build_forward_expand(gf, out); return gf; } From 1fc4ed3a1508f25632ee26b8e45f9fc4062c7582 Mon Sep 17 00:00:00 2001 From: KenForever1 <2962666398@qq.com> Date: Wed, 1 Jul 2026 19:56:27 +0800 Subject: [PATCH 4/7] Address MiniT2I PR review feedback - Simplify model version detection to a single representative weight check - Remove resolve_prefix; use fixed prefix with --diffusion-model - Add docs/minit2i.md and README entry --- README.md | 1 + docs/minit2i.md | 48 +++++++++++++++++++++++++++++++++ src/model/diffusion/minit2i.hpp | 19 +------------ src/model_loader.cpp | 10 +------ src/stable-diffusion.cpp | 2 +- 5 files changed, 52 insertions(+), 28 deletions(-) create mode 100644 docs/minit2i.md diff --git a/README.md b/README.md index 6b0e5ef0f..d7127bb18 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,7 @@ API and command-line option may change frequently.*** - [PiD](./docs/pid.md) - [LongCat Image](./docs/longcat_image.md) - [Z-Image](./docs/z_image.md) + - [MiniT2I](./docs/minit2i.md) - [Ovis-Image](./docs/ovis_image.md) - [Anima](./docs/anima.md) - [ERNIE-Image](./docs/ernie_image.md) diff --git a/docs/minit2i.md b/docs/minit2i.md new file mode 100644 index 000000000..78c854dfa --- /dev/null +++ b/docs/minit2i.md @@ -0,0 +1,48 @@ +# How to Use + +MiniT2I uses a MiniT2I diffusion transformer and `google/flan-t5-large` as the text encoder. + +## Download weights + +- Download MiniT2I diffusion model + - safetensors: https://huggingface.co/MiniT2I/minit2i-b-16/tree/main/transformer (`diffusion_pytorch_model.safetensors`) +- Download flan-t5-large text encoder + - safetensors: https://huggingface.co/google/flan-t5-large/tree/main (`model.safetensors`) + +## Examples + +### Mac Metal + +``` +./bin/sd-cli \ + --backend metal \ + --diffusion-model ../models/minit2i/diffusion_pytorch_model.safetensors \ + --t5xxl ../models/flan-t5-large/model.safetensors \ + --prompt "a cat" \ + --steps 100 \ + --cfg-scale 6 \ + --width 512 \ + --height 512 \ + --seed 42 \ + --sampling-method euler \ + --rng cpu \ + --output minit2i_metal.png \ + --threads 8 +``` + +### CUDA with diffusion flash attention + +``` +./bin/sd-cli \ + --diffusion-model ../models/minit2i/diffusion_pytorch_model.safetensors \ + --t5xxl ../models/flan-t5-large/model.safetensors \ + --prompt "a cat" \ + --steps 100 \ + --cfg-scale 6 \ + --width 512 \ + --height 512 \ + --seed 42 \ + --sampling-method euler \ + --diffusion-fa \ + --output minit2i_cuda.png +``` diff --git a/src/model/diffusion/minit2i.hpp b/src/model/diffusion/minit2i.hpp index e3c9dd538..f4698a76f 100644 --- a/src/model/diffusion/minit2i.hpp +++ b/src/model/diffusion/minit2i.hpp @@ -459,23 +459,6 @@ namespace MiniT2I { } }; - inline std::string resolve_prefix(const String2TensorStorage& tensor_storage_map, const std::string& requested) { - if (!requested.empty() && tensor_storage_map.find(requested + ".img_embedder.proj1.weight") != tensor_storage_map.end()) { - return requested; - } - static const std::vector candidates = { - "model.net", - "model.diffusion_model.net", - "model.diffusion_model.model.net", - }; - for (const auto& candidate : candidates) { - if (tensor_storage_map.find(candidate + ".img_embedder.proj1.weight") != tensor_storage_map.end()) { - return candidate; - } - } - return requested.empty() ? "model.net" : requested; - } - struct MiniT2IRunner : public DiffusionModelRunner { MiniT2IConfig config; MMJiT model; @@ -493,7 +476,7 @@ namespace MiniT2I { const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "", std::shared_ptr weight_manager = nullptr) - : DiffusionModelRunner(backend, resolve_prefix(tensor_storage_map, prefix), weight_manager), + : DiffusionModelRunner(backend, prefix, weight_manager), config(MiniT2IConfig::detect_from_weights(tensor_storage_map, this->prefix)), model(config) { model.init(params_ctx, tensor_storage_map, this->prefix); diff --git a/src/model_loader.cpp b/src/model_loader.cpp index 9c702897e..b41187d0c 100644 --- a/src/model_loader.cpp +++ b/src/model_loader.cpp @@ -470,15 +470,7 @@ SDVersion ModelLoader::get_sd_version() { tensor_storage_map.find("model.diffusion_model.transformer_blocks.0.img_mlp.w1.weight") != tensor_storage_map.end()) { return VERSION_LENS; } - if ((tensor_storage_map.find("model.net.img_embedder.proj1.weight") != tensor_storage_map.end() && - tensor_storage_map.find("model.net.double_blocks.0.img_qkv.weight") != tensor_storage_map.end() && - tensor_storage_map.find("model.net.txt_embedder.weight") != tensor_storage_map.end()) || - (tensor_storage_map.find("model.diffusion_model.net.img_embedder.proj1.weight") != tensor_storage_map.end() && - tensor_storage_map.find("model.diffusion_model.net.double_blocks.0.img_qkv.weight") != tensor_storage_map.end() && - tensor_storage_map.find("model.diffusion_model.net.txt_embedder.weight") != tensor_storage_map.end()) || - (tensor_storage_map.find("model.diffusion_model.model.net.img_embedder.proj1.weight") != tensor_storage_map.end() && - tensor_storage_map.find("model.diffusion_model.model.net.double_blocks.0.img_qkv.weight") != tensor_storage_map.end() && - tensor_storage_map.find("model.diffusion_model.model.net.txt_embedder.weight") != tensor_storage_map.end())) { + if (tensor_storage.name.find("net.img_embedder.proj1.weight") != std::string::npos) { return VERSION_MINIT2I; } if (tensor_storage.name.find("model.diffusion_model.transformer_blocks.0.img_mod.1.weight") != std::string::npos) { diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 2ae2e9651..c7f58de8d 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -793,7 +793,7 @@ class StableDiffusionGGML { model_manager); diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, - "", + "model.diffusion_model.model.net", model_manager); } else if (sd_version_is_anima(version)) { cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), From 49c98b986454318a95b53b023f927f3e122ce216 Mon Sep 17 00:00:00 2001 From: KenForever1 <2962666398@qq.com> Date: Wed, 1 Jul 2026 20:51:32 +0800 Subject: [PATCH 5/7] Use generic sampling flow for MiniT2I Replace the standalone MiniT2I sampling branch with the shared sample_k_diffusion path: - Add MiniT2IFlowDenoiser (sigma = 1 - t, x0-prediction scalings) so the generic Euler update reproduces the reference linear-flow step - Pass the prompt mask via MiniT2IDiffusionExtra and derive the unconditional signal from a zeroed mask, letting the generic CFG guider handle classifier-free guidance - Add MINIT2I_FLOW_PRED prediction type and select the denoiser for it Output matches the previous dedicated branch (max abs pixel diff 2/255). --- include/stable-diffusion.h | 1 + src/model/diffusion/minit2i.hpp | 5 +- src/model/diffusion/model.hpp | 7 ++- src/runtime/denoiser.hpp | 62 +++++++++++++++++++++ src/stable-diffusion.cpp | 97 ++++++--------------------------- 5 files changed, 88 insertions(+), 84 deletions(-) diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index d5bc8a40a..c750498ba 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -84,6 +84,7 @@ enum prediction_t { FLOW_PRED, FLUX_FLOW_PRED, SEFI_FLOW_PRED, + MINIT2I_FLOW_PRED, PREDICTION_COUNT }; diff --git a/src/model/diffusion/minit2i.hpp b/src/model/diffusion/minit2i.hpp index f4698a76f..827fede88 100644 --- a/src/model/diffusion/minit2i.hpp +++ b/src/model/diffusion/minit2i.hpp @@ -597,12 +597,13 @@ namespace MiniT2I { GGML_ASSERT(diffusion_params.x != nullptr); GGML_ASSERT(diffusion_params.timesteps != nullptr); GGML_ASSERT(diffusion_params.context != nullptr); - GGML_ASSERT(diffusion_params.y != nullptr); + const auto* extra = diffusion_extra_as(diffusion_params); + GGML_ASSERT(extra->mask != nullptr); return compute(n_threads, *diffusion_params.x, *diffusion_params.timesteps, *diffusion_params.context, - *diffusion_params.y); + *extra->mask); } }; } // namespace MiniT2I diff --git a/src/model/diffusion/model.hpp b/src/model/diffusion/model.hpp index 2e143fe4c..cd44e3b50 100644 --- a/src/model/diffusion/model.hpp +++ b/src/model/diffusion/model.hpp @@ -52,6 +52,10 @@ struct LTXAVDiffusionExtra { const sd::Tensor* video_positions = nullptr; }; +struct MiniT2IDiffusionExtra { + const sd::Tensor* mask = nullptr; +}; + using DiffusionExtraParams = std::variant; + LTXAVDiffusionExtra, + MiniT2IDiffusionExtra>; struct DiffusionParams { const sd::Tensor* x = nullptr; diff --git a/src/runtime/denoiser.hpp b/src/runtime/denoiser.hpp index ed1cd4d93..812eebe61 100644 --- a/src/runtime/denoiser.hpp +++ b/src/runtime/denoiser.hpp @@ -1338,6 +1338,68 @@ struct SefiFlowDenoiser : public FluxFlowDenoiser { } }; +// MiniT2I predicts x0 directly and integrates a linear flow ODE: +// x_{t+dt} = x_t + (x0 - x_t)/(1 - t) * dt, t in [0, 1), x0 = start = noise * 2. +// Mapping sigma = 1 - t makes the generic Euler update +// x += (x - denoised)/sigma * (sigma_next - sigma) +// exactly reproduce that step when denoised == x0. To make the generic +// `denoised = pred * c_out + x * c_skip` yield x0 from the model's raw x0 +// prediction we use c_skip = 0, c_out = 1, c_in = 1. Sigmas run linearly 1 -> 0. +struct MiniT2IFlowDenoiser : public Denoiser { + float sigma_min() override { + return 0.0f; + } + + float sigma_max() override { + return 1.0f; + } + + float sigma_to_t(float sigma) override { + return 1.0f - sigma; + } + + float t_to_sigma(float t) override { + return 1.0f - t; + } + + std::vector get_scalings(float sigma) override { + SD_UNUSED(sigma); + float c_skip = 0.0f; + float c_out = 1.0f; + float c_in = 1.0f; + return {c_skip, c_out, c_in}; + } + + sd::Tensor noise_scaling(float sigma, + const sd::Tensor& noise, + const sd::Tensor& latent) override { + SD_UNUSED(sigma); + SD_UNUSED(latent); + // Sampling starts from x0_init = noise * 2 (see MiniT2I reference). + return noise * 2.0f; + } + + sd::Tensor inverse_noise_scaling(float sigma, const sd::Tensor& latent) override { + SD_UNUSED(sigma); + return latent; + } + + std::vector get_sigmas(uint32_t n, int image_seq_len, scheduler_t scheduler_type, SDVersion version, const char* extra_sample_args = nullptr) override { + SD_UNUSED(image_seq_len); + SD_UNUSED(scheduler_type); + SD_UNUSED(version); + SD_UNUSED(extra_sample_args); + // Uniform t schedule 0 -> 1 => sigma 1 -> 0, matching the reference loop. + std::vector sigmas; + sigmas.reserve(n + 1); + for (uint32_t i = 0; i < n; ++i) { + sigmas.push_back(1.0f - static_cast(i) / static_cast(n)); + } + sigmas.push_back(0.0f); + return sigmas; + } +}; + typedef std::function&, float, int)> denoise_cb_t; static std::pair get_ancestral_step(float sigma_from, diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index c7f58de8d..26c04a1b5 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -1309,6 +1309,8 @@ class StableDiffusionGGML { } } else if (sd_version_is_sefi_image(version)) { pred_type = SEFI_FLOW_PRED; + } else if (sd_version_is_minit2i(version)) { + pred_type = MINIT2I_FLOW_PRED; } else { pred_type = EPS_PRED; } @@ -1346,6 +1348,11 @@ class StableDiffusionGGML { denoiser = std::make_shared(); break; } + case MINIT2I_FLOW_PRED: { + LOG_INFO("running in MiniT2I FLOW mode"); + denoiser = std::make_shared(); + break; + } default: { LOG_ERROR("Unknown predition type %i", pred_type); return false; @@ -2044,87 +2051,6 @@ class StableDiffusionGGML { int64_t last_progress_us = ggml_time_us(); SamplePreviewContext preview = prepare_sample_preview_context(); - if (sd_version_is_minit2i(version)) { - if (noise.empty()) { - LOG_ERROR("MiniT2I sampling requires initial noise"); - return {}; - } - if (cond.c_crossattn.empty() || cond.c_vector.empty()) { - LOG_ERROR("MiniT2I requires T5 hidden states and prompt mask"); - return {}; - } - size_t minit2i_steps = steps > 0 ? steps : 100; - sd::Tensor x_t = noise * 2.0f; - sd::Tensor denoised = x_t; - sd::Tensor uncond_mask = sd::Tensor::zeros_like(cond.c_vector); - - auto run_minit2i = [&](const sd::Tensor& x, - float t_value, - const sd::Tensor& mask) -> sd::Tensor { - int64_t batch = x.dim() >= 4 ? x.shape()[3] : 1; - if (batch <= 0) { - LOG_ERROR("MiniT2I got invalid input shape for sampling"); - return {}; - } - LOG_DEBUG("MiniT2I sampling input shape: dim=%" PRId64 ", batch=%" PRId64, - x.dim(), - batch); - std::vector t_vec(static_cast(batch), t_value); - const int64_t t_vec_size = static_cast(t_vec.size()); - sd::Tensor timesteps_tensor({t_vec_size}, std::move(t_vec)); - DiffusionParams diffusion_params; - diffusion_params.x = &x; - diffusion_params.timesteps = ×teps_tensor; - diffusion_params.context = &cond.c_crossattn; - diffusion_params.y = &mask; - auto out = work_diffusion_model->compute(n_threads, diffusion_params); - if (out.empty()) { - LOG_ERROR("MiniT2I diffusion model compute failed"); - return {}; - } - return out; - }; - - pretty_progress(0, static_cast(minit2i_steps), 0); - last_progress_us = ggml_time_us(); - for (size_t i = 0; i < minit2i_steps; ++i) { - if (get_cancel_flag() == SD_CANCEL_ALL) { - LOG_DEBUG("cancelling generation"); - return {}; - } - float t_cur = static_cast(i) / static_cast(minit2i_steps); - float t_next = static_cast(i + 1) / static_cast(minit2i_steps); - - if (sd_should_preview_noisy() && preview.callback != nullptr) { - preview_image(static_cast(i + 1), x_t, version, preview.mode, preview.callback, preview.data, true); - } - - auto cond_x0 = run_minit2i(x_t, t_cur, cond.c_vector); - if (cond_x0.empty()) { - return {}; - } - auto uncond_x0 = run_minit2i(x_t, t_cur, uncond_mask); - if (uncond_x0.empty()) { - return {}; - } - float denom = std::max(1.0f - t_cur, 0.001f); - auto cond_v = (cond_x0 - x_t) / denom; - auto uncond_v = (uncond_x0 - x_t) / denom; - auto v = uncond_v + (cond_v - uncond_v) * cfg_scale; - x_t += v * (t_next - t_cur); - denoised = x_t; - - if (sd_should_preview_denoised() && preview.callback != nullptr) { - preview_image(static_cast(i + 1), denoised, version, preview.mode, preview.callback, preview.data, false); - } - report_sample_progress(static_cast(i + 1), minit2i_steps, &last_progress_us); - } - if (work_diffusion_model) { - work_diffusion_model->free_compute_buffer(); - } - return denoised; - } - sd::Tensor x_t = !noise.empty() ? denoiser->noise_scaling(sigmas[0], noise, init_latent) : init_latent; @@ -2247,6 +2173,9 @@ class StableDiffusionGGML { audio_length, frame_rate, video_positions.empty() ? nullptr : &video_positions}; + } else if (sd_version_is_minit2i(version)) { + diffusion_params.extra = MiniT2IDiffusionExtra{ + condition.c_vector.empty() ? nullptr : &condition.c_vector}; } else { diffusion_params.extra = std::monostate{}; } @@ -2685,6 +2614,7 @@ const char* prediction_to_str[] = { "sd3_flow", "flux_flow", "sefi_flow", + "minit2i_flow", }; const char* sd_prediction_name(enum prediction_t prediction) { @@ -4318,6 +4248,11 @@ static std::optional prepare_image_generation_embeds(sd_c if (request->use_uncond || request->use_high_noise_uncond) { if (sd_version_is_ideogram4(sd_ctx->sd->version)) { uncond.c_vector = sd::Tensor::from_vector({1.0f}); + } else if (sd_version_is_minit2i(sd_ctx->sd->version)) { + // MiniT2I derives the unconditional signal from the same T5 hidden + // states with a zeroed prompt mask, so no extra text encode is needed. + uncond.c_crossattn = cond.c_crossattn; + uncond.c_vector = sd::Tensor::zeros_like(cond.c_vector); } else { bool zero_out_masked = false; if (sd_version_is_sdxl(sd_ctx->sd->version) && From c47b8e531eebe98a4168872e6b623f42ee673a18 Mon Sep 17 00:00:00 2001 From: leejet Date: Wed, 1 Jul 2026 23:35:50 +0800 Subject: [PATCH 6/7] fix url --- docs/minit2i.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/minit2i.md b/docs/minit2i.md index 78c854dfa..7e120b38c 100644 --- a/docs/minit2i.md +++ b/docs/minit2i.md @@ -5,7 +5,7 @@ MiniT2I uses a MiniT2I diffusion transformer and `google/flan-t5-large` as the t ## Download weights - Download MiniT2I diffusion model - - safetensors: https://huggingface.co/MiniT2I/minit2i-b-16/tree/main/transformer (`diffusion_pytorch_model.safetensors`) + - safetensors: https://huggingface.co/MiniT2I/MiniT2I/tree/main/minit2i-b-16/transformer (`diffusion_pytorch_model.safetensors`) - Download flan-t5-large text encoder - safetensors: https://huggingface.co/google/flan-t5-large/tree/main (`model.safetensors`) From 059df64d9d2bf0cf09b04b64042d7fa916c48c88 Mon Sep 17 00:00:00 2001 From: leejet Date: Thu, 2 Jul 2026 00:32:18 +0800 Subject: [PATCH 7/7] format code --- src/model/diffusion/minit2i.hpp | 138 ++++++++++++++++---------------- src/model/te/t5.hpp | 132 +++++++++++++++--------------- src/stable-diffusion.cpp | 14 ++-- 3 files changed, 142 insertions(+), 142 deletions(-) diff --git a/src/model/diffusion/minit2i.hpp b/src/model/diffusion/minit2i.hpp index 827fede88..284661054 100644 --- a/src/model/diffusion/minit2i.hpp +++ b/src/model/diffusion/minit2i.hpp @@ -47,8 +47,8 @@ namespace MiniT2I { continue; } if (ends_with(name, "img_embedder.proj1.weight") && tensor_storage.n_dims == 4) { - config.patch_size = tensor_storage.ne[0]; - config.in_channels = tensor_storage.ne[2]; + config.patch_size = tensor_storage.ne[0]; + config.in_channels = tensor_storage.ne[2]; config.pca_channels = tensor_storage.ne[3]; } else if (ends_with(name, "img_embedder.proj2.weight") && tensor_storage.n_dims == 4) { config.pca_channels = tensor_storage.ne[2]; @@ -59,8 +59,8 @@ namespace MiniT2I { } else if (ends_with(name, "pooled_embedder.weight") && tensor_storage.n_dims == 2) { config.cond_vec_size = tensor_storage.ne[1]; } else if (ends_with(name, "double_blocks.0.img_qkv.weight") && tensor_storage.n_dims == 2) { - int64_t inner3 = tensor_storage.ne[1]; - int64_t inner = inner3 / 3; + int64_t inner3 = tensor_storage.ne[1]; + int64_t inner = inner3 / 3; config.hidden_size = tensor_storage.ne[0]; if (config.hidden_size == 768) { config.num_heads = 12; @@ -73,9 +73,9 @@ namespace MiniT2I { config.num_heads = std::max(1, inner / config.head_dim); } } else if (ends_with(name, "final_layer.linear.weight") && tensor_storage.n_dims == 2) { - int64_t patch_area = config.patch_size * config.patch_size; - config.hidden_size = tensor_storage.ne[0]; - config.in_channels = patch_area > 0 ? tensor_storage.ne[1] / patch_area : config.in_channels; + int64_t patch_area = config.patch_size * config.patch_size; + config.hidden_size = tensor_storage.ne[0]; + config.in_channels = patch_area > 0 ? tensor_storage.ne[1] / patch_area : config.in_channels; } else if (ends_with(name, "mask_token") && tensor_storage.n_dims >= 2) { config.prompt_length = tensor_storage.ne[1]; } @@ -92,8 +92,8 @@ namespace MiniT2I { if (pos != std::string::npos) { auto items = split_string(name.substr(pos), '.'); if (items.size() > 1) { - int64_t idx = atoi(items[1].c_str()); - config.txt_preamble_depth = std::max(config.txt_preamble_depth, idx + 1); + int64_t idx = atoi(items[1].c_str()); + config.txt_preamble_depth = std::max(config.txt_preamble_depth, idx + 1); } } } @@ -134,8 +134,8 @@ namespace MiniT2I { for (int x = 0; x < grid_size; ++x) { size_t base = static_cast(y * grid_size + x) * dim; for (int i = 0; i < quarter; ++i) { - float ay = y * omega[i]; - float ax = x * omega[i]; + float ay = y * omega[i]; + float ax = x * omega[i]; out[base + i] = std::sin(ax); out[base + quarter + i] = std::cos(ax); out[base + half_dim + i] = std::sin(ay); @@ -152,9 +152,9 @@ namespace MiniT2I { inline std::vector make_vision_rope(int side, int head_dim) { GGML_ASSERT(head_dim % 4 == 0); - int dim = head_dim / 2; - int quarter = dim / 2; - int length = side * side; + int dim = head_dim / 2; + int quarter = dim / 2; + int length = side * side; std::vector out(static_cast(length) * (head_dim / 2) * 4); std::vector freqs(quarter); for (int i = 0; i < quarter; ++i) { @@ -165,15 +165,15 @@ namespace MiniT2I { int pos = y * side + x; size_t base = static_cast(pos) * (head_dim / 2) * 4; for (int i = 0; i < quarter; ++i) { - float ay = y * freqs[i]; - float ax = x * freqs[i]; + float ay = y * freqs[i]; + float ax = x * freqs[i]; float angles[2] = {ay, ax}; for (int axis = 0; axis < 2; ++axis) { - int j = axis * quarter + i; - out[base + 4 * j] = std::cos(angles[axis]); - out[base + 4 * j + 1] = -std::sin(angles[axis]); - out[base + 4 * j + 2] = std::sin(angles[axis]); - out[base + 4 * j + 3] = std::cos(angles[axis]); + int j = axis * quarter + i; + out[base + 4 * j] = std::cos(angles[axis]); + out[base + 4 * j + 1] = -std::sin(angles[axis]); + out[base + 4 * j + 2] = std::sin(angles[axis]); + out[base + 4 * j + 3] = std::cos(angles[axis]); } } } @@ -184,15 +184,15 @@ namespace MiniT2I { struct SwiGLUMlp : public GGMLBlock { SwiGLUMlp(int64_t in_features, int64_t hidden_features) { int64_t hidden_dim = ((hidden_features + 7) / 8) * 8; - blocks["w1"] = std::make_shared(in_features, hidden_dim, false); - blocks["w3"] = std::make_shared(in_features, hidden_dim, false); - blocks["w2"] = std::make_shared(hidden_dim, in_features, false); + blocks["w1"] = std::make_shared(in_features, hidden_dim, false); + blocks["w3"] = std::make_shared(in_features, hidden_dim, false); + blocks["w2"] = std::make_shared(hidden_dim, in_features, false); } ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { - auto w1 = std::dynamic_pointer_cast(blocks["w1"]); - auto w3 = std::dynamic_pointer_cast(blocks["w3"]); - auto w2 = std::dynamic_pointer_cast(blocks["w2"]); + auto w1 = std::dynamic_pointer_cast(blocks["w1"]); + auto w3 = std::dynamic_pointer_cast(blocks["w3"]); + auto w2 = std::dynamic_pointer_cast(blocks["w2"]); auto gate = ggml_silu(ctx->ggml_ctx, w1->forward(ctx, x)); auto up = w3->forward(ctx, x); return w2->forward(ctx, ggml_mul(ctx->ggml_ctx, gate, up)); @@ -205,28 +205,28 @@ namespace MiniT2I { BottleneckPatchEmbed(int64_t patch_size, int64_t in_channels, int64_t pca_channels, int64_t hidden_size) : patch_size(patch_size) { blocks["proj1"] = std::make_shared(in_channels, - pca_channels, - std::pair{static_cast(patch_size), static_cast(patch_size)}, - std::pair{static_cast(patch_size), static_cast(patch_size)}, - std::pair{0, 0}, - std::pair{1, 1}, - false); + pca_channels, + std::pair{static_cast(patch_size), static_cast(patch_size)}, + std::pair{static_cast(patch_size), static_cast(patch_size)}, + std::pair{0, 0}, + std::pair{1, 1}, + false); blocks["proj2"] = std::make_shared(pca_channels, - hidden_size, - std::pair{1, 1}, - std::pair{1, 1}, - std::pair{0, 0}, - std::pair{1, 1}, - true); + hidden_size, + std::pair{1, 1}, + std::pair{1, 1}, + std::pair{0, 0}, + std::pair{1, 1}, + true); } ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { auto proj1 = std::dynamic_pointer_cast(blocks["proj1"]); auto proj2 = std::dynamic_pointer_cast(blocks["proj2"]); - x = proj1->forward(ctx, x); - x = proj2->forward(ctx, x); - x = ggml_reshape_3d(ctx->ggml_ctx, x, x->ne[0] * x->ne[1], x->ne[2], x->ne[3]); - x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3)); + x = proj1->forward(ctx, x); + x = proj2->forward(ctx, x); + x = ggml_reshape_3d(ctx->ggml_ctx, x, x->ne[0] * x->ne[1], x->ne[2], x->ne[3]); + x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3)); return x; } }; @@ -253,12 +253,12 @@ namespace MiniT2I { inline std::vector split_qkv(ggml_context* ctx, ggml_tensor* qkv, int64_t num_heads, int64_t head_dim) { int64_t N = qkv->ne[2]; int64_t L = qkv->ne[1]; - auto q = ggml_view_4d(ctx, qkv, head_dim, num_heads, L, N, - qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], 0); - auto k = ggml_view_4d(ctx, qkv, head_dim, num_heads, L, N, - qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], qkv->nb[0] * head_dim * num_heads); - auto v = ggml_view_4d(ctx, qkv, head_dim, num_heads, L, N, - qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], qkv->nb[0] * head_dim * num_heads * 2); + auto q = ggml_view_4d(ctx, qkv, head_dim, num_heads, L, N, + qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], 0); + auto k = ggml_view_4d(ctx, qkv, head_dim, num_heads, L, N, + qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], qkv->nb[0] * head_dim * num_heads); + auto v = ggml_view_4d(ctx, qkv, head_dim, num_heads, L, N, + qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], qkv->nb[0] * head_dim * num_heads * 2); return {q, k, v}; } @@ -268,7 +268,7 @@ namespace MiniT2I { PlainTextTransformerBlock(int64_t hidden_size, int64_t num_heads, int64_t head_dim, float mlp_ratio) : num_heads(num_heads), head_dim(head_dim) { - int64_t inner_dim = num_heads * head_dim; + int64_t inner_dim = num_heads * head_dim; blocks["norm1"] = std::make_shared(hidden_size, 1e-6f); blocks["norm2"] = std::make_shared(hidden_size, 1e-6f); blocks["qkv"] = std::make_shared(hidden_size, inner_dim * 3, true); @@ -304,7 +304,7 @@ namespace MiniT2I { DoubleStreamDiTBlock(int64_t hidden_size, int64_t txt_hidden_size, int64_t num_heads, int64_t head_dim, float mlp_ratio) : num_heads(num_heads), head_dim(head_dim) { - int64_t inner_dim = num_heads * head_dim; + int64_t inner_dim = num_heads * head_dim; blocks["img_norm1"] = std::make_shared(hidden_size, 1e-6f); blocks["img_norm2"] = std::make_shared(hidden_size, 1e-6f); blocks["txt_norm1"] = std::make_shared(txt_hidden_size, 1e-6f); @@ -346,7 +346,7 @@ namespace MiniT2I { auto k = ggml_concat(ctx->ggml_ctx, k_norm->forward(ctx, txt_qkv[1]), k_norm->forward(ctx, img_qkv[1]), 2); auto v = ggml_concat(ctx->ggml_ctx, txt_qkv[2], img_qkv[2], 2); - auto out = Rope::attention(ctx, q, k, v, pe, nullptr, 1.0f, false); + auto out = Rope::attention(ctx, q, k, v, pe, nullptr, 1.0f, false); auto out_txt = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, lt); auto out_img = ggml_ext_slice(ctx->ggml_ctx, out, 1, lt, lt + li); @@ -399,10 +399,10 @@ namespace MiniT2I { if (mask == nullptr) { return context; } - mask = ggml_reshape_3d(ctx->ggml_ctx, mask, 1, mask->ne[0], mask->ne[1]); - mask = ggml_repeat(ctx->ggml_ctx, mask, context); - auto keep = ggml_mul(ctx->ggml_ctx, context, mask); - auto inv = ggml_sub(ctx->ggml_ctx, ggml_ext_ones_like(ctx->ggml_ctx, mask), mask); + mask = ggml_reshape_3d(ctx->ggml_ctx, mask, 1, mask->ne[0], mask->ne[1]); + mask = ggml_repeat(ctx->ggml_ctx, mask, context); + auto keep = ggml_mul(ctx->ggml_ctx, context, mask); + auto inv = ggml_sub(ctx->ggml_ctx, ggml_ext_ones_like(ctx->ggml_ctx, mask), mask); auto mask_token = ggml_repeat(ctx->ggml_ctx, params["mask_token"], context); return ggml_add(ctx->ggml_ctx, keep, ggml_mul(ctx->ggml_ctx, mask_token, inv)); } @@ -411,10 +411,10 @@ namespace MiniT2I { int64_t dim = context->ne[0]; int64_t len = context->ne[1]; int64_t N = context->ne[2]; - auto x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, context, 1, 0, 2, 3)); - x = ggml_reshape_3d(ctx->ggml_ctx, x, len, dim, N); - x = ggml_mean(ctx->ggml_ctx, x); - x = ggml_reshape_2d(ctx->ggml_ctx, x, dim, N); + auto x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, context, 1, 0, 2, 3)); + x = ggml_reshape_3d(ctx->ggml_ctx, x, len, dim, N); + x = ggml_mean(ctx->ggml_ctx, x); + x = ggml_reshape_2d(ctx->ggml_ctx, x, dim, N); return x; } @@ -527,14 +527,14 @@ namespace MiniT2I { auto pos_embed_vec = make_2d_sincos_pos_embed(static_cast(img_side), static_cast(config.hidden_size)); auto txt_pe_vec = make_text_rope(static_cast(txt_len), static_cast(config.head_dim)); - auto img_pe_vec = make_vision_rope(static_cast(img_side), static_cast(config.head_dim)); - auto joint_pe_vec = txt_pe_vec; + auto img_pe_vec = make_vision_rope(static_cast(img_side), static_cast(config.head_dim)); + auto joint_pe_vec = txt_pe_vec; joint_pe_vec.insert(joint_pe_vec.end(), img_pe_vec.begin(), img_pe_vec.end()); ggml_init_params params; - params.mem_size = static_cast(3 * ggml_tensor_overhead()); - params.mem_buffer = nullptr; - params.no_alloc = true; + params.mem_size = static_cast(3 * ggml_tensor_overhead()); + params.mem_buffer = nullptr; + params.no_alloc = true; position_cache_ctx = ggml_init(params); GGML_ASSERT(position_cache_ctx != nullptr); @@ -563,10 +563,10 @@ namespace MiniT2I { const sd::Tensor& timesteps_tensor, const sd::Tensor& context_tensor, const sd::Tensor& mask_tensor) { - ggml_cgraph* gf = new_graph_custom(MINIT2I_GRAPH_SIZE); - ggml_tensor* x = make_input(x_tensor); - ggml_tensor* context = make_input(context_tensor); - ggml_tensor* mask = make_input(mask_tensor); + ggml_cgraph* gf = new_graph_custom(MINIT2I_GRAPH_SIZE); + ggml_tensor* x = make_input(x_tensor); + ggml_tensor* context = make_input(context_tensor); + ggml_tensor* mask = make_input(mask_tensor); SD_UNUSED(timesteps_tensor); int64_t W = x->ne[0]; diff --git a/src/model/te/t5.hpp b/src/model/te/t5.hpp index c7cfef2df..6d2326f94 100644 --- a/src/model/te/t5.hpp +++ b/src/model/te/t5.hpp @@ -23,72 +23,72 @@ struct T5Config { int64_t vocab_size = 32128; bool relative_attention = true; - static T5Config detect_from_weights(const String2TensorStorage& tensor_storage_map, - const std::string& prefix, - bool is_umt5 = false) { - T5Config config; - if (is_umt5) { - config.vocab_size = 256384; - config.relative_attention = false; - } - auto find_tensor = [&](const std::string& suffix) -> const TensorStorage* { - auto it = tensor_storage_map.find(prefix + "." + suffix); - if (it != tensor_storage_map.end()) { - return &it->second; - } - it = tensor_storage_map.find(prefix + suffix); - if (it != tensor_storage_map.end()) { - return &it->second; - } - return nullptr; - }; - - if (const TensorStorage* shared = find_tensor("shared.weight")) { - if (shared->n_dims == 2) { - config.vocab_size = shared->ne[1]; - config.model_dim = shared->ne[0]; - } - } - if (const TensorStorage* q = find_tensor("encoder.block.0.layer.0.SelfAttention.q.weight")) { - if (q->n_dims == 2) { - config.model_dim = q->ne[0]; - int64_t inner_dim = q->ne[1]; - // Flan-T5/T5 uses d_kv=64 for common sizes. - if (inner_dim % 64 == 0) { - config.num_heads = inner_dim / 64; - } - } - } - if (const TensorStorage* wi = find_tensor("encoder.block.0.layer.1.DenseReluDense.wi_0.weight")) { - if (wi->n_dims == 2) { - config.model_dim = wi->ne[0]; - config.ff_dim = wi->ne[1]; - } - } - int64_t detected_layers = 0; - for (const auto& [name, _] : tensor_storage_map) { - std::string base = prefix; - if (!base.empty() && base.back() != '.') { - base += "."; - } - std::string layer_prefix = base + "encoder.block."; - if (!starts_with(name, layer_prefix)) { - continue; - } - size_t pos = layer_prefix.size(); - size_t dot = name.find('.', pos); - if (dot == std::string::npos) { - continue; - } - int64_t layer = atoi(name.substr(pos, dot - pos).c_str()); - detected_layers = std::max(detected_layers, layer + 1); - } - if (detected_layers > 0) { - config.num_layers = detected_layers; - } - return config; - } -}; + static T5Config detect_from_weights(const String2TensorStorage& tensor_storage_map, + const std::string& prefix, + bool is_umt5 = false) { + T5Config config; + if (is_umt5) { + config.vocab_size = 256384; + config.relative_attention = false; + } + auto find_tensor = [&](const std::string& suffix) -> const TensorStorage* { + auto it = tensor_storage_map.find(prefix + "." + suffix); + if (it != tensor_storage_map.end()) { + return &it->second; + } + it = tensor_storage_map.find(prefix + suffix); + if (it != tensor_storage_map.end()) { + return &it->second; + } + return nullptr; + }; + + if (const TensorStorage* shared = find_tensor("shared.weight")) { + if (shared->n_dims == 2) { + config.vocab_size = shared->ne[1]; + config.model_dim = shared->ne[0]; + } + } + if (const TensorStorage* q = find_tensor("encoder.block.0.layer.0.SelfAttention.q.weight")) { + if (q->n_dims == 2) { + config.model_dim = q->ne[0]; + int64_t inner_dim = q->ne[1]; + // Flan-T5/T5 uses d_kv=64 for common sizes. + if (inner_dim % 64 == 0) { + config.num_heads = inner_dim / 64; + } + } + } + if (const TensorStorage* wi = find_tensor("encoder.block.0.layer.1.DenseReluDense.wi_0.weight")) { + if (wi->n_dims == 2) { + config.model_dim = wi->ne[0]; + config.ff_dim = wi->ne[1]; + } + } + int64_t detected_layers = 0; + for (const auto& [name, _] : tensor_storage_map) { + std::string base = prefix; + if (!base.empty() && base.back() != '.') { + base += "."; + } + std::string layer_prefix = base + "encoder.block."; + if (!starts_with(name, layer_prefix)) { + continue; + } + size_t pos = layer_prefix.size(); + size_t dot = name.find('.', pos); + if (dot == std::string::npos) { + continue; + } + int64_t layer = atoi(name.substr(pos, dot - pos).c_str()); + detected_layers = std::max(detected_layers, layer + 1); + } + if (detected_layers > 0) { + config.num_layers = detected_layers; + } + return config; + } +}; class T5LayerNorm : public UnaryBlock { protected: diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 26c04a1b5..edf7bf78e 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -792,9 +792,9 @@ class StableDiffusionGGML { tensor_storage_map, model_manager); diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), - tensor_storage_map, - "model.diffusion_model.model.net", - model_manager); + tensor_storage_map, + "model.diffusion_model.model.net", + model_manager); } else if (sd_version_is_anima(version)) { cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), tensor_storage_map, @@ -2051,10 +2051,10 @@ class StableDiffusionGGML { int64_t last_progress_us = ggml_time_us(); SamplePreviewContext preview = prepare_sample_preview_context(); - sd::Tensor x_t = !noise.empty() - ? denoiser->noise_scaling(sigmas[0], noise, init_latent) - : init_latent; - sd::Tensor denoised = x_t; + sd::Tensor x_t = !noise.empty() + ? denoiser->noise_scaling(sigmas[0], noise, init_latent) + : init_latent; + sd::Tensor denoised = x_t; auto denoise = [&](const sd::Tensor& x, float sigma, int step) -> sd::guidance::GuiderOutput { if (get_cancel_flag() == SD_CANCEL_ALL) {