From ded8583a2448931668f2f7a273ca68b44c4c602c Mon Sep 17 00:00:00 2001
From: KenForever1 <2962666398@qq.com>
Date: Thu, 18 Jun 2026 23:44:34 +0800
Subject: [PATCH 1/7] Add MiniT2I inference support

---
 src/conditioning/conditioner.hpp |  95 +++++
 src/core/ggml_extend_backend.cpp |  81 ++++-
 src/model.h                      |   9 +
 src/model/diffusion/minit2i.hpp  | 573 +++++++++++++++++++++++++++++++
 src/model/te/t5.hpp              |  79 ++++-
 src/model/vae/vae.hpp            |   2 +-
 src/model_loader.cpp             |  11 +
 src/stable-diffusion.cpp         | 100 +++++-
 8 files changed, 931 insertions(+), 19 deletions(-)
 create mode 100644 src/model/diffusion/minit2i.hpp
diff --git a/src/conditioning/conditioner.hpp b/src/conditioning/conditioner.hpp
index e8b8ee3da..d63303a82 100644
--- a/src/conditioning/conditioner.hpp
+++ b/src/conditioning/conditioner.hpp
@@ -1378,6 +1378,101 @@ struct T5CLIPEmbedder : public Conditioner {
     }
 };
 
+struct MiniT2IConditioner : public Conditioner {
+    T5UniGramTokenizer tokenizer;
+    std::shared_ptr<T5Runner> t5;
+    size_t prompt_length = 256;
+
+    MiniT2IConditioner(ggml_backend_t backend,
+                       const String2TensorStorage& tensor_storage_map      = {},
+                       std::shared_ptr<RunnerWeightManager> weight_manager = nullptr) {
+        bool use_t5 = false;
+        for (const auto& pair : tensor_storage_map) {
+            if (pair.first.find("text_encoders.t5xxl") != std::string::npos) {
+                use_t5 = true;
+                break;
+            }
+        }
+        if (!use_t5) {
+            LOG_WARN("IMPORTANT NOTICE: No MiniT2I T5 text encoder provided, cannot process prompts!");
+            return;
+        }
+        t5 = std::make_shared<T5Runner>(backend, tensor_storage_map, "text_encoders.t5xxl.transformer", false, weight_manager);
+    }
+
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
+        if (t5) {
+            t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
+        }
+    }
+
+    void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
+        if (t5) {
+            t5->set_max_graph_vram_bytes(max_vram_bytes);
+        }
+    }
+
+    void set_stream_layers_enabled(bool enabled) override {
+        if (t5) {
+            t5->set_stream_layers_enabled(enabled);
+        }
+    }
+
+    void set_flash_attention_enabled(bool enabled) override {
+        if (t5) {
+            t5->set_flash_attention_enabled(enabled);
+        }
+    }
+
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        if (t5) {
+            t5->set_weight_adapter(adapter);
+        }
+    }
+
+    void runner_done() override {
+        if (t5) {
+            t5->runner_done();
+        }
+    }
+
+    SDCondition get_learned_condition(int n_threads,
+                                      const ConditionerParams& conditioner_params) override {
+        SDCondition result;
+        if (!t5) {
+            result.c_crossattn = sd::Tensor<float>::zeros({1024, static_cast<int64_t>(prompt_length)});
+            result.c_vector    = sd::Tensor<float>::zeros({static_cast<int64_t>(prompt_length)});
+            return result;
+        }
+
+        std::vector<int> tokens = tokenizer.encode(conditioner_params.text);
+        if (tokens.size() > prompt_length) {
+            tokens.resize(prompt_length);
+        }
+        std::vector<float> mask(tokens.size(), 1.0f);
+        while (tokens.size() < prompt_length) {
+            tokens.push_back(tokenizer.PAD_TOKEN_ID);
+            mask.push_back(0.0f);
+        }
+
+        sd::Tensor<int32_t> input_ids({static_cast<int64_t>(tokens.size())}, tokens);
+        std::vector<float> t5_mask(mask.size(), 0.0f);
+        for (size_t i = 0; i < mask.size(); ++i) {
+            t5_mask[i] = mask[i] > 0.0f ? 0.0f : -HUGE_VALF;
+        }
+        sd::Tensor<float> hidden_states = t5->compute(n_threads,
+                                                      input_ids,
+                                                      sd::Tensor<float>::from_vector(t5_mask),
+                                                      false,
+                                                      true,
+                                                      true);
+        GGML_ASSERT(!hidden_states.empty());
+        result.c_crossattn = std::move(hidden_states);
+        result.c_vector    = sd::Tensor<float>::from_vector(mask);
+        return result;
+    }
+};
+
 struct AnimaConditioner : public Conditioner {
     std::shared_ptr<BPETokenizer> qwen_tokenizer;
     T5UniGramTokenizer t5_tokenizer;
diff --git a/src/core/ggml_extend_backend.cpp b/src/core/ggml_extend_backend.cpp
index f3e2cceba..2eb62d3a3 100644
--- a/src/core/ggml_extend_backend.cpp
+++ b/src/core/ggml_extend_backend.cpp
@@ -110,7 +110,67 @@ static std::string resolve_first_device_by_type(enum ggml_backend_dev_type type)
     if (dev == nullptr) {
         return "";
     }
-    return ggml_backend_dev_name(dev);
+    const char* dev_name = ggml_backend_dev_name(dev);
+    if (dev_name != nullptr && dev_name[0] != '\0') {
+        return dev_name;
+    }
+    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+    const char* reg_name   = reg != nullptr ? ggml_backend_reg_name(reg) : nullptr;
+    return reg_name != nullptr ? reg_name : "";
+}
+
+static ggml_backend_dev_t resolve_first_device_by_registry_name(const std::string& name) {
+    std::string lower = lower_copy(trim_copy(name));
+    if (lower == "metal") {
+        lower = "mtl";
+    }
+    if (lower.empty()) {
+        return nullptr;
+    }
+
+    const size_t device_count = ggml_backend_dev_count();
+    for (size_t i = 0; i < device_count; ++i) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+        if (reg == nullptr) {
+            continue;
+        }
+        const char* reg_name = ggml_backend_reg_name(reg);
+        if (reg_name != nullptr && lower_copy(reg_name) == lower) {
+            return dev;
+        }
+    }
+    return nullptr;
+}
+
+static ggml_backend_dev_t resolve_device_by_name(const std::string& name) {
+    const std::string lower = lower_copy(trim_copy(name));
+    if (lower.empty()) {
+        return nullptr;
+    }
+
+    const size_t device_count = ggml_backend_dev_count();
+    for (size_t i = 0; i < device_count; ++i) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        const char* dev_name   = ggml_backend_dev_name(dev);
+        if (dev_name != nullptr && lower_copy(dev_name) == lower) {
+            return dev;
+        }
+    }
+    return nullptr;
+}
+
+static std::string backend_device_name(ggml_backend_dev_t dev) {
+    if (dev == nullptr) {
+        return "";
+    }
+    const char* name = ggml_backend_dev_name(dev);
+    if (name != nullptr && name[0] != '\0') {
+        return name;
+    }
+    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+    const char* reg_name   = reg != nullptr ? ggml_backend_reg_name(reg) : nullptr;
+    return reg_name != nullptr ? reg_name : "";
 }
 
 static ggml_backend_buffer_t ggml_backend_tensor_buffer(const struct ggml_tensor* tensor) {
@@ -296,6 +356,10 @@ std::string sd_backend_resolve_name(const std::string& name) {
         return resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU);
     }
 
+    if (ggml_backend_dev_t dev = resolve_first_device_by_registry_name(requested)) {
+        return backend_device_name(dev);
+    }
+
     const size_t device_count = ggml_backend_dev_count();
     for (size_t i = 0; i < device_count; ++i) {
         ggml_backend_dev_t dev = ggml_backend_dev_get(i);
@@ -328,7 +392,20 @@ static ggml_backend_t init_named_backend(const std::string& name) {
         return ggml_backend_init_best();
     }
 
+    if (ggml_backend_dev_t dev = resolve_device_by_name(name)) {
+        return ggml_backend_dev_init(dev, nullptr);
+    }
+    if (ggml_backend_dev_t dev = resolve_first_device_by_registry_name(name)) {
+        return ggml_backend_dev_init(dev, nullptr);
+    }
+
     std::string resolved = sd_backend_resolve_name(name);
+    if (ggml_backend_dev_t dev = resolve_device_by_name(resolved)) {
+        return ggml_backend_dev_init(dev, nullptr);
+    }
+    if (ggml_backend_dev_t dev = resolve_first_device_by_registry_name(resolved)) {
+        return ggml_backend_dev_init(dev, nullptr);
+    }
     if (resolved.empty()) {
         return nullptr;
     }
@@ -599,7 +676,7 @@ bool SDBackendManager::validate(std::string* error) const {
             }
             return false;
         }
-        if (!sd_backend_resolve_name(name).empty()) {
+        if (!sd_backend_resolve_name(name).empty() || resolve_first_device_by_registry_name(name) != nullptr) {
             return true;
         }
         if (error != nullptr) {
diff --git a/src/model.h b/src/model.h
index fff050149..75fdbe643 100644
--- a/src/model.h
+++ b/src/model.h
@@ -46,6 +46,7 @@ enum SDVersion {
     VERSION_OVIS_IMAGE,
     VERSION_ERNIE_IMAGE,
     VERSION_LENS,
+    VERSION_MINIT2I,
     VERSION_LONGCAT,
     VERSION_PID,
     VERSION_IDEOGRAM4,
@@ -174,6 +175,13 @@ static inline bool sd_version_is_lens(SDVersion version) {
     return false;
 }
 
+static inline bool sd_version_is_minit2i(SDVersion version) {
+    if (version == VERSION_MINIT2I) {
+        return true;
+    }
+    return false;
+}
+
 static inline bool sd_version_is_pid(SDVersion version) {
     if (version == VERSION_PID) {
         return true;
@@ -247,6 +255,7 @@ static inline bool sd_version_is_dit(SDVersion version) {
         sd_version_is_boogu_image(version) ||
         sd_version_is_ernie_image(version) ||
         sd_version_is_lens(version) ||
+        sd_version_is_minit2i(version) ||
         sd_version_is_longcat(version) ||
         sd_version_is_pid(version) ||
         sd_version_is_ideogram4(version) ||
diff --git a/src/model/diffusion/minit2i.hpp b/src/model/diffusion/minit2i.hpp
new file mode 100644
index 000000000..d69f1f4ac
--- /dev/null
+++ b/src/model/diffusion/minit2i.hpp
@@ -0,0 +1,573 @@
+#ifndef __SD_MODEL_DIFFUSION_MINIT2I_HPP__
+#define __SD_MODEL_DIFFUSION_MINIT2I_HPP__
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "core/ggml_extend.hpp"
+#include "model/common/rope.hpp"
+#include "model/diffusion/dit.hpp"
+#include "model/diffusion/model.hpp"
+#include "model_loader.h"
+
+namespace MiniT2I {
+    constexpr int MINIT2I_GRAPH_SIZE = 196608;
+
+    struct MiniT2IConfig {
+        int64_t image_size         = 512;
+        int64_t patch_size         = 16;
+        int64_t in_channels        = 3;
+        int64_t txt_input_size     = 1024;
+        int64_t hidden_size        = 768;
+        int64_t txt_hidden_size    = 768;
+        int64_t cond_vec_size      = 768;
+        int64_t depth_double       = 17;
+        int64_t txt_preamble_depth = 2;
+        int64_t num_heads          = 12;
+        int64_t head_dim           = 64;
+        float mlp_ratio            = 2.6667f;
+        int64_t pca_channels       = 128;
+        int64_t prompt_length      = 256;
+        int64_t n_T                = 100;
+        float cfg_interval_start   = 0.0f;
+        float cfg_interval_end     = 1.0f;
+
+        static MiniT2IConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
+            MiniT2IConfig config;
+            config.depth_double       = 0;
+            config.txt_preamble_depth = 0;
+
+            for (const auto& [name, tensor_storage] : tensor_storage_map) {
+                if (!starts_with(name, prefix)) {
+                    continue;
+                }
+                if (ends_with(name, "img_embedder.proj1.weight") && tensor_storage.n_dims == 4) {
+                    config.patch_size  = tensor_storage.ne[0];
+                    config.in_channels = tensor_storage.ne[2];
+                    config.pca_channels = tensor_storage.ne[3];
+                } else if (ends_with(name, "img_embedder.proj2.weight") && tensor_storage.n_dims == 4) {
+                    config.pca_channels = tensor_storage.ne[2];
+                    config.hidden_size  = tensor_storage.ne[3];
+                } else if (ends_with(name, "txt_embedder.weight") && tensor_storage.n_dims == 2) {
+                    config.txt_input_size  = tensor_storage.ne[0];
+                    config.txt_hidden_size = tensor_storage.ne[1];
+                } else if (ends_with(name, "pooled_embedder.weight") && tensor_storage.n_dims == 2) {
+                    config.cond_vec_size = tensor_storage.ne[1];
+                } else if (ends_with(name, "double_blocks.0.img_qkv.weight") && tensor_storage.n_dims == 2) {
+                    int64_t inner3 = tensor_storage.ne[1];
+                    int64_t inner  = inner3 / 3;
+                    config.hidden_size = tensor_storage.ne[0];
+                    if (config.hidden_size == 768) {
+                        config.num_heads = 12;
+                        config.head_dim  = 64;
+                    } else if (config.hidden_size == 1248) {
+                        config.num_heads = 24;
+                        config.head_dim  = 52;
+                    } else if (inner > 0) {
+                        config.head_dim  = 64;
+                        config.num_heads = std::max<int64_t>(1, inner / config.head_dim);
+                    }
+                } else if (ends_with(name, "final_layer.linear.weight") && tensor_storage.n_dims == 2) {
+                    int64_t patch_area  = config.patch_size * config.patch_size;
+                    config.hidden_size  = tensor_storage.ne[0];
+                    config.in_channels  = patch_area > 0 ? tensor_storage.ne[1] / patch_area : config.in_channels;
+                } else if (ends_with(name, "mask_token") && tensor_storage.n_dims >= 2) {
+                    config.prompt_length = tensor_storage.ne[1];
+                }
+
+                size_t pos = name.find("double_blocks.");
+                if (pos != std::string::npos) {
+                    auto items = split_string(name.substr(pos), '.');
+                    if (items.size() > 1) {
+                        int64_t idx         = atoi(items[1].c_str());
+                        config.depth_double = std::max<int64_t>(config.depth_double, idx + 1);
+                    }
+                }
+                pos = name.find("txt_preamble_blocks.");
+                if (pos != std::string::npos) {
+                    auto items = split_string(name.substr(pos), '.');
+                    if (items.size() > 1) {
+                        int64_t idx                 = atoi(items[1].c_str());
+                        config.txt_preamble_depth   = std::max<int64_t>(config.txt_preamble_depth, idx + 1);
+                    }
+                }
+            }
+
+            if (config.depth_double <= 0) {
+                config.depth_double = config.hidden_size == 1248 ? 23 : 17;
+            }
+            if (config.txt_preamble_depth <= 0) {
+                config.txt_preamble_depth = 2;
+            }
+            if (config.head_dim <= 0 || config.num_heads <= 0) {
+                config.head_dim  = config.hidden_size == 1248 ? 52 : 64;
+                config.num_heads = config.hidden_size / config.head_dim;
+            }
+            LOG_DEBUG("minit2i: hidden_size=%" PRId64 ", txt_hidden_size=%" PRId64 ", heads=%" PRId64 ", head_dim=%" PRId64 ", double_blocks=%" PRId64 ", txt_blocks=%" PRId64 ", patch=%" PRId64 ", in_channels=%" PRId64,
+                      config.hidden_size,
+                      config.txt_hidden_size,
+                      config.num_heads,
+                      config.head_dim,
+                      config.depth_double,
+                      config.txt_preamble_depth,
+                      config.patch_size,
+                      config.in_channels);
+            return config;
+        }
+    };
+
+    inline std::vector<float> make_2d_sincos_pos_embed(int grid_size, int dim) {
+        GGML_ASSERT(dim % 4 == 0);
+        int half_dim = dim / 2;
+        int quarter  = half_dim / 2;
+        std::vector<float> out(static_cast<size_t>(grid_size) * grid_size * dim);
+        std::vector<float> omega(quarter);
+        for (int i = 0; i < quarter; ++i) {
+            omega[i] = 1.0f / std::pow(10000.0f, static_cast<float>(i) / static_cast<float>(quarter));
+        }
+        for (int y = 0; y < grid_size; ++y) {
+            for (int x = 0; x < grid_size; ++x) {
+                size_t base = static_cast<size_t>(y * grid_size + x) * dim;
+                for (int i = 0; i < quarter; ++i) {
+                    float ay = y * omega[i];
+                    float ax = x * omega[i];
+                    out[base + i]                      = std::sin(ax);
+                    out[base + quarter + i]            = std::cos(ax);
+                    out[base + half_dim + i]           = std::sin(ay);
+                    out[base + half_dim + quarter + i] = std::cos(ay);
+                }
+            }
+        }
+        return out;
+    }
+
+    inline std::vector<float> make_text_rope(int length, int head_dim) {
+        return Rope::flatten(Rope::rope(Rope::linspace(0.f, static_cast<float>(length - 1), length), head_dim, 10000.f));
+    }
+
+    inline std::vector<float> make_vision_rope(int side, int head_dim) {
+        GGML_ASSERT(head_dim % 4 == 0);
+        int dim      = head_dim / 2;
+        int quarter  = dim / 2;
+        int length   = side * side;
+        std::vector<float> out(static_cast<size_t>(length) * (head_dim / 2) * 4);
+        std::vector<float> freqs(quarter);
+        for (int i = 0; i < quarter; ++i) {
+            freqs[i] = 1.0f / std::pow(10000.0f, static_cast<float>(2 * i) / static_cast<float>(dim));
+        }
+        for (int y = 0; y < side; ++y) {
+            for (int x = 0; x < side; ++x) {
+                int pos     = y * side + x;
+                size_t base = static_cast<size_t>(pos) * (head_dim / 2) * 4;
+                for (int i = 0; i < quarter; ++i) {
+                    float ay = y * freqs[i];
+                    float ax = x * freqs[i];
+                    float angles[2] = {ay, ax};
+                    for (int axis = 0; axis < 2; ++axis) {
+                        int j                   = axis * quarter + i;
+                        out[base + 4 * j]       = std::cos(angles[axis]);
+                        out[base + 4 * j + 1]   = -std::sin(angles[axis]);
+                        out[base + 4 * j + 2]   = std::sin(angles[axis]);
+                        out[base + 4 * j + 3]   = std::cos(angles[axis]);
+                    }
+                }
+            }
+        }
+        return out;
+    }
+
+    struct SwiGLUMlp : public GGMLBlock {
+        SwiGLUMlp(int64_t in_features, int64_t hidden_features) {
+            int64_t hidden_dim = ((hidden_features + 7) / 8) * 8;
+            blocks["w1"] = std::make_shared<Linear>(in_features, hidden_dim, false);
+            blocks["w3"] = std::make_shared<Linear>(in_features, hidden_dim, false);
+            blocks["w2"] = std::make_shared<Linear>(hidden_dim, in_features, false);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+            auto w1 = std::dynamic_pointer_cast<Linear>(blocks["w1"]);
+            auto w3 = std::dynamic_pointer_cast<Linear>(blocks["w3"]);
+            auto w2 = std::dynamic_pointer_cast<Linear>(blocks["w2"]);
+            auto gate = ggml_silu(ctx->ggml_ctx, w1->forward(ctx, x));
+            auto up   = w3->forward(ctx, x);
+            return w2->forward(ctx, ggml_mul(ctx->ggml_ctx, gate, up));
+        }
+    };
+
+    struct BottleneckPatchEmbed : public GGMLBlock {
+        int64_t patch_size;
+
+        BottleneckPatchEmbed(int64_t patch_size, int64_t in_channels, int64_t pca_channels, int64_t hidden_size)
+            : patch_size(patch_size) {
+            blocks["proj1"] = std::make_shared<Conv2d>(in_channels,
+                                                        pca_channels,
+                                                        std::pair<int, int>{static_cast<int>(patch_size), static_cast<int>(patch_size)},
+                                                        std::pair<int, int>{static_cast<int>(patch_size), static_cast<int>(patch_size)},
+                                                        std::pair<int, int>{0, 0},
+                                                        std::pair<int, int>{1, 1},
+                                                        false);
+            blocks["proj2"] = std::make_shared<Conv2d>(pca_channels,
+                                                        hidden_size,
+                                                        std::pair<int, int>{1, 1},
+                                                        std::pair<int, int>{1, 1},
+                                                        std::pair<int, int>{0, 0},
+                                                        std::pair<int, int>{1, 1},
+                                                        true);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+            auto proj1 = std::dynamic_pointer_cast<Conv2d>(blocks["proj1"]);
+            auto proj2 = std::dynamic_pointer_cast<Conv2d>(blocks["proj2"]);
+            x = proj1->forward(ctx, x);
+            x = proj2->forward(ctx, x);
+            x = ggml_reshape_3d(ctx->ggml_ctx, x, x->ne[0] * x->ne[1], x->ne[2], x->ne[3]);
+            x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3));
+            return x;
+        }
+    };
+
+    struct TimestepEmbedder : public GGMLBlock {
+        int frequency_embedding_size;
+
+        TimestepEmbedder(int64_t hidden_size, int frequency_embedding_size = 256)
+            : frequency_embedding_size(frequency_embedding_size) {
+            blocks["mlp.0"] = std::make_shared<Linear>(frequency_embedding_size, hidden_size, true, true);
+            blocks["mlp.2"] = std::make_shared<Linear>(hidden_size, hidden_size, true, true);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* t) {
+            auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]);
+            auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]);
+            auto t_emb = ggml_ext_timestep_embedding(ctx->ggml_ctx, t, frequency_embedding_size, 10000, 1.0f);
+            t_emb      = mlp_0->forward(ctx, t_emb);
+            t_emb      = ggml_silu_inplace(ctx->ggml_ctx, t_emb);
+            return mlp_2->forward(ctx, t_emb);
+        }
+    };
+
+    inline std::vector<ggml_tensor*> split_qkv(ggml_context* ctx, ggml_tensor* qkv, int64_t num_heads, int64_t head_dim) {
+        int64_t N = qkv->ne[2];
+        int64_t L = qkv->ne[1];
+        auto q = ggml_view_4d(ctx, qkv, head_dim, num_heads, L, N,
+                              qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], 0);
+        auto k = ggml_view_4d(ctx, qkv, head_dim, num_heads, L, N,
+                              qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], qkv->nb[0] * head_dim * num_heads);
+        auto v = ggml_view_4d(ctx, qkv, head_dim, num_heads, L, N,
+                              qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], qkv->nb[0] * head_dim * num_heads * 2);
+        return {q, k, v};
+    }
+
+    struct PlainTextTransformerBlock : public GGMLBlock {
+        int64_t num_heads;
+        int64_t head_dim;
+
+        PlainTextTransformerBlock(int64_t hidden_size, int64_t num_heads, int64_t head_dim, float mlp_ratio)
+            : num_heads(num_heads), head_dim(head_dim) {
+            int64_t inner_dim = num_heads * head_dim;
+            blocks["norm1"]     = std::make_shared<RMSNorm>(hidden_size, 1e-6f);
+            blocks["norm2"]     = std::make_shared<RMSNorm>(hidden_size, 1e-6f);
+            blocks["qkv"]       = std::make_shared<Linear>(hidden_size, inner_dim * 3, true);
+            blocks["attn_proj"] = std::make_shared<Linear>(inner_dim, hidden_size, true);
+            blocks["mlp"]       = std::make_shared<SwiGLUMlp>(hidden_size, static_cast<int64_t>(hidden_size * mlp_ratio));
+            blocks["q_norm"]    = std::make_shared<RMSNorm>(head_dim, 1e-6f);
+            blocks["k_norm"]    = std::make_shared<RMSNorm>(head_dim, 1e-6f);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* txt, ggml_tensor* pe) {
+            auto norm1     = std::dynamic_pointer_cast<RMSNorm>(blocks["norm1"]);
+            auto norm2     = std::dynamic_pointer_cast<RMSNorm>(blocks["norm2"]);
+            auto qkv_proj  = std::dynamic_pointer_cast<Linear>(blocks["qkv"]);
+            auto attn_proj = std::dynamic_pointer_cast<Linear>(blocks["attn_proj"]);
+            auto mlp       = std::dynamic_pointer_cast<SwiGLUMlp>(blocks["mlp"]);
+            auto q_norm    = std::dynamic_pointer_cast<RMSNorm>(blocks["q_norm"]);
+            auto k_norm    = std::dynamic_pointer_cast<RMSNorm>(blocks["k_norm"]);
+
+            auto qkv = split_qkv(ctx->ggml_ctx, qkv_proj->forward(ctx, norm1->forward(ctx, txt)), num_heads, head_dim);
+            auto q   = q_norm->forward(ctx, qkv[0]);
+            auto k   = k_norm->forward(ctx, qkv[1]);
+            auto v   = qkv[2];
+            auto out = Rope::attention(ctx, q, k, v, pe, nullptr, 1.0f, false);
+            txt      = ggml_add(ctx->ggml_ctx, txt, attn_proj->forward(ctx, out));
+            txt      = ggml_add(ctx->ggml_ctx, txt, mlp->forward(ctx, norm2->forward(ctx, txt)));
+            return txt;
+        }
+    };
+
+    struct DoubleStreamDiTBlock : public GGMLBlock {
+        int64_t num_heads;
+        int64_t head_dim;
+
+        DoubleStreamDiTBlock(int64_t hidden_size, int64_t txt_hidden_size, int64_t num_heads, int64_t head_dim, float mlp_ratio)
+            : num_heads(num_heads), head_dim(head_dim) {
+            int64_t inner_dim = num_heads * head_dim;
+            blocks["img_norm1"]     = std::make_shared<RMSNorm>(hidden_size, 1e-6f);
+            blocks["img_norm2"]     = std::make_shared<RMSNorm>(hidden_size, 1e-6f);
+            blocks["txt_norm1"]     = std::make_shared<RMSNorm>(txt_hidden_size, 1e-6f);
+            blocks["txt_norm2"]     = std::make_shared<RMSNorm>(txt_hidden_size, 1e-6f);
+            blocks["img_qkv"]       = std::make_shared<Linear>(hidden_size, inner_dim * 3, true);
+            blocks["txt_qkv"]       = std::make_shared<Linear>(txt_hidden_size, inner_dim * 3, true);
+            blocks["q_norm"]        = std::make_shared<RMSNorm>(head_dim, 1e-6f);
+            blocks["k_norm"]        = std::make_shared<RMSNorm>(head_dim, 1e-6f);
+            blocks["img_attn_proj"] = std::make_shared<Linear>(inner_dim, hidden_size, true);
+            blocks["txt_attn_proj"] = std::make_shared<Linear>(inner_dim, txt_hidden_size, true);
+            blocks["img_mlp"]       = std::make_shared<SwiGLUMlp>(hidden_size, static_cast<int64_t>(hidden_size * mlp_ratio));
+            blocks["txt_mlp"]       = std::make_shared<SwiGLUMlp>(txt_hidden_size, static_cast<int64_t>(txt_hidden_size * mlp_ratio));
+        }
+
+        std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
+                                                      ggml_tensor* img,
+                                                      ggml_tensor* txt,
+                                                      ggml_tensor* pe) {
+            auto img_norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["img_norm1"]);
+            auto img_norm2 = std::dynamic_pointer_cast<RMSNorm>(blocks["img_norm2"]);
+            auto txt_norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm1"]);
+            auto txt_norm2 = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm2"]);
+            auto img_qkv_p = std::dynamic_pointer_cast<Linear>(blocks["img_qkv"]);
+            auto txt_qkv_p = std::dynamic_pointer_cast<Linear>(blocks["txt_qkv"]);
+            auto q_norm    = std::dynamic_pointer_cast<RMSNorm>(blocks["q_norm"]);
+            auto k_norm    = std::dynamic_pointer_cast<RMSNorm>(blocks["k_norm"]);
+            auto img_proj  = std::dynamic_pointer_cast<Linear>(blocks["img_attn_proj"]);
+            auto txt_proj  = std::dynamic_pointer_cast<Linear>(blocks["txt_attn_proj"]);
+            auto img_mlp   = std::dynamic_pointer_cast<SwiGLUMlp>(blocks["img_mlp"]);
+            auto txt_mlp   = std::dynamic_pointer_cast<SwiGLUMlp>(blocks["txt_mlp"]);
+
+            int64_t li = img->ne[1];
+            int64_t lt = txt->ne[1];
+
+            auto img_qkv = split_qkv(ctx->ggml_ctx, img_qkv_p->forward(ctx, img_norm1->forward(ctx, img)), num_heads, head_dim);
+            auto txt_qkv = split_qkv(ctx->ggml_ctx, txt_qkv_p->forward(ctx, txt_norm1->forward(ctx, txt)), num_heads, head_dim);
+
+            auto q = ggml_concat(ctx->ggml_ctx, q_norm->forward(ctx, txt_qkv[0]), q_norm->forward(ctx, img_qkv[0]), 2);
+            auto k = ggml_concat(ctx->ggml_ctx, k_norm->forward(ctx, txt_qkv[1]), k_norm->forward(ctx, img_qkv[1]), 2);
+            auto v = ggml_concat(ctx->ggml_ctx, txt_qkv[2], img_qkv[2], 2);
+
+            auto out = Rope::attention(ctx, q, k, v, pe, nullptr, 1.0f, false);
+            auto out_txt = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, lt);
+            auto out_img = ggml_ext_slice(ctx->ggml_ctx, out, 1, lt, lt + li);
+
+            img = ggml_add(ctx->ggml_ctx, img, img_proj->forward(ctx, out_img));
+            txt = ggml_add(ctx->ggml_ctx, txt, txt_proj->forward(ctx, out_txt));
+            img = ggml_add(ctx->ggml_ctx, img, img_mlp->forward(ctx, img_norm2->forward(ctx, img)));
+            txt = ggml_add(ctx->ggml_ctx, txt, txt_mlp->forward(ctx, txt_norm2->forward(ctx, txt)));
+            return {img, txt};
+        }
+    };
+
+    struct FinalLayer : public GGMLBlock {
+        FinalLayer(int64_t hidden_size, int64_t patch_size, int64_t out_channels) {
+            blocks["norm_final"] = std::make_shared<RMSNorm>(hidden_size, 1e-6f);
+            blocks["linear"]     = std::make_shared<Linear>(hidden_size, patch_size * patch_size * out_channels, true);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+            auto norm_final = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_final"]);
+            auto linear     = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
+            return linear->forward(ctx, norm_final->forward(ctx, x));
+        }
+    };
+
+    struct MMJiT : public GGMLBlock {
+        MiniT2IConfig config;
+
+        MMJiT(const MiniT2IConfig& config)
+            : config(config) {
+            blocks["img_embedder"]    = std::make_shared<BottleneckPatchEmbed>(config.patch_size, config.in_channels, config.pca_channels, config.hidden_size);
+            blocks["txt_embedder"]    = std::make_shared<Linear>(config.txt_input_size, config.txt_hidden_size, false);
+            blocks["t_embedder"]      = std::make_shared<TimestepEmbedder>(config.cond_vec_size);
+            blocks["pooled_embedder"] = std::make_shared<Linear>(config.txt_input_size, config.cond_vec_size, false);
+            for (int64_t i = 0; i < config.txt_preamble_depth; ++i) {
+                blocks["txt_preamble_blocks." + std::to_string(i)] = std::make_shared<PlainTextTransformerBlock>(config.txt_hidden_size, config.num_heads, config.head_dim, config.mlp_ratio);
+            }
+            for (int64_t i = 0; i < config.depth_double; ++i) {
+                blocks["double_blocks." + std::to_string(i)] = std::make_shared<DoubleStreamDiTBlock>(config.hidden_size, config.txt_hidden_size, config.num_heads, config.head_dim, config.mlp_ratio);
+            }
+            blocks["final_layer"] = std::make_shared<FinalLayer>(config.hidden_size, config.patch_size, config.in_channels);
+        }
+
+        void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+            GGMLBlock::init_params(ctx, tensor_storage_map, prefix);
+            enum ggml_type wtype = get_type(prefix + "mask_token", tensor_storage_map, GGML_TYPE_F32);
+            params["mask_token"] = ggml_new_tensor_3d(ctx, wtype, config.txt_input_size, 1, 1);
+        }
+
+        ggml_tensor* apply_text_mask(GGMLRunnerContext* ctx, ggml_tensor* context, ggml_tensor* mask) {
+            if (mask == nullptr) {
+                return context;
+            }
+            mask = ggml_reshape_3d(ctx->ggml_ctx, mask, 1, mask->ne[0], mask->ne[1]);
+            mask = ggml_repeat(ctx->ggml_ctx, mask, context);
+            auto keep = ggml_mul(ctx->ggml_ctx, context, mask);
+            auto inv  = ggml_sub(ctx->ggml_ctx, ggml_ext_ones_like(ctx->ggml_ctx, mask), mask);
+            auto mask_token = ggml_repeat(ctx->ggml_ctx, params["mask_token"], context);
+            return ggml_add(ctx->ggml_ctx, keep, ggml_mul(ctx->ggml_ctx, mask_token, inv));
+        }
+
+        ggml_tensor* pool_context(GGMLRunnerContext* ctx, ggml_tensor* context) {
+            int64_t dim = context->ne[0];
+            int64_t len = context->ne[1];
+            int64_t N   = context->ne[2];
+            auto x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, context, 1, 0, 2, 3));
+            x      = ggml_reshape_3d(ctx->ggml_ctx, x, len, dim, N);
+            x      = ggml_mean(ctx->ggml_ctx, x);
+            x      = ggml_reshape_2d(ctx->ggml_ctx, x, dim, N);
+            return x;
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* img,
+                             ggml_tensor* t,
+                             ggml_tensor* context,
+                             ggml_tensor* mask,
+                             ggml_tensor* pos_embed,
+                             ggml_tensor* txt_pe,
+                             ggml_tensor* joint_pe) {
+            auto img_embedder    = std::dynamic_pointer_cast<BottleneckPatchEmbed>(blocks["img_embedder"]);
+            auto txt_embedder    = std::dynamic_pointer_cast<Linear>(blocks["txt_embedder"]);
+            auto t_embedder      = std::dynamic_pointer_cast<TimestepEmbedder>(blocks["t_embedder"]);
+            auto pooled_embedder = std::dynamic_pointer_cast<Linear>(blocks["pooled_embedder"]);
+            auto final_layer     = std::dynamic_pointer_cast<FinalLayer>(blocks["final_layer"]);
+
+            int64_t W  = img->ne[0];
+            int64_t H  = img->ne[1];
+            int64_t hp = H / config.patch_size;
+            int64_t wp = W / config.patch_size;
+
+            context = apply_text_mask(ctx, context, mask);
+            auto x  = img_embedder->forward(ctx, img);
+            x       = ggml_add(ctx->ggml_ctx, x, pos_embed);
+
+            auto t_vec       = t_embedder->forward(ctx, t);
+            auto pooled_text = pool_context(ctx, context);
+            auto vec         = ggml_add(ctx->ggml_ctx, t_vec, pooled_embedder->forward(ctx, pooled_text));
+            SD_UNUSED(vec);
+
+            auto txt = txt_embedder->forward(ctx, context);
+            for (int64_t i = 0; i < config.txt_preamble_depth; ++i) {
+                auto block = std::dynamic_pointer_cast<PlainTextTransformerBlock>(blocks["txt_preamble_blocks." + std::to_string(i)]);
+                txt        = block->forward(ctx, txt, txt_pe);
+                sd::ggml_graph_cut::mark_graph_cut(txt, "minit2i.txt_preamble_blocks." + std::to_string(i), "txt");
+            }
+            for (int64_t i = 0; i < config.depth_double; ++i) {
+                auto block = std::dynamic_pointer_cast<DoubleStreamDiTBlock>(blocks["double_blocks." + std::to_string(i)]);
+                auto out   = block->forward(ctx, x, txt, joint_pe);
+                x          = out.first;
+                txt        = out.second;
+                sd::ggml_graph_cut::mark_graph_cut(x, "minit2i.double_blocks." + std::to_string(i), "x");
+                sd::ggml_graph_cut::mark_graph_cut(txt, "minit2i.double_blocks." + std::to_string(i), "txt");
+            }
+            auto combined = ggml_concat(ctx->ggml_ctx, txt, x, 1);
+            auto out      = final_layer->forward(ctx, combined);
+            auto img_out  = ggml_ext_slice(ctx->ggml_ctx, out, 1, txt->ne[1], txt->ne[1] + x->ne[1]);
+            return DiT::unpatchify(ctx->ggml_ctx, img_out, hp, wp, static_cast<int>(config.patch_size), static_cast<int>(config.patch_size), false);
+        }
+    };
+
+    inline std::string resolve_prefix(const String2TensorStorage& tensor_storage_map, const std::string& requested) {
+        if (!requested.empty() && tensor_storage_map.find(requested + ".img_embedder.proj1.weight") != tensor_storage_map.end()) {
+            return requested;
+        }
+        static const std::vector<std::string> candidates = {
+            "model.net",
+            "model.diffusion_model.net",
+            "model.diffusion_model.model.net",
+        };
+        for (const auto& candidate : candidates) {
+            if (tensor_storage_map.find(candidate + ".img_embedder.proj1.weight") != tensor_storage_map.end()) {
+                return candidate;
+            }
+        }
+        return requested.empty() ? "model.net" : requested;
+    }
+
+    struct MiniT2IRunner : public DiffusionModelRunner {
+        MiniT2IConfig config;
+        MMJiT model;
+        std::vector<float> pos_embed_vec;
+        std::vector<float> txt_pe_vec;
+        std::vector<float> joint_pe_vec;
+
+        MiniT2IRunner(ggml_backend_t backend,
+                      const String2TensorStorage& tensor_storage_map      = {},
+                      const std::string prefix                            = "",
+                      std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
+            : DiffusionModelRunner(backend, resolve_prefix(tensor_storage_map, prefix), weight_manager),
+              config(MiniT2IConfig::detect_from_weights(tensor_storage_map, this->prefix)),
+              model(config) {
+            model.init(params_ctx, tensor_storage_map, this->prefix);
+        }
+
+        std::string get_desc() override {
+            return "MiniT2I";
+        }
+
+        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
+            model.get_param_tensors(tensors, prefix);
+        }
+
+        ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
+                                 const sd::Tensor<float>& timesteps_tensor,
+                                 const sd::Tensor<float>& context_tensor,
+                                 const sd::Tensor<float>& mask_tensor) {
+            ggml_cgraph* gf        = new_graph_custom(MINIT2I_GRAPH_SIZE);
+            ggml_tensor* x         = make_input(x_tensor);
+            ggml_tensor* timesteps = make_input(timesteps_tensor);
+            ggml_tensor* context   = make_input(context_tensor);
+            ggml_tensor* mask      = make_input(mask_tensor);
+
+            int64_t W        = x->ne[0];
+            int64_t H        = x->ne[1];
+            int64_t img_side = H / config.patch_size;
+            int64_t txt_len  = context->ne[1];
+
+            pos_embed_vec = make_2d_sincos_pos_embed(static_cast<int>(img_side), static_cast<int>(config.hidden_size));
+            auto pos_embed = ggml_new_tensor_3d(compute_ctx, GGML_TYPE_F32, config.hidden_size, img_side * img_side, 1);
+            set_backend_tensor_data(pos_embed, pos_embed_vec.data());
+
+            txt_pe_vec = make_text_rope(static_cast<int>(txt_len), static_cast<int>(config.head_dim));
+            auto txt_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.head_dim / 2, txt_len);
+            set_backend_tensor_data(txt_pe, txt_pe_vec.data());
+
+            auto img_pe_vec = make_vision_rope(static_cast<int>(img_side), static_cast<int>(config.head_dim));
+            joint_pe_vec    = txt_pe_vec;
+            joint_pe_vec.insert(joint_pe_vec.end(), img_pe_vec.begin(), img_pe_vec.end());
+            auto joint_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.head_dim / 2, txt_len + img_side * img_side);
+            set_backend_tensor_data(joint_pe, joint_pe_vec.data());
+
+            auto runner_ctx = get_context();
+            auto out        = model.forward(&runner_ctx, x, timesteps, context, mask, pos_embed, txt_pe, joint_pe);
+            ggml_build_forward_expand(gf, out);
+            return gf;
+        }
+
+        sd::Tensor<float> compute(int n_threads,
+                                  const sd::Tensor<float>& x,
+                                  const sd::Tensor<float>& timesteps,
+                                  const sd::Tensor<float>& context,
+                                  const sd::Tensor<float>& mask) {
+            auto get_graph = [&]() -> ggml_cgraph* {
+                return build_graph(x, timesteps, context, mask);
+            };
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
+        }
+
+        sd::Tensor<float> compute(int n_threads,
+                                  const DiffusionParams& diffusion_params) override {
+            GGML_ASSERT(diffusion_params.x != nullptr);
+            GGML_ASSERT(diffusion_params.timesteps != nullptr);
+            GGML_ASSERT(diffusion_params.context != nullptr);
+            GGML_ASSERT(diffusion_params.y != nullptr);
+            return compute(n_threads,
+                           *diffusion_params.x,
+                           *diffusion_params.timesteps,
+                           *diffusion_params.context,
+                           *diffusion_params.y);
+        }
+    };
+}  // namespace MiniT2I
+
+#endif  // __SD_MODEL_DIFFUSION_MINIT2I_HPP__
diff --git a/src/model/te/t5.hpp b/src/model/te/t5.hpp
index 7a92ec577..c7cfef2df 100644
--- a/src/model/te/t5.hpp
+++ b/src/model/te/t5.hpp
@@ -23,19 +23,72 @@ struct T5Config {
     int64_t vocab_size      = 32128;
     bool relative_attention = true;
 
-    static T5Config detect_from_weights(const String2TensorStorage& tensor_storage_map,
-                                        const std::string& prefix,
-                                        bool is_umt5 = false) {
-        (void)tensor_storage_map;
-        (void)prefix;
-        T5Config config;
-        if (is_umt5) {
-            config.vocab_size         = 256384;
-            config.relative_attention = false;
-        }
-        return config;
-    }
-};
+    static T5Config detect_from_weights(const String2TensorStorage& tensor_storage_map,
+                                        const std::string& prefix,
+                                        bool is_umt5 = false) {
+        T5Config config;
+        if (is_umt5) {
+            config.vocab_size         = 256384;
+            config.relative_attention = false;
+        }
+        auto find_tensor = [&](const std::string& suffix) -> const TensorStorage* {
+            auto it = tensor_storage_map.find(prefix + "." + suffix);
+            if (it != tensor_storage_map.end()) {
+                return &it->second;
+            }
+            it = tensor_storage_map.find(prefix + suffix);
+            if (it != tensor_storage_map.end()) {
+                return &it->second;
+            }
+            return nullptr;
+        };
+
+        if (const TensorStorage* shared = find_tensor("shared.weight")) {
+            if (shared->n_dims == 2) {
+                config.vocab_size = shared->ne[1];
+                config.model_dim  = shared->ne[0];
+            }
+        }
+        if (const TensorStorage* q = find_tensor("encoder.block.0.layer.0.SelfAttention.q.weight")) {
+            if (q->n_dims == 2) {
+                config.model_dim = q->ne[0];
+                int64_t inner_dim = q->ne[1];
+                // Flan-T5/T5 uses d_kv=64 for common sizes.
+                if (inner_dim % 64 == 0) {
+                    config.num_heads = inner_dim / 64;
+                }
+            }
+        }
+        if (const TensorStorage* wi = find_tensor("encoder.block.0.layer.1.DenseReluDense.wi_0.weight")) {
+            if (wi->n_dims == 2) {
+                config.model_dim = wi->ne[0];
+                config.ff_dim    = wi->ne[1];
+            }
+        }
+        int64_t detected_layers = 0;
+        for (const auto& [name, _] : tensor_storage_map) {
+            std::string base = prefix;
+            if (!base.empty() && base.back() != '.') {
+                base += ".";
+            }
+            std::string layer_prefix = base + "encoder.block.";
+            if (!starts_with(name, layer_prefix)) {
+                continue;
+            }
+            size_t pos = layer_prefix.size();
+            size_t dot = name.find('.', pos);
+            if (dot == std::string::npos) {
+                continue;
+            }
+            int64_t layer = atoi(name.substr(pos, dot - pos).c_str());
+            detected_layers = std::max(detected_layers, layer + 1);
+        }
+        if (detected_layers > 0) {
+            config.num_layers = detected_layers;
+        }
+        return config;
+    }
+};
 
 class T5LayerNorm : public UnaryBlock {
 protected:
diff --git a/src/model/vae/vae.hpp b/src/model/vae/vae.hpp
index 34a0d9663..8b8c46ded 100644
--- a/src/model/vae/vae.hpp
+++ b/src/model/vae/vae.hpp
@@ -78,7 +78,7 @@ struct VAE : public GGMLRunner {
             scale_factor = 16;
         } else if (sd_version_uses_flux2_vae(version)) {
             scale_factor = 16;
-        } else if (version == VERSION_CHROMA_RADIANCE || version == VERSION_HIDREAM_O1) {
+        } else if (version == VERSION_CHROMA_RADIANCE || version == VERSION_HIDREAM_O1 || sd_version_is_minit2i(version)) {
             scale_factor = 1;
         }
         return scale_factor;
diff --git a/src/model_loader.cpp b/src/model_loader.cpp
index c239e22d2..9c702897e 100644
--- a/src/model_loader.cpp
+++ b/src/model_loader.cpp
@@ -470,6 +470,17 @@ SDVersion ModelLoader::get_sd_version() {
             tensor_storage_map.find("model.diffusion_model.transformer_blocks.0.img_mlp.w1.weight") != tensor_storage_map.end()) {
             return VERSION_LENS;
         }
+        if ((tensor_storage_map.find("model.net.img_embedder.proj1.weight") != tensor_storage_map.end() &&
+             tensor_storage_map.find("model.net.double_blocks.0.img_qkv.weight") != tensor_storage_map.end() &&
+             tensor_storage_map.find("model.net.txt_embedder.weight") != tensor_storage_map.end()) ||
+            (tensor_storage_map.find("model.diffusion_model.net.img_embedder.proj1.weight") != tensor_storage_map.end() &&
+             tensor_storage_map.find("model.diffusion_model.net.double_blocks.0.img_qkv.weight") != tensor_storage_map.end() &&
+             tensor_storage_map.find("model.diffusion_model.net.txt_embedder.weight") != tensor_storage_map.end()) ||
+            (tensor_storage_map.find("model.diffusion_model.model.net.img_embedder.proj1.weight") != tensor_storage_map.end() &&
+             tensor_storage_map.find("model.diffusion_model.model.net.double_blocks.0.img_qkv.weight") != tensor_storage_map.end() &&
+             tensor_storage_map.find("model.diffusion_model.model.net.txt_embedder.weight") != tensor_storage_map.end())) {
+            return VERSION_MINIT2I;
+        }
         if (tensor_storage.name.find("model.diffusion_model.transformer_blocks.0.img_mod.1.weight") != std::string::npos) {
             return VERSION_QWEN_IMAGE;
         }
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index a1623252c..2ae2e9651 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -29,6 +29,7 @@
 #include "model/diffusion/krea2.hpp"
 #include "model/diffusion/lens.hpp"
 #include "model/diffusion/ltxv.hpp"
+#include "model/diffusion/minit2i.hpp"
 #include "model/diffusion/mmdit.hpp"
 #include "model/diffusion/model.hpp"
 #include "model/diffusion/pid.hpp"
@@ -93,6 +94,7 @@ const char* model_version_to_str[] = {
     "Ovis Image",
     "Ernie Image",
     "Lens",
+    "MiniT2I",
     "Longcat-Image",
     "PiD",
     "Ideogram 4",
@@ -785,6 +787,14 @@ class StableDiffusionGGML {
                                                                                tensor_storage_map,
                                                                                "model",
                                                                                model_manager);
+            } else if (sd_version_is_minit2i(version)) {
+                cond_stage_model = std::make_shared<MiniT2IConditioner>(backend_for(SDBackendModule::TE),
+                                                                        tensor_storage_map,
+                                                                        model_manager);
+                diffusion_model  = std::make_shared<MiniT2I::MiniT2IRunner>(backend_for(SDBackendModule::DIFFUSION),
+                                                                            tensor_storage_map,
+                                                                            "",
+                                                                            model_manager);
             } else if (sd_version_is_anima(version)) {
                 cond_stage_model = std::make_shared<AnimaConditioner>(backend_for(SDBackendModule::TE),
                                                                       tensor_storage_map,
@@ -958,7 +968,7 @@ class StableDiffusionGGML {
                 }
             };
 
-            if (version == VERSION_CHROMA_RADIANCE || version == VERSION_HIDREAM_O1) {
+            if (version == VERSION_CHROMA_RADIANCE || version == VERSION_HIDREAM_O1 || sd_version_is_minit2i(version)) {
                 LOG_INFO("using FakeVAE");
                 first_stage_model = std::make_shared<FakeVAE>(version,
                                                               backend_for(SDBackendModule::VAE),
@@ -2032,11 +2042,93 @@ class StableDiffusionGGML {
         }
 
         int64_t last_progress_us     = ggml_time_us();
+        SamplePreviewContext preview = prepare_sample_preview_context();
+
+        if (sd_version_is_minit2i(version)) {
+            if (noise.empty()) {
+                LOG_ERROR("MiniT2I sampling requires initial noise");
+                return {};
+            }
+            if (cond.c_crossattn.empty() || cond.c_vector.empty()) {
+                LOG_ERROR("MiniT2I requires T5 hidden states and prompt mask");
+                return {};
+            }
+            size_t minit2i_steps = steps > 0 ? steps : 100;
+            sd::Tensor<float> x_t = noise * 2.0f;
+            sd::Tensor<float> denoised = x_t;
+            sd::Tensor<float> uncond_mask = sd::Tensor<float>::zeros_like(cond.c_vector);
+
+            auto run_minit2i = [&](const sd::Tensor<float>& x,
+                                   float t_value,
+                                   const sd::Tensor<float>& mask) -> sd::Tensor<float> {
+                int64_t batch = x.dim() >= 4 ? x.shape()[3] : 1;
+                if (batch <= 0) {
+                    LOG_ERROR("MiniT2I got invalid input shape for sampling");
+                    return {};
+                }
+                LOG_DEBUG("MiniT2I sampling input shape: dim=%" PRId64 ", batch=%" PRId64,
+                          x.dim(),
+                          batch);
+                std::vector<float> t_vec(static_cast<size_t>(batch), t_value);
+                const int64_t t_vec_size = static_cast<int64_t>(t_vec.size());
+                sd::Tensor<float> timesteps_tensor({t_vec_size}, std::move(t_vec));
+                DiffusionParams diffusion_params;
+                diffusion_params.x         = &x;
+                diffusion_params.timesteps = &timesteps_tensor;
+                diffusion_params.context   = &cond.c_crossattn;
+                diffusion_params.y         = &mask;
+                auto out = work_diffusion_model->compute(n_threads, diffusion_params);
+                if (out.empty()) {
+                    LOG_ERROR("MiniT2I diffusion model compute failed");
+                    return {};
+                }
+                return out;
+            };
+
+            pretty_progress(0, static_cast<int>(minit2i_steps), 0);
+            last_progress_us = ggml_time_us();
+            for (size_t i = 0; i < minit2i_steps; ++i) {
+                if (get_cancel_flag() == SD_CANCEL_ALL) {
+                    LOG_DEBUG("cancelling generation");
+                    return {};
+                }
+                float t_cur  = static_cast<float>(i) / static_cast<float>(minit2i_steps);
+                float t_next = static_cast<float>(i + 1) / static_cast<float>(minit2i_steps);
+
+                if (sd_should_preview_noisy() && preview.callback != nullptr) {
+                    preview_image(static_cast<int>(i + 1), x_t, version, preview.mode, preview.callback, preview.data, true);
+                }
+
+                auto cond_x0 = run_minit2i(x_t, t_cur, cond.c_vector);
+                if (cond_x0.empty()) {
+                    return {};
+                }
+                auto uncond_x0 = run_minit2i(x_t, t_cur, uncond_mask);
+                if (uncond_x0.empty()) {
+                    return {};
+                }
+                float denom = std::max(1.0f - t_cur, 0.001f);
+                auto cond_v = (cond_x0 - x_t) / denom;
+                auto uncond_v = (uncond_x0 - x_t) / denom;
+                auto v = uncond_v + (cond_v - uncond_v) * cfg_scale;
+                x_t += v * (t_next - t_cur);
+                denoised = x_t;
+
+                if (sd_should_preview_denoised() && preview.callback != nullptr) {
+                    preview_image(static_cast<int>(i + 1), denoised, version, preview.mode, preview.callback, preview.data, false);
+                }
+                report_sample_progress(static_cast<int>(i + 1), minit2i_steps, &last_progress_us);
+            }
+            if (work_diffusion_model) {
+                work_diffusion_model->free_compute_buffer();
+            }
+            return denoised;
+        }
+
         sd::Tensor<float> x_t        = !noise.empty()
                                            ? denoiser->noise_scaling(sigmas[0], noise, init_latent)
                                            : init_latent;
         sd::Tensor<float> denoised   = x_t;
-        SamplePreviewContext preview = prepare_sample_preview_context();
 
         auto denoise = [&](const sd::Tensor<float>& x, float sigma, int step) -> sd::guidance::GuiderOutput {
             if (get_cancel_flag() == SD_CANCEL_ALL) {
@@ -2335,6 +2427,8 @@ class StableDiffusionGGML {
                 latent_channel = 3;
             } else if (version == VERSION_CHROMA_RADIANCE) {
                 latent_channel = 3;
+            } else if (sd_version_is_minit2i(version)) {
+                latent_channel = 3;
             } else if (sd_version_is_pid(version)) {
                 latent_channel = 3;
             } else if (sd_version_is_sefi_image(version)) {
@@ -2416,7 +2510,7 @@ class StableDiffusionGGML {
     }
 
     sd::Tensor<float> decode_first_stage(const sd::Tensor<float>& x, bool decode_video = false) {
-        if (sd_version_is_pid(version)) {
+        if (sd_version_is_pid(version) || sd_version_is_minit2i(version)) {
             return sd::ops::clamp((x + 1.f) * 0.5f, 0.0f, 1.0f);
         }
         auto latents = first_stage_model->diffusion_to_vae_latents(x);

From 9153c16a5545c0cad500d9a94a2148e3ac453576 Mon Sep 17 00:00:00 2001
From: KenForever1 <2962666398@qq.com>
Date: Fri, 19 Jun 2026 15:41:20 +0800
Subject: [PATCH 2/7] Optimize MiniT2I position cache

Cache MiniT2I positional embeddings and text/vision RoPE tensors in a runner-level backend buffer. This avoids regenerating and uploading the same step-invariant constants for every denoise graph while preserving model batch semantics.
---
 src/model/diffusion/minit2i.hpp | 98 +++++++++++++++++++++++++++------
 1 file changed, 80 insertions(+), 18 deletions(-)

diff --git a/src/model/diffusion/minit2i.hpp b/src/model/diffusion/minit2i.hpp
index d69f1f4ac..ea1675273 100644
--- a/src/model/diffusion/minit2i.hpp
+++ b/src/model/diffusion/minit2i.hpp
@@ -487,9 +487,15 @@ namespace MiniT2I {
     struct MiniT2IRunner : public DiffusionModelRunner {
         MiniT2IConfig config;
         MMJiT model;
-        std::vector<float> pos_embed_vec;
-        std::vector<float> txt_pe_vec;
-        std::vector<float> joint_pe_vec;
+        ggml_context* position_cache_ctx            = nullptr;
+        ggml_backend_buffer_t position_cache_buffer = nullptr;
+        ggml_tensor* cached_pos_embed               = nullptr;
+        ggml_tensor* cached_txt_pe                  = nullptr;
+        ggml_tensor* cached_joint_pe                = nullptr;
+        int64_t cached_img_side                     = -1;
+        int64_t cached_txt_len                      = -1;
+        int64_t cached_hidden_size                  = -1;
+        int64_t cached_head_dim                     = -1;
 
         MiniT2IRunner(ggml_backend_t backend,
                       const String2TensorStorage& tensor_storage_map      = {},
@@ -501,6 +507,10 @@ namespace MiniT2I {
             model.init(params_ctx, tensor_storage_map, this->prefix);
         }
 
+        ~MiniT2IRunner() override {
+            free_position_cache();
+        }
+
         std::string get_desc() override {
             return "MiniT2I";
         }
@@ -509,6 +519,71 @@ namespace MiniT2I {
             model.get_param_tensors(tensors, prefix);
         }
 
+        void free_position_cache() {
+            if (position_cache_buffer != nullptr) {
+                ggml_backend_buffer_free(position_cache_buffer);
+                position_cache_buffer = nullptr;
+            }
+            if (position_cache_ctx != nullptr) {
+                ggml_free(position_cache_ctx);
+                position_cache_ctx = nullptr;
+            }
+            cached_pos_embed   = nullptr;
+            cached_txt_pe      = nullptr;
+            cached_joint_pe    = nullptr;
+            cached_img_side    = -1;
+            cached_txt_len     = -1;
+            cached_hidden_size = -1;
+            cached_head_dim    = -1;
+        }
+
+        void ensure_position_cache(int64_t img_side, int64_t txt_len) {
+            if (cached_img_side == img_side &&
+                cached_txt_len == txt_len &&
+                cached_hidden_size == config.hidden_size &&
+                cached_head_dim == config.head_dim &&
+                cached_pos_embed != nullptr &&
+                cached_txt_pe != nullptr &&
+                cached_joint_pe != nullptr) {
+                return;
+            }
+
+            free_position_cache();
+
+            auto pos_embed_vec = make_2d_sincos_pos_embed(static_cast<int>(img_side), static_cast<int>(config.hidden_size));
+            auto txt_pe_vec    = make_text_rope(static_cast<int>(txt_len), static_cast<int>(config.head_dim));
+            auto img_pe_vec = make_vision_rope(static_cast<int>(img_side), static_cast<int>(config.head_dim));
+            auto joint_pe_vec = txt_pe_vec;
+            joint_pe_vec.insert(joint_pe_vec.end(), img_pe_vec.begin(), img_pe_vec.end());
+
+            ggml_init_params params;
+            params.mem_size   = static_cast<size_t>(3 * ggml_tensor_overhead());
+            params.mem_buffer = nullptr;
+            params.no_alloc   = true;
+            position_cache_ctx = ggml_init(params);
+            GGML_ASSERT(position_cache_ctx != nullptr);
+
+            cached_pos_embed = ggml_new_tensor_3d(position_cache_ctx, GGML_TYPE_F32, config.hidden_size, img_side * img_side, 1);
+            ggml_set_name(cached_pos_embed, "minit2i.pos_embed");
+            cached_txt_pe = ggml_new_tensor_4d(position_cache_ctx, GGML_TYPE_F32, 2, 2, config.head_dim / 2, txt_len);
+            ggml_set_name(cached_txt_pe, "minit2i.txt_pe");
+            cached_joint_pe = ggml_new_tensor_4d(position_cache_ctx, GGML_TYPE_F32, 2, 2, config.head_dim / 2, txt_len + img_side * img_side);
+            ggml_set_name(cached_joint_pe, "minit2i.joint_pe");
+
+            position_cache_buffer = ggml_backend_alloc_ctx_tensors(position_cache_ctx, runtime_backend);
+            GGML_ASSERT(position_cache_buffer != nullptr);
+            ggml_backend_buffer_set_usage(position_cache_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+            ggml_backend_tensor_set(cached_pos_embed, pos_embed_vec.data(), 0, ggml_nbytes(cached_pos_embed));
+            ggml_backend_tensor_set(cached_txt_pe, txt_pe_vec.data(), 0, ggml_nbytes(cached_txt_pe));
+            ggml_backend_tensor_set(cached_joint_pe, joint_pe_vec.data(), 0, ggml_nbytes(cached_joint_pe));
+            ggml_backend_synchronize(runtime_backend);
+
+            cached_img_side    = img_side;
+            cached_txt_len     = txt_len;
+            cached_hidden_size = config.hidden_size;
+            cached_head_dim    = config.head_dim;
+        }
+
         ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
                                  const sd::Tensor<float>& timesteps_tensor,
                                  const sd::Tensor<float>& context_tensor,
@@ -523,23 +598,10 @@ namespace MiniT2I {
             int64_t H        = x->ne[1];
             int64_t img_side = H / config.patch_size;
             int64_t txt_len  = context->ne[1];
-
-            pos_embed_vec = make_2d_sincos_pos_embed(static_cast<int>(img_side), static_cast<int>(config.hidden_size));
-            auto pos_embed = ggml_new_tensor_3d(compute_ctx, GGML_TYPE_F32, config.hidden_size, img_side * img_side, 1);
-            set_backend_tensor_data(pos_embed, pos_embed_vec.data());
-
-            txt_pe_vec = make_text_rope(static_cast<int>(txt_len), static_cast<int>(config.head_dim));
-            auto txt_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.head_dim / 2, txt_len);
-            set_backend_tensor_data(txt_pe, txt_pe_vec.data());
-
-            auto img_pe_vec = make_vision_rope(static_cast<int>(img_side), static_cast<int>(config.head_dim));
-            joint_pe_vec    = txt_pe_vec;
-            joint_pe_vec.insert(joint_pe_vec.end(), img_pe_vec.begin(), img_pe_vec.end());
-            auto joint_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.head_dim / 2, txt_len + img_side * img_side);
-            set_backend_tensor_data(joint_pe, joint_pe_vec.data());
+            ensure_position_cache(img_side, txt_len);
 
             auto runner_ctx = get_context();
-            auto out        = model.forward(&runner_ctx, x, timesteps, context, mask, pos_embed, txt_pe, joint_pe);
+            auto out        = model.forward(&runner_ctx, x, timesteps, context, mask, cached_pos_embed, cached_txt_pe, cached_joint_pe);
             ggml_build_forward_expand(gf, out);
             return gf;
         }

From 09221cd996cddf9a3a1ea6f3303ca431194d661c Mon Sep 17 00:00:00 2001
From: KenForever1 <2962666398@qq.com>
Date: Fri, 19 Jun 2026 16:29:32 +0800
Subject: [PATCH 3/7] Remove unused MiniT2I conditioning branch

Drop the unused timestep and pooled-text vec path from MiniT2I graph construction. The Python reference currently passes this vec through unused block/final-layer parameters, and local validation produced identical output hashes before and after the cleanup.
---
 src/model/diffusion/minit2i.hpp | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/src/model/diffusion/minit2i.hpp b/src/model/diffusion/minit2i.hpp
index ea1675273..e3c9dd538 100644
--- a/src/model/diffusion/minit2i.hpp
+++ b/src/model/diffusion/minit2i.hpp
@@ -420,17 +420,14 @@ namespace MiniT2I {
 
         ggml_tensor* forward(GGMLRunnerContext* ctx,
                              ggml_tensor* img,
-                             ggml_tensor* t,
                              ggml_tensor* context,
                              ggml_tensor* mask,
                              ggml_tensor* pos_embed,
                              ggml_tensor* txt_pe,
                              ggml_tensor* joint_pe) {
-            auto img_embedder    = std::dynamic_pointer_cast<BottleneckPatchEmbed>(blocks["img_embedder"]);
-            auto txt_embedder    = std::dynamic_pointer_cast<Linear>(blocks["txt_embedder"]);
-            auto t_embedder      = std::dynamic_pointer_cast<TimestepEmbedder>(blocks["t_embedder"]);
-            auto pooled_embedder = std::dynamic_pointer_cast<Linear>(blocks["pooled_embedder"]);
-            auto final_layer     = std::dynamic_pointer_cast<FinalLayer>(blocks["final_layer"]);
+            auto img_embedder = std::dynamic_pointer_cast<BottleneckPatchEmbed>(blocks["img_embedder"]);
+            auto txt_embedder = std::dynamic_pointer_cast<Linear>(blocks["txt_embedder"]);
+            auto final_layer  = std::dynamic_pointer_cast<FinalLayer>(blocks["final_layer"]);
 
             int64_t W  = img->ne[0];
             int64_t H  = img->ne[1];
@@ -441,11 +438,6 @@ namespace MiniT2I {
             auto x  = img_embedder->forward(ctx, img);
             x       = ggml_add(ctx->ggml_ctx, x, pos_embed);
 
-            auto t_vec       = t_embedder->forward(ctx, t);
-            auto pooled_text = pool_context(ctx, context);
-            auto vec         = ggml_add(ctx->ggml_ctx, t_vec, pooled_embedder->forward(ctx, pooled_text));
-            SD_UNUSED(vec);
-
             auto txt = txt_embedder->forward(ctx, context);
             for (int64_t i = 0; i < config.txt_preamble_depth; ++i) {
                 auto block = std::dynamic_pointer_cast<PlainTextTransformerBlock>(blocks["txt_preamble_blocks." + std::to_string(i)]);
@@ -590,9 +582,9 @@ namespace MiniT2I {
                                  const sd::Tensor<float>& mask_tensor) {
             ggml_cgraph* gf        = new_graph_custom(MINIT2I_GRAPH_SIZE);
             ggml_tensor* x         = make_input(x_tensor);
-            ggml_tensor* timesteps = make_input(timesteps_tensor);
             ggml_tensor* context   = make_input(context_tensor);
             ggml_tensor* mask      = make_input(mask_tensor);
+            SD_UNUSED(timesteps_tensor);
 
             int64_t W        = x->ne[0];
             int64_t H        = x->ne[1];
@@ -601,7 +593,7 @@ namespace MiniT2I {
             ensure_position_cache(img_side, txt_len);
 
             auto runner_ctx = get_context();
-            auto out        = model.forward(&runner_ctx, x, timesteps, context, mask, cached_pos_embed, cached_txt_pe, cached_joint_pe);
+            auto out        = model.forward(&runner_ctx, x, context, mask, cached_pos_embed, cached_txt_pe, cached_joint_pe);
             ggml_build_forward_expand(gf, out);
             return gf;
         }

From 1fc4ed3a1508f25632ee26b8e45f9fc4062c7582 Mon Sep 17 00:00:00 2001
From: KenForever1 <2962666398@qq.com>
Date: Wed, 1 Jul 2026 19:56:27 +0800
Subject: [PATCH 4/7] Address MiniT2I PR review feedback

- Simplify model version detection to a single representative weight check
- Remove resolve_prefix; use fixed prefix with --diffusion-model
- Add docs/minit2i.md and README entry
---
 README.md                       |  1 +
 docs/minit2i.md                 | 48 +++++++++++++++++++++++++++++++++
 src/model/diffusion/minit2i.hpp | 19 +------------
 src/model_loader.cpp            | 10 +------
 src/stable-diffusion.cpp        |  2 +-
 5 files changed, 52 insertions(+), 28 deletions(-)
 create mode 100644 docs/minit2i.md

diff --git a/README.md b/README.md
index 6b0e5ef0f..d7127bb18 100644
--- a/README.md
+++ b/README.md
@@ -48,6 +48,7 @@ API and command-line option may change frequently.***
     - [PiD](./docs/pid.md)
     - [LongCat Image](./docs/longcat_image.md)
     - [Z-Image](./docs/z_image.md)
+    - [MiniT2I](./docs/minit2i.md)
     - [Ovis-Image](./docs/ovis_image.md)
     - [Anima](./docs/anima.md)
     - [ERNIE-Image](./docs/ernie_image.md)
diff --git a/docs/minit2i.md b/docs/minit2i.md
new file mode 100644
index 000000000..78c854dfa
--- /dev/null
+++ b/docs/minit2i.md
@@ -0,0 +1,48 @@
+# How to Use
+
+MiniT2I uses a MiniT2I diffusion transformer and `google/flan-t5-large` as the text encoder.
+
+## Download weights
+
+- Download MiniT2I diffusion model
+    - safetensors: https://huggingface.co/MiniT2I/minit2i-b-16/tree/main/transformer (`diffusion_pytorch_model.safetensors`)
+- Download flan-t5-large text encoder
+    - safetensors: https://huggingface.co/google/flan-t5-large/tree/main (`model.safetensors`)
+
+## Examples
+
+### Mac Metal
+
+```
+./bin/sd-cli \
+  --backend metal \
+  --diffusion-model ../models/minit2i/diffusion_pytorch_model.safetensors \
+  --t5xxl ../models/flan-t5-large/model.safetensors \
+  --prompt "a cat" \
+  --steps 100 \
+  --cfg-scale 6 \
+  --width 512 \
+  --height 512 \
+  --seed 42 \
+  --sampling-method euler \
+  --rng cpu \
+  --output minit2i_metal.png \
+  --threads 8
+```
+
+### CUDA with diffusion flash attention
+
+```
+./bin/sd-cli \
+  --diffusion-model ../models/minit2i/diffusion_pytorch_model.safetensors \
+  --t5xxl ../models/flan-t5-large/model.safetensors \
+  --prompt "a cat" \
+  --steps 100 \
+  --cfg-scale 6 \
+  --width 512 \
+  --height 512 \
+  --seed 42 \
+  --sampling-method euler \
+  --diffusion-fa \
+  --output minit2i_cuda.png
+```
diff --git a/src/model/diffusion/minit2i.hpp b/src/model/diffusion/minit2i.hpp
index e3c9dd538..f4698a76f 100644
--- a/src/model/diffusion/minit2i.hpp
+++ b/src/model/diffusion/minit2i.hpp
@@ -459,23 +459,6 @@ namespace MiniT2I {
         }
     };
 
-    inline std::string resolve_prefix(const String2TensorStorage& tensor_storage_map, const std::string& requested) {
-        if (!requested.empty() && tensor_storage_map.find(requested + ".img_embedder.proj1.weight") != tensor_storage_map.end()) {
-            return requested;
-        }
-        static const std::vector<std::string> candidates = {
-            "model.net",
-            "model.diffusion_model.net",
-            "model.diffusion_model.model.net",
-        };
-        for (const auto& candidate : candidates) {
-            if (tensor_storage_map.find(candidate + ".img_embedder.proj1.weight") != tensor_storage_map.end()) {
-                return candidate;
-            }
-        }
-        return requested.empty() ? "model.net" : requested;
-    }
-
     struct MiniT2IRunner : public DiffusionModelRunner {
         MiniT2IConfig config;
         MMJiT model;
@@ -493,7 +476,7 @@ namespace MiniT2I {
                       const String2TensorStorage& tensor_storage_map      = {},
                       const std::string prefix                            = "",
                       std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-            : DiffusionModelRunner(backend, resolve_prefix(tensor_storage_map, prefix), weight_manager),
+            : DiffusionModelRunner(backend, prefix, weight_manager),
               config(MiniT2IConfig::detect_from_weights(tensor_storage_map, this->prefix)),
               model(config) {
             model.init(params_ctx, tensor_storage_map, this->prefix);
diff --git a/src/model_loader.cpp b/src/model_loader.cpp
index 9c702897e..b41187d0c 100644
--- a/src/model_loader.cpp
+++ b/src/model_loader.cpp
@@ -470,15 +470,7 @@ SDVersion ModelLoader::get_sd_version() {
             tensor_storage_map.find("model.diffusion_model.transformer_blocks.0.img_mlp.w1.weight") != tensor_storage_map.end()) {
             return VERSION_LENS;
         }
-        if ((tensor_storage_map.find("model.net.img_embedder.proj1.weight") != tensor_storage_map.end() &&
-             tensor_storage_map.find("model.net.double_blocks.0.img_qkv.weight") != tensor_storage_map.end() &&
-             tensor_storage_map.find("model.net.txt_embedder.weight") != tensor_storage_map.end()) ||
-            (tensor_storage_map.find("model.diffusion_model.net.img_embedder.proj1.weight") != tensor_storage_map.end() &&
-             tensor_storage_map.find("model.diffusion_model.net.double_blocks.0.img_qkv.weight") != tensor_storage_map.end() &&
-             tensor_storage_map.find("model.diffusion_model.net.txt_embedder.weight") != tensor_storage_map.end()) ||
-            (tensor_storage_map.find("model.diffusion_model.model.net.img_embedder.proj1.weight") != tensor_storage_map.end() &&
-             tensor_storage_map.find("model.diffusion_model.model.net.double_blocks.0.img_qkv.weight") != tensor_storage_map.end() &&
-             tensor_storage_map.find("model.diffusion_model.model.net.txt_embedder.weight") != tensor_storage_map.end())) {
+        if (tensor_storage.name.find("net.img_embedder.proj1.weight") != std::string::npos) {
             return VERSION_MINIT2I;
         }
         if (tensor_storage.name.find("model.diffusion_model.transformer_blocks.0.img_mod.1.weight") != std::string::npos) {
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index 2ae2e9651..c7f58de8d 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -793,7 +793,7 @@ class StableDiffusionGGML {
                                                                         model_manager);
                 diffusion_model  = std::make_shared<MiniT2I::MiniT2IRunner>(backend_for(SDBackendModule::DIFFUSION),
                                                                             tensor_storage_map,
-                                                                            "",
+                                                                            "model.diffusion_model.model.net",
                                                                             model_manager);
             } else if (sd_version_is_anima(version)) {
                 cond_stage_model = std::make_shared<AnimaConditioner>(backend_for(SDBackendModule::TE),

From 49c98b986454318a95b53b023f927f3e122ce216 Mon Sep 17 00:00:00 2001
From: KenForever1 <2962666398@qq.com>
Date: Wed, 1 Jul 2026 20:51:32 +0800
Subject: [PATCH 5/7] Use generic sampling flow for MiniT2I

Replace the standalone MiniT2I sampling branch with the shared
sample_k_diffusion path:
- Add MiniT2IFlowDenoiser (sigma = 1 - t, x0-prediction scalings) so the
  generic Euler update reproduces the reference linear-flow step
- Pass the prompt mask via MiniT2IDiffusionExtra and derive the
  unconditional signal from a zeroed mask, letting the generic CFG guider
  handle classifier-free guidance
- Add MINIT2I_FLOW_PRED prediction type and select the denoiser for it

Output matches the previous dedicated branch (max abs pixel diff 2/255).
---
 include/stable-diffusion.h      |  1 +
 src/model/diffusion/minit2i.hpp |  5 +-
 src/model/diffusion/model.hpp   |  7 ++-
 src/runtime/denoiser.hpp        | 62 +++++++++++++++++++++
 src/stable-diffusion.cpp        | 97 ++++++---------------------------
 5 files changed, 88 insertions(+), 84 deletions(-)

diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
index d5bc8a40a..c750498ba 100644
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@@ -84,6 +84,7 @@ enum prediction_t {
     FLOW_PRED,
     FLUX_FLOW_PRED,
     SEFI_FLOW_PRED,
+    MINIT2I_FLOW_PRED,
     PREDICTION_COUNT
 };
 
diff --git a/src/model/diffusion/minit2i.hpp b/src/model/diffusion/minit2i.hpp
index f4698a76f..827fede88 100644
--- a/src/model/diffusion/minit2i.hpp
+++ b/src/model/diffusion/minit2i.hpp
@@ -597,12 +597,13 @@ namespace MiniT2I {
             GGML_ASSERT(diffusion_params.x != nullptr);
             GGML_ASSERT(diffusion_params.timesteps != nullptr);
             GGML_ASSERT(diffusion_params.context != nullptr);
-            GGML_ASSERT(diffusion_params.y != nullptr);
+            const auto* extra = diffusion_extra_as<MiniT2IDiffusionExtra>(diffusion_params);
+            GGML_ASSERT(extra->mask != nullptr);
             return compute(n_threads,
                            *diffusion_params.x,
                            *diffusion_params.timesteps,
                            *diffusion_params.context,
-                           *diffusion_params.y);
+                           *extra->mask);
         }
     };
 }  // namespace MiniT2I
diff --git a/src/model/diffusion/model.hpp b/src/model/diffusion/model.hpp
index 2e143fe4c..cd44e3b50 100644
--- a/src/model/diffusion/model.hpp
+++ b/src/model/diffusion/model.hpp
@@ -52,6 +52,10 @@ struct LTXAVDiffusionExtra {
     const sd::Tensor<float>* video_positions = nullptr;
 };
 
+struct MiniT2IDiffusionExtra {
+    const sd::Tensor<float>* mask = nullptr;
+};
+
 using DiffusionExtraParams = std::variant<std::monostate,
                                           UNetDiffusionExtra,
                                           SkipLayerDiffusionExtra,
@@ -59,7 +63,8 @@ using DiffusionExtraParams = std::variant<std::monostate,
                                           AnimaDiffusionExtra,
                                           WanDiffusionExtra,
                                           HiDreamO1DiffusionExtra,
-                                          LTXAVDiffusionExtra>;
+                                          LTXAVDiffusionExtra,
+                                          MiniT2IDiffusionExtra>;
 
 struct DiffusionParams {
     const sd::Tensor<float>* x                        = nullptr;
diff --git a/src/runtime/denoiser.hpp b/src/runtime/denoiser.hpp
index ed1cd4d93..812eebe61 100644
--- a/src/runtime/denoiser.hpp
+++ b/src/runtime/denoiser.hpp
@@ -1338,6 +1338,68 @@ struct SefiFlowDenoiser : public FluxFlowDenoiser {
     }
 };
 
+// MiniT2I predicts x0 directly and integrates a linear flow ODE:
+//   x_{t+dt} = x_t + (x0 - x_t)/(1 - t) * dt,  t in [0, 1), x0 = start = noise * 2.
+// Mapping sigma = 1 - t makes the generic Euler update
+//   x += (x - denoised)/sigma * (sigma_next - sigma)
+// exactly reproduce that step when denoised == x0. To make the generic
+// `denoised = pred * c_out + x * c_skip` yield x0 from the model's raw x0
+// prediction we use c_skip = 0, c_out = 1, c_in = 1. Sigmas run linearly 1 -> 0.
+struct MiniT2IFlowDenoiser : public Denoiser {
+    float sigma_min() override {
+        return 0.0f;
+    }
+
+    float sigma_max() override {
+        return 1.0f;
+    }
+
+    float sigma_to_t(float sigma) override {
+        return 1.0f - sigma;
+    }
+
+    float t_to_sigma(float t) override {
+        return 1.0f - t;
+    }
+
+    std::vector<float> get_scalings(float sigma) override {
+        SD_UNUSED(sigma);
+        float c_skip = 0.0f;
+        float c_out  = 1.0f;
+        float c_in   = 1.0f;
+        return {c_skip, c_out, c_in};
+    }
+
+    sd::Tensor<float> noise_scaling(float sigma,
+                                    const sd::Tensor<float>& noise,
+                                    const sd::Tensor<float>& latent) override {
+        SD_UNUSED(sigma);
+        SD_UNUSED(latent);
+        // Sampling starts from x0_init = noise * 2 (see MiniT2I reference).
+        return noise * 2.0f;
+    }
+
+    sd::Tensor<float> inverse_noise_scaling(float sigma, const sd::Tensor<float>& latent) override {
+        SD_UNUSED(sigma);
+        return latent;
+    }
+
+    std::vector<float> get_sigmas(uint32_t n, int image_seq_len, scheduler_t scheduler_type, SDVersion version, const char* extra_sample_args = nullptr) override {
+        SD_UNUSED(image_seq_len);
+        SD_UNUSED(scheduler_type);
+        SD_UNUSED(version);
+        SD_UNUSED(extra_sample_args);
+        // Uniform t schedule 0 -> 1 => sigma 1 -> 0, matching the reference loop.
+        std::vector<float> sigmas;
+        sigmas.reserve(n + 1);
+        for (uint32_t i = 0; i < n; ++i) {
+            sigmas.push_back(1.0f - static_cast<float>(i) / static_cast<float>(n));
+        }
+        sigmas.push_back(0.0f);
+        return sigmas;
+    }
+};
+
 typedef std::function<sd::guidance::GuiderOutput(const sd::Tensor<float>&, float, int)> denoise_cb_t;
 
 static std::pair<float, float> get_ancestral_step(float sigma_from,
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index c7f58de8d..26c04a1b5 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -1309,6 +1309,8 @@ class StableDiffusionGGML {
                     }
                 } else if (sd_version_is_sefi_image(version)) {
                     pred_type = SEFI_FLOW_PRED;
+                } else if (sd_version_is_minit2i(version)) {
+                    pred_type = MINIT2I_FLOW_PRED;
                 } else {
                     pred_type = EPS_PRED;
                 }
@@ -1346,6 +1348,11 @@ class StableDiffusionGGML {
                     denoiser = std::make_shared<SefiFlowDenoiser>();
                     break;
                 }
+                case MINIT2I_FLOW_PRED: {
+                    LOG_INFO("running in MiniT2I FLOW mode");
+                    denoiser = std::make_shared<MiniT2IFlowDenoiser>();
+                    break;
+                }
                 default: {
                     LOG_ERROR("Unknown predition type %i", pred_type);
                     return false;
@@ -2044,87 +2051,6 @@ class StableDiffusionGGML {
         int64_t last_progress_us     = ggml_time_us();
         SamplePreviewContext preview = prepare_sample_preview_context();
 
-        if (sd_version_is_minit2i(version)) {
-            if (noise.empty()) {
-                LOG_ERROR("MiniT2I sampling requires initial noise");
-                return {};
-            }
-            if (cond.c_crossattn.empty() || cond.c_vector.empty()) {
-                LOG_ERROR("MiniT2I requires T5 hidden states and prompt mask");
-                return {};
-            }
-            size_t minit2i_steps = steps > 0 ? steps : 100;
-            sd::Tensor<float> x_t = noise * 2.0f;
-            sd::Tensor<float> denoised = x_t;
-            sd::Tensor<float> uncond_mask = sd::Tensor<float>::zeros_like(cond.c_vector);
-
-            auto run_minit2i = [&](const sd::Tensor<float>& x,
-                                   float t_value,
-                                   const sd::Tensor<float>& mask) -> sd::Tensor<float> {
-                int64_t batch = x.dim() >= 4 ? x.shape()[3] : 1;
-                if (batch <= 0) {
-                    LOG_ERROR("MiniT2I got invalid input shape for sampling");
-                    return {};
-                }
-                LOG_DEBUG("MiniT2I sampling input shape: dim=%" PRId64 ", batch=%" PRId64,
-                          x.dim(),
-                          batch);
-                std::vector<float> t_vec(static_cast<size_t>(batch), t_value);
-                const int64_t t_vec_size = static_cast<int64_t>(t_vec.size());
-                sd::Tensor<float> timesteps_tensor({t_vec_size}, std::move(t_vec));
-                DiffusionParams diffusion_params;
-                diffusion_params.x         = &x;
-                diffusion_params.timesteps = &timesteps_tensor;
-                diffusion_params.context   = &cond.c_crossattn;
-                diffusion_params.y         = &mask;
-                auto out = work_diffusion_model->compute(n_threads, diffusion_params);
-                if (out.empty()) {
-                    LOG_ERROR("MiniT2I diffusion model compute failed");
-                    return {};
-                }
-                return out;
-            };
-
-            pretty_progress(0, static_cast<int>(minit2i_steps), 0);
-            last_progress_us = ggml_time_us();
-            for (size_t i = 0; i < minit2i_steps; ++i) {
-                if (get_cancel_flag() == SD_CANCEL_ALL) {
-                    LOG_DEBUG("cancelling generation");
-                    return {};
-                }
-                float t_cur  = static_cast<float>(i) / static_cast<float>(minit2i_steps);
-                float t_next = static_cast<float>(i + 1) / static_cast<float>(minit2i_steps);
-
-                if (sd_should_preview_noisy() && preview.callback != nullptr) {
-                    preview_image(static_cast<int>(i + 1), x_t, version, preview.mode, preview.callback, preview.data, true);
-                }
-
-                auto cond_x0 = run_minit2i(x_t, t_cur, cond.c_vector);
-                if (cond_x0.empty()) {
-                    return {};
-                }
-                auto uncond_x0 = run_minit2i(x_t, t_cur, uncond_mask);
-                if (uncond_x0.empty()) {
-                    return {};
-                }
-                float denom = std::max(1.0f - t_cur, 0.001f);
-                auto cond_v = (cond_x0 - x_t) / denom;
-                auto uncond_v = (uncond_x0 - x_t) / denom;
-                auto v = uncond_v + (cond_v - uncond_v) * cfg_scale;
-                x_t += v * (t_next - t_cur);
-                denoised = x_t;
-
-                if (sd_should_preview_denoised() && preview.callback != nullptr) {
-                    preview_image(static_cast<int>(i + 1), denoised, version, preview.mode, preview.callback, preview.data, false);
-                }
-                report_sample_progress(static_cast<int>(i + 1), minit2i_steps, &last_progress_us);
-            }
-            if (work_diffusion_model) {
-                work_diffusion_model->free_compute_buffer();
-            }
-            return denoised;
-        }
-
         sd::Tensor<float> x_t        = !noise.empty()
                                            ? denoiser->noise_scaling(sigmas[0], noise, init_latent)
                                            : init_latent;
@@ -2247,6 +2173,9 @@ class StableDiffusionGGML {
                         audio_length,
                         frame_rate,
                         video_positions.empty() ? nullptr : &video_positions};
+                } else if (sd_version_is_minit2i(version)) {
+                    diffusion_params.extra = MiniT2IDiffusionExtra{
+                        condition.c_vector.empty() ? nullptr : &condition.c_vector};
                 } else {
                     diffusion_params.extra = std::monostate{};
                 }
@@ -2685,6 +2614,7 @@ const char* prediction_to_str[] = {
     "sd3_flow",
     "flux_flow",
     "sefi_flow",
+    "minit2i_flow",
 };
 
 const char* sd_prediction_name(enum prediction_t prediction) {
@@ -4318,6 +4248,11 @@ static std::optional<ImageGenerationEmbeds> prepare_image_generation_embeds(sd_c
     if (request->use_uncond || request->use_high_noise_uncond) {
         if (sd_version_is_ideogram4(sd_ctx->sd->version)) {
             uncond.c_vector = sd::Tensor<float>::from_vector({1.0f});
+        } else if (sd_version_is_minit2i(sd_ctx->sd->version)) {
+            // MiniT2I derives the unconditional signal from the same T5 hidden
+            // states with a zeroed prompt mask, so no extra text encode is needed.
+            uncond.c_crossattn = cond.c_crossattn;
+            uncond.c_vector    = sd::Tensor<float>::zeros_like(cond.c_vector);
         } else {
             bool zero_out_masked = false;
             if (sd_version_is_sdxl(sd_ctx->sd->version) &&

From c47b8e531eebe98a4168872e6b623f42ee673a18 Mon Sep 17 00:00:00 2001
From: leejet <leejet714@gmail.com>
Date: Wed, 1 Jul 2026 23:35:50 +0800
Subject: [PATCH 6/7] fix url

---
 docs/minit2i.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/minit2i.md b/docs/minit2i.md
index 78c854dfa..7e120b38c 100644
--- a/docs/minit2i.md
+++ b/docs/minit2i.md
@@ -5,7 +5,7 @@ MiniT2I uses a MiniT2I diffusion transformer and `google/flan-t5-large` as the t
 ## Download weights
 
 - Download MiniT2I diffusion model
-    - safetensors: https://huggingface.co/MiniT2I/minit2i-b-16/tree/main/transformer (`diffusion_pytorch_model.safetensors`)
+    - safetensors: https://huggingface.co/MiniT2I/MiniT2I/tree/main/minit2i-b-16/transformer (`diffusion_pytorch_model.safetensors`)
 - Download flan-t5-large text encoder
     - safetensors: https://huggingface.co/google/flan-t5-large/tree/main (`model.safetensors`)
 

From 059df64d9d2bf0cf09b04b64042d7fa916c48c88 Mon Sep 17 00:00:00 2001
From: leejet <leejet714@gmail.com>
Date: Thu, 2 Jul 2026 00:32:18 +0800
Subject: [PATCH 7/7] format code

---
 src/model/diffusion/minit2i.hpp | 138 ++++++++++++++++----------------
 src/model/te/t5.hpp             | 132 +++++++++++++++---------------
 src/stable-diffusion.cpp        |  14 ++--
 3 files changed, 142 insertions(+), 142 deletions(-)

diff --git a/src/model/diffusion/minit2i.hpp b/src/model/diffusion/minit2i.hpp
index 827fede88..284661054 100644
--- a/src/model/diffusion/minit2i.hpp
+++ b/src/model/diffusion/minit2i.hpp
@@ -47,8 +47,8 @@ namespace MiniT2I {
                     continue;
                 }
                 if (ends_with(name, "img_embedder.proj1.weight") && tensor_storage.n_dims == 4) {
-                    config.patch_size  = tensor_storage.ne[0];
-                    config.in_channels = tensor_storage.ne[2];
+                    config.patch_size   = tensor_storage.ne[0];
+                    config.in_channels  = tensor_storage.ne[2];
                     config.pca_channels = tensor_storage.ne[3];
                 } else if (ends_with(name, "img_embedder.proj2.weight") && tensor_storage.n_dims == 4) {
                     config.pca_channels = tensor_storage.ne[2];
@@ -59,8 +59,8 @@ namespace MiniT2I {
                 } else if (ends_with(name, "pooled_embedder.weight") && tensor_storage.n_dims == 2) {
                     config.cond_vec_size = tensor_storage.ne[1];
                 } else if (ends_with(name, "double_blocks.0.img_qkv.weight") && tensor_storage.n_dims == 2) {
-                    int64_t inner3 = tensor_storage.ne[1];
-                    int64_t inner  = inner3 / 3;
+                    int64_t inner3     = tensor_storage.ne[1];
+                    int64_t inner      = inner3 / 3;
                     config.hidden_size = tensor_storage.ne[0];
                     if (config.hidden_size == 768) {
                         config.num_heads = 12;
@@ -73,9 +73,9 @@ namespace MiniT2I {
                         config.num_heads = std::max<int64_t>(1, inner / config.head_dim);
                     }
                 } else if (ends_with(name, "final_layer.linear.weight") && tensor_storage.n_dims == 2) {
-                    int64_t patch_area  = config.patch_size * config.patch_size;
-                    config.hidden_size  = tensor_storage.ne[0];
-                    config.in_channels  = patch_area > 0 ? tensor_storage.ne[1] / patch_area : config.in_channels;
+                    int64_t patch_area = config.patch_size * config.patch_size;
+                    config.hidden_size = tensor_storage.ne[0];
+                    config.in_channels = patch_area > 0 ? tensor_storage.ne[1] / patch_area : config.in_channels;
                 } else if (ends_with(name, "mask_token") && tensor_storage.n_dims >= 2) {
                     config.prompt_length = tensor_storage.ne[1];
                 }
@@ -92,8 +92,8 @@ namespace MiniT2I {
                 if (pos != std::string::npos) {
                     auto items = split_string(name.substr(pos), '.');
                     if (items.size() > 1) {
-                        int64_t idx                 = atoi(items[1].c_str());
-                        config.txt_preamble_depth   = std::max<int64_t>(config.txt_preamble_depth, idx + 1);
+                        int64_t idx               = atoi(items[1].c_str());
+                        config.txt_preamble_depth = std::max<int64_t>(config.txt_preamble_depth, idx + 1);
                     }
                 }
             }
@@ -134,8 +134,8 @@ namespace MiniT2I {
             for (int x = 0; x < grid_size; ++x) {
                 size_t base = static_cast<size_t>(y * grid_size + x) * dim;
                 for (int i = 0; i < quarter; ++i) {
-                    float ay = y * omega[i];
-                    float ax = x * omega[i];
+                    float ay                           = y * omega[i];
+                    float ax                           = x * omega[i];
                     out[base + i]                      = std::sin(ax);
                     out[base + quarter + i]            = std::cos(ax);
                     out[base + half_dim + i]           = std::sin(ay);
@@ -152,9 +152,9 @@ namespace MiniT2I {
 
     inline std::vector<float> make_vision_rope(int side, int head_dim) {
         GGML_ASSERT(head_dim % 4 == 0);
-        int dim      = head_dim / 2;
-        int quarter  = dim / 2;
-        int length   = side * side;
+        int dim     = head_dim / 2;
+        int quarter = dim / 2;
+        int length  = side * side;
         std::vector<float> out(static_cast<size_t>(length) * (head_dim / 2) * 4);
         std::vector<float> freqs(quarter);
         for (int i = 0; i < quarter; ++i) {
@@ -165,15 +165,15 @@ namespace MiniT2I {
                 int pos     = y * side + x;
                 size_t base = static_cast<size_t>(pos) * (head_dim / 2) * 4;
                 for (int i = 0; i < quarter; ++i) {
-                    float ay = y * freqs[i];
-                    float ax = x * freqs[i];
+                    float ay        = y * freqs[i];
+                    float ax        = x * freqs[i];
                     float angles[2] = {ay, ax};
                     for (int axis = 0; axis < 2; ++axis) {
-                        int j                   = axis * quarter + i;
-                        out[base + 4 * j]       = std::cos(angles[axis]);
-                        out[base + 4 * j + 1]   = -std::sin(angles[axis]);
-                        out[base + 4 * j + 2]   = std::sin(angles[axis]);
-                        out[base + 4 * j + 3]   = std::cos(angles[axis]);
+                        int j                 = axis * quarter + i;
+                        out[base + 4 * j]     = std::cos(angles[axis]);
+                        out[base + 4 * j + 1] = -std::sin(angles[axis]);
+                        out[base + 4 * j + 2] = std::sin(angles[axis]);
+                        out[base + 4 * j + 3] = std::cos(angles[axis]);
                     }
                 }
             }
@@ -184,15 +184,15 @@ namespace MiniT2I {
     struct SwiGLUMlp : public GGMLBlock {
         SwiGLUMlp(int64_t in_features, int64_t hidden_features) {
             int64_t hidden_dim = ((hidden_features + 7) / 8) * 8;
-            blocks["w1"] = std::make_shared<Linear>(in_features, hidden_dim, false);
-            blocks["w3"] = std::make_shared<Linear>(in_features, hidden_dim, false);
-            blocks["w2"] = std::make_shared<Linear>(hidden_dim, in_features, false);
+            blocks["w1"]       = std::make_shared<Linear>(in_features, hidden_dim, false);
+            blocks["w3"]       = std::make_shared<Linear>(in_features, hidden_dim, false);
+            blocks["w2"]       = std::make_shared<Linear>(hidden_dim, in_features, false);
         }
 
         ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
-            auto w1 = std::dynamic_pointer_cast<Linear>(blocks["w1"]);
-            auto w3 = std::dynamic_pointer_cast<Linear>(blocks["w3"]);
-            auto w2 = std::dynamic_pointer_cast<Linear>(blocks["w2"]);
+            auto w1   = std::dynamic_pointer_cast<Linear>(blocks["w1"]);
+            auto w3   = std::dynamic_pointer_cast<Linear>(blocks["w3"]);
+            auto w2   = std::dynamic_pointer_cast<Linear>(blocks["w2"]);
             auto gate = ggml_silu(ctx->ggml_ctx, w1->forward(ctx, x));
             auto up   = w3->forward(ctx, x);
             return w2->forward(ctx, ggml_mul(ctx->ggml_ctx, gate, up));
@@ -205,28 +205,28 @@ namespace MiniT2I {
         BottleneckPatchEmbed(int64_t patch_size, int64_t in_channels, int64_t pca_channels, int64_t hidden_size)
             : patch_size(patch_size) {
             blocks["proj1"] = std::make_shared<Conv2d>(in_channels,
-                                                        pca_channels,
-                                                        std::pair<int, int>{static_cast<int>(patch_size), static_cast<int>(patch_size)},
-                                                        std::pair<int, int>{static_cast<int>(patch_size), static_cast<int>(patch_size)},
-                                                        std::pair<int, int>{0, 0},
-                                                        std::pair<int, int>{1, 1},
-                                                        false);
+                                                       pca_channels,
+                                                       std::pair<int, int>{static_cast<int>(patch_size), static_cast<int>(patch_size)},
+                                                       std::pair<int, int>{static_cast<int>(patch_size), static_cast<int>(patch_size)},
+                                                       std::pair<int, int>{0, 0},
+                                                       std::pair<int, int>{1, 1},
+                                                       false);
             blocks["proj2"] = std::make_shared<Conv2d>(pca_channels,
-                                                        hidden_size,
-                                                        std::pair<int, int>{1, 1},
-                                                        std::pair<int, int>{1, 1},
-                                                        std::pair<int, int>{0, 0},
-                                                        std::pair<int, int>{1, 1},
-                                                        true);
+                                                       hidden_size,
+                                                       std::pair<int, int>{1, 1},
+                                                       std::pair<int, int>{1, 1},
+                                                       std::pair<int, int>{0, 0},
+                                                       std::pair<int, int>{1, 1},
+                                                       true);
         }
 
         ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
             auto proj1 = std::dynamic_pointer_cast<Conv2d>(blocks["proj1"]);
             auto proj2 = std::dynamic_pointer_cast<Conv2d>(blocks["proj2"]);
-            x = proj1->forward(ctx, x);
-            x = proj2->forward(ctx, x);
-            x = ggml_reshape_3d(ctx->ggml_ctx, x, x->ne[0] * x->ne[1], x->ne[2], x->ne[3]);
-            x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3));
+            x          = proj1->forward(ctx, x);
+            x          = proj2->forward(ctx, x);
+            x          = ggml_reshape_3d(ctx->ggml_ctx, x, x->ne[0] * x->ne[1], x->ne[2], x->ne[3]);
+            x          = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3));
             return x;
         }
     };
@@ -253,12 +253,12 @@ namespace MiniT2I {
     inline std::vector<ggml_tensor*> split_qkv(ggml_context* ctx, ggml_tensor* qkv, int64_t num_heads, int64_t head_dim) {
         int64_t N = qkv->ne[2];
         int64_t L = qkv->ne[1];
-        auto q = ggml_view_4d(ctx, qkv, head_dim, num_heads, L, N,
-                              qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], 0);
-        auto k = ggml_view_4d(ctx, qkv, head_dim, num_heads, L, N,
-                              qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], qkv->nb[0] * head_dim * num_heads);
-        auto v = ggml_view_4d(ctx, qkv, head_dim, num_heads, L, N,
-                              qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], qkv->nb[0] * head_dim * num_heads * 2);
+        auto q    = ggml_view_4d(ctx, qkv, head_dim, num_heads, L, N,
+                                 qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], 0);
+        auto k    = ggml_view_4d(ctx, qkv, head_dim, num_heads, L, N,
+                                 qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], qkv->nb[0] * head_dim * num_heads);
+        auto v    = ggml_view_4d(ctx, qkv, head_dim, num_heads, L, N,
+                                 qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], qkv->nb[0] * head_dim * num_heads * 2);
         return {q, k, v};
     }
 
@@ -268,7 +268,7 @@ namespace MiniT2I {
 
         PlainTextTransformerBlock(int64_t hidden_size, int64_t num_heads, int64_t head_dim, float mlp_ratio)
             : num_heads(num_heads), head_dim(head_dim) {
-            int64_t inner_dim = num_heads * head_dim;
+            int64_t inner_dim   = num_heads * head_dim;
             blocks["norm1"]     = std::make_shared<RMSNorm>(hidden_size, 1e-6f);
             blocks["norm2"]     = std::make_shared<RMSNorm>(hidden_size, 1e-6f);
             blocks["qkv"]       = std::make_shared<Linear>(hidden_size, inner_dim * 3, true);
@@ -304,7 +304,7 @@ namespace MiniT2I {
 
         DoubleStreamDiTBlock(int64_t hidden_size, int64_t txt_hidden_size, int64_t num_heads, int64_t head_dim, float mlp_ratio)
             : num_heads(num_heads), head_dim(head_dim) {
-            int64_t inner_dim = num_heads * head_dim;
+            int64_t inner_dim       = num_heads * head_dim;
             blocks["img_norm1"]     = std::make_shared<RMSNorm>(hidden_size, 1e-6f);
             blocks["img_norm2"]     = std::make_shared<RMSNorm>(hidden_size, 1e-6f);
             blocks["txt_norm1"]     = std::make_shared<RMSNorm>(txt_hidden_size, 1e-6f);
@@ -346,7 +346,7 @@ namespace MiniT2I {
             auto k = ggml_concat(ctx->ggml_ctx, k_norm->forward(ctx, txt_qkv[1]), k_norm->forward(ctx, img_qkv[1]), 2);
             auto v = ggml_concat(ctx->ggml_ctx, txt_qkv[2], img_qkv[2], 2);
 
-            auto out = Rope::attention(ctx, q, k, v, pe, nullptr, 1.0f, false);
+            auto out     = Rope::attention(ctx, q, k, v, pe, nullptr, 1.0f, false);
             auto out_txt = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, lt);
             auto out_img = ggml_ext_slice(ctx->ggml_ctx, out, 1, lt, lt + li);
 
@@ -399,10 +399,10 @@ namespace MiniT2I {
             if (mask == nullptr) {
                 return context;
             }
-            mask = ggml_reshape_3d(ctx->ggml_ctx, mask, 1, mask->ne[0], mask->ne[1]);
-            mask = ggml_repeat(ctx->ggml_ctx, mask, context);
-            auto keep = ggml_mul(ctx->ggml_ctx, context, mask);
-            auto inv  = ggml_sub(ctx->ggml_ctx, ggml_ext_ones_like(ctx->ggml_ctx, mask), mask);
+            mask            = ggml_reshape_3d(ctx->ggml_ctx, mask, 1, mask->ne[0], mask->ne[1]);
+            mask            = ggml_repeat(ctx->ggml_ctx, mask, context);
+            auto keep       = ggml_mul(ctx->ggml_ctx, context, mask);
+            auto inv        = ggml_sub(ctx->ggml_ctx, ggml_ext_ones_like(ctx->ggml_ctx, mask), mask);
             auto mask_token = ggml_repeat(ctx->ggml_ctx, params["mask_token"], context);
             return ggml_add(ctx->ggml_ctx, keep, ggml_mul(ctx->ggml_ctx, mask_token, inv));
         }
@@ -411,10 +411,10 @@ namespace MiniT2I {
             int64_t dim = context->ne[0];
             int64_t len = context->ne[1];
             int64_t N   = context->ne[2];
-            auto x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, context, 1, 0, 2, 3));
-            x      = ggml_reshape_3d(ctx->ggml_ctx, x, len, dim, N);
-            x      = ggml_mean(ctx->ggml_ctx, x);
-            x      = ggml_reshape_2d(ctx->ggml_ctx, x, dim, N);
+            auto x      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, context, 1, 0, 2, 3));
+            x           = ggml_reshape_3d(ctx->ggml_ctx, x, len, dim, N);
+            x           = ggml_mean(ctx->ggml_ctx, x);
+            x           = ggml_reshape_2d(ctx->ggml_ctx, x, dim, N);
             return x;
         }
 
@@ -527,14 +527,14 @@ namespace MiniT2I {
 
             auto pos_embed_vec = make_2d_sincos_pos_embed(static_cast<int>(img_side), static_cast<int>(config.hidden_size));
             auto txt_pe_vec    = make_text_rope(static_cast<int>(txt_len), static_cast<int>(config.head_dim));
-            auto img_pe_vec = make_vision_rope(static_cast<int>(img_side), static_cast<int>(config.head_dim));
-            auto joint_pe_vec = txt_pe_vec;
+            auto img_pe_vec    = make_vision_rope(static_cast<int>(img_side), static_cast<int>(config.head_dim));
+            auto joint_pe_vec  = txt_pe_vec;
             joint_pe_vec.insert(joint_pe_vec.end(), img_pe_vec.begin(), img_pe_vec.end());
 
             ggml_init_params params;
-            params.mem_size   = static_cast<size_t>(3 * ggml_tensor_overhead());
-            params.mem_buffer = nullptr;
-            params.no_alloc   = true;
+            params.mem_size    = static_cast<size_t>(3 * ggml_tensor_overhead());
+            params.mem_buffer  = nullptr;
+            params.no_alloc    = true;
             position_cache_ctx = ggml_init(params);
             GGML_ASSERT(position_cache_ctx != nullptr);
 
@@ -563,10 +563,10 @@ namespace MiniT2I {
                                  const sd::Tensor<float>& timesteps_tensor,
                                  const sd::Tensor<float>& context_tensor,
                                  const sd::Tensor<float>& mask_tensor) {
-            ggml_cgraph* gf        = new_graph_custom(MINIT2I_GRAPH_SIZE);
-            ggml_tensor* x         = make_input(x_tensor);
-            ggml_tensor* context   = make_input(context_tensor);
-            ggml_tensor* mask      = make_input(mask_tensor);
+            ggml_cgraph* gf      = new_graph_custom(MINIT2I_GRAPH_SIZE);
+            ggml_tensor* x       = make_input(x_tensor);
+            ggml_tensor* context = make_input(context_tensor);
+            ggml_tensor* mask    = make_input(mask_tensor);
             SD_UNUSED(timesteps_tensor);
 
             int64_t W        = x->ne[0];
diff --git a/src/model/te/t5.hpp b/src/model/te/t5.hpp
index c7cfef2df..6d2326f94 100644
--- a/src/model/te/t5.hpp
+++ b/src/model/te/t5.hpp
@@ -23,72 +23,72 @@ struct T5Config {
     int64_t vocab_size      = 32128;
     bool relative_attention = true;
 
-    static T5Config detect_from_weights(const String2TensorStorage& tensor_storage_map,
-                                        const std::string& prefix,
-                                        bool is_umt5 = false) {
-        T5Config config;
-        if (is_umt5) {
-            config.vocab_size         = 256384;
-            config.relative_attention = false;
-        }
-        auto find_tensor = [&](const std::string& suffix) -> const TensorStorage* {
-            auto it = tensor_storage_map.find(prefix + "." + suffix);
-            if (it != tensor_storage_map.end()) {
-                return &it->second;
-            }
-            it = tensor_storage_map.find(prefix + suffix);
-            if (it != tensor_storage_map.end()) {
-                return &it->second;
-            }
-            return nullptr;
-        };
-
-        if (const TensorStorage* shared = find_tensor("shared.weight")) {
-            if (shared->n_dims == 2) {
-                config.vocab_size = shared->ne[1];
-                config.model_dim  = shared->ne[0];
-            }
-        }
-        if (const TensorStorage* q = find_tensor("encoder.block.0.layer.0.SelfAttention.q.weight")) {
-            if (q->n_dims == 2) {
-                config.model_dim = q->ne[0];
-                int64_t inner_dim = q->ne[1];
-                // Flan-T5/T5 uses d_kv=64 for common sizes.
-                if (inner_dim % 64 == 0) {
-                    config.num_heads = inner_dim / 64;
-                }
-            }
-        }
-        if (const TensorStorage* wi = find_tensor("encoder.block.0.layer.1.DenseReluDense.wi_0.weight")) {
-            if (wi->n_dims == 2) {
-                config.model_dim = wi->ne[0];
-                config.ff_dim    = wi->ne[1];
-            }
-        }
-        int64_t detected_layers = 0;
-        for (const auto& [name, _] : tensor_storage_map) {
-            std::string base = prefix;
-            if (!base.empty() && base.back() != '.') {
-                base += ".";
-            }
-            std::string layer_prefix = base + "encoder.block.";
-            if (!starts_with(name, layer_prefix)) {
-                continue;
-            }
-            size_t pos = layer_prefix.size();
-            size_t dot = name.find('.', pos);
-            if (dot == std::string::npos) {
-                continue;
-            }
-            int64_t layer = atoi(name.substr(pos, dot - pos).c_str());
-            detected_layers = std::max(detected_layers, layer + 1);
-        }
-        if (detected_layers > 0) {
-            config.num_layers = detected_layers;
-        }
-        return config;
-    }
-};
+    static T5Config detect_from_weights(const String2TensorStorage& tensor_storage_map,
+                                        const std::string& prefix,
+                                        bool is_umt5 = false) {
+        T5Config config;
+        if (is_umt5) {
+            config.vocab_size         = 256384;
+            config.relative_attention = false;
+        }
+        auto find_tensor = [&](const std::string& suffix) -> const TensorStorage* {
+            auto it = tensor_storage_map.find(prefix + "." + suffix);
+            if (it != tensor_storage_map.end()) {
+                return &it->second;
+            }
+            it = tensor_storage_map.find(prefix + suffix);
+            if (it != tensor_storage_map.end()) {
+                return &it->second;
+            }
+            return nullptr;
+        };
+
+        if (const TensorStorage* shared = find_tensor("shared.weight")) {
+            if (shared->n_dims == 2) {
+                config.vocab_size = shared->ne[1];
+                config.model_dim  = shared->ne[0];
+            }
+        }
+        if (const TensorStorage* q = find_tensor("encoder.block.0.layer.0.SelfAttention.q.weight")) {
+            if (q->n_dims == 2) {
+                config.model_dim  = q->ne[0];
+                int64_t inner_dim = q->ne[1];
+                // Flan-T5/T5 uses d_kv=64 for common sizes.
+                if (inner_dim % 64 == 0) {
+                    config.num_heads = inner_dim / 64;
+                }
+            }
+        }
+        if (const TensorStorage* wi = find_tensor("encoder.block.0.layer.1.DenseReluDense.wi_0.weight")) {
+            if (wi->n_dims == 2) {
+                config.model_dim = wi->ne[0];
+                config.ff_dim    = wi->ne[1];
+            }
+        }
+        int64_t detected_layers = 0;
+        for (const auto& [name, _] : tensor_storage_map) {
+            std::string base = prefix;
+            if (!base.empty() && base.back() != '.') {
+                base += ".";
+            }
+            std::string layer_prefix = base + "encoder.block.";
+            if (!starts_with(name, layer_prefix)) {
+                continue;
+            }
+            size_t pos = layer_prefix.size();
+            size_t dot = name.find('.', pos);
+            if (dot == std::string::npos) {
+                continue;
+            }
+            int64_t layer   = atoi(name.substr(pos, dot - pos).c_str());
+            detected_layers = std::max(detected_layers, layer + 1);
+        }
+        if (detected_layers > 0) {
+            config.num_layers = detected_layers;
+        }
+        return config;
+    }
+};
 
 class T5LayerNorm : public UnaryBlock {
 protected:
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index 26c04a1b5..edf7bf78e 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -792,9 +792,9 @@ class StableDiffusionGGML {
                                                                         tensor_storage_map,
                                                                         model_manager);
                 diffusion_model  = std::make_shared<MiniT2I::MiniT2IRunner>(backend_for(SDBackendModule::DIFFUSION),
-                                                                            tensor_storage_map,
-                                                                            "model.diffusion_model.model.net",
-                                                                            model_manager);
+                                                                           tensor_storage_map,
+                                                                           "model.diffusion_model.model.net",
+                                                                           model_manager);
             } else if (sd_version_is_anima(version)) {
                 cond_stage_model = std::make_shared<AnimaConditioner>(backend_for(SDBackendModule::TE),
                                                                       tensor_storage_map,
@@ -2051,10 +2051,10 @@ class StableDiffusionGGML {
         int64_t last_progress_us     = ggml_time_us();
         SamplePreviewContext preview = prepare_sample_preview_context();
 
-        sd::Tensor<float> x_t        = !noise.empty()
-                                           ? denoiser->noise_scaling(sigmas[0], noise, init_latent)
-                                           : init_latent;
-        sd::Tensor<float> denoised   = x_t;
+        sd::Tensor<float> x_t      = !noise.empty()
+                                         ? denoiser->noise_scaling(sigmas[0], noise, init_latent)
+                                         : init_latent;
+        sd::Tensor<float> denoised = x_t;
 
         auto denoise = [&](const sd::Tensor<float>& x, float sigma, int step) -> sd::guidance::GuiderOutput {
             if (get_cancel_flag() == SD_CANCEL_ALL) {