From d3c1811dd7430f8d22c4885c36e9872226c63111 Mon Sep 17 00:00:00 2001 From: leejet Date: Tue, 30 Jun 2026 22:14:27 +0800 Subject: [PATCH] chore: strip UTF-8 BOMs and add cleanup script --- script/remove_utf8_bom.py | 234 +++++++++++++++++++++ src/conditioning/conditioner.hpp | 2 +- src/model/diffusion/control.hpp | 2 +- src/model/diffusion/hidream_o1.hpp | 2 +- src/model/diffusion/model.hpp | 2 +- src/model/te/clip.hpp | 2 +- src/model/te/llm.hpp | 2 +- src/model/te/t5.hpp | 2 +- src/model/upscaler/esrgan.hpp | 2 +- src/model/upscaler/ltx_latent_upscaler.hpp | 2 +- src/model/vae/ltx_audio_vae.hpp | 2 +- src/model/vae/vae.hpp | 2 +- 12 files changed, 245 insertions(+), 11 deletions(-) create mode 100644 script/remove_utf8_bom.py diff --git a/script/remove_utf8_bom.py b/script/remove_utf8_bom.py new file mode 100644 index 000000000..ccb07b0dc --- /dev/null +++ b/script/remove_utf8_bom.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python3 +"""Remove UTF-8 BOMs from files under a directory. + +By default this scans the current working directory recursively and skips +repository areas that should not be touched by ordinary maintenance scripts. +Only files whose first three bytes are the UTF-8 BOM are rewritten. +""" + +import argparse +import os +import shutil +import sys +import tempfile +from pathlib import Path + + +UTF8_BOM = b"\xef\xbb\xbf" + +DEFAULT_EXCLUDED_DIR_NAMES = { + ".git", + ".hg", + ".svn", + ".mypy_cache", + ".pytest_cache", + "__pycache__", + "test", +} + +DEFAULT_EXCLUDED_DIR_PREFIXES = { + "build", +} + +DEFAULT_EXCLUDED_REL_DIRS = { + "examples/server/frontend", + "ggml", + "models", + "src/vocab", + "thirdparty", +} + + +def rel_posix(path: Path, root: Path) -> str: + try: + return path.relative_to(root).as_posix() + except ValueError: + return path.as_posix() + + +def should_skip_dir( + path: Path, + root: Path, + excluded_rel_dirs: set[str], + excluded_names: set[str], + excluded_prefixes: set[str], +) -> bool: + rel = rel_posix(path, root) + return ( + path.name in excluded_names + or rel in excluded_rel_dirs + or any(path.name.startswith(prefix) for prefix in excluded_prefixes) + ) + + +def iter_files( + root: Path, + recursive: bool, + excluded_rel_dirs: set[str], + excluded_names: set[str], + excluded_prefixes: set[str], + follow_symlinks: bool, +): + if recursive: + for dirpath, dirnames, filenames in os.walk(root, followlinks=follow_symlinks): + current_dir = Path(dirpath) + dirnames[:] = [ + name + for name in dirnames + if not should_skip_dir( + current_dir / name, + root, + excluded_rel_dirs, + excluded_names, + excluded_prefixes, + ) + ] + for filename in filenames: + path = current_dir / filename + if path.is_symlink() and not follow_symlinks: + continue + yield path + else: + for path in root.iterdir(): + if path.is_file() and (follow_symlinks or not path.is_symlink()): + yield path + + +def has_utf8_bom(path: Path) -> bool: + with path.open("rb") as f: + return f.read(len(UTF8_BOM)) == UTF8_BOM + + +def strip_utf8_bom(path: Path) -> None: + tmp_path = None + try: + with path.open("rb") as src: + if src.read(len(UTF8_BOM)) != UTF8_BOM: + return + + fd, tmp_name = tempfile.mkstemp( + prefix=f".{path.name}.", + suffix=".tmp", + dir=str(path.parent), + ) + tmp_path = Path(tmp_name) + with os.fdopen(fd, "wb") as dst: + shutil.copyfileobj(src, dst, length=1024 * 1024) + + shutil.copystat(path, tmp_path, follow_symlinks=False) + os.replace(tmp_path, path) + tmp_path = None + finally: + if tmp_path is not None: + try: + tmp_path.unlink() + except FileNotFoundError: + pass + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Scan files and convert UTF-8 BOM files to UTF-8 without BOM.", + ) + parser.add_argument( + "root", + nargs="?", + default=".", + help="Directory to scan. Defaults to the current directory.", + ) + parser.add_argument( + "-n", + "--dry-run", + action="store_true", + help="Only list files that would be converted.", + ) + parser.add_argument( + "--no-recursive", + action="store_true", + help="Only scan files directly under root.", + ) + parser.add_argument( + "--include-repo-excluded", + action="store_true", + help="Do not skip default repository excluded directories.", + ) + parser.add_argument( + "--exclude-dir", + action="append", + default=[], + metavar="DIR", + help="Additional directory name or root-relative path to skip. Can be used multiple times.", + ) + parser.add_argument( + "--follow-symlinks", + action="store_true", + help="Follow symlinked directories and files.", + ) + parser.add_argument( + "-q", + "--quiet", + action="store_true", + help="Only print the final summary.", + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + root = Path(args.root).resolve() + + if not root.is_dir(): + print(f"error: not a directory: {root}", file=sys.stderr) + return 2 + + excluded_names = set() + excluded_rel_dirs = set() + excluded_prefixes = set() + if not args.include_repo_excluded: + excluded_names.update(DEFAULT_EXCLUDED_DIR_NAMES) + excluded_rel_dirs.update(DEFAULT_EXCLUDED_REL_DIRS) + excluded_prefixes.update(DEFAULT_EXCLUDED_DIR_PREFIXES) + + for item in args.exclude_dir: + normalized = Path(item).as_posix().strip("/") + if "/" in normalized: + excluded_rel_dirs.add(normalized) + else: + excluded_names.add(normalized) + + scanned = 0 + converted = 0 + errors = 0 + + for path in iter_files( + root=root, + recursive=not args.no_recursive, + excluded_rel_dirs=excluded_rel_dirs, + excluded_names=excluded_names, + excluded_prefixes=excluded_prefixes, + follow_symlinks=args.follow_symlinks, + ): + scanned += 1 + try: + if not has_utf8_bom(path): + continue + converted += 1 + rel = rel_posix(path, root) + if args.dry_run: + if not args.quiet: + print(f"would convert: {rel}") + else: + strip_utf8_bom(path) + if not args.quiet: + print(f"converted: {rel}") + except OSError as exc: + errors += 1 + print(f"error: {rel_posix(path, root)}: {exc}", file=sys.stderr) + + action = "would convert" if args.dry_run else "converted" + print(f"scanned {scanned} file(s), {action} {converted}, errors {errors}") + return 1 if errors else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/conditioning/conditioner.hpp b/src/conditioning/conditioner.hpp index f6a686fa6..e8b8ee3da 100644 --- a/src/conditioning/conditioner.hpp +++ b/src/conditioning/conditioner.hpp @@ -1,4 +1,4 @@ -#ifndef __SD_CONDITIONING_CONDITIONER_HPP__ +#ifndef __SD_CONDITIONING_CONDITIONER_HPP__ #define __SD_CONDITIONING_CONDITIONER_HPP__ #include diff --git a/src/model/diffusion/control.hpp b/src/model/diffusion/control.hpp index 57e3616f2..eeb8f5109 100644 --- a/src/model/diffusion/control.hpp +++ b/src/model/diffusion/control.hpp @@ -1,4 +1,4 @@ -#ifndef __SD_MODEL_DIFFUSION_CONTROL_HPP__ +#ifndef __SD_MODEL_DIFFUSION_CONTROL_HPP__ #define __SD_MODEL_DIFFUSION_CONTROL_HPP__ #include "model/common/block.hpp" diff --git a/src/model/diffusion/hidream_o1.hpp b/src/model/diffusion/hidream_o1.hpp index 559f61bcf..9d3df0391 100644 --- a/src/model/diffusion/hidream_o1.hpp +++ b/src/model/diffusion/hidream_o1.hpp @@ -1,4 +1,4 @@ -#ifndef __SD_MODEL_DIFFUSION_HIDREAM_O1_HPP__ +#ifndef __SD_MODEL_DIFFUSION_HIDREAM_O1_HPP__ #define __SD_MODEL_DIFFUSION_HIDREAM_O1_HPP__ #include diff --git a/src/model/diffusion/model.hpp b/src/model/diffusion/model.hpp index 67f0fee02..2e143fe4c 100644 --- a/src/model/diffusion/model.hpp +++ b/src/model/diffusion/model.hpp @@ -1,4 +1,4 @@ -#ifndef __SD_MODEL_DIFFUSION_MODEL_HPP__ +#ifndef __SD_MODEL_DIFFUSION_MODEL_HPP__ #define __SD_MODEL_DIFFUSION_MODEL_HPP__ #include diff --git a/src/model/te/clip.hpp b/src/model/te/clip.hpp index 6dc8a947b..2fde3de79 100644 --- a/src/model/te/clip.hpp +++ b/src/model/te/clip.hpp @@ -1,4 +1,4 @@ -#ifndef __SD_MODEL_TE_CLIP_HPP__ +#ifndef __SD_MODEL_TE_CLIP_HPP__ #define __SD_MODEL_TE_CLIP_HPP__ #include "core/ggml_extend.hpp" diff --git a/src/model/te/llm.hpp b/src/model/te/llm.hpp index 427baa250..3c5e9bb54 100644 --- a/src/model/te/llm.hpp +++ b/src/model/te/llm.hpp @@ -1,4 +1,4 @@ -#ifndef __SD_MODEL_TE_LLM_HPP__ +#ifndef __SD_MODEL_TE_LLM_HPP__ #define __SD_MODEL_TE_LLM_HPP__ #include diff --git a/src/model/te/t5.hpp b/src/model/te/t5.hpp index 23da08222..7a92ec577 100644 --- a/src/model/te/t5.hpp +++ b/src/model/te/t5.hpp @@ -1,4 +1,4 @@ -#ifndef __SD_MODEL_TE_T5_HPP__ +#ifndef __SD_MODEL_TE_T5_HPP__ #define __SD_MODEL_TE_T5_HPP__ #include diff --git a/src/model/upscaler/esrgan.hpp b/src/model/upscaler/esrgan.hpp index 4afbab07d..21c977124 100644 --- a/src/model/upscaler/esrgan.hpp +++ b/src/model/upscaler/esrgan.hpp @@ -1,4 +1,4 @@ -#ifndef __SD_MODEL_UPSCALER_ESRGAN_HPP__ +#ifndef __SD_MODEL_UPSCALER_ESRGAN_HPP__ #define __SD_MODEL_UPSCALER_ESRGAN_HPP__ #include diff --git a/src/model/upscaler/ltx_latent_upscaler.hpp b/src/model/upscaler/ltx_latent_upscaler.hpp index 5343ad036..b70e16136 100644 --- a/src/model/upscaler/ltx_latent_upscaler.hpp +++ b/src/model/upscaler/ltx_latent_upscaler.hpp @@ -1,4 +1,4 @@ -#ifndef __SD_MODEL_UPSCALER_LTX_LATENT_UPSCALER_HPP__ +#ifndef __SD_MODEL_UPSCALER_LTX_LATENT_UPSCALER_HPP__ #define __SD_MODEL_UPSCALER_LTX_LATENT_UPSCALER_HPP__ #include diff --git a/src/model/vae/ltx_audio_vae.hpp b/src/model/vae/ltx_audio_vae.hpp index 997c57a5b..2f8c03d10 100644 --- a/src/model/vae/ltx_audio_vae.hpp +++ b/src/model/vae/ltx_audio_vae.hpp @@ -1,4 +1,4 @@ -#ifndef __SD_MODEL_VAE_LTX_AUDIO_VAE_HPP__ +#ifndef __SD_MODEL_VAE_LTX_AUDIO_VAE_HPP__ #define __SD_MODEL_VAE_LTX_AUDIO_VAE_HPP__ #include diff --git a/src/model/vae/vae.hpp b/src/model/vae/vae.hpp index af091bb57..34a0d9663 100644 --- a/src/model/vae/vae.hpp +++ b/src/model/vae/vae.hpp @@ -1,4 +1,4 @@ -#ifndef __SD_MODEL_VAE_VAE_HPP__ +#ifndef __SD_MODEL_VAE_VAE_HPP__ #define __SD_MODEL_VAE_VAE_HPP__ #include "core/tensor_ggml.hpp"