|
| 1 | +"""Client-attested build provenance capture (AGX1-418). |
| 2 | +
|
| 3 | +The single producer of source identity for agent builds: git coordinates plus a |
| 4 | +deterministic content hash of the build context. Every build path (CLI, sgpctl, |
| 5 | +CI) imports this so capture logic and the ``working_tree_hash`` definition live |
| 6 | +in exactly one place. Capture is best-effort — a missing/odd git state degrades |
| 7 | +to nulls and never raises into a build. |
| 8 | +""" |
| 9 | + |
| 10 | +from __future__ import annotations |
| 11 | + |
| 12 | +import os |
| 13 | +import stat |
| 14 | +import hashlib |
| 15 | +import subprocess |
| 16 | +from typing import Optional |
| 17 | +from pathlib import Path |
| 18 | +from datetime import datetime, timezone |
| 19 | +from dataclasses import dataclass |
| 20 | + |
| 21 | +from agentex.lib.utils.logging import make_logger |
| 22 | + |
| 23 | +logger = make_logger(__name__) |
| 24 | + |
| 25 | +_GIT_TIMEOUT_S = 5 |
| 26 | +_HASH_CHUNK_BYTES = 1 << 20 |
| 27 | + |
| 28 | + |
| 29 | +@dataclass(frozen=True) |
| 30 | +class BuildProvenance: |
| 31 | + """Source identity for one build. All fields degrade to ``None``. |
| 32 | +
|
| 33 | + Exactly one identity anchors the build: a **clean committed tree** keys on |
| 34 | + ``commit`` (``working_tree_hash`` is ``None``); anything else — a dirty tree |
| 35 | + or a non-git context, neither of which a commit can address — carries a |
| 36 | + ``working_tree_hash`` instead. So a non-null hash means "no clean commit to |
| 37 | + point to," and ``is_clean_commit`` is the gate ``--require-clean`` checks. |
| 38 | + """ |
| 39 | + |
| 40 | + repo: Optional[str] = None |
| 41 | + commit: Optional[str] = None |
| 42 | + ref: Optional[str] = None |
| 43 | + subpath: Optional[str] = None |
| 44 | + working_tree_hash: Optional[str] = None |
| 45 | + author_name: Optional[str] = None |
| 46 | + author_email: Optional[str] = None |
| 47 | + build_timestamp: Optional[str] = None |
| 48 | + |
| 49 | + @property |
| 50 | + def is_clean_commit(self) -> bool: |
| 51 | + return self.commit is not None and self.working_tree_hash is None |
| 52 | + |
| 53 | + def source_fields(self) -> dict[str, str]: |
| 54 | + """The ``source_*`` form fields for the cloud-build upload (None omitted).""" |
| 55 | + fields = { |
| 56 | + "source_repo": self.repo, |
| 57 | + "source_commit": self.commit, |
| 58 | + "source_ref": self.ref, |
| 59 | + "source_subpath": self.subpath, |
| 60 | + "working_tree_hash": self.working_tree_hash, |
| 61 | + } |
| 62 | + return {key: value for key, value in fields.items() if value is not None} |
| 63 | + |
| 64 | + def build_info(self) -> dict[str, str]: |
| 65 | + """The ``build-info.json`` payload (runtime ``registration_metadata``). |
| 66 | +
|
| 67 | + Overlapping keys match the server's ``DeploymentHistory`` type |
| 68 | + (``commit_hash`` / ``branch_name`` / ``author_*`` / ``build_timestamp``), |
| 69 | + which is populated from ``registration_metadata``; the rest are the |
| 70 | + provenance-specific coordinates. |
| 71 | + """ |
| 72 | + info = { |
| 73 | + "repo": self.repo, |
| 74 | + "commit_hash": self.commit, |
| 75 | + "branch_name": self.ref, |
| 76 | + "subpath": self.subpath, |
| 77 | + "working_tree_hash": self.working_tree_hash, |
| 78 | + "author_name": self.author_name, |
| 79 | + "author_email": self.author_email, |
| 80 | + "build_timestamp": self.build_timestamp, |
| 81 | + } |
| 82 | + return {key: value for key, value in info.items() if value is not None} |
| 83 | + |
| 84 | + |
| 85 | +def _git(repo_root: Path, *args: str) -> Optional[str]: |
| 86 | + """Run a git command under ``repo_root``; return stripped stdout or None.""" |
| 87 | + try: |
| 88 | + proc = subprocess.run( |
| 89 | + ("git", "-C", str(repo_root), *args), |
| 90 | + capture_output=True, |
| 91 | + text=True, |
| 92 | + timeout=_GIT_TIMEOUT_S, |
| 93 | + check=False, |
| 94 | + ) |
| 95 | + except (OSError, subprocess.SubprocessError): |
| 96 | + return None |
| 97 | + if proc.returncode != 0: |
| 98 | + return None |
| 99 | + return proc.stdout.strip() or None |
| 100 | + |
| 101 | + |
| 102 | +def normalize_remote(url: Optional[str]) -> Optional[str]: |
| 103 | + """Canonicalize a git remote to ``host/path`` — credentials and scheme stripped. |
| 104 | +
|
| 105 | + ``git@github.com:org/repo.git`` and ``https://x:tok@github.com/org/repo.git`` |
| 106 | + both normalize to ``github.com/org/repo``. Host is lowercased; path casing is |
| 107 | + preserved (repo paths can be case-significant). |
| 108 | + """ |
| 109 | + if not url: |
| 110 | + return None |
| 111 | + candidate = url.strip() |
| 112 | + # scp-like syntax: git@host:org/repo(.git) — no scheme, host/path split on ':' |
| 113 | + if "://" not in candidate and ":" in candidate and "/" not in candidate.split(":", 1)[0]: |
| 114 | + candidate = candidate.split("@", 1)[-1].replace(":", "/", 1) |
| 115 | + else: |
| 116 | + if "://" in candidate: |
| 117 | + candidate = candidate.split("://", 1)[1] |
| 118 | + candidate = candidate.split("@", 1)[-1] |
| 119 | + if candidate.endswith(".git"): |
| 120 | + candidate = candidate[: -len(".git")] |
| 121 | + candidate = candidate.strip("/") |
| 122 | + if not candidate: |
| 123 | + return None |
| 124 | + host, slash, path = candidate.partition("/") |
| 125 | + return f"{host.lower()}{slash}{path}" |
| 126 | + |
| 127 | + |
| 128 | +def _sha256_file(path: Path) -> str: |
| 129 | + digest = hashlib.sha256() |
| 130 | + with open(path, "rb") as handle: |
| 131 | + while chunk := handle.read(_HASH_CHUNK_BYTES): |
| 132 | + digest.update(chunk) |
| 133 | + return digest.hexdigest() |
| 134 | + |
| 135 | + |
| 136 | +def iter_context_files(root: Path) -> list[Path]: |
| 137 | + """Files (and symlinks) under ``root``, sorted by POSIX relpath. |
| 138 | +
|
| 139 | + The canonical, order-stable enumeration shared by the content hash and the |
| 140 | + archive packer so the two can never drift on which files they cover. |
| 141 | + """ |
| 142 | + return sorted( |
| 143 | + (path for path in root.rglob("*") if path.is_symlink() or path.is_file()), |
| 144 | + key=lambda path: path.relative_to(root).as_posix(), |
| 145 | + ) |
| 146 | + |
| 147 | + |
| 148 | +def working_tree_hash(root: Path) -> str: |
| 149 | + """Deterministic content hash of the build context at ``root``. |
| 150 | +
|
| 151 | + sha256 over the sorted ``(relpath, normalized mode, content digest)`` of every |
| 152 | + file — the build *inputs*, not the tarball (tar/gzip framing is |
| 153 | + non-deterministic and would defeat dedupe). Mode is normalized to the |
| 154 | + executable bit; symlinks hash their target string, not the resolved content. |
| 155 | + """ |
| 156 | + lines: list[str] = [] |
| 157 | + for path in iter_context_files(root): |
| 158 | + relpath = path.relative_to(root).as_posix() |
| 159 | + if path.is_symlink(): |
| 160 | + mode = "120000" |
| 161 | + content_digest = hashlib.sha256(os.readlink(path).encode("utf-8")).hexdigest() |
| 162 | + else: |
| 163 | + executable = bool(path.stat().st_mode & stat.S_IXUSR) |
| 164 | + mode = "100755" if executable else "100644" |
| 165 | + content_digest = _sha256_file(path) |
| 166 | + lines.append(f"{relpath}\x00{mode}\x00{content_digest}") |
| 167 | + return hashlib.sha256("\n".join(lines).encode("utf-8")).hexdigest() |
| 168 | + |
| 169 | + |
| 170 | +def capture_build_provenance( |
| 171 | + repo_path: Path, context_root: Path, content_root: Optional[Path] = None |
| 172 | +) -> BuildProvenance: |
| 173 | + """Capture source identity for a build of ``context_root``. |
| 174 | +
|
| 175 | + ``repo_path`` is where git is interrogated and ``subpath`` is ``context_root`` |
| 176 | + relative to the repo root (which agent, in a monorepo). ``content_root`` is |
| 177 | + the directory hashed — the *staged*, post-``.dockerignore`` tree that actually |
| 178 | + ships; it defaults to ``context_root`` when there is no separate staging dir. |
| 179 | + The content hash is computed unless a clean commit identifies the build (so: |
| 180 | + for a dirty tree or a non-git context, but not for a clean committed tree). |
| 181 | + """ |
| 182 | + timestamp = datetime.now(timezone.utc).isoformat() |
| 183 | + hash_root = content_root if content_root is not None else context_root |
| 184 | + repo_root = _git(repo_path, "rev-parse", "--show-toplevel") |
| 185 | + if repo_root is None: |
| 186 | + # No git at all — the content hash is the only identity available. |
| 187 | + logger.info("build-provenance: %s is not a git work tree; hashing context", repo_path) |
| 188 | + return BuildProvenance( |
| 189 | + working_tree_hash=working_tree_hash(hash_root), |
| 190 | + build_timestamp=timestamp, |
| 191 | + ) |
| 192 | + |
| 193 | + repo_root_path = Path(repo_root) |
| 194 | + commit = _git(repo_root_path, "rev-parse", "HEAD") |
| 195 | + # symbolic-ref fails on a detached HEAD (→ None); fall back to an exact tag. |
| 196 | + ref = _git(repo_root_path, "symbolic-ref", "--short", "HEAD") or _git( |
| 197 | + repo_root_path, "describe", "--tags", "--exact-match" |
| 198 | + ) |
| 199 | + remote = normalize_remote(_git(repo_root_path, "remote", "get-url", "origin")) |
| 200 | + author_name = _git(repo_root_path, "log", "-1", "--format=%an") |
| 201 | + author_email = _git(repo_root_path, "log", "-1", "--format=%ae") |
| 202 | + |
| 203 | + subpath: Optional[str] = None |
| 204 | + try: |
| 205 | + relative = context_root.resolve().relative_to(repo_root_path.resolve()).as_posix() |
| 206 | + subpath = relative if relative != "." else None |
| 207 | + except ValueError: |
| 208 | + subpath = None |
| 209 | + |
| 210 | + # Hash unless a clean commit identifies the build: dirty tree, or an unborn |
| 211 | + # HEAD with no commit yet, both fall back to the content hash. |
| 212 | + dirty = _git(repo_root_path, "status", "--porcelain") is not None |
| 213 | + tree_hash = working_tree_hash(hash_root) if (dirty or commit is None) else None |
| 214 | + |
| 215 | + return BuildProvenance( |
| 216 | + repo=remote, |
| 217 | + commit=commit, |
| 218 | + ref=ref, |
| 219 | + subpath=subpath, |
| 220 | + working_tree_hash=tree_hash, |
| 221 | + author_name=author_name, |
| 222 | + author_email=author_email, |
| 223 | + build_timestamp=timestamp, |
| 224 | + ) |
0 commit comments