Skip to content

Commit b1655e6

Browse files
feat(lib): capture client-attested build provenance
Add agentex.lib.utils.build_provenance — the single producer of source identity for agent builds (git coordinates + a deterministic content hash of the build context). prepare_cloud_build_context now writes build-info.json into the staged context (populates runtime registration_metadata with no server change) and exposes provenance on CloudBuildContext so the upload can send source_* fields. Archive member order is now deterministic via a sorted enumeration shared with the hash. The hash is computed only when there is no clean commit to identify the build (dirty tree or non-git context). First of three surfaces for AGX1-418 (Phase 1, client-attested); the SGP build-record columns and the sgpctl/Gitea uploaders follow. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
1 parent 936bac6 commit b1655e6

5 files changed

Lines changed: 513 additions & 6 deletions

File tree

src/agentex/lib/cli/handlers/agent_handlers.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import json
34
from typing import NamedTuple
45
from pathlib import Path
56

@@ -8,12 +9,15 @@
89

910
from agentex.lib.cli.debug import DebugConfig
1011
from agentex.lib.utils.logging import make_logger
12+
from agentex.lib.utils.build_provenance import BuildProvenance, capture_build_provenance
1113
from agentex.lib.cli.handlers.run_handlers import RunError, run_agent as _run_agent
1214
from agentex.lib.sdk.config.agent_manifest import BuildContextManager, load_agent_manifest, build_context_manager
1315

1416
logger = make_logger(__name__)
1517
console = Console()
1618

19+
_BUILD_INFO_FILENAME = "build-info.json"
20+
1721

1822
class DockerBuildError(Exception):
1923
"""An error occurred during docker build"""
@@ -28,6 +32,7 @@ class CloudBuildContext(NamedTuple):
2832
tag: str
2933
image_name: str
3034
build_context_size_kb: float
35+
provenance: BuildProvenance
3136

3237

3338
def build_agent(
@@ -261,8 +266,24 @@ def prepare_cloud_build_context(
261266
logger.info("Preparing build context...")
262267

263268
with build_context_manager(agent_manifest, build_context_root) as build_context:
269+
staged_root = Path(build_context.path)
270+
# Capture source identity over the staged (post-.dockerignore) tree — the
271+
# exact bytes that ship — then write build-info.json into it so it lands
272+
# in the image for runtime registration. Capture runs before the write so
273+
# the content hash never includes build-info.json itself.
274+
provenance = capture_build_provenance(
275+
repo_path=build_context_root,
276+
context_root=build_context_root,
277+
content_root=staged_root,
278+
)
279+
(staged_root / _BUILD_INFO_FILENAME).write_text(json.dumps(provenance.build_info(), indent=2, sort_keys=True))
280+
logger.info(
281+
f"Build provenance: commit={provenance.commit} ref={provenance.ref} "
282+
f"clean_commit={provenance.is_clean_commit}"
283+
)
284+
264285
# Compress the prepared context using the static zipped method
265-
with BuildContextManager.zipped(root_path=build_context.path) as archive_buffer:
286+
with BuildContextManager.zipped(root_path=staged_root) as archive_buffer:
266287
archive_bytes = archive_buffer.read()
267288

268289
build_context_size_kb = len(archive_bytes) / 1024
@@ -275,4 +296,5 @@ def prepare_cloud_build_context(
275296
tag=tag,
276297
image_name=image_name,
277298
build_context_size_kb=build_context_size_kb,
299+
provenance=provenance,
278300
)

src/agentex/lib/sdk/config/agent_manifest.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from agentex.lib.utils.io import load_yaml_file
2525
from agentex.lib.utils.logging import make_logger
2626
from agentex.config.agent_manifest import AgentManifest # noqa: F401
27+
from agentex.lib.utils.build_provenance import iter_context_files
2728

2829
logger = make_logger(__name__)
2930

@@ -189,12 +190,11 @@ def zipped(root_path: Path | None = None) -> Iterator[IO[bytes]]:
189190

190191
tar_buffer = io.BytesIO()
191192

193+
# Sorted, relpath-stable enumeration (shared with the content hash) so the
194+
# archive's member order is deterministic across machines.
192195
with tarfile.open(fileobj=tar_buffer, mode="w:gz") as tar_file:
193-
for path in Path(root_path).rglob(
194-
"*"
195-
): # Recursively add files to the tar.gz
196-
if path.is_file(): # Ensure that we're only adding files
197-
tar_file.add(path, arcname=path.relative_to(root_path))
196+
for path in iter_context_files(Path(root_path)):
197+
tar_file.add(path, arcname=path.relative_to(root_path))
198198

199199
tar_buffer.seek(0) # Reset the buffer position to the beginning
200200
yield tar_buffer
Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
"""Client-attested build provenance capture (AGX1-418).
2+
3+
The single producer of source identity for agent builds: git coordinates plus a
4+
deterministic content hash of the build context. Every build path (CLI, sgpctl,
5+
CI) imports this so capture logic and the ``working_tree_hash`` definition live
6+
in exactly one place. Capture is best-effort — a missing/odd git state degrades
7+
to nulls and never raises into a build.
8+
"""
9+
10+
from __future__ import annotations
11+
12+
import os
13+
import stat
14+
import hashlib
15+
import subprocess
16+
from typing import Optional
17+
from pathlib import Path
18+
from datetime import datetime, timezone
19+
from dataclasses import dataclass
20+
21+
from agentex.lib.utils.logging import make_logger
22+
23+
logger = make_logger(__name__)
24+
25+
_GIT_TIMEOUT_S = 5
26+
_HASH_CHUNK_BYTES = 1 << 20
27+
28+
29+
@dataclass(frozen=True)
30+
class BuildProvenance:
31+
"""Source identity for one build. All fields degrade to ``None``.
32+
33+
Exactly one identity anchors the build: a **clean committed tree** keys on
34+
``commit`` (``working_tree_hash`` is ``None``); anything else — a dirty tree
35+
or a non-git context, neither of which a commit can address — carries a
36+
``working_tree_hash`` instead. So a non-null hash means "no clean commit to
37+
point to," and ``is_clean_commit`` is the gate ``--require-clean`` checks.
38+
"""
39+
40+
repo: Optional[str] = None
41+
commit: Optional[str] = None
42+
ref: Optional[str] = None
43+
subpath: Optional[str] = None
44+
working_tree_hash: Optional[str] = None
45+
author_name: Optional[str] = None
46+
author_email: Optional[str] = None
47+
build_timestamp: Optional[str] = None
48+
49+
@property
50+
def is_clean_commit(self) -> bool:
51+
return self.commit is not None and self.working_tree_hash is None
52+
53+
def source_fields(self) -> dict[str, str]:
54+
"""The ``source_*`` form fields for the cloud-build upload (None omitted)."""
55+
fields = {
56+
"source_repo": self.repo,
57+
"source_commit": self.commit,
58+
"source_ref": self.ref,
59+
"source_subpath": self.subpath,
60+
"working_tree_hash": self.working_tree_hash,
61+
}
62+
return {key: value for key, value in fields.items() if value is not None}
63+
64+
def build_info(self) -> dict[str, str]:
65+
"""The ``build-info.json`` payload (runtime ``registration_metadata``).
66+
67+
Overlapping keys match the server's ``DeploymentHistory`` type
68+
(``commit_hash`` / ``branch_name`` / ``author_*`` / ``build_timestamp``),
69+
which is populated from ``registration_metadata``; the rest are the
70+
provenance-specific coordinates.
71+
"""
72+
info = {
73+
"repo": self.repo,
74+
"commit_hash": self.commit,
75+
"branch_name": self.ref,
76+
"subpath": self.subpath,
77+
"working_tree_hash": self.working_tree_hash,
78+
"author_name": self.author_name,
79+
"author_email": self.author_email,
80+
"build_timestamp": self.build_timestamp,
81+
}
82+
return {key: value for key, value in info.items() if value is not None}
83+
84+
85+
def _git(repo_root: Path, *args: str) -> Optional[str]:
86+
"""Run a git command under ``repo_root``; return stripped stdout or None."""
87+
try:
88+
proc = subprocess.run(
89+
("git", "-C", str(repo_root), *args),
90+
capture_output=True,
91+
text=True,
92+
timeout=_GIT_TIMEOUT_S,
93+
check=False,
94+
)
95+
except (OSError, subprocess.SubprocessError):
96+
return None
97+
if proc.returncode != 0:
98+
return None
99+
return proc.stdout.strip() or None
100+
101+
102+
def normalize_remote(url: Optional[str]) -> Optional[str]:
103+
"""Canonicalize a git remote to ``host/path`` — credentials and scheme stripped.
104+
105+
``git@github.com:org/repo.git`` and ``https://x:tok@github.com/org/repo.git``
106+
both normalize to ``github.com/org/repo``. Host is lowercased; path casing is
107+
preserved (repo paths can be case-significant).
108+
"""
109+
if not url:
110+
return None
111+
candidate = url.strip()
112+
# scp-like syntax: git@host:org/repo(.git) — no scheme, host/path split on ':'
113+
if "://" not in candidate and ":" in candidate and "/" not in candidate.split(":", 1)[0]:
114+
candidate = candidate.split("@", 1)[-1].replace(":", "/", 1)
115+
else:
116+
if "://" in candidate:
117+
candidate = candidate.split("://", 1)[1]
118+
candidate = candidate.split("@", 1)[-1]
119+
if candidate.endswith(".git"):
120+
candidate = candidate[: -len(".git")]
121+
candidate = candidate.strip("/")
122+
if not candidate:
123+
return None
124+
host, slash, path = candidate.partition("/")
125+
return f"{host.lower()}{slash}{path}"
126+
127+
128+
def _sha256_file(path: Path) -> str:
129+
digest = hashlib.sha256()
130+
with open(path, "rb") as handle:
131+
while chunk := handle.read(_HASH_CHUNK_BYTES):
132+
digest.update(chunk)
133+
return digest.hexdigest()
134+
135+
136+
def iter_context_files(root: Path) -> list[Path]:
137+
"""Files (and symlinks) under ``root``, sorted by POSIX relpath.
138+
139+
The canonical, order-stable enumeration shared by the content hash and the
140+
archive packer so the two can never drift on which files they cover.
141+
"""
142+
return sorted(
143+
(path for path in root.rglob("*") if path.is_symlink() or path.is_file()),
144+
key=lambda path: path.relative_to(root).as_posix(),
145+
)
146+
147+
148+
def working_tree_hash(root: Path) -> str:
149+
"""Deterministic content hash of the build context at ``root``.
150+
151+
sha256 over the sorted ``(relpath, normalized mode, content digest)`` of every
152+
file — the build *inputs*, not the tarball (tar/gzip framing is
153+
non-deterministic and would defeat dedupe). Mode is normalized to the
154+
executable bit; symlinks hash their target string, not the resolved content.
155+
"""
156+
lines: list[str] = []
157+
for path in iter_context_files(root):
158+
relpath = path.relative_to(root).as_posix()
159+
if path.is_symlink():
160+
mode = "120000"
161+
content_digest = hashlib.sha256(os.readlink(path).encode("utf-8")).hexdigest()
162+
else:
163+
executable = bool(path.stat().st_mode & stat.S_IXUSR)
164+
mode = "100755" if executable else "100644"
165+
content_digest = _sha256_file(path)
166+
lines.append(f"{relpath}\x00{mode}\x00{content_digest}")
167+
return hashlib.sha256("\n".join(lines).encode("utf-8")).hexdigest()
168+
169+
170+
def capture_build_provenance(
171+
repo_path: Path, context_root: Path, content_root: Optional[Path] = None
172+
) -> BuildProvenance:
173+
"""Capture source identity for a build of ``context_root``.
174+
175+
``repo_path`` is where git is interrogated and ``subpath`` is ``context_root``
176+
relative to the repo root (which agent, in a monorepo). ``content_root`` is
177+
the directory hashed — the *staged*, post-``.dockerignore`` tree that actually
178+
ships; it defaults to ``context_root`` when there is no separate staging dir.
179+
The content hash is computed unless a clean commit identifies the build (so:
180+
for a dirty tree or a non-git context, but not for a clean committed tree).
181+
"""
182+
timestamp = datetime.now(timezone.utc).isoformat()
183+
hash_root = content_root if content_root is not None else context_root
184+
repo_root = _git(repo_path, "rev-parse", "--show-toplevel")
185+
if repo_root is None:
186+
# No git at all — the content hash is the only identity available.
187+
logger.info("build-provenance: %s is not a git work tree; hashing context", repo_path)
188+
return BuildProvenance(
189+
working_tree_hash=working_tree_hash(hash_root),
190+
build_timestamp=timestamp,
191+
)
192+
193+
repo_root_path = Path(repo_root)
194+
commit = _git(repo_root_path, "rev-parse", "HEAD")
195+
# symbolic-ref fails on a detached HEAD (→ None); fall back to an exact tag.
196+
ref = _git(repo_root_path, "symbolic-ref", "--short", "HEAD") or _git(
197+
repo_root_path, "describe", "--tags", "--exact-match"
198+
)
199+
remote = normalize_remote(_git(repo_root_path, "remote", "get-url", "origin"))
200+
author_name = _git(repo_root_path, "log", "-1", "--format=%an")
201+
author_email = _git(repo_root_path, "log", "-1", "--format=%ae")
202+
203+
subpath: Optional[str] = None
204+
try:
205+
relative = context_root.resolve().relative_to(repo_root_path.resolve()).as_posix()
206+
subpath = relative if relative != "." else None
207+
except ValueError:
208+
subpath = None
209+
210+
# Hash unless a clean commit identifies the build: dirty tree, or an unborn
211+
# HEAD with no commit yet, both fall back to the content hash.
212+
dirty = _git(repo_root_path, "status", "--porcelain") is not None
213+
tree_hash = working_tree_hash(hash_root) if (dirty or commit is None) else None
214+
215+
return BuildProvenance(
216+
repo=remote,
217+
commit=commit,
218+
ref=ref,
219+
subpath=subpath,
220+
working_tree_hash=tree_hash,
221+
author_name=author_name,
222+
author_email=author_email,
223+
build_timestamp=timestamp,
224+
)

tests/lib/cli/test_agent_handlers.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22

33
from __future__ import annotations
44

5+
import io
56
import os
7+
import json
68
import tarfile
79
import tempfile
810
from pathlib import Path
@@ -145,6 +147,23 @@ def test_prepare_cloud_build_context_returns_cloud_build_context(
145147
assert len(result.archive_bytes) > 0
146148
assert result.build_context_size_kb > 0
147149

150+
def test_prepare_cloud_build_context_writes_build_info(self, temp_agent_dir: Path):
151+
"""build-info.json ships in the archive and matches the captured provenance."""
152+
manifest_path = str(temp_agent_dir / "manifest.yaml")
153+
154+
result = prepare_cloud_build_context(manifest_path=manifest_path)
155+
156+
# Non-git temp dir → the content hash is the identity, no commit.
157+
assert result.provenance.commit is None
158+
assert result.provenance.working_tree_hash is not None
159+
160+
with tarfile.open(fileobj=io.BytesIO(result.archive_bytes), mode="r:gz") as archive:
161+
build_info_name = next(n for n in archive.getnames() if n.endswith("build-info.json"))
162+
member = archive.extractfile(build_info_name)
163+
assert member is not None
164+
shipped = json.loads(member.read())
165+
assert shipped == result.provenance.build_info()
166+
148167
def test_prepare_cloud_build_context_with_tag_override(self, temp_agent_dir: Path):
149168
"""Test that tag parameter overrides manifest tag."""
150169
manifest_path = str(temp_agent_dir / "manifest.yaml")

0 commit comments

Comments
 (0)