diff --git a/.agents/skills/debug-openshell-cluster/SKILL.md b/.agents/skills/debug-openshell-cluster/SKILL.md index 68ecc7749..56e329450 100644 --- a/.agents/skills/debug-openshell-cluster/SKILL.md +++ b/.agents/skills/debug-openshell-cluster/SKILL.md @@ -268,6 +268,33 @@ kubectl -n openshell get configmap openshell-config -o jsonpath='{.data.gateway\ kubectl -n get sandbox -o jsonpath='{.spec.template.spec.serviceAccountName}{"\n"}' ``` +If `supervisor_topology = "sidecar"` is rendered, sandbox pods should have an +`openshell-network-init` init container running `--mode=network-init`, an +`agent` container running `openshell-sandbox --mode=process`, and an +`openshell-supervisor-network` container running `--mode=network`. The init +container owns nftables setup and should be the only sidecar topology container +with `NET_ADMIN`. It also needs `CHOWN`/`FOWNER` to hand shared emptyDir state +to `sidecar_proxy_uid`. The long-running network sidecar runs as +`sidecar_proxy_uid` with primary GID `0` so it can read the root-owned, +group-readable projected service-account token. In sidecar topology the +`openshell-sa-token` projected volume should render `defaultMode: 288` (`0440`); +if the proxy logs `failed to read K8s SA token`, verify this token mode and the +network sidecar security context. The process container should also publish the +workload entrypoint PID to `OPENSHELL_ENTRYPOINT_PID_FILE` +(`/run/openshell-sidecar/entrypoint.pid` by default), and the network sidecar +should read it for binary-scoped policy decisions; if allowed network rules are +all denied, inspect that file and the network sidecar logs. +Inspect all three when sandbox registration or egress enforcement fails: + +```bash +kubectl -n openshell get configmap openshell-config -o jsonpath='{.data.gateway\.toml}' | grep supervisor_topology +kubectl -n get pod -o jsonpath='{range .spec.initContainers[*]}{.name}{" "}{.command}{"\n"}{end}' +kubectl -n get pod -o jsonpath='{range .spec.containers[*]}{.name}{" "}{.command}{"\n"}{end}' +kubectl -n logs -c openshell-network-init --tail=200 +kubectl -n logs -c openshell-supervisor-network --tail=200 +kubectl -n logs -c agent --tail=200 +``` + ### Step 6: Check VM-Backed Gateways Use the VM driver logs and host diagnostics available in the user's environment. Verify: diff --git a/.agents/skills/helm-dev-environment/SKILL.md b/.agents/skills/helm-dev-environment/SKILL.md index bffa4e2e8..7d6ad7cd5 100644 --- a/.agents/skills/helm-dev-environment/SKILL.md +++ b/.agents/skills/helm-dev-environment/SKILL.md @@ -60,9 +60,17 @@ mise run helm:skaffold:dev mise run helm:skaffold:run ``` +**Supervisor sidecar topology** (build once and leave running): +```bash +mise run helm:skaffold:run:sidecar +``` + Both commands build the `gateway` and `supervisor` images and deploy the OpenShell Helm -chart. The `pkiInitJob` hook (a pre-install Job that runs `openshell-gateway generate-certs`) -generates mTLS secrets on first install. Envoy Gateway opt-in; see the Optional Add-ons section below. +chart. The sidecar profile renders an `openshell-network-init` init container for +nftables setup and a non-root `openshell-supervisor-network` runtime sidecar for +proxying. The `pkiInitJob` hook (a pre-install Job that runs `openshell-gateway +generate-certs`) generates mTLS secrets on first install. Envoy Gateway opt-in; +see the Optional Add-ons section below. The gateway Service uses ClusterIP. Access is via Envoy Gateway (port `8080`) or `kubectl port-forward`. @@ -126,6 +134,12 @@ openshell sandbox list --gateway-endpoint https://localhost:8090 mise run helm:skaffold:delete ``` +For a sidecar-profile deployment: + +```bash +mise run helm:skaffold:delete:sidecar +``` + ### Delete the cluster entirely ```bash @@ -250,6 +264,7 @@ for dependencies still declared in `Chart.yaml`. | `deploy/helm/openshell/ci/values-gateway.yaml` | Envoy Gateway GRPCRoute + Gateway overlay | | `deploy/helm/openshell/ci/values-high-availability.yaml` | HA test overlay (`replicaCount: 2` with external PostgreSQL Secret) | | `deploy/helm/openshell/ci/values-keycloak.yaml` | Keycloak OIDC overlay | +| `deploy/helm/openshell/ci/values-sidecar.yaml` | Supervisor sidecar topology overlay for Kubernetes e2e/dev | | `deploy/helm/openshell/ci/values-spire.yaml` | SPIFFE/SPIRE provider token grant overlay | | `deploy/helm/openshell/ci/values-spire-stack.yaml` | SPIRE hardened chart values for local dev | | `deploy/helm/openshell/ci/values-tls-disabled.yaml` | Lint-only: TLS + auth disabled (reverse-proxy edge termination) | diff --git a/Cargo.lock b/Cargo.lock index 672fd71ab..d8c81b467 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3663,6 +3663,7 @@ dependencies = [ "kube-runtime", "miette", "openshell-core", + "openshell-policy", "prost", "prost-types", "serde", @@ -3808,6 +3809,7 @@ dependencies = [ "clap", "futures", "miette", + "nix", "openshell-core", "openshell-ocsf", "openshell-policy", @@ -3964,6 +3966,7 @@ dependencies = [ "nix", "openshell-core", "openshell-ocsf", + "openshell-policy", "rand_core 0.6.4", "russh", "rustix 1.1.4", diff --git a/architecture/build.md b/architecture/build.md index 6cd7b15d2..54693a486 100644 --- a/architecture/build.md +++ b/architecture/build.md @@ -91,10 +91,11 @@ Runtime layout: as a release artifact. Linux GNU VM driver binaries must not reference `GLIBC_*` symbols newer than `GLIBC_2.28`; release workflows verify this before publishing artifacts. -- **Supervisor**: `scratch` base, static musl binary at `/openshell-sandbox`. - Static linkage is required because the image is mounted/extracted into - sandbox environments (Docker extraction, Podman image volumes, Kubernetes - init-container copy-self) and cannot rely on a dynamic loader. +- **Supervisor**: Alpine base with `nftables`, static musl binary at + `/openshell-sandbox`. Static linkage keeps the binary usable when the image + is mounted/extracted into sandbox environments (Docker extraction, Podman + image volumes, Kubernetes init-container copy-self), while `nftables` supports + Kubernetes supervisor sidecar egress enforcement. Gateway image builds bake the corresponding supervisor image tag into the gateway binary so Docker sandboxes do not depend on `:latest` by default. diff --git a/architecture/compute-runtimes.md b/architecture/compute-runtimes.md index f122fda5d..ac239bfb8 100644 --- a/architecture/compute-runtimes.md +++ b/architecture/compute-runtimes.md @@ -81,7 +81,7 @@ The supervisor must be available inside each sandbox workload: |---|---| | Docker | Bind-mounted local supervisor binary, or a binary extracted from the configured supervisor image. | | Podman | Read-only OCI image volume containing the supervisor binary. | -| Kubernetes | Sandbox pod image or pod template configuration. | +| Kubernetes | Supervisor image side-loaded into the sandbox pod by image volume or init container. | | VM | Embedded in the guest rootfs bundle. | | Extension | Defined by the out-of-tree driver. | @@ -89,6 +89,20 @@ Driver-controlled environment variables must override sandbox image or template values for sandbox ID, sandbox name, gateway endpoint, relay socket path, TLS paths, and command metadata. +Kubernetes can run the supervisor in the default combined topology or in a +sidecar topology. Combined mode keeps network and process supervision in the +agent container. Sidecar mode runs network enforcement, the proxy, and gateway +loopback forwarding in a dedicated sidecar, while the agent container runs only +the process-supervision leaf and launches the user workload after the sidecar +signals readiness. In sidecar mode, an init container performs the privileged +pod-network nftables setup with `NET_ADMIN` and hands shared state ownership to +the configured proxy UID; the long-running network sidecar runs as that UID and +does not keep `NET_ADMIN`. The agent container runs as the resolved sandbox +UID/GID with no added Linux capabilities. Sidecar mode preserves gateway session +and SSH behavior, but treats the process leaf as network-only: Landlock +filesystem policy, process privilege dropping, and process/binary identity +checks are not applied there. + ## Images The gateway image and Helm chart are built from this repository. Sandbox images diff --git a/crates/openshell-core/src/grpc_client.rs b/crates/openshell-core/src/grpc_client.rs index 96158a1d1..4f2477c25 100644 --- a/crates/openshell-core/src/grpc_client.rs +++ b/crates/openshell-core/src/grpc_client.rs @@ -167,9 +167,14 @@ async fn build_plain_channel(endpoint: &str) -> Result { .into_diagnostic() .wrap_err_with(|| format!("failed to read client key from {key_path}"))?; - let tls_config = ClientTlsConfig::new() + let mut tls_config = ClientTlsConfig::new() .ca_certificate(Certificate::from_pem(ca_pem)) .identity(Identity::from_pem(cert_pem, key_pem)); + if let Ok(server_name) = std::env::var(sandbox_env::GATEWAY_TLS_SERVER_NAME) + && !server_name.is_empty() + { + tls_config = tls_config.domain_name(server_name); + } ep = ep .tls_config(tls_config) diff --git a/crates/openshell-core/src/sandbox_env.rs b/crates/openshell-core/src/sandbox_env.rs index b457a4a8e..ae3a21787 100644 --- a/crates/openshell-core/src/sandbox_env.rs +++ b/crates/openshell-core/src/sandbox_env.rs @@ -29,6 +29,47 @@ pub const SANDBOX_COMMAND: &str = "OPENSHELL_SANDBOX_COMMAND"; /// Deployment-controlled telemetry toggle propagated to the sandbox supervisor. pub const TELEMETRY_ENABLED: &str = "OPENSHELL_TELEMETRY_ENABLED"; +/// Supervisor pod/runtime topology. Kubernetes sidecar mode sets this to +/// `"sidecar"`; the default combined supervisor path omits it. +pub const SUPERVISOR_TOPOLOGY: &str = "OPENSHELL_SUPERVISOR_TOPOLOGY"; + +/// Network enforcement backend selected by the compute driver. +pub const NETWORK_ENFORCEMENT_MODE: &str = "OPENSHELL_NETWORK_ENFORCEMENT_MODE"; + +/// Process enforcement mode selected by the compute driver. +/// +/// The default when unset is `"full"`, where the process supervisor enforces +/// filesystem/process policy before spawning workloads. Kubernetes sidecar +/// topology sets this to `"network-only"` so the process wrapper can run as +/// the sandbox UID without Linux capabilities while preserving SSH/session +/// behavior. +pub const PROCESS_ENFORCEMENT_MODE: &str = "OPENSHELL_PROCESS_ENFORCEMENT_MODE"; + +/// Whether network policy evaluation must bind requests to the peer binary. +/// +/// The default when unset is `"required"`. Kubernetes sidecar experiments may +/// set this to `"relaxed"` to enforce endpoint and L7 policy without per-binary +/// `/proc` identity binding. +pub const NETWORK_BINARY_IDENTITY: &str = "OPENSHELL_NETWORK_BINARY_IDENTITY"; + +/// File written by the network supervisor when sidecar networking is ready. +pub const SUPERVISOR_READY_FILE: &str = "OPENSHELL_SUPERVISOR_READY_FILE"; + +/// File written by the process supervisor with the workload entrypoint PID and +/// read by the network sidecar for process/binary-bound network policy checks. +pub const ENTRYPOINT_PID_FILE: &str = "OPENSHELL_ENTRYPOINT_PID_FILE"; + +/// Loopback address where the network sidecar forwards gateway gRPC traffic. +pub const GATEWAY_FORWARD_ADDR: &str = "OPENSHELL_GATEWAY_FORWARD_ADDR"; + +/// Optional TLS server name used when the process supervisor reaches the +/// gateway through a loopback TCP forward. +pub const GATEWAY_TLS_SERVER_NAME: &str = "OPENSHELL_GATEWAY_TLS_SERVER_NAME"; + +/// Directory where the network supervisor writes the proxy CA files consumed +/// by workload child processes. +pub const PROXY_TLS_DIR: &str = "OPENSHELL_PROXY_TLS_DIR"; + /// Path to the CA certificate for mTLS communication with the gateway. pub const TLS_CA: &str = "OPENSHELL_TLS_CA"; @@ -71,3 +112,18 @@ pub const K8S_SA_TOKEN_FILE: &str = "OPENSHELL_K8S_SA_TOKEN_FILE"; /// exchanges without using SPIFFE for gateway authentication. pub const PROVIDER_SPIFFE_WORKLOAD_API_SOCKET: &str = "OPENSHELL_PROVIDER_SPIFFE_WORKLOAD_API_SOCKET"; + +/// Resolved sandbox UID used to override `run_as_user` when the policy +/// specifies a numeric value instead of the hardcoded "sandbox" user name. +/// +/// Set by compute drivers (Kubernetes, Docker, VM) from resolved config or +/// cluster autodetection. The supervisor reads this at startup and uses it +/// directly with `setuid()` / `chown()` without requiring an `/etc/passwd` +/// entry in the sandbox image. +pub const SANDBOX_UID: &str = "OPENSHELL_SANDBOX_UID"; + +/// Resolved sandbox GID paired with [`SANDBOX_UID`]. +/// +/// Used alongside UID for PVC init container `chown` operations and when the +/// supervisor drops privileges to a group other than the UID's primary group. +pub const SANDBOX_GID: &str = "OPENSHELL_SANDBOX_GID"; diff --git a/crates/openshell-driver-kubernetes/Cargo.toml b/crates/openshell-driver-kubernetes/Cargo.toml index 07fa91015..2c02f864a 100644 --- a/crates/openshell-driver-kubernetes/Cargo.toml +++ b/crates/openshell-driver-kubernetes/Cargo.toml @@ -16,6 +16,7 @@ path = "src/main.rs" [dependencies] openshell-core = { path = "../openshell-core", default-features = false } +openshell-policy = { path = "../openshell-policy" } tokio = { workspace = true } tonic = { workspace = true, features = ["transport"] } diff --git a/crates/openshell-driver-kubernetes/README.md b/crates/openshell-driver-kubernetes/README.md index 831e4edf2..bfc71d9c5 100644 --- a/crates/openshell-driver-kubernetes/README.md +++ b/crates/openshell-driver-kubernetes/README.md @@ -53,9 +53,27 @@ pods do not need direct external ingress for SSH. ## Container Security Context -The driver grants the sandbox agent container the Linux capabilities the -supervisor needs for namespace setup and policy enforcement. It can also request -a Kubernetes AppArmor profile through `app_armor_profile`. +The default `combined` supervisor topology grants the sandbox agent container +the Linux capabilities the supervisor needs for namespace setup and process, +filesystem, and network policy enforcement. + +The `sidecar` supervisor topology moves pod-level network setup into a root init +container and runs the long-lived network sidecar as a non-root UID with no +added Linux capabilities. The agent container also runs as the resolved sandbox +UID/GID with `allowPrivilegeEscalation: false` and `capabilities.drop: ["ALL"]`. +In this mode OpenShell preserves gateway session and SSH behavior, but the +process supervisor runs in network-only mode and does not apply Landlock +filesystem policy, process privilege dropping, or process/binary identity +checks. Network endpoint and L7 policy remain enforced by the network sidecar. + +Sidecar mode uses the pod `fsGroup` to make the projected service-account token +and sandbox client TLS secret group-readable so the non-root process supervisor +can authenticate to the gateway. Treat the agent container as trusted with +respect to those in-pod gateway credentials until a narrower credential handoff +exists. + +The driver can request a Kubernetes AppArmor profile through +`app_armor_profile`. Supported values are `Unconfined`, `RuntimeDefault`, and `Localhost/`. An empty or unset value omits diff --git a/crates/openshell-driver-kubernetes/src/config.rs b/crates/openshell-driver-kubernetes/src/config.rs index 4c1153b08..68dcc915a 100644 --- a/crates/openshell-driver-kubernetes/src/config.rs +++ b/crates/openshell-driver-kubernetes/src/config.rs @@ -15,6 +15,9 @@ pub const DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME: &str = "default"; /// Default storage size for the workspace PVC. pub const DEFAULT_WORKSPACE_STORAGE_SIZE: &str = "2Gi"; +/// Default UID for the long-running Kubernetes network supervisor sidecar. +pub const DEFAULT_SIDECAR_PROXY_UID: u32 = 1337; + /// How the supervisor binary is delivered into sandbox pods. #[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)] #[serde(rename_all = "kebab-case")] @@ -52,6 +55,41 @@ impl FromStr for SupervisorSideloadMethod { } } +/// How the supervisor is arranged inside Kubernetes sandbox pods. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)] +#[serde(rename_all = "kebab-case")] +pub enum SupervisorTopology { + /// Run networking and process supervision in the agent container. + #[default] + Combined, + /// Run network supervision in a privileged sidecar and process supervision + /// as a low-capability wrapper in the agent container. + Sidecar, +} + +impl std::fmt::Display for SupervisorTopology { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Combined => f.write_str("combined"), + Self::Sidecar => f.write_str("sidecar"), + } + } +} + +impl FromStr for SupervisorTopology { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + "combined" => Ok(Self::Combined), + "sidecar" => Ok(Self::Sidecar), + other => Err(format!( + "unknown supervisor topology '{other}'; expected 'combined' or 'sidecar'" + )), + } + } +} + /// Kubernetes `AppArmor` profile requested for the sandbox agent container. #[derive(Debug, Clone, PartialEq, Eq)] pub enum AppArmorProfile { @@ -176,6 +214,14 @@ pub struct KubernetesComputeConfig { pub supervisor_image_pull_policy: String, /// How the supervisor binary is delivered into sandbox pods. pub supervisor_sideload_method: SupervisorSideloadMethod, + /// Supervisor pod topology. `combined` preserves the existing single + /// root supervisor container path; `sidecar` moves pod-level network + /// enforcement into a dedicated sidecar container. + pub supervisor_topology: SupervisorTopology, + /// UID used by the long-running network sidecar in `sidecar` topology. + /// The network init container installs nftables rules that exempt this + /// UID, so it must not match the sandbox workload UID. + pub sidecar_proxy_uid: u32, pub grpc_endpoint: String, pub ssh_socket_path: String, pub client_tls_secret_name: String, @@ -211,6 +257,16 @@ pub struct KubernetesComputeConfig { deserialize_with = "deserialize_provider_spiffe_workload_api_socket_path" )] pub provider_spiffe_workload_api_socket_path: String, + /// UID used for the supervisor container `securityContext.runAsUser` and + /// PVC init container ownership operations. When empty, the driver + /// auto-detects from `OpenShift` SCC annotations on the target namespace; + /// if those are also absent, falls back to `1000`. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub sandbox_uid: Option, + /// GID used alongside `sandbox_uid` for PVC init container operations. + /// When empty and `sandbox_uid` is set, defaults to the resolved UID. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub sandbox_gid: Option, } /// Lower bound enforced by kubelet for projected SA tokens. @@ -221,6 +277,18 @@ pub const MIN_SA_TOKEN_TTL_SECS: i64 = 600; /// pod start). pub const MAX_SA_TOKEN_TTL_SECS: i64 = 86_400; +/// Default sandbox UID used when neither config nor `OpenShift` SCC annotations +/// provide a resolved value. +pub(crate) const DEFAULT_SANDBOX_UID: u32 = 1000; + +/// The annotation key for the `OpenShift` `ServiceAccount` UID range. +/// Format: `/` (e.g. `1000000000/10000`). +pub const ANNOTATION_SCC_UID_RANGE: &str = "openshift.io/sa.scc.uid-range"; + +/// The annotation key for the `OpenShift` `ServiceAccount` supplemental groups. +/// Format: `/` (e.g. `1000000000/10000`). +pub const ANNOTATION_SCC_SUPPLEMENTAL_GROUPS: &str = "openshift.io/sa.scc.supplemental-groups"; + impl Default for KubernetesComputeConfig { fn default() -> Self { Self { @@ -236,6 +304,8 @@ impl Default for KubernetesComputeConfig { supervisor_image: DEFAULT_SUPERVISOR_IMAGE.to_string(), supervisor_image_pull_policy: String::new(), supervisor_sideload_method: SupervisorSideloadMethod::default(), + supervisor_topology: SupervisorTopology::default(), + sidecar_proxy_uid: DEFAULT_SIDECAR_PROXY_UID, grpc_endpoint: String::new(), ssh_socket_path: "/run/openshell/ssh.sock".to_string(), client_tls_secret_name: String::new(), @@ -246,6 +316,8 @@ impl Default for KubernetesComputeConfig { default_runtime_class_name: String::new(), sa_token_ttl_secs: 3600, provider_spiffe_workload_api_socket_path: String::new(), + sandbox_uid: None, + sandbox_gid: None, } } } @@ -277,6 +349,73 @@ impl KubernetesComputeConfig { &self.provider_spiffe_workload_api_socket_path, ) } + + pub fn validate_sidecar_proxy_uid(&self) -> Result<(), String> { + if self.sidecar_proxy_uid < openshell_policy::MIN_SANDBOX_UID { + return Err(format!( + "sidecar_proxy_uid must be at least {}", + openshell_policy::MIN_SANDBOX_UID + )); + } + Ok(()) + } + + /// Resolve the sandbox UID/GID pair. + /// + /// Resolution order: + /// 1. Configured `sandbox_uid` / `sandbox_gid` (explicit override) + /// 2. `OpenShift` SCC namespace annotations (`sa.scc.uid-range`, + /// `sa.scc.supplemental-groups`) — passed in as the optional + /// `namespace_annotations` map + /// 3. Fallback defaults: UID=`1000`, GID=UID + pub fn resolve_sandbox_uid( + &self, + namespace_annotations: Option<&std::collections::BTreeMap>, + ) -> u32 { + if let Some(uid) = self.sandbox_uid { + return uid; + } + // Try OpenShift SCC annotation. + if let Some(anns) = namespace_annotations + && let Some(range) = anns.get(ANNOTATION_SCC_UID_RANGE) + && let Some(uid) = Self::from_open_shift_uid_range(range) + { + return uid; + } + DEFAULT_SANDBOX_UID + } + + pub fn resolve_sandbox_gid( + &self, + resolved_uid: u32, + _namespace_annotations: Option<&std::collections::BTreeMap>, + ) -> u32 { + self.sandbox_gid + .or(self.sandbox_uid) + .unwrap_or(resolved_uid) + } + + /// Parse `OpenShift` SCC `sa.scc.uid-range` annotation. + /// + /// Format: `/` (e.g. `1000000000/10000`). + pub fn from_open_shift_uid_range(annotation: &str) -> Option { + let (start, _) = annotation.split_once('/')?; + start + .trim() + .parse::() + .ok() + .filter(|&uid| uid >= openshell_policy::MIN_SANDBOX_UID) + } + + /// Parse `OpenShift` SCC `sa.scc.supplemental-groups` annotation. + pub fn from_open_shift_supplemental_groups(annotation: &str) -> Option { + let (start, _) = annotation.split_once('/')?; + start + .trim() + .parse::() + .ok() + .filter(|&gid| gid >= openshell_policy::MIN_SANDBOX_UID) + } } fn validate_provider_spiffe_workload_api_socket_path_value( @@ -314,6 +453,7 @@ fn validate_provider_spiffe_workload_api_socket_path_value( #[cfg(test)] mod tests { use super::*; + use std::collections::BTreeMap as HashMap; #[test] fn default_workspace_storage_size_is_2gi() { @@ -324,6 +464,56 @@ mod tests { ); } + #[test] + fn default_supervisor_topology_is_combined() { + let cfg = KubernetesComputeConfig::default(); + assert_eq!(cfg.supervisor_topology, SupervisorTopology::Combined); + } + + #[test] + fn default_sidecar_proxy_uid_is_dedicated_non_root_uid() { + let cfg = KubernetesComputeConfig::default(); + assert_eq!(cfg.sidecar_proxy_uid, DEFAULT_SIDECAR_PROXY_UID); + } + + #[test] + fn serde_override_supervisor_topology_sidecar() { + let json = serde_json::json!({ + "supervisor_topology": "sidecar" + }); + let cfg: KubernetesComputeConfig = serde_json::from_value(json).unwrap(); + assert_eq!(cfg.supervisor_topology, SupervisorTopology::Sidecar); + } + + #[test] + fn serde_override_sidecar_proxy_uid() { + let json = serde_json::json!({ + "sidecar_proxy_uid": 2000 + }); + let cfg: KubernetesComputeConfig = serde_json::from_value(json).unwrap(); + assert_eq!(cfg.sidecar_proxy_uid, 2000); + cfg.validate_sidecar_proxy_uid().unwrap(); + } + + #[test] + fn validate_sidecar_proxy_uid_rejects_privileged_uid() { + let cfg = KubernetesComputeConfig { + sidecar_proxy_uid: 999, + ..KubernetesComputeConfig::default() + }; + let err = cfg.validate_sidecar_proxy_uid().unwrap_err(); + assert!(err.contains("sidecar_proxy_uid")); + } + + #[test] + fn serde_rejects_invalid_supervisor_topology() { + let json = serde_json::json!({ + "supervisor_topology": "daemonset" + }); + let err = serde_json::from_value::(json).unwrap_err(); + assert!(err.to_string().contains("unknown variant")); + } + #[test] fn default_service_account_name_is_default() { let cfg = KubernetesComputeConfig::default(); @@ -459,4 +649,128 @@ mod tests { let cfg: KubernetesComputeConfig = serde_json::from_value(json).unwrap(); assert_eq!(cfg.image_pull_secrets, ["regcred", "backup-regcred"]); } + + #[test] + fn default_sandbox_uid_and_gid_are_none() { + let cfg = KubernetesComputeConfig::default(); + assert_eq!(cfg.sandbox_uid, None); + assert_eq!(cfg.sandbox_gid, None); + } + + #[test] + fn serde_override_sandbox_uid() { + let json = serde_json::json!({ + "sandbox_uid": 1500 + }); + let cfg: KubernetesComputeConfig = serde_json::from_value(json).unwrap(); + assert_eq!(cfg.sandbox_uid, Some(1500)); + } + + #[test] + fn serde_override_sandbox_gid() { + let json = serde_json::json!({ + "sandbox_gid": 2000 + }); + let cfg: KubernetesComputeConfig = serde_json::from_value(json).unwrap(); + assert_eq!(cfg.sandbox_gid, Some(2000)); + } + + #[test] + fn parse_openshift_uid_range() { + assert_eq!( + KubernetesComputeConfig::from_open_shift_uid_range("1000000000/10000"), + Some(1_000_000_000) + ); + assert_eq!( + KubernetesComputeConfig::from_open_shift_uid_range("1000/50000"), + Some(1000) + ); + } + + #[test] + fn parse_openshift_uid_range_rejects_below_min() { + // 999 is below MIN_SANDBOX_UID (1000) — should be rejected. + assert_eq!( + KubernetesComputeConfig::from_open_shift_uid_range("999/50000"), + None + ); + } + + #[test] + fn parse_openshift_supplemental_groups() { + assert_eq!( + KubernetesComputeConfig::from_open_shift_supplemental_groups("1000/50000"), + Some(1000) + ); + } + + #[test] + fn resolve_sandbox_uid_prefers_config() { + let cfg = KubernetesComputeConfig { + sandbox_uid: Some(5000), + ..KubernetesComputeConfig::default() + }; + // Config value should win even when annotations are present. + let mut anns: HashMap = HashMap::new(); + anns.insert( + ANNOTATION_SCC_UID_RANGE.to_string(), + "1000000000/10000".to_string(), + ); + assert_eq!(cfg.resolve_sandbox_uid(Some(&anns)), 5000); + } + + #[test] + fn resolve_sandbox_uid_falls_back_to_openshift_annotation() { + let cfg = KubernetesComputeConfig::default(); + let mut anns: HashMap = HashMap::new(); + anns.insert( + ANNOTATION_SCC_UID_RANGE.to_string(), + "1000000000/10000".to_string(), + ); + assert_eq!(cfg.resolve_sandbox_uid(Some(&anns)), 1_000_000_000); + } + + #[test] + fn resolve_sandbox_uid_falls_back_to_default() { + let cfg = KubernetesComputeConfig::default(); + // No config, no annotations. + assert_eq!(cfg.resolve_sandbox_uid(None), DEFAULT_SANDBOX_UID); + // Empty annotations map. + let anns: HashMap = HashMap::new(); + assert_eq!(cfg.resolve_sandbox_uid(Some(&anns)), DEFAULT_SANDBOX_UID); + } + + #[test] + fn resolve_sandbox_gid_prefers_config() { + let cfg = KubernetesComputeConfig { + sandbox_uid: Some(5000), + sandbox_gid: Some(6000), + ..KubernetesComputeConfig::default() + }; + assert_eq!( + cfg.resolve_sandbox_gid(cfg.resolve_sandbox_uid(None), None), + 6000 + ); + } + + #[test] + fn resolve_sandbox_gid_falls_back_to_uid() { + let cfg = KubernetesComputeConfig { + sandbox_uid: Some(5000), + ..KubernetesComputeConfig::default() + }; + // sandbox_gid is None, should fall back to sandbox_uid. + assert_eq!( + cfg.resolve_sandbox_gid(cfg.resolve_sandbox_uid(None), None), + 5000 + ); + } + + #[test] + fn resolve_sandbox_gid_falls_back_to_resolved_uid() { + let cfg = KubernetesComputeConfig::default(); + // Both are None, should use the resolved UID. + let uid = cfg.resolve_sandbox_uid(None); + assert_eq!(cfg.resolve_sandbox_gid(uid, None), uid); + } } diff --git a/crates/openshell-driver-kubernetes/src/driver.rs b/crates/openshell-driver-kubernetes/src/driver.rs index 909568302..991a01654 100644 --- a/crates/openshell-driver-kubernetes/src/driver.rs +++ b/crates/openshell-driver-kubernetes/src/driver.rs @@ -3,12 +3,14 @@ //! Kubernetes compute driver. +use super::AppArmorProfile; use crate::config::{ - AppArmorProfile, DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME, DEFAULT_WORKSPACE_STORAGE_SIZE, - KubernetesComputeConfig, SupervisorSideloadMethod, + DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME, DEFAULT_SANDBOX_UID, DEFAULT_SIDECAR_PROXY_UID, + DEFAULT_WORKSPACE_STORAGE_SIZE, KubernetesComputeConfig, SupervisorSideloadMethod, + SupervisorTopology, }; use futures::{Stream, StreamExt, TryStreamExt}; -use k8s_openapi::api::core::v1::{Event as KubeEventObj, Node}; +use k8s_openapi::api::core::v1::{Event as KubeEventObj, Namespace, Node}; use kube::api::{Api, ApiResource, DeleteParams, ListParams, PostParams}; use kube::core::gvk::GroupVersionKind; use kube::core::{DynamicObject, ObjectMeta}; @@ -217,6 +219,9 @@ impl KubernetesComputeDriver { config .validate_provider_spiffe_workload_api_socket_path() .map_err(KubernetesDriverError::Precondition)?; + config + .validate_sidecar_proxy_uid() + .map_err(KubernetesDriverError::Precondition)?; let base_config = match kube::Config::incluster() { Ok(c) => c, Err(_) => kube::Config::infer() @@ -330,6 +335,47 @@ impl KubernetesComputeDriver { )) } + /// Resolve sandbox UID/GID from config or `OpenShift` SCC namespace annotations. + /// + /// Returns `(uid, gid, ns_annotations_map)`: + /// - If `sandbox_uid` is set in config, returns that (with fallback GID) + /// - Otherwise fetches the target namespace and checks for + /// `openshift.io/sa.scc.uid-range` / `openshift.io/sa.scc.supplemental-groups` + /// annotations. + /// - If neither config nor `OpenShift` is found, returns `(1000, 1000, {})` as defaults. + async fn resolve_sandbox_identity(&self) -> (u32, u32, BTreeMap) { + // Explicit config takes priority — skip namespace lookup entirely. + if self.config.sandbox_uid.is_some() { + let uid = self.config.resolve_sandbox_uid(None); + let gid = self.config.resolve_sandbox_gid(uid, None); + return (uid, gid, BTreeMap::new()); + } + + // Try to read namespace annotations for OpenShift SCC. + // Namespace is namespaced so Api::all works (it's cluster-scoped but + // can list all namespaces) and we filter by name, or use Api::namespaced. + let ns_api: Api = Api::all(self.client.clone()); + if let Ok(Ok(ns)) = + tokio::time::timeout(KUBE_API_TIMEOUT, ns_api.get(self.config.namespace.as_str())).await + { + let anns = ns.metadata.annotations.unwrap_or_default(); + let uid = self.config.resolve_sandbox_uid(Some(&anns)); + // Collect supplemental groups annotation for sandbox init containers. + let gid = anns + .get(crate::config::ANNOTATION_SCC_SUPPLEMENTAL_GROUPS) + .map_or(uid, |sup_range| { + KubernetesComputeConfig::from_open_shift_supplemental_groups(sup_range) + .unwrap_or(uid) + }); + (uid, gid, anns) + } else { + // Namespace fetch failed or timed out; fall back to defaults. + let uid = DEFAULT_SANDBOX_UID; + let gid = uid; + (uid, gid, BTreeMap::new()) + } + } + async fn has_gpu_capacity(&self) -> Result { let nodes: Api = Api::all(self.client.clone()); let node_list = nodes.list(&ListParams::default()).await?; @@ -471,11 +517,21 @@ impl KubernetesComputeDriver { .supported_agent_sandbox_api(self.client.clone()) .await .map_err(KubernetesDriverError::Message)?; + + // Resolve sandbox UID/GID from config or OpenShift SCC namespace annotations. + let (resolved_user_id, resolved_group_id, ns_annotations) = + self.resolve_sandbox_identity().await; + let mut obj = DynamicObject::new(name, &agent_sandbox_api.resource); obj.metadata = ObjectMeta { name: Some(name.to_string()), namespace: Some(self.config.namespace.clone()), labels: Some(sandbox_labels(sandbox)), + annotations: if ns_annotations.is_empty() { + None + } else { + Some(ns_annotations) + }, ..Default::default() }; let params = SandboxPodParams { @@ -485,6 +541,8 @@ impl KubernetesComputeDriver { supervisor_image: &self.config.supervisor_image, supervisor_image_pull_policy: &self.config.supervisor_image_pull_policy, supervisor_sideload_method: self.config.supervisor_sideload_method, + supervisor_topology: self.config.supervisor_topology, + sidecar_proxy_uid: self.config.sidecar_proxy_uid, service_account_name: &self.config.service_account_name, sandbox_id: &sandbox.id, sandbox_name: &sandbox.name, @@ -501,7 +559,11 @@ impl KubernetesComputeDriver { provider_spiffe_workload_api_socket_path: &self .config .provider_spiffe_workload_api_socket_path, + sandbox_uid: resolved_user_id, + sandbox_gid: resolved_group_id, }; + validate_sidecar_proxy_identity(¶ms)?; + obj.data = sandbox_to_k8s_spec(sandbox.spec.as_ref(), ¶ms); match tokio::time::timeout( KUBE_API_TIMEOUT, @@ -932,6 +994,31 @@ const SUPERVISOR_VOLUME_NAME: &str = "openshell-supervisor-bin"; /// Name of the init container that installs the supervisor binary. const SUPERVISOR_INIT_CONTAINER_NAME: &str = "openshell-supervisor-install"; +/// Name of the init container that prepares pod-level sidecar networking. +const SUPERVISOR_NETWORK_INIT_CONTAINER_NAME: &str = "openshell-network-init"; + +/// Container name for the network-only supervisor sidecar. +const SUPERVISOR_NETWORK_SIDECAR_NAME: &str = "openshell-supervisor-network"; + +/// Shared volume used by the network sidecar to signal readiness to the +/// process-only supervisor in the agent container. +const SIDECAR_STATE_VOLUME_NAME: &str = "openshell-sidecar-state"; +const SIDECAR_STATE_MOUNT_PATH: &str = "/run/openshell-sidecar"; +const SIDECAR_READY_FILE: &str = "/run/openshell-sidecar/supervisor.ready"; +const SIDECAR_ENTRYPOINT_PID_FILE: &str = "/run/openshell-sidecar/entrypoint.pid"; +const SIDECAR_SSH_SOCKET_FILE: &str = "/run/openshell-sidecar/ssh.sock"; + +/// Shared TLS work directory. The network sidecar writes the proxy CA bundle +/// here, while the agent container consumes it after the readiness file exists. +const SIDECAR_TLS_VOLUME_NAME: &str = "openshell-supervisor-tls"; +const SIDECAR_TLS_MOUNT_PATH: &str = "/etc/openshell-tls/proxy"; +const SIDECAR_CLIENT_TLS_MOUNT_PATH: &str = "/etc/openshell-tls/proxy/client"; + +/// Loopback listener owned by the network sidecar. The process-only supervisor +/// connects here for gateway gRPC, and the sidecar forwards bytes to the real +/// gateway endpoint using its own network privileges. +const SIDECAR_GATEWAY_FORWARD_ADDR: &str = "127.0.0.1:18080"; + /// Build the emptyDir volume that holds the supervisor binary. /// /// The init container writes the binary here; the agent container reads it. @@ -1006,28 +1093,12 @@ fn supervisor_init_container( spec } -/// Apply supervisor side-load transforms to an already-built pod template JSON. -/// -/// Depending on the sideload method: -/// - **`ImageVolume`**: mounts the supervisor OCI image directly as a read-only -/// volume (no init container needed, requires K8s >= v1.33). -/// - **`InitContainer`**: injects an emptyDir volume and an init container that -/// copies the supervisor binary from the supervisor image into that volume. -/// -/// In both cases, the agent container gets a command override to run the -/// side-loaded binary and `runAsUser: 0` so it can create network namespaces, -/// set up the proxy, and configure Landlock/seccomp. -fn apply_supervisor_sideload( - pod_template: &mut serde_json::Value, +fn apply_supervisor_binary_source( + spec: &mut serde_json::Map, supervisor_image: &str, supervisor_image_pull_policy: &str, method: SupervisorSideloadMethod, ) { - let Some(spec) = pod_template.get_mut("spec").and_then(|v| v.as_object_mut()) else { - return; - }; - - // 1. Add the volume (image source or emptyDir depending on method) let volumes = spec .entry("volumes") .or_insert_with(|| serde_json::json!([])) @@ -1046,7 +1117,6 @@ fn apply_supervisor_sideload( } } - // 2. Add the init container only for the init-container method if method == SupervisorSideloadMethod::InitContainer { let init_containers = spec .entry("initContainers") @@ -1059,8 +1129,35 @@ fn apply_supervisor_sideload( )); } } +} + +/// Apply supervisor side-load transforms to an already-built pod template JSON. +/// +/// Depending on the sideload method: +/// - **`ImageVolume`**: mounts the supervisor OCI image directly as a read-only +/// volume (no init container needed, requires K8s >= v1.33). +/// - **`InitContainer`**: injects an emptyDir volume and an init container that +/// copies the supervisor binary from the supervisor image into that volume. +/// +/// In both cases, the agent container gets a command override to run the +/// side-loaded binary as root so it can create network namespaces, set up the +/// proxy, and configure Landlock/seccomp. +#[allow(clippy::similar_names)] +fn apply_supervisor_sideload( + pod_template: &mut serde_json::Value, + supervisor_image: &str, + supervisor_image_pull_policy: &str, + method: SupervisorSideloadMethod, + sandbox_uid: u32, + sandbox_gid: u32, +) { + let Some(spec) = pod_template.get_mut("spec").and_then(|v| v.as_object_mut()) else { + return; + }; + + apply_supervisor_binary_source(spec, supervisor_image, supervisor_image_pull_policy, method); - // 3. Find the agent container and add volume mount + command override + // Find the agent container and add volume mount + command override let Some(containers) = spec.get_mut("containers").and_then(|v| v.as_array_mut()) else { return; }; @@ -1101,7 +1198,423 @@ fn apply_supervisor_sideload( if let Some(volume_mounts) = volume_mounts { volume_mounts.push(supervisor_volume_mount()); } + + // Inject resolved sandbox UID/GID as environment variables so the + // supervisor can use them directly without /etc/passwd lookups. + let env = container + .entry("env") + .or_insert_with(|| serde_json::json!([])) + .as_array_mut(); + if let Some(env) = env { + env.push(serde_json::json!({ + "name": openshell_core::sandbox_env::SANDBOX_UID.to_string(), + "value": sandbox_uid.to_string(), + })); + env.push(serde_json::json!({ + "name": openshell_core::sandbox_env::SANDBOX_GID.to_string(), + "value": sandbox_gid.to_string(), + })); + } + } +} + +fn sidecar_state_volume_mount() -> serde_json::Value { + serde_json::json!({ + "name": SIDECAR_STATE_VOLUME_NAME, + "mountPath": SIDECAR_STATE_MOUNT_PATH, + }) +} + +fn sidecar_tls_volume_mount() -> serde_json::Value { + serde_json::json!({ + "name": SIDECAR_TLS_VOLUME_NAME, + "mountPath": SIDECAR_TLS_MOUNT_PATH, + }) +} + +fn sidecar_process_gateway_endpoint(grpc_endpoint: &str) -> String { + if grpc_endpoint.is_empty() { + String::new() + } else if grpc_endpoint.starts_with("https://") { + format!("https://{SIDECAR_GATEWAY_FORWARD_ADDR}") + } else { + format!("http://{SIDECAR_GATEWAY_FORWARD_ADDR}") + } +} + +fn gateway_tls_server_name(grpc_endpoint: &str) -> Option { + let rest = grpc_endpoint.strip_prefix("https://")?; + let authority = rest.split('/').next().unwrap_or(rest); + if authority.is_empty() { + return None; + } + if let Some(bracketed) = authority.strip_prefix('[') { + return bracketed.split(']').next().map(str::to_string); + } + authority + .split(':') + .next() + .filter(|host| !host.is_empty()) + .map(str::to_string) +} + +fn copy_log_level_env( + env: &mut Vec, + template_environment: &std::collections::HashMap, + spec_environment: &std::collections::HashMap, +) { + if let Some(value) = spec_environment + .get(openshell_core::sandbox_env::LOG_LEVEL) + .or_else(|| template_environment.get(openshell_core::sandbox_env::LOG_LEVEL)) + { + upsert_env(env, openshell_core::sandbox_env::LOG_LEVEL, value); + } +} + +fn supervisor_sidecar_env( + template_environment: &std::collections::HashMap, + spec_environment: &std::collections::HashMap, + params: &SandboxPodParams<'_>, +) -> Vec { + let mut env = Vec::new(); + apply_required_env( + &mut env, + params.sandbox_id, + params.sandbox_name, + params.grpc_endpoint, + "", + !params.client_tls_secret_name.is_empty(), + provider_spiffe_socket_path(params), + ); + if !params.client_tls_secret_name.is_empty() { + upsert_env( + &mut env, + openshell_core::sandbox_env::TLS_CA, + &format!("{SIDECAR_CLIENT_TLS_MOUNT_PATH}/ca.crt"), + ); + upsert_env( + &mut env, + openshell_core::sandbox_env::TLS_CERT, + &format!("{SIDECAR_CLIENT_TLS_MOUNT_PATH}/tls.crt"), + ); + upsert_env( + &mut env, + openshell_core::sandbox_env::TLS_KEY, + &format!("{SIDECAR_CLIENT_TLS_MOUNT_PATH}/tls.key"), + ); + } + copy_log_level_env(&mut env, template_environment, spec_environment); + upsert_env( + &mut env, + openshell_core::sandbox_env::SUPERVISOR_TOPOLOGY, + "sidecar", + ); + upsert_env( + &mut env, + openshell_core::sandbox_env::NETWORK_ENFORCEMENT_MODE, + "sidecar-nftables", + ); + upsert_env( + &mut env, + openshell_core::sandbox_env::NETWORK_BINARY_IDENTITY, + "relaxed", + ); + upsert_env( + &mut env, + openshell_core::sandbox_env::SUPERVISOR_READY_FILE, + SIDECAR_READY_FILE, + ); + upsert_env( + &mut env, + openshell_core::sandbox_env::ENTRYPOINT_PID_FILE, + SIDECAR_ENTRYPOINT_PID_FILE, + ); + upsert_env( + &mut env, + openshell_core::sandbox_env::GATEWAY_FORWARD_ADDR, + SIDECAR_GATEWAY_FORWARD_ADDR, + ); + upsert_env( + &mut env, + openshell_core::sandbox_env::PROXY_TLS_DIR, + SIDECAR_TLS_MOUNT_PATH, + ); + env +} + +fn supervisor_sidecar_container( + template_environment: &std::collections::HashMap, + spec_environment: &std::collections::HashMap, + params: &SandboxPodParams<'_>, +) -> serde_json::Value { + let mut container = serde_json::json!({ + "name": SUPERVISOR_NETWORK_SIDECAR_NAME, + "image": params.supervisor_image, + "command": [ + SUPERVISOR_IMAGE_BINARY_PATH, + "--mode=network", + ], + "env": supervisor_sidecar_env(template_environment, spec_environment, params), + "securityContext": { + "runAsUser": params.sidecar_proxy_uid, + "runAsGroup": params.sandbox_gid, + "runAsNonRoot": true, + "allowPrivilegeEscalation": false, + "capabilities": { + "drop": ["ALL"] + } + }, + "volumeMounts": [ + sidecar_state_volume_mount(), + sidecar_tls_volume_mount(), + { + "name": "openshell-sa-token", + "mountPath": "/var/run/secrets/openshell", + "readOnly": true + } + ] + }); + if !params.supervisor_image_pull_policy.is_empty() { + container["imagePullPolicy"] = serde_json::json!(params.supervisor_image_pull_policy); + } + if params.provider_spiffe_enabled { + container["volumeMounts"] + .as_array_mut() + .expect("volumeMounts is an array") + .push(serde_json::json!({ + "name": SPIFFE_WORKLOAD_API_VOLUME_NAME, + "mountPath": spiffe_socket_mount_path(params.provider_spiffe_workload_api_socket_path), + "readOnly": true, + })); } + if let Some(profile) = params.app_armor_profile { + container["securityContext"]["appArmorProfile"] = app_armor_profile_to_k8s(profile); + } + container +} + +fn supervisor_network_init_container(params: &SandboxPodParams<'_>) -> serde_json::Value { + let mut container = serde_json::json!({ + "name": SUPERVISOR_NETWORK_INIT_CONTAINER_NAME, + "image": params.supervisor_image, + "command": [ + SUPERVISOR_IMAGE_BINARY_PATH, + "--mode=network-init", + "--sidecar-proxy-uid", + params.sidecar_proxy_uid.to_string(), + "--sidecar-proxy-gid", + params.sandbox_gid.to_string(), + "--sidecar-state-dir", + SIDECAR_STATE_MOUNT_PATH, + "--sidecar-tls-dir", + SIDECAR_TLS_MOUNT_PATH, + ], + "securityContext": { + "runAsUser": 0, + "allowPrivilegeEscalation": false, + "capabilities": { + "drop": ["ALL"], + "add": ["NET_ADMIN", "CHOWN", "FOWNER"] + } + }, + "volumeMounts": [ + sidecar_state_volume_mount(), + sidecar_tls_volume_mount(), + ] + }); + if !params.supervisor_image_pull_policy.is_empty() { + container["imagePullPolicy"] = serde_json::json!(params.supervisor_image_pull_policy); + } + if !params.client_tls_secret_name.is_empty() { + container["volumeMounts"] + .as_array_mut() + .expect("volumeMounts is an array") + .push(serde_json::json!({ + "name": "openshell-client-tls", + "mountPath": "/etc/openshell-tls/client", + "readOnly": true + })); + } + if let Some(profile) = params.app_armor_profile { + container["securityContext"]["appArmorProfile"] = app_armor_profile_to_k8s(profile); + } + container +} + +fn apply_supervisor_sidecar_topology( + pod_template: &mut serde_json::Value, + template_environment: &std::collections::HashMap, + spec_environment: &std::collections::HashMap, + params: &SandboxPodParams<'_>, +) { + let Some(spec) = pod_template.get_mut("spec").and_then(|v| v.as_object_mut()) else { + return; + }; + + let pod_security_context = spec + .entry("securityContext") + .or_insert_with(|| serde_json::json!({})); + if let Some(sc) = pod_security_context.as_object_mut() { + sc.insert("fsGroup".to_string(), serde_json::json!(params.sandbox_gid)); + } + + apply_supervisor_binary_source( + spec, + params.supervisor_image, + params.supervisor_image_pull_policy, + params.supervisor_sideload_method, + ); + + let volumes = spec + .entry("volumes") + .or_insert_with(|| serde_json::json!([])) + .as_array_mut(); + if let Some(volumes) = volumes { + volumes.push(serde_json::json!({ + "name": SIDECAR_STATE_VOLUME_NAME, + "emptyDir": {} + })); + volumes.push(serde_json::json!({ + "name": SIDECAR_TLS_VOLUME_NAME, + "emptyDir": {} + })); + } + + let init_containers = spec + .entry("initContainers") + .or_insert_with(|| serde_json::json!([])) + .as_array_mut(); + if let Some(init_containers) = init_containers { + init_containers.push(supervisor_network_init_container(params)); + } + + let Some(containers) = spec.get_mut("containers").and_then(|v| v.as_array_mut()) else { + return; + }; + + let target_index = containers + .iter() + .position(|c| c.get("name").and_then(|v| v.as_str()) == Some("agent")) + .unwrap_or(0); + + if let Some(container) = containers + .get_mut(target_index) + .and_then(|v| v.as_object_mut()) + { + container.insert( + "command".to_string(), + serde_json::json!([ + format!("{}/openshell-sandbox", SUPERVISOR_MOUNT_PATH), + "--mode=process" + ]), + ); + + let security_context = container + .entry("securityContext") + .or_insert_with(|| serde_json::json!({})); + if let Some(sc) = security_context.as_object_mut() { + sc.insert( + "runAsUser".to_string(), + serde_json::json!(params.sandbox_uid), + ); + sc.insert( + "runAsGroup".to_string(), + serde_json::json!(params.sandbox_gid), + ); + sc.insert("runAsNonRoot".to_string(), serde_json::json!(true)); + sc.insert( + "allowPrivilegeEscalation".to_string(), + serde_json::json!(false), + ); + sc.insert( + "capabilities".to_string(), + serde_json::json!({ + "drop": ["ALL"] + }), + ); + } + + let volume_mounts = container + .entry("volumeMounts") + .or_insert_with(|| serde_json::json!([])) + .as_array_mut(); + if let Some(volume_mounts) = volume_mounts { + volume_mounts.push(supervisor_volume_mount()); + volume_mounts.push(sidecar_state_volume_mount()); + volume_mounts.push(sidecar_tls_volume_mount()); + } + + let env = container + .entry("env") + .or_insert_with(|| serde_json::json!([])) + .as_array_mut(); + if let Some(env) = env { + let process_endpoint = sidecar_process_gateway_endpoint(params.grpc_endpoint); + upsert_env( + env, + openshell_core::sandbox_env::ENDPOINT, + &process_endpoint, + ); + if let Some(server_name) = gateway_tls_server_name(params.grpc_endpoint) { + upsert_env( + env, + openshell_core::sandbox_env::GATEWAY_TLS_SERVER_NAME, + &server_name, + ); + } + upsert_env( + env, + openshell_core::sandbox_env::SUPERVISOR_TOPOLOGY, + "sidecar", + ); + upsert_env( + env, + openshell_core::sandbox_env::NETWORK_ENFORCEMENT_MODE, + "sidecar-nftables", + ); + upsert_env( + env, + openshell_core::sandbox_env::PROCESS_ENFORCEMENT_MODE, + "network-only", + ); + upsert_env( + env, + openshell_core::sandbox_env::SSH_SOCKET_PATH, + SIDECAR_SSH_SOCKET_FILE, + ); + upsert_env( + env, + openshell_core::sandbox_env::SUPERVISOR_READY_FILE, + SIDECAR_READY_FILE, + ); + upsert_env( + env, + openshell_core::sandbox_env::ENTRYPOINT_PID_FILE, + SIDECAR_ENTRYPOINT_PID_FILE, + ); + upsert_env( + env, + openshell_core::sandbox_env::PROXY_TLS_DIR, + SIDECAR_TLS_MOUNT_PATH, + ); + upsert_env( + env, + openshell_core::sandbox_env::SANDBOX_UID, + ¶ms.sandbox_uid.to_string(), + ); + upsert_env( + env, + openshell_core::sandbox_env::SANDBOX_GID, + ¶ms.sandbox_gid.to_string(), + ); + } + } + + containers.push(supervisor_sidecar_container( + template_environment, + spec_environment, + params, + )); } /// Apply workspace persistence transforms to an already-built pod template. @@ -1119,10 +1632,13 @@ fn apply_supervisor_sideload( /// The init container mounts the PVC at a temporary path so it can still see /// the image's `/sandbox` directory. It checks for a sentinel file and skips /// the copy if the PVC was already initialised. +#[allow(clippy::similar_names)] fn apply_workspace_persistence( pod_template: &mut serde_json::Value, image: &str, image_pull_policy: &str, + sandbox_uid: u32, + sandbox_gid: u32, ) { let Some(spec) = pod_template.get_mut("spec").and_then(|v| v.as_object_mut()) else { return; @@ -1168,6 +1684,10 @@ fn apply_workspace_persistence( // self-referential symlinks under `/sandbox/.uv`, and GNU cp can // fail while seeding the PVC even though preserving the symlink as-is // is valid. `tar` copies the tree without dereferencing those links. + // Archive only the contents, not the `/sandbox` directory entry + // itself, so extraction never tries to chmod the PVC mount root. + // Extract without restoring owner, mode, or timestamps so the + // non-root init container can seed kubelet-owned PVCs. // // The inner `[ -d ... ]` guard handles custom images that don't have // a /sandbox directory — the copy is skipped but the sentinel is @@ -1175,7 +1695,12 @@ fn apply_workspace_persistence( let copy_cmd = format!( "if [ ! -f {WORKSPACE_INIT_MOUNT_PATH}/{WORKSPACE_SENTINEL} ]; then \ if [ -d {WORKSPACE_MOUNT_PATH} ]; then \ - tar -C {WORKSPACE_MOUNT_PATH} -cf - . | tar -C {WORKSPACE_INIT_MOUNT_PATH} -xpf -; \ + tmp=$(mktemp) && rm -f \"$tmp\" && \ + (cd {WORKSPACE_MOUNT_PATH} && find . -mindepth 1 -maxdepth 1 -exec tar -cf \"$tmp\" {{}} +) && \ + if [ -f \"$tmp\" ]; then \ + tar -C {WORKSPACE_INIT_MOUNT_PATH} --no-same-owner --no-same-permissions --touch -xf \"$tmp\" && \ + rm -f \"$tmp\"; \ + fi; \ fi && \ touch {WORKSPACE_INIT_MOUNT_PATH}/{WORKSPACE_SENTINEL}; \ fi" @@ -1185,7 +1710,11 @@ fn apply_workspace_persistence( "name": WORKSPACE_INIT_CONTAINER_NAME, "image": image, "command": ["sh", "-c", copy_cmd], - "securityContext": { "runAsUser": 0 }, + "securityContext": { + "runAsUser": sandbox_uid, + "runAsGroup": sandbox_gid, + "fsGroup": sandbox_gid, + }, "volumeMounts": [{ "name": WORKSPACE_VOLUME_NAME, "mountPath": WORKSPACE_INIT_MOUNT_PATH @@ -1231,6 +1760,8 @@ struct SandboxPodParams<'a> { supervisor_image: &'a str, supervisor_image_pull_policy: &'a str, supervisor_sideload_method: SupervisorSideloadMethod, + supervisor_topology: SupervisorTopology, + sidecar_proxy_uid: u32, service_account_name: &'a str, sandbox_id: &'a str, sandbox_name: &'a str, @@ -1247,6 +1778,10 @@ struct SandboxPodParams<'a> { sa_token_ttl_secs: i64, provider_spiffe_enabled: bool, provider_spiffe_workload_api_socket_path: &'a str, + /// Resolved sandbox UID for supervisor `runAsUser` and env var. + sandbox_uid: u32, + /// Resolved sandbox GID for PVC init container operations. + sandbox_gid: u32, } impl Default for SandboxPodParams<'_> { @@ -1258,6 +1793,8 @@ impl Default for SandboxPodParams<'_> { supervisor_image: "", supervisor_image_pull_policy: "", supervisor_sideload_method: SupervisorSideloadMethod::default(), + supervisor_topology: SupervisorTopology::default(), + sidecar_proxy_uid: DEFAULT_SIDECAR_PROXY_UID, service_account_name: DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME, sandbox_id: "", sandbox_name: "", @@ -1272,10 +1809,26 @@ impl Default for SandboxPodParams<'_> { sa_token_ttl_secs: 3600, provider_spiffe_enabled: false, provider_spiffe_workload_api_socket_path: "", + sandbox_uid: DEFAULT_SANDBOX_UID, + sandbox_gid: DEFAULT_SANDBOX_UID, } } } +fn validate_sidecar_proxy_identity( + params: &SandboxPodParams<'_>, +) -> Result<(), KubernetesDriverError> { + if params.supervisor_topology == SupervisorTopology::Sidecar + && params.sidecar_proxy_uid == params.sandbox_uid + { + return Err(KubernetesDriverError::Precondition(format!( + "sidecar_proxy_uid ({}) must not match sandbox_uid ({}) in sidecar topology", + params.sidecar_proxy_uid, params.sandbox_uid + ))); + } + Ok(()) +} + fn spec_pod_env(spec: Option<&SandboxSpec>) -> std::collections::HashMap { let mut env = spec.map_or_else(Default::default, |s| s.environment.clone()); if let Some(s) = spec.filter(|s| !s.log_level.is_empty()) { @@ -1586,13 +2139,22 @@ fn sandbox_template_to_k8s_with_gpu_requirements( serde_json::Value::Array(vec![serde_json::Value::Object(container)]), ); - // Add TLS secret volume. Mode 0400 (owner-read) prevents the - // unprivileged sandbox user from reading the mTLS private key. + // Add TLS secret volume. Combined mode uses mode 0400 because the + // supervisor starts as root and drops privileges before running workload + // children. Sidecar mode keeps the process supervisor non-root, so it uses + // pod fsGroup + 0440 to preserve gateway session and SSH control behavior. let mut volumes: Vec = Vec::new(); if !params.client_tls_secret_name.is_empty() { + let client_tls_default_mode = match params.supervisor_topology { + SupervisorTopology::Combined => 0o400, + SupervisorTopology::Sidecar => 0o440, + }; volumes.push(serde_json::json!({ "name": "openshell-client-tls", - "secret": { "secretName": params.client_tls_secret_name, "defaultMode": 256 } + "secret": { + "secretName": params.client_tls_secret_name, + "defaultMode": client_tls_default_mode + } })); } if params.provider_spiffe_enabled { @@ -1607,7 +2169,12 @@ fn sandbox_template_to_k8s_with_gpu_requirements( // Projected ServiceAccountToken volume — kubelet writes a short-lived // audience-bound JWT into /var/run/secrets/openshell/token and rotates // it automatically. The supervisor exchanges this for a gateway-minted - // JWT via `IssueSandboxToken` once at startup. + // JWT via `IssueSandboxToken` once at startup. In sidecar topology both + // supervisor containers run with the sandbox GID and need group-read access. + let sa_token_default_mode = match params.supervisor_topology { + SupervisorTopology::Combined => 0o400, + SupervisorTopology::Sidecar => 0o440, + }; volumes.push(serde_json::json!({ "name": "openshell-sa-token", "projected": { @@ -1618,7 +2185,7 @@ fn sandbox_template_to_k8s_with_gpu_requirements( "path": "token" } }], - "defaultMode": 256 + "defaultMode": sa_token_default_mode } })); spec.insert("volumes".to_string(), serde_json::Value::Array(volumes)); @@ -1642,18 +2209,38 @@ fn sandbox_template_to_k8s_with_gpu_requirements( let mut result = serde_json::Value::Object(template_value); - apply_supervisor_sideload( - &mut result, - params.supervisor_image, - params.supervisor_image_pull_policy, - params.supervisor_sideload_method, - ); + match params.supervisor_topology { + SupervisorTopology::Combined => { + apply_supervisor_sideload( + &mut result, + params.supervisor_image, + params.supervisor_image_pull_policy, + params.supervisor_sideload_method, + params.sandbox_uid, + params.sandbox_gid, + ); + } + SupervisorTopology::Sidecar => { + apply_supervisor_sidecar_topology( + &mut result, + &template.environment, + spec_environment, + params, + ); + } + } // Inject workspace persistence (init container + PVC volume mount) so // that /sandbox data survives pod rescheduling. Skipped when the user // provides custom volumeClaimTemplates to avoid conflicts. if inject_workspace { - apply_workspace_persistence(&mut result, image, params.image_pull_policy); + apply_workspace_persistence( + &mut result, + image, + params.image_pull_policy, + params.sandbox_uid, + params.sandbox_gid, + ); } result @@ -2134,6 +2721,15 @@ mod tests { assert!(!should_try_next_sandbox_api_version(&err)); } + fn rendered_env<'a>(container: &'a serde_json::Value, name: &str) -> Option<&'a str> { + container["env"] + .as_array()? + .iter() + .find(|item| item.get("name").and_then(|value| value.as_str()) == Some(name))? + .get("value")? + .as_str() + } + #[test] fn driver_config_rejects_invalid_shape() { let template = SandboxTemplate { @@ -2271,6 +2867,8 @@ mod tests { "custom-image:latest", "IfNotPresent", SupervisorSideloadMethod::InitContainer, + 1500, // sandbox_uid + 1500, // sandbox_gid ); let sc = &pod_template["spec"]["containers"][0]["securityContext"]; @@ -2300,6 +2898,8 @@ mod tests { "supervisor-image:latest", "IfNotPresent", SupervisorSideloadMethod::InitContainer, + 1000, // sandbox_uid + 1000, // sandbox_gid ); let sc = &pod_template["spec"]["containers"][0]["securityContext"]; @@ -2325,6 +2925,8 @@ mod tests { "supervisor-image:latest", "IfNotPresent", SupervisorSideloadMethod::InitContainer, + 1000, // sandbox_uid + 1000, // sandbox_gid ); // Volume should be an emptyDir @@ -2399,6 +3001,8 @@ mod tests { "supervisor-image:latest", "IfNotPresent", SupervisorSideloadMethod::ImageVolume, + 1000, // sandbox_uid + 1000, // sandbox_gid ); let volumes = pod_template["spec"]["volumes"] @@ -2453,6 +3057,8 @@ mod tests { "supervisor-image:latest", "", SupervisorSideloadMethod::ImageVolume, + 1000, // sandbox_uid + 1000, // sandbox_gid ); let volume = &pod_template["spec"]["volumes"][0]; @@ -2463,6 +3069,257 @@ mod tests { ); } + #[test] + fn sidecar_topology_renders_process_agent_and_network_sidecar() { + let params = SandboxPodParams { + supervisor_topology: SupervisorTopology::Sidecar, + supervisor_sideload_method: SupervisorSideloadMethod::InitContainer, + supervisor_image: "supervisor-image:latest", + supervisor_image_pull_policy: "IfNotPresent", + grpc_endpoint: "https://openshell-gateway.openshell.svc:8080", + client_tls_secret_name: "openshell-client-tls", + sidecar_proxy_uid: 2200, + sandbox_uid: 1500, + sandbox_gid: 1500, + ..SandboxPodParams::default() + }; + let pod_template = sandbox_template_to_k8s( + &SandboxTemplate { + image: "agent-image:latest".to_string(), + ..SandboxTemplate::default() + }, + false, + &std::collections::HashMap::new(), + false, + ¶ms, + ); + + assert!( + pod_template["spec"]["shareProcessNamespace"].is_null(), + "sidecar mode no longer needs a shared process namespace when binary identity is relaxed" + ); + assert_eq!(pod_template["spec"]["securityContext"]["fsGroup"], 1500); + let containers = pod_template["spec"]["containers"].as_array().unwrap(); + assert_eq!(containers.len(), 2); + + let agent = containers + .iter() + .find(|container| container["name"] == "agent") + .unwrap(); + assert_eq!( + agent["command"], + serde_json::json!([ + format!("{SUPERVISOR_MOUNT_PATH}/openshell-sandbox"), + "--mode=process" + ]) + ); + assert_eq!(agent["securityContext"]["runAsUser"], 1500); + assert_eq!(agent["securityContext"]["runAsGroup"], 1500); + assert_eq!(agent["securityContext"]["runAsNonRoot"], true); + assert_eq!( + agent["securityContext"]["capabilities"], + serde_json::json!({ + "drop": ["ALL"] + }) + ); + assert_eq!( + rendered_env(agent, openshell_core::sandbox_env::ENDPOINT), + Some("https://127.0.0.1:18080") + ); + assert_eq!( + rendered_env(agent, openshell_core::sandbox_env::GATEWAY_TLS_SERVER_NAME), + Some("openshell-gateway.openshell.svc") + ); + assert_eq!( + rendered_env(agent, openshell_core::sandbox_env::PROCESS_ENFORCEMENT_MODE), + Some("network-only") + ); + assert_eq!( + rendered_env(agent, openshell_core::sandbox_env::SSH_SOCKET_PATH), + Some(SIDECAR_SSH_SOCKET_FILE) + ); + assert_eq!( + rendered_env(agent, openshell_core::sandbox_env::SUPERVISOR_READY_FILE), + Some(SIDECAR_READY_FILE) + ); + assert_eq!( + rendered_env(agent, openshell_core::sandbox_env::ENTRYPOINT_PID_FILE), + Some(SIDECAR_ENTRYPOINT_PID_FILE) + ); + assert_eq!( + rendered_env(agent, openshell_core::sandbox_env::PROXY_TLS_DIR), + Some(SIDECAR_TLS_MOUNT_PATH) + ); + assert_eq!( + rendered_env(agent, openshell_core::sandbox_env::SANDBOX_UID), + Some("1500") + ); + + let sidecar = containers + .iter() + .find(|container| container["name"] == SUPERVISOR_NETWORK_SIDECAR_NAME) + .unwrap(); + assert_eq!(sidecar["image"], "supervisor-image:latest"); + assert_eq!(sidecar["imagePullPolicy"], "IfNotPresent"); + assert_eq!( + sidecar["command"], + serde_json::json!([SUPERVISOR_IMAGE_BINARY_PATH, "--mode=network"]) + ); + assert_eq!(sidecar["securityContext"]["runAsUser"], 2200); + assert_eq!(sidecar["securityContext"]["runAsGroup"], 1500); + assert_eq!(sidecar["securityContext"]["runAsNonRoot"], true); + assert_eq!( + sidecar["securityContext"]["capabilities"], + serde_json::json!({ + "drop": ["ALL"] + }) + ); + assert_eq!( + rendered_env(sidecar, openshell_core::sandbox_env::ENDPOINT), + Some("https://openshell-gateway.openshell.svc:8080") + ); + assert_eq!( + rendered_env(sidecar, openshell_core::sandbox_env::GATEWAY_FORWARD_ADDR), + Some(SIDECAR_GATEWAY_FORWARD_ADDR) + ); + assert_eq!( + rendered_env( + sidecar, + openshell_core::sandbox_env::NETWORK_BINARY_IDENTITY + ), + Some("relaxed") + ); + assert_eq!( + rendered_env(sidecar, openshell_core::sandbox_env::ENTRYPOINT_PID_FILE), + Some(SIDECAR_ENTRYPOINT_PID_FILE) + ); + assert_eq!( + rendered_env(sidecar, openshell_core::sandbox_env::PROXY_TLS_DIR), + Some(SIDECAR_TLS_MOUNT_PATH) + ); + assert_eq!( + rendered_env(sidecar, openshell_core::sandbox_env::TLS_CA), + Some("/etc/openshell-tls/proxy/client/ca.crt") + ); + let sidecar_mounts = sidecar["volumeMounts"].as_array().unwrap(); + assert!( + !sidecar_mounts + .iter() + .any(|mount| mount["name"] == "openshell-client-tls"), + "runtime sidecar should use the init-copied TLS files, not the root-owned Secret mount" + ); + let volumes = pod_template["spec"]["volumes"].as_array().unwrap(); + let sa_token = volumes + .iter() + .find(|volume| volume["name"] == "openshell-sa-token") + .unwrap(); + assert_eq!(sa_token["projected"]["defaultMode"], 0o440); + let client_tls = volumes + .iter() + .find(|volume| volume["name"] == "openshell-client-tls") + .unwrap(); + assert_eq!(client_tls["secret"]["defaultMode"], 0o440); + + let init_containers = pod_template["spec"]["initContainers"].as_array().unwrap(); + let network_init = init_containers + .iter() + .find(|container| container["name"] == SUPERVISOR_NETWORK_INIT_CONTAINER_NAME) + .unwrap(); + assert_eq!(network_init["image"], "supervisor-image:latest"); + assert_eq!(network_init["imagePullPolicy"], "IfNotPresent"); + assert_eq!( + network_init["command"], + serde_json::json!([ + SUPERVISOR_IMAGE_BINARY_PATH, + "--mode=network-init", + "--sidecar-proxy-uid", + "2200", + "--sidecar-proxy-gid", + "1500", + "--sidecar-state-dir", + SIDECAR_STATE_MOUNT_PATH, + "--sidecar-tls-dir", + SIDECAR_TLS_MOUNT_PATH + ]) + ); + assert_eq!( + network_init["securityContext"]["capabilities"], + serde_json::json!({ + "drop": ["ALL"], + "add": ["NET_ADMIN", "CHOWN", "FOWNER"] + }) + ); + let network_init_mounts = network_init["volumeMounts"].as_array().unwrap(); + assert!(network_init_mounts.iter().any(|mount| { + mount["name"] == "openshell-client-tls" + && mount["mountPath"] == "/etc/openshell-tls/client" + })); + } + + #[test] + fn sidecar_topology_adds_shared_state_and_tls_volumes() { + let params = SandboxPodParams { + supervisor_topology: SupervisorTopology::Sidecar, + supervisor_sideload_method: SupervisorSideloadMethod::ImageVolume, + supervisor_image: "supervisor-image:latest", + grpc_endpoint: "http://openshell-gateway.openshell.svc:8080", + ..SandboxPodParams::default() + }; + let pod_template = sandbox_template_to_k8s( + &SandboxTemplate::default(), + false, + &std::collections::HashMap::new(), + false, + ¶ms, + ); + + let volumes = pod_template["spec"]["volumes"].as_array().unwrap(); + assert!( + volumes + .iter() + .any(|volume| volume["name"] == SIDECAR_STATE_VOLUME_NAME) + ); + assert!( + volumes + .iter() + .any(|volume| volume["name"] == SIDECAR_TLS_VOLUME_NAME) + ); + assert!(volumes.iter().any(|volume| { + volume["name"] == SUPERVISOR_VOLUME_NAME && volume["image"].is_object() + })); + + let containers = pod_template["spec"]["containers"].as_array().unwrap(); + for container_name in ["agent", SUPERVISOR_NETWORK_SIDECAR_NAME] { + let container = containers + .iter() + .find(|container| container["name"] == container_name) + .unwrap(); + let mounts = container["volumeMounts"].as_array().unwrap(); + assert!(mounts.iter().any(|mount| { + mount["name"] == SIDECAR_STATE_VOLUME_NAME + && mount["mountPath"] == SIDECAR_STATE_MOUNT_PATH + })); + assert!(mounts.iter().any(|mount| { + mount["name"] == SIDECAR_TLS_VOLUME_NAME + && mount["mountPath"] == SIDECAR_TLS_MOUNT_PATH + })); + } + } + + #[test] + fn sidecar_topology_rejects_proxy_uid_matching_sandbox_uid() { + let params = SandboxPodParams { + supervisor_topology: SupervisorTopology::Sidecar, + sidecar_proxy_uid: 1500, + sandbox_uid: 1500, + ..SandboxPodParams::default() + }; + + let err = validate_sidecar_proxy_identity(¶ms).unwrap_err(); + assert!(matches!(err, KubernetesDriverError::Precondition(_))); + assert!(err.to_string().contains("sidecar_proxy_uid")); + } + /// Regression test: TLS mount path must match env var paths. /// The volume is mounted at a specific path and the env vars must point to /// files within that same path, otherwise the sandbox will fail to start @@ -2945,6 +3802,8 @@ mod tests { &mut pod_template, "openshell/sandbox:latest", "IfNotPresent", + 1000, // sandbox_uid + 1000, // sandbox_gid ); // Init container @@ -2955,7 +3814,8 @@ mod tests { assert_eq!(init_containers[0]["name"], WORKSPACE_INIT_CONTAINER_NAME); assert_eq!(init_containers[0]["image"], "openshell/sandbox:latest"); assert_eq!(init_containers[0]["imagePullPolicy"], "IfNotPresent"); - assert_eq!(init_containers[0]["securityContext"]["runAsUser"], 0); + // init container runs as the resolved sandbox UID (not root) + assert_eq!(init_containers[0]["securityContext"]["runAsUser"], 1000); // Init container mounts PVC at temp path, not /sandbox let init_mounts = init_containers[0]["volumeMounts"] @@ -2998,7 +3858,13 @@ mod tests { } }); - apply_workspace_persistence(&mut pod_template, "my-custom-image:v2", "IfNotPresent"); + apply_workspace_persistence( + &mut pod_template, + "my-custom-image:v2", + "IfNotPresent", + 1000, + 1000, + ); let init_image = pod_template["spec"]["initContainers"][0]["image"] .as_str() @@ -3020,7 +3886,7 @@ mod tests { } }); - apply_workspace_persistence(&mut pod_template, "img:latest", "Always"); + apply_workspace_persistence(&mut pod_template, "img:latest", "Always", 1000, 1000); let cmd = pod_template["spec"]["initContainers"][0]["command"] .as_array() @@ -3034,6 +3900,16 @@ mod tests { script.contains("tar -C"), "init script must seed image contents with a tar stream" ); + assert!( + script.contains("find . -mindepth 1 -maxdepth 1"), + "init script must archive sandbox contents without the mount root entry" + ); + assert!( + script.contains("--no-same-owner") + && script.contains("--no-same-permissions") + && script.contains("--touch"), + "init script must avoid restoring metadata onto the PVC root" + ); } #[test] diff --git a/crates/openshell-driver-kubernetes/src/lib.rs b/crates/openshell-driver-kubernetes/src/lib.rs index 22b0a8703..309f7e270 100644 --- a/crates/openshell-driver-kubernetes/src/lib.rs +++ b/crates/openshell-driver-kubernetes/src/lib.rs @@ -6,8 +6,9 @@ pub mod driver; pub mod grpc; pub use config::{ - AppArmorProfile, DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME, DEFAULT_WORKSPACE_STORAGE_SIZE, - KubernetesComputeConfig, SupervisorSideloadMethod, + AppArmorProfile, DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME, DEFAULT_SIDECAR_PROXY_UID, + DEFAULT_WORKSPACE_STORAGE_SIZE, KubernetesComputeConfig, SupervisorSideloadMethod, + SupervisorTopology, }; pub use driver::{KubernetesComputeDriver, KubernetesDriverError}; pub use grpc::ComputeDriverService; diff --git a/crates/openshell-driver-kubernetes/src/main.rs b/crates/openshell-driver-kubernetes/src/main.rs index f7eeeba42..3e95a8215 100644 --- a/crates/openshell-driver-kubernetes/src/main.rs +++ b/crates/openshell-driver-kubernetes/src/main.rs @@ -11,7 +11,8 @@ use openshell_core::VERSION; use openshell_core::proto::compute::v1::compute_driver_server::ComputeDriverServer; use openshell_driver_kubernetes::{ AppArmorProfile, ComputeDriverService, DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME, - KubernetesComputeConfig, KubernetesComputeDriver, SupervisorSideloadMethod, + DEFAULT_SIDECAR_PROXY_UID, KubernetesComputeConfig, KubernetesComputeDriver, + SupervisorSideloadMethod, SupervisorTopology, }; #[derive(Parser, Debug)] @@ -80,6 +81,16 @@ struct Args { )] supervisor_sideload_method: SupervisorSideloadMethod, + #[arg( + long, + env = "OPENSHELL_SUPERVISOR_TOPOLOGY", + default_value = "combined" + )] + supervisor_topology: SupervisorTopology, + + #[arg(long, env = "OPENSHELL_SIDECAR_PROXY_UID", default_value_t = DEFAULT_SIDECAR_PROXY_UID)] + sidecar_proxy_uid: u32, + #[arg(long, env = "OPENSHELL_ENABLE_USER_NAMESPACES")] enable_user_namespaces: bool, @@ -117,6 +128,8 @@ async fn main() -> Result<()> { .unwrap_or_else(|| openshell_core::config::DEFAULT_SUPERVISOR_IMAGE.to_string()), supervisor_image_pull_policy: args.supervisor_image_pull_policy.unwrap_or_default(), supervisor_sideload_method: args.supervisor_sideload_method, + supervisor_topology: args.supervisor_topology, + sidecar_proxy_uid: args.sidecar_proxy_uid, grpc_endpoint: args.grpc_endpoint.unwrap_or_default(), ssh_socket_path: args.sandbox_ssh_socket_path, client_tls_secret_name: args.client_tls_secret_name.unwrap_or_default(), @@ -135,6 +148,8 @@ async fn main() -> Result<()> { provider_spiffe_workload_api_socket_path: args .provider_spiffe_workload_api_socket_path .unwrap_or_default(), + sandbox_uid: None, + sandbox_gid: None, }) .await .into_diagnostic()?; diff --git a/crates/openshell-driver-podman/README.md b/crates/openshell-driver-podman/README.md index c484bf8a4..05e45b073 100644 --- a/crates/openshell-driver-podman/README.md +++ b/crates/openshell-driver-podman/README.md @@ -125,8 +125,8 @@ sequenceDiagram C->>C: entrypoint: /opt/openshell/bin/openshell-sandbox ``` -The supervisor image from `deploy/docker/Dockerfile.supervisor` copies the static -`openshell-sandbox` binary to `/openshell-sandbox`. +The supervisor image from `deploy/docker/Dockerfile.supervisor` provides the +static `openshell-sandbox` binary at `/openshell-sandbox`. Mounting that image at `/opt/openshell/bin` makes the binary available as `/opt/openshell/bin/openshell-sandbox`. diff --git a/crates/openshell-driver-podman/src/container.rs b/crates/openshell-driver-podman/src/container.rs index 16814784a..de498f118 100644 --- a/crates/openshell-driver-podman/src/container.rs +++ b/crates/openshell-driver-podman/src/container.rs @@ -839,9 +839,8 @@ pub fn build_container_spec_with_token_and_gpu_devices( // Side-load the supervisor binary from a standalone OCI image. // Podman resolves image_volumes at the libpod layer, mounting the // image's filesystem at the destination path without starting a - // container from it. The supervisor image is FROM scratch with just - // the binary at /openshell-sandbox, so it appears at - // /opt/openshell/bin/openshell-sandbox. + // container from it. The supervisor image exposes the binary at + // /openshell-sandbox, so it appears at /opt/openshell/bin/openshell-sandbox. image_volumes, hostname: format!("sandbox-{}", sandbox.name), // Override the image's ENTRYPOINT so the supervisor binary runs diff --git a/crates/openshell-driver-vm/src/driver.rs b/crates/openshell-driver-vm/src/driver.rs index d5d9565e7..643d31834 100644 --- a/crates/openshell-driver-vm/src/driver.rs +++ b/crates/openshell-driver-vm/src/driver.rs @@ -207,7 +207,7 @@ enum GuestImagePayloadSource { LocalDocker { rootfs_archive: PathBuf }, } -#[derive(Debug, Clone)] +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] pub struct VmDriverConfig { pub openshell_endpoint: String, pub state_dir: PathBuf, @@ -225,8 +225,19 @@ pub struct VmDriverConfig { pub gpu_enabled: bool, pub gpu_mem_mib: u32, pub gpu_vcpus: u8, + /// Resolved sandbox UID for rootfs `/etc/passwd` entry. + /// When empty, defaults to 10001 (the legacy hardcoded value). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub sandbox_uid: Option, + /// Resolved sandbox GID for rootfs `/etc/passwd` and `/etc/group` entries. + /// When empty, defaults to the resolved UID. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub sandbox_gid: Option, } +/// Default sandbox UID used by the VM driver when no config value is set. +pub const DEFAULT_SANDBOX_UID: u32 = 10001; + impl Default for VmDriverConfig { fn default() -> Self { Self { @@ -246,11 +257,23 @@ impl Default for VmDriverConfig { gpu_enabled: false, gpu_mem_mib: 8192, gpu_vcpus: 4, + sandbox_uid: None, + sandbox_gid: None, } } } impl VmDriverConfig { + /// Resolve the sandbox UID, falling back to `DEFAULT_SANDBOX_UID`. + pub fn resolve_sandbox_uid(&self) -> u32 { + self.sandbox_uid.unwrap_or(DEFAULT_SANDBOX_UID) + } + + /// Resolve the sandbox GID, falling back to the resolved UID. + pub fn resolve_sandbox_gid(&self, resolved_uid: u32) -> u32 { + self.sandbox_gid.unwrap_or(resolved_uid) + } + fn requires_tls_materials(&self) -> bool { self.openshell_endpoint.starts_with("https://") } @@ -2545,14 +2568,19 @@ impl VmDriver { let image_identity_owned = image_identity.to_string(); let exported_rootfs_for_build = exported_rootfs.clone(); let prepared_rootfs_for_build = prepared_rootfs.clone(); + let sandbox_user_id = self.config.resolve_sandbox_uid(); + let sandbox_group_id = self.config.resolve_sandbox_gid(sandbox_user_id); self.publish_vm_progress( sandbox_id, "PreparingRootfs", - format!("Preparing VM rootfs for local image \"{image_ref}\""), + format!( + "Preparing VM rootfs for local image \"{image_ref}\" (sandbox uid={sandbox_user_id})" + ), HashMap::from([ ("image_ref".to_string(), image_ref.to_string()), ("image_source".to_string(), "local_docker".to_string()), ("image_identity".to_string(), image_identity.to_string()), + ("sandbox_uid".to_string(), sandbox_user_id.to_string()), ]), ); let prepare_result = tokio::task::spawn_blocking(move || { @@ -2560,6 +2588,8 @@ impl VmDriver { prepare_sandbox_rootfs_from_image_root( &prepared_rootfs_for_build, &image_identity_owned, + sandbox_user_id, + sandbox_group_id, ) .map_err(|err| { format!("vm sandbox image '{image_ref_owned}' is not base-compatible: {err}") @@ -2678,20 +2708,27 @@ impl VmDriver { let image_ref_owned = image_ref.to_string(); let image_identity_owned = image_identity.to_string(); let prepared_rootfs_for_build = prepared_rootfs.clone(); + let sandbox_user_id = self.config.resolve_sandbox_uid(); + let sandbox_group_id = self.config.resolve_sandbox_gid(sandbox_user_id); self.publish_vm_progress( sandbox_id, "PreparingRootfs", - format!("Preparing VM rootfs for image \"{image_ref}\""), + format!( + "Preparing VM rootfs for image \"{image_ref}\" (sandbox uid={sandbox_user_id})" + ), HashMap::from([ ("image_ref".to_string(), image_ref.to_string()), ("image_source".to_string(), "registry".to_string()), ("image_identity".to_string(), image_identity.to_string()), + ("sandbox_uid".to_string(), sandbox_user_id.to_string()), ]), ); let prepare_result = tokio::task::spawn_blocking(move || { prepare_sandbox_rootfs_from_image_root( &prepared_rootfs_for_build, &image_identity_owned, + sandbox_user_id, + sandbox_group_id, ) .map_err(|err| { format!("vm sandbox image '{image_ref_owned}' is not base-compatible: {err}") diff --git a/crates/openshell-driver-vm/src/main.rs b/crates/openshell-driver-vm/src/main.rs index 57db7b64b..17718f952 100644 --- a/crates/openshell-driver-vm/src/main.rs +++ b/crates/openshell-driver-vm/src/main.rs @@ -214,6 +214,8 @@ async fn main() -> Result<()> { gpu_enabled: args.gpu, gpu_mem_mib: args.gpu_mem_mib, gpu_vcpus: args.gpu_vcpus, + sandbox_uid: None, + sandbox_gid: None, }) .await .map_err(|err| miette::miette!("{err}"))?; diff --git a/crates/openshell-driver-vm/src/rootfs.rs b/crates/openshell-driver-vm/src/rootfs.rs index d59e7b4b9..a2499d806 100644 --- a/crates/openshell-driver-vm/src/rootfs.rs +++ b/crates/openshell-driver-vm/src/rootfs.rs @@ -29,8 +29,10 @@ pub const fn sandbox_guest_init_path() -> &'static str { pub fn prepare_sandbox_rootfs_from_image_root( rootfs: &Path, image_identity: &str, + sandbox_user_id: u32, + sandbox_group_id: u32, ) -> Result<(), String> { - prepare_sandbox_rootfs(rootfs)?; + prepare_sandbox_rootfs(rootfs, sandbox_user_id, sandbox_group_id)?; validate_sandbox_rootfs(rootfs)?; fs::write( rootfs.join(ROOTFS_VARIANT_MARKER), @@ -348,7 +350,11 @@ fn append_symlink_to_archive( .map_err(|e| format!("append symlink {}: {e}", source_path.display())) } -fn prepare_sandbox_rootfs(rootfs: &Path) -> Result<(), String> { +fn prepare_sandbox_rootfs( + rootfs: &Path, + sandbox_user_id: u32, + sandbox_group_id: u32, +) -> Result<(), String> { for relative in ["opt/openshell/.initialized", "opt/openshell/.rootfs-type"] { remove_rootfs_path(rootfs, relative)?; } @@ -377,7 +383,7 @@ fn prepare_sandbox_rootfs(rootfs: &Path) -> Result<(), String> { fs::create_dir_all(&opt_dir).map_err(|e| format!("create {}: {e}", opt_dir.display()))?; fs::write(opt_dir.join(".rootfs-type"), "sandbox\n") .map_err(|e| format!("write sandbox rootfs marker: {e}"))?; - ensure_sandbox_guest_user(rootfs)?; + ensure_sandbox_guest_user(rootfs, sandbox_user_id, sandbox_group_id)?; create_sandbox_mountpoint(&rootfs.join("sandbox"))?; create_sandbox_mountpoint(&rootfs.join("image-cache"))?; create_sandbox_mountpoint(&rootfs.join("lower"))?; @@ -752,16 +758,17 @@ fn temporary_injection_path(image_path: &Path) -> PathBuf { )) } -fn ensure_sandbox_guest_user(rootfs: &Path) -> Result<(), String> { - const SANDBOX_UID: u32 = 10001; - const SANDBOX_GID: u32 = 10001; - +fn ensure_sandbox_guest_user( + rootfs: &Path, + sandbox_user_id: u32, + sandbox_group_id: u32, +) -> Result<(), String> { let etc_dir = rootfs.join("etc"); fs::create_dir_all(&etc_dir).map_err(|e| format!("create {}: {e}", etc_dir.display()))?; ensure_line_in_file( &etc_dir.join("group"), - &format!("sandbox:x:{SANDBOX_GID}:"), + &format!("sandbox:x:{sandbox_group_id}:"), |line| line.starts_with("sandbox:"), )?; ensure_line_in_file(&etc_dir.join("gshadow"), "sandbox:!::", |line| { @@ -769,7 +776,9 @@ fn ensure_sandbox_guest_user(rootfs: &Path) -> Result<(), String> { })?; ensure_line_in_file( &etc_dir.join("passwd"), - &format!("sandbox:x:{SANDBOX_UID}:{SANDBOX_GID}:OpenShell Sandbox:/sandbox:/bin/bash"), + &format!( + "sandbox:x:{sandbox_user_id}:{sandbox_group_id}:OpenShell Sandbox:/sandbox:/bin/bash" + ), |line| line.starts_with("sandbox:"), )?; ensure_line_in_file( @@ -936,7 +945,9 @@ mod tests { fs::write(rootfs.join("bin/sed"), b"sed").expect("write sed"); fs::write(rootfs.join("sbin/ip"), b"ip").expect("write ip"); - prepare_sandbox_rootfs(&rootfs).expect("prepare sandbox rootfs"); + // Use a non-standard UID so the test doesn't collide with the default. + let uid = 20001; + prepare_sandbox_rootfs(&rootfs, uid, uid).expect("prepare sandbox rootfs"); validate_sandbox_rootfs(&rootfs).expect("validate sandbox rootfs"); assert!(rootfs.join("srv/openshell-vm-sandbox-init.sh").is_file()); @@ -955,12 +966,14 @@ mod tests { assert!( fs::read_to_string(rootfs.join("etc/passwd")) .expect("read passwd") - .contains("sandbox:x:10001:10001:OpenShell Sandbox:/sandbox:/bin/bash") + .contains(&format!( + "sandbox:x:{uid}:{uid}:OpenShell Sandbox:/sandbox:/bin/bash" + )) ); assert!( fs::read_to_string(rootfs.join("etc/group")) .expect("read group") - .contains("sandbox:x:10001:") + .contains(&format!("sandbox:x:{uid}:")) ); assert_eq!( fs::read_to_string(rootfs.join("etc/hosts")).expect("read hosts"), @@ -980,7 +993,7 @@ mod tests { fs::create_dir_all(rootfs.join("sandbox")).expect("create sandbox workdir"); fs::write(rootfs.join("sandbox/app.py"), "print('hello')\n").expect("write app"); - prepare_sandbox_rootfs(&rootfs).expect("prepare sandbox rootfs"); + prepare_sandbox_rootfs(&rootfs, 10001, 10001).expect("prepare sandbox rootfs"); assert!(rootfs.join("sandbox").is_dir()); assert_eq!( diff --git a/crates/openshell-policy/src/lib.rs b/crates/openshell-policy/src/lib.rs index e046954a2..4fc83de92 100644 --- a/crates/openshell-policy/src/lib.rs +++ b/crates/openshell-policy/src/lib.rs @@ -552,6 +552,41 @@ fn from_proto(policy: &SandboxPolicy) -> PolicyFile { } } +// --------------------------------------------------------------------------- +// Sandbox UID/GID constants +// --------------------------------------------------------------------------- + +/// Minimum accepted UID for sandbox process identity. +/// UIDs below this are reserved for system users and are rejected. +pub const MIN_SANDBOX_UID: u32 = 1000; + +/// Maximum accepted UID for sandbox process identity. +/// UIDs above this exceed typical OS limits and are rejected. +pub const MAX_SANDBOX_UID: u32 = 2_000_000_000; + +/// The literal string value accepted as a valid sandbox user/group name. +const SANDBOX_NAME: &str = "sandbox"; + +/// Validate whether a process identity field value is acceptable. +/// +/// Accepts either the literal `"sandbox"` or a numeric UID/GID parsed as +/// `u32` within the range `[MIN_SANDBOX_UID, MAX_SANDBOX_UID]`. +/// +/// Rejects: +/// - The empty string (callers should use `ensure_sandbox_process_identity` +/// to fill defaults before validation) +/// - UID 0 or values below `MIN_SANDBOX_UID` +/// - Values above `MAX_SANDBOX_UID` +/// - Non-numeric strings other than `"sandbox"` (e.g. `"root"`, `"nobody"`) +pub fn is_valid_sandbox_identity(value: &str) -> bool { + if value == SANDBOX_NAME { + return true; + } + value + .parse::() + .is_ok_and(|uid| (MIN_SANDBOX_UID..=MAX_SANDBOX_UID).contains(&uid)) +} + // --------------------------------------------------------------------------- // Public API // --------------------------------------------------------------------------- @@ -725,7 +760,10 @@ impl fmt::Display for PolicyViolation { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Self::InvalidProcessIdentity { field, value } => { - write!(f, "{field} must be 'sandbox', got '{value}'") + write!( + f, + "{field} must be 'sandbox' or a numeric UID/GID in range [{MIN_SANDBOX_UID}, {MAX_SANDBOX_UID}], got '{value}'" + ) } Self::PathTraversal { path } => { write!(f, "path contains '..' traversal component: {path}") @@ -803,17 +841,18 @@ pub fn validate_sandbox_policy( ) -> std::result::Result<(), Vec> { let mut violations = Vec::new(); - // Check process identity — must be "sandbox". + // Check process identity — must be "sandbox" or a numeric UID/GID + // within the acceptable sandbox range. // `ensure_sandbox_process_identity` should be called before this to - // fill in defaults; anything other than "sandbox" is rejected. + // fill in defaults; any invalid value is rejected. if let Some(ref process) = policy.process { - if process.run_as_user != "sandbox" { + if !is_valid_sandbox_identity(&process.run_as_user) { violations.push(PolicyViolation::InvalidProcessIdentity { field: "run_as_user", value: process.run_as_user.clone(), }); } - if process.run_as_group != "sandbox" { + if !is_valid_sandbox_identity(&process.run_as_group) { violations.push(PolicyViolation::InvalidProcessIdentity { field: "run_as_group", value: process.run_as_group.clone(), @@ -1637,6 +1676,180 @@ network_policies: assert!(s.contains("sandbox")); } + // ---- is_valid_sandbox_identity tests ---- + + #[test] + fn valid_identity_accepts_sandbox() { + assert!(is_valid_sandbox_identity("sandbox")); + } + + #[test] + fn valid_identity_accepts_numeric_uid_in_range() { + assert!(is_valid_sandbox_identity("1000")); + assert!(is_valid_sandbox_identity("50000")); + assert!(is_valid_sandbox_identity("1000660000")); + } + + #[test] + fn valid_identity_accepts_boundary_uids() { + assert!(is_valid_sandbox_identity(&MIN_SANDBOX_UID.to_string())); + assert!(is_valid_sandbox_identity(&MAX_SANDBOX_UID.to_string())); + } + + #[test] + fn valid_identity_rejects_zero() { + assert!(!is_valid_sandbox_identity("0")); + } + + #[test] + fn valid_identity_rejects_system_uids_below_min() { + assert!(!is_valid_sandbox_identity("999")); + assert!(!is_valid_sandbox_identity("100")); + assert!(!is_valid_sandbox_identity("1")); + } + + #[test] + fn valid_identity_rejects_uid_above_max() { + assert!(!is_valid_sandbox_identity( + &MAX_SANDBOX_UID.saturating_add(1).to_string() + )); + } + + #[test] + fn valid_identity_rejects_non_numeric_names() { + assert!(!is_valid_sandbox_identity("root")); + assert!(!is_valid_sandbox_identity("nobody")); + assert!(!is_valid_sandbox_identity("user")); + } + + #[test] + fn valid_identity_rejects_empty_string() { + assert!(!is_valid_sandbox_identity("")); + } + + // ---- Policy validation with numeric UIDs ---- + + #[test] + fn validate_accepts_numeric_uid_in_range() { + let policy = SandboxPolicy { + version: 1, + process: Some(ProcessPolicy { + run_as_user: "1000".into(), + run_as_group: "5000".into(), + }), + filesystem: None, + landlock: None, + network_policies: HashMap::new(), + }; + assert!(validate_sandbox_policy(&policy).is_ok()); + } + + #[test] + fn validate_accepts_boundary_uids() { + let policy = SandboxPolicy { + version: 1, + process: Some(ProcessPolicy { + run_as_user: MIN_SANDBOX_UID.to_string(), + run_as_group: MAX_SANDBOX_UID.to_string(), + }), + filesystem: None, + landlock: None, + network_policies: HashMap::new(), + }; + assert!(validate_sandbox_policy(&policy).is_ok()); + } + + #[test] + fn validate_rejects_uid_out_of_range_low() { + let mut policy = restrictive_default_policy(); + policy.process = Some(ProcessPolicy { + run_as_user: "500".into(), + run_as_group: "sandbox".into(), + }); + let violations = validate_sandbox_policy(&policy).unwrap_err(); + assert!(violations.iter().any(|v| matches!( + v, + PolicyViolation::InvalidProcessIdentity { + field: "run_as_user", + .. + } + ))); + } + + #[test] + fn validate_rejects_uid_out_of_range_high() { + let mut policy = restrictive_default_policy(); + policy.process = Some(ProcessPolicy { + run_as_user: (MAX_SANDBOX_UID + 1).to_string(), + run_as_group: "sandbox".into(), + }); + let violations = validate_sandbox_policy(&policy).unwrap_err(); + assert!(violations.iter().any(|v| matches!( + v, + PolicyViolation::InvalidProcessIdentity { + field: "run_as_user", + .. + } + ))); + } + + #[test] + fn validate_rejects_root_string() { + let mut policy = restrictive_default_policy(); + policy.process = Some(ProcessPolicy { + run_as_user: "root".into(), + run_as_group: "sandbox".into(), + }); + let violations = validate_sandbox_policy(&policy).unwrap_err(); + assert!(violations.iter().any(|v| matches!( + v, + PolicyViolation::InvalidProcessIdentity { + field: "run_as_user", + .. + } + ))); + } + + #[test] + fn validate_rejects_nobody_string() { + let mut policy = restrictive_default_policy(); + policy.process = Some(ProcessPolicy { + run_as_user: "nobody".into(), + run_as_group: "nogroup".into(), + }); + let violations = validate_sandbox_policy(&policy).unwrap_err(); + assert_eq!(violations.len(), 2); + } + + #[test] + fn validate_accepts_mixed_sandbox_name_and_uid() { + // run_as_user as "sandbox" name, run_as_group as numeric UID + let policy = SandboxPolicy { + version: 1, + process: Some(ProcessPolicy { + run_as_user: "sandbox".into(), + run_as_group: "1000".into(), + }), + filesystem: None, + landlock: None, + network_policies: HashMap::new(), + }; + assert!(validate_sandbox_policy(&policy).is_ok()); + } + + #[test] + fn policy_violation_display_includes_range() { + let v = PolicyViolation::InvalidProcessIdentity { + field: "run_as_user", + value: "root".into(), + }; + let s = format!("{v}"); + assert!(s.contains("sandbox")); + assert!(s.contains(&MIN_SANDBOX_UID.to_string())); + assert!(s.contains(&MAX_SANDBOX_UID.to_string())); + assert!(s.contains("root")); + } + // ---- Multi-port and host wildcard tests ---- #[test] diff --git a/crates/openshell-sandbox/Cargo.toml b/crates/openshell-sandbox/Cargo.toml index 086dbe02c..a5d344910 100644 --- a/crates/openshell-sandbox/Cargo.toml +++ b/crates/openshell-sandbox/Cargo.toml @@ -33,6 +33,9 @@ clap = { workspace = true } # Error handling miette = { workspace = true } +# Unix ownership for Kubernetes sidecar init setup +nix = { workspace = true } + # TLS crypto provider install (main.rs) rustls = { workspace = true } diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index d5967d1f3..9f88ed11d 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -13,10 +13,10 @@ mod mechanistic_mapper; #[cfg_attr(not(target_os = "linux"), allow(dead_code))] mod metadata_server; -use miette::Result; +use miette::{IntoDiagnostic, Result}; use std::future::Future; use std::sync::Arc; -use std::sync::atomic::AtomicU32; +use std::sync::atomic::{AtomicU32, Ordering}; use std::time::Duration; use tracing::{debug, info, warn}; @@ -64,12 +64,22 @@ use openshell_core::denial::DenialEvent; use openshell_core::policy::{NetworkMode, NetworkPolicy, ProxyPolicy, SandboxPolicy}; use openshell_core::provider_credentials::ProviderCredentialState; use openshell_supervisor_network::opa::OpaEngine; +use openshell_supervisor_process::process::ProcessEnforcementMode; pub use openshell_supervisor_process::process::{ProcessHandle, ProcessStatus}; use openshell_supervisor_process::skills; +use tokio::io::copy_bidirectional; +use tokio::net::{TcpListener, TcpStream}; use tokio::sync::mpsc::UnboundedSender; #[cfg(target_os = "linux")] use tokio::time::timeout; +const SIDECAR_NETWORK_ENFORCEMENT_MODE: &str = "sidecar-nftables"; +const SIDECAR_TLS_DIR: &str = "/etc/openshell-tls/proxy"; +const SIDECAR_CA_CERT: &str = "openshell-ca.pem"; +const SIDECAR_CA_BUNDLE: &str = "ca-bundle.pem"; +const SIDECAR_PROCESS_PROXY_ADDR: &str = "127.0.0.1:3128"; +const SIDECAR_READY_TIMEOUT_SECS: u64 = 120; + /// Run a command in the sandbox. /// /// # Errors @@ -125,6 +135,16 @@ pub async fn run_sandbox( } } + let sidecar_network_enforcement = sidecar_network_enforcement_enabled(); + let process_enforcement_mode = process_enforcement_mode(); + let sidecar_ready_file = supervisor_ready_file(); + if process_enabled + && !network_enabled + && let Some(path) = sidecar_ready_file.as_deref() + { + wait_for_supervisor_ready(path).await?; + } + // Load policy and initialize OPA engine let openshell_endpoint_for_proxy = openshell_endpoint.clone(); let sandbox_name_for_agg = sandbox.clone(); @@ -218,6 +238,12 @@ pub async fn run_sandbox( // Shared PID: set after process spawn so the proxy can look up // the entrypoint process's /proc/net/tcp for identity binding. let entrypoint_pid = Arc::new(AtomicU32::new(0)); + if network_enabled + && !process_enabled + && let Some(path) = entrypoint_pid_file() + { + spawn_entrypoint_pid_file_watcher(path, entrypoint_pid.clone()); + } // Create the workload's network namespace. It is shared infrastructure: // the proxy binds to its host-side veth IP, the bypass monitor reads @@ -225,7 +251,7 @@ pub async fn run_sandbox( // it via setns(). The RAII handle lives in this frame for the duration // of the sandbox. #[cfg(target_os = "linux")] - let netns = if network_enabled { + let netns = if network_enabled && !sidecar_network_enforcement { openshell_supervisor_process::netns::create_netns_for_proxy(&policy)? } else { None @@ -295,6 +321,34 @@ pub async fn run_sandbox( None }; + let _gateway_forward = if network_enabled && sidecar_network_enforcement { + let endpoint = openshell_endpoint_for_proxy.as_deref().ok_or_else(|| { + miette::miette!("sidecar network enforcement requires an OpenShell gateway endpoint") + })?; + Some(start_gateway_forward_from_env(endpoint).await?) + } else { + None + }; + + #[cfg(target_os = "linux")] + if network_enabled && sidecar_network_enforcement { + if !matches!(policy.network.mode, NetworkMode::Proxy) { + return Err(miette::miette!( + "sidecar network enforcement requires proxy network mode" + )); + } + if let Some(path) = sidecar_ready_file.as_deref() { + write_supervisor_ready(path)?; + } + } + + #[cfg(not(target_os = "linux"))] + if network_enabled && sidecar_network_enforcement { + return Err(miette::miette!( + "sidecar network enforcement is only supported on Linux" + )); + } + // Spawn the denial-aggregator flush task. The aggregator drains denial // events from the proxy + bypass monitor, batches them, and ships // summaries to the gateway via `SubmitPolicyAnalysis`. @@ -445,8 +499,17 @@ pub async fn run_sandbox( } } + let process_policy = process_policy_for_topology(&policy, sidecar_network_enforcement)?; + let exit_code = if process_enabled { - let ca_file_paths = networking.as_ref().and_then(|n| n.ca_file_paths.clone()); + let ca_file_paths = networking + .as_ref() + .and_then(|n| n.ca_file_paths.clone()) + .or_else(|| { + sidecar_network_enforcement + .then(sidecar_ca_file_paths) + .flatten() + }); openshell_supervisor_process::run::run_process( program, @@ -457,7 +520,8 @@ pub async fn run_sandbox( sandbox_id.as_deref(), openshell_endpoint.as_deref(), ssh_socket_path, - &policy, + &process_policy, + process_enforcement_mode, entrypoint_pid, provider_credentials, provider_env, @@ -518,6 +582,205 @@ async fn wait_for_shutdown_signal() { } } +fn sidecar_network_enforcement_enabled() -> bool { + std::env::var(openshell_core::sandbox_env::NETWORK_ENFORCEMENT_MODE) + .is_ok_and(|value| value == SIDECAR_NETWORK_ENFORCEMENT_MODE) +} + +fn process_enforcement_mode() -> ProcessEnforcementMode { + match std::env::var(openshell_core::sandbox_env::PROCESS_ENFORCEMENT_MODE) + .unwrap_or_else(|_| "full".to_string()) + .as_str() + { + "network-only" => ProcessEnforcementMode::NetworkOnly, + _ => ProcessEnforcementMode::Full, + } +} + +fn supervisor_ready_file() -> Option { + std::env::var(openshell_core::sandbox_env::SUPERVISOR_READY_FILE) + .ok() + .filter(|value| !value.is_empty()) +} + +fn entrypoint_pid_file() -> Option { + std::env::var(openshell_core::sandbox_env::ENTRYPOINT_PID_FILE) + .ok() + .filter(|value| !value.is_empty()) +} + +fn spawn_entrypoint_pid_file_watcher(path: String, entrypoint_pid: Arc) { + tokio::spawn(async move { + let pid_path = std::path::PathBuf::from(&path); + loop { + match std::fs::read_to_string(&pid_path) { + Ok(contents) => match contents.trim().parse::() { + Ok(pid) if pid > 0 => { + entrypoint_pid.store(pid, Ordering::Release); + info!(path, pid, "Loaded sidecar workload entrypoint PID"); + return; + } + Ok(_) | Err(_) => { + debug!(path, contents = %contents.trim(), "Ignoring invalid entrypoint PID file contents"); + } + }, + Err(err) if err.kind() == std::io::ErrorKind::NotFound => {} + Err(err) => { + debug!(path, error = %err, "Failed to read entrypoint PID file"); + } + } + tokio::time::sleep(Duration::from_millis(100)).await; + } + }); +} + +async fn wait_for_supervisor_ready(path: &str) -> Result<()> { + let ready_path = std::path::Path::new(path); + let deadline = tokio::time::Instant::now() + Duration::from_secs(SIDECAR_READY_TIMEOUT_SECS); + loop { + if ready_path.exists() { + info!(path, "Network supervisor sidecar is ready"); + return Ok(()); + } + if tokio::time::Instant::now() >= deadline { + return Err(miette::miette!( + "timed out waiting for network supervisor sidecar readiness file {path}" + )); + } + tokio::time::sleep(Duration::from_millis(250)).await; + } +} + +#[cfg(target_os = "linux")] +fn write_supervisor_ready(path: &str) -> Result<()> { + let ready_path = std::path::Path::new(path); + if let Some(parent) = ready_path.parent() { + std::fs::create_dir_all(parent).into_diagnostic()?; + } + std::fs::write(ready_path, b"ready\n").into_diagnostic()?; + info!(path, "Network supervisor sidecar readiness file written"); + Ok(()) +} + +fn sidecar_ca_file_paths() -> Option<(std::path::PathBuf, std::path::PathBuf)> { + let tls_dir = std::env::var(openshell_core::sandbox_env::PROXY_TLS_DIR) + .unwrap_or_else(|_| SIDECAR_TLS_DIR.to_string()); + let cert = std::path::Path::new(&tls_dir).join(SIDECAR_CA_CERT); + let bundle = std::path::Path::new(&tls_dir).join(SIDECAR_CA_BUNDLE); + (cert.exists() && bundle.exists()).then_some((cert, bundle)) +} + +fn process_policy_for_topology( + policy: &SandboxPolicy, + sidecar_network_enforcement: bool, +) -> Result { + let mut process_policy = policy.clone(); + if sidecar_network_enforcement && matches!(process_policy.network.mode, NetworkMode::Proxy) { + let proxy = process_policy + .network + .proxy + .get_or_insert(ProxyPolicy { http_addr: None }); + if proxy.http_addr.is_none() { + proxy.http_addr = Some(SIDECAR_PROCESS_PROXY_ADDR.parse().into_diagnostic()?); + } + } + Ok(process_policy) +} + +struct GatewayForwardHandle { + task: tokio::task::JoinHandle<()>, +} + +impl Drop for GatewayForwardHandle { + fn drop(&mut self) { + self.task.abort(); + } +} + +async fn start_gateway_forward_from_env(endpoint: &str) -> Result { + let listen_addr = + std::env::var(openshell_core::sandbox_env::GATEWAY_FORWARD_ADDR).map_err(|_| { + miette::miette!( + "{} is required for sidecar gateway forwarding", + openshell_core::sandbox_env::GATEWAY_FORWARD_ADDR + ) + })?; + start_gateway_forward(&listen_addr, endpoint).await +} + +async fn start_gateway_forward(listen_addr: &str, endpoint: &str) -> Result { + let upstream = gateway_tcp_addr(endpoint)?; + let listener = TcpListener::bind(listen_addr).await.into_diagnostic()?; + info!( + listen_addr, + upstream, "Gateway loopback TCP forward started for sidecar topology" + ); + + let task = tokio::spawn(async move { + loop { + let (mut inbound, peer) = match listener.accept().await { + Ok(accepted) => accepted, + Err(e) => { + warn!(error = %e, "Gateway forward accept failed"); + continue; + } + }; + let upstream = upstream.clone(); + tokio::spawn(async move { + let mut outbound = match TcpStream::connect(&upstream).await { + Ok(stream) => stream, + Err(e) => { + warn!(peer = %peer, upstream, error = %e, "Gateway forward connect failed"); + return; + } + }; + if let Err(e) = copy_bidirectional(&mut inbound, &mut outbound).await { + debug!(peer = %peer, error = %e, "Gateway forward connection closed with error"); + } + }); + } + }); + + Ok(GatewayForwardHandle { task }) +} + +fn gateway_tcp_addr(endpoint: &str) -> Result { + let (scheme, rest) = endpoint + .split_once("://") + .ok_or_else(|| miette::miette!("gateway endpoint must include a URL scheme"))?; + let default_port = match scheme { + "http" => 80, + "https" => 443, + other => { + return Err(miette::miette!( + "unsupported gateway endpoint scheme '{other}' for sidecar forwarding" + )); + } + }; + let authority = rest.split('/').next().unwrap_or(rest); + if authority.is_empty() { + return Err(miette::miette!("gateway endpoint is missing a host")); + } + if authority.starts_with('[') { + let closing = authority + .find(']') + .ok_or_else(|| miette::miette!("invalid bracketed IPv6 gateway endpoint"))?; + let host = &authority[..=closing]; + let port = authority[closing + 1..] + .strip_prefix(':') + .and_then(|value| value.parse::().ok()) + .unwrap_or(default_port); + return Ok(format!("{host}:{port}")); + } + let (host, port) = match authority.rsplit_once(':') { + Some((host, port)) if !host.is_empty() => { + (host, port.parse::().unwrap_or(default_port)) + } + _ => (authority, default_port), + }; + Ok(format!("{host}:{port}")) +} + /// Flush aggregated denial summaries to the gateway via `SubmitPolicyAnalysis`. async fn flush_proposals_to_gateway( endpoint: &str, @@ -1927,8 +2190,24 @@ fn format_setting_value(es: &openshell_core::proto::EffectiveSetting) -> String )] mod tests { use super::*; + use openshell_core::policy::{ + FilesystemPolicy, LandlockPolicy, NetworkMode, NetworkPolicy, ProcessPolicy, ProxyPolicy, + }; use std::sync::atomic::{AtomicBool, Ordering}; + fn proxy_policy(http_addr: Option) -> SandboxPolicy { + SandboxPolicy { + version: 1, + filesystem: FilesystemPolicy::default(), + network: NetworkPolicy { + mode: NetworkMode::Proxy, + proxy: Some(ProxyPolicy { http_addr }), + }, + landlock: LandlockPolicy::default(), + process: ProcessPolicy::default(), + } + } + fn effective_bool(value: bool) -> openshell_core::proto::EffectiveSetting { openshell_core::proto::EffectiveSetting { value: Some(openshell_core::proto::SettingValue { @@ -1940,6 +2219,73 @@ mod tests { } } + #[test] + fn sidecar_process_policy_sets_loopback_proxy_addr() { + let policy = proxy_policy(None); + + let process_policy = process_policy_for_topology(&policy, true).unwrap(); + + let http_addr = process_policy + .network + .proxy + .and_then(|proxy| proxy.http_addr) + .expect("sidecar process policy should set proxy address"); + assert_eq!(http_addr.to_string(), SIDECAR_PROCESS_PROXY_ADDR); + assert!( + policy + .network + .proxy + .as_ref() + .expect("original policy should keep proxy config") + .http_addr + .is_none(), + "process policy normalization must not mutate the network policy" + ); + } + + #[test] + fn non_sidecar_process_policy_preserves_proxy_addr() { + let policy = proxy_policy(None); + + let process_policy = process_policy_for_topology(&policy, false).unwrap(); + + assert!( + process_policy + .network + .proxy + .and_then(|proxy| proxy.http_addr) + .is_none() + ); + } + + #[test] + fn gateway_tcp_addr_uses_explicit_port() { + assert_eq!( + gateway_tcp_addr("https://openshell-gateway.openshell.svc:8080").unwrap(), + "openshell-gateway.openshell.svc:8080" + ); + } + + #[test] + fn gateway_tcp_addr_uses_scheme_default_port() { + assert_eq!( + gateway_tcp_addr("https://openshell-gateway.openshell.svc").unwrap(), + "openshell-gateway.openshell.svc:443" + ); + assert_eq!( + gateway_tcp_addr("http://openshell-gateway.openshell.svc").unwrap(), + "openshell-gateway.openshell.svc:80" + ); + } + + #[test] + fn gateway_tcp_addr_preserves_ipv6_brackets() { + assert_eq!( + gateway_tcp_addr("https://[fd00::1]:8443").unwrap(), + "[fd00::1]:8443" + ); + } + #[test] fn apply_ocsf_json_setting_enables_from_initial_settings_snapshot() { let enabled = AtomicBool::new(false); diff --git a/crates/openshell-sandbox/src/main.rs b/crates/openshell-sandbox/src/main.rs index 91b145c2e..c5e4bf4ea 100644 --- a/crates/openshell-sandbox/src/main.rs +++ b/crates/openshell-sandbox/src/main.rs @@ -35,15 +35,26 @@ const DEBUG_RPC_SUBCOMMAND: &str = "debug-rpc"; /// Default `--mode` value: run both supervisor leaves in a single binary. const DEFAULT_MODE: &str = "network,process"; +const SIDECAR_STATE_DIR: &str = "/run/openshell-sidecar"; +const SIDECAR_TLS_DIR: &str = "/etc/openshell-tls/proxy"; +#[cfg(target_os = "linux")] +const CLIENT_TLS_DIR: &str = "/etc/openshell-tls/client"; +#[cfg(target_os = "linux")] +const SIDECAR_CLIENT_TLS_SUBDIR: &str = "client"; +#[cfg(target_os = "linux")] +const CLIENT_TLS_FILES: [&str; 3] = ["ca.crt", "tls.crt", "tls.key"]; /// Which supervisor leaves are enabled in this process. /// /// Parsed from a comma-separated `--mode` value, e.g. `network`, -/// `process`, or `network,process`. At least one must be set. +/// `process`, or `network,process`. `network-init` is a one-shot setup mode +/// used by the Kubernetes sidecar topology and cannot be combined with other +/// mode components. At least one must be set. #[derive(Clone, Copy, Debug)] struct Mode { network: bool, process: bool, + network_init: bool, } impl std::str::FromStr for Mode { @@ -53,20 +64,27 @@ impl std::str::FromStr for Mode { let mut mode = Self { network: false, process: false, + network_init: false, }; for part in s.split(',').map(str::trim).filter(|p| !p.is_empty()) { match part { "network" => mode.network = true, "process" => mode.process = true, + "network-init" => mode.network_init = true, other => { return Err(format!( - "unknown mode component '{other}' (expected 'network' and/or 'process')" + "unknown mode component '{other}' (expected 'network', 'process', or 'network-init')" )); } } } - if !mode.network && !mode.process { - return Err("--mode must enable at least one of: network, process".into()); + if mode.network_init && (mode.network || mode.process) { + return Err("--mode=network-init cannot be combined with other components".into()); + } + if !mode.network && !mode.process && !mode.network_init { + return Err( + "--mode must enable at least one of: network, process, network-init".into(), + ); } Ok(mode) } @@ -149,9 +167,28 @@ struct Args { /// "network" and/or "process". Defaults to both (single-binary /// topology). Use --mode=network for a network-only sidecar, or /// --mode=process for a process-only supervisor when network - /// enforcement runs in another pod. + /// enforcement runs in another pod. Use --mode=network-init only in + /// the Kubernetes init container that prepares sidecar nftables. #[arg(long, default_value = DEFAULT_MODE)] mode: Mode, + + /// UID that the long-running Kubernetes network sidecar will run as. + /// `--mode=network-init` installs nftables rules that exempt this UID. + #[arg(long, env = "OPENSHELL_SIDECAR_PROXY_UID", default_value_t = 1337)] + sidecar_proxy_uid: u32, + + /// GID assigned to shared sidecar state directories. Defaults to + /// `--sidecar-proxy-uid` when omitted. + #[arg(long, env = "OPENSHELL_SIDECAR_PROXY_GID")] + sidecar_proxy_gid: Option, + + /// Shared state directory between the network init container and sidecar. + #[arg(long, env = "OPENSHELL_SIDECAR_STATE_DIR", default_value = SIDECAR_STATE_DIR)] + sidecar_state_dir: String, + + /// Shared TLS work directory between the network init container and sidecar. + #[arg(long, env = "OPENSHELL_PROXY_TLS_DIR", default_value = SIDECAR_TLS_DIR)] + sidecar_tls_dir: String, } /// Copy the running executable to `dest`, creating parent directories as @@ -194,6 +231,136 @@ fn copy_self(dest: &str) -> Result<()> { Ok(()) } +#[cfg(target_os = "linux")] +fn prepare_sidecar_directory(path: &Path, uid: u32, gid: u32, mode: u32) -> Result<()> { + use miette::Context as _; + use nix::unistd::{Gid, Uid, chown}; + use std::os::unix::fs::PermissionsExt; + + std::fs::create_dir_all(path) + .into_diagnostic() + .wrap_err_with(|| format!("failed to create sidecar directory {}", path.display()))?; + let mut perms = std::fs::metadata(path).into_diagnostic()?.permissions(); + perms.set_mode(mode); + std::fs::set_permissions(path, perms) + .into_diagnostic() + .wrap_err_with(|| format!("failed to chmod sidecar directory {}", path.display()))?; + chown(path, Some(Uid::from_raw(uid)), Some(Gid::from_raw(gid))) + .into_diagnostic() + .wrap_err_with(|| { + format!( + "failed to chown sidecar directory {} to {uid}:{gid}", + path.display() + ) + })?; + Ok(()) +} + +#[cfg(target_os = "linux")] +fn copy_sidecar_client_tls_if_present( + source_dir: &Path, + sidecar_tls_dir: &Path, + uid: u32, + gid: u32, +) -> Result<()> { + use miette::Context as _; + use nix::unistd::{Gid, Uid, chown}; + use std::os::unix::fs::PermissionsExt; + + if !source_dir.exists() { + return Ok(()); + } + + let dest_dir = sidecar_tls_dir.join(SIDECAR_CLIENT_TLS_SUBDIR); + prepare_sidecar_directory(&dest_dir, uid, gid, 0o750)?; + for file_name in CLIENT_TLS_FILES { + let source = source_dir.join(file_name); + if !source.exists() { + return Err(miette::miette!( + "client TLS source file is missing: {}", + source.display() + )); + } + let dest = dest_dir.join(file_name); + std::fs::copy(&source, &dest) + .into_diagnostic() + .wrap_err_with(|| { + format!( + "failed to copy client TLS file {} to {}", + source.display(), + dest.display() + ) + })?; + let mut perms = std::fs::metadata(&dest).into_diagnostic()?.permissions(); + perms.set_mode(0o400); + std::fs::set_permissions(&dest, perms) + .into_diagnostic() + .wrap_err_with(|| { + format!("failed to chmod copied client TLS file {}", dest.display()) + })?; + chown(&dest, Some(Uid::from_raw(uid)), Some(Gid::from_raw(gid))) + .into_diagnostic() + .wrap_err_with(|| { + format!( + "failed to chown copied client TLS file {} to {uid}:{gid}", + dest.display() + ) + })?; + } + + Ok(()) +} + +#[cfg(target_os = "linux")] +fn run_network_init( + sidecar_proxy_uid: u32, + sidecar_proxy_gid: u32, + sidecar_state_dir: &str, + sidecar_tls_dir: &str, +) -> Result<()> { + if sidecar_proxy_uid < openshell_policy::MIN_SANDBOX_UID { + return Err(miette::miette!( + "--sidecar-proxy-uid must be at least {}", + openshell_policy::MIN_SANDBOX_UID + )); + } + if sidecar_proxy_gid < openshell_policy::MIN_SANDBOX_UID { + return Err(miette::miette!( + "--sidecar-proxy-gid must be at least {}", + openshell_policy::MIN_SANDBOX_UID + )); + } + + let sidecar_state_dir = Path::new(sidecar_state_dir); + let sidecar_tls_dir = Path::new(sidecar_tls_dir); + prepare_sidecar_directory( + sidecar_state_dir, + sidecar_proxy_uid, + sidecar_proxy_gid, + 0o775, + )?; + prepare_sidecar_directory(sidecar_tls_dir, sidecar_proxy_uid, sidecar_proxy_gid, 0o755)?; + copy_sidecar_client_tls_if_present( + Path::new(CLIENT_TLS_DIR), + sidecar_tls_dir, + sidecar_proxy_uid, + sidecar_proxy_gid, + )?; + openshell_supervisor_process::netns::install_sidecar_bypass_rules(sidecar_proxy_uid) +} + +#[cfg(not(target_os = "linux"))] +fn run_network_init( + _sidecar_proxy_uid: u32, + _sidecar_proxy_gid: u32, + _sidecar_state_dir: &str, + _sidecar_tls_dir: &str, +) -> Result<()> { + Err(miette::miette!( + "--mode=network-init is only supported on Linux" + )) +} + fn main() -> Result<()> { // Handle `copy-self ` before clap so it works without any of the // sandbox flags. Kubernetes init containers invoke this path to seed an @@ -222,6 +389,16 @@ fn main() -> Result<()> { let args = Args::parse(); + if args.mode.network_init { + let sidecar_proxy_gid = args.sidecar_proxy_gid.unwrap_or(args.sidecar_proxy_uid); + return run_network_init( + args.sidecar_proxy_uid, + sidecar_proxy_gid, + &args.sidecar_state_dir, + &args.sidecar_tls_dir, + ); + } + // Try to open a rolling log file; fall back to stderr-only logging if it fails // (e.g., /var/log is not writable in custom workload images). // Rotates daily, keeps the 3 most recent files to bound disk usage. @@ -421,4 +598,24 @@ mod tests { let final_path = dest_dir.join("openshell-sandbox"); assert!(final_path.exists(), "binary should land inside dest dir"); } + + #[test] + fn mode_parses_network_init_standalone() { + let mode = "network-init".parse::().unwrap(); + assert!(mode.network_init); + assert!(!mode.network); + assert!(!mode.process); + } + + #[test] + fn mode_rejects_combined_network_init() { + let err = "network-init,network".parse::().unwrap_err(); + assert!(err.contains("cannot be combined")); + } + + #[test] + fn mode_rejects_empty_value() { + let err = "".parse::().unwrap_err(); + assert!(err.contains("at least one")); + } } diff --git a/crates/openshell-supervisor-network/data/sandbox-policy.rego b/crates/openshell-supervisor-network/data/sandbox-policy.rego index afcd28863..bd4c24aba 100644 --- a/crates/openshell-supervisor-network/data/sandbox-policy.rego +++ b/crates/openshell-supervisor-network/data/sandbox-policy.rego @@ -19,6 +19,10 @@ allow_network if { network_policy_for_request } +binary_identity_required if { + object.get(object.get(data, "runtime", {}), "require_binary_identity", true) +} + # --- Deny reasons (specific diagnostics for debugging policy denials) --- deny_reason := "missing input.network" if { @@ -131,6 +135,12 @@ endpoint_allowed(policy, network) if { endpoint.ports[_] == network.port } +# Binary matching can be relaxed by trusted runtime configuration. In that +# mode, network policies are endpoint/L7 scoped and ignore policy.binaries. +binary_allowed(_, _) if { + not binary_identity_required +} + # Binary matching: exact path. # SHA256 integrity is enforced in Rust via trust-on-first-use (TOFU) cache, # not in Rego. The proxy computes and caches binary hashes at runtime. @@ -161,6 +171,10 @@ binary_allowed(policy, exec) if { glob.match(b.path, ["/"], p) } +user_declared_binary_allowed(_, _) if { + not binary_identity_required +} + user_declared_binary_allowed(policy, exec) if { some b b := policy.binaries[_] diff --git a/crates/openshell-supervisor-network/src/identity.rs b/crates/openshell-supervisor-network/src/identity.rs index fce568f41..5e89c3503 100644 --- a/crates/openshell-supervisor-network/src/identity.rs +++ b/crates/openshell-supervisor-network/src/identity.rs @@ -100,23 +100,34 @@ impl BinaryIdentityCache { /// Returns `Ok(hash)` if it matches, `Err` if the hash changed (binary tampered). #[cfg_attr(not(target_os = "linux"), allow(dead_code))] pub fn verify_or_cache(&self, path: &Path) -> Result { - self.verify_or_cache_with_hasher(path, procfs::file_sha256) + self.verify_or_cache_with_paths(path, path, procfs::file_sha256) } - fn verify_or_cache_with_hasher(&self, path: &Path, mut hash_file: F) -> Result + #[cfg(target_os = "linux")] + pub fn verify_or_cache_process_exe(&self, display_path: &Path, pid: u32) -> Result { + let proc_exe = PathBuf::from(format!("/proc/{pid}/exe")); + self.verify_or_cache_with_paths(display_path, &proc_exe, procfs::file_sha256) + } + + fn verify_or_cache_with_paths( + &self, + cache_path: &Path, + access_path: &Path, + mut hash_file: F, + ) -> Result where F: FnMut(&Path) -> Result, { let start = std::time::Instant::now(); - let metadata = std::fs::metadata(path) - .map_err(|error| miette::miette!("Failed to stat {}: {error}", path.display()))?; + let metadata = std::fs::metadata(access_path) + .map_err(|error| miette::miette!("Failed to stat {}: {error}", cache_path.display()))?; let fingerprint = FileFingerprint::from_metadata(&metadata); let cached = self .hashes .lock() .map_err(|_| miette::miette!("Binary identity cache lock poisoned"))? - .get(path) + .get(cache_path) .cloned(); if let Some(cached_binary) = &cached @@ -125,7 +136,7 @@ impl BinaryIdentityCache { debug!( " verify_or_cache: {}ms CACHE HIT path={}", start.elapsed().as_millis(), - path.display() + cache_path.display() ); return Ok(cached_binary.hash.clone()); } @@ -133,29 +144,29 @@ impl BinaryIdentityCache { debug!( " verify_or_cache: CACHE MISS size={} path={}", metadata.len(), - path.display() + cache_path.display() ); - let current_hash = hash_file(path)?; + let current_hash = hash_file(access_path)?; let mut hashes = self .hashes .lock() .map_err(|_| miette::miette!("Binary identity cache lock poisoned"))?; - if let Some(existing) = hashes.get(path) + if let Some(existing) = hashes.get(cache_path) && existing.hash != current_hash { return Err(miette::miette!( "Binary integrity violation: {} hash changed (cached: {}, current: {})", - path.display(), + cache_path.display(), existing.hash, current_hash )); } hashes.insert( - path.to_path_buf(), + cache_path.to_path_buf(), CachedBinary { hash: current_hash.clone(), fingerprint, @@ -165,7 +176,7 @@ impl BinaryIdentityCache { debug!( " verify_or_cache TOTAL (cold): {}ms path={}", start.elapsed().as_millis(), - path.display() + cache_path.display() ); Ok(current_hash) @@ -212,13 +223,13 @@ mod tests { let mut hash_calls = 0; let hash1 = cache - .verify_or_cache_with_hasher(tmp.path(), |path| { + .verify_or_cache_with_paths(tmp.path(), tmp.path(), |path| { hash_calls += 1; procfs::file_sha256(path) }) .unwrap(); let hash2 = cache - .verify_or_cache_with_hasher(tmp.path(), |path| { + .verify_or_cache_with_paths(tmp.path(), tmp.path(), |path| { hash_calls += 1; procfs::file_sha256(path) }) @@ -238,7 +249,7 @@ mod tests { let mut hash_calls = 0; let hash1 = cache - .verify_or_cache_with_hasher(tmp.path(), |path| { + .verify_or_cache_with_paths(tmp.path(), tmp.path(), |path| { hash_calls += 1; procfs::file_sha256(path) }) @@ -254,7 +265,7 @@ mod tests { .unwrap(); let hash2 = cache - .verify_or_cache_with_hasher(tmp.path(), |path| { + .verify_or_cache_with_paths(tmp.path(), tmp.path(), |path| { hash_calls += 1; procfs::file_sha256(path) }) @@ -275,7 +286,7 @@ mod tests { let mut hash_calls = 0; cache - .verify_or_cache_with_hasher(&path, |path| { + .verify_or_cache_with_paths(&path, &path, |path| { hash_calls += 1; procfs::file_sha256(path) }) @@ -292,7 +303,7 @@ mod tests { .set_modified(original_mtime) .unwrap(); - let result = cache.verify_or_cache_with_hasher(&path, |path| { + let result = cache.verify_or_cache_with_paths(&path, &path, |path| { hash_calls += 1; procfs::file_sha256(path) }); @@ -301,6 +312,28 @@ mod tests { assert_eq!(hash_calls, 2); } + #[test] + fn display_path_can_differ_from_access_path() { + let mut tmp = tempfile::NamedTempFile::new().unwrap(); + tmp.write_all(b"binary content").unwrap(); + tmp.flush().unwrap(); + let display_path = Path::new("/usr/bin/python3"); + + let cache = BinaryIdentityCache::new(); + let hash = cache + .verify_or_cache_with_paths(display_path, tmp.path(), procfs::file_sha256) + .unwrap(); + + assert!(!hash.is_empty()); + assert!( + cache + .hashes + .lock() + .unwrap() + .contains_key(Path::new("/usr/bin/python3")) + ); + } + #[test] fn hash_mismatch_returns_error() { let dir = tempfile::tempdir().unwrap(); diff --git a/crates/openshell-supervisor-network/src/opa.rs b/crates/openshell-supervisor-network/src/opa.rs index 0e97f5857..56b0f7754 100644 --- a/crates/openshell-supervisor-network/src/opa.rs +++ b/crates/openshell-supervisor-network/src/opa.rs @@ -17,6 +17,7 @@ use std::sync::{ Arc, Mutex, atomic::{AtomicU64, Ordering}, }; +use tracing::info; /// Baked-in rego rules for OPA policy evaluation. /// These rules define the network access decision logic and static config @@ -54,6 +55,49 @@ pub struct NetworkInput { pub cmdline_paths: Vec, } +pub(crate) fn network_binary_identity_required() -> bool { + std::env::var(openshell_core::sandbox_env::NETWORK_BINARY_IDENTITY).map_or(true, |value| { + !matches!( + value.as_str(), + "relaxed" | "disabled" | "endpoint-only" | "false" | "0" + ) + }) +} + +fn inject_runtime_policy_data(data: &mut serde_json::Value, require_binary_identity: bool) { + let Some(obj) = data.as_object_mut() else { + return; + }; + obj.insert( + "runtime".to_string(), + serde_json::json!({ + "require_binary_identity": require_binary_identity, + }), + ); +} + +fn emit_binary_identity_mode(require_binary_identity: bool, source: &str) { + info!( + require_binary_identity, + source, "Configured OPA runtime binary identity mode" + ); + openshell_ocsf::ocsf_emit!( + openshell_ocsf::ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) + .severity(openshell_ocsf::SeverityId::Informational) + .status(openshell_ocsf::StatusId::Success) + .state(openshell_ocsf::StateId::Enabled, "configured") + .unmapped( + "require_binary_identity", + serde_json::json!(require_binary_identity) + ) + .unmapped("source", serde_json::json!(source)) + .message(format!( + "OPA runtime binary identity mode configured [source:{source} require_binary_identity:{require_binary_identity}]" + )) + .build() + ); +} + /// Sandbox configuration extracted from OPA data at startup. pub struct SandboxConfig { pub filesystem: FilesystemPolicy, @@ -145,7 +189,9 @@ impl OpaEngine { engine .add_policy_from_file(policy_path) .map_err(|e| miette::miette!("{e}"))?; - let data_json = preprocess_yaml_data(&yaml_str)?; + let require_binary_identity = network_binary_identity_required(); + emit_binary_identity_mode(require_binary_identity, "files"); + let data_json = preprocess_yaml_data(&yaml_str, require_binary_identity)?; engine .add_data_json(&data_json) .map_err(|e| miette::miette!("{e}"))?; @@ -159,11 +205,24 @@ impl OpaEngine { /// /// Preprocesses the YAML data to expand access presets and validate L7 config. pub fn from_strings(policy: &str, data_yaml: &str) -> Result { + Self::from_strings_with_binary_identity_required( + policy, + data_yaml, + network_binary_identity_required(), + ) + } + + pub(crate) fn from_strings_with_binary_identity_required( + policy: &str, + data_yaml: &str, + require_binary_identity: bool, + ) -> Result { let mut engine = regorus::Engine::new(); engine .add_policy("policy.rego".into(), policy.into()) .map_err(|e| miette::miette!("{e}"))?; - let data_json = preprocess_yaml_data(data_yaml)?; + emit_binary_identity_mode(require_binary_identity, "strings"); + let data_json = preprocess_yaml_data(data_yaml, require_binary_identity)?; engine .add_data_json(&data_json) .map_err(|e| miette::miette!("{e}"))?; @@ -192,11 +251,25 @@ impl OpaEngine { /// gap between user-specified symlink paths (e.g., `/usr/bin/python3`) and /// kernel-resolved canonical paths (e.g., `/usr/bin/python3.11`). pub fn from_proto_with_pid(proto: &ProtoSandboxPolicy, entrypoint_pid: u32) -> Result { + Self::from_proto_with_pid_and_binary_identity_required( + proto, + entrypoint_pid, + network_binary_identity_required(), + ) + } + + fn from_proto_with_pid_and_binary_identity_required( + proto: &ProtoSandboxPolicy, + entrypoint_pid: u32, + require_binary_identity: bool, + ) -> Result { + emit_binary_identity_mode(require_binary_identity, "proto"); let data_json_str = proto_to_opa_data_json(proto, entrypoint_pid); // Parse back to Value for preprocessing, then re-serialize let mut data: serde_json::Value = serde_json::from_str(&data_json_str) .map_err(|e| miette::miette!("internal: failed to parse proto JSON: {e}"))?; + inject_runtime_policy_data(&mut data, require_binary_identity); // Validate BEFORE expanding presets let (errors, warnings) = crate::l7::validate_l7_policies(&data); @@ -717,9 +790,10 @@ fn parse_process_policy(val: ®orus::Value) -> ProcessPolicy { } /// Preprocess YAML policy data: parse, normalize, validate, expand access presets, return JSON. -fn preprocess_yaml_data(yaml_str: &str) -> Result { +fn preprocess_yaml_data(yaml_str: &str, require_binary_identity: bool) -> Result { let mut data: serde_json::Value = serde_yml::from_str(yaml_str) .map_err(|e| miette::miette!("failed to parse YAML data: {e}"))?; + inject_runtime_policy_data(&mut data, require_binary_identity); // Normalize port → ports for all endpoints so Rego always sees "ports" array. normalize_endpoint_ports(&mut data); @@ -2031,6 +2105,87 @@ process: assert!(eval_l7(&engine, &input)); } + #[test] + fn l7_get_allowed_by_rules_when_binary_identity_relaxed() { + let engine = + OpaEngine::from_strings_with_binary_identity_required(TEST_POLICY, L7_TEST_DATA, false) + .expect("Failed to load relaxed L7 test data"); + let mut input = l7_input("api.example.com", 8080, "GET", "/repos/myorg/foo"); + input["exec"]["path"] = "".into(); + assert!(eval_l7(&engine, &input)); + } + + #[test] + fn relaxed_binary_identity_preserves_matched_policy_and_l7_for_proto() { + let mut network_policies = std::collections::HashMap::new(); + network_policies.insert( + "test_l7".to_string(), + NetworkPolicyRule { + name: "test_l7".to_string(), + endpoints: vec![NetworkEndpoint { + host: "host.k3d.internal".to_string(), + port: 56123, + protocol: "rest".to_string(), + enforcement: "enforce".to_string(), + rules: vec![L7Rule { + allow: Some(L7Allow { + method: "GET".to_string(), + path: "/allowed".to_string(), + command: String::new(), + query: std::collections::HashMap::new(), + operation_type: String::new(), + operation_name: String::new(), + fields: Vec::new(), + }), + }], + allowed_ips: vec!["192.168.0.0/16".to_string()], + ..Default::default() + }], + binaries: vec![NetworkBinary { + path: "/usr/bin/curl".to_string(), + ..Default::default() + }], + }, + ); + let proto = ProtoSandboxPolicy { + version: 1, + filesystem: Some(ProtoFs { + include_workdir: true, + read_only: vec![], + read_write: vec![], + }), + landlock: Some(openshell_core::proto::LandlockPolicy { + compatibility: "best_effort".to_string(), + }), + process: Some(ProtoProc { + run_as_user: "sandbox".to_string(), + run_as_group: "sandbox".to_string(), + }), + network_policies, + }; + let engine = OpaEngine::from_proto_with_pid_and_binary_identity_required(&proto, 0, false) + .expect("engine from relaxed proto"); + let network_input = NetworkInput { + host: "host.k3d.internal".into(), + port: 56123, + binary_path: PathBuf::new(), + binary_sha256: String::new(), + ancestors: vec![], + cmdline_paths: vec![], + }; + let action = engine.evaluate_network_action(&network_input).unwrap(); + assert_eq!( + action, + NetworkAction::Allow { + matched_policy: Some("test_l7".to_string()) + } + ); + + let mut input = l7_input("host.k3d.internal", 56123, "GET", "/allowed"); + input["exec"]["path"] = "".into(); + assert!(eval_l7(&engine, &input)); + } + #[test] fn l7_post_allowed_by_rules() { let engine = l7_engine(); @@ -3444,6 +3599,46 @@ process: ); } + #[test] + fn relaxed_binary_identity_allows_declared_endpoint_without_binary_match() { + let engine = OpaEngine::from_strings_with_binary_identity_required( + TEST_POLICY, + INFERENCE_TEST_DATA, + false, + ) + .expect("Failed to load relaxed binary identity test data"); + let input = NetworkInput { + host: "api.anthropic.com".into(), + port: 443, + binary_path: PathBuf::from("/tmp/unlisted-agent"), + binary_sha256: "unused".into(), + ancestors: vec![], + cmdline_paths: vec![], + }; + + let action = engine.evaluate_network_action(&input).unwrap(); + assert_eq!( + action, + NetworkAction::Allow { + matched_policy: Some("claude_code".to_string()) + }, + ); + assert!( + engine.query_exact_declared_endpoint_host(&input).unwrap(), + "relaxed identity should preserve exact declared endpoint handling" + ); + + let undeclared = NetworkInput { + host: "api.openai.com".into(), + ..input + }; + let action = engine.evaluate_network_action(&undeclared).unwrap(); + assert!( + matches!(action, NetworkAction::Deny { .. }), + "relaxed identity must not allow undeclared endpoints" + ); + } + #[test] fn unknown_endpoint_returns_deny() { let engine = inference_engine(); diff --git a/crates/openshell-supervisor-network/src/proxy.rs b/crates/openshell-supervisor-network/src/proxy.rs index bba3c3919..88a03b52c 100644 --- a/crates/openshell-supervisor-network/src/proxy.rs +++ b/crates/openshell-supervisor-network/src/proxy.rs @@ -42,6 +42,8 @@ const TUNNEL_PROTOCOL_PEEK_POLL: std::time::Duration = std::time::Duration::from const TUNNEL_PROTOCOL_PEEK_POLL: std::time::Duration = std::time::Duration::from_millis(1); const INFERENCE_LOCAL_HOST: &str = "inference.local"; const INFERENCE_LOCAL_PORT: u16 = 443; +#[cfg(target_os = "linux")] +const SIDECAR_SUPERVISOR_TOPOLOGY: &str = "sidecar"; /// Hostnames injected by compute drivers as `/etc/hosts` aliases for the host /// machine. Traffic to these names is eligible for the trusted-gateway SSRF @@ -1404,7 +1406,7 @@ fn resolve_owner_identity( })?; let bin_hash = identity_cache - .verify_or_cache(&bin_path) + .verify_or_cache_process_exe(&bin_path, owner_pid) .map_err(|e| IdentityError { reason: format!("binary integrity check failed: {e}"), binary: Some(bin_path.clone()), @@ -1412,11 +1414,15 @@ fn resolve_owner_identity( ancestors: vec![], })?; - let ancestors = crate::procfs::collect_ancestor_binaries(owner_pid, entrypoint_pid); + let ancestor_identities = collect_ancestor_identities(owner_pid, entrypoint_pid); + let ancestors: Vec = ancestor_identities + .iter() + .map(|(_, path)| path.clone()) + .collect(); - for ancestor in &ancestors { + for (ancestor_pid, ancestor) in &ancestor_identities { identity_cache - .verify_or_cache(ancestor) + .verify_or_cache_process_exe(ancestor, *ancestor_pid) .map_err(|e| IdentityError { reason: format!( "ancestor integrity check failed for {}: {e}", @@ -1441,6 +1447,31 @@ fn resolve_owner_identity( }) } +#[cfg(target_os = "linux")] +fn collect_ancestor_identities(pid: u32, stop_pid: u32) -> Vec<(u32, PathBuf)> { + const MAX_DEPTH: usize = 64; + let mut ancestors = Vec::new(); + let mut current = pid; + + for _ in 0..MAX_DEPTH { + let ppid = match crate::procfs::read_ppid(current) { + Some(p) if p > 0 && p != current => p, + _ => break, + }; + + if let Ok(path) = crate::procfs::binary_path(ppid.cast_signed()) { + ancestors.push((ppid, path)); + } + + if ppid == stop_pid || ppid == 1 { + break; + } + current = ppid; + } + + ancestors +} + /// Resolve the identity of the process owning a TCP peer connection. /// /// Walks `/proc//net/tcp` to find the socket inode, locates @@ -1551,8 +1582,17 @@ fn evaluate_opa_tcp( } }; - let pid = entrypoint_pid.load(Ordering::Acquire); - if pid == 0 { + if !crate::opa::network_binary_identity_required() { + let result = evaluate_endpoint_only_opa(engine, host, port); + debug!( + "evaluate_opa_tcp endpoint-only: host={host} port={port} action={:?}", + result.action + ); + return result; + } + + let entrypoint_pid = entrypoint_pid.load(Ordering::Acquire); + let Some(proc_net_anchor_pid) = proc_net_anchor_pid(entrypoint_pid) else { return deny( "entrypoint process not yet spawned".into(), None, @@ -1560,12 +1600,12 @@ fn evaluate_opa_tcp( vec![], vec![], ); - } + }; let total_start = std::time::Instant::now(); let peer_port = peer_addr.port(); - let identity = match resolve_process_identity(pid, peer_port, identity_cache) { + let identity = match resolve_process_identity(proc_net_anchor_pid, peer_port, identity_cache) { Ok(id) => id, Err(err) => { return deny( @@ -1619,6 +1659,52 @@ fn evaluate_opa_tcp( result } +#[cfg(target_os = "linux")] +fn proc_net_anchor_pid(entrypoint_pid: u32) -> Option { + if entrypoint_pid != 0 { + return Some(entrypoint_pid); + } + sidecar_topology_enabled().then(std::process::id) +} + +#[cfg(target_os = "linux")] +fn sidecar_topology_enabled() -> bool { + std::env::var(openshell_core::sandbox_env::SUPERVISOR_TOPOLOGY) + .is_ok_and(|value| value == SIDECAR_SUPERVISOR_TOPOLOGY) +} + +fn evaluate_endpoint_only_opa(engine: &OpaEngine, host: &str, port: u16) -> ConnectDecision { + let input = crate::opa::NetworkInput { + host: host.to_string(), + port, + binary_path: PathBuf::new(), + binary_sha256: String::new(), + ancestors: vec![], + cmdline_paths: vec![], + }; + + match engine.evaluate_network_action_with_generation(&input) { + Ok((action, generation)) => ConnectDecision { + action, + generation, + binary: None, + binary_pid: None, + ancestors: vec![], + cmdline_paths: vec![], + }, + Err(e) => ConnectDecision { + action: NetworkAction::Deny { + reason: format!("policy evaluation error: {e}"), + }, + generation: engine.current_generation(), + binary: None, + binary_pid: None, + ancestors: vec![], + cmdline_paths: vec![], + }, + } +} + /// Non-Linux stub: OPA identity binding requires /proc. #[cfg(not(target_os = "linux"))] fn evaluate_opa_tcp( @@ -1626,9 +1712,13 @@ fn evaluate_opa_tcp( engine: &OpaEngine, _identity_cache: &BinaryIdentityCache, _entrypoint_pid: &AtomicU32, - _host: &str, - _port: u16, + host: &str, + port: u16, ) -> ConnectDecision { + if !crate::opa::network_binary_identity_required() { + return evaluate_endpoint_only_opa(engine, host, port); + } + ConnectDecision { action: NetworkAction::Deny { reason: "identity binding unavailable on this platform".into(), @@ -2130,14 +2220,24 @@ fn query_l7_route_snapshot( }; match engine.query_endpoint_configs_with_generation(&input) { - Ok((vals, generation)) => Some(L7RouteSnapshot { - configs: vals + Ok((vals, generation)) => { + let configs: Vec<_> = vals .into_iter() .filter_map(|val| crate::l7::parse_l7_config(&val)) .map(|config| L7ConfigSnapshot { config }) - .collect(), - generation, - }), + .collect(); + debug!( + host, + port, + generation, + config_count = configs.len(), + "Forward proxy L7 route lookup complete" + ); + Some(L7RouteSnapshot { + configs, + generation, + }) + } Err(e) => { let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) @@ -3317,10 +3417,29 @@ async fn handle_forward_proxy( } }; let policy_str = matched_policy.as_deref().unwrap_or("-"); + debug!( + host = %host_lc, + port, + binary = %binary_str, + binary_pid = %pid_str, + matched_policy = %policy_str, + decision_generation = decision.generation, + current_generation = opa_engine.current_generation(), + action = ?decision.action, + "Forward proxy L4 policy decision" + ); let sandbox_entrypoint_pid = entrypoint_pid.load(Ordering::Acquire); let forward_generation_guard = match opa_engine.generation_guard(decision.generation) { Ok(guard) => guard, Err(e) => { + warn!( + host = %host_lc, + port, + decision_generation = decision.generation, + current_generation = opa_engine.current_generation(), + error = %e, + "Forward proxy rejected request because policy generation changed after L4 decision" + ); emit_l7_tunnel_close_after_policy_change(&host_lc, port, e); emit_activity_simple(activity_tx, true, "policy_stale"); respond( @@ -3381,6 +3500,15 @@ async fn handle_forward_proxy( && !route.configs.is_empty() { if route.generation != forward_generation_guard.captured_generation() { + warn!( + host = %host_lc, + port, + decision_generation = decision.generation, + guard_generation = forward_generation_guard.captured_generation(), + route_generation = route.generation, + current_generation = opa_engine.current_generation(), + "Forward proxy rejected request because L7 route lookup used a different policy generation" + ); emit_l7_tunnel_close_after_policy_change( &host_lc, port, @@ -3406,6 +3534,14 @@ async fn handle_forward_proxy( let tunnel_engine = match opa_engine.clone_engine_for_tunnel(route.generation) { Ok(engine) => engine, Err(e) => { + warn!( + host = %host_lc, + port, + route_generation = route.generation, + current_generation = opa_engine.current_generation(), + error = %e, + "Forward proxy rejected request because L7 tunnel engine could not be cloned" + ); emit_l7_tunnel_close_after_policy_change(&host_lc, port, e); emit_activity_simple(activity_tx, true, "policy_stale"); respond( @@ -4010,6 +4146,14 @@ async fn handle_forward_proxy( }; if let Err(e) = forward_generation_guard.ensure_current() { + warn!( + host = %host_lc, + port, + captured_generation = forward_generation_guard.captured_generation(), + current_generation = forward_generation_guard.current_generation(), + error = %e, + "Forward proxy rejected request because policy changed before upstream connect" + ); emit_l7_tunnel_close_after_policy_change(&host_lc, port, e); emit_activity_simple(activity_tx, true, "policy_stale"); respond( @@ -4148,6 +4292,14 @@ async fn handle_forward_proxy( }; if let Err(e) = forward_generation_guard.ensure_current() { + warn!( + host = %host_lc, + port, + captured_generation = forward_generation_guard.captured_generation(), + current_generation = forward_generation_guard.current_generation(), + error = %e, + "Forward proxy rejected request because policy changed before relay" + ); emit_l7_tunnel_close_after_policy_change(&host_lc, port, e); respond( client, @@ -4284,6 +4436,46 @@ mod tests { use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt}; use tokio::net::{TcpListener, TcpStream}; + #[test] + fn endpoint_only_opa_allows_declared_endpoint_without_process_identity() { + let policy = include_str!("../data/sandbox-policy.rego"); + let data = r#" +version: 1 +network_policies: + test_l7: + name: test_l7 + endpoints: + - host: host.k3d.internal + port: 56123 + protocol: rest + enforcement: enforce + rules: + - allow: + method: GET + path: /allowed + binaries: + - path: /usr/bin/curl +"#; + let engine = OpaEngine::from_strings_with_binary_identity_required(policy, data, false) + .expect("relaxed engine"); + + let decision = evaluate_endpoint_only_opa(&engine, "host.k3d.internal", 56123); + assert_eq!( + decision.action, + NetworkAction::Allow { + matched_policy: Some("test_l7".to_string()), + } + ); + assert!(decision.binary.is_none()); + assert!(decision.ancestors.is_empty()); + + let denied = evaluate_endpoint_only_opa(&engine, "api.example.com", 443); + assert!( + matches!(denied.action, NetworkAction::Deny { .. }), + "endpoint-only mode must still deny undeclared endpoints" + ); + } + fn websocket_l7_config( protocol: crate::l7::L7Protocol, websocket_credential_rewrite: bool, diff --git a/crates/openshell-supervisor-network/src/run.rs b/crates/openshell-supervisor-network/src/run.rs index 9553e0673..8e17758bd 100644 --- a/crates/openshell-supervisor-network/src/run.rs +++ b/crates/openshell-supervisor-network/src/run.rs @@ -201,7 +201,9 @@ pub async fn run_networking( let (tls_state, ca_file_paths) = if matches!(policy.network.mode, NetworkMode::Proxy) { match SandboxCa::generate() { Ok(ca) => { - let tls_dir = std::path::Path::new("/etc/openshell-tls"); + let tls_dir = std::env::var(openshell_core::sandbox_env::PROXY_TLS_DIR) + .unwrap_or_else(|_| "/etc/openshell-tls".to_string()); + let tls_dir = std::path::Path::new(&tls_dir); let system_ca_bundle = read_system_ca_bundle(); match write_ca_files(&ca, tls_dir, &system_ca_bundle) { Ok(paths) => { diff --git a/crates/openshell-supervisor-process/Cargo.toml b/crates/openshell-supervisor-process/Cargo.toml index b2dad859e..7752cc8af 100644 --- a/crates/openshell-supervisor-process/Cargo.toml +++ b/crates/openshell-supervisor-process/Cargo.toml @@ -13,6 +13,7 @@ rust-version.workspace = true [dependencies] openshell-core = { path = "../openshell-core" } openshell-ocsf = { path = "../openshell-ocsf" } +openshell-policy = { path = "../openshell-policy" } anyhow = { workspace = true } base64 = { workspace = true } diff --git a/crates/openshell-supervisor-process/src/netns/mod.rs b/crates/openshell-supervisor-process/src/netns/mod.rs index cc7b1d84c..3aabe32ea 100644 --- a/crates/openshell-supervisor-process/src/netns/mod.rs +++ b/crates/openshell-supervisor-process/src/netns/mod.rs @@ -467,6 +467,27 @@ pub fn create_netns_for_proxy( } } +/// Install pod-network bypass enforcement for Kubernetes sidecar topology. +/// +/// This runs in the current network namespace, not in a per-workload netns. +/// The rules allow loopback and the sidecar proxy UID, then reject direct +/// TCP/UDP egress from other UIDs so traffic must use the sidecar's local +/// proxy. +/// +/// # Errors +/// +/// Returns an error when `nft` is unavailable or the ruleset cannot be loaded. +pub fn install_sidecar_bypass_rules(proxy_uid: u32) -> Result<()> { + let nft_cmd = find_nft().ok_or_else(|| { + miette::miette!( + "trusted nft helper not found; sidecar network enforcement requires nftables" + ) + })?; + let log_prefix = Some("openshell:sidecar-bypass:"); + let ruleset = nft_ruleset::generate_sidecar_bypass_ruleset(proxy_uid, log_prefix); + run_nft_current_namespace(&nft_cmd, &ruleset) +} + /// Run an `ip` command on the host. fn run_ip(args: &[&str]) -> Result<()> { let ip_path = find_trusted_binary("ip", IP_SEARCH_PATHS)?; @@ -490,6 +511,39 @@ fn run_ip(args: &[&str]) -> Result<()> { Ok(()) } +fn run_nft_current_namespace(nft_cmd: &str, ruleset: &str) -> Result<()> { + use std::io::Write; + let mut tmp = tempfile::Builder::new() + .prefix("openshell-sidecar-nft-") + .suffix(".conf") + .tempfile() + .into_diagnostic()?; + tmp.write_all(ruleset.as_bytes()).into_diagnostic()?; + let ruleset_path = tmp.path().to_string_lossy().to_string(); + + debug!( + command = %format!("{nft_cmd} -f {ruleset_path}"), + "Loading nftables sidecar ruleset" + ); + + let output = Command::new(nft_cmd) + .args(["-f", &ruleset_path]) + .output() + .into_diagnostic()?; + + drop(tmp); + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(miette::miette!( + "sidecar nft ruleset load failed: {}", + stderr.trim() + )); + } + + Ok(()) +} + /// Run an `ip` command inside a network namespace via `nsenter --net=`. /// /// We use `nsenter` instead of `ip netns exec` because `ip netns exec` diff --git a/crates/openshell-supervisor-process/src/netns/nft_ruleset.rs b/crates/openshell-supervisor-process/src/netns/nft_ruleset.rs index ba7aeb936..d7ec5132e 100644 --- a/crates/openshell-supervisor-process/src/netns/nft_ruleset.rs +++ b/crates/openshell-supervisor-process/src/netns/nft_ruleset.rs @@ -53,6 +53,46 @@ pub fn generate_bypass_ruleset(host_ip: &str, proxy_port: u16, log_prefix: Optio ) } +/// Generate a pod-network ruleset for Kubernetes sidecar enforcement. +/// +/// The network sidecar and the process supervisor share a pod network +/// namespace. The sidecar runs as `proxy_uid` and owns external egress; +/// sandbox traffic must use loopback services hosted by that sidecar +/// (gateway forward and HTTP CONNECT proxy). +pub fn generate_sidecar_bypass_ruleset(proxy_uid: u32, log_prefix: Option<&str>) -> String { + let log_tcp = log_prefix + .map(|p| { + format!( + "\n tcp flags syn limit rate 5/second burst 10 packets log prefix \"{p}\" flags skuid" + ) + }) + .unwrap_or_default(); + let log_udp = log_prefix + .map(|p| { + format!( + "\n meta l4proto udp limit rate 5/second burst 10 packets log prefix \"{p}\" flags skuid" + ) + }) + .unwrap_or_default(); + + format!( + r#"table inet openshell_sidecar_bypass {{ + chain output {{ + type filter hook output priority 0; policy accept; + + oifname "lo" accept + ct state established,related accept + meta skuid {proxy_uid} accept{log_tcp} + meta nfproto ipv4 meta l4proto tcp reject with icmp type port-unreachable + meta nfproto ipv6 meta l4proto tcp reject with icmpv6 type port-unreachable{log_udp} + meta nfproto ipv4 meta l4proto udp reject with icmp type port-unreachable + meta nfproto ipv6 meta l4proto udp reject with icmpv6 type port-unreachable + }} +}} +"# + ) +} + #[cfg(test)] mod tests { use super::*; @@ -145,4 +185,27 @@ mod tests { "UDP log rule must come before UDP reject rule" ); } + + #[test] + fn sidecar_ruleset_allows_supervisor_uid_and_loopback() { + let ruleset = generate_sidecar_bypass_ruleset(1337, None); + assert!(ruleset.contains("table inet openshell_sidecar_bypass")); + assert!(ruleset.contains("oifname \"lo\" accept")); + assert!(ruleset.contains("meta skuid 1337 accept")); + } + + #[test] + fn sidecar_ruleset_rejects_tcp_and_udp_egress() { + let ruleset = generate_sidecar_bypass_ruleset(0, Some("openshell:sidecar:test:")); + assert!(ruleset.contains("meta nfproto ipv4 meta l4proto tcp reject")); + assert!(ruleset.contains("meta nfproto ipv6 meta l4proto tcp reject")); + assert!(ruleset.contains("meta nfproto ipv4 meta l4proto udp reject")); + assert!(ruleset.contains("meta nfproto ipv6 meta l4proto udp reject")); + assert_eq!( + ruleset + .matches("log prefix \"openshell:sidecar:test:\"") + .count(), + 2 + ); + } } diff --git a/crates/openshell-supervisor-process/src/process.rs b/crates/openshell-supervisor-process/src/process.rs index 9f9fe1822..358334fec 100644 --- a/crates/openshell-supervisor-process/src/process.rs +++ b/crates/openshell-supervisor-process/src/process.rs @@ -11,7 +11,7 @@ use crate::netns::NetworkNamespace; use crate::sandbox; use miette::{IntoDiagnostic, Result}; use nix::sys::signal::{self, Signal}; -use nix::unistd::{Group, Pid, User}; +use nix::unistd::{Gid, Group, Pid, Uid, User}; use openshell_core::policy::{NetworkMode, SandboxPolicy}; use std::collections::HashMap; use std::ffi::CString; @@ -28,10 +28,32 @@ use std::sync::OnceLock; use tokio::process::{Child, Command}; use tracing::debug; +/// Process/filesystem enforcement performed by the process supervisor. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ProcessEnforcementMode { + /// Preserve the existing supervisor behavior: prepare filesystem policy, + /// drop privileges, and apply Landlock/seccomp to workload processes. + Full, + /// Preserve process launch and SSH/session behavior, but skip controls + /// that require root or extra Linux capabilities. Kubernetes sidecar mode + /// uses this when network policy is enforced by the network sidecar. + NetworkOnly, +} + +impl ProcessEnforcementMode { + #[must_use] + pub const fn enforces_process_controls(self) -> bool { + matches!(self, Self::Full) + } +} + const SUPERVISOR_ONLY_ENV_VARS: &[&str] = &[ openshell_core::sandbox_env::SANDBOX_TOKEN, openshell_core::sandbox_env::SANDBOX_TOKEN_FILE, openshell_core::sandbox_env::K8S_SA_TOKEN_FILE, + openshell_core::sandbox_env::TLS_CA, + openshell_core::sandbox_env::TLS_CERT, + openshell_core::sandbox_env::TLS_KEY, openshell_core::sandbox_env::PROVIDER_SPIFFE_WORKLOAD_API_SOCKET, ]; @@ -403,6 +425,7 @@ impl ProcessHandle { workdir: Option<&str>, interactive: bool, policy: &SandboxPolicy, + enforcement_mode: ProcessEnforcementMode, netns: Option<&NetworkNamespace>, ca_paths: Option<&(PathBuf, PathBuf)>, provider_env: &HashMap, @@ -413,6 +436,7 @@ impl ProcessHandle { workdir, interactive, policy, + enforcement_mode, netns.and_then(NetworkNamespace::ns_fd), ca_paths, provider_env, @@ -425,12 +449,14 @@ impl ProcessHandle { /// /// Returns an error if the process fails to start. #[cfg(not(target_os = "linux"))] + #[allow(clippy::too_many_arguments)] pub fn spawn( program: &str, args: &[String], workdir: Option<&str>, interactive: bool, policy: &SandboxPolicy, + enforcement_mode: ProcessEnforcementMode, ca_paths: Option<&(PathBuf, PathBuf)>, provider_env: &HashMap, ) -> Result { @@ -440,6 +466,7 @@ impl ProcessHandle { workdir, interactive, policy, + enforcement_mode, ca_paths, provider_env, ) @@ -453,6 +480,7 @@ impl ProcessHandle { workdir: Option<&str>, interactive: bool, policy: &SandboxPolicy, + enforcement_mode: ProcessEnforcementMode, netns_fd: Option, ca_paths: Option<&(PathBuf, PathBuf)>, provider_env: &HashMap, @@ -512,18 +540,30 @@ impl ProcessHandle { // process where the tracing subscriber is functional. The child's // pre_exec context cannot reliably emit structured logs. #[cfg(target_os = "linux")] - sandbox::linux::log_sandbox_readiness(policy, workdir); + if enforcement_mode.enforces_process_controls() { + sandbox::linux::log_sandbox_readiness(policy, workdir); + } // Phase 1 (as root): Prepare Landlock ruleset by opening PathFds. // This MUST happen before drop_privileges() so that root-only paths // (e.g. mode 700 directories) can be opened. See issue #803. #[cfg(target_os = "linux")] - let prepared_sandbox = sandbox::linux::prepare(policy, workdir) - .map_err(|err| miette::miette!("Failed to prepare sandbox: {err}"))?; + let prepared_sandbox = if enforcement_mode.enforces_process_controls() { + Some( + sandbox::linux::prepare(policy, workdir) + .map_err(|err| miette::miette!("Failed to prepare sandbox: {err}"))?, + ) + } else { + None + }; #[cfg(target_os = "linux")] - let supervisor_identity_mount = supervisor_identity_mount_from_env().map_err(|err| { - miette::miette!("Failed to prepare supervisor identity isolation: {err}") - })?; + let supervisor_identity_mount = if enforcement_mode.enforces_process_controls() { + supervisor_identity_mount_from_env().map_err(|err| { + miette::miette!("Failed to prepare supervisor identity isolation: {err}") + })? + } else { + None + }; // Set up process group for signal handling (non-interactive mode only). // In interactive mode, we inherit the parent's process group to maintain @@ -535,7 +575,7 @@ impl ProcessHandle { // Wrap in Option so we can .take() it out of the FnMut closure. // pre_exec is only called once (after fork, before exec). #[cfg(target_os = "linux")] - let mut prepared_sandbox = Some(prepared_sandbox); + let mut prepared_sandbox = prepared_sandbox; #[allow(unsafe_code)] unsafe { cmd.pre_exec(move || { @@ -560,8 +600,10 @@ impl ProcessHandle { // Drop privileges. initgroups/setgid/setuid need access to // /etc/group and /etc/passwd which would be blocked if // Landlock were already enforced. - drop_privileges(&policy) - .map_err(|err| std::io::Error::other(err.to_string()))?; + if enforcement_mode.enforces_process_controls() { + drop_privileges(&policy) + .map_err(|err| std::io::Error::other(err.to_string()))?; + } harden_child_process().map_err(|err| std::io::Error::other(err.to_string()))?; @@ -589,12 +631,14 @@ impl ProcessHandle { } #[cfg(not(target_os = "linux"))] + #[allow(clippy::too_many_arguments)] fn spawn_impl( program: &str, args: &[String], workdir: Option<&str>, interactive: bool, policy: &SandboxPolicy, + enforcement_mode: ProcessEnforcementMode, ca_paths: Option<&(PathBuf, PathBuf)>, provider_env: &HashMap, ) -> Result { @@ -657,13 +701,17 @@ impl ProcessHandle { // Drop privileges before applying sandbox restrictions. // initgroups/setgid/setuid need access to /etc/group and /etc/passwd // which may be blocked by Landlock. - drop_privileges(&policy) - .map_err(|err| std::io::Error::other(err.to_string()))?; + if enforcement_mode.enforces_process_controls() { + drop_privileges(&policy) + .map_err(|err| std::io::Error::other(err.to_string()))?; + } harden_child_process().map_err(|err| std::io::Error::other(err.to_string()))?; - sandbox::apply(&policy, workdir.as_deref()) - .map_err(|err| std::io::Error::other(err.to_string()))?; + if enforcement_mode.enforces_process_controls() { + sandbox::apply(&policy, workdir.as_deref()) + .map_err(|err| std::io::Error::other(err.to_string()))?; + } Ok(()) }); @@ -748,17 +796,36 @@ impl Drop for ProcessHandle { } } -/// Validate that the `sandbox` user exists in this image. +/// Validate that the configured sandbox identity exists in this image. +/// +/// When the identity is the literal `"sandbox"`, verifies the user exists +/// in `/etc/passwd` (all sandbox images ship with one). /// -/// All sandbox images must include a `sandbox` user for privilege dropping. -/// This check runs at supervisor startup (inside the container) where we can -/// inspect `/etc/passwd`. If the user is missing, the sandbox fails fast -/// with a clear error instead of silently running child processes as root. +/// When the identity is a numeric UID, skips the passwd lookup entirely — +/// the kernel will use the resolved UID regardless of whether an entry +/// exists in `/etc/passwd`. Logs an OCSF event confirming numeric UID usage. +/// Non-numeric, non-"sandbox" values are rejected. #[cfg(unix)] pub fn validate_sandbox_user(policy: &SandboxPolicy) -> Result<()> { - let user_name = policy.process.run_as_user.as_deref().unwrap_or("sandbox"); + let identity = policy.process.run_as_user.as_deref().unwrap_or("sandbox"); + + // Numeric UID — no passwd entry required; kernel resolves directly. + if openshell_policy::is_valid_sandbox_identity(identity) && identity.parse::().is_ok() { + openshell_ocsf::ocsf_emit!( + openshell_ocsf::ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) + .severity(openshell_ocsf::SeverityId::Informational) + .status(openshell_ocsf::StatusId::Success) + .state(openshell_ocsf::StateId::Enabled, "validated") + .message(format!( + "Accepted numeric UID {identity} (no passwd entry required)" + )) + .build() + ); + return Ok(()); + } - if user_name.is_empty() || user_name == "sandbox" { + // "sandbox" name — must exist in /etc/passwd. + if identity == "sandbox" { match User::from_name("sandbox") { Ok(Some(_)) => { openshell_ocsf::ocsf_emit!( @@ -780,11 +847,36 @@ pub fn validate_sandbox_user(policy: &SandboxPolicy) -> Result<()> { return Err(miette::miette!("failed to look up 'sandbox' user: {e}")); } } + } else if !identity.is_empty() { + // Non-numeric, non-sandbox string — attempt passwd lookup. + // This catches cases where someone accidentally put "root" or similar. + match User::from_name(identity) { + Ok(Some(_)) => { + tracing::warn!( + identity, + "non-sandbox user accepted via passwd entry; \ + consider using a numeric UID for UID-injected images" + ); + } + Ok(None) => { + return Err(miette::miette!( + "unrecognized sandbox identity '{identity}'; \ + expected 'sandbox' or a numeric UID in range [{MIN_SANDBOX_UID}, {MAX_SANDBOX_UID}]" + )); + } + Err(e) => { + return Err(miette::miette!( + "failed to look up identity '{identity}': {e}" + )); + } + } } Ok(()) } +pub use openshell_policy::{MAX_SANDBOX_UID, MIN_SANDBOX_UID}; + /// Prepare a `read_write` path for the sandboxed process. /// /// Returns `true` when the path was created by the supervisor and therefore @@ -823,9 +915,13 @@ fn prepare_read_write_path(path: &Path) -> Result { /// Creates `read_write` directories if they don't exist and sets ownership /// on newly-created paths to the configured sandbox user/group. This runs as /// the supervisor (root) before forking the child process. +/// +/// Accepts both name-based identities (resolved via `/etc/passwd`) and numeric +/// UIDs/GIDs (passed directly to `chown` without a passwd lookup). #[cfg(unix)] pub fn prepare_filesystem(policy: &SandboxPolicy) -> Result<()> { use nix::unistd::chown; + use nix::unistd::{Gid, Uid}; let user_name = match policy.process.run_as_user.as_deref() { Some(name) if !name.is_empty() => Some(name), @@ -841,27 +937,22 @@ pub fn prepare_filesystem(policy: &SandboxPolicy) -> Result<()> { return Ok(()); } - // Resolve user and group - let uid = if let Some(name) = user_name { - Some( - User::from_name(name) - .into_diagnostic()? - .ok_or_else(|| miette::miette!("Sandbox user not found: {name}"))? - .uid, - ) - } else { - None + // Resolve UID: numeric values are passed directly; names resolve via passwd. + let uid = match user_name { + Some(name) if name.parse::().is_ok() => { + Some(Uid::from_raw(name.parse().into_diagnostic()?)) + } + Some(name) => User::from_name(name).into_diagnostic()?.map(|u| u.uid), + _ => None, }; - let gid = if let Some(name) = group_name { - Some( - Group::from_name(name) - .into_diagnostic()? - .ok_or_else(|| miette::miette!("Sandbox group not found: {name}"))? - .gid, - ) - } else { - None + // Resolve GID: numeric values are passed directly; names resolve via group. + let gid = match group_name { + Some(name) if name.parse::().is_ok() => { + Some(Gid::from_raw(name.parse().into_diagnostic()?)) + } + Some(name) => Group::from_name(name).into_diagnostic()?.map(|g| g.gid), + _ => None, }; // Create missing read_write paths and only chown the ones we created. @@ -914,27 +1005,59 @@ pub fn drop_privileges(policy: &SandboxPolicy) -> Result<()> { return Ok(()); } - let user = if let Some(name) = user_name { - User::from_name(name) - .into_diagnostic()? - .ok_or_else(|| miette::miette!("Sandbox user not found: {name}"))? - } else { - User::from_uid(nix::unistd::geteuid()) - .into_diagnostic()? - .ok_or_else(|| miette::miette!("Failed to resolve current user"))? + // Resolve UID: numeric values are used directly; names resolve via passwd. + let target_uid = match user_name { + Some(name) if name.parse::().is_ok() => Uid::from_raw(name.parse().into_diagnostic()?), + Some(name) => { + User::from_name(name) + .into_diagnostic()? + .ok_or_else(|| miette::miette!("Sandbox user not found: {name}"))? + .uid + } + None => nix::unistd::geteuid(), }; - let group = if let Some(name) = group_name { - Group::from_name(name) + // Resolve group: if a numeric GID is configured use it directly. + // Otherwise try name resolution, then fall back to current user's primary group. + let target_gid = match group_name { + Some(name) if name.parse::().is_ok() => Gid::from_raw(name.parse().into_diagnostic()?), + Some(name) => { + Group::from_name(name) + .into_diagnostic()? + .ok_or_else(|| miette::miette!("Sandbox group not found: {name}"))? + .gid + } + None => match target_uid.as_raw() { + 0 => nix::unistd::getegid(), + _ => Group::from_gid( + User::from_uid(target_uid) + .into_diagnostic()? + .ok_or_else(|| miette::miette!("Failed to resolve user from UID {target_uid}"))? + .gid, + ) .into_diagnostic()? - .ok_or_else(|| miette::miette!("Sandbox group not found: {name}"))? + .map_or_else(nix::unistd::getegid, |g| g.gid), + }, + }; + + // Resolve the user record for initgroups (if name-based) or skip it (numeric UID). + let user = if user_name.is_some() { + Some( + User::from_uid(target_uid) + .into_diagnostic()? + .ok_or_else(|| { + miette::miette!("Failed to resolve user record for UID {target_uid}") + })?, + ) } else { - Group::from_gid(user.gid) - .into_diagnostic()? - .ok_or_else(|| miette::miette!("Failed to resolve user primary group"))? + None }; - if user_name.is_some() { + // Set supplementary groups only when we have a name-based identity. + // Numeric UIDs may not have a passwd entry, so initgroups would fail. + if let Some(ref user) = user + && target_uid != nix::unistd::geteuid() + { let user_cstr = CString::new(user.name.clone()).map_err(|_| miette::miette!("Invalid user name"))?; #[cfg(any( @@ -953,31 +1076,35 @@ pub fn drop_privileges(policy: &SandboxPolicy) -> Result<()> { target_os = "redox" )))] { - nix::unistd::initgroups(user_cstr.as_c_str(), group.gid).into_diagnostic()?; + nix::unistd::initgroups(user_cstr.as_c_str(), target_gid).into_diagnostic()?; } } - nix::unistd::setgid(group.gid).into_diagnostic()?; + if target_gid != nix::unistd::getegid() { + nix::unistd::setgid(target_gid).into_diagnostic()?; + } // Verify effective GID actually changed (defense-in-depth, CWE-250 / CERT POS37-C) let effective_gid = nix::unistd::getegid(); - if effective_gid != group.gid { + if effective_gid != target_gid { return Err(miette::miette!( "Privilege drop verification failed: expected effective GID {}, got {}", - group.gid, + target_gid, effective_gid )); } - if user_name.is_some() { - nix::unistd::setuid(user.uid).into_diagnostic()?; + if let Some(_user) = user { + if target_uid != nix::unistd::geteuid() { + nix::unistd::setuid(target_uid).into_diagnostic()?; + } // Verify effective UID actually changed (defense-in-depth, CWE-250 / CERT POS37-C) let effective_uid = nix::unistd::geteuid(); - if effective_uid != user.uid { + if effective_uid != target_uid { return Err(miette::miette!( "Privilege drop verification failed: expected effective UID {}, got {}", - user.uid, + target_uid, effective_uid )); } @@ -985,11 +1112,11 @@ pub fn drop_privileges(policy: &SandboxPolicy) -> Result<()> { // Verify root cannot be re-acquired (CERT POS37-C hardening). // If we dropped from root, setuid(0) must fail; success means privileges // were not fully relinquished. - if nix::unistd::setuid(nix::unistd::Uid::from_raw(0)).is_ok() && user.uid.as_raw() != 0 { + if nix::unistd::setuid(Uid::from_raw(0)).is_ok() && target_uid.as_raw() != 0 { return Err(miette::miette!( "Privilege drop verification failed: process can still re-acquire root (UID 0) \ after switching to UID {}", - user.uid + target_uid )); } } @@ -1403,7 +1530,7 @@ mod tests { let current_user = User::from_uid(nix::unistd::geteuid()) .unwrap() .expect("current user entry"); - let restricted_group = Group::from_gid(nix::unistd::Gid::from_raw(0)) + let restricted_group = Group::from_gid(Gid::from_raw(0)) .unwrap() .expect("gid 0 group entry"); if restricted_group.gid == nix::unistd::getegid() { @@ -1538,4 +1665,54 @@ mod tests { Some(PathBuf::from("/run/spire")) ); } + + // ---- Numeric UID tests (Phase 2) ---- + + #[test] + fn drop_privileges_accepts_numeric_uid() { + // When running as non-root, a numeric UID/GID that matches the + // current process should succeed without any passwd lookup. + if nix::unistd::geteuid().is_root() { + return; + } + + let uid_raw = nix::unistd::geteuid().as_raw(); + let gid_raw = nix::unistd::getegid().as_raw(); + + let policy = policy_with_process(ProcessPolicy { + run_as_user: Some(uid_raw.to_string()), + run_as_group: Some(gid_raw.to_string()), + }); + + assert!( + drop_privileges(&policy).is_ok(), + "should accept current process UID/GID as numeric strings" + ); + } + + #[test] + fn drop_privileges_numeric_uid_skips_initgroups() { + // When running as non-root with a numeric user but group matches, + // initgroups should not be called (guard: target_uid != geteuid()). + if nix::unistd::geteuid().is_root() { + return; + } + + let current_uid = nix::unistd::geteuid().as_raw(); + + // Use a different group name that exists (the current one). + let current_group = Group::from_gid(nix::unistd::getegid()) + .expect("should resolve current group") + .expect("current group should exist"); + + let policy = policy_with_process(ProcessPolicy { + run_as_user: Some(current_uid.to_string()), // numeric UID, no passwd entry needed + run_as_group: Some(current_group.name), // name-based group + }); + + assert!( + drop_privileges(&policy).is_ok(), + "should accept numeric UID with name-based group (initgroups guarded)" + ); + } } diff --git a/crates/openshell-supervisor-process/src/run.rs b/crates/openshell-supervisor-process/src/run.rs index 5a5c203a2..c9cd63be9 100644 --- a/crates/openshell-supervisor-process/src/run.rs +++ b/crates/openshell-supervisor-process/src/run.rs @@ -33,7 +33,7 @@ use openshell_core::denial::DenialEvent; #[cfg(target_os = "linux")] use crate::managed_children; -use crate::process::ProcessHandle; +use crate::process::{ProcessEnforcementMode, ProcessHandle}; fn ocsf_ctx() -> &'static openshell_ocsf::SandboxContext { openshell_ocsf::ctx::ctx() @@ -57,6 +57,7 @@ pub async fn run_process( openshell_endpoint: Option<&str>, ssh_socket_path: Option, policy: &SandboxPolicy, + enforcement_mode: ProcessEnforcementMode, entrypoint_pid: Arc, provider_credentials: ProviderCredentialState, provider_env: std::collections::HashMap, @@ -71,13 +72,17 @@ pub async fn run_process( // must include a "sandbox" user for privilege dropping; failing fast here // beats silently running children as root. #[cfg(unix)] - crate::process::validate_sandbox_user(policy)?; + if enforcement_mode.enforces_process_controls() { + crate::process::validate_sandbox_user(policy)?; + } // Create read_write directories and chown newly-created ones to the // sandbox user/group. Runs as the supervisor (root) before the child // is forked so the workload sees writable paths it owns. #[cfg(unix)] - crate::process::prepare_filesystem(policy)?; + if enforcement_mode.enforces_process_controls() { + crate::process::prepare_filesystem(policy)?; + } // Eagerly fetch initial settings and install the agent skill if the // proposals flag is on at startup, rather than waiting for the policy @@ -198,31 +203,10 @@ pub async fn run_process( // their env so cooperative tools (curl, npm, Node) route through the // CONNECT proxy. Linux uses the netns host_ip; on other targets fall back // to the policy-declared http_addr directly. - let ssh_proxy_url = if matches!(policy.network.mode, NetworkMode::Proxy) { - #[cfg(target_os = "linux")] - { - netns.map(|ns| { - let port = policy - .network - .proxy - .as_ref() - .and_then(|p| p.http_addr) - .map_or(3128, |addr| addr.port()); - format!("http://{}:{port}", ns.host_ip()) - }) - } - #[cfg(not(target_os = "linux"))] - { - policy - .network - .proxy - .as_ref() - .and_then(|p| p.http_addr) - .map(|addr| format!("http://{addr}")) - } - } else { - None - }; + #[cfg(target_os = "linux")] + let ssh_proxy_url = ssh_proxy_url_for_policy(policy, netns.map(NetworkNamespace::host_ip)); + #[cfg(not(target_os = "linux"))] + let ssh_proxy_url = ssh_proxy_url_for_policy(policy, None); let ssh_socket_path: Option = ssh_socket_path.map(std::path::PathBuf::from); if let Some(listen_path) = ssh_socket_path.clone() { @@ -251,6 +235,7 @@ pub async fn run_process( ca_paths, provider_credentials_clone, user_env_clone, + enforcement_mode, ) .await { @@ -317,6 +302,7 @@ pub async fn run_process( workdir, interactive, policy, + enforcement_mode, netns, ca_file_paths.as_ref(), &provider_env, @@ -329,12 +315,16 @@ pub async fn run_process( workdir, interactive, policy, + enforcement_mode, ca_file_paths.as_ref(), &provider_env, )?; // Store the entrypoint PID so the proxy can resolve TCP peer identity entrypoint_pid.store(handle.pid(), Ordering::Release); + if let Some(path) = entrypoint_pid_file() { + write_entrypoint_pid_file(&path, handle.pid())?; + } ocsf_emit!( ProcessActivityBuilder::new(ocsf_ctx()) .activity(ActivityId::Open) @@ -387,6 +377,42 @@ pub async fn run_process( Ok(status.code()) } +fn entrypoint_pid_file() -> Option { + std::env::var(openshell_core::sandbox_env::ENTRYPOINT_PID_FILE) + .ok() + .filter(|value| !value.is_empty()) +} + +fn write_entrypoint_pid_file(path: &str, pid: u32) -> Result<()> { + let pid_path = std::path::Path::new(path); + if let Some(parent) = pid_path.parent() { + std::fs::create_dir_all(parent).into_diagnostic()?; + } + std::fs::write(pid_path, format!("{pid}\n")).into_diagnostic()?; + info!( + path, + pid, "Published workload entrypoint PID for network sidecar" + ); + Ok(()) +} + +fn ssh_proxy_url_for_policy( + policy: &SandboxPolicy, + netns_proxy_host: Option, +) -> Option { + if !matches!(policy.network.mode, NetworkMode::Proxy) { + return None; + } + + let proxy = policy.network.proxy.as_ref()?; + if let Some(host) = netns_proxy_host { + let port = proxy.http_addr.map_or(3128, |addr| addr.port()); + return Some(format!("http://{host}:{port}")); + } + + proxy.http_addr.map(|addr| format!("http://{addr}")) +} + /// Eagerly fetch initial settings and install the agent-driven policy /// proposal skill if the flag is on at startup. /// @@ -443,3 +469,53 @@ async fn install_initial_agent_skill(sandbox_id: Option<&str>, openshell_endpoin ); } } + +#[cfg(test)] +mod tests { + use super::*; + use openshell_core::policy::{ + FilesystemPolicy, LandlockPolicy, NetworkMode, NetworkPolicy, ProcessPolicy, ProxyPolicy, + }; + + fn policy(mode: NetworkMode, http_addr: Option) -> SandboxPolicy { + SandboxPolicy { + version: 1, + filesystem: FilesystemPolicy::default(), + network: NetworkPolicy { + mode, + proxy: http_addr.map(|http_addr| ProxyPolicy { + http_addr: Some(http_addr), + }), + }, + landlock: LandlockPolicy::default(), + process: ProcessPolicy::default(), + } + } + + #[test] + fn ssh_proxy_url_uses_policy_addr_without_netns() { + let policy = policy(NetworkMode::Proxy, Some(([127, 0, 0, 1], 3128).into())); + + assert_eq!( + ssh_proxy_url_for_policy(&policy, None).as_deref(), + Some("http://127.0.0.1:3128") + ); + } + + #[test] + fn ssh_proxy_url_prefers_netns_host_with_policy_port() { + let policy = policy(NetworkMode::Proxy, Some(([127, 0, 0, 1], 8080).into())); + + assert_eq!( + ssh_proxy_url_for_policy(&policy, Some([10, 200, 0, 1].into())).as_deref(), + Some("http://10.200.0.1:8080") + ); + } + + #[test] + fn ssh_proxy_url_skips_non_proxy_mode() { + let policy = policy(NetworkMode::Allow, Some(([127, 0, 0, 1], 3128).into())); + + assert_eq!(ssh_proxy_url_for_policy(&policy, None), None); + } +} diff --git a/crates/openshell-supervisor-process/src/ssh.rs b/crates/openshell-supervisor-process/src/ssh.rs index 955ec780c..c55a6d877 100644 --- a/crates/openshell-supervisor-process/src/ssh.rs +++ b/crates/openshell-supervisor-process/src/ssh.rs @@ -6,7 +6,7 @@ use crate::child_env; #[cfg(target_os = "linux")] use crate::managed_children; -use crate::process::{drop_privileges, is_supervisor_only_env_var}; +use crate::process::{ProcessEnforcementMode, drop_privileges, is_supervisor_only_env_var}; use crate::sandbox; use miette::{IntoDiagnostic, Result}; use nix::pty::{Winsize, openpty}; @@ -42,6 +42,7 @@ type SshServerInit = ( fn ssh_server_init( listen_path: &Path, ca_file_paths: &Option<(PathBuf, PathBuf)>, + enforcement_mode: ProcessEnforcementMode, ) -> Result { let mut rng = OsRng; let host_key = PrivateKey::random(&mut rng, Algorithm::Ed25519).into_diagnostic()?; @@ -55,13 +56,16 @@ fn ssh_server_init( let config = Arc::new(config); let ca_paths = ca_file_paths.as_ref().map(|p| Arc::new(p.clone())); - // Ensure the parent directory exists and is root-owned with 0700 - // permissions. The sandbox entrypoint runs as an unprivileged user; it - // must not be able to enter this directory and connect to the socket. + // In full enforcement mode the supervisor starts as root and can isolate + // the SSH socket in a root-only directory before spawning unprivileged + // children. In network-only sidecar mode the process supervisor itself + // runs as the sandbox UID, so the driver points the socket at a writable + // sidecar state volume and accepts that Unix permissions no longer isolate + // same-UID child processes from the socket. if let Some(parent) = listen_path.parent() { std::fs::create_dir_all(parent).into_diagnostic()?; #[cfg(unix)] - { + if enforcement_mode.enforces_process_controls() { use std::os::unix::fs::PermissionsExt; let perms = std::fs::Permissions::from_mode(0o700); std::fs::set_permissions(parent, perms).into_diagnostic()?; @@ -108,21 +112,23 @@ pub async fn run_ssh_server( ca_file_paths: Option<(PathBuf, PathBuf)>, provider_credentials: ProviderCredentialState, user_environment: HashMap, + enforcement_mode: ProcessEnforcementMode, ) -> Result<()> { - let (listener, config, ca_paths) = match ssh_server_init(&listen_path, &ca_file_paths) { - Ok(v) => { - // Signal that the SSH server has bound the socket and is ready to - // accept connections. The parent task awaits this before spawning - // the entrypoint process, ensuring exec requests won't race - // against server startup. - let _ = ready_tx.send(Ok(())); - v - } - Err(err) => { - let _ = ready_tx.send(Err(err)); - return Ok(()); - } - }; + let (listener, config, ca_paths) = + match ssh_server_init(&listen_path, &ca_file_paths, enforcement_mode) { + Ok(v) => { + // Signal that the SSH server has bound the socket and is ready to + // accept connections. The parent task awaits this before spawning + // the entrypoint process, ensuring exec requests won't race + // against server startup. + let _ = ready_tx.send(Ok(())); + v + } + Err(err) => { + let _ = ready_tx.send(Err(err)); + return Ok(()); + } + }; loop { let (stream, _peer) = listener.accept().await.into_diagnostic()?; @@ -145,6 +151,7 @@ pub async fn run_ssh_server( ca_paths, provider_credentials, user_environment, + enforcement_mode, ) .await { @@ -172,6 +179,7 @@ async fn handle_connection( ca_file_paths: Option>, provider_credentials: ProviderCredentialState, user_environment: HashMap, + enforcement_mode: ProcessEnforcementMode, ) -> Result<()> { // Access is gated by the Unix-socket filesystem permissions (root-only), // not by an application-level preface. The supervisor bridges the @@ -195,6 +203,7 @@ async fn handle_connection( ca_file_paths, provider_credentials, user_environment, + enforcement_mode, ); russh::server::run_stream(config, stream, handler) .await @@ -223,6 +232,7 @@ struct SshHandler { ca_file_paths: Option>, provider_credentials: ProviderCredentialState, user_environment: HashMap, + enforcement_mode: ProcessEnforcementMode, channels: HashMap, } @@ -236,6 +246,7 @@ impl SshHandler { ca_file_paths: Option>, provider_credentials: ProviderCredentialState, user_environment: HashMap, + enforcement_mode: ProcessEnforcementMode, ) -> Self { Self { policy, @@ -245,6 +256,7 @@ impl SshHandler { ca_file_paths, provider_credentials, user_environment, + enforcement_mode, channels: HashMap::new(), } } @@ -468,6 +480,7 @@ impl russh::server::Handler for SshHandler { self.ca_file_paths.clone(), &self.provider_credentials.child_env_with_gcp_resolved(), &self.user_environment, + self.enforcement_mode, )?; let state = self.channels.get_mut(&channel).ok_or_else(|| { anyhow::anyhow!("subsystem_request on unknown channel {channel:?}") @@ -564,6 +577,7 @@ impl SshHandler { self.ca_file_paths.clone(), &provider_env, &self.user_environment, + self.enforcement_mode, )?; state.pty_master = Some(pty_master); state.input_sender = Some(input_sender); @@ -582,6 +596,7 @@ impl SshHandler { self.ca_file_paths.clone(), &provider_env, &self.user_environment, + self.enforcement_mode, )?; state.input_sender = Some(input_sender); } @@ -661,12 +676,20 @@ impl Default for PtyRequest { /// Derive the session USER and HOME from the policy's `run_as_user`. /// -/// Falls back to `("sandbox", "/sandbox")` when the policy has no explicit user, -/// preserving backward compatibility with images that use the default layout. +/// For name-based identities, looks up the home directory via `/etc/passwd` +/// (or defaults to `/home/{user}`). +/// +/// For numeric UIDs, there is no passwd entry — falls back to +/// `("{uid}", "/sandbox")` so the agent session still has a meaningful +/// USER identifier. fn session_user_and_home(policy: &SandboxPolicy) -> (String, String) { match policy.process.run_as_user.as_deref() { Some(user) if !user.is_empty() => { - // Look up the user's home directory from /etc/passwd. + // Numeric UID — no passwd entry expected; use default HOME. + if user.parse::().is_ok() { + return (user.to_string(), "/sandbox".to_string()); + } + // Name-based identity — look up home from /etc/passwd. let home = nix::unistd::User::from_name(user) .ok() .flatten() @@ -740,6 +763,7 @@ fn spawn_pty_shell( ca_file_paths: Option>, provider_env: &HashMap, user_environment: &HashMap, + enforcement_mode: ProcessEnforcementMode, ) -> anyhow::Result<(std::fs::File, mpsc::Sender>)> { let winsize = Winsize { ws_row: to_u16(pty.row_height.max(1)), @@ -798,12 +822,20 @@ fn spawn_pty_shell( // Probe Landlock availability from the parent process where tracing works. #[cfg(target_os = "linux")] - sandbox::linux::log_sandbox_readiness(policy, workdir.as_deref()); + if enforcement_mode.enforces_process_controls() { + sandbox::linux::log_sandbox_readiness(policy, workdir.as_deref()); + } // Phase 1 (as root): Prepare Landlock ruleset before drop_privileges. #[cfg(target_os = "linux")] - let prepared_sandbox = sandbox::linux::prepare(policy, workdir.as_deref()) - .map_err(|err| anyhow::anyhow!("Failed to prepare sandbox: {err}"))?; + let prepared_sandbox = if enforcement_mode.enforces_process_controls() { + Some( + sandbox::linux::prepare(policy, workdir.as_deref()) + .map_err(|err| anyhow::anyhow!("Failed to prepare sandbox: {err}"))?, + ) + } else { + None + }; #[cfg(unix)] { @@ -813,6 +845,7 @@ fn spawn_pty_shell( workdir.clone(), slave_fd, netns_fd, + enforcement_mode, #[cfg(target_os = "linux")] prepared_sandbox, )?; @@ -905,6 +938,7 @@ fn spawn_pipe_exec( ca_file_paths: Option>, provider_env: &HashMap, user_environment: &HashMap, + enforcement_mode: ProcessEnforcementMode, ) -> anyhow::Result>> { let mut cmd = command.map_or_else( || { @@ -947,12 +981,20 @@ fn spawn_pipe_exec( // Probe Landlock availability from the parent process where tracing works. #[cfg(target_os = "linux")] - sandbox::linux::log_sandbox_readiness(policy, workdir.as_deref()); + if enforcement_mode.enforces_process_controls() { + sandbox::linux::log_sandbox_readiness(policy, workdir.as_deref()); + } // Phase 1 (as root): Prepare Landlock ruleset before drop_privileges. #[cfg(target_os = "linux")] - let prepared_sandbox = sandbox::linux::prepare(policy, workdir.as_deref()) - .map_err(|err| anyhow::anyhow!("Failed to prepare sandbox: {err}"))?; + let prepared_sandbox = if enforcement_mode.enforces_process_controls() { + Some( + sandbox::linux::prepare(policy, workdir.as_deref()) + .map_err(|err| anyhow::anyhow!("Failed to prepare sandbox: {err}"))?, + ) + } else { + None + }; #[cfg(unix)] { @@ -961,6 +1003,7 @@ fn spawn_pipe_exec( policy.clone(), workdir.clone(), netns_fd, + enforcement_mode, #[cfg(target_os = "linux")] prepared_sandbox, )?; @@ -1060,7 +1103,9 @@ fn spawn_pipe_exec( mod unsafe_pty { #[cfg(not(target_os = "linux"))] use super::sandbox; - use super::{Command, RawFd, SandboxPolicy, Winsize, drop_privileges, setsid}; + use super::{ + Command, ProcessEnforcementMode, RawFd, SandboxPolicy, Winsize, drop_privileges, setsid, + }; #[cfg(unix)] use std::os::unix::process::CommandExt; @@ -1099,17 +1144,21 @@ mod unsafe_pty { _workdir: Option, slave_fd: RawFd, netns_fd: Option, - #[cfg(target_os = "linux")] prepared: crate::sandbox::linux::PreparedSandbox, + enforcement_mode: ProcessEnforcementMode, + #[cfg(target_os = "linux")] prepared: Option, ) -> anyhow::Result<()> { // Wrap in Option so we can .take() it out of the FnMut closure. // pre_exec is only called once (after fork, before exec). #[cfg(target_os = "linux")] - let mut prepared = Some(prepared); + let mut prepared = prepared; #[cfg(target_os = "linux")] - let supervisor_identity_mount = crate::process::supervisor_identity_mount_from_env() - .map_err(|err| { + let supervisor_identity_mount = if enforcement_mode.enforces_process_controls() { + crate::process::supervisor_identity_mount_from_env().map_err(|err| { anyhow::anyhow!("failed to prepare supervisor identity isolation: {err}") - })?; + })? + } else { + None + }; unsafe { cmd.pre_exec(move || { setsid().map_err(|err| std::io::Error::other(err.to_string()))?; @@ -1118,6 +1167,7 @@ mod unsafe_pty { enter_netns_and_sandbox( netns_fd, &policy, + enforcement_mode, #[cfg(target_os = "linux")] supervisor_identity_mount, #[cfg(target_os = "linux")] @@ -1144,20 +1194,25 @@ mod unsafe_pty { policy: SandboxPolicy, _workdir: Option, netns_fd: Option, - #[cfg(target_os = "linux")] prepared: crate::sandbox::linux::PreparedSandbox, + enforcement_mode: ProcessEnforcementMode, + #[cfg(target_os = "linux")] prepared: Option, ) -> anyhow::Result<()> { #[cfg(target_os = "linux")] - let mut prepared = Some(prepared); + let mut prepared = prepared; #[cfg(target_os = "linux")] - let supervisor_identity_mount = crate::process::supervisor_identity_mount_from_env() - .map_err(|err| { + let supervisor_identity_mount = if enforcement_mode.enforces_process_controls() { + crate::process::supervisor_identity_mount_from_env().map_err(|err| { anyhow::anyhow!("failed to prepare supervisor identity isolation: {err}") - })?; + })? + } else { + None + }; unsafe { cmd.pre_exec(move || { enter_netns_and_sandbox( netns_fd, &policy, + enforcement_mode, #[cfg(target_os = "linux")] supervisor_identity_mount, #[cfg(target_os = "linux")] @@ -1171,6 +1226,7 @@ mod unsafe_pty { fn enter_netns_and_sandbox( netns_fd: Option, policy: &SandboxPolicy, + enforcement_mode: ProcessEnforcementMode, #[cfg(target_os = "linux")] supervisor_identity_mount: Option< &crate::process::SupervisorIdentityMountNamespace, >, @@ -1199,7 +1255,9 @@ mod unsafe_pty { // Drop privileges. initgroups/setgid/setuid need /etc/group and // /etc/passwd which would be blocked if Landlock were already enforced. - drop_privileges(policy).map_err(|err| std::io::Error::other(err.to_string()))?; + if enforcement_mode.enforces_process_controls() { + drop_privileges(policy).map_err(|err| std::io::Error::other(err.to_string()))?; + } crate::process::harden_child_process() .map_err(|err| std::io::Error::other(err.to_string()))?; @@ -1212,7 +1270,9 @@ mod unsafe_pty { } #[cfg(not(target_os = "linux"))] - sandbox::apply(policy, None).map_err(|err| std::io::Error::other(err.to_string()))?; + if enforcement_mode.enforces_process_controls() { + sandbox::apply(policy, None).map_err(|err| std::io::Error::other(err.to_string()))?; + } Ok(()) } @@ -1527,6 +1587,112 @@ mod tests { assert_eq!(rx_b.recv().unwrap(), b"still-alive"); } + // ----------------------------------------------------------------------- + // session_user_and_home tests (Phase 2: numeric UID support) + // ----------------------------------------------------------------------- + + #[test] + fn session_user_and_home_returns_numeric_uid_as_user() { + use openshell_core::policy::{ + FilesystemPolicy, LandlockPolicy, NetworkPolicy, ProcessPolicy, + }; + let policy = SandboxPolicy { + version: 1, + filesystem: FilesystemPolicy::default(), + network: NetworkPolicy::default(), + landlock: LandlockPolicy::default(), + process: ProcessPolicy { + run_as_user: Some("1000".into()), + run_as_group: None, + }, + }; + let (user, home) = session_user_and_home(&policy); + assert_eq!(user, "1000"); + // Numeric UID has no passwd entry — defaults to /sandbox. + assert_eq!(home, "/sandbox"); + } + + #[test] + fn session_user_and_home_returns_name_from_passwd() { + use openshell_core::policy::{ + FilesystemPolicy, LandlockPolicy, NetworkPolicy, ProcessPolicy, + }; + let policy = SandboxPolicy { + version: 1, + filesystem: FilesystemPolicy::default(), + network: NetworkPolicy::default(), + landlock: LandlockPolicy::default(), + process: ProcessPolicy { + run_as_user: Some("sandbox".into()), + run_as_group: None, + }, + }; + let (user, home) = session_user_and_home(&policy); + assert_eq!(user, "sandbox"); + // Name-based — should resolve via passwd (or /home/{user}). + assert!(!home.is_empty()); + } + + #[test] + fn session_user_and_home_defaults_to_sandbox_when_empty() { + use openshell_core::policy::{ + FilesystemPolicy, LandlockPolicy, NetworkPolicy, ProcessPolicy, + }; + let policy = SandboxPolicy { + version: 1, + filesystem: FilesystemPolicy::default(), + network: NetworkPolicy::default(), + landlock: LandlockPolicy::default(), + process: ProcessPolicy { + run_as_user: Some(String::new()), + run_as_group: None, + }, + }; + let (user, home) = session_user_and_home(&policy); + assert_eq!(user, "sandbox"); + assert_eq!(home, "/sandbox"); + } + + #[test] + fn session_user_and_home_defaults_to_sandbox_when_none() { + use openshell_core::policy::{ + FilesystemPolicy, LandlockPolicy, NetworkPolicy, ProcessPolicy, + }; + let policy = SandboxPolicy { + version: 1, + filesystem: FilesystemPolicy::default(), + network: NetworkPolicy::default(), + landlock: LandlockPolicy::default(), + process: ProcessPolicy { + run_as_user: None, + run_as_group: None, + }, + }; + let (user, home) = session_user_and_home(&policy); + assert_eq!(user, "sandbox"); + assert_eq!(home, "/sandbox"); + } + + #[test] + fn session_user_and_home_handles_large_numeric_uid() { + use openshell_core::policy::{ + FilesystemPolicy, LandlockPolicy, NetworkPolicy, ProcessPolicy, + }; + let policy = SandboxPolicy { + version: 1, + filesystem: FilesystemPolicy::default(), + network: NetworkPolicy::default(), + landlock: LandlockPolicy::default(), + process: ProcessPolicy { + run_as_user: Some("1000660000".into()), + run_as_group: None, + }, + }; + let (user, home) = session_user_and_home(&policy); + assert_eq!(user, "1000660000"); + assert_eq!(home, "/sandbox"); + } + /// `install_pre_exec_no_pty` runs drop_privileges and succeeds when the /// current user/group is already the configured one (no actual uid change). /// @@ -1567,21 +1733,24 @@ mod tests { policy, None, None, // no netns fd + ProcessEnforcementMode::Full, #[cfg(target_os = "linux")] - sandbox::linux::prepare( - &SandboxPolicy { - version: 0, - filesystem: FilesystemPolicy::default(), - network: NetworkPolicy::default(), - landlock: LandlockPolicy::default(), - process: ProcessPolicy { - run_as_user: None, - run_as_group: None, + Some( + sandbox::linux::prepare( + &SandboxPolicy { + version: 0, + filesystem: FilesystemPolicy::default(), + network: NetworkPolicy::default(), + landlock: LandlockPolicy::default(), + process: ProcessPolicy { + run_as_user: None, + run_as_group: None, + }, }, - }, - None, - ) - .expect("prepare should succeed in test environment"), + None, + ) + .expect("prepare should succeed in test environment"), + ), ) .expect("install pre_exec should succeed"); diff --git a/deploy/docker/Dockerfile.supervisor b/deploy/docker/Dockerfile.supervisor index c84cc70e9..5bb32d7f5 100644 --- a/deploy/docker/Dockerfile.supervisor +++ b/deploy/docker/Dockerfile.supervisor @@ -5,10 +5,10 @@ # Supervisor image build. # -# The final image is `scratch`: it only carries the static `openshell-sandbox` -# binary used by Docker extraction, Podman image volumes, and the Kubernetes -# init container copy-self path. A static musl binary lets the image stay -# `scratch` while still being executable as an init container. +# The final image carries the static `openshell-sandbox` binary used by Docker +# extraction, Podman image volumes, and the Kubernetes init container copy-self +# path. It also includes nftables so the Kubernetes supervisor sidecar can +# install pod-namespace egress enforcement rules. # # The Rust binary is built natively before this image build runs and staged at: # deploy/docker/.build/prebuilt-binaries//openshell-sandbox @@ -19,17 +19,16 @@ # target) and uploads it as an artifact, which is downloaded into the same # staging directory before the image build job runs. -FROM scratch AS supervisor +FROM alpine:3.22 AS supervisor ARG TARGETARCH -# --chmod=0550 drops world-execute and survives the actions/upload-artifact -# + download-artifact roundtrip (which strips exec perms). Ownership is left -# at root (0:0) deliberately: the Podman driver mounts this image as a -# read-only image volume into the sandbox container and drops DAC_OVERRIDE, -# so the container's UID 0 must own the binary to read+exec it. Mode 0550 -# (r-xr-x---) is the security win; the chown to a non-root UID was breaking -# Podman without buying anything since the container is always UID 0. -COPY --chmod=0550 deploy/docker/.build/prebuilt-binaries/${TARGETARCH}/openshell-sandbox /openshell-sandbox +RUN apk add --no-cache nftables + +# --chmod=0555 restores execute bits after the actions/upload-artifact + +# download-artifact roundtrip strips them. Ownership stays root (0:0) for +# Podman image-volume mounts, while world-execute lets the Kubernetes +# network sidecar run this binary as the dedicated non-root proxy UID. +COPY --chmod=0555 deploy/docker/.build/prebuilt-binaries/${TARGETARCH}/openshell-sandbox /openshell-sandbox ENTRYPOINT ["/openshell-sandbox"] diff --git a/deploy/helm/openshell/README.md b/deploy/helm/openshell/README.md index e6d539592..610837bdf 100644 --- a/deploy/helm/openshell/README.md +++ b/deploy/helm/openshell/README.md @@ -236,7 +236,9 @@ add `ci/values-spire.yaml` to the OpenShell release values files. | supervisor.image.pullPolicy | string | `""` | Supervisor image pull policy. Defaults to the gateway image pull policy when empty. | | supervisor.image.repository | string | `"ghcr.io/nvidia/openshell/supervisor"` | Supervisor image repository. | | supervisor.image.tag | string | `""` | Supervisor image tag. Defaults to the chart appVersion when empty. | +| supervisor.sidecarProxyUid | int | `1337` | UID for the long-running network sidecar in sidecar topology. The network init container installs nftables rules that exempt this UID. | | supervisor.sideloadMethod | string | `""` | How the supervisor binary is delivered into sandbox pods. Empty (default) = auto-detect from cluster version: K8s >= v1.35 -> "image-volume" (ImageVolume enabled by default; GA in v1.36) K8s < v1.35 -> "init-container" (copies via init container + emptyDir) On K8s v1.33-v1.34 with the ImageVolume feature gate manually enabled, set this to "image-volume" explicitly. | +| supervisor.topology | string | `"combined"` | Supervisor pod topology for Kubernetes sandboxes. "combined" runs the current single supervisor container in the agent pod. "sidecar" runs network enforcement in a dedicated sidecar and the process supervisor as a low-capability wrapper in the agent container. | | tolerations | list | `[]` | Tolerations for the gateway pod. | | workload.allowMultiReplicaStatefulSet | bool | `false` | Allow replicaCount > 1 while rendering a StatefulSet. Prefer workload.kind=deployment for external database-backed multi-replica gateways; this override exists for operators who explicitly require StatefulSet identity or storage semantics. | | workload.kind | string | `"statefulset"` | Gateway workload controller kind. Use `statefulset` for the default SQLite database, or `deployment` when server.externalDbSecret points at an external database. | diff --git a/deploy/helm/openshell/ci/values-sidecar.yaml b/deploy/helm/openshell/ci/values-sidecar.yaml new file mode 100644 index 000000000..dac9e810f --- /dev/null +++ b/deploy/helm/openshell/ci/values-sidecar.yaml @@ -0,0 +1,13 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# CI/dev overlay for exercising the Kubernetes supervisor sidecar topology. +# +# Merge after values.yaml and ci/values-skaffold.yaml: +# helm install ... -f values.yaml -f ci/values-skaffold.yaml -f ci/values-sidecar.yaml +# +# Or set: +# OPENSHELL_E2E_KUBE_EXTRA_VALUES=deploy/helm/openshell/ci/values-sidecar.yaml +# before running `mise run e2e:kubernetes`. +supervisor: + topology: sidecar diff --git a/deploy/helm/openshell/skaffold.yaml b/deploy/helm/openshell/skaffold.yaml index 37a21fbac..19bd662f9 100644 --- a/deploy/helm/openshell/skaffold.yaml +++ b/deploy/helm/openshell/skaffold.yaml @@ -120,6 +120,8 @@ deploy: # To enable SPIFFE/SPIRE provider token grants (requires the # spire-crds and spire releases above): #- ci/values-spire.yaml + # To exercise the Kubernetes supervisor sidecar topology: + #- ci/values-sidecar.yaml # To test multi-replica external PostgreSQL behavior: #- ci/values-high-availability.yaml setValueTemplates: @@ -127,3 +129,9 @@ deploy: image.tag: '{{.IMAGE_TAG_openshell_gateway}}' supervisor.image.repository: '{{.IMAGE_REPO_openshell_supervisor}}' supervisor.image.tag: '{{.IMAGE_TAG_openshell_supervisor}}' +profiles: + - name: sidecar + patches: + - op: add + path: /deploy/helm/releases/0/valuesFiles/- + value: ci/values-sidecar.yaml diff --git a/deploy/helm/openshell/templates/gateway-config.yaml b/deploy/helm/openshell/templates/gateway-config.yaml index 7037be88f..755e8f524 100644 --- a/deploy/helm/openshell/templates/gateway-config.yaml +++ b/deploy/helm/openshell/templates/gateway-config.yaml @@ -113,6 +113,8 @@ data: grpc_endpoint = {{ include "openshell.grpcEndpoint" . | quote }} service_account_name = {{ include "openshell.sandboxServiceAccountName" . | quote }} supervisor_sideload_method = {{ include "openshell.supervisorSideloadMethod" . | quote }} + supervisor_topology = {{ .Values.supervisor.topology | default "combined" | quote }} + sidecar_proxy_uid = {{ .Values.supervisor.sidecarProxyUid | default 1337 }} sa_token_ttl_secs = {{ .Values.server.sandboxJwt.k8sSaTokenTtlSecs | default 3600 }} {{- if .Values.server.providerTokenGrants.spiffe.enabled }} provider_spiffe_workload_api_socket_path = {{ .Values.server.providerTokenGrants.spiffe.workloadApiSocketPath | quote }} diff --git a/deploy/helm/openshell/tests/gateway_config_test.yaml b/deploy/helm/openshell/tests/gateway_config_test.yaml index c2708a20f..ff7523863 100644 --- a/deploy/helm/openshell/tests/gateway_config_test.yaml +++ b/deploy/helm/openshell/tests/gateway_config_test.yaml @@ -83,6 +83,24 @@ tests: path: data["gateway.toml"] pattern: '(?ms)\[openshell\.drivers\.kubernetes\].*?service_account_name\s*=\s*"openshell-sandbox"' + - it: renders supervisor topology under [openshell.drivers.kubernetes] + template: templates/gateway-config.yaml + set: + supervisor.topology: sidecar + asserts: + - matchRegex: + path: data["gateway.toml"] + pattern: '(?ms)\[openshell\.drivers\.kubernetes\].*?supervisor_topology\s*=\s*"sidecar"' + + - it: renders sidecar proxy uid under [openshell.drivers.kubernetes] + template: templates/gateway-config.yaml + set: + supervisor.sidecarProxyUid: 2200 + asserts: + - matchRegex: + path: data["gateway.toml"] + pattern: '(?ms)\[openshell\.drivers\.kubernetes\].*?sidecar_proxy_uid\s*=\s*2200' + - it: renders sandbox image pull secrets under [openshell.drivers.kubernetes] template: templates/gateway-config.yaml set: diff --git a/deploy/helm/openshell/values.yaml b/deploy/helm/openshell/values.yaml index d7ff8b257..7a3723fe3 100644 --- a/deploy/helm/openshell/values.yaml +++ b/deploy/helm/openshell/values.yaml @@ -44,6 +44,14 @@ supervisor: # On K8s v1.33-v1.34 with the ImageVolume feature gate manually enabled, # set this to "image-volume" explicitly. sideloadMethod: "" + # -- Supervisor pod topology for Kubernetes sandboxes. + # "combined" runs the current single supervisor container in the agent pod. + # "sidecar" runs network enforcement in a dedicated sidecar and the process + # supervisor as a low-capability wrapper in the agent container. + topology: "combined" + # -- UID for the long-running network sidecar in sidecar topology. The + # network init container installs nftables rules that exempt this UID. + sidecarProxyUid: 1337 # -- Image pull secrets attached to gateway and helper pods. imagePullSecrets: [] diff --git a/docs/kubernetes/access-control.mdx b/docs/kubernetes/access-control.mdx index 8824b6de1..5409a4b11 100644 --- a/docs/kubernetes/access-control.mdx +++ b/docs/kubernetes/access-control.mdx @@ -5,7 +5,7 @@ title: "Access Control" sidebar-title: "Access Control" description: "Configure OIDC user authentication or reverse-proxy auth termination for a Kubernetes-deployed OpenShell gateway." keywords: "Generative AI, Cybersecurity, Kubernetes, Authentication, mTLS, OIDC, Keycloak, Entra ID, Okta, Gateway Auth" -position: 4 +position: 5 --- The OpenShell gateway supports two access-control models for human callers on Kubernetes: diff --git a/docs/kubernetes/ingress.mdx b/docs/kubernetes/ingress.mdx index a47637073..a572004fb 100644 --- a/docs/kubernetes/ingress.mdx +++ b/docs/kubernetes/ingress.mdx @@ -5,7 +5,7 @@ title: "Ingress" sidebar-title: "Ingress" description: "Expose the OpenShell gateway externally using the Kubernetes Gateway API and a GRPCRoute." keywords: "Generative AI, Cybersecurity, Kubernetes, Gateway API, Envoy Gateway, GRPCRoute, Ingress, External Access" -position: 3 +position: 4 --- By default, the OpenShell gateway is only reachable inside the cluster. To let CLI clients connect without a `kubectl port-forward`, expose the gateway through an ingress. diff --git a/docs/kubernetes/managing-certificates.mdx b/docs/kubernetes/managing-certificates.mdx index a445f77e8..da8c87eaf 100644 --- a/docs/kubernetes/managing-certificates.mdx +++ b/docs/kubernetes/managing-certificates.mdx @@ -5,7 +5,7 @@ title: "Managing Certificates" sidebar-title: "Managing Certificates" description: "Configure the OpenShell Helm chart to use cert-manager for mTLS certificate issuance and automatic renewal." keywords: "Generative AI, Cybersecurity, Kubernetes, cert-manager, PKI, TLS, mTLS, Certificates" -position: 2 +position: 3 --- The OpenShell gateway uses mTLS certificates for transport between the gateway and sandbox supervisors. These certificates are not Kubernetes user authentication; configure OIDC or a trusted access proxy for user access. The Helm chart supports two ways to provision and manage the certificate bundle: diff --git a/docs/kubernetes/openshift.mdx b/docs/kubernetes/openshift.mdx index b8313bdfe..caf799b51 100644 --- a/docs/kubernetes/openshift.mdx +++ b/docs/kubernetes/openshift.mdx @@ -5,7 +5,7 @@ title: "OpenShift" sidebar-title: "OpenShift" description: "Install the OpenShell Helm chart on OpenShift, including the SCC binding and chart overrides required by OpenShift's Security Context Constraints." keywords: "Generative AI, Cybersecurity, Kubernetes, OpenShift, SCC, Security Context Constraints, Helm, Gateway, Installation" -position: 5 +position: 6 --- diff --git a/docs/kubernetes/setup.mdx b/docs/kubernetes/setup.mdx index 5ab786519..d3355bfd1 100644 --- a/docs/kubernetes/setup.mdx +++ b/docs/kubernetes/setup.mdx @@ -160,6 +160,8 @@ The most commonly changed values are: | `server.enableLoopbackServiceHttp` | Enable local plaintext HTTP for loopback sandbox service URLs. Defaults to `true`. | | `pkiInitJob.serverDnsNames` / `certManager.serverDnsNames` | Additional gateway server DNS SANs. Wildcard SANs also enable sandbox service URLs under that domain. | | `supervisor.sideloadMethod` | How the supervisor binary is delivered into sandbox pods. Leave empty to auto-detect based on cluster version: clusters running Kubernetes 1.35 or later use `image-volume` (ImageVolume GA in 1.36); older clusters use `init-container`. Set explicitly to `image-volume` on Kubernetes 1.33 or 1.34 with the ImageVolume feature gate enabled, or to `init-container` to force the legacy path on any version. | +| `supervisor.topology` | Sandbox pod topology. Leave as `combined` for the original full-enforcement path, or set to `sidecar` when the agent container should run non-root without added Linux capabilities. Refer to [Topology](/kubernetes/topology). | +| `supervisor.sidecarProxyUid` | Non-root UID for the long-running network sidecar when `supervisor.topology=sidecar`. The UID must not match the sandbox UID. | Use a values file for repeatable deployments: @@ -243,6 +245,7 @@ The gateway exposes `/healthz` for process liveness and `/readyz` for dependency ## Next Steps +- To choose between combined and sidecar sandbox pods, refer to [Topology](/kubernetes/topology). - To enable automatic certificate rotation with cert-manager, refer to [Managing Certificates](/kubernetes/managing-certificates). - To expose the gateway externally without port-forwarding, refer to [Ingress](/kubernetes/ingress). - To configure OIDC or reverse-proxy authentication, refer to [Access Control](/kubernetes/access-control). diff --git a/docs/kubernetes/topology.mdx b/docs/kubernetes/topology.mdx new file mode 100644 index 000000000..467566f42 --- /dev/null +++ b/docs/kubernetes/topology.mdx @@ -0,0 +1,136 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Kubernetes Sandbox Topology" +sidebar-title: "Topology" +description: "Choose between combined and sidecar supervisor topology for Kubernetes sandbox pods." +keywords: "Generative AI, Cybersecurity, Kubernetes, Sandboxing, Sidecar, Network Policy, RuntimeClass" +position: 2 +--- + +Kubernetes sandbox pods can run the OpenShell supervisor in `combined` or +`sidecar` topology. Choose the topology based on which controls you need inside +the pod and how much privilege your cluster allows on the agent container. + +## Choose a Topology + +The default `combined` topology preserves the full OpenShell enforcement model. +Use `sidecar` only when you accept network-focused enforcement in exchange for a +lower-privilege agent container. + +| Topology | Use when | Main tradeoff | +|---|---|---| +| `combined` | You need OpenShell network, filesystem, and process controls in the sandbox workload. | The agent container carries the Linux capabilities the supervisor needs. | +| `sidecar` | You need the agent container to run as non-root without added Linux capabilities, and network policy is the primary control. | Filesystem policy, process privilege dropping, and process/binary identity checks are not applied by the process supervisor. | + +## Combined Topology + +Combined topology is the original Kubernetes mode and remains the default. The +agent container starts the OpenShell supervisor, and the supervisor launches the +workload after applying sandbox setup. + +Combined topology keeps these controls in one supervisor path: + +- Network endpoint and L7 policy enforcement. +- Filesystem policy enforcement. +- Process and binary identity checks. +- Privilege drop into the sandbox user. +- Gateway relay, SSH sessions, exec, and file sync. + +Because the supervisor performs network namespace setup and process/filesystem +controls from the agent container, Kubernetes grants that container elevated +Linux capabilities. Use this mode when you need the complete OpenShell sandbox +contract and your cluster policy permits those capabilities. + +## Sidecar Topology + +Sidecar topology splits the supervisor into a network sidecar and a +low-privilege process supervisor in the agent container. + +The pod contains these OpenShell-managed pieces: + +| Component | Runs as | Purpose | +|---|---|---| +| Network init container | Root with setup capabilities | Installs pod-level nftables rules and prepares shared sidecar state. | +| Network sidecar | `supervisor.sidecarProxyUid` | Runs the proxy, enforces network policy, writes proxy TLS material, and forwards gateway traffic on loopback. | +| Agent container | Resolved sandbox UID/GID | Runs the process supervisor and launches the user workload. | + +In this topology, the agent container runs with `runAsNonRoot: true`, +`allowPrivilegeEscalation: false`, and `capabilities.drop: ["ALL"]`. The +long-running network sidecar also drops all Linux capabilities. The root init +container keeps the setup capabilities needed to configure pod networking. + +Sidecar mode preserves gateway session behavior, including SSH connectivity, +because the process supervisor still owns the session relay. The network sidecar +handles outbound enforcement and forwards the process supervisor's gateway +traffic to the real gateway endpoint. + + +Sidecar mode runs the process supervisor in network-only mode. OpenShell still +enforces network endpoint and L7 policy through the sidecar, but the process +supervisor does not apply Landlock filesystem policy, process privilege +dropping, or process/binary identity checks. + + +## Credential Exposure + +Sidecar topology uses pod `fsGroup` and group-readable projected credentials so +the non-root process supervisor can authenticate to the gateway. This includes +the projected ServiceAccount token used for sandbox token bootstrap and the +sandbox client TLS secret. + +Treat the agent container as trusted with respect to those in-pod gateway +credentials. Use `combined` topology when that credential exposure is not +acceptable for your deployment. + +## RuntimeClass Isolation + +Sidecar topology pairs well with runtime classes such as gVisor or Kata +Containers when the cluster supports them. A sandboxed runtime strengthens the +container boundary while OpenShell focuses on network policy enforcement from +the sidecar. + +Runtime classes do not re-enable the OpenShell filesystem and process controls +that sidecar mode relaxes. Use them as an additional workload boundary, not as a +replacement for the combined topology's full supervisor controls. + +You can set a default runtime class in the Kubernetes driver configuration or +override it per sandbox with driver config: + +```shell +openshell sandbox create \ + --driver-config-json '{"kubernetes":{"pod":{"runtime_class_name":"kata-containers"}}}' \ + -- claude +``` + +## Enable Sidecar Mode + +Set `supervisor.topology=sidecar` in the Helm chart values: + +```yaml +supervisor: + topology: sidecar + sidecarProxyUid: 1337 +``` + +`sidecarProxyUid` must be a non-root UID and must not match the sandbox UID. +The network init container exempts this UID from proxy redirection so the +sidecar can reach the gateway. + +For direct gateway TOML configuration, set the equivalent Kubernetes driver +fields: + +```toml +[openshell.drivers.kubernetes] +supervisor_topology = "sidecar" +sidecar_proxy_uid = 1337 +``` + +Leave `supervisor.topology` unset, or set it to `combined`, to keep the original +single-container supervisor path. + +## Next Steps + +- To install OpenShell on Kubernetes, refer to [Setup](/kubernetes/setup). +- To configure gateway authentication, refer to [Access Control](/kubernetes/access-control). +- To review the driver fields, refer to [Gateway Configuration File](/reference/gateway-config). diff --git a/docs/reference/gateway-config.mdx b/docs/reference/gateway-config.mdx index 2aaa6e7b0..7da846474 100644 --- a/docs/reference/gateway-config.mdx +++ b/docs/reference/gateway-config.mdx @@ -177,6 +177,15 @@ supervisor_image_pull_policy = "IfNotPresent" # Use the image volume on Kubernetes >= 1.35 (GA in 1.36); switch to "init-container" # on older clusters or where the ImageVolume feature gate is off. supervisor_sideload_method = "image-volume" +# "combined" runs the existing single supervisor container with full process, +# filesystem, and network enforcement in the agent container. "sidecar" moves +# pod-level network enforcement and gateway forwarding into a network sidecar; +# the agent container runs non-root with no added Linux capabilities and keeps +# SSH/session behavior, but process/filesystem enforcement is network-only. +supervisor_topology = "combined" +# UID used by the long-running network sidecar. In sidecar topology the +# network init container installs nftables rules that exempt this UID. +sidecar_proxy_uid = 1337 grpc_endpoint = "https://openshell-gateway.agents.svc:8080" ssh_socket_path = "/run/openshell/ssh.sock" client_tls_secret_name = "openshell-client-tls" @@ -194,6 +203,12 @@ sa_token_ttl_secs = 3600 # shared roots such as /run, /var, /tmp, and /etc are rejected. # Supervisor-to-gateway auth still uses gateway JWTs. provider_spiffe_workload_api_socket_path = "/spiffe-workload-api/spire-agent.sock" +# Explicit sandbox UID/GID for the supervisor container securityContext and +# PVC init container. When unset, the driver auto-detects from OpenShift SCC +# namespace annotations (openshift.io/sa.scc.uid-range) if present, falling +# back to 1000 on non-OpenShift clusters. +# sandbox_uid = 1500 +# sandbox_gid = 1500 ``` ### Docker @@ -306,6 +321,9 @@ overlay_disk_mib = 4096 guest_tls_ca = "/var/lib/openshell/guest-tls/ca.pem" guest_tls_cert = "/var/lib/openshell/guest-tls/client.pem" guest_tls_key = "/var/lib/openshell/guest-tls/client-key.pem" +# Resolved sandbox UID/GID for the rootfs /etc/passwd entry. +# Defaults to 10001 when unset; matching GID is used if sandbox_gid is empty. +# sandbox_uid = 20001 ``` ### Extension Driver diff --git a/docs/reference/sandbox-compute-drivers.mdx b/docs/reference/sandbox-compute-drivers.mdx index 341d9e9f4..5151008e8 100644 --- a/docs/reference/sandbox-compute-drivers.mdx +++ b/docs/reference/sandbox-compute-drivers.mdx @@ -304,10 +304,27 @@ For maintainer-level implementation details, refer to the [Kubernetes driver REA | `supervisor_image` | `supervisor.image.repository` / `supervisor.image.tag` | Set the supervisor image that provides the `openshell-sandbox` binary. | | `supervisor_image_pull_policy` | `supervisor.image.pullPolicy` | Set the Kubernetes image pull policy for the supervisor image. | | `supervisor_sideload_method` | `supervisor.sideloadMethod` | How the supervisor binary is delivered into sandbox pods. Leave empty to auto-detect from cluster version. Set to `image-volume` to mount the supervisor OCI image directly as a volume (requires Kubernetes 1.33+ with the ImageVolume feature gate; GA in 1.36), or `init-container` to copy it through an init container on older clusters. | +| `supervisor_topology` | `supervisor.topology` | Set `combined` for the default single supervisor path, or `sidecar` to move pod-level network enforcement and gateway forwarding into a dedicated sidecar. | +| `sidecar_proxy_uid` | `supervisor.sidecarProxyUid` | UID used by the long-running network sidecar in `sidecar` topology. The network init container exempts this UID from proxy redirection. | | `app_armor_profile` | `server.appArmorProfile` | Set the sandbox agent container's AppArmor profile. Helm defaults this to `Unconfined` so AppArmor-enabled nodes do not block supervisor network namespace setup. Set the Helm value to an empty string to omit the field, or use `RuntimeDefault` or `Localhost/` for operator-managed profiles. | | `workspace_default_storage_size` | `server.workspaceDefaultStorageSize` | Set the default workspace PVC size for new sandboxes. | | `sa_token_ttl_secs` | `server.sandboxJwt.k8sSaTokenTtlSecs` | Set the projected ServiceAccount token TTL used for the bootstrap token exchange. | +In `combined` topology, the agent container carries the Linux capabilities +needed by the supervisor for network namespace setup, Landlock filesystem +policy, process privilege changes, and network policy enforcement. In `sidecar` +topology, the agent container runs as the resolved sandbox UID/GID with no added +Linux capabilities. A root init container performs the nftables setup, and the +long-running sidecar runs non-root with no added Linux capabilities. Sidecar +mode keeps gateway session and SSH behavior, but the process supervisor runs in +network-only mode: filesystem policy, process privilege dropping, and +process/binary identity checks are not applied by the process container. + +Sidecar mode uses pod `fsGroup` so the non-root process supervisor can read the +projected ServiceAccount token and sandbox client TLS secret required for +gateway authentication. Treat the workload container as trusted with respect to +those in-pod gateway credentials. + The Kubernetes driver creates namespaced `agents.x-k8s.io` `Sandbox` resources from the Kubernetes SIG Apps [agent-sandbox](https://github.com/kubernetes-sigs/agent-sandbox) project. It detects the served Sandbox API at runtime, caches the selected API version for the gateway process, and uses `v1beta1` when available before falling back to `v1alpha1`, so supported Agent Sandbox installations work without version-specific operator configuration. The Agent Sandbox controller turns those resources into sandbox pods and related storage. If Agent Sandbox is upgraded in place, restart the OpenShell gateway after the controller and CRD rollout completes so the gateway can detect the served API versions again. @@ -315,3 +332,30 @@ If Agent Sandbox is upgraded in place, restart the OpenShell gateway after the c `Sandbox.spec.volumeClaimTemplates` is immutable after creation. To change storage configuration, delete the sandbox and create a new one with the updated spec. + +## Sandbox User Identity + +OpenShell accepts both the hardcoded username `"sandbox"` and numeric UIDs in `[1000, 2_000_000_000]` for the supervisor's process identity (the policy's `run_as_user` field). The driver resolves the UID at sandbox creation time and passes it to the supervisor via environment variables. + +### Kubernetes / OpenShift + +The Kubernetes driver auto-detects the sandbox UID from OpenShift SCC namespace annotations: + +- `openshift.io/sa.scc.uid-range` (format: `/`, e.g. `1000000000/10000`) provides the UID. +- `openshift.io/sa.scc.supplemental-groups` provides the GID when present; otherwise the resolved UID is used as the GID. +- On non-OpenShift clusters, or when annotations are absent, the driver falls back to `1000`. + +You can override autodetection with explicit `sandbox_uid` / `sandbox_gid` config in `[openshell.drivers.kubernetes]`. When set, the driver skips namespace annotation lookup entirely. + +The resolved UID/GID appear in: + +- Supervisor container environment variables (`OPENSHELL_SANDBOX_UID`, `OPENSHELL_SANDBOX_GID`) for direct kernel-level privilege dropping without `/etc/passwd` lookups. +- PVC init container `securityContext.runAsUser/runAsGroup/fsGroup` for workspace ownership operations. + +### VM Driver + +The VM driver injects the sandbox UID into the rootfs guest's `/etc/passwd`, `/etc/group`, and `/etc/gshadow` during rootfs preparation. Default UID is `10001`; configure `sandbox_uid` in `[openshell.drivers.vm]` to use a different value. + +### Custom Images + +Custom sandbox images no longer need a baked-in `"sandbox"` user. If your image requires a passwd entry for tools like `sudo` or `ssh`, add one manually (e.g. `RUN useradd -m -u 1500 deploy`). The supervisor resolves the numeric UID directly via `setuid()` without needing `/etc/passwd`. diff --git a/e2e/with-kube-gateway.sh b/e2e/with-kube-gateway.sh index 47b8730dc..8ce1989da 100755 --- a/e2e/with-kube-gateway.sh +++ b/e2e/with-kube-gateway.sh @@ -393,6 +393,46 @@ require_cmd() { fi } +configure_fixture_container_engine() { + local selected_engine="" + + if [ -n "${CONTAINER_ENGINE:-}" ]; then + selected_engine="$(printf '%s' "${CONTAINER_ENGINE}" | tr '[:upper:]' '[:lower:]')" + case "${selected_engine}" in + docker|podman) + export CONTAINER_ENGINE="${selected_engine}" + return + ;; + *) + echo "ERROR: CONTAINER_ENGINE=${CONTAINER_ENGINE} is invalid; expected docker or podman" >&2 + exit 2 + ;; + esac + fi + + case "${KUBE_CONTEXT}" in + k3d-*) + selected_engine="docker" + ;; + kind-*) + case "$(printf '%s' "${KIND_EXPERIMENTAL_PROVIDER:-}" | tr '[:upper:]' '[:lower:]')" in + podman) + selected_engine="podman" + ;; + *) + selected_engine="docker" + ;; + esac + ;; + *) + return + ;; + esac + + export CONTAINER_ENGINE="${selected_engine}" + echo "Using ${CONTAINER_ENGINE} for Kubernetes e2e host-side fixture containers." +} + require_cmd helm require_cmd kubectl require_cmd curl @@ -423,6 +463,8 @@ else KUBE_CONTEXT="k3d-${CLUSTER_NAME}" fi +configure_fixture_container_engine + if [ -z "${OPENSHELL_E2E_KUBE_BUILD_IMAGES+x}" ]; then if [ "${CLUSTER_CREATED_BY_US}" = "1" ]; then OPENSHELL_E2E_KUBE_BUILD_IMAGES=1 diff --git a/examples/bring-your-own-container/Dockerfile b/examples/bring-your-own-container/Dockerfile index 61f283970..fc65bd695 100644 --- a/examples/bring-your-own-container/Dockerfile +++ b/examples/bring-your-own-container/Dockerfile @@ -14,15 +14,19 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ curl iproute2 nftables \ && rm -rf /var/lib/apt/lists/* -# Create the sandbox user for non-root execution. -# Use a high UID range to avoid conflicts with host users when running without -# user namespace remapping (UID in container = UID on host). -RUN groupadd -g 1000660000 sandbox && \ - useradd -m -u 1000660000 -g sandbox sandbox +# The sandbox user is injected at runtime by the compute driver. +# Kubernetes: resolved from OpenShift SCC namespace annotations or explicit +# sandbox_uid config. VM: resolves to 10001 by default, configurable in +# gateway TOML. +# +# Images no longer need a baked-in "sandbox" user — numeric UIDs are accepted +# and the driver passes them directly to setuid()/chown() at sandbox start. +# If your image requires a passwd entry for tools like ssh or sudo, add one +# manually (e.g. RUN useradd -m -u 1500 deploy). -RUN install -d -o sandbox -g sandbox /sandbox +RUN install -d /sandbox WORKDIR /sandbox -COPY --chown=sandbox:sandbox app.py . +COPY app.py . EXPOSE 8080 diff --git a/tasks/helm.toml b/tasks/helm.toml index f25dadb09..433f04f32 100644 --- a/tasks/helm.toml +++ b/tasks/helm.toml @@ -55,16 +55,31 @@ description = "Run skaffold dev for deploy/helm/openshell (iterative deploy)" dir = "deploy/helm/openshell" run = "skaffold dev" +["helm:skaffold:dev:sidecar"] +description = "Run skaffold dev with the Kubernetes supervisor sidecar topology" +dir = "deploy/helm/openshell" +run = "skaffold dev -p sidecar" + ["helm:skaffold:run"] description = "Run skaffold run for deploy/helm/openshell (one-shot deploy)" dir = "deploy/helm/openshell" run = "skaffold run" +["helm:skaffold:run:sidecar"] +description = "Run skaffold run with the Kubernetes supervisor sidecar topology" +dir = "deploy/helm/openshell" +run = "skaffold run -p sidecar" + ["helm:skaffold:delete"] description = "Run skaffold delete for deploy/helm/openshell" dir = "deploy/helm/openshell" run = "skaffold delete" +["helm:skaffold:delete:sidecar"] +description = "Run skaffold delete for the Kubernetes supervisor sidecar topology" +dir = "deploy/helm/openshell" +run = "skaffold delete -p sidecar" + ["helm:skaffold:diagnose"] description = "Run skaffold diagnose for deploy/helm/openshell" dir = "deploy/helm/openshell" diff --git a/tasks/test.toml b/tasks/test.toml index 1d0f97856..20b243964 100644 --- a/tasks/test.toml +++ b/tasks/test.toml @@ -106,6 +106,11 @@ run = [ "AGENT_SANDBOX_VERSION=v0.4.6 e2e/rust/e2e-kubernetes.sh", ] +["e2e:kubernetes:sidecar"] +description = "Run Kubernetes e2e with the supervisor sidecar topology overlay" +env = { OPENSHELL_E2E_KUBE_EXTRA_VALUES = "deploy/helm/openshell/ci/values-sidecar.yaml" } +run = "e2e/rust/e2e-kubernetes.sh" + ["e2e:kubernetes:db"] description = "Run Kubernetes e2e with all database backend scenarios (SQLite and external PostgreSQL with existingSecret)" env = { OPENSHELL_E2E_KUBE_DB_SCENARIOS = "1" }