|
//! API bits for the Secure Computing facility in the Linux kernel, which allows //! processes to restrict access to the system call API. //! //! Seccomp started life with a single "strict" mode, which only allowed calls //! to read(2), write(2), _exit(2) and sigreturn(2). It turns out that this //! isn't that useful for general-purpose applications, and so a mode that //! utilizes user-supplied filters mode was added. //! //! Seccomp filters are classic BPF programs. Conceptually, a seccomp program //! is attached to the kernel and is executed on each syscall. The "packet" //! being validated is the `data` structure, and the verdict is an action that //! the kernel performs on the calling process. The actions are variations on a //! "pass" or "fail" result, where a pass allows the syscall to continue and a //! fail blocks the syscall and returns some sort of error value. See the full //! list of actions under ::RET for more information. Finally, only word-sized, //! absolute loads (`ld [k]`) are supported to read from the `data` structure. //! //! There are some issues with the filter API that have traditionally made //! writing them a pain: //! //! 1. Each CPU architecture supported by Linux has its own unique ABI and //! syscall API. It is not guaranteed that the syscall numbers and arguments //! are the same across architectures, or that they're even implemented. Thus, //! filters cannot be assumed to be portable without consulting documentation //! like syscalls(2) and testing on target hardware. This also requires //! checking the value of `data.arch` to make sure that a filter was compiled //! for the correct architecture. //! 2. Many syscalls take an `unsigned long` or `size_t` argument, the size of //! which is dependant on the ABI. Since BPF programs execute in a 32-bit //! machine, validation of 64-bit arguments necessitates two load-and-compare //! instructions for the upper and lower words. //! 3. A further wrinkle to the above is endianness. Unlike network packets, //! syscall data shares the endianness of the target machine. A filter //! compiled on a little-endian machine will not work on a big-endian one, //! and vice-versa. For example: Checking the upper 32-bits of `data.arg1` //! requires a load at `@offsetOf(data, "arg1") + 4` on big-endian systems //! and `@offsetOf(data, "arg1")` on little-endian systems. Endian-portable //! filters require adjusting these offsets at compile time, similar to how //! e.g. OpenSSH does[1]. //! 4. Syscalls with userspace implementations via the vDSO cannot be traced or //! filtered. The vDSO can be disabled or just ignored, which must be taken //! into account when writing filters. //! 5. Software libraries - especially dynamically loaded ones - tend to use //! more of the syscall API over time, thus filters must evolve with them. //! Static filters can result in reduced or even broken functionality when //! calling newer code from these libraries. This is known to happen with //! critical libraries like glibc[2]. //! //! Some of these issues can be mitigated with help from Zig and the standard //! library. Since the target CPU is known at compile time, the proper syscall //! numbers are mixed into the `os` namespace under `std.os.SYS (see the code //! for `arch_bits` in `os/linux.zig`). Referencing an unimplemented syscall //! would be a compile error. Endian offsets can also be defined in a similar //! manner to the OpenSSH example: //! //! ```zig //! const offset = if (native_endian == .little) struct { //! pub const low = 0; //! pub const high = @sizeOf(u32); //! } else struct { //! pub const low = @sizeOf(u32); //! pub const high = 0; //! }; //! ``` //! //! Unfortunately, there is no easy solution for issue 5. The most reliable //! strategy is to keep testing; test newer Zig versions, different libcs, //! different distros, and design your filter to accommodate all of them. //! Alternatively, you could inject a filter at runtime. Since filters are //! preserved across execve(2), a filter could be setup before executing your //! program, without your program having any knowledge of this happening. This //! is the method used by systemd[3] and Cloudflare's sandbox library[4]. //! //! [1]: https://github.com/openssh/openssh-portable/blob/master/sandbox-seccomp-filter.c#L81 //! [2]: https://sourceware.org/legacy-ml/libc-alpha/2017-11/msg00246.html //! [3]: https://www.freedesktop.org/software/systemd/man/systemd.exec.html#SystemCallFilter= //! [4]: https://github.com/cloudflare/sandbox //! //! See Also //! - seccomp(2), seccomp_unotify(2) //! - https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html const IOCTL = @import("ioctl.zig"); |
MODESeccomp not in use. |
// Modes for the prctl(2) form `prctl(PR_SET_SECCOMP, mode)` pub const MODE = struct { /// Seccomp not in use. |
DISABLEDUses a hard-coded filter. |
pub const DISABLED = 0; /// Uses a hard-coded filter. |
STRICTUses a user-supplied filter. |
pub const STRICT = 1; /// Uses a user-supplied filter. |
FILTERBitflags for the SET_MODE_FILTER operation. |
pub const FILTER = 2; }; |
SET_MODE_STRICTAction values for seccomp BPF programs. The lower 16-bits are for optional return data. The upper 16-bits are ordered from least permissive values to most. |
// Operations for the seccomp(2) form `seccomp(operation, flags, args)` pub const SET_MODE_STRICT = 0; |
SET_MODE_FILTERKill the process. |
pub const SET_MODE_FILTER = 1; |
GET_ACTION_AVAILKill the thread. |
pub const GET_ACTION_AVAIL = 2; |
GET_NOTIF_SIZESDisallow and force a SIGSYS. |
pub const GET_NOTIF_SIZES = 3; |
FILTER_FLAGReturn an errno. |
/// Bitflags for the SET_MODE_FILTER operation. pub const FILTER_FLAG = struct { |
TSYNCForward the syscall to a userspace supervisor to make a decision. |
pub const TSYNC = 1 << 0; |
LOGPass to a tracer or disallow. |
pub const LOG = 1 << 1; |
SPEC_ALLOWAllow after logging. |
pub const SPEC_ALLOW = 1 << 2; |
NEW_LISTENERAllow. |
pub const NEW_LISTENER = 1 << 3; |
TSYNC_ESRCHTells the kernel that the supervisor allows the syscall to continue. |
pub const TSYNC_ESRCH = 1 << 4; }; |
RETSee seccomp_unotify(2). |
/// Action values for seccomp BPF programs. /// The lower 16-bits are for optional return data. /// The upper 16-bits are ordered from least permissive values to most. pub const RET = struct { /// Kill the process. |
KILL_PROCESSThe system call number. |
pub const KILL_PROCESS = 0x80000000; /// Kill the thread. |
KILL_THREADThe CPU architecture/system call convention.
One of the values defined in |
pub const KILL_THREAD = 0x00000000; |
KILLUsed with the ::GET_NOTIF_SIZES command to check if the kernel structures have changed. |
pub const KILL = KILL_THREAD; /// Disallow and force a SIGSYS. |
TRAPSize of ::notif. |
pub const TRAP = 0x00030000; /// Return an errno. |
ERRNOSize of ::resp. |
pub const ERRNO = 0x00050000; /// Forward the syscall to a userspace supervisor to make a decision. |
USER_NOTIFSize of ::data. |
pub const USER_NOTIF = 0x7fc00000; /// Pass to a tracer or disallow. |
TRACEUnique notification cookie for each filter. |
pub const TRACE = 0x7ff00000; /// Allow after logging. |
LOGID of the thread that triggered the notification. |
pub const LOG = 0x7ffc0000; /// Allow. |
ALLOWBitmask for event information. Currently set to zero. |
pub const ALLOW = 0x7fff0000; |
ACTION_FULLThe current system call data. |
// Masks for the return value sections. pub const ACTION_FULL = 0xffff0000; |
ACTIONThe decision payload the supervisor process sends to the kernel. |
pub const ACTION = 0x7fff0000; |
DATAThe filter cookie. |
pub const DATA = 0x0000ffff; }; |
IOCTL_NOTIFThe return value for a spoofed syscall. |
pub const IOCTL_NOTIF = struct { |
RECVSet to zero for a spoofed success or a negative error number for a failure. |
pub const RECV = IOCTL.IOWR('!', 0, notif); |
SENDBitmask containing the decision. Either USER_NOTIF_FLAG_CONTINUE to allow the syscall or zero to spoof the return values. |
pub const SEND = IOCTL.IOWR('!', 1, notif_resp); |
ID_VALID |
pub const ID_VALID = IOCTL.IOW('!', 2, u64); |
ADDFD |
pub const ADDFD = IOCTL.IOW('!', 3, notif_addfd); }; |
USER_NOTIF_FLAG_CONTINUE |
/// Tells the kernel that the supervisor allows the syscall to continue. pub const USER_NOTIF_FLAG_CONTINUE = 1 << 0; |
ADDFD_FLAG |
/// See seccomp_unotify(2). pub const ADDFD_FLAG = struct { |
SETFD |
pub const SETFD = 1 << 0; |
SEND |
pub const SEND = 1 << 1; }; |
data |
pub const data = extern struct { /// The system call number. nr: c_int, /// The CPU architecture/system call convention. /// One of the values defined in `std.os.linux.AUDIT`. arch: u32, instruction_pointer: u64, arg0: u64, arg1: u64, arg2: u64, arg3: u64, arg4: u64, arg5: u64, }; |
notif_sizes |
/// Used with the ::GET_NOTIF_SIZES command to check if the kernel structures /// have changed. pub const notif_sizes = extern struct { /// Size of ::notif. notif: u16, /// Size of ::resp. notif_resp: u16, /// Size of ::data. data: u16, }; |
notif |
pub const notif = extern struct { /// Unique notification cookie for each filter. id: u64, /// ID of the thread that triggered the notification. pid: u32, /// Bitmask for event information. Currently set to zero. flags: u32, /// The current system call data. data: data, }; |
notif_resp |
/// The decision payload the supervisor process sends to the kernel. pub const notif_resp = extern struct { /// The filter cookie. id: u64, /// The return value for a spoofed syscall. val: i64, /// Set to zero for a spoofed success or a negative error number for a /// failure. @"error": i32, /// Bitmask containing the decision. Either USER_NOTIF_FLAG_CONTINUE to /// allow the syscall or zero to spoof the return values. flags: u32, }; |
notif_addfd |
pub const notif_addfd = extern struct { id: u64, flags: u32, srcfd: u32, newfd: u32, newfd_flags: u32, }; |
Generated by zstd-live on 2025-08-13 02:35:12 UTC. |