mirror of https://github.com/mkerrisk/man-pages
seccomp.2: New page documenting seccomp(2)
Combines documentation from prctl, in-kernel seccomp_filter.txt and dropper.c, along with details specific to the new system call. Signed-off-by: Kees Cook <keescook@chromium.org> Signed-off-by: Michael Kerrisk <mtk.manpages@gmail.com>
This commit is contained in:
parent
3924b70d3a
commit
e9519f4f28
|
@ -0,0 +1,400 @@
|
||||||
|
.\" Copyright (C) 2014 Kees Cook <keescook@chromium.org>
|
||||||
|
.\" and Copyright (C) 2012 Will Drewry <wad@chromium.org>
|
||||||
|
.\" and Copyright (C) 2008 Michael Kerrisk <mtk.manpages@gmail.com>
|
||||||
|
.\"
|
||||||
|
.\" %%%LICENSE_START(VERBATIM)
|
||||||
|
.\" Permission is granted to make and distribute verbatim copies of this
|
||||||
|
.\" manual provided the copyright notice and this permission notice are
|
||||||
|
.\" preserved on all copies.
|
||||||
|
.\"
|
||||||
|
.\" Permission is granted to copy and distribute modified versions of this
|
||||||
|
.\" manual under the conditions for verbatim copying, provided that the
|
||||||
|
.\" entire resulting derived work is distributed under the terms of a
|
||||||
|
.\" permission notice identical to this one.
|
||||||
|
.\"
|
||||||
|
.\" Since the Linux kernel and libraries are constantly changing, this
|
||||||
|
.\" manual page may be incorrect or out-of-date. The author(s) assume no
|
||||||
|
.\" responsibility for errors or omissions, or for damages resulting from
|
||||||
|
.\" the use of the information contained herein. The author(s) may not
|
||||||
|
.\" have taken the same level of care in the production of this manual,
|
||||||
|
.\" which is licensed free of charge, as they might when working
|
||||||
|
.\" professionally.
|
||||||
|
.\"
|
||||||
|
.\" Formatted or processed versions of this manual, if unaccompanied by
|
||||||
|
.\" the source, must acknowledge the copyright and authors of this work.
|
||||||
|
.\" %%%LICENSE_END
|
||||||
|
.\"
|
||||||
|
.TH SECCOMP 2 2014-06-23 "Linux" "Linux Programmer's Manual"
|
||||||
|
.SH NAME
|
||||||
|
seccomp \-
|
||||||
|
operate on Secure Computing state of the process
|
||||||
|
.SH SYNOPSIS
|
||||||
|
.nf
|
||||||
|
.B #include <linux/seccomp.h>
|
||||||
|
.B #include <linux/filter.h>
|
||||||
|
.B #include <linux/audit.h>
|
||||||
|
.B #include <linux/signal.h>
|
||||||
|
.B #include <sys/ptrace.h>
|
||||||
|
|
||||||
|
.BI "int seccomp(unsigned int " operation ", unsigned int " flags ,
|
||||||
|
.BI " void *" args );
|
||||||
|
.fi
|
||||||
|
.SH DESCRIPTION
|
||||||
|
The
|
||||||
|
.BR seccomp ()
|
||||||
|
system call operates on the Secure Computing (seccomp) state of the
|
||||||
|
current process.
|
||||||
|
|
||||||
|
Currently, Linux supports the following
|
||||||
|
.IR operation
|
||||||
|
values:
|
||||||
|
.TP
|
||||||
|
.BR SECCOMP_SET_MODE_STRICT
|
||||||
|
Only system calls that the thread is permitted to make are
|
||||||
|
.BR read (2),
|
||||||
|
.BR write (2),
|
||||||
|
.BR _exit (2),
|
||||||
|
and
|
||||||
|
.BR sigreturn (2).
|
||||||
|
Other system calls result in the delivery of a
|
||||||
|
.BR SIGKILL
|
||||||
|
signal. Strict secure computing mode is useful for number-crunching
|
||||||
|
applications that may need to execute untrusted byte code, perhaps
|
||||||
|
obtained by reading from a pipe or socket.
|
||||||
|
|
||||||
|
This operation is available only if the kernel is configured with
|
||||||
|
.BR CONFIG_SECCOMP
|
||||||
|
enabled.
|
||||||
|
|
||||||
|
The value of
|
||||||
|
.IR flags
|
||||||
|
must be 0, and
|
||||||
|
.IR args
|
||||||
|
must be NULL.
|
||||||
|
|
||||||
|
This operation is functionally identical to calling
|
||||||
|
.IR "prctl(PR_SET_SECCOMP,\ SECCOMP_MODE_STRICT)" .
|
||||||
|
.TP
|
||||||
|
.BR SECCOMP_SET_MODE_FILTER
|
||||||
|
The system calls allowed are defined by a pointer to a Berkeley Packet
|
||||||
|
Filter (BPF) passed via
|
||||||
|
.IR args .
|
||||||
|
This argument is a pointer to
|
||||||
|
.IR "struct\ sock_fprog" ;
|
||||||
|
it can be designed to filter arbitrary system calls and system call
|
||||||
|
arguments. If the filter is invalid, the call will fail, returning
|
||||||
|
.BR EACCESS
|
||||||
|
in
|
||||||
|
.IR errno .
|
||||||
|
|
||||||
|
If
|
||||||
|
.BR fork (2),
|
||||||
|
.BR clone (2),
|
||||||
|
or
|
||||||
|
.BR execve (2)
|
||||||
|
are allowed by the filter, any child processes will be constrained to
|
||||||
|
the same filters and system calls as the parent.
|
||||||
|
|
||||||
|
Prior to using this operation, the process must call
|
||||||
|
.IR "prctl(PR_SET_NO_NEW_PRIVS,\ 1)"
|
||||||
|
or run with
|
||||||
|
.BR CAP_SYS_ADMIN
|
||||||
|
privileges in its namespace. If these are not true, the call will fail
|
||||||
|
and return
|
||||||
|
.BR EACCES
|
||||||
|
in
|
||||||
|
.IR errno .
|
||||||
|
This requirement ensures that filter programs cannot be applied to child
|
||||||
|
processes with greater privileges than the process that installed them.
|
||||||
|
|
||||||
|
Additionally, if
|
||||||
|
.BR prctl (2)
|
||||||
|
or
|
||||||
|
.BR seccomp (2)
|
||||||
|
is allowed by the attached filter, additional filters may be layered on
|
||||||
|
which will increase evaluation time, but allow for further reduction of
|
||||||
|
the attack surface during execution of a process.
|
||||||
|
|
||||||
|
This operation is available only if the kernel is configured with
|
||||||
|
.BR CONFIG_SECCOMP_FILTER
|
||||||
|
enabled.
|
||||||
|
|
||||||
|
When
|
||||||
|
.IR flags
|
||||||
|
are 0, this operation is functionally identical to calling
|
||||||
|
.IR "prctl(PR_SET_SECCOMP,\ SECCOMP_MODE_FILTER,\ args)" .
|
||||||
|
|
||||||
|
The recognized
|
||||||
|
.IR flags
|
||||||
|
are:
|
||||||
|
.RS
|
||||||
|
.TP
|
||||||
|
.BR SECCOMP_FILTER_FLAG_TSYNC
|
||||||
|
When adding a new filter, synchronize all other threads of the current
|
||||||
|
process to the same seccomp filter tree. If any thread cannot do this,
|
||||||
|
the call will not attach the new seccomp filter, and will fail returning
|
||||||
|
the first thread ID found that cannot synchronize. Synchronization will
|
||||||
|
fail if another thread is in
|
||||||
|
.BR SECCOMP_MODE_STRICT
|
||||||
|
or if it has attached new seccomp filters to itself, diverging from the
|
||||||
|
calling thread's filter tree.
|
||||||
|
.RE
|
||||||
|
.SH FILTERS
|
||||||
|
When adding filters via
|
||||||
|
.BR SECCOMP_SET_MODE_FILTER ,
|
||||||
|
.IR args
|
||||||
|
points to a filter program:
|
||||||
|
|
||||||
|
.in +4n
|
||||||
|
.nf
|
||||||
|
struct sock_fprog {
|
||||||
|
unsigned short len; /* Number of BPF instructions */
|
||||||
|
struct sock_filter *filter;
|
||||||
|
};
|
||||||
|
.fi
|
||||||
|
.in
|
||||||
|
|
||||||
|
Each program must contain one or more BPF instructions:
|
||||||
|
|
||||||
|
.in +4n
|
||||||
|
.nf
|
||||||
|
struct sock_filter { /* Filter block */
|
||||||
|
__u16 code; /* Actual filter code */
|
||||||
|
__u8 jt; /* Jump true */
|
||||||
|
__u8 jf; /* Jump false */
|
||||||
|
__u32 k; /* Generic multiuse field */
|
||||||
|
};
|
||||||
|
.fi
|
||||||
|
.in
|
||||||
|
|
||||||
|
When executing the instructions, the BPF program executes over the
|
||||||
|
syscall information made available via:
|
||||||
|
|
||||||
|
.in +4n
|
||||||
|
.nf
|
||||||
|
struct seccomp_data {
|
||||||
|
int nr; /* system call number */
|
||||||
|
__u32 arch; /* AUDIT_ARCH_* value */
|
||||||
|
__u64 instruction_pointer; /* CPU instruction pointer */
|
||||||
|
__u64 args[6]; /* up to 6 system call arguments */
|
||||||
|
};
|
||||||
|
.fi
|
||||||
|
.in
|
||||||
|
|
||||||
|
A seccomp filter may return any of the following values. If multiple
|
||||||
|
filters exist, the return value for the evaluation of a given system
|
||||||
|
call will always use the highest precedent value. (For example,
|
||||||
|
.BR SECCOMP_RET_KILL
|
||||||
|
will always take precedence.)
|
||||||
|
|
||||||
|
In precedence order, they are:
|
||||||
|
.TP
|
||||||
|
.BR SECCOMP_RET_KILL
|
||||||
|
Results in the task exiting immediately without executing the
|
||||||
|
system call. The exit status of the task (status & 0x7f) will
|
||||||
|
be
|
||||||
|
.BR SIGSYS ,
|
||||||
|
not
|
||||||
|
.BR SIGKILL .
|
||||||
|
.TP
|
||||||
|
.BR SECCOMP_RET_TRAP
|
||||||
|
Results in the kernel sending a
|
||||||
|
.BR SIGSYS
|
||||||
|
signal to the triggering task without executing the system call.
|
||||||
|
.IR siginfo\->si_call_addr
|
||||||
|
will show the address of the system call instruction, and
|
||||||
|
.IR siginfo\->si_syscall
|
||||||
|
and
|
||||||
|
.IR siginfo\->si_arch
|
||||||
|
will indicate which syscall was attempted. The program counter will be
|
||||||
|
as though the syscall happened (i.e. it will not point to the syscall
|
||||||
|
instruction). The return value register will contain an arch\-dependent
|
||||||
|
value; if resuming execution, set it to something sensible.
|
||||||
|
(The architecture dependency is because replacing it with
|
||||||
|
.BR ENOSYS
|
||||||
|
could overwrite some useful information.)
|
||||||
|
|
||||||
|
The
|
||||||
|
.BR SECCOMP_RET_DATA
|
||||||
|
portion of the return value will be passed as
|
||||||
|
.IR si_errno .
|
||||||
|
|
||||||
|
.BR SIGSYS
|
||||||
|
triggered by seccomp will have a
|
||||||
|
.IR si_code
|
||||||
|
of
|
||||||
|
.BR SYS_SECCOMP .
|
||||||
|
.TP
|
||||||
|
.BR SECCOMP_RET_ERRNO
|
||||||
|
Results in the lower 16-bits of the return value being passed
|
||||||
|
to userland as the
|
||||||
|
.IR errno
|
||||||
|
without executing the system call.
|
||||||
|
.TP
|
||||||
|
.BR SECCOMP_RET_TRACE
|
||||||
|
When returned, this value will cause the kernel to attempt to
|
||||||
|
notify a ptrace()-based tracer prior to executing the system
|
||||||
|
call. If there is no tracer present,
|
||||||
|
.BR ENOSYS
|
||||||
|
is returned to userland and the system call is not executed.
|
||||||
|
|
||||||
|
A tracer will be notified if it requests
|
||||||
|
.BR PTRACE_O_TRACESECCOMP
|
||||||
|
using
|
||||||
|
.IR ptrace(PTRACE_SETOPTIONS) .
|
||||||
|
The tracer will be notified of a
|
||||||
|
.BR PTRACE_EVENT_SECCOMP
|
||||||
|
and the
|
||||||
|
.BR SECCOMP_RET_DATA
|
||||||
|
portion of the BPF program return value will be available to the tracer
|
||||||
|
via
|
||||||
|
.BR PTRACE_GETEVENTMSG .
|
||||||
|
|
||||||
|
The tracer can skip the system call by changing the syscall number
|
||||||
|
to \-1. Alternatively, the tracer can change the system call
|
||||||
|
requested by changing the system call to a valid syscall number. If
|
||||||
|
the tracer asks to skip the system call, then the system call will
|
||||||
|
appear to return the value that the tracer puts in the return value
|
||||||
|
register.
|
||||||
|
|
||||||
|
The seccomp check will not be run again after the tracer is
|
||||||
|
notified. (This means that seccomp-based sandboxes MUST NOT
|
||||||
|
allow use of ptrace, even of other sandboxed processes, without
|
||||||
|
extreme care; ptracers can use this mechanism to escape.)
|
||||||
|
.TP
|
||||||
|
.BR SECCOMP_RET_ALLOW
|
||||||
|
Results in the system call being executed.
|
||||||
|
|
||||||
|
If multiple filters exist, the return value for the evaluation of a
|
||||||
|
given system call will always use the highest precedent value.
|
||||||
|
|
||||||
|
Precedence is only determined using the
|
||||||
|
.BR SECCOMP_RET_ACTION
|
||||||
|
mask. When multiple filters return values of the same precedence,
|
||||||
|
only the
|
||||||
|
.BR SECCOMP_RET_DATA
|
||||||
|
from the most recently installed filter will be returned.
|
||||||
|
.SH RETURN VALUE
|
||||||
|
On success,
|
||||||
|
.BR seccomp ()
|
||||||
|
returns 0.
|
||||||
|
On error, if
|
||||||
|
.BR SECCOMP_FILTER_FLAG_TSYNC
|
||||||
|
was used, the return value is the thread ID that caused the
|
||||||
|
synchronization failure. On other errors, \-1 is returned, and
|
||||||
|
.IR errno
|
||||||
|
is set to indicate the cause of the error.
|
||||||
|
.SH ERRORS
|
||||||
|
.BR seccomp ()
|
||||||
|
can fail for the following reasons:
|
||||||
|
.TP
|
||||||
|
.BR EACCESS
|
||||||
|
the caller did not have the
|
||||||
|
.BR CAP_SYS_ADMIN
|
||||||
|
capability, or had not set
|
||||||
|
.IR no_new_privs
|
||||||
|
before using
|
||||||
|
.BR SECCOMP_SET_MODE_FILTER .
|
||||||
|
.TP
|
||||||
|
.BR EFAULT
|
||||||
|
.IR args
|
||||||
|
was required to be a valid address.
|
||||||
|
.TP
|
||||||
|
.BR EINVAL
|
||||||
|
.IR operation
|
||||||
|
is unknown; or
|
||||||
|
.IR flags
|
||||||
|
are invalid for the given
|
||||||
|
.IR operation
|
||||||
|
.TP
|
||||||
|
.BR ESRCH
|
||||||
|
Another thread caused a failure during thread sync, but its ID could not
|
||||||
|
be determined.
|
||||||
|
.SH VERSIONS
|
||||||
|
This system call first appeared in Linux 3.16.
|
||||||
|
.\" FIXME Add glibc version
|
||||||
|
.SH CONFORMING TO
|
||||||
|
This system call is a nonstandard Linux extension.
|
||||||
|
.SH NOTES
|
||||||
|
.BR seccomp ()
|
||||||
|
provides a superset of the functionality provided by
|
||||||
|
.IR PR_SET_SECCOMP
|
||||||
|
of
|
||||||
|
.BR prctl (2) .
|
||||||
|
(Which does not support
|
||||||
|
.IR flags .)
|
||||||
|
.SH EXAMPLE
|
||||||
|
.nf
|
||||||
|
#include <errno.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <linux/audit.h>
|
||||||
|
#include <linux/filter.h>
|
||||||
|
#include <linux/seccomp.h>
|
||||||
|
#include <sys/prctl.h>
|
||||||
|
|
||||||
|
static int install_filter(int syscall, int arch, int error)
|
||||||
|
{
|
||||||
|
struct sock_filter filter[] = {
|
||||||
|
/* Load architecture. */
|
||||||
|
BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
|
||||||
|
(offsetof(struct seccomp_data, arch))),
|
||||||
|
/* Jump forward 4 instructions on architecture mismatch. */
|
||||||
|
BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, arch, 0, 4),
|
||||||
|
/* Load syscall number. */
|
||||||
|
BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
|
||||||
|
(offsetof(struct seccomp_data, nr))),
|
||||||
|
/* Jump forward 1 instruction on syscall mismatch. */
|
||||||
|
BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, syscall, 0, 1),
|
||||||
|
/* Matching arch and syscall: return specific errno. */
|
||||||
|
BPF_STMT(BPF_RET+BPF_K,
|
||||||
|
SECCOMP_RET_ERRNO|(error & SECCOMP_RET_DATA)),
|
||||||
|
/* Destination of syscall mismatch: Allow other syscalls. */
|
||||||
|
BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
|
||||||
|
/* Destination of arch mismatch: Kill process. */
|
||||||
|
BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),
|
||||||
|
};
|
||||||
|
struct sock_fprog prog = {
|
||||||
|
.len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
|
||||||
|
.filter = filter,
|
||||||
|
};
|
||||||
|
if (seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog)) {
|
||||||
|
perror("seccomp");
|
||||||
|
return EXIT_FAILURE;
|
||||||
|
}
|
||||||
|
return EXIT_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
if (argc < 5) {
|
||||||
|
fprintf(stderr, "Usage:\\n"
|
||||||
|
"refuse <syscall_nr> <arch> <errno> <prog> [<args>]\\n"
|
||||||
|
"Hint: AUDIT_ARCH_I386: 0x%X\\n"
|
||||||
|
" AUDIT_ARCH_X86_64: 0x%X\\n"
|
||||||
|
"\\n", AUDIT_ARCH_I386, AUDIT_ARCH_X86_64);
|
||||||
|
return EXIT_FAILURE;
|
||||||
|
}
|
||||||
|
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
|
||||||
|
perror("prctl");
|
||||||
|
return EXIT_FAILURE;
|
||||||
|
}
|
||||||
|
if (install_filter(strtol(argv[1], NULL, 0),
|
||||||
|
strtol(argv[2], NULL, 0),
|
||||||
|
strtol(argv[3], NULL, 0)))
|
||||||
|
return EXIT_FAILURE;
|
||||||
|
execv(argv[4], &argv[4]);
|
||||||
|
perror("execv");
|
||||||
|
return EXIT_FAILURE;
|
||||||
|
}
|
||||||
|
.fi
|
||||||
|
.SH SEE ALSO
|
||||||
|
.ad l
|
||||||
|
.nh
|
||||||
|
.BR prctl (2),
|
||||||
|
.BR ptrace (2),
|
||||||
|
.BR signal (7),
|
||||||
|
.BR socket (7)
|
||||||
|
.ad
|
Loading…
Reference in New Issue