2014-09-25 22:47:46 +00:00
|
|
|
.\" Copyright (C) 2014 Kees Cook <keescook@chromium.org>
|
|
|
|
.\" and Copyright (C) 2012 Will Drewry <wad@chromium.org>
|
2014-11-08 06:31:35 +00:00
|
|
|
.\" and Copyright (C) 2008, 2014 Michael Kerrisk <mtk.manpages@gmail.com>
|
2014-09-25 22:47:46 +00:00
|
|
|
.\"
|
|
|
|
.\" %%%LICENSE_START(VERBATIM)
|
|
|
|
.\" Permission is granted to make and distribute verbatim copies of this
|
|
|
|
.\" manual provided the copyright notice and this permission notice are
|
|
|
|
.\" preserved on all copies.
|
|
|
|
.\"
|
|
|
|
.\" Permission is granted to copy and distribute modified versions of this
|
|
|
|
.\" manual under the conditions for verbatim copying, provided that the
|
|
|
|
.\" entire resulting derived work is distributed under the terms of a
|
|
|
|
.\" permission notice identical to this one.
|
|
|
|
.\"
|
|
|
|
.\" Since the Linux kernel and libraries are constantly changing, this
|
|
|
|
.\" manual page may be incorrect or out-of-date. The author(s) assume no
|
|
|
|
.\" responsibility for errors or omissions, or for damages resulting from
|
|
|
|
.\" the use of the information contained herein. The author(s) may not
|
|
|
|
.\" have taken the same level of care in the production of this manual,
|
|
|
|
.\" which is licensed free of charge, as they might when working
|
|
|
|
.\" professionally.
|
|
|
|
.\"
|
|
|
|
.\" Formatted or processed versions of this manual, if unaccompanied by
|
|
|
|
.\" the source, must acknowledge the copyright and authors of this work.
|
|
|
|
.\" %%%LICENSE_END
|
|
|
|
.\"
|
memusage.1, memusagestat.1, pldd.1, accept.2, adjtimex.2, arch_prctl.2, bdflush.2, bpf.2, close.2, epoll_ctl.2, epoll_wait.2, execve.2, execveat.2, fanotify_init.2, fanotify_mark.2, fcntl.2, fsync.2, get_kernel_syms.2, getdomainname.2, getgroups.2, gethostname.2, getrandom.2, getrlimit.2, getrusage.2, getsid.2, getunwind.2, io_getevents.2, ioctl_fat.2, kexec_load.2, killpg.2, listxattr.2, lseek.2, madvise.2, memfd_create.2, mknod.2, mlock.2, modify_ldt.2, msgctl.2, msgget.2, msgop.2, readlink.2, readv.2, reboot.2, recvmmsg.2, rename.2, request_key.2, restart_syscall.2, sched_setaffinity.2, sched_setattr.2, sched_setparam.2, seccomp.2, select_tut.2, semctl.2, semget.2, semop.2, set_thread_area.2, seteuid.2, setgid.2, setpgid.2, setresuid.2, setreuid.2, setsid.2, setuid.2, shmctl.2, shmget.2, shmop.2, sigaction.2, sigprocmask.2, stat.2, symlink.2, syscall.2, sysctl.2, unlink.2, bindresvport.3, byteorder.3, dlopen.3, endian.3, error.3, ffs.3, fmemopen.3, getcwd.3, getlogin.3, getnetent.3, getprotoent.3, getservent.3, getumask.3, getutent.3, glob.3, isalpha.3, lio_listio.3, login.3, mbsinit.3, mbstowcs.3, mbtowc.3, mkstemp.3, nextup.3, ntp_gettime.3, posix_fallocate.3, posix_spawn.3, pthread_join.3, pthread_rwlockattr_setkind_np.3, random.3, rcmd.3, realpath.3, resolver.3, setjmp.3, setnetgrent.3, sigvec.3, strerror.3, strverscmp.3, system.3, toupper.3, towlower.3, towupper.3, wcstombs.3, wordexp.3, cciss.4, loop.4, mouse.4, random.4, core.5, group.5, hosts.5, resolv.conf.5, ascii.7, environ.7, epoll.7, glob.7, ip.7, mq_overview.7, packet.7, pipe.7, raw.7, sched.7, signal.7, socket.7, symlink.7, ld.so.8, sln.8: tstamp
Signed-off-by: Michael Kerrisk <mtk.manpages@gmail.com>
2016-10-08 10:53:47 +00:00
|
|
|
.TH SECCOMP 2 2016-10-08 "Linux" "Linux Programmer's Manual"
|
2014-09-25 22:47:46 +00:00
|
|
|
.SH NAME
|
2014-11-02 07:31:15 +00:00
|
|
|
seccomp \- operate on Secure Computing state of the process
|
2014-09-25 22:47:46 +00:00
|
|
|
.SH SYNOPSIS
|
|
|
|
.nf
|
|
|
|
.B #include <linux/seccomp.h>
|
|
|
|
.B #include <linux/filter.h>
|
|
|
|
.B #include <linux/audit.h>
|
|
|
|
.B #include <linux/signal.h>
|
|
|
|
.B #include <sys/ptrace.h>
|
2014-12-27 11:24:19 +00:00
|
|
|
.\" Kees Cook noted: Anything that uses SECCOMP_RET_TRACE returns will
|
|
|
|
.\" need <sys/ptrace.h>
|
2014-09-25 22:47:46 +00:00
|
|
|
|
2014-11-02 07:31:15 +00:00
|
|
|
.BI "int seccomp(unsigned int " operation ", unsigned int " flags \
|
|
|
|
", void *" args );
|
2014-09-25 22:47:46 +00:00
|
|
|
.fi
|
|
|
|
.SH DESCRIPTION
|
|
|
|
The
|
|
|
|
.BR seccomp ()
|
|
|
|
system call operates on the Secure Computing (seccomp) state of the
|
2014-11-02 07:31:15 +00:00
|
|
|
calling process.
|
2014-09-25 22:47:46 +00:00
|
|
|
|
|
|
|
Currently, Linux supports the following
|
|
|
|
.IR operation
|
|
|
|
values:
|
|
|
|
.TP
|
|
|
|
.BR SECCOMP_SET_MODE_STRICT
|
2014-12-27 11:24:19 +00:00
|
|
|
The only system calls that the calling thread is permitted to make are
|
2014-09-25 22:47:46 +00:00
|
|
|
.BR read (2),
|
|
|
|
.BR write (2),
|
2015-06-30 11:39:39 +00:00
|
|
|
.BR _exit (2)
|
|
|
|
(but not
|
|
|
|
.BR exit_group (2)),
|
2014-09-25 22:47:46 +00:00
|
|
|
and
|
|
|
|
.BR sigreturn (2).
|
|
|
|
Other system calls result in the delivery of a
|
|
|
|
.BR SIGKILL
|
2014-12-27 11:24:19 +00:00
|
|
|
signal.
|
2014-11-08 10:48:49 +00:00
|
|
|
Strict secure computing mode is useful for number-crunching
|
2014-09-25 22:47:46 +00:00
|
|
|
applications that may need to execute untrusted byte code, perhaps
|
|
|
|
obtained by reading from a pipe or socket.
|
|
|
|
|
2015-03-12 13:07:01 +00:00
|
|
|
Note that although the calling thread can no longer call
|
|
|
|
.BR sigprocmask (2),
|
|
|
|
it can use
|
|
|
|
.BR sigreturn (2)
|
|
|
|
to block all signals apart from
|
|
|
|
.BR SIGKILL
|
|
|
|
and
|
|
|
|
.BR SIGSTOP .
|
2015-03-22 19:18:02 +00:00
|
|
|
This means that
|
2015-03-12 13:07:01 +00:00
|
|
|
.BR alarm (2)
|
2015-03-22 19:18:02 +00:00
|
|
|
(for example) is not sufficient for restricting the process's execution time.
|
|
|
|
Instead, to reliably terminate the process,
|
|
|
|
.BR SIGKILL
|
|
|
|
must be used.
|
|
|
|
This can be done by using
|
2015-03-12 13:07:01 +00:00
|
|
|
.BR timer_create (2)
|
|
|
|
with
|
|
|
|
.BR SIGEV_SIGNAL
|
|
|
|
and
|
2015-03-22 19:18:02 +00:00
|
|
|
.IR sigev_signo
|
2015-03-12 13:07:01 +00:00
|
|
|
set to
|
2015-03-22 19:18:02 +00:00
|
|
|
.BR SIGKILL ,
|
|
|
|
or by using
|
2015-03-12 13:07:01 +00:00
|
|
|
.BR setrlimit (2)
|
|
|
|
to set the hard limit for
|
|
|
|
.BR RLIMIT_CPU .
|
|
|
|
|
2014-09-25 22:47:46 +00:00
|
|
|
This operation is available only if the kernel is configured with
|
|
|
|
.BR CONFIG_SECCOMP
|
|
|
|
enabled.
|
|
|
|
|
|
|
|
The value of
|
|
|
|
.IR flags
|
|
|
|
must be 0, and
|
|
|
|
.IR args
|
|
|
|
must be NULL.
|
|
|
|
|
2014-11-02 07:31:15 +00:00
|
|
|
This operation is functionally identical to the call:
|
|
|
|
|
|
|
|
prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT);
|
2014-09-25 22:47:46 +00:00
|
|
|
.TP
|
|
|
|
.BR SECCOMP_SET_MODE_FILTER
|
|
|
|
The system calls allowed are defined by a pointer to a Berkeley Packet
|
|
|
|
Filter (BPF) passed via
|
|
|
|
.IR args .
|
2014-12-27 11:24:19 +00:00
|
|
|
This argument is a pointer to a
|
2014-09-25 22:47:46 +00:00
|
|
|
.IR "struct\ sock_fprog" ;
|
|
|
|
it can be designed to filter arbitrary system calls and system call
|
2014-11-02 07:31:15 +00:00
|
|
|
arguments.
|
2014-11-08 10:48:49 +00:00
|
|
|
If the filter is invalid,
|
|
|
|
.BR seccomp ()
|
|
|
|
fails, returning
|
2014-12-27 11:24:19 +00:00
|
|
|
.BR EINVAL
|
2014-09-25 22:47:46 +00:00
|
|
|
in
|
|
|
|
.IR errno .
|
|
|
|
|
|
|
|
If
|
2014-11-02 07:34:47 +00:00
|
|
|
.BR fork (2)
|
2014-09-25 22:47:46 +00:00
|
|
|
or
|
2014-11-02 07:34:47 +00:00
|
|
|
.BR clone (2)
|
|
|
|
is allowed by the filter, any child processes will be constrained to
|
2014-12-27 11:24:19 +00:00
|
|
|
the same system call filters as the parent.
|
2014-11-02 07:34:47 +00:00
|
|
|
If
|
|
|
|
.BR execve (2)
|
2014-12-27 11:24:19 +00:00
|
|
|
is allowed,
|
|
|
|
the existing filters will be preserved across a call to
|
2014-11-02 07:34:47 +00:00
|
|
|
.BR execve (2).
|
2014-09-25 22:47:46 +00:00
|
|
|
|
2014-11-02 08:40:24 +00:00
|
|
|
In order to use the
|
|
|
|
.BR SECCOMP_SET_MODE_FILTER
|
|
|
|
operation, either the caller must have the
|
2014-09-25 22:47:46 +00:00
|
|
|
.BR CAP_SYS_ADMIN
|
2016-06-29 15:44:55 +00:00
|
|
|
capability in its user namespace, or the thread must already have the
|
2014-12-27 11:24:19 +00:00
|
|
|
.I no_new_privs
|
|
|
|
bit set.
|
|
|
|
If that bit was not already set by an ancestor of this thread,
|
|
|
|
the thread must make the following call:
|
2014-11-02 08:40:24 +00:00
|
|
|
|
|
|
|
prctl(PR_SET_NO_NEW_PRIVS, 1);
|
|
|
|
|
|
|
|
Otherwise, the
|
|
|
|
.BR SECCOMP_SET_MODE_FILTER
|
|
|
|
operation will fail and return
|
2014-09-25 22:47:46 +00:00
|
|
|
.BR EACCES
|
|
|
|
in
|
|
|
|
.IR errno .
|
2014-12-26 07:01:13 +00:00
|
|
|
This requirement ensures that an unprivileged process cannot apply
|
|
|
|
a malicious filter and then invoke a set-user-ID or
|
|
|
|
other privileged program using
|
|
|
|
.BR execve (2),
|
2014-12-27 11:24:19 +00:00
|
|
|
thus potentially compromising that program.
|
|
|
|
(Such a malicious filter might, for example, cause an attempt to use
|
|
|
|
.BR setuid (2)
|
|
|
|
to set the caller's user IDs to non-zero values to instead
|
|
|
|
return 0 without actually making the system call.
|
|
|
|
Thus, the program might be tricked into retaining superuser privileges
|
|
|
|
in circumstances where it is possible to influence it to do
|
|
|
|
dangerous things because it did not actually drop privileges.)
|
2014-09-25 22:47:46 +00:00
|
|
|
|
2014-11-02 08:43:23 +00:00
|
|
|
If
|
2014-09-25 22:47:46 +00:00
|
|
|
.BR prctl (2)
|
|
|
|
or
|
arch_prctl.2, execveat.2, fanotify_mark.2, fcntl.2, fork.2, madvise.2, mknod.2, mmap.2, modify_ldt.2, mount.2, open.2, prctl.2, ptrace.2, restart_syscall.2, seccomp.2, semop.2, set_thread_area.2, symlink.2, umount.2, unlink.2, error.3, getnetent.3, getprotoent.3, getservent.3, getutent.3, glob.3, login.3, setjmp.3, setnetgrent.3, wordexp.3, epoll.7: Remove section number from page self reference
Fix places where pages refer to the function that they describe
and include a section number in that reference. Such references
cause some HTML-rendering tools to create self-references in the
page.
Signed-off-by: Michael Kerrisk <mtk.manpages@gmail.com>
2016-08-07 16:40:35 +00:00
|
|
|
.BR seccomp ()
|
2014-11-02 08:43:23 +00:00
|
|
|
is allowed by the attached filter, further filters may be added.
|
2014-11-02 15:31:31 +00:00
|
|
|
This will increase evaluation time, but allows for further reduction of
|
2014-12-27 11:24:19 +00:00
|
|
|
the attack surface during execution of a thread.
|
2014-09-25 22:47:46 +00:00
|
|
|
|
2014-11-02 07:31:15 +00:00
|
|
|
The
|
|
|
|
.BR SECCOMP_SET_MODE_FILTER
|
|
|
|
operation is available only if the kernel is configured with
|
2014-09-25 22:47:46 +00:00
|
|
|
.BR CONFIG_SECCOMP_FILTER
|
|
|
|
enabled.
|
|
|
|
|
|
|
|
When
|
|
|
|
.IR flags
|
2014-11-02 07:31:15 +00:00
|
|
|
is 0, this operation is functionally identical to the call:
|
|
|
|
|
|
|
|
prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, args);
|
2014-09-25 22:47:46 +00:00
|
|
|
|
|
|
|
The recognized
|
|
|
|
.IR flags
|
|
|
|
are:
|
|
|
|
.RS
|
|
|
|
.TP
|
|
|
|
.BR SECCOMP_FILTER_FLAG_TSYNC
|
2014-11-08 10:48:49 +00:00
|
|
|
When adding a new filter, synchronize all other threads of the calling
|
2014-11-02 07:31:15 +00:00
|
|
|
process to the same seccomp filter tree.
|
2014-12-26 07:01:13 +00:00
|
|
|
A "filter tree" is the ordered list of filters attached to a thread.
|
|
|
|
(Attaching identical filters in separate
|
|
|
|
.BR seccomp ()
|
|
|
|
calls results in different filters from this perspective.)
|
|
|
|
|
|
|
|
If any thread cannot synchronize to the same filter tree,
|
2014-11-02 07:31:15 +00:00
|
|
|
the call will not attach the new seccomp filter,
|
|
|
|
and will fail, returning the first thread ID found that cannot synchronize.
|
2014-12-27 11:24:19 +00:00
|
|
|
Synchronization will fail if another thread in the same process is in
|
2014-09-25 22:47:46 +00:00
|
|
|
.BR SECCOMP_MODE_STRICT
|
2014-11-02 07:31:15 +00:00
|
|
|
or if it has attached new seccomp filters to itself,
|
|
|
|
diverging from the calling thread's filter tree.
|
2014-09-25 22:47:46 +00:00
|
|
|
.RE
|
2014-12-30 11:35:21 +00:00
|
|
|
.SS Filters
|
2014-09-25 22:47:46 +00:00
|
|
|
When adding filters via
|
|
|
|
.BR SECCOMP_SET_MODE_FILTER ,
|
|
|
|
.IR args
|
|
|
|
points to a filter program:
|
|
|
|
|
|
|
|
.in +4n
|
|
|
|
.nf
|
|
|
|
struct sock_fprog {
|
2014-12-30 07:54:36 +00:00
|
|
|
unsigned short len; /* Number of BPF instructions */
|
|
|
|
struct sock_filter *filter; /* Pointer to array of
|
|
|
|
BPF instructions */
|
2014-09-25 22:47:46 +00:00
|
|
|
};
|
|
|
|
.fi
|
|
|
|
.in
|
|
|
|
|
|
|
|
Each program must contain one or more BPF instructions:
|
|
|
|
|
|
|
|
.in +4n
|
|
|
|
.nf
|
2014-12-30 07:54:36 +00:00
|
|
|
struct sock_filter { /* Filter block */
|
|
|
|
__u16 code; /* Actual filter code */
|
|
|
|
__u8 jt; /* Jump true */
|
|
|
|
__u8 jf; /* Jump false */
|
|
|
|
__u32 k; /* Generic multiuse field */
|
2014-09-25 22:47:46 +00:00
|
|
|
};
|
|
|
|
.fi
|
|
|
|
.in
|
|
|
|
|
2014-12-30 10:54:29 +00:00
|
|
|
When executing the instructions, the BPF program operates on the
|
|
|
|
system call information made available (i.e., use the
|
|
|
|
.BR BPF_ABS
|
2015-04-06 15:26:41 +00:00
|
|
|
addressing mode) as a (read-only)
|
|
|
|
.\" Quoting Kees Cook:
|
|
|
|
.\" If BPF even allows changing the data, it's not copied back to
|
|
|
|
.\" the syscall when it runs. Anything wanting to do things like
|
|
|
|
.\" that would need to use ptrace to catch the call an directly
|
|
|
|
.\" modify the registers before continuing with the call.
|
|
|
|
buffer of the following form:
|
2014-09-25 22:47:46 +00:00
|
|
|
|
|
|
|
.in +4n
|
|
|
|
.nf
|
|
|
|
struct seccomp_data {
|
2014-12-30 09:06:11 +00:00
|
|
|
int nr; /* System call number */
|
2014-12-30 09:05:42 +00:00
|
|
|
__u32 arch; /* AUDIT_ARCH_* value
|
|
|
|
(see <linux/audit.h>) */
|
2014-09-25 22:47:46 +00:00
|
|
|
__u64 instruction_pointer; /* CPU instruction pointer */
|
2014-12-30 07:54:36 +00:00
|
|
|
__u64 args[6]; /* Up to 6 system call arguments */
|
2014-09-25 22:47:46 +00:00
|
|
|
};
|
|
|
|
.fi
|
|
|
|
.in
|
|
|
|
|
2015-09-05 06:31:14 +00:00
|
|
|
Because numbering of system calls varies between architectures and
|
2015-06-30 11:28:10 +00:00
|
|
|
some architectures (e.g., x86-64) allow user-space code to use
|
2015-03-24 18:38:33 +00:00
|
|
|
the calling conventions of multiple architectures, it is usually
|
|
|
|
necessary to verify the value of the
|
|
|
|
.IR arch
|
|
|
|
field.
|
|
|
|
|
|
|
|
It is strongly recommended to use a whitelisting approach whenever
|
|
|
|
possible because such an approach is more robust and simple.
|
|
|
|
A blacklist will have to be updated whenever a potentially
|
2015-03-29 16:00:46 +00:00
|
|
|
dangerous system call is added (or a dangerous flag or option if those
|
2015-03-24 18:38:33 +00:00
|
|
|
are blacklisted), and it is often possible to alter the
|
|
|
|
representation of a value without altering its meaning, leading to
|
|
|
|
a blacklist bypass.
|
|
|
|
|
|
|
|
The
|
|
|
|
.IR arch
|
2015-03-29 16:00:46 +00:00
|
|
|
field is not unique for all calling conventions.
|
2015-06-30 11:28:10 +00:00
|
|
|
The x86-64 ABI and the x32 ABI both use
|
2015-03-24 18:38:33 +00:00
|
|
|
.BR AUDIT_ARCH_X86_64
|
|
|
|
as
|
|
|
|
.IR arch ,
|
2015-03-29 16:00:46 +00:00
|
|
|
and they run on the same processors.
|
|
|
|
Instead, the mask
|
2015-03-24 18:38:33 +00:00
|
|
|
.BR __X32_SYSCALL_BIT
|
|
|
|
is used on the system call number to tell the two ABIs apart.
|
2015-07-23 14:15:22 +00:00
|
|
|
.\" As noted by Dave Drysdale in a note at the end of
|
2015-06-30 13:35:19 +00:00
|
|
|
.\" https://lwn.net/Articles/604515/
|
|
|
|
.\" One additional detail to point out for the x32 ABI case:
|
|
|
|
.\" the syscall number gets a high bit set (__X32_SYSCALL_BIT),
|
|
|
|
.\" to mark it as an x32 call.
|
|
|
|
.\"
|
|
|
|
.\" If x32 support is included in the kernel, then __SYSCALL_MASK
|
|
|
|
.\" will have a value that is not all-ones, and this will trigger
|
|
|
|
.\" an extra instruction in system_call to mask off the extra bit,
|
2015-07-23 14:15:22 +00:00
|
|
|
.\" so that the syscall table indexing still works.
|
2015-06-30 13:35:19 +00:00
|
|
|
|
2015-03-24 18:38:33 +00:00
|
|
|
This means that in order to create a seccomp-based
|
2015-06-30 11:28:10 +00:00
|
|
|
blacklist for system calls performed through the x86-64 ABI,
|
2015-03-24 18:38:33 +00:00
|
|
|
it is necessary to not only check that
|
|
|
|
.IR arch
|
|
|
|
equals
|
|
|
|
.BR AUDIT_ARCH_X86_64 ,
|
2015-06-30 11:29:23 +00:00
|
|
|
but also to explicitly reject all system calls that contain
|
2015-03-24 18:38:33 +00:00
|
|
|
.BR __X32_SYSCALL_BIT
|
|
|
|
in
|
|
|
|
.IR nr .
|
|
|
|
|
2015-09-05 06:43:31 +00:00
|
|
|
The
|
|
|
|
.I instruction_pointer
|
|
|
|
field provides the address of the machine-language instruction that
|
|
|
|
performed the system call.
|
|
|
|
This might be useful in conjunction with the use of
|
|
|
|
.I /proc/[pid]/maps
|
|
|
|
to perform checks based on which region (mapping) of the program
|
|
|
|
made the system call.
|
|
|
|
(Probably, it is wise to lock down the
|
|
|
|
.BR mmap (2)
|
|
|
|
and
|
|
|
|
.BR mprotect (2)
|
|
|
|
system calls to prevent the program from subverting such checks.)
|
|
|
|
|
2015-03-24 18:38:33 +00:00
|
|
|
When checking values from
|
|
|
|
.IR args
|
|
|
|
against a blacklist, keep in mind that arguments are often
|
2015-03-29 16:00:46 +00:00
|
|
|
silently truncated before being processed, but after the seccomp check.
|
|
|
|
For example, this happens if the i386 ABI is used on an
|
2015-06-30 13:35:19 +00:00
|
|
|
x86-64 kernel: although the kernel will normally not look beyond
|
2015-03-24 18:38:33 +00:00
|
|
|
the 32 lowest bits of the arguments, the values of the full
|
2015-03-29 16:00:46 +00:00
|
|
|
64-bit registers will be present in the seccomp data.
|
2015-06-30 11:28:10 +00:00
|
|
|
A less surprising example is that if the x86-64 ABI is used to perform
|
2015-03-29 16:00:46 +00:00
|
|
|
a system call that takes an argument of type
|
|
|
|
.IR int ,
|
|
|
|
the more-significant half of the argument register is ignored by
|
|
|
|
the system call, but visible in the seccomp data.
|
2015-03-24 18:38:33 +00:00
|
|
|
|
2014-12-27 11:24:19 +00:00
|
|
|
A seccomp filter returns a 32-bit value consisting of two parts:
|
|
|
|
the most significant 16 bits
|
|
|
|
(corresponding to the mask defined by the constant
|
|
|
|
.BR SECCOMP_RET_ACTION )
|
|
|
|
contain one of the "action" values listed below;
|
|
|
|
the least significant 16-bits (defined by the constant
|
|
|
|
.BR SECCOMP_RET_DATA )
|
|
|
|
are "data" to be associated with this return value.
|
|
|
|
|
2015-09-05 06:50:59 +00:00
|
|
|
If multiple filters exist, they are \fIall\fP executed,
|
|
|
|
in reverse order of their addition to the filter tree\(emthat is,
|
|
|
|
the most recently installed filter is executed first.
|
2015-09-05 07:00:02 +00:00
|
|
|
(Note that all filters will be called
|
|
|
|
even if one of the earlier filters returns
|
|
|
|
.BR SECCOMP_RET_KILL .
|
|
|
|
This is done to simplify the kernel code and to provide a
|
|
|
|
tiny speed-up in the execution of sets of filters by
|
|
|
|
avoiding a check for this uncommon case.)
|
|
|
|
.\" From an Aug 2015 conversation with Kees Cook where I asked why *all*
|
|
|
|
.\" filters even if one of the early filters returns SECCOMP_RET_KILL:
|
|
|
|
.\"
|
|
|
|
.\" It's just because it would be an optimization that would only speed up
|
|
|
|
.\" the RET_KILL case, but it's the uncommon one and the one that doesn't
|
|
|
|
.\" benefit meaningfully from such a change (you need to kill the process
|
|
|
|
.\" really quickly?). We would speed up killing a program at the (albeit
|
|
|
|
.\" tiny) expense to all other filtered programs. Best to keep the filter
|
|
|
|
.\" execution logic clear, simple, and as fast as possible for all
|
|
|
|
.\" filters.
|
2014-12-27 11:24:19 +00:00
|
|
|
The return value for the evaluation of a given system call is the first-seen
|
|
|
|
.BR SECCOMP_RET_ACTION
|
|
|
|
value of highest precedence (along with its accompanying data)
|
|
|
|
returned by execution of all of the filters.
|
2014-09-25 22:47:46 +00:00
|
|
|
|
2014-12-27 11:24:19 +00:00
|
|
|
In decreasing order of precedence,
|
2014-11-02 13:56:10 +00:00
|
|
|
the values that may be returned by a seccomp filter are:
|
2014-09-25 22:47:46 +00:00
|
|
|
.TP
|
|
|
|
.BR SECCOMP_RET_KILL
|
2014-12-27 13:54:49 +00:00
|
|
|
This value results in the process exiting immediately
|
|
|
|
without executing the system call.
|
2014-12-27 11:24:19 +00:00
|
|
|
The process terminates as though killed by a
|
2014-11-02 13:59:31 +00:00
|
|
|
.B SIGSYS
|
|
|
|
signal
|
|
|
|
.RI ( not
|
|
|
|
.BR SIGKILL ).
|
2014-09-25 22:47:46 +00:00
|
|
|
.TP
|
|
|
|
.BR SECCOMP_RET_TRAP
|
2014-12-27 13:54:49 +00:00
|
|
|
This value results in the kernel sending a
|
2014-09-25 22:47:46 +00:00
|
|
|
.BR SIGSYS
|
2014-12-27 11:24:19 +00:00
|
|
|
signal to the triggering process without executing the system call.
|
|
|
|
Various fields will be set in the
|
|
|
|
.I siginfo_t
|
2014-12-30 07:59:58 +00:00
|
|
|
structure (see
|
|
|
|
.BR sigaction (2))
|
|
|
|
associated with signal:
|
2014-12-30 08:38:57 +00:00
|
|
|
.RS
|
|
|
|
.IP * 3
|
|
|
|
.I si_signo
|
|
|
|
will contain
|
|
|
|
.BR SIGSYS .
|
|
|
|
.IP *
|
2014-12-30 07:59:58 +00:00
|
|
|
.IR si_call_addr
|
2014-12-30 08:38:57 +00:00
|
|
|
will show the address of the system call instruction.
|
|
|
|
.IP *
|
2014-12-30 07:59:58 +00:00
|
|
|
.IR si_syscall
|
2014-09-25 22:47:46 +00:00
|
|
|
and
|
2014-12-30 07:59:58 +00:00
|
|
|
.IR si_arch
|
2014-11-08 10:48:49 +00:00
|
|
|
will indicate which system call was attempted.
|
2014-12-30 08:38:57 +00:00
|
|
|
.IP *
|
|
|
|
.I si_code
|
|
|
|
will contain
|
|
|
|
.BR SYS_SECCOMP .
|
|
|
|
.IP *
|
|
|
|
.I si_errno
|
|
|
|
will contain the
|
|
|
|
.BR SECCOMP_RET_DATA
|
|
|
|
portion of the filter return value.
|
|
|
|
.RE
|
|
|
|
.IP
|
2014-11-02 07:31:15 +00:00
|
|
|
The program counter will be as though the system call happened
|
2014-11-08 10:48:49 +00:00
|
|
|
(i.e., it will not point to the system call instruction).
|
2014-11-02 07:31:15 +00:00
|
|
|
The return value register will contain an architecture\-dependent value;
|
2014-12-30 20:25:02 +00:00
|
|
|
if resuming execution, set it to something appropriate for the system call.
|
2015-01-07 12:20:02 +00:00
|
|
|
(The architecture dependency is because replacing it with
|
2014-09-25 22:47:46 +00:00
|
|
|
.BR ENOSYS
|
|
|
|
could overwrite some useful information.)
|
|
|
|
.TP
|
|
|
|
.BR SECCOMP_RET_ERRNO
|
2014-12-27 13:54:49 +00:00
|
|
|
This value results in the
|
2014-12-27 11:24:19 +00:00
|
|
|
.B SECCOMP_RET_DATA
|
|
|
|
portion of the filter's return value being passed to user space as the
|
2014-09-25 22:47:46 +00:00
|
|
|
.IR errno
|
2014-12-27 11:24:19 +00:00
|
|
|
value without executing the system call.
|
2014-09-25 22:47:46 +00:00
|
|
|
.TP
|
|
|
|
.BR SECCOMP_RET_TRACE
|
2014-11-08 10:48:49 +00:00
|
|
|
When returned, this value will cause the kernel to attempt to notify a
|
|
|
|
.BR ptrace (2)-based
|
|
|
|
tracer prior to executing the system call.
|
2014-11-02 07:31:15 +00:00
|
|
|
If there is no tracer present,
|
2014-11-02 14:04:25 +00:00
|
|
|
the system call is not executed and returns a failure status with
|
|
|
|
.I errno
|
|
|
|
set to
|
|
|
|
.BR ENOSYS .
|
2014-09-25 22:47:46 +00:00
|
|
|
|
|
|
|
A tracer will be notified if it requests
|
|
|
|
.BR PTRACE_O_TRACESECCOMP
|
|
|
|
using
|
|
|
|
.IR ptrace(PTRACE_SETOPTIONS) .
|
|
|
|
The tracer will be notified of a
|
|
|
|
.BR PTRACE_EVENT_SECCOMP
|
|
|
|
and the
|
|
|
|
.BR SECCOMP_RET_DATA
|
2014-12-27 11:24:19 +00:00
|
|
|
portion of the filter's return value will be available to the tracer via
|
2014-09-25 22:47:46 +00:00
|
|
|
.BR PTRACE_GETEVENTMSG .
|
|
|
|
|
2014-11-02 07:31:15 +00:00
|
|
|
The tracer can skip the system call by changing the system call number
|
|
|
|
to \-1.
|
|
|
|
Alternatively, the tracer can change the system call
|
2014-11-08 10:48:49 +00:00
|
|
|
requested by changing the system call to a valid system call number.
|
2014-11-02 07:31:15 +00:00
|
|
|
If the tracer asks to skip the system call, then the system call will
|
|
|
|
appear to return the value that the tracer puts in the return value register.
|
2014-09-25 22:47:46 +00:00
|
|
|
|
2016-11-05 20:40:36 +00:00
|
|
|
.\" This was changed in ce6526e8afa4.
|
|
|
|
.\" A related hole, using PTRACE_SYSCALL instead of SECCOMP_RET_TRACE, was
|
|
|
|
.\" changed in arch-specific commits, e.g. 93e35efb8de4 for X86 and
|
|
|
|
.\" 0f3912fd934c for ARM.
|
|
|
|
Before kernel 4.8, the seccomp check will not be run again after the tracer is
|
|
|
|
notified.
|
|
|
|
(This means that, on older kernels, seccomp-based sandboxes
|
2014-11-02 07:31:15 +00:00
|
|
|
.B "must not"
|
2014-11-08 10:48:49 +00:00
|
|
|
allow use of
|
|
|
|
.BR ptrace (2)\(emeven
|
|
|
|
of other
|
2014-11-02 07:31:15 +00:00
|
|
|
sandboxed processes\(emwithout extreme care;
|
2014-12-30 20:25:02 +00:00
|
|
|
ptracers can use this mechanism to escape from the seccomp sandbox.)
|
2014-09-25 22:47:46 +00:00
|
|
|
.TP
|
|
|
|
.BR SECCOMP_RET_ALLOW
|
2014-12-27 13:54:49 +00:00
|
|
|
This value results in the system call being executed.
|
2014-09-25 22:47:46 +00:00
|
|
|
.SH RETURN VALUE
|
|
|
|
On success,
|
|
|
|
.BR seccomp ()
|
|
|
|
returns 0.
|
|
|
|
On error, if
|
|
|
|
.BR SECCOMP_FILTER_FLAG_TSYNC
|
2014-11-02 07:31:15 +00:00
|
|
|
was used,
|
2014-12-27 13:54:21 +00:00
|
|
|
the return value is the ID of the thread
|
|
|
|
that caused the synchronization failure.
|
2014-12-27 13:54:49 +00:00
|
|
|
(This ID is a kernel thread ID of the type returned by
|
|
|
|
.BR clone (2)
|
|
|
|
and
|
2014-12-30 08:51:04 +00:00
|
|
|
.BR gettid (2).)
|
2014-11-02 07:31:15 +00:00
|
|
|
On other errors, \-1 is returned, and
|
2014-09-25 22:47:46 +00:00
|
|
|
.IR errno
|
|
|
|
is set to indicate the cause of the error.
|
|
|
|
.SH ERRORS
|
|
|
|
.BR seccomp ()
|
|
|
|
can fail for the following reasons:
|
|
|
|
.TP
|
|
|
|
.BR EACCESS
|
2014-11-02 07:31:15 +00:00
|
|
|
The caller did not have the
|
2014-09-25 22:47:46 +00:00
|
|
|
.BR CAP_SYS_ADMIN
|
2016-06-29 15:44:55 +00:00
|
|
|
capability in its user namespace, or had not set
|
2014-09-25 22:47:46 +00:00
|
|
|
.IR no_new_privs
|
|
|
|
before using
|
|
|
|
.BR SECCOMP_SET_MODE_FILTER .
|
|
|
|
.TP
|
|
|
|
.BR EFAULT
|
|
|
|
.IR args
|
2014-12-30 08:51:58 +00:00
|
|
|
was not a valid address.
|
2014-09-25 22:47:46 +00:00
|
|
|
.TP
|
|
|
|
.BR EINVAL
|
|
|
|
.IR operation
|
|
|
|
is unknown; or
|
|
|
|
.IR flags
|
|
|
|
are invalid for the given
|
2014-12-30 11:03:40 +00:00
|
|
|
.IR operation .
|
2014-12-30 07:30:36 +00:00
|
|
|
.TP
|
|
|
|
.BR EINVAL
|
2014-12-30 11:05:48 +00:00
|
|
|
.I operation
|
|
|
|
included
|
|
|
|
.BR BPF_ABS ,
|
|
|
|
but the specified offset was not aligned to a 32-bit boundary or exceeded
|
|
|
|
.IR "sizeof(struct\ seccomp_data)" .
|
|
|
|
.TP
|
|
|
|
.BR EINVAL
|
2014-12-30 07:30:36 +00:00
|
|
|
.\" See kernel/seccomp.c::seccomp_may_assign_mode() in 3.18 sources
|
|
|
|
A secure computing mode has already been set, and
|
|
|
|
.I operation
|
|
|
|
differs from the existing setting.
|
|
|
|
.TP
|
|
|
|
.BR EINVAL
|
|
|
|
.\" See stub kernel/seccomp.c::seccomp_set_mode_filter() in 3.18 sources
|
|
|
|
.I operation
|
|
|
|
specified
|
|
|
|
.BR SECCOMP_SET_MODE_FILTER ,
|
|
|
|
but the kernel was not built with
|
|
|
|
.B CONFIG_SECCOMP_FILTER
|
|
|
|
enabled.
|
|
|
|
.TP
|
|
|
|
.BR EINVAL
|
|
|
|
.I operation
|
|
|
|
specified
|
|
|
|
.BR SECCOMP_SET_MODE_FILTER ,
|
|
|
|
but the filter program pointed to by
|
|
|
|
.I args
|
|
|
|
was not valid or the length of the filter program was zero or exceeded
|
|
|
|
.B BPF_MAXINSNS
|
|
|
|
(4096) instructions.
|
|
|
|
.TP
|
|
|
|
.BR ENOMEM
|
|
|
|
Out of memory.
|
|
|
|
.TP
|
|
|
|
.BR ENOMEM
|
|
|
|
.\" ENOMEM in kernel/seccomp.c::seccomp_attach_filter() in 3.18 sources
|
|
|
|
The total length of all filter programs attached
|
|
|
|
to the calling thread would exceed
|
|
|
|
.B MAX_INSNS_PER_PATH
|
|
|
|
(32768) instructions.
|
|
|
|
Note that for the purposes of calculating this limit,
|
2014-12-30 07:50:53 +00:00
|
|
|
each already existing filter program incurs an
|
|
|
|
overhead penalty of 4 instructions.
|
2014-09-25 22:47:46 +00:00
|
|
|
.TP
|
|
|
|
.BR ESRCH
|
|
|
|
Another thread caused a failure during thread sync, but its ID could not
|
|
|
|
be determined.
|
|
|
|
.SH VERSIONS
|
2014-11-08 10:48:49 +00:00
|
|
|
The
|
2015-01-16 06:55:21 +00:00
|
|
|
.BR seccomp ()
|
2014-11-08 10:48:49 +00:00
|
|
|
system call first appeared in Linux 3.17.
|
2014-12-27 11:24:19 +00:00
|
|
|
.\" FIXME . Add glibc version
|
2014-09-25 22:47:46 +00:00
|
|
|
.SH CONFORMING TO
|
2014-11-08 10:48:49 +00:00
|
|
|
The
|
2015-01-16 06:55:21 +00:00
|
|
|
.BR seccomp ()
|
2014-11-08 10:48:49 +00:00
|
|
|
system call is a nonstandard Linux extension.
|
2014-09-25 22:47:46 +00:00
|
|
|
.SH NOTES
|
2015-03-10 18:02:00 +00:00
|
|
|
Rather than hand-coding seccomp filters as shown in the example below,
|
|
|
|
you may prefer to employ the
|
|
|
|
.I libseccomp
|
|
|
|
library, which provides a front-end for generating seccomp filters.
|
|
|
|
|
2015-01-07 10:30:56 +00:00
|
|
|
The
|
|
|
|
.IR Seccomp
|
|
|
|
field of the
|
|
|
|
.IR /proc/[pid]/status
|
|
|
|
file provides a method of viewing the seccomp mode of a process; see
|
|
|
|
.BR proc (5).
|
|
|
|
|
2014-09-25 22:47:46 +00:00
|
|
|
.BR seccomp ()
|
2014-11-02 07:31:15 +00:00
|
|
|
provides a superset of the functionality provided by the
|
|
|
|
.BR prctl (2)
|
2014-11-02 16:11:15 +00:00
|
|
|
.BR PR_SET_SECCOMP
|
2014-11-02 07:31:15 +00:00
|
|
|
operation (which does not support
|
|
|
|
.IR flags ).
|
2016-11-08 12:40:52 +00:00
|
|
|
|
|
|
|
Since Linux 4.4, the
|
|
|
|
.BR prctl (2)
|
|
|
|
.B PTRACE_SECCOMP_GET_FILTER
|
|
|
|
operation can be used to dump a process's seccomp filters.
|
|
|
|
.\"
|
2014-12-30 11:35:39 +00:00
|
|
|
.SS Seccomp-specific BPF details
|
|
|
|
Note the following BPF details specific to seccomp filters:
|
|
|
|
.IP * 3
|
|
|
|
The
|
|
|
|
.B BPF_H
|
|
|
|
and
|
|
|
|
.B BPF_B
|
|
|
|
size modifiers are not supported: all operations must load and store
|
|
|
|
(4-byte) words
|
|
|
|
.RB ( BPF_W ).
|
|
|
|
.IP *
|
|
|
|
To access the contents of the
|
|
|
|
.I seccomp_data
|
|
|
|
buffer, use the
|
|
|
|
.B BPF_ABS
|
|
|
|
addressing mode modifier.
|
|
|
|
.IP *
|
|
|
|
The
|
|
|
|
.B BPF_LEN
|
|
|
|
addressing mode modifier yields an immediate mode operand
|
|
|
|
whose value is the size of the
|
|
|
|
.IR seccomp_data
|
|
|
|
buffer.
|
2014-09-25 22:47:46 +00:00
|
|
|
.SH EXAMPLE
|
2014-11-08 06:31:35 +00:00
|
|
|
The program below accepts four or more arguments.
|
|
|
|
The first three arguments are a system call number,
|
|
|
|
a numeric architecture identifier, and an error number.
|
|
|
|
The program uses these values to construct a BPF filter
|
|
|
|
that is used at run time to perform the following checks:
|
|
|
|
.IP [1] 4
|
|
|
|
If the program is not running on the specified architecture,
|
|
|
|
the BPF filter causes system calls to fail with the error
|
|
|
|
.BR ENOSYS .
|
|
|
|
.IP [2]
|
|
|
|
If the program attempts to execute the system call with the specified number,
|
|
|
|
the BPF filter causes the system call to fail, with
|
|
|
|
.I errno
|
|
|
|
being set to the specified error number.
|
|
|
|
.PP
|
|
|
|
The remaining command-line arguments specify
|
|
|
|
the pathname and additional arguments of a program
|
|
|
|
that the example program should attempt to execute using
|
2015-03-10 09:47:22 +00:00
|
|
|
.BR execv (3)
|
2014-11-08 06:31:35 +00:00
|
|
|
(a library function that employs the
|
|
|
|
.BR execve (2)
|
|
|
|
system call).
|
|
|
|
Some example runs of the program are shown below.
|
|
|
|
|
|
|
|
First, we display the architecture that we are running on (x86-64)
|
|
|
|
and then construct a shell function that looks up system call
|
|
|
|
numbers on this architecture:
|
|
|
|
|
|
|
|
.nf
|
|
|
|
.in +4n
|
|
|
|
$ \fBuname -m\fP
|
|
|
|
x86_64
|
|
|
|
$ \fBsyscall_nr() {
|
|
|
|
cat /usr/src/linux/arch/x86/syscalls/syscall_64.tbl | \\
|
2014-12-30 20:25:02 +00:00
|
|
|
awk '$2 != "x32" && $3 == "'$1'" { print $1 }'
|
2014-11-08 06:31:35 +00:00
|
|
|
}\fP
|
|
|
|
.in
|
|
|
|
.fi
|
|
|
|
|
|
|
|
When the BPF filter rejects a system call (case [2] above),
|
|
|
|
it causes the system call to fail with the error number
|
|
|
|
specified on the command line.
|
|
|
|
In the experiments shown here, we'll use error number 99:
|
|
|
|
|
|
|
|
.nf
|
|
|
|
.in +4n
|
|
|
|
$ \fBerrno 99\fP
|
|
|
|
EADDRNOTAVAIL 99 Cannot assign requested address
|
|
|
|
.in
|
|
|
|
.fi
|
|
|
|
|
|
|
|
In the following example, we attempt to run the command
|
|
|
|
.BR whoami (1),
|
|
|
|
but the BPF filter rejects the
|
|
|
|
.BR execve (2)
|
|
|
|
system call, so that the command is not even executed:
|
|
|
|
|
|
|
|
.nf
|
|
|
|
.in +4n
|
|
|
|
$ \fBsyscall_nr execve\fP
|
2014-12-30 20:25:02 +00:00
|
|
|
59
|
2014-12-27 11:24:19 +00:00
|
|
|
$ \fB./a.out\fP
|
|
|
|
Usage: ./a.out <syscall_nr> <arch> <errno> <prog> [<args>]
|
|
|
|
Hint for <arch>: AUDIT_ARCH_I386: 0x40000003
|
|
|
|
AUDIT_ARCH_X86_64: 0xC000003E
|
2014-11-08 06:31:35 +00:00
|
|
|
$ \fB./a.out 59 0xC000003E 99 /bin/whoami\fP
|
|
|
|
execv: Cannot assign requested address
|
|
|
|
.in
|
|
|
|
.fi
|
|
|
|
|
|
|
|
In the next example, the BPF filter rejects the
|
|
|
|
.BR write (2)
|
|
|
|
system call, so that, although it is successfully started, the
|
|
|
|
.BR whoami (1)
|
|
|
|
command is not able to write output:
|
|
|
|
|
|
|
|
.nf
|
|
|
|
.in +4n
|
|
|
|
$ \fBsyscall_nr write\fP
|
|
|
|
1
|
|
|
|
$ \fB./a.out 1 0xC000003E 99 /bin/whoami\fP
|
|
|
|
.in
|
|
|
|
.fi
|
|
|
|
|
|
|
|
In the final example,
|
|
|
|
the BPF filter rejects a system call that is not used by the
|
|
|
|
.BR whoami (1)
|
|
|
|
command, so it is able to successfully execute and produce output:
|
|
|
|
|
|
|
|
.nf
|
|
|
|
.in +4n
|
|
|
|
$ \fBsyscall_nr preadv\fP
|
|
|
|
295
|
|
|
|
$ \fB./a.out 295 0xC000003E 99 /bin/whoami\fP
|
|
|
|
cecilia
|
|
|
|
.in
|
|
|
|
.fi
|
|
|
|
.SS Program source
|
2014-09-25 22:47:46 +00:00
|
|
|
.nf
|
|
|
|
#include <errno.h>
|
|
|
|
#include <stddef.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <unistd.h>
|
|
|
|
#include <linux/audit.h>
|
|
|
|
#include <linux/filter.h>
|
|
|
|
#include <linux/seccomp.h>
|
|
|
|
#include <sys/prctl.h>
|
|
|
|
|
2015-03-24 18:38:33 +00:00
|
|
|
#define X32_SYSCALL_BIT 0x40000000
|
|
|
|
|
2014-11-02 07:53:40 +00:00
|
|
|
static int
|
2014-12-30 11:53:23 +00:00
|
|
|
install_filter(int syscall_nr, int t_arch, int f_errno)
|
2014-09-25 22:47:46 +00:00
|
|
|
{
|
2015-03-24 18:38:33 +00:00
|
|
|
unsigned int upper_nr_limit = 0xffffffff;
|
2015-06-30 11:28:42 +00:00
|
|
|
|
|
|
|
/* Assume that AUDIT_ARCH_X86_64 means the normal x86-64 ABI */
|
2015-03-24 18:38:33 +00:00
|
|
|
if (t_arch == AUDIT_ARCH_X86_64)
|
|
|
|
upper_nr_limit = X32_SYSCALL_BIT - 1;
|
|
|
|
|
2014-09-25 22:47:46 +00:00
|
|
|
struct sock_filter filter[] = {
|
2014-12-30 11:54:01 +00:00
|
|
|
/* [0] Load architecture from 'seccomp_data' buffer into
|
2014-12-30 11:52:18 +00:00
|
|
|
accumulator */
|
2014-12-30 20:25:02 +00:00
|
|
|
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
|
2014-09-25 22:47:46 +00:00
|
|
|
(offsetof(struct seccomp_data, arch))),
|
2014-11-02 07:53:40 +00:00
|
|
|
|
2015-03-24 18:38:33 +00:00
|
|
|
/* [1] Jump forward 5 instructions if architecture does not
|
2014-12-30 11:54:01 +00:00
|
|
|
match 't_arch' */
|
2015-03-24 18:38:33 +00:00
|
|
|
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, t_arch, 0, 5),
|
2014-11-02 07:53:40 +00:00
|
|
|
|
2014-12-30 11:54:01 +00:00
|
|
|
/* [2] Load system call number from 'seccomp_data' buffer into
|
2014-12-30 11:52:18 +00:00
|
|
|
accumulator */
|
2014-12-30 20:25:02 +00:00
|
|
|
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
|
2014-09-25 22:47:46 +00:00
|
|
|
(offsetof(struct seccomp_data, nr))),
|
2014-11-02 07:53:40 +00:00
|
|
|
|
2015-06-30 11:28:10 +00:00
|
|
|
/* [3] Check ABI - only needed for x86-64 in blacklist use
|
2015-04-07 13:05:14 +00:00
|
|
|
cases. Use JGT instead of checking against the bit
|
|
|
|
mask to avoid having to reload the syscall number. */
|
2015-03-24 18:38:33 +00:00
|
|
|
BPF_JUMP(BPF_JMP | BPF_JGT | BPF_K, upper_nr_limit, 3, 0),
|
|
|
|
|
|
|
|
/* [4] Jump forward 1 instruction if system call number
|
2014-12-30 11:54:01 +00:00
|
|
|
does not match 'syscall_nr' */
|
2014-12-30 20:25:02 +00:00
|
|
|
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, syscall_nr, 0, 1),
|
2014-11-02 07:53:40 +00:00
|
|
|
|
2015-03-24 18:38:33 +00:00
|
|
|
/* [5] Matching architecture and system call: don't execute
|
2014-12-30 11:53:23 +00:00
|
|
|
the system call, and return 'f_errno' in 'errno' */
|
2014-12-30 20:25:02 +00:00
|
|
|
BPF_STMT(BPF_RET | BPF_K,
|
2014-12-30 11:53:23 +00:00
|
|
|
SECCOMP_RET_ERRNO | (f_errno & SECCOMP_RET_DATA)),
|
2014-11-02 07:53:40 +00:00
|
|
|
|
2015-03-24 18:38:33 +00:00
|
|
|
/* [6] Destination of system call number mismatch: allow other
|
2014-11-08 10:48:49 +00:00
|
|
|
system calls */
|
2014-12-30 20:25:02 +00:00
|
|
|
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
|
2014-11-02 07:53:40 +00:00
|
|
|
|
2015-03-24 18:38:33 +00:00
|
|
|
/* [7] Destination of architecture mismatch: kill process */
|
2014-12-30 20:25:02 +00:00
|
|
|
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL),
|
2014-09-25 22:47:46 +00:00
|
|
|
};
|
2014-11-02 07:53:40 +00:00
|
|
|
|
2014-09-25 22:47:46 +00:00
|
|
|
struct sock_fprog prog = {
|
2014-11-02 07:53:40 +00:00
|
|
|
.len = (unsigned short) (sizeof(filter) / sizeof(filter[0])),
|
2014-09-25 22:47:46 +00:00
|
|
|
.filter = filter,
|
|
|
|
};
|
2014-11-02 07:53:40 +00:00
|
|
|
|
2014-09-25 22:47:46 +00:00
|
|
|
if (seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog)) {
|
|
|
|
perror("seccomp");
|
2014-11-02 07:53:40 +00:00
|
|
|
return 1;
|
2014-09-25 22:47:46 +00:00
|
|
|
}
|
2014-11-02 07:53:40 +00:00
|
|
|
|
|
|
|
return 0;
|
2014-09-25 22:47:46 +00:00
|
|
|
}
|
|
|
|
|
2014-11-02 07:53:40 +00:00
|
|
|
int
|
|
|
|
main(int argc, char **argv)
|
2014-09-25 22:47:46 +00:00
|
|
|
{
|
|
|
|
if (argc < 5) {
|
2014-12-27 11:24:19 +00:00
|
|
|
fprintf(stderr, "Usage: "
|
|
|
|
"%s <syscall_nr> <arch> <errno> <prog> [<args>]\\n"
|
|
|
|
"Hint for <arch>: AUDIT_ARCH_I386: 0x%X\\n"
|
|
|
|
" AUDIT_ARCH_X86_64: 0x%X\\n"
|
|
|
|
"\\n", argv[0], AUDIT_ARCH_I386, AUDIT_ARCH_X86_64);
|
2014-11-02 07:53:40 +00:00
|
|
|
exit(EXIT_FAILURE);
|
2014-09-25 22:47:46 +00:00
|
|
|
}
|
2014-11-02 07:53:40 +00:00
|
|
|
|
2014-09-25 22:47:46 +00:00
|
|
|
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
|
|
|
|
perror("prctl");
|
2014-11-02 07:53:40 +00:00
|
|
|
exit(EXIT_FAILURE);
|
2014-09-25 22:47:46 +00:00
|
|
|
}
|
2014-11-02 07:53:40 +00:00
|
|
|
|
2014-09-25 22:47:46 +00:00
|
|
|
if (install_filter(strtol(argv[1], NULL, 0),
|
|
|
|
strtol(argv[2], NULL, 0),
|
|
|
|
strtol(argv[3], NULL, 0)))
|
2014-11-02 07:53:40 +00:00
|
|
|
exit(EXIT_FAILURE);
|
|
|
|
|
2014-09-25 22:47:46 +00:00
|
|
|
execv(argv[4], &argv[4]);
|
|
|
|
perror("execv");
|
2014-11-02 07:53:40 +00:00
|
|
|
exit(EXIT_FAILURE);
|
2014-09-25 22:47:46 +00:00
|
|
|
}
|
|
|
|
.fi
|
|
|
|
.SH SEE ALSO
|
2015-07-23 12:42:25 +00:00
|
|
|
.BR bpf (2),
|
2014-09-25 22:47:46 +00:00
|
|
|
.BR prctl (2),
|
|
|
|
.BR ptrace (2),
|
2015-01-18 06:26:17 +00:00
|
|
|
.BR sigaction (2),
|
2015-09-05 06:43:31 +00:00
|
|
|
.BR proc (5),
|
2014-09-25 22:47:46 +00:00
|
|
|
.BR signal (7),
|
|
|
|
.BR socket (7)
|
2014-11-02 15:48:07 +00:00
|
|
|
.sp
|
2015-08-08 08:32:18 +00:00
|
|
|
Various pages from the
|
|
|
|
.I libseccomp
|
|
|
|
library, including:
|
2015-08-08 11:44:20 +00:00
|
|
|
.BR scmp_sys_resolver (1),
|
2015-08-08 08:32:18 +00:00
|
|
|
.BR seccomp_init (3),
|
|
|
|
.BR seccomp_load (3),
|
|
|
|
.BR seccomp_rule_add (3),
|
|
|
|
and
|
|
|
|
.BR seccomp_export_bpf (3).
|
|
|
|
.sp
|
2014-12-30 09:48:48 +00:00
|
|
|
The kernel source files
|
|
|
|
.IR Documentation/networking/filter.txt
|
|
|
|
and
|
|
|
|
.IR Documentation/prctl/seccomp_filter.txt .
|
2014-12-30 09:47:36 +00:00
|
|
|
.sp
|
|
|
|
McCanne, S. and Jacobson, V. (1992)
|
|
|
|
.IR "The BSD Packet Filter: A New Architecture for User-level Packet Capture" ,
|
|
|
|
Proceedings of the USENIX Winter 1993 Conference
|
|
|
|
.UR http://www.tcpdump.org/papers/bpf-usenix93.pdf
|
|
|
|
.UE
|