2014-09-25 22:47:46 +00:00
|
|
|
.\" Copyright (C) 2014 Kees Cook <keescook@chromium.org>
|
|
|
|
.\" and Copyright (C) 2012 Will Drewry <wad@chromium.org>
|
2014-11-08 06:31:35 +00:00
|
|
|
.\" and Copyright (C) 2008, 2014 Michael Kerrisk <mtk.manpages@gmail.com>
|
2014-09-25 22:47:46 +00:00
|
|
|
.\"
|
|
|
|
.\" %%%LICENSE_START(VERBATIM)
|
|
|
|
.\" Permission is granted to make and distribute verbatim copies of this
|
|
|
|
.\" manual provided the copyright notice and this permission notice are
|
|
|
|
.\" preserved on all copies.
|
|
|
|
.\"
|
|
|
|
.\" Permission is granted to copy and distribute modified versions of this
|
|
|
|
.\" manual under the conditions for verbatim copying, provided that the
|
|
|
|
.\" entire resulting derived work is distributed under the terms of a
|
|
|
|
.\" permission notice identical to this one.
|
|
|
|
.\"
|
|
|
|
.\" Since the Linux kernel and libraries are constantly changing, this
|
|
|
|
.\" manual page may be incorrect or out-of-date. The author(s) assume no
|
|
|
|
.\" responsibility for errors or omissions, or for damages resulting from
|
|
|
|
.\" the use of the information contained herein. The author(s) may not
|
|
|
|
.\" have taken the same level of care in the production of this manual,
|
|
|
|
.\" which is licensed free of charge, as they might when working
|
|
|
|
.\" professionally.
|
|
|
|
.\"
|
|
|
|
.\" Formatted or processed versions of this manual, if unaccompanied by
|
|
|
|
.\" the source, must acknowledge the copyright and authors of this work.
|
|
|
|
.\" %%%LICENSE_END
|
|
|
|
.\"
|
|
|
|
.TH SECCOMP 2 2014-06-23 "Linux" "Linux Programmer's Manual"
|
|
|
|
.SH NAME
|
2014-11-02 07:31:15 +00:00
|
|
|
seccomp \- operate on Secure Computing state of the process
|
2014-09-25 22:47:46 +00:00
|
|
|
.SH SYNOPSIS
|
|
|
|
.nf
|
|
|
|
.B #include <linux/seccomp.h>
|
|
|
|
.B #include <linux/filter.h>
|
|
|
|
.B #include <linux/audit.h>
|
|
|
|
.B #include <linux/signal.h>
|
2014-11-08 10:22:10 +00:00
|
|
|
.\" FIXME Is sys/ptrace.h really required? It is not used in
|
|
|
|
.\" the example program below.
|
2014-09-25 22:47:46 +00:00
|
|
|
.B #include <sys/ptrace.h>
|
|
|
|
|
2014-11-02 07:31:15 +00:00
|
|
|
.BI "int seccomp(unsigned int " operation ", unsigned int " flags \
|
|
|
|
", void *" args );
|
2014-09-25 22:47:46 +00:00
|
|
|
.fi
|
|
|
|
.SH DESCRIPTION
|
|
|
|
The
|
|
|
|
.BR seccomp ()
|
|
|
|
system call operates on the Secure Computing (seccomp) state of the
|
2014-11-02 07:31:15 +00:00
|
|
|
calling process.
|
2014-11-02 15:33:41 +00:00
|
|
|
.\" FIXME: This page various uses the terms "process', "thread" and "task".
|
|
|
|
.\" Probably only one of these (not "task"!) should be used in all
|
|
|
|
.\" cases. I suspect it should be "thread".
|
2014-09-25 22:47:46 +00:00
|
|
|
|
|
|
|
Currently, Linux supports the following
|
|
|
|
.IR operation
|
|
|
|
values:
|
|
|
|
.TP
|
|
|
|
.BR SECCOMP_SET_MODE_STRICT
|
2014-11-02 07:31:15 +00:00
|
|
|
The only system calls that the thread is permitted to make are
|
2014-09-25 22:47:46 +00:00
|
|
|
.BR read (2),
|
|
|
|
.BR write (2),
|
|
|
|
.BR _exit (2),
|
|
|
|
and
|
|
|
|
.BR sigreturn (2).
|
|
|
|
Other system calls result in the delivery of a
|
|
|
|
.BR SIGKILL
|
2014-11-08 10:48:49 +00:00
|
|
|
signal
|
|
|
|
Strict secure computing mode is useful for number-crunching
|
2014-09-25 22:47:46 +00:00
|
|
|
applications that may need to execute untrusted byte code, perhaps
|
|
|
|
obtained by reading from a pipe or socket.
|
|
|
|
|
|
|
|
This operation is available only if the kernel is configured with
|
|
|
|
.BR CONFIG_SECCOMP
|
|
|
|
enabled.
|
|
|
|
|
|
|
|
The value of
|
|
|
|
.IR flags
|
|
|
|
must be 0, and
|
|
|
|
.IR args
|
|
|
|
must be NULL.
|
|
|
|
|
2014-11-02 07:31:15 +00:00
|
|
|
This operation is functionally identical to the call:
|
|
|
|
|
|
|
|
prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT);
|
2014-09-25 22:47:46 +00:00
|
|
|
.TP
|
|
|
|
.BR SECCOMP_SET_MODE_FILTER
|
|
|
|
The system calls allowed are defined by a pointer to a Berkeley Packet
|
|
|
|
Filter (BPF) passed via
|
|
|
|
.IR args .
|
2014-11-08 10:48:49 +00:00
|
|
|
This arguMent is a pointer to a
|
2014-09-25 22:47:46 +00:00
|
|
|
.IR "struct\ sock_fprog" ;
|
|
|
|
it can be designed to filter arbitrary system calls and system call
|
2014-11-02 07:31:15 +00:00
|
|
|
arguments.
|
2014-11-08 10:48:49 +00:00
|
|
|
If the filter is invalid,
|
|
|
|
.BR seccomp ()
|
|
|
|
fails, returning
|
2014-09-25 22:47:46 +00:00
|
|
|
.BR EACCESS
|
|
|
|
in
|
|
|
|
.IR errno .
|
|
|
|
|
2014-11-02 12:45:02 +00:00
|
|
|
.\" FIXME I (mtk) reworded the following paragraph substantially.
|
|
|
|
.\" Please check it.
|
2014-09-25 22:47:46 +00:00
|
|
|
If
|
2014-11-02 07:34:47 +00:00
|
|
|
.BR fork (2)
|
2014-09-25 22:47:46 +00:00
|
|
|
or
|
2014-11-02 07:34:47 +00:00
|
|
|
.BR clone (2)
|
|
|
|
is allowed by the filter, any child processes will be constrained to
|
2014-09-25 22:47:46 +00:00
|
|
|
the same filters and system calls as the parent.
|
2014-11-02 07:34:47 +00:00
|
|
|
If
|
|
|
|
.BR execve (2)
|
|
|
|
is allowed by the filter,
|
|
|
|
the filters and constraints on permitted system calls are preserved across an
|
|
|
|
.BR execve (2).
|
2014-09-25 22:47:46 +00:00
|
|
|
|
2014-11-02 12:45:02 +00:00
|
|
|
.\" FIXME I (mtk) reworded the following paragraph substantially.
|
|
|
|
.\" Please check it.
|
2014-11-02 08:40:24 +00:00
|
|
|
In order to use the
|
|
|
|
.BR SECCOMP_SET_MODE_FILTER
|
|
|
|
operation, either the caller must have the
|
2014-09-25 22:47:46 +00:00
|
|
|
.BR CAP_SYS_ADMIN
|
2014-11-02 08:40:24 +00:00
|
|
|
capability or the call must be preceded by the call:
|
|
|
|
|
|
|
|
prctl(PR_SET_NO_NEW_PRIVS, 1);
|
|
|
|
|
|
|
|
Otherwise, the
|
|
|
|
.BR SECCOMP_SET_MODE_FILTER
|
|
|
|
operation will fail and return
|
2014-09-25 22:47:46 +00:00
|
|
|
.BR EACCES
|
|
|
|
in
|
|
|
|
.IR errno .
|
|
|
|
This requirement ensures that filter programs cannot be applied to child
|
2014-11-02 15:36:00 +00:00
|
|
|
.\" FIXME What does "installed" in the following line mean?
|
2014-09-25 22:47:46 +00:00
|
|
|
processes with greater privileges than the process that installed them.
|
|
|
|
|
2014-11-02 08:43:23 +00:00
|
|
|
If
|
2014-09-25 22:47:46 +00:00
|
|
|
.BR prctl (2)
|
|
|
|
or
|
|
|
|
.BR seccomp (2)
|
2014-11-02 08:43:23 +00:00
|
|
|
is allowed by the attached filter, further filters may be added.
|
2014-11-02 15:31:31 +00:00
|
|
|
This will increase evaluation time, but allows for further reduction of
|
2014-09-25 22:47:46 +00:00
|
|
|
the attack surface during execution of a process.
|
|
|
|
|
2014-11-02 07:31:15 +00:00
|
|
|
The
|
|
|
|
.BR SECCOMP_SET_MODE_FILTER
|
|
|
|
operation is available only if the kernel is configured with
|
2014-09-25 22:47:46 +00:00
|
|
|
.BR CONFIG_SECCOMP_FILTER
|
|
|
|
enabled.
|
|
|
|
|
|
|
|
When
|
|
|
|
.IR flags
|
2014-11-02 07:31:15 +00:00
|
|
|
is 0, this operation is functionally identical to the call:
|
|
|
|
|
|
|
|
prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, args);
|
2014-09-25 22:47:46 +00:00
|
|
|
|
|
|
|
The recognized
|
|
|
|
.IR flags
|
|
|
|
are:
|
|
|
|
.RS
|
|
|
|
.TP
|
|
|
|
.BR SECCOMP_FILTER_FLAG_TSYNC
|
2014-11-08 10:48:49 +00:00
|
|
|
When adding a new filter, synchronize all other threads of the calling
|
2014-11-02 07:31:15 +00:00
|
|
|
process to the same seccomp filter tree.
|
2014-11-02 08:49:47 +00:00
|
|
|
.\" FIXME Nowhere in this page is the term "filter tree" defined.
|
2014-11-08 06:32:05 +00:00
|
|
|
.\" There should be a definition somewhere.
|
2014-11-02 08:49:47 +00:00
|
|
|
.\" Is it: "the set of filters attached to a thread"?
|
2014-11-02 07:31:15 +00:00
|
|
|
If any thread cannot do this,
|
|
|
|
the call will not attach the new seccomp filter,
|
|
|
|
and will fail, returning the first thread ID found that cannot synchronize.
|
|
|
|
Synchronization will fail if another thread is in
|
2014-09-25 22:47:46 +00:00
|
|
|
.BR SECCOMP_MODE_STRICT
|
2014-11-02 07:31:15 +00:00
|
|
|
or if it has attached new seccomp filters to itself,
|
|
|
|
diverging from the calling thread's filter tree.
|
2014-09-25 22:47:46 +00:00
|
|
|
.RE
|
|
|
|
.SH FILTERS
|
|
|
|
When adding filters via
|
|
|
|
.BR SECCOMP_SET_MODE_FILTER ,
|
|
|
|
.IR args
|
|
|
|
points to a filter program:
|
|
|
|
|
|
|
|
.in +4n
|
|
|
|
.nf
|
|
|
|
struct sock_fprog {
|
|
|
|
unsigned short len; /* Number of BPF instructions */
|
|
|
|
struct sock_filter *filter;
|
|
|
|
};
|
|
|
|
.fi
|
|
|
|
.in
|
|
|
|
|
|
|
|
Each program must contain one or more BPF instructions:
|
|
|
|
|
|
|
|
.in +4n
|
|
|
|
.nf
|
|
|
|
struct sock_filter { /* Filter block */
|
|
|
|
__u16 code; /* Actual filter code */
|
|
|
|
__u8 jt; /* Jump true */
|
|
|
|
__u8 jf; /* Jump false */
|
|
|
|
__u32 k; /* Generic multiuse field */
|
|
|
|
};
|
|
|
|
.fi
|
|
|
|
.in
|
|
|
|
|
|
|
|
When executing the instructions, the BPF program executes over the
|
2014-11-08 10:48:49 +00:00
|
|
|
system call information made available via:
|
2014-09-25 22:47:46 +00:00
|
|
|
|
|
|
|
.in +4n
|
|
|
|
.nf
|
|
|
|
struct seccomp_data {
|
|
|
|
int nr; /* system call number */
|
|
|
|
__u32 arch; /* AUDIT_ARCH_* value */
|
|
|
|
__u64 instruction_pointer; /* CPU instruction pointer */
|
|
|
|
__u64 args[6]; /* up to 6 system call arguments */
|
|
|
|
};
|
|
|
|
.fi
|
|
|
|
.in
|
|
|
|
|
2014-11-08 10:46:12 +00:00
|
|
|
.\" FIXME I find the next piece a little hard to understand, so,
|
|
|
|
.\" some questions:
|
|
|
|
.\" * If there are multiple filters, in what order are they executed?
|
|
|
|
.\" (The man page should probably detail the answer to this question.)
|
|
|
|
.\" * If there are multiple filters, are they all always executed?
|
|
|
|
.\" I assume not, but the notion that
|
|
|
|
.\" "the return value for the evaluation of a given system call
|
|
|
|
.\" will always use the value with the highest precedence"
|
|
|
|
.\" implies that even that if one filter generates (say)
|
|
|
|
.\" SECCOMP_RET_ERRNO, then further filters may still be executed,
|
|
|
|
.\" including one that generates (say) the "higher priority"
|
|
|
|
.\" SECCOMP_RET_KILL condition.
|
|
|
|
.\" Can you clarify the above?
|
2014-11-02 13:50:39 +00:00
|
|
|
A seccomp filter returns one of the values listed below.
|
2014-11-02 07:31:15 +00:00
|
|
|
If multiple filters exist,
|
2014-11-08 10:46:12 +00:00
|
|
|
the return value for the evaluation of a given system call
|
|
|
|
will always use the value with the highest precedence.
|
2014-11-02 07:31:15 +00:00
|
|
|
(For example,
|
2014-09-25 22:47:46 +00:00
|
|
|
.BR SECCOMP_RET_KILL
|
|
|
|
will always take precedence.)
|
|
|
|
|
2014-11-02 13:56:10 +00:00
|
|
|
In decreasing order order of precedence,
|
|
|
|
the values that may be returned by a seccomp filter are:
|
2014-09-25 22:47:46 +00:00
|
|
|
.TP
|
|
|
|
.BR SECCOMP_RET_KILL
|
2014-11-02 13:56:10 +00:00
|
|
|
Results in the task exiting immediately without executing the system call.
|
2014-11-02 13:59:31 +00:00
|
|
|
The task terminates as though killed by a
|
|
|
|
.B SIGSYS
|
|
|
|
signal
|
|
|
|
.RI ( not
|
|
|
|
.BR SIGKILL ).
|
2014-09-25 22:47:46 +00:00
|
|
|
.TP
|
|
|
|
.BR SECCOMP_RET_TRAP
|
|
|
|
Results in the kernel sending a
|
|
|
|
.BR SIGSYS
|
|
|
|
signal to the triggering task without executing the system call.
|
|
|
|
.IR siginfo\->si_call_addr
|
|
|
|
will show the address of the system call instruction, and
|
|
|
|
.IR siginfo\->si_syscall
|
|
|
|
and
|
|
|
|
.IR siginfo\->si_arch
|
2014-11-08 10:48:49 +00:00
|
|
|
will indicate which system call was attempted.
|
2014-11-02 07:31:15 +00:00
|
|
|
The program counter will be as though the system call happened
|
2014-11-08 10:48:49 +00:00
|
|
|
(i.e., it will not point to the system call instruction).
|
2014-11-02 07:31:15 +00:00
|
|
|
The return value register will contain an architecture\-dependent value;
|
|
|
|
if resuming execution, set it to something sensible.
|
2014-09-25 22:47:46 +00:00
|
|
|
(The architecture dependency is because replacing it with
|
|
|
|
.BR ENOSYS
|
|
|
|
could overwrite some useful information.)
|
|
|
|
|
2014-11-08 10:56:00 +00:00
|
|
|
.\" FIXME The following sentence is the first time that SECCOMP_RET_DATA
|
|
|
|
.\" is mentioned. SECCOMP_RET_DATA needs to be described in this
|
|
|
|
.\" man page.
|
2014-09-25 22:47:46 +00:00
|
|
|
The
|
|
|
|
.BR SECCOMP_RET_DATA
|
|
|
|
portion of the return value will be passed as
|
|
|
|
.IR si_errno .
|
|
|
|
|
|
|
|
.BR SIGSYS
|
2014-11-08 10:48:49 +00:00
|
|
|
triggered by seccomp will have the value
|
2014-11-02 14:03:08 +00:00
|
|
|
.BR SYS_SECCOMP
|
|
|
|
in the
|
2014-09-25 22:47:46 +00:00
|
|
|
.IR si_code
|
2014-11-02 14:03:08 +00:00
|
|
|
field.
|
2014-09-25 22:47:46 +00:00
|
|
|
.TP
|
|
|
|
.BR SECCOMP_RET_ERRNO
|
2014-11-08 10:52:39 +00:00
|
|
|
.\" FIXME What does "the return value" refer to in the next sentence?
|
|
|
|
.\" It is not obvious to me.
|
2014-09-25 22:47:46 +00:00
|
|
|
Results in the lower 16-bits of the return value being passed
|
2014-11-08 10:48:49 +00:00
|
|
|
to user space as the
|
2014-09-25 22:47:46 +00:00
|
|
|
.IR errno
|
|
|
|
without executing the system call.
|
|
|
|
.TP
|
|
|
|
.BR SECCOMP_RET_TRACE
|
2014-11-08 10:48:49 +00:00
|
|
|
When returned, this value will cause the kernel to attempt to notify a
|
|
|
|
.BR ptrace (2)-based
|
|
|
|
tracer prior to executing the system call.
|
2014-11-02 14:04:25 +00:00
|
|
|
.\" FIXME I (mtk) reworded the following sentence substantially.
|
|
|
|
.\" Please check it.
|
2014-11-02 07:31:15 +00:00
|
|
|
If there is no tracer present,
|
2014-11-02 14:04:25 +00:00
|
|
|
the system call is not executed and returns a failure status with
|
|
|
|
.I errno
|
|
|
|
set to
|
|
|
|
.BR ENOSYS .
|
2014-09-25 22:47:46 +00:00
|
|
|
|
|
|
|
A tracer will be notified if it requests
|
|
|
|
.BR PTRACE_O_TRACESECCOMP
|
|
|
|
using
|
|
|
|
.IR ptrace(PTRACE_SETOPTIONS) .
|
|
|
|
The tracer will be notified of a
|
|
|
|
.BR PTRACE_EVENT_SECCOMP
|
|
|
|
and the
|
|
|
|
.BR SECCOMP_RET_DATA
|
|
|
|
portion of the BPF program return value will be available to the tracer
|
|
|
|
via
|
|
|
|
.BR PTRACE_GETEVENTMSG .
|
|
|
|
|
2014-11-02 07:31:15 +00:00
|
|
|
The tracer can skip the system call by changing the system call number
|
|
|
|
to \-1.
|
|
|
|
Alternatively, the tracer can change the system call
|
2014-11-08 10:48:49 +00:00
|
|
|
requested by changing the system call to a valid system call number.
|
2014-11-02 07:31:15 +00:00
|
|
|
If the tracer asks to skip the system call, then the system call will
|
|
|
|
appear to return the value that the tracer puts in the return value register.
|
2014-09-25 22:47:46 +00:00
|
|
|
|
2014-11-02 07:31:15 +00:00
|
|
|
The seccomp check will not be run again after the tracer is notified.
|
|
|
|
(This means that seccomp-based sandboxes
|
|
|
|
.B "must not"
|
2014-11-08 10:48:49 +00:00
|
|
|
allow use of
|
|
|
|
.BR ptrace (2)\(emeven
|
|
|
|
of other
|
2014-11-02 07:31:15 +00:00
|
|
|
sandboxed processes\(emwithout extreme care;
|
|
|
|
ptracers can use this mechanism to escape.)
|
2014-09-25 22:47:46 +00:00
|
|
|
.TP
|
|
|
|
.BR SECCOMP_RET_ALLOW
|
|
|
|
Results in the system call being executed.
|
2014-11-08 10:48:49 +00:00
|
|
|
.PP
|
2014-09-25 22:47:46 +00:00
|
|
|
If multiple filters exist, the return value for the evaluation of a
|
|
|
|
given system call will always use the highest precedent value.
|
|
|
|
|
2014-11-08 10:48:49 +00:00
|
|
|
.\" FIXME The following sentence is the first time that SECCOMP_RET_ACTION
|
|
|
|
.\" is mentioned. SECCOMP_RET_ACTION needs to be described in this
|
|
|
|
.\" man page.
|
|
|
|
Precedence is determined using only the
|
2014-09-25 22:47:46 +00:00
|
|
|
.BR SECCOMP_RET_ACTION
|
2014-11-08 10:48:49 +00:00
|
|
|
mask.
|
|
|
|
When multiple filters return values of the same precedence,
|
2014-09-25 22:47:46 +00:00
|
|
|
only the
|
|
|
|
.BR SECCOMP_RET_DATA
|
|
|
|
from the most recently installed filter will be returned.
|
|
|
|
.SH RETURN VALUE
|
|
|
|
On success,
|
|
|
|
.BR seccomp ()
|
|
|
|
returns 0.
|
|
|
|
On error, if
|
|
|
|
.BR SECCOMP_FILTER_FLAG_TSYNC
|
2014-11-02 07:31:15 +00:00
|
|
|
was used,
|
|
|
|
the return value is the thread ID that caused the synchronization failure.
|
|
|
|
On other errors, \-1 is returned, and
|
2014-09-25 22:47:46 +00:00
|
|
|
.IR errno
|
|
|
|
is set to indicate the cause of the error.
|
|
|
|
.SH ERRORS
|
|
|
|
.BR seccomp ()
|
|
|
|
can fail for the following reasons:
|
|
|
|
.TP
|
|
|
|
.BR EACCESS
|
2014-11-02 07:31:15 +00:00
|
|
|
The caller did not have the
|
2014-09-25 22:47:46 +00:00
|
|
|
.BR CAP_SYS_ADMIN
|
|
|
|
capability, or had not set
|
|
|
|
.IR no_new_privs
|
|
|
|
before using
|
|
|
|
.BR SECCOMP_SET_MODE_FILTER .
|
|
|
|
.TP
|
|
|
|
.BR EFAULT
|
|
|
|
.IR args
|
|
|
|
was required to be a valid address.
|
|
|
|
.TP
|
|
|
|
.BR EINVAL
|
|
|
|
.IR operation
|
|
|
|
is unknown; or
|
|
|
|
.IR flags
|
|
|
|
are invalid for the given
|
|
|
|
.IR operation
|
|
|
|
.TP
|
|
|
|
.BR ESRCH
|
|
|
|
Another thread caused a failure during thread sync, but its ID could not
|
|
|
|
be determined.
|
|
|
|
.SH VERSIONS
|
2014-11-08 10:48:49 +00:00
|
|
|
The
|
|
|
|
.BR seccomp()
|
|
|
|
system call first appeared in Linux 3.17.
|
2014-09-25 22:47:46 +00:00
|
|
|
.\" FIXME Add glibc version
|
|
|
|
.SH CONFORMING TO
|
2014-11-08 10:48:49 +00:00
|
|
|
The
|
|
|
|
.BR seccomp()
|
|
|
|
system call is a nonstandard Linux extension.
|
2014-09-25 22:47:46 +00:00
|
|
|
.SH NOTES
|
|
|
|
.BR seccomp ()
|
2014-11-02 07:31:15 +00:00
|
|
|
provides a superset of the functionality provided by the
|
|
|
|
.BR prctl (2)
|
2014-11-02 16:11:15 +00:00
|
|
|
.BR PR_SET_SECCOMP
|
2014-11-02 07:31:15 +00:00
|
|
|
operation (which does not support
|
|
|
|
.IR flags ).
|
2014-09-25 22:47:46 +00:00
|
|
|
.SH EXAMPLE
|
2014-11-08 06:31:35 +00:00
|
|
|
.\" FIXME Please carefully review the following new piece that
|
|
|
|
.\" demonstrates the use of your example program.
|
|
|
|
The program below accepts four or more arguments.
|
|
|
|
The first three arguments are a system call number,
|
|
|
|
a numeric architecture identifier, and an error number.
|
|
|
|
The program uses these values to construct a BPF filter
|
|
|
|
that is used at run time to perform the following checks:
|
|
|
|
.IP [1] 4
|
|
|
|
If the program is not running on the specified architecture,
|
|
|
|
the BPF filter causes system calls to fail with the error
|
|
|
|
.BR ENOSYS .
|
|
|
|
.IP [2]
|
|
|
|
If the program attempts to execute the system call with the specified number,
|
|
|
|
the BPF filter causes the system call to fail, with
|
|
|
|
.I errno
|
|
|
|
being set to the specified error number.
|
|
|
|
.PP
|
|
|
|
The remaining command-line arguments specify
|
|
|
|
the pathname and additional arguments of a program
|
|
|
|
that the example program should attempt to execute using
|
|
|
|
.BR execve (3)
|
|
|
|
(a library function that employs the
|
|
|
|
.BR execve (2)
|
|
|
|
system call).
|
|
|
|
Some example runs of the program are shown below.
|
|
|
|
|
|
|
|
First, we display the architecture that we are running on (x86-64)
|
|
|
|
and then construct a shell function that looks up system call
|
|
|
|
numbers on this architecture:
|
|
|
|
|
|
|
|
.nf
|
|
|
|
.in +4n
|
|
|
|
$ \fBuname -m\fP
|
|
|
|
x86_64
|
|
|
|
$ \fBsyscall_nr() {
|
|
|
|
cat /usr/src/linux/arch/x86/syscalls/syscall_64.tbl | \\
|
|
|
|
awk '$2 != "x32" && $3 == "'$1'" { print $1 }'
|
|
|
|
}\fP
|
|
|
|
.in
|
|
|
|
.fi
|
|
|
|
|
|
|
|
When the BPF filter rejects a system call (case [2] above),
|
|
|
|
it causes the system call to fail with the error number
|
|
|
|
specified on the command line.
|
|
|
|
In the experiments shown here, we'll use error number 99:
|
|
|
|
|
|
|
|
.nf
|
|
|
|
.in +4n
|
|
|
|
$ \fBerrno 99\fP
|
|
|
|
EADDRNOTAVAIL 99 Cannot assign requested address
|
|
|
|
.in
|
|
|
|
.fi
|
|
|
|
|
|
|
|
In the following example, we attempt to run the command
|
|
|
|
.BR whoami (1),
|
|
|
|
but the BPF filter rejects the
|
|
|
|
.BR execve (2)
|
|
|
|
system call, so that the command is not even executed:
|
|
|
|
|
|
|
|
.nf
|
|
|
|
.in +4n
|
|
|
|
$ \fBsyscall_nr execve\fP
|
|
|
|
59
|
|
|
|
$ \fB./a.out 59 0xC000003E 99 /bin/whoami\fP
|
|
|
|
execv: Cannot assign requested address
|
|
|
|
.in
|
|
|
|
.fi
|
|
|
|
|
|
|
|
In the next example, the BPF filter rejects the
|
|
|
|
.BR write (2)
|
|
|
|
system call, so that, although it is successfully started, the
|
|
|
|
.BR whoami (1)
|
|
|
|
command is not able to write output:
|
|
|
|
|
|
|
|
.nf
|
|
|
|
.in +4n
|
|
|
|
$ \fBsyscall_nr write\fP
|
|
|
|
1
|
|
|
|
$ \fB./a.out 1 0xC000003E 99 /bin/whoami\fP
|
|
|
|
.in
|
|
|
|
.fi
|
|
|
|
|
|
|
|
In the final example,
|
|
|
|
the BPF filter rejects a system call that is not used by the
|
|
|
|
.BR whoami (1)
|
|
|
|
command, so it is able to successfully execute and produce output:
|
|
|
|
|
|
|
|
.nf
|
|
|
|
.in +4n
|
|
|
|
$ \fBsyscall_nr preadv\fP
|
|
|
|
295
|
|
|
|
$ \fB./a.out 295 0xC000003E 99 /bin/whoami\fP
|
|
|
|
cecilia
|
|
|
|
.in
|
|
|
|
.fi
|
|
|
|
.SS Program source
|
|
|
|
.fi
|
2014-09-25 22:47:46 +00:00
|
|
|
.nf
|
|
|
|
#include <errno.h>
|
|
|
|
#include <stddef.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <unistd.h>
|
|
|
|
#include <linux/audit.h>
|
|
|
|
#include <linux/filter.h>
|
|
|
|
#include <linux/seccomp.h>
|
|
|
|
#include <sys/prctl.h>
|
|
|
|
|
2014-11-02 07:53:40 +00:00
|
|
|
static int
|
|
|
|
install_filter(int syscall, int arch, int error)
|
2014-09-25 22:47:46 +00:00
|
|
|
{
|
|
|
|
struct sock_filter filter[] = {
|
2014-11-08 05:58:55 +00:00
|
|
|
/* [0] Load architecture */
|
2014-11-02 07:53:40 +00:00
|
|
|
BPF_STMT(BPF_LD + BPF_W + BPF_ABS,
|
2014-09-25 22:47:46 +00:00
|
|
|
(offsetof(struct seccomp_data, arch))),
|
2014-11-02 07:53:40 +00:00
|
|
|
|
2014-11-08 05:58:55 +00:00
|
|
|
/* [1] Jump forward 4 instructions on architecture mismatch */
|
2014-11-02 07:53:40 +00:00
|
|
|
BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, arch, 0, 4),
|
|
|
|
|
2014-11-08 10:48:49 +00:00
|
|
|
/* [2] Load system call number */
|
2014-11-02 07:53:40 +00:00
|
|
|
BPF_STMT(BPF_LD + BPF_W + BPF_ABS,
|
2014-09-25 22:47:46 +00:00
|
|
|
(offsetof(struct seccomp_data, nr))),
|
2014-11-02 07:53:40 +00:00
|
|
|
|
2014-11-08 10:48:49 +00:00
|
|
|
/* [3] Jump forward 1 instruction on system call number
|
|
|
|
mismatch */
|
2014-11-02 07:53:40 +00:00
|
|
|
BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, syscall, 0, 1),
|
|
|
|
|
2014-11-08 10:48:49 +00:00
|
|
|
/* [4] Matching architecture and system call: return
|
|
|
|
specific errno */
|
2014-11-02 07:53:40 +00:00
|
|
|
BPF_STMT(BPF_RET + BPF_K,
|
|
|
|
SECCOMP_RET_ERRNO | (error & SECCOMP_RET_DATA)),
|
|
|
|
|
2014-11-08 10:48:49 +00:00
|
|
|
/* [5] Destination of system call number mismatch: allow other
|
|
|
|
system calls */
|
2014-11-02 07:53:40 +00:00
|
|
|
BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW),
|
|
|
|
|
2014-11-08 05:58:55 +00:00
|
|
|
/* [6] Destination of architecture mismatch: kill process */
|
2014-11-02 07:53:40 +00:00
|
|
|
BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_KILL),
|
2014-09-25 22:47:46 +00:00
|
|
|
};
|
2014-11-02 07:53:40 +00:00
|
|
|
|
2014-09-25 22:47:46 +00:00
|
|
|
struct sock_fprog prog = {
|
2014-11-02 07:53:40 +00:00
|
|
|
.len = (unsigned short) (sizeof(filter) / sizeof(filter[0])),
|
2014-09-25 22:47:46 +00:00
|
|
|
.filter = filter,
|
|
|
|
};
|
2014-11-02 07:53:40 +00:00
|
|
|
|
2014-09-25 22:47:46 +00:00
|
|
|
if (seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog)) {
|
|
|
|
perror("seccomp");
|
2014-11-02 07:53:40 +00:00
|
|
|
return 1;
|
2014-09-25 22:47:46 +00:00
|
|
|
}
|
2014-11-02 07:53:40 +00:00
|
|
|
|
|
|
|
return 0;
|
2014-09-25 22:47:46 +00:00
|
|
|
}
|
|
|
|
|
2014-11-02 07:53:40 +00:00
|
|
|
int
|
|
|
|
main(int argc, char **argv)
|
2014-09-25 22:47:46 +00:00
|
|
|
{
|
|
|
|
if (argc < 5) {
|
|
|
|
fprintf(stderr, "Usage:\\n"
|
|
|
|
"refuse <syscall_nr> <arch> <errno> <prog> [<args>]\\n"
|
|
|
|
"Hint: AUDIT_ARCH_I386: 0x%X\\n"
|
|
|
|
" AUDIT_ARCH_X86_64: 0x%X\\n"
|
|
|
|
"\\n", AUDIT_ARCH_I386, AUDIT_ARCH_X86_64);
|
2014-11-02 07:53:40 +00:00
|
|
|
exit(EXIT_FAILURE);
|
2014-09-25 22:47:46 +00:00
|
|
|
}
|
2014-11-02 07:53:40 +00:00
|
|
|
|
2014-09-25 22:47:46 +00:00
|
|
|
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
|
|
|
|
perror("prctl");
|
2014-11-02 07:53:40 +00:00
|
|
|
exit(EXIT_FAILURE);
|
2014-09-25 22:47:46 +00:00
|
|
|
}
|
2014-11-02 07:53:40 +00:00
|
|
|
|
2014-09-25 22:47:46 +00:00
|
|
|
if (install_filter(strtol(argv[1], NULL, 0),
|
|
|
|
strtol(argv[2], NULL, 0),
|
|
|
|
strtol(argv[3], NULL, 0)))
|
2014-11-02 07:53:40 +00:00
|
|
|
exit(EXIT_FAILURE);
|
|
|
|
|
2014-09-25 22:47:46 +00:00
|
|
|
execv(argv[4], &argv[4]);
|
|
|
|
perror("execv");
|
2014-11-02 07:53:40 +00:00
|
|
|
exit(EXIT_FAILURE);
|
2014-09-25 22:47:46 +00:00
|
|
|
}
|
|
|
|
.fi
|
|
|
|
.SH SEE ALSO
|
|
|
|
.BR prctl (2),
|
|
|
|
.BR ptrace (2),
|
|
|
|
.BR signal (7),
|
|
|
|
.BR socket (7)
|
2014-11-02 15:48:07 +00:00
|
|
|
.sp
|
2014-11-08 05:21:10 +00:00
|
|
|
.\" FIXME: Is the following the best source of info on the BPF language?
|
2014-11-02 15:48:07 +00:00
|
|
|
The kernel source file
|
|
|
|
.IR Documentation/networking/filter.txt .
|