2020-09-28 20:13:12 +00:00
|
|
|
.\" Copyright (C) 2020 Michael Kerrisk <mtk.manpages@gmail.com>
|
|
|
|
.\"
|
|
|
|
.\" %%%LICENSE_START(VERBATIM)
|
|
|
|
.\" Permission is granted to make and distribute verbatim copies of this
|
|
|
|
.\" manual provided the copyright notice and this permission notice are
|
|
|
|
.\" preserved on all copies.
|
|
|
|
.\"
|
|
|
|
.\" Permission is granted to copy and distribute modified versions of this
|
|
|
|
.\" manual under the conditions for verbatim copying, provided that the
|
|
|
|
.\" entire resulting derived work is distributed under the terms of a
|
|
|
|
.\" permission notice identical to this one.
|
|
|
|
.\"
|
|
|
|
.\" Since the Linux kernel and libraries are constantly changing, this
|
|
|
|
.\" manual page may be incorrect or out-of-date. The author(s) assume no
|
|
|
|
.\" responsibility for errors or omissions, or for damages resulting from
|
|
|
|
.\" the use of the information contained herein. The author(s) may not
|
|
|
|
.\" have taken the same level of care in the production of this manual,
|
|
|
|
.\" which is licensed free of charge, as they might when working
|
|
|
|
.\" professionally.
|
|
|
|
.\"
|
|
|
|
.\" Formatted or processed versions of this manual, if unaccompanied by
|
|
|
|
.\" the source, must acknowledge the copyright and authors of this work.
|
|
|
|
.\" %%%LICENSE_END
|
|
|
|
.\"
|
|
|
|
.TH SECCOMP_UNOTIFY 2 2020-10-01 "Linux" "Linux Programmer's Manual"
|
|
|
|
.SH NAME
|
|
|
|
seccomp_unotify \- Seccomp user-space notification mechanism
|
|
|
|
.SH SYNOPSIS
|
|
|
|
.nf
|
|
|
|
.B #include <linux/seccomp.h>
|
|
|
|
.B #include <linux/filter.h>
|
|
|
|
.B #include <linux/audit.h>
|
|
|
|
.PP
|
|
|
|
.BI "int seccomp(unsigned int " operation ", unsigned int " flags \
|
|
|
|
", void *" args );
|
2020-10-01 09:33:16 +00:00
|
|
|
.PP
|
|
|
|
.B #include <sys/ioctl.h>
|
|
|
|
.PP
|
|
|
|
.BI "int ioctl(int " fd ", SECCOMP_IOCTL_NOTIF_RECV,"
|
|
|
|
.BI " struct seccomp_notif *" req );
|
|
|
|
.BI "int ioctl(int " fd ", SECCOMP_IOCTL_NOTIF_SEND,"
|
|
|
|
.BI " struct seccomp_notif_resp *" resp );
|
|
|
|
.BI "int ioctl(int " fd ", SECCOMP_IOCTL_NOTIF_ID_VALID, __u64 *" id );
|
2020-09-28 20:13:12 +00:00
|
|
|
.fi
|
|
|
|
.SH DESCRIPTION
|
|
|
|
This page describes the user-space notification mechanism provided by the
|
|
|
|
Secure Computing (seccomp) facility.
|
|
|
|
As well as the use of the
|
|
|
|
.B SECCOMP_FILTER_FLAG_NEW_LISTENER
|
|
|
|
flag, the
|
|
|
|
.BR SECCOMP_RET_USER_NOTIF
|
|
|
|
action value, and the
|
|
|
|
.B SECCOMP_GET_NOTIF_SIZES
|
|
|
|
operation described in
|
|
|
|
.BR seccomp (2),
|
|
|
|
this mechanism involves the use of a number of related
|
|
|
|
.BR ioctl (2)
|
|
|
|
operations (described below).
|
|
|
|
.\"
|
|
|
|
.SS Overview
|
|
|
|
In conventional usage of a seccomp filter,
|
2020-10-14 16:30:34 +00:00
|
|
|
the decision about how to treat a system call is made by the filter itself.
|
|
|
|
By contrast, the user-space notification mechanism allows
|
|
|
|
the seccomp filter to delegate
|
|
|
|
the handling of the system call to another user-space process.
|
2020-10-15 10:27:33 +00:00
|
|
|
Note that this mechanism is explicitly
|
|
|
|
.B not
|
|
|
|
intended as a method implementing security policy; see NOTES.
|
2020-09-28 20:13:12 +00:00
|
|
|
.PP
|
|
|
|
In the discussion that follows,
|
2020-10-14 16:30:34 +00:00
|
|
|
the thread(s) on which the seccomp filter is installed is (are)
|
|
|
|
referred to as the
|
2020-09-28 20:13:12 +00:00
|
|
|
.IR target ,
|
|
|
|
and the process that is notified by the user-space notification
|
|
|
|
mechanism is referred to as the
|
|
|
|
.IR supervisor .
|
2020-10-14 16:30:34 +00:00
|
|
|
.PP
|
|
|
|
A suitably privileged supervisor can use the user-space notification
|
|
|
|
mechanism to perform actions on behalf of the target.
|
|
|
|
The advantage of the user-space notification mechanism is that
|
|
|
|
the supervisor will
|
|
|
|
usually be able to retrieve information about the target and the
|
|
|
|
performed system call that the seccomp filter itself cannot.
|
|
|
|
(A seccomp filter is limited in the information it can obtain and
|
|
|
|
the actions that it can perform because it
|
|
|
|
is running on a virtual machine inside the kernel.)
|
|
|
|
.PP
|
|
|
|
An overview of the steps performed by the target and the supervisor
|
2020-10-01 09:33:16 +00:00
|
|
|
is as follows:
|
2020-09-28 20:13:12 +00:00
|
|
|
.\"-------------------------------------
|
|
|
|
.IP 1. 3
|
2020-10-01 09:33:16 +00:00
|
|
|
The target establishes a seccomp filter in the usual manner,
|
2020-09-28 20:13:12 +00:00
|
|
|
but with two differences:
|
|
|
|
.RS
|
|
|
|
.IP \(bu 2
|
|
|
|
The
|
|
|
|
.BR seccomp (2)
|
|
|
|
.I flags
|
|
|
|
argument includes the flag
|
|
|
|
.BR SECCOMP_FILTER_FLAG_NEW_LISTENER .
|
2020-10-26 09:11:09 +00:00
|
|
|
Consequently, the return value of the (successful)
|
2020-09-28 20:13:12 +00:00
|
|
|
.BR seccomp (2)
|
|
|
|
call is a new "listening"
|
|
|
|
file descriptor that can be used to receive notifications.
|
2020-10-15 11:33:27 +00:00
|
|
|
Only one "listening" seccomp filter can be installed for a thread.
|
|
|
|
.\" FIXME
|
|
|
|
.\" Is the last sentence above correct?
|
2020-10-26 09:11:09 +00:00
|
|
|
.\"
|
|
|
|
.\" Kees Cook (25 Oct 2020) notes:
|
|
|
|
.\"
|
|
|
|
.\" I like this limitation, but I expect that it'll need to change in the
|
|
|
|
.\" future. Even with LSMs, we see the need for arbitrary stacking, and the
|
|
|
|
.\" idea of there being only 1 supervisor will eventually break down. Right
|
|
|
|
.\" now there is only 1 because only container managers are using this
|
|
|
|
.\" feature. But if some daemon starts using it to isolate some thread,
|
|
|
|
.\" suddenly it might break if a container manager is trying to listen to it
|
|
|
|
.\" too, etc. I expect it won't be needed soon, but I do think it'll change.
|
|
|
|
.\"
|
2020-09-28 20:13:12 +00:00
|
|
|
.IP \(bu
|
|
|
|
In cases where it is appropriate, the seccomp filter returns the action value
|
|
|
|
.BR SECCOMP_RET_USER_NOTIF .
|
|
|
|
This return value will trigger a notification event.
|
|
|
|
.RE
|
|
|
|
.\"-------------------------------------
|
|
|
|
.IP 2.
|
2020-10-01 09:33:16 +00:00
|
|
|
In order that the supervisor can obtain notifications
|
2020-09-28 20:13:12 +00:00
|
|
|
using the listening file descriptor,
|
|
|
|
(a duplicate of) that file descriptor must be passed from
|
2020-10-01 09:33:16 +00:00
|
|
|
the target to the supervisor.
|
2020-09-28 20:13:12 +00:00
|
|
|
One way in which this could be done is by passing the file descriptor
|
2020-10-01 09:33:16 +00:00
|
|
|
over a UNIX domain socket connection between the target and the supervisor
|
|
|
|
(using the
|
2020-09-28 20:13:12 +00:00
|
|
|
.BR SCM_RIGHTS
|
|
|
|
ancillary message type described in
|
|
|
|
.BR unix (7)).
|
2020-10-01 09:33:16 +00:00
|
|
|
.\" Jann Horn:
|
|
|
|
.\" Instead of using unix domain sockets to send the fd to the
|
|
|
|
.\" parent, I think you could also use clone3() with
|
|
|
|
.\" flags==CLONE_FILES|SIGCHLD, dup2() the seccomp fd to an fd
|
|
|
|
.\" that was reserved in the parent, call unshare(CLONE_FILES)
|
|
|
|
.\" in the child after setting up the seccomp fd, and wake
|
|
|
|
.\" up the parent with something like pthread_cond_signal()?
|
|
|
|
.\" I'm not sure whether that'd look better or worse in the
|
|
|
|
.\" end though, so maybe just ignore this comment.
|
2020-09-28 20:13:12 +00:00
|
|
|
.\"-------------------------------------
|
|
|
|
.IP 3.
|
2020-10-01 09:33:16 +00:00
|
|
|
The supervisor will receive notification events
|
2020-09-28 20:13:12 +00:00
|
|
|
on the listening file descriptor.
|
|
|
|
These events are returned as structures of type
|
|
|
|
.IR seccomp_notif .
|
|
|
|
Because this structure and its size may evolve over kernel versions,
|
|
|
|
the supervisor must first determine the size of this structure
|
|
|
|
using the
|
|
|
|
.BR seccomp (2)
|
|
|
|
.B SECCOMP_GET_NOTIF_SIZES
|
|
|
|
operation, which returns a structure of type
|
|
|
|
.IR seccomp_notif_sizes .
|
|
|
|
The supervisor allocates a buffer of size
|
|
|
|
.I seccomp_notif_sizes.seccomp_notif
|
|
|
|
bytes to receive notification events.
|
|
|
|
In addition,the supervisor allocates another buffer of size
|
|
|
|
.I seccomp_notif_sizes.seccomp_notif_resp
|
|
|
|
bytes for the response (a
|
|
|
|
.I struct seccomp_notif_resp
|
|
|
|
structure)
|
2020-10-01 09:33:16 +00:00
|
|
|
that it will provide to the kernel (and thus the target).
|
2020-09-28 20:13:12 +00:00
|
|
|
.\"-------------------------------------
|
|
|
|
.IP 4.
|
2020-10-01 09:33:16 +00:00
|
|
|
The target then performs its workload,
|
2020-09-28 20:13:12 +00:00
|
|
|
which includes system calls that will be controlled by the seccomp filter.
|
|
|
|
Whenever one of these system calls causes the filter to return the
|
|
|
|
.B SECCOMP_RET_USER_NOTIF
|
|
|
|
action value, the kernel does
|
|
|
|
.I not
|
2020-10-14 16:30:34 +00:00
|
|
|
(yet) execute the system call;
|
2020-10-01 09:33:16 +00:00
|
|
|
instead, execution of the target is temporarily blocked inside
|
2020-10-14 16:30:34 +00:00
|
|
|
the kernel (in a sleep state that is interruptible by signals)
|
|
|
|
and a notification event is generated on the listening file descriptor.
|
2020-09-28 20:13:12 +00:00
|
|
|
.\"-------------------------------------
|
|
|
|
.IP 5.
|
2020-10-01 09:33:16 +00:00
|
|
|
The supervisor can now repeatedly monitor the
|
2020-09-28 20:13:12 +00:00
|
|
|
listening file descriptor for
|
|
|
|
.BR SECCOMP_RET_USER_NOTIF -triggered
|
|
|
|
events.
|
|
|
|
To do this, the supervisor uses the
|
|
|
|
.B SECCOMP_IOCTL_NOTIF_RECV
|
|
|
|
.BR ioctl (2)
|
|
|
|
operation to read information about a notification event;
|
|
|
|
this operation blocks until an event is available.
|
|
|
|
The operation returns a
|
|
|
|
.I seccomp_notif
|
|
|
|
structure containing information about the system call
|
2020-10-01 09:33:16 +00:00
|
|
|
that is being attempted by the target.
|
2020-09-28 20:13:12 +00:00
|
|
|
.\"-------------------------------------
|
|
|
|
.IP 6.
|
|
|
|
The
|
|
|
|
.I seccomp_notif
|
|
|
|
structure returned by the
|
|
|
|
.B SECCOMP_IOCTL_NOTIF_RECV
|
|
|
|
operation includes the same information (a
|
|
|
|
.I seccomp_data
|
|
|
|
structure) that was passed to the seccomp filter.
|
|
|
|
This information allows the supervisor to discover the system call number and
|
2020-10-01 09:33:16 +00:00
|
|
|
the arguments for the target's system call.
|
|
|
|
In addition, the notification event contains the ID of the thread
|
2020-10-26 09:11:09 +00:00
|
|
|
that triggered the notification and a unique cookie value that
|
|
|
|
is used in subsequent
|
|
|
|
.B SECCOMP_IOCTL_NOTIF_ID_VALID
|
|
|
|
and
|
|
|
|
.B SECCOMP_IOCTL_NOTIF_SEND
|
|
|
|
operations.
|
2020-09-28 20:13:12 +00:00
|
|
|
.IP
|
|
|
|
The information in the notification can be used to discover the
|
2020-10-01 09:33:16 +00:00
|
|
|
values of pointer arguments for the target's system call.
|
2020-09-28 20:13:12 +00:00
|
|
|
(This is something that can't be done from within a seccomp filter.)
|
2020-09-30 20:24:59 +00:00
|
|
|
One way in which the supervisor can do this is to open the corresponding
|
2020-10-01 09:33:16 +00:00
|
|
|
.I /proc/[tid]/mem
|
|
|
|
file (see
|
|
|
|
.BR proc (5))
|
|
|
|
and read bytes from the location that corresponds to one of
|
2020-09-30 20:24:59 +00:00
|
|
|
the pointer arguments whose value is supplied in the notification event.
|
2020-09-28 20:13:12 +00:00
|
|
|
.\" Tycho Andersen mentioned that there are alternatives to /proc/PID/mem,
|
|
|
|
.\" such as ptrace() and /proc/PID/map_files
|
|
|
|
(The supervisor must be careful to avoid
|
|
|
|
a race condition that can occur when doing this;
|
|
|
|
see the description of the
|
|
|
|
.BR SECCOMP_IOCTL_NOTIF_ID_VALID
|
|
|
|
.BR ioctl (2)
|
|
|
|
operation below.)
|
|
|
|
In addition,
|
|
|
|
the supervisor can access other system information that is visible
|
|
|
|
in user space but which is not accessible from a seccomp filter.
|
|
|
|
.\"-------------------------------------
|
|
|
|
.IP 7.
|
|
|
|
Having obtained information as per the previous step,
|
|
|
|
the supervisor may then choose to perform an action in response
|
2020-10-01 09:33:16 +00:00
|
|
|
to the target's system call
|
2020-09-28 20:13:12 +00:00
|
|
|
(which, as noted above, is not executed when the seccomp filter returns the
|
|
|
|
.B SECCOMP_RET_USER_NOTIF
|
|
|
|
action value).
|
|
|
|
.IP
|
|
|
|
One example use case here relates to containers.
|
2020-10-01 09:33:16 +00:00
|
|
|
The target may be located inside a container where
|
2020-09-28 20:13:12 +00:00
|
|
|
it does not have sufficient capabilities to mount a filesystem
|
|
|
|
in the container's mount namespace.
|
|
|
|
However, the supervisor may be a more privileged process that
|
2020-10-01 09:33:16 +00:00
|
|
|
does have sufficient capabilities to perform the mount operation.
|
2020-09-28 20:13:12 +00:00
|
|
|
.\"-------------------------------------
|
|
|
|
.IP 8.
|
|
|
|
The supervisor then sends a response to the notification.
|
|
|
|
The information in this response is used by the kernel to construct
|
2020-10-01 09:33:16 +00:00
|
|
|
a return value for the target's system call and provide
|
2020-09-28 20:13:12 +00:00
|
|
|
a value that will be assigned to the
|
|
|
|
.I errno
|
2020-10-01 09:33:16 +00:00
|
|
|
variable of the target.
|
2020-09-28 20:13:12 +00:00
|
|
|
.IP
|
|
|
|
The response is sent using the
|
2020-10-04 05:21:54 +00:00
|
|
|
.B SECCOMP_IOCTL_NOTIF_SEND
|
2020-09-28 20:13:12 +00:00
|
|
|
.BR ioctl (2)
|
|
|
|
operation, which is used to transmit a
|
|
|
|
.I seccomp_notif_resp
|
|
|
|
structure to the kernel.
|
|
|
|
This structure includes a cookie value that the supervisor obtained in the
|
|
|
|
.I seccomp_notif
|
|
|
|
structure returned by the
|
|
|
|
.B SECCOMP_IOCTL_NOTIF_RECV
|
|
|
|
operation.
|
|
|
|
This cookie value allows the kernel to associate the response with the
|
2020-10-01 09:33:16 +00:00
|
|
|
target.
|
2020-10-26 09:11:09 +00:00
|
|
|
This structure must include the cookie value that the supervisor
|
|
|
|
obtained in the
|
|
|
|
.I seccomp_notif
|
|
|
|
structure returned by the
|
|
|
|
.B SECCOMP_IOCTL_NOTIF_RECV
|
|
|
|
operation;
|
|
|
|
the cookie allows the kernel to associate the response with the target.
|
2020-09-28 20:13:12 +00:00
|
|
|
.\"-------------------------------------
|
|
|
|
.IP 9.
|
|
|
|
Once the notification has been sent,
|
2020-10-01 09:33:16 +00:00
|
|
|
the system call in the target thread unblocks,
|
2020-09-28 20:13:12 +00:00
|
|
|
returning the information that was provided by the supervisor
|
|
|
|
in the notification response.
|
|
|
|
.\"-------------------------------------
|
|
|
|
.PP
|
|
|
|
As a variation on the last two steps,
|
|
|
|
the supervisor can send a response that tells the kernel that it
|
2020-10-01 09:33:16 +00:00
|
|
|
should execute the target thread's system call; see the discussion of
|
2020-09-28 20:13:12 +00:00
|
|
|
.BR SECCOMP_USER_NOTIF_FLAG_CONTINUE ,
|
|
|
|
below.
|
|
|
|
.\"
|
|
|
|
.SS ioctl(2) operations
|
|
|
|
The following
|
|
|
|
.BR ioctl (2)
|
|
|
|
operations are provided to support seccomp user-space notification.
|
|
|
|
For each of these operations, the first (file descriptor) argument of
|
|
|
|
.BR ioctl (2)
|
|
|
|
is the listening file descriptor returned by a call to
|
|
|
|
.BR seccomp (2)
|
|
|
|
with the
|
|
|
|
.BR SECCOMP_FILTER_FLAG_NEW_LISTENER
|
|
|
|
flag.
|
|
|
|
.TP
|
|
|
|
.BR SECCOMP_IOCTL_NOTIF_RECV " (since Linux 5.0)"
|
|
|
|
This operation is used to obtain a user-space
|
|
|
|
notification event.
|
|
|
|
If no such event is currently pending,
|
|
|
|
the operation blocks until an event occurs.
|
|
|
|
The third
|
|
|
|
.BR ioctl (2)
|
|
|
|
argument is a pointer to a structure of the following form
|
|
|
|
which contains information about the event.
|
|
|
|
This structure must be zeroed out before the call.
|
|
|
|
.IP
|
|
|
|
.in +4n
|
|
|
|
.EX
|
|
|
|
struct seccomp_notif {
|
|
|
|
__u64 id; /* Cookie */
|
2020-10-01 09:33:16 +00:00
|
|
|
__u32 pid; /* TID of target thread */
|
2020-09-28 20:13:12 +00:00
|
|
|
__u32 flags; /* Currently unused (0) */
|
|
|
|
struct seccomp_data data; /* See seccomp(2) */
|
|
|
|
};
|
|
|
|
.EE
|
|
|
|
.in
|
|
|
|
.IP
|
|
|
|
The fields in this structure are as follows:
|
|
|
|
.RS
|
|
|
|
.TP
|
|
|
|
.I id
|
|
|
|
This is a cookie for the notification.
|
|
|
|
Each such cookie is guaranteed to be unique for the corresponding
|
|
|
|
seccomp filter.
|
|
|
|
.RS
|
|
|
|
.IP \(bu 2
|
|
|
|
It can be used with the
|
|
|
|
.B SECCOMP_IOCTL_NOTIF_ID_VALID
|
|
|
|
.BR ioctl (2)
|
2020-10-01 09:33:16 +00:00
|
|
|
operation to verify that the target is still alive.
|
2020-09-28 20:13:12 +00:00
|
|
|
.IP \(bu
|
|
|
|
When returning a notification response to the kernel,
|
|
|
|
the supervisor must include the cookie value in the
|
|
|
|
.IR seccomp_notif_resp
|
|
|
|
structure that is specified as the argument of the
|
|
|
|
.BR SECCOMP_IOCTL_NOTIF_SEND
|
|
|
|
operation.
|
|
|
|
.RE
|
|
|
|
.TP
|
|
|
|
.I pid
|
2020-10-01 09:33:16 +00:00
|
|
|
This is the thread ID of the target thread that triggered
|
2020-09-28 20:13:12 +00:00
|
|
|
the notification event.
|
|
|
|
.TP
|
|
|
|
.I flags
|
|
|
|
This is a bit mask of flags providing further information on the event.
|
|
|
|
In the current implementation, this field is always zero.
|
|
|
|
.TP
|
|
|
|
.I data
|
|
|
|
This is a
|
|
|
|
.I seccomp_data
|
|
|
|
structure containing information about the system call that
|
|
|
|
triggered the notification.
|
|
|
|
This is the same structure that is passed to the seccomp filter.
|
|
|
|
See
|
|
|
|
.BR seccomp (2)
|
|
|
|
for details of this structure.
|
|
|
|
.RE
|
|
|
|
.IP
|
|
|
|
On success, this operation returns 0; on failure, \-1 is returned, and
|
|
|
|
.I errno
|
|
|
|
is set to indicate the cause of the error.
|
|
|
|
This operation can fail with the following errors:
|
|
|
|
.RS
|
|
|
|
.TP
|
|
|
|
.BR EINVAL " (since Linux 5.5)"
|
|
|
|
.\" commit 2882d53c9c6f3b8311d225062522f03772cf0179
|
|
|
|
The
|
|
|
|
.I seccomp_notif
|
|
|
|
structure that was passed to the call contained nonzero fields.
|
|
|
|
.TP
|
|
|
|
.B ENOENT
|
2020-10-01 09:33:16 +00:00
|
|
|
The target thread was killed by a signal as the notification information
|
|
|
|
was being generated,
|
2020-10-04 05:21:54 +00:00
|
|
|
or the target's (blocked) system call was interrupted by a signal handler.
|
2020-09-28 20:13:12 +00:00
|
|
|
.RE
|
|
|
|
.\" FIXME
|
|
|
|
.\" From my experiments,
|
|
|
|
.\" it appears that if a SECCOMP_IOCTL_NOTIF_RECV is done after
|
2020-10-01 09:33:16 +00:00
|
|
|
.\" the target thread terminates, then the ioctl() simply
|
2020-09-28 20:13:12 +00:00
|
|
|
.\" blocks (rather than returning an error to indicate that the
|
2020-10-01 09:33:16 +00:00
|
|
|
.\" target no longer exists).
|
2020-09-28 20:13:12 +00:00
|
|
|
.\"
|
|
|
|
.\" I found that surprising, and it required some contortions in
|
|
|
|
.\" the example program. It was not possible to code my SIGCHLD
|
|
|
|
.\" handler (which reaps the zombie when the worker/target
|
2020-10-01 09:33:16 +00:00
|
|
|
.\" terminates) to simply set a flag checked in the main
|
2020-09-28 20:13:12 +00:00
|
|
|
.\" handleNotifications() loop, since this created an
|
|
|
|
.\" unavoidable race where the child might terminate just after
|
|
|
|
.\" I had checked the flag, but before I blocked (forever!) in the
|
|
|
|
.\" SECCOMP_IOCTL_NOTIF_RECV operation. Instead, I had to code
|
|
|
|
.\" the signal handler to simply call _exit(2) in order to
|
|
|
|
.\" terminate the parent process (the supervisor).
|
|
|
|
.\"
|
|
|
|
.\" Is this expected behavior? It seems to me rather
|
|
|
|
.\" desirable that SECCOMP_IOCTL_NOTIF_RECV should give an error
|
2020-10-01 09:33:16 +00:00
|
|
|
.\" if the target has terminated.
|
2020-09-30 20:32:46 +00:00
|
|
|
.\"
|
2020-10-25 14:02:54 +00:00
|
|
|
.\" Jann posted a patch to rectify this, but there was no response
|
|
|
|
.\" (Lore link: https://bit.ly/3jvUBxk) to his question about fixing
|
|
|
|
.\" this issue. (I've tried building with the patch, but encountered
|
|
|
|
.\" an issue with the target process entering D state after a signal.)
|
|
|
|
.\"
|
2020-09-30 20:32:46 +00:00
|
|
|
.\" For now, this behavior is documented in BUGS.
|
2020-10-26 09:11:09 +00:00
|
|
|
.\"
|
|
|
|
.\" Kees Cook commented: Let's change [this] ASAP!
|
2020-09-28 20:13:12 +00:00
|
|
|
.TP
|
|
|
|
.BR SECCOMP_IOCTL_NOTIF_ID_VALID " (since Linux 5.0)"
|
|
|
|
This operation can be used to check that a notification ID
|
|
|
|
returned by an earlier
|
|
|
|
.B SECCOMP_IOCTL_NOTIF_RECV
|
2020-10-26 09:11:09 +00:00
|
|
|
operation is still valid
|
|
|
|
(i.e., that the target still exists and its system call
|
|
|
|
is still blocked waiting for a response).
|
2020-09-28 20:13:12 +00:00
|
|
|
.IP
|
|
|
|
The third
|
|
|
|
.BR ioctl (2)
|
|
|
|
argument is a pointer to the cookie
|
|
|
|
.RI ( id )
|
|
|
|
returned by the
|
|
|
|
.B SECCOMP_IOCTL_NOTIF_RECV
|
|
|
|
operation.
|
|
|
|
.IP
|
|
|
|
This operation is necessary to avoid race conditions that can occur when the
|
|
|
|
.I pid
|
|
|
|
returned by the
|
|
|
|
.B SECCOMP_IOCTL_NOTIF_RECV
|
|
|
|
operation terminates, and that process ID is reused by another process.
|
|
|
|
An example of this kind of race is the following
|
|
|
|
.RS
|
|
|
|
.IP 1. 3
|
|
|
|
A notification is generated on the listening file descriptor.
|
|
|
|
The returned
|
|
|
|
.I seccomp_notif
|
2020-10-01 09:33:16 +00:00
|
|
|
contains the TID of the target thread (in the
|
|
|
|
.I pid
|
2020-10-04 05:21:54 +00:00
|
|
|
field of the structure).
|
2020-09-28 20:13:12 +00:00
|
|
|
.IP 2.
|
2020-10-01 09:33:16 +00:00
|
|
|
The target terminates.
|
2020-09-28 20:13:12 +00:00
|
|
|
.IP 3.
|
2020-10-01 09:33:16 +00:00
|
|
|
Another thread or process is created on the system that by chance reuses the
|
|
|
|
TID that was freed when the target terminated.
|
2020-09-28 20:13:12 +00:00
|
|
|
.IP 4.
|
|
|
|
The supervisor
|
|
|
|
.BR open (2)s
|
|
|
|
the
|
2020-10-01 09:33:16 +00:00
|
|
|
.IR /proc/[tid]/mem
|
|
|
|
file for the TID obtained in step 1, with the intention of (say)
|
2020-10-04 05:21:54 +00:00
|
|
|
inspecting the memory location(s) that containing the argument(s) of
|
2020-09-28 20:13:12 +00:00
|
|
|
the system call that triggered the notification in step 1.
|
|
|
|
.RE
|
|
|
|
.IP
|
|
|
|
In the above scenario, the risk is that the supervisor may try
|
|
|
|
to access the memory of a process other than the target.
|
2020-10-01 09:33:16 +00:00
|
|
|
This race can be avoided by following the call to
|
|
|
|
.BR open (2)
|
|
|
|
with a
|
2020-09-28 20:13:12 +00:00
|
|
|
.B SECCOMP_IOCTL_NOTIF_ID_VALID
|
|
|
|
operation to verify that the process that generated the notification
|
|
|
|
is still alive.
|
2020-10-01 09:33:16 +00:00
|
|
|
(Note that if the target terminates after the latter step,
|
|
|
|
a subsequent
|
2020-09-28 20:13:12 +00:00
|
|
|
.BR read (2)
|
2020-10-16 07:29:10 +00:00
|
|
|
from the file descriptor may return 0, indicating end of file.)
|
2020-10-01 09:33:16 +00:00
|
|
|
.\" Jann Horn:
|
|
|
|
.\" the PID can be reused, but the /proc/$pid directory is
|
|
|
|
.\" internally not associated with the numeric PID, but,
|
|
|
|
.\" conceptually speaking, with a specific incarnation of the
|
|
|
|
.\" PID, or something like that. (Actually, it is associated
|
|
|
|
.\" with the "struct pid", which is not reused, instead of the
|
|
|
|
.\" numeric PID.
|
2020-09-28 20:13:12 +00:00
|
|
|
.IP
|
|
|
|
On success (i.e., the notification ID is still valid),
|
2020-10-01 09:33:16 +00:00
|
|
|
this operation returns 0.
|
2020-09-28 20:13:12 +00:00
|
|
|
On failure (i.e., the notification ID is no longer valid),
|
|
|
|
\-1 is returned, and
|
|
|
|
.I errno
|
|
|
|
is set to
|
|
|
|
.BR ENOENT .
|
|
|
|
.TP
|
|
|
|
.BR SECCOMP_IOCTL_NOTIF_SEND " (since Linux 5.0)"
|
|
|
|
This operation is used to send a notification response back to the kernel.
|
|
|
|
The third
|
|
|
|
.BR ioctl (2)
|
|
|
|
argument of this structure is a pointer to a structure of the following form:
|
|
|
|
.IP
|
|
|
|
.in +4n
|
|
|
|
.EX
|
|
|
|
struct seccomp_notif_resp {
|
|
|
|
__u64 id; /* Cookie value */
|
|
|
|
__s64 val; /* Success return value */
|
|
|
|
__s32 error; /* 0 (success) or negative
|
|
|
|
error number */
|
|
|
|
__u32 flags; /* See below */
|
|
|
|
};
|
|
|
|
.EE
|
|
|
|
.in
|
|
|
|
.IP
|
|
|
|
The fields of this structure are as follows:
|
|
|
|
.RS
|
|
|
|
.TP
|
|
|
|
.I id
|
|
|
|
This is the cookie value that was obtained using the
|
|
|
|
.B SECCOMP_IOCTL_NOTIF_RECV
|
|
|
|
operation.
|
|
|
|
This cookie value allows the kernel to correctly associate this response
|
|
|
|
with the system call that triggered the user-space notification.
|
|
|
|
.TP
|
|
|
|
.I val
|
|
|
|
This is the value that will be used for a spoofed
|
2020-10-01 09:33:16 +00:00
|
|
|
success return for the target's system call; see below.
|
2020-09-28 20:13:12 +00:00
|
|
|
.TP
|
|
|
|
.I error
|
|
|
|
This is the value that will be used as the error number
|
|
|
|
.RI ( errno )
|
2020-10-01 09:33:16 +00:00
|
|
|
for a spoofed error return for the target's system call; see below.
|
2020-09-28 20:13:12 +00:00
|
|
|
.TP
|
|
|
|
.I flags
|
2020-10-04 05:21:54 +00:00
|
|
|
This is a bit mask that includes zero or more of the following flags:
|
2020-09-28 20:13:12 +00:00
|
|
|
.RS
|
|
|
|
.TP
|
|
|
|
.BR SECCOMP_USER_NOTIF_FLAG_CONTINUE " (since Linux 5.5)"
|
2020-10-01 09:33:16 +00:00
|
|
|
Tell the kernel to execute the target's system call.
|
2020-09-28 20:13:12 +00:00
|
|
|
.\" commit fb3c5386b382d4097476ce9647260fc89b34afdb
|
|
|
|
.RE
|
|
|
|
.RE
|
|
|
|
.IP
|
|
|
|
Two kinds of response are possible:
|
|
|
|
.RS
|
|
|
|
.IP \(bu 2
|
|
|
|
A response to the kernel telling it to execute the
|
2020-10-01 09:33:16 +00:00
|
|
|
target's system call.
|
2020-09-28 20:13:12 +00:00
|
|
|
In this case, the
|
|
|
|
.I flags
|
|
|
|
field includes
|
|
|
|
.B SECCOMP_USER_NOTIF_FLAG_CONTINUE
|
|
|
|
and the
|
|
|
|
.I error
|
|
|
|
and
|
|
|
|
.I val
|
|
|
|
fields must be zero.
|
|
|
|
.IP
|
|
|
|
This kind of response can be useful in cases where the supervisor needs
|
|
|
|
to do deeper analysis of the target's system call than is possible
|
|
|
|
from a seccomp filter (e.g., examining the values of pointer arguments),
|
2020-10-01 09:33:16 +00:00
|
|
|
and, having decided that the system call does not require emulation
|
|
|
|
by the supervisor, the supervisor wants the system call to
|
|
|
|
be executed normally in the target.
|
2020-10-15 10:27:33 +00:00
|
|
|
.IP
|
|
|
|
The
|
|
|
|
.B SECCOMP_USER_NOTIF_FLAG_CONTINUE
|
|
|
|
flag should be used with caution; see NOTES.
|
2020-09-28 20:13:12 +00:00
|
|
|
.IP \(bu
|
2020-10-01 09:33:16 +00:00
|
|
|
A spoofed return value for the target's system call.
|
|
|
|
In this case, the kernel does not execute the target's system call,
|
2020-09-28 20:13:12 +00:00
|
|
|
instead causing the system call to return a spoofed value as specified by
|
|
|
|
fields of the
|
|
|
|
.I seccomp_notif_resp
|
|
|
|
structure.
|
|
|
|
The supervisor should set the fields of this structure as follows:
|
|
|
|
.RS
|
|
|
|
.IP + 3
|
|
|
|
.I flags
|
|
|
|
does not contain
|
|
|
|
.BR SECCOMP_USER_NOTIF_FLAG_CONTINUE .
|
|
|
|
.IP +
|
|
|
|
.I error
|
|
|
|
is set either to 0 for a spoofed "success" return or to a negative
|
|
|
|
error number for a spoofed "failure" return.
|
2020-10-01 09:33:16 +00:00
|
|
|
In the former case, the kernel causes the target's system call
|
2020-09-28 20:13:12 +00:00
|
|
|
to return the value specified in the
|
|
|
|
.I val
|
|
|
|
field.
|
2020-10-01 09:33:16 +00:00
|
|
|
In the later case, the kernel causes the target's system call
|
2020-09-28 20:13:12 +00:00
|
|
|
to return \-1, and
|
|
|
|
.I errno
|
|
|
|
is assigned the negated
|
|
|
|
.I error
|
|
|
|
value.
|
|
|
|
.IP +
|
|
|
|
.I val
|
|
|
|
is set to a value that will be used as the return value for a spoofed
|
2020-10-01 09:33:16 +00:00
|
|
|
"success" return for the target's system call.
|
2020-09-28 20:13:12 +00:00
|
|
|
The value in this field is ignored if the
|
|
|
|
.I error
|
|
|
|
field contains a nonzero value.
|
2020-10-26 09:11:09 +00:00
|
|
|
.\" FIXME
|
|
|
|
.\" Kees Cook suggested:
|
|
|
|
.\"
|
|
|
|
.\" Strictly speaking, this is architecture specific, but
|
|
|
|
.\" all architectures do it this way. Should seccomp enforce
|
|
|
|
.\" val == 0 when err != 0 ?
|
2020-09-28 20:13:12 +00:00
|
|
|
.RE
|
|
|
|
.RE
|
|
|
|
.IP
|
|
|
|
On success, this operation returns 0; on failure, \-1 is returned, and
|
|
|
|
.I errno
|
|
|
|
is set to indicate the cause of the error.
|
|
|
|
This operation can fail with the following errors:
|
|
|
|
.RS
|
|
|
|
.TP
|
|
|
|
.B EINPROGRESS
|
|
|
|
A response to this notification has already been sent.
|
|
|
|
.TP
|
|
|
|
.B EINVAL
|
|
|
|
An invalid value was specified in the
|
|
|
|
.I flags field.
|
|
|
|
.TP
|
|
|
|
.B
|
|
|
|
.B EINVAL
|
|
|
|
The
|
|
|
|
.I flags
|
|
|
|
field contained
|
|
|
|
.BR SECCOMP_USER_NOTIF_FLAG_CONTINUE ,
|
|
|
|
and the
|
|
|
|
.I error
|
|
|
|
or
|
|
|
|
.I val
|
|
|
|
field was not zero.
|
|
|
|
.TP
|
|
|
|
.B ENOENT
|
2020-10-01 09:33:16 +00:00
|
|
|
The blocked system call in the target
|
|
|
|
has been interrupted by a signal handler
|
|
|
|
or the target has terminated.
|
|
|
|
.\" Jann Horn notes:
|
|
|
|
.\" you could also get this [ENOENT] if a response has already
|
|
|
|
.\" been sent, instead of EINPROGRESS - the only difference is
|
|
|
|
.\" whether the target thread has picked up the response yet
|
2020-09-28 20:13:12 +00:00
|
|
|
.RE
|
|
|
|
.SH NOTES
|
2020-10-28 12:14:08 +00:00
|
|
|
One example use case for the user-space notification
|
|
|
|
mechanism is to allow a container manager
|
|
|
|
(a process which is typically running with more privilege than
|
|
|
|
the processes inside the container)
|
|
|
|
to mount block devices or create device nodes for the container.
|
|
|
|
The mount use case provides an example of where the
|
|
|
|
.BR SECCOMP_USER_NOTIF_FLAG_CONTINUE
|
|
|
|
.BR ioctl (2)
|
|
|
|
operation is useful.
|
|
|
|
Upon receiving a notification for the
|
|
|
|
.BR mount (2)
|
|
|
|
system call, the container manager (the "supervisor") can distinguish
|
|
|
|
a request to mount a block filesystem
|
|
|
|
(which would not be possible for a "target" process inside the container)
|
|
|
|
and mount that file system.
|
|
|
|
If, on the other hand, the container manager detects that the operation
|
|
|
|
could be performed by the process inside the container
|
|
|
|
(e.g., a mount of a
|
|
|
|
.BR tmpfs (5)
|
|
|
|
filesystem), it can notify the kernel that the target process's
|
|
|
|
.BR mount (2)
|
|
|
|
system call can continue.
|
|
|
|
.\"
|
2020-10-14 05:28:40 +00:00
|
|
|
.SS select()/poll()/epoll semantics
|
2020-09-28 20:13:12 +00:00
|
|
|
The file descriptor returned when
|
|
|
|
.BR seccomp (2)
|
|
|
|
is employed with the
|
|
|
|
.B SECCOMP_FILTER_FLAG_NEW_LISTENER
|
|
|
|
flag can be monitored using
|
|
|
|
.BR poll (2),
|
|
|
|
.BR epoll (7),
|
|
|
|
and
|
|
|
|
.BR select (2).
|
2020-10-14 05:28:40 +00:00
|
|
|
These interfaces indicate that the file descriptor is ready as follows:
|
|
|
|
.IP \(bu 2
|
2020-09-28 20:13:12 +00:00
|
|
|
When a notification is pending,
|
|
|
|
these interfaces indicate that the file descriptor is readable.
|
2020-10-01 09:33:16 +00:00
|
|
|
Following such an indication, a subsequent
|
|
|
|
.B SECCOMP_IOCTL_NOTIF_RECV
|
|
|
|
.BR ioctl (2)
|
|
|
|
will not block, returning either information about a notification
|
|
|
|
or else failing with the error
|
|
|
|
.B EINTR
|
|
|
|
if the target has been killed by a signal or its system call
|
|
|
|
has been interrupted by a signal handler.
|
2020-10-14 05:28:40 +00:00
|
|
|
.IP \(bu
|
|
|
|
After the notification has been received (i.e., by the
|
|
|
|
.B SECCOMP_IOCTL_NOTIF_RECV
|
|
|
|
.BR ioctl (2)
|
|
|
|
operation), these interfaces indicate that the file descriptor is writable,
|
|
|
|
meaning that a notification response can be sent using the
|
|
|
|
.B SECCOMP_IOCTL_NOTIF_SEND
|
|
|
|
.BR ioctl (2)
|
|
|
|
operation.
|
2020-10-15 08:14:09 +00:00
|
|
|
.IP \(bu
|
|
|
|
After the last thread using the filter has terminated and been reaped using
|
|
|
|
.BR waitpid (2)
|
|
|
|
(or similar),
|
|
|
|
the file descriptor indicates an end-of-file condition (readable in
|
|
|
|
.BR select (2);
|
|
|
|
.BR POLLHUP / EPOLLHUP
|
|
|
|
in
|
|
|
|
.BR poll (2)/
|
|
|
|
.BR epoll_wait (2)).
|
2020-10-15 10:27:33 +00:00
|
|
|
.SS Design goals; use of SECCOMP_USER_NOTIF_FLAG_CONTINUE
|
|
|
|
The intent of the user-space notification feature is
|
|
|
|
to allow system calls to be performed on behalf of the target.
|
|
|
|
The target's system call should either be handled by the supervisor or
|
|
|
|
allowed to continue normally in the kernel (where standard security
|
|
|
|
policies will be applied).
|
|
|
|
.PP
|
|
|
|
.BR "Note well" :
|
|
|
|
this mechanism must not be used to make security policy decisions
|
|
|
|
about the system call,
|
|
|
|
which would be inherently race-prone for reasons described next.
|
|
|
|
.PP
|
|
|
|
The
|
|
|
|
.B SECCOMP_USER_NOTIF_FLAG_CONTINUE
|
|
|
|
flag must be used with caution.
|
|
|
|
If set by the supervisor, the target's system call will continue.
|
|
|
|
However, there is a time-of-check, time-of-use race here,
|
|
|
|
since an attacker could exploit the interval of time where the target is
|
|
|
|
blocked waiting on the "continue" response to do things such as
|
|
|
|
rewriting the system call arguments.
|
|
|
|
.PP
|
|
|
|
Note furthermore that a user-space notifier can be bypassed if
|
|
|
|
the existing filters allow the use of
|
|
|
|
.BR seccomp (2)
|
|
|
|
or
|
|
|
|
.BR prctl (2)
|
|
|
|
to install a filter that returns an action value with a higher precedence than
|
|
|
|
.B SECCOMP_RET_USER_NOTIF
|
|
|
|
(see
|
|
|
|
.BR seccomp (2)).
|
|
|
|
.PP
|
|
|
|
It should thus be absolutely clear that the
|
|
|
|
seccomp user-space notification mechanism
|
|
|
|
.B can not
|
|
|
|
be used to implement a security policy!
|
|
|
|
It should only ever be used in scenarios where a more privileged process
|
|
|
|
supervises the system calls of a lesser privileged target to
|
|
|
|
get around kernel-enforced security restrictions when
|
|
|
|
the supervisor deems this safe.
|
|
|
|
In other words,
|
|
|
|
in order to continue a system call, the supervisor should be sure that
|
|
|
|
another security mechanism or the kernel itself will sufficiently block
|
|
|
|
the system call if its arguments are rewritten to something unsafe.
|
seccomp_unotify.2: Describe the interaction with SA_RESTART signal handlers
And, as noted by Jann Horn, note how the user-space notification
mechanism causes a small breakage in the user-space API with
respect to nonrestartable system calls.
====
From the email discussion with Jann Horn
> >> So, I partially demonstrated what you describe here, for two example
> >> system calls (epoll_wait() and pause()). But I could not exactly
> >> demonstrate things as I understand you to be describing them. (So,
> >> I'm not sure whether I have not understood you correctly, or
> >> if things are not exactly as you describe them.)
> >>
> >> Here's a scenario (A) that I tested:
> >>
> >> 1. Target installs seccomp filters for a blocking syscall
> >> (epoll_wait() or pause(), both of which should never restart,
> >> regardless of SA_RESTART)
> >> 2. Target installs SIGINT handler with SA_RESTART
> >> 3. Supervisor is sleeping (i.e., is not blocked in
> >> SECCOMP_IOCTL_NOTIF_RECV operation).
> >> 4. Target makes a blocking system call (epoll_wait() or pause()).
> >> 5. SIGINT gets delivered to target; handler gets called;
> >> ***and syscall gets restarted by the kernel***
> >>
> >> That last should never happen, of course, and is a result of the
> >> combination of both the user-notify filter and the SA_RESTART flag.
> >> If one or other is not present, then the system call is not
> >> restarted.
> >>
> >> So, as you note below, the UAPI gets broken a little.
> >>
> >> However, from your description above I had understood that
> >> something like the following scenario (B) could occur:
> >>
> >> 1. Target installs seccomp filters for a blocking syscall
> >> (epoll_wait() or pause(), both of which should never restart,
> >> regardless of SA_RESTART)
> >> 2. Target installs SIGINT handler with SA_RESTART
> >> 3. Supervisor performs SECCOMP_IOCTL_NOTIF_RECV operation (which
> >> blocks).
> >> 4. Target makes a blocking system call (epoll_wait() or pause()).
> >> 5. Supervisor gets seccomp user-space notification (i.e.,
> >> SECCOMP_IOCTL_NOTIF_RECV ioctl() returns
> >> 6. SIGINT gets delivered to target; handler gets called;
> >> and syscall gets restarted by the kernel
> >> 7. Supervisor performs another SECCOMP_IOCTL_NOTIF_RECV operation
> >> which gets another notification for the restarted system call.
> >>
> >> However, I don't observe such behavior. In step 6, the syscall
> >> does not get restarted by the kernel, but instead returns -1/EINTR.
> >> Perhaps I have misconstructed my experiment in the second case, or
> >> perhaps I've misunderstood what you meant, or is it possibly the
> >> case that things are not quite as you said?
>
> Thanks for the code, Jann (including the demo of the CLONE_FILES
> technique to pass the notification FD to the supervisor).
>
> But I think your code just demonstrates what I described in
> scenario A. So, it seems that I both understood what you
> meant (because my code demonstrates the same thing) and
> also misunderstood what you said (because I thought you
> were meaning something more like scenario B).
Ahh, sorry, I should've read your mail more carefully. Indeed, that
testcase only shows scenario A. But the following shows scenario B...
[Below, two pieces of code from Jann, with a lot of
cosmetic changes by mtk.]
====
[And from a follow-up in the same email thread:]
> If userspace relies on non-restarting behavior, it should be using
> something like epoll_pwait(). And that stuff only unblocks signals
> after we've already past the seccomp checks on entry.
Thanks for elaborating that detail, since as soon as you talked
about "enlarging a preexisting race" above, I immediately wondered
sigsuspend(), pselect(), etc.
(Mind you, I still wonder about the effect on system calls that
are normally nonrestartable because they have timeouts. My
understanding is that the kernel doesn't restart those system
calls because it's impossible for the kernel to restart the call
with the right timeout value. I wonder what happens when those
system calls are restarted in the scenario we're discussing.)
Anyway, returning to your point... So, to be clear (and to
quickly remind myself in case I one day reread this thread),
there is not a problem with sigsuspend(), pselect(), ppoll(),
and epoll_pwait() since:
* Before the syscall, signals are blocked in the target.
* Inside the syscall, signals are still blocked at the time
the check is made for seccomp filters.
* If a seccomp user-space notification event kicks, the target
is put to sleep with the signals still blocked.
* The signal will only get delivered after the supervisor either
triggers a spoofed success/failure return in the target or the
supervisor sends a CONTINUE response to the kernel telling it
to execute the target's system call. Either way, there won't be
any restarting of the target's system call (and the supervisor
thus won't see multiple notifications).
====
Scenario A
$ ./seccomp_unotify_restart_scen_A
C: installed seccomp: fd 3
C: woke 1 waiters
P: child installed seccomp fd 3
C: About to call pause(): Success
P: going to send SIGUSR1...
C: sigusr1_handler handler invoked
P: about to terminate
C: got pdeath signal on parent termination
C: about to terminate
/* Modified version of code from Jann Horn */
#define _GNU_SOURCE
#include <stdio.h>
#include <signal.h>
#include <err.h>
#include <errno.h>
#include <unistd.h>
#include <stdlib.h>
#include <sched.h>
#include <stddef.h>
#include <limits.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/prctl.h>
#include <linux/seccomp.h>
#include <linux/filter.h>
#include <linux/futex.h>
struct {
int seccomp_fd;
} *shared;
static void
sigusr1_handler(int sig, siginfo_t * info, void *uctx)
{
printf("C: sigusr1_handler handler invoked\n");
}
static void
sigusr2_handler(int sig, siginfo_t * info, void *uctx)
{
printf("C: got pdeath signal on parent termination\n");
printf("C: about to terminate\n");
exit(0);
}
int
main(void)
{
setbuf(stdout, NULL);
/* Allocate memory that will be shared by parent and child */
shared = mmap(NULL, 0x1000, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_SHARED, -1, 0);
if (shared == MAP_FAILED)
err(1, "mmap");
shared->seccomp_fd = -1;
/* glibc's clone() wrapper doesn't support fork()-style usage */
/* Child process and parent share file descriptor table */
pid_t child = syscall(__NR_clone, CLONE_FILES | SIGCHLD,
NULL, NULL, NULL, 0);
if (child == -1)
err(1, "clone");
/* CHILD */
if (child == 0) {
/* don't outlive the parent */
prctl(PR_SET_PDEATHSIG, SIGUSR2);
if (getppid() == 1)
exit(0);
/* Install seccomp filter */
prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
struct sock_filter insns[] = {
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
offsetof(struct seccomp_data, nr)),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_pause, 0, 1),
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_USER_NOTIF),
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)
};
struct sock_fprog prog = {
.len = sizeof(insns) / sizeof(insns[0]),
.filter = insns
};
int seccomp_ret = syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER,
SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog);
if (seccomp_ret < 0)
err(1, "install");
printf("C: installed seccomp: fd %d\n", seccomp_ret);
/* Place the notifier FD number into the shared memory */
__atomic_store(&shared->seccomp_fd, &seccomp_ret,
__ATOMIC_RELEASE);
/* Wake the parent */
int futex_ret =
syscall(__NR_futex, &shared->seccomp_fd, FUTEX_WAKE,
INT_MAX, NULL, NULL, 0);
printf("C: woke %d waiters\n", futex_ret);
/* Establish SA_RESTART handler for SIGUSR1 */
struct sigaction act = {
.sa_sigaction = sigusr1_handler,
.sa_flags = SA_RESTART | SA_SIGINFO
};
if (sigaction(SIGUSR1, &act, NULL))
err(1, "sigaction");
struct sigaction act2 = {
.sa_sigaction = sigusr2_handler,
.sa_flags = 0
};
if (sigaction(SIGUSR2, &act2, NULL))
err(1, "sigaction");
/* Make a blocking system call */
perror("C: About to call pause()");
pause();
perror("C: pause returned");
exit(0);
}
/* PARENT */
/* Wait for futex wake-up from child */
int futex_ret = syscall(__NR_futex, &shared->seccomp_fd, FUTEX_WAIT,
-1, NULL, NULL, 0);
if (futex_ret == -1 && errno != EAGAIN)
err(1, "futex wait");
/* Get notification FD from the child */
int fd = __atomic_load_n(&shared->seccomp_fd, __ATOMIC_ACQUIRE);
printf("\tP: child installed seccomp fd %d\n", fd);
sleep(1);
printf("\tP: going to send SIGUSR1...\n");
kill(child, SIGUSR1);
sleep(1);
printf("\tP: about to terminate\n");
exit(0);
}
====
Scenario B
$ ./seccomp_unotify_restart_scen_B
C: installed seccomp: fd 3
C: woke 1 waiters
C: About to call pause()
P: child installed seccomp fd 3
P: about to SECCOMP_IOCTL_NOTIF_RECV
P: got notif: id=17773741941218455591 pid=25052 nr=34
P: about to send SIGUSR1 to child...
P: about to SECCOMP_IOCTL_NOTIF_RECV
C: sigusr1_handler handler invoked
P: got notif: id=17773741941218455592 pid=25052 nr=34
P: about to send SIGUSR1 to child...
P: about to SECCOMP_IOCTL_NOTIF_RECV
C: sigusr1_handler handler invoked
P: got notif: id=17773741941218455593 pid=25052 nr=34
P: about to send SIGUSR1 to child...
P: about to SECCOMP_IOCTL_NOTIF_RECV
C: sigusr1_handler handler invoked
P: got notif: id=17773741941218455594 pid=25052 nr=34
P: about to send SIGUSR1 to child...
C: sigusr1_handler handler invoked
C: got pdeath signal on parent termination
C: about to terminate
/* Modified version of code from Jann Horn */
#define _GNU_SOURCE
#include <stdio.h>
#include <signal.h>
#include <err.h>
#include <errno.h>
#include <unistd.h>
#include <stdlib.h>
#include <sched.h>
#include <stddef.h>
#include <string.h>
#include <limits.h>
#include <inttypes.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <sys/prctl.h>
#include <linux/seccomp.h>
#include <linux/filter.h>
#include <linux/futex.h>
struct {
int seccomp_fd;
} *shared;
static void
sigusr1_handler(int sig, siginfo_t * info, void *uctx)
{
printf("C: sigusr1_handler handler invoked\n");
}
static void
sigusr2_handler(int sig, siginfo_t * info, void *uctx)
{
printf("C: got pdeath signal on parent termination\n");
printf("C: about to terminate\n");
exit(0);
}
static size_t
max_size(size_t a, size_t b)
{
return (a > b) ? a : b;
}
int
main(void)
{
setbuf(stdout, NULL);
/* Allocate memory that will be shared by parent and child */
shared = mmap(NULL, 0x1000, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_SHARED, -1, 0);
if (shared == MAP_FAILED)
err(1, "mmap");
shared->seccomp_fd = -1;
/* glibc's clone() wrapper doesn't support fork()-style usage */
/* Child process and parent share file descriptor table */
pid_t child = syscall(__NR_clone, CLONE_FILES | SIGCHLD,
NULL, NULL, NULL, 0);
if (child == -1)
err(1, "clone");
/* CHILD */
if (child == 0) {
/* don't outlive the parent */
prctl(PR_SET_PDEATHSIG, SIGUSR2);
if (getppid() == 1)
exit(0);
/* Install seccomp filter */
prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
struct sock_filter insns[] = {
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
offsetof(struct seccomp_data, nr)),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_pause, 0, 1),
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_USER_NOTIF),
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)
};
struct sock_fprog prog = {
.len = sizeof(insns) / sizeof(insns[0]),
.filter = insns
};
int seccomp_ret = syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER,
SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog);
if (seccomp_ret < 0)
err(1, "install");
printf("C: installed seccomp: fd %d\n", seccomp_ret);
/* Place the notifier FD number into the shared memory */
__atomic_store(&shared->seccomp_fd, &seccomp_ret,
__ATOMIC_RELEASE);
/* Wake the parent */
int futex_ret =
syscall(__NR_futex, &shared->seccomp_fd, FUTEX_WAKE,
INT_MAX, NULL, NULL, 0);
printf("C: woke %d waiters\n", futex_ret);
/* Establish SA_RESTART handler for SIGUSR1 */
struct sigaction act = {
.sa_sigaction = sigusr1_handler,
.sa_flags = SA_RESTART | SA_SIGINFO
};
if (sigaction(SIGUSR1, &act, NULL))
err(1, "sigaction");
struct sigaction act2 = {
.sa_sigaction = sigusr2_handler,
.sa_flags = 0
};
if (sigaction(SIGUSR2, &act2, NULL))
err(1, "sigaction");
/* Make a blocking system call */
printf("C: About to call pause()\n");
pause();
perror("C: pause returned");
exit(0);
}
/* PARENT */
/* Wait for futex wake-up from child */
int futex_ret = syscall(__NR_futex, &shared->seccomp_fd, FUTEX_WAIT,
-1, NULL, NULL, 0);
if (futex_ret == -1 && errno != EAGAIN)
err(1, "futex wait");
/* Get notification FD from the child */
int fd = __atomic_load_n(&shared->seccomp_fd, __ATOMIC_ACQUIRE);
printf("\tP: child installed seccomp fd %d\n", fd);
/* Discover seccomp buffer sizes and allocate notification buffer */
struct seccomp_notif_sizes sizes;
if (syscall(__NR_seccomp, SECCOMP_GET_NOTIF_SIZES, 0, &sizes))
err(1, "notif_sizes");
struct seccomp_notif *notif =
malloc(max_size(sizeof(struct seccomp_notif),
sizes.seccomp_notif));
if (!notif)
err(1, "malloc");
for (int i = 0; i < 4; i++) {
printf("\tP: about to SECCOMP_IOCTL_NOTIF_RECV\n");
memset(notif, '\0', sizes.seccomp_notif);
if (ioctl(fd, SECCOMP_IOCTL_NOTIF_RECV, notif))
err(1, "notif_recv");
printf("\tP: got notif: id=%llu pid=%u nr=%d\n",
notif->id, notif->pid, notif->data.nr);
sleep(1);
printf("\tP: about to send SIGUSR1 to child...\n");
kill(child, SIGUSR1);
}
sleep(1);
exit(0);
}
====
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Michael Kerrisk <mtk.manpages@gmail.com>
2020-10-24 12:29:11 +00:00
|
|
|
.\"
|
|
|
|
.SS Interaction with SA_RESTART signal handlers
|
|
|
|
Consider the following scenario:
|
|
|
|
.IP \(bu 2
|
|
|
|
The target process has used
|
|
|
|
.BR sigaction (2)
|
|
|
|
to install a signal handler with the
|
|
|
|
.B SA_RESTART
|
|
|
|
flag.
|
|
|
|
.IP \(bu
|
|
|
|
The target has made a system call that triggered a seccomp
|
|
|
|
user-space notification and the target is currently blocked
|
|
|
|
until the supervisor sends a notification response.
|
|
|
|
.IP \(bu
|
|
|
|
A signal is delivered to the target and the signal handler is executed.
|
|
|
|
.IP \(bu
|
|
|
|
When (if) the supervisor attempts to send a notification response, the
|
|
|
|
.B SECCOMP_IOCTL_NOTIF_SEND
|
|
|
|
.BR ioctl (2))
|
|
|
|
operation will fail with the
|
|
|
|
.BR ENOENT
|
|
|
|
error.
|
|
|
|
.PP
|
|
|
|
In this scenario, the kernel will restart the target's system call.
|
|
|
|
Consequently, the supervisor will receive another user-space notification.
|
|
|
|
Thus, depending on how many times the blocked system call
|
|
|
|
is interrupted by a signal handler,
|
|
|
|
the supervisor may receive multiple notifications for
|
2020-10-26 09:11:09 +00:00
|
|
|
the same instance of a system call in the target.
|
seccomp_unotify.2: Describe the interaction with SA_RESTART signal handlers
And, as noted by Jann Horn, note how the user-space notification
mechanism causes a small breakage in the user-space API with
respect to nonrestartable system calls.
====
From the email discussion with Jann Horn
> >> So, I partially demonstrated what you describe here, for two example
> >> system calls (epoll_wait() and pause()). But I could not exactly
> >> demonstrate things as I understand you to be describing them. (So,
> >> I'm not sure whether I have not understood you correctly, or
> >> if things are not exactly as you describe them.)
> >>
> >> Here's a scenario (A) that I tested:
> >>
> >> 1. Target installs seccomp filters for a blocking syscall
> >> (epoll_wait() or pause(), both of which should never restart,
> >> regardless of SA_RESTART)
> >> 2. Target installs SIGINT handler with SA_RESTART
> >> 3. Supervisor is sleeping (i.e., is not blocked in
> >> SECCOMP_IOCTL_NOTIF_RECV operation).
> >> 4. Target makes a blocking system call (epoll_wait() or pause()).
> >> 5. SIGINT gets delivered to target; handler gets called;
> >> ***and syscall gets restarted by the kernel***
> >>
> >> That last should never happen, of course, and is a result of the
> >> combination of both the user-notify filter and the SA_RESTART flag.
> >> If one or other is not present, then the system call is not
> >> restarted.
> >>
> >> So, as you note below, the UAPI gets broken a little.
> >>
> >> However, from your description above I had understood that
> >> something like the following scenario (B) could occur:
> >>
> >> 1. Target installs seccomp filters for a blocking syscall
> >> (epoll_wait() or pause(), both of which should never restart,
> >> regardless of SA_RESTART)
> >> 2. Target installs SIGINT handler with SA_RESTART
> >> 3. Supervisor performs SECCOMP_IOCTL_NOTIF_RECV operation (which
> >> blocks).
> >> 4. Target makes a blocking system call (epoll_wait() or pause()).
> >> 5. Supervisor gets seccomp user-space notification (i.e.,
> >> SECCOMP_IOCTL_NOTIF_RECV ioctl() returns
> >> 6. SIGINT gets delivered to target; handler gets called;
> >> and syscall gets restarted by the kernel
> >> 7. Supervisor performs another SECCOMP_IOCTL_NOTIF_RECV operation
> >> which gets another notification for the restarted system call.
> >>
> >> However, I don't observe such behavior. In step 6, the syscall
> >> does not get restarted by the kernel, but instead returns -1/EINTR.
> >> Perhaps I have misconstructed my experiment in the second case, or
> >> perhaps I've misunderstood what you meant, or is it possibly the
> >> case that things are not quite as you said?
>
> Thanks for the code, Jann (including the demo of the CLONE_FILES
> technique to pass the notification FD to the supervisor).
>
> But I think your code just demonstrates what I described in
> scenario A. So, it seems that I both understood what you
> meant (because my code demonstrates the same thing) and
> also misunderstood what you said (because I thought you
> were meaning something more like scenario B).
Ahh, sorry, I should've read your mail more carefully. Indeed, that
testcase only shows scenario A. But the following shows scenario B...
[Below, two pieces of code from Jann, with a lot of
cosmetic changes by mtk.]
====
[And from a follow-up in the same email thread:]
> If userspace relies on non-restarting behavior, it should be using
> something like epoll_pwait(). And that stuff only unblocks signals
> after we've already past the seccomp checks on entry.
Thanks for elaborating that detail, since as soon as you talked
about "enlarging a preexisting race" above, I immediately wondered
sigsuspend(), pselect(), etc.
(Mind you, I still wonder about the effect on system calls that
are normally nonrestartable because they have timeouts. My
understanding is that the kernel doesn't restart those system
calls because it's impossible for the kernel to restart the call
with the right timeout value. I wonder what happens when those
system calls are restarted in the scenario we're discussing.)
Anyway, returning to your point... So, to be clear (and to
quickly remind myself in case I one day reread this thread),
there is not a problem with sigsuspend(), pselect(), ppoll(),
and epoll_pwait() since:
* Before the syscall, signals are blocked in the target.
* Inside the syscall, signals are still blocked at the time
the check is made for seccomp filters.
* If a seccomp user-space notification event kicks, the target
is put to sleep with the signals still blocked.
* The signal will only get delivered after the supervisor either
triggers a spoofed success/failure return in the target or the
supervisor sends a CONTINUE response to the kernel telling it
to execute the target's system call. Either way, there won't be
any restarting of the target's system call (and the supervisor
thus won't see multiple notifications).
====
Scenario A
$ ./seccomp_unotify_restart_scen_A
C: installed seccomp: fd 3
C: woke 1 waiters
P: child installed seccomp fd 3
C: About to call pause(): Success
P: going to send SIGUSR1...
C: sigusr1_handler handler invoked
P: about to terminate
C: got pdeath signal on parent termination
C: about to terminate
/* Modified version of code from Jann Horn */
#define _GNU_SOURCE
#include <stdio.h>
#include <signal.h>
#include <err.h>
#include <errno.h>
#include <unistd.h>
#include <stdlib.h>
#include <sched.h>
#include <stddef.h>
#include <limits.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/prctl.h>
#include <linux/seccomp.h>
#include <linux/filter.h>
#include <linux/futex.h>
struct {
int seccomp_fd;
} *shared;
static void
sigusr1_handler(int sig, siginfo_t * info, void *uctx)
{
printf("C: sigusr1_handler handler invoked\n");
}
static void
sigusr2_handler(int sig, siginfo_t * info, void *uctx)
{
printf("C: got pdeath signal on parent termination\n");
printf("C: about to terminate\n");
exit(0);
}
int
main(void)
{
setbuf(stdout, NULL);
/* Allocate memory that will be shared by parent and child */
shared = mmap(NULL, 0x1000, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_SHARED, -1, 0);
if (shared == MAP_FAILED)
err(1, "mmap");
shared->seccomp_fd = -1;
/* glibc's clone() wrapper doesn't support fork()-style usage */
/* Child process and parent share file descriptor table */
pid_t child = syscall(__NR_clone, CLONE_FILES | SIGCHLD,
NULL, NULL, NULL, 0);
if (child == -1)
err(1, "clone");
/* CHILD */
if (child == 0) {
/* don't outlive the parent */
prctl(PR_SET_PDEATHSIG, SIGUSR2);
if (getppid() == 1)
exit(0);
/* Install seccomp filter */
prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
struct sock_filter insns[] = {
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
offsetof(struct seccomp_data, nr)),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_pause, 0, 1),
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_USER_NOTIF),
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)
};
struct sock_fprog prog = {
.len = sizeof(insns) / sizeof(insns[0]),
.filter = insns
};
int seccomp_ret = syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER,
SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog);
if (seccomp_ret < 0)
err(1, "install");
printf("C: installed seccomp: fd %d\n", seccomp_ret);
/* Place the notifier FD number into the shared memory */
__atomic_store(&shared->seccomp_fd, &seccomp_ret,
__ATOMIC_RELEASE);
/* Wake the parent */
int futex_ret =
syscall(__NR_futex, &shared->seccomp_fd, FUTEX_WAKE,
INT_MAX, NULL, NULL, 0);
printf("C: woke %d waiters\n", futex_ret);
/* Establish SA_RESTART handler for SIGUSR1 */
struct sigaction act = {
.sa_sigaction = sigusr1_handler,
.sa_flags = SA_RESTART | SA_SIGINFO
};
if (sigaction(SIGUSR1, &act, NULL))
err(1, "sigaction");
struct sigaction act2 = {
.sa_sigaction = sigusr2_handler,
.sa_flags = 0
};
if (sigaction(SIGUSR2, &act2, NULL))
err(1, "sigaction");
/* Make a blocking system call */
perror("C: About to call pause()");
pause();
perror("C: pause returned");
exit(0);
}
/* PARENT */
/* Wait for futex wake-up from child */
int futex_ret = syscall(__NR_futex, &shared->seccomp_fd, FUTEX_WAIT,
-1, NULL, NULL, 0);
if (futex_ret == -1 && errno != EAGAIN)
err(1, "futex wait");
/* Get notification FD from the child */
int fd = __atomic_load_n(&shared->seccomp_fd, __ATOMIC_ACQUIRE);
printf("\tP: child installed seccomp fd %d\n", fd);
sleep(1);
printf("\tP: going to send SIGUSR1...\n");
kill(child, SIGUSR1);
sleep(1);
printf("\tP: about to terminate\n");
exit(0);
}
====
Scenario B
$ ./seccomp_unotify_restart_scen_B
C: installed seccomp: fd 3
C: woke 1 waiters
C: About to call pause()
P: child installed seccomp fd 3
P: about to SECCOMP_IOCTL_NOTIF_RECV
P: got notif: id=17773741941218455591 pid=25052 nr=34
P: about to send SIGUSR1 to child...
P: about to SECCOMP_IOCTL_NOTIF_RECV
C: sigusr1_handler handler invoked
P: got notif: id=17773741941218455592 pid=25052 nr=34
P: about to send SIGUSR1 to child...
P: about to SECCOMP_IOCTL_NOTIF_RECV
C: sigusr1_handler handler invoked
P: got notif: id=17773741941218455593 pid=25052 nr=34
P: about to send SIGUSR1 to child...
P: about to SECCOMP_IOCTL_NOTIF_RECV
C: sigusr1_handler handler invoked
P: got notif: id=17773741941218455594 pid=25052 nr=34
P: about to send SIGUSR1 to child...
C: sigusr1_handler handler invoked
C: got pdeath signal on parent termination
C: about to terminate
/* Modified version of code from Jann Horn */
#define _GNU_SOURCE
#include <stdio.h>
#include <signal.h>
#include <err.h>
#include <errno.h>
#include <unistd.h>
#include <stdlib.h>
#include <sched.h>
#include <stddef.h>
#include <string.h>
#include <limits.h>
#include <inttypes.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <sys/prctl.h>
#include <linux/seccomp.h>
#include <linux/filter.h>
#include <linux/futex.h>
struct {
int seccomp_fd;
} *shared;
static void
sigusr1_handler(int sig, siginfo_t * info, void *uctx)
{
printf("C: sigusr1_handler handler invoked\n");
}
static void
sigusr2_handler(int sig, siginfo_t * info, void *uctx)
{
printf("C: got pdeath signal on parent termination\n");
printf("C: about to terminate\n");
exit(0);
}
static size_t
max_size(size_t a, size_t b)
{
return (a > b) ? a : b;
}
int
main(void)
{
setbuf(stdout, NULL);
/* Allocate memory that will be shared by parent and child */
shared = mmap(NULL, 0x1000, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_SHARED, -1, 0);
if (shared == MAP_FAILED)
err(1, "mmap");
shared->seccomp_fd = -1;
/* glibc's clone() wrapper doesn't support fork()-style usage */
/* Child process and parent share file descriptor table */
pid_t child = syscall(__NR_clone, CLONE_FILES | SIGCHLD,
NULL, NULL, NULL, 0);
if (child == -1)
err(1, "clone");
/* CHILD */
if (child == 0) {
/* don't outlive the parent */
prctl(PR_SET_PDEATHSIG, SIGUSR2);
if (getppid() == 1)
exit(0);
/* Install seccomp filter */
prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
struct sock_filter insns[] = {
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
offsetof(struct seccomp_data, nr)),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_pause, 0, 1),
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_USER_NOTIF),
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)
};
struct sock_fprog prog = {
.len = sizeof(insns) / sizeof(insns[0]),
.filter = insns
};
int seccomp_ret = syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER,
SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog);
if (seccomp_ret < 0)
err(1, "install");
printf("C: installed seccomp: fd %d\n", seccomp_ret);
/* Place the notifier FD number into the shared memory */
__atomic_store(&shared->seccomp_fd, &seccomp_ret,
__ATOMIC_RELEASE);
/* Wake the parent */
int futex_ret =
syscall(__NR_futex, &shared->seccomp_fd, FUTEX_WAKE,
INT_MAX, NULL, NULL, 0);
printf("C: woke %d waiters\n", futex_ret);
/* Establish SA_RESTART handler for SIGUSR1 */
struct sigaction act = {
.sa_sigaction = sigusr1_handler,
.sa_flags = SA_RESTART | SA_SIGINFO
};
if (sigaction(SIGUSR1, &act, NULL))
err(1, "sigaction");
struct sigaction act2 = {
.sa_sigaction = sigusr2_handler,
.sa_flags = 0
};
if (sigaction(SIGUSR2, &act2, NULL))
err(1, "sigaction");
/* Make a blocking system call */
printf("C: About to call pause()\n");
pause();
perror("C: pause returned");
exit(0);
}
/* PARENT */
/* Wait for futex wake-up from child */
int futex_ret = syscall(__NR_futex, &shared->seccomp_fd, FUTEX_WAIT,
-1, NULL, NULL, 0);
if (futex_ret == -1 && errno != EAGAIN)
err(1, "futex wait");
/* Get notification FD from the child */
int fd = __atomic_load_n(&shared->seccomp_fd, __ATOMIC_ACQUIRE);
printf("\tP: child installed seccomp fd %d\n", fd);
/* Discover seccomp buffer sizes and allocate notification buffer */
struct seccomp_notif_sizes sizes;
if (syscall(__NR_seccomp, SECCOMP_GET_NOTIF_SIZES, 0, &sizes))
err(1, "notif_sizes");
struct seccomp_notif *notif =
malloc(max_size(sizeof(struct seccomp_notif),
sizes.seccomp_notif));
if (!notif)
err(1, "malloc");
for (int i = 0; i < 4; i++) {
printf("\tP: about to SECCOMP_IOCTL_NOTIF_RECV\n");
memset(notif, '\0', sizes.seccomp_notif);
if (ioctl(fd, SECCOMP_IOCTL_NOTIF_RECV, notif))
err(1, "notif_recv");
printf("\tP: got notif: id=%llu pid=%u nr=%d\n",
notif->id, notif->pid, notif->data.nr);
sleep(1);
printf("\tP: about to send SIGUSR1 to child...\n");
kill(child, SIGUSR1);
}
sleep(1);
exit(0);
}
====
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Michael Kerrisk <mtk.manpages@gmail.com>
2020-10-24 12:29:11 +00:00
|
|
|
.PP
|
|
|
|
One oddity is that system call restarting as described in this scenario
|
|
|
|
will occur even for the blocking system calls listed in
|
|
|
|
.BR signal (7)
|
|
|
|
that would
|
|
|
|
.B never
|
|
|
|
normally be restarted by the
|
|
|
|
.BR SA_RESTART
|
|
|
|
flag.
|
2020-10-26 09:11:09 +00:00
|
|
|
.\" FIXME
|
|
|
|
.\" About the above, Kees Cook commented:
|
|
|
|
.\"
|
|
|
|
.\" Does this need fixing? I imagine the correct behavior for this case
|
|
|
|
.\" would be a response to _SEND of EINPROGRESS and the target would see
|
|
|
|
.\" EINTR normally?
|
|
|
|
.\"
|
|
|
|
.\" I mean, it's not like seccomp doesn't already expose weirdness with
|
|
|
|
.\" syscall restarts. Not even arm64 compat agrees[3] with arm32 in this
|
|
|
|
.\" regard. :(
|
2020-10-26 09:45:24 +00:00
|
|
|
.
|
|
|
|
.\" FIXME
|
|
|
|
.\" Michael Kerrisk:
|
|
|
|
.\" I wonder about the effect of this oddity for system calls that
|
|
|
|
.\" are normally nonrestartable because they have timeouts. My
|
|
|
|
.\" understanding is that the kernel doesn't restart those system
|
|
|
|
.\" calls because it's impossible for the kernel to restart the call
|
|
|
|
.\" with the right timeout value. I wonder what happens when those
|
|
|
|
.\" system calls are restarted in the scenario we're discussing.)
|
2020-09-30 20:32:46 +00:00
|
|
|
.SH BUGS
|
|
|
|
If a
|
|
|
|
.BR SECCOMP_IOCTL_NOTIF_RECV
|
|
|
|
.BR ioctl (2)
|
2020-10-01 09:33:16 +00:00
|
|
|
operation
|
|
|
|
.\" or a poll/epoll/select
|
|
|
|
is performed after the target terminates, then the
|
2020-09-30 20:32:46 +00:00
|
|
|
.BR ioctl (2)
|
|
|
|
call simply blocks (rather than returning an error to indicate that the
|
2020-10-01 09:33:16 +00:00
|
|
|
target no longer exists).
|
2020-10-26 09:11:09 +00:00
|
|
|
.\" FIXME
|
|
|
|
.\" Comment from Kees Cook:
|
|
|
|
.\"
|
|
|
|
.\" I want this fixed. It caused me no end of pain when building the
|
|
|
|
.\" selftests, and ended up spawning my implementing a global test timeout
|
|
|
|
.\" in kselftest. :P Before the usage counter refactor, there was no sane
|
|
|
|
.\" way to deal with this, but now I think we're close.
|
|
|
|
.\"
|
2020-09-28 20:13:12 +00:00
|
|
|
.SH EXAMPLES
|
|
|
|
The (somewhat contrived) program shown below demonstrates the use of
|
|
|
|
the interfaces described in this page.
|
|
|
|
The program creates a child process that serves as the "target" process.
|
|
|
|
The child process installs a seccomp filter that returns the
|
|
|
|
.B SECCOMP_RET_USER_NOTIF
|
|
|
|
action value if a call is made to
|
|
|
|
.BR mkdir (2).
|
|
|
|
The child process then calls
|
|
|
|
.BR mkdir (2)
|
|
|
|
once for each of the supplied command-line arguments,
|
|
|
|
and reports the result returned by the call.
|
|
|
|
After processing all arguments, the child process terminates.
|
|
|
|
.PP
|
|
|
|
The parent process acts as the supervisor, listening for the notifications
|
|
|
|
that are generated when the target process calls
|
|
|
|
.BR mkdir (2).
|
|
|
|
When such a notification occurs,
|
|
|
|
the supervisor examines the memory of the target process (using
|
|
|
|
.IR /proc/[pid]/mem )
|
|
|
|
to discover the pathname argument that was supplied to the
|
|
|
|
.BR mkdir (2)
|
|
|
|
call, and performs one of the following actions:
|
|
|
|
.IP \(bu 2
|
|
|
|
If the pathname begins with the prefix "/tmp/",
|
|
|
|
then the supervisor attempts to create the specified directory,
|
|
|
|
and then spoofs a return for the target process based on the return
|
|
|
|
value of the supervisor's
|
|
|
|
.BR mkdir (2)
|
|
|
|
call.
|
|
|
|
In the event that that call succeeds,
|
|
|
|
the spoofed success return value is the length of the pathname.
|
|
|
|
.IP \(bu
|
|
|
|
If the pathname begins with "./" (i.e., it is a relative pathname),
|
|
|
|
the supervisor sends a
|
|
|
|
.B SECCOMP_USER_NOTIF_FLAG_CONTINUE
|
|
|
|
response to the kernel to say that the kernel should execute
|
|
|
|
the target process's
|
|
|
|
.BR mkdir (2)
|
|
|
|
call.
|
|
|
|
.IP \(bu
|
|
|
|
If the pathname begins with some other prefix,
|
|
|
|
the supervisor spoofs an error return for the target process,
|
|
|
|
so that the target process's
|
|
|
|
.BR mkdir (2)
|
|
|
|
call appears to fail with the error
|
|
|
|
.BR EOPNOTSUPP
|
|
|
|
("Operation not supported").
|
|
|
|
Additionally, if the specified pathname is exactly "/bye",
|
|
|
|
then the supervisor terminates.
|
|
|
|
.PP
|
2020-10-01 09:33:16 +00:00
|
|
|
This program can be used to demonstrate various aspects of the
|
2020-09-28 20:13:12 +00:00
|
|
|
behavior of the seccomp user-space notification mechanism.
|
|
|
|
To help aid such demonstrations,
|
|
|
|
the program logs various messages to show the operation
|
|
|
|
of the target process (lines prefixed "T:") and the supervisor
|
|
|
|
(indented lines prefixed "S:").
|
|
|
|
.PP
|
|
|
|
In the following example, the target attempts to create the directory
|
|
|
|
.IR /tmp/x .
|
|
|
|
Upon receiving the notification, the supervisor creates the directory on the
|
|
|
|
target's behalf,
|
|
|
|
and spoofs a success return to be received by the target process's
|
|
|
|
.BR mkdir (2)
|
|
|
|
call.
|
|
|
|
.PP
|
|
|
|
.in +4n
|
|
|
|
.EX
|
|
|
|
$ \fB./seccomp_unotify /tmp/x\fP
|
|
|
|
T: PID = 23168
|
|
|
|
|
|
|
|
T: about to mkdir("/tmp/x")
|
|
|
|
S: got notification (ID 0x17445c4a0f4e0e3c) for PID 23168
|
|
|
|
S: executing: mkdir("/tmp/x", 0700)
|
|
|
|
S: success! spoofed return = 6
|
|
|
|
S: sending response (flags = 0; val = 6; error = 0)
|
|
|
|
T: SUCCESS: mkdir(2) returned 6
|
|
|
|
|
|
|
|
T: terminating
|
|
|
|
S: target has terminated; bye
|
|
|
|
.EE
|
|
|
|
.in
|
|
|
|
.PP
|
|
|
|
In the above output, note that the spoofed return value seen by the target
|
|
|
|
process is 6 (the length of the pathname
|
|
|
|
.IR /tmp/x ),
|
|
|
|
whereas a normal
|
|
|
|
.BR mkdir (2)
|
|
|
|
call returns 0 on success.
|
|
|
|
.PP
|
|
|
|
In the next example, the target attempts to create a directory using the
|
|
|
|
relative pathname
|
|
|
|
.IR ./sub .
|
|
|
|
Since this pathname starts with "./",
|
|
|
|
the supervisor sends a
|
|
|
|
.B SECCOMP_USER_NOTIF_FLAG_CONTINUE
|
|
|
|
response to the kernel,
|
|
|
|
and the kernel then (successfully) executes the target process's
|
|
|
|
.BR mkdir (2)
|
|
|
|
call.
|
|
|
|
.PP
|
|
|
|
.in +4n
|
|
|
|
.EX
|
|
|
|
$ \fB./seccomp_unotify ./sub\fP
|
|
|
|
T: PID = 23204
|
|
|
|
|
|
|
|
T: about to mkdir("./sub")
|
|
|
|
S: got notification (ID 0xddb16abe25b4c12) for PID 23204
|
|
|
|
S: target can execute system call
|
|
|
|
S: sending response (flags = 0x1; val = 0; error = 0)
|
|
|
|
T: SUCCESS: mkdir(2) returned 0
|
|
|
|
|
|
|
|
T: terminating
|
|
|
|
S: target has terminated; bye
|
|
|
|
.EE
|
|
|
|
.in
|
|
|
|
.PP
|
|
|
|
If the target process attempts to create a directory with
|
|
|
|
a pathname that doesn't start with "." and doesn't begin with the prefix
|
|
|
|
"/tmp/", then the supervisor spoofs an error return
|
|
|
|
.RB ( EOPNOTSUPP ,
|
|
|
|
"Operation not supported")
|
|
|
|
for the target's
|
|
|
|
.BR mkdir (2)
|
|
|
|
call (which is not executed):
|
|
|
|
.PP
|
|
|
|
.in +4n
|
|
|
|
.EX
|
|
|
|
$ \fB./seccomp_unotify /xxx\fP
|
|
|
|
T: PID = 23178
|
|
|
|
|
|
|
|
T: about to mkdir("/xxx")
|
|
|
|
S: got notification (ID 0xe7dc095d1c524e80) for PID 23178
|
|
|
|
S: spoofing error response (Operation not supported)
|
|
|
|
S: sending response (flags = 0; val = 0; error = \-95)
|
|
|
|
T: ERROR: mkdir(2): Operation not supported
|
|
|
|
|
|
|
|
T: terminating
|
|
|
|
S: target has terminated; bye
|
|
|
|
.EE
|
|
|
|
.in
|
|
|
|
.PP
|
|
|
|
In the next example,
|
|
|
|
the target process attempts to create a directory with the pathname
|
|
|
|
.BR /tmp/nosuchdir/b .
|
|
|
|
Upon receiving the notification,
|
|
|
|
the supervisor attempts to create that directory, but the
|
|
|
|
.BR mkdir (2)
|
|
|
|
call fails because the directory
|
|
|
|
.BR /tmp/nosuchdir
|
|
|
|
does not exist.
|
|
|
|
Consequently, the supervisor spoofs an error return that passes the error
|
|
|
|
that it received back to the target process's
|
|
|
|
.BR mkdir (2)
|
|
|
|
call.
|
|
|
|
.PP
|
|
|
|
.in +4n
|
|
|
|
.EX
|
|
|
|
$ \fB./seccomp_unotify /tmp/nosuchdir/b\fP
|
|
|
|
T: PID = 23199
|
|
|
|
|
|
|
|
T: about to mkdir("/tmp/nosuchdir/b")
|
|
|
|
S: got notification (ID 0x8744454293506046) for PID 23199
|
|
|
|
S: executing: mkdir("/tmp/nosuchdir/b", 0700)
|
|
|
|
S: failure! (errno = 2; No such file or directory)
|
|
|
|
S: sending response (flags = 0; val = 0; error = \-2)
|
|
|
|
T: ERROR: mkdir(2): No such file or directory
|
|
|
|
|
|
|
|
T: terminating
|
|
|
|
S: target has terminated; bye
|
|
|
|
.EE
|
|
|
|
.in
|
|
|
|
.PP
|
|
|
|
If the supervisor receives a notification and sees that the
|
|
|
|
argument of the target's
|
|
|
|
.BR mkdir (2)
|
|
|
|
is the string "/bye", then (as well as spoofing an
|
|
|
|
.B EOPNOTSUPP
|
|
|
|
error), the supervisor terminates.
|
|
|
|
If the target process subsequently executes another
|
|
|
|
.BR mkdir (2)
|
|
|
|
that triggers its seccomp filter to return the
|
|
|
|
.B SECCOMP_RET_USER_NOTIF
|
|
|
|
action value, then the kernel causes the target process's system call to
|
|
|
|
fail with the error
|
|
|
|
.B ENOSYS
|
|
|
|
("Function not implemented").
|
|
|
|
This is demonstrated by the following example:
|
|
|
|
.PP
|
|
|
|
.in +4n
|
|
|
|
.EX
|
|
|
|
$ \fB./seccomp_unotify /bye /tmp/y\fP
|
|
|
|
T: PID = 23185
|
|
|
|
|
|
|
|
T: about to mkdir("/bye")
|
|
|
|
S: got notification (ID 0xa81236b1d2f7b0f4) for PID 23185
|
|
|
|
S: spoofing error response (Operation not supported)
|
|
|
|
S: sending response (flags = 0; val = 0; error = \-95)
|
|
|
|
S: terminating **********
|
|
|
|
T: ERROR: mkdir(2): Operation not supported
|
|
|
|
|
|
|
|
T: about to mkdir("/tmp/y")
|
|
|
|
T: ERROR: mkdir(2): Function not implemented
|
|
|
|
|
|
|
|
T: terminating
|
|
|
|
.EE
|
|
|
|
.in
|
|
|
|
.\"
|
|
|
|
.SS Program source
|
|
|
|
.EX
|
|
|
|
#define _GNU_SOURCE
|
|
|
|
#include <errno.h>
|
|
|
|
#include <fcntl.h>
|
|
|
|
#include <limits.h>
|
|
|
|
#include <linux/audit.h>
|
|
|
|
#include <linux/filter.h>
|
|
|
|
#include <linux/seccomp.h>
|
|
|
|
#include <signal.h>
|
|
|
|
#include <stdbool.h>
|
|
|
|
#include <stddef.h>
|
|
|
|
#include <stdint.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <sys/socket.h>
|
|
|
|
#include <sys/ioctl.h>
|
|
|
|
#include <sys/prctl.h>
|
|
|
|
#include <sys/stat.h>
|
|
|
|
#include <sys/types.h>
|
|
|
|
#include <sys/un.h>
|
|
|
|
#include <sys/syscall.h>
|
|
|
|
#include <unistd.h>
|
|
|
|
|
|
|
|
#define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \e
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
/* Send the file descriptor \(aqfd\(aq over the connected UNIX domain socket
|
|
|
|
\(aqsockfd\(aq. Returns 0 on success, or \-1 on error. */
|
|
|
|
|
|
|
|
static int
|
|
|
|
sendfd(int sockfd, int fd)
|
|
|
|
{
|
|
|
|
struct msghdr msgh;
|
|
|
|
struct iovec iov;
|
|
|
|
int data;
|
|
|
|
struct cmsghdr *cmsgp;
|
|
|
|
|
|
|
|
/* Allocate a char array of suitable size to hold the ancillary data.
|
|
|
|
However, since this buffer is in reality a \(aqstruct cmsghdr\(aq, use a
|
2020-10-01 09:33:16 +00:00
|
|
|
union to ensure that it is suitably aligned. */
|
2020-09-28 20:13:12 +00:00
|
|
|
union {
|
|
|
|
char buf[CMSG_SPACE(sizeof(int))];
|
|
|
|
/* Space large enough to hold an \(aqint\(aq */
|
|
|
|
struct cmsghdr align;
|
|
|
|
} controlMsg;
|
|
|
|
|
|
|
|
/* The \(aqmsg_name\(aq field can be used to specify the address of the
|
|
|
|
destination socket when sending a datagram. However, we do not
|
|
|
|
need to use this field because \(aqsockfd\(aq is a connected socket. */
|
|
|
|
|
|
|
|
msgh.msg_name = NULL;
|
|
|
|
msgh.msg_namelen = 0;
|
|
|
|
|
|
|
|
/* On Linux, we must transmit at least one byte of real data in
|
|
|
|
order to send ancillary data. We transmit an arbitrary integer
|
|
|
|
whose value is ignored by recvfd(). */
|
|
|
|
|
|
|
|
msgh.msg_iov = &iov;
|
|
|
|
msgh.msg_iovlen = 1;
|
|
|
|
iov.iov_base = &data;
|
|
|
|
iov.iov_len = sizeof(int);
|
|
|
|
data = 12345;
|
|
|
|
|
|
|
|
/* Set \(aqmsghdr\(aq fields that describe ancillary data */
|
|
|
|
|
|
|
|
msgh.msg_control = controlMsg.buf;
|
|
|
|
msgh.msg_controllen = sizeof(controlMsg.buf);
|
|
|
|
|
|
|
|
/* Set up ancillary data describing file descriptor to send */
|
|
|
|
|
|
|
|
cmsgp = CMSG_FIRSTHDR(&msgh);
|
|
|
|
cmsgp\->cmsg_level = SOL_SOCKET;
|
|
|
|
cmsgp\->cmsg_type = SCM_RIGHTS;
|
|
|
|
cmsgp\->cmsg_len = CMSG_LEN(sizeof(int));
|
|
|
|
memcpy(CMSG_DATA(cmsgp), &fd, sizeof(int));
|
|
|
|
|
|
|
|
/* Send real plus ancillary data */
|
|
|
|
|
|
|
|
if (sendmsg(sockfd, &msgh, 0) == \-1)
|
|
|
|
return \-1;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Receive a file descriptor on a connected UNIX domain socket. Returns
|
|
|
|
the received file descriptor on success, or \-1 on error. */
|
|
|
|
|
|
|
|
static int
|
|
|
|
recvfd(int sockfd)
|
|
|
|
{
|
|
|
|
struct msghdr msgh;
|
|
|
|
struct iovec iov;
|
|
|
|
int data, fd;
|
|
|
|
ssize_t nr;
|
|
|
|
|
|
|
|
/* Allocate a char buffer for the ancillary data. See the comments
|
|
|
|
in sendfd() */
|
|
|
|
union {
|
|
|
|
char buf[CMSG_SPACE(sizeof(int))];
|
|
|
|
struct cmsghdr align;
|
|
|
|
} controlMsg;
|
|
|
|
struct cmsghdr *cmsgp;
|
|
|
|
|
|
|
|
/* The \(aqmsg_name\(aq field can be used to obtain the address of the
|
|
|
|
sending socket. However, we do not need this information. */
|
|
|
|
|
|
|
|
msgh.msg_name = NULL;
|
|
|
|
msgh.msg_namelen = 0;
|
|
|
|
|
|
|
|
/* Specify buffer for receiving real data */
|
|
|
|
|
|
|
|
msgh.msg_iov = &iov;
|
|
|
|
msgh.msg_iovlen = 1;
|
|
|
|
iov.iov_base = &data; /* Real data is an \(aqint\(aq */
|
|
|
|
iov.iov_len = sizeof(int);
|
|
|
|
|
|
|
|
/* Set \(aqmsghdr\(aq fields that describe ancillary data */
|
|
|
|
|
|
|
|
msgh.msg_control = controlMsg.buf;
|
|
|
|
msgh.msg_controllen = sizeof(controlMsg.buf);
|
|
|
|
|
|
|
|
/* Receive real plus ancillary data; real data is ignored */
|
|
|
|
|
|
|
|
nr = recvmsg(sockfd, &msgh, 0);
|
|
|
|
if (nr == \-1)
|
|
|
|
return \-1;
|
|
|
|
|
|
|
|
cmsgp = CMSG_FIRSTHDR(&msgh);
|
|
|
|
|
|
|
|
/* Check the validity of the \(aqcmsghdr\(aq */
|
|
|
|
|
|
|
|
if (cmsgp == NULL ||
|
|
|
|
cmsgp\->cmsg_len != CMSG_LEN(sizeof(int)) ||
|
|
|
|
cmsgp\->cmsg_level != SOL_SOCKET ||
|
|
|
|
cmsgp\->cmsg_type != SCM_RIGHTS) {
|
|
|
|
errno = EINVAL;
|
|
|
|
return \-1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Return the received file descriptor to our caller */
|
|
|
|
|
|
|
|
memcpy(&fd, CMSG_DATA(cmsgp), sizeof(int));
|
|
|
|
return fd;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
sigchldHandler(int sig)
|
|
|
|
{
|
2020-10-26 09:11:09 +00:00
|
|
|
char msg[] = "\etS: target has terminated; bye\en";
|
2020-09-28 20:13:12 +00:00
|
|
|
|
2020-10-26 09:11:09 +00:00
|
|
|
write(STDOUT_FILENO, msg, sizeof(msg) - 1);
|
2020-09-28 20:13:12 +00:00
|
|
|
_exit(EXIT_SUCCESS);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
seccomp(unsigned int operation, unsigned int flags, void *args)
|
|
|
|
{
|
|
|
|
return syscall(__NR_seccomp, operation, flags, args);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* The following is the x86\-64\-specific BPF boilerplate code for checking
|
|
|
|
that the BPF program is running on the right architecture + ABI. At
|
|
|
|
completion of these instructions, the accumulator contains the system
|
|
|
|
call number. */
|
|
|
|
|
|
|
|
/* For the x32 ABI, all system call numbers have bit 30 set */
|
|
|
|
|
|
|
|
#define X32_SYSCALL_BIT 0x40000000
|
|
|
|
|
|
|
|
#define X86_64_CHECK_ARCH_AND_LOAD_SYSCALL_NR \e
|
|
|
|
BPF_STMT(BPF_LD | BPF_W | BPF_ABS, \e
|
|
|
|
(offsetof(struct seccomp_data, arch))), \e
|
|
|
|
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 0, 2), \e
|
|
|
|
BPF_STMT(BPF_LD | BPF_W | BPF_ABS, \e
|
|
|
|
(offsetof(struct seccomp_data, nr))), \e
|
|
|
|
BPF_JUMP(BPF_JMP | BPF_JGE | BPF_K, X32_SYSCALL_BIT, 0, 1), \e
|
|
|
|
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS)
|
|
|
|
|
|
|
|
/* installNotifyFilter() installs a seccomp filter that generates
|
|
|
|
user\-space notifications (SECCOMP_RET_USER_NOTIF) when the process
|
|
|
|
calls mkdir(2); the filter allows all other system calls.
|
|
|
|
|
|
|
|
The function return value is a file descriptor from which the
|
|
|
|
user\-space notifications can be fetched. */
|
|
|
|
|
|
|
|
static int
|
|
|
|
installNotifyFilter(void)
|
|
|
|
{
|
|
|
|
struct sock_filter filter[] = {
|
|
|
|
X86_64_CHECK_ARCH_AND_LOAD_SYSCALL_NR,
|
|
|
|
|
|
|
|
/* mkdir() triggers notification to user\-space supervisor */
|
|
|
|
|
|
|
|
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_mkdir, 0, 1),
|
|
|
|
BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF),
|
|
|
|
|
|
|
|
/* Every other system call is allowed */
|
|
|
|
|
|
|
|
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
|
|
|
|
};
|
|
|
|
|
|
|
|
struct sock_fprog prog = {
|
|
|
|
.len = sizeof(filter) / sizeof(filter[0]),
|
|
|
|
.filter = filter,
|
|
|
|
};
|
|
|
|
|
|
|
|
/* Install the filter with the SECCOMP_FILTER_FLAG_NEW_LISTENER flag;
|
|
|
|
as a result, seccomp() returns a notification file descriptor. */
|
|
|
|
|
|
|
|
int notifyFd = seccomp(SECCOMP_SET_MODE_FILTER,
|
|
|
|
SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog);
|
|
|
|
if (notifyFd == \-1)
|
|
|
|
errExit("seccomp\-install\-notify\-filter");
|
|
|
|
|
|
|
|
return notifyFd;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Close a pair of sockets created by socketpair() */
|
|
|
|
|
|
|
|
static void
|
|
|
|
closeSocketPair(int sockPair[2])
|
|
|
|
{
|
|
|
|
if (close(sockPair[0]) == \-1)
|
|
|
|
errExit("closeSocketPair\-close\-0");
|
|
|
|
if (close(sockPair[1]) == \-1)
|
|
|
|
errExit("closeSocketPair\-close\-1");
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Implementation of the target process; create a child process that:
|
|
|
|
|
|
|
|
(1) installs a seccomp filter with the
|
|
|
|
SECCOMP_FILTER_FLAG_NEW_LISTENER flag;
|
|
|
|
(2) writes the seccomp notification file descriptor returned from
|
|
|
|
the previous step onto the UNIX domain socket, \(aqsockPair[0]\(aq;
|
|
|
|
(3) calls mkdir(2) for each element of \(aqargv\(aq.
|
|
|
|
|
|
|
|
The function return value in the parent is the PID of the child
|
|
|
|
process; the child does not return from this function. */
|
|
|
|
|
|
|
|
static pid_t
|
|
|
|
targetProcess(int sockPair[2], char *argv[])
|
|
|
|
{
|
|
|
|
pid_t targetPid = fork();
|
|
|
|
if (targetPid == \-1)
|
|
|
|
errExit("fork");
|
|
|
|
|
|
|
|
if (targetPid > 0) /* In parent, return PID of child */
|
|
|
|
return targetPid;
|
|
|
|
|
|
|
|
/* Child falls through to here */
|
|
|
|
|
|
|
|
printf("T: PID = %ld\en", (long) getpid());
|
|
|
|
|
|
|
|
/* Install seccomp filter(s) */
|
|
|
|
|
|
|
|
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0))
|
|
|
|
errExit("prctl");
|
|
|
|
|
|
|
|
int notifyFd = installNotifyFilter();
|
|
|
|
|
|
|
|
/* Pass the notification file descriptor to the tracing process over
|
|
|
|
a UNIX domain socket */
|
|
|
|
|
|
|
|
if (sendfd(sockPair[0], notifyFd) == \-1)
|
|
|
|
errExit("sendfd");
|
|
|
|
|
|
|
|
/* Notification and socket FDs are no longer needed in target */
|
|
|
|
|
|
|
|
if (close(notifyFd) == \-1)
|
|
|
|
errExit("close\-target\-notify\-fd");
|
|
|
|
|
|
|
|
closeSocketPair(sockPair);
|
|
|
|
|
|
|
|
/* Perform a mkdir() call for each of the command\-line arguments */
|
|
|
|
|
|
|
|
for (char **ap = argv; *ap != NULL; ap++) {
|
|
|
|
printf("\enT: about to mkdir(\e"%s\e")\en", *ap);
|
|
|
|
|
|
|
|
int s = mkdir(*ap, 0700);
|
|
|
|
if (s == \-1)
|
|
|
|
perror("T: ERROR: mkdir(2)");
|
|
|
|
else
|
|
|
|
printf("T: SUCCESS: mkdir(2) returned %d\en", s);
|
|
|
|
}
|
|
|
|
|
|
|
|
printf("\enT: terminating\en");
|
|
|
|
exit(EXIT_SUCCESS);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Check that the notification ID provided by a SECCOMP_IOCTL_NOTIF_RECV
|
2020-10-29 11:19:16 +00:00
|
|
|
operation is still valid. It will no longer be valid if the target
|
|
|
|
process has terminated or is no longer blocked in the system call that
|
|
|
|
generated the notification (because it was interrupted by a signal).
|
|
|
|
|
|
|
|
This operation can be used when doing such things as accessing
|
|
|
|
/proc/PID files in the target process in order to avoid TOCTOU race
|
|
|
|
conditions where the PID that is returned by SECCOMP_IOCTL_NOTIF_RECV
|
|
|
|
terminates and is reused by another process. */
|
2020-09-28 20:13:12 +00:00
|
|
|
|
2020-10-29 16:15:50 +00:00
|
|
|
static bool
|
|
|
|
notificationIdIsValid(int notifyFd, uint64_t id)
|
2020-09-28 20:13:12 +00:00
|
|
|
{
|
2020-10-29 16:15:50 +00:00
|
|
|
if (ioctl(notifyFd, SECCOMP_IOCTL_NOTIF_ID_VALID, &id) == \-1) {
|
|
|
|
perror("\etS: notification ID check failed!!!\en");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
2020-09-28 20:13:12 +00:00
|
|
|
}
|
|
|
|
|
2020-10-29 09:46:10 +00:00
|
|
|
/* Access the memory of the target process in order to fetch the
|
|
|
|
pathname referred to by the system call argument \(aqargNum\(aq in
|
|
|
|
\(aqreq\->data.args[]\(aq. The pathname is returned in \(aqpath\(aq,
|
|
|
|
a buffer of \(aqlen\(aq bytes allocated by the caller.
|
|
|
|
|
|
|
|
Returns true if the fetched pathname is correctly formed
|
2020-10-29 16:15:50 +00:00
|
|
|
(i.e., has a terminating null byte) and the notification ID
|
|
|
|
is still valid, and false otherwise. */
|
2020-09-28 20:13:12 +00:00
|
|
|
|
2020-10-18 20:11:54 +00:00
|
|
|
static bool
|
2020-09-28 20:13:12 +00:00
|
|
|
getTargetPathname(struct seccomp_notif *req, int notifyFd,
|
2020-10-29 09:46:10 +00:00
|
|
|
int argNum, char *path, size_t len)
|
2020-09-28 20:13:12 +00:00
|
|
|
{
|
|
|
|
char procMemPath[PATH_MAX];
|
2020-10-18 20:11:54 +00:00
|
|
|
|
2020-09-28 20:13:12 +00:00
|
|
|
snprintf(procMemPath, sizeof(procMemPath), "/proc/%d/mem", req\->pid);
|
|
|
|
|
2020-10-29 16:15:50 +00:00
|
|
|
int procMemFd = open(procMemPath, O_RDONLY | O_CLOEXEC);
|
2020-09-28 20:13:12 +00:00
|
|
|
if (procMemFd == \-1)
|
2020-10-29 16:15:50 +00:00
|
|
|
errExit("Supervisor: open");
|
2020-09-28 20:13:12 +00:00
|
|
|
|
2020-10-29 11:19:16 +00:00
|
|
|
/* Check that the process whose info we are accessing is still alive
|
|
|
|
and blocked in the system call that caused the notification.
|
2020-10-29 16:15:50 +00:00
|
|
|
If the SECCOMP_IOCTL_NOTIF_ID_VALID operation (performed in
|
|
|
|
notificationIdIsValid()) succeeded, we know that the /proc/PID/mem
|
|
|
|
file descriptor that we opened corresponded to the process for
|
|
|
|
which we received a notification. If that process subsequently
|
|
|
|
terminates, then read() on that file descriptor will return
|
|
|
|
0 (EOF). */
|
2020-09-28 20:13:12 +00:00
|
|
|
|
2020-10-29 16:15:50 +00:00
|
|
|
if (!notificationIdIsValid(notifyFd, req\->id))
|
|
|
|
return false;
|
2020-09-28 20:13:12 +00:00
|
|
|
|
2020-10-29 09:46:10 +00:00
|
|
|
/* Read bytes at the location containing the pathname argument */
|
2020-09-28 20:13:12 +00:00
|
|
|
|
2020-10-29 09:46:10 +00:00
|
|
|
ssize_t nread = pread(procMemFd, path, len, req\->data.args[argNum]);
|
2020-10-16 15:08:24 +00:00
|
|
|
if (nread == \-1)
|
2020-10-29 16:15:50 +00:00
|
|
|
errExit("Supervisor: pread");
|
2020-09-28 20:13:12 +00:00
|
|
|
|
2020-10-16 15:08:24 +00:00
|
|
|
if (nread == 0) {
|
2020-10-01 09:33:16 +00:00
|
|
|
fprintf(stderr, "\etS: pread() of /proc/PID/mem "
|
2020-09-28 20:13:12 +00:00
|
|
|
"returned 0 (EOF)\en");
|
|
|
|
exit(EXIT_FAILURE);
|
|
|
|
}
|
|
|
|
|
2020-10-24 08:46:28 +00:00
|
|
|
if (close(procMemFd) == \-1)
|
2020-10-29 16:15:50 +00:00
|
|
|
errExit("Supervisor: close\-/proc/PID/mem");
|
2020-10-24 08:46:28 +00:00
|
|
|
|
2020-10-16 09:02:08 +00:00
|
|
|
/* We have no guarantees about what was in the memory of the target
|
2020-10-29 16:15:50 +00:00
|
|
|
process. (The memory may have been modified by another thread, or
|
|
|
|
even by an external attacking process.) We therefore treat the
|
|
|
|
buffer returned by pread() as untrusted input. The buffer should
|
|
|
|
be terminated by a null byte; if not, then we will trigger an
|
|
|
|
error for the target process. */
|
2020-10-16 09:02:08 +00:00
|
|
|
|
2020-10-26 09:11:09 +00:00
|
|
|
if (strnlen(path, nread) < nread)
|
|
|
|
return true;
|
2020-10-18 20:11:54 +00:00
|
|
|
|
2020-10-24 08:46:28 +00:00
|
|
|
return false;
|
2020-09-28 20:13:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Handle notifications that arrive via the SECCOMP_RET_USER_NOTIF file
|
|
|
|
descriptor, \(aqnotifyFd\(aq. */
|
|
|
|
|
|
|
|
static void
|
|
|
|
handleNotifications(int notifyFd)
|
|
|
|
{
|
|
|
|
struct seccomp_notif_sizes sizes;
|
|
|
|
char path[PATH_MAX];
|
|
|
|
|
|
|
|
/* Discover the sizes of the structures that are used to receive
|
|
|
|
notifications and send notification responses, and allocate
|
|
|
|
buffers of those sizes. */
|
|
|
|
|
|
|
|
if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes) == \-1)
|
|
|
|
errExit("\etS: seccomp\-SECCOMP_GET_NOTIF_SIZES");
|
|
|
|
|
|
|
|
struct seccomp_notif *req = malloc(sizes.seccomp_notif);
|
|
|
|
if (req == NULL)
|
|
|
|
errExit("\etS: malloc");
|
|
|
|
|
seccomp_unotify.2: EXAMPLE: Improve allocation of response buffer
From a conversation with Jann Horn:
[[
>>>> struct seccomp_notif_resp *resp = malloc(sizes.seccomp_notif_resp);
>>>
>>> This should probably do something like max(sizes.seccomp_notif_resp,
>>> sizeof(struct seccomp_notif_resp)) in case the program was built
>>> against new UAPI headers that make struct seccomp_notif_resp big, but
>>> is running under an old kernel where that struct is still smaller?
>>
>> I'm confused. Why? I mean, if the running kernel says that it expects
>> a buffer of a certain size, and we allocate a buffer of that size,
>> what's the problem?
>
> Because in userspace, we cast the result of malloc() to a "struct
> seccomp_notif_resp *". If the kernel tells us that it expects a size
> smaller than sizeof(struct seccomp_notif_resp), then we end up with a
> pointer to a struct that consists partly of allocated memory, partly
> of out-of-bounds memory, which is generally a bad idea - I'm not sure
> whether the C standard permits that. And if userspace then e.g.
> decides to access some member of that struct that is beyond what the
> kernel thinks is the struct size, we get actual OOB memory accesses.
Got it. (But gosh, this seems like a fragile API mess.)
I added the following to the code:
/* When allocating the response buffer, we must allow for the fact
that the user-space binary may have been built with user-space
headers where 'struct seccomp_notif_resp' is bigger than the
response buffer expected by the (older) kernel. Therefore, we
allocate a buffer that is the maximum of the two sizes. This
ensures that if the supervisor places bytes into the response
structure that are past the response size that the kernel expects,
then the supervisor is not touching an invalid memory location. */
size_t resp_size = sizes.seccomp_notif_resp;
if (sizeof(struct seccomp_notif_resp) > resp_size)
resp_size = sizeof(struct seccomp_notif_resp);
struct seccomp_notif_resp *resp = malloc(resp_size);
]]
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Michael Kerrisk <mtk.manpages@gmail.com>
2020-10-16 09:24:25 +00:00
|
|
|
/* When allocating the response buffer, we must allow for the fact
|
|
|
|
that the user\-space binary may have been built with user\-space
|
|
|
|
headers where \(aqstruct seccomp_notif_resp\(aq is bigger than the
|
|
|
|
response buffer expected by the (older) kernel. Therefore, we
|
|
|
|
allocate a buffer that is the maximum of the two sizes. This
|
|
|
|
ensures that if the supervisor places bytes into the response
|
|
|
|
structure that are past the response size that the kernel expects,
|
|
|
|
then the supervisor is not touching an invalid memory location. */
|
|
|
|
|
|
|
|
size_t resp_size = sizes.seccomp_notif_resp;
|
|
|
|
if (sizeof(struct seccomp_notif_resp) > resp_size)
|
|
|
|
resp_size = sizeof(struct seccomp_notif_resp);
|
|
|
|
|
|
|
|
struct seccomp_notif_resp *resp = malloc(resp_size);
|
2020-09-28 20:13:12 +00:00
|
|
|
if (resp == NULL)
|
|
|
|
errExit("\etS: malloc");
|
|
|
|
|
|
|
|
/* Loop handling notifications */
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
/* Wait for next notification, returning info in \(aq*req\(aq */
|
|
|
|
|
|
|
|
memset(req, 0, sizes.seccomp_notif);
|
|
|
|
if (ioctl(notifyFd, SECCOMP_IOCTL_NOTIF_RECV, req) == \-1) {
|
|
|
|
if (errno == EINTR)
|
|
|
|
continue;
|
2020-10-26 09:11:09 +00:00
|
|
|
errExit("\etS: ioctl\-SECCOMP_IOCTL_NOTIF_RECV");
|
2020-09-28 20:13:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
printf("\etS: got notification (ID %#llx) for PID %d\en",
|
|
|
|
req\->id, req\->pid);
|
|
|
|
|
|
|
|
/* The only system call that can generate a notification event
|
|
|
|
is mkdir(2). Nevertheless, we check that the notified system
|
|
|
|
call is indeed mkdir() as kind of future\-proofing of this
|
|
|
|
code in case the seccomp filter is later modified to
|
|
|
|
generate notifications for other system calls. */
|
|
|
|
|
|
|
|
if (req\->data.nr != __NR_mkdir) {
|
|
|
|
printf("\etS: notification contained unexpected "
|
|
|
|
"system call number; bye!!!\en");
|
|
|
|
exit(EXIT_FAILURE);
|
|
|
|
}
|
|
|
|
|
2020-10-29 09:46:10 +00:00
|
|
|
bool pathOK = getTargetPathname(req, notifyFd, 0, path,
|
2020-10-18 20:11:54 +00:00
|
|
|
sizeof(path));
|
2020-09-28 20:13:12 +00:00
|
|
|
|
|
|
|
/* Prepopulate some fields of the response */
|
|
|
|
|
|
|
|
resp\->id = req\->id; /* Response includes notification ID */
|
|
|
|
resp\->flags = 0;
|
|
|
|
resp\->val = 0;
|
|
|
|
|
2020-10-29 16:15:50 +00:00
|
|
|
/* If getTargetPathname() failed, trigger an EINVAL error
|
|
|
|
response (sending this response may yield an error if the
|
|
|
|
failure occurred because the notification ID was no longer
|
|
|
|
valid); if the directory is in /tmp, then create it on behalf
|
|
|
|
of the supervisor; if the pathname starts with \(aq.\(aq, tell the
|
|
|
|
kernel to let the target process execute the mkdir();
|
|
|
|
otherwise, give an error for a directory pathname in any other
|
|
|
|
location. */
|
2020-10-18 20:11:54 +00:00
|
|
|
|
|
|
|
if (!pathOK) {
|
|
|
|
resp->error = -EINVAL;
|
|
|
|
printf("\etS: spoofing error for invalid pathname (%s)\en",
|
|
|
|
strerror(-resp->error));
|
|
|
|
} else if (strncmp(path, "/tmp/", strlen("/tmp/")) == 0) {
|
2020-09-28 20:13:12 +00:00
|
|
|
printf("\etS: executing: mkdir(\e"%s\e", %#llo)\en",
|
|
|
|
path, req\->data.args[1]);
|
|
|
|
|
|
|
|
if (mkdir(path, req\->data.args[1]) == 0) {
|
|
|
|
resp\->error = 0; /* "Success" */
|
|
|
|
resp\->val = strlen(path); /* Used as return value of
|
|
|
|
mkdir() in target */
|
|
|
|
printf("\etS: success! spoofed return = %lld\en",
|
|
|
|
resp\->val);
|
|
|
|
} else {
|
|
|
|
|
|
|
|
/* If mkdir() failed in the supervisor, pass the error
|
|
|
|
back to the target */
|
|
|
|
|
|
|
|
resp\->error = \-errno;
|
|
|
|
printf("\etS: failure! (errno = %d; %s)\en", errno,
|
|
|
|
strerror(errno));
|
|
|
|
}
|
2020-10-01 09:33:16 +00:00
|
|
|
} else if (strncmp(path, "./", strlen("./")) == 0) {
|
2020-09-28 20:13:12 +00:00
|
|
|
resp\->error = resp\->val = 0;
|
|
|
|
resp\->flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE;
|
|
|
|
printf("\etS: target can execute system call\en");
|
|
|
|
} else {
|
|
|
|
resp\->error = \-EOPNOTSUPP;
|
|
|
|
printf("\etS: spoofing error response (%s)\en",
|
|
|
|
strerror(\-resp\->error));
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Send a response to the notification */
|
|
|
|
|
|
|
|
printf("\etS: sending response "
|
|
|
|
"(flags = %#x; val = %lld; error = %d)\en",
|
|
|
|
resp\->flags, resp\->val, resp\->error);
|
|
|
|
|
|
|
|
if (ioctl(notifyFd, SECCOMP_IOCTL_NOTIF_SEND, resp) == \-1) {
|
|
|
|
if (errno == ENOENT)
|
|
|
|
printf("\etS: response failed with ENOENT; "
|
|
|
|
"perhaps target process\(aqs syscall was "
|
2020-10-01 09:33:16 +00:00
|
|
|
"interrupted by a signal?\en");
|
2020-09-28 20:13:12 +00:00
|
|
|
else
|
|
|
|
perror("ioctl\-SECCOMP_IOCTL_NOTIF_SEND");
|
|
|
|
}
|
|
|
|
|
|
|
|
/* If the pathname is just "/bye", then the supervisor
|
|
|
|
terminates. This allows us to see what happens if the
|
|
|
|
target process makes further calls to mkdir(2). */
|
|
|
|
|
|
|
|
if (strcmp(path, "/bye") == 0) {
|
|
|
|
printf("\etS: terminating **********\en");
|
|
|
|
exit(EXIT_FAILURE);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Implementation of the supervisor process:
|
|
|
|
|
|
|
|
(1) obtains the notification file descriptor from \(aqsockPair[1]\(aq
|
|
|
|
(2) handles notifications that arrive on that file descriptor. */
|
|
|
|
|
|
|
|
static void
|
|
|
|
supervisor(int sockPair[2])
|
|
|
|
{
|
|
|
|
int notifyFd = recvfd(sockPair[1]);
|
|
|
|
if (notifyFd == \-1)
|
|
|
|
errExit("recvfd");
|
|
|
|
|
|
|
|
closeSocketPair(sockPair); /* We no longer need the socket pair */
|
|
|
|
|
|
|
|
handleNotifications(notifyFd);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
main(int argc, char *argv[])
|
|
|
|
{
|
|
|
|
int sockPair[2];
|
|
|
|
|
|
|
|
setbuf(stdout, NULL);
|
|
|
|
|
|
|
|
if (argc < 2) {
|
|
|
|
fprintf(stderr, "At least one pathname argument is required\en");
|
|
|
|
exit(EXIT_FAILURE);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Create a UNIX domain socket that is used to pass the seccomp
|
|
|
|
notification file descriptor from the target process to the
|
|
|
|
supervisor process. */
|
|
|
|
|
|
|
|
if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockPair) == \-1)
|
|
|
|
errExit("socketpair");
|
|
|
|
|
|
|
|
/* Create a child process\-\-the "target"\-\-that installs seccomp
|
|
|
|
filtering. The target process writes the seccomp notification
|
|
|
|
file descriptor onto \(aqsockPair[0]\(aq and then calls mkdir(2) for
|
|
|
|
each directory in the command\-line arguments. */
|
|
|
|
|
|
|
|
(void) targetProcess(sockPair, &argv[optind]);
|
|
|
|
|
|
|
|
/* Catch SIGCHLD when the target terminates, so that the
|
|
|
|
supervisor can also terminate. */
|
|
|
|
|
|
|
|
struct sigaction sa;
|
|
|
|
sa.sa_handler = sigchldHandler;
|
|
|
|
sa.sa_flags = 0;
|
|
|
|
sigemptyset(&sa.sa_mask);
|
|
|
|
if (sigaction(SIGCHLD, &sa, NULL) == \-1)
|
|
|
|
errExit("sigaction");
|
|
|
|
|
|
|
|
supervisor(sockPair);
|
|
|
|
|
|
|
|
exit(EXIT_SUCCESS);
|
|
|
|
}
|
|
|
|
.EE
|
|
|
|
.SH SEE ALSO
|
|
|
|
.BR ioctl (2),
|
2020-10-28 18:18:56 +00:00
|
|
|
.BR pidfd_open (2),
|
|
|
|
.BR pidfd_getfd (2),
|
2020-09-28 20:13:12 +00:00
|
|
|
.BR seccomp (2)
|
2020-09-30 20:24:59 +00:00
|
|
|
.PP
|
|
|
|
A further example program can be found in the kernel source file
|
|
|
|
.IR samples/seccomp/user-trap.c .
|