mirror of https://github.com/mkerrisk/man-pages
563 lines
17 KiB
Groff
563 lines
17 KiB
Groff
.\" Copyright (c) 2016, IBM Corporation.
|
|
.\" Written by Mike Rapoport <rppt@linux.vnet.ibm.com>
|
|
.\" and Copyright (C) 2017 Michael Kerrisk <mtk.manpages@gmail.com>
|
|
.\"
|
|
.\" %%%LICENSE_START(VERBATIM)
|
|
.\" Permission is granted to make and distribute verbatim copies of this
|
|
.\" manual provided the copyright notice and this permission notice are
|
|
.\" preserved on all copies.
|
|
.\"
|
|
.\" Permission is granted to copy and distribute modified versions of this
|
|
.\" manual under the conditions for verbatim copying, provided that the
|
|
.\" entire resulting derived work is distributed under the terms of a
|
|
.\" permission notice identical to this one.
|
|
.\"
|
|
.\" Since the Linux kernel and libraries are constantly changing, this
|
|
.\" manual page may be incorrect or out-of-date. The author(s) assume no
|
|
.\" responsibility for errors or omissions, or for damages resulting from
|
|
.\" the use of the information contained herein. The author(s) may not
|
|
.\" have taken the same level of care in the production of this manual,
|
|
.\" which is licensed free of charge, as they might when working
|
|
.\" professionally.
|
|
.\"
|
|
.\" Formatted or processed versions of this manual, if unaccompanied by
|
|
.\" the source, must acknowledge the copyright and authors of this work.
|
|
.\" %%%LICENSE_END
|
|
.\"
|
|
.TH USERFAULTFD 2 2016-12-12 "Linux" "Linux Programmer's Manual"
|
|
.SH NAME
|
|
userfaultfd \- create a file descriptor for handling page faults in user space
|
|
.SH SYNOPSIS
|
|
.nf
|
|
.B #include <sys/types.h>
|
|
.B #include <linux/userfaultfd.h>
|
|
.sp
|
|
.BI "int userfaultfd(int " flags );
|
|
.fi
|
|
.PP
|
|
.IR Note :
|
|
There is no glibc wrapper for this system call; see NOTES.
|
|
.SH DESCRIPTION
|
|
.BR userfaultfd ()
|
|
creates a new userfaultfd object that can be used for delegation of page-fault
|
|
handling to a user-space application,
|
|
and returns a file descriptor that refers to the new object.
|
|
The new userfaultfd object is configured using
|
|
.BR ioctl (2).
|
|
|
|
Once the userfaultfd object is configured, the application can use
|
|
.BR read (2)
|
|
to receive userfaultfd notifications.
|
|
The reads from userfaultfd may be blocking or non-blocking,
|
|
depending on the value of
|
|
.I flags
|
|
used for the creation of the userfaultfd or subsequent calls to
|
|
.BR fcntl (2).
|
|
|
|
The following values may be bitwise ORed in
|
|
.IR flags
|
|
to change the behavior of
|
|
.BR userfaultfd ():
|
|
.TP
|
|
.BR O_CLOEXEC
|
|
Enable the close-on-exec flag for the new userfaultfd file descriptor.
|
|
See the description of the
|
|
.B O_CLOEXEC
|
|
flag in
|
|
.BR open (2).
|
|
.TP
|
|
.BR O_NONBLOCK
|
|
Enables non-blocking operation for the userfaultfd object.
|
|
See the description of the
|
|
.BR O_NONBLOCK
|
|
flag in
|
|
.BR open (2).
|
|
.PP
|
|
When the last file descriptor referring to a userfaultfd object is closed,
|
|
all memory ranges that were registered with the object are unregistered
|
|
and unread page-fault events are flushed.
|
|
.\"
|
|
.SS Usage
|
|
The userfaultfd mechanism is designed to allow a thread in a multithreaded
|
|
program to perform user-space paging for the other threads in the process.
|
|
When a page fault occurs for one of the regions registered
|
|
to the userfaultfd object,
|
|
the faulting thread is put to sleep and
|
|
an event is generated that can be read via the userfaultfd file descriptor.
|
|
The fault-handling thread reads events from this file descriptor and services
|
|
them using the operations described in
|
|
.BR ioctl_userfaultfd (2).
|
|
When servicing the page fault events,
|
|
the fault-handling thread can trigger a wake-up for the sleeping thread.
|
|
.\"
|
|
.SS Userfaultfd operation
|
|
After the userfaultfd object is created with
|
|
.BR userfaultfd (),
|
|
the application must enable it using the
|
|
.B UFFDIO_API
|
|
.BR ioctl (2)
|
|
operation.
|
|
This operation allows a handshake between the kernel and user space
|
|
to determine the API version and supported features.
|
|
This operation must be performed before any of the other
|
|
.BR ioctl (2)
|
|
operations described below (or those operations fail with the
|
|
.BR EINVAL
|
|
error).
|
|
|
|
After a successful
|
|
.B UFFDIO_API
|
|
operation,
|
|
the application then registers memory address ranges using the
|
|
.B UFFDIO_REGISTER
|
|
.BR ioctl (2)
|
|
operation.
|
|
After successful completion of a
|
|
.B UFFDIO_REGISTER
|
|
operation,
|
|
a page fault occurring in the requested memory range, and satisfying
|
|
the mode defined at the registration time, will be forwarded by the kernel to
|
|
the user-space application.
|
|
The application can then use the
|
|
.B UFFDIO_COPY
|
|
or
|
|
.B UFFDIO_ZERO
|
|
.BR ioctl (2)
|
|
operations to resolve the page fault.
|
|
|
|
Details of the various
|
|
.BR ioctl (2)
|
|
operations can be found in
|
|
.BR ioctl_userfaultfd (2).
|
|
|
|
Currently, userfaultfd can be used only with anonymous private memory
|
|
mappings.
|
|
.\"
|
|
.SS Reading from the userfaultfd structure
|
|
Each
|
|
.BR read (2)
|
|
from the userfaultfd file descriptor returns one or more
|
|
.I uffd_msg
|
|
structures, each of which describes a page-fault event:
|
|
|
|
.nf
|
|
.in +4n
|
|
struct uffd_msg {
|
|
__u8 event; /* Type of event */
|
|
...
|
|
union {
|
|
struct {
|
|
__u64 flags; /* Flags describing fault */
|
|
__u64 address; /* Faulting address */
|
|
} pagefault;
|
|
...
|
|
} arg;
|
|
|
|
/* Padding fields omitted */
|
|
} __packed;
|
|
.in
|
|
.fi
|
|
|
|
If multiple events are available and the supplied buffer is large enough,
|
|
.BR read (2)
|
|
returns as many events as will fit in the supplied buffer.
|
|
If the buffer supplied to
|
|
.BR read (2)
|
|
is smaller than the size of the
|
|
.I uffd_msg
|
|
structure, the
|
|
.BR read (2)
|
|
fails with the error
|
|
.BR EINVAL .
|
|
|
|
The fields set in the
|
|
.I uffd_msg
|
|
structure are as follows:
|
|
.TP
|
|
.I event
|
|
The type of event.
|
|
Currently, only one value can appear in this field:
|
|
.BR UFFD_EVENT_PAGEFAULT ,
|
|
which indicates a page-fault event.
|
|
.TP
|
|
.I address
|
|
The address that triggered the page fault.
|
|
.TP
|
|
.I flags
|
|
A bit mask of flags that describe the event.
|
|
For
|
|
.BR UFFD_EVENT_PAGEFAULT ,
|
|
the following flag may appear:
|
|
.RS
|
|
.TP
|
|
.B UFFD_PAGEFAULT_FLAG_WRITE
|
|
If the address is in a range that was registered with the
|
|
.B UFFDIO_REGISTER_MODE_MISSING
|
|
flag (see
|
|
.BR ioctl_userfaultfd (2))
|
|
and this flag is set, this a write fault;
|
|
otherwise it is a read fault.
|
|
.\"
|
|
.\" UFFD_PAGEFAULT_FLAG_WP is not yet supported.
|
|
.RE
|
|
.PP
|
|
A
|
|
.BR read (2)
|
|
on a userfaultfd file descriptor can fail with the following errors:
|
|
.TP
|
|
.B EINVAL
|
|
The userfaultfd object has not yet been enabled using the
|
|
.BR UFFDIO_API
|
|
.BR ioctl (2)
|
|
operation
|
|
.PP
|
|
If the
|
|
.B O_NONBLOCK
|
|
flag is enabled in the associated open file description,
|
|
the userfaultfd file descriptor can be monitored with
|
|
.BR poll (2),
|
|
.BR select (2),
|
|
and
|
|
.BR epoll (7).
|
|
When events are available, the file descriptor indicates as readable.
|
|
If the
|
|
.B O_NONBLOCK
|
|
flag is not enabled, then
|
|
.BR poll (2)
|
|
(always) indicates the file as having a
|
|
.BR POLLERR
|
|
condition, and
|
|
.BR select (2)
|
|
indicates the file descriptor as both readable and writable.
|
|
.\" FIXME What is the reason for this seemingly odd behavior with respect
|
|
.\" to the O_NONBLOCK flag? (see userfaultfd_poll() in fs/userfaultfd.c).
|
|
.\" Something needs to be said about this.
|
|
.SH RETURN VALUE
|
|
On success,
|
|
.BR userfaultfd ()
|
|
returns a new file descriptor that refers to the userfaultfd object.
|
|
On error, \-1 is returned, and
|
|
.I errno
|
|
is set appropriately.
|
|
.SH ERRORS
|
|
.TP
|
|
.B EINVAL
|
|
An unsupported value was specified in
|
|
.IR flags .
|
|
.TP
|
|
.BR EMFILE
|
|
The per-process limit on the number of open file descriptors has been
|
|
reached
|
|
.TP
|
|
.B ENFILE
|
|
The system-wide limit on the total number of open files has been
|
|
reached.
|
|
.TP
|
|
.B ENOMEM
|
|
Insufficient kernel memory was available.
|
|
.SH VERSIONS
|
|
The
|
|
.BR userfaultfd ()
|
|
system call first appeared in Linux 4.3.
|
|
.SH CONFORMING TO
|
|
.BR userfaultfd ()
|
|
is Linux-specific and should not be used in programs intended to be
|
|
portable.
|
|
.SH NOTES
|
|
Glibc does not provide a wrapper for this system call; call it using
|
|
.BR syscall (2).
|
|
|
|
The userfaultfd mechanism can be used as an alternative to
|
|
traditional user-space paging techniques based on the use of the
|
|
.BR SIGSEGV
|
|
signal and
|
|
.BR mmap (2).
|
|
It can also be used to implement lazy restore
|
|
for checkpoint/restore mechanisms,
|
|
as well as post-copy migration to allow (nearly) uninterrupted execution
|
|
when transferring virtual machines from one host to another.
|
|
.SH EXAMPLE
|
|
The program below demonstrates the use of the userfaultfd mechanism.
|
|
The program creates two threads, one of which acts as the
|
|
page-fault handler for the process, for the pages in a demand-page zero
|
|
region created using
|
|
.BR mmap (2).
|
|
|
|
The program takes one command-line argument,
|
|
which is the number of pages that will be created in a mapping
|
|
whose page faults will be handled via userfaultfd.
|
|
After creating a userfaultfd object,
|
|
the program then creates an anonymous private mapping of the specified size
|
|
and registers the address range of that mapping using the
|
|
.B UFFDIO_REGISTER
|
|
.BR ioctl (2)
|
|
operation.
|
|
The program then creates a second thread that will perform the
|
|
task of handling page faults.
|
|
|
|
The main thread then walks through the pages of the mapping fetching
|
|
bytes from successive pages.
|
|
Because the pages have not yet been accessed,
|
|
the first access of a byte in each page will trigger a page-fault event
|
|
on the userfaultfd file descriptor.
|
|
|
|
Each of the page-fault events is handled by the second thread,
|
|
which sits in a loop processing input from the userfaultfd file descriptor.
|
|
In each loop iteration, the second thread first calls
|
|
.BR poll (2)
|
|
to check the state of the file descriptor,
|
|
and then reads an event from the file descriptor.
|
|
All such events should be
|
|
.B UFFD_EVENT_PAGEFAULT
|
|
events,
|
|
which the thread handles by copying a page of data into
|
|
the faulting region using the
|
|
.B UFFDIO_COPY
|
|
.BR ioctl (2)
|
|
operation.
|
|
|
|
The following is an example of what we see when running the program:
|
|
|
|
.nf
|
|
.in +4n
|
|
$ \fB./userfaultfd_demo 3\fP
|
|
Address returned by mmap() = 0x7fd30106c000
|
|
|
|
fault_handler_thread():
|
|
poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
|
|
UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106c00f
|
|
(uffdio_copy.copy returned 4096)
|
|
Read address 0x7fd30106c00f in main(): A
|
|
Read address 0x7fd30106c40f in main(): A
|
|
Read address 0x7fd30106c80f in main(): A
|
|
Read address 0x7fd30106cc0f in main(): A
|
|
|
|
fault_handler_thread():
|
|
poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
|
|
UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106d00f
|
|
(uffdio_copy.copy returned 4096)
|
|
Read address 0x7fd30106d00f in main(): B
|
|
Read address 0x7fd30106d40f in main(): B
|
|
Read address 0x7fd30106d80f in main(): B
|
|
Read address 0x7fd30106dc0f in main(): B
|
|
|
|
fault_handler_thread():
|
|
poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
|
|
UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106e00f
|
|
(uffdio_copy.copy returned 4096)
|
|
Read address 0x7fd30106e00f in main(): C
|
|
Read address 0x7fd30106e40f in main(): C
|
|
Read address 0x7fd30106e80f in main(): C
|
|
Read address 0x7fd30106ec0f in main(): C
|
|
.in
|
|
.fi
|
|
.SS Program source
|
|
\&
|
|
.nf
|
|
/* userfaultfd_demo.c
|
|
|
|
Licensed under the GNU General Public License version 2 or later.
|
|
*/
|
|
#define _GNU_SOURCE
|
|
#include <sys/types.h>
|
|
#include <stdio.h>
|
|
#include <linux/userfaultfd.h>
|
|
#include <pthread.h>
|
|
#include <errno.h>
|
|
#include <unistd.h>
|
|
#include <stdlib.h>
|
|
#include <fcntl.h>
|
|
#include <signal.h>
|
|
#include <poll.h>
|
|
#include <string.h>
|
|
#include <sys/mman.h>
|
|
#include <sys/syscall.h>
|
|
#include <sys/ioctl.h>
|
|
#include <poll.h>
|
|
|
|
#define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \\
|
|
} while (0)
|
|
|
|
static int page_size;
|
|
|
|
static void *
|
|
fault_handler_thread(void *arg)
|
|
{
|
|
static struct uffd_msg msg; /* Data read from userfaultfd */
|
|
static int fault_cnt = 0; /* Number of faults so far handled */
|
|
long uffd; /* userfaultfd file descriptor */
|
|
static char *page = NULL;
|
|
struct uffdio_copy uffdio_copy;
|
|
ssize_t nread;
|
|
|
|
uffd = (long) arg;
|
|
|
|
/* Create a page that will be copied into the faulting region */
|
|
|
|
if (page == NULL) {
|
|
page = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
|
|
MAP_PRIVATE | MAP_ANONYMOUS, \-1, 0);
|
|
if (page == MAP_FAILED)
|
|
errExit("mmap");
|
|
}
|
|
|
|
/* Loop, handling incoming events on the userfaultfd
|
|
file descriptor */
|
|
|
|
for (;;) {
|
|
|
|
/* See what poll() tells us about the userfaultfd */
|
|
|
|
struct pollfd pollfd;
|
|
int nready;
|
|
pollfd.fd = uffd;
|
|
pollfd.events = POLLIN;
|
|
nready = poll(&pollfd, 1, \-1);
|
|
if (nready == \-1)
|
|
errExit("poll");
|
|
|
|
printf("\\nfault_handler_thread():\\n");
|
|
printf(" poll() returns: nready = %d; "
|
|
"POLLIN = %d; POLLERR = %d\\n", nready,
|
|
(pollfd.revents & POLLIN) != 0,
|
|
(pollfd.revents & POLLERR) != 0);
|
|
|
|
/* Read an event from the userfaultfd */
|
|
|
|
nread = read(uffd, &msg, sizeof(msg));
|
|
if (nread == 0) {
|
|
printf("EOF on userfaultfd!\\n");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
if (nread == \-1)
|
|
errExit("read");
|
|
|
|
/* We expect only one kind of event; verify that assumption */
|
|
|
|
if (msg.event != UFFD_EVENT_PAGEFAULT) {
|
|
fprintf(stderr, "Unexpected event on userfaultfd\\n");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
/* Display info about the page\-fault event */
|
|
|
|
printf(" UFFD_EVENT_PAGEFAULT event: ");
|
|
printf("flags = %llx; ", msg.arg.pagefault.flags);
|
|
printf("address = %llx\\n", msg.arg.pagefault.address);
|
|
|
|
/* Copy the page pointed to by \(aqpage\(aq into the faulting
|
|
region. Vary the contents that are copied in, so that it
|
|
is more obvious that each fault is handled separately. */
|
|
|
|
memset(page, \(aqA\(aq + fault_cnt % 20, page_size);
|
|
fault_cnt++;
|
|
|
|
uffdio_copy.src = (unsigned long) page;
|
|
|
|
/* We need to handle page faults in units of pages(!).
|
|
So, round faulting address down to page boundary */
|
|
|
|
uffdio_copy.dst = (unsigned long) msg.arg.pagefault.address &
|
|
~(page_size \- 1);
|
|
uffdio_copy.len = page_size;
|
|
uffdio_copy.mode = 0;
|
|
uffdio_copy.copy = 0;
|
|
if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy) == \-1)
|
|
errExit("ioctl\-UFFDIO_COPY");
|
|
|
|
printf(" (uffdio_copy.copy returned %lld)\\n",
|
|
uffdio_copy.copy);
|
|
}
|
|
}
|
|
|
|
int
|
|
main(int argc, char *argv[])
|
|
{
|
|
long uffd; /* userfaultfd file descriptor */
|
|
char *addr; /* Start of region handled by userfaultfd */
|
|
unsigned long len; /* Length of region handled by userfaultfd */
|
|
pthread_t thr; /* ID of thread that handles page faults */
|
|
struct uffdio_api uffdio_api;
|
|
struct uffdio_register uffdio_register;
|
|
int s;
|
|
|
|
if (argc != 2) {
|
|
fprintf(stderr, "Usage: %s num\-pages\\n", argv[0]);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
page_size = sysconf(_SC_PAGE_SIZE);
|
|
len = strtoul(argv[1], NULL, 0) * page_size;
|
|
|
|
/* Create and enable userfaultfd object */
|
|
|
|
uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
|
|
if (uffd == \-1)
|
|
errExit("userfaultfd");
|
|
|
|
uffdio_api.api = UFFD_API;
|
|
uffdio_api.features = 0;
|
|
if (ioctl(uffd, UFFDIO_API, &uffdio_api) == \-1)
|
|
errExit("ioctl\-UFFDIO_API");
|
|
|
|
/* Create a private anonymous mapping. The memory will be
|
|
demand\-zero paged\-\-that is, not yet allocated. When we
|
|
actually touch the memory, it will be allocated via
|
|
the userfaultfd. */
|
|
|
|
addr = mmap(NULL, len, PROT_READ | PROT_WRITE,
|
|
MAP_PRIVATE | MAP_ANONYMOUS, \-1, 0);
|
|
if (addr == MAP_FAILED)
|
|
errExit("mmap");
|
|
|
|
printf("Address returned by mmap() = %p\\n", addr);
|
|
|
|
/* Register the memory range of the mapping we just created for
|
|
handling by the userfaultfd object. In mode, we request to track
|
|
missing pages (i.e., pages that have not yet been faulted in). */
|
|
|
|
uffdio_register.range.start = (unsigned long) addr;
|
|
uffdio_register.range.len = len;
|
|
uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
|
|
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == \-1)
|
|
errExit("ioctl\-UFFDIO_REGISTER");
|
|
|
|
/* Create a thread that will process the userfaultfd events */
|
|
|
|
s = pthread_create(&thr, NULL, fault_handler_thread, (void *) uffd);
|
|
if (s != 0) {
|
|
errno = s;
|
|
errExit("pthread_create");
|
|
}
|
|
|
|
/* Main thread now touches memory in the mapping, touching
|
|
locations 1024 bytes apart. This will trigger userfaultfd
|
|
events for all pages in the region. */
|
|
|
|
int l;
|
|
l = 0xf; /* Ensure that faulting address is not on a page
|
|
boundary, in order to test that we correctly
|
|
handle that case in fault_handling_thread() */
|
|
while (l < len) {
|
|
char c = addr[l];
|
|
printf("Read address %p in main(): ", addr + l);
|
|
printf("%c\\n", c);
|
|
l += 1024;
|
|
usleep(100000); /* Slow things down a little */
|
|
}
|
|
|
|
exit(EXIT_SUCCESS);
|
|
}
|
|
.fi
|
|
.SH SEE ALSO
|
|
.BR fcntl (2),
|
|
.BR ioctl (2),
|
|
.BR ioctl_userfaultfd (2),
|
|
.BR madvise (2),
|
|
.BR mmap (2)
|
|
|
|
.IR Documentation/vm/userfaultfd.txt
|
|
in the Linux kernel source tree
|
|
|