From efeece04658ec59427f56cfa5c5bb7925fd38462 Mon Sep 17 00:00:00 2001 From: Michael Kerrisk Date: Wed, 16 Aug 2017 09:30:51 +0200 Subject: [PATCH] _syscall.2, bpf.2, cacheflush.2, capget.2, chdir.2, chmod.2, chroot.2, clock_getres.2, clock_nanosleep.2, clone.2, close.2, connect.2, copy_file_range.2, create_module.2, delete_module.2, dup.2, epoll_create.2, epoll_ctl.2, epoll_wait.2, eventfd.2, execve.2, execveat.2, fallocate.2, flock.2, fork.2, fsync.2, futex.2, futimesat.2, get_kernel_syms.2, get_mempolicy.2, get_robust_list.2, getcpu.2, getdents.2, getdomainname.2, getgid.2, getgroups.2, gethostname.2, getitimer.2, getpagesize.2, getpeername.2, getpriority.2, getrandom.2, getresuid.2, getrlimit.2, getrusage.2, getsid.2, getsockname.2, getsockopt.2, gettid.2, gettimeofday.2, getuid.2, getunwind.2, init_module.2, inotify_add_watch.2, inotify_init.2, inotify_rm_watch.2, intro.2, io_cancel.2, io_destroy.2, io_getevents.2, io_setup.2, io_submit.2, ioctl_fat.2, ioctl_ficlonerange.2, ioctl_fideduperange.2, ioctl_tty.2, ioctl_userfaultfd.2, ioperm.2, iopl.2, ioprio_set.2, kcmp.2, kexec_load.2, keyctl.2, kill.2, link.2, listen.2, listxattr.2, llseek.2, lookup_dcookie.2, lseek.2, madvise.2, mbind.2, membarrier.2, memfd_create.2, migrate_pages.2, mincore.2, mkdir.2, mknod.2, mlock.2, mmap.2, mmap2.2, modify_ldt.2, move_pages.2, mprotect.2, mq_getsetattr.2, mremap.2, msgctl.2, msgget.2, msgop.2, msync.2, nanosleep.2, nfsservctl.2, nice.2, open_by_handle_at.2, outb.2, perf_event_open.2, perfmonctl.2, personality.2, pivot_root.2, pkey_alloc.2, poll.2, posix_fadvise.2, prctl.2, pread.2, process_vm_readv.2, ptrace.2, query_module.2, quotactl.2, read.2, readahead.2, readdir.2, readv.2, reboot.2, recv.2, recvmmsg.2, remap_file_pages.2, rename.2, request_key.2, restart_syscall.2, rt_sigqueueinfo.2, s390_pci_mmio_write.2, s390_runtime_instr.2, sched_get_priority_max.2, sched_rr_get_interval.2, sched_setaffinity.2, sched_setattr.2, sched_setparam.2, sched_setscheduler.2, sched_yield.2, seccomp.2, select.2, select_tut.2, semctl.2, semget.2, semop.2, send.2, sendfile.2, sendmmsg.2, set_mempolicy.2, set_thread_area.2, set_tid_address.2, seteuid.2, setfsgid.2, setfsuid.2, setgid.2, setns.2, setpgid.2, setresuid.2, setreuid.2, setsid.2, setuid.2, sgetmask.2, shmctl.2, shmget.2, shmop.2, sigaction.2, sigaltstack.2, sigpending.2, sigprocmask.2, sigreturn.2, sigsuspend.2, sigwaitinfo.2, socket.2, socketcall.2, socketpair.2, splice.2, spu_create.2, spu_run.2, stat.2, statfs.2, statx.2, subpage_prot.2, swapon.2, symlink.2, sync.2, sync_file_range.2, syscalls.2, sysctl.2, sysinfo.2, syslog.2, tee.2, time.2, timer_create.2, timer_getoverrun.2, timer_settime.2, timerfd_create.2, times.2, tkill.2, truncate.2, umask.2, umount.2, unimplemented.2, unlink.2, unshare.2, uselib.2, userfaultfd.2, utime.2, utimensat.2, vfork.2, vmsplice.2, wait.2, wait4.2, write.2: Formatting fix: replace blank lines with .PP/.IP Blank lines shouldn't generally appear in *roff source (other than in code examples), since they create large vertical spaces between text blocks. Signed-off-by: Michael Kerrisk --- man2/_syscall.2 | 6 +- man2/bpf.2 | 4 +- man2/cacheflush.2 | 2 +- man2/capget.2 | 12 +- man2/chdir.2 | 2 +- man2/chmod.2 | 18 +- man2/chroot.2 | 12 +- man2/clock_getres.2 | 4 +- man2/clock_nanosleep.2 | 24 +-- man2/clone.2 | 140 +++++++------- man2/close.2 | 14 +- man2/connect.2 | 6 +- man2/copy_file_range.2 | 8 +- man2/create_module.2 | 2 +- man2/delete_module.2 | 12 +- man2/dup.2 | 14 +- man2/epoll_create.2 | 4 +- man2/epoll_ctl.2 | 12 +- man2/epoll_wait.2 | 2 +- man2/eventfd.2 | 16 +- man2/execve.2 | 78 ++++---- man2/execveat.2 | 18 +- man2/fallocate.2 | 34 ++-- man2/flock.2 | 14 +- man2/fork.2 | 4 +- man2/fsync.2 | 10 +- man2/futex.2 | 152 +++++++-------- man2/futimesat.2 | 10 +- man2/get_kernel_syms.2 | 2 +- man2/get_mempolicy.2 | 18 +- man2/get_robust_list.2 | 10 +- man2/getcpu.2 | 8 +- man2/getdents.2 | 10 +- man2/getdomainname.2 | 6 +- man2/getgid.2 | 2 +- man2/getgroups.2 | 8 +- man2/gethostname.2 | 6 +- man2/getitimer.2 | 34 ++-- man2/getpagesize.2 | 4 +- man2/getpeername.2 | 4 +- man2/getpriority.2 | 20 +- man2/getrandom.2 | 18 +- man2/getresuid.2 | 2 +- man2/getrlimit.2 | 50 ++--- man2/getrusage.2 | 10 +- man2/getsid.2 | 2 +- man2/getsockname.2 | 2 +- man2/getsockopt.2 | 12 +- man2/gettid.2 | 4 +- man2/gettimeofday.2 | 8 +- man2/getuid.2 | 4 +- man2/getunwind.2 | 12 +- man2/init_module.2 | 26 +-- man2/inotify_add_watch.2 | 4 +- man2/inotify_init.2 | 4 +- man2/inotify_rm_watch.2 | 2 +- man2/intro.2 | 8 +- man2/io_cancel.2 | 2 +- man2/io_destroy.2 | 2 +- man2/io_getevents.2 | 12 +- man2/io_setup.2 | 2 +- man2/io_submit.2 | 2 +- man2/ioctl_fat.2 | 12 +- man2/ioctl_ficlonerange.2 | 4 +- man2/ioctl_fideduperange.2 | 18 +- man2/ioctl_tty.2 | 24 +-- man2/ioctl_userfaultfd.2 | 50 ++--- man2/ioperm.2 | 10 +- man2/iopl.2 | 12 +- man2/ioprio_set.2 | 16 +- man2/kcmp.2 | 14 +- man2/kexec_load.2 | 10 +- man2/keyctl.2 | 342 +++++++++++++++++----------------- man2/kill.2 | 2 +- man2/link.2 | 24 +-- man2/listen.2 | 8 +- man2/listxattr.2 | 6 +- man2/llseek.2 | 2 +- man2/lookup_dcookie.2 | 4 +- man2/lseek.2 | 18 +- man2/madvise.2 | 28 +-- man2/mbind.2 | 28 +-- man2/membarrier.2 | 28 +-- man2/memfd_create.2 | 30 +-- man2/migrate_pages.2 | 14 +- man2/mincore.2 | 4 +- man2/mkdir.2 | 14 +- man2/mknod.2 | 24 +-- man2/mlock.2 | 44 ++--- man2/mmap.2 | 34 ++-- man2/mmap2.2 | 6 +- man2/modify_ldt.2 | 2 +- man2/move_pages.2 | 18 +- man2/mprotect.2 | 18 +- man2/mq_getsetattr.2 | 2 +- man2/mremap.2 | 12 +- man2/msgctl.2 | 6 +- man2/msgget.2 | 2 +- man2/msgop.2 | 18 +- man2/msync.2 | 6 +- man2/nanosleep.2 | 12 +- man2/nfsservctl.2 | 2 +- man2/nice.2 | 10 +- man2/open_by_handle_at.2 | 46 ++--- man2/outb.2 | 4 +- man2/perf_event_open.2 | 238 +++++++++++------------ man2/perfmonctl.2 | 12 +- man2/personality.2 | 4 +- man2/pivot_root.2 | 16 +- man2/pkey_alloc.2 | 8 +- man2/poll.2 | 28 +-- man2/posix_fadvise.2 | 22 +-- man2/prctl.2 | 86 ++++----- man2/pread.2 | 6 +- man2/process_vm_readv.2 | 28 +-- man2/ptrace.2 | 110 +++++------ man2/query_module.2 | 6 +- man2/quotactl.2 | 32 ++-- man2/read.2 | 16 +- man2/readahead.2 | 2 +- man2/readdir.2 | 2 +- man2/readv.2 | 26 +-- man2/reboot.2 | 2 +- man2/recv.2 | 32 ++-- man2/recvmmsg.2 | 18 +- man2/remap_file_pages.2 | 12 +- man2/rename.2 | 46 ++--- man2/request_key.2 | 28 +-- man2/restart_syscall.2 | 6 +- man2/rt_sigqueueinfo.2 | 10 +- man2/s390_pci_mmio_write.2 | 2 +- man2/s390_runtime_instr.2 | 4 +- man2/sched_get_priority_max.2 | 8 +- man2/sched_rr_get_interval.2 | 8 +- man2/sched_setaffinity.2 | 32 ++-- man2/sched_setattr.2 | 14 +- man2/sched_setparam.2 | 8 +- man2/sched_setscheduler.2 | 18 +- man2/sched_yield.2 | 4 +- man2/seccomp.2 | 94 +++++----- man2/select.2 | 44 ++--- man2/select_tut.2 | 8 +- man2/semctl.2 | 12 +- man2/semget.2 | 4 +- man2/semop.2 | 12 +- man2/send.2 | 16 +- man2/sendfile.2 | 28 +-- man2/sendmmsg.2 | 18 +- man2/set_mempolicy.2 | 16 +- man2/set_thread_area.2 | 8 +- man2/set_tid_address.2 | 4 +- man2/seteuid.2 | 8 +- man2/setfsgid.2 | 6 +- man2/setfsuid.2 | 6 +- man2/setgid.2 | 2 +- man2/setns.2 | 22 +-- man2/setpgid.2 | 36 ++-- man2/setresuid.2 | 14 +- man2/setreuid.2 | 18 +- man2/setsid.2 | 10 +- man2/setuid.2 | 4 +- man2/sgetmask.2 | 12 +- man2/shmctl.2 | 10 +- man2/shmget.2 | 34 ++-- man2/shmop.2 | 10 +- man2/sigaction.2 | 20 +- man2/sigaltstack.2 | 6 +- man2/sigpending.2 | 6 +- man2/sigprocmask.2 | 24 +-- man2/sigreturn.2 | 10 +- man2/sigsuspend.2 | 4 +- man2/sigwaitinfo.2 | 16 +- man2/socket.2 | 10 +- man2/socketcall.2 | 2 +- man2/socketpair.2 | 8 +- man2/splice.2 | 8 +- man2/spu_create.2 | 12 +- man2/spu_run.2 | 12 +- man2/stat.2 | 32 ++-- man2/statfs.2 | 14 +- man2/statx.2 | 6 +- man2/subpage_prot.2 | 8 +- man2/swapon.2 | 4 +- man2/symlink.2 | 20 +- man2/sync.2 | 8 +- man2/sync_file_range.2 | 8 +- man2/syscalls.2 | 12 +- man2/sysctl.2 | 4 +- man2/sysinfo.2 | 8 +- man2/syslog.2 | 10 +- man2/tee.2 | 6 +- man2/time.2 | 8 +- man2/timer_create.2 | 14 +- man2/timer_getoverrun.2 | 6 +- man2/timer_settime.2 | 16 +- man2/timerfd_create.2 | 22 +-- man2/times.2 | 4 +- man2/tkill.2 | 6 +- man2/truncate.2 | 6 +- man2/umask.2 | 18 +- man2/umount.2 | 8 +- man2/unimplemented.2 | 4 +- man2/unlink.2 | 16 +- man2/unshare.2 | 12 +- man2/uselib.2 | 4 +- man2/userfaultfd.2 | 48 ++--- man2/utime.2 | 16 +- man2/utimensat.2 | 26 +-- man2/vfork.2 | 12 +- man2/vmsplice.2 | 6 +- man2/wait.2 | 24 +-- man2/wait4.2 | 4 +- man2/write.2 | 24 +-- 213 files changed, 1937 insertions(+), 1937 deletions(-) diff --git a/man2/_syscall.2 b/man2/_syscall.2 index 542589dc9..2a1427828 100644 --- a/man2/_syscall.2 +++ b/man2/_syscall.2 @@ -94,13 +94,13 @@ instead. on those architectures, .BR syscall (2) was always required.) - +.PP The _syscall() macros .I "do not" produce a prototype. You may have to create one, especially for C++ users. - +.PP System calls are not required to return only positive or negative error codes. You need to read the source to be sure how it will return errors. @@ -121,7 +121,7 @@ when is negative. For the error codes, see .BR errno (3). - +.PP When defining a system call, the argument types .I must be diff --git a/man2/bpf.2 b/man2/bpf.2 index dc6e4e491..bb7c1e6fd 100644 --- a/man2/bpf.2 +++ b/man2/bpf.2 @@ -146,7 +146,7 @@ The .I size argument is the size of the union pointed to by .IR attr . - +.PP The value provided in .IR cmd is one of the following: @@ -919,7 +919,7 @@ to a perf event file descriptor, .IR event_fd , that was created by a previous call to .BR perf_event_open (2): - +.PP .in +4n .nf ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd); diff --git a/man2/cacheflush.2 b/man2/cacheflush.2 index 7e26d219b..b9ba2283e 100644 --- a/man2/cacheflush.2 +++ b/man2/cacheflush.2 @@ -93,7 +93,7 @@ and .I nbytes arguments, making this function fairly expensive. Therefore, the whole cache is always flushed. - +.PP This function always behaves as if .BR BCACHE has been passed for the diff --git a/man2/capget.2 b/man2/capget.2 index 9200f08ce..e5dd2fe6f 100644 --- a/man2/capget.2 +++ b/man2/capget.2 @@ -101,7 +101,7 @@ To define the structures for passing to the system call, you have to use the and .I struct __user_cap_data_struct names because the typedefs are only pointers. - +.PP Kernels prior to 2.6.25 prefer 32-bit capabilities with version .BR _LINUX_CAPABILITY_VERSION_1 . @@ -110,19 +110,19 @@ Linux 2.6.25 added 64-bit capability sets, with version There was, however, an API glitch, and Linux 2.6.26 added .BR _LINUX_CAPABILITY_VERSION_3 to fix the problem. - +.PP Note that 64-bit capabilities use .IR datap [0] and .IR datap [1], whereas 32-bit capabilities use only .IR datap [0]. - +.PP On kernels that support file capabilities (VFS capability support), these system calls behave slightly differently. This support was added as an option in Linux 2.6.24, and became fixed (nonoptional) in Linux 2.6.33. - +.PP For .BR capget () calls, one can probe the capabilities of any process by specifying its @@ -167,7 +167,7 @@ caller and .BR init (1); or a value less than \-1, in which case the change is applied to all members of the process group whose ID is \-\fIpid\fP. - +.PP For details on the data, see .BR capabilities (7). .SH RETURN VALUE @@ -175,7 +175,7 @@ On success, zero is returned. On error, \-1 is returned, and .I errno is set appropriately. - +.PP The calls will fail with the error .BR EINVAL , and set the diff --git a/man2/chdir.2 b/man2/chdir.2 index 1e624eacd..f0e9bd3c5 100644 --- a/man2/chdir.2 +++ b/man2/chdir.2 @@ -129,7 +129,7 @@ POSIX.1-2001, POSIX.1-2008, SVr4, 4.4BSD. .SH NOTES The current working directory is the starting point for interpreting relative pathnames (those not starting with \(aq/\(aq). - +.PP A child process created via .BR fork (2) inherits its parent's current working directory. diff --git a/man2/chmod.2 b/man2/chmod.2 index fd31ee14e..30290d583 100644 --- a/man2/chmod.2 +++ b/man2/chmod.2 @@ -164,7 +164,7 @@ The effective UID of the calling process must match the owner of the file, or the process must be privileged (Linux: it must have the .B CAP_FOWNER capability). - +.PP If the calling process is not privileged (Linux: does not have the .B CAP_FSETID capability), and the group of the file does not match @@ -173,7 +173,7 @@ supplementary group IDs, the .B S_ISGID bit will be turned off, but this will not cause an error to be returned. - +.PP As a security measure, depending on the filesystem, the set-user-ID and set-group-ID execution bits may be turned off if a file is written. @@ -185,7 +185,7 @@ which may have a special meaning. For the sticky bit, and for set-user-ID and set-group-ID bits on directories, see .BR inode (7). - +.PP On NFS filesystems, restricting the permissions will immediately influence already open files, because the access control is done on the server, but open files are maintained by the client. @@ -199,7 +199,7 @@ The system call operates in exactly the same way as .BR chmod (), except for the differences described here. - +.PP If the pathname given in .I pathname is relative, then it is interpreted relative to the directory @@ -209,7 +209,7 @@ referred to by the file descriptor the calling process, as is done by .BR chmod () for a relative pathname). - +.PP If .I pathname is relative and @@ -221,13 +221,13 @@ then is interpreted relative to the current working directory of the calling process (like .BR chmod ()). - +.PP If .I pathname is absolute, then .I dirfd is ignored. - +.PP .I flags can either be 0, or include the following flag: .TP @@ -250,7 +250,7 @@ is set appropriately. .SH ERRORS Depending on the filesystem, errors other than those listed below can be returned. - +.PP The more general errors for .BR chmod () are listed below: @@ -350,7 +350,7 @@ library support was added to glibc in version 2.4. .BR chmod (), .BR fchmod (): 4.4BSD, SVr4, POSIX.1-2001i, POSIX.1-2008. - +.PP .BR fchmodat (): POSIX.1-2008. .SH NOTES diff --git a/man2/chroot.2 b/man2/chroot.2 index b1d16f4b7..503335b4c 100644 --- a/man2/chroot.2 +++ b/man2/chroot.2 @@ -65,12 +65,12 @@ changes the root directory of the calling process to that specified in .IR path . This directory will be used for pathnames beginning with \fI/\fP. The root directory is inherited by all children of the calling process. - +.PP Only a privileged process (Linux: one with the .B CAP_SYS_CHROOT capability in its user namespace) may call .BR chroot (). - +.PP This call changes an ingredient in the pathname resolution process and does nothing else. In particular, it is not intended to be used @@ -87,7 +87,7 @@ The easiest way to do that is to .BR chdir (2) to the to-be-moved directory, wait for it to be moved out, then open a path like ../../../etc/passwd. - +.PP .\" This is how the "slightly trickier variation" works: .\" https://github.com/QubesOS/qubes-secpack/blob/master/QSBs/qsb-014-2015.txt#L142 A slightly @@ -98,7 +98,7 @@ If a daemon allows a "chroot directory" to be specified, that usually means that if you want to prevent remote users from accessing files outside the chroot directory, you must ensure that folders are never moved out of it. - +.PP This call does not change the current working directory, so that after the call \(aq\fI.\fP\(aq can be outside the tree rooted at \(aq\fI/\fP\(aq. @@ -108,7 +108,7 @@ by doing: mkdir foo; chroot foo; cd .. .fi - +.PP This call does not close open file descriptors, and such file descriptors may allow access to files outside the chroot tree. .SH RETURN VALUE @@ -166,7 +166,7 @@ A child process created via inherits its parent's root directory. The root directory is left unchanged by .BR execve (2). - +.PP FreeBSD has a stronger .BR jail () system call. diff --git a/man2/clock_getres.2 b/man2/clock_getres.2 index 836b21e31..fadfd9ab3 100644 --- a/man2/clock_getres.2 +++ b/man2/clock_getres.2 @@ -224,7 +224,7 @@ T{ .BR clock_settime () T} Thread safety MT-Safe .TE - +.sp 1 .SH CONFORMING TO POSIX.1-2001, POSIX.1-2008, SUSv2. .SH AVAILABILITY @@ -289,7 +289,7 @@ Glibc contains no provisions to deal with these offsets (unlike the Linux Kernel). Typically these offsets are small and therefore the effects may be negligible in most cases. - +.PP Since glibc 2.4, the wrapper functions for the system calls described in this page avoid the abovementioned problems by employing the kernel implementation of diff --git a/man2/clock_nanosleep.2 b/man2/clock_nanosleep.2 index 637065ee3..ac877849d 100644 --- a/man2/clock_nanosleep.2 +++ b/man2/clock_nanosleep.2 @@ -58,7 +58,7 @@ It differs in allowing the caller to select the clock against which the sleep interval is to be measured, and in allowing the sleep interval to be specified as either an absolute or a relative value. - +.PP The time values passed to and returned by this call are specified using .I timespec structures, defined as follows: @@ -71,7 +71,7 @@ struct timespec { }; .fi .in - +.PP The .I clock_id argument specifies the clock against which the sleep interval @@ -102,7 +102,7 @@ and .BR pthread_getcpuclockid (3) can also be passed in .IR clock_id . - +.PP If .I flags is 0, then the value specified in @@ -110,7 +110,7 @@ is 0, then the value specified in is interpreted as an interval relative to the current value of the clock specified by .IR clock_id . - +.PP If .I flags is @@ -125,7 +125,7 @@ is less than or equal to the current value of the clock, then .BR clock_nanosleep () returns immediately without suspending the calling thread. - +.PP .BR clock_nanosleep () suspends the execution of the calling thread until either at least the time specified by @@ -133,7 +133,7 @@ until either at least the time specified by has elapsed, or a signal is delivered that causes a signal handler to be called or that terminates the process. - +.PP If the call is interrupted by a signal handler, .BR clock_nanosleep () fails with the error @@ -195,7 +195,7 @@ is not an exact multiple of the granularity underlying clock (see then the interval will be rounded up to the next multiple. Furthermore, after the sleep completes, there may still be a delay before the CPU becomes free to once again execute the calling thread. - +.PP Using an absolute timer is useful for preventing timer drift problems of the type described in .BR nanosleep (2). @@ -210,14 +210,14 @@ and then call with the .B TIMER_ABSTIME flag. - +.PP .BR clock_nanosleep () is never restarted after being interrupted by a signal handler, regardless of the use of the .BR sigaction (2) .B SA_RESTART flag. - +.PP The .I remain argument is unused, and unnecessary, when @@ -227,11 +227,11 @@ is (An absolute sleep can be restarted using the same .I request argument.) - +.PP POSIX.1 specifies that .BR clock_nanosleep () has no effect on signals dispositions or the signal mask. - +.PP POSIX.1 specifies that after changing the value of the .B CLOCK_REALTIME clock via @@ -243,7 +243,7 @@ will wake up; if the new clock value falls past the end of the sleep interval, then the .BR clock_nanosleep () call will return immediately. - +.PP POSIX.1 specifies that changing the value of the .B CLOCK_REALTIME diff --git a/man2/clone.2 b/man2/clone.2 index 2fec123ea..56e55a66b 100644 --- a/man2/clone.2 +++ b/man2/clone.2 @@ -60,14 +60,14 @@ clone, __clone2 \- create a child process .BR clone () creates a new process, in a manner similar to .BR fork (2). - +.PP This page describes both the glibc .BR clone () wrapper function and the underlying system call on which it is based. The main text describes the wrapper function; the differences for the raw system call are described toward the end of this page. - +.PP Unlike .BR fork (2), .BR clone () @@ -79,12 +79,12 @@ page, "calling process" normally corresponds to "parent process". But see the description of .B CLONE_PARENT below.) - +.PP One use of .BR clone () is to implement threads: multiple threads of control in a program that run concurrently in a shared memory space. - +.PP When the child process is created with .BR clone (), it executes the function @@ -104,7 +104,7 @@ The argument is passed to the .I fn function. - +.PP When the .IR fn ( arg ) function application returns, the child process terminates. @@ -114,7 +114,7 @@ is the exit code for the child process. The child process may also terminate explicitly by calling .BR exit (2) or after receiving a fatal signal. - +.PP The .I child_stack argument specifies the location of the stack used by the child process. @@ -130,7 +130,7 @@ Stacks grow downward on all processors that run Linux .I child_stack usually points to the topmost address of the memory space set up for the child stack. - +.PP The low byte of .I flags contains the number of the @@ -146,7 +146,7 @@ options when waiting for the child with .BR wait (2). If no signal is specified, then the parent process is not signaled when the child terminates. - +.PP .I flags may also be bitwise-or'ed with zero or more of the following constants, in order to specify what is shared between the calling process @@ -185,7 +185,7 @@ operation), the other process is also affected. If a process sharing a file descriptor table calls .BR execve (2), its file descriptor table is duplicated (unshared). - +.IP If .B CLONE_FILES is not set, the child process inherits a copy of all file descriptors @@ -215,7 +215,7 @@ or .BR umask (2) performed by the calling process or the child process also affects the other process. - +.IP If .B CLONE_FS is not set, the child process works on a copy of the filesystem @@ -236,7 +236,7 @@ the calling process. If this flag is not set, then (as with .BR fork (2)) the new process has its own I/O context. - +.IP .\" The following based on text from Jens Axboe The I/O context is the I/O scope of the disk scheduler (i.e., what the I/O scheduler uses to model scheduling of a process's I/O). @@ -253,7 +253,7 @@ for instance), they should employ .BR CLONE_IO to get better I/O performance. .\" with CFQ and AS. - +.IP If the kernel is not configured with the .B CONFIG_BLOCK option, this flag is a no-op. @@ -264,10 +264,10 @@ If this flag is not set, then (as with .BR fork (2)) the process is created in the same cgroup namespaces as the calling process. This flag is intended for the implementation of containers. - +.IP For further information on cgroup namespaces, see .BR cgroup_namespaces (7). - +.IP Only a privileged process .RB ( CAP_SYS_ADMIN ) can employ @@ -283,7 +283,7 @@ If this flag is not set, then (as with the process is created in the same IPC namespace as the calling process. This flag is intended for the implementation of containers. - +.IP An IPC namespace provides an isolated view of System\ V IPC objects (see .BR svipc (7)) and (since Linux 2.6.30) @@ -295,29 +295,29 @@ POSIX message queues The common characteristic of these IPC mechanisms is that IPC objects are identified by mechanisms other than filesystem pathnames. - +.IP Objects created in an IPC namespace are visible to all other processes that are members of that namespace, but are not visible to processes in other IPC namespaces. - +.IP When an IPC namespace is destroyed (i.e., when the last process that is a member of the namespace terminates), all IPC objects in the namespace are automatically destroyed. - +.IP Only a privileged process .RB ( CAP_SYS_ADMIN ) can employ .BR CLONE_NEWIPC . This flag can't be specified in conjunction with .BR CLONE_SYSVSEM . - +.IP For further information on IPC namespaces, see .BR namespaces (7). .TP .BR CLONE_NEWNET " (since Linux 2.6.24)" (The implementation of this flag was completed only by about kernel version 2.6.29.) - +.IP If .B CLONE_NEWNET is set, then create the process in a new network namespace. @@ -326,7 +326,7 @@ If this flag is not set, then (as with the process is created in the same network namespace as the calling process. This flag is intended for the implementation of containers. - +.IP A network namespace provides an isolated view of the networking stack (network device interfaces, IPv4 and IPv6 protocol stacks, IP routing tables, firewall rules, the @@ -341,14 +341,14 @@ A virtual network device ("veth") pair provides a pipe-like abstraction that can be used to create tunnels between network namespaces, and can be used to create a bridge to a physical network device in another namespace. - +.IP When a network namespace is freed (i.e., when the last process in the namespace terminates), its physical network devices are moved back to the initial network namespace (not to the parent of the process). For further information on network namespaces, see .BR namespaces (7). - +.IP Only a privileged process .RB ( CAP_SYS_ADMIN ) can employ @@ -363,7 +363,7 @@ If .B CLONE_NEWNS is not set, the child lives in the same mount namespace as the parent. - +.IP Only a privileged process .RB ( CAP_SYS_ADMIN ) can employ @@ -376,7 +376,7 @@ and in the same .BR clone () call. - +.IP For further information on mount namespaces, see .BR namespaces (7) and @@ -398,12 +398,12 @@ If this flag is not set, then (as with the process is created in the same PID namespace as the calling process. This flag is intended for the implementation of containers. - +.IP For further information on PID namespaces, see .BR namespaces (7) and .BR pid_namespaces (7). - +.IP Only a privileged process .RB ( CAP_SYS_ADMIN ) can employ @@ -422,19 +422,19 @@ the current semantics were merged in Linux 3.5, and the final pieces to make the user namespaces completely usable were merged in Linux 3.8.) - +.IP If .B CLONE_NEWUSER is set, then create the process in a new user namespace. If this flag is not set, then (as with .BR fork (2)) the process is created in the same user namespace as the calling process. - +.IP For further information on user namespaces, see .BR namespaces (7) and .BR user_namespaces (7) - +.IP Before Linux 3.8, use of .BR CLONE_NEWUSER required that the caller have three capabilities: @@ -445,7 +445,7 @@ and .\" Before Linux 2.6.29, it appears that only CAP_SYS_ADMIN was needed Starting with Linux 3.8, no privileges are needed to create a user namespace. - +.IP This flag can't be specified in conjunction with .BR CLONE_THREAD or @@ -459,7 +459,7 @@ For security reasons, .BR CLONE_NEWUSER cannot be specified in conjunction with .BR CLONE_FS . - +.IP For further information on user namespaces, see .BR user_namespaces (7). .TP @@ -474,7 +474,7 @@ If this flag is not set, then (as with the process is created in the same UTS namespace as the calling process. This flag is intended for the implementation of containers. - +.IP A UTS namespace is the set of identifiers returned by .BR uname (2); among these, the domain name and the hostname can be modified by @@ -485,12 +485,12 @@ respectively. Changes made to the identifiers in a UTS namespace are visible to all other processes in the same namespace, but are not visible to processes in other UTS namespaces. - +.IP Only a privileged process .RB ( CAP_SYS_ADMIN ) can employ .BR CLONE_NEWUTS . - +.IP For further information on UTS namespaces, see .BR namespaces (7). .TP @@ -500,13 +500,13 @@ If is set, then the parent of the new child (as returned by .BR getppid (2)) will be the same as that of the calling process. - +.IP If .B CLONE_PARENT is not set, then (as with .BR fork (2)) the child's parent is the calling process. - +.IP Note that it is the parent process, as returned by .BR getppid (2), which is signaled when the child terminates, so that @@ -548,7 +548,7 @@ then trace the child also (see .BR CLONE_SETTLS " (since Linux 2.5.32)" The TLS (Thread Local Storage) descriptor is set to .I newtls. - +.IP The interpretation of .I newtls and the resulting effect is architecture dependent. @@ -581,7 +581,7 @@ signals. So, one of them may block or unblock some signals using .BR sigprocmask (2) without affecting the other process. - +.IP If .B CLONE_SIGHAND is not set, the child process inherits a copy of the signal handlers @@ -592,7 +592,7 @@ Calls to .BR sigaction (2) performed later by one of the processes have no effect on the other process. - +.IP Since Linux 2.6.0-test6, .I flags must also include @@ -609,7 +609,7 @@ is set, then the child is initially stopped (as though it was sent a signal), and must be resumed by sending it a .B SIGCONT signal. - +.IP This flag was .I deprecated from Linux 2.6.25 onward, @@ -648,7 +648,7 @@ To make the remainder of the discussion of .B CLONE_THREAD more readable, the term "thread" is used to refer to the processes within a thread group. - +.IP Thread groups were a feature added in Linux 2.4 to support the POSIX threads notion of a set of threads that share a single PID. Internally, this shared PID is the so-called @@ -656,7 +656,7 @@ thread group identifier (TGID) for the thread group. Since Linux 2.4, calls to .BR getpid (2) return the TGID of the caller. - +.IP The threads within a group can be distinguished by their (system-wide) unique thread IDs (TID). A new thread's TID is available as the function result @@ -665,7 +665,7 @@ returned to the caller of and a thread can obtain its own TID using .BR gettid (2). - +.IP When a call is made to .BR clone () without specifying @@ -675,7 +675,7 @@ whose TGID is the same as the thread's TID. This thread is the .I leader of the new thread group. - +.IP A new thread created with .B CLONE_THREAD has the same parent process as the caller of @@ -697,23 +697,23 @@ using .BR wait (2). (The thread is said to be .IR detached .) - +.IP After all of the threads in a thread group terminate the parent process of the thread group is sent a .B SIGCHLD (or other termination) signal. - +.IP If any of the threads in a thread group performs an .BR execve (2), then all threads other than the thread group leader are terminated, and the new program is executed in the thread group leader. - +.IP If one of the threads in a thread group creates a child using .BR fork (2), then any thread in the group can .BR wait (2) for that child. - +.IP Since Linux 2.5.35, .I flags must also include @@ -726,17 +726,17 @@ is specified also requires .BR CLONE_VM to be included). - +.IP Signals may be sent to a thread group as a whole (i.e., a TGID) using .BR kill (2), or to a specific thread (i.e., TID) using .BR tgkill (2). - +.IP Signal dispositions and actions are process-wide: if an unhandled signal is delivered to a thread, then it will affect (terminate, stop, continue, be ignored in) all members of the thread group. - +.IP Each thread has its own signal mask, as set by .BR sigprocmask (2), but signals can be pending either: for the whole process @@ -749,7 +749,7 @@ A call to .BR sigpending (2) returns a signal set that is the union of the signals pending for the whole process and the signals that are pending for the calling thread. - +.IP If .BR kill (2) is used to send a signal to a thread group, @@ -780,7 +780,7 @@ or .BR _exit (2) (as with .BR vfork (2)). - +.IP If .B CLONE_VFORK is not set, then both the calling process and the child are schedulable @@ -799,7 +799,7 @@ Moreover, any memory mapping or unmapping performed with or .BR munmap (2) by the child or calling process also affects the other process. - +.IP If .B CLONE_VM is not set, the child process runs in a separate copy of the memory @@ -824,10 +824,10 @@ arguments of the wrapper function are omitted. Furthermore, the argument order changes. In addition, there are variations across architectures. - +.PP The raw system call interface on x86-64 and some other architectures (including sh, tile, and alpha) is roughly: - +.PP .in +4 .nf .BI "long clone(unsigned long " flags ", void *" child_stack , @@ -835,13 +835,13 @@ The raw system call interface on x86-64 and some other architectures .BI " unsigned long " newtls ); .fi .in - +.PP On x86-32, and several other common architectures (including score, ARM, ARM 64, PA-RISC, arc, Power PC, xtensa, and MIPS), .\" CONFIG_CLONE_BACKWARDS the order of the last two arguments is reversed: - +.PP .in +4 .nf .BI "long clone(unsigned long " flags ", void *" child_stack , @@ -849,11 +849,11 @@ the order of the last two arguments is reversed: .BI " int *" ctid ); .fi .in - +.PP On the cris and s390 architectures, .\" CONFIG_CLONE_BACKWARDS2 the order of the first two arguments is reversed: - +.PP .in +4 .nf .BI "long clone(void *" child_stack ", unsigned long " flags , @@ -861,11 +861,11 @@ the order of the first two arguments is reversed: .BI " unsigned long " newtls ); .fi .in - +.PP On the microblaze architecture, .\" CONFIG_CLONE_BACKWARDS3 an additional argument is supplied: - +.PP .in +4 .nf .BI "long clone(unsigned long " flags ", void *" child_stack , @@ -874,7 +874,7 @@ an additional argument is supplied: .BI " unsigned long " newtls ); .fi .in - +.PP Another difference for the raw system call is that the .I child_stack argument may be zero, in which case copy-on-write semantics ensure that the @@ -1076,7 +1076,7 @@ and the call would cause the limit on the number of nested user namespaces to be exceeded. See .BR user_namespaces (7). - +.IP From Linux 3.11 to Linux 4.8, the error diagnosed in this case was .BR EUSERS . .TP @@ -1152,13 +1152,13 @@ The system call can be used to test whether two processes share various resources such as a file descriptor table, System V semaphore undo operations, or a virtual address space. - - +.PP +.PP Handlers registered using .BR pthread_atfork (3) are not executed during a call to .BR clone (). - +.PP In the Linux 2.4.x series, .B CLONE_THREAD generally does not make the parent of the new thread the same @@ -1168,7 +1168,7 @@ However, for kernel versions 2.4.7 to 2.4.18 the flag implied the .B CLONE_PARENT flag (as in Linux 2.6.0 and later). - +.PP For a while there was .B CLONE_DETACHED (introduced in 2.5.32): @@ -1177,7 +1177,7 @@ In Linux 2.6.2, the need to give this flag together with .B CLONE_THREAD disappeared. This flag is still defined, but has no effect. - +.PP On i386, .BR clone () should not be called through vsyscall, but directly through diff --git a/man2/close.2 b/man2/close.2 index f57eb370a..751ec322b 100644 --- a/man2/close.2 +++ b/man2/close.2 @@ -132,13 +132,13 @@ Failing to check the return value when closing a file may lead to .I silent loss of data. This can especially be observed with NFS and with disk quota. - +.PP Note, however, that a failure return should be used only for diagnostic purposes (i.e., a warning to the application that there may still be I/O pending or there may have been failed I/O) or remedial purposes (e.g., writing the file once more or creating a backup). - +.PP Retrying the .BR close () after a failure return is the wrong thing to do, @@ -159,7 +159,7 @@ the steps that may return an error, .\" filp_close() such as flushing data to the filesystem or device, occur only later in the close operation. - +.PP Many other implementations similarly always close the file descriptor .\" FreeBSD documents this explicitly. From the look of the source code .\" SVR4, ancient SunOS, later Solaris, and AIX all do this. @@ -172,19 +172,19 @@ POSIX.1 is currently silent on this point, but there are plans to mandate this behavior in the next major release .\" Issue 8 of the standard - +.PP A careful programmer who wants to know about I/O errors may precede .BR close () with a call to .BR fsync (2). - +.PP The .B EINTR error is a somewhat special case. Regarding the .B EINTR error, POSIX.1-2013 says: - +.PP .RS If .BR close () @@ -196,7 +196,7 @@ and the state of .I fildes is unspecified. .RE - +.PP This permits the behavior that occurs on Linux and many other implementations, where, as with other errors that may be reported by diff --git a/man2/connect.2 b/man2/connect.2 index 62e20df72..a77ea27c8 100644 --- a/man2/connect.2 +++ b/man2/connect.2 @@ -94,7 +94,7 @@ is determined by the address space of the socket see .BR socket (2) for further details. - +.PP If the socket .I sockfd is of type @@ -259,12 +259,12 @@ POSIX.1 does not require the inclusion of and this header file is not required on Linux. However, some historical (BSD) implementations required this header file, and portable applications are probably wise to include it. - +.PP For background on the .I socklen_t type, see .BR accept (2). - +.PP If .BR connect () fails, consider the state of the socket as unspecified. diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2 index f3cbbe251..ff41cfc1d 100644 --- a/man2/copy_file_range.2 +++ b/man2/copy_file_range.2 @@ -47,7 +47,7 @@ bytes of data from file descriptor to file descriptor .IR fd_out , overwriting any data that exists within the requested range of the target file. - +.PP The following semantics apply for .IR off_in , and similar statements apply to @@ -74,7 +74,7 @@ is not changed, but .I off_in is adjusted appropriately. .PP - +.PP The .I flags argument is provided to allow for future extensions @@ -84,7 +84,7 @@ Upon successful completion, .BR copy_file_range () will return the number of bytes copied between files. This could be less than the length originally requested. - +.PP On error, .BR copy_file_range () returns \-1 and @@ -143,7 +143,7 @@ in a loop, and using the and .BR SEEK_HOLE operations to find the locations of data segments. - +.PP .BR copy_file_range () gives filesystems an opportunity to implement "copy acceleration" techniques, such as the use of reflinks (i.e., two or more i-nodes that share diff --git a/man2/create_module.2 b/man2/create_module.2 index 8c3851b12..7ffea6751 100644 --- a/man2/create_module.2 +++ b/man2/create_module.2 @@ -22,7 +22,7 @@ No declaration of this system call is provided in glibc headers; see NOTES. .SH DESCRIPTION .IR Note : This system call is present only in kernels before Linux 2.6. - +.PP .BR create_module () attempts to create a loadable module entry and reserve the kernel memory that will be needed to hold the module. diff --git a/man2/delete_module.2 b/man2/delete_module.2 index 87855d22a..0484d329e 100644 --- a/man2/delete_module.2 +++ b/man2/delete_module.2 @@ -46,7 +46,7 @@ The argument is used to modify the behavior of the system call, as described below. This system call requires privilege. - +.PP Module removal is attempted according to the following rules: .IP 1. 4 If there are other loaded modules that depend on @@ -67,7 +67,7 @@ flag is always specified, and the flag may additionally be specified. .\" O_TRUNC == KMOD_REMOVE_FORCE in kmod library .\" O_NONBLOCK == KMOD_REMOVE_NOWAIT in kmod library - +.IP The various combinations for .I flags have the following effect: @@ -183,7 +183,7 @@ it is (before glibc 2.23) sufficient to manually declare the interface in your code; alternatively, you can invoke the system call using .BR syscall (2). - +.PP The uninterruptible sleep that may occur if .BR O_NONBLOCK is omitted from @@ -195,13 +195,13 @@ As at Linux 3.7, specifying is optional, but in future kernels it is likely to become mandatory. .SS Linux 2.4 and earlier In Linux 2.4 and earlier, the system call took only one argument: - +.PP .BI " int delete_module(const char *" name ); - +.PP If .I name is NULL, all unused modules marked auto-clean are removed. - +.PP Some further details of differences in the behavior of .BR delete_module () in Linux 2.4 and earlier are diff --git a/man2/dup.2 b/man2/dup.2 index 77e312982..3d2ad394b 100644 --- a/man2/dup.2 +++ b/man2/dup.2 @@ -56,7 +56,7 @@ The system call creates a copy of the file descriptor .IR oldfd , using the lowest-numbered unused file descriptor for the new descriptor. - +.PP After a successful return, the old and new file descriptors may be used interchangeably. They refer to the same open file description (see @@ -65,7 +65,7 @@ and thus share file offset and file status flags; for example, if the file offset is modified by using .BR lseek (2) on one of the file descriptors, the offset is also changed for the other. - +.PP The two file descriptors do not share file descriptor flags (the close-on-exec flag). The close-on-exec flag @@ -85,7 +85,7 @@ it uses the file descriptor number specified in If the file descriptor .IR newfd was previously open, it is silently closed before being reused. - +.PP The steps of closing and reusing the file descriptor .IR newfd are performed @@ -101,7 +101,7 @@ might be reused between the two steps. Such reuse could happen because the main program is interrupted by a signal handler that allocates a file descriptor, or because a parallel thread allocates a file descriptor. - +.PP Note the following points: .IP * 3 If @@ -210,7 +210,7 @@ version 2.9. .BR dup (), .BR dup2 (): POSIX.1-2001, POSIX.1-2008, SVr4, 4.3BSD. - +.PP .BR dup3 () is Linux-specific. .\" SVr4 documents additional @@ -230,7 +230,7 @@ also sometimes returns .B EINVAL like .BR F_DUPFD . - +.PP If .I newfd was open, any errors that would have been reported at @@ -246,7 +246,7 @@ before calling .BR dup2 (), because of the race condition described above. Instead, code something like the following could be used: - +.PP .nf /* Obtain a duplicate of 'newfd' that can subsequently be used to check for close() errors; an EBADF error diff --git a/man2/epoll_create.2 b/man2/epoll_create.2 index 3bab199d3..f612347aa 100644 --- a/man2/epoll_create.2 +++ b/man2/epoll_create.2 @@ -39,7 +39,7 @@ instance. Since Linux 2.6.8, the .I size argument is ignored, but must be greater than zero; see NOTES below. - +.PP .BR epoll_create () returns a file descriptor referring to the new epoll instance. This file descriptor is used for all the subsequent calls to the @@ -112,7 +112,7 @@ There was insufficient memory to create the kernel object. .BR epoll_create () was added to the kernel in version 2.6. Library support is provided in glibc starting with version 2.3.2. - +.PP .\" To be precise: kernel 2.5.44. .\" The interface should be finalized by Linux kernel 2.5.66. .BR epoll_create1 () diff --git a/man2/epoll_ctl.2 b/man2/epoll_ctl.2 index de63aec53..49c80486a 100644 --- a/man2/epoll_ctl.2 +++ b/man2/epoll_ctl.2 @@ -35,7 +35,7 @@ It requests that the operation .I op be performed for the target file descriptor, .IR fd . - +.PP Valid values for the .I op argument are: @@ -92,7 +92,7 @@ struct epoll_event { }; .fi .in - +.PP The .I events member is a bit mask composed by ORing together zero or more of @@ -134,7 +134,7 @@ Hang up happened on the associated file descriptor. .BR epoll_wait (2) will always wait for this event; it is not necessary to set it in .IR events . - +.IP Note that when reading from a channel such as a pipe or a stream socket, this event merely indicates that the peer closed its end of the channel. Subsequent reads from the channel will return 0 (end of file) @@ -206,7 +206,7 @@ The default in this scenario (when is not set) is for all epoll file descriptors to receive an event. .BR EPOLLEXCLUSIVE is thus useful for avoiding thundering herd problems in certain scenarios. - +.IP If the same file descriptor is in multiple epoll instances, some with the .BR EPOLLEXCLUSIVE @@ -215,7 +215,7 @@ instances that did not specify .BR EPOLLEXCLUSIVE , and at least one of the epoll instances that did specify .BR EPOLLEXCLUSIVE . - +.IP The following values may be specified in conjunction with .BR EPOLLEXCLUSIVE : .BR EPOLLIN , @@ -398,7 +398,7 @@ when using Applications that need to be portable to kernels before 2.6.9 should specify a non-null pointer in .IR event . - +.PP If .B EPOLLWAKEUP is specified in diff --git a/man2/epoll_wait.2 b/man2/epoll_wait.2 index ece2b2aef..e98b85f22 100644 --- a/man2/epoll_wait.2 +++ b/man2/epoll_wait.2 @@ -100,7 +100,7 @@ struct epoll_event { }; .fi .in - +.PP The .I data field of each returned structure contains the same data as was specified diff --git a/man2/eventfd.2 b/man2/eventfd.2 index d9d9443cb..e6b4ca27b 100644 --- a/man2/eventfd.2 +++ b/man2/eventfd.2 @@ -36,7 +36,7 @@ The object contains an unsigned 64-bit integer counter that is maintained by the kernel. This counter is initialized with the value specified in the argument .IR initval . - +.PP The following values may be bitwise ORed in .IR flags to change the behavior of @@ -67,7 +67,7 @@ See below. In Linux up to version 2.6.26, the .I flags argument is unused, and must be specified as zero. - +.PP As its return value, .BR eventfd () returns a new file descriptor that can be used to refer to the @@ -275,7 +275,7 @@ T{ .BR eventfd () T} Thread safety MT-Safe .TE - +.sp 1 .SH CONFORMING TO .BR eventfd () and @@ -289,13 +289,13 @@ The kernel overhead of an eventfd file descriptor is much lower than that of a pipe, and only one file descriptor is required (versus the two required for a pipe). - +.PP When used in the kernel, an eventfd file descriptor can provide a bridge from kernel to user space, allowing, for example, functionalities like KAIO (kernel AIO) .\" or eventually syslets/threadlets to signal to a file descriptor that some operation is complete. - +.PP A key point about an eventfd file descriptor is that it can be monitored just like any other file descriptor using .BR select (2), @@ -312,7 +312,7 @@ interface, these mechanisms could not be multiplexed via .BR poll (2), or .BR epoll (7).) - +.PP The current value of an eventfd counter can be viewed via the entry for the corresponding file descriptor in the process's .IR /proc/[pid]/fdinfo @@ -348,7 +348,7 @@ int eventfd_read(int fd, eventfd_t *value); int eventfd_write(int fd, eventfd_t value); .fi .in - +.PP The functions perform the read and write operations on an eventfd file descriptor, returning 0 if the correct number of bytes was transferred, @@ -362,7 +362,7 @@ the child writes each of the integers supplied in the program's command-line arguments to the eventfd file descriptor. When the parent has finished sleeping, it reads from the eventfd file descriptor. - +.PP The following shell session shows a sample run of the program: .in +4n .nf diff --git a/man2/execve.2 b/man2/execve.2 index a0428efd8..a2d4fa5ff 100644 --- a/man2/execve.2 +++ b/man2/execve.2 @@ -48,15 +48,15 @@ execve \- execute program executes the program pointed to by \fIfilename\fP. \fIfilename\fP must be either a binary executable, or a script starting with a line of the form: - +.PP .in +4n .nf \fB#!\fP \fIinterpreter \fP[optional-arg] .fi .in - +.PP For details of the latter case, see "Interpreter scripts" below. - +.PP \fIargv\fP is an array of argument strings passed to the new program. By convention, the first of these strings (i.e., .IR argv[0] ) @@ -65,31 +65,31 @@ should contain the filename associated with the file being executed. \fBkey=value\fP, which are passed as environment to the new program. The \fIargv\fP and \fIenvp\fP arrays must each include a null pointer at the end of the array. - +.PP The argument vector and environment can be accessed by the called program's main function, when it is defined as: - +.PP .in +4n .nf int main(int argc, char *argv[], char *envp[]) .fi .in - +.PP Note, however, that the use of a third argument to the main function is not specified in POSIX.1; according to POSIX.1, the environment should be accessed via the external variable .BR environ (7). - +.PP .BR execve () does not return on success, and the text, initialized data, uninitialized data (bss), and stack of the calling process are overwritten according to the contents of the newly loaded program. - +.PP If the current program is being ptraced, a \fBSIGTRAP\fP signal is sent to it after a successful .BR execve (). - +.PP If the set-user-ID bit is set on the program file pointed to by \fIfilename\fP, then the effective user ID of the calling process is changed @@ -97,7 +97,7 @@ to that of the owner of the program file. Similarly, when the set-group-ID bit of the program file is set the effective group ID of the calling process is set to the group of the program file. - +.PP The aforementioned transformations of the effective IDs are .I not performed (i.e., the set-user-ID and set-group-ID bits are ignored) @@ -126,11 +126,11 @@ The effective user ID of the process is copied to the saved set-user-ID; similarly, the effective group ID is copied to the saved set-group-ID. This copying takes place after any effective ID changes that occur because of the set-user-ID and set-group-ID mode bits. - +.PP The process's real UID and real GID, as well its supplementary group IDs, are unchanged by a call to .BR execve (). - +.PP If the executable is an a.out dynamically linked binary executable containing shared-library stubs, the Linux dynamic linker @@ -138,7 +138,7 @@ shared-library stubs, the Linux dynamic linker is called at the start of execution to bring needed shared objects into memory and link the executable with them. - +.PP If the executable is a dynamically linked ELF executable, the interpreter named in the PT_INTERP segment is used to load the needed shared objects. @@ -146,7 +146,7 @@ This interpreter is typically .I /lib/ld-linux.so.2 for binaries linked with glibc (see .BR ld-linux.so (8)). - +.PP All process attributes are preserved during an .BR execve (), except the following: @@ -294,13 +294,13 @@ closed across an .SS Interpreter scripts An interpreter script is a text file that has execute permission enabled and whose first line is of the form: - +.PP .in +4n .nf \fB#!\fP \fIinterpreter \fP[optional-arg] .fi .in - +.PP The .I interpreter must be a valid pathname for an executable file. @@ -311,13 +311,13 @@ argument of specifies an interpreter script, then .I interpreter will be invoked with the following arguments: - +.PP .in +4n .nf \fIinterpreter\fP [optional-arg] \fIfilename\fP arg... .fi .in - +.PP where .I arg... is the series of words pointed to by the @@ -326,12 +326,12 @@ argument of .BR execve (), starting at .IR argv [1]. - +.PP For portable use, .I optional-arg should either be absent, or be specified as a single word (i.e., it should not contain white space); see NOTES below. - +.PP Since Linux 2.6.28, .\" commit bf2a9a39639b8b51377905397a5005f444e9a892 the kernel permits the interpreter of a script to itself be a script. @@ -351,14 +351,14 @@ constant (either defined in .I or available at run time using the call .IR "sysconf(_SC_ARG_MAX)" ). - +.PP On Linux prior to kernel 2.6.23, the memory used to store the environment and argument strings was limited to 32 pages (defined by the kernel constant .BR MAX_ARG_PAGES ). On architectures with a 4-kB page size, this yields a maximum size of 128 kB. - +.PP On kernel 2.6.23 and later, most architectures support a size limit derived from the soft .B RLIMIT_STACK @@ -529,7 +529,7 @@ POSIX does not document the #! behavior, but it exists .SH NOTES Set-user-ID and set-group-ID processes can not be .BR ptrace (2)d. - +.PP The result of mounting a filesystem .I nosuid varies across Linux kernel versions: @@ -540,7 +540,7 @@ give the user powers she did not have already (and return some will just ignore the set-user-ID and set-group-ID bits and .BR exec () successfully. - +.PP On Linux, .I argv and @@ -562,7 +562,7 @@ case the same as Linux. .\" Bug filed 30 Apr 2007: http://bugzilla.kernel.org/show_bug.cgi?id=8408 .\" Bug rejected (because fix would constitute an ABI change). .\" - +.PP POSIX.1 says that values returned by .BR sysconf (3) should be invariant over the lifetime of a process. @@ -573,7 +573,7 @@ resource limit changes, then the value reported by will also change, to reflect the fact that the limit on space for holding command-line arguments and environment variables has changed. - +.PP In most cases where .BR execve () fails, control returns to the original executable image, @@ -591,7 +591,7 @@ signal. .SS Interpreter scripts A maximum line length of 127 characters is allowed for the first line in an interpreter script. - +.PP The semantics of the .I optional-arg argument of an interpreter script vary across implementations. @@ -610,7 +610,7 @@ an interpreter script can have multiple arguments, and white spaces in .I optional-arg are used to delimit the arguments. - +.PP Linux ignores the set-user-ID and set-group-ID bits on scripts. .\" .\" .SH BUGS @@ -627,7 +627,7 @@ A more detailed explanation of the error that can occur (since Linux 3.1) when calling .BR execve () is as follows. - +.PP The .BR EAGAIN error can occur when a @@ -649,7 +649,7 @@ call to fail. .\" commit 909cc4ae86f3380152a18e2a3c44523893ee11c4 the resource limit was not imposed on processes that changed their user IDs.) - +.PP Since Linux 3.1, the scenario just described no longer causes the .BR set*uid () call to fail, @@ -680,7 +680,7 @@ common privileged daemon workflow\(emnamely, .BR set*uid () + .BR execve (). - +.PP If the resource limit was not still exceeded at the time of the .BR execve () call @@ -719,7 +719,7 @@ Since UNIX\ V7, both are NULL. .SH EXAMPLE The following program is designed to be execed by the second program below. It just echoes its command-line arguments, one per line. - +.PP .in +4n .nf /* myecho.c */ @@ -739,7 +739,7 @@ main(int argc, char *argv[]) } .fi .in - +.PP This program can be used to exec the program named in its command-line argument: .in +4n @@ -770,9 +770,9 @@ main(int argc, char *argv[]) } .fi .in - +.PP We can use the second program to exec the first as follows: - +.PP .in +4n .nf .RB "$" " cc myecho.c \-o myecho" @@ -783,13 +783,13 @@ argv[1]: hello argv[2]: world .fi .in - +.PP We can also use these programs to demonstrate the use of a script interpreter. To do this we create a script whose "interpreter" is our .I myecho program: - +.PP .in +4n .nf .RB "$" " cat > script" @@ -798,9 +798,9 @@ program: .RB "$" " chmod +x script" .fi .in - +.PP We can then use our program to exec the script: - +.PP .in +4n .nf .RB "$" " ./execve ./script" diff --git a/man2/execveat.2 b/man2/execveat.2 index 4512c0e79..ec8f45e93 100644 --- a/man2/execveat.2 +++ b/man2/execveat.2 @@ -45,7 +45,7 @@ and It operates in exactly the same way as .BR execve (2), except for the differences described in this manual page. - +.PP If the pathname given in .I pathname is relative, then it is interpreted relative to the directory @@ -55,7 +55,7 @@ referred to by the file descriptor the calling process, as is done by .BR execve (2) for a relative pathname). - +.PP If .I pathname is relative and @@ -67,13 +67,13 @@ then is interpreted relative to the current working directory of the calling process (like .BR execve (2)). - +.PP If .I pathname is absolute, then .I dirfd is ignored. - +.PP If .I pathname is an empty string and the @@ -83,7 +83,7 @@ flag is specified, then the file descriptor specifies the file to be executed (i.e., .IR dirfd refers to an executable file, rather than a directory). - +.PP The .I flags argument is a bit mask that can include zero or more of the following flags: @@ -176,7 +176,7 @@ system call is also needed to allow to be implemented on systems that do not have the .I /proc filesystem mounted. - +.PP When asked to execute a script file, the .IR argv[0] that is passed to the script interpreter is a string of the form @@ -199,7 +199,7 @@ in this case, .IR P is the value given in .IR pathname . - +.PP For the same reasons described in .BR fexecve (3), the natural idiom when using @@ -212,9 +212,9 @@ The .B ENOENT error described above means that it is not possible to set the close-on-exec flag on the file descriptor given to a call of the form: - +.PP execveat(fd, "", argv, envp, AT_EMPTY_PATH); - +.PP However, the inability to set the close-on-exec flag means that a file descriptor referring to the script leaks through to the script itself. As well as wasting a file descriptor, diff --git a/man2/fallocate.2 b/man2/fallocate.2 index 38d0f7961..05cb63642 100644 --- a/man2/fallocate.2 +++ b/man2/fallocate.2 @@ -24,7 +24,7 @@ This is a nonportable, Linux-specific system call. For the portable, POSIX.1-specified method of ensuring that space is allocated for a file, see .BR posix_fallocate (3). - +.PP .BR fallocate () allows the caller to directly manipulate the allocated disk space for the file referred to by @@ -34,7 +34,7 @@ for the byte range starting at and continuing for .I len bytes. - +.PP The .I mode argument determines the operation to be performed on the given range. @@ -62,13 +62,13 @@ This default behavior closely resembles the behavior of the .BR posix_fallocate (3) library function, and is intended as a method of optimally implementing that function. - +.PP After a successful call, subsequent writes into the range specified by .IR offset and .IR len are guaranteed not to fail because of lack of disk space. - +.PP If the .B FALLOC_FL_KEEP_SIZE flag is specified in @@ -79,7 +79,7 @@ but the file size will not be changed even if is greater than the file size. Preallocating zeroed blocks beyond the end of the file in this manner is useful for optimizing append workloads. - +.PP If the .B FALLOC_FL_UNSHARE flag is specified in @@ -108,7 +108,7 @@ Within the specified range, partial filesystem blocks are zeroed, and whole filesystem blocks are removed from the file. After a successful call, subsequent reads from this range will return zeroes. - +.PP The .BR FALLOC_FL_PUNCH_HOLE flag must be ORed with @@ -119,7 +119,7 @@ in other words, even when punching off the end of the file, the file size (as reported by .BR stat (2)) does not change. - +.PP Not all filesystems support .BR FALLOC_FL_PUNCH_HOLE ; if a filesystem doesn't support the operation, an error is returned. @@ -154,7 +154,7 @@ will be appended at the location and the file will be .I len bytes smaller. - +.PP A filesystem may place limitations on the granularity of the operation, in order to ensure efficient implementation. Typically, @@ -168,7 +168,7 @@ If a filesystem has such a requirement, will fail with the error .BR EINVAL if this requirement is violated. - +.PP If the region specified by .I offset plus @@ -177,12 +177,12 @@ reaches or passes the end of file, an error is returned; instead, use .BR ftruncate (2) to truncate a file. - +.PP No other flags may be specified in .IR mode in conjunction with .BR FALLOC_FL_COLLAPSE_RANGE . - +.PP As at Linux 3.15, .B FALLOC_FL_COLLAPSE_RANGE is supported by @@ -206,13 +206,13 @@ Within the specified range, blocks are preallocated for the regions that span the holes in the file. After a successful call, subsequent reads from this range will return zeroes. - +.PP Zeroing is done within the filesystem preferably by converting the range into unwritten extents. This approach means that the specified range will not be physically zeroed out on the device (except for partial blocks at the either end of the range), and I/O is (otherwise) required only to update metadata. - +.PP If the .B FALLOC_FL_KEEP_SIZE flag is additionally specified in @@ -224,7 +224,7 @@ is greater than the file size. This behavior is the same as when preallocating space with .B FALLOC_FL_KEEP_SIZE specified. - +.PP Not all filesystems support .BR FALLOC_FL_ZERO_RANGE ; if a filesystem doesn't support the operation, an error is returned. @@ -261,7 +261,7 @@ bytes. Inserting a hole inside a file increases the file size by .I len bytes. - +.PP This mode has the same limitations as .BR FALLOC_FL_COLLAPSE_RANGE regarding the granularity of the operation. @@ -275,12 +275,12 @@ is equal to or greater than the end of file, an error is returned. For such operations (i.e., inserting a hole at the end of file), .BR ftruncate (2) should be used. - +.PP No other flags may be specified in .IR mode in conjunction with .BR FALLOC_FL_INSERT_RANGE . - +.PP .B FALLOC_FL_INSERT_RANGE requires filesystem support. Filesystems that support this operation include diff --git a/man2/flock.2 b/man2/flock.2 index 993dc945d..345fcec86 100644 --- a/man2/flock.2 +++ b/man2/flock.2 @@ -68,9 +68,9 @@ To make a nonblocking request, include .B LOCK_NB (by ORing) with any of the above operations. - +.PP A single file may not simultaneously have both shared and exclusive locks. - +.PP Locks created by .BR flock () are associated with an open file description (see @@ -85,7 +85,7 @@ Furthermore, the lock is released either by an explicit .B LOCK_UN operation on any of these duplicate file descriptors, or when all such file descriptors have been closed. - +.PP If a process uses .BR open (2) (or similar) to obtain more than one file descriptor for the same file, @@ -94,19 +94,19 @@ these file descriptors are treated independently by An attempt to lock the file using one of these file descriptors may be denied by a lock that the calling process has already placed via another file descriptor. - +.PP A process may hold only one type of lock (shared or exclusive) on a file. Subsequent .BR flock () calls on an already locked file will convert an existing lock to the new lock mode. - +.PP Locks created by .BR flock () are preserved across an .BR execve (2). - +.PP A shared or exclusive lock can be placed on a file regardless of the mode in which the file was opened. .SH RETURN VALUE @@ -241,7 +241,7 @@ and occurs on many other implementations.) .BR open (2), .BR lockf (3), .BR lslocks (8) - +.PP .I Documentation/filesystems/locks.txt in the Linux kernel source tree .RI ( Documentation/locks.txt diff --git a/man2/fork.2 b/man2/fork.2 index 9aefa9a7d..b5af58ca0 100644 --- a/man2/fork.2 +++ b/man2/fork.2 @@ -52,7 +52,7 @@ process. The calling process is referred to as the .I parent process. - +.PP The child process and the parent process run in separate memory spaces. At the time of .BR fork () @@ -62,7 +62,7 @@ Memory writes, file mappings and unmappings .RB ( munmap (2)) performed by one of the processes do not affect the other. - +.PP The child process is an exact duplicate of the parent process except for the following points: .IP * 3 diff --git a/man2/fsync.2 b/man2/fsync.2 index 928e962fb..8b4bc0019 100644 --- a/man2/fsync.2 +++ b/man2/fsync.2 @@ -72,7 +72,7 @@ This includes writing through or flushing a disk cache if present. The call blocks until the device reports that the transfer has completed. It also flushes metadata information associated with the file (see .BR inode (7)). - +.PP Calling .BR fsync () does not necessarily ensure @@ -80,7 +80,7 @@ that the entry in the directory containing the file has also reached disk. For that an explicit .BR fsync () on a file descriptor for the directory is also needed. - +.PP .BR fdatasync () is similar to .BR fsync (), @@ -101,7 +101,7 @@ On the other hand, a change to the file size as made by say .BR ftruncate (2)), would require a metadata flush. - +.PP The aim of .BR fdatasync () is to reduce disk activity for applications that do not @@ -145,13 +145,13 @@ On some UNIX systems (but not Linux), must be a .I writable file descriptor. - +.PP In Linux 2.2 and earlier, .BR fdatasync () is equivalent to .BR fsync (), and so has no performance advantage. - +.PP The .BR fsync () implementations in older kernels and lesser used filesystems diff --git a/man2/futex.2 b/man2/futex.2 index 48586b3d5..78957aa63 100644 --- a/man2/futex.2 +++ b/man2/futex.2 @@ -54,7 +54,7 @@ Other .BR futex () operations can be used to wake any processes or threads waiting for a particular condition. - +.PP A futex is a 32-bit value\(emreferred to below as a .IR "futex word" \(emwhose address is supplied to the @@ -73,7 +73,7 @@ virtual addresses in different processes, but these addresses all refer to the same location in physical memory.) In a multithreaded program, it is sufficient to place the futex word in a global variable shared by all threads. - +.PP When executing a futex operation that requests to block a thread, the kernel will block only if the futex word has the value that the calling thread supplied (as one of the arguments of the @@ -106,7 +106,7 @@ blocking via a futex is an atomic compare-and-block operation. .\" the reference in the following sentence .\" See NOTES for a detailed specification of .\" the synchronization semantics. - +.PP One use of futexes is for implementing locks. The state of the lock (i.e., acquired or not acquired) can be represented as an atomically accessed flag in shared memory. @@ -133,10 +133,10 @@ operation that wakes threads blocked on the lock flag used as a futex word See .BR futex (7) for more detail on how to use futexes. - +.PP Besides the basic wait and wake-up futex functionality, there are further futex operations aimed at supporting more complex use cases. - +.PP Note that no explicit initialization or destruction is necessary to use futexes; the kernel maintains a futex @@ -157,7 +157,7 @@ argument; .IR val is a value whose meaning and purpose depends on .IR futex_op . - +.PP The remaining arguments .RI ( timeout , .IR uaddr2 , @@ -165,7 +165,7 @@ and .IR val3 ) are required only for certain of the futex operations described below. Where one of these arguments is not required, it is ignored. - +.PP For several blocking operations, the .I timeout argument is a pointer to a @@ -183,12 +183,12 @@ then to and in the remainder of this page, this argument is referred to as .I val2 when interpreted in this fashion. - +.PP Where it is required, the .IR uaddr2 argument is a pointer to a second futex word that is employed by the operation. - +.PP The interpretation of the final integer argument, .IR val3 , depends on the operation. @@ -216,7 +216,7 @@ This allows the kernel to make some additional performance optimizations. .\" I.e., It allows the kernel choose the fast path for validating .\" the user-space address and avoids expensive VMA lookups, .\" taking reference counts on file backing store, and so on. - +.IP As a convenience, .IR defines a set of constants with the suffix @@ -242,13 +242,13 @@ and .\" commit 337f13046ff03717a9e99675284a817527440a49 .BR FUTEX_WAIT operations. - +.IP If this option is set, the kernel measures the .I timeout against the .BR CLOCK_REALTIME clock. - +.IP If this option is not set, the kernel measures the .I timeout against the @@ -287,7 +287,7 @@ If the futex value does not match .IR val , then the call fails immediately with the error .BR EAGAIN . - +.IP The purpose of the comparison with the expected value is to prevent lost wake-ups. If another thread changed the value of the futex word after the @@ -298,7 +298,7 @@ operation (or similar wake-up) after the value change and before this .BR FUTEX_WAIT operation, then the calling thread will observe the value change and will not start to sleep. - +.IP If the .I timeout is not NULL, the structure it points to specifies a @@ -316,7 +316,7 @@ in If .I timeout is NULL, the call blocks indefinitely. - +.IP .IR Note : for .BR FUTEX_WAIT , @@ -335,7 +335,7 @@ with .IR val3 specified as .BR FUTEX_BITSET_MATCH_ANY . - +.IP The arguments .I uaddr2 and @@ -372,7 +372,7 @@ is specified as either 1 (wake up a single waiter) or No guarantee is provided about which waiters are awoken (e.g., a waiter with a higher scheduling priority is not guaranteed to be awoken in preference to a waiter with a lower priority). - +.IP The arguments .IR timeout , .IR uaddr2 , @@ -407,21 +407,21 @@ on the futex word, the file descriptor indicates as being readable with .BR poll (2), and .BR epoll (7) - +.IP The file descriptor can be used to obtain asynchronous notifications: if .I val is nonzero, then, when another process or thread executes a .BR FUTEX_WAKE , the caller will receive the signal number that was passed in .IR val . - +.IP The arguments .IR timeout , .I uaddr2 and .I val3 are ignored. - +.IP Because it was inherently racy, .B FUTEX_FD has been removed @@ -466,7 +466,7 @@ The argument specifies an upper limit on the number of waiters that are requeued to the futex at .IR uaddr2 . - +.IP .\" FIXME(Torvald) Is the following correct? Or is just the decision .\" which threads to wake or requeue part of the atomic operation? The load from @@ -482,7 +482,7 @@ ordered with respect to other operations on the same futex word. .\" source and target futex. No other waiter can enqueue itself .\" for waiting and no other waiter can dequeue itself because of .\" a timeout or signal. - +.IP Typical values to specify for .I val are 0 or 1. @@ -500,7 +500,7 @@ is typically either 1 or .BR FUTEX_CMP_REQUEUE operation equivalent to .BR FUTEX_WAIT .) - +.IP The .B FUTEX_CMP_REQUEUE operation was added as a replacement for the earlier @@ -517,7 +517,7 @@ conditions, which allows race conditions to be avoided in certain use cases. .\" To: Darren Hart .\" CC: libc-alpha@sourceware.org, ... .\" Subject: Re: Add futex wrapper to glibc? - +.IP Both .BR FUTEX_REQUEUE and @@ -529,7 +529,7 @@ another futex. Consider the following scenario, where multiple waiter threads are waiting on B, a wait queue implemented using a futex: - +.IP .in +4n .nf lock(A) @@ -541,7 +541,7 @@ while (!check_value(V)) { unlock(A); .fi .in - +.IP If a waker thread used .BR FUTEX_WAKE , then all waiters waiting on B would be woken up, @@ -573,13 +573,13 @@ of the wait queue associated with the condition variable. .BR FUTEX_WAKE_OP allows such cases to be implemented without leading to high rates of contention and context switching. - +.IP The .BR FUTEX_WAKE_OP operation is equivalent to executing the following code atomically and totally ordered with respect to other futex operations on any of the two supplied futex words: - +.IP .in +4n .nf int oldval = *(int *) uaddr2; @@ -589,7 +589,7 @@ if (oldval \fIcmp\fP \fIcmparg\fP) futex(uaddr2, FUTEX_WAKE, val2, 0, 0, 0); .fi .in - +.IP In other words, .BR FUTEX_WAKE_OP does the following: @@ -621,7 +621,7 @@ The operation and comparison that are to be performed are encoded in the bits of the argument .IR val3 . Pictorially, the encoding is: - +.IP .in +8n .nf +---+---+-----------+-----------+ @@ -630,9 +630,9 @@ Pictorially, the encoding is: 4 4 12 12 <== # of bits .fi .in - +.IP Expressed in code, the encoding is: - +.IP .in +4n .nf #define FUTEX_OP(op, oparg, cmp, cmparg) \\ @@ -642,7 +642,7 @@ Expressed in code, the encoding is: (cmparg & 0xfff)) .fi .in - +.IP In the above, .I op and @@ -653,11 +653,11 @@ The and .I cmparg components are literal numeric values, except as noted below. - +.IP The .I op component has one of the following values: - +.IP .in +4n .nf FUTEX_OP_SET 0 /* uaddr2 = oparg; */ @@ -667,23 +667,23 @@ FUTEX_OP_ANDN 3 /* uaddr2 &= ~oparg; */ FUTEX_OP_XOR 4 /* uaddr2 ^= oparg; */ .fi .in - +.IP In addition, bit-wise ORing the following value into .I op causes .IR "(1\ <<\ oparg)" to be used as the operand: - +.IP .in +4n .nf FUTEX_OP_ARG_SHIFT 8 /* Use (1 << oparg) as operand */ .fi .in - +.IP The .I cmp field is one of the following: - +.IP .in +4n .nf FUTEX_OP_CMP_EQ 0 /* if (oldval == cmparg) wake */ @@ -694,7 +694,7 @@ FUTEX_OP_CMP_GT 4 /* if (oldval > cmparg) wake */ FUTEX_OP_CMP_GE 5 /* if (oldval >= cmparg) wake */ .fi .in - +.IP The return value of .BR FUTEX_WAKE_OP is the sum of the number of waiters woken on the futex @@ -717,7 +717,7 @@ is stored in the kernel-internal state of the waiter. See the description of .BR FUTEX_WAKE_BITSET for further details. - +.IP If .I timeout is not NULL, the structure it points to specifies @@ -725,8 +725,8 @@ an absolute timeout for the wait operation. If .I timeout is NULL, the operation can block indefinitely. - - +.IP +.IP The .I uaddr2 argument is ignored. @@ -751,7 +751,7 @@ state of the waiter (the "wait" bit mask that is set using .BR FUTEX_WAIT_BITSET ). All of the waiters for which the result of the AND is nonzero are woken up; the remaining waiters are left sleeping. - +.IP The effect of .BR FUTEX_WAIT_BITSET and @@ -778,7 +778,7 @@ including those that are not interested in being woken up .\" obtain the absolute timeout functionality that is useful .\" for efficiently implementing Pthreads APIs (which use absolute .\" timeouts); FUTEX_WAIT provides only relative timeouts. - +.IP The constant .BR FUTEX_BITSET_MATCH_ANY , which corresponds to all 32 bits set in the bit mask, can be used as the @@ -807,7 +807,7 @@ with specified as .BR FUTEX_BITSET_MATCH_ANY ; that is, wake up any waiter(s). - +.IP The .I uaddr2 and @@ -826,7 +826,7 @@ while tasks at an intermediate priority continuously preempt the low-priority task from the CPU. Consequently, the low-priority task makes no progress toward releasing the lock, and the high-priority task remains blocked. - +.PP Priority inheritance is a mechanism for dealing with the priority-inversion problem. With this mechanism, when a high-priority task becomes blocked @@ -843,7 +843,7 @@ held by another intermediate-priority task then both of those tasks (or more generally, all of the tasks in a lock chain) have their priorities raised to be the same as the high-priority task. - +.PP From a user-space perspective, what makes a futex PI-aware is a policy agreement (described below) between user space and the kernel about the value of the futex word, @@ -859,7 +859,7 @@ for the implementation of very specific IPC mechanisms.) .\" talk about a PI aware pthread_mutex, than a PI aware futex, since .\" there is a lot of policy and scaffolding that has to be built up .\" around it to use it properly (this is what a PI pthread_mutex is). - +.PP .\" mtk: The following text is drawn from the Hart/Guniguntala paper .\" (listed in SEE ALSO), but I have reworded some pieces .\" significantly. @@ -880,7 +880,7 @@ If the lock is owned and there are threads contending for the lock, then the .B FUTEX_WAITERS bit shall be set in the futex word's value; in other words, this value is: - +.IP FUTEX_WAITERS | TID .IP (Note that is invalid for a PI futex word to have no owner and @@ -897,7 +897,7 @@ Acquiring a lock simply consists of using compare-and-swap to atomically set the futex word's value to the caller's TID if its previous value was 0. Releasing a lock requires using compare-and-swap to set the futex word's value to 0 if the previous value was the expected TID. - +.PP If a futex is already acquired (i.e., has a nonzero value), waiters must employ the .B FUTEX_LOCK_PI @@ -908,7 +908,7 @@ bit is set in the futex value; in this case, the lock owner must employ the .B FUTEX_UNLOCK_PI operation to release the lock. - +.PP In the cases where callers are forced into the kernel (i.e., required to perform a .BR futex () @@ -918,7 +918,7 @@ a kernel locking mechanism which implements the required priority-inheritance semantics. After the RT-mutex is acquired, the futex value is updated accordingly, before the calling thread returns to user space. - +.PP It is important to note .\" tglx (July 2015): .\" If there are multiple waiters on a pi futex then a wake pi operation @@ -935,7 +935,7 @@ up in an invalid state, such as having an owner but the value being 0, or having waiters but not having the .B FUTEX_WAITERS bit set.) - +.PP If a futex has an associated RT-mutex in the kernel (i.e., there are blocked waiters) and the owner of the futex/RT-mutex dies unexpectedly, @@ -954,7 +954,7 @@ the dead owner. .\" mechanism. In that case the futex value will be set to .\" FUTEX_OWNER_DIED. The robust futex mechanism is also available for non .\" PI futexes. - +.PP PI futexes are operated on by specifying one of the values listed below in .IR futex_op . Note that the PI futex operations must be used as paired operations @@ -996,7 +996,7 @@ This operation is used after an attempt to acquire the lock via an atomic user-mode instruction failed because the futex word has a nonzero value\(emspecifically, because it contained the (PID-namespace-specific) TID of the lock owner. - +.IP The operation checks the value of the futex word at the address .IR uaddr . If the value is 0, then the kernel tries to atomically set @@ -1079,7 +1079,7 @@ This inheritance follows the lock chain in the case of nested locking .\" (i.e., task 1 blocks on lock A, held by task 2, .\" while task 2 blocks on lock B, held by task 3) and performs deadlock detection. - +.IP The .I timeout argument provides a timeout for the lock attempt. @@ -1108,7 +1108,7 @@ clock. If .I timeout is NULL, the operation will block indefinitely. - +.IP The .IR uaddr2 , .IR val , @@ -1125,7 +1125,7 @@ This operation tries to acquire the lock at .IR uaddr . It is invoked when a user-space atomic acquire did not succeed because the futex word was not 0. - +.IP Because the kernel has access to more state information than user space, acquisition of the lock might succeed if performed by the kernel in cases where the futex word @@ -1149,7 +1149,7 @@ but the kernel can fix this up and acquire the futex. .\" Darren Hart (Oct 2015): .\" The trylock in the kernel has more state, so it can independently .\" verify the flags that userspace must trust implicitly. - +.IP The .IR uaddr2 , .IR val , @@ -1168,11 +1168,11 @@ This operation wakes the top priority waiter that is waiting in on the futex address provided by the .I uaddr argument. - +.IP This is called when the user-space value at .I uaddr cannot be changed atomically from a TID (of the owner) to 0. - +.IP The .IR uaddr2 , .IR val , @@ -1196,7 +1196,7 @@ from a non-PI source futex .RI ( uaddr ) to a PI target futex .RI ( uaddr2 ). - +.IP As with .BR FUTEX_CMP_REQUEUE , this operation wakes up a maximum of @@ -1212,7 +1212,7 @@ The remaining waiters are removed from the wait queue of the source futex at .I uaddr and added to the wait queue of the target futex at .IR uaddr2 . - +.IP The .I val2 .\" val2 is the cap on the number of requeued waiters. @@ -1246,7 +1246,7 @@ The wait operation on .I uaddr is the same as for .BR FUTEX_WAIT . - +.IP The waiter can be removed from the wait on .I uaddr without requeueing on @@ -1258,7 +1258,7 @@ In this case, the .BR FUTEX_WAIT_REQUEUE_PI operation fails with the error .BR EAGAIN . - +.IP If .I timeout is not NULL, the structure it points to specifies @@ -1266,11 +1266,11 @@ an absolute timeout for the wait operation. If .I timeout is NULL, the operation can block indefinitely. - +.IP The .I val3 argument is ignored. - +.IP The .BR FUTEX_WAIT_REQUEUE_PI and @@ -1323,7 +1323,7 @@ was invoked via all operations return \-1 and set .I errno to indicate the cause of the error. - +.PP The return value on success depends on the operation, as described in the following list: .TP @@ -1414,7 +1414,7 @@ The value pointed to by was not equal to the expected value .I val at the time of the call. - +.IP .BR Note : on Linux, the symbolic names .B EAGAIN @@ -1688,7 +1688,7 @@ and the timeout expired before the operation completed. .PP Futexes were first made available in a stable kernel release with Linux 2.6.0. - +.PP Initial futex support was merged in Linux 2.5.7 but with different semantics from what was described above. A four-argument system call with the semantics @@ -1700,7 +1700,7 @@ This system call is Linux-specific. .SH NOTES Glibc does not provide a wrapper for this system call; call it using .BR syscall (2). - +.PP Several higher-level programming abstractions are implemented via futexes, including POSIX semaphores and various POSIX threads synchronization mechanisms @@ -1722,7 +1722,7 @@ The two processes each write messages to the terminal and employ a synchronization protocol that ensures that they alternate in writing messages. Upon running this program we see output such as the following: - +.PP .in +4n .nf $ \fB./futex_demo\fP @@ -1912,17 +1912,17 @@ Franke, H., Russell, R., and Kirwood, M., 2002. .br .UR http://kernel.org\:/doc\:/ols\:/2002\:/ols2002\-pages\-479\-495.pdf .UE - +.PP Hart, D., 2009. \fIA futex overview and update\fP, .UR http://lwn.net/Articles/360699/ .UE - +.PP Hart, D. and Guniguntala, D., 2009. \fIRequeue-PI: Making Glibc Condvars PI-Aware\fP (from proceedings of the 2009 Real-Time Linux Workshop), .UR http://lwn.net/images/conf/rtlws11/papers/proc/p10.pdf .UE - +.PP Drepper, U., 2011. \fIFutexes Are Tricky\fP, .UR http://www.akkadia.org/drepper/futex.pdf .UE diff --git a/man2/futimesat.2 b/man2/futimesat.2 index 0e8d29915..fc0deab6d 100644 --- a/man2/futimesat.2 +++ b/man2/futimesat.2 @@ -47,13 +47,13 @@ This system call is obsolete. Use .BR utimensat (2) instead. - +.PP The .BR futimesat () system call operates in exactly the same way as .BR utimes (2), except for the differences described in this manual page. - +.PP If the pathname given in .I pathname is relative, then it is interpreted relative to the directory @@ -63,7 +63,7 @@ referred to by the file descriptor the calling process, as is done by .BR utimes (2) for a relative pathname). - +.PP If .I pathname is relative and @@ -75,7 +75,7 @@ then is interpreted relative to the current working directory of the calling process (like .BR utimes (2)). - +.PP If .I pathname is absolute, then @@ -114,7 +114,7 @@ This system call is nonstandard. It was implemented from a specification that was proposed for POSIX.1, but that specification was replaced by the one for .BR utimensat (2). - +.PP A similar system call exists on Solaris. .SH NOTES .SS Glibc notes diff --git a/man2/get_kernel_syms.2 b/man2/get_kernel_syms.2 index a7648b045..fdfbf46e6 100644 --- a/man2/get_kernel_syms.2 +++ b/man2/get_kernel_syms.2 @@ -22,7 +22,7 @@ No declaration of this system call is provided in glibc headers; see NOTES. .SH DESCRIPTION .BR Note : This system call is present only in kernels before Linux 2.6. - +.PP If .I table is NULL, diff --git a/man2/get_mempolicy.2 b/man2/get_mempolicy.2 index 111660b34..522d65520 100644 --- a/man2/get_mempolicy.2 +++ b/man2/get_mempolicy.2 @@ -42,12 +42,12 @@ Link with \fI\-lnuma\fP. retrieves the NUMA policy of the calling thread or of a memory address, depending on the setting of .IR flags . - +.PP A NUMA machine has different memory controllers with different distances to specific CPUs. The memory policy defines from which node memory is allocated for the thread. - +.PP If .I flags is specified as 0, @@ -69,7 +69,7 @@ When is 0, .I addr must be specified as NULL. - +.PP If .I flags specifies @@ -91,7 +91,7 @@ with either .B MPOL_F_ADDR or .BR MPOL_F_NODE . - +.PP If .I flags specifies @@ -105,7 +105,7 @@ or one of the helper functions described in .BR numa (3) has been used to establish a policy for the memory range containing .IR addr . - +.PP If the .I mode argument is not NULL, then @@ -126,7 +126,7 @@ The value specified by .I maxnode is always rounded to a multiple of .IR "sizeof(unsigned\ long)*8" . - +.PP If .I flags specifies both @@ -143,7 +143,7 @@ If no page has yet been allocated for the specified address, will allocate a page as if the thread had performed a read (load) access to that address, and return the ID of the node where that page was allocated. - +.PP If .I flags specifies @@ -168,9 +168,9 @@ call with the flag for read accesses, and in memory ranges mapped with the .B MAP_SHARED flag for all accesses. - +.PP Other flag values are reserved. - +.PP For an overview of the possible policies see .BR set_mempolicy (2). .SH RETURN VALUE diff --git a/man2/get_robust_list.2 b/man2/get_robust_list.2 index 43f3ee4eb..c83f2ff2f 100644 --- a/man2/get_robust_list.2 +++ b/man2/get_robust_list.2 @@ -47,7 +47,7 @@ The robust futex implementation needs to maintain per-thread lists of the robust futexes which are to be unlocked when the thread exits. These lists are managed in user space; the kernel is notified about only the location of the head of the list. - +.PP The .BR get_robust_list () system call returns the head of the robust futex list of the thread @@ -63,14 +63,14 @@ The size of the object pointed to by .I **head_ptr is stored in .IR len_ptr . - +.PP Permission to employ .BR get_robust_list () is governed by a ptrace access mode .B PTRACE_MODE_READ_REALCREDS check; see .BR ptrace (2). - +.PP The .BR set_robust_list () system call requests the kernel to record the head of the list of @@ -126,14 +126,14 @@ These system calls are not needed by normal applications. No support for them is provided in glibc. In the unlikely event that you want to call them directly, use .BR syscall (2). - +.PP A thread can have only one robust futex list; therefore applications that wish to use this functionality should use the robust mutexes provided by glibc. .SH SEE ALSO .BR futex (2) .\" .BR pthread_mutexattr_setrobust_np (3) - +.PP .IR Documentation/robust-futexes.txt and .IR Documentation/robust-futex-ABI.txt diff --git a/man2/getcpu.2 b/man2/getcpu.2 index eceb895d7..e6bebb199 100644 --- a/man2/getcpu.2 +++ b/man2/getcpu.2 @@ -39,11 +39,11 @@ When either or .I node is NULL nothing is written to the respective pointer. - +.PP The third argument to this system call is nowadays unused, and should be specified as NULL unless portability to Linux 2.6.23 or earlier is required (see NOTES). - +.PP The information placed in .I cpu is guaranteed to be current only at the time of the call: @@ -79,13 +79,13 @@ The intention of .BR getcpu () is to allow programs to make optimizations with per-CPU data or for NUMA optimization. - +.PP Glibc does not provide a wrapper for this system call; call it using .BR syscall (2); or use .BR sched_getcpu (3) instead. - +.PP The .I tcache argument is unused since Linux 2.6.24. diff --git a/man2/getdents.2 b/man2/getdents.2 index 6c0956daa..f27d06ad8 100644 --- a/man2/getdents.2 +++ b/man2/getdents.2 @@ -93,7 +93,7 @@ is the size of this entire .IR linux_dirent . .I d_name is a null-terminated filename. - +.PP .I d_type is a byte at the end of the structure that indicates the file type. It contains one of the following values (defined in @@ -157,14 +157,14 @@ In addition, supports an explicit .I d_type field. - +.PP The .BR getdents64 () system call is like .BR getdents (), except that its second argument is a pointer to a buffer containing structures of the following type: - +.PP .nf .in +4n struct linux_dirent64 { @@ -213,7 +213,7 @@ structure yourself. However, you probably want to use .BR readdir (3) instead. - +.PP These calls supersede .BR readdir (2). .SH EXAMPLE @@ -223,7 +223,7 @@ The program below demonstrates the use of .BR getdents (). The following output shows an example of what we see when running this program on an ext2 directory: - +.PP .in +4n .nf .RB "$" " ./a.out /testfs/" diff --git a/man2/getdomainname.2 b/man2/getdomainname.2 index 9309b38c6..d6daf0344 100644 --- a/man2/getdomainname.2 +++ b/man2/getdomainname.2 @@ -57,7 +57,7 @@ Feature Test Macro Requirements for glibc (see .SH DESCRIPTION These functions are used to access or to change the NIS domain name of the host system. - +.PP .BR setdomainname () sets the domain name to the value given in the character array .IR name . @@ -68,7 +68,7 @@ argument specifies the number of bytes in (Thus, .I name does not require a terminating null byte.) - +.PP .BR getdomainname () returns the null-terminated domain name in the character array .IR name , @@ -121,7 +121,7 @@ POSIX does not specify these calls. Since Linux 1.0, the limit on the length of a domain name, including the terminating null byte, is 64 bytes. In older kernels, it was 8 bytes. - +.PP On most Linux architectures (including x86), there is no .BR getdomainname () diff --git a/man2/getgid.2 b/man2/getgid.2 index 26f49ad63..0d9a802ea 100644 --- a/man2/getgid.2 +++ b/man2/getgid.2 @@ -36,7 +36,7 @@ getgid, getegid \- get group identity .SH DESCRIPTION .BR getgid () returns the real group ID of the calling process. - +.PP .BR getegid () returns the effective group ID of the calling process. .SH ERRORS diff --git a/man2/getgroups.2 b/man2/getgroups.2 index bd89eec2c..e6900adf6 100644 --- a/man2/getgroups.2 +++ b/man2/getgroups.2 @@ -71,7 +71,7 @@ is included in the returned list. (Thus, an application should also call .BR getegid (2) and add or remove the resulting value.) - +.PP If .I size is zero, @@ -100,7 +100,7 @@ returns the number of supplementary group IDs. On error, \-1 is returned, and .I errno is set appropriately. - +.PP On success, .BR setgroups () returns 0. @@ -166,7 +166,7 @@ is defined in The set of supplementary group IDs is inherited from the parent process, and preserved across an .BR execve (2). - +.PP The maximum number of supplementary group IDs can be found at run time using .BR sysconf (3): .nf @@ -181,7 +181,7 @@ cannot be larger than one more than this value. Since Linux 2.6.4, the maximum number of supplementary group IDs is also exposed via the Linux-specific read-only file, .IR /proc/sys/kernel/ngroups_max . - +.PP The original Linux .BR getgroups () system call supported only 16-bit group IDs. diff --git a/man2/gethostname.2 b/man2/gethostname.2 index 79e8cc354..9a30a1359 100644 --- a/man2/gethostname.2 +++ b/man2/gethostname.2 @@ -69,7 +69,7 @@ _BSD_SOURCE || _XOPEN_SOURCE\ >=\ 500 .SH DESCRIPTION These system calls are used to access or to change the hostname of the current processor. - +.PP .BR sethostname () sets the hostname to the value given in the character array .IR name . @@ -80,7 +80,7 @@ argument specifies the number of bytes in (Thus, .I name does not require a terminating null byte.) - +.PP .BR gethostname () returns the null-terminated hostname in the character array .IR name , @@ -167,7 +167,7 @@ set to .BR ENAMETOOLONG ; in this case, a terminating null byte is not included in the returned .IR name . - +.PP Versions of glibc before 2.2 .\" At least glibc 2.0 and 2.1, older versions not checked handle the case where the length of the diff --git a/man2/getitimer.2 b/man2/getitimer.2 index 08f6d0d34..1d7d6ca63 100644 --- a/man2/getitimer.2 +++ b/man2/getitimer.2 @@ -29,7 +29,7 @@ and (optionally) at regular intervals after that. When a timer expires, a signal is generated for the calling process, and the timer is reset to the specified interval (if the interval is nonzero). - +.PP Three types of timers\(emspecified via the .IR which argument\(emare provided, @@ -56,14 +56,14 @@ CPU time consumed by the process. At each expiration, a .B SIGPROF signal is generated. - +.IP In conjunction with .BR ITIMER_VIRTUAL , this timer can be used to profile user and system CPU time consumed by the process. .LP A process has only one of each of the three types of timers. - +.PP Timer values are defined by the following structures: .PD 0 .in +4n @@ -88,7 +88,7 @@ places the current value of the timer specified by .IR which in the buffer pointed to by .IR curr_value . - +.PP The .IR it_value substructure is populated with the amount of time remaining until @@ -99,7 +99,7 @@ when the timer expires. If both fields of .IR it_value are zero, then this timer is currently disarmed (inactive). - +.PP The .IR it_interval substructure is populated with the timer interval. @@ -119,7 +119,7 @@ is non-NULL, the buffer it points to is used to return the previous value of the timer (i.e., the same information that is returned by .BR getitimer ()). - +.PP If either field in .IR new_value.it_value is nonzero, @@ -127,7 +127,7 @@ then the timer is armed to initially expire at the specified time. If both fields in .IR new_value.it_value are zero, then the timer is disarmed. - +.PP The .IR new_value.it_interval field specifies the new interval for the timer; @@ -177,13 +177,13 @@ on the system timer resolution and on the system load; see If the timer expires while the process is active (always true for .BR ITIMER_VIRTUAL ), the signal will be delivered immediately when generated. - +.PP A child created via .BR fork (2) does not inherit its parent's interval timers. Interval timers are preserved across an .BR execve (2). - +.PP POSIX.1 leaves the interaction between .BR setitimer () @@ -193,16 +193,16 @@ and the three interfaces and .BR usleep (3) unspecified. - +.PP The standards are silent on the meaning of the call: - +.PP setitimer(which, NULL, &old_value); - +.PP Many systems (Solaris, the BSDs, and perhaps others) treat this as equivalent to: - +.PP getitimer(which, &old_value); - +.PP In Linux, this is treated as being equivalent to a call in which the .I new_value fields are zero; that is, the timer is disabled. @@ -217,7 +217,7 @@ Under very heavy loading, an timer may expire before the signal from a previous expiration has been delivered. The second signal in such an event will be lost. - +.PP On Linux kernels before 2.6.16, timer values are represented in jiffies. If a request is made set a timer with a value whose jiffies representation exceeds @@ -232,14 +232,14 @@ approximately 99.42 days. Since Linux 2.6.16, the kernel uses a different internal representation for times, and this ceiling is removed. - +.PP On certain systems (including i386), Linux kernels before version 2.6.12 have a bug which will produce premature timer expirations of up to one jiffy under some circumstances. This bug is fixed in kernel 2.6.12. .\" 4 Jul 2005: It looks like this bug may remain in 2.4.x. .\" http://lkml.org/lkml/2005/7/1/165 - +.PP POSIX.1-2001 says that .BR setitimer () should fail if a diff --git a/man2/getpagesize.2 b/man2/getpagesize.2 index f3d223eb9..bc5932989 100644 --- a/man2/getpagesize.2 +++ b/man2/getpagesize.2 @@ -84,12 +84,12 @@ instead of long sz = sysconf(_SC_PAGESIZE); .fi .in - +.PP (Most systems allow the synonym .B _SC_PAGE_SIZE for .BR _SC_PAGESIZE .) - +.PP Whether .BR getpagesize () is present as a Linux system call depends on the architecture. diff --git a/man2/getpeername.2 b/man2/getpeername.2 index 84a3c109f..9eceecbb4 100644 --- a/man2/getpeername.2 +++ b/man2/getpeername.2 @@ -60,7 +60,7 @@ by .IR addr . On return it contains the actual size of the name returned (in bytes). The name is truncated if the buffer provided is too small. - +.PP The returned address is truncated if the buffer provided is too small; in this case, .I addrlen @@ -107,7 +107,7 @@ For background on the .I socklen_t type, see .BR accept (2). - +.PP For stream sockets, once a .BR connect (2) has been performed, either socket can call diff --git a/man2/getpriority.2 b/man2/getpriority.2 index aa108679f..dfcdd6625 100644 --- a/man2/getpriority.2 +++ b/man2/getpriority.2 @@ -67,7 +67,7 @@ call. The process attribute dealt with by these system calls is the same attribute (also known as the "nice" value) that is dealt with by .BR nice (2). - +.PP The value .I which is one of @@ -90,7 +90,7 @@ A zero value for .I who denotes (respectively) the calling process, the process group of the calling process, or the real user ID of the calling process. - +.PP The .I prio argument is a value in the range \-20 to 19 (but see NOTES below). @@ -99,7 +99,7 @@ Attempts to set a priority outside this range are silently clamped to the range. The default priority is 0; lower values give a process a higher scheduling priority. - +.PP The .BR getpriority () call returns the highest priority (lowest numerical value) @@ -108,7 +108,7 @@ The .BR setpriority () call sets the priorities of all of the specified processes to the specified value. - +.PP Traditionally, only a privileged process could lower the nice value (i.e., set a higher priority). However, since Linux 2.6.12, an unprivileged process can decrease @@ -132,7 +132,7 @@ to clear the external variable prior to the call, then check it afterward to determine if \-1 is an error or a legitimate value. - +.PP .BR setpriority () returns 0 on success. On error, it returns \-1 and sets @@ -179,19 +179,19 @@ SVr4, 4.4BSD (these interfaces first appeared in 4.2BSD). .SH NOTES For further details on the nice value, see .BR sched (7). - +.PP .IR Note : the addition of the "autogroup" feature in Linux 2.6.38 means that the nice value no longer has its traditional effect in many circumstances. For details, see .BR sched (7). - +.PP A child created by .BR fork (2) inherits its parent's nice value. The nice value is preserved across .BR execve (2). - +.PP The details on the condition for .B EPERM depend on the system. @@ -206,7 +206,7 @@ the real or effective user ID of the process \fIwho\fP. All BSD-like systems (SunOS 4.1.3, Ultrix 4.2, 4.3BSD, FreeBSD 4.3, OpenBSD-2.5, ...) behave in the same manner as Linux 2.6.12 and later. - +.PP Including .I is not required these days, but increases portability. @@ -247,6 +247,6 @@ which may be made standards conformant in the future. .BR fork (2), .BR capabilities (7), .BR sched (7) - +.PP .I Documentation/scheduler/sched-nice-design.txt in the Linux kernel source tree (since Linux 2.6.23) diff --git a/man2/getrandom.2 b/man2/getrandom.2 index 45de28123..f691af457 100644 --- a/man2/getrandom.2 +++ b/man2/getrandom.2 @@ -41,7 +41,7 @@ with up to random bytes. These bytes can be used to seed user-space random number generators or for cryptographic purposes. - +.PP By default, .BR getrandom () draws entropy from the @@ -52,7 +52,7 @@ device). This behavior can be changed via the .I flags argument. - +.PP If the .I urandom source has been initialized, @@ -62,7 +62,7 @@ No such guarantees apply for larger buffer sizes. For example, if the call is interrupted by a signal handler, it may return a partially filled buffer, or fail with the error .BR EINTR . - +.PP If the .I urandom source has not yet been initialized, then @@ -71,7 +71,7 @@ will block, unless .B GRND_NONBLOCK is specified in .IR flags . - +.PP The .I flags argument is a bit mask that can contain zero or more of the following values @@ -174,7 +174,7 @@ This system call is Linux-specific. For an overview and comparison of the various interfaces that can be used to obtain randomness, see .BR random (7). - +.PP Unlike .IR /dev/random and @@ -232,7 +232,7 @@ will block until some random bytes become available (unless the .BR GRND_NONBLOCK flag was specified). - +.PP The behavior when a call to .BR getrandom () that is blocked while reading from the @@ -257,19 +257,19 @@ then will not fail with .BR EINTR . Instead, it will return all of the bytes that have been requested. - +.PP When reading from the .IR random source, blocking requests of any size can be interrupted by a signal handler (the call fails with the error .BR EINTR ). - +.PP Using .BR getrandom () to read small buffers (<=\ 256 bytes) from the .I urandom source is the preferred mode of usage. - +.PP The special treatment of small values of .I buflen was designed for compatibility with diff --git a/man2/getresuid.2 b/man2/getresuid.2 index 6100d75fd..605263adb 100644 --- a/man2/getresuid.2 +++ b/man2/getresuid.2 @@ -59,7 +59,7 @@ One of the arguments specified an address outside the calling program's address space. .SH VERSIONS These system calls appeared on Linux starting with kernel 2.1.44. - +.PP The prototypes are given by glibc since version 2.3.2, provided .B _GNU_SOURCE diff --git a/man2/getrlimit.2 b/man2/getrlimit.2 index 654589309..eeec24c78 100644 --- a/man2/getrlimit.2 +++ b/man2/getrlimit.2 @@ -237,7 +237,7 @@ and .BR MAP_LOCKED ; a process can lock bytes up to this limit in each of these two categories. - +.IP In Linux kernels before 2.6.9, this limit controlled the amount of memory that could be locked by a privileged process. Since Linux 2.6.9, no limits are placed on the amount of memory @@ -281,7 +281,7 @@ and the and .I posix_msg_tree_node structures are kernel-internal structures. - +.IP The "overhead" addend in the formula accounts for overhead bytes required by the implementation and ensures that the user cannot @@ -320,7 +320,7 @@ to exceed this limit yield the error (Historically, this limit was named .B RLIMIT_OFILE on BSD.) - +.IP Since Linux 4.5, this limit also defines the maximum number of file descriptors that an unprivileged process (one without the @@ -365,7 +365,7 @@ this process using .BR sched_setscheduler (2) and .BR sched_setparam (2). - +.IP For further details on real-time scheduling policies, see .BR sched (7) .TP @@ -380,7 +380,7 @@ the count of its consumed CPU time is reset to zero. The CPU time count is not reset if the process continues trying to use the CPU but is preempted, its time slice expires, or it calls .BR sched_yield (2). - +.IP Upon reaching the soft limit, the process is sent a .B SIGXCPU signal. @@ -391,10 +391,10 @@ will be generated once each second until the hard limit is reached, at which point the process is sent a .B SIGKILL signal. - +.IP The intended use of this limit is to stop a runaway real-time process from locking up the system. - +.IP For further details on real-time scheduling policies, see .BR sched (7) .TP @@ -419,7 +419,7 @@ Upon reaching this limit, a signal is generated. To handle this signal, a process must employ an alternate signal stack .RB ( sigaltstack (2)). - +.IP Since Linux 2.6.23, this limit also determines the amount of space used for the process's command-line arguments and environment variables; for details, see @@ -444,14 +444,14 @@ system call combines and extends the functionality of and .BR getrlimit (). It can be used to both set and get the resource limits of an arbitrary process. - +.PP The .I resource argument has the same meaning as for .BR setrlimit () and .BR getrlimit (). - +.PP If the .IR new_limit argument is a not NULL, then the @@ -469,7 +469,7 @@ in the .I rlimit structure pointed to by .IR old_limit . - +.PP The .I pid argument specifies the ID of the process on which the call is to operate. @@ -553,7 +553,7 @@ T{ .BR prlimit () T} Thread safety MT-Safe .TE - +.sp 1 .SH CONFORMING TO .BR getrlimit (), .BR setrlimit (): @@ -561,7 +561,7 @@ POSIX.1-2001, POSIX.1-2008, SVr4, 4.3BSD. .br .BR prlimit (): Linux-specific. - +.PP .B RLIMIT_MEMLOCK and .B RLIMIT_NPROC @@ -583,12 +583,12 @@ A child process created via inherits its parent's resource limits. Resource limits are preserved across .BR execve (2). - +.PP Lowering the soft limit for a resource below the process's current consumption of that resource will succeed (but will prevent the process from further increasing its consumption of the resource). - +.PP One can set the resource limits of the shell using the built-in .IR ulimit command @@ -597,12 +597,12 @@ in .BR csh (1)). The shell's resource limits are inherited by the processes that it creates to execute commands. - +.PP Since Linux 2.6.24, the resource limits of any process can be inspected via .IR /proc/[pid]/limits ; see .BR proc (5). - +.PP Ancient systems provided a .BR vlimit () function with a similar purpose to @@ -620,7 +620,7 @@ wrapper functions no longer invoke the corresponding system calls, but instead employ .BR prlimit (), for the reasons described in BUGS. - +.PP The name of the glibc wrapper function is .BR prlimit (); the underlying system call is @@ -634,7 +634,7 @@ signals delivered when a process encountered the soft and hard .B RLIMIT_CPU limits were delivered one (CPU) second later than they should have been. This was fixed in kernel 2.6.8. - +.PP In 2.6.x kernels before 2.6.17, a .B RLIMIT_CPU limit of 0 is wrongly treated as "no limit" (like @@ -642,12 +642,12 @@ limit of 0 is wrongly treated as "no limit" (like Since Linux 2.6.17, setting a limit of 0 does have an effect, but is actually treated as a limit of 1 second. .\" see http://marc.theaimsgroup.com/?l=linux-kernel&m=114008066530167&w=2 - +.PP A kernel bug means that .\" See https://lwn.net/Articles/145008/ .B RLIMIT_RTPRIO does not work in kernel 2.6.12; the problem is fixed in kernel 2.6.13. - +.PP In kernel 2.6.12, there was an off-by-one mismatch between the priority ranges returned by .BR getpriority (2) @@ -658,7 +658,7 @@ was calculated as .IR "19\ \-\ rlim_cur" . This was fixed in kernel 2.6.13. .\" see http://marc.theaimsgroup.com/?l=linux-kernel&m=112256338703880&w=2 - +.PP Since Linux 2.6.12, .\" The relevant patch, sent to LKML, seems to be .\" http://thread.gmane.org/gmane.linux.kernel/273462 @@ -685,7 +685,7 @@ portable applications should avoid relying on this Linux-specific behavior. The Linux-specific .BR RLIMIT_RTTIME limit exhibits the same behavior when the soft limit is encountered. - +.PP Kernels before 2.4.22 did not diagnose the error .B EINVAL for @@ -726,7 +726,7 @@ represent file offsets\(emthat is, as wide as a 64-bit .BR off_t (assuming a program compiled with .IR _FILE_OFFSET_BITS=64 ). - +.PP To work around this kernel limitation, if a program tried to set a resource limit to a value larger than can be represented in a 32-bit @@ -736,7 +736,7 @@ then the glibc wrapper function silently converted the limit value to .BR RLIM_INFINITY . In other words, the requested resource limit setting was silently ignored. - +.PP This problem was addressed in Linux 2.6.36 with two principal changes: .IP * 3 the addition of a new kernel representation of resource limits that diff --git a/man2/getrusage.2 b/man2/getrusage.2 index 67588c573..2a839855e 100644 --- a/man2/getrusage.2 +++ b/man2/getrusage.2 @@ -211,7 +211,7 @@ T{ .BR getrusage () T} Thread safety MT-Safe .TE - +.sp 1 .SH CONFORMING TO POSIX.1-2001, POSIX.1-2008, SVr4, 4.3BSD. POSIX.1 specifies @@ -220,13 +220,13 @@ but specifies only the fields .I ru_utime and .IR ru_stime . - +.PP .B RUSAGE_THREAD is Linux-specific. .SH NOTES Resource usage metrics are preserved across an .BR execve (2). - +.PP Including .I is not required these days, but increases portability. @@ -249,7 +249,7 @@ This nonconformance is rectified in Linux 2.6.9 and later. .LP The structure definition shown at the start of this page was taken from 4.3BSD Reno. - +.PP Ancient systems provided a .BR vtimes () function with a similar purpose to @@ -258,7 +258,7 @@ For backward compatibility, glibc also provides .BR vtimes (). All new applications should be written using .BR getrusage (). - +.PP See also the description of .IR /proc/[pid]/stat in diff --git a/man2/getsid.2 b/man2/getsid.2 index ab6bdb350..f05fc0131 100644 --- a/man2/getsid.2 +++ b/man2/getsid.2 @@ -85,7 +85,7 @@ POSIX.1-2001, POSIX.1-2008, SVr4. .SH NOTES Linux does not return .BR EPERM . - +.PP See .BR credentials (7) for a description of sessions and session IDs. diff --git a/man2/getsockname.2 b/man2/getsockname.2 index b71883531..3b18315e0 100644 --- a/man2/getsockname.2 +++ b/man2/getsockname.2 @@ -59,7 +59,7 @@ argument should be initialized to indicate the amount of space (in bytes) pointed to by .IR addr . On return it contains the actual size of the socket address. - +.PP The returned address is truncated if the buffer provided is too small; in this case, .I addrlen diff --git a/man2/getsockopt.2 b/man2/getsockopt.2 index 12b83aab3..4a647dad3 100644 --- a/man2/getsockopt.2 +++ b/man2/getsockopt.2 @@ -64,7 +64,7 @@ manipulate options for the socket referred to by the file descriptor Options may exist at multiple protocol levels; they are always present at the uppermost socket level. - +.PP When manipulating socket options, the level at which the option resides and the name of the option must be specified. To manipulate options at the sockets API level, @@ -83,7 +83,7 @@ should be set to the protocol number of .BR TCP ; see .BR getprotoent (3). - +.PP The arguments .I optval and @@ -105,7 +105,7 @@ the value returned. If no option value is to be supplied or returned, .I optval may be NULL. - +.PP .I Optname and any specified options are passed uninterpreted to the appropriate protocol module for interpretation. @@ -115,7 +115,7 @@ contains definitions for socket level options, described below. Options at other protocol levels vary in format and name; consult the appropriate entries in section 4 of the manual. - +.PP Most socket-level options utilize an .I int argument for @@ -133,7 +133,7 @@ On success, zero is returned for the standard options. On error, \-1 is returned, and .I errno is set appropriately. - +.PP Netfilter allows the programmer to define custom socket options with associated handlers; for such options, the return value on success is the value returned by the handler. @@ -185,7 +185,7 @@ POSIX.1 does not require the inclusion of and this header file is not required on Linux. However, some historical (BSD) implementations required this header file, and portable applications are probably wise to include it. - +.PP For background on the .I socklen_t type, see diff --git a/man2/gettid.2 b/man2/gettid.2 index ae873ecd2..90db74a61 100644 --- a/man2/gettid.2 +++ b/man2/gettid.2 @@ -64,11 +64,11 @@ Glibc does not provide a wrapper for this system call; call it using .BR syscall (2). .\" FIXME . See http://sourceware.org/bugzilla/show_bug.cgi?id=6399 .\" "gettid() should have a wrapper" - +.PP The thread ID returned by this call is not the same thing as a POSIX thread ID (i.e., the opaque value returned by .BR pthread_self (3)). - +.PP In a new thread group created by a .BR clone (2) call that does not specify the diff --git a/man2/gettimeofday.2 b/man2/gettimeofday.2 index 48e3a1900..6763860a6 100644 --- a/man2/gettimeofday.2 +++ b/man2/gettimeofday.2 @@ -119,7 +119,7 @@ structure is obsolete; the .I tz argument should normally be specified as NULL. (See NOTES below.) - +.PP Under Linux, there are some peculiar "warp clock" semantics associated with the .BR settimeofday () @@ -182,12 +182,12 @@ affected by discontinuous jumps in the system time (e.g., if the system administrator manually changes the system time). If you need a monotonically increasing clock, see .BR clock_gettime (2). - +.PP Macros for operating on .I timeval structures are described in .BR timeradd (3). - +.PP Traditionally, the fields of .I struct timeval were of type @@ -218,7 +218,7 @@ or .\" Each and every occurrence of this field in the kernel source .\" (other than the declaration) is a bug. Thus, the following is purely of historical interest. - +.PP On old systems, the field .I tz_dsttime contains a symbolic constant (values are given below) diff --git a/man2/getuid.2 b/man2/getuid.2 index 888af404f..583fae5e0 100644 --- a/man2/getuid.2 +++ b/man2/getuid.2 @@ -37,7 +37,7 @@ getuid, geteuid \- get user identity .SH DESCRIPTION .BR getuid () returns the real user ID of the calling process. - +.PP .BR geteuid () returns the effective user ID of the calling process. .SH ERRORS @@ -54,7 +54,7 @@ UNIX\ V7 introduced separate calls .BR getuid () and .BR geteuid (). - +.PP The original Linux .BR getuid () and diff --git a/man2/getunwind.2 b/man2/getunwind.2 index fda06f3f4..5d8d58a51 100644 --- a/man2/getunwind.2 +++ b/man2/getunwind.2 @@ -39,7 +39,7 @@ getunwind \- copy the unwind data to caller's buffer There is no glibc wrapper for this system call; see NOTES. .SH DESCRIPTION .I Note: this function is obsolete. - +.PP The IA-64-specific .BR getunwind () @@ -49,7 +49,7 @@ unwind data into the buffer pointed to by and returns the size of the unwind data; this data describes the gate page (kernel code that is mapped into user space). - +.PP The size of the buffer .I buf is specified in @@ -61,17 +61,17 @@ is greater than or equal to the size of the unwind data and is not NULL; otherwise, no data is copied, and the call succeeds, returning the size that would be needed to store the unwind data. - +.PP The first part of the unwind data contains an unwind table. The rest contains the associated unwind information, in no particular order. The unwind table contains entries of the following form: - +.PP .nf u64 start; (64-bit address of start of function) u64 end; (64-bit address of end of function) u64 info; (BUF-relative offset to unwind info) .fi - +.PP An entry whose .I start value is zero indicates the end of the table. @@ -100,7 +100,7 @@ and is available only on the IA-64 architecture. This system call has been deprecated. The modern way to obtain the kernel's unwind data is via the .BR vdso (7). - +.PP Glibc does not provide a wrapper for this system call; in the unlikely event that you want to call it, use .BR syscall (2). diff --git a/man2/init_module.2 b/man2/init_module.2 index 17778725e..15ede83bd 100644 --- a/man2/init_module.2 +++ b/man2/init_module.2 @@ -51,7 +51,7 @@ and then runs the module's .I init function. This system call requires privilege. - +.PP The .I module_image argument points to a buffer containing the binary image @@ -59,7 +59,7 @@ to be loaded; .I len specifies the size of that buffer. The module image should be a valid ELF image, built for the running kernel. - +.PP The .I param_values argument is a string containing space-delimited specifications of the @@ -70,12 +70,12 @@ and The kernel parses this string and initializes the specified parameters. Each of the parameter specifications has the form: - +.PP .RI " " name [\c .BI = value\c .RB [ ,\c .IR value ...]] - +.PP The parameter .I name is one of those defined within the module using @@ -108,7 +108,7 @@ The .I param_values argument is as for .BR init_module (). - +.PP The .I flags argument modifies the operation of @@ -140,7 +140,7 @@ for the function named by the symbol. In this case, the kernel version number within the "vermagic" string is ignored, as the symbol version hashes are assumed to be sufficiently reliable. - +.PP Using the .B MODULE_INIT_IGNORE_VERMAGIC flag indicates that the "vermagic" string is to be ignored, and the @@ -272,17 +272,17 @@ it is (before glibc 2.23) sufficient to manually declare the interface in your code; alternatively, you can invoke the system call using .BR syscall (2). - +.PP Glibc does not provide a wrapper for .BR finit_module (); call it using .BR syscall (2). - +.PP Information about currently loaded modules can be found in .IR /proc/modules and in the file trees under the per-module subdirectories under .IR /sys/module . - +.PP See the Linux kernel source file .I include/linux/module.h for some useful background information. @@ -291,11 +291,11 @@ for some useful background information. In Linux 2.4 and earlier, the .BR init_module () system call was rather different: - +.PP .B " #include " - +.PP .BI " int init_module(const char *" name ", struct module *" image ); - +.PP (User-space applications can detect which version of .BR init_module () is available by calling @@ -303,7 +303,7 @@ is available by calling the latter call fails with the error .BR ENOSYS on Linux 2.6 and later.) - +.PP The older version of the system call loads the relocated module image pointed to by .I image diff --git a/man2/inotify_add_watch.2 b/man2/inotify_add_watch.2 index e91f3b67d..82968438d 100644 --- a/man2/inotify_add_watch.2 +++ b/man2/inotify_add_watch.2 @@ -51,7 +51,7 @@ See .BR inotify (7) for a description of the bits that can be set in .IR mask . - +.PP A successful call to .BR inotify_add_watch () returns a unique watch descriptor for this inotify instance, @@ -63,7 +63,7 @@ then the watch descriptor is newly allocated. If the filesystem object was already being watched (perhaps via a different link to the same object), then the descriptor for the existing watch is returned. - +.PP The watch descriptor is returned by later .BR read (2)s from the inotify file descriptor. diff --git a/man2/inotify_init.2 b/man2/inotify_init.2 index 1a3f363bb..7fd47eb5a 100644 --- a/man2/inotify_init.2 +++ b/man2/inotify_init.2 @@ -39,11 +39,11 @@ inotify_init, inotify_init1 \- initialize an inotify instance .SH DESCRIPTION For an overview of the inotify API, see .BR inotify (7). - +.PP .BR inotify_init () initializes a new inotify instance and returns a file descriptor associated with a new inotify event queue. - +.PP If .I flags is 0, then diff --git a/man2/inotify_rm_watch.2 b/man2/inotify_rm_watch.2 index 1db6d1d70..e965353c8 100644 --- a/man2/inotify_rm_watch.2 +++ b/man2/inotify_rm_watch.2 @@ -39,7 +39,7 @@ removes the watch associated with the watch descriptor .I wd from the inotify instance associated with the file descriptor .IR fd . - +.PP Removing a watch causes an .B IN_IGNORED event to be generated for this watch descriptor. diff --git a/man2/intro.2 b/man2/intro.2 index d0d18dc67..ea9a84c6c 100644 --- a/man2/intro.2 +++ b/man2/intro.2 @@ -39,7 +39,7 @@ wrapper functions which perform the steps required the system call. Thus, making a system call looks the same as invoking a normal library function. - +.PP In many cases, the C library wrapper function does nothing more than: .IP * 3 copying arguments and the unique system call number to the @@ -62,7 +62,7 @@ try to note the details of both the (usually GNU) C library API interface and the raw system call. Most commonly, the main DESCRIPTION will focus on the C library interface, and differences for the system call are covered in the NOTES section. - +.PP For a list of the Linux system calls, see .BR syscalls (2). .SH RETURN VALUE @@ -74,12 +74,12 @@ system call returns a negative value, the wrapper copies the absolute value into the .I errno variable, and returns \-1 as the return value of the wrapper. - +.PP The value returned by a successful system call depends on the call. Many system calls return 0 on success, but some can return nonzero values from a successful call. The details are described in the individual manual pages. - +.PP In some cases, the programmer must define a feature test macro in order to obtain the declaration of a system call from the header file specified diff --git a/man2/io_cancel.2 b/man2/io_cancel.2 index 57ea86fea..aa2007c21 100644 --- a/man2/io_cancel.2 +++ b/man2/io_cancel.2 @@ -70,7 +70,7 @@ But instead, you probably want to use the wrapper function provided by .\" http://git.fedorahosted.org/git/?p=libaio.git .IR libaio . - +.PP Note that the .I libaio wrapper function uses a different type diff --git a/man2/io_destroy.2 b/man2/io_destroy.2 index b0f875a42..6b4679685 100644 --- a/man2/io_destroy.2 +++ b/man2/io_destroy.2 @@ -59,7 +59,7 @@ But instead, you probably want to use the wrapper function provided by .\" http://git.fedorahosted.org/git/?p=libaio.git .IR libaio . - +.PP Note that the .I libaio wrapper function uses a different type diff --git a/man2/io_getevents.2 b/man2/io_getevents.2 index 2993d2123..82a428e9d 100644 --- a/man2/io_getevents.2 +++ b/man2/io_getevents.2 @@ -27,10 +27,10 @@ system call attempts to read at least \fImin_nr\fP events and up to \fInr\fP events from the completion queue of the AIO context specified by \fIctx_id\fP. - +.PP The \fItimeout\fP argument specifies the amount of time to wait for events, and is specified as a relative timeout in a structure of the following form: - +.PP .in +4n .nf struct timespec { @@ -39,10 +39,10 @@ struct timespec { }; .fi .in - +.PP The specified time will be rounded up to the system clock granularity and is guaranteed not to expire early. - +.PP Specifying .I timeout as NULL means block indefinitely until at least @@ -60,7 +60,7 @@ expired. It may also be a nonzero value less than .IR min_nr , if the call was interrupted by a signal handler. - +.PP For the failure return, see NOTES. .SH ERRORS .TP @@ -96,7 +96,7 @@ But instead, you probably want to use the wrapper function provided by .\" http://git.fedorahosted.org/git/?p=libaio.git .IR libaio . - +.PP Note that the .I libaio wrapper function uses a different type diff --git a/man2/io_setup.2 b/man2/io_setup.2 index 984fdc4d2..66514f879 100644 --- a/man2/io_setup.2 +++ b/man2/io_setup.2 @@ -72,7 +72,7 @@ But instead, you probably want to use the wrapper function provided by .\" http://git.fedorahosted.org/git/?p=libaio.git .IR libaio . - +.PP Note that the .I libaio wrapper function uses a different type diff --git a/man2/io_submit.2 b/man2/io_submit.2 index e17f7ffa5..36465b64b 100644 --- a/man2/io_submit.2 +++ b/man2/io_submit.2 @@ -74,7 +74,7 @@ But instead, you probably want to use the wrapper function provided by .\" http://git.fedorahosted.org/git/?p=libaio.git .IR libaio . - +.PP Note that the .I libaio wrapper function uses a different type diff --git a/man2/ioctl_fat.2 b/man2/ioctl_fat.2 index b07a2fe64..e4713da37 100644 --- a/man2/ioctl_fat.2 +++ b/man2/ioctl_fat.2 @@ -142,7 +142,7 @@ repeatedly. The .I entry argument is a two-element array of the following structures: - +.PP .in +4n .nf struct __fat_dirent { @@ -229,14 +229,14 @@ For further error values, see and .B VFAT_IOCTL_READDIR_SHORT first appeared in Linux 2.0. - +.PP .BR FAT_IOCTL_GET_ATTRIBUTES and .BR FAT_IOCTL_SET_ATTRIBUTES first appeared .\" just before we got Git history in Linux 2.6.12. - +.PP .B FAT_IOCTL_GET_VOLUME_ID was introduced in version 3.11 .\" commit 6e5b93ee55d401f1619092fb675b57c28c9ed7ec @@ -254,7 +254,7 @@ the program reads and displays the attribute again. .PP The following was recorded when applying the program for the file .IR /mnt/user/foo : - +.PP .in +4n .nf # ./toggle_fat_archive_flag /mnt/user/foo @@ -355,7 +355,7 @@ to display the volume ID of a FAT filesystem. The following output was recorded when applying the program for directory .IR /mnt/user : - +.PP .in +4n .nf $ ./display_fat_volume_id /mnt/user @@ -418,7 +418,7 @@ to list a directory. .PP The following was recorded when applying the program to the directory .IR /mnt/user : - +.PP .in +4n .nf $ ./fat_dir /mnt/user diff --git a/man2/ioctl_ficlonerange.2 b/man2/ioctl_ficlonerange.2 index 5d2d55056..1804ef9a2 100644 --- a/man2/ioctl_ficlonerange.2 +++ b/man2/ioctl_ficlonerange.2 @@ -47,7 +47,7 @@ If a file write should occur to a shared region, the filesystem must ensure that the changes remain private to the file being written. This behavior is commonly referred to as "copy on write". - +.PP This ioctl reflinks up to .IR src_length bytes from file descriptor @@ -78,7 +78,7 @@ struct file_clone_range { .in Clones are atomic with regards to concurrent writes, so no locks need to be taken to obtain a consistent cloned copy. - +.PP The .B FICLONE ioctl clones entire files. diff --git a/man2/ioctl_fideduperange.2 b/man2/ioctl_fideduperange.2 index 87eb60b37..934f72c78 100644 --- a/man2/ioctl_fideduperange.2 +++ b/man2/ioctl_fideduperange.2 @@ -47,7 +47,7 @@ If a file write should occur to a shared region, the filesystem must ensure that the changes remain private to the file being written. This behavior is commonly referred to as "copy on write". - +.PP This ioctl performs the "compare and share if identical" operation on up to .IR src_length bytes from file descriptor @@ -68,20 +68,20 @@ struct file_dedupe_range { }; .fi .in - +.PP Deduplication is atomic with regards to concurrent writes, so no locks need to be taken to obtain a consistent deduplicated copy. - +.PP The fields .IR reserved1 " and " reserved2 must be zero. - +.PP Destinations for the deduplication operation are conveyed in the array at the end of the structure. The number of destinations is given in .IR dest_count ", and the destination information is conveyed in the following form: - +.PP .in +4n .nf struct file_dedupe_range_info { @@ -94,7 +94,7 @@ struct file_dedupe_range_info { .fi .in - +.PP Each deduplication operation targets .IR src_length bytes in file descriptor @@ -125,7 +125,7 @@ is mapped into and the previous contents in .IR dest_fd are freed. - +.PP Upon successful completion of this ioctl, the number of bytes successfully deduplicated is returned in .IR bytes_deduped @@ -143,7 +143,7 @@ code is set to for success, a negative error code in case of error, or .B FILE_DEDUPE_RANGE_DIFFERS if the data did not match. - +.PP .SH RETURN VALUE On error, \-1 is returned, and .I errno @@ -208,7 +208,7 @@ Because a copy-on-write operation requires the allocation of new storage, the .BR fallocate (2) operation may unshare shared blocks to guarantee that subsequent writes will not fail because of lack of disk space. - +.PP Some filesystems may limit the amount of data that can be deduplicated in a single call. .SH SEE ALSO diff --git a/man2/ioctl_tty.2 b/man2/ioctl_tty.2 index 8d6511d4d..144b1b040 100644 --- a/man2/ioctl_tty.2 +++ b/man2/ioctl_tty.2 @@ -98,7 +98,7 @@ Window sizes are kept in the kernel, but not used by the kernel (except in the case of virtual consoles, where the kernel will update the window size when the size of the virtual console changes, for example, by loading a new font). - +.PP The following constants and structure are defined in .IR . .TP @@ -109,7 +109,7 @@ Get window size. Set window size. .LP The struct used by these ioctls is defined as - +.PP .in +4n .nf struct winsize { @@ -120,7 +120,7 @@ struct winsize { }; .fi .in - +.PP When the window size changes, a .B SIGWINCH signal is sent to the @@ -141,7 +141,7 @@ returns without doing anything. When .I arg is nonzero, nobody knows what will happen. - +.IP (SVr4, UnixWare, Solaris, Linux treat .I "tcsendbreak(fd,arg)" with nonzero @@ -244,7 +244,7 @@ controlling terminal already. For this case, .I arg should be specified as zero. - +.IP If this terminal is already the controlling terminal of a different session group, then the ioctl fails with .BR EPERM , @@ -334,7 +334,7 @@ If the first byte is not .B TIOCPKT_DATA (0), it is an OR of one or more of the following bits: - +.IP .nf TIOCPKT_FLUSHREAD The read queue for the terminal is flushed. TIOCPKT_FLUSHWRITE The write queue for the terminal is flushed. @@ -343,7 +343,7 @@ TIOCPKT_START Output to the terminal is restarted. TIOCPKT_DOSTOP The start and stop characters are \fB^S\fP/\fB^Q\fP. TIOCPKT_NOSTOP The start and stop characters are not \fB^S\fP/\fB^Q\fP. .fi - +.IP While this mode is in use, the presence of control status information to be read from the master side may be detected by a @@ -353,7 +353,7 @@ for exceptional conditions or a for the .I POLLPRI event. - +.IP This mode is used by .BR rlogin (1) and @@ -395,7 +395,7 @@ pseudoterminal slave device. This operation can be performed regardless of whether the pathname of the slave device is accessible through the calling process's mount namespaces. - +.IP Security-conscious programs interacting with namespaces may wish to use this operation rather than .BR open (2) @@ -424,7 +424,7 @@ Clear the indicated modem bits. Set the indicated modem bits. .LP The following bits are used by the above ioctls: - +.PP .nf TIOCM_LE DSR (data set ready/line enable) TIOCM_DTR DTR (data terminal ready) @@ -459,7 +459,7 @@ The counts are written to the .I serial_icounter_struct structure pointed to by .IR argp . - +.IP Note: both 1->0 and 0->1 transitions are counted, except for RI, where only 0->1 transitions are counted. .SS Marking a line as local @@ -547,7 +547,7 @@ Inappropriate Insufficient permission. .SH EXAMPLE Check the condition of DTR on the serial port. - +.PP .nf #include #include diff --git a/man2/ioctl_userfaultfd.2 b/man2/ioctl_userfaultfd.2 index f878c6112..f742dda4b 100644 --- a/man2/ioctl_userfaultfd.2 +++ b/man2/ioctl_userfaultfd.2 @@ -55,7 +55,7 @@ is one of the commands listed below, and .I argp is a pointer to a data structure that is specific to .IR cmd . - +.PP The various .BR ioctl (2) operations are described below. @@ -78,7 +78,7 @@ events. .SS UFFDIO_API (Since Linux 4.3.) Enable operation of the userfaultfd and perform API handshake. - +.PP The .I argp argument is a pointer to a @@ -98,7 +98,7 @@ struct uffdio_api { The .I api field denotes the API version requested by the application. - +.PP The kernel verifies that it can support the requested API version, and sets the .I features @@ -107,7 +107,7 @@ and fields to bit masks representing all the available features and the generic .BR ioctl (2) operations available. - +.PP For Linux kernel versions before 4.11, the .I features field must be initialized to zero before the call to @@ -116,7 +116,7 @@ and zero (i.e., no feature bits) is placed in the .I features field by the kernel upon return from .BR ioctl (2). - +.PP Starting from Linux 4.11, the .I features field can be used to ask whether particular features are supported @@ -124,7 +124,7 @@ and explicitly enable userfaultfd features that are disabled by default. The kernel always reports all the available features in the .I features field. - +.PP To enable userfaultfd features the application should set a bit corresponding to each feature it wants to enable in the .I features @@ -135,7 +135,7 @@ Otherwise it will zero out the returned structure and return .BR EINVAL . .\" FIXME add more details about feature negotiation and enablement - +.PP Since Linux 4.11, the following feature bits may be set: .TP .B UFFD_FEATURE_EVENT_FORK @@ -196,7 +196,7 @@ with the flag set, .BR memfd_create (2), and so on. - +.IP The returned .I ioctls field can contain the following bits: @@ -255,15 +255,15 @@ by the current kernel version. (Since Linux 4.3.) Register a memory address range with the userfaultfd object. The pages in the range must be "compatible". - +.PP Up to Linux kernel 4.11, only private anonymous ranges are compatible for registering with .BR UFFDIO_REGISTER . - +.PP Since Linux 4.11, hugetlbfs and shared memory ranges are also compatible with .BR UFFDIO_REGISTER . - +.PP The .I argp argument is a pointer to a @@ -285,7 +285,7 @@ struct uffdio_register { .fi .in - +.PP The .I range field defines a memory range starting at @@ -293,7 +293,7 @@ field defines a memory range starting at and continuing for .I len bytes that should be handled by the userfaultfd. - +.PP The .I mode field defines the mode of operation desired for this memory region. @@ -316,7 +316,7 @@ bit-mask field to indicate which operations are available for the specified range. This returned bit mask is as for .BR UFFDIO_API . - +.PP This .BR ioctl (2) operation returns 0 on success. @@ -364,12 +364,12 @@ There as an incompatible mapping in the specified address range. Unregister a memory address range from userfaultfd. The pages in the range must be "compatible" (see the description of .BR UFFDIO_REGISTER .) - +.PP The address range to unregister is specified in the .IR uffdio_range structure pointed to by .IR argp . - +.PP This .BR ioctl (2) operation returns 0 on success. @@ -406,7 +406,7 @@ fields of the .I uffdio_copy structure pointed to by .IR argp : - +.PP .in +4n .nf struct uffdio_copy { @@ -448,7 +448,7 @@ field is output-only; it is not read by the .B UFFDIO_COPY operation. - +.PP This .BR ioctl (2) operation returns 0 on success. @@ -505,14 +505,14 @@ operation. .SS UFFDIO_ZEROPAGE (Since Linux 4.3.) Zero out a memory range registered with userfaultfd. - +.PP The requested range is specified by the .I range field of the .I uffdio_zeropage structure pointed to by .IR argp : - +.PP .in +4n .nf struct uffdio_zeropage { @@ -552,7 +552,7 @@ field is output-only; it is not read by the .B UFFDIO_ZERO operation. - +.PP This .BR ioctl (2) operation returns 0 on success. @@ -593,7 +593,7 @@ operation. (Since Linux 4.3.) Wake up the thread waiting for page-fault resolution on a specified memory address range. - +.PP The .B UFFDIO_WAKE operation is used in conjunction with @@ -613,13 +613,13 @@ and .BR UFFDIO_ZEROPAGE operations in a batch and then explicitly wake up the faulting thread using .BR UFFDIO_WAKE . - +.PP The .I argp argument is a pointer to a .I uffdio_range structure (shown above) that specifies the address range. - +.PP This .BR ioctl (2) operation returns 0 on success. @@ -675,6 +675,6 @@ operation that actually enables the desired features. .BR ioctl (2), .BR mmap (2), .BR userfaultfd (2) - +.PP .IR Documentation/vm/userfaultfd.txt in the Linux kernel source tree diff --git a/man2/ioperm.2 b/man2/ioperm.2 index 7e69b563a..c82890896 100644 --- a/man2/ioperm.2 +++ b/man2/ioperm.2 @@ -53,7 +53,7 @@ If .I turn_on is nonzero, the calling thread must be privileged .RB ( CAP_SYS_RAWIO ). - +.PP Before Linux 2.6.8, only the first 0x3ff I/O ports could be specified in this manner. For more ports, the @@ -62,7 +62,7 @@ system call had to be used (with a .I level argument of 3). Since Linux 2.6.8, 65,536 I/O ports can be specified. - +.PP Permissions are inherited by the child created by .BR fork (2) (but see NOTES). @@ -70,7 +70,7 @@ Permissions are preserved across .BR execve (2); this is useful for giving port access permissions to unprivileged programs. - +.PP This call is mostly for the i386 architecture. On many other architectures it does not exist or will always return an error. @@ -104,11 +104,11 @@ intended to be portable. The .I /proc/ioports file shows the I/O ports that are currently allocated on the system. - +.PP Before Linux 2.4, permissions were not inherited by a child created by .BR fork (2). - +.PP Glibc has an .BR ioperm () prototype both in diff --git a/man2/iopl.2 b/man2/iopl.2 index b90efb63c..d2f4804fa 100644 --- a/man2/iopl.2 +++ b/man2/iopl.2 @@ -42,25 +42,25 @@ iopl \- change I/O privilege level changes the I/O privilege level of the calling process, as specified by the two least significant bits in .IR level . - +.PP This call is necessary to allow 8514-compatible X servers to run under Linux. Since these X servers require access to all 65536 I/O ports, the .BR ioperm (2) call is not sufficient. - +.PP In addition to granting unrestricted I/O port access, running at a higher I/O privilege level also allows the process to disable interrupts. This will probably crash the system, and is not recommended. - +.PP Permissions are not inherited by the child process created by .BR fork (2) and are not preserved across .BR execve (2) (but see NOTES). - +.PP The I/O privilege level for a normal process is 0. - +.PP This call is mostly for the i386 architecture. On many other architectures it does not exist or will always return an error. @@ -98,7 +98,7 @@ Glibc2 has a prototype both in and in .IR . Avoid the latter, it is available on i386 only. - +.PP Prior to Linux 3.7, on some architectures (such as i386), permissions .I were diff --git a/man2/ioprio_set.2 b/man2/ioprio_set.2 index 211f8f869..67dd4999a 100644 --- a/man2/ioprio_set.2 +++ b/man2/ioprio_set.2 @@ -39,7 +39,7 @@ and .BR ioprio_set () system calls respectively get and set the I/O scheduling class and priority of one or more threads. - +.PP The .I which and @@ -95,7 +95,7 @@ is the lowest) or if it belongs to the same priority class as the other process but has a higher priority level (a lower priority number means a higher priority level). - +.PP The .I ioprio argument given to @@ -141,7 +141,7 @@ information on scheduling classes and priorities, as well as the meaning of specifying .I ioprio as 0. - +.PP I/O priorities are supported for reads and for synchronous .RB ( O_DIRECT , .BR O_SYNC ) @@ -201,7 +201,7 @@ These system calls are Linux-specific. .SH NOTES Glibc does not provide a wrapper for these system calls; call them using .BR syscall (2). - +.PP Two or more processes or threads can share an I/O context. This will be the case when .BR clone (2) @@ -220,12 +220,12 @@ is the one that is returned by .BR gettid (2) or .BR clone (2). - +.PP These system calls have an effect only when used in conjunction with an I/O scheduler that supports I/O priorities. As at kernel 2.6.17 the only such scheduler is the Completely Fair Queuing (CFQ) I/O scheduler. - +.PP If no I/O scheduler has been set for a thread, then by default the I/O priority will follow the CPU nice value .RB ( setpriority (2)). @@ -242,7 +242,7 @@ as 0 can be used to reset to the default I/O scheduling behavior. I/O schedulers are selected on a per-device basis via the special file .IR /sys/block//queue/scheduler . - +.PP One can view the current I/O scheduler via the .I /sys filesystem. @@ -365,6 +365,6 @@ Suitable definitions can be found in .BR open (2), .BR capabilities (7), .BR cgroups (7) - +.PP .I Documentation/block/ioprio.txt in the Linux kernel source tree diff --git a/man2/kcmp.2 b/man2/kcmp.2 index 22ff0a2eb..6e48444ba 100644 --- a/man2/kcmp.2 +++ b/man2/kcmp.2 @@ -47,7 +47,7 @@ and .I pid2 share a kernel resource such as virtual memory, file descriptors, and so on. - +.PP Permission to employ .BR kcmp () is governed by ptrace access mode @@ -58,7 +58,7 @@ and .IR pid2 ; see .BR ptrace (2). - +.PP The .I type argument specifies which resource is to be compared in the two processes. @@ -210,7 +210,7 @@ The return value of a successful call to is simply the result of arithmetic comparison of kernel pointers (when the kernel compares resources, it uses their memory addresses). - +.PP The easiest way to explain is to consider an example. Suppose that .I v1 @@ -242,7 +242,7 @@ but ordering information is unavailable. On error, \-1 is returned, and .I errno is set appropriately. - +.PP .BR kcmp () was designed to return values suitable for sorting. This is particularly handy if one needs to compare @@ -304,7 +304,7 @@ is Linux-specific and should not be used in programs intended to be portable. .SH NOTES Glibc does not provide a wrapper for this system call; call it using .BR syscall (2). - +.PP This system call is available only if the kernel was configured with .BR CONFIG_CHECKPOINT_RESTORE . The main use of the system call is for the @@ -313,7 +313,7 @@ The alternative to this system call would have been to expose suitable process information via the .BR proc (5) filesystem; this was deemed to be unsuitable for security reasons. - +.PP See .BR clone (2) for some background information on the shared resources @@ -326,7 +326,7 @@ the same open file description. The program tests different cases for the file descriptor pairs, as described in the program output. An example run of the program is as follows: - +.PP .nf .in +4n $ \fB./a.out\fP diff --git a/man2/kexec_load.2 b/man2/kexec_load.2 index 198afba93..467caa1e4 100644 --- a/man2/kexec_load.2 +++ b/man2/kexec_load.2 @@ -102,7 +102,7 @@ or one of the following architecture constants and .BR KEXEC_ARCH_MIPS_LE . The architecture must be executable on the CPU of the system. - +.PP The .I entry argument is the physical entry address in the kernel image. @@ -178,13 +178,13 @@ and is moved to the final destination at kexec reboot time (e.g., when the command is executed with the .I \-e option). - +.PP In case of kexec on panic (i.e., the .BR KEXEC_ON_CRASH flag is set), the segment data is loaded to reserved memory at the time of the call, and, after a crash, the kexec mechanism simply passes control to that kernel. - +.PP The .BR kexec_load () system call is available only if the kernel was configured with @@ -209,7 +209,7 @@ The .IR cmdline_len argument specifies size of the buffer. The last byte in the buffer must be a null byte (\(aq\\0\(aq). - +.PP The .IR flags argument is a bit mask which modifies the behavior of the call. @@ -343,7 +343,7 @@ Call them using .BR reboot (2), .BR syscall (2), .BR kexec (8) - +.PP The kernel source files .IR Documentation/kdump/kdump.txt and diff --git a/man2/keyctl.2 b/man2/keyctl.2 index f4d056f16..a0bd612e8 100644 --- a/man2/keyctl.2 +++ b/man2/keyctl.2 @@ -49,7 +49,7 @@ No glibc wrapper is provided for this system call; see NOTES. .SH DESCRIPTION .BR keyctl () allows user-space programs to perform key manipulation. - +.PP The operation performed by .BR keyctl () is determined by the value of the @@ -61,14 +61,14 @@ library (provided by the .I keyutils package) into individual functions (noted below) to permit the compiler to check types. - +.PP The permitted values for .I operation are: .TP .BR KEYCTL_GET_KEYRING_ID " (since Linux 2.6.10)" Map a special key ID to a real key ID for this process. - +.IP This operation looks up the special key whose ID is provided in .I arg2 (cast to @@ -146,23 +146,23 @@ is created and its real key ID returned as the function result. .\" be created. Otherwise, the operation fails with the error .BR ENOKEY . - +.IP If a valid key ID is specified in .IR arg2 , and the key exists, then this operation simply returns the key ID. If the key does not exist, the call fails with error .BR ENOKEY . - +.IP The caller must have .I search permission on a keyring in order for it to be found. - +.IP The arguments .IR arg4 and .IR arg5 are ignored. - +.IP This operation is exposed by .I libkeyutils via the function @@ -173,14 +173,14 @@ Replace the session keyring this process subscribes to with a new session keyring. .\" This may be useful in conjunction with some sort of .\" session management framework that is employed by the application. - +.IP If .I arg2 is NULL, an anonymous keyring with the description "_ses" is created and the process is subscribed to that keyring as its session keyring, displacing the previous session keyring. - +.IP Otherwise, .I arg2 (cast to @@ -209,7 +209,7 @@ The arguments and .IR arg5 are ignored. - +.IP This operation is exposed by .I libkeyutils via the function @@ -217,7 +217,7 @@ via the function .TP .BR KEYCTL_UPDATE " (since Linux 2.6.10)" Update a key's data payload. - +.IP The .I arg2 argument (cast to @@ -232,19 +232,19 @@ points to the new payload and (cast to .IR size_t ) contains the new payload size in bytes. - +.IP The caller must have .I write permission on the key specified and the key type must support updating. - +.IP A negatively instantiated key (see the description of .BR KEYCTL_REJECT ) can be positively instantiated with this operation. - +.IP The .I arg5 argument is ignored. - +.IP This operation is exposed by .I libkeyutils via the function @@ -260,7 +260,7 @@ it will no longer be findable, and will be unavailable for further operations. Further attempts to use the key will fail with the error .BR EKEYREVOKED . - +.IP The caller must have .IR write or @@ -269,14 +269,14 @@ permission on the key. .\" Keys with the KEY_FLAG_KEEP bit set cause an EPERM .\" error for KEYCTL_REVOKE. Does this need to be documented? .\" David Howells: No significance for user space. - +.IP The arguments .IR arg3 , .IR arg4 , and .IR arg5 are ignored. - +.IP This operation is exposed by .I libkeyutils via the function @@ -284,7 +284,7 @@ via the function .TP .BR KEYCTL_CHOWN " (since Linux 2.6.10)" Change the ownership (user and group ID) of a key. - +.IP The .I arg2 argument (cast to @@ -300,26 +300,26 @@ The argument (cast to .IR gid_t ) contains the new group ID (or \-1 in case the group ID shouldn't be changed). - +.IP The key must grant the caller .I setattr permission. - +.IP For the UID to be changed, or for the GID to be changed to a group the caller is not a member of, the caller must have the .B CAP_SYS_ADMIN capability (see .BR capabilities (7)). - +.IP If the UID is to be changed, the new user must have sufficient quota to accept the key. The quota deduction will be removed from the old user to the new user should the UID be changed. - +.IP The .I arg5 argument is ignored. - +.IP This operation is exposed by .I libkeyutils via the function @@ -334,12 +334,12 @@ to the permissions provided in the .I arg3 argument (cast to .IR key_perm_t ). - +.IP If the caller doesn't have the .B CAP_SYS_ADMIN capability, it can change permissions only for the keys it owns. (More precisely: the caller's filesystem UID must match the UID of the key.) - +.IP The key must grant .I setattr permission to the caller @@ -347,7 +347,7 @@ permission to the caller of the caller's capabilities. .\" FIXME Above, is it really intended that a privileged process can't .\" override the lack of the 'setattr' permission? - +.IP The permissions in .IR arg3 specify masks of available operations @@ -395,7 +395,7 @@ or category, then it will not receive permissions granted in the .IR other category. - +.IP The .I possessor category grants permissions that are cumulative with the grants from the @@ -404,7 +404,7 @@ category grants permissions that are cumulative with the grants from the or .IR other category. - +.IP Each permission mask is eight bits in size, with only six bits currently used. The available permissions are: @@ -412,11 +412,11 @@ The available permissions are: .TP .IR view This permission allows reading attributes of a key. - +.IP This permission is required for the .BR KEYCTL_DESCRIBE operation. - +.IP The permission bits for each category are .BR KEY_POS_VIEW , .BR KEY_USR_VIEW , @@ -426,11 +426,11 @@ and .TP .IR read This permission allows reading a key's payload. - +.IP This permission is required for the .BR KEYCTL_READ operation. - +.IP The permission bits for each category are .BR KEY_POS_READ , .BR KEY_USR_READ , @@ -441,7 +441,7 @@ and .IR write This permission allows update or instantiation of a key's payload. For a keyring, it allows keys to be linked and unlinked from the keyring, - +.IP This permission is required for the .BR KEYCTL_UPDATE , .BR KEYCTL_REVOKE , @@ -450,7 +450,7 @@ This permission is required for the and .BR KEYCTL_UNLINK operations. - +.IP The permission bits for each category are .BR KEY_POS_WRITE , .BR KEY_USR_WRITE , @@ -463,7 +463,7 @@ This permission allows keyrings to be searched and keys to be found. Searches can recurse only into nested keyrings that have .I search permission set. - +.IP This permission is required for the .BR KEYCTL_GET_KEYRING_ID , .BR KEYCTL_JOIN_SESSION_KEYRING , @@ -471,7 +471,7 @@ This permission is required for the and .BR KEYCTL_INVALIDATE operations. - +.IP The permission bits for each category are .BR KEY_POS_SEARCH , .BR KEY_USR_SEARCH , @@ -481,13 +481,13 @@ and .TP .IR link This permission allows a key or keyring to be linked to. - +.IP This permission is required for the .BR KEYCTL_LINK and .BR KEYCTL_SESSION_TO_PARENT operations. - +.IP The permission bits for each category are .BR KEY_POS_LINK , .BR KEY_USR_LINK , @@ -497,14 +497,14 @@ and .TP .IR setattr " (since Linux 2.6.15)." This permission allows a key's UID, GID, and permissions mask to be changed. - +.IP This permission is required for the .BR KEYCTL_REVOKE , .BR KEYCTL_CHOWN , and .BR KEYCTL_SETPERM operations. - +.IP The permission bits for each category are .BR KEY_POS_SETATTR , .BR KEY_USR_SETATTR , @@ -520,11 +520,11 @@ all of the permission bits in each of the user categories: .BR KEY_GRP_ALL , and .BR KEY_OTH_ALL . - +.IP The .IR arg4 " and " arg5 arguments are ignored. - +.IP This operation is exposed by .I libkeyutils via the function @@ -532,7 +532,7 @@ via the function .TP .BR KEYCTL_DESCRIBE " (since Linux 2.6.10)" Obtain a string describing the attributes of a specified key. - +.IP The ID of the key to be described is specified in .I arg2 (cast to @@ -545,18 +545,18 @@ The descriptive string is returned in the buffer pointed to by (cast to .IR size_t ) specifies the size of that buffer in bytes. - +.IP The key must grant the caller .I view permission. - +.IP The returned string is null-terminated and contains the following information about the key: - +.IP .in +4n .IR type ; uid ; gid ; perm ; description .in - +.IP In the above, .IR type and @@ -569,9 +569,9 @@ are decimal strings, and .I perm is a hexadecimal permissions mask. The descriptive string is written with the following format: - +.IP %s;%d;%d;%08x;%s - +.IP .BR "Note: the intention is that the descriptive string should" .BR "be extensible in future kernel versions". In particular, the @@ -586,7 +586,7 @@ it should be parsed by working backwards from the end of the string to find the last semicolon. This allows future semicolon-delimited fields to be inserted in the descriptive string in the future. - +.IP Writing to the buffer is attempted only when .IR arg3 is non-NULL and the specified buffer size @@ -598,11 +598,11 @@ is large enough to accept the descriptive string In order to determine whether the buffer size was too small, check to see if the return value of the operation is greater than .IR arg4 . - +.IP The .I arg5 argument is ignored. - +.IP This operation is exposed by .I libkeyutils via the function @@ -610,7 +610,7 @@ via the function .TP .B KEYCTL_CLEAR Clear the contents of (i.e., unlink all keys from) a keyring. - +.IP The ID of the key (which must be of keyring type) .\" or the error ENOTDIR results @@ -622,18 +622,18 @@ is provided in .\" This function can also be used to clear special kernel keyrings if they .\" are appropriately marked if the user has CAP_SYS_ADMIN capability. The .\" DNS resolver cache keyring is an example of this. - +.IP The caller must have .I write permission on the keyring. - +.IP The arguments .IR arg3 , .IR arg4 , and .IR arg5 are ignored. - +.IP This operation is exposed by .I libkeyutils via the function @@ -641,7 +641,7 @@ via the function .TP .BR KEYCTL_LINK " (since Linux 2.6.10)" Create a link from a keyring to a key. - +.IP The key to be linked is specified in .IR arg2 (cast to @@ -650,10 +650,10 @@ the keyring is specified in .IR arg3 (cast to .IR key_serial_t ). - +.IP If a key with the same type and description is already linked in the keyring, then that key is displaced from the keyring. - +.IP Before creating the link, the kernel checks the nesting of the keyrings and returns appropriate errors if the link would produce a cycle @@ -662,19 +662,19 @@ or if the nesting of keyrings would be too deep .BR KEYRING_SEARCH_MAX_DEPTH , defined with the value 6, and is necessary to prevent overflows on the kernel stack when recursively searching keyrings). - +.IP The caller must have .I link permission on the key being added and .I write permission on the keyring. - +.IP The arguments .IR arg4 and .IR arg5 are ignored. - +.IP This operation is exposed by .I libkeyutils via the function @@ -682,7 +682,7 @@ via the function .TP .BR KEYCTL_UNLINK " (since Linux 2.6.10)" Unlink a key from a keyring. - +.IP The ID of the key to be unlinked is specified in .I arg2 (cast to @@ -691,22 +691,22 @@ the ID of the keyring from which it is to be unlinked is specified in .I arg3 (cast to .IR key_serial_t ). - +.IP If the key is not currently linked into the keyring, an error results. - +.IP The caller must have .I write permission on the keyring from which the key is being removed. - +.IP If the last link to a key is removed, then that key will be scheduled for destruction. - +.IP The arguments .IR arg4 and .IR arg5 are ignored. - +.IP This operation is exposed by .I libkeyutils via the function @@ -715,14 +715,14 @@ via the function .BR KEYCTL_SEARCH " (since Linux 2.6.10)" Search for a key in a keyring tree, returning its ID and optionally linking it to a specified keyring. - +.IP The tree to be searched is specified by passing the ID of the head keyring in .IR arg2 (cast to .IR key_serial_t ). The search is performed breadth-first and recursively. - +.IP The .I arg3 and @@ -740,7 +740,7 @@ including the terminating null byte), and contains the description of the key (a null-terminated character string up to 4096 bytes in size, including the terminating null byte). - +.IP The source keyring must grant .I search permission to the caller. @@ -750,9 +750,9 @@ permission will be searched. Only keys with for which the caller has .I search permission can be found. - +.IP If the key is found, its ID is returned as the function result. - +.IP If the key is found and .I arg5 (cast to @@ -766,7 +766,7 @@ If the destination keyring specified in already contains a link to a key that has the same type and description, then that link will be displaced by a link to the key found by this operation. - +.IP Instead of valid existing keyring IDs, the source .RI ( arg2 ) and destination @@ -781,7 +781,7 @@ via the function .TP .BR KEYCTL_READ " (since Linux 2.6.10)" Read the payload data of a key. - +.IP The ID of the key whose payload is to be read is specified in .I arg2 (cast to @@ -790,7 +790,7 @@ This can be the ID of an existing key, or any of the special key IDs listed for .BR KEYCTL_GET_KEYRING_ID . .\" including KEY_SPEC_REQKEY_AUTH_KEY - +.IP The payload is placed in the buffer pointed by .I arg3 (cast to @@ -799,7 +799,7 @@ the size of that buffer must be specified in .I arg4 (cast to .IR size_t ). - +.IP The returned data will be processed for presentation according to the key type. For example, a keyring will return an array of @@ -811,7 +811,7 @@ key type will return its data as is. If a key type does not implement this function, the operation fails with the error .BR EOPNOTSUPP . - +.IP If .I arg3 is not NULL, @@ -822,18 +822,18 @@ To determine whether the buffer was of sufficient size, check to see that the return value is less than or equal to the value supplied in .IR arg4 . - +.IP The key must either grant the caller .I read permission, or grant the caller .I search permission when searched for from the process keyrings (i.e., the key is possessed). - +.IP The .I arg5 argument is ignored. - +.IP This operation is exposed by .I libkeyutils via the function @@ -841,12 +841,12 @@ via the function .TP .BR KEYCTL_INSTANTIATE " (since Linux 2.6.10)" (Positively) instantiate an uninstantiated key with a specified payload. - +.IP The ID of the key to be instantiated is provided in .I arg2 (cast to .IR key_serial_t ). - +.IP The key payload is specified in the buffer pointed to by .I arg3 (cast to @@ -855,13 +855,13 @@ the size of that buffer is specified in .I arg4 (cast to .IR size_t ). - +.IP The payload may be a NULL pointer and the buffer size may be 0 if this is supported by the key type (e.g., it is a keyring). - +.IP The operation may be fail if the payload data is in the wrong format or is otherwise invalid. - +.IP If .I arg5 (cast to @@ -870,7 +870,7 @@ is nonzero, then, subject to the same constraints and rules as .BR KEYCTL_LINK , the instantiated key is linked into the keyring whose ID specified in .IR arg5 . - +.IP The caller must have the appropriate authorization key, and once the uninstantiated key has been instantiated, the authorization key is revoked. @@ -880,7 +880,7 @@ program. See .BR request_key (2) for an explanation of uninstantiated keys and key instantiation. - +.IP This operation is exposed by .I libkeyutils via the function @@ -888,15 +888,15 @@ via the function .TP .BR KEYCTL_NEGATE " (since Linux 2.6.10)" Negatively instantiate an uninstantiated key. - +.IP This operation is equivalent to the call: - +.IP keyctl(KEYCTL_REJECT, arg2, arg3, ENOKEY, arg4); - +.IP The .I arg5 argument is ignored. - +.IP This operation is exposed by .I libkeyutils via the function @@ -915,7 +915,7 @@ Setting the default keyring also has an effect when requesting a key from user space; see .BR request_key (2) for details. - +.IP The .I arg2 argument (cast to @@ -976,19 +976,19 @@ Use the requestor keyring. .IP All other values are invalid. .\" (including the still-unsupported KEY_REQKEY_DEFL_GROUP_KEYRING) - +.IP The arguments .IR arg3 , .IR arg4 , and .IR arg5 are ignored. - +.IP The setting controlled by this operation is inherited by the child of .BR fork (2) and preserved across .BR execve (2). - +.IP This operation is exposed by .I libkeyutils via the function @@ -996,7 +996,7 @@ via the function .TP .BR KEYCTL_SET_TIMEOUT " (since Linux 2.6.16)" Set a timeout on a key. - +.IP The ID of the key is specified in .I arg2 (cast to @@ -1007,34 +1007,34 @@ is specified in (cast to .IR "unsigned int" ). The timeout is measured against the realtime clock. - +.IP Specifying the timeout value as 0 clears any existing timeout on the key. - +.IP The .I /proc/keys file displays the remaining time until each key will expire. (This is the only method of discovering the timeout on a key.) - +.IP The caller must either have the .I setattr permission on the key or hold an instantiation authorization token for the key (see .BR request_key (2)). - +.IP The key and any links to the key will be automatically garbage collected after the timeout expires. Subsequent attempts to access the key will then fail with the error .BR EKEYEXPIRED . - +.IP This operation cannot be used to set timeouts on revoked, expired, or negatively instantiated keys. - +.IP The arguments .IR arg4 and .IR arg5 are ignored. - +.IP This operation is exposed by .I libkeyutils via the function @@ -1043,14 +1043,14 @@ via the function .BR KEYCTL_ASSUME_AUTHORITY " (since Linux 2.6.16)" Assume (or divest) the authority for the calling thread to instantiate a key. - +.IP The .I arg2 argument (cast to .IR key_serial_t ) specifies either a nonzero key ID to assume authority, or the value 0 to divest authority. - +.IP If .I arg2 is nonzero, then it specifies the ID of an uninstantiated key for which @@ -1063,7 +1063,7 @@ or .BR KEYCTL_NEGATE . Once the key has been instantiated, the thread is automatically divested of authority to instantiate the key. - +.IP Authority over a key can be assumed only if the calling thread has present in its keyrings the authorization key that is associated with the specified key. @@ -1077,19 +1077,19 @@ for an explanation of how this operation is used.) The caller must have .I search permission on the authorization key. - +.IP If the specified key has a matching authorization key, then the ID of that key is returned. The authorization key can be read .RB ( KEYCTL_READ ) to obtain the callout information passed to .BR request_key (2). - +.IP If the ID given in .I arg2 is 0, then the currently assumed authority is cleared (divested), and the value 0 is returned. - +.IP The .BR KEYCTL_ASSUME_AUTHORITY mechanism allows a program such as @@ -1101,14 +1101,14 @@ For further information, see .BR request_key (2) and the kernel source file .IR Documentation/security/keys-request-key.txt . - +.IP The arguments .IR arg3 , .IR arg4 , and .IR arg5 are ignored. - +.IP This operation is exposed by .I libkeyutils via the function @@ -1117,7 +1117,7 @@ via the function .BR KEYCTL_GET_SECURITY " (since Linux 2.6.26)" .\" commit 70a5bb72b55e82fbfbf1e22cae6975fac58a1e2d Get the LSM (Linux Security Module) security label of the specified key. - +.IP The ID of the key whose security label is to be fetched is specified in .I arg2 (cast to @@ -1131,7 +1131,7 @@ the size of the buffer must be provided in .I arg4 (cast to .IR size_t ). - +.IP If .I arg3 is specified as NULL or the buffer size specified in @@ -1140,24 +1140,24 @@ is too small, the full size of the security label string (including the terminating null byte) is returned as the function result, and nothing is copied to the buffer. - +.IP The caller must have .I view permission on the specified key. - +.IP The returned security label string will be rendered in a form appropriate to the LSM in force. For example, with SELinux, it may look like: - +.IP unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 - +.IP If no LSM is currently in force, then an empty string is placed in the buffer. - +.IP The .I arg5 argument is ignored. - +.IP This operation is exposed by .I libkeyutils via the functions @@ -1174,10 +1174,10 @@ subscribes with the session keyring of the calling process. .\" What is the use case for KEYCTL_SESSION_TO_PARENT? .\" David Howells: the Process Authentication Groups people requested this, .\" but then didn't use it; maybe there are no users. - +.IP The keyring will be replaced in the parent process at the point where the parent next transitions from kernel space to user space. - +.IP The keyring must exist and must grant the caller .I link permission. @@ -1187,7 +1187,7 @@ and must not be set-user-ID or set-group-ID. The UID of the parent process's existing session keyring (f it has one), as well as the UID of the caller's session keyring much match the caller's effective UID. - +.IP The fact that it is the parent process that is affected by this operation allows a program such as the shell to start a child process that uses this operation to change the shell's session keyring. @@ -1195,7 +1195,7 @@ uses this operation to change the shell's session keyring. .BR keyctl (1) .B new_session command does.) - +.IP The arguments .IR arg2 , .IR arg3 , @@ -1203,7 +1203,7 @@ The arguments and .IR arg5 are ignored. - +.IP This operation is exposed by .I libkeyutils via the function @@ -1216,7 +1216,7 @@ on the key. This operation provides a superset of the functionality of the earlier .BR KEYCTL_NEGATE operation. - +.IP The ID of the key that is to be negatively instantiated is specified in .I arg2 (cast to @@ -1236,7 +1236,7 @@ typically, this is one of .BR EKEYREVOKED , or .BR EKEYEXPIRED . - +.IP If .I arg5 (cast to @@ -1246,14 +1246,14 @@ is nonzero, then, subject to the same constraints and rules as the negatively instantiated key is linked into the keyring whose ID is specified in .IR arg5 . - +.IP The caller must have the appropriate authorization key. In other words, this operation is available only from a .BR request-key (8)-style program. See .BR request_key (2). - +.IP The caller must have the appropriate authorization key, and once the uninstantiated key has been instantiated, the authorization key is revoked. @@ -1263,7 +1263,7 @@ program. See .BR request_key (2) for an explanation of uninstantiated keys and key instantiation. - +.IP This operation is exposed by .I libkeyutils via the function @@ -1273,13 +1273,13 @@ via the function .\" commit ee009e4a0d4555ed522a631bae9896399674f063 Instantiate an uninstantiated key with a payload specified via a vector of buffers. - +.IP This operation is the same as .BR KEYCTL_INSTANTIATE , but the payload data is specified as an array of .IR iovec structures: - +.IP .in +4n .nf struct iovec { @@ -1288,7 +1288,7 @@ struct iovec { }; .fi .in - +.IP The pointer to the payload vector is specified in .IR arg3 (cast as @@ -1297,7 +1297,7 @@ The number of items in the vector is specified in .IR arg4 (cast as .IR "unsigned int" ). - +.IP The .I arg2 (key ID) @@ -1306,7 +1306,7 @@ and (keyring ID) are interpreted as for .BR KEYCTL_INSTANTIATE . - +.IP This operation is exposed by .I libkeyutils via the function @@ -1315,18 +1315,18 @@ via the function .BR KEYCTL_INVALIDATE " (since Linux 3.5)" .\" commit fd75815f727f157a05f4c96b5294a4617c0557da Mark a key as invalid. - +.IP The ID of the key to be invalidated is specified in .I arg2 (cast to .IR key_serial_t ). - +.IP To invalidate a key, the caller must have .I search permission on the key. .\" CAP_SYS_ADMIN is permitted to invalidate certain special keys - +.IP This operation marks the key as invalid and schedules immediate garbage collection. The garbage collector removes the invalidated key from all keyrings and @@ -1334,20 +1334,20 @@ deletes the key when its reference count reaches zero. After this operation, the key will be ignored by all searches, even if it is not yet deleted. - +.IP Keys that are marked invalid become invisible to normal key operations immediately, though they are still visible in .I /proc/keys (marked with an 'i' flag) until they are actually removed. - +.IP The arguments .IR arg3 , .IR arg4 , and .IR arg5 are ignored. - +.IP This operation is exposed by .I libkeyutils via the function @@ -1358,7 +1358,7 @@ via the function Get the persistent keyring .RB ( persistent-keyring (7)) for a specified user and link it to a specified keyring. - +.IP The user ID is specified in .I arg2 (cast to @@ -1368,44 +1368,44 @@ The ID of the destination keyring is specified in .I arg3 (cast to .IR key_serial_t ). - +.IP The caller must have the .BR CAP_SETUID capability in its user namespace in order to fetch the persistent keyring for a user ID that does not match either the real or effective user ID of the caller. - +.IP If the call is successful, a link to the persistent keyring is added to the keyring whose ID was specified in .IR arg3 . - +.IP The caller must have .I write permission on the keyring. - +.IP The persistent keyring will be created by the kernel if it does not yet exist. - +.IP Each time the .B KEYCTL_GET_PERSISTENT operation is performed, the persistent keyring will have its expiration timeout reset to the value in: - +.IP /proc/sys/kernel/keys/persistent_keyring_expiry - +.IP Should the timeout be reached, the persistent keyring will be removed and everything it pins can then be garbage collected. - +.IP Persistent keyrings were added to Linux in kernel version 3.13. - +.IP The arguments .IR arg4 and .IR arg5 are ignored. - +.IP This operation is exposed by .I libkeyutils via the function @@ -1414,7 +1414,7 @@ via the function .BR KEYCTL_DH_COMPUTE " (since Linux 4.7)" .\" commit ddbb41148724367394d0880c516bfaeed127b52e Compute a Diffie-Hellman shared secret or public key. - +.IP The .I arg2 argument is a pointer to a set of parameters containing @@ -1422,7 +1422,7 @@ serial numbers for three .IR """user""" keys used in the Diffie-Hellman calculation, packaged in a structure of the following form: - +.IP .nf .in +4n struct keyctl_dh_params { @@ -1433,18 +1433,18 @@ struct keyctl_dh_params { }; .in .fi - +.IP Each of the three keys specified in this structure must grant the caller .I read permission. The payloads of these keys are used to calculate the Diffie-Hellman result as: - +.IP base ^ private mod prime - +.IP If the base is the shared generator, the result is the local public key. If the base is the remote public key, the result is the shared secret. - +.IP The .I arg3 argument (cast to @@ -1454,7 +1454,7 @@ The size of that buffer is specified in .I arg4 (cast to .IR size_t ). - +.IP The buffer must be large enough to accommodate the output data, otherwise an error is returned. If @@ -1463,20 +1463,20 @@ is specified zero, in which case the buffer is not used and the operation returns the minimum required buffer size (i.e., the length of the prime). - +.IP Diffie-Hellman computations can be performed in user space, but require a multiple-precision integer (MPI) library. Moving the implementation into the kernel gives access to the kernel MPI implementation, and allows access to secure or acceleration hardware. - +.IP Adding support for DH computation to the .BR keyctl() system call was considered a good fit due to the DH algorithm's use for deriving shared keys; it also allows the type of the key to determine which DH implementation (software or hardware) is appropriate. - +.IP The .I arg5 argument is reserved and must be 0. @@ -1684,7 +1684,7 @@ program provided by the package. For informational purposes, the program records various information in a log file. - +.PP As described in .BR request_key (2), the @@ -1694,7 +1694,7 @@ describe a key that is to be instantiated. The example program fetches and logs these arguments. The program assumes authority to instantiate the requested key, and then instantiates that key. - +.PP The following shell session demonstrates the use of this program. In the session, we compile the program and then use it to temporarily replace the standard @@ -1707,7 +1707,7 @@ While our example program is installed, we use the example program shown in .BR request_key (2) to request a key. - +.PP .nf .in +4n $ \fBcc \-o key_instantiate key_instantiate.c \-lkeyutils\fP @@ -1718,10 +1718,10 @@ Key ID is 20d035bf $ \fBsudo mv /sbin/request\-key.backup /sbin/request\-key\fP .in .fi - +.PP Looking at the log file created by this program, we can see the command-line arguments supplied to our example program: - +.PP .nf .in +4n $ \fBcat /tmp/key_instantiate.log \fP @@ -1743,7 +1743,7 @@ Destination keyring: 256e6a6 Auth key description: .request_key_auth;1000;1000;0b010000;20d035bf .in .fi - +.PP The last few lines of the above output show that the example program was able to fetch: .IP * 3 @@ -1778,7 +1778,7 @@ we can also see the newly created key with the name .IR mykey and ID .IR 20d035bf . - +.PP .nf .in +4n $ \fBcat /proc/keys | egrep \(aqmykey|256e6a6\(aq\fP @@ -1991,7 +1991,7 @@ main(int argc, char *argv[]) .BR user\-namespaces (7), .BR user\-session\-keyring (7), .BR request\-key (8) - +.PP The kernel source files .IR Documentation/security/keys.txt and diff --git a/man2/kill.2 b/man2/kill.2 index f7ecad1dc..580fa9b69 100644 --- a/man2/kill.2 +++ b/man2/kill.2 @@ -85,7 +85,7 @@ If \fIsig\fP is 0, then no signal is sent, but existence and permission checks are still performed; this can be used to check for the existence of a process ID or process group ID that the caller is permitted to signal. - +.PP For a process to have permission to send a signal, it must either be privileged (under Linux: have the .B CAP_KILL diff --git a/man2/link.2 b/man2/link.2 index 5dde32413..7d5e6d34e 100644 --- a/man2/link.2 +++ b/man2/link.2 @@ -66,13 +66,13 @@ _ATFILE_SOURCE .SH DESCRIPTION .BR link () creates a new link (also known as a hard link) to an existing file. - +.PP If .I newpath exists, it will .I not be overwritten. - +.PP This new name may be used exactly as the old one for any operation; both names refer to the same file (and so have the same permissions and ownership) and it is impossible to tell which name was the @@ -83,7 +83,7 @@ The system call operates in exactly the same way as .BR link (), except for the differences described here. - +.PP If the pathname given in .I oldpath is relative, then it is interpreted relative to the directory @@ -93,7 +93,7 @@ referred to by the file descriptor the calling process, as is done by .BR link () for a relative pathname). - +.PP If .I oldpath is relative and @@ -105,13 +105,13 @@ then is interpreted relative to the current working directory of the calling process (like .BR link ()). - +.PP If .I oldpath is absolute, then .I olddirfd is ignored. - +.PP The interpretation of .I newpath is as for @@ -119,7 +119,7 @@ is as for except that a relative pathname is interpreted relative to the directory referred to by the file descriptor .IR newdirfd . - +.PP The following values can be bitwise ORed in .IR flags : .TP @@ -168,7 +168,7 @@ If procfs is mounted, this can be used as an alternative to .BR AT_EMPTY_PATH , like this: - +.IP .nf .in +4n linkat(AT_FDCWD, "/proc/self/fd/", newdirfd, @@ -309,9 +309,9 @@ capability. An attempt was made to link to the .I /proc/self/fd/NN file corresponding to a file descriptor created with - +.IP open(path, O_TMPFILE | O_EXCL, mode); - +.IP See .BR open (2). .TP @@ -354,7 +354,7 @@ SVr4, 4.3BSD, POSIX.1-2001 (but see NOTES), POSIX.1-2008. .\" SVr4 documents additional ENOLINK and .\" EMULTIHOP error conditions; POSIX.1 does not document ELOOP. .\" X/OPEN does not document EFAULT, ENOMEM or EIO. - +.PP .BR linkat (): POSIX.1-2008. .SH NOTES @@ -364,7 +364,7 @@ cannot span filesystems. Use .BR symlink (2) if this is required. - +.PP POSIX.1-2001 says that .BR link () should dereference diff --git a/man2/listen.2 b/man2/listen.2 index 678cd1487..9e45160ae 100644 --- a/man2/listen.2 +++ b/man2/listen.2 @@ -60,14 +60,14 @@ marks the socket referred to by as a passive socket, that is, as a socket that will be used to accept incoming connection requests using .BR accept (2). - +.PP The .I sockfd argument is a file descriptor that refers to a socket of type .B SOCK_STREAM or .BR SOCK_SEQPACKET . - +.PP The .I backlog argument defines the maximum length @@ -146,7 +146,7 @@ POSIX.1 does not require the inclusion of and this header file is not required on Linux. However, some historical (BSD) implementations required this header file, and portable applications are probably wise to include it. - +.PP The behavior of the .I backlog argument on TCP sockets changed with Linux 2.2. @@ -162,7 +162,7 @@ length and this setting is ignored. See .BR tcp (7) for more information. - +.PP If the .I backlog argument is greater than the value in diff --git a/man2/listxattr.2 b/man2/listxattr.2 index fb7eb3f95..2a37c5e58 100644 --- a/man2/listxattr.2 +++ b/man2/listxattr.2 @@ -88,7 +88,7 @@ A single extended attribute is a null-terminated string. The name includes a namespace prefix; there may be several, disjoint namespaces associated with an individual inode. - +.PP If .I size is specified as zero, these calls return the current size of the @@ -182,7 +182,7 @@ and .BR getxattr (2). For the file whose pathname is provided as a command-line argument, it lists all extended file attributes and their values. - +.PP To keep the code simple, the program assumes that attribute keys and values are constant during the execution of the program. A production program should expect and handle changes during @@ -199,7 +199,7 @@ with a larger buffer each time it fails with the error Calls to .BR getxattr (2) could be handled similarly. - +.PP The following output was recorded by first creating a file, setting some extended file attributes, and then listing the attributes with the example program. diff --git a/man2/llseek.2 b/man2/llseek.2 index da18e85f0..f53b4465a 100644 --- a/man2/llseek.2 +++ b/man2/llseek.2 @@ -59,7 +59,7 @@ or respectively. It returns the resulting file position in the argument .IR result . - +.PP This system call exists on various 32-bit platforms to support seeking to large file offsets. .SH RETURN VALUE diff --git a/man2/lookup_dcookie.2 b/man2/lookup_dcookie.2 index 0059fdc0f..fa33856e5 100644 --- a/man2/lookup_dcookie.2 +++ b/man2/lookup_dcookie.2 @@ -35,7 +35,7 @@ Look up the full path of the directory entry specified by the value The cookie is an opaque identifier uniquely identifying a particular directory entry. The buffer given is filled in with the full path of the directory entry. - +.PP For .BR lookup_dcookie () to return successfully, @@ -84,7 +84,7 @@ is a special-purpose system call, currently used only by the .BR oprofile (1) profiler. It relies on a kernel driver to register cookies for directory entries. - +.PP The path returned may be suffixed by the string " (deleted)" if the directory entry has been removed. .SH SEE ALSO diff --git a/man2/lseek.2 b/man2/lseek.2 index 208990acf..e33a9949e 100644 --- a/man2/lseek.2 +++ b/man2/lseek.2 @@ -121,13 +121,13 @@ In both of the above cases, fails if .I offset points past the end of the file. - +.PP These operations allow applications to map holes in a sparsely allocated file. This can be useful for applications such as file backup tools, which can save space when creating backups and preserve holes, if they have a mechanism for discovering holes. - +.PP For the purposes of these operations, a hole is a sequence of zeros that (normally) has not been allocated in the underlying file storage. However, a filesystem is not obliged to report holes, @@ -150,7 +150,7 @@ it can be considered to consist of data that is a sequence of zeros). .\" https://lkml.org/lkml/2011/4/22/79 .\" http://lwn.net/Articles/440255/ .\" http://blogs.oracle.com/bonwick/entry/seek_hole_and_seek_data - +.PP The .BR _GNU_SOURCE feature test macro must be defined in order to obtain the definitions of @@ -159,7 +159,7 @@ and .BR SEEK_HOLE from .IR . - +.PP The .BR SEEK_HOLE and @@ -223,7 +223,7 @@ The resulting file offset cannot be represented in an is associated with a pipe, socket, or FIFO. .SH CONFORMING TO POSIX.1-2001, POSIX.1-2008, SVr4, 4.3BSD. - +.PP .BR SEEK_DATA and .BR SEEK_HOLE @@ -236,7 +236,7 @@ See .BR open (2) for a discussion of the relationship between file descriptors, open file descriptions, and files. - +.PP If the .B O_APPEND file status flag is set on the open file description, @@ -245,15 +245,15 @@ then a .I always moves the file offset to the end of the file, regardless of the use of .BR lseek (). - +.PP The .I off_t data type is a signed integer data type specified by POSIX.1. - +.PP Some devices are incapable of seeking and POSIX does not specify which devices must support .BR lseek (). - +.PP On Linux, using .BR lseek () on a terminal device fails with the error diff --git a/man2/madvise.2 b/man2/madvise.2 index 5ac9aca3c..d5db4ab4b 100644 --- a/man2/madvise.2 +++ b/man2/madvise.2 @@ -67,7 +67,7 @@ and with size bytes In most cases, the goal of such advice is to improve system or application performance. - +.PP Initially, the system call supported a set of "conventional" .I advice values, which are also available on several other implementations. @@ -125,7 +125,7 @@ Expect access in the near future. Do not expect access in the near future. (For the time being, the application is finished with the given range, so the kernel can free resources associated with it.) - +.IP After a successful .B MADV_DONTNEED operation, @@ -136,14 +136,14 @@ up-to-date contents of the underlying mapped file (for shared file mappings, shared anonymous mappings, and shmem-based techniques such as System V shared memory segments) or zero-fill-on-demand pages for anonymous private mappings. - +.IP Note that, when applied to shared mappings, .BR MADV_DONTNEED might not lead to immediate freeing of the pages in the range. The kernel is free to delay freeing the pages until an appropriate moment. The resident set size (RSS) of the calling process will be immediately reduced however. - +.IP .B MADV_DONTNEED cannot be applied to locked pages, Huge TLB pages, or .BR VM_PFNMAP @@ -181,12 +181,12 @@ bytes containing zero. .\" bufferpool (shared memory segments) - without writing back to .\" disk/swap space. This feature is also useful for supporting .\" hot-plug memory on UML. - +.IP The specified address range must be mapped shared and writable. This flag cannot be applied to locked pages, Huge TLB pages, or .BR VM_PFNMAP pages. - +.IP In the initial implementation, only .BR tmpfs (5) is supported @@ -255,7 +255,7 @@ processes. This operation may result in the calling process receiving a .B SIGBUS and the page being unmapped. - +.IP This feature is intended for testing of memory error-handling code; it is available only if the kernel was configured with .BR CONFIG_MEMORY_FAILURE . @@ -273,14 +273,14 @@ These are replaced by a single write-protected page (which is automatically copied if a process later wants to update the content of the page). KSM merges only private anonymous pages (see .BR mmap (2)). - +.IP The KSM feature is intended for applications that generate many instances of the same data (e.g., virtualization systems such as KVM). It can consume a lot of processing power; use with care. See the Linux kernel source file .I Documentation/vm/ksm.txt for more details. - +.IP The .BR MADV_MERGEABLE and @@ -312,7 +312,7 @@ The effect of the .B MADV_SOFT_OFFLINE operation is invisible to (i.e., does not change the semantics of) the calling process. - +.IP This feature is intended for testing of memory error-handling code; it is available only if the kernel was configured with .BR CONFIG_MEMORY_FAILURE . @@ -332,7 +332,7 @@ to replace them with huge pages. The kernel will also allocate huge pages directly when the region is naturally aligned to the huge page size (see .BR posix_memalign (2)). - +.IP This feature is primarily aimed at applications that use large mappings of data and access large regions of that memory at a time (e.g., virtualization systems such as QEMU). @@ -341,7 +341,7 @@ It can very easily waste memory (e.g., a 2MB mapping that only ever accesses See the Linux kernel source file .I Documentation/vm/transhuge.txt for more details. - +.IP The .BR MADV_HUGEPAGE and @@ -397,7 +397,7 @@ If there is no subsequent write, the kernel can free the pages at any time. Once pages in the range have been freed, the caller will see zero-fill-on-demand pages upon subsequent page references. - +.IP The .B MADV_FREE operation @@ -496,7 +496,7 @@ Other implementations typically implement at least the flags listed above under .IR "Conventional advice flags" , albeit with some variation in semantics. - +.PP POSIX.1-2001 describes .BR posix_madvise (3) with constants diff --git a/man2/mbind.2 b/man2/mbind.2 index d3d726ceb..fa2d99553 100644 --- a/man2/mbind.2 +++ b/man2/mbind.2 @@ -55,7 +55,7 @@ and continuing for .I len bytes. The memory policy defines from which node memory is allocated. - +.PP If the memory range specified by the .IR addr " and " len arguments includes an "anonymous" region of memory\(emthat is @@ -77,7 +77,7 @@ an initial read access will allocate pages according to the memory policy of the thread that causes the page to be allocated. This may not be the thread that called .BR mbind (). - +.PP The specified policy will be ignored for any .B MAP_SHARED mappings in the specified memory range. @@ -85,7 +85,7 @@ Rather the pages will be allocated according to the memory policy of the thread that caused the page to be allocated. Again, this may not be the thread that called .BR mbind (). - +.PP If the specified memory range includes a shared memory region created using the .BR shmget (2) @@ -102,7 +102,7 @@ the huge pages will be allocated according to the policy specified only if the page allocation is caused by the process that calls .BR mbind () for that region. - +.PP By default, .BR mbind () has an effect only for new allocations; if the pages inside @@ -113,7 +113,7 @@ This default behavior may be overridden by the and .B MPOL_MF_MOVE_ALL flags described below. - +.PP The .I mode argument must specify one of @@ -130,7 +130,7 @@ require the caller to specify the node or nodes to which the mode applies, via the .I nodemask argument. - +.PP The .I mode argument may also include an optional @@ -182,7 +182,7 @@ allowed by the thread's current cpuset context .B MPOL_F_STATIC_NODES mode flag is specified), and contains memory. - +.PP The .I mode argument must include one of the following values: @@ -296,7 +296,7 @@ if the existing pages in the memory range don't follow the policy. .\" --Lee Schermerhorn .\" In 2.6.16 or later the kernel will also try to move pages .\" to the requested node with this flag. - +.PP If .B MPOL_MF_MOVE is specified in @@ -309,7 +309,7 @@ If is also specified, then the call will fail with the error .B EIO if some pages could not be moved. - +.PP If .B MPOL_MF_MOVE_ALL is passed in @@ -427,12 +427,12 @@ This system call is Linux-specific. .SH NOTES For information on library support, see .BR numa (7). - +.PP NUMA policy is not supported on a memory-mapped file range that was mapped with the .B MAP_SHARED flag. - +.PP The .B MPOL_DEFAULT mode can have different effects for @@ -466,14 +466,14 @@ with an empty set of nodes. This method will work for .BR set_mempolicy (2), as well. - +.PP Support for huge page policy was added with 2.6.16. For interleave policy to be effective on huge page mappings the policied memory needs to be tens of megabytes or larger. - +.PP .B MPOL_MF_STRICT is ignored on huge page mappings. - +.PP .B MPOL_MF_MOVE and .B MPOL_MF_MOVE_ALL diff --git a/man2/membarrier.2 b/man2/membarrier.2 index 1a800a4f1..dfa48ca45 100644 --- a/man2/membarrier.2 +++ b/man2/membarrier.2 @@ -39,12 +39,12 @@ effectively is .I not as simple as replacing memory barriers with this system call, but requires understanding of the details below. - +.PP Use of memory barriers needs to be done taking into account that a memory barrier always needs to be either matched with its memory barrier counterparts, or that the architecture's memory model doesn't require the matching barriers. - +.PP There are cases where one side of the matching barriers (which we will refer to as "fast side") is executed much more often than the other (which we will refer to as "slow side"). @@ -53,18 +53,18 @@ This is a prime target for the use of The key idea is to replace, for these matching barriers, the fast-side memory barriers by simple compiler barriers, for example: - +.PP asm volatile ("" : : : "memory") - +.PP and replace the slow-side memory barriers by calls to .BR membarrier (). - +.PP This will add overhead to the slow side, and remove overhead from the fast side, thus resulting in an overall performance increase as long as the slow side is infrequent enough that the overhead of the .BR membarrier () calls does not outweigh the performance gain on the fast side. - +.PP The .I cmd argument is one of the following: @@ -95,7 +95,7 @@ argument is currently unused and must be specified as 0. All memory accesses performed in program order from each targeted thread are guaranteed to be ordered with respect to .BR membarrier (). - +.PP If we use the semantic .I barrier() to represent a compiler barrier forcing memory @@ -109,7 +109,7 @@ each pairing of and .IR smp_mb() . The pair ordering is detailed as (O: ordered, X: not ordered): - +.PP barrier() smp_mb() membarrier() barrier() X X O smp_mb() X O O @@ -124,7 +124,7 @@ On error, \-1 is returned, and .I errno is set appropriately. - +.PP For a given command, with .I flags set to 0, this system call is @@ -171,10 +171,10 @@ matching barriers on other cores. For instance, a load fence can order loads prior to and following that fence with respect to stores ordered by store fences. - +.PP Program order is the order in which instructions are ordered in the program assembly code. - +.PP Examples where .BR membarrier () can be useful include implementations @@ -184,7 +184,7 @@ Assuming a multithreaded application where "fast_path()" is executed very frequently, and where "slow_path()" is executed infrequently, the following code (x86) can be transformed using .BR membarrier (): - +.PP .in +4n .nf #include @@ -230,11 +230,11 @@ main(int argc, char **argv) } .fi .in - +.PP The code above transformed to use .BR membarrier () becomes: - +.PP .in +4n .nf #define _GNU_SOURCE diff --git a/man2/memfd_create.2 b/man2/memfd_create.2 index 6fe3ac759..fa17466d2 100644 --- a/man2/memfd_create.2 +++ b/man2/memfd_create.2 @@ -51,14 +51,14 @@ memory allocations such as those allocated using with the .BR MAP_ANONYMOUS flag. - +.PP The initial size of the file is set to 0. Following the call, the file size should be set using .BR ftruncate (2). (Alternatively, the file may be populated by calls to .BR write (2) or similar.) - +.PP The name supplied in .I name is used as a filename and will be displayed @@ -69,7 +69,7 @@ The displayed name is always prefixed with and serves only for debugging purposes. Names do not affect the behavior of the file descriptor, and as such multiple files can have the same name without any side effects. - +.PP The following values may be bitwise ORed in .IR flags to change the behavior of @@ -104,7 +104,7 @@ meaning that no other seals can be set on the file. Unused bits in .I flags must be 0. - +.PP As its return value, .BR memfd_create () returns a new file descriptor that can be used to refer to the file. @@ -113,7 +113,7 @@ This file descriptor is opened for both reading and writing and .B O_LARGEFILE is set for the file descriptor. - +.PP With respect to .BR fork (2) and @@ -166,7 +166,7 @@ system call is Linux-specific. .SH NOTES Glibc does not provide a wrapper for this system call; call it using .BR syscall (2). - +.PP .\" See also http://lwn.net/Articles/593918/ .\" and http://lwn.net/Articles/594919/ and http://lwn.net/Articles/591108/ The @@ -179,7 +179,7 @@ The primary purpose of is to create files and associated file descriptors that are used with the file-sealing APIs provided by .BR fcntl (2). - +.PP The .BR memfd_create () system call also has uses without file sealing @@ -211,13 +211,13 @@ location in the shared memory region. (Dealing with this possibility necessitates the use of a handler for the .BR SIGBUS signal.) - +.PP Dealing with untrusted peers imposes extra complexity on code that employs shared memory. Memory sealing enables that extra complexity to be eliminated, by allowing a process to operate secure in the knowledge that its peer can't modify the shared memory in an undesired fashion. - +.PP An example of the usage of the sealing mechanism is as follows: .IP 1. 3 The first process creates a @@ -297,7 +297,7 @@ seal has not yet been applied). Below are shown two example programs that demonstrate the use of .BR memfd_create () and the file sealing API. - +.PP The first program, .IR t_memfd_create.c , creates a @@ -312,18 +312,18 @@ The first argument is the name to associate with the file, the second argument is the size to be set for the file, and the optional third argument is a string of characters that specify seals to be set on file. - +.PP The second program, .IR t_get_seals.c , can be used to open an existing file that was created via .BR memfd_create () and inspect the set of seals that have been applied to that file. - +.PP The following shell session demonstrates the use of these programs. First we create a .BR tmpfs (5) file and set some seals on it: - +.PP .in +4n .nf $ \fB./t_memfd_create my_memfd_file 4096 sw &\fP @@ -331,7 +331,7 @@ $ \fB./t_memfd_create my_memfd_file 4096 sw &\fP PID: 11775; fd: 3; /proc/11775/fd/3 .fi .in - +.PP At this point, the .I t_memfd_create program continues to run in the background. @@ -347,7 +347,7 @@ Using that pathname, we inspect the content of the symbolic link, and use our .I t_get_seals program to view the seals that have been placed on the file: - +.PP .in +4n .nf $ \fBreadlink /proc/11775/fd/3\fP diff --git a/man2/migrate_pages.2 b/man2/migrate_pages.2 index 1aea17c3f..9498fdddd 100644 --- a/man2/migrate_pages.2 +++ b/man2/migrate_pages.2 @@ -44,7 +44,7 @@ the kernel maintains the relative topology relationship inside .I old_nodes during the migration to .IR new_nodes . - +.PP The .I old_nodes and @@ -66,7 +66,7 @@ as in .BR mbind (2), but different from .BR select (2)). - +.PP The .I pid argument is the ID of the process whose pages are to be moved. @@ -80,7 +80,7 @@ If is 0, then .BR migrate_pages () moves pages of the calling process. - +.PP Pages shared with another process will be moved only if the initiating process has the .B CAP_SYS_NICE @@ -142,7 +142,7 @@ This system call is Linux-specific. .SH NOTES For information on library support, see .BR numa (7). - +.PP Use .BR get_mempolicy (2) with the @@ -151,7 +151,7 @@ flag to obtain the set of nodes that are allowed by the calling process's cpuset. Note that this information is subject to change at any time by manual or automatic reconfiguration of the cpuset. - +.PP Use of .BR migrate_pages () may result in pages whose location @@ -163,7 +163,7 @@ and/or the specified process (see That is, memory policy does not constrain the destination nodes used by .BR migrate_pages (). - +.PP The .I header is not included with glibc, but requires installing @@ -179,6 +179,6 @@ or a similar package. .BR numa (7), .BR migratepages (8), .BR numastat (8) - +.PP .IR Documentation/vm/page_migration in the Linux kernel source tree diff --git a/man2/mincore.2 b/man2/mincore.2 index 834cef3be..f3208b565 100644 --- a/man2/mincore.2 +++ b/man2/mincore.2 @@ -62,7 +62,7 @@ starting at the address and continuing for .I length bytes. - +.PP The .I addr argument must be a multiple of the system page size. @@ -76,7 +76,7 @@ One may obtain the page size .RB ( PAGE_SIZE ) using .IR sysconf(_SC_PAGESIZE) . - +.PP The .I vec argument must point to an array containing at least diff --git a/man2/mkdir.2 b/man2/mkdir.2 index 592a6070f..a2b04044f 100644 --- a/man2/mkdir.2 +++ b/man2/mkdir.2 @@ -48,7 +48,7 @@ _ATFILE_SOURCE .BR mkdir () attempts to create a directory named .IR pathname . - +.PP The argument .I mode specifies the mode for the new directory (see @@ -62,7 +62,7 @@ Whether other .I mode bits are honored for the created directory depends on the operating system. For Linux, see NOTES below. - +.PP The newly created directory will be owned by the effective user ID of the process. If the directory containing the file has the set-group-ID @@ -72,7 +72,7 @@ or, synonymously .IR "mount -o grpid" ), the new directory will inherit the group ownership from its parent; otherwise it will be owned by the effective group ID of the process. - +.PP If the parent directory has the set-group-ID bit set, then so will the newly created directory. .\" @@ -83,7 +83,7 @@ The system call operates in exactly the same way as .BR mkdir (), except for the differences described here. - +.PP If the pathname given in .I pathname is relative, then it is interpreted relative to the directory @@ -93,7 +93,7 @@ referred to by the file descriptor the calling process, as is done by .BR mkdir () for a relative pathname). - +.PP If .I pathname is relative and @@ -105,7 +105,7 @@ then is interpreted relative to the current working directory of the calling process (like .BR mkdir ()). - +.PP If .I pathname is absolute, then @@ -209,7 +209,7 @@ library support was added to glibc in version 2.4. .BR mkdir (): SVr4, BSD, POSIX.1-2001, POSIX.1-2008. .\" SVr4 documents additional EIO, EMULTIHOP - +.PP .BR mkdirat (): POSIX.1-2008. .SH NOTES diff --git a/man2/mknod.2 b/man2/mknod.2 index e3708cb34..475115083 100644 --- a/man2/mknod.2 +++ b/man2/mknod.2 @@ -55,7 +55,7 @@ with attributes specified by .I mode and .IR dev . - +.PP The .I mode argument specifies both the file mode to use and the type of node @@ -63,13 +63,13 @@ to be created. It should be a combination (using bitwise OR) of one of the file types listed below and zero or more of the file mode bits listed in .BR inode (7). - +.PP The file mode is modified by the process's .I umask in the usual way: in the absence of a default ACL, the permissions of the created node are .RI ( mode " & ~" umask ). - +.PP The file type must be one of .BR S_IFREG , .BR S_IFCHR , @@ -83,7 +83,7 @@ special file, block special file, FIFO (named pipe), or UNIX domain socket, respectively. (Zero file type is equivalent to type .BR S_IFREG .) - +.PP If the file type is .B S_IFCHR or @@ -96,13 +96,13 @@ special file may be useful to build the value for .IR dev ); otherwise it is ignored. - +.PP If .I pathname already exists, or is a symbolic link, this call fails with an .B EEXIST error. - +.PP The newly created node will be owned by the effective user ID of the process. If the directory containing the node has the set-group-ID @@ -117,7 +117,7 @@ The system call operates in exactly the same way as .BR mknod (), except for the differences described here. - +.PP If the pathname given in .I pathname is relative, then it is interpreted relative to the directory @@ -127,7 +127,7 @@ referred to by the file descriptor the calling process, as is done by .BR mknod () for a relative pathname). - +.PP If .I pathname is relative and @@ -139,7 +139,7 @@ then is interpreted relative to the current working directory of the calling process (like .BR mknod ()). - +.PP If .I pathname is absolute, then @@ -251,7 +251,7 @@ SVr4, 4.4BSD, POSIX.1-2001 (but see below), POSIX.1-2008. .\" The Linux version differs from the SVr4 version in that it .\" does not require root permission to create pipes, also in that no .\" EMULTIHOP, ENOLINK, or EINTR error is documented. - +.PP .BR mknodat (): POSIX.1-2008. .SH NOTES @@ -272,14 +272,14 @@ However, nowadays one should never use for this purpose; one should use .BR mkfifo (3), a function especially defined for this purpose. - +.PP Under Linux, .BR mknod () cannot be used to create directories. One should make directories with .BR mkdir (2). .\" and one should make UNIX domain sockets with socket(2) and bind(2). - +.PP There are many infelicities in the protocol underlying NFS. Some of these affect .BR mknod () diff --git a/man2/mlock.2 b/man2/mlock.2 index 7f11b30e4..3800a2217 100644 --- a/man2/mlock.2 +++ b/man2/mlock.2 @@ -45,7 +45,7 @@ and lock part or all of the calling process's virtual address space into RAM, preventing that memory from being paged to the swap area. - +.PP .BR munlock () and .BR munlockall () @@ -53,7 +53,7 @@ perform the converse operation, unlocking part or all of the calling process's virtual address space, so that pages in the specified virtual address range may once more to be swapped out if required by the kernel memory manager. - +.PP Memory locking and unlocking are performed in units of whole pages. .SS mlock(), mlock2(), and munlock() .BR mlock () @@ -65,7 +65,7 @@ bytes. All pages that contain a part of the specified address range are guaranteed to be resident in RAM when the call returns successfully; the pages are guaranteed to stay in RAM until later unlocked. - +.PP .BR mlock2 () .\" commit a8ca5d0ecbdde5cc3d7accacbd69968b0c98764e .\" commit de60f5f10c58d4f34b68622442c0e04180367f3f @@ -79,7 +79,7 @@ However, the state of the pages contained in that range after the call returns successfully will depend on the value in the .I flags argument. - +.PP The .I flags argument can be either 0 or the following constant: @@ -88,19 +88,19 @@ argument can be either 0 or the following constant: Lock pages that are currently resident and mark the entire range to have pages locked when they are populated by the page fault. .PP - +.PP If .I flags is 0, .BR mlock2 () behaves exactly the same as .BR mlock (). - +.PP Note: currently, there is not a glibc wrapper for .BR mlock2 (), so it will need to be invoked using .BR syscall (2). - +.PP .BR munlock () unlocks pages in the address range starting at .I addr @@ -119,7 +119,7 @@ memory, and memory-mapped files. All mapped pages are guaranteed to be resident in RAM when the call returns successfully; the pages are guaranteed to stay in RAM until later unlocked. - +.PP The .I flags argument is constructed as the bitwise OR of one or more of the @@ -175,7 +175,7 @@ In the same circumstances, stack growth may likewise fail: the kernel will deny stack expansion and deliver a .B SIGSEGV signal to the process. - +.PP .BR munlockall () unlocks all pages mapped into the address space of the calling process. @@ -275,7 +275,7 @@ For is available since Linux 4.4. .SH CONFORMING TO POSIX.1-2001, POSIX.1-2008, SVr4. - +.PP mlock2 () is Linux specific. .SH AVAILABILITY @@ -290,7 +290,7 @@ can be determined from the constant .B PAGESIZE (if defined) in \fI\fP or by calling .IR sysconf(_SC_PAGESIZE) . - +.PP On POSIX systems on which .BR mlockall () and @@ -321,7 +321,7 @@ software has erased the secrets in RAM and terminated. (But be aware that the suspend mode on laptops and some desktop computers will save a copy of the system's RAM to disk, regardless of memory locks.) - +.PP Real-time processes that are using .BR mlockall () to prevent delays on page faults should reserve enough @@ -334,7 +334,7 @@ This way, enough pages will be mapped for the stack and can be locked into RAM. The dummy writes ensure that not even copy-on-write page faults can occur in the critical section. - +.PP Memory locks are not inherited by a child created via .BR fork (2) and are automatically removed (unlocked) during an @@ -349,7 +349,7 @@ settings are not inherited by a child created via .BR fork (2) and are cleared during an .BR execve (2). - +.PP Note that .BR fork (2) will prepare the address space for a copy-on-write operation. @@ -363,11 +363,11 @@ or .BR mlock () operation\(emnot even from a thread which runs at a low priority within a process which also has a thread running at elevated priority. - +.PP The memory lock on an address range is automatically removed if the address range is unmapped via .BR munmap (2). - +.PP Memory locks do not stack, that is, pages which have been locked several times by calls to .BR mlock (), @@ -381,7 +381,7 @@ for the corresponding range or by Pages which are mapped to several locations or by several processes stay locked into RAM as long as they are locked at least at one location or by at least one process. - +.PP If a call to .BR mlockall () which uses the @@ -390,7 +390,7 @@ flag is followed by another call that does not specify this flag, the changes made by the .B MCL_FUTURE call will be lost. - +.PP The .BR mlock2 () .B MLOCK_ONFAULT @@ -417,7 +417,7 @@ and allows an implementation to require that .I addr is page aligned, so portable applications should ensure this. - +.PP The .I VmLck field of the Linux-specific @@ -438,7 +438,7 @@ a process must be privileged in order to lock memory and the .B RLIMIT_MEMLOCK soft resource limit defines a limit on how much memory the process may lock. - +.PP Since Linux 2.6.9, no limits are placed on the amount of memory that a privileged process can lock and the .B RLIMIT_MEMLOCK @@ -467,7 +467,7 @@ would fail on requests that should have succeeded. This bug was fixed .\" commit 0cf2f6f6dc605e587d2c1120f295934c77e810e8 in Linux 4.9 - +.PP In the 2.4 series Linux kernels up to and including 2.4.17, a bug caused the .BR mlockall () @@ -475,7 +475,7 @@ a bug caused the flag to be inherited across a .BR fork (2). This was rectified in kernel 2.4.18. - +.PP Since kernel 2.6.9, if a privileged process calls .I mlockall(MCL_FUTURE) and later drops privileges (loses the diff --git a/man2/mmap.2 b/man2/mmap.2 index 9ff2a5859..c762d3be2 100644 --- a/man2/mmap.2 +++ b/man2/mmap.2 @@ -60,7 +60,7 @@ The starting address for the new mapping is specified in The .I length argument specifies the length of the mapping. - +.PP If .I addr is NULL, @@ -74,7 +74,7 @@ on Linux, the mapping will be created at a nearby page boundary. .\" Before Linux 2.6.24, the address was rounded up to the next page .\" boundary; since 2.6.24, it is rounded down! The address of the new mapping is returned as the result of the call. - +.PP The contents of a file mapping (as opposed to an anonymous mapping; see .B MAP_ANONYMOUS below), are initialized using @@ -135,7 +135,7 @@ It is unspecified whether changes made to the file after the call are visible in the mapped region. .LP Both of these flags are described in POSIX.1-2001 and POSIX.1-2008. - +.PP In addition, zero or more of the following values can be ORed in .IR flags : .TP @@ -251,7 +251,7 @@ Used in conjunction with .B MAP_HUGETLB to select alternative hugetlb page sizes (respectively, 2 MB and 1 GB) on systems that support multiple hugetlb page sizes. - +.IP More generally, the desired huge page size can be configured by encoding the base-2 logarithm of the desired page size in the six bits at the offset .BR MAP_HUGE_SHIFT . @@ -261,14 +261,14 @@ the default huge page size can be discovered vie the field exposed by .IR /proc/meminfo .) Thus, the above two constants are defined as: - +.IP .nf .in +4n #define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) #define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) .in .fi - +.IP The range of huge page sizes that are supported by the system can be discovered by listing the subdirectories in .IR /sys/kernel/mm/hugepages . @@ -411,7 +411,7 @@ On error, the value is returned, and .I errno is set to indicate the cause of the error. - +.PP On success, .BR munmap () returns 0. @@ -577,7 +577,7 @@ or not. Portable programs should always set .B PROT_EXEC if they intend to execute code in the new mapping. - +.PP The portable way to create a mapping is to specify .I addr as 0 (NULL), and omit @@ -592,7 +592,7 @@ If the flag is specified, and .I addr is 0 (NULL), then the mapped address will be 0 (NULL). - +.PP Certain .I flags constants are defined only if suitable feature test macros are defined @@ -625,7 +625,7 @@ The relevant flags are: .BR MAP_POPULATE , and .BR MAP_STACK . - +.PP An application can determine which pages of a mapping are currently resident in the buffer/page cache using .BR mincore (2). @@ -662,7 +662,7 @@ and .BR munmap () differ somewhat from the requirements for mappings that use the native system page size. - +.PP For .BR mmap (), .I offset @@ -670,7 +670,7 @@ must be a multiple of the underlying huge page size. The system automatically aligns .I length to be a multiple of the underlying huge page size. - +.PP For .BR munmap (), .I addr @@ -698,14 +698,14 @@ On Linux, there are no guarantees like those suggested above under .BR MAP_NORESERVE . By default, any process can be killed at any moment when the system runs out of memory. - +.PP In kernels before 2.6.7, the .B MAP_POPULATE flag has effect only if .I prot is specified as .BR PROT_NONE . - +.PP SUSv3 specifies that .BR mmap () should fail if @@ -720,7 +720,7 @@ Since kernel 2.6.12, fails with the error .B EINVAL for this case. - +.PP POSIX specifies that the system shall always zero fill any partial page at the end of the object and that system will never write any modification of the @@ -837,14 +837,14 @@ main(int argc, char *argv[]) .BR userfaultfd (2), .BR shm_open (3), .BR shm_overview (7) - +.PP The descriptions of the following files in .BR proc (5): .IR /proc/[pid]/maps , .IR /proc/[pid]/map_files , and .IR /proc/[pid]/smaps . - +.PP B.O. Gallmeister, POSIX.4, O'Reilly, pp. 128-129 and 389-391. .\" .\" Repeat after me: private read-only mappings are 100% equivalent to diff --git a/man2/mmap2.2 b/man2/mmap2.2 index 7cd94a01b..7e99d781c 100644 --- a/man2/mmap2.2 +++ b/man2/mmap2.2 @@ -40,7 +40,7 @@ mmap2 \- map files or devices into memory This is probably not the system call that you are interested in; instead, see .BR mmap (2), which describes the glibc wrapper function that invokes this system call. - +.PP The .BR mmap2 () system call provides the same interface as @@ -83,9 +83,9 @@ the glibc wrapper function invokes this system call rather than the .BR mmap (2) system call. - +.PP This system call does not exist on x86-64. - +.PP On ia64, the unit for .I offset is actually the system page size, rather than 4096 bytes. diff --git a/man2/modify_ldt.2 b/man2/modify_ldt.2 index 858ddfa9b..256d83c2d 100644 --- a/man2/modify_ldt.2 +++ b/man2/modify_ldt.2 @@ -70,7 +70,7 @@ structure and .I bytecount must equal the size of this structure. - +.PP The .I user_desc structure is defined in \fI\fP as: diff --git a/man2/move_pages.2 b/man2/move_pages.2 index ca0fbaed7..dc3892a52 100644 --- a/man2/move_pages.2 +++ b/man2/move_pages.2 @@ -42,7 +42,7 @@ The result of the move is reflected in The .I flags indicate constraints on the pages to be moved. - +.PP .I pid is the ID of the process in which pages are to be moved. To move pages in another process, @@ -55,7 +55,7 @@ If is 0, then .BR move_pages () moves pages of the calling process. - +.PP .I count is the number of pages to move. It defines the size of the three arrays @@ -63,7 +63,7 @@ It defines the size of the three arrays .IR nodes , and .IR status . - +.PP .I pages is an array of pointers to the pages that should be moved. These are pointers that should be aligned to page boundaries. @@ -71,7 +71,7 @@ These are pointers that should be aligned to page boundaries. .\" not aligned to page boundaries Addresses are specified as seen by the process specified by .IR pid . - +.PP .I nodes is an array of integers that specify the desired location for each page. Each element in the array is a node number. @@ -84,13 +84,13 @@ where each page currently resides, in the array. Obtaining the status of each page may be necessary to determine pages that need to be moved. - +.PP .I status is an array of integers that return the status of each page. The array contains valid values only if .BR move_pages () did not return an error. - +.PP .I flags specify what types of pages to move. .B MPOL_MF_MOVE @@ -198,7 +198,7 @@ This system call is Linux-specific. .SH NOTES For information on library support, see .BR numa (7). - +.PP Use .BR get_mempolicy (2) with the @@ -209,7 +209,7 @@ flag to obtain the set of nodes that are allowed by the current cpuset. Note that this information is subject to change at any time by manual or automatic reconfiguration of the cpuset. - +.PP Use of this function may result in pages whose location (node) violates the memory policy established for the specified addresses (See @@ -219,7 +219,7 @@ and/or the specified process (See That is, memory policy does not constrain the destination nodes used by .BR move_pages (). - +.PP The .I header is not included with glibc, but requires installing diff --git a/man2/mprotect.2 b/man2/mprotect.2 index 6358b2b27..9a64965d2 100644 --- a/man2/mprotect.2 +++ b/man2/mprotect.2 @@ -47,7 +47,7 @@ containing any part of the address range in the interval [\fIaddr\fP,\ \fIaddr\fP+\fIlen\fP\-1]. .I addr must be aligned to a page boundary. - +.PP If the calling process tries to access memory in a manner that violates the protections, then the kernel generates a .B SIGSEGV @@ -218,7 +218,7 @@ POSIX says that the behavior of is unspecified if it is applied to a region of memory that was not obtained via .BR mmap (2). - +.PP .BR pkey_mprotect () is a nonportable Linux extension. .SH NOTES @@ -228,7 +228,7 @@ on any address in a process's address space (except for the kernel vsyscall area). In particular, it can be used to change existing code mappings to be writable. - +.PP Whether .B PROT_EXEC has any effect different from @@ -242,12 +242,12 @@ specifying .B PROT_READ will implicitly add .BR PROT_EXEC. - +.PP On some hardware architectures (e.g., i386), .B PROT_WRITE implies .BR PROT_READ . - +.PP POSIX.1 says that an implementation may permit access other than that specified in .IR prot , @@ -256,7 +256,7 @@ but at a minimum can allow write access only if has been set, and must not allow any access if .B PROT_NONE has been set. - +.PP Applications should be careful when mixing use of .BR mprotect () and @@ -269,7 +269,7 @@ set to .B PROT_EXEC a pkey is may be allocated and set on the memory implicitly by the kernel, but only when the pkey was 0 previously. - +.PP On systems that do not support protection keys in hardware, .BR pkey_mprotect () may still be used, but @@ -287,10 +287,10 @@ The program below demonstrates the use of The program allocates four pages of memory, makes the third of these pages read-only, and then executes a loop that walks upward through the allocated region modifying bytes. - +.PP An example of what we might see when running the program is the following: - +.PP .in +4n .nf .RB "$" " ./a.out" diff --git a/man2/mq_getsetattr.2 b/man2/mq_getsetattr.2 index e4676f6e6..316fcca07 100644 --- a/man2/mq_getsetattr.2 +++ b/man2/mq_getsetattr.2 @@ -39,7 +39,7 @@ mq_getsetattr \- get/set message queue attributes There is no glibc wrapper for this system call; see NOTES. .SH DESCRIPTION Do not use this system call. - +.PP This is the low-level system call used to implement .BR mq_getattr (3) and diff --git a/man2/mremap.2 b/man2/mremap.2 index b7ced563d..82cca0570 100644 --- a/man2/mremap.2 +++ b/man2/mremap.2 @@ -44,7 +44,7 @@ mremap \- remap a virtual memory address expands (or shrinks) an existing memory mapping, potentially moving it at the same time (controlled by the \fIflags\fP argument and the available virtual address space). - +.PP \fIold_address\fP is the old address of the virtual memory block that you want to expand (or shrink). Note that \fIold_address\fP has to be page @@ -58,7 +58,7 @@ An optional fifth argument, may be provided; see the description of .B MREMAP_FIXED below. - +.PP In Linux the memory is divided into pages. A user process has (one or) several linear virtual memory segments. @@ -70,7 +70,7 @@ a segmentation violation if the memory is accessed incorrectly (e.g., writing to a read-only segment). Accessing virtual memory outside of the segments will also cause a segmentation violation. - +.PP .BR mremap () uses the Linux page table scheme. .BR mremap () @@ -78,7 +78,7 @@ changes the mapping between virtual addresses and memory pages. This can be used to implement a very efficient .BR realloc (3). - +.PP The \fIflags\fP bit-mask argument may be 0, or include the following flag: .TP .B MREMAP_MAYMOVE @@ -196,7 +196,7 @@ and the prototype for did not allow for the .I new_address argument. - +.PP If .BR mremap () is used to move or expand an area locked with @@ -216,7 +216,7 @@ if the area cannot be populated. .BR sbrk (2), .BR malloc (3), .BR realloc (3) - +.PP Your favorite text book on operating systems for more information on paged memory (e.g., \fIModern Operating Systems\fP by Andrew S. Tanenbaum, diff --git a/man2/msgctl.2 b/man2/msgctl.2 index 0056fd158..620c01453 100644 --- a/man2/msgctl.2 +++ b/man2/msgctl.2 @@ -247,7 +247,7 @@ A successful .B MSG_STAT operation returns the identifier of the queue whose index was given in .IR msqid . - +.PP On error, \-1 is returned with .I errno indicating the error. @@ -338,7 +338,7 @@ Applications intended to be portable to such old systems may need to include these header files. .\" Like Linux, the FreeBSD man pages still document .\" the inclusion of these header files. - +.PP The .BR IPC_INFO , .B MSG_STAT @@ -350,7 +350,7 @@ program to provide information on allocated resources. In the future these may modified or moved to a .I /proc filesystem interface. - +.PP Various fields in the \fIstruct msqid_ds\fP were typed as .I short diff --git a/man2/msgget.2 b/man2/msgget.2 index edaed1457..469afe0e4 100644 --- a/man2/msgget.2 +++ b/man2/msgget.2 @@ -194,7 +194,7 @@ Applications intended to be portable to such old systems may need to include these header files. .\" Like Linux, the FreeBSD man pages still document .\" the inclusion of these header files. - +.PP .B IPC_PRIVATE isn't a flag field but a .I key_t diff --git a/man2/msgop.2 b/man2/msgop.2 index a8be94605..8c760d231 100644 --- a/man2/msgop.2 +++ b/man2/msgop.2 @@ -138,7 +138,7 @@ is specified in .IR msgflg , then the call instead fails with the error .BR EAGAIN . - +.PP A blocked .BR msgsnd () call may also fail if: @@ -262,7 +262,7 @@ Nondestructively fetch a copy of the message at the ordinal position in the queue specified by .I msgtyp (messages are considered to be numbered starting at 0). - +.IP This flag must be specified in conjunction with .BR IPC_NOWAIT , with the result that, if there is no message available at the given position, @@ -276,7 +276,7 @@ and .BR MSG_EXCEPT may not both be specified in .IR msgflg . - +.IP The .BR MSG_COPY flag was added for the implementation of @@ -473,7 +473,7 @@ and this kernel was configured without .BR CONFIG_CHECKPOINT_RESTORE . .SH CONFORMING TO POSIX.1-2001, POSIX.1-2008, SVr4. - +.PP The .B MSG_EXCEPT and @@ -496,14 +496,14 @@ Applications intended to be portable to such old systems may need to include these header files. .\" Like Linux, the FreeBSD man pages still document .\" the inclusion of these header files. - +.PP The .I msgp argument is declared as \fIstruct msgbuf\ *\fP in glibc 2.0 and 2.1. It is declared as \fIvoid\ *\fP in glibc 2.2 and later, as required by SUSv2 and SUSv3. - +.PP The following limits on message queue resources affect the .BR msgsnd () call: @@ -554,7 +554,7 @@ of whether that message was at the ordinal position This bug is fixed .\" commit 4f87dac386cc43d5525da7a939d4b4e7edbea22c in Linux 3.14. - +.PP Specifying both .B MSG_COPY and @@ -575,11 +575,11 @@ The program below demonstrates the use of .BR msgsnd () and .BR msgrcv (). - +.PP The example program is first run with the \fB\-s\fP option to send a message and then run again with the \fB\-r\fP option to receive a message. - +.PP The following shell session shows a sample run of the program: .in +4n .nf diff --git a/man2/msync.2 b/man2/msync.2 index 98f04cb02..4d16ee493 100644 --- a/man2/msync.2 +++ b/man2/msync.2 @@ -45,7 +45,7 @@ corresponds to the memory area starting at and having length .I length is updated. - +.PP The .I flags argument should specify exactly one of @@ -98,7 +98,7 @@ are set in The indicated memory (or part of it) was not mapped. .SH CONFORMING TO POSIX.1-2001, POSIX.1-2008. - +.PP This call was introduced in Linux 1.3.21, and then used .B EFAULT instead of @@ -149,5 +149,5 @@ in .IR flags . .SH SEE ALSO .BR mmap (2) - +.PP B.O. Gallmeister, POSIX.4, O'Reilly, pp. 128-129 and 389-391. diff --git a/man2/nanosleep.2 b/man2/nanosleep.2 index bd59867a8..e716ceec9 100644 --- a/man2/nanosleep.2 +++ b/man2/nanosleep.2 @@ -54,7 +54,7 @@ until either at least the time specified in has elapsed, or the delivery of a signal that triggers the invocation of a handler in the calling thread or that terminates the process. - +.PP If the call is interrupted by a signal handler, .BR nanosleep () returns \-1, sets @@ -71,7 +71,7 @@ The value of can then be used to call .BR nanosleep () again and complete the specified pause (but see NOTES). - +.PP The structure .I timespec is used to specify intervals of time with nanosecond precision. @@ -87,7 +87,7 @@ struct timespec { .in .PP The value of the nanoseconds field must be in the range 0 to 999999999. - +.PP Compared to .BR sleep (3) and @@ -139,7 +139,7 @@ is not an exact multiple of the granularity underlying clock (see then the interval will be rounded up to the next multiple. Furthermore, after the sleep completes, there may still be a delay before the CPU becomes free to once again execute the calling thread. - +.PP The fact that .BR nanosleep () sleeps for a relative interval can be problematic if the call @@ -149,7 +149,7 @@ will lead to drift in the time when the sleep finally completes. This problem can be avoided by using .BR clock_nanosleep (2) with an absolute time value. - +.PP POSIX.1 specifies that .BR nanosleep () should measure time against the @@ -212,7 +212,7 @@ To avoid such problems, use with the .BR TIMER_ABSTIME flag to sleep to an absolute deadline. - +.PP In Linux 2.4, if .BR nanosleep () is stopped by a signal (e.g., diff --git a/man2/nfsservctl.2 b/man2/nfsservctl.2 index 32cd63f9e..7dcd4d383 100644 --- a/man2/nfsservctl.2 +++ b/man2/nfsservctl.2 @@ -19,7 +19,7 @@ It has been replaced by a set of files in the .I nfsd filesystem; see .BR nfsd (7). - +.PP .nf /* * These are the commands understood by nfsctl(). diff --git a/man2/nice.2 b/man2/nice.2 index 24422b9e8..dc7d85537 100644 --- a/man2/nice.2 +++ b/man2/nice.2 @@ -51,10 +51,10 @@ adds .I inc to the nice value for the calling thread. (A higher nice value means a low priority.) - +.PP The range of the nice value is +19 (low priority) to \-20 (high priority). Attempts to set a nice value outside the range are clamped to the range. - +.PP Traditionally, only a privileged process could lower the nice value (i.e., set a higher priority). However, since Linux 2.6.12, an unprivileged process can decrease @@ -68,7 +68,7 @@ On success, the new nice value is returned (but see NOTES below). On error, \-1 is returned, and .I errno is set appropriately. - +.PP A successful call can legitimately return \-1. To detect an error, set .I errno @@ -99,7 +99,7 @@ However, the raw system call and (g)libc .SH NOTES For further details on the nice value, see .BR sched (7). - +.PP .IR Note : the addition of the "autogroup" feature in Linux 2.6.38 means that the nice value no longer has its traditional effect in many circumstances. @@ -114,7 +114,7 @@ However, the raw Linux system call returns 0 on success. Likewise, the .BR nice () wrapper function provided in glibc 2.2.3 and earlier returns 0 on success. - +.PP Since glibc 2.2.4, the .BR nice () wrapper function provided by glibc provides conformance to POSIX.1 by calling diff --git a/man2/open_by_handle_at.2 b/man2/open_by_handle_at.2 index 3b326bba4..e807155b6 100644 --- a/man2/open_by_handle_at.2 +++ b/man2/open_by_handle_at.2 @@ -68,7 +68,7 @@ arguments. The file handle is returned via the argument .IR handle , which is a pointer to a structure of the following form: - +.PP .in +4n .nf struct file_handle { @@ -96,7 +96,7 @@ Upon successful return, the .IR handle_bytes field is updated to contain the number of bytes actually written to .IR f_handle . - +.PP The caller can discover the required size for the .I file_handle structure by making a call in which @@ -109,7 +109,7 @@ and is set to indicate the required size; the caller can then use this information to allocate a structure of the correct size (see EXAMPLE below). - +.PP Other than the use of the .IR handle_bytes field, the caller should treat the @@ -120,7 +120,7 @@ and .IR f_handle fields are needed only by a subsequent call to .BR open_by_handle_at (). - +.PP The .I flags argument is a bit mask constructed by ORing together zero or more of @@ -128,7 +128,7 @@ argument is a bit mask constructed by ORing together zero or more of and .BR AT_SYMLINK_FOLLOW , described below. - +.PP Together, the .I pathname and @@ -193,7 +193,7 @@ Opening the pathname in the fifth field of that record yields a file descriptor for the mount point; that file descriptor can be used in a subsequent call to .BR open_by_handle_at (). - +.PP By default, .BR name_to_handle_at () does not dereference @@ -213,7 +213,7 @@ system call opens the file referred to by .IR handle , a file handle returned by a previous call to .BR name_to_handle_at (). - +.PP The .IR mount_fd argument is a file descriptor for any object (file, directory, etc.) @@ -223,7 +223,7 @@ should be interpreted. The special value .B AT_FDCWD can be specified, meaning the current working directory of the caller. - +.PP The .I flags argument @@ -236,7 +236,7 @@ refers to a symbolic link, the caller must specify the flag, and the symbolic link is not dereferenced; the .B O_NOFOLLOW flag, if specified, is ignored. - +.PP The caller must have the .B CAP_DAC_READ_SEARCH capability to invoke @@ -248,7 +248,7 @@ returns 0, and .BR open_by_handle_at () returns a nonnegative file descriptor. - +.PP In the event of an error, both system calls return \-1 and set .I errno to indicate the cause of the error. @@ -259,7 +259,7 @@ and can fail for the same errors as .BR openat (2). In addition, they can fail with the errors noted below. - +.PP .BR name_to_handle_at () can fail with the following errors: .TP @@ -350,7 +350,7 @@ These system calls first appeared in Linux 2.6.39. Library support is provided in glibc since version 2.14. .SH CONFORMING TO These system calls are nonstandard Linux extensions. - +.PP FreeBSD has a broadly similar pair of system calls in the form of .BR getfh () and @@ -360,20 +360,20 @@ A file handle can be generated in one process using .BR name_to_handle_at () and later used in a different process that calls .BR open_by_handle_at (). - +.PP Some filesystem don't support the translation of pathnames to file handles, for example, .IR /proc , .IR /sys , and various network filesystems. - +.PP A file handle may become invalid ("stale") if a file is deleted, or for other filesystem-specific reasons. Invalid handles are notified by an .B ESTALE error from .BR open_by_handle_at (). - +.PP These system calls are designed for use by user-space file servers. For example, a user-space NFS server might generate a file handle and pass it to an NFS client. @@ -383,7 +383,7 @@ it could pass the handle back to the server. .\" "Open by handle" - Jonathan Corbet, 2010-02-23 This sort of functionality allows a user-space file server to operate in a stateless fashion with respect to the files it serves. - +.PP If .I pathname refers to a symbolic link and @@ -419,7 +419,7 @@ However, an application can use the information in the .I mountinfo record that corresponds to the mount ID to derive a persistent identifier. - +.PP For example, one can use the device name in the fifth field of the .I mountinfo record to search for the corresponding device UUID via the symbolic links in @@ -447,7 +447,7 @@ uses to obtain the file handle and mount ID for the file specified in its command-line argument; the handle and mount ID are written to standard output. - +.PP The second program .RI ( t_open_by_handle_at.c ) reads a mount ID and file handle from standard input. @@ -467,9 +467,9 @@ to find a record whose mount ID matches the mount ID read from standard input, and the mount directory specified in that record is opened. (These programs do not deal with the fact that mount IDs are not persistent.) - +.PP The following shell session demonstrates the use of these two programs: - +.PP .in +4n .nf $ \fBecho 'Can you please think about it?' > cecilia.txt\fP @@ -481,7 +481,7 @@ Read 31 bytes $ \fBrm cecilia.txt\fP .fi .in - +.PP Now we delete and (quickly) re-create the file so that it has the same content and (by chance) the same inode. Nevertheless, @@ -490,7 +490,7 @@ Nevertheless, .\" counter that gets incremented in this case. recognizes that the original file referred to by the file handle no longer exists. - +.PP .in +4n .nf $ \fBstat \-\-printf="%i\\n" cecilia.txt\fP # Display inode number @@ -722,7 +722,7 @@ main(int argc, char *argv[]) .BR blkid (8), .BR findfs (8), .BR mount (8) - +.PP The .I libblkid and diff --git a/man2/outb.2 b/man2/outb.2 index 0bf1caa3e..b8aa80ea0 100644 --- a/man2/outb.2 +++ b/man2/outb.2 @@ -70,12 +70,12 @@ but can be used from user space. .\" , given the following information .\" in addition to that given in .\" .BR outb (9). - +.PP You must compile with \fB\-O\fP or \fB\-O2\fP or similar. The functions are defined as inline macros, and will not be substituted in without optimization enabled, causing unresolved references at link time. - +.PP You use .BR ioperm (2) or alternatively diff --git a/man2/perf_event_open.2 b/man2/perf_event_open.2 index 16b412007..3bce742c6 100644 --- a/man2/perf_event_open.2 +++ b/man2/perf_event_open.2 @@ -113,7 +113,7 @@ is governed by a ptrace access mode .B PTRACE_MODE_READ_REALCREDS check; see .BR ptrace (2). - +.PP The .I group_fd argument allows event groups to be created. @@ -199,7 +199,7 @@ The .I perf_event_attr structure provides detailed configuration information for the event being created. - +.PP .in +4n .nf struct perf_event_attr { @@ -276,7 +276,7 @@ struct perf_event_attr { }; .fi .in - +.PP The fields of the .I perf_event_attr structure are described in more detail below: @@ -346,7 +346,7 @@ Set this using .I sizeof(struct perf_event_attr) to allow the kernel to see the struct size at the time of compilation. - +.IP The related define .B PERF_ATTR_SIZE_VER0 is set to 64; this was the size of the first published struct. @@ -387,7 +387,7 @@ The fields are also taken into account in cases where 64 bits is not enough to fully specify the event. The encoding of these fields are event dependent. - +.IP There are various ways to set the .I config field that are dependent on the value of the previously @@ -398,7 +398,7 @@ What follows are various possible settings for .I config separated out by .IR type . - +.IP If .I type is @@ -517,7 +517,7 @@ must be associated with an active event. This dummy event allows gathering such records without requiring a counting event. .RE - +.PP .RS If .I type @@ -530,7 +530,7 @@ can be obtained from under debugfs .I tracing/events/*/*/id if ftrace is enabled in the kernel. .RE - +.PP .RS If .I type @@ -602,7 +602,7 @@ to measure accesses to measure misses .RE .RE - +.PP If .I type is @@ -618,7 +618,7 @@ The libpfm4 library can be used to translate from the name in the architectural manuals to the raw hex value .BR perf_event_open () expects in this field. - +.PP If .I type is @@ -640,7 +640,7 @@ in the mmap buffer. The .I sample_type field controls what data is recorded on each overflow. - +.IP .I sample_freq can be used if you wish to use frequency rather than period. In this case, you set the @@ -708,7 +708,7 @@ Usually returned by tracepoint events. This provides a record of recent branches, as provided by CPU branch sampling hardware (such as Intel Last Branch Record). Not all hardware supports this feature. - +.IP See the .I branch_sample_type field for how to filter which branches are reported. @@ -743,7 +743,7 @@ Places the value in a fixed position in the record, either at the beginning (for sample events) or at the end (if a non-sample event). - +.IP This was necessary because a sample stream may have records from various different event sources with different .I sample_type @@ -755,7 +755,7 @@ but the format could not be found without knowing what event the sample belonged to (causing a circular dependency). - +.IP The .B PERF_SAMPLE_IDENTIFIER setting makes the event stream always parsable @@ -770,7 +770,7 @@ values in records. .\" commit fdfbbd07e91f8fe387140776f3fd94605f0c89e5 Records reasons for transactional memory abort events (for example, from Intel TSX transactional memory support). - +.IP The .I precise_ip setting must be greater than 0 and a transactional memory abort @@ -836,7 +836,7 @@ If disabled, the event can later be enabled by .BR prctl (2), or .IR enable_on_exec . - +.IP When creating an event group, typically the group leader is initialized with .I disabled @@ -856,7 +856,7 @@ tasks as well as the task specified. This applies only to new children, not to any existing children at the time the counter is created (nor to any new children of existing children). - +.IP Inherit does not work for some combinations of .IR read_format values, such as @@ -883,7 +883,7 @@ it should be the only group using the CPU's counters. In the future this may allow monitoring programs to support PMU features that need to run alone so that they do not disrupt other hardware counters. - +.IP Note that many unexpected situations may prevent events with the .I exclusive bit set from ever running. @@ -982,7 +982,7 @@ Smaller skid is better and allows more accurate reporting of which events correspond to which instructions, but hardware is often limited with how small this can be. - +.IP The possible values of this field are the following: .RS .IP 0 3 @@ -1022,7 +1022,7 @@ additionally be included in if the corresponding .I sample_type is selected. - +.IP If .B PERF_SAMPLE_IDENTIFIER is specified, then an additional ID value is included @@ -1030,9 +1030,9 @@ as the last value to ease parsing the record stream. This may lead to the .I id value appearing twice. - +.IP The layout is described by this pseudo-structure: - +.IP .in +4n .nf struct sample_id { @@ -1137,7 +1137,7 @@ happen before an overflow notification happens. Which one is used is selected by the .I watermark bit flag. - +.IP .I wakeup_events counts only .B PERF_RECORD_SAMPLE @@ -1147,7 +1147,7 @@ To receive overflow notification for all types choose watermark and set .I wakeup_watermark to 1. - +.IP Prior to Linux 3.0, setting .\" commit f506b3dc0ec454a16d40cab9ee5d75435b39dc50 .I wakeup_events @@ -1229,7 +1229,7 @@ If .B PERF_SAMPLE_BRANCH_STACK is enabled, then this specifies what branches to include in the branch record. - +.IP The first part of the value is the privilege level, which is a combination of one of the values listed below. If the user does not set privilege level explicitly, the kernel @@ -1348,18 +1348,18 @@ The values that are there are specified by the field in the .I attr structure at open time. - +.PP If you attempt to read into a buffer that is not big enough to hold the data, the error .B ENOSPC results. - +.PP Here is the layout of the data returned by a read: .IP * 2 If .B PERF_FORMAT_GROUP was specified to allow reading all events in a group at once: - +.IP .in +4n .nf struct read_format { @@ -1379,7 +1379,7 @@ If was .I not specified: - +.IP .in +4n .nf struct read_format { @@ -1429,18 +1429,18 @@ mmap tracking) are logged into a ring-buffer. This ring-buffer is created and accessed through .BR mmap (2). - +.PP The mmap size should be 1+2^n pages, where the first page is a metadata page .RI ( "struct perf_event_mmap_page" ) that contains various bits of information such as where the ring-buffer head is. - +.PP Before kernel 2.6.39, there is a bug that means you must allocate an mmap ring buffer when sampling even if you do not plan to access it. - +.PP The structure of the first metadata mmap page is as follows: - +.PP .in +4n .nf struct perf_event_mmap_page { @@ -1478,7 +1478,7 @@ struct perf_event_mmap_page { } .fi .in - +.PP The following list describes the fields in the .I perf_event_mmap_page structure in more detail: @@ -1519,7 +1519,7 @@ impossible to know if or .I cap_usr_rdpmc were actually set. - +.IP Starting with Linux 3.12, these are renamed to .\" commit fa7315871046b9a4c48627905691dbde57e51033 .I cap_bit0 @@ -1537,7 +1537,7 @@ the properly separated and .I cap_user_rdpmc bits. - +.IP If not-set, it indicates an older kernel where .I cap_usr_time and @@ -1550,7 +1550,7 @@ be used with caution. If the hardware supports user-space read of performance counters without syscall (this is the "rdpmc" instruction on x86), then the following code can be used to do a read: - +.IP .in +4n .nf u32 seq, time_mult, time_shift, idx, width; @@ -1601,7 +1601,7 @@ If this field provides the bit-width of the value read using the rdpmc or equivalent instruction. This can be used to sign extend the result like: - +.IP .in +4n .nf pmc <<= 64 \- pmc_width; @@ -1611,7 +1611,7 @@ count += pmc; .in .TP .IR time_shift ", " time_mult ", " time_offset - +.IP If .IR cap_usr_time , these fields can be used to compute the time @@ -1627,7 +1627,7 @@ delta since delta = time_offset + quot * time_mult + ((rem * time_mult) >> time_shift); .fi - +.IP Where .IR time_offset , .IR time_mult , @@ -1650,22 +1650,22 @@ enabled and possible running (if idx), improving the scaling: .TP .IR time_zero " (since Linux 3.12)" .\" commit fa7315871046b9a4c48627905691dbde57e51033 - +.IP If .I cap_usr_time_zero is set, then the hardware clock (the TSC timestamp counter on x86) can be calculated from the .IR time_zero ", " time_mult ", and " time_shift " values:" - +.IP .nf time = timestamp - time_zero; quot = time / time_mult; rem = time % time_mult; cyc = (quot << time_shift) + (rem << time_shift) / time_mult; .fi - +.IP And vice versa: - +.IP .nf quot = cyc >> time_shift; rem = cyc & (((u64)1 << time_shift) - 1); @@ -1678,7 +1678,7 @@ This points to the head of the data section. The value continuously increases, it does not wrap. The value needs to be manually wrapped by the size of the mmap buffer before accessing the samples. - +.IP On SMP-capable platforms, after reading the .I data_head value, @@ -1708,7 +1708,7 @@ The AUX region allows mmaping a separate sample buffer for high-bandwidth data streams (separate from the main perf sample buffer). An example of a high-bandwidth stream is instruction tracing support, as is found in newer Intel processors. - +.IP To set up an AUX area, first .I aux_offset needs to be set with an offset greater than @@ -1726,7 +1726,7 @@ resource limit (see and also as part of the .I perf_event_mlock_kb allowance. - +.IP By default, the AUX buffer will be truncated if it will not fit in the available space in the ring buffer. If the AUX buffer is mapped as a read only buffer, then it will @@ -1735,7 +1735,7 @@ by new. In overwrite mode, it might not be possible to infer where the new data began, and it is the consumer's job to disable measurement while reading to avoid possible data races. - +.IP The .IR aux_head " and " aux_tail ring buffer pointers have the same behavior and ordering @@ -1743,7 +1743,7 @@ rules as the previous described .IR data_head " and " data_tail . .PP The following 2^n ring-buffer pages have the layout described below. - +.PP If .I perf_event_attr.sample_id_all is set, then all event types will @@ -1757,9 +1757,9 @@ fields, that is, at the end of the payload. This allows a newer perf.data file to be supported by older perf tools, with the new optional fields being ignored. - +.PP The mmap values start with a header: - +.PP .in +4n .nf struct perf_event_header { @@ -1769,7 +1769,7 @@ struct perf_event_header { }; .fi .in - +.PP Below, we describe the .I perf_event_header fields in more detail. @@ -1783,7 +1783,7 @@ This indicates the size of the record. The .I misc field contains additional information about the sample. - +.IP The CPU mode can be determined from this value by masking with .B PERF_RECORD_MISC_CPUMODE_MASK and looking for one of the following (note these are not @@ -1810,7 +1810,7 @@ Sample happened in the guest kernel. .\" commit 39447b386c846bbf1c56f6403c5282837486200f Sample happened in guest user code. .RE - +.PP .RS Since the following three statuses are generated by different record types, they alias to the same bit: @@ -1839,7 +1839,7 @@ record is generated, this bit indicates that the context switch is away from the current process (instead of into the current process). .RE - +.PP .RS In addition, the following bits can be set: .TP @@ -1880,7 +1880,7 @@ The MMAP events record the mappings so that we can correlate user-space IPs to code. They have the following structure: - +.IP .in +4n .nf struct { @@ -1913,7 +1913,7 @@ is a string describing the backing of the allocated memory. .TP .B PERF_RECORD_LOST This record indicates when events are lost. - +.IP .in +4n .nf struct { @@ -1935,7 +1935,7 @@ is the number of events that were lost. .TP .B PERF_RECORD_COMM This record indicates a change in the process name. - +.IP .in +4n .nf struct { @@ -1961,7 +1961,7 @@ is a string containing the new name of the process. .TP .B PERF_RECORD_EXIT This record indicates a process exit event. - +.IP .in +4n .nf struct { @@ -1976,7 +1976,7 @@ struct { .TP .BR PERF_RECORD_THROTTLE ", " PERF_RECORD_UNTHROTTLE This record indicates a throttle/unthrottle event. - +.IP .in +4n .nf struct { @@ -1991,7 +1991,7 @@ struct { .TP .B PERF_RECORD_FORK This record indicates a fork event. - +.IP .in +4n .nf struct { @@ -2006,7 +2006,7 @@ struct { .TP .B PERF_RECORD_READ This record indicates a read event. - +.IP .in +4n .nf struct { @@ -2020,7 +2020,7 @@ struct { .TP .B PERF_RECORD_SAMPLE This record indicates a sample. - +.IP .in +4n .nf struct { @@ -2152,7 +2152,7 @@ If is enabled, then a 32-bit value indicating size is included followed by an array of 8-bit values of length size. The values are padded with 0 to have 64-bit alignment. - +.IP This RAW record data is opaque with respect to the ABI. The ABI doesn't make any promises with respect to the stability of its content, it may vary depending @@ -2195,7 +2195,7 @@ previous branch stack update. .P The entries are from most to least recent, so the first entry has the most recent branch. - +.PP Support for .IR mispred , .IR predicted , @@ -2203,7 +2203,7 @@ and .IR cycles is optional; if not supported, those values will be 0. - +.PP The type of branches recorded is specified by the .I branch_sample_type field. @@ -2213,13 +2213,13 @@ field. If .B PERF_SAMPLE_REGS_USER is enabled, then the user CPU registers are recorded. - +.IP The .I abi field is one of .BR PERF_SAMPLE_REGS_ABI_NONE ", " PERF_SAMPLE_REGS_ABI_32 " or " .BR PERF_SAMPLE_REGS_ABI_64 . - +.IP The .I regs field is an array of the CPU registers that were specified by @@ -2268,7 +2268,7 @@ the following fields: .TP 4 .I mem_op Type of opcode, a bitwise combination of: - +.IP .PD 0 .RS .TP 24 @@ -2293,7 +2293,7 @@ Executable code Memory hierarchy level hit or miss, a bitwise combination of the following, shifted left by .BR PERF_MEM_LVL_SHIFT : - +.IP .PD 0 .RS .TP 24 @@ -2344,7 +2344,7 @@ Uncached memory .I mem_snoop Snoop mode, a bitwise combination of the following, shifted left by .BR PERF_MEM_SNOOP_SHIFT : - +.IP .PD 0 .RS .TP 24 @@ -2368,7 +2368,7 @@ Snoop hit modified .I mem_lock Lock instruction, a bitwise combination of the following, shifted left by .BR PERF_MEM_LOCK_SHIFT : - +.IP .PD 0 .RS .TP 24 @@ -2384,7 +2384,7 @@ Locked transaction TLB access hit or miss, a bitwise combination of the following, shifted left by .BR PERF_MEM_TLB_SHIFT : - +.IP .PD 0 .RS .TP 24 @@ -2417,7 +2417,7 @@ If the .B PERF_SAMPLE_TRANSACTION flag is set, then a 64-bit field is recorded describing the sources of any transactional memory aborts. - +.IP The field is a bitwise combination of the following values: .RS .TP @@ -2456,7 +2456,7 @@ and masking with the value If .B PERF_SAMPLE_REGS_INTR is enabled, then the user CPU registers are recorded. - +.IP The .I abi field is one of @@ -2464,7 +2464,7 @@ field is one of .BR PERF_SAMPLE_REGS_ABI_32 , or .BR PERF_SAMPLE_REGS_ABI_64 . - +.IP The .I regs field is an array of the CPU registers that were specified by @@ -2484,7 +2484,7 @@ The format is similar to that of the .B PERF_RECORD_MMAP record, but includes extra values that allow uniquely identifying shared mappings. - +.IP .in +4n .nf struct { @@ -2547,7 +2547,7 @@ is a string describing the backing of the allocated memory. \" commit 68db7e98c3a6ebe7284b6cf14906ed7c55f3f7f0 This record reports that new data is available in the separate AUX buffer region. - +.IP .in +4n .nf struct { @@ -2585,7 +2585,7 @@ if set, then the data returned has overwritten previous data. This record indicates which process has initiated an instruction trace event, allowing tools to properly correlate the instruction addresses in the AUX buffer with the proper executable. - +.IP .in +4n .nf struct { @@ -2607,7 +2607,7 @@ thread ID of the thread starting an instruction trace. \" f38b0dbb491a6987e198aa6b428db8692a6480f8 When using hardware sampling (such as Intel PEBS) this record indicates some number of samples that may have been lost. - +.IP .in +4n .nf struct { @@ -2631,7 +2631,7 @@ bit in the .I misc field indicates whether it was a context switch into or away from the current process. - +.IP .in +4n .nf struct { @@ -2654,7 +2654,7 @@ bit in the .I misc field indicates whether it was a context switch into or away from the current process. - +.IP .in +4n .nf struct { @@ -2691,13 +2691,13 @@ and .BR F_SETSIG operations in .BR fcntl (2). - +.PP Overflows are generated only by sampling events .RI ( sample_period must have a nonzero value). - +.PP There are two ways to generate overflow notifications. - +.PP The first is to set a .I wakeup_events or @@ -2707,7 +2707,7 @@ or bytes have been written to the mmap ring buffer. In this case, .B POLL_IN is indicated. - +.PP The other way is by use of the .B PERF_EVENT_IOC_REFRESH ioctl. @@ -2719,13 +2719,13 @@ once the counter reaches 0 .B POLL_HUP is indicated and the underlying event is disabled. - +.PP Refreshing an event group leader refreshes all siblings and refreshing with a parameter of 0 currently enables infinite refreshes; these behaviors are unsupported and should not be relied on. .\" See https://lkml.org/lkml/2011/5/24/337 - +.PP Starting with Linux 3.18, .\" commit 179033b3e064d2cd3f5f9945e76b0a0f0fbf4883 .B POLL_HUP @@ -2739,12 +2739,12 @@ instruction to get low-latency reads without having to enter the kernel. Note that using .I rdpmc is not necessarily faster than other methods for reading event values. - +.PP Support for this can be detected with the .I cap_usr_rdpmc field in the mmap page; documentation on how to calculate event values can be found in that section. - +.PP Originally, when rdpmc support was enabled, any process (not just ones with an active perf event) could use the rdpmc instruction to access the counters. @@ -2763,7 +2763,7 @@ file descriptors: .B PERF_EVENT_IOC_ENABLE This enables the individual event or event group specified by the file descriptor argument. - +.IP If the .B PERF_IOC_FLAG_GROUP bit is set in the ioctl argument, then all events in a group are @@ -2773,14 +2773,14 @@ enabled, even if the event specified is not the group leader .B PERF_EVENT_IOC_DISABLE This disables the individual counter or event group specified by the file descriptor argument. - +.IP Enabling or disabling the leader of a group enables or disables the entire group; that is, while the group leader is disabled, none of the counters in the group will count. Enabling or disabling a member of a group other than the leader affects only that counter; disabling a non-leader stops that counter from counting but doesn't affect any other counter. - +.IP If the .B PERF_IOC_FLAG_GROUP bit is set in the ioctl argument, then all events in a group are @@ -2810,7 +2810,7 @@ multiplexing or .I time_running values. - +.IP If the .B PERF_IOC_FLAG_GROUP bit is set in the ioctl argument, then all events in a group are @@ -2819,7 +2819,7 @@ reset, even if the event specified is not the group leader .TP .B PERF_EVENT_IOC_PERIOD This updates the overflow period for the event. - +.IP Since Linux 3.7 (on ARM) .\" commit 3581fe0ef37ce12ac7a4f74831168352ae848edc and Linux 3.14 (all other architectures), @@ -2827,10 +2827,10 @@ and Linux 3.14 (all other architectures), the new period takes effect immediately. On older kernels, the new period did not take effect until after the next overflow. - +.IP The argument is a pointer to a 64-bit value containing the desired new period. - +.IP Prior to Linux 2.6.36, .\" commit ad0cf3478de8677f720ee06393b3147819568d6a this ioctl always failed due to a bug @@ -2840,20 +2840,20 @@ in the kernel. This tells the kernel to report event notifications to the specified file descriptor rather than the default one. The file descriptors must all be on the same CPU. - +.IP The argument specifies the desired file descriptor, or \-1 if output should be ignored. .TP .BR PERF_EVENT_IOC_SET_FILTER " (since Linux 2.6.33)" .\" commit 6fb2915df7f0747d9044da9dbff5b46dc2e20830 This adds an ftrace filter to this event. - +.IP The argument is a pointer to the desired ftrace filter. .TP .BR PERF_EVENT_IOC_ID " (since Linux 3.12)" .\" commit cf4957f17f2a89984915ea808876d9c82225b862 This returns the event ID value for the given event file descriptor. - +.IP The argument is a pointer to a 64-bit unsigned integer to hold the result. .TP @@ -2864,7 +2864,7 @@ program to an existing kprobe tracepoint event. You need .B CAP_SYS_ADMIN privileges to use this ioctl. - +.IP The argument is a BPF program file descriptor that was created by a previous .BR bpf (2) @@ -2883,7 +2883,7 @@ process has created on other processes. It enables or disables only the group leaders, not any other members in the groups. .SS perf_event related configuration files - +.PP Files in .I /proc/sys/kernel/ .RS 4 @@ -2892,7 +2892,7 @@ Files in The .I perf_event_paranoid file can be set to restrict access to the performance counters. - +.IP .PD 0 .RS .IP 2 4 @@ -2930,10 +2930,10 @@ Maximum number of pages an unprivileged user can .BR mlock (2). The default is 516 (kB). .RE - +.PP Files in .I /sys/bus/event_source/devices/ - +.PP .RS 4 Since Linux 2.6.34, the kernel supports having multiple PMUs available for monitoring. @@ -2954,7 +2954,7 @@ to indicate that you wish to use this PMU. If this file is 1, then direct user-space access to the performance counter registers is allowed via the rdpmc instruction. This can be disabled by echoing 0 to the file. - +.IP As of Linux 4.0 .\" a66734297f78707ce39d756b656bfae861d53f62 .\" 7911d3f7af14a614617e38245fedf98a724e46a9 @@ -2970,7 +2970,7 @@ subfields available for programming the various fields in the .I perf_event_attr struct. - +.IP The content of each file is the name of the config field, followed by a colon, followed by a series of integer bit ranges separated by commas. @@ -2991,7 +2991,7 @@ expressed in terms of the fields found in the previously mentioned directory. These are not necessarily complete lists of all events supported by a PMU, but usually a subset of events deemed useful or interesting. - +.IP The content of each file is a list of attribute names separated by commas. Each entry has an optional value (either hex or decimal). @@ -3158,7 +3158,7 @@ is larger than the maximum specified in Returned on many (but not all) architectures when an unsupported .IR exclude_hv ", " exclude_idle ", " exclude_user ", or " exclude_kernel setting is specified. - +.IP It can also happen, as with .BR EACCES , when the requested event requires @@ -3186,7 +3186,7 @@ and should not be used in programs intended to be portable. Glibc does not provide a wrapper for this system call; call it using .BR syscall (2). See the example below. - +.PP The official way of knowing if .BR perf_event_open () support is enabled is checking @@ -3200,7 +3200,7 @@ option to is needed to properly get overflow signals in threads. This was introduced in Linux 2.6.32. .\" commit ba0a6c9f6fceed11c6a99e8326f0477fe383e6b5 - +.PP Prior to Linux 2.6.33 (at least for x86), .\" commit b690081d4d3f6a23541493f1682835c3cd5c54a1 the kernel did not check @@ -3210,40 +3210,40 @@ This means to see if a given set of events works you have to .BR perf_event_open (), start, then read before you know for sure you can get valid measurements. - +.PP Prior to Linux 2.6.34, .\" FIXME . cannot find a kernel commit for this one event constraints were not enforced by the kernel. In that case, some events would silently return "0" if the kernel scheduled them in an improper counter slot. - +.PP Prior to Linux 2.6.34, there was a bug when multiplexing where the wrong results could be returned. .\" commit 45e16a6834b6af098702e5ea6c9a40de42ff77d8 - +.PP Kernels from Linux 2.6.35 to Linux 2.6.39 can quickly crash the kernel if "inherit" is enabled and many threads are started. .\" commit 38b435b16c36b0d863efcf3f07b34a6fac9873fd - +.PP Prior to Linux 2.6.35, .\" commit 050735b08ca8a016bbace4445fa025b88fee770b .B PERF_FORMAT_GROUP did not work with attached processes. - +.PP There is a bug in the kernel code between Linux 2.6.36 and Linux 3.0 that ignores the "watermark" field and acts as if a wakeup_event was chosen if the union has a nonzero value in it. .\" commit 4ec8363dfc1451f8c8f86825731fe712798ada02 - +.PP From Linux 2.6.31 to Linux 3.4, the .B PERF_IOC_FLAG_GROUP ioctl argument was broken and would repeatedly operate on the event specified rather than iterating across all sibling events in a group. .\" commit 724b6daa13e100067c30cfc4d1ad06629609dc4e - +.PP From Linux 3.4 to Linux 3.11, the mmap .\" commit fa7315871046b9a4c48627905691dbde57e51033 .I cap_usr_rdpmc @@ -3255,7 +3255,7 @@ Code should migrate to the new and .I cap_user_time fields instead. - +.PP Always double-check your results! Various generalized events have had wrong values. For example, retired branches measured diff --git a/man2/perfmonctl.2 b/man2/perfmonctl.2 index 1c5cce9de..531fbc773 100644 --- a/man2/perfmonctl.2 +++ b/man2/perfmonctl.2 @@ -44,7 +44,7 @@ PMU (performance monitoring unit). The PMU consists of PMD (performance monitoring data) registers and PMC (performance monitoring control) registers, which gather hardware statistics. - +.PP .BR perfmonctl () applies the operation .I cmd @@ -54,7 +54,7 @@ The number of arguments is defined by \fInarg\fR. The .I fd argument specifies the perfmon context to operate on. - +.PP Supported values for .I cmd are: @@ -64,14 +64,14 @@ are: .BI "perfmonctl(int " fd ", PFM_CREATE_CONTEXT, pfarg_context_t *" ctxt ", 1); .fi Set up a context. - +.IP The .I fd parameter is ignored. A new perfmon context is created as specified in .I ctxt and its file descriptor is returned in \fIctxt->ctx_fd\fR. - +.IP The file descriptor can be used in subsequent calls to .BR perfmonctl () and can be used to read event notifications (type @@ -83,7 +83,7 @@ The file descriptor is pollable using .BR poll (2), and .BR epoll (7). - +.IP The context can be destroyed by calling .BR close (2) on the file descriptor. @@ -204,5 +204,5 @@ Glibc does not provide a wrapper for this system call; call it using .BR syscall (2). .SH SEE ALSO .BR gprof (1) - +.PP The perfmon2 interface specification diff --git a/man2/personality.2 b/man2/personality.2 index 9a45993d0..9e8200f5b 100644 --- a/man2/personality.2 +++ b/man2/personality.2 @@ -45,7 +45,7 @@ signal numbers into signal actions. The execution domain system allows Linux to provide limited support for binaries compiled under other UNIX-like operating systems. - +.PP If .I persona is not @@ -57,7 +57,7 @@ Specifying .IR persona as 0xffffffff provides a way of retrieving the current persona without changing it. - +.PP A list of the available execution domains can be found in .IR . The execution domain is a 32-bit value in which the top three diff --git a/man2/pivot_root.2 b/man2/pivot_root.2 index 0e18142c1..fae0e7368 100644 --- a/man2/pivot_root.2 +++ b/man2/pivot_root.2 @@ -24,14 +24,14 @@ of the calling process. .\" The .\" .B CAP_SYS_ADMIN .\" capability is required. - +.PP The typical use of .BR pivot_root () is during system startup, when the system mounts a temporary root filesystem (e.g., an \fBinitrd\fP), then mounts the real root filesystem, and eventually turns the latter into the current root of all relevant processes or threads. - +.PP .BR pivot_root () may or may not change the current root and the current working directory of any processes or threads which use the old @@ -43,7 +43,7 @@ at the old root operate correctly in either case. An easy way to ensure this is to change their root and current working directory to \fInew_root\fP before invoking .BR pivot_root (). - +.PP The paragraph above is intentionally vague because the implementation of .BR pivot_root () may change in the future. @@ -59,14 +59,14 @@ In the future, there may be a mechanism for kernel threads to explicitly relinquish any access to the filesystem, such that this fairly intrusive mechanism can be removed from .BR pivot_root (). - +.PP Note that this also applies to the calling process: .BR pivot_root () may or may not affect its current working directory. It is therefore recommended to call \fBchdir("/")\fP immediately after .BR pivot_root (). - +.PP The following restrictions apply to \fInew_root\fP and \fIput_old\fP: .IP \- 3 They must be directories. @@ -83,14 +83,14 @@ No other filesystem may be mounted on \fIput_old\fP. See also .BR pivot_root (8) for additional usage examples. - +.PP If the current root is not a mount point (e.g., after .BR chroot (2) or .BR pivot_root (), see also below), not the old root directory, but the mount point of that filesystem is mounted on \fIput_old\fP. - +.PP \fInew_root\fP does not have to be a mount point. In this case, \fI/proc/mounts\fP will show the mount point of the filesystem containing @@ -132,7 +132,7 @@ Glibc does not provide a wrapper for this system call; call it using .BR pivot_root () should not have to change root and current working directory of all other processes in the system. - +.PP Some of the more obscure uses of .BR pivot_root () may quickly lead to diff --git a/man2/pkey_alloc.2 b/man2/pkey_alloc.2 index 00b96ba81..537feba13 100644 --- a/man2/pkey_alloc.2 +++ b/man2/pkey_alloc.2 @@ -36,7 +36,7 @@ pkey_alloc, pkey_free \- allocate or free a protection key .BR pkey_alloc () allocates a protection key (pkey) and allows it to be passed to .BR pkey_mprotect (2). - +.PP The .BR pkey_alloc () .I flags @@ -53,7 +53,7 @@ frees a protection key and makes it available for later allocations. After a protection key has been freed, it may no longer be used in any protection-key-related operations. - +.PP An application should not call .BR pkey_free () on any protection key which has been assigned to an address @@ -87,7 +87,7 @@ The number of keys available is architecture-specific and implementation-specific and may be reduced by kernel-internal use of certain keys. There are currently 15 keys available to user programs on x86. - +.IP This error will also be returned if the processor or operating system does not support protection keys. Applications should always be prepared to handle this error, since @@ -113,7 +113,7 @@ It can be used in lieu of any other mechanism for detecting pkey support and will simply fail with the error .B ENOSPC if the operating system has no pkey support. - +.PP The kernel guarantees that the contents of the hardware rights register (PKRU) will be preserved only for allocated protection keys. diff --git a/man2/poll.2 b/man2/poll.2 index 4b3c15f76..3a6e351a2 100644 --- a/man2/poll.2 +++ b/man2/poll.2 @@ -50,7 +50,7 @@ performs a similar task to .BR select (2): it waits for one of a set of file descriptors to become ready to perform I/O. - +.PP The set of file descriptors to be monitored is specified in the .I fds argument, which is an array of structures of the following form: @@ -69,7 +69,7 @@ The caller should specify the number of items in the .I fds array in .IR nfds . - +.PP The field .I fd contains a file descriptor for an open file. @@ -85,7 +85,7 @@ call: simply negate the .I fd field. Note, however, that this technique can't be used to ignore file descriptor 0.) - +.PP The field .I events is an input parameter, a bit mask specifying the events the application @@ -100,7 +100,7 @@ are and .B POLLNVAL (see below). - +.PP The field .I revents is an output parameter, filled by the kernel with the events that @@ -119,12 +119,12 @@ or field, and will be set in the .I revents field whenever the corresponding condition is true.) - +.PP If none of the events requested (and no error) has occurred for any of the file descriptors, then .BR poll () blocks until one of the events occurs. - +.PP The .I timeout argument specifies the number of milliseconds that @@ -151,7 +151,7 @@ Specifying a of zero causes .BR poll () to return immediately, even if no file descriptors are ready. - +.PP The bits that may be set/returned in .I events and @@ -289,7 +289,7 @@ See the description of for an explanation of why .BR ppoll () is necessary. - +.PP If the .I sigmask argument is specified as NULL, then @@ -301,7 +301,7 @@ differs from only in the precision of the .I timeout argument). - +.PP The .I tmo_p argument specifies an upper limit on the amount of time that @@ -317,7 +317,7 @@ struct timespec { }; .fi .in - +.PP If .I tmo_p is specified as NULL, then @@ -362,7 +362,7 @@ the glibc (and the old Linux libc) .BR poll () wrapper function provides emulation using .BR select (2). - +.PP The .BR ppoll () system call was added to Linux in kernel 2.6.16. @@ -389,7 +389,7 @@ Portable programs may wish to check for .B EAGAIN and loop, just as with .BR EINTR . - +.PP Some implementations define the nonstandard constant .B INFTIM with the value \-1 for use as a @@ -397,7 +397,7 @@ with the value \-1 for use as a for .BR poll (). This constant is not provided in glibc. - +.PP For a discussion of what may happen if a file descriptor being monitored by .BR poll () is closed in another thread, see @@ -416,7 +416,7 @@ Thus, the glibc function does not modify its .I tmo_p argument. - +.PP The raw .BR ppoll () system call has a fifth argument, diff --git a/man2/posix_fadvise.2 b/man2/posix_fadvise.2 index 959c2c2a7..918ca7aff 100644 --- a/man2/posix_fadvise.2 +++ b/man2/posix_fadvise.2 @@ -54,14 +54,14 @@ Programs can use to announce an intention to access file data in a specific pattern in the future, thus allowing the kernel to perform appropriate optimizations. - +.PP The \fIadvice\fP applies to a (not necessarily existent) region starting at \fIoffset\fP and extending for \fIlen\fP bytes (or until the end of the file if \fIlen\fP is 0) within the file referred to by \fIfd\fP. The \fIadvice\fP is not binding; it merely constitutes an expectation on behalf of the application. - +.PP Permissible values for \fIadvice\fP include: .TP .B POSIX_FADV_NORMAL @@ -79,14 +79,14 @@ The specified data will be accessed in random order. .TP .B POSIX_FADV_NOREUSE The specified data will be accessed only once. - +.IP In kernels before 2.6.18, \fBPOSIX_FADV_NOREUSE\fP had the same semantics as \fBPOSIX_FADV_WILLNEED\fP. This was probably a bug; since kernel 2.6.18, this flag is a no-op. .TP .B POSIX_FADV_WILLNEED The specified data will be accessed in the near future. - +.IP \fBPOSIX_FADV_WILLNEED\fP initiates a nonblocking read of the specified region into the page cache. The amount of data read may be decreased by the kernel depending @@ -96,7 +96,7 @@ and more is rarely useful.) .TP .B POSIX_FADV_DONTNEED The specified data will not be accessed in the near future. - +.IP \fBPOSIX_FADV_DONTNEED\fP attempts to free cached pages associated with the specified region. This is useful, for example, while streaming large @@ -104,7 +104,7 @@ files. A program may periodically request the kernel to free cached data that has already been used, so that more useful cached pages are not discarded instead. - +.IP Requests to discard partial pages are ignored. It is preferable to preserve needed data than discard unneeded data. If the application requires that data be considered for discarding, then @@ -112,7 +112,7 @@ If the application requires that data be considered for discarding, then and .I len must be page-aligned. - +.IP The implementation .I may attempt to write back dirty pages in the specified region, @@ -152,7 +152,7 @@ the underlying system call is called Library support has been provided since glibc version 2.2, via the wrapper function .BR posix_fadvise (). - +.PP Since Linux 3.18, .\" commit d3ac21cacc24790eb45d735769f35753f5b56ceb support for the underlying system call is optional, @@ -174,12 +174,12 @@ default size for the backing device; \fBPOSIX_FADV_SEQUENTIAL\fP doubles this size, and \fBPOSIX_FADV_RANDOM\fP disables file readahead entirely. These changes affect the entire file, not just the specified region (but other open file handles to the same file are unaffected). - +.PP The contents of the kernel buffer cache can be cleared via the .IR /proc/sys/vm/drop_caches interface described in .BR proc (5). - +.PP One can obtain a snapshot of which pages of a file are resident in the buffer cache by opening a file, mapping it with .BR mmap (2), @@ -210,7 +210,7 @@ Therefore, these architectures define a version of the system call that orders the arguments suitably, but is otherwise exactly the same as .BR posix_fadvise (). - +.PP For example, since Linux 2.6.14, ARM has the following system call: .PP .in +4n diff --git a/man2/prctl.2 b/man2/prctl.2 index 2ccdcf060..456f10964 100644 --- a/man2/prctl.2 +++ b/man2/prctl.2 @@ -124,7 +124,7 @@ The capability bounding set dictates whether the process can receive the capability through a file's permitted capability set on a subsequent call to .BR execve (2). - +.IP If the capability specified in .I arg2 is not valid, then the call fails with the error @@ -138,7 +138,7 @@ capability within its user namespace, then drop the capability specified by from the calling thread's capability bounding set. Any children of the calling thread will inherit the newly reduced bounding set. - +.IP The call fails with the error: .B EPERM if the calling thread does not have the @@ -160,7 +160,7 @@ set the "child subreaper" attribute of the calling process; if .I arg2 is zero, unset the attribute. - +.IP A subreaper fulfills the role of .BR init (1) for its descendant processes. @@ -177,14 +177,14 @@ will receive a signal and will be able to .BR wait (2) on the process to discover its termination status. - +.IP The setting of this bit is not inherited by children created by .BR fork (2) and .BR clone (2). The setting is preserved across .BR execve (2). - +.IP Establishing a subreaper process is useful in session management frameworks where a hierarchical group of processes is managed by a subreaper process that needs to be informed when one of the processes\(emfor example, @@ -205,7 +205,7 @@ in the location pointed to by Set the state of the "dumpable" flag, which determines whether core dumps are produced for the calling process upon delivery of a signal whose default behavior is to produce a core dump. - +.IP In kernels up to and including 2.6.12, .I arg2 must be either 0 @@ -227,7 +227,7 @@ for security reasons, this feature has been removed. .I /proc/sys/fs/\:suid_dumpable in .BR proc (5).) - +.IP Normally, this flag is set to 1. However, it is reset to the current value contained in the file .IR /proc/sys/fs/\:suid_dumpable @@ -262,7 +262,7 @@ Processes that are not dumpable can not be attached via see .BR ptrace (2) for further details. - +.IP If a process is not dumpable, the ownership of files in the process's .IR /proc/[pid] @@ -302,7 +302,7 @@ FP32 or FP64 ABIs. When more restrictive code is linked in, the overall requirement for the process is to use the more restrictive floating-point mode. - +.IP Because the kernel has no means of knowing in advance which mode the process should be executed in, and because these restrictions can @@ -310,7 +310,7 @@ change over the lifetime of the process, the .B PR_SET_FP_MODE operation is provided to allow control of the floating-point mode from user space. - +.IP .\" https://dmz-portal.mips.com/wiki/MIPS_O32_ABI_-_FR0_and_FR1_Interlinking The .I (unsigned int) arg2 @@ -327,7 +327,7 @@ and 64-bit registers are represented as a pair of registers (even- and odd- numbered, with the even-numbered register containing the lower 32 bits, and the odd-numbered register containing the higher 32 bits). - +.IP When this bit is .I set (on supported hardware), @@ -337,8 +337,8 @@ mode). Note that modern MIPS implementations (MIPS R6 and newer) support .B FR=1 mode only. - - +.IP +.IP Applications that use the O32 FP32 ABI can operate only when this bit is .I unset .RB ( FR=0 ; @@ -401,10 +401,10 @@ are ignored. Get the current floating-point mode (see the description of .B PR_SET_FP_MODE for details). - +.IP On success, the call returns a bit mask which represents the current floating-point mode. - +.IP The arguments .IR arg2 , .IR arg3 , @@ -523,7 +523,7 @@ Usually these fields are set by the kernel and dynamic loader (see for more information) and a regular application should not use this feature. However, there are cases, such as self-modifying programs, where a program might find it useful to change its own memory map. - +.IP The calling process must have the .BR CAP_SYS_RESOURCE capability. @@ -537,7 +537,7 @@ The and .I arg5 arguments must be zero if unused. - +.IP .\" commit 52b3694157e3aa6df871e283115652ec6f2d31e0 Since Linux 3.10, this feature is available all the time. @@ -669,7 +669,7 @@ Provides one-shot access to all the addresses by passing in a The .I arg4 argument should provide the size of the struct. - +.IP This feature is available only if the kernel is built with the .BR CONFIG_CHECKPOINT_RESTORE option enabled. @@ -682,7 +682,7 @@ This allows user space to find a compatible struct. The .I arg4 argument should be a pointer to an unsigned int. - +.IP This feature is available only if the kernel is built with the .BR CONFIG_CHECKPOINT_RESTORE option enabled. @@ -702,7 +702,7 @@ and .IR arg5 .\" commit e9d1b4f3c60997fe197bf0243cb4a41a44387a88 arguments must be zero. - +.IP MPX is a hardware-assisted mechanism for performing bounds checking on pointers. It consists of a set of registers storing bounds information @@ -715,7 +715,7 @@ These tables are called "bounds tables" and the MPX .BR prctl () operations control whether the kernel manages their allocation and freeing. - +.IP When management is enabled, the kernel will take over allocation and freeing of the bounds tables. It does this by trapping the #BR exceptions that result @@ -725,28 +725,28 @@ it allocates the table and populates the bounds directory with the location of the new table. For freeing, the kernel checks to see if bounds tables are present for memory which is not allocated, and frees them if so. - +.IP Before enabling MPX management using .BR PR_MPX_ENABLE_MANAGEMENT , the application must first have allocated a user-space buffer for the bounds directory and placed the location of that directory in the .I bndcfgu register. - +.IP These calls will fail if the CPU or kernel does not support MPX. Kernel support for MPX is enabled via the .BR CONFIG_X86_INTEL_MPX configuration option. You can check whether the CPU supports MPX by looking for the 'mpx' CPUID bit, like with the following command: - +.IP cat /proc/cpuinfo | grep ' mpx ' - +.IP A thread may not switch in or out of long (64-bit) mode while MPX is enabled. - +.IP All threads in a process are affected by these calls. - +.IP The child of a .BR fork (2) inherits the state of MPX management. @@ -755,7 +755,7 @@ During MPX management is reset to a state as if .BR PR_MPX_DISABLE_MANAGEMENT had been called. - +.IP For further information on Intel MPX, see the kernel source file .IR Documentation/x86/intel_mpx.txt . .TP @@ -807,7 +807,7 @@ and .BR clone (2), and preserved across .BR execve (2). - +.IP Since Linux 4.10, the value of a thread's .I no_new_privs @@ -816,7 +816,7 @@ bit can be viewed via the field in the .IR /proc/[pid]/status file. - +.IP For more information, see the kernel source file .IR Documentation/prctl/no_new_privs.txt . See also @@ -847,7 +847,7 @@ or a binary that has associated capabilities (see .BR capabilities (7)). This value is preserved across .BR execve (2). - +.IP .IR Warning : .\" https://bugzilla.kernel.org/show_bug.cgi?id=43300 the "parent" in this case is considered to be the @@ -887,7 +887,7 @@ is .BR PR_SET_PTRACER_ANY , the ptrace restrictions introduced by Yama are effectively disabled for the calling process. - +.IP For further information, see the kernel source file .IR Documentation/security/Yama.txt . .TP @@ -901,12 +901,12 @@ The more recent .BR seccomp (2) system call provides a superset of the functionality of .BR PR_SET_SECCOMP . - +.IP The seccomp mode is selected via .IR arg2 . (The seccomp constants are defined in .IR .) - +.IP With .IR arg2 set to @@ -929,7 +929,7 @@ This operation is available only if the kernel is configured with .B CONFIG_SECCOMP enabled. - +.IP With .IR arg2 set to @@ -944,7 +944,7 @@ arbitrary system calls and system call arguments. This mode is available only if the kernel is configured with .B CONFIG_SECCOMP_FILTER enabled. - +.IP If .BR SECCOMP_MODE_FILTER filters permit @@ -959,7 +959,7 @@ If the filters permit .BR prctl () calls, then additional filters can be added; they are run in order until the first non-allow result is seen. - +.IP For further information, see the kernel source file .IR Documentation/prctl/seccomp_filter.txt . .TP @@ -980,7 +980,7 @@ This operation is available only if the kernel is configured with .B CONFIG_SECCOMP enabled. - +.IP Since Linux 3.8, the .IR Seccomp field of the @@ -1089,14 +1089,14 @@ is less than or equal to zero, .\" The minimum value is 1? Seems a little strange. the "current" timer slack is reset to the thread's "default" timer slack value. - +.IP The "current" timer slack is used by the kernel to group timer expirations for the calling thread that are close to one another; as a consequence, timer expirations for the thread may be up to the specified number of nanoseconds late (but will never expire early). Grouping timer expirations can help reduce system power consumption by minimizing CPU wake-ups. - +.IP The timer expirations affected by timer slack are those set by .BR select (2), .BR pselect (2), @@ -1116,11 +1116,11 @@ and .BR pthread_rwlock_timedwrlock (3), and .BR sem_timedwait (3)). - +.IP Timer slack is not applied to threads that are scheduled under a real-time scheduling policy (see .BR sched_setscheduler (2)). - +.IP When a new thread is created, the two timer slack values are made the same as the "current" value of the creating thread. @@ -1133,7 +1133,7 @@ The timer slack values of are 50,000 nanoseconds (50 microseconds). The timer slack values are preserved across .BR execve (2). - +.IP Since Linux 4.6, the "current" timer slack value of any process can be examined and changed via the file .IR /proc/[pid]/timerslack_ns . diff --git a/man2/pread.2 b/man2/pread.2 index 07171b71c..a686724cf 100644 --- a/man2/pread.2 +++ b/man2/pread.2 @@ -84,13 +84,13 @@ returns the number of bytes read and .BR pwrite () returns the number of bytes written. - +.PP Note that is not an error for a successful call to transfer fewer bytes than requested (see .BR read (2) and .BR write (2)). - +.PP On error, \-1 is returned and .I errno is set to indicate the cause of the error. @@ -147,7 +147,7 @@ The glibc and .BR pwrite () wrapper functions transparently deal with the change. - +.PP On some 32-bit architectures, the calling signature for these system calls differ, for the reasons described in diff --git a/man2/process_vm_readv.2 b/man2/process_vm_readv.2 index b0dec11b2..6d2ed8567 100644 --- a/man2/process_vm_readv.2 +++ b/man2/process_vm_readv.2 @@ -70,7 +70,7 @@ of the calling process ("the local process") and the process identified by ("the remote process"). The data moves directly between the address spaces of the two processes, without passing through kernel space. - +.PP The .BR process_vm_readv () system call transfers data from the remote process to the local process. @@ -95,7 +95,7 @@ and .IR liovcnt specifies the number of elements in .IR local_iov . - +.PP The .BR process_vm_writev () system call is the converse of @@ -109,7 +109,7 @@ and .IR remote_iov have the same meaning as for .BR process_vm_readv (). - +.PP The .I local_iov and @@ -119,7 +119,7 @@ arguments point to an array of structures, defined in .IR as: - +.PP .in +4n .nf struct iovec { @@ -128,7 +128,7 @@ struct iovec { }; .fi .in - +.PP Buffers are processed in array order. This means that .BR process_vm_readv () @@ -142,7 +142,7 @@ Likewise, is completely read before proceeding to .IR remote_iov[1] , and so on. - +.PP Similarly, .BR process_vm_writev () writes out the entire contents of @@ -153,7 +153,7 @@ and it completely fills .I remote_iov[0] before proceeding to .IR remote_iov[1] . - +.PP The lengths of .I remote_iov[i].iov_len and @@ -161,11 +161,11 @@ and do not have to be the same. Thus, it is possible to split a single local buffer into multiple remote buffers, or vice versa. - +.PP The .I flags argument is currently unused and must be set to 0. - +.PP The values specified in the .I liovcnt and @@ -178,7 +178,7 @@ or accessible via the call .IR sysconf(_SC_IOV_MAX) ). .\" In time, glibc might provide a wrapper that works around this limit, .\" as is done for readv()/writev() - +.PP The count arguments and .IR local_iov are checked before doing any transfers. @@ -188,7 +188,7 @@ is invalid, or the addresses refer to regions that are inaccessible to the local process, none of the vectors will be processed and an error will be returned immediately. - +.PP Note, however, that these system calls do not check the memory regions in the remote process until just before doing the read/write. Consequently, a partial read/write (see RETURN VALUE) @@ -208,7 +208,7 @@ elements and have them merge back into a single write entry. The first read entry goes up to the page boundary, while the second starts on the next page boundary.) - +.PP Permission to read from or write to another process is governed by a ptrace access mode .B PTRACE_MODE_ATTACH_REALCREDS @@ -230,7 +230,7 @@ These system calls won't perform a partial transfer that splits a single element.) The caller should check the return value to determine whether a partial read/write occurred. - +.PP On error, \-1 is returned and .I errno is set appropriately. @@ -292,7 +292,7 @@ The data transfers performed by and .BR process_vm_writev () are not guaranteed to be atomic in any way. - +.PP These system calls were designed to permit fast message passing by allowing messages to be exchanged with a single copy operation (rather than the double copy that would be required diff --git a/man2/ptrace.2 b/man2/ptrace.2 index 022d55e84..73f459ef0 100644 --- a/man2/ptrace.2 +++ b/man2/ptrace.2 @@ -131,9 +131,9 @@ Therefore, "tracee" always means "(one) thread", never "a (possibly multithreaded) process". Ptrace commands are always sent to a specific tracee using a call of the form - +.PP ptrace(PTRACE_foo, pid, ...) - +.PP where .I pid is the thread ID of the corresponding Linux thread. @@ -486,11 +486,11 @@ A by the tracer will return a .I status value such that - +.IP .nf status>>8 == (SIGTRAP | (PTRACE_EVENT_CLONE<<8)) .fi - +.IP The PID of the new process can be retrieved with .BR PTRACE_GETEVENTMSG . .IP @@ -523,11 +523,11 @@ A by the tracer will return a .I status value such that - +.IP .nf status>>8 == (SIGTRAP | (PTRACE_EVENT_EXEC<<8)) .fi - +.IP If the execing thread is not a thread group leader, the thread ID is reset to thread group leader's ID before this stop. Since Linux 3.0, the former thread ID can be retrieved with @@ -540,11 +540,11 @@ A by the tracer will return a .I status value such that - +.IP .nf status>>8 == (SIGTRAP | (PTRACE_EVENT_EXIT<<8)) .fi - +.IP The tracee's exit status can be retrieved with .BR PTRACE_GETEVENTMSG . .IP @@ -572,11 +572,11 @@ A by the tracer will return a .I status value such that - +.IP .nf status>>8 == (SIGTRAP | (PTRACE_EVENT_FORK<<8)) .fi - +.IP The PID of the new process can be retrieved with .BR PTRACE_GETEVENTMSG . .TP @@ -605,11 +605,11 @@ A by the tracer will return a .I status value such that - +.IP .nf status>>8 == (SIGTRAP | (PTRACE_EVENT_VFORK<<8)) .fi - +.IP The PID of the new process can be retrieved with .BR PTRACE_GETEVENTMSG . .TP @@ -621,11 +621,11 @@ A by the tracer will return a .I status value such that - +.IP .nf status>>8 == (SIGTRAP | (PTRACE_EVENT_VFORK_DONE<<8)) .fi - +.IP The PID of the new process can (since Linux 2.6.18) be retrieved with .BR PTRACE_GETEVENTMSG . .TP @@ -639,11 +639,11 @@ A by the tracer will return a .I status value such that - +.IP .nf status>>8 == (SIGTRAP | (PTRACE_EVENT_SECCOMP<<8)) .fi - +.IP While this triggers a .BR PTRACE_EVENT stop, it is similar to a syscall-enter-stop. @@ -851,7 +851,7 @@ See the "Attaching and detaching" subsection for additional information. and .I data are ignored.) - +.IP Permission to perform a .BR PTRACE_ATTACH is governed by a ptrace access mode @@ -911,7 +911,7 @@ and must be zero. .I data contains a bit mask of ptrace options to activate immediately. - +.IP Permission to perform a .BR PTRACE_SEIZE is governed by a ptrace access mode @@ -923,7 +923,7 @@ check; see below. .\" commit f8e529ed941ba2bbcbf310b575d968159ce7e895 This operation allows the tracer to dump the tracee's classic BPF filters. - +.IP .I addr is an integer specifying the index of the filter to be dumped. The most recently installed filter has the index 0. @@ -932,13 +932,13 @@ If is greater than the number of installed filters, the operation fails with the error .BR ENOENT . - +.IP .I data is either a pointer to a .IR "struct sock_filter" array that is large enough to store the BPF program, or NULL if the program is not to be stored. - +.IP Upon success, the return value is the number of instructions in the BPF program. If @@ -946,7 +946,7 @@ If was NULL, then this return value can be used to correctly size the .IR "struct sock_filter" array passed in a subsequent call. - +.IP This operation fails with the error .B EACCESS if the caller does not have the @@ -956,7 +956,7 @@ If the filter referred to by .I addr is not a classic BPF filter, the operation fails with the error .BR EMEDIUMTYPE . - +.IP This operation is available if the kernel was configured with both the .B CONFIG_SECCOMP_FILTER and the @@ -1305,9 +1305,9 @@ ptrace stops that are not signal-delivery-stops is a cause of confusion among ptrace users. One typical scenario is that the tracer observes group-stop, mistakes it for signal-delivery-stop, restarts the tracee with - +.PP ptrace(PTRACE_restart, pid, 0, stopsig) - +.PP with the intention of injecting .IR stopsig , but @@ -1472,9 +1472,9 @@ An additional bit is set in the higher byte of the status word: the value .I status>>8 will be - +.PP (SIGTRAP | PTRACE_EVENT_foo << 8). - +.PP The following events exist: .TP .B PTRACE_EVENT_VFORK @@ -1731,7 +1731,7 @@ of ptrace stops has changed between kernel versions. This documents the behavior from their introduction until Linux 4.7 (inclusive). The behavior in later kernel versions is documented in the next section. - +.PP A .BR PTRACE_EVENT_SECCOMP stop occurs whenever a @@ -1741,7 +1741,7 @@ This is independent of which methods was used to restart the system call. Notably, seccomp still runs even if the tracee was restarted using .BR PTRACE_SYSEMU and this system call is unconditionally skipped. - +.PP Restarts from this stop will behave as if the stop had occurred right before the system call in question. In particular, both @@ -1771,7 +1771,7 @@ Note that seccomp no longer runs (and no .B PTRACE_EVENT_SECCOMP will be reported) if the system call is skipped due to .BR PTRACE_SYSEMU . - +.PP Functionally, a .B PTRACE_EVENT_SECCOMP stop functions comparably @@ -1782,7 +1782,7 @@ the system call number may be changed and any other modified registers are visible to the to-be-executed system call as well). Note that there may be, but need not have been a preceding syscall-entry-stop. - +.PP After a .BR PTRACE_EVENT_SECCOMP stop, seccomp will be rerun, with a @@ -1840,9 +1840,9 @@ may succeed and return some random value if current ptrace-stop is not documented as returning a meaningful event message. .LP The call - +.PP ptrace(PTRACE_SETOPTIONS, pid, 0, PTRACE_O_flags); - +.PP affects one tracee. The tracee's current flags are replaced. Flags are inherited by new tracees created and "auto-attached" via active @@ -1879,13 +1879,13 @@ recommended practice is to always pass 0 in .IR sig .) .SS Attaching and detaching A thread can be attached to the tracer using the call - +.PP ptrace(PTRACE_ATTACH, pid, 0, 0); - +.PP or - +.PP ptrace(PTRACE_SEIZE, pid, 0, PTRACE_O_flags); - +.PP .B PTRACE_ATTACH sends .B SIGSTOP @@ -1930,17 +1930,17 @@ use command. .LP The request - +.PP ptrace(PTRACE_TRACEME, 0, 0, 0); - +.PP turns the calling thread into a tracee. The thread continues to run (doesn't enter ptrace-stop). A common practice is to follow the .B PTRACE_TRACEME with - +.PP raise(SIGSTOP); - +.PP and allow the parent (which is our tracer now) to observe our signal-delivery-stop. .LP @@ -1969,9 +1969,9 @@ is delivered to the children, causing them to enter signal-delivery-stop after they exit the system call which created them. .LP Detaching of the tracee is performed by: - +.PP ptrace(PTRACE_DETACH, pid, 0, sig); - +.PP .B PTRACE_DETACH is a restarting operation; therefore it requires the tracee to be in ptrace-stop. @@ -2112,7 +2112,7 @@ the tracer should clean up all its internal data structures describing the threads of this process, and retain only one data structure\(emone which describes the single still running tracee, with - +.PP thread ID == thread group ID == process ID. .LP Example: two threads call @@ -2317,7 +2317,7 @@ whether or not the "target" process is dumpable, and the results of checks performed by any enabled Linux Security Module (LSM)\(emfor example, SELinux, Yama, or Smack\(emand by the commoncap LSM (which is always invoked). - +.PP Prior to Linux 2.6.27, all access checks were of a single type. Since Linux 2.6.27, .\" commit 006ebb40d3d65338bd74abb03b945f8d60e362bd @@ -2422,7 +2422,7 @@ and accesses to various pseudofiles (e.g., under .IR /proc ). These names are used in other manual pages to provide a simple shorthand for labeling the different kernel checks. - +.PP The algorithm employed for ptrace access mode checking determines whether the calling process is allowed to perform the corresponding action on the target process. @@ -2443,7 +2443,7 @@ employ the caller's filesystem UID and GID. .BR credentials (7), the filesystem UID and GID almost always have the same values as the corresponding effective IDs.) - +.IP Otherwise, the access mode specifies .BR PTRACE_MODE_REALCREDS , so use the caller's real UID and GID for the checks in the next step. @@ -2544,7 +2544,7 @@ a compromised process can ptrace-attach to other sensitive processes (e.g., a GPG agent or an SSH session) owned by the user in order to gain additional credentials that may exist in memory and thus expand the scope of the attack. - +.PP More precisely, the Yama LSM limits two types of operations: .IP * 3 Any operation that performs a ptrace access mode @@ -2553,7 +2553,7 @@ check\(emfor example, .BR ptrace () .BR PTRACE_ATTACH . (See the "Ptrace access mode checking" discussion above.) - +.IP .IP * .BR ptrace () .BR PTRACE_TRACEME . @@ -2568,7 +2568,7 @@ file with one of the following values: No additional restrictions on operations that perform .BR PTRACE_MODE_ATTACH checks (beyond those imposed by the commoncap and other LSMs). - +.IP The use of .BR PTRACE_TRACEME is unchanged. @@ -2583,7 +2583,7 @@ it must have a predefined relationship with the target process. By default, the predefined relationship is that the target process must be a descendant of the caller. - +.IP A target process can employ the .BR prctl (2) .B PR_SET_PTRACER @@ -2593,7 +2593,7 @@ operations on the target. See the kernel source file .IR Documentation/security/Yama.txt for further details. - +.IP The use of .BR PTRACE_TRACEME is unchanged. @@ -2611,7 +2611,7 @@ No process may perform .BR PTRACE_MODE_ATTACH operations or trace children that employ .BR PTRACE_TRACEME . - +.IP Once this value has been written to the file, it cannot be changed. .PP With respect to values 1 and 2, @@ -2713,9 +2713,9 @@ from an file descriptor. The usual symptom of this bug is that when you attach to a quiescent process with the command - +.PP strace \-p - +.PP then, instead of the usual and expected one-line output such as .nf diff --git a/man2/query_module.2 b/man2/query_module.2 index 3416d7e91..143ab1033 100644 --- a/man2/query_module.2 +++ b/man2/query_module.2 @@ -23,7 +23,7 @@ No declaration of this system call is provided in glibc headers; see NOTES. .SH DESCRIPTION .IR Note : This system call is present only in kernels before Linux 2.6. - +.PP .BR query_module () requests information from the kernel about loadable modules. The returned information is placed in the buffer pointed to by @@ -40,7 +40,7 @@ Some operations require to identify a currently loaded module, some allow .I name to be NULL, indicating the kernel proper. - +.PP The following values can be specified for .IR which : .TP @@ -180,7 +180,7 @@ can be obtained from .IR /proc/kallsyms , and the files under the directory .IR /sys/module . - +.PP The .BR query_module () system call is not supported by glibc. diff --git a/man2/quotactl.2 b/man2/quotactl.2 index ef64fa135..525e75ab4 100644 --- a/man2/quotactl.2 +++ b/man2/quotactl.2 @@ -46,7 +46,7 @@ The soft limit can be exceeded, but warnings will ensue. Moreover, the user can't exceed the soft limit for more than grace period duration (one week by default) at a time; after this, the soft limit counts as a hard limit. - +.PP The .BR quotactl () call manipulates disk quotas. @@ -73,12 +73,12 @@ for project quotas. The .I subcmd value is described below. - +.PP The .I special argument is a pointer to a null-terminated string containing the pathname of the (mounted) block special device for the filesystem being manipulated. - +.PP The .I addr argument is the address of an optional, command-specific, data structure @@ -86,7 +86,7 @@ that is copied in or out of the system. The interpretation of .I addr is given with each command below. - +.PP The .I subcmd value is one of the following: @@ -205,7 +205,7 @@ This operation is the same as but it returns quota information for the next ID greater than or equal to .IR id that has a quota set. - +.IP The .I addr argument is a pointer to a @@ -216,7 +216,7 @@ except for the addition of a .I dqb_id field that is used to return the ID for which quota information is being returned: - +.IP .nf .in +4n struct nextdqblk { @@ -373,7 +373,7 @@ The and .I id arguments are ignored. - +.IP This operation is obsolete and was removed in Linux 2.4.22. Files in .I /proc/sys/fs/quota/ @@ -392,7 +392,7 @@ to be a pointer to an .I "unsigned int" that contains a combination of the following flags (defined in .IR ): - +.IP .nf .in +4n #define XFS_QUOTA_UDQ_ACCT (1<<0) /* User quota @@ -409,7 +409,7 @@ that contains a combination of the following flags (defined in enforcement */ .in .fi - +.IP This operation requires privilege .RB ( CAP_SYS_ADMIN ). The @@ -442,7 +442,7 @@ argument is a pointer to an structure, which is defined in .I as follows: - +.IP .nf .in +4n /* All the blk units are in BBs (Basic Blocks) of @@ -492,7 +492,7 @@ struct fs_disk_quota { }; .in .fi - +.IP Unprivileged users may retrieve only their own quotas; a privileged user .RB ( CAP_SYS_ADMIN ) @@ -541,7 +541,7 @@ filesystem. The .I fs_quota_stat structure itself is defined as follows: - +.IP .nf .in +4n #define FS_QSTAT_VERSION 1 /* fs_quota_stat.qs_version */ @@ -572,7 +572,7 @@ struct fs_quota_stat { }; .in .fi - +.IP The .I id argument is ignored. @@ -590,7 +590,7 @@ the project quota file. The .I fs_quota_statv structure itself is defined as follows: - +.IP .nf .in +4n #define FS_QSTATV_VERSION1 1 /* fs_quota_statv.qs_version */ @@ -625,7 +625,7 @@ struct fs_quota_statv { }; .in .fi - +.IP The .I qs_version field of the structure should be filled with the version of the structure @@ -652,7 +652,7 @@ structure) which identify what types of quota should be removed .I cmd argument is ignored, but should remain valid in order to pass preliminary quotactl syscall handler checks). - +.IP Quotas must have already been turned off. The .I id diff --git a/man2/read.2 b/man2/read.2 index abb2de9a0..3c74e9477 100644 --- a/man2/read.2 +++ b/man2/read.2 @@ -49,7 +49,7 @@ bytes from file descriptor .I fd into the buffer starting at .IR buf . - +.PP On files that support seeking, the read operation commences at the file offset, and the file offset is incremented by the number of bytes read. @@ -57,7 +57,7 @@ If the file offset is at or past the end of file, no bytes are read, and .BR read () returns zero. - +.PP If .I count is zero, @@ -72,7 +72,7 @@ does not check for errors, a with a .I count of 0 returns zero and has no other effects. - +.PP According to POSIX.1, if .I count is greater than @@ -89,7 +89,7 @@ because we are reading from a pipe, or from a terminal), or because .BR read () was interrupted by a signal. See also NOTES. - +.PP On error, \-1 is returned, and .I errno is set appropriately. @@ -179,7 +179,7 @@ and .I ssize_t are, respectively, unsigned and signed integer data types specified by POSIX.1. - +.PP On Linux, .BR read () (and similar system calls) will transfer at most @@ -187,7 +187,7 @@ On Linux, returning the number of bytes actually transferred. .\" commit e28cc71572da38a5a12c1cfe4d7032017adccf69 (This is true on both 32-bit and 64-bit systems.) - +.PP On NFS filesystems, reading small amounts of data will update the timestamp only the first time, subsequent calls may not do so. This is caused @@ -206,13 +206,13 @@ increase server load and decrease performance. .SH BUGS According to POSIX.1-2008/SUSv4 Section XSI 2.9.7 ("Thread Interactions with Regular File Operations"): - +.PP .RS 4 All of the following functions shall be atomic with respect to each other in the effects specified in POSIX.1-2008 when they operate on regular files or symbolic links: ... .RE - +.PP Among the APIs subsequently listed are .BR read () and diff --git a/man2/readahead.2 b/man2/readahead.2 index 529ebc505..7f7f408f4 100644 --- a/man2/readahead.2 +++ b/man2/readahead.2 @@ -41,7 +41,7 @@ initiates readahead on a file so that subsequent reads from that file will be satisfied from the cache, and not block on disk I/O (assuming the readahead was initiated early enough and that other activity on the system did not in the meantime flush pages from the cache). - +.PP The .I fd argument is a file descriptor identifying the file which is diff --git a/man2/readdir.2 b/man2/readdir.2 index 4665f3d75..798cf19a3 100644 --- a/man2/readdir.2 +++ b/man2/readdir.2 @@ -122,7 +122,7 @@ structure yourself. However, probably you should use .BR readdir (3) instead. - +.PP This system call does not exist on x86-64. .SH SEE ALSO .BR getdents (2), diff --git a/man2/readv.2 b/man2/readv.2 index 1524d7a1b..21aa28d1e 100644 --- a/man2/readv.2 +++ b/man2/readv.2 @@ -166,7 +166,7 @@ but adds a fourth argument, .IR offset , which specifies the file offset at which the input operation is to be performed. - +.PP The .BR pwritev () system call combines the functionality of @@ -179,13 +179,13 @@ but adds a fourth argument, .IR offset , which specifies the file offset at which the output operation is to be performed. - +.PP The file offset is not changed by these system calls. The file referred to by .I fd must be capable of seeking. .SS preadv2() and pwritev2() - +.PP These system calls are similar to .BR preadv () and @@ -193,7 +193,7 @@ and calls, but add a fifth argument, .IR flags , which modifies the behavior on a per-call basis. - +.PP Unlike .BR preadv () and @@ -201,7 +201,7 @@ and if the .I offset argument is \-1, then the current file offset is used and updated. - +.PP The .I flags argument contains a bitwise OR of zero or more of the following flags: @@ -245,13 +245,13 @@ return the number of bytes read; and .BR pwritev2 () return the number of bytes written. - +.PP Note that it is not an error for a successful call to transfer fewer bytes than requested (see .BR read (2) and .BR write (2)). - +.PP On error, \-1 is returned, and \fIerrno\fP is set appropriately. .SH ERRORS The errors are as given for @@ -287,7 +287,7 @@ An unknown flag is specified in \fIflags\fP. and .BR pwritev () first appeared in Linux 2.6.30; library support was added in glibc 2.10. - +.PP .BR preadv2 () and .BR pwritev2 () @@ -302,7 +302,7 @@ POSIX.1-2001, POSIX.1-2008, .\" and \fIint\fP as the return type. .\" The readv/writev system calls were buggy before Linux 1.3.40. .\" (Says release.libc.) - +.PP .BR preadv (), .BR pwritev (): nonstandard, but present also on the modern BSDs. @@ -334,9 +334,9 @@ corresponding GNU C library wrapper functions shown in the SYNOPSIS. The final argument, .IR offset , is unpacked by the wrapper functions into two arguments in the system calls: - +.PP .BI " unsigned long " pos_l ", unsigned long " pos - +.PP These arguments contain, respectively, the low order and high order 32 bits of .IR offset . .SS Historical C library/kernel differences @@ -365,7 +365,7 @@ The wrapper function for .BR writev () performed the analogous task using a temporary buffer and a call to .BR write (2). - +.PP The need for this extra effort in the glibc wrapper functions went away with Linux 2.2 and later. However, glibc continued to provide this behavior until version 2.10. @@ -379,7 +379,7 @@ the glibc wrapper functions always just directly invoke the system calls. .SH EXAMPLE The following code sample demonstrates the use of .BR writev (): - +.PP .in +4n .nf char *str0 = "hello "; diff --git a/man2/reboot.2 b/man2/reboot.2 index 2dabab8d0..17a3db3ee 100644 --- a/man2/reboot.2 +++ b/man2/reboot.2 @@ -79,7 +79,7 @@ and since 2.5.71 also are permitted as values for .IR magic2 . (The hexadecimal values of these constants are meaningful.) - +.PP The .I cmd argument can have the following values: diff --git a/man2/recv.2 b/man2/recv.2 index e8d2d9ec8..35717597c 100644 --- a/man2/recv.2 +++ b/man2/recv.2 @@ -82,11 +82,11 @@ is generally equivalent to .BR read (2) (but see NOTES). Also, the following call - +.PP recv(sockfd, buf, len, flags); - +.PP is equivalent to - +.PP recvfrom(sockfd, buf, len, flags, NULL, NULL); .PP All three calls return the length of the message on successful @@ -164,7 +164,7 @@ is passed as normal data via The original destination address of the datagram that caused the error is supplied via .IR msg_name . - +.IP The error is supplied in a .I sock_extended_err structure: @@ -251,7 +251,7 @@ netlink (since Linux 2.6.22), and UNIX datagram (since Linux 3.4) sockets: return the real length of the packet or datagram, even when it was longer than the passed buffer. - +.IP For use with Internet stream sockets, see .BR tcp (7). .TP @@ -269,7 +269,7 @@ places the received message into the buffer .IR buf . The caller must specify the size of the buffer in .IR len . - +.PP If .I src_addr is not NULL, @@ -297,7 +297,7 @@ The returned address is truncated if the buffer provided is too small; in this case, .I addrlen will return a value greater than was supplied to the call. - +.PP If the caller is not interested in the source address, .I src_addr and @@ -312,7 +312,7 @@ call is normally used only on a socket (see .BR connect (2)). It is equivalent to the call: - +.PP recvfrom(fd, buf, len, flags, NULL, 0); .\" .SS recvmsg() @@ -356,14 +356,14 @@ will contain the length of the returned address. If the application does not need to know the source address, .I msg_name can be specified as NULL. - +.PP The fields .I msg_iov and .I msg_iovlen describe scatter-gather locations, as discussed in .BR readv (2). - +.PP The field .IR msg_control , which has length @@ -433,14 +433,14 @@ if an error occurred. In the event of an error, .I errno is set to indicate the error. - +.PP When a stream socket peer has performed an orderly shutdown, the return value will be 0 (the traditional "end-of-file" return). - +.PP Datagram sockets in various domains (e.g., the UNIX and Internet domains) permit zero-length datagrams. When such a datagram is received, the return value is 0. - +.PP The value 0 may also be returned if the requested number of bytes to receive from a stream socket was 0. .SH ERRORS @@ -518,13 +518,13 @@ In this circumstance, has no effect (the datagram remains pending), while .BR recv () consumes the pending datagram. - +.PP The .I socklen_t type was invented by POSIX. See also .BR accept (2). - +.PP According to POSIX.1, .\" POSIX.1-2001, POSIX.1-2008 the @@ -540,7 +540,7 @@ but glibc currently types it as .\" The problem is an underlying kernel issue: the size of the .\" __kernel_size_t type used to type this field varies .\" across architectures, but socklen_t is always 32 bits. - +.PP See .BR recvmmsg (2) for information about a Linux-specific system call diff --git a/man2/recvmmsg.2 b/man2/recvmmsg.2 index e1b7750d2..e09c987b8 100644 --- a/man2/recvmmsg.2 +++ b/man2/recvmmsg.2 @@ -52,11 +52,11 @@ using a single system call. A further extension over .BR recvmsg (2) is support for a timeout on the receive operation. - +.PP The .I sockfd argument is the file descriptor of the socket to receive data from. - +.PP The .I msgvec argument is a pointer to an array of @@ -64,13 +64,13 @@ argument is a pointer to an array of structures. The size of this array is specified in .IR vlen . - +.PP The .I mmsghdr structure is defined in .I as: - +.PP .in +4n .nf struct mmsghdr { @@ -92,7 +92,7 @@ field is the number of bytes returned for the message in the entry. This field has the same value as the return value of a single .BR recvmsg (2) on the header. - +.PP The .I flags argument contains flags ORed together. @@ -119,7 +119,7 @@ may overrun by a small amount.) If .I timeout is NULL, then the operation blocks indefinitely. - +.PP A blocking .BR recvmmsg () call blocks until @@ -130,7 +130,7 @@ A nonblocking call reads as many messages as are available (up to the limit specified by .IR vlen ) and returns immediately. - +.PP On return from .BR recvmmsg (), successive elements of @@ -188,7 +188,7 @@ to receive multiple messages on a socket and stores them in multiple buffers. The call returns if all buffers are filled or if the timeout specified has expired. - +.PP The following snippet periodically generates UDP datagrams containing a random number: .in +4n @@ -198,7 +198,7 @@ containing a random number: .B " sleep 0.25; done" .fi .in - +.PP These datagrams are read by the example application, which can give the following output: .in +4n diff --git a/man2/remap_file_pages.2 b/man2/remap_file_pages.2 index cf45c45dd..06af8fad3 100644 --- a/man2/remap_file_pages.2 +++ b/man2/remap_file_pages.2 @@ -50,7 +50,7 @@ This change was made because the kernel code for this system call was complex, and it is believed to be little used or perhaps even completely unused. While it had some use cases in database applications on 32-bit systems, those use cases don't exist on 64-bit systems. - +.PP The .BR remap_file_pages () system call is used to create a nonlinear mapping, that is, a mapping @@ -62,7 +62,7 @@ over using repeated calls to .BR mmap (2) is that the former approach does not require the kernel to create additional VMA (Virtual Memory Area) data structures. - +.PP To create a nonlinear mapping we perform the following steps: .TP 3 1. @@ -91,7 +91,7 @@ within the mapping: is a file offset in units of the system page size; .I size is the length of the region in bytes. - +.PP The .I addr argument serves two purposes. @@ -109,7 +109,7 @@ identified by and .I size will be placed. - +.PP The values specified in .I addr and @@ -123,11 +123,11 @@ to the nearest multiple of the page size. .\" This rounding is weird, and not consistent with the treatment of .\" the analogous arguments for munmap()/mprotect() and for mlock(). .\" MTK, 14 Sep 2005 - +.PP The .I prot argument must be specified as 0. - +.PP The .I flags argument has the same meaning as for diff --git a/man2/rename.2 b/man2/rename.2 index 1882b5720..7ea919672 100644 --- a/man2/rename.2 +++ b/man2/rename.2 @@ -83,10 +83,10 @@ are unaffected. Open file descriptors for .I oldpath are also unaffected. - +.PP Various restrictions determine whether or not the rename operation succeeds: see ERRORS below. - +.PP If .I newpath already exists, it will be atomically replaced, so that there is @@ -98,7 +98,7 @@ However, there will probably be a window in which both and .I newpath refer to the file being renamed. - +.PP If .I oldpath and @@ -106,7 +106,7 @@ and are existing hard links referring to the same file, then .BR rename () does nothing, and returns a success status. - +.PP If .I newpath exists but the operation fails for some reason, @@ -114,13 +114,13 @@ exists but the operation fails for some reason, guarantees to leave an instance of .I newpath in place. - +.PP .I oldpath can specify a directory. In this case, .I newpath must either not exist, or it must specify an empty directory. - +.PP If .I oldpath refers to a symbolic link, the link is renamed; if @@ -132,7 +132,7 @@ The system call operates in exactly the same way as .BR rename (), except for the differences described here. - +.PP If the pathname given in .I oldpath is relative, then it is interpreted relative to the directory @@ -142,7 +142,7 @@ referred to by the file descriptor the calling process, as is done by .BR rename () for a relative pathname). - +.PP If .I oldpath is relative and @@ -154,13 +154,13 @@ then is interpreted relative to the current working directory of the calling process (like .BR rename ()). - +.PP If .I oldpath is absolute, then .I olddirfd is ignored. - +.PP The interpretation of .I newpath is as for @@ -168,7 +168,7 @@ is as for except that a relative pathname is interpreted relative to the directory referred to by the file descriptor .IR newdirfd . - +.PP See .BR openat (2) for an explanation of the need for @@ -184,7 +184,7 @@ call with a zero .I flags argument is equivalent to .BR renameat (). - +.PP The .I flags argument is a bit mask consisting of zero or more of the following flags: @@ -205,7 +205,7 @@ of the rename. Return an error if .IR newpath already exists. - +.IP .B RENAME_NOREPLACE can't be employed together with .BR RENAME_EXCHANGE . @@ -215,14 +215,14 @@ can't be employed together with .\" commit 787fb6bc9682ec7c05fb5d9561b57100fbc1cc41 This operation makes sense only for overlay/union filesystem implementations. - +.IP Specifying .B RENAME_WHITEOUT creates a "whiteout" object at the source of the rename at the same time as performing the rename. The whole operation is atomic, so that if the rename succeeds then the whiteout will also have been created. - +.IP A "whiteout" is an object that has special meaning in union/overlay filesystem constructs. In these constructs, @@ -230,7 +230,7 @@ multiple layers exist and only the top one is ever modified. A whiteout on an upper layer will effectively hide a matching file in the lower layer, making it appear as if the file didn't exist. - +.IP When a file that exists on the lower layer is renamed, the file is first copied up (if not already on the upper layer) and then renamed on the upper, read-write layer. @@ -238,19 +238,19 @@ At the same time, the source file needs to be "whiteouted" (so that the version of the source file in the lower layer is rendered invisible). The whole operation needs to be done atomically. - +.IP When not part of a union/overlay, the whiteout appears as a character device with a {0,0} device number. - +.IP .B RENAME_WHITEOUT requires the same privileges as creating a device node (i.e., the .BR CAP_MKNOD capability). - +.IP .B RENAME_WHITEOUT can't be employed together with .BR RENAME_EXCHANGE . - +.IP .B RENAME_WHITEOUT requires support from the underlying filesystem. Among the filesystems that provide that support are @@ -477,17 +477,17 @@ capability. .BR renameat () was added to Linux in kernel 2.6.16; library support was added to glibc in version 2.4. - +.PP .BR renameat2 () was added to Linux in kernel 3.15. .\" FIXME . glibc support is pending. .SH CONFORMING TO .BR rename (): 4.3BSD, C89, C99, POSIX.1-2001, POSIX.1-2008. - +.PP .BR renameat (): POSIX.1-2008. - +.PP .BR renameat2 () is Linux-specific. .SH NOTES diff --git a/man2/request_key.2 b/man2/request_key.2 index fde40b597..44bcc358a 100644 --- a/man2/request_key.2 +++ b/man2/request_key.2 @@ -35,7 +35,7 @@ If the key is found or created, attaches it to the keyring whose ID is specified in .I dest_keyring and returns the key's serial number. - +.PP .BR request_key () first recursively searches for a matching key in all of the keyrings attached to the calling process. @@ -67,13 +67,13 @@ If the key is not found and .I callout is NULL, then the call fails with the error .BR ENOKEY . - +.PP If the key is not found and .I callout is not NULL, then the kernel attempts to invoke a user-space program to instantiate the key. The details are given below. - +.PP The .I dest_keyring serial number may be that of a valid keyring for which the caller has @@ -105,7 +105,7 @@ When the is specified as 0 and no key construction has been performed, then no additional linking is done. - +.PP Otherwise, if .I dest_keyring is 0 and a new key is constructed, the new key will be linked @@ -263,7 +263,7 @@ with a new session keyring that contains a link to the authorization key, V. .\" The request-key(8) program can be invoked in circumstances *other* than .\" when triggered by request_key(2). For example, upcalls from places such .\" as the DNS resolver. - +.IP This program is supplied with the following command-line arguments: .RS .IP [0] 4 @@ -358,7 +358,7 @@ The purpose of this negatively instantiated key is to prevent (that require expensive .BR request-key (8) upcalls) for a key that can't (at the moment) be positively instantiated. - +.PP Once the key has been instantiated, the authorization key .RB ( KEY_SPEC_REQKEY_AUTH_KEY ) is revoked, and the destination keyring @@ -366,7 +366,7 @@ is revoked, and the destination keyring is no longer accessible from the .BR request-key (8) program. - +.PP If a key is created, then\(emregardless of whether it is a valid key or a negatively instantiated key\(emit will displace any other key with the same type and description from the keyring specified in @@ -455,11 +455,11 @@ and arguments for the system call are taken from the values supplied in the command-line arguments. The call specifies the session keyring as the target keyring. - +.PP In order to demonstrate this program, we first create a suitable entry in the file .IR /etc/request-key.conf . - +.PP .in +4n .nf $ sudo sh @@ -468,7 +468,7 @@ $ sudo sh # \fBexit\fP .fi .in - +.PP This entry specifies that when a new "user" key with the prefix "mtk:" must be instantiated, that task should be performed via the .BR keyctl (1) @@ -493,11 +493,11 @@ See for details of these .I % specifiers. - +.PP Then we run the program and check the contents of .IR /proc/keys to verify that the requested key has been instantiated: - +.PP .in +4n .nf $ \fB./t_request_key user mtk:key1 "Payload data"\fP @@ -505,7 +505,7 @@ $ \fBgrep \(aq2dddaf50\(aq /proc/keys\fP 2dddaf50 I--Q--- 1 perm 3f010000 1000 1000 user mtk:key1: 12 .fi .in - +.PP For another example of the use of this program, see .BR keyctl (2). .SS Program source @@ -559,7 +559,7 @@ main(int argc, char *argv[]) .BR user\-keyring (7), .BR user\-session\-keyring (7), .BR request\-key (8) - +.PP The kernel source files .IR Documentation/security/keys.txt and diff --git a/man2/restart_syscall.2 b/man2/restart_syscall.2 index 04da60ee1..a77cedda8 100644 --- a/man2/restart_syscall.2 +++ b/man2/restart_syscall.2 @@ -50,7 +50,7 @@ is later resumed after receiving a .BR SIGCONT signal. This system call is designed only for internal use by the kernel. - +.PP .BR restart_syscall () is used for restarting only those system calls that, when restarted, should adjust their time-related parameters\(emnamely @@ -103,7 +103,7 @@ This system call is Linux-specific. There is no glibc wrapper for this system call, because it is intended for use only by the kernel and should never be called by applications. - +.PP The kernel uses .BR restart_syscall () to ensure that when a system call is restarted @@ -125,7 +125,7 @@ Notable examples of system calls that suffer this problem are .BR select (2), and .BR pselect (2). - +.PP From user space, the operation of .BR restart_syscall () is largely invisible: diff --git a/man2/rt_sigqueueinfo.2 b/man2/rt_sigqueueinfo.2 index a7bd7d30b..3932c51bc 100644 --- a/man2/rt_sigqueueinfo.2 +++ b/man2/rt_sigqueueinfo.2 @@ -47,13 +47,13 @@ by establishing a signal handler with the .BR sigaction (2) .B SA_SIGINFO flag. - +.PP These system calls are not intended for direct application use; they are provided to allow the implementation of .BR sigqueue (3) and .BR pthread_sigqueue (3). - +.PP The .BR rt_sigqueueinfo () system call sends the signal @@ -65,7 +65,7 @@ to the thread group with the ID corresponds to the traditional UNIX process ID.) The signal will be delivered to an arbitrary member of the thread group (i.e., one of the threads that is not currently blocking the signal). - +.PP The .I uinfo argument specifies the data to accompany the signal. @@ -114,7 +114,7 @@ field to the value specified in .IR sig , so that the receiver of the signal can also obtain the signal number via that field. - +.PP The .BR rt_tgsigqueueinfo () system call is like @@ -180,7 +180,7 @@ Since these system calls are not intended for application use, there are no glibc wrapper functions; use .BR syscall (2) in the unlikely case that you want to call them directly. - +.PP As with .BR kill (2), the null signal (0) can be used to check if the specified process diff --git a/man2/s390_pci_mmio_write.2 b/man2/s390_pci_mmio_write.2 index 93e98be81..e0cb1ef15 100644 --- a/man2/s390_pci_mmio_write.2 +++ b/man2/s390_pci_mmio_write.2 @@ -54,7 +54,7 @@ data from the PCI MMIO memory location specified by .IR mmio_addr to the user-space buffer .IR user_buffer . - +.PP These system calls must be used instead of the simple assignment or data-transfer operations that are used to access the PCI MMIO memory areas mapped to user space on the Linux System z platform. diff --git a/man2/s390_runtime_instr.2 b/man2/s390_runtime_instr.2 index 2ef271d68..76d1503cc 100644 --- a/man2/s390_runtime_instr.2 +++ b/man2/s390_runtime_instr.2 @@ -36,7 +36,7 @@ The .BR s390_runtime_instr () system call starts or stops CPU run-time instrumentation for the calling thread. - +.PP The .IR command argument controls whether run-time instrumentation is started @@ -44,7 +44,7 @@ argument controls whether run-time instrumentation is started 1) or stopped .RB ( S390_RUNTIME_INSTR_STOP , 2) for the calling thread. - +.PP The .IR signum argument specifies the number of a real-time signal. diff --git a/man2/sched_get_priority_max.2 b/man2/sched_get_priority_max.2 index 81afb0527..f2c955114 100644 --- a/man2/sched_get_priority_max.2 +++ b/man2/sched_get_priority_max.2 @@ -56,7 +56,7 @@ and .BR SCHED_DEADLINE . Further details about these policies can be found in .BR sched (7). - +.PP Processes with numerically higher priority values are scheduled before processes with numerically lower priority values. Thus, the value @@ -65,7 +65,7 @@ returned by will be greater than the value returned by .BR sched_get_priority_min (). - +.PP Linux allows the static priority range 1 to 99 for the .B SCHED_FIFO and @@ -73,7 +73,7 @@ and policies, and the priority 0 for the remaining policies. Scheduling priority ranges for the various policies are not alterable. - +.PP The range of scheduling priorities may vary on other POSIX systems, thus it is a good idea for portable applications to use a virtual priority range and map it to the interval given by @@ -86,7 +86,7 @@ a spread of at least 32 between the maximum and the minimum values for .B SCHED_FIFO and .BR SCHED_RR . - +.PP POSIX systems on which .BR sched_get_priority_max () and diff --git a/man2/sched_rr_get_interval.2 b/man2/sched_rr_get_interval.2 index d15fc7c31..239263bf8 100644 --- a/man2/sched_rr_get_interval.2 +++ b/man2/sched_rr_get_interval.2 @@ -44,11 +44,11 @@ the round-robin time quantum for the process identified by The specified process should be running under the .B SCHED_RR scheduling policy. - +.PP The .I timespec structure has the following form: - +.PP .in +4n .nf struct timespec { @@ -57,7 +57,7 @@ struct timespec { }; .fi .in - +.PP If .I pid is zero, the time quantum for the calling process is written into @@ -114,7 +114,7 @@ quantum has varied somewhat across kernel versions. This method of adjusting the quantum was removed .\" commit a4ec24b48ddef1e93f7578be53270f0b95ad666c starting with Linux 2.6.24. - +.PP Linux 3.9 added .\" commit ce0dbbbb30aee6a835511d5be446462388ba9eee a new mechanism for adjusting (and viewing) the diff --git a/man2/sched_setaffinity.2 b/man2/sched_setaffinity.2 index f02a5c75a..2d9b3c5ad 100644 --- a/man2/sched_setaffinity.2 +++ b/man2/sched_setaffinity.2 @@ -60,14 +60,14 @@ Restricting a thread to run on a single CPU also avoids the performance cost caused by the cache invalidation that occurs when a thread ceases to execute on one CPU and then recommences execution on a different CPU. - +.PP A CPU affinity mask is represented by the .I cpu_set_t structure, a "CPU set", pointed to by .IR mask . A set of macros for manipulating CPU sets is described in .BR CPU_SET (3). - +.PP .BR sched_setaffinity () sets the CPU affinity mask of the thread whose ID is .I pid @@ -82,14 +82,14 @@ is the length (in bytes) of the data pointed to by .IR mask . Normally this argument would be specified as .IR "sizeof(cpu_set_t)" . - +.PP If the thread specified by .I pid is not currently running on one of the CPUs specified in .IR mask , then that thread is migrated to one of the CPUs specified in .IR mask . - +.PP .BR sched_getaffinity () writes the affinity mask of the thread whose ID is .I pid @@ -174,7 +174,7 @@ runs if the "cpuset" mechanism described in is being used. These restrictions on the actual set of CPUs on which the thread will run are silently imposed by the kernel. - +.PP There are various ways of determining the number of CPUs available on the system, including: inspecting the contents of .IR /proc/cpuinfo ; @@ -186,7 +186,7 @@ and .BR _SC_NPROCESSORS_ONLN parameters; and inspecting the list of CPU directories under .IR /sys/devices/system/cpu/ . - +.PP .BR sched (7) has a description of the Linux scheduling scheme. .PP @@ -206,7 +206,7 @@ will set the attribute for the main thread of the thread group. .BR pthread_setaffinity_np (3) instead of .BR sched_setaffinity ().) - +.PP The .I isolcpus boot option can be used to isolate one or more CPUs at boot time, @@ -224,7 +224,7 @@ As noted in that file, is the preferred mechanism of isolating CPUs (versus the alternative of manually setting the CPU affinity of all processes on the system). - +.PP A child created via .BR fork (2) inherits its parent's CPU affinity mask. @@ -256,9 +256,9 @@ meaning that the maximum CPU number that can be represented is 1023. .\" and https://sourceware.org/ml/libc-alpha/2013-07/msg00288.html If the kernel CPU affinity mask is larger than 1024, then calls of the form: - +.PP sched_getaffinity(pid, sizeof(cpu_set_t), &mask); - +.PP will fail with the error .BR EINVAL , the error produced by the underlying system call for the case where the @@ -279,7 +279,7 @@ of the required mask using .BR sched_getaffinity () calls with increasing mask sizes (until the call does not fail with the error .BR EINVAL ). - +.PP Be aware that .BR CPU_ALLOC (3) may allocate a slightly larger CPU set than requested @@ -303,16 +303,16 @@ The program takes three command-line arguments: the CPU number for the parent, the CPU number for the child, and the number of loop iterations that both processes should perform. - +.PP As the sample runs below demonstrate, the amount of real and CPU time consumed when running the program will depend on intra-core caching effects and whether the processes are using the same CPU. - +.PP We first employ .BR lscpu (1) to determine that this (x86) system has two cores, each with two CPUs: - +.PP .in +4n .nf $ \fBlscpu | grep -i 'core.*:|socket'\fP @@ -321,12 +321,12 @@ Core(s) per socket: 2 Socket(s): 1 .fi .in - +.PP We then time the operation of the example program for three cases: both processes running on the same CPU; both processes running on different CPUs on the same core; and both processes running on different CPUs on different cores. - +.PP .in +4n .nf $ \fBtime \-p ./a.out 0 0 100000000\fP diff --git a/man2/sched_setattr.2 b/man2/sched_setattr.2 index ea08ab08a..92e86eb4f 100644 --- a/man2/sched_setattr.2 +++ b/man2/sched_setattr.2 @@ -49,7 +49,7 @@ If .I pid equals zero, the scheduling policy and attributes of the calling thread will be set. - +.PP Currently, Linux supports the following "normal" (i.e., non-real-time) scheduling policies as values that may be specified in .IR policy : @@ -94,7 +94,7 @@ The argument is a pointer to a structure that defines the new scheduling policy and attributes for the specified thread. This structure has the following form: - +.PP .in +4n .nf struct sched_attr { @@ -112,7 +112,7 @@ struct sched_attr { }; .fi .in - +.PP The fields of this structure are as follows: .TP .B size @@ -218,7 +218,7 @@ If equals zero, the scheduling policy and attributes of the calling thread will be retrieved. - +.PP The .I size argument should be set to the size of the @@ -228,7 +228,7 @@ The value must be at least as large as the size of the initially published .I sched_attr structure, or the call fails with the error .BR EINVAL . - +.PP The retrieved scheduling attributes are placed in the fields of the .I sched_attr structure pointed to by @@ -238,7 +238,7 @@ The kernel sets to the size of its .I sched_attr structure. - +.PP If the caller-provided .I attr buffer is larger than the kernel's @@ -254,7 +254,7 @@ fails with the error As with .BR sched_setattr (), these semantics allow for future extensibility of the interface. - +.PP The .I flags argument is provided to allow for future extensions to the interface; diff --git a/man2/sched_setparam.2 b/man2/sched_setparam.2 index 861ecb192..ab83900c8 100644 --- a/man2/sched_setparam.2 +++ b/man2/sched_setparam.2 @@ -57,13 +57,13 @@ policy of the process identified by See .BR sched (7) for a description of the scheduling policies supported under Linux. - +.PP .BR sched_getparam () retrieves the scheduling parameters for the process identified by \fIpid\fP. If \fIpid\fP is zero, then the parameters of the calling process are retrieved. - +.PP .BR sched_setparam () checks the validity of \fIparam\fP for the scheduling policy of the thread. @@ -72,11 +72,11 @@ range given by .BR sched_get_priority_min (2) and .BR sched_get_priority_max (2). - +.PP For a discussion of the privileges and resource limits related to scheduling priority and policy, see .BR sched (7). - +.PP POSIX systems on which .BR sched_setparam () and diff --git a/man2/sched_setscheduler.2 b/man2/sched_setscheduler.2 index 7a6a8314e..9c866597e 100644 --- a/man2/sched_setscheduler.2 +++ b/man2/sched_setscheduler.2 @@ -45,11 +45,11 @@ sets both the scheduling policy and parameters for the thread whose ID is specified in \fIpid\fP. If \fIpid\fP equals zero, the scheduling policy and parameters of the calling thread will be set. - +.PP The scheduling parameters are specified in the .I param argument, which is a pointer to a structure of the following form: - +.PP .nf .in +4n struct sched_param { @@ -59,13 +59,13 @@ struct sched_param { }; .in .fi - +.PP In the current implementation, the structure contains only one field, .IR sched_priority . The interpretation of .I param depends on the selected policy. - +.PP Currently, Linux supports the following "normal" (i.e., non-real-time) scheduling policies as values that may be specified in .IR policy : @@ -86,7 +86,7 @@ low priority background jobs. For each of the above policies, .IR param\->sched_priority must be 0. - +.PP Various "real-time" policies are also supported, for special time-critical applications that need precise control over the way in which runnable threads are selected for execution. @@ -112,7 +112,7 @@ and with the specified .IR policy . On Linux, these system calls return, respectively, 1 and 99. - +.PP Since Linux 2.6.32, the .B SCHED_RESET_ON_FORK flag can be ORed in @@ -125,7 +125,7 @@ do not inherit privileged scheduling policies. See .BR sched (7) for details. - +.PP .BR sched_getscheduler () returns the current scheduling policy of the thread identified by \fIpid\fP. @@ -178,7 +178,7 @@ That page also describes an additional policy, .BR SCHED_DEADLINE , which is settable only via .BR sched_setattr (2). - +.PP POSIX systems on which .BR sched_setscheduler () and @@ -186,7 +186,7 @@ and are available define .B _POSIX_PRIORITY_SCHEDULING in \fI\fP. - +.PP POSIX.1 does not detail the permissions that an unprivileged thread requires in order to call .BR sched_setscheduler (), diff --git a/man2/sched_yield.2 b/man2/sched_yield.2 index 3b3701932..2851e6054 100644 --- a/man2/sched_yield.2 +++ b/man2/sched_yield.2 @@ -56,14 +56,14 @@ If the calling thread is the only thread in the highest priority list at that time, it will continue to run after a call to .BR sched_yield (). - +.PP POSIX systems on which .BR sched_yield () is available define .B _POSIX_PRIORITY_SCHEDULING in .IR . - +.PP Strategic calls to .BR sched_yield () can improve performance by giving other threads or processes diff --git a/man2/seccomp.2 b/man2/seccomp.2 index caaff6c2f..55d9b65eb 100644 --- a/man2/seccomp.2 +++ b/man2/seccomp.2 @@ -45,7 +45,7 @@ The .BR seccomp () system call operates on the Secure Computing (seccomp) state of the calling process. - +.PP Currently, Linux supports the following .IR operation values: @@ -65,7 +65,7 @@ signal. Strict secure computing mode is useful for number-crunching applications that may need to execute untrusted byte code, perhaps obtained by reading from a pipe or socket. - +.IP Note that although the calling thread can no longer call .BR sigprocmask (2), it can use @@ -92,19 +92,19 @@ or by using .BR setrlimit (2) to set the hard limit for .BR RLIMIT_CPU . - +.IP This operation is available only if the kernel is configured with .BR CONFIG_SECCOMP enabled. - +.IP The value of .IR flags must be 0, and .IR args must be NULL. - +.IP This operation is functionally identical to the call: - +.IP prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT); .TP .BR SECCOMP_SET_MODE_FILTER @@ -121,7 +121,7 @@ fails, returning .BR EINVAL in .IR errno . - +.IP If .BR fork (2) or @@ -133,7 +133,7 @@ If is allowed, the existing filters will be preserved across a call to .BR execve (2). - +.IP In order to use the .BR SECCOMP_SET_MODE_FILTER operation, either the caller must have the @@ -143,9 +143,9 @@ capability in its user namespace, or the thread must already have the bit set. If that bit was not already set by an ancestor of this thread, the thread must make the following call: - +.IP prctl(PR_SET_NO_NEW_PRIVS, 1); - +.IP Otherwise, the .BR SECCOMP_SET_MODE_FILTER operation will fail and return @@ -164,7 +164,7 @@ return 0 without actually making the system call. Thus, the program might be tricked into retaining superuser privileges in circumstances where it is possible to influence it to do dangerous things because it did not actually drop privileges.) - +.IP If .BR prctl (2) or @@ -172,19 +172,19 @@ or is allowed by the attached filter, further filters may be added. This will increase evaluation time, but allows for further reduction of the attack surface during execution of a thread. - +.IP The .BR SECCOMP_SET_MODE_FILTER operation is available only if the kernel is configured with .BR CONFIG_SECCOMP_FILTER enabled. - +.IP When .IR flags is 0, this operation is functionally identical to the call: - +.IP prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, args); - +.IP The recognized .IR flags are: @@ -197,7 +197,7 @@ A "filter tree" is the ordered list of filters attached to a thread. (Attaching identical filters in separate .BR seccomp () calls results in different filters from this perspective.) - +.IP If any thread cannot synchronize to the same filter tree, the call will not attach the new seccomp filter, and will fail, returning the first thread ID found that cannot synchronize. @@ -211,7 +211,7 @@ When adding filters via .BR SECCOMP_SET_MODE_FILTER , .IR args points to a filter program: - +.PP .in +4n .nf struct sock_fprog { @@ -221,9 +221,9 @@ struct sock_fprog { }; .fi .in - +.PP Each program must contain one or more BPF instructions: - +.PP .in +4n .nf struct sock_filter { /* Filter block */ @@ -234,7 +234,7 @@ struct sock_filter { /* Filter block */ }; .fi .in - +.PP When executing the instructions, the BPF program operates on the system call information made available (i.e., use the .BR BPF_ABS @@ -245,7 +245,7 @@ addressing mode) as a (read-only) .\" that would need to use ptrace to catch the call an directly .\" modify the registers before continuing with the call. buffer of the following form: - +.PP .in +4n .nf struct seccomp_data { @@ -257,14 +257,14 @@ struct seccomp_data { }; .fi .in - +.PP Because numbering of system calls varies between architectures and some architectures (e.g., x86-64) allow user-space code to use the calling conventions of multiple architectures, it is usually necessary to verify the value of the .IR arch field. - +.PP It is strongly recommended to use a whitelisting approach whenever possible because such an approach is more robust and simple. A blacklist will have to be updated whenever a potentially @@ -272,7 +272,7 @@ dangerous system call is added (or a dangerous flag or option if those are blacklisted), and it is often possible to alter the representation of a value without altering its meaning, leading to a blacklist bypass. - +.PP The .IR arch field is not unique for all calling conventions. @@ -294,7 +294,7 @@ is used on the system call number to tell the two ABIs apart. .\" will have a value that is not all-ones, and this will trigger .\" an extra instruction in system_call to mask off the extra bit, .\" so that the syscall table indexing still works. - +.PP This means that in order to create a seccomp-based blacklist for system calls performed through the x86-64 ABI, it is necessary to not only check that @@ -305,7 +305,7 @@ but also to explicitly reject all system calls that contain .BR __X32_SYSCALL_BIT in .IR nr . - +.PP The .I instruction_pointer field provides the address of the machine-language instruction that @@ -319,7 +319,7 @@ made the system call. and .BR mprotect (2) system calls to prevent the program from subverting such checks.) - +.PP When checking values from .IR args against a blacklist, keep in mind that arguments are often @@ -333,7 +333,7 @@ a system call that takes an argument of type .IR int , the more-significant half of the argument register is ignored by the system call, but visible in the seccomp data. - +.PP A seccomp filter returns a 32-bit value consisting of two parts: the most significant 16 bits (corresponding to the mask defined by the constant @@ -342,7 +342,7 @@ contain one of the "action" values listed below; the least significant 16-bits (defined by the constant .BR SECCOMP_RET_DATA ) are "data" to be associated with this return value. - +.PP If multiple filters exist, they are \fIall\fP executed, in reverse order of their addition to the filter tree\(emthat is, the most recently installed filter is executed first. @@ -366,7 +366,7 @@ The return value for the evaluation of a given system call is the first-seen .BR SECCOMP_RET_ACTION value of highest precedence (along with its accompanying data) returned by execution of all of the filters. - +.PP In decreasing order of precedence, the values that may be returned by a seccomp filter are: .TP @@ -382,7 +382,7 @@ signal Even if a signal handler has been registered and otherwise catches .BR SIGSYS , the handler will be ignored in this case and the task always terminates. - +.IP .\" See these commits: .\" seccomp: dump core when using SECCOMP_RET_KILL (b25e67161c295c98acda92123b2dd1e7d8642901) .\" seccomp: Only dump core when single-threaded (d7276e321ff8a53106a59c85ca46d03e34288893) @@ -455,7 +455,7 @@ the system call is not executed and returns a failure status with .I errno set to .BR ENOSYS . - +.IP A tracer will be notified if it requests .BR PTRACE_O_TRACESECCOMP using @@ -466,14 +466,14 @@ and the .BR SECCOMP_RET_DATA portion of the filter's return value will be available to the tracer via .BR PTRACE_GETEVENTMSG . - +.IP The tracer can skip the system call by changing the system call number to \-1. Alternatively, the tracer can change the system call requested by changing the system call to a valid system call number. If the tracer asks to skip the system call, then the system call will appear to return the value that the tracer puts in the return value register. - +.IP .\" This was changed in ce6526e8afa4. .\" A related hole, using PTRACE_SYSCALL instead of SECCOMP_RET_TRACE, was .\" changed in arch-specific commits, e.g. 93e35efb8de4 for X86 and @@ -591,21 +591,21 @@ Rather than hand-coding seccomp filters as shown in the example below, you may prefer to employ the .I libseccomp library, which provides a front-end for generating seccomp filters. - +.PP The .IR Seccomp field of the .IR /proc/[pid]/status file provides a method of viewing the seccomp mode of a process; see .BR proc (5). - +.PP .BR seccomp () provides a superset of the functionality provided by the .BR prctl (2) .BR PR_SET_SECCOMP operation (which does not support .IR flags ). - +.PP Since Linux 4.4, the .BR prctl (2) .B PTRACE_SECCOMP_GET_FILTER @@ -658,11 +658,11 @@ that the example program should attempt to execute using .BR execve (2) system call). Some example runs of the program are shown below. - +.PP First, we display the architecture that we are running on (x86-64) and then construct a shell function that looks up system call numbers on this architecture: - +.PP .nf .in +4n $ \fBuname -m\fP @@ -673,25 +673,25 @@ $ \fBsyscall_nr() { }\fP .in .fi - +.PP When the BPF filter rejects a system call (case [2] above), it causes the system call to fail with the error number specified on the command line. In the experiments shown here, we'll use error number 99: - +.PP .nf .in +4n $ \fBerrno 99\fP EADDRNOTAVAIL 99 Cannot assign requested address .in .fi - +.PP In the following example, we attempt to run the command .BR whoami (1), but the BPF filter rejects the .BR execve (2) system call, so that the command is not even executed: - +.PP .nf .in +4n $ \fBsyscall_nr execve\fP @@ -704,13 +704,13 @@ $ \fB./a.out 59 0xC000003E 99 /bin/whoami\fP execv: Cannot assign requested address .in .fi - +.PP In the next example, the BPF filter rejects the .BR write (2) system call, so that, although it is successfully started, the .BR whoami (1) command is not able to write output: - +.PP .nf .in +4n $ \fBsyscall_nr write\fP @@ -718,12 +718,12 @@ $ \fBsyscall_nr write\fP $ \fB./a.out 1 0xC000003E 99 /bin/whoami\fP .in .fi - +.PP In the final example, the BPF filter rejects a system call that is not used by the .BR whoami (1) command, so it is able to successfully execute and produce output: - +.PP .nf .in +4n $ \fBsyscall_nr preadv\fP diff --git a/man2/select.2 b/man2/select.2 index 4c4cc9e68..0b1a1314e 100644 --- a/man2/select.2 +++ b/man2/select.2 @@ -144,13 +144,13 @@ will be watched for exceptional conditions. .B POLLPRI in .BR poll (2).) - +.PP On exit, each of the file descriptor sets is modified in place to indicate which file descriptors actually changed status. (Thus, if using .BR select () within a loop, the sets must be reinitialized before each call.) - +.PP Each of the three file descriptor sets may be specified as NULL if no file descriptors are to be watched for the corresponding class of events. @@ -260,7 +260,7 @@ avoiding the race.) The time structures involved are defined in .I and look like - +.PP .in +4n .nf struct timeval { @@ -269,9 +269,9 @@ struct timeval { }; .fi .in - +.PP and - +.PP .in +4n .nf struct timespec { @@ -280,7 +280,7 @@ struct timespec { }; .fi .in - +.PP (However, see below on the POSIX.1 versions.) .PP Some code calls @@ -395,7 +395,7 @@ in undefined behavior. Moreover, POSIX requires .I fd to be a valid file descriptor. - +.PP On some other UNIX systems, .\" Darwin, according to a report by Jeremy Sequoia, relayed by Josh Triplett .BR select () @@ -412,7 +412,7 @@ Portable programs may wish to check for .B EAGAIN and loop, just as with .BR EINTR . - +.PP On systems that lack .BR pselect (), reliable (and more portable) signal trapping can be achieved @@ -425,7 +425,7 @@ in the main program. (To avoid possibly blocking when writing to a pipe that may be full or reading from a pipe that may be empty, nonblocking I/O is used when reading from and writing to the pipe.) - +.PP Concerning the types involved, the classical situation is that the two fields of a .I timeval @@ -434,7 +434,7 @@ structure are typed as (as shown above), and the structure is defined in .IR . The POSIX.1 situation is - +.PP .in +4n .nf struct timeval { @@ -443,7 +443,7 @@ struct timeval { }; .fi .in - +.PP where the structure is defined in .I and the data types @@ -464,7 +464,7 @@ for .BR select () and .BR pselect (). - +.PP Under glibc 2.0, .I gives the wrong prototype for @@ -486,7 +486,7 @@ and the event notifications provided by .BR poll (2) (and .BR epoll (7)): - +.PP .nf .in +4n #define POLLIN_SET (POLLRDNORM | POLLRDBAND | POLLIN | POLLHUP | @@ -524,7 +524,7 @@ However, in the glibc implementation, the .IR fd_set type is fixed in size. See also BUGS. - +.PP The .BR pselect () interface described in this page is implemented by glibc. @@ -532,7 +532,7 @@ The underlying Linux system call is named .BR pselect6 (). This system call has somewhat different behavior from the glibc wrapper function. - +.PP The Linux .BR pselect6 () system call modifies its @@ -547,7 +547,7 @@ function does not modify its .I timeout argument; this is the behavior required by POSIX.1-2001. - +.PP The final argument of the .BR pselect6 () system call is not a @@ -588,7 +588,7 @@ macros operating according to that limit. To monitor file descriptors greater than 1023, use .BR poll (2) instead. - +.PP According to POSIX, .BR select () should check all specified file descriptors in the three file descriptor sets, @@ -600,13 +600,13 @@ that the process currently has open. According to POSIX, any such file descriptor that is specified in one of the sets should result in the error .BR EBADF . - +.PP Glibc 2.0 provided a version of .BR pselect () that did not take a .I sigmask argument. - +.PP Starting with version 2.1, glibc provided an emulation of .BR pselect () that was implemented using @@ -619,7 +619,7 @@ was designed to prevent. Modern versions of glibc use the (race-free) .BR pselect () system call on kernels where it is provided. - +.PP Under Linux, .BR select () may report a socket file descriptor as "ready for reading", while @@ -635,7 +635,7 @@ Thus it may be safer to use .B O_NONBLOCK on sockets that should not block. .\" Maybe the kernel should have returned EIO in such a situation? - +.PP On Linux, .BR select () also modifies @@ -701,6 +701,6 @@ main(void) .BR write (2), .BR epoll (7), .BR time (7) - +.PP For a tutorial with discussion and examples, see .BR select_tut (2). diff --git a/man2/select_tut.2 b/man2/select_tut.2 index 967fd9b5d..5dfcea8df 100644 --- a/man2/select_tut.2 +++ b/man2/select_tut.2 @@ -76,7 +76,7 @@ is used to efficiently monitor multiple file descriptors, to see if any of them is, or becomes, "ready"; that is, to see whether I/O becomes possible, or an "exceptional condition" has occurred on any of the file descriptors. - +.PP Its principal arguments are three "sets" of file descriptors: \fIreadfds\fP, \fIwritefds\fP, and \fIexceptfds\fP. Each set is declared as type @@ -487,13 +487,13 @@ On success, .BR select () returns the total number of file descriptors still present in the file descriptor sets. - +.PP If .BR select () timed out, then the return value will be zero. The file descriptors set should be all empty (but may not be on some systems). - +.PP A return value of \-1 indicates an error, with \fIerrno\fP being set appropriately. In the case of an error, the contents of the returned sets and @@ -811,7 +811,7 @@ Another idea is to set nonblocking I/O using .BR fcntl (2). This also has its problems because you end up using inefficient timeouts. - +.PP The program does not handle more than one simultaneous connection at a time, although it could easily be extended to do this with a linked list of buffers\(emone for each connection. diff --git a/man2/semctl.2 b/man2/semctl.2 index 6316787d1..057c286d2 100644 --- a/man2/semctl.2 +++ b/man2/semctl.2 @@ -65,7 +65,7 @@ This function has three or four arguments, depending on When there are four, the fourth has the type .IR "union semun" . The \fIcalling program\fP must define this union as follows: - +.PP .nf .in +4n union semun { @@ -341,7 +341,7 @@ returns \-1 with .I errno indicating the error. - +.PP Otherwise, the system call returns a nonnegative value depending on .I cmd as follows: @@ -459,7 +459,7 @@ or greater than the implementation limit .SH CONFORMING TO POSIX.1-2001, POSIX.1-2008, SVr4. .\" SVr4 documents more error conditions EINVAL and EOVERFLOW. - +.PP POSIX.1 specifies the .\" POSIX.1-2001, POSIX.1-2008 .I sem_nsems @@ -484,7 +484,7 @@ Applications intended to be portable to such old systems may need to include these header files. .\" Like Linux, the FreeBSD man pages still document .\" the inclusion of these header files. - +.PP The .BR IPC_INFO , .B SEM_STAT @@ -542,7 +542,7 @@ and explicitly notes that this value is set by a successful call, with the implication that no other interface affects the .I sempid value. - +.PP While some implementations conform to the behavior specified in POSIX.1, others do not. (The fault here probably lies with POSIX.1 inasmuch as it likely failed @@ -560,7 +560,7 @@ on process termination as a consequence of the use of the .B SEM_UNDO flag (see .BR semop (2)). - +.PP Linux also updates .I sempid for diff --git a/man2/semget.2 b/man2/semget.2 index 30cf2fef9..4f4c45bde 100644 --- a/man2/semget.2 +++ b/man2/semget.2 @@ -224,7 +224,7 @@ Applications intended to be portable to such old systems may need to include these header files. .\" Like Linux, the FreeBSD man pages still document .\" the inclusion of these header files. - +.PP .B IPC_PRIVATE isn't a flag field but a .I key_t @@ -247,7 +247,7 @@ it should explicitly initialize the semaphores to the desired values. .\" In truth, every one of the many implementations that I've tested sets .\" the values to zero, but I suppose there is/was some obscure .\" implementation out there that does not. - +.PP Initialization can be done using .BR semctl (2) .B SETVAL diff --git a/man2/semop.2 b/man2/semop.2 index 677a0d465..a669c5aaf 100644 --- a/man2/semop.2 +++ b/man2/semop.2 @@ -113,7 +113,7 @@ performed immediately depends on the presence of the flag in the individual .I sem_flg fields, as noted below. - +.PP Each operation is performed on the .IR sem_num \-th semaphore of the semaphore set, where the first semaphore of the set @@ -288,7 +288,7 @@ then .BR semtimedop () behaves exactly like .BR semop (). - +.PP Note that if .BR semtimedop () is interrupted by a signal, causing the call to fail with the error @@ -400,7 +400,7 @@ Applications intended to be portable to such old systems may need to include these header files. .\" Like Linux, the FreeBSD man pages still document .\" the inclusion of these header files. - +.PP The .I sem_undo structures of a process aren't inherited by the child produced by @@ -414,7 +414,7 @@ is never automatically restarted after being interrupted by a signal handler, regardless of the setting of the .B SA_RESTART flag when establishing a signal handler. - +.PP A semaphore adjustment .RI ( semadj ) value is a per-process, per-semaphore integer that is the negated sum @@ -447,7 +447,7 @@ flag allows more than one process to share a list; see .BR clone (2) for details. - +.PP The \fIsemval\fP, \fIsempid\fP, \fIsemzcnt\fP, and \fIsemnct\fP values for a semaphore can all be retrieved using appropriate .BR semctl (2) @@ -509,7 +509,7 @@ is specified for a semaphore operation). Linux adopts a third approach: decreasing the semaphore value as far as possible (i.e., to zero) and allowing process termination to proceed immediately. - +.PP In kernels 2.6.x, x <= 10, there is a bug that in some circumstances prevents a thread that is waiting for a semaphore value to become zero from being woken up when the value does actually become zero. diff --git a/man2/send.2 b/man2/send.2 index 70d07cff7..286c1eba8 100644 --- a/man2/send.2 +++ b/man2/send.2 @@ -81,11 +81,11 @@ argument, is equivalent to .BR write (2). Also, the following call - +.PP send(sockfd, buf, len, flags); - +.PP is equivalent to - +.PP sendto(sockfd, buf, len, flags, NULL, 0); .PP The argument @@ -219,7 +219,7 @@ as the socket option (see .BR tcp (7)), with the difference that this flag can be set on a per-call basis. - +.IP Since Linux 2.6, this flag is also supported for UDP sockets, and informs the kernel to package all of the data sent in calls with this flag set into a single datagram which is transmitted only when a call is performed @@ -285,14 +285,14 @@ It points to a buffer containing the address; the field should be set to the size of the address. For a connected socket, these fields should be specified as NULL and 0, respectively. - +.PP The .I msg_iov and .I msg_iovlen fields specify scatter-gather locations, as for .BR writev (2). - +.PP You may send control information using the .I msg_control and @@ -303,7 +303,7 @@ per socket by the value in .IR /proc/sys/net/core/optmem_max ; see .BR socket (7). - +.PP The .I msg_flags field is ignored. @@ -445,7 +445,7 @@ but glibc currently types it as .\" The problem is an underlying kernel issue: the size of the .\" __kernel_size_t type used to type this field varies .\" across architectures, but socklen_t is always 32 bits. - +.PP See .BR sendmmsg (2) for information about a Linux-specific system call diff --git a/man2/sendfile.2 b/man2/sendfile.2 index fc0c47806..0bf66ce2a 100644 --- a/man2/sendfile.2 +++ b/man2/sendfile.2 @@ -54,12 +54,12 @@ is more efficient than the combination of and .BR write (2), which would require transferring data to and from user space. - +.PP .I in_fd should be a file descriptor opened for reading and .I out_fd should be a descriptor opened for writing. - +.PP If .I offset is not NULL, then it points @@ -80,24 +80,24 @@ does not modify the file offset of otherwise the file offset is adjusted to reflect the number of bytes read from .IR in_fd . - +.PP If .I offset is NULL, then data will be read from .IR in_fd starting at the file offset, and the file offset will be updated by the call. - +.PP .I count is the number of bytes to copy between the file descriptors. - +.PP The .IR in_fd argument must correspond to a file which supports .BR mmap (2)-like operations (i.e., it cannot be a socket). - +.PP In Linux kernels before 2.6.33, .I out_fd must refer to a socket. @@ -114,7 +114,7 @@ Note that a successful call to may write fewer bytes than requested; the caller should be prepared to retry the call if there were unsent bytes. See also NOTES. - +.PP On error, \-1 is returned, and .I errno is set appropriately. @@ -174,7 +174,7 @@ The include file is present since glibc 2.1. .SH CONFORMING TO Not specified in POSIX.1-2001, nor in other standards. - +.PP Other UNIX systems implement .BR sendfile () with different semantics and prototypes. @@ -185,7 +185,7 @@ will transfer at most 0x7ffff000 (2,147,479,552) bytes, returning the number of bytes actually transferred. .\" commit e28cc71572da38a5a12c1cfe4d7032017adccf69 (This is true on both 32-bit and 64-bit systems.) - +.PP If you plan to use .BR sendfile () for sending files to a TCP socket, but need @@ -195,13 +195,13 @@ it useful to employ the option, described in .BR tcp (7), to minimize the number of packets and to tune performance. - +.PP In Linux 2.4 and earlier, .I out_fd could also refer to a regular file; this possibility went away in the Linux 2.6.x kernel series, but was restored in Linux 2.6.33. - +.PP The original Linux .BR sendfile () system call was not designed to handle large file offsets. @@ -213,7 +213,7 @@ argument. The glibc .BR sendfile () wrapper function transparently deals with the kernel differences. - +.PP Applications may wish to fall back to .BR read (2)/ write (2) in the case where @@ -222,7 +222,7 @@ fails with .B EINVAL or .BR ENOSYS . - +.PP If .I out_fd refers to a socket or pipe with zero-copy support, callers must ensure the @@ -231,7 +231,7 @@ transferred portions of the file referred to by remain unmodified until the reader on the other end of .I out_fd has consumed the transferred data. - +.PP The Linux-specific .BR splice (2) call supports transferring data between arbitrary file descriptors diff --git a/man2/sendmmsg.2 b/man2/sendmmsg.2 index 056c74771..b813aa9e2 100644 --- a/man2/sendmmsg.2 +++ b/man2/sendmmsg.2 @@ -46,12 +46,12 @@ that allows the caller to transmit multiple messages on a socket using a single system call. (This has performance benefits for some applications.) .\" See commit 228e548e602061b08ee8e8966f567c12aa079682 - +.PP The .I sockfd argument is the file descriptor of the socket on which data is to be transmitted. - +.PP The .I msgvec argument is a pointer to an array of @@ -59,13 +59,13 @@ argument is a pointer to an array of structures. The size of this array is specified in .IR vlen . - +.PP The .I mmsghdr structure is defined in .I as: - +.PP .in +4n .nf struct mmsghdr { @@ -88,13 +88,13 @@ field is used to return the number of bytes sent from the message in (i.e., the same as the return value from a single .BR sendmsg (2) call). - +.PP The .I flags argument contains flags ORed together. The flags are the same as for .BR sendmsg (2). - +.PP A blocking .BR sendmmsg () call blocks until @@ -104,7 +104,7 @@ A nonblocking call sends as many messages as possible (up to the limit specified by .IR vlen ) and returns immediately. - +.PP On return from .BR sendmmsg (), the @@ -126,7 +126,7 @@ if this is less than the caller can retry with a further .BR sendmmsg () call to send the remaining messages. - +.PP On error, \-1 is returned, and .I errno is set to indicate the error. @@ -174,7 +174,7 @@ and .I three in two distinct UDP datagrams using one system call. The contents of the first datagram originates from a pair of buffers. - +.PP .nf #define _GNU_SOURCE #include diff --git a/man2/set_mempolicy.2 b/man2/set_mempolicy.2 index b136444fe..c4a68f36e 100644 --- a/man2/set_mempolicy.2 +++ b/man2/set_mempolicy.2 @@ -46,12 +46,12 @@ to the values specified by the and .I maxnode arguments. - +.PP A NUMA machine has different memory controllers with different distances to specific CPUs. The memory policy defines from which node memory is allocated for the thread. - +.PP This system call defines the default policy for the thread. The thread policy governs allocation of pages in the process's address space outside of memory ranges @@ -72,7 +72,7 @@ The policy is applied only when a new page is allocated for the thread. For anonymous memory this is when the page is first touched by the thread. - +.PP The .I mode argument must specify one of @@ -89,7 +89,7 @@ require the caller to specify the node or nodes to which the mode applies, via the .I nodemask argument. - +.PP The .I mode argument may also include an optional @@ -133,7 +133,7 @@ is zero, the .I nodemask argument is ignored. - +.PP Where a .I nodemask is required, it must contain at least one node that is on-line, @@ -154,7 +154,7 @@ the memory policy reverts to This effectively overrides the specified policy until the process's cpuset context includes one or more of the nodes specified by .IR nodemask . - +.PP The .I mode argument must include one of the following values: @@ -184,7 +184,7 @@ node ID specified in and so forth, until none of the specified nodes contain free memory. Pages will not be allocated from any node not specified in the .IR nodemask . - +.IP .TP .B MPOL_INTERLEAVE This mode interleaves page allocations across the nodes specified in @@ -308,7 +308,7 @@ Memory policy is not remembered if the page is swapped out. When such a page is paged back in, it will use the policy of the thread or memory range that is in effect at the time the page is allocated. - +.PP For information on library support, see .BR numa (7). .SH SEE ALSO diff --git a/man2/set_thread_area.2 b/man2/set_thread_area.2 index 82861c4db..ffc5b03b5 100644 --- a/man2/set_thread_area.2 +++ b/man2/set_thread_area.2 @@ -25,10 +25,10 @@ Linux dedicates three global descriptor table (GDT) entries for thread-local storage. For more information about the GDT, see the Intel Software Developer's Manual or the AMD Architecture Programming Manual. - +.PP Both of these system calls take an argument that is a pointer to a structure of the following type: - +.PP .nf .in +4n struct user_desc { @@ -44,13 +44,13 @@ struct user_desc { }; .in .fi - +.PP .BR get_thread_area () reads the GDT entry indicated by .I u_info\->entry_number and fills in the rest of the fields in .IR u_info . - +.PP .BR set_thread_area () sets a TLS entry in the GDT. .PP diff --git a/man2/set_tid_address.2 b/man2/set_tid_address.2 index d05608dee..915a81239 100644 --- a/man2/set_tid_address.2 +++ b/man2/set_tid_address.2 @@ -82,9 +82,9 @@ if the thread is sharing memory with other threads, then 0 is written at the address specified in .I clear_child_tid and the kernel performs the following operation: - +.PP futex(clear_child_tid, FUTEX_WAKE, 1, NULL, NULL, 0); - +.PP The effect of this operation is to wake a single thread that is performing a futex wait on the memory location. Errors from the futex wake operation are ignored. diff --git a/man2/seteuid.2 b/man2/seteuid.2 index 7bbb82468..f6fed9e25 100644 --- a/man2/seteuid.2 +++ b/man2/seteuid.2 @@ -56,7 +56,7 @@ _POSIX_C_SOURCE\ >=\ 200112L sets the effective user ID of the calling process. Unprivileged processes may only set the effective user ID to the real user ID, the effective user ID or the saved set-user-ID. - +.PP Precisely the same holds for .BR setegid () with "group" instead of "user". @@ -70,7 +70,7 @@ On success, zero is returned. On error, \-1 is returned, and .I errno is set appropriately. - +.PP .IR Note : there are cases where .BR seteuid () @@ -91,7 +91,7 @@ capability in its user namespace) and .I euid does not match the current real user ID, current effective user ID, or current saved set-user-ID. - +.IP In the case of .BR setegid (): the calling process is not privileged (does not have the @@ -124,7 +124,7 @@ with the difference that the change in implementation from to .BI setresgid(\-1, " egid" ", \-1)" occurred in glibc 2.2 or 2.3 (depending on the hardware architecture). - +.PP According to POSIX.1, .BR seteuid () .RB ( setegid ()) diff --git a/man2/setfsgid.2 b/man2/setfsgid.2 index 8985124a1..4b5e17954 100644 --- a/man2/setfsgid.2 +++ b/man2/setfsgid.2 @@ -48,7 +48,7 @@ In fact, whenever the effective group ID is changed, the filesystem group ID will also be changed to the new value of the effective group ID. - +.PP Explicit calls to .BR setfsuid (2) and @@ -59,7 +59,7 @@ corresponding change in the real and effective user and group IDs. A change in the normal user IDs for a program such as the NFS server is a security hole that can expose it to unwanted signals. (But see below.) - +.PP .BR setfsgid () will succeed only if the caller is the superuser or if .I fsgid @@ -87,7 +87,7 @@ for a discussion of why the use of both and .BR setfsgid () is nowadays unneeded. - +.PP The original Linux .BR setfsgid () system call supported only 16-bit group IDs. diff --git a/man2/setfsuid.2 b/man2/setfsuid.2 index 4c49ccecf..42ec4338c 100644 --- a/man2/setfsuid.2 +++ b/man2/setfsuid.2 @@ -48,7 +48,7 @@ In fact, whenever the effective user ID is changed, the filesystem user ID will also be changed to the new value of the effective user ID. - +.PP Explicit calls to .BR setfsuid () and @@ -59,7 +59,7 @@ corresponding change in the real and effective user and group IDs. A change in the normal user IDs for a program such as the NFS server is a security hole that can expose it to unwanted signals. (But see below.) - +.PP .BR setfsuid () will succeed only if the caller is the superuser or if .I fsuid @@ -95,7 +95,7 @@ Thus, is nowadays unneeded and should be avoided in new applications (likewise for .BR setfsgid (2)). - +.PP The original Linux .BR setfsuid () system call supported only 16-bit user IDs. diff --git a/man2/setgid.2 b/man2/setgid.2 index a991b911d..ace6a23ec 100644 --- a/man2/setgid.2 +++ b/man2/setgid.2 @@ -43,7 +43,7 @@ If the calling process is privileged (has the .B CAP_SETGID capability in its user namespace), the real GID and saved set-group-ID are also set. - +.PP Under Linux, .BR setgid () is implemented like the POSIX version with the diff --git a/man2/setns.2 b/man2/setns.2 index 8f90f82e2..86af37b41 100644 --- a/man2/setns.2 +++ b/man2/setns.2 @@ -18,7 +18,7 @@ setns \- reassociate thread with a namespace .SH DESCRIPTION Given a file descriptor referring to a namespace, reassociate the calling thread with that namespace. - +.PP The .I fd argument is a file descriptor referring to one of the namespace entries in a @@ -31,7 +31,7 @@ The calling thread will be reassociated with the corresponding namespace, subject to any constraints imposed by the .I nstype argument. - +.PP The .I nstype argument specifies which type of namespace @@ -83,7 +83,7 @@ and wants to ensure that the namespace is of a particular type. .IR fd if the file descriptor was opened by another process and, for example, passed to the caller via a UNIX domain socket.) - +.PP If .I fd refers to a PID namespaces, the semantics are somewhat different @@ -99,7 +99,7 @@ is a descendant (child, grandchild, etc.) of the PID namespace of the caller. For further details on PID namespaces, see .BR pid_namespaces (7). - +.PP A process reassociating itself with a user namespace must have the .B CAP_SYS_ADMIN .\" See kernel/user_namespace.c:userns_install() [3.8 source] @@ -126,7 +126,7 @@ filesystem-related attributes flag) with another process. For further details on user namespaces, see .BR user_namespaces (7). - +.PP A process may not be reassociated with a new mount namespace if it is multithreaded. .\" Above check is in fs/namespace.c:mntns_install() [3.8 source] @@ -140,7 +140,7 @@ in the target mount namespace. See .BR user_namespaces (7) for details on the interaction of user namespaces and mount namespaces. - +.PP Using .BR setns () to change the caller's cgroup namespace does not change @@ -217,7 +217,7 @@ The remaining arguments specify a command and its arguments. The program opens the namespace file, joins that namespace using .BR setns (), and executes the specified command inside that namespace. - +.PP The following shell session demonstrates the use of this program (compiled as a binary named .IR ns_exec ) @@ -227,7 +227,7 @@ example program in the .BR clone (2) man page (complied as a binary named .IR newuts ). - +.PP We begin by executing the example program in .BR clone (2) in the background. @@ -235,7 +235,7 @@ That program creates a child in a separate UTS namespace. The child changes the hostname in its namespace, and then both processes display the hostnames in their UTS namespaces, so that we can see that they are different. - +.PP .nf .in +4n $ \fBsu\fP # Need privilege for namespace operations @@ -249,12 +249,12 @@ uts.nodename in parent: antero antero .in .fi - +.PP We then run the program shown below, using it to execute a shell. Inside that shell, we verify that the hostname is the one set by the child created by the first program: - +.PP .nf .in +4n # \fB./ns_exec /proc/3550/ns/uts /bin/bash\fP diff --git a/man2/setpgid.2 b/man2/setpgid.2 index 977659abc..7f26910d4 100644 --- a/man2/setpgid.2 +++ b/man2/setpgid.2 @@ -105,7 +105,7 @@ The preferred, POSIX.1-specified ways of doing this are: for retrieving the calling process's PGID; and .BR setpgid (), for setting a process's PGID. - +.PP .BR setpgid () sets the PGID of the process specified by .I pid @@ -130,12 +130,12 @@ and In this case, the \fIpgid\fP specifies an existing process group to be joined and the session ID of that group must match the session ID of the joining process. - +.PP The POSIX.1 version of .BR getpgrp (), which takes no arguments, returns the PGID of the calling process. - +.PP .BR getpgid () returns the PGID of the process specified by .IR pid . @@ -146,12 +146,12 @@ is zero, the process ID of the calling process is used. necessary, and the POSIX.1 .BR getpgrp () is preferred for that task.) - +.PP The System\ V-style .BR setpgrp (), which takes no arguments, is equivalent to .IR "setpgid(0,\ 0)" . - +.PP The BSD-specific .BR setpgrp () call, which takes arguments @@ -159,9 +159,9 @@ call, which takes arguments and .IR pgid , is a wrapper function that calls - +.PP setpgid(pid, pgid) - +.PP .\" The true BSD setpgrp() system call differs in allowing the PGID .\" to be set to arbitrary values, rather than being restricted to .\" PGIDs in the same session. @@ -172,15 +172,15 @@ function is no longer exposed by calls should be replaced with the .BR setpgid () call shown above. - +.PP The BSD-specific .BR getpgrp () call, which takes a single .I pid argument, is a wrapper function that calls - +.PP getpgid(pid) - +.PP Since glibc 2.19, the BSD-specific .BR getpgrp () function is no longer exposed by @@ -200,11 +200,11 @@ return zero. On error, \-1 is returned, and .I errno is set appropriately. - +.PP The POSIX.1 .BR getpgrp () always returns the PGID of the caller. - +.PP .BR getpgid (), and the BSD-specific .BR getpgrp () @@ -252,7 +252,7 @@ and the version of .BR getpgrp () with no arguments conform to POSIX.1-2001. - +.PP POSIX.1-2001 also specifies .BR getpgid () and the version of @@ -261,7 +261,7 @@ that takes no arguments. (POSIX.1-2008 marks this .BR setpgrp () specification as obsolete.) - +.PP The version of .BR getpgrp () with one argument and the version of @@ -274,12 +274,12 @@ A child created via inherits its parent's process group ID. The PGID is preserved across an .BR execve (2). - +.PP Each process group is a member of a session and each process is a member of the session of which its process group is a member. (See .BR credentials (7).) - +.PP A session can have a controlling terminal. At any time, one (and only one) of the process groups in the session can be the foreground process group @@ -306,7 +306,7 @@ and .BR tcsetpgrp (3) functions are used to get/set the foreground process group of the controlling terminal. - +.PP The .BR setpgid () and @@ -314,7 +314,7 @@ and calls are used by programs such as .BR bash (1) to create process groups in order to implement shell job control. - +.PP If the termination of a process causes a process group to become orphaned, and if any member of the newly orphaned process group is stopped, then a .B SIGHUP diff --git a/man2/setresuid.2 b/man2/setresuid.2 index 9cc4d7468..a2d855545 100644 --- a/man2/setresuid.2 +++ b/man2/setresuid.2 @@ -39,22 +39,22 @@ setresuid, setresgid \- set real, effective and saved user or group ID .BR setresuid () sets the real user ID, the effective user ID, and the saved set-user-ID of the calling process. - +.PP An unprivileged process may change its real UID, effective UID, and saved set-user-ID, each to one of: the current real UID, the current effective UID or the current saved set-user-ID. - +.PP A privileged process (on Linux, one having the \fBCAP_SETUID\fP capability) may set its real UID, effective UID, and saved set-user-ID to arbitrary values. - +.PP If one of the arguments equals \-1, the corresponding value is not changed. - +.PP Regardless of what changes are made to the real UID, effective UID, and saved set-user-ID, the filesystem UID is always set to the same value as the (possibly new) effective UID. - +.PP Completely analogously, .BR setresgid () sets the real GID, effective GID, and saved set-group-ID @@ -66,7 +66,7 @@ On success, zero is returned. On error, \-1 is returned, and .I errno is set appropriately. - +.PP .IR Note : there are cases where .BR setresuid () @@ -122,7 +122,7 @@ they also appear on HP-UX and some of the BSDs. Under HP-UX and FreeBSD, the prototype is found in .IR . Under Linux, the prototype is provided by glibc since version 2.3.2. - +.PP The original Linux .BR setresuid () and diff --git a/man2/setreuid.2 b/man2/setreuid.2 index dae346e41..f0b9c1faa 100644 --- a/man2/setreuid.2 +++ b/man2/setreuid.2 @@ -73,22 +73,22 @@ _XOPEN_SOURCE\ >=\ 500 .SH DESCRIPTION .BR setreuid () sets real and effective user IDs of the calling process. - +.PP Supplying a value of \-1 for either the real or effective user ID forces the system to leave that ID unchanged. - +.PP Unprivileged processes may only set the effective user ID to the real user ID, the effective user ID, or the saved set-user-ID. - +.PP Unprivileged users may only set the real user ID to the real user ID or the effective user ID. - +.PP If the real user ID is set (i.e., .I ruid is not \-1) or the effective user ID is set to a value not equal to the previous real user ID, the saved set-user-ID will be set to the new effective user ID. - +.PP Completely analogously, .BR setregid () sets real and effective group ID's of the calling process, @@ -98,7 +98,7 @@ On success, zero is returned. On error, \-1 is returned, and .I errno is set appropriately. - +.PP .IR Note : there are cases where .BR setreuid () @@ -158,7 +158,7 @@ first appeared in 4.2BSD). Setting the effective user (group) ID to the saved set-user-ID (saved set-group-ID) is possible since Linux 1.1.37 (1.1.38). - +.PP POSIX.1 does not specify all of the UID changes that Linux permits for an unprivileged process. For @@ -175,10 +175,10 @@ and the effective group ID can be changed to the value of the real group ID or the saved set-group-ID. The precise details of what ID changes are permitted vary across implementations. - +.PP POSIX.1 makes no specification about the effect of these calls on the saved set-user-ID and saved set-group-ID. - +.PP The original Linux .BR setreuid () and diff --git a/man2/setsid.2 b/man2/setsid.2 index 31986a311..7d2eb4fe8 100644 --- a/man2/setsid.2 +++ b/man2/setsid.2 @@ -47,10 +47,10 @@ The calling process is the leader of the new session The calling process also becomes the process group leader of a new process group in the session (i.e., its process group ID is made the same as its process ID). - +.PP The calling process will be the only process in the new process group and in the new session. - +.PP Initially, the new session has no controlling terminal. For details of how a session acquires a controlling terminal, see .BR credentials (7). @@ -76,7 +76,7 @@ A child created via inherits its parent's session ID. The session ID is preserved across an .BR execve (2). - +.PP A process group leader is a process whose process group ID equals its PID. Disallowing a process group leader from calling .BR setsid () @@ -93,14 +93,14 @@ and have the parent .BR _exit (2), while the child (which by definition can't be a process group leader) calls .BR setsid (). - +.PP If a session has a controlling terminal, and the .B CLOCAL flag for that terminal is not set, and a terminal hangup occurs, then the session leader is sent a .BR SIGHUP signal. - +.PP If a process that is a session leader terminates, then a .B SIGHUP signal is sent to each process in the foreground diff --git a/man2/setuid.2 b/man2/setuid.2 index 75d7aafb9..fef714279 100644 --- a/man2/setuid.2 +++ b/man2/setuid.2 @@ -76,7 +76,7 @@ On success, zero is returned. On error, \-1 is returned, and .I errno is set appropriately. - +.PP .IR Note : there are cases where .BR setuid () @@ -136,7 +136,7 @@ If .I uid is different from the old effective UID, the process will be forbidden from leaving core dumps. - +.PP The original Linux .BR setuid () system call supported only 16-bit user IDs. diff --git a/man2/sgetmask.2 b/man2/sgetmask.2 index e8789a882..80b530834 100644 --- a/man2/sgetmask.2 +++ b/man2/sgetmask.2 @@ -39,15 +39,15 @@ These system calls are obsolete. use .BR sigprocmask (2) instead. - +.PP .BR sgetmask () returns the signal mask of the calling process. - +.PP .BR ssetmask () sets the signal mask of the calling process to the value given in .IR newmask . The previous signal mask is returned. - +.PP The signal masks dealt with by these two system calls are plain bit masks (unlike the .I sigset_t @@ -76,12 +76,12 @@ These system calls are Linux-specific. Glibc does not provide wrappers for these obsolete system calls; in the unlikely event that you want to call them, use .BR syscall (2). - +.PP These system calls are unaware of signal numbers greater than 31 (i.e., real-time signals). - +.PP These system calls do not exist on x86-64. - +.PP It is not possible to block .B SIGSTOP or diff --git a/man2/shmctl.2 b/man2/shmctl.2 index eb2d35151..f6a82a8e5 100644 --- a/man2/shmctl.2 +++ b/man2/shmctl.2 @@ -298,7 +298,7 @@ operation returns the identifier of the shared memory segment whose index was given in .IR shmid . Other operations return 0 on success. - +.PP On error, \-1 is returned, and .I errno is set appropriately. @@ -362,7 +362,7 @@ or the owner and the process was not privileged (Linux: did not have the .B CAP_SYS_ADMIN capability). - +.IP Or (in kernels before 2.6.9), .B SHM_LOCK or @@ -392,7 +392,7 @@ Applications intended to be portable to such old systems may need to include these header files. .\" Like Linux, the FreeBSD man pages still document .\" the inclusion of these header files. - +.PP The .BR IPC_INFO , .B SHM_STAT @@ -404,7 +404,7 @@ program to provide information on allocated resources. In the future, these may modified or moved to a .I /proc filesystem interface. - +.PP Linux permits a process to attach .RB ( shmat (2)) a shared memory segment that has already been marked for deletion @@ -412,7 +412,7 @@ using .IR shmctl(IPC_RMID) . This feature is not available on other UNIX implementations; portable applications should avoid relying on it. - +.PP Various fields in a \fIstruct shmid_ds\fP were typed as .I short under Linux 2.2 diff --git a/man2/shmget.2 b/man2/shmget.2 index 09663fac0..023e1899a 100644 --- a/man2/shmget.2 +++ b/man2/shmget.2 @@ -118,19 +118,19 @@ Used in conjunction with .B SHM_HUGETLB to select alternative hugetlb page sizes (respectively, 2 MB and 1 GB) on systems that support multiple hugetlb page sizes. - +.IP More generally, the desired huge page size can be configured by encoding the base-2 logarithm of the desired page size in the six bits at the offset .BR SHM_HUGE_SHIFT . Thus, the above two constants are defined as: - +.IP .nf .in +4 #define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT) #define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT) .in .fi - +.IP For some additional details, see the discussion of the similarly named constants in .BR mmap (2). @@ -271,7 +271,7 @@ capability). .SH CONFORMING TO POSIX.1-2001, POSIX.1-2008, SVr4. .\" SVr4 documents an additional error condition EEXIST. - +.PP .B SHM_HUGETLB and .B SHM_NORESERVE @@ -289,7 +289,7 @@ Applications intended to be portable to such old systems may need to include these header files. .\" Like Linux, the FreeBSD man pages still document .\" the inclusion of these header files. - +.PP .B IPC_PRIVATE isn't a flag field but a .I key_t @@ -308,15 +308,15 @@ call: .B SHMALL System-wide limit on the total amount of shared memory, measured in units of the system page size. - +.IP On Linux, this limit can be read and modified via .IR /proc/sys/kernel/shmall . Since Linux 3.16, .\" commit 060028bac94bf60a65415d1d55a359c3a17d5c31 the default value for this limit is: - +.IP ULONG_MAX - 2^24 - +.IP The effect of this value (which is suitable for both 32-bit and 64-bit systems) is to impose no limitation on allocations. @@ -327,12 +327,12 @@ applications simply raised the existing limit without first checking its current value. Such applications would cause the value to overflow if the limit was set at .BR ULONG_MAX . - +.IP From Linux 2.4 up to Linux 3.15, the default value for this limit was: - +.IP SHMMAX / PAGE_SIZE * (SHMMNI / 16) - +.IP If .B SHMMAX and @@ -343,15 +343,15 @@ as the limit on the total memory used by all shared memory segments. .TP .B SHMMAX Maximum size in bytes for a shared memory segment. - +.IP On Linux, this limit can be read and modified via .IR /proc/sys/kernel/shmmax . Since Linux 3.16, .\" commit 060028bac94bf60a65415d1d55a359c3a17d5c31 the default value for this limit is: - +.IP ULONG_MAX - 2^24 - +.IP The effect of this value (which is suitable for both 32-bit and 64-bit systems) is to impose no limitation on allocations. @@ -360,10 +360,10 @@ See the description of for a discussion of why this default value (rather than .BR ULONG_MAX ) is used. - +.IP From Linux 2.2 up to Linux 3.15, the default value of this limit was 0x2000000 (32MB). - +.IP Because it is not possible to map just part of a shared memory segment, the amount of virtual memory places another limit on the maximum size of a usable segment: @@ -380,7 +380,7 @@ is the effective minimum size). System-wide limit on the number of shared memory segments. In Linux 2.2, the default value for this limit was 128; since Linux 2.4, the default value is 4096. - +.IP On Linux, this limit can be read and modified via .IR /proc/sys/kernel/shmmni . .\" Kernels between 2.4.x and 2.6.8 had an off-by-one error that meant diff --git a/man2/shmop.2 b/man2/shmop.2 index 94d97df8a..ff3cc25c7 100644 --- a/man2/shmop.2 +++ b/man2/shmop.2 @@ -173,7 +173,7 @@ returns the address of the attached shared memory segment; on error, is returned, and .I errno is set to indicate the cause of the error. - +.PP On success, .BR shmdt () returns 0; on error \-1 is returned, and @@ -228,7 +228,7 @@ is not aligned on a page boundary. .SH CONFORMING TO POSIX.1-2001, POSIX.1-2008, SVr4. .\" SVr4 documents an additional error condition EMFILE. - +.PP In SVID 3 (or perhaps earlier), the type of the \fIshmaddr\fP argument was changed from .I "char\ *" @@ -245,15 +245,15 @@ into After a .BR fork (2), the child inherits the attached shared memory segments. - +.PP After an .BR execve (2), all attached shared memory segments are detached from the process. - +.PP Upon .BR _exit (2), all attached shared memory segments are detached from the process. - +.PP Using .BR shmat () with diff --git a/man2/sigaction.2 b/man2/sigaction.2 index 278c67fbd..6ac9c4b26 100644 --- a/man2/sigaction.2 +++ b/man2/sigaction.2 @@ -202,7 +202,7 @@ This flag is meaningful only when establishing a handler for .BR SIGCHLD , or when setting that signal's disposition to .BR SIG_DFL . - +.IP If the .B SA_NOCLDWAIT flag is set when establishing a handler for @@ -271,7 +271,7 @@ the signal handler address is passed via the .IR act.sa_sigaction field. This handler takes three arguments, as follows: - +.PP .nf .in +4n void @@ -281,7 +281,7 @@ handler(int sig, siginfo_t *info, void *ucontext) } .in .fi - +.PP The .I siginfo_t data type is a structure with the following fields: @@ -332,7 +332,7 @@ siginfo_t { } .fi .in - +.PP .IR si_signo ", " si_errno " and " si_code are defined for all signals. .RI ( si_errno @@ -446,7 +446,7 @@ On some architectures, these signals also fill in the .I si_trapno field. - +.IP Some suberrors of .BR SIGBUS , in particular @@ -480,7 +480,7 @@ the tracee will be shown as delivering the event. and .I si_addr_lsb are Linux-specific extensions. - +.IP The .BR SEGV_BNDERR suberror of @@ -489,7 +489,7 @@ populates .IR si_lower and .IR si_upper . - +.IP The .BR SEGV_PKUERR suberror of @@ -545,7 +545,7 @@ event, will contain .BR SIGTRAP and have the ptrace event in the high byte: - +.PP .nf (SIGTRAP | PTRACE_EVENT_foo << 8). .fi @@ -846,7 +846,7 @@ During an .BR execve (2), the dispositions of handled signals are reset to the default; the dispositions of ignored signals are left unchanged. - +.PP According to POSIX, the behavior of a process is undefined after it ignores a .BR SIGFPE , @@ -943,7 +943,7 @@ used internally by the NPTL threading implementation. See .BR nptl (7) for details. - +.PP The original Linux system call was named .BR sigaction (). However, with the addition of real-time signals in Linux 2.2, diff --git a/man2/sigaltstack.2 b/man2/sigaltstack.2 index 88bfffa30..94ec8d230 100644 --- a/man2/sigaltstack.2 +++ b/man2/sigaltstack.2 @@ -88,7 +88,7 @@ of these tasks, then the other argument can be specified as NULL. The .I stack_t type used to type the arguments of this function is defined as follows: - +.PP .in +4n .nf typedef struct { @@ -98,7 +98,7 @@ typedef struct { } stack_t; .fi .in - +.PP To establish a new alternate signal stack, the fields of this structure are set as follows: .TP @@ -244,7 +244,7 @@ had to know the direction of stack growth. .SH EXAMPLE The following code segment demonstrates the use of .BR sigaltstack (): - +.PP .in +4n .nf stack_t ss; diff --git a/man2/sigpending.2 b/man2/sigpending.2 index 6322f8411..1296c6d46 100644 --- a/man2/sigpending.2 +++ b/man2/sigpending.2 @@ -67,16 +67,16 @@ POSIX.1-2001, POSIX.1-2008. See .BR sigsetops (3) for details on manipulating signal sets. - +.PP If a signal is both blocked and has a disposition of "ignored", it is .I not added to the mask of pending signals when generated. - +.PP The set of signals that is pending for a thread is the union of the set of signals that is pending for that thread and the set of signals that is pending for the process as a whole; see .BR signal (7). - +.PP A child created via .BR fork (2) initially has an empty pending signal set; diff --git a/man2/sigprocmask.2 b/man2/sigprocmask.2 index 66e14cf95..630042ec1 100644 --- a/man2/sigprocmask.2 +++ b/man2/sigprocmask.2 @@ -62,7 +62,7 @@ blocked for the caller (see also .BR signal (7) for more details). - +.PP The behavior of the call is dependent on the value of .IR how , as follows. @@ -86,7 +86,7 @@ If .I oldset is non-NULL, the previous value of the signal mask is stored in .IR oldset . - +.PP If .I set is NULL, then the signal mask is unchanged (i.e., @@ -95,12 +95,12 @@ is ignored), but the current value of the signal mask is nevertheless returned in .I oldset (if it is not NULL). - +.PP A set of functions for modifying and inspecting variables of type .I sigset_t ("signal sets") is described in .BR sigsetops (3). - +.PP The use of .BR sigprocmask () is unspecified in a multithreaded process; see @@ -131,15 +131,15 @@ POSIX.1-2001, POSIX.1-2008. It is not possible to block .BR SIGKILL " or " SIGSTOP . Attempts to do so are silently ignored. - +.PP Each of the threads in a process has its own signal mask. - +.PP A child created via .BR fork (2) inherits a copy of its parent's signal mask; the signal mask is preserved across .BR execve (2). - +.PP If .BR SIGBUS , .BR SIGFPE , @@ -157,7 +157,7 @@ or See .BR sigsetops (3) for details on manipulating signal sets. - +.PP Note that it is permissible (although not very useful) to specify both .I set and @@ -165,7 +165,7 @@ and as NULL. .\" .SS C library/kernel differences - +.PP The kernel's definition of .IR sigset_t differs in size from that used @@ -175,7 +175,7 @@ In this manual page, the former is referred to as (it is nevertheless named .I sigset_t in the kernel sources). - +.PP The glibc wrapper function for .BR sigprocmask () silently ignores attempts to block the two real-time signals that @@ -183,7 +183,7 @@ are used internally by the NPTL threading implementation. See .BR nptl (7) for details. - +.PP The original Linux system call was named .BR sigprocmask (). However, with the addition of real-time signals in Linux 2.2, @@ -212,7 +212,7 @@ This argument is currently required to have a fixed architecture specific value .IR sizeof(kernel_sigset_t) ). .\" sizeof(kernel_sigset_t) == _NSIG / 8, .\" which equals to 8 on most architectures, but e.g. on MIPS it's 16. - +.PP The glibc .BR sigprocmask () wrapper function hides these details from us, transparently calling diff --git a/man2/sigreturn.2 b/man2/sigreturn.2 index 5029ed3c2..7a8bf1500 100644 --- a/man2/sigreturn.2 +++ b/man2/sigreturn.2 @@ -42,14 +42,14 @@ it saves various pieces of process context (processor status word, registers, signal mask, and signal stack settings) into the user-space stack. .\" See arch/x86/kernel/signal.c::__setup_frame() [in 3.17 source code] - +.PP The kernel also arranges that, during the transition back to user mode, the signal handler is called, and that, upon return from the handler, control passes to a piece of user-space code commonly called the "signal trampoline". The signal trampoline code in turn calls .BR sigreturn (). - +.PP This .BR sigreturn () call undoes everything that was @@ -80,7 +80,7 @@ be called directly. Details of the arguments (if any) passed to .BR sigreturn () vary depending on the architecture. - +.PP Once upon a time, UNIX systems placed the signal trampoline code onto the user stack. Nowadays, pages of the user stack are protected so as to @@ -103,7 +103,7 @@ and sets the flag in the .IR sa_flags field. - +.PP The saved process context information is placed in a .I ucontext_t structure (see @@ -112,7 +112,7 @@ That structure is visible within the signal handler as the third argument of a handler established with the .BR SA_SIGINFO flag. - +.PP On some other UNIX systems, the operation of the signal trampoline differs a little. In particular, on some systems, upon transitioning back to user mode, diff --git a/man2/sigsuspend.2 b/man2/sigsuspend.2 index 04c0fe416..60164e31e 100644 --- a/man2/sigsuspend.2 +++ b/man2/sigsuspend.2 @@ -50,7 +50,7 @@ mask given by .I mask and then suspends the process until delivery of a signal whose action is to invoke a signal handler or to terminate a process. - +.PP If the signal terminates the process, then .BR sigsuspend () does not return. @@ -59,7 +59,7 @@ If the signal is caught, then returns after the signal handler returns, and the signal mask is restored to the state before the call to .BR sigsuspend (). - +.PP It is not possible to block .B SIGKILL or diff --git a/man2/sigwaitinfo.2 b/man2/sigwaitinfo.2 index 0a94ae556..9a3952799 100644 --- a/man2/sigwaitinfo.2 +++ b/man2/sigwaitinfo.2 @@ -54,7 +54,7 @@ is pending is already pending for the calling thread, .BR sigwaitinfo () will return immediately.) - +.PP .BR sigwaitinfo () removes the signal from the set of pending signals and returns the signal number as its function result. @@ -151,18 +151,18 @@ a thread other than the one calling .BR sigwaitinfo () or .BR sigtimedwait ()). - +.PP The set of signals that is pending for a given thread is the union of the set of signals that is pending specifically for that thread and the set of signals that is pending for the process as a whole (see .BR signal (7)). - +.PP Attempts to wait for .B SIGKILL and .B SIGSTOP are silently ignored. - +.PP If multiple threads of a process are blocked waiting for the same signal(s) in .BR sigwaitinfo () @@ -171,7 +171,7 @@ or then exactly one of the threads will actually receive the signal if it becomes pending for the process as a whole; which of the threads receives the signal is indeterminate. - +.PP .BR sigwaitinfo () or .BR sigtimedwait (), @@ -183,7 +183,7 @@ or the .BR SIGFPE signal that results from an arithmetic error. Such signals can be caught only via signal handler. - +.PP POSIX leaves the meaning of a NULL value for the .I timeout argument of @@ -198,7 +198,7 @@ On Linux, .BR sigwaitinfo () is a library function implemented on top of .BR sigtimedwait (). - +.PP The glibc wrapper functions for .BR sigwaitinfo () and @@ -208,7 +208,7 @@ are used internally by the NPTL threading implementation. See .BR nptl (7) for details. - +.PP The original Linux system call was named .BR sigtimedwait (). However, with the addition of real-time signals in Linux 2.2, diff --git a/man2/socket.2 b/man2/socket.2 index f6aaec27c..5cc07fb48 100644 --- a/man2/socket.2 +++ b/man2/socket.2 @@ -353,13 +353,13 @@ supported within this domain. Other errors may be generated by the underlying protocol modules. .SH CONFORMING TO POSIX.1-2001, POSIX.1-2008, 4.4BSD. - +.PP The .B SOCK_NONBLOCK and .B SOCK_CLOEXEC flags are Linux-specific. - +.PP .BR socket () appeared in 4.2BSD. It is generally portable to/from @@ -371,7 +371,7 @@ POSIX.1 does not require the inclusion of and this header file is not required on Linux. However, some historical (BSD) implementations required this header file, and portable applications are probably wise to include it. - +.PP The manifest constants used under 4.x BSD for protocol families are .BR PF_UNIX , @@ -384,7 +384,7 @@ families. However, already the BSD man page promises: "The protocol family generally is the same as the address family", and subsequent standards use AF_* everywhere. - +.PP The .B AF_ALG protocol type was added in Linux 2.6.38. @@ -420,7 +420,7 @@ is shown in .BR tcp (7), .BR udp (7), .BR unix (7) - +.PP \(lqAn Introductory 4.3BSD Interprocess Communication Tutorial\(rq and \(lqBSD Interprocess Communication Tutorial\(rq, diff --git a/man2/socketcall.2 b/man2/socketcall.2 index 6f084b372..668e48e0b 100644 --- a/man2/socketcall.2 +++ b/man2/socketcall.2 @@ -158,7 +158,7 @@ system call; instead .BR accept (2), .BR bind (2), and so on really are implemented as separate system calls. - +.PP On x86-32, .BR socketcall () was historically the only entry point for the sockets API. diff --git a/man2/socketpair.2 b/man2/socketpair.2 index 6113b0bdc..24b104f91 100644 --- a/man2/socketpair.2 +++ b/man2/socketpair.2 @@ -60,7 +60,7 @@ and using the optionally specified .IR protocol . For further details of these arguments, see .BR socket (2). - +.PP The file descriptors used in referencing the new sockets are returned in .IR sv [0] and @@ -71,7 +71,7 @@ On success, zero is returned. On error, \-1 is returned, and .I errno is set appropriately. - +.PP On Linux (and other systems), .BR socketpair () does not modify @@ -113,7 +113,7 @@ On Linux, the only supported domain for this call is (or synonymously, .BR AF_LOCAL ). (Most implementations have the same restriction.) - +.PP Since Linux 2.6.27, .BR socketpair () supports the @@ -124,7 +124,7 @@ flags in the .I type argument, as described in .BR socket (2). - +.PP POSIX.1 does not require the inclusion of .IR , and this header file is not required on Linux. diff --git a/man2/splice.2 b/man2/splice.2 index cc565d916..5dc44d4bf 100644 --- a/man2/splice.2 +++ b/man2/splice.2 @@ -47,7 +47,7 @@ bytes of data from the file descriptor to the file descriptor .IR fd_out , where one of the file descriptors must refer to a pipe. - +.PP The following semantics apply for .I fd_in and @@ -85,7 +85,7 @@ Analogous statements apply for .I fd_out and .IR off_out . - +.PP The .I flags argument is a bit mask that is composed by ORing together @@ -137,14 +137,14 @@ Upon successful completion, .BR splice () returns the number of bytes spliced to or from the pipe. - +.PP A return value of 0 means end of input. If .I fd_in refers to a pipe, then this means that there was no data to transfer, and it would not make sense to block because there are no writers connected to the write end of the pipe. - +.PP On error, .BR splice () returns \-1 and diff --git a/man2/spu_create.2 b/man2/spu_create.2 index d5401fa95..dbd72cda3 100644 --- a/man2/spu_create.2 +++ b/man2/spu_create.2 @@ -56,7 +56,7 @@ is successful, a directory is created at .I pathname and it is populated with the files described in .BR spufs (7). - +.PP When a context is created, the returned file descriptor can only be passed to .BR spu_run (2), @@ -75,7 +75,7 @@ directory) once the last reference to the context has gone; this usually occurs when the file descriptor returned by .BR spu_create () is closed. - +.PP The .I flags argument can be zero or any bitwise OR-ed @@ -94,7 +94,7 @@ functionally related to each other and which share common scheduling parameters\(empriority and policy. In the future, gang scheduling may be implemented causing the group to be switched in and out as a single unit.) - +.IP A new directory will be created at the location specified by the .I pathname argument. @@ -107,7 +107,7 @@ Create a context that is not affected by the SPU scheduler. Once the context is run, it will not be scheduled out until it is destroyed by the creating process. - +.IP Because the context cannot be removed from the SPU, some functionality is disabled for .BR SPU_CREATE_NOSCHED @@ -118,7 +118,7 @@ available in this context directory in Additionally, .BR SPU_CREATE_NOSCHED contexts cannot dump a core file when crashing. - +.IP Creating .BR SPU_CREATE_NOSCHED contexts requires the @@ -131,7 +131,7 @@ Isolated contexts are protected from some PPE (PowerPC Processing Element) operations, such as access to the SPU local store and the NPC register. - +.IP Creating .B SPU_CREATE_ISOLATE contexts also requires the diff --git a/man2/spu_run.2 b/man2/spu_run.2 index 027c00e9d..528c8c139 100644 --- a/man2/spu_run.2 +++ b/man2/spu_run.2 @@ -50,7 +50,7 @@ that refers to a specific SPU context. When the context gets scheduled to a physical SPU, it starts execution at the instruction pointer passed in .IR npc . - +.PP Execution of SPU code happens synchronously, meaning that .BR spu_run () blocks while the SPU is still running. @@ -59,7 +59,7 @@ to execute SPU code in parallel with other code on either the main CPU or other SPUs, a new thread of execution must be created first (e.g., using .BR pthread_create (3)). - +.PP When .BR spu_run () returns, the current value of the SPU program counter is written to @@ -69,7 +69,7 @@ so successive calls to can use the same .I npc pointer. - +.PP The .I event argument provides a buffer for an extended status code. @@ -79,7 +79,7 @@ context was created with the flag, then this buffer is populated by the Linux kernel before .BR spu_run () returns. - +.PP The status code may be one (or more) of the following constants: .TP .B SPE_EVENT_DMA_ALIGNMENT @@ -108,7 +108,7 @@ register. On error, it returns \-1 and sets .I errno to one of the error codes listed below. - +.PP The .I spu_status register value is a bit mask of status codes and @@ -209,7 +209,7 @@ The following is an example of running a simple, one-instruction SPU program with the .BR spu_run () system call. - +.PP .nf #include #include diff --git a/man2/stat.2 b/man2/stat.2 index af55b1fbf..61cbc9d3a 100644 --- a/man2/stat.2 +++ b/man2/stat.2 @@ -114,7 +114,7 @@ retrieve information about the file pointed to by the differences for .BR fstatat () are described below. - +.PP .BR lstat () is identical to .BR stat (), @@ -122,7 +122,7 @@ except that if .I pathname is a symbolic link, then it returns information about the link itself, not the file that it refers to. - +.PP .BR fstat () is identical to .BR stat (), @@ -163,7 +163,7 @@ struct stat { }; .fi .in - +.PP .IR Note : the order of fields in the .I stat @@ -174,7 +174,7 @@ the definition above does not show the padding bytes that may be present between some fields on various architectures. Consult the glibc and kernel source code if you need to know the details. - +.PP .\" Background: inode attributes are modified with i_mutex held, but .\" read by stat() without taking the mutex. .IR Note : @@ -199,7 +199,7 @@ or the old .I st_uid together with the new .IR st_mode . - +.PP The fields in the .I stat structure are as follows: @@ -267,7 +267,7 @@ The system call operates in exactly the same way as .BR stat (), except for the differences described here. - +.PP If the pathname given in .I pathname is relative, then it is interpreted relative to the directory @@ -277,7 +277,7 @@ referred to by the file descriptor the calling process, as is done by .BR stat () for a relative pathname). - +.PP If .I pathname is relative and @@ -289,13 +289,13 @@ then is interpreted relative to the current working directory of the calling process (like .BR stat ()). - +.PP If .I pathname is absolute, then .I dirfd is ignored. - +.PP .I flags can either be 0, or include one or more of the following flags ORed: .TP @@ -451,10 +451,10 @@ SVr4, 4.3BSD, POSIX.1-2001, POSIX.1.2008. .\" and .\" .BR lstat () .\" error conditions EINTR, EMULTIHOP, ENOLINK, and EOVERFLOW. - +.PP .BR fstatat (): POSIX.1-2008. - +.PP According to POSIX.1-2001, .BR lstat () on a symbolic link need return valid information only in the @@ -468,7 +468,7 @@ POSIX.1-2008 tightens the specification, requiring .BR lstat () to return valid information in all fields except the mode bits in .IR st_mode . - +.PP Use of the .I st_blocks and @@ -498,7 +498,7 @@ and as .IR time_t that recorded timestamps with one-second precision. - +.PP Since kernel 2.5.48, the .I stat structure supports nanosecond resolution for the three file timestamp fields. @@ -548,7 +548,7 @@ Similar remarks apply for .BR fstat () and .BR lstat (). - +.PP The kernel-internal versions of the .I stat structure dealt with by the different versions are, respectively: @@ -609,13 +609,13 @@ and repacking the returned information if required for old binaries. .\" interface, rather than the libc-kernel interface. .\" .\" (Note that the details depend on gcc being used as c compiler.) - +.PP On modern 64-bit systems, life is simpler: there is a single .BR stat () system call and the kernel deals with a .I stat structure that contains fields of a sufficient size. - +.PP The underlying system call employed by the glibc .BR fstatat () wrapper function is actually called diff --git a/man2/statfs.2 b/man2/statfs.2 index 7c618c53b..ba469a10f 100644 --- a/man2/statfs.2 +++ b/man2/statfs.2 @@ -44,7 +44,7 @@ is the pathname of any file within the mounted filesystem. is a pointer to a .I statfs structure defined approximately as follows: - +.PP .in +4n .nf struct statfs { @@ -156,7 +156,7 @@ Filesystem types: Most of these MAGIC constants are defined in .IR /usr/include/linux/magic.h , and some are hardcoded in kernel sources. - +.PP The .IR f_flags is a bit mask indicating mount options for the filesystem. @@ -204,7 +204,7 @@ Nobody knows what is supposed to contain (but see below). .PP Fields that are undefined for a particular filesystem are set to 0. - +.PP .BR fstatfs () returns the same information about an open file referenced by descriptor .IR fd . @@ -288,7 +288,7 @@ or compare these fields to local variables in a program. Using .I "unsigned\ int" for such variables suffices on most systems. - +.PP The original Linux .BR statfs () and @@ -310,12 +310,12 @@ The glibc and .BR fstatfs () wrapper functions transparently deal with the kernel differences. - +.PP Some systems have only \fI\fP, other systems also have \fI\fP, where the former includes the latter. So it seems including the former is the best choice. - +.PP LSB has deprecated the library calls .BR statfs () and @@ -350,7 +350,7 @@ is defined as .IR "struct { int val[2]; }" . The same holds for FreeBSD, except that it uses the include file .IR . - +.PP The general idea is that .I f_fsid contains some random stuff such that the pair diff --git a/man2/statx.2 b/man2/statx.2 index 8b9453d60..a53c73ed6 100644 --- a/man2/statx.2 +++ b/man2/statx.2 @@ -183,13 +183,13 @@ flag). In this case, .I dirfd can refer to any type of file, not just a directory. - +.IP If .I dirfd is .BR AT_FDCWD , the call operates on the current working directory. - +.IP This flag is Linux-specific; define .B _GNU_SOURCE .\" Before glibc 2.16, defining _ATFILE_SOURCE sufficed @@ -433,7 +433,7 @@ The bits in .I stx_attributes_mask correspond bit-by-bit to .IR stx_attributes . - +.PP The flags are as follows: .TP .B STATX_ATTR_COMPRESSED diff --git a/man2/subpage_prot.2 b/man2/subpage_prot.2 index 0499b9dd9..a60835818 100644 --- a/man2/subpage_prot.2 +++ b/man2/subpage_prot.2 @@ -44,14 +44,14 @@ The PowerPC-specific system call provides the facility to control the access permissions on individual 4kB subpages on systems configured with a page size of 64kB. - +.PP The protection map is applied to the memory pages in the region starting at .I addr and continuing for .I len bytes. Both of these arguments must be aligned to a 64-kB boundary. - +.PP The protection map is specified in the buffer pointed to by .IR map . The map has 2 bits per 4kB subpage; @@ -98,7 +98,7 @@ This system call is Linux-specific. .SH NOTES Glibc does not provide a wrapper for this system call; call it using .BR syscall (2). - +.PP Normal page protections (at the 64-kB page level) also apply; the subpage protection mechanism is an additional constraint, so putting 0 in a 2-bit field won't allow writes to a page that is otherwise @@ -128,6 +128,6 @@ hardware pages (on machines with hardware 64-kB page support). .SH SEE ALSO .BR mprotect (2), .BR syscall (2) - +.PP .IR Documentation/vm/hugetlbpage.txt in the Linux kernel source tree diff --git a/man2/swapon.2 b/man2/swapon.2 index e304b1df2..01a5ca77d 100644 --- a/man2/swapon.2 +++ b/man2/swapon.2 @@ -169,7 +169,7 @@ argument was introduced in Linux 1.3.2. .SH NOTES The partition or path must be prepared with .BR mkswap (8). - +.PP There is an upper limit on the number of swap files that may be used, defined by the kernel constant .BR MAX_SWAPFILES . @@ -189,7 +189,7 @@ Since kernel 2.6.32, the limit is further decreased by 1 if the kernel is built with the .B CONFIG_MEMORY_FAILURE option. - +.PP Discard of swap pages was introduced in kernel 2.6.29, then made conditional on the diff --git a/man2/symlink.2 b/man2/symlink.2 index 8c558d5fb..699ac71f6 100644 --- a/man2/symlink.2 +++ b/man2/symlink.2 @@ -78,27 +78,27 @@ creates a symbolic link named .I linkpath which contains the string .IR target . - +.PP Symbolic links are interpreted at run time as if the contents of the link had been substituted into the path being followed to find a file or directory. - +.PP Symbolic links may contain .I .. path components, which (if used at the start of the link) refer to the parent directories of that in which the link resides. - +.PP A symbolic link (also known as a soft link) may point to an existing file or to a nonexistent one; the latter case is known as a dangling link. - +.PP The permissions of a symbolic link are irrelevant; the ownership is ignored when following the link, but is checked when removal or renaming of the link is requested and the link is in a directory with the sticky bit .RB ( S_ISVTX ) set. - +.PP If .I linkpath exists, it will @@ -110,7 +110,7 @@ The system call operates in exactly the same way as .BR symlink (), except for the differences described here. - +.PP If the pathname given in .I linkpath is relative, then it is interpreted relative to the directory @@ -120,7 +120,7 @@ referred to by the file descriptor the calling process, as is done by .BR symlink () for a relative pathname). - +.PP If .I linkpath is relative and @@ -132,7 +132,7 @@ then is interpreted relative to the current working directory of the calling process (like .BR symlink ()). - +.PP If .I linkpath is absolute, then @@ -235,14 +235,14 @@ SVr4, 4.3BSD, POSIX.1-2001, POSIX.1-2008. .\" See .\" .BR open (2) .\" re multiple files with the same name, and NFS. - +.PP .BR symlinkat (): POSIX.1-2008. .SH NOTES No checking of .I target is done. - +.PP Deleting the name referred to by a symbolic link will actually delete the file (unless it also has other hard links). If this behavior is not desired, use diff --git a/man2/sync.2 b/man2/sync.2 index 8734f606d..42904bc43 100644 --- a/man2/sync.2 +++ b/man2/sync.2 @@ -68,7 +68,7 @@ _GNU_SOURCE .BR sync () causes all pending modifications to filesystem metadata and cached file data to be written to the underlying filesystems. - +.PP .BR syncfs () is like .BR sync (), @@ -84,7 +84,7 @@ to indicate the error. .SH ERRORS .BR sync () is always successful. - +.PP .BR syncfs () can fail for at least the following reason: .TP @@ -98,7 +98,7 @@ library support was added to glibc in version 2.14. .SH CONFORMING TO .BR sync (): POSIX.1-2001, POSIX.1-2008, SVr4, 4.3BSD. - +.PP .BR syncfs () is Linux-specific. .SH NOTES @@ -110,7 +110,7 @@ In glibc 2.2.1 and earlier, it was "int sync(void)", and .BR sync () always returned 0. - +.PP According to the standard specification (e.g., POSIX.1-2001), .BR sync () schedules the writes, but may return before the actual diff --git a/man2/sync_file_range.2 b/man2/sync_file_range.2 index 1e24d3b1a..c0a6c920a 100644 --- a/man2/sync_file_range.2 +++ b/man2/sync_file_range.2 @@ -44,7 +44,7 @@ permits fine control when synchronizing the open file referred to by the file descriptor .I fd with disk. - +.PP .I offset is the starting byte of the file range to be synchronized. .I nbytes @@ -58,7 +58,7 @@ Synchronization is in units of the system page size: is rounded down to a page boundary; .I (offset+nbytes-1) is rounded up to a page boundary. - +.PP The .I flags bit-mask argument can include any of the following values: @@ -105,7 +105,7 @@ will detect any I/O errors or .B ENOSPC conditions and will return these to the caller. - +.PP Useful combinations of the .I flags bits are: @@ -205,7 +205,7 @@ system call that orders the arguments suitably: .PP The behavior of this system call is otherwise exactly the same as .BR sync_file_range (). - +.PP A system call with this signature first appeared on the ARM architecture in Linux 2.6.20, with the name .BR arm_sync_file_range (). diff --git a/man2/syscalls.2 b/man2/syscalls.2 index 41fbe2c47..0fc639448 100644 --- a/man2/syscalls.2 +++ b/man2/syscalls.2 @@ -46,7 +46,7 @@ as the name of the system call that it invokes. For example, glibc contains a function .BR truncate () which invokes the underlying "truncate" system call. - +.PP Often the glibc wrapper function is quite thin, doing little work other than copying arguments to the right registers before invoking the system call, @@ -64,7 +64,7 @@ the wrapper function negates the returned error number (to make it positive), copies it to .IR errno , and returns \-1 to the caller of the wrapper. - +.PP Sometimes, however, the wrapper function does some extra work before invoking the system call. For example, nowadays there are (for reasons described below) two @@ -625,7 +625,7 @@ On many platforms, including x86-32, socket calls are all multiplexed .BR socketcall (2) and similarly System\ V IPC calls are multiplexed through .BR ipc (2). - +.PP Although slots are reserved for them in the system call table, the following system calls are not implemented in the standard kernel: .BR afs_syscall (2), \" __NR_afs_syscall is 53 on Linux 2.6.22/i386 @@ -670,7 +670,7 @@ and .BR putpmsg (2) calls are for kernels patched to support STREAMS, and may never be in the standard kernel. - +.PP There was briefly .BR set_zone_reclaim (2), added in Linux 2.6.13, and removed in 2.6.16; @@ -692,7 +692,7 @@ proprietary operating-system emulation, such as parisc, sparc, sparc64, and alpha, there are many additional system calls; mips64 also contains a full set of 32-bit system calls. - +.PP Over time, changes to the interfaces of some system calls have been necessary. One reason for such changes was the need to increase the size of @@ -790,7 +790,7 @@ symbolic links. These system calls supersede the older system calls which, except in the case of the "stat" calls, have the same name without the "64" suffix. - +.IP On newer platforms that only have 64-bit file access and 32-bit UIDs/GIDs (e.g., alpha, ia64, s390x, x86-64), there is just a single version of the UID/GID and file access system calls. diff --git a/man2/sysctl.2 b/man2/sysctl.2 index 4c4d916ad..5624a6ac1 100644 --- a/man2/sysctl.2 +++ b/man2/sysctl.2 @@ -44,7 +44,7 @@ There is no glibc wrapper for this system call; see NOTES. .SH DESCRIPTION .B Do not use this system call! See NOTES. - +.PP The .BR _sysctl () call reads and/or writes kernel parameters. @@ -128,7 +128,7 @@ uses of this system call result in warnings in the kernel log. Remove it from your programs now; use the .I /proc/sys interface instead. - +.PP This system call is available only if the kernel was configured with the .B CONFIG_SYSCTL_SYSCALL option. diff --git a/man2/sysinfo.2 b/man2/sysinfo.2 index 5b27dac89..801ff446c 100644 --- a/man2/sysinfo.2 +++ b/man2/sysinfo.2 @@ -43,11 +43,11 @@ sysinfo \- return system information .BR sysinfo () returns certain statistics on memory and swap usage, as well as the load average. - +.PP Until Linux 2.3.16, .BR sysinfo () returned information in the following structure: - +.PP .nf .in +4n struct sysinfo { @@ -67,10 +67,10 @@ struct sysinfo { .PP In the above structure, the sizes of the memory and swap fields are given in bytes. - +.PP Since Linux 2.3.23 (i386) and Linux 2.3.48 (all architectures) the structure is: - +.PP .nf .in +4n struct sysinfo { diff --git a/man2/syslog.2 b/man2/syslog.2 index 8946c0e2d..740e78431 100644 --- a/man2/syslog.2 +++ b/man2/syslog.2 @@ -54,7 +54,7 @@ which talks to see .BR syslog (3) for details. - +.PP This page describes the kernel .BR syslog () system call, which is used to control the kernel @@ -158,7 +158,7 @@ to See the discussion of .IR /proc/sys/kernel/printk , below. - +.IP The .I bufp and @@ -181,7 +181,7 @@ to See the discussion of .IR /proc/sys/kernel/printk , below. - +.IP The .I bufp and @@ -311,7 +311,7 @@ KERN_NOTICE 5 Normal but significant condition KERN_INFO 6 Informational KERN_DEBUG 7 Debug-level messages .TE - +.sp 1 The kernel .IR printk() routine will print a message on the @@ -330,7 +330,7 @@ For \fItype\fP 10, .BR syslog () returns the total size of the kernel log buffer. For other values of \fItype\fP, 0 is returned on success. - +.PP In case of error, \-1 is returned, and \fIerrno\fP is set to indicate the error. .SH ERRORS diff --git a/man2/tee.2 b/man2/tee.2 index 7fe173b17..8e80d9d56 100644 --- a/man2/tee.2 +++ b/man2/tee.2 @@ -53,7 +53,7 @@ It does not consume the data that is duplicated from .IR fd_in ; therefore, that data can be copied by a subsequent .BR splice (2). - +.PP .I flags is a bit mask that is composed by ORing together zero or more of the following values: @@ -89,7 +89,7 @@ A return value of 0 means that there was no data to transfer, and it would not make sense to block, because there are no writers connected to the write end of the pipe referred to by .IR fd_in . - +.PP On error, .BR tee () returns \-1 and @@ -138,7 +138,7 @@ program using the .BR tee () system call. Here is an example of its use: - +.PP .in +4n .nf $ \fBdate |./a.out out.log | cat\fP diff --git a/man2/time.2 b/man2/time.2 index 4453221ed..955a59c42 100644 --- a/man2/time.2 +++ b/man2/time.2 @@ -37,7 +37,7 @@ time \- get time in seconds .BR time () returns the time as the number of seconds since the Epoch, 1970-01-01 00:00:00 +0000 (UTC). - +.PP If .I tloc is non-NULL, @@ -52,7 +52,7 @@ appropriately. .B EFAULT .I tloc points outside your accessible address space (but see BUGS). - +.IP On systems where the C library .BR time () wrapper function invokes an implementation provided by the @@ -82,7 +82,7 @@ and the Epoch, because of leap seconds and because system clocks are not required to be synchronized to a standard reference. The intention is that the interpretation of seconds since the Epoch values be consistent; see POSIX.1-2008 Rationale A.4.15 for further rationale. - +.PP On Linux, a call to .BR time () with @@ -110,7 +110,7 @@ successful reports that the time is a few seconds the Epoch, so the C library wrapper function never sets .I errno as a result of this call. - +.PP The .I tloc argument is obsolescent and should always be NULL in new code. diff --git a/man2/timer_create.2 b/man2/timer_create.2 index a75208607..d51111ac5 100644 --- a/man2/timer_create.2 +++ b/man2/timer_create.2 @@ -52,7 +52,7 @@ The ID of the new timer is returned in the buffer pointed to by which must be a non-null pointer. This ID is unique within the process, until the timer is deleted. The new timer is initially disarmed. - +.PP The .I clockid argument specifies the clock that the new timer uses to measure time. @@ -119,7 +119,7 @@ returned by a call to .BR clock_getcpuclockid (3) or .BR pthread_getcpuclockid (3). - +.PP The .I sevp argument points to a @@ -128,7 +128,7 @@ structure that specifies how the caller should be notified when the timer expires. For the definition and general details of this structure, see .BR sigevent (7). - +.PP The .I sevp.sigev_notify field can have the following values: @@ -223,12 +223,12 @@ POSIX.1-2001, POSIX.1-2008. .SH NOTES A program may create multiple interval timers using .BR timer_create (). - +.PP Timers are not inherited by the child of a .BR fork (2), and are disarmed and deleted during an .BR execve (2). - +.PP The kernel preallocates a "queued real-time signal" for each timer created using .BR timer_create (). @@ -236,7 +236,7 @@ Consequently, the number of timers is limited by the .BR RLIMIT_SIGPENDING resource limit (see .BR setrlimit (2)). - +.PP The timers created by .BR timer_create () are commonly known as "POSIX (interval) timers". @@ -326,7 +326,7 @@ Assuming that the timer expired at least once while the program slept, the signal handler will be invoked, and the handler displays some information about the timer notification. The program terminates after one invocation of the signal handler. - +.PP In the following example run, the program sleeps for 1 second, after creating a timer that has a frequency of 100 nanoseconds. By the time the signal is unblocked and delivered, diff --git a/man2/timer_getoverrun.2 b/man2/timer_getoverrun.2 index c274805e4..313797330 100644 --- a/man2/timer_getoverrun.2 +++ b/man2/timer_getoverrun.2 @@ -53,7 +53,7 @@ via signals .RB ( SIGEV_SIGNAL ), and via threads .RB ( SIGEV_THREAD ). - +.PP When expiration notifications are delivered via a signal, overruns can occur as follows. Regardless of whether or not a real-time signal is used for @@ -73,7 +73,7 @@ In this interval, further timer expirations may occur. The timer overrun count is the number of additional timer expirations that occurred between the time when the signal was generated and when it was delivered or accepted. - +.PP Timer overruns can also occur when expiration notifications are delivered via invocation of a thread, since there may be an arbitrary delay between an expiration of the timer @@ -108,7 +108,7 @@ structure (see This allows an application to avoid the overhead of making a system call to obtain the overrun count, but is a nonportable extension to POSIX.1. - +.PP POSIX.1 discusses timer overruns only in the context of timer notifications using signals. .\" FIXME . Austin bug filed, 11 Feb 09 diff --git a/man2/timer_settime.2 b/man2/timer_settime.2 index 982ce28e8..00100c25b 100644 --- a/man2/timer_settime.2 +++ b/man2/timer_settime.2 @@ -60,7 +60,7 @@ the new interval for the timer. The .I itimerspec structure is defined as follows: - +.PP .in +4n .nf struct timespec { @@ -74,7 +74,7 @@ struct itimerspec { }; .fi .in - +.PP Each of the substructures of the .I itimerspec structure is a @@ -84,7 +84,7 @@ in seconds and nanoseconds. These time values are measured according to the clock that was specified when the timer was created by .BR timer_create (2). - +.PP If .I new_value->it_value specifies a nonzero value (i.e., either subfield is nonzero), then @@ -98,7 +98,7 @@ If specifies a zero value (i.e., both subfields are zero), then the timer is disarmed. - +.PP The .I new_value->it_interval field specifies the period of the timer, in seconds and nanoseconds. @@ -110,7 +110,7 @@ If specifies a zero value, then the timer expires just once, at the time specified by .IR it_value . - +.PP By default, the initial expiration time specified in .I new_value->it_value is interpreted relative to the current time on the timer's @@ -131,7 +131,7 @@ and the overrun count (see .BR timer_getoverrun (2)) will be set correctly. .\" By experiment: the overrun count is set correctly, for CLOCK_REALTIME. - +.PP If the value of the .B CLOCK_REALTIME clock is adjusted while an absolute timer based on that clock is armed, @@ -141,7 +141,7 @@ Adjustments to the clock have no effect on relative timers based on that clock. .\" Similar remarks might apply with respect to process and thread CPU time .\" clocks, but these clocks are not currently (2.6.28) settable on Linux. - +.PP If .I old_value is not NULL, then it points to a buffer @@ -150,7 +150,7 @@ that is used to return the previous interval of the timer (in and the amount of time until the timer would previously have next expired (in .IR old_value->it_value ). - +.PP .BR timer_gettime () returns the time until next expiration, and the interval, for the timer specified by diff --git a/man2/timerfd_create.2 b/man2/timerfd_create.2 index 7e7d8439b..3c7824cfd 100644 --- a/man2/timerfd_create.2 +++ b/man2/timerfd_create.2 @@ -47,7 +47,7 @@ with the advantage that the file descriptor may be monitored by .BR poll (2), and .BR epoll (7). - +.PP The use of these three system calls is analogous to the use of .BR timer_create (2), .BR timer_settime (2), @@ -111,7 +111,7 @@ capability in order to set a timer against this clock. .PP The current value of each of these clocks can be retrieved using .BR clock_gettime (2). - +.PP Starting with Linux 2.6.27, the following values may be bitwise ORed in .IR flags to change the behavior of @@ -143,7 +143,7 @@ must be specified as zero. arms (starts) or disarms (stops) the timer referred to by the file descriptor .IR fd . - +.PP The .I new_value argument specifies the initial expiration and interval for the timer. @@ -176,7 +176,7 @@ to a nonzero value arms the timer. Setting both fields of .I new_value.it_value to zero disarms the timer. - +.PP Setting one or both fields of .I new_value.it_interval to nonzero values specifies the period, in seconds and nanoseconds, @@ -185,7 +185,7 @@ If both fields of .I new_value.it_interval are zero, the timer expires just once, at the time specified by .IR new_value.it_value . - +.PP By default, the initial expiration time specified in .I new_value @@ -197,7 +197,7 @@ specifies a time relative to the current value of the clock specified by An absolute timeout can be selected via the .I flags argument. - +.PP The .I flags argument is a bit mask that can include the following values: @@ -246,7 +246,7 @@ an structure that contains the current setting of the timer referred to by the file descriptor .IR fd . - +.PP The .I it_value field returns the amount of time @@ -256,7 +256,7 @@ then the timer is currently disarmed. This field always contains a relative value, regardless of whether the .BR TFD_TIMER_ABSTIME flag was specified when setting the timer. - +.PP The .I it_interval field returns the interval of the timer. @@ -300,7 +300,7 @@ A will fail with the error .B EINVAL if the size of the supplied buffer is less than 8 bytes. - +.IP If the associated clock is either .BR CLOCK_REALTIME or @@ -386,7 +386,7 @@ returns a new file descriptor. On error, \-1 is returned and .I errno is set to indicate the error. - +.PP .BR timerfd_settime () and .BR timerfd_gettime () @@ -480,7 +480,7 @@ The second argument specifies the interval for the timer, in seconds. The third argument specifies the number of times the program should allow the timer to expire before terminating. The second and third command-line arguments are optional. - +.PP The following shell session demonstrates the use of the program: .in +4n .nf diff --git a/man2/times.2 b/man2/times.2 index e860671f7..b8b0905b4 100644 --- a/man2/times.2 +++ b/man2/times.2 @@ -146,7 +146,7 @@ This nonconformance is rectified in Linux 2.6.9 and later. .\" See the description of times() in XSH, which says: .\" The times of a terminated child process are included... when wait() .\" or waitpid() returns the process ID of this terminated child. - +.PP On Linux, the .I buf argument can be specified as NULL, with the result that @@ -164,7 +164,7 @@ but this value is measured in units of .BR CLOCKS_PER_SEC , not the clock ticks used by .BR times (). - +.PP On Linux, the "arbitrary point in the past" from which the return value of .BR times () is measured has varied across kernel versions. diff --git a/man2/tkill.2 b/man2/tkill.2 index 7c80d6438..4181bb3a2 100644 --- a/man2/tkill.2 +++ b/man2/tkill.2 @@ -51,7 +51,7 @@ in the thread group can be used to send a signal only to a process (i.e., thread group) as a whole, and the signal will be delivered to an arbitrary thread within that process.) - +.PP .BR tkill () is an obsolete predecessor to .BR tgkill (). @@ -84,7 +84,7 @@ Avoid using this system call. .\" measurable, one could exhaust all but 1-2 available pid values, .\" possibly by lowering the max pid parameter in /proc, forcing .\" the same tid to be reused rapidly. - +.PP These are the raw system call interfaces, meant for internal thread library use. .SH RETURN VALUE @@ -132,7 +132,7 @@ See the description of in .BR clone (2) for an explanation of thread groups. - +.PP Glibc does not provide wrappers for these system calls; call them using .BR syscall (2). .SH SEE ALSO diff --git a/man2/truncate.2 b/man2/truncate.2 index 520f6e64c..2799f9813 100644 --- a/man2/truncate.2 +++ b/man2/truncate.2 @@ -223,7 +223,7 @@ POSIX.1-2001, POSIX.1-2008, .BR ftruncate () can also be used to set the size of a POSIX shared memory object; see .BR shm_open (7). - +.PP The details in DESCRIPTION are for XSI-compliant systems. For non-XSI-compliant systems, the POSIX standard allows two behaviors for @@ -244,7 +244,7 @@ and to be used to extend a file beyond its current length: a notable example on Linux is VFAT. .\" At the very least: OSF/1, Solaris 7, and FreeBSD conform, mtk, Jan 2002 - +.PP The original Linux .BR truncate () and @@ -258,7 +258,7 @@ system calls that handle large files. However, these details can be ignored by applications using glibc, whose wrapper functions transparently employ the more recent system calls where they are available. - +.PP On some 32-bit architectures, the calling signature for these system calls differ, for the reasons described in diff --git a/man2/umask.2 b/man2/umask.2 index 530cb6609..756176bc3 100644 --- a/man2/umask.2 +++ b/man2/umask.2 @@ -47,7 +47,7 @@ sets the calling process's file mode creation mask (umask) to & 0777 (i.e., only the file permission bits of .I mask are used), and returns the previous value of the mask. - +.PP The umask is used by .BR open (2), .BR mkdir (2), @@ -62,7 +62,7 @@ argument to .BR open (2) and .BR mkdir (2). - +.PP Alternatively, if the parent directory has a default ACL (see .BR acl (5)), the umask is ignored, the default ACL is inherited, @@ -71,19 +71,19 @@ and permission bits absent in the .I mode argument are turned off. For example, the following default ACL is equivalent to a umask of 022: - +.PP u::rwx,g::r-x,o::r-x - +.PP Combining the effect of this default ACL with a .I mode argument of 0666 (rw-rw-rw-), the resulting file permissions would be 0644 (rw-r--r--). - +.PP The constants that should be used to specify .I mask are described in .BR inode (7). - +.PP The typical default value for the process umask is .I S_IWGRP\ |\ S_IWOTH (octal 022). @@ -116,7 +116,7 @@ A child process created via inherits its parent's umask. The umask is left unchanged by .BR execve (2). - +.PP It is impossible to use .BR umask () to fetch a process's umask without at the same time changing it. @@ -125,7 +125,7 @@ A second call to would then be needed to restore the umask. The nonatomicity of these two steps provides the potential for races in multithreaded programs. - +.PP Since Linux 4.7, the umask of any process can be viewed via the .I Umask field of @@ -133,7 +133,7 @@ field of Inspecting this field in .IR /proc/self/status allows a process to retrieve its umask without at the same time changing it. - +.PP The umask setting also affects the permissions assigned to POSIX IPC objects .RB ( mq_open (3), .BR sem_open (3), diff --git a/man2/umount.2 b/man2/umount.2 index 56c374f90..f3278707b 100644 --- a/man2/umount.2 +++ b/man2/umount.2 @@ -47,11 +47,11 @@ remove the attachment of the (topmost) filesystem mounted on .\" Note: the kernel naming differs from the glibc naming .\" umount2 is the glibc name for what the kernel now calls umount .\" and umount is the glibc name for oldumount - +.PP Appropriate privilege (Linux: the .B CAP_SYS_ADMIN capability) is required to unmount filesystems. - +.PP Linux 2.1.116 added the .BR umount2 () system call, which, like @@ -186,7 +186,7 @@ This means that .BR umount () of any peer in a set of shared mounts will cause all of its peers to be unmounted and all of their slaves to be unmounted as well. - +.PP This propagation of unmount activity can be particularly surprising on systems where every mount point is shared by default. On such systems, @@ -194,7 +194,7 @@ recursively bind mounting the root directory of the filesystem onto a subdirectory and then later unmounting that subdirectory with .BR MNT_DETACH will cause every mount in the mount namespace to be lazily unmounted. - +.PP To ensure .BR umount () does not propagate in this fashion, diff --git a/man2/unimplemented.2 b/man2/unimplemented.2 index 9b20d1bfa..ac0876698 100644 --- a/man2/unimplemented.2 +++ b/man2/unimplemented.2 @@ -44,7 +44,7 @@ Note that and .BR ulimit (3) are implemented as library functions. - +.PP Some system calls, like .BR alloc_hugepages (2), .BR free_hugepages (2), @@ -53,7 +53,7 @@ Some system calls, like and .BR vm86 (2) exist only on certain architectures. - +.PP Some system calls, like .BR ipc (2), .BR create_module (2), diff --git a/man2/unlink.2 b/man2/unlink.2 index e0a86d125..24e1477dc 100644 --- a/man2/unlink.2 +++ b/man2/unlink.2 @@ -69,13 +69,13 @@ deletes a name from the filesystem. If that name was the last link to a file and no processes have the file open, the file is deleted and the space it was using is made available for reuse. - +.PP If the name was the last link to a file but any processes still have the file open, the file will remain in existence until the last file descriptor referring to it is closed. - +.PP If the name referred to a symbolic link, the link is removed. - +.PP If the name referred to a socket, FIFO, or device, the name for it is removed but processes which have the object open may continue to use it. @@ -92,7 +92,7 @@ includes the .B AT_REMOVEDIR flag) except for the differences described here. - +.PP If the pathname given in .I pathname is relative, then it is interpreted relative to the directory @@ -104,7 +104,7 @@ the calling process, as is done by and .BR rmdir (2) for a relative pathname). - +.PP If the pathname given in .I pathname is relative and @@ -118,13 +118,13 @@ directory of the calling process (like .BR unlink () and .BR rmdir (2)). - +.PP If the pathname given in .I pathname is absolute, then .I dirfd is ignored. - +.PP .I flags is a bit mask that can either be specified as 0, or by ORing together flag values that control the operation of @@ -280,7 +280,7 @@ library support was added to glibc in version 2.4. SVr4, 4.3BSD, POSIX.1-2001, POSIX.1-2008. .\" SVr4 documents additional error .\" conditions EINTR, EMULTIHOP, ETXTBSY, ENOLINK. - +.PP .BR unlinkat (): POSIX.1-2008. .SH NOTES diff --git a/man2/unshare.2 b/man2/unshare.2 index ccf4a904e..a9b09b487 100644 --- a/man2/unshare.2 +++ b/man2/unshare.2 @@ -39,12 +39,12 @@ or while other parts, such as virtual memory, may be shared by explicit request when creating a process or thread using .BR clone (2). - +.PP The main use of .BR unshare () is to allow a process to control its shared execution context without creating a new process. - +.PP The .I flags argument is a bit mask that specifies which parts of @@ -197,7 +197,7 @@ also automatically implies requires that the user ID and group ID of the calling process are mapped to user IDs and group IDs in the user namespace of the calling process at the time of the call. - +.IP For further information on user namespaces, see .BR user_namespaces (7). .TP @@ -308,7 +308,7 @@ and the call would cause the limit on the number of nested user namespaces to be exceeded. See .BR user_namespaces (7). - +.IP From Linux 3.11 to Linux 4.8, the error diagnosed in this case was .BR EUSERS . .TP @@ -413,7 +413,7 @@ $ \fBsudo ./unshare -m /bin/bash\fP mnt:[4026532325] .fi .in - +.PP The differing output of the two .BR readlink (1) commands shows that the two shells are in different mount namespaces. @@ -488,6 +488,6 @@ main(int argc, char *argv[]) .BR setns (2), .BR vfork (2), .BR namespaces (7) - +.PP .I Documentation/unshare.txt in the Linux kernel source tree diff --git a/man2/uselib.2 b/man2/uselib.2 index c38fa225b..625d075ef 100644 --- a/man2/uselib.2 +++ b/man2/uselib.2 @@ -88,7 +88,7 @@ Therefore, in order to employ this system call, it was sufficient to manually declare the interface in your code; alternatively, you could invoke the system call using .BR syscall (2). - +.PP In ancient libc versions, .BR uselib () was used to load @@ -111,7 +111,7 @@ so that this dynamic library can load the remaining libraries needed This is also the state of affairs in libc5. .LP glibc2 does not use this call. - +.PP Since Linux 3.15, .\" commit 69369a7003735d0d8ef22097e27a55a8bad9557a this system call is available only when the kernel is configured with the diff --git a/man2/userfaultfd.2 b/man2/userfaultfd.2 index eb88da483..2a3acd317 100644 --- a/man2/userfaultfd.2 +++ b/man2/userfaultfd.2 @@ -44,7 +44,7 @@ handling to a user-space application, and returns a file descriptor that refers to the new object. The new userfaultfd object is configured using .BR ioctl (2). - +.PP Once the userfaultfd object is configured, the application can use .BR read (2) to receive userfaultfd notifications. @@ -53,7 +53,7 @@ depending on the value of .I flags used for the creation of the userfaultfd or subsequent calls to .BR fcntl (2). - +.PP The following values may be bitwise ORed in .IR flags to change the behavior of @@ -89,7 +89,7 @@ them using the operations described in .BR ioctl_userfaultfd (2). When servicing the page fault events, the fault-handling thread can trigger a wake-up for the sleeping thread. - +.PP It is possible for the faulting threads and the fault-handling threads to run in the context of different processes. In this case, these threads may belong to different programs, @@ -99,7 +99,7 @@ In such non-cooperative mode, the process that monitors userfaultfd and handles page faults needs to be aware of the changes in the virtual memory layout of the faulting process to avoid memory corruption. - +.PP Starting from Linux 4.11, userfaultfd can also notify the fault-handling threads about changes in the virtual memory layout of the faulting process. @@ -123,7 +123,7 @@ soon as the userfaultfd manager executes The userfaultfd manager should carefully synchronize calls to .B UFFDIO_COPY with the processing of events. - +.PP The current asynchronous model of the event delivery is optimal for single threaded non-cooperative userfaultfd manager implementations. .\" Regarding the preceding sentence, Mike Rapoport says: @@ -131,7 +131,7 @@ single threaded non-cooperative userfaultfd manager implementations. .\" problematic for multi-threaded monitor. I even suspect that it would be .\" impossible to ensure synchronization between page faults and non-page .\" fault events in multi-threaded monitor. - +.PP .\" FIXME elaborate about non-cooperating mode, describe its limitations .\" for kernels before 4.11, features added in 4.11 .\" and limitations remaining in 4.11 @@ -151,7 +151,7 @@ This operation must be performed before any of the other operations described below (or those operations fail with the .BR EINVAL error). - +.PP After a successful .B UFFDIO_API operation, @@ -171,21 +171,21 @@ or .B UFFDIO_ZERO .BR ioctl (2) operations to resolve the page fault. - +.PP Details of the various .BR ioctl (2) operations can be found in .BR ioctl_userfaultfd (2). - +.PP Since Linux 4.11, events other than page-fault may enabled during .B UFFDIO_API operation. - +.PP Up to Linux 4.11, userfaultfd can be used only with anonymous private memory mappings. Since Linux 4.11, userfaultfd can be also used with hugetlbfs and shared memory mappings. - +.PP .\" .SS Reading from the userfaultfd structure Each @@ -194,7 +194,7 @@ from the userfaultfd file descriptor returns one or more .I uffd_msg structures, each of which describes a page-fault event or an event required for the non-cooperative userfaultfd usage: - +.PP .nf .in +4n struct uffd_msg { @@ -228,7 +228,7 @@ struct uffd_msg { } __packed; .in .fi - +.PP If multiple events are available and the supplied buffer is large enough, .BR read (2) returns as many events as will fit in the supplied buffer. @@ -240,7 +240,7 @@ structure, the .BR read (2) fails with the error .BR EINVAL . - +.PP The fields set in the .I uffd_msg structure are as follows: @@ -255,7 +255,7 @@ The non-page-fault events are generated only when appropriate feature is enabled during API handshake with .B UFFDIO_API .BR ioctl (2). - +.IP The following values can appear in the .I event field: @@ -418,7 +418,7 @@ Insufficient kernel memory was available. The .BR userfaultfd () system call first appeared in Linux 4.3. - +.PP The support for hugetlbfs and shared memory areas and non-page-fault events was added in Linux 4.11 .SH CONFORMING TO @@ -428,7 +428,7 @@ portable. .SH NOTES Glibc does not provide a wrapper for this system call; call it using .BR syscall (2). - +.PP The userfaultfd mechanism can be used as an alternative to traditional user-space paging techniques based on the use of the .BR SIGSEGV @@ -445,7 +445,7 @@ The program creates two threads, one of which acts as the page-fault handler for the process, for the pages in a demand-page zero region created using .BR mmap (2). - +.PP The program takes one command-line argument, which is the number of pages that will be created in a mapping whose page faults will be handled via userfaultfd. @@ -457,13 +457,13 @@ and registers the address range of that mapping using the operation. The program then creates a second thread that will perform the task of handling page faults. - +.PP The main thread then walks through the pages of the mapping fetching bytes from successive pages. Because the pages have not yet been accessed, the first access of a byte in each page will trigger a page-fault event on the userfaultfd file descriptor. - +.PP Each of the page-fault events is handled by the second thread, which sits in a loop processing input from the userfaultfd file descriptor. In each loop iteration, the second thread first calls @@ -478,9 +478,9 @@ the faulting region using the .B UFFDIO_COPY .BR ioctl (2) operation. - +.PP The following is an example of what we see when running the program: - +.PP .nf .in +4n $ \fB./userfaultfd_demo 3\fP @@ -719,7 +719,7 @@ main(int argc, char *argv[]) .BR ioctl_userfaultfd (2), .BR madvise (2), .BR mmap (2) - +.PP .IR Documentation/vm/userfaultfd.txt in the Linux kernel source tree - +.PP diff --git a/man2/utime.2 b/man2/utime.2 index 44d335636..d07dd6d23 100644 --- a/man2/utime.2 +++ b/man2/utime.2 @@ -47,7 +47,7 @@ utime, utimes \- change file last access and modification times .B Note: modern applications may prefer to use the interfaces described in .BR utimensat (2). - +.PP The .BR utime () system call @@ -58,23 +58,23 @@ to the fields of .I times respectively. - +.PP If .I times is NULL, then the access and modification times of the file are set to the current time. - +.PP Changing timestamps is permitted when: either the process has appropriate privileges, or the effective user ID equals the user ID of the file, or .I times is NULL and the process has write permission for the file. - +.PP The .I utimbuf structure is: - +.PP .in +4n .nf struct utimbuf { @@ -83,12 +83,12 @@ struct utimbuf { }; .fi .in - +.PP The .BR utime () system call allows specification of timestamps with a resolution of 1 second. - +.PP The .BR utimes () system call @@ -101,7 +101,7 @@ structures, which allow a precision of 1 microsecond for specifying timestamps. The .I timeval structure is: - +.PP .in +4n .nf struct timeval { diff --git a/man2/utimensat.2 b/man2/utimensat.2 index 7c3c29e1e..81c5f2880 100644 --- a/man2/utimensat.2 +++ b/man2/utimensat.2 @@ -76,7 +76,7 @@ and .BR utimes (2), which permit only second and microsecond precision, respectively, when setting file timestamps. - +.PP With .BR utimensat () the file is specified via the pathname given in @@ -86,7 +86,7 @@ With the file whose timestamps are to be updated is specified via an open file descriptor, .IR fd . - +.PP For both calls, the new file timestamps are specified in the array .IR times : .IR times [0] @@ -110,7 +110,7 @@ struct timespec { .PP Updated file timestamps are set to the greatest value supported by the filesystem that is not greater than the specified time. - +.PP If the .I tv_nsec field of one of the @@ -129,7 +129,7 @@ In both of these cases, the value of the corresponding .I tv_sec .\" 2.6.22 was broken: it is not ignored field is ignored. - +.PP If .I times is NULL, then both timestamps are set to the current time. @@ -167,7 +167,7 @@ and neither field is .BR UTIME_OMIT ), either condition 2 or 3 above must apply. - +.PP If both .I tv_nsec fields are specified as @@ -190,7 +190,7 @@ for a relative pathname). See .BR openat (2) for an explanation of why this can be useful. - +.PP If .I pathname is relative and @@ -202,13 +202,13 @@ then is interpreted relative to the current working directory of the calling process (like .BR utimes (2)). - +.PP If .I pathname is absolute, then .I dirfd is ignored. - +.PP The .I flags field is a bit mask that may be 0, or include the following constant, @@ -408,7 +408,7 @@ Search permission is denied for one of the prefix components of .BR utimensat () was added to Linux in kernel 2.6.22; glibc support was added with version 2.6. - +.PP Support for .BR futimens () first appeared in glibc 2.6. @@ -425,7 +425,7 @@ T{ .BR futimens () T} Thread safety MT-Safe .TE - +.sp 1 .SH CONFORMING TO .BR futimens () and @@ -435,7 +435,7 @@ are specified in POSIX.1-2008. .BR utimensat () obsoletes .BR futimesat (2). - +.PP On Linux, timestamps cannot be changed for a file marked immutable, and the only change permitted for files marked append-only is to set the timestamps to the current time. @@ -444,7 +444,7 @@ set the timestamps to the current time. and .BR utimes (2) on Linux.) - +.PP If both .I tv_nsec fields are specified as @@ -477,7 +477,7 @@ is implemented as: utimensat(fd, NULL, times, 0); .fi - +.PP Note, however, that the glibc wrapper for .BR utimensat () disallows passing NULL as the value for diff --git a/man2/vfork.2 b/man2/vfork.2 index c054917ef..fcca9a942 100644 --- a/man2/vfork.2 +++ b/man2/vfork.2 @@ -115,7 +115,7 @@ established by the parent process and flushing the parent's .BR stdio (3) buffers), but may call .BR _exit (2). - +.PP As with .BR fork (2), the child process created by @@ -126,7 +126,7 @@ the .BR vfork () call differs only in the treatment of the virtual address space, as described above. - +.PP Signals sent to the parent arrive after the child releases the parent's memory (i.e., after the child terminates @@ -163,7 +163,7 @@ held in a register. 4.3BSD; POSIX.1-2001 (but marked OBSOLETE). POSIX.1-2008 removes the specification of .BR vfork (). - +.PP The requirements put on .BR vfork () by the standards are weaker than those put on @@ -242,7 +242,7 @@ changes memory, those changes may result in an inconsistent process state from the perspective of the parent process (e.g., memory changes would be visible in the parent, but changes to the state of open file descriptors would not be visible). - +.PP When .BR vfork () is called in a multithreaded process, @@ -279,7 +279,7 @@ LinuxThreads threading library. (See .BR pthreads (7) for a description of Linux threading libraries.) - +.PP A call to .BR vfork () is equivalent to calling @@ -287,7 +287,7 @@ is equivalent to calling with .I flags specified as: - +.PP CLONE_VM | CLONE_VFORK | SIGCHLD .SS History The diff --git a/man2/vmsplice.2 b/man2/vmsplice.2 index ced462663..018266e8f 100644 --- a/man2/vmsplice.2 +++ b/man2/vmsplice.2 @@ -54,14 +54,14 @@ into a pipe. The file descriptor .I fd must refer to a pipe. - +.PP The pointer .I iov points to an array of .I iovec structures as defined in .IR : - +.PP .in +4n .nf struct iovec { @@ -70,7 +70,7 @@ struct iovec { }; .in .fi - +.PP The .I flags argument is a bit mask that is composed by ORing together diff --git a/man2/wait.2 b/man2/wait.2 index b86bcc95a..de5027ad1 100644 --- a/man2/wait.2 +++ b/man2/wait.2 @@ -94,7 +94,7 @@ In the case of a terminated child, performing a wait allows the system to release the resources associated with the child; if a wait is not performed, then the terminated child remains in a "zombie" state (see NOTES below). - +.PP If a child has already changed state, then these calls return immediately. Otherwise, they block until either a child changes state or a signal handler interrupts the call (assuming that system calls @@ -118,7 +118,7 @@ is equivalent to: waitpid(\-1, &wstatus, 0); .fi - +.PP The .BR waitpid () system call suspends execution of the calling process until a @@ -131,7 +131,7 @@ waits only for terminated children, but this behavior is modifiable via the .I options argument, as described below. - +.PP The value of .I pid can be: @@ -221,7 +221,7 @@ returns true if the child produced a core dump. This macro should be employed only if .B WIFSIGNALED returned true. - +.IP This macro is not specified in POSIX.1-2001 and is not available on some UNIX implementations (e.g., AIX, SunOS). Therefore, enclose its use inside @@ -249,7 +249,7 @@ The .BR waitid () system call (available since Linux 2.6.9) provides more precise control over which child state changes to wait for. - +.PP The .I idtype and @@ -362,7 +362,7 @@ after the call returns. .BR wait (): on success, returns the process ID of the terminated child; on error, \-1 is returned. - +.PP .BR waitpid (): on success, returns the process ID of the child whose state has changed; if @@ -371,7 +371,7 @@ was specified and one or more child(ren) specified by .I pid exist, but have not yet changed state, then 0 is returned. On error, \-1 is returned. - +.PP .BR waitid (): returns 0 on success or if @@ -446,7 +446,7 @@ are adopted by operation); .BR init (1) automatically performs a wait to remove the zombies. - +.PP POSIX.1-2001 specifies that if the disposition of .B SIGCHLD is set to @@ -475,7 +475,7 @@ Note that even though the default disposition of is "ignore", explicitly setting the disposition to .B SIG_IGN results in different treatment of zombie process children.) - +.PP Linux 2.6 conforms to the POSIX requirements. However, Linux 2.4 (and earlier) does not: if a @@ -547,7 +547,7 @@ flag is automatically implied if the child is being ptraced. .BR wait () is actually a library function that (in glibc) is implemented as a call to .BR wait4 (2). - +.PP On some architectures, there is no .BR waitpid () system call; @@ -555,7 +555,7 @@ system call; instead, this interface is implemented via a C library wrapper function that calls .BR wait4 (2). - +.PP The raw .BR waitid () system call takes a fifth argument, of type @@ -599,7 +599,7 @@ using the integer supplied on the command line as the exit status. The parent process executes a loop that monitors the child using .BR waitpid (), and uses the W*() macros described above to analyze the wait status value. - +.PP The following shell session demonstrates the use of the program: .in +4n .nf diff --git a/man2/wait4.2 b/man2/wait4.2 index d3dded339..5a121a40a 100644 --- a/man2/wait4.2 +++ b/man2/wait4.2 @@ -72,7 +72,7 @@ These functions are obsolete; use or .BR waitid (2) in new programs. - +.PP The .BR wait3 () and @@ -139,7 +139,7 @@ As for .BR waitpid (2). .SH CONFORMING TO 4.3BSD. - +.PP SUSv1 included a specification of .BR wait3 (); SUSv2 included diff --git a/man2/write.2 b/man2/write.2 index 37e2ce36c..6a39b5b55 100644 --- a/man2/write.2 +++ b/man2/write.2 @@ -51,7 +51,7 @@ bytes from the buffer pointed .I buf to the file referred to by the file descriptor .IR fd . - +.PP The number of bytes written may be less than .I count if, for example, @@ -65,7 +65,7 @@ handler after having written less than bytes. (See also .BR pipe (7).) - +.PP For a seekable file (i.e., one to which .BR lseek (2) may be applied, for example, a regular file) @@ -79,14 +79,14 @@ with the file offset is first set to the end of the file before writing. The adjustment of the file offset and the write operation are performed as an atomic step. - +.PP POSIX requires that a .BR read (2) that can be proved to occur after a .BR write () has returned will return the new data. Note that not all filesystems are POSIX conforming. - +.PP According to POSIX.1, if .I count is greater than @@ -99,10 +99,10 @@ nothing was written). It is not an error if this number is smaller than the number of bytes requested; this may happen for example because the disk device was filled. See also NOTES. - +.PP On error, \-1 is returned, and \fIerrno\fP is set appropriately. - +.PP If \fIcount\fP is zero and .I fd refers to a regular file, then @@ -206,7 +206,7 @@ Other errors may occur, depending on the object connected to SVr4, 4.3BSD, POSIX.1-2001. .\" SVr4 documents additional error .\" conditions EDEADLK, ENOLCK, ENOLNK, ENOSR, ENXIO, or ERANGE. - +.PP Under SVr4 a write may be interrupted and return .B EINTR at any point, @@ -218,7 +218,7 @@ and .I ssize_t are, respectively, unsigned and signed integer data types specified by POSIX.1. - +.PP A successful return from .BR write () does not make any guarantee that data has been committed to disk. @@ -227,7 +227,7 @@ that space has successfully been reserved for the data. The only way to be sure is to call .BR fsync (2) after you are done writing all your data. - +.PP If a .BR write () is interrupted by a signal handler before any bytes are written, @@ -235,7 +235,7 @@ then the call fails with the error .BR EINTR ; if it is interrupted after at least one byte has been written, the call succeeds, and returns the number of bytes written. - +.PP On Linux, .BR write () (and similar system calls) will transfer at most @@ -246,13 +246,13 @@ returning the number of bytes actually transferred. .SH BUGS According to POSIX.1-2008/SUSv4 Section XSI 2.9.7 ("Thread Interactions with Regular File Operations"): - +.PP .RS 4 All of the following functions shall be atomic with respect to each other in the effects specified in POSIX.1-2008 when they operate on regular files or symbolic links: ... .RE - +.PP Among the APIs subsequently listed are .BR write () and