mirror of https://github.com/mkerrisk/man-pages
420 lines
12 KiB
Groff
420 lines
12 KiB
Groff
.\" Copyright (C) 2019 Michael Kerrisk <mtk.manpages@gmail.com>
|
|
.\" A very few fragments remain from an earlier page written by
|
|
.\" Werner Almesberger in 2000
|
|
.\"
|
|
.\" %%%LICENSE_START(VERBATIM)
|
|
.\" Permission is granted to make and distribute verbatim copies of this
|
|
.\" manual provided the copyright notice and this permission notice are
|
|
.\" preserved on all copies.
|
|
.\"
|
|
.\" Permission is granted to copy and distribute modified versions of this
|
|
.\" manual under the conditions for verbatim copying, provided that the
|
|
.\" entire resulting derived work is distributed under the terms of a
|
|
.\" permission notice identical to this one.
|
|
.\"
|
|
.\" Since the Linux kernel and libraries are constantly changing, this
|
|
.\" manual page may be incorrect or out-of-date. The author(s) assume no
|
|
.\" responsibility for errors or omissions, or for damages resulting from
|
|
.\" the use of the information contained herein. The author(s) may not
|
|
.\" have taken the same level of care in the production of this manual,
|
|
.\" which is licensed free of charge, as they might when working
|
|
.\" professionally.
|
|
.\"
|
|
.\" Formatted or processed versions of this manual, if unaccompanied by
|
|
.\" the source, must acknowledge the copyright and authors of this work.
|
|
.\" %%%LICENSE_END
|
|
.\"
|
|
.TH PIVOT_ROOT 2 2020-11-01 "Linux" "Linux Programmer's Manual"
|
|
.SH NAME
|
|
pivot_root \- change the root mount
|
|
.SH SYNOPSIS
|
|
.BI "int pivot_root(const char *" new_root ", const char *" put_old );
|
|
.PP
|
|
.IR Note :
|
|
There is no glibc wrapper for this system call; see NOTES.
|
|
.SH DESCRIPTION
|
|
.BR pivot_root ()
|
|
changes the root mount in the mount namespace of the calling process.
|
|
More precisely, it moves the root mount to the
|
|
directory \fIput_old\fP and makes \fInew_root\fP the new root mount.
|
|
The calling process must have the
|
|
.B CAP_SYS_ADMIN
|
|
capability in the user namespace that owns the caller's mount namespace.
|
|
.PP
|
|
.BR pivot_root ()
|
|
changes the root directory and the current working directory
|
|
of each process or thread in the same mount namespace to
|
|
.I new_root
|
|
if they point to the old root directory.
|
|
(See also NOTES.)
|
|
On the other hand,
|
|
.BR pivot_root ()
|
|
does not change the caller's current working directory
|
|
(unless it is on the old root directory),
|
|
and thus it should be followed by a
|
|
\fBchdir("/")\fP call.
|
|
.PP
|
|
The following restrictions apply:
|
|
.IP \- 3
|
|
.IR new_root
|
|
and
|
|
.IR put_old
|
|
must be directories.
|
|
.IP \-
|
|
.I new_root
|
|
and
|
|
.I put_old
|
|
must not be on the same mount as the current root.
|
|
.IP \-
|
|
\fIput_old\fP must be at or underneath \fInew_root\fP;
|
|
that is, adding some nonnegative
|
|
number of "\fI/..\fP" prefixes to the pathname pointed to by
|
|
.I put_old
|
|
must yield the same directory as \fInew_root\fP.
|
|
.IP \-
|
|
.I new_root
|
|
must be a path to a mount point, but can't be
|
|
.IR """/""" .
|
|
A path that is not already a mount point can be converted into one by
|
|
bind mounting the path onto itself.
|
|
.IP \-
|
|
The propagation type of the parent mount of
|
|
.IR new_root
|
|
and the parent mount of the current root directory must not be
|
|
.BR MS_SHARED ;
|
|
similarly, if
|
|
.I put_old
|
|
is an existing mount point, its propagation type must not be
|
|
.BR MS_SHARED .
|
|
These restrictions ensure that
|
|
.BR pivot_root ()
|
|
never propagates any changes to another mount namespace.
|
|
.IP \-
|
|
The current root directory must be a mount point.
|
|
.SH RETURN VALUE
|
|
On success, zero is returned.
|
|
On error, \-1 is returned, and
|
|
\fIerrno\fP is set appropriately.
|
|
.SH ERRORS
|
|
.BR pivot_root ()
|
|
may fail with any of the same errors as
|
|
.BR stat (2).
|
|
Additionally, it may fail with the following errors:
|
|
.TP
|
|
.B EBUSY
|
|
.\" Reconfirmed that the following error occurs on Linux 5.0 by
|
|
.\" specifying 'new_root' as "/rootfs" and 'put_old' as
|
|
.\" "/rootfs/oldrootfs", and *not* bind mounting "/rootfs" on top of
|
|
.\" itself. Of course, this is an odd situation, since a later check
|
|
.\" in the kernel code will in any case yield EINVAL if 'new_root' is
|
|
.\" not a mount point. However, when the system call was first added,
|
|
.\" 'new_root' was not required to be a mount point. So, this
|
|
.\" error is nowadays probably just the result of crufty accumulation.
|
|
.\" This error can also occur if we bind mount "/" on top of itself
|
|
.\" and try to specify "/" as the 'new' (again, an odd situation). So,
|
|
.\" the EBUSY check in the kernel does still seem necessary to prevent
|
|
.\" that case. Furthermore, the "or put_old" piece is probably
|
|
.\" redundant text (although the check is in the kernel), since,
|
|
.\" in another check, 'put_old' is required to be under 'new_root'.
|
|
.I new_root
|
|
or
|
|
.I put_old
|
|
is on the current root mount.
|
|
(This error covers the pathological case where
|
|
.I new_root
|
|
is
|
|
.IR """/""" .)
|
|
.TP
|
|
.B EINVAL
|
|
.I new_root
|
|
is not a mount point.
|
|
.TP
|
|
.B EINVAL
|
|
\fIput_old\fP is not at or underneath \fInew_root\fP.
|
|
.TP
|
|
.B EINVAL
|
|
The current root directory is not a mount point
|
|
(because of an earlier
|
|
.BR chroot (2)).
|
|
.TP
|
|
.B EINVAL
|
|
The current root is on the rootfs (initial ramfs) mount; see NOTES.
|
|
.TP
|
|
.B EINVAL
|
|
Either the mount point at
|
|
.IR new_root ,
|
|
or the parent mount of that mount point,
|
|
has propagation type
|
|
.BR MS_SHARED .
|
|
.TP
|
|
.B EINVAL
|
|
.I put_old
|
|
is a mount point and has the propagation type
|
|
.BR MS_SHARED .
|
|
.TP
|
|
.B ENOTDIR
|
|
\fInew_root\fP or \fIput_old\fP is not a directory.
|
|
.TP
|
|
.B EPERM
|
|
The calling process does not have the
|
|
.B CAP_SYS_ADMIN
|
|
capability.
|
|
.SH VERSIONS
|
|
.BR pivot_root ()
|
|
was introduced in Linux 2.3.41.
|
|
.SH CONFORMING TO
|
|
.BR pivot_root ()
|
|
is Linux-specific and hence is not portable.
|
|
.SH NOTES
|
|
Glibc does not provide a wrapper for this system call; call it using
|
|
.BR syscall (2).
|
|
.PP
|
|
A command-line interface for this system call is provided by
|
|
.BR pivot_root (8).
|
|
.PP
|
|
.BR pivot_root ()
|
|
allows the caller to switch to a new root filesystem while at the same time
|
|
placing the old root mount at a location under
|
|
.I new_root
|
|
from where it can subsequently be unmounted.
|
|
(The fact that it moves all processes that have a root directory
|
|
or current working directory on the old root directory to the
|
|
new root frees the old root directory of users,
|
|
allowing the old root mount to be unmounted more easily.)
|
|
.PP
|
|
One use of
|
|
.BR pivot_root ()
|
|
is during system startup, when the
|
|
system mounts a temporary root filesystem (e.g., an
|
|
.BR initrd (4)),
|
|
then mounts the real root filesystem, and eventually turns the latter into
|
|
the root directory of all relevant processes and threads.
|
|
A modern use is to set up a root filesystem during
|
|
the creation of a container.
|
|
.PP
|
|
The fact that
|
|
.BR pivot_root ()
|
|
modifies process root and current working directories in the
|
|
manner noted in DESCRIPTION
|
|
is necessary in order to prevent kernel threads from keeping the old
|
|
root mount busy with their root and current working directories,
|
|
even if they never access
|
|
the filesystem in any way.
|
|
.PP
|
|
The rootfs (initial ramfs) cannot be
|
|
.BR pivot_root ()ed.
|
|
The recommended method of changing the root filesystem in this case is
|
|
to delete everything in rootfs, overmount rootfs with the new root, attach
|
|
.IR stdin / stdout / stderr
|
|
to the new
|
|
.IR /dev/console ,
|
|
and exec the new
|
|
.BR init (1).
|
|
Helper programs for this process exist; see
|
|
.BR switch_root (8).
|
|
.\"
|
|
.SS pivot_root(\(dq.\(dq, \(dq.\(dq)
|
|
.I new_root
|
|
and
|
|
.I put_old
|
|
may be the same directory.
|
|
In particular, the following sequence allows a pivot-root operation
|
|
without needing to create and remove a temporary directory:
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
chdir(new_root);
|
|
pivot_root(".", ".");
|
|
umount2(".", MNT_DETACH);
|
|
.EE
|
|
.in
|
|
.PP
|
|
This sequence succeeds because the
|
|
.BR pivot_root ()
|
|
call stacks the old root mount point
|
|
on top of the new root mount point at
|
|
.IR / .
|
|
At that point, the calling process's root directory and current
|
|
working directory refer to the new root mount point
|
|
.RI ( new_root ).
|
|
During the subsequent
|
|
.BR umount ()
|
|
call, resolution of
|
|
.IR """."""
|
|
starts with
|
|
.I new_root
|
|
and then moves up the list of mounts stacked at
|
|
.IR / ,
|
|
with the result that old root mount point is unmounted.
|
|
.\"
|
|
.SS Historical notes
|
|
For many years, this manual page carried the following text:
|
|
.RS
|
|
.PP
|
|
.BR pivot_root ()
|
|
may or may not change the current root and the current
|
|
working directory of any processes or threads which use the old
|
|
root directory.
|
|
The caller of
|
|
.BR pivot_root ()
|
|
must ensure that processes with root or current working directory
|
|
at the old root operate correctly in either case.
|
|
An easy way to ensure this is to change their
|
|
root and current working directory to \fInew_root\fP before invoking
|
|
.BR pivot_root ().
|
|
.RE
|
|
.PP
|
|
This text, written before the system call implementation was
|
|
even finalized in the kernel, was probably intended to warn users
|
|
at that time that the implementation might change before final release.
|
|
However, the behavior stated in DESCRIPTION
|
|
has remained consistent since this system call
|
|
was first implemented and will not change now.
|
|
.SH EXAMPLES
|
|
.\" FIXME
|
|
.\" Would it be better, because simpler, to use unshare(2)
|
|
.\" rather than clone(2) in the example below?
|
|
The program below demonstrates the use of
|
|
.BR pivot_root ()
|
|
inside a mount namespace that is created using
|
|
.BR clone (2).
|
|
After pivoting to the root directory named in the program's
|
|
first command-line argument, the child created by
|
|
.BR clone (2)
|
|
then executes the program named in the remaining command-line arguments.
|
|
.PP
|
|
We demonstrate the program by creating a directory that will serve as
|
|
the new root filesystem and placing a copy of the (statically linked)
|
|
.BR busybox (1)
|
|
executable in that directory.
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
$ \fBmkdir /tmp/rootfs\fP
|
|
$ \fBls \-id /tmp/rootfs\fP # Show inode number of new root directory
|
|
319459 /tmp/rootfs
|
|
$ \fBcp $(which busybox) /tmp/rootfs\fP
|
|
$ \fBPS1=\(aqbbsh$ \(aq sudo ./pivot_root_demo /tmp/rootfs /busybox sh\fP
|
|
bbsh$ \fBPATH=/\fP
|
|
bbsh$ \fBbusybox ln busybox ln\fP
|
|
bbsh$ \fBln busybox echo\fP
|
|
bbsh$ \fBln busybox ls\fP
|
|
bbsh$ \fBls\fP
|
|
busybox echo ln ls
|
|
bbsh$ \fBls \-id /\fP # Compare with inode number above
|
|
319459 /
|
|
bbsh$ \fBecho \(aqhello world\(aq\fP
|
|
hello world
|
|
.EE
|
|
.in
|
|
.SS Program source
|
|
\&
|
|
.PP
|
|
.EX
|
|
/* pivot_root_demo.c */
|
|
|
|
#define _GNU_SOURCE
|
|
#include <sched.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <unistd.h>
|
|
#include <sys/wait.h>
|
|
#include <sys/syscall.h>
|
|
#include <sys/mount.h>
|
|
#include <sys/stat.h>
|
|
#include <limits.h>
|
|
#include <sys/mman.h>
|
|
|
|
#define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \e
|
|
} while (0)
|
|
|
|
static int
|
|
pivot_root(const char *new_root, const char *put_old)
|
|
{
|
|
return syscall(SYS_pivot_root, new_root, put_old);
|
|
}
|
|
|
|
#define STACK_SIZE (1024 * 1024)
|
|
|
|
static int /* Startup function for cloned child */
|
|
child(void *arg)
|
|
{
|
|
char **args = arg;
|
|
char *new_root = args[0];
|
|
const char *put_old = "/oldrootfs";
|
|
char path[PATH_MAX];
|
|
|
|
/* Ensure that \(aqnew_root\(aq and its parent mount don\(aqt have
|
|
shared propagation (which would cause pivot_root() to
|
|
return an error), and prevent propagation of mount
|
|
events to the initial mount namespace */
|
|
|
|
if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) == 1)
|
|
errExit("mount\-MS_PRIVATE");
|
|
|
|
/* Ensure that \(aqnew_root\(aq is a mount point */
|
|
|
|
if (mount(new_root, new_root, NULL, MS_BIND, NULL) == \-1)
|
|
errExit("mount\-MS_BIND");
|
|
|
|
/* Create directory to which old root will be pivoted */
|
|
|
|
snprintf(path, sizeof(path), "%s/%s", new_root, put_old);
|
|
if (mkdir(path, 0777) == \-1)
|
|
errExit("mkdir");
|
|
|
|
/* And pivot the root filesystem */
|
|
|
|
if (pivot_root(new_root, path) == \-1)
|
|
errExit("pivot_root");
|
|
|
|
/* Switch the current working directory to "/" */
|
|
|
|
if (chdir("/") == \-1)
|
|
errExit("chdir");
|
|
|
|
/* Unmount old root and remove mount point */
|
|
|
|
if (umount2(put_old, MNT_DETACH) == \-1)
|
|
perror("umount2");
|
|
if (rmdir(put_old) == \-1)
|
|
perror("rmdir");
|
|
|
|
/* Execute the command specified in argv[1]... */
|
|
|
|
execv(args[1], &args[1]);
|
|
errExit("execv");
|
|
}
|
|
|
|
int
|
|
main(int argc, char *argv[])
|
|
{
|
|
/* Create a child process in a new mount namespace */
|
|
|
|
char *stack = mmap(NULL, STACK_SIZE, PROT_READ | PROT_WRITE,
|
|
MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, \-1, 0);
|
|
if (stack == MAP_FAILED)
|
|
errExit("mmap");
|
|
|
|
if (clone(child, stack + STACK_SIZE,
|
|
CLONE_NEWNS | SIGCHLD, &argv[1]) == \-1)
|
|
errExit("clone");
|
|
|
|
/* Parent falls through to here; wait for child */
|
|
|
|
if (wait(NULL) == \-1)
|
|
errExit("wait");
|
|
|
|
exit(EXIT_SUCCESS);
|
|
}
|
|
.EE
|
|
.SH SEE ALSO
|
|
.BR chdir (2),
|
|
.BR chroot (2),
|
|
.BR mount (2),
|
|
.BR stat (2),
|
|
.BR initrd (4),
|
|
.BR mount_namespaces (7),
|
|
.BR pivot_root (8),
|
|
.BR switch_root (8)
|