mirror of https://github.com/mkerrisk/man-pages
3808 lines
97 KiB
Groff
3808 lines
97 KiB
Groff
.\" Copyright (c) 2012, Vincent Weaver
|
|
.\"
|
|
.\" %%%LICENSE_START(GPLv2+_DOC_FULL)
|
|
.\" This is free documentation; you can redistribute it and/or
|
|
.\" modify it under the terms of the GNU General Public License as
|
|
.\" published by the Free Software Foundation; either version 2 of
|
|
.\" the License, or (at your option) any later version.
|
|
.\"
|
|
.\" The GNU General Public License's references to "object code"
|
|
.\" and "executables" are to be interpreted as the output of any
|
|
.\" document formatting or typesetting system, including
|
|
.\" intermediate and printed output.
|
|
.\"
|
|
.\" This manual is distributed in the hope that it will be useful,
|
|
.\" but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
.\" GNU General Public License for more details.
|
|
.\"
|
|
.\" You should have received a copy of the GNU General Public
|
|
.\" License along with this manual; if not, see
|
|
.\" <http://www.gnu.org/licenses/>.
|
|
.\" %%%LICENSE_END
|
|
.\"
|
|
.\" This document is based on the perf_event.h header file, the
|
|
.\" tools/perf/design.txt file, and a lot of bitter experience.
|
|
.\"
|
|
.TH PERF_EVENT_OPEN 2 2021-03-22 "Linux" "Linux Programmer's Manual"
|
|
.SH NAME
|
|
perf_event_open \- set up performance monitoring
|
|
.SH SYNOPSIS
|
|
.nf
|
|
.BR "#include <linux/perf_event.h>" " /* Definition of " PERF_* " constants */"
|
|
.BR "#include <linux/hw_breakpoint.h>" " /* Definition of " HW_* " constants */"
|
|
.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
|
|
.B #include <unistd.h>
|
|
.PP
|
|
.BI "int syscall(SYS_perf_event_open, struct perf_event_attr *" attr ,
|
|
.BI " pid_t " pid ", int " cpu ", int " group_fd \
|
|
", unsigned long " flags );
|
|
.fi
|
|
.PP
|
|
.IR Note :
|
|
glibc provides no wrapper for
|
|
.BR perf_event_open (),
|
|
necessitating the use of
|
|
.BR syscall (2).
|
|
.SH DESCRIPTION
|
|
Given a list of parameters,
|
|
.BR perf_event_open ()
|
|
returns a file descriptor, for use in subsequent system calls
|
|
.RB ( read "(2), " mmap "(2), " prctl "(2), " fcntl "(2), etc.)."
|
|
.PP
|
|
A call to
|
|
.BR perf_event_open ()
|
|
creates a file descriptor that allows measuring performance
|
|
information.
|
|
Each file descriptor corresponds to one
|
|
event that is measured; these can be grouped together
|
|
to measure multiple events simultaneously.
|
|
.PP
|
|
Events can be enabled and disabled in two ways: via
|
|
.BR ioctl (2)
|
|
and via
|
|
.BR prctl (2).
|
|
When an event is disabled it does not count or generate overflows but does
|
|
continue to exist and maintain its count value.
|
|
.PP
|
|
Events come in two flavors: counting and sampled.
|
|
A
|
|
.I counting
|
|
event is one that is used for counting the aggregate number of events
|
|
that occur.
|
|
In general, counting event results are gathered with a
|
|
.BR read (2)
|
|
call.
|
|
A
|
|
.I sampling
|
|
event periodically writes measurements to a buffer that can then
|
|
be accessed via
|
|
.BR mmap (2).
|
|
.SS Arguments
|
|
The
|
|
.I pid
|
|
and
|
|
.I cpu
|
|
arguments allow specifying which process and CPU to monitor:
|
|
.TP
|
|
.BR "pid == 0" " and " "cpu == \-1"
|
|
This measures the calling process/thread on any CPU.
|
|
.TP
|
|
.BR "pid == 0" " and " "cpu >= 0"
|
|
This measures the calling process/thread only
|
|
when running on the specified CPU.
|
|
.TP
|
|
.BR "pid > 0" " and " "cpu == \-1"
|
|
This measures the specified process/thread on any CPU.
|
|
.TP
|
|
.BR "pid > 0" " and " "cpu >= 0"
|
|
This measures the specified process/thread only
|
|
when running on the specified CPU.
|
|
.TP
|
|
.BR "pid == \-1" " and " "cpu >= 0"
|
|
This measures all processes/threads on the specified CPU.
|
|
This requires
|
|
.B CAP_PERFMON
|
|
(since Linux 5.8) or
|
|
.B CAP_SYS_ADMIN
|
|
capability or a
|
|
.I /proc/sys/kernel/perf_event_paranoid
|
|
value of less than 1.
|
|
.TP
|
|
.BR "pid == \-1" " and " "cpu == \-1"
|
|
This setting is invalid and will return an error.
|
|
.PP
|
|
When
|
|
.I pid
|
|
is greater than zero, permission to perform this system call
|
|
is governed by
|
|
.B CAP_PERFMON
|
|
(since Linux 5.9) and a ptrace access mode
|
|
.B PTRACE_MODE_READ_REALCREDS
|
|
check on older Linux versions; see
|
|
.BR ptrace (2).
|
|
.PP
|
|
The
|
|
.I group_fd
|
|
argument allows event groups to be created.
|
|
An event group has one event which is the group leader.
|
|
The leader is created first, with
|
|
.IR group_fd " = \-1."
|
|
The rest of the group members are created with subsequent
|
|
.BR perf_event_open ()
|
|
calls with
|
|
.I group_fd
|
|
being set to the file descriptor of the group leader.
|
|
(A single event on its own is created with
|
|
.IR group_fd " = \-1"
|
|
and is considered to be a group with only 1 member.)
|
|
An event group is scheduled onto the CPU as a unit: it will
|
|
be put onto the CPU only if all of the events in the group can be put onto
|
|
the CPU.
|
|
This means that the values of the member events can be
|
|
meaningfully compared\(emadded, divided (to get ratios), and so on\(emwith each
|
|
other, since they have counted events for the same set of executed
|
|
instructions.
|
|
.PP
|
|
The
|
|
.I flags
|
|
argument is formed by ORing together zero or more of the following values:
|
|
.TP
|
|
.BR PERF_FLAG_FD_CLOEXEC " (since Linux 3.14)"
|
|
.\" commit a21b0b354d4ac39be691f51c53562e2c24443d9e
|
|
This flag enables the close-on-exec flag for the created
|
|
event file descriptor,
|
|
so that the file descriptor is automatically closed on
|
|
.BR execve (2).
|
|
Setting the close-on-exec flags at creation time, rather than later with
|
|
.BR fcntl (2),
|
|
avoids potential race conditions where the calling thread invokes
|
|
.BR perf_event_open ()
|
|
and
|
|
.BR fcntl (2)
|
|
at the same time as another thread calls
|
|
.BR fork (2)
|
|
then
|
|
.BR execve (2).
|
|
.TP
|
|
.BR PERF_FLAG_FD_NO_GROUP
|
|
This flag tells the event to ignore the
|
|
.I group_fd
|
|
parameter except for the purpose of setting up output redirection
|
|
using the
|
|
.B PERF_FLAG_FD_OUTPUT
|
|
flag.
|
|
.TP
|
|
.BR PERF_FLAG_FD_OUTPUT " (broken since Linux 2.6.35)"
|
|
.\" commit ac9721f3f54b27a16c7e1afb2481e7ee95a70318
|
|
This flag re-routes the event's sampled output to instead
|
|
be included in the mmap buffer of the event specified by
|
|
.IR group_fd .
|
|
.TP
|
|
.BR PERF_FLAG_PID_CGROUP " (since Linux 2.6.39)"
|
|
.\" commit e5d1367f17ba6a6fed5fd8b74e4d5720923e0c25
|
|
This flag activates per-container system-wide monitoring.
|
|
A container
|
|
is an abstraction that isolates a set of resources for finer-grained
|
|
control (CPUs, memory, etc.).
|
|
In this mode, the event is measured
|
|
only if the thread running on the monitored CPU belongs to the designated
|
|
container (cgroup).
|
|
The cgroup is identified by passing a file descriptor
|
|
opened on its directory in the cgroupfs filesystem.
|
|
For instance, if the
|
|
cgroup to monitor is called
|
|
.IR test ,
|
|
then a file descriptor opened on
|
|
.I /dev/cgroup/test
|
|
(assuming cgroupfs is mounted on
|
|
.IR /dev/cgroup )
|
|
must be passed as the
|
|
.I pid
|
|
parameter.
|
|
cgroup monitoring is available only
|
|
for system-wide events and may therefore require extra permissions.
|
|
.PP
|
|
The
|
|
.I perf_event_attr
|
|
structure provides detailed configuration information
|
|
for the event being created.
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
struct perf_event_attr {
|
|
__u32 type; /* Type of event */
|
|
__u32 size; /* Size of attribute structure */
|
|
__u64 config; /* Type\-specific configuration */
|
|
|
|
union {
|
|
__u64 sample_period; /* Period of sampling */
|
|
__u64 sample_freq; /* Frequency of sampling */
|
|
};
|
|
|
|
__u64 sample_type; /* Specifies values included in sample */
|
|
__u64 read_format; /* Specifies values returned in read */
|
|
|
|
__u64 disabled : 1, /* off by default */
|
|
inherit : 1, /* children inherit it */
|
|
pinned : 1, /* must always be on PMU */
|
|
exclusive : 1, /* only group on PMU */
|
|
exclude_user : 1, /* don\(aqt count user */
|
|
exclude_kernel : 1, /* don\(aqt count kernel */
|
|
exclude_hv : 1, /* don\(aqt count hypervisor */
|
|
exclude_idle : 1, /* don\(aqt count when idle */
|
|
mmap : 1, /* include mmap data */
|
|
comm : 1, /* include comm data */
|
|
freq : 1, /* use freq, not period */
|
|
inherit_stat : 1, /* per task counts */
|
|
enable_on_exec : 1, /* next exec enables */
|
|
task : 1, /* trace fork/exit */
|
|
watermark : 1, /* wakeup_watermark */
|
|
precise_ip : 2, /* skid constraint */
|
|
mmap_data : 1, /* non\-exec mmap data */
|
|
sample_id_all : 1, /* sample_type all events */
|
|
exclude_host : 1, /* don\(aqt count in host */
|
|
exclude_guest : 1, /* don\(aqt count in guest */
|
|
exclude_callchain_kernel : 1,
|
|
/* exclude kernel callchains */
|
|
exclude_callchain_user : 1,
|
|
/* exclude user callchains */
|
|
mmap2 : 1, /* include mmap with inode data */
|
|
comm_exec : 1, /* flag comm events that are
|
|
due to exec */
|
|
use_clockid : 1, /* use clockid for time fields */
|
|
context_switch : 1, /* context switch data */
|
|
write_backward : 1, /* Write ring buffer from end
|
|
to beginning */
|
|
namespaces : 1, /* include namespaces data */
|
|
ksymbol : 1, /* include ksymbol events */
|
|
bpf_event : 1, /* include bpf events */
|
|
aux_output : 1, /* generate AUX records
|
|
instead of events */
|
|
cgroup : 1, /* include cgroup events */
|
|
text_poke : 1, /* include text poke events */
|
|
|
|
__reserved_1 : 30;
|
|
|
|
union {
|
|
__u32 wakeup_events; /* wakeup every n events */
|
|
__u32 wakeup_watermark; /* bytes before wakeup */
|
|
};
|
|
|
|
__u32 bp_type; /* breakpoint type */
|
|
|
|
union {
|
|
__u64 bp_addr; /* breakpoint address */
|
|
__u64 kprobe_func; /* for perf_kprobe */
|
|
__u64 uprobe_path; /* for perf_uprobe */
|
|
__u64 config1; /* extension of config */
|
|
};
|
|
|
|
union {
|
|
__u64 bp_len; /* breakpoint length */
|
|
__u64 kprobe_addr; /* with kprobe_func == NULL */
|
|
__u64 probe_offset; /* for perf_[k,u]probe */
|
|
__u64 config2; /* extension of config1 */
|
|
};
|
|
__u64 branch_sample_type; /* enum perf_branch_sample_type */
|
|
__u64 sample_regs_user; /* user regs to dump on samples */
|
|
__u32 sample_stack_user; /* size of stack to dump on
|
|
samples */
|
|
__s32 clockid; /* clock to use for time fields */
|
|
__u64 sample_regs_intr; /* regs to dump on samples */
|
|
__u32 aux_watermark; /* aux bytes before wakeup */
|
|
__u16 sample_max_stack; /* max frames in callchain */
|
|
__u16 __reserved_2; /* align to u64 */
|
|
|
|
};
|
|
.EE
|
|
.in
|
|
.PP
|
|
The fields of the
|
|
.I perf_event_attr
|
|
structure are described in more detail below:
|
|
.TP
|
|
.I type
|
|
This field specifies the overall event type.
|
|
It has one of the following values:
|
|
.RS
|
|
.TP
|
|
.B PERF_TYPE_HARDWARE
|
|
This indicates one of the "generalized" hardware events provided
|
|
by the kernel.
|
|
See the
|
|
.I config
|
|
field definition for more details.
|
|
.TP
|
|
.B PERF_TYPE_SOFTWARE
|
|
This indicates one of the software-defined events provided by the kernel
|
|
(even if no hardware support is available).
|
|
.TP
|
|
.B PERF_TYPE_TRACEPOINT
|
|
This indicates a tracepoint
|
|
provided by the kernel tracepoint infrastructure.
|
|
.TP
|
|
.B PERF_TYPE_HW_CACHE
|
|
This indicates a hardware cache event.
|
|
This has a special encoding, described in the
|
|
.I config
|
|
field definition.
|
|
.TP
|
|
.B PERF_TYPE_RAW
|
|
This indicates a "raw" implementation-specific event in the
|
|
.IR config " field."
|
|
.TP
|
|
.BR PERF_TYPE_BREAKPOINT " (since Linux 2.6.33)"
|
|
.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e
|
|
This indicates a hardware breakpoint as provided by the CPU.
|
|
Breakpoints can be read/write accesses to an address as well as
|
|
execution of an instruction address.
|
|
.TP
|
|
dynamic PMU
|
|
Since Linux 2.6.38,
|
|
.\" commit 2e80a82a49c4c7eca4e35734380f28298ba5db19
|
|
.BR perf_event_open ()
|
|
can support multiple PMUs.
|
|
To enable this, a value exported by the kernel can be used in the
|
|
.I type
|
|
field to indicate which PMU to use.
|
|
The value to use can be found in the sysfs filesystem:
|
|
there is a subdirectory per PMU instance under
|
|
.IR /sys/bus/event_source/devices .
|
|
In each subdirectory there is a
|
|
.I type
|
|
file whose content is an integer that can be used in the
|
|
.I type
|
|
field.
|
|
For instance,
|
|
.I /sys/bus/event_source/devices/cpu/type
|
|
contains the value for the core CPU PMU, which is usually 4.
|
|
.TP
|
|
.BR kprobe " and " uprobe " (since Linux 4.17)"
|
|
.\" commit 65074d43fc77bcae32776724b7fa2696923c78e4
|
|
.\" commit e12f03d7031a977356e3d7b75a68c2185ff8d155
|
|
.\" commit 33ea4b24277b06dbc55d7f5772a46f029600255e
|
|
These two dynamic PMUs create a kprobe/uprobe and attach it to the
|
|
file descriptor generated by perf_event_open.
|
|
The kprobe/uprobe will be destroyed on the destruction of the file descriptor.
|
|
See fields
|
|
.IR kprobe_func ,
|
|
.IR uprobe_path ,
|
|
.IR kprobe_addr ,
|
|
and
|
|
.I probe_offset
|
|
for more details.
|
|
.RE
|
|
.TP
|
|
.I "size"
|
|
The size of the
|
|
.I perf_event_attr
|
|
structure for forward/backward compatibility.
|
|
Set this using
|
|
.I sizeof(struct perf_event_attr)
|
|
to allow the kernel to see
|
|
the struct size at the time of compilation.
|
|
.IP
|
|
The related define
|
|
.B PERF_ATTR_SIZE_VER0
|
|
is set to 64; this was the size of the first published struct.
|
|
.B PERF_ATTR_SIZE_VER1
|
|
is 72, corresponding to the addition of breakpoints in Linux 2.6.33.
|
|
.\" commit cb5d76999029ae7a517cb07dfa732c1b5a934fc2
|
|
.\" this was added much later when PERF_ATTR_SIZE_VER2 happened
|
|
.\" but the actual attr_size had increased in 2.6.33
|
|
.B PERF_ATTR_SIZE_VER2
|
|
is 80 corresponding to the addition of branch sampling in Linux 3.4.
|
|
.\" commit cb5d76999029ae7a517cb07dfa732c1b5a934fc2
|
|
.B PERF_ATTR_SIZE_VER3
|
|
is 96 corresponding to the addition
|
|
of
|
|
.I sample_regs_user
|
|
and
|
|
.I sample_stack_user
|
|
in Linux 3.7.
|
|
.\" commit 1659d129ed014b715b0b2120e6fd929bdd33ed03
|
|
.B PERF_ATTR_SIZE_VER4
|
|
is 104 corresponding to the addition of
|
|
.I sample_regs_intr
|
|
in Linux 3.19.
|
|
.\" commit 60e2364e60e86e81bc6377f49779779e6120977f
|
|
.B PERF_ATTR_SIZE_VER5
|
|
is 112 corresponding to the addition of
|
|
.I aux_watermark
|
|
in Linux 4.1.
|
|
.\" commit 1a5941312414c71dece6717da9a0fa1303127afa
|
|
.TP
|
|
.I "config"
|
|
This specifies which event you want, in conjunction with
|
|
the
|
|
.I type
|
|
field.
|
|
The
|
|
.I config1
|
|
and
|
|
.I config2
|
|
fields are also taken into account in cases where 64 bits is not
|
|
enough to fully specify the event.
|
|
The encoding of these fields are event dependent.
|
|
.IP
|
|
There are various ways to set the
|
|
.I config
|
|
field that are dependent on the value of the previously
|
|
described
|
|
.I type
|
|
field.
|
|
What follows are various possible settings for
|
|
.I config
|
|
separated out by
|
|
.IR type .
|
|
.IP
|
|
If
|
|
.I type
|
|
is
|
|
.BR PERF_TYPE_HARDWARE ,
|
|
we are measuring one of the generalized hardware CPU events.
|
|
Not all of these are available on all platforms.
|
|
Set
|
|
.I config
|
|
to one of the following:
|
|
.RS 12
|
|
.TP
|
|
.B PERF_COUNT_HW_CPU_CYCLES
|
|
Total cycles.
|
|
Be wary of what happens during CPU frequency scaling.
|
|
.TP
|
|
.B PERF_COUNT_HW_INSTRUCTIONS
|
|
Retired instructions.
|
|
Be careful, these can be affected by various
|
|
issues, most notably hardware interrupt counts.
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_REFERENCES
|
|
Cache accesses.
|
|
Usually this indicates Last Level Cache accesses but this may
|
|
vary depending on your CPU.
|
|
This may include prefetches and coherency messages; again this
|
|
depends on the design of your CPU.
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_MISSES
|
|
Cache misses.
|
|
Usually this indicates Last Level Cache misses; this is intended to be
|
|
used in conjunction with the
|
|
.B PERF_COUNT_HW_CACHE_REFERENCES
|
|
event to calculate cache miss rates.
|
|
.TP
|
|
.B PERF_COUNT_HW_BRANCH_INSTRUCTIONS
|
|
Retired branch instructions.
|
|
Prior to Linux 2.6.35, this used
|
|
the wrong event on AMD processors.
|
|
.\" commit f287d332ce835f77a4f5077d2c0ef1e3f9ea42d2
|
|
.TP
|
|
.B PERF_COUNT_HW_BRANCH_MISSES
|
|
Mispredicted branch instructions.
|
|
.TP
|
|
.B PERF_COUNT_HW_BUS_CYCLES
|
|
Bus cycles, which can be different from total cycles.
|
|
.TP
|
|
.BR PERF_COUNT_HW_STALLED_CYCLES_FRONTEND " (since Linux 3.0)"
|
|
.\" commit 8f62242246351b5a4bc0c1f00c0c7003edea128a
|
|
Stalled cycles during issue.
|
|
.TP
|
|
.BR PERF_COUNT_HW_STALLED_CYCLES_BACKEND " (since Linux 3.0)"
|
|
.\" commit 8f62242246351b5a4bc0c1f00c0c7003edea128a
|
|
Stalled cycles during retirement.
|
|
.TP
|
|
.BR PERF_COUNT_HW_REF_CPU_CYCLES " (since Linux 3.3)"
|
|
.\" commit c37e17497e01fc0f5d2d6feb5723b210b3ab8890
|
|
Total cycles; not affected by CPU frequency scaling.
|
|
.RE
|
|
.IP
|
|
If
|
|
.I type
|
|
is
|
|
.BR PERF_TYPE_SOFTWARE ,
|
|
we are measuring software events provided by the kernel.
|
|
Set
|
|
.I config
|
|
to one of the following:
|
|
.RS 12
|
|
.TP
|
|
.B PERF_COUNT_SW_CPU_CLOCK
|
|
This reports the CPU clock, a high-resolution per-CPU timer.
|
|
.TP
|
|
.B PERF_COUNT_SW_TASK_CLOCK
|
|
This reports a clock count specific to the task that is running.
|
|
.TP
|
|
.B PERF_COUNT_SW_PAGE_FAULTS
|
|
This reports the number of page faults.
|
|
.TP
|
|
.B PERF_COUNT_SW_CONTEXT_SWITCHES
|
|
This counts context switches.
|
|
Until Linux 2.6.34, these were all reported as user-space
|
|
events, after that they are reported as happening in the kernel.
|
|
.\" commit e49a5bd38159dfb1928fd25b173bc9de4bbadb21
|
|
.TP
|
|
.B PERF_COUNT_SW_CPU_MIGRATIONS
|
|
This reports the number of times the process
|
|
has migrated to a new CPU.
|
|
.TP
|
|
.B PERF_COUNT_SW_PAGE_FAULTS_MIN
|
|
This counts the number of minor page faults.
|
|
These did not require disk I/O to handle.
|
|
.TP
|
|
.B PERF_COUNT_SW_PAGE_FAULTS_MAJ
|
|
This counts the number of major page faults.
|
|
These required disk I/O to handle.
|
|
.TP
|
|
.BR PERF_COUNT_SW_ALIGNMENT_FAULTS " (since Linux 2.6.33)"
|
|
.\" commit f7d7986060b2890fc26db6ab5203efbd33aa2497
|
|
This counts the number of alignment faults.
|
|
These happen when unaligned memory accesses happen; the kernel
|
|
can handle these but it reduces performance.
|
|
This happens only on some architectures (never on x86).
|
|
.TP
|
|
.BR PERF_COUNT_SW_EMULATION_FAULTS " (since Linux 2.6.33)"
|
|
.\" commit f7d7986060b2890fc26db6ab5203efbd33aa2497
|
|
This counts the number of emulation faults.
|
|
The kernel sometimes traps on unimplemented instructions
|
|
and emulates them for user space.
|
|
This can negatively impact performance.
|
|
.TP
|
|
.BR PERF_COUNT_SW_DUMMY " (since Linux 3.12)"
|
|
.\" commit fa0097ee690693006ab1aea6c01ad3c851b65c77
|
|
This is a placeholder event that counts nothing.
|
|
Informational sample record types such as mmap or comm
|
|
must be associated with an active event.
|
|
This dummy event allows gathering such records without requiring
|
|
a counting event.
|
|
.RE
|
|
.PP
|
|
.RS
|
|
If
|
|
.I type
|
|
is
|
|
.BR PERF_TYPE_TRACEPOINT ,
|
|
then we are measuring kernel tracepoints.
|
|
The value to use in
|
|
.I config
|
|
can be obtained from under debugfs
|
|
.I tracing/events/*/*/id
|
|
if ftrace is enabled in the kernel.
|
|
.RE
|
|
.PP
|
|
.RS
|
|
If
|
|
.I type
|
|
is
|
|
.BR PERF_TYPE_HW_CACHE ,
|
|
then we are measuring a hardware CPU cache event.
|
|
To calculate the appropriate
|
|
.I config
|
|
value, use the following equation:
|
|
.RS 4
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
config = (perf_hw_cache_id) |
|
|
(perf_hw_cache_op_id << 8) |
|
|
(perf_hw_cache_op_result_id << 16);
|
|
.EE
|
|
.in
|
|
.PP
|
|
where
|
|
.I perf_hw_cache_id
|
|
is one of:
|
|
.RS 4
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_L1D
|
|
for measuring Level 1 Data Cache
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_L1I
|
|
for measuring Level 1 Instruction Cache
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_LL
|
|
for measuring Last-Level Cache
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_DTLB
|
|
for measuring the Data TLB
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_ITLB
|
|
for measuring the Instruction TLB
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_BPU
|
|
for measuring the branch prediction unit
|
|
.TP
|
|
.BR PERF_COUNT_HW_CACHE_NODE " (since Linux 3.1)"
|
|
.\" commit 89d6c0b5bdbb1927775584dcf532d98b3efe1477
|
|
for measuring local memory accesses
|
|
.RE
|
|
.PP
|
|
and
|
|
.I perf_hw_cache_op_id
|
|
is one of:
|
|
.RS 4
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_OP_READ
|
|
for read accesses
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_OP_WRITE
|
|
for write accesses
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_OP_PREFETCH
|
|
for prefetch accesses
|
|
.RE
|
|
.PP
|
|
and
|
|
.I perf_hw_cache_op_result_id
|
|
is one of:
|
|
.RS 4
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_RESULT_ACCESS
|
|
to measure accesses
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_RESULT_MISS
|
|
to measure misses
|
|
.RE
|
|
.RE
|
|
.PP
|
|
If
|
|
.I type
|
|
is
|
|
.BR PERF_TYPE_RAW ,
|
|
then a custom "raw"
|
|
.I config
|
|
value is needed.
|
|
Most CPUs support events that are not covered by the "generalized" events.
|
|
These are implementation defined; see your CPU manual (for example
|
|
the Intel Volume 3B documentation or the AMD BIOS and Kernel Developer
|
|
Guide).
|
|
The libpfm4 library can be used to translate from the name in the
|
|
architectural manuals to the raw hex value
|
|
.BR perf_event_open ()
|
|
expects in this field.
|
|
.PP
|
|
If
|
|
.I type
|
|
is
|
|
.BR PERF_TYPE_BREAKPOINT ,
|
|
then leave
|
|
.I config
|
|
set to zero.
|
|
Its parameters are set in other places.
|
|
.PP
|
|
If
|
|
.I type
|
|
is
|
|
.B kprobe
|
|
or
|
|
.BR uprobe ,
|
|
set
|
|
.I retprobe
|
|
(bit 0 of
|
|
.IR config ,
|
|
see
|
|
.IR /sys/bus/event_source/devices/[k,u]probe/format/retprobe )
|
|
for kretprobe/uretprobe.
|
|
See fields
|
|
.IR kprobe_func ,
|
|
.IR uprobe_path ,
|
|
.IR kprobe_addr ,
|
|
and
|
|
.I probe_offset
|
|
for more details.
|
|
.RE
|
|
.TP
|
|
.IR kprobe_func ", " uprobe_path ", " kprobe_addr ", and " probe_offset
|
|
These fields describe the kprobe/uprobe for dynamic PMUs
|
|
.B kprobe
|
|
and
|
|
.BR uprobe .
|
|
For
|
|
.BR kprobe :
|
|
use
|
|
.I kprobe_func
|
|
and
|
|
.IR probe_offset ,
|
|
or use
|
|
.I kprobe_addr
|
|
and leave
|
|
.I kprobe_func
|
|
as NULL.
|
|
For
|
|
.BR uprobe :
|
|
use
|
|
.I uprobe_path
|
|
and
|
|
.IR probe_offset .
|
|
.TP
|
|
.IR sample_period ", " sample_freq
|
|
A "sampling" event is one that generates an overflow notification
|
|
every N events, where N is given by
|
|
.IR sample_period .
|
|
A sampling event has
|
|
.IR sample_period " > 0."
|
|
When an overflow occurs, requested data is recorded
|
|
in the mmap buffer.
|
|
The
|
|
.I sample_type
|
|
field controls what data is recorded on each overflow.
|
|
.IP
|
|
.I sample_freq
|
|
can be used if you wish to use frequency rather than period.
|
|
In this case, you set the
|
|
.I freq
|
|
flag.
|
|
The kernel will adjust the sampling period
|
|
to try and achieve the desired rate.
|
|
The rate of adjustment is a
|
|
timer tick.
|
|
.TP
|
|
.I sample_type
|
|
The various bits in this field specify which values to include
|
|
in the sample.
|
|
They will be recorded in a ring-buffer,
|
|
which is available to user space using
|
|
.BR mmap (2).
|
|
The order in which the values are saved in the
|
|
sample are documented in the MMAP Layout subsection below;
|
|
it is not the
|
|
.I "enum perf_event_sample_format"
|
|
order.
|
|
.RS
|
|
.TP
|
|
.B PERF_SAMPLE_IP
|
|
Records instruction pointer.
|
|
.TP
|
|
.B PERF_SAMPLE_TID
|
|
Records the process and thread IDs.
|
|
.TP
|
|
.B PERF_SAMPLE_TIME
|
|
Records a timestamp.
|
|
.TP
|
|
.B PERF_SAMPLE_ADDR
|
|
Records an address, if applicable.
|
|
.TP
|
|
.B PERF_SAMPLE_READ
|
|
Record counter values for all events in a group, not just the group leader.
|
|
.TP
|
|
.B PERF_SAMPLE_CALLCHAIN
|
|
Records the callchain (stack backtrace).
|
|
.TP
|
|
.B PERF_SAMPLE_ID
|
|
Records a unique ID for the opened event's group leader.
|
|
.TP
|
|
.B PERF_SAMPLE_CPU
|
|
Records CPU number.
|
|
.TP
|
|
.B PERF_SAMPLE_PERIOD
|
|
Records the current sampling period.
|
|
.TP
|
|
.B PERF_SAMPLE_STREAM_ID
|
|
Records a unique ID for the opened event.
|
|
Unlike
|
|
.B PERF_SAMPLE_ID
|
|
the actual ID is returned, not the group leader.
|
|
This ID is the same as the one returned by
|
|
.BR PERF_FORMAT_ID .
|
|
.TP
|
|
.B PERF_SAMPLE_RAW
|
|
Records additional data, if applicable.
|
|
Usually returned by tracepoint events.
|
|
.TP
|
|
.BR PERF_SAMPLE_BRANCH_STACK " (since Linux 3.4)"
|
|
.\" commit bce38cd53e5ddba9cb6d708c4ef3d04a4016ec7e
|
|
This provides a record of recent branches, as provided
|
|
by CPU branch sampling hardware (such as Intel Last Branch Record).
|
|
Not all hardware supports this feature.
|
|
.IP
|
|
See the
|
|
.I branch_sample_type
|
|
field for how to filter which branches are reported.
|
|
.TP
|
|
.BR PERF_SAMPLE_REGS_USER " (since Linux 3.7)"
|
|
.\" commit 4018994f3d8785275ef0e7391b75c3462c029e56
|
|
Records the current user-level CPU register state
|
|
(the values in the process before the kernel was called).
|
|
.TP
|
|
.BR PERF_SAMPLE_STACK_USER " (since Linux 3.7)"
|
|
.\" commit c5ebcedb566ef17bda7b02686e0d658a7bb42ee7
|
|
Records the user level stack, allowing stack unwinding.
|
|
.TP
|
|
.BR PERF_SAMPLE_WEIGHT " (since Linux 3.10)"
|
|
.\" commit c3feedf2aaf9ac8bad6f19f5d21e4ee0b4b87e9c
|
|
Records a hardware provided weight value that expresses how
|
|
costly the sampled event was.
|
|
This allows the hardware to highlight expensive events in
|
|
a profile.
|
|
.TP
|
|
.BR PERF_SAMPLE_DATA_SRC " (since Linux 3.10)"
|
|
.\" commit d6be9ad6c960f43800a6f118932bc8a5a4eadcd1
|
|
Records the data source: where in the memory hierarchy
|
|
the data associated with the sampled instruction came from.
|
|
This is available only if the underlying hardware
|
|
supports this feature.
|
|
.TP
|
|
.BR PERF_SAMPLE_IDENTIFIER " (since Linux 3.12)"
|
|
.\" commit ff3d527cebc1fa3707c617bfe9e74f53fcfb0955
|
|
Places the
|
|
.B SAMPLE_ID
|
|
value in a fixed position in the record,
|
|
either at the beginning (for sample events) or at the end
|
|
(if a non-sample event).
|
|
.IP
|
|
This was necessary because a sample stream may have
|
|
records from various different event sources with different
|
|
.I sample_type
|
|
settings.
|
|
Parsing the event stream properly was not possible because the
|
|
format of the record was needed to find
|
|
.BR SAMPLE_ID ,
|
|
but
|
|
the format could not be found without knowing what
|
|
event the sample belonged to (causing a circular
|
|
dependency).
|
|
.IP
|
|
The
|
|
.B PERF_SAMPLE_IDENTIFIER
|
|
setting makes the event stream always parsable
|
|
by putting
|
|
.B SAMPLE_ID
|
|
in a fixed location, even though
|
|
it means having duplicate
|
|
.B SAMPLE_ID
|
|
values in records.
|
|
.TP
|
|
.BR PERF_SAMPLE_TRANSACTION " (since Linux 3.13)"
|
|
.\" commit fdfbbd07e91f8fe387140776f3fd94605f0c89e5
|
|
Records reasons for transactional memory abort events
|
|
(for example, from Intel TSX transactional memory support).
|
|
.IP
|
|
The
|
|
.I precise_ip
|
|
setting must be greater than 0 and a transactional memory abort
|
|
event must be measured or no values will be recorded.
|
|
Also note that some perf_event measurements, such as sampled
|
|
cycle counting, may cause extraneous aborts (by causing an
|
|
interrupt during a transaction).
|
|
.TP
|
|
.BR PERF_SAMPLE_REGS_INTR " (since Linux 3.19)"
|
|
.\" commit 60e2364e60e86e81bc6377f49779779e6120977f
|
|
Records a subset of the current CPU register state
|
|
as specified by
|
|
.IR sample_regs_intr .
|
|
Unlike
|
|
.B PERF_SAMPLE_REGS_USER
|
|
the register values will return kernel register
|
|
state if the overflow happened while kernel
|
|
code is running.
|
|
If the CPU supports hardware sampling of
|
|
register state (i.e., PEBS on Intel x86) and
|
|
.I precise_ip
|
|
is set higher than zero then the register
|
|
values returned are those captured by
|
|
hardware at the time of the sampled
|
|
instruction's retirement.
|
|
.TP
|
|
.BR PERF_SAMPLE_PHYS_ADDR " (since Linux 4.13)"
|
|
.\" commit fc7ce9c74c3ad232b084d80148654f926d01ece7
|
|
Records physical address of data like in
|
|
.BR PERF_SAMPLE_ADDR .
|
|
.TP
|
|
.BR PERF_SAMPLE_CGROUP " (since Linux 5.7)"
|
|
.\" commit 96aaab686505c449e24d76e76507290dcc30e008
|
|
Records (perf_event) cgroup ID of the process.
|
|
This corresponds to the
|
|
.I id
|
|
field in the
|
|
.B PERF_RECORD_CGROUP
|
|
event.
|
|
.RE
|
|
.TP
|
|
.I read_format
|
|
This field specifies the format of the data returned by
|
|
.BR read (2)
|
|
on a
|
|
.BR perf_event_open ()
|
|
file descriptor.
|
|
.RS
|
|
.TP
|
|
.B PERF_FORMAT_TOTAL_TIME_ENABLED
|
|
Adds the 64-bit
|
|
.I time_enabled
|
|
field.
|
|
This can be used to calculate estimated totals if
|
|
the PMU is overcommitted and multiplexing is happening.
|
|
.TP
|
|
.B PERF_FORMAT_TOTAL_TIME_RUNNING
|
|
Adds the 64-bit
|
|
.I time_running
|
|
field.
|
|
This can be used to calculate estimated totals if
|
|
the PMU is overcommitted and multiplexing is happening.
|
|
.TP
|
|
.B PERF_FORMAT_ID
|
|
Adds a 64-bit unique value that corresponds to the event group.
|
|
.TP
|
|
.B PERF_FORMAT_GROUP
|
|
Allows all counter values in an event group to be read with one read.
|
|
.RE
|
|
.TP
|
|
.I disabled
|
|
The
|
|
.I disabled
|
|
bit specifies whether the counter starts out disabled or enabled.
|
|
If disabled, the event can later be enabled by
|
|
.BR ioctl (2),
|
|
.BR prctl (2),
|
|
or
|
|
.IR enable_on_exec .
|
|
.IP
|
|
When creating an event group, typically the group leader is initialized
|
|
with
|
|
.I disabled
|
|
set to 1 and any child events are initialized with
|
|
.I disabled
|
|
set to 0.
|
|
Despite
|
|
.I disabled
|
|
being 0, the child events will not start until the group leader
|
|
is enabled.
|
|
.TP
|
|
.I inherit
|
|
The
|
|
.I inherit
|
|
bit specifies that this counter should count events of child
|
|
tasks as well as the task specified.
|
|
This applies only to new children, not to any existing children at
|
|
the time the counter is created (nor to any new children of
|
|
existing children).
|
|
.IP
|
|
Inherit does not work for some combinations of
|
|
.IR read_format
|
|
values, such as
|
|
.BR PERF_FORMAT_GROUP .
|
|
.TP
|
|
.I pinned
|
|
The
|
|
.I pinned
|
|
bit specifies that the counter should always be on the CPU if at all
|
|
possible.
|
|
It applies only to hardware counters and only to group leaders.
|
|
If a pinned counter cannot be put onto the CPU (e.g., because there are
|
|
not enough hardware counters or because of a conflict with some other
|
|
event), then the counter goes into an 'error' state, where reads
|
|
return end-of-file (i.e.,
|
|
.BR read (2)
|
|
returns 0) until the counter is subsequently enabled or disabled.
|
|
.TP
|
|
.I exclusive
|
|
The
|
|
.I exclusive
|
|
bit specifies that when this counter's group is on the CPU,
|
|
it should be the only group using the CPU's counters.
|
|
In the future this may allow monitoring programs to
|
|
support PMU features that need to run alone so that they do not
|
|
disrupt other hardware counters.
|
|
.IP
|
|
Note that many unexpected situations may prevent events with the
|
|
.I exclusive
|
|
bit set from ever running.
|
|
This includes any users running a system-wide
|
|
measurement as well as any kernel use of the performance counters
|
|
(including the commonly enabled NMI Watchdog Timer interface).
|
|
.TP
|
|
.I exclude_user
|
|
If this bit is set, the count excludes events that happen in user space.
|
|
.TP
|
|
.I exclude_kernel
|
|
If this bit is set, the count excludes events that happen in kernel space.
|
|
.TP
|
|
.I exclude_hv
|
|
If this bit is set, the count excludes events that happen in the
|
|
hypervisor.
|
|
This is mainly for PMUs that have built-in support for handling this
|
|
(such as POWER).
|
|
Extra support is needed for handling hypervisor measurements on most
|
|
machines.
|
|
.TP
|
|
.I exclude_idle
|
|
If set, don't count when the CPU is running the idle task.
|
|
While you can currently enable this for any event type, it is ignored
|
|
for all but software events.
|
|
.TP
|
|
.I mmap
|
|
The
|
|
.I mmap
|
|
bit enables generation of
|
|
.B PERF_RECORD_MMAP
|
|
samples for every
|
|
.BR mmap (2)
|
|
call that has
|
|
.B PROT_EXEC
|
|
set.
|
|
This allows tools to notice new executable code being mapped into
|
|
a program (dynamic shared libraries for example)
|
|
so that addresses can be mapped back to the original code.
|
|
.TP
|
|
.I comm
|
|
The
|
|
.I comm
|
|
bit enables tracking of process command name as modified by the
|
|
.BR execve (2)
|
|
and
|
|
.BR prctl (PR_SET_NAME)
|
|
system calls as well as writing to
|
|
.IR /proc/self/comm .
|
|
If the
|
|
.I comm_exec
|
|
flag is also successfully set (possible since Linux 3.16),
|
|
.\" commit 82b897782d10fcc4930c9d4a15b175348fdd2871
|
|
then the misc flag
|
|
.B PERF_RECORD_MISC_COMM_EXEC
|
|
can be used to differentiate the
|
|
.BR execve (2)
|
|
case from the others.
|
|
.TP
|
|
.I freq
|
|
If this bit is set, then
|
|
.I sample_frequency
|
|
not
|
|
.I sample_period
|
|
is used when setting up the sampling interval.
|
|
.TP
|
|
.I inherit_stat
|
|
This bit enables saving of event counts on context switch for
|
|
inherited tasks.
|
|
This is meaningful only if the
|
|
.I inherit
|
|
field is set.
|
|
.TP
|
|
.I enable_on_exec
|
|
If this bit is set, a counter is automatically
|
|
enabled after a call to
|
|
.BR execve (2).
|
|
.TP
|
|
.I task
|
|
If this bit is set, then
|
|
fork/exit notifications are included in the ring buffer.
|
|
.TP
|
|
.I watermark
|
|
If set, have an overflow notification happen when we cross the
|
|
.I wakeup_watermark
|
|
boundary.
|
|
Otherwise, overflow notifications happen after
|
|
.I wakeup_events
|
|
samples.
|
|
.TP
|
|
.IR precise_ip " (since Linux 2.6.35)"
|
|
.\" commit ab608344bcbde4f55ec4cd911b686b0ce3eae076
|
|
This controls the amount of skid.
|
|
Skid is how many instructions
|
|
execute between an event of interest happening and the kernel
|
|
being able to stop and record the event.
|
|
Smaller skid is
|
|
better and allows more accurate reporting of which events
|
|
correspond to which instructions, but hardware is often limited
|
|
with how small this can be.
|
|
.IP
|
|
The possible values of this field are the following:
|
|
.RS
|
|
.IP 0 3
|
|
.B SAMPLE_IP
|
|
can have arbitrary skid.
|
|
.IP 1
|
|
.B SAMPLE_IP
|
|
must have constant skid.
|
|
.IP 2
|
|
.B SAMPLE_IP
|
|
requested to have 0 skid.
|
|
.IP 3
|
|
.B SAMPLE_IP
|
|
must have 0 skid.
|
|
See also the description of
|
|
.BR PERF_RECORD_MISC_EXACT_IP .
|
|
.RE
|
|
.TP
|
|
.IR mmap_data " (since Linux 2.6.36)"
|
|
.\" commit 3af9e859281bda7eb7c20b51879cf43aa788ac2e
|
|
This is the counterpart of the
|
|
.I mmap
|
|
field.
|
|
This enables generation of
|
|
.B PERF_RECORD_MMAP
|
|
samples for
|
|
.BR mmap (2)
|
|
calls that do not have
|
|
.B PROT_EXEC
|
|
set (for example data and SysV shared memory).
|
|
.TP
|
|
.IR sample_id_all " (since Linux 2.6.38)"
|
|
.\" commit c980d1091810df13f21aabbce545fd98f545bbf7
|
|
If set, then TID, TIME, ID, STREAM_ID, and CPU can
|
|
additionally be included in
|
|
.RB non- PERF_RECORD_SAMPLE s
|
|
if the corresponding
|
|
.I sample_type
|
|
is selected.
|
|
.IP
|
|
If
|
|
.B PERF_SAMPLE_IDENTIFIER
|
|
is specified, then an additional ID value is included
|
|
as the last value to ease parsing the record stream.
|
|
This may lead to the
|
|
.I id
|
|
value appearing twice.
|
|
.IP
|
|
The layout is described by this pseudo-structure:
|
|
.IP
|
|
.in +4n
|
|
.EX
|
|
struct sample_id {
|
|
{ u32 pid, tid; } /* if PERF_SAMPLE_TID set */
|
|
{ u64 time; } /* if PERF_SAMPLE_TIME set */
|
|
{ u64 id; } /* if PERF_SAMPLE_ID set */
|
|
{ u64 stream_id;} /* if PERF_SAMPLE_STREAM_ID set */
|
|
{ u32 cpu, res; } /* if PERF_SAMPLE_CPU set */
|
|
{ u64 id; } /* if PERF_SAMPLE_IDENTIFIER set */
|
|
};
|
|
.EE
|
|
.in
|
|
.TP
|
|
.IR exclude_host " (since Linux 3.2)"
|
|
.\" commit a240f76165e6255384d4bdb8139895fac7988799
|
|
When conducting measurements that include processes running
|
|
VM instances (i.e., have executed a
|
|
.B KVM_RUN
|
|
.BR ioctl (2)),
|
|
only measure events happening inside a guest instance.
|
|
This is only meaningful outside the guests; this setting does
|
|
not change counts gathered inside of a guest.
|
|
Currently, this functionality is x86 only.
|
|
.TP
|
|
.IR exclude_guest " (since Linux 3.2)"
|
|
.\" commit a240f76165e6255384d4bdb8139895fac7988799
|
|
When conducting measurements that include processes running
|
|
VM instances (i.e., have executed a
|
|
.B KVM_RUN
|
|
.BR ioctl (2)),
|
|
do not measure events happening inside guest instances.
|
|
This is only meaningful outside the guests; this setting does
|
|
not change counts gathered inside of a guest.
|
|
Currently, this functionality is x86 only.
|
|
.TP
|
|
.IR exclude_callchain_kernel " (since Linux 3.7)"
|
|
.\" commit d077526485d5c9b12fe85d0b2b3b7041e6bc5f91
|
|
Do not include kernel callchains.
|
|
.TP
|
|
.IR exclude_callchain_user " (since Linux 3.7)"
|
|
.\" commit d077526485d5c9b12fe85d0b2b3b7041e6bc5f91
|
|
Do not include user callchains.
|
|
.TP
|
|
.IR mmap2 " (since Linux 3.16)"
|
|
.\" commit 13d7a2410fa637f450a29ecb515ac318ee40c741
|
|
.\" This is tricky; was committed during 3.12 development
|
|
.\" but right before release was disabled.
|
|
.\" So while you could select mmap2 starting with 3.12
|
|
.\" it did not work until 3.16
|
|
.\" commit a5a5ba72843dd05f991184d6cb9a4471acce1005
|
|
Generate an extended executable mmap record that contains enough
|
|
additional information to uniquely identify shared mappings.
|
|
The
|
|
.I mmap
|
|
flag must also be set for this to work.
|
|
.TP
|
|
.IR comm_exec " (since Linux 3.16)"
|
|
.\" commit 82b897782d10fcc4930c9d4a15b175348fdd2871
|
|
This is purely a feature-detection flag, it does not change
|
|
kernel behavior.
|
|
If this flag can successfully be set, then, when
|
|
.I comm
|
|
is enabled, the
|
|
.B PERF_RECORD_MISC_COMM_EXEC
|
|
flag will be set in the
|
|
.I misc
|
|
field of a comm record header if the rename event being
|
|
reported was caused by a call to
|
|
.BR execve (2).
|
|
This allows tools to distinguish between the various
|
|
types of process renaming.
|
|
.TP
|
|
.IR use_clockid " (since Linux 4.1)"
|
|
.\" commit 34f439278cef7b1177f8ce24f9fc81dfc6221d3b
|
|
This allows selecting which internal Linux clock to use
|
|
when generating timestamps via the
|
|
.I clockid
|
|
field.
|
|
This can make it easier to correlate perf sample times with
|
|
timestamps generated by other tools.
|
|
.TP
|
|
.IR context_switch " (since Linux 4.3)"
|
|
.\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4
|
|
This enables the generation of
|
|
.B PERF_RECORD_SWITCH
|
|
records when a context switch occurs.
|
|
It also enables the generation of
|
|
.B PERF_RECORD_SWITCH_CPU_WIDE
|
|
records when sampling in CPU-wide mode.
|
|
This functionality is in addition to existing tracepoint and
|
|
software events for measuring context switches.
|
|
The advantage of this method is that it will give full
|
|
information even with strict
|
|
.I perf_event_paranoid
|
|
settings.
|
|
.TP
|
|
.IR write_backward " (since Linux 4.6)"
|
|
.\" commit 9ecda41acb971ebd07c8fb35faf24005c0baea12
|
|
This causes the ring buffer to be written from the end to the beginning.
|
|
This is to support reading from overwritable ring buffer.
|
|
.TP
|
|
.IR namespaces " (since Linux 4.11)"
|
|
.\" commit e422267322cd319e2695a535e47c5b1feeac45eb
|
|
This enables the generation of
|
|
.B PERF_RECORD_NAMESPACES
|
|
records when a task enters a new namespace.
|
|
Each namespace has a combination of device and inode numbers.
|
|
.TP
|
|
.IR ksymbol " (since Linux 5.0)"
|
|
.\" commit 76193a94522f1d4edf2447a536f3f796ce56343b
|
|
This enables the generation of
|
|
.B PERF_RECORD_KSYMBOL
|
|
records when new kernel symbols are registered or unregistered.
|
|
This is analyzing dynamic kernel functions like eBPF.
|
|
.TP
|
|
.IR bpf_event " (since Linux 5.0)"
|
|
.\" commit 6ee52e2a3fe4ea35520720736e6791df1fb67106
|
|
This enables the generation of
|
|
.B PERF_RECORD_BPF_EVENT
|
|
records when an eBPF program is loaded or unloaded.
|
|
.TP
|
|
.IR auxevent " (since Linux 5.4)"
|
|
.\" commit ab43762ef010967e4ccd53627f70a2eecbeafefb
|
|
This allows normal (non-AUX) events to generate data for AUX events
|
|
if the hardware supports it.
|
|
.TP
|
|
.IR cgroup " (since Linux 5.7)"
|
|
.\" commit 96aaab686505c449e24d76e76507290dcc30e008
|
|
This enables the generation of
|
|
.B PERF_RECORD_CGROUP
|
|
records when a new cgroup is created (and activated).
|
|
.TP
|
|
.IR text_poke " (since Linux 5.8)"
|
|
.\" commit e17d43b93e544f5016c0251d2074c15568d5d963
|
|
This enables the generation of
|
|
.B PERF_RECORD_TEXT_POKE
|
|
records when there's a change to the kernel text
|
|
(i.e., self-modifying code).
|
|
.TP
|
|
.IR wakeup_events ", " wakeup_watermark
|
|
This union sets how many samples
|
|
.RI ( wakeup_events )
|
|
or bytes
|
|
.RI ( wakeup_watermark )
|
|
happen before an overflow notification happens.
|
|
Which one is used is selected by the
|
|
.I watermark
|
|
bit flag.
|
|
.IP
|
|
.I wakeup_events
|
|
counts only
|
|
.B PERF_RECORD_SAMPLE
|
|
record types.
|
|
To receive overflow notification for all
|
|
.B PERF_RECORD
|
|
types choose watermark and set
|
|
.I wakeup_watermark
|
|
to 1.
|
|
.IP
|
|
Prior to Linux 3.0, setting
|
|
.\" commit f506b3dc0ec454a16d40cab9ee5d75435b39dc50
|
|
.I wakeup_events
|
|
to 0 resulted in no overflow notifications;
|
|
more recent kernels treat 0 the same as 1.
|
|
.TP
|
|
.IR bp_type " (since Linux 2.6.33)"
|
|
.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e
|
|
This chooses the breakpoint type.
|
|
It is one of:
|
|
.RS
|
|
.TP
|
|
.B HW_BREAKPOINT_EMPTY
|
|
No breakpoint.
|
|
.TP
|
|
.B HW_BREAKPOINT_R
|
|
Count when we read the memory location.
|
|
.TP
|
|
.B HW_BREAKPOINT_W
|
|
Count when we write the memory location.
|
|
.TP
|
|
.B HW_BREAKPOINT_RW
|
|
Count when we read or write the memory location.
|
|
.TP
|
|
.B HW_BREAKPOINT_X
|
|
Count when we execute code at the memory location.
|
|
.PP
|
|
The values can be combined via a bitwise or, but the
|
|
combination of
|
|
.B HW_BREAKPOINT_R
|
|
or
|
|
.B HW_BREAKPOINT_W
|
|
with
|
|
.B HW_BREAKPOINT_X
|
|
is not allowed.
|
|
.RE
|
|
.TP
|
|
.IR bp_addr " (since Linux 2.6.33)"
|
|
.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e
|
|
This is the address of the breakpoint.
|
|
For execution breakpoints, this is the memory address of the instruction
|
|
of interest; for read and write breakpoints, it is the memory address
|
|
of the memory location of interest.
|
|
.TP
|
|
.IR config1 " (since Linux 2.6.39)"
|
|
.\" commit a7e3ed1e470116c9d12c2f778431a481a6be8ab6
|
|
.I config1
|
|
is used for setting events that need an extra register or otherwise
|
|
do not fit in the regular config field.
|
|
Raw OFFCORE_EVENTS on Nehalem/Westmere/SandyBridge use this field
|
|
on Linux 3.3 and later kernels.
|
|
.TP
|
|
.IR bp_len " (since Linux 2.6.33)"
|
|
.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e
|
|
.I bp_len
|
|
is the length of the breakpoint being measured if
|
|
.I type
|
|
is
|
|
.BR PERF_TYPE_BREAKPOINT .
|
|
Options are
|
|
.BR HW_BREAKPOINT_LEN_1 ,
|
|
.BR HW_BREAKPOINT_LEN_2 ,
|
|
.BR HW_BREAKPOINT_LEN_4 ,
|
|
and
|
|
.BR HW_BREAKPOINT_LEN_8 .
|
|
For an execution breakpoint, set this to
|
|
.IR sizeof(long) .
|
|
.TP
|
|
.IR config2 " (since Linux 2.6.39)"
|
|
.\" commit a7e3ed1e470116c9d12c2f778431a481a6be8ab6
|
|
.I config2
|
|
is a further extension of the
|
|
.I config1
|
|
field.
|
|
.TP
|
|
.IR branch_sample_type " (since Linux 3.4)"
|
|
.\" commit bce38cd53e5ddba9cb6d708c4ef3d04a4016ec7e
|
|
If
|
|
.B PERF_SAMPLE_BRANCH_STACK
|
|
is enabled, then this specifies what branches to include
|
|
in the branch record.
|
|
.IP
|
|
The first part of the value is the privilege level, which
|
|
is a combination of one of the values listed below.
|
|
If the user does not set privilege level explicitly, the kernel
|
|
will use the event's privilege level.
|
|
Event and branch privilege levels do not have to match.
|
|
.RS
|
|
.TP
|
|
.B PERF_SAMPLE_BRANCH_USER
|
|
Branch target is in user space.
|
|
.TP
|
|
.B PERF_SAMPLE_BRANCH_KERNEL
|
|
Branch target is in kernel space.
|
|
.TP
|
|
.B PERF_SAMPLE_BRANCH_HV
|
|
Branch target is in hypervisor.
|
|
.TP
|
|
.B PERF_SAMPLE_BRANCH_PLM_ALL
|
|
A convenience value that is the three preceding values ORed together.
|
|
.PP
|
|
In addition to the privilege value, at least one or more of the
|
|
following bits must be set.
|
|
.TP
|
|
.B PERF_SAMPLE_BRANCH_ANY
|
|
Any branch type.
|
|
.TP
|
|
.B PERF_SAMPLE_BRANCH_ANY_CALL
|
|
Any call branch (includes direct calls, indirect calls, and far jumps).
|
|
.TP
|
|
.B PERF_SAMPLE_BRANCH_IND_CALL
|
|
Indirect calls.
|
|
.TP
|
|
.BR PERF_SAMPLE_BRANCH_CALL " (since Linux 4.4)"
|
|
.\" commit c229bf9dc179d2023e185c0f705bdf68484c1e73
|
|
Direct calls.
|
|
.TP
|
|
.B PERF_SAMPLE_BRANCH_ANY_RETURN
|
|
Any return branch.
|
|
.TP
|
|
.BR PERF_SAMPLE_BRANCH_IND_JUMP " (since Linux 4.2)"
|
|
.\" commit c9fdfa14c3792c0160849c484e83aa57afd80ccc
|
|
Indirect jumps.
|
|
.TP
|
|
.BR PERF_SAMPLE_BRANCH_COND " (since Linux 3.16)"
|
|
.\" commit bac52139f0b7ab31330e98fd87fc5a2664951050
|
|
Conditional branches.
|
|
.TP
|
|
.BR PERF_SAMPLE_BRANCH_ABORT_TX " (since Linux 3.11)"
|
|
.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
|
|
Transactional memory aborts.
|
|
.TP
|
|
.BR PERF_SAMPLE_BRANCH_IN_TX " (since Linux 3.11)"
|
|
.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
|
|
Branch in transactional memory transaction.
|
|
.TP
|
|
.BR PERF_SAMPLE_BRANCH_NO_TX " (since Linux 3.11)"
|
|
.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
|
|
Branch not in transactional memory transaction.
|
|
.BR PERF_SAMPLE_BRANCH_CALL_STACK " (since Linux 4.1)"
|
|
.\" commit 2c44b1936bb3b135a3fac8b3493394d42e51cf70
|
|
Branch is part of a hardware-generated call stack.
|
|
This requires hardware support, currently only found
|
|
on Intel x86 Haswell or newer.
|
|
.RE
|
|
.TP
|
|
.IR sample_regs_user " (since Linux 3.7)"
|
|
.\" commit 4018994f3d8785275ef0e7391b75c3462c029e56
|
|
This bit mask defines the set of user CPU registers to dump on samples.
|
|
The layout of the register mask is architecture-specific and
|
|
is described in the kernel header file
|
|
.IR arch/ARCH/include/uapi/asm/perf_regs.h .
|
|
.TP
|
|
.IR sample_stack_user " (since Linux 3.7)"
|
|
.\" commit c5ebcedb566ef17bda7b02686e0d658a7bb42ee7
|
|
This defines the size of the user stack to dump if
|
|
.B PERF_SAMPLE_STACK_USER
|
|
is specified.
|
|
.TP
|
|
.IR clockid " (since Linux 4.1)"
|
|
.\" commit 34f439278cef7b1177f8ce24f9fc81dfc6221d3b
|
|
If
|
|
.I use_clockid
|
|
is set, then this field selects which internal Linux timer to
|
|
use for timestamps.
|
|
The available timers are defined in
|
|
.IR linux/time.h ,
|
|
with
|
|
.BR CLOCK_MONOTONIC ,
|
|
.BR CLOCK_MONOTONIC_RAW ,
|
|
.BR CLOCK_REALTIME ,
|
|
.BR CLOCK_BOOTTIME ,
|
|
and
|
|
.B CLOCK_TAI
|
|
currently supported.
|
|
.TP
|
|
.IR aux_watermark " (since Linux 4.1)"
|
|
.\" commit 1a5941312414c71dece6717da9a0fa1303127afa
|
|
This specifies how much data is required to trigger a
|
|
.B PERF_RECORD_AUX
|
|
sample.
|
|
.TP
|
|
.IR sample_max_stack " (since Linux 4.8)"
|
|
.\" commit 97c79a38cd454602645f0470ffb444b3b75ce574
|
|
When
|
|
.I sample_type
|
|
includes
|
|
.BR PERF_SAMPLE_CALLCHAIN ,
|
|
this field specifies how many stack frames to report when
|
|
generating the callchain.
|
|
.SS Reading results
|
|
Once a
|
|
.BR perf_event_open ()
|
|
file descriptor has been opened, the values
|
|
of the events can be read from the file descriptor.
|
|
The values that are there are specified by the
|
|
.I read_format
|
|
field in the
|
|
.I attr
|
|
structure at open time.
|
|
.PP
|
|
If you attempt to read into a buffer that is not big enough to hold the
|
|
data, the error
|
|
.B ENOSPC
|
|
results.
|
|
.PP
|
|
Here is the layout of the data returned by a read:
|
|
.IP * 2
|
|
If
|
|
.B PERF_FORMAT_GROUP
|
|
was specified to allow reading all events in a group at once:
|
|
.IP
|
|
.in +4n
|
|
.EX
|
|
struct read_format {
|
|
u64 nr; /* The number of events */
|
|
u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
|
|
u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
|
|
struct {
|
|
u64 value; /* The value of the event */
|
|
u64 id; /* if PERF_FORMAT_ID */
|
|
} values[nr];
|
|
};
|
|
.EE
|
|
.in
|
|
.IP *
|
|
If
|
|
.B PERF_FORMAT_GROUP
|
|
was
|
|
.I not
|
|
specified:
|
|
.IP
|
|
.in +4n
|
|
.EX
|
|
struct read_format {
|
|
u64 value; /* The value of the event */
|
|
u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
|
|
u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
|
|
u64 id; /* if PERF_FORMAT_ID */
|
|
};
|
|
.EE
|
|
.in
|
|
.PP
|
|
The values read are as follows:
|
|
.TP
|
|
.I nr
|
|
The number of events in this file descriptor.
|
|
Available only if
|
|
.B PERF_FORMAT_GROUP
|
|
was specified.
|
|
.TP
|
|
.IR time_enabled ", " time_running
|
|
Total time the event was enabled and running.
|
|
Normally these values are the same.
|
|
Multiplexing happens if the number of events is more than the
|
|
number of available PMU counter slots.
|
|
In that case the events run only part of the time and the
|
|
.I time_enabled
|
|
and
|
|
.I time running
|
|
values can be used to scale an estimated value for the count.
|
|
.TP
|
|
.I value
|
|
An unsigned 64-bit value containing the counter result.
|
|
.TP
|
|
.I id
|
|
A globally unique value for this particular event; only present if
|
|
.B PERF_FORMAT_ID
|
|
was specified in
|
|
.IR read_format .
|
|
.SS MMAP layout
|
|
When using
|
|
.BR perf_event_open ()
|
|
in sampled mode, asynchronous events
|
|
(like counter overflow or
|
|
.B PROT_EXEC
|
|
mmap tracking)
|
|
are logged into a ring-buffer.
|
|
This ring-buffer is created and accessed through
|
|
.BR mmap (2).
|
|
.PP
|
|
The mmap size should be 1+2^n pages, where the first page is a
|
|
metadata page
|
|
.RI ( "struct perf_event_mmap_page" )
|
|
that contains various
|
|
bits of information such as where the ring-buffer head is.
|
|
.PP
|
|
Before kernel 2.6.39, there is a bug that means you must allocate an mmap
|
|
ring buffer when sampling even if you do not plan to access it.
|
|
.PP
|
|
The structure of the first metadata mmap page is as follows:
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
struct perf_event_mmap_page {
|
|
__u32 version; /* version number of this structure */
|
|
__u32 compat_version; /* lowest version this is compat with */
|
|
__u32 lock; /* seqlock for synchronization */
|
|
__u32 index; /* hardware counter identifier */
|
|
__s64 offset; /* add to hardware counter value */
|
|
__u64 time_enabled; /* time event active */
|
|
__u64 time_running; /* time event on CPU */
|
|
union {
|
|
__u64 capabilities;
|
|
struct {
|
|
__u64 cap_usr_time / cap_usr_rdpmc / cap_bit0 : 1,
|
|
cap_bit0_is_deprecated : 1,
|
|
cap_user_rdpmc : 1,
|
|
cap_user_time : 1,
|
|
cap_user_time_zero : 1,
|
|
};
|
|
};
|
|
__u16 pmc_width;
|
|
__u16 time_shift;
|
|
__u32 time_mult;
|
|
__u64 time_offset;
|
|
__u64 __reserved[120]; /* Pad to 1 k */
|
|
__u64 data_head; /* head in the data section */
|
|
__u64 data_tail; /* user\-space written tail */
|
|
__u64 data_offset; /* where the buffer starts */
|
|
__u64 data_size; /* data buffer size */
|
|
__u64 aux_head;
|
|
__u64 aux_tail;
|
|
__u64 aux_offset;
|
|
__u64 aux_size;
|
|
|
|
}
|
|
.EE
|
|
.in
|
|
.PP
|
|
The following list describes the fields in the
|
|
.I perf_event_mmap_page
|
|
structure in more detail:
|
|
.TP
|
|
.I version
|
|
Version number of this structure.
|
|
.TP
|
|
.I compat_version
|
|
The lowest version this is compatible with.
|
|
.TP
|
|
.I lock
|
|
A seqlock for synchronization.
|
|
.TP
|
|
.I index
|
|
A unique hardware counter identifier.
|
|
.TP
|
|
.I offset
|
|
When using rdpmc for reads this offset value
|
|
must be added to the one returned by rdpmc to get
|
|
the current total event count.
|
|
.TP
|
|
.I time_enabled
|
|
Time the event was active.
|
|
.TP
|
|
.I time_running
|
|
Time the event was running.
|
|
.TP
|
|
.IR cap_usr_time " / " cap_usr_rdpmc " / " cap_bit0 " (since Linux 3.4)"
|
|
.\" commit c7206205d00ab375839bd6c7ddb247d600693c09
|
|
There was a bug in the definition of
|
|
.I cap_usr_time
|
|
and
|
|
.I cap_usr_rdpmc
|
|
from Linux 3.4 until Linux 3.11.
|
|
Both bits were defined to point to the same location, so it was
|
|
impossible to know if
|
|
.I cap_usr_time
|
|
or
|
|
.I cap_usr_rdpmc
|
|
were actually set.
|
|
.IP
|
|
Starting with Linux 3.12, these are renamed to
|
|
.\" commit fa7315871046b9a4c48627905691dbde57e51033
|
|
.I cap_bit0
|
|
and you should use the
|
|
.I cap_user_time
|
|
and
|
|
.I cap_user_rdpmc
|
|
fields instead.
|
|
.TP
|
|
.IR cap_bit0_is_deprecated " (since Linux 3.12)"
|
|
.\" commit fa7315871046b9a4c48627905691dbde57e51033
|
|
If set, this bit indicates that the kernel supports
|
|
the properly separated
|
|
.I cap_user_time
|
|
and
|
|
.I cap_user_rdpmc
|
|
bits.
|
|
.IP
|
|
If not-set, it indicates an older kernel where
|
|
.I cap_usr_time
|
|
and
|
|
.I cap_usr_rdpmc
|
|
map to the same bit and thus both features should
|
|
be used with caution.
|
|
.TP
|
|
.IR cap_user_rdpmc " (since Linux 3.12)"
|
|
.\" commit fa7315871046b9a4c48627905691dbde57e51033
|
|
If the hardware supports user-space read of performance counters
|
|
without syscall (this is the "rdpmc" instruction on x86), then
|
|
the following code can be used to do a read:
|
|
.IP
|
|
.in +4n
|
|
.EX
|
|
u32 seq, time_mult, time_shift, idx, width;
|
|
u64 count, enabled, running;
|
|
u64 cyc, time_offset;
|
|
|
|
do {
|
|
seq = pc\->lock;
|
|
barrier();
|
|
enabled = pc\->time_enabled;
|
|
running = pc\->time_running;
|
|
|
|
if (pc\->cap_usr_time && enabled != running) {
|
|
cyc = rdtsc();
|
|
time_offset = pc\->time_offset;
|
|
time_mult = pc\->time_mult;
|
|
time_shift = pc\->time_shift;
|
|
}
|
|
|
|
idx = pc\->index;
|
|
count = pc\->offset;
|
|
|
|
if (pc\->cap_usr_rdpmc && idx) {
|
|
width = pc\->pmc_width;
|
|
count += rdpmc(idx \- 1);
|
|
}
|
|
|
|
barrier();
|
|
} while (pc\->lock != seq);
|
|
.EE
|
|
.in
|
|
.TP
|
|
.IR cap_user_time " (since Linux 3.12)"
|
|
.\" commit fa7315871046b9a4c48627905691dbde57e51033
|
|
This bit indicates the hardware has a constant, nonstop
|
|
timestamp counter (TSC on x86).
|
|
.TP
|
|
.IR cap_user_time_zero " (since Linux 3.12)"
|
|
.\" commit fa7315871046b9a4c48627905691dbde57e51033
|
|
Indicates the presence of
|
|
.I time_zero
|
|
which allows mapping timestamp values to
|
|
the hardware clock.
|
|
.TP
|
|
.I pmc_width
|
|
If
|
|
.IR cap_usr_rdpmc ,
|
|
this field provides the bit-width of the value
|
|
read using the rdpmc or equivalent instruction.
|
|
This can be used to sign extend the result like:
|
|
.IP
|
|
.in +4n
|
|
.EX
|
|
pmc <<= 64 \- pmc_width;
|
|
pmc >>= 64 \- pmc_width; // signed shift right
|
|
count += pmc;
|
|
.EE
|
|
.in
|
|
.TP
|
|
.IR time_shift ", " time_mult ", " time_offset
|
|
.IP
|
|
If
|
|
.IR cap_usr_time ,
|
|
these fields can be used to compute the time
|
|
delta since
|
|
.I time_enabled
|
|
(in nanoseconds) using rdtsc or similar.
|
|
.IP
|
|
.in +4n
|
|
.EX
|
|
u64 quot, rem;
|
|
u64 delta;
|
|
|
|
quot = cyc >> time_shift;
|
|
rem = cyc & (((u64)1 << time_shift) \- 1);
|
|
delta = time_offset + quot * time_mult +
|
|
((rem * time_mult) >> time_shift);
|
|
.EE
|
|
.in
|
|
.IP
|
|
Where
|
|
.IR time_offset ,
|
|
.IR time_mult ,
|
|
.IR time_shift ,
|
|
and
|
|
.I cyc
|
|
are read in the
|
|
seqcount loop described above.
|
|
This delta can then be added to
|
|
enabled and possible running (if idx), improving the scaling:
|
|
.IP
|
|
.in +4n
|
|
.EX
|
|
enabled += delta;
|
|
if (idx)
|
|
running += delta;
|
|
quot = count / running;
|
|
rem = count % running;
|
|
count = quot * enabled + (rem * enabled) / running;
|
|
.EE
|
|
.in
|
|
.TP
|
|
.IR time_zero " (since Linux 3.12)"
|
|
.\" commit fa7315871046b9a4c48627905691dbde57e51033
|
|
.IP
|
|
If
|
|
.I cap_usr_time_zero
|
|
is set, then the hardware clock (the TSC timestamp counter on x86)
|
|
can be calculated from the
|
|
.IR time_zero ,
|
|
.IR time_mult ,
|
|
and
|
|
.I time_shift
|
|
values:
|
|
.IP
|
|
.in +4n
|
|
.EX
|
|
time = timestamp \- time_zero;
|
|
quot = time / time_mult;
|
|
rem = time % time_mult;
|
|
cyc = (quot << time_shift) + (rem << time_shift) / time_mult;
|
|
.EE
|
|
.in
|
|
.IP
|
|
And vice versa:
|
|
.IP
|
|
.in +4n
|
|
.EX
|
|
quot = cyc >> time_shift;
|
|
rem = cyc & (((u64)1 << time_shift) \- 1);
|
|
timestamp = time_zero + quot * time_mult +
|
|
((rem * time_mult) >> time_shift);
|
|
.EE
|
|
.in
|
|
.TP
|
|
.I data_head
|
|
This points to the head of the data section.
|
|
The value continuously increases, it does not wrap.
|
|
The value needs to be manually wrapped by the size of the mmap buffer
|
|
before accessing the samples.
|
|
.IP
|
|
On SMP-capable platforms, after reading the
|
|
.I data_head
|
|
value,
|
|
user space should issue an rmb().
|
|
.TP
|
|
.I data_tail
|
|
When the mapping is
|
|
.BR PROT_WRITE ,
|
|
the
|
|
.I data_tail
|
|
value should be written by user space to reflect the last read data.
|
|
In this case, the kernel will not overwrite unread data.
|
|
.TP
|
|
.IR data_offset " (since Linux 4.1)"
|
|
.\" commit e8c6deac69629c0cb97c3d3272f8631ef17f8f0f
|
|
Contains the offset of the location in the mmap buffer
|
|
where perf sample data begins.
|
|
.TP
|
|
.IR data_size " (since Linux 4.1)"
|
|
.\" commit e8c6deac69629c0cb97c3d3272f8631ef17f8f0f
|
|
Contains the size of the perf sample region within
|
|
the mmap buffer.
|
|
.TP
|
|
.IR aux_head ", " aux_tail ", " aux_offset ", " aux_size " (since Linux 4.1)"
|
|
.\" commit 45bfb2e50471abbbfd83d40d28c986078b0d24ff
|
|
The AUX region allows
|
|
.BR mmap (2)-ing
|
|
a separate sample buffer for
|
|
high-bandwidth data streams (separate from the main perf sample buffer).
|
|
An example of a high-bandwidth stream is instruction tracing support,
|
|
as is found in newer Intel processors.
|
|
.IP
|
|
To set up an AUX area, first
|
|
.I aux_offset
|
|
needs to be set with an offset greater than
|
|
.IR data_offset + data_size
|
|
and
|
|
.I aux_size
|
|
needs to be set to the desired buffer size.
|
|
The desired offset and size must be page aligned, and the size
|
|
must be a power of two.
|
|
These values are then passed to mmap in order to map the AUX buffer.
|
|
Pages in the AUX buffer are included as part of the
|
|
.B RLIMIT_MEMLOCK
|
|
resource limit (see
|
|
.BR setrlimit (2)),
|
|
and also as part of the
|
|
.I perf_event_mlock_kb
|
|
allowance.
|
|
.IP
|
|
By default, the AUX buffer will be truncated if it will not fit
|
|
in the available space in the ring buffer.
|
|
If the AUX buffer is mapped as a read only buffer, then it will
|
|
operate in ring buffer mode where old data will be overwritten
|
|
by new.
|
|
In overwrite mode, it might not be possible to infer where the
|
|
new data began, and it is the consumer's job to disable
|
|
measurement while reading to avoid possible data races.
|
|
.IP
|
|
The
|
|
.I aux_head
|
|
and
|
|
.I aux_tail
|
|
ring buffer pointers have the same behavior and ordering
|
|
rules as the previous described
|
|
.I data_head
|
|
and
|
|
.IR data_tail .
|
|
.PP
|
|
The following 2^n ring-buffer pages have the layout described below.
|
|
.PP
|
|
If
|
|
.I perf_event_attr.sample_id_all
|
|
is set, then all event types will
|
|
have the sample_type selected fields related to where/when (identity)
|
|
an event took place (TID, TIME, ID, CPU, STREAM_ID) described in
|
|
.B PERF_RECORD_SAMPLE
|
|
below, it will be stashed just after the
|
|
.I perf_event_header
|
|
and the fields already present for the existing
|
|
fields, that is, at the end of the payload.
|
|
This allows a newer perf.data
|
|
file to be supported by older perf tools, with the new optional
|
|
fields being ignored.
|
|
.PP
|
|
The mmap values start with a header:
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
struct perf_event_header {
|
|
__u32 type;
|
|
__u16 misc;
|
|
__u16 size;
|
|
};
|
|
.EE
|
|
.in
|
|
.PP
|
|
Below, we describe the
|
|
.I perf_event_header
|
|
fields in more detail.
|
|
For ease of reading,
|
|
the fields with shorter descriptions are presented first.
|
|
.TP
|
|
.I size
|
|
This indicates the size of the record.
|
|
.TP
|
|
.I misc
|
|
The
|
|
.I misc
|
|
field contains additional information about the sample.
|
|
.IP
|
|
The CPU mode can be determined from this value by masking with
|
|
.B PERF_RECORD_MISC_CPUMODE_MASK
|
|
and looking for one of the following (note these are not
|
|
bit masks, only one can be set at a time):
|
|
.RS
|
|
.TP
|
|
.B PERF_RECORD_MISC_CPUMODE_UNKNOWN
|
|
Unknown CPU mode.
|
|
.TP
|
|
.B PERF_RECORD_MISC_KERNEL
|
|
Sample happened in the kernel.
|
|
.TP
|
|
.B PERF_RECORD_MISC_USER
|
|
Sample happened in user code.
|
|
.TP
|
|
.B PERF_RECORD_MISC_HYPERVISOR
|
|
Sample happened in the hypervisor.
|
|
.TP
|
|
.BR PERF_RECORD_MISC_GUEST_KERNEL " (since Linux 2.6.35)"
|
|
.\" commit 39447b386c846bbf1c56f6403c5282837486200f
|
|
Sample happened in the guest kernel.
|
|
.TP
|
|
.B PERF_RECORD_MISC_GUEST_USER " (since Linux 2.6.35)"
|
|
.\" commit 39447b386c846bbf1c56f6403c5282837486200f
|
|
Sample happened in guest user code.
|
|
.RE
|
|
.PP
|
|
.RS
|
|
Since the following three statuses are generated by
|
|
different record types, they alias to the same bit:
|
|
.TP
|
|
.BR PERF_RECORD_MISC_MMAP_DATA " (since Linux 3.10)"
|
|
.\" commit 2fe85427e3bf65d791700d065132772fc26e4d75
|
|
This is set when the mapping is not executable;
|
|
otherwise the mapping is executable.
|
|
.TP
|
|
.BR PERF_RECORD_MISC_COMM_EXEC " (since Linux 3.16)"
|
|
.\" commit 82b897782d10fcc4930c9d4a15b175348fdd2871
|
|
This is set for a
|
|
.B PERF_RECORD_COMM
|
|
record on kernels more recent than Linux 3.16
|
|
if a process name change was caused by an
|
|
.BR execve (2)
|
|
system call.
|
|
.TP
|
|
.BR PERF_RECORD_MISC_SWITCH_OUT " (since Linux 4.3)"
|
|
.\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4
|
|
When a
|
|
.B PERF_RECORD_SWITCH
|
|
or
|
|
.B PERF_RECORD_SWITCH_CPU_WIDE
|
|
record is generated, this bit indicates that the
|
|
context switch is away from the current process
|
|
(instead of into the current process).
|
|
.RE
|
|
.PP
|
|
.RS
|
|
In addition, the following bits can be set:
|
|
.TP
|
|
.B PERF_RECORD_MISC_EXACT_IP
|
|
This indicates that the content of
|
|
.B PERF_SAMPLE_IP
|
|
points
|
|
to the actual instruction that triggered the event.
|
|
See also
|
|
.IR perf_event_attr.precise_ip .
|
|
.TP
|
|
.BR PERF_RECORD_MISC_EXT_RESERVED " (since Linux 2.6.35)"
|
|
.\" commit 1676b8a077c352085d52578fb4f29350b58b6e74
|
|
This indicates there is extended data available (currently not used).
|
|
.TP
|
|
.B PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT
|
|
.\" commit 930e6fcd2bcce9bcd9d4aa7e755678d33f3fe6f4
|
|
This bit is not set by the kernel.
|
|
It is reserved for the user-space perf utility to indicate that
|
|
.I /proc/i[pid]/maps
|
|
parsing was taking too long and was stopped, and thus the mmap
|
|
records may be truncated.
|
|
.RE
|
|
.TP
|
|
.I type
|
|
The
|
|
.I type
|
|
value is one of the below.
|
|
The values in the corresponding record (that follows the header)
|
|
depend on the
|
|
.I type
|
|
selected as shown.
|
|
.RS
|
|
.TP 4
|
|
.B PERF_RECORD_MMAP
|
|
The MMAP events record the
|
|
.B PROT_EXEC
|
|
mappings so that we can correlate
|
|
user-space IPs to code.
|
|
They have the following structure:
|
|
.IP
|
|
.in +4n
|
|
.EX
|
|
struct {
|
|
struct perf_event_header header;
|
|
u32 pid, tid;
|
|
u64 addr;
|
|
u64 len;
|
|
u64 pgoff;
|
|
char filename[];
|
|
};
|
|
.EE
|
|
.in
|
|
.RS
|
|
.TP
|
|
.I pid
|
|
is the process ID.
|
|
.TP
|
|
.I tid
|
|
is the thread ID.
|
|
.TP
|
|
.I addr
|
|
is the address of the allocated memory.
|
|
.I len
|
|
is the length of the allocated memory.
|
|
.I pgoff
|
|
is the page offset of the allocated memory.
|
|
.I filename
|
|
is a string describing the backing of the allocated memory.
|
|
.RE
|
|
.TP
|
|
.B PERF_RECORD_LOST
|
|
This record indicates when events are lost.
|
|
.IP
|
|
.in +4n
|
|
.EX
|
|
struct {
|
|
struct perf_event_header header;
|
|
u64 id;
|
|
u64 lost;
|
|
struct sample_id sample_id;
|
|
};
|
|
.EE
|
|
.in
|
|
.RS
|
|
.TP
|
|
.I id
|
|
is the unique event ID for the samples that were lost.
|
|
.TP
|
|
.I lost
|
|
is the number of events that were lost.
|
|
.RE
|
|
.TP
|
|
.B PERF_RECORD_COMM
|
|
This record indicates a change in the process name.
|
|
.IP
|
|
.in +4n
|
|
.EX
|
|
struct {
|
|
struct perf_event_header header;
|
|
u32 pid;
|
|
u32 tid;
|
|
char comm[];
|
|
struct sample_id sample_id;
|
|
};
|
|
.EE
|
|
.in
|
|
.RS
|
|
.TP
|
|
.I pid
|
|
is the process ID.
|
|
.TP
|
|
.I tid
|
|
is the thread ID.
|
|
.TP
|
|
.I comm
|
|
is a string containing the new name of the process.
|
|
.RE
|
|
.TP
|
|
.B PERF_RECORD_EXIT
|
|
This record indicates a process exit event.
|
|
.IP
|
|
.in +4n
|
|
.EX
|
|
struct {
|
|
struct perf_event_header header;
|
|
u32 pid, ppid;
|
|
u32 tid, ptid;
|
|
u64 time;
|
|
struct sample_id sample_id;
|
|
};
|
|
.EE
|
|
.in
|
|
.TP
|
|
.BR PERF_RECORD_THROTTLE ", " PERF_RECORD_UNTHROTTLE
|
|
This record indicates a throttle/unthrottle event.
|
|
.IP
|
|
.in +4n
|
|
.EX
|
|
struct {
|
|
struct perf_event_header header;
|
|
u64 time;
|
|
u64 id;
|
|
u64 stream_id;
|
|
struct sample_id sample_id;
|
|
};
|
|
.EE
|
|
.in
|
|
.TP
|
|
.B PERF_RECORD_FORK
|
|
This record indicates a fork event.
|
|
.IP
|
|
.in +4n
|
|
.EX
|
|
struct {
|
|
struct perf_event_header header;
|
|
u32 pid, ppid;
|
|
u32 tid, ptid;
|
|
u64 time;
|
|
struct sample_id sample_id;
|
|
};
|
|
.EE
|
|
.in
|
|
.TP
|
|
.B PERF_RECORD_READ
|
|
This record indicates a read event.
|
|
.IP
|
|
.in +4n
|
|
.EX
|
|
struct {
|
|
struct perf_event_header header;
|
|
u32 pid, tid;
|
|
struct read_format values;
|
|
struct sample_id sample_id;
|
|
};
|
|
.EE
|
|
.in
|
|
.TP
|
|
.B PERF_RECORD_SAMPLE
|
|
This record indicates a sample.
|
|
.IP
|
|
.in +4n
|
|
.EX
|
|
struct {
|
|
struct perf_event_header header;
|
|
u64 sample_id; /* if PERF_SAMPLE_IDENTIFIER */
|
|
u64 ip; /* if PERF_SAMPLE_IP */
|
|
u32 pid, tid; /* if PERF_SAMPLE_TID */
|
|
u64 time; /* if PERF_SAMPLE_TIME */
|
|
u64 addr; /* if PERF_SAMPLE_ADDR */
|
|
u64 id; /* if PERF_SAMPLE_ID */
|
|
u64 stream_id; /* if PERF_SAMPLE_STREAM_ID */
|
|
u32 cpu, res; /* if PERF_SAMPLE_CPU */
|
|
u64 period; /* if PERF_SAMPLE_PERIOD */
|
|
struct read_format v;
|
|
/* if PERF_SAMPLE_READ */
|
|
u64 nr; /* if PERF_SAMPLE_CALLCHAIN */
|
|
u64 ips[nr]; /* if PERF_SAMPLE_CALLCHAIN */
|
|
u32 size; /* if PERF_SAMPLE_RAW */
|
|
char data[size]; /* if PERF_SAMPLE_RAW */
|
|
u64 bnr; /* if PERF_SAMPLE_BRANCH_STACK */
|
|
struct perf_branch_entry lbr[bnr];
|
|
/* if PERF_SAMPLE_BRANCH_STACK */
|
|
u64 abi; /* if PERF_SAMPLE_REGS_USER */
|
|
u64 regs[weight(mask)];
|
|
/* if PERF_SAMPLE_REGS_USER */
|
|
u64 size; /* if PERF_SAMPLE_STACK_USER */
|
|
char data[size]; /* if PERF_SAMPLE_STACK_USER */
|
|
u64 dyn_size; /* if PERF_SAMPLE_STACK_USER &&
|
|
size != 0 */
|
|
u64 weight; /* if PERF_SAMPLE_WEIGHT */
|
|
u64 data_src; /* if PERF_SAMPLE_DATA_SRC */
|
|
u64 transaction; /* if PERF_SAMPLE_TRANSACTION */
|
|
u64 abi; /* if PERF_SAMPLE_REGS_INTR */
|
|
u64 regs[weight(mask)];
|
|
/* if PERF_SAMPLE_REGS_INTR */
|
|
u64 phys_addr; /* if PERF_SAMPLE_PHYS_ADDR */
|
|
u64 cgroup; /* if PERF_SAMPLE_CGROUP */
|
|
};
|
|
.EE
|
|
.in
|
|
.RS 4
|
|
.TP 4
|
|
.I sample_id
|
|
If
|
|
.B PERF_SAMPLE_IDENTIFIER
|
|
is enabled, a 64-bit unique ID is included.
|
|
This is a duplication of the
|
|
.B PERF_SAMPLE_ID
|
|
.I id
|
|
value, but included at the beginning of the sample
|
|
so parsers can easily obtain the value.
|
|
.TP
|
|
.I ip
|
|
If
|
|
.B PERF_SAMPLE_IP
|
|
is enabled, then a 64-bit instruction
|
|
pointer value is included.
|
|
.TP
|
|
.IR pid ", " tid
|
|
If
|
|
.B PERF_SAMPLE_TID
|
|
is enabled, then a 32-bit process ID
|
|
and 32-bit thread ID are included.
|
|
.TP
|
|
.I time
|
|
If
|
|
.B PERF_SAMPLE_TIME
|
|
is enabled, then a 64-bit timestamp
|
|
is included.
|
|
This is obtained via local_clock() which is a hardware timestamp
|
|
if available and the jiffies value if not.
|
|
.TP
|
|
.I addr
|
|
If
|
|
.B PERF_SAMPLE_ADDR
|
|
is enabled, then a 64-bit address is included.
|
|
This is usually the address of a tracepoint,
|
|
breakpoint, or software event; otherwise the value is 0.
|
|
.TP
|
|
.I id
|
|
If
|
|
.B PERF_SAMPLE_ID
|
|
is enabled, a 64-bit unique ID is included.
|
|
If the event is a member of an event group, the group leader ID is returned.
|
|
This ID is the same as the one returned by
|
|
.BR PERF_FORMAT_ID .
|
|
.TP
|
|
.I stream_id
|
|
If
|
|
.B PERF_SAMPLE_STREAM_ID
|
|
is enabled, a 64-bit unique ID is included.
|
|
Unlike
|
|
.B PERF_SAMPLE_ID
|
|
the actual ID is returned, not the group leader.
|
|
This ID is the same as the one returned by
|
|
.BR PERF_FORMAT_ID .
|
|
.TP
|
|
.IR cpu ", " res
|
|
If
|
|
.B PERF_SAMPLE_CPU
|
|
is enabled, this is a 32-bit value indicating
|
|
which CPU was being used, in addition to a reserved (unused)
|
|
32-bit value.
|
|
.TP
|
|
.I period
|
|
If
|
|
.B PERF_SAMPLE_PERIOD
|
|
is enabled, a 64-bit value indicating
|
|
the current sampling period is written.
|
|
.TP
|
|
.I v
|
|
If
|
|
.B PERF_SAMPLE_READ
|
|
is enabled, a structure of type read_format
|
|
is included which has values for all events in the event group.
|
|
The values included depend on the
|
|
.I read_format
|
|
value used at
|
|
.BR perf_event_open ()
|
|
time.
|
|
.TP
|
|
.IR nr ", " ips[nr]
|
|
If
|
|
.B PERF_SAMPLE_CALLCHAIN
|
|
is enabled, then a 64-bit number is included
|
|
which indicates how many following 64-bit instruction pointers will
|
|
follow.
|
|
This is the current callchain.
|
|
.TP
|
|
.IR size ", " data[size]
|
|
If
|
|
.B PERF_SAMPLE_RAW
|
|
is enabled, then a 32-bit value indicating size
|
|
is included followed by an array of 8-bit values of length size.
|
|
The values are padded with 0 to have 64-bit alignment.
|
|
.IP
|
|
This RAW record data is opaque with respect to the ABI.
|
|
The ABI doesn't make any promises with respect to the stability
|
|
of its content, it may vary depending
|
|
on event, hardware, and kernel version.
|
|
.TP
|
|
.IR bnr ", " lbr[bnr]
|
|
If
|
|
.B PERF_SAMPLE_BRANCH_STACK
|
|
is enabled, then a 64-bit value indicating
|
|
the number of records is included, followed by
|
|
.I bnr
|
|
.I perf_branch_entry
|
|
structures which each include the fields:
|
|
.RS
|
|
.TP
|
|
.I from
|
|
This indicates the source instruction (may not be a branch).
|
|
.TP
|
|
.I to
|
|
The branch target.
|
|
.TP
|
|
.I mispred
|
|
The branch target was mispredicted.
|
|
.TP
|
|
.I predicted
|
|
The branch target was predicted.
|
|
.TP
|
|
.IR in_tx " (since Linux 3.11)"
|
|
.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
|
|
The branch was in a transactional memory transaction.
|
|
.TP
|
|
.IR abort " (since Linux 3.11)"
|
|
.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
|
|
The branch was in an aborted transactional memory transaction.
|
|
.TP
|
|
.IR cycles " (since Linux 4.3)"
|
|
.\" commit 71ef3c6b9d4665ee7afbbe4c208a98917dcfc32f
|
|
This reports the number of cycles elapsed since the
|
|
previous branch stack update.
|
|
.PP
|
|
The entries are from most to least recent, so the first entry
|
|
has the most recent branch.
|
|
.PP
|
|
Support for
|
|
.IR mispred ,
|
|
.IR predicted ,
|
|
and
|
|
.I cycles
|
|
is optional; if not supported, those
|
|
values will be 0.
|
|
.PP
|
|
The type of branches recorded is specified by the
|
|
.I branch_sample_type
|
|
field.
|
|
.RE
|
|
.TP
|
|
.IR abi ", " regs[weight(mask)]
|
|
If
|
|
.B PERF_SAMPLE_REGS_USER
|
|
is enabled, then the user CPU registers are recorded.
|
|
.IP
|
|
The
|
|
.I abi
|
|
field is one of
|
|
.BR PERF_SAMPLE_REGS_ABI_NONE ,
|
|
.BR PERF_SAMPLE_REGS_ABI_32 ,
|
|
or
|
|
.BR PERF_SAMPLE_REGS_ABI_64 .
|
|
.IP
|
|
The
|
|
.I regs
|
|
field is an array of the CPU registers that were specified by
|
|
the
|
|
.I sample_regs_user
|
|
attr field.
|
|
The number of values is the number of bits set in the
|
|
.I sample_regs_user
|
|
bit mask.
|
|
.TP
|
|
.IR size ", " data[size] ", " dyn_size
|
|
If
|
|
.B PERF_SAMPLE_STACK_USER
|
|
is enabled, then the user stack is recorded.
|
|
This can be used to generate stack backtraces.
|
|
.I size
|
|
is the size requested by the user in
|
|
.I sample_stack_user
|
|
or else the maximum record size.
|
|
.I data
|
|
is the stack data (a raw dump of the memory pointed to by the
|
|
stack pointer at the time of sampling).
|
|
.I dyn_size
|
|
is the amount of data actually dumped (can be less than
|
|
.IR size ).
|
|
Note that
|
|
.I dyn_size
|
|
is omitted if
|
|
.I size
|
|
is 0.
|
|
.TP
|
|
.I weight
|
|
If
|
|
.B PERF_SAMPLE_WEIGHT
|
|
is enabled, then a 64-bit value provided by the hardware
|
|
is recorded that indicates how costly the event was.
|
|
This allows expensive events to stand out more clearly
|
|
in profiles.
|
|
.TP
|
|
.I data_src
|
|
If
|
|
.B PERF_SAMPLE_DATA_SRC
|
|
is enabled, then a 64-bit value is recorded that is made up of
|
|
the following fields:
|
|
.RS
|
|
.TP 4
|
|
.I mem_op
|
|
Type of opcode, a bitwise combination of:
|
|
.IP
|
|
.PD 0
|
|
.RS
|
|
.TP 24
|
|
.B PERF_MEM_OP_NA
|
|
Not available
|
|
.TP
|
|
.B PERF_MEM_OP_LOAD
|
|
Load instruction
|
|
.TP
|
|
.B PERF_MEM_OP_STORE
|
|
Store instruction
|
|
.TP
|
|
.B PERF_MEM_OP_PFETCH
|
|
Prefetch
|
|
.TP
|
|
.B PERF_MEM_OP_EXEC
|
|
Executable code
|
|
.RE
|
|
.PD
|
|
.TP
|
|
.I mem_lvl
|
|
Memory hierarchy level hit or miss, a bitwise combination of
|
|
the following, shifted left by
|
|
.BR PERF_MEM_LVL_SHIFT :
|
|
.IP
|
|
.PD 0
|
|
.RS
|
|
.TP 24
|
|
.B PERF_MEM_LVL_NA
|
|
Not available
|
|
.TP
|
|
.B PERF_MEM_LVL_HIT
|
|
Hit
|
|
.TP
|
|
.B PERF_MEM_LVL_MISS
|
|
Miss
|
|
.TP
|
|
.B PERF_MEM_LVL_L1
|
|
Level 1 cache
|
|
.TP
|
|
.B PERF_MEM_LVL_LFB
|
|
Line fill buffer
|
|
.TP
|
|
.B PERF_MEM_LVL_L2
|
|
Level 2 cache
|
|
.TP
|
|
.B PERF_MEM_LVL_L3
|
|
Level 3 cache
|
|
.TP
|
|
.B PERF_MEM_LVL_LOC_RAM
|
|
Local DRAM
|
|
.TP
|
|
.B PERF_MEM_LVL_REM_RAM1
|
|
Remote DRAM 1 hop
|
|
.TP
|
|
.B PERF_MEM_LVL_REM_RAM2
|
|
Remote DRAM 2 hops
|
|
.TP
|
|
.B PERF_MEM_LVL_REM_CCE1
|
|
Remote cache 1 hop
|
|
.TP
|
|
.B PERF_MEM_LVL_REM_CCE2
|
|
Remote cache 2 hops
|
|
.TP
|
|
.B PERF_MEM_LVL_IO
|
|
I/O memory
|
|
.TP
|
|
.B PERF_MEM_LVL_UNC
|
|
Uncached memory
|
|
.RE
|
|
.PD
|
|
.TP
|
|
.I mem_snoop
|
|
Snoop mode, a bitwise combination of the following, shifted left by
|
|
.BR PERF_MEM_SNOOP_SHIFT :
|
|
.IP
|
|
.PD 0
|
|
.RS
|
|
.TP 24
|
|
.B PERF_MEM_SNOOP_NA
|
|
Not available
|
|
.TP
|
|
.B PERF_MEM_SNOOP_NONE
|
|
No snoop
|
|
.TP
|
|
.B PERF_MEM_SNOOP_HIT
|
|
Snoop hit
|
|
.TP
|
|
.B PERF_MEM_SNOOP_MISS
|
|
Snoop miss
|
|
.TP
|
|
.B PERF_MEM_SNOOP_HITM
|
|
Snoop hit modified
|
|
.RE
|
|
.PD
|
|
.TP
|
|
.I mem_lock
|
|
Lock instruction, a bitwise combination of the following, shifted left by
|
|
.BR PERF_MEM_LOCK_SHIFT :
|
|
.IP
|
|
.PD 0
|
|
.RS
|
|
.TP 24
|
|
.B PERF_MEM_LOCK_NA
|
|
Not available
|
|
.TP
|
|
.B PERF_MEM_LOCK_LOCKED
|
|
Locked transaction
|
|
.RE
|
|
.PD
|
|
.TP
|
|
.I mem_dtlb
|
|
TLB access hit or miss, a bitwise combination of the following, shifted
|
|
left by
|
|
.BR PERF_MEM_TLB_SHIFT :
|
|
.IP
|
|
.PD 0
|
|
.RS
|
|
.TP 24
|
|
.B PERF_MEM_TLB_NA
|
|
Not available
|
|
.TP
|
|
.B PERF_MEM_TLB_HIT
|
|
Hit
|
|
.TP
|
|
.B PERF_MEM_TLB_MISS
|
|
Miss
|
|
.TP
|
|
.B PERF_MEM_TLB_L1
|
|
Level 1 TLB
|
|
.TP
|
|
.B PERF_MEM_TLB_L2
|
|
Level 2 TLB
|
|
.TP
|
|
.B PERF_MEM_TLB_WK
|
|
Hardware walker
|
|
.TP
|
|
.B PERF_MEM_TLB_OS
|
|
OS fault handler
|
|
.RE
|
|
.PD
|
|
.RE
|
|
.TP
|
|
.I transaction
|
|
If the
|
|
.B PERF_SAMPLE_TRANSACTION
|
|
flag is set, then a 64-bit field is recorded describing
|
|
the sources of any transactional memory aborts.
|
|
.IP
|
|
The field is a bitwise combination of the following values:
|
|
.RS
|
|
.TP
|
|
.B PERF_TXN_ELISION
|
|
Abort from an elision type transaction (Intel-CPU-specific).
|
|
.TP
|
|
.B PERF_TXN_TRANSACTION
|
|
Abort from a generic transaction.
|
|
.TP
|
|
.B PERF_TXN_SYNC
|
|
Synchronous abort (related to the reported instruction).
|
|
.TP
|
|
.B PERF_TXN_ASYNC
|
|
Asynchronous abort (not related to the reported instruction).
|
|
.TP
|
|
.B PERF_TXN_RETRY
|
|
Retryable abort (retrying the transaction may have succeeded).
|
|
.TP
|
|
.B PERF_TXN_CONFLICT
|
|
Abort due to memory conflicts with other threads.
|
|
.TP
|
|
.B PERF_TXN_CAPACITY_WRITE
|
|
Abort due to write capacity overflow.
|
|
.TP
|
|
.B PERF_TXN_CAPACITY_READ
|
|
Abort due to read capacity overflow.
|
|
.RE
|
|
.IP
|
|
In addition, a user-specified abort code can be obtained from
|
|
the high 32 bits of the field by shifting right by
|
|
.B PERF_TXN_ABORT_SHIFT
|
|
and masking with the value
|
|
.BR PERF_TXN_ABORT_MASK .
|
|
.TP
|
|
.IR abi ", " regs[weight(mask)]
|
|
If
|
|
.B PERF_SAMPLE_REGS_INTR
|
|
is enabled, then the user CPU registers are recorded.
|
|
.IP
|
|
The
|
|
.I abi
|
|
field is one of
|
|
.BR PERF_SAMPLE_REGS_ABI_NONE ,
|
|
.BR PERF_SAMPLE_REGS_ABI_32 ,
|
|
or
|
|
.BR PERF_SAMPLE_REGS_ABI_64 .
|
|
.IP
|
|
The
|
|
.I regs
|
|
field is an array of the CPU registers that were specified by
|
|
the
|
|
.I sample_regs_intr
|
|
attr field.
|
|
The number of values is the number of bits set in the
|
|
.I sample_regs_intr
|
|
bit mask.
|
|
.TP
|
|
.I phys_addr
|
|
If the
|
|
.B PERF_SAMPLE_PHYS_ADDR
|
|
flag is set, then the 64-bit physical address is recorded.
|
|
.TP
|
|
.I cgroup
|
|
If the
|
|
.B PERF_SAMPLE_CGROUP
|
|
flag is set,
|
|
then the 64-bit cgroup ID (for the perf_event subsystem) is recorded.
|
|
To get the pathname of the cgroup, the ID should match to one in a
|
|
.B PERF_RECORD_CGROUP .
|
|
.RE
|
|
.TP
|
|
.B PERF_RECORD_MMAP2
|
|
This record includes extended information on
|
|
.BR mmap (2)
|
|
calls returning executable mappings.
|
|
The format is similar to that of the
|
|
.B PERF_RECORD_MMAP
|
|
record, but includes extra values that allow uniquely identifying
|
|
shared mappings.
|
|
.IP
|
|
.in +4n
|
|
.EX
|
|
struct {
|
|
struct perf_event_header header;
|
|
u32 pid;
|
|
u32 tid;
|
|
u64 addr;
|
|
u64 len;
|
|
u64 pgoff;
|
|
u32 maj;
|
|
u32 min;
|
|
u64 ino;
|
|
u64 ino_generation;
|
|
u32 prot;
|
|
u32 flags;
|
|
char filename[];
|
|
struct sample_id sample_id;
|
|
};
|
|
.EE
|
|
.in
|
|
.RS
|
|
.TP
|
|
.I pid
|
|
is the process ID.
|
|
.TP
|
|
.I tid
|
|
is the thread ID.
|
|
.TP
|
|
.I addr
|
|
is the address of the allocated memory.
|
|
.TP
|
|
.I len
|
|
is the length of the allocated memory.
|
|
.TP
|
|
.I pgoff
|
|
is the page offset of the allocated memory.
|
|
.TP
|
|
.I maj
|
|
is the major ID of the underlying device.
|
|
.TP
|
|
.I min
|
|
is the minor ID of the underlying device.
|
|
.TP
|
|
.I ino
|
|
is the inode number.
|
|
.TP
|
|
.I ino_generation
|
|
is the inode generation.
|
|
.TP
|
|
.I prot
|
|
is the protection information.
|
|
.TP
|
|
.I flags
|
|
is the flags information.
|
|
.TP
|
|
.I filename
|
|
is a string describing the backing of the allocated memory.
|
|
.RE
|
|
.TP
|
|
.BR PERF_RECORD_AUX " (since Linux 4.1)"
|
|
.\" commit 68db7e98c3a6ebe7284b6cf14906ed7c55f3f7f0
|
|
This record reports that new data is available in the separate
|
|
AUX buffer region.
|
|
.IP
|
|
.in +4n
|
|
.EX
|
|
struct {
|
|
struct perf_event_header header;
|
|
u64 aux_offset;
|
|
u64 aux_size;
|
|
u64 flags;
|
|
struct sample_id sample_id;
|
|
};
|
|
.EE
|
|
.in
|
|
.RS
|
|
.TP
|
|
.I aux_offset
|
|
offset in the AUX mmap region where the new data begins.
|
|
.TP
|
|
.I aux_size
|
|
size of the data made available.
|
|
.TP
|
|
.I flags
|
|
describes the AUX update.
|
|
.RS
|
|
.TP
|
|
.B PERF_AUX_FLAG_TRUNCATED
|
|
if set, then the data returned was truncated to fit the available
|
|
buffer size.
|
|
.TP
|
|
.B PERF_AUX_FLAG_OVERWRITE
|
|
.\" commit 2023a0d2829e521fe6ad6b9907f3f90bfbf57142
|
|
if set, then the data returned has overwritten previous data.
|
|
.RE
|
|
.RE
|
|
.TP
|
|
.BR PERF_RECORD_ITRACE_START " (since Linux 4.1)"
|
|
.\" ec0d7729bbaed4b9d2d3fada693278e13a3d1368
|
|
This record indicates which process has initiated an instruction
|
|
trace event, allowing tools to properly correlate the instruction
|
|
addresses in the AUX buffer with the proper executable.
|
|
.IP
|
|
.in +4n
|
|
.EX
|
|
struct {
|
|
struct perf_event_header header;
|
|
u32 pid;
|
|
u32 tid;
|
|
};
|
|
.EE
|
|
.in
|
|
.RS
|
|
.TP
|
|
.I pid
|
|
process ID of the thread starting an instruction trace.
|
|
.TP
|
|
.I tid
|
|
thread ID of the thread starting an instruction trace.
|
|
.RE
|
|
.TP
|
|
.BR PERF_RECORD_LOST_SAMPLES " (since Linux 4.2)"
|
|
.\" f38b0dbb491a6987e198aa6b428db8692a6480f8
|
|
When using hardware sampling (such as Intel PEBS) this record
|
|
indicates some number of samples that may have been lost.
|
|
.IP
|
|
.in +4n
|
|
.EX
|
|
struct {
|
|
struct perf_event_header header;
|
|
u64 lost;
|
|
struct sample_id sample_id;
|
|
};
|
|
.EE
|
|
.in
|
|
.RS
|
|
.TP
|
|
.I lost
|
|
the number of potentially lost samples.
|
|
.RE
|
|
.TP
|
|
.BR PERF_RECORD_SWITCH " (since Linux 4.3)"
|
|
.\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4
|
|
This record indicates a context switch has happened.
|
|
The
|
|
.B PERF_RECORD_MISC_SWITCH_OUT
|
|
bit in the
|
|
.I misc
|
|
field indicates whether it was a context switch into
|
|
or away from the current process.
|
|
.IP
|
|
.in +4n
|
|
.EX
|
|
struct {
|
|
struct perf_event_header header;
|
|
struct sample_id sample_id;
|
|
};
|
|
.EE
|
|
.in
|
|
.TP
|
|
.BR PERF_RECORD_SWITCH_CPU_WIDE " (since Linux 4.3)"
|
|
.\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4
|
|
As with
|
|
.B PERF_RECORD_SWITCH
|
|
this record indicates a context switch has happened,
|
|
but it only occurs when sampling in CPU-wide mode
|
|
and provides additional information on the process
|
|
being switched to/from.
|
|
The
|
|
.B PERF_RECORD_MISC_SWITCH_OUT
|
|
bit in the
|
|
.I misc
|
|
field indicates whether it was a context switch into
|
|
or away from the current process.
|
|
.IP
|
|
.in +4n
|
|
.EX
|
|
struct {
|
|
struct perf_event_header header;
|
|
u32 next_prev_pid;
|
|
u32 next_prev_tid;
|
|
struct sample_id sample_id;
|
|
};
|
|
.EE
|
|
.in
|
|
.RS
|
|
.TP
|
|
.I next_prev_pid
|
|
The process ID of the previous (if switching in)
|
|
or next (if switching out) process on the CPU.
|
|
.TP
|
|
.I next_prev_tid
|
|
The thread ID of the previous (if switching in)
|
|
or next (if switching out) thread on the CPU.
|
|
.RE
|
|
.TP
|
|
.BR PERF_RECORD_NAMESPACES " (since Linux 4.11)"
|
|
.\" commit e422267322cd319e2695a535e47c5b1feeac45eb
|
|
This record includes various namespace information of a process.
|
|
.IP
|
|
.in +4n
|
|
.EX
|
|
struct {
|
|
struct perf_event_header header;
|
|
u32 pid;
|
|
u32 tid;
|
|
u64 nr_namespaces;
|
|
struct { u64 dev, inode } [nr_namespaces];
|
|
struct sample_id sample_id;
|
|
};
|
|
.EE
|
|
.in
|
|
.RS
|
|
.TP
|
|
.I pid
|
|
is the process ID
|
|
.TP
|
|
.I tid
|
|
is the thread ID
|
|
.TP
|
|
.I nr_namespace
|
|
is the number of namespaces in this record
|
|
.RE
|
|
.IP
|
|
Each namespace has
|
|
.I dev
|
|
and
|
|
.I inode
|
|
fields and is recorded in the
|
|
fixed position like below:
|
|
.RS
|
|
.TP
|
|
.BR NET_NS_INDEX = 0
|
|
Network namespace
|
|
.TP
|
|
.BR UTS_NS_INDEX = 1
|
|
UTS namespace
|
|
.TP
|
|
.BR IPC_NS_INDEX = 2
|
|
IPC namespace
|
|
.TP
|
|
.BR PID_NS_INDEX = 3
|
|
PID namespace
|
|
.TP
|
|
.BR USER_NS_INDEX = 4
|
|
User namespace
|
|
.TP
|
|
.BR MNT_NS_INDEX = 5
|
|
Mount namespace
|
|
.TP
|
|
.BR CGROUP_NS_INDEX = 6
|
|
Cgroup namespace
|
|
.RE
|
|
.TP
|
|
.BR PERF_RECORD_KSYMBOL " (since Linux 5.0)"
|
|
.\" commit 76193a94522f1d4edf2447a536f3f796ce56343b
|
|
This record indicates kernel symbol register/unregister events.
|
|
.IP
|
|
.in +4n
|
|
.EX
|
|
struct {
|
|
struct perf_event_header header;
|
|
u64 addr;
|
|
u32 len;
|
|
u16 ksym_type;
|
|
u16 flags;
|
|
char name[];
|
|
struct sample_id sample_id;
|
|
};
|
|
.EE
|
|
.in
|
|
.RS
|
|
.TP
|
|
.I addr
|
|
is the address of the kernel symbol.
|
|
.TP
|
|
.I len
|
|
is the length of the kernel symbol.
|
|
.TP
|
|
.I ksym_type
|
|
is the type of the kernel symbol.
|
|
Currently the following types are available:
|
|
.RS
|
|
.TP
|
|
.B PERF_RECORD_KSYMBOL_TYPE_BPF
|
|
The kernel symbol is a BPF function.
|
|
.RE
|
|
.TP
|
|
.I flags
|
|
If the
|
|
.B PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER
|
|
is set, then this event is for unregistering the kernel symbol.
|
|
.RE
|
|
.TP
|
|
.BR PERF_RECORD_BPF_EVENT " (since Linux 5.0)"
|
|
.\" commit 6ee52e2a3fe4ea35520720736e6791df1fb67106
|
|
This record indicates BPF program is loaded or unloaded.
|
|
.IP
|
|
.in +4n
|
|
.EX
|
|
struct {
|
|
struct perf_event_header header;
|
|
u16 type;
|
|
u16 flags;
|
|
u32 id;
|
|
u8 tag[BPF_TAG_SIZE];
|
|
struct sample_id sample_id;
|
|
};
|
|
.EE
|
|
.in
|
|
.RS
|
|
.TP
|
|
.I type
|
|
is one of the following values:
|
|
.RS
|
|
.TP
|
|
.B PERF_BPF_EVENT_PROG_LOAD
|
|
A BPF program is loaded
|
|
.TP
|
|
.B PERF_BPF_EVENT_PROG_UNLOAD
|
|
A BPF program is unloaded
|
|
.RE
|
|
.TP
|
|
.I id
|
|
is the ID of the BPF program.
|
|
.TP
|
|
.I tag
|
|
is the tag of the BPF program.
|
|
Currently,
|
|
.B BPF_TAG_SIZE
|
|
is defined as 8.
|
|
.RE
|
|
.TP
|
|
.BR PERF_RECORD_CGROUP " (since Linux 5.7)"
|
|
.\" commit 96aaab686505c449e24d76e76507290dcc30e008
|
|
This record indicates a new cgroup is created and activated.
|
|
.IP
|
|
.in +4n
|
|
.EX
|
|
struct {
|
|
struct perf_event_header header;
|
|
u64 id;
|
|
char path[];
|
|
struct sample_id sample_id;
|
|
};
|
|
.EE
|
|
.in
|
|
.RS
|
|
.TP
|
|
.I id
|
|
is the cgroup identifier.
|
|
This can be also retrieved by
|
|
.BR name_to_handle_at (2)
|
|
on the cgroup path (as a file handle).
|
|
.TP
|
|
.I path
|
|
is the path of the cgroup from the root.
|
|
.RE
|
|
.TP
|
|
.BR PERF_RECORD_TEXT_POKE " (since Linux 5.8)"
|
|
.\" commit e17d43b93e544f5016c0251d2074c15568d5d963
|
|
This record indicates a change in the kernel text.
|
|
This includes addition and removal of the text
|
|
and the corresponding length is zero in this case.
|
|
.IP
|
|
.in +4n
|
|
.EX
|
|
struct {
|
|
struct perf_event_header header;
|
|
u64 addr;
|
|
u16 old_len;
|
|
u16 new_len;
|
|
u8 bytes[];
|
|
struct sample_id sample_id;
|
|
};
|
|
.EE
|
|
.in
|
|
.RS
|
|
.TP
|
|
.I addr
|
|
is the address of the change
|
|
.TP
|
|
.I old_len
|
|
is the old length
|
|
.TP
|
|
.I new_len
|
|
is the new length
|
|
.TP
|
|
.I bytes
|
|
contains old bytes immediately followed by new bytes.
|
|
.RE
|
|
.RE
|
|
.SS Overflow handling
|
|
Events can be set to notify when a threshold is crossed,
|
|
indicating an overflow.
|
|
Overflow conditions can be captured by monitoring the
|
|
event file descriptor with
|
|
.BR poll (2),
|
|
.BR select (2),
|
|
or
|
|
.BR epoll (7).
|
|
Alternatively, the overflow events can be captured via sa signal handler,
|
|
by enabling I/O signaling on the file descriptor; see the discussion of the
|
|
.BR F_SETOWN
|
|
and
|
|
.BR F_SETSIG
|
|
operations in
|
|
.BR fcntl (2).
|
|
.PP
|
|
Overflows are generated only by sampling events
|
|
.RI ( sample_period
|
|
must have a nonzero value).
|
|
.PP
|
|
There are two ways to generate overflow notifications.
|
|
.PP
|
|
The first is to set a
|
|
.I wakeup_events
|
|
or
|
|
.I wakeup_watermark
|
|
value that will trigger if a certain number of samples
|
|
or bytes have been written to the mmap ring buffer.
|
|
In this case,
|
|
.B POLL_IN
|
|
is indicated.
|
|
.PP
|
|
The other way is by use of the
|
|
.B PERF_EVENT_IOC_REFRESH
|
|
ioctl.
|
|
This ioctl adds to a counter that decrements each time the event overflows.
|
|
When nonzero,
|
|
.B POLL_IN
|
|
is indicated, but
|
|
once the counter reaches 0
|
|
.B POLL_HUP
|
|
is indicated and
|
|
the underlying event is disabled.
|
|
.PP
|
|
Refreshing an event group leader refreshes all siblings and
|
|
refreshing with a parameter of 0 currently enables infinite
|
|
refreshes;
|
|
these behaviors are unsupported and should not be relied on.
|
|
.\" See https://lkml.org/lkml/2011/5/24/337
|
|
.PP
|
|
Starting with Linux 3.18,
|
|
.\" commit 179033b3e064d2cd3f5f9945e76b0a0f0fbf4883
|
|
.B POLL_HUP
|
|
is indicated if the event being monitored is attached to a different
|
|
process and that process exits.
|
|
.SS rdpmc instruction
|
|
Starting with Linux 3.4 on x86, you can use the
|
|
.\" commit c7206205d00ab375839bd6c7ddb247d600693c09
|
|
.I rdpmc
|
|
instruction to get low-latency reads without having to enter the kernel.
|
|
Note that using
|
|
.I rdpmc
|
|
is not necessarily faster than other methods for reading event values.
|
|
.PP
|
|
Support for this can be detected with the
|
|
.I cap_usr_rdpmc
|
|
field in the mmap page; documentation on how
|
|
to calculate event values can be found in that section.
|
|
.PP
|
|
Originally, when rdpmc support was enabled, any process (not just ones
|
|
with an active perf event) could use the rdpmc instruction to access
|
|
the counters.
|
|
Starting with Linux 4.0,
|
|
.\" 7911d3f7af14a614617e38245fedf98a724e46a9
|
|
rdpmc support is only allowed if an event is currently enabled
|
|
in a process's context.
|
|
To restore the old behavior, write the value 2 to
|
|
.IR /sys/devices/cpu/rdpmc .
|
|
.SS perf_event ioctl calls
|
|
Various ioctls act on
|
|
.BR perf_event_open ()
|
|
file descriptors:
|
|
.TP
|
|
.B PERF_EVENT_IOC_ENABLE
|
|
This enables the individual event or event group specified by the
|
|
file descriptor argument.
|
|
.IP
|
|
If the
|
|
.B PERF_IOC_FLAG_GROUP
|
|
bit is set in the ioctl argument, then all events in a group are
|
|
enabled, even if the event specified is not the group leader
|
|
(but see BUGS).
|
|
.TP
|
|
.B PERF_EVENT_IOC_DISABLE
|
|
This disables the individual counter or event group specified by the
|
|
file descriptor argument.
|
|
.IP
|
|
Enabling or disabling the leader of a group enables or disables the
|
|
entire group; that is, while the group leader is disabled, none of the
|
|
counters in the group will count.
|
|
Enabling or disabling a member of a group other than the leader
|
|
affects only that counter; disabling a non-leader
|
|
stops that counter from counting but doesn't affect any other counter.
|
|
.IP
|
|
If the
|
|
.B PERF_IOC_FLAG_GROUP
|
|
bit is set in the ioctl argument, then all events in a group are
|
|
disabled, even if the event specified is not the group leader
|
|
(but see BUGS).
|
|
.TP
|
|
.B PERF_EVENT_IOC_REFRESH
|
|
Non-inherited overflow counters can use this
|
|
to enable a counter for a number of overflows specified by the argument,
|
|
after which it is disabled.
|
|
Subsequent calls of this ioctl add the argument value to the current
|
|
count.
|
|
An overflow notification with
|
|
.B POLL_IN
|
|
set will happen on each overflow until the
|
|
count reaches 0; when that happens a notification with
|
|
.B POLL_HUP
|
|
set is sent and the event is disabled.
|
|
Using an argument of 0 is considered undefined behavior.
|
|
.TP
|
|
.B PERF_EVENT_IOC_RESET
|
|
Reset the event count specified by the
|
|
file descriptor argument to zero.
|
|
This resets only the counts; there is no way to reset the
|
|
multiplexing
|
|
.I time_enabled
|
|
or
|
|
.I time_running
|
|
values.
|
|
.IP
|
|
If the
|
|
.B PERF_IOC_FLAG_GROUP
|
|
bit is set in the ioctl argument, then all events in a group are
|
|
reset, even if the event specified is not the group leader
|
|
(but see BUGS).
|
|
.TP
|
|
.B PERF_EVENT_IOC_PERIOD
|
|
This updates the overflow period for the event.
|
|
.IP
|
|
Since Linux 3.7 (on ARM)
|
|
.\" commit 3581fe0ef37ce12ac7a4f74831168352ae848edc
|
|
and Linux 3.14 (all other architectures),
|
|
.\" commit bad7192b842c83e580747ca57104dd51fe08c223
|
|
the new period takes effect immediately.
|
|
On older kernels, the new period did not take effect until
|
|
after the next overflow.
|
|
.IP
|
|
The argument is a pointer to a 64-bit value containing the
|
|
desired new period.
|
|
.IP
|
|
Prior to Linux 2.6.36,
|
|
.\" commit ad0cf3478de8677f720ee06393b3147819568d6a
|
|
this ioctl always failed due to a bug
|
|
in the kernel.
|
|
.TP
|
|
.B PERF_EVENT_IOC_SET_OUTPUT
|
|
This tells the kernel to report event notifications to the specified
|
|
file descriptor rather than the default one.
|
|
The file descriptors must all be on the same CPU.
|
|
.IP
|
|
The argument specifies the desired file descriptor, or \-1 if
|
|
output should be ignored.
|
|
.TP
|
|
.BR PERF_EVENT_IOC_SET_FILTER " (since Linux 2.6.33)"
|
|
.\" commit 6fb2915df7f0747d9044da9dbff5b46dc2e20830
|
|
This adds an ftrace filter to this event.
|
|
.IP
|
|
The argument is a pointer to the desired ftrace filter.
|
|
.TP
|
|
.BR PERF_EVENT_IOC_ID " (since Linux 3.12)"
|
|
.\" commit cf4957f17f2a89984915ea808876d9c82225b862
|
|
This returns the event ID value for the given event file descriptor.
|
|
.IP
|
|
The argument is a pointer to a 64-bit unsigned integer
|
|
to hold the result.
|
|
.TP
|
|
.BR PERF_EVENT_IOC_SET_BPF " (since Linux 4.1)"
|
|
.\" commit 2541517c32be2531e0da59dfd7efc1ce844644f5
|
|
This allows attaching a Berkeley Packet Filter (BPF)
|
|
program to an existing kprobe tracepoint event.
|
|
You need
|
|
.B CAP_PERFMON
|
|
(since Linux 5.8) or
|
|
.B CAP_SYS_ADMIN
|
|
privileges to use this ioctl.
|
|
.IP
|
|
The argument is a BPF program file descriptor that was created by
|
|
a previous
|
|
.BR bpf (2)
|
|
system call.
|
|
.TP
|
|
.BR PERF_EVENT_IOC_PAUSE_OUTPUT " (since Linux 4.7)"
|
|
.\" commit 86e7972f690c1017fd086cdfe53d8524e68c661c
|
|
This allows pausing and resuming the event's ring-buffer.
|
|
A paused ring-buffer does not prevent generation of samples,
|
|
but simply discards them.
|
|
The discarded samples are considered lost, and cause a
|
|
.BR PERF_RECORD_LOST
|
|
sample to be generated when possible.
|
|
An overflow signal may still be triggered by the discarded sample
|
|
even though the ring-buffer remains empty.
|
|
.IP
|
|
The argument is an unsigned 32-bit integer.
|
|
A nonzero value pauses the ring-buffer, while a
|
|
zero value resumes the ring-buffer.
|
|
.TP
|
|
.BR PERF_EVENT_MODIFY_ATTRIBUTES " (since Linux 4.17)"
|
|
.\" commit 32ff77e8cc9e66cc4fb38098f64fd54cc8f54573
|
|
This allows modifying an existing event without the overhead
|
|
of closing and reopening a new event.
|
|
Currently this is supported only for breakpoint events.
|
|
.IP
|
|
The argument is a pointer to a
|
|
.I perf_event_attr
|
|
structure containing the updated event settings.
|
|
.TP
|
|
.BR PERF_EVENT_IOC_QUERY_BPF " (since Linux 4.16)"
|
|
.\" commit f371b304f12e31fe30207c41ca7754564e0ea4dc
|
|
This allows querying which Berkeley Packet Filter (BPF)
|
|
programs are attached to an existing kprobe tracepoint.
|
|
You can only attach one BPF program per event, but you can
|
|
have multiple events attached to a tracepoint.
|
|
Querying this value on one tracepoint event returns the ID
|
|
of all BPF programs in all events attached to the tracepoint.
|
|
You need
|
|
.B CAP_PERFMON
|
|
(since Linux 5.8) or
|
|
.B CAP_SYS_ADMIN
|
|
privileges to use this ioctl.
|
|
.IP
|
|
The argument is a pointer to a structure
|
|
.in +4n
|
|
.EX
|
|
struct perf_event_query_bpf {
|
|
__u32 ids_len;
|
|
__u32 prog_cnt;
|
|
__u32 ids[0];
|
|
};
|
|
.EE
|
|
.in
|
|
.IP
|
|
The
|
|
.I ids_len
|
|
field indicates the number of ids that can fit in the provided
|
|
.I ids
|
|
array.
|
|
The
|
|
.I prog_cnt
|
|
value is filled in by the kernel with the number of attached
|
|
BPF programs.
|
|
The
|
|
.I ids
|
|
array is filled with the ID of each attached BPF program.
|
|
If there are more programs than will fit in the array, then the
|
|
kernel will return
|
|
.B ENOSPC
|
|
and
|
|
.I ids_len
|
|
will indicate the number of program IDs that were successfully copied.
|
|
.\"
|
|
.SS Using prctl(2)
|
|
A process can enable or disable all currently open event groups
|
|
using the
|
|
.BR prctl (2)
|
|
.B PR_TASK_PERF_EVENTS_ENABLE
|
|
and
|
|
.B PR_TASK_PERF_EVENTS_DISABLE
|
|
operations.
|
|
This applies only to events created locally by the calling process.
|
|
This does not apply to events created by other processes attached
|
|
to the calling process or inherited events from a parent process.
|
|
Only group leaders are enabled and disabled,
|
|
not any other members of the groups.
|
|
.SS perf_event related configuration files
|
|
Files in
|
|
.I /proc/sys/kernel/
|
|
.RS 4
|
|
.TP
|
|
.I /proc/sys/kernel/perf_event_paranoid
|
|
The
|
|
.I perf_event_paranoid
|
|
file can be set to restrict access to the performance counters.
|
|
.IP
|
|
.PD 0
|
|
.RS
|
|
.IP 2 4
|
|
allow only user-space measurements (default since Linux 4.6).
|
|
.\" default changed in commit 0161028b7c8aebef64194d3d73e43bc3b53b5c66
|
|
.IP 1
|
|
allow both kernel and user measurements (default before Linux 4.6).
|
|
.IP 0
|
|
allow access to CPU-specific data but not raw tracepoint samples.
|
|
.IP \-1
|
|
no restrictions.
|
|
.RE
|
|
.PD
|
|
.IP
|
|
The existence of the
|
|
.I perf_event_paranoid
|
|
file is the official method for determining if a kernel supports
|
|
.BR perf_event_open ().
|
|
.TP
|
|
.I /proc/sys/kernel/perf_event_max_sample_rate
|
|
This sets the maximum sample rate.
|
|
Setting this too high can allow
|
|
users to sample at a rate that impacts overall machine performance
|
|
and potentially lock up the machine.
|
|
The default value is
|
|
100000 (samples per second).
|
|
.TP
|
|
.I /proc/sys/kernel/perf_event_max_stack
|
|
.\" Introduced in c5dfd78eb79851e278b7973031b9ca363da87a7e
|
|
This file sets the maximum depth of stack frame entries reported
|
|
when generating a call trace.
|
|
.TP
|
|
.I /proc/sys/kernel/perf_event_mlock_kb
|
|
Maximum number of pages an unprivileged user can
|
|
.BR mlock (2).
|
|
The default is 516 (kB).
|
|
.RE
|
|
.PP
|
|
Files in
|
|
.I /sys/bus/event_source/devices/
|
|
.PP
|
|
.RS 4
|
|
Since Linux 2.6.34, the kernel supports having multiple PMUs
|
|
available for monitoring.
|
|
Information on how to program these PMUs can be found under
|
|
.IR /sys/bus/event_source/devices/ .
|
|
Each subdirectory corresponds to a different PMU.
|
|
.TP
|
|
.IR /sys/bus/event_source/devices/*/type " (since Linux 2.6.38)"
|
|
.\" commit abe43400579d5de0078c2d3a760e6598e183f871
|
|
This contains an integer that can be used in the
|
|
.I type
|
|
field of
|
|
.I perf_event_attr
|
|
to indicate that you wish to use this PMU.
|
|
.TP
|
|
.IR /sys/bus/event_source/devices/cpu/rdpmc " (since Linux 3.4)"
|
|
.\" commit 0c9d42ed4cee2aa1dfc3a260b741baae8615744f
|
|
If this file is 1, then direct user-space access to the
|
|
performance counter registers is allowed via the rdpmc instruction.
|
|
This can be disabled by echoing 0 to the file.
|
|
.IP
|
|
As of Linux 4.0
|
|
.\" a66734297f78707ce39d756b656bfae861d53f62
|
|
.\" 7911d3f7af14a614617e38245fedf98a724e46a9
|
|
the behavior has changed, so that 1 now means only allow access
|
|
to processes with active perf events, with 2 indicating the old
|
|
allow-anyone-access behavior.
|
|
.TP
|
|
.IR /sys/bus/event_source/devices/*/format/ " (since Linux 3.4)"
|
|
.\" commit 641cc938815dfd09f8fa1ec72deb814f0938ac33
|
|
This subdirectory contains information on the architecture-specific
|
|
subfields available for programming the various
|
|
.I config
|
|
fields in the
|
|
.I perf_event_attr
|
|
struct.
|
|
.IP
|
|
The content of each file is the name of the config field, followed
|
|
by a colon, followed by a series of integer bit ranges separated by
|
|
commas.
|
|
For example, the file
|
|
.I event
|
|
may contain the value
|
|
.I config1:1,6\-10,44
|
|
which indicates that event is an attribute that occupies bits 1,6\(en10, and 44
|
|
of
|
|
.IR perf_event_attr::config1 .
|
|
.TP
|
|
.IR /sys/bus/event_source/devices/*/events/ " (since Linux 3.4)"
|
|
.\" commit 641cc938815dfd09f8fa1ec72deb814f0938ac33
|
|
This subdirectory contains files with predefined events.
|
|
The contents are strings describing the event settings
|
|
expressed in terms of the fields found in the previously mentioned
|
|
.I ./format/
|
|
directory.
|
|
These are not necessarily complete lists of all events supported by
|
|
a PMU, but usually a subset of events deemed useful or interesting.
|
|
.IP
|
|
The content of each file is a list of attribute names
|
|
separated by commas.
|
|
Each entry has an optional value (either hex or decimal).
|
|
If no value is specified, then it is assumed to be a single-bit
|
|
field with a value of 1.
|
|
An example entry may look like this:
|
|
.IR event=0x2,inv,ldlat=3 .
|
|
.TP
|
|
.I /sys/bus/event_source/devices/*/uevent
|
|
This file is the standard kernel device interface
|
|
for injecting hotplug events.
|
|
.TP
|
|
.IR /sys/bus/event_source/devices/*/cpumask " (since Linux 3.7)"
|
|
.\" commit 314d9f63f385096580e9e2a06eaa0745d92fe4ac
|
|
The
|
|
.I cpumask
|
|
file contains a comma-separated list of integers that
|
|
indicate a representative CPU number for each socket (package)
|
|
on the motherboard.
|
|
This is needed when setting up uncore or northbridge events, as
|
|
those PMUs present socket-wide events.
|
|
.RE
|
|
.SH RETURN VALUE
|
|
On success,
|
|
.BR perf_event_open ()
|
|
returns the new file descriptor.
|
|
On error, \-1 is returned and
|
|
.I errno
|
|
is set to indicate the error.
|
|
.SH ERRORS
|
|
The errors returned by
|
|
.BR perf_event_open ()
|
|
can be inconsistent, and may
|
|
vary across processor architectures and performance monitoring units.
|
|
.TP
|
|
.B E2BIG
|
|
Returned if the
|
|
.I perf_event_attr
|
|
.I size
|
|
value is too small
|
|
(smaller than
|
|
.BR PERF_ATTR_SIZE_VER0 ),
|
|
too big (larger than the page size),
|
|
or larger than the kernel supports and the extra bytes are not zero.
|
|
When
|
|
.B E2BIG
|
|
is returned, the
|
|
.I perf_event_attr
|
|
.I size
|
|
field is overwritten by the kernel to be the size of the structure
|
|
it was expecting.
|
|
.TP
|
|
.B EACCES
|
|
Returned when the requested event requires
|
|
.B CAP_PERFMON
|
|
(since Linux 5.8) or
|
|
.B CAP_SYS_ADMIN
|
|
permissions (or a more permissive perf_event paranoid setting).
|
|
Some common cases where an unprivileged process
|
|
may encounter this error:
|
|
attaching to a process owned by a different user;
|
|
monitoring all processes on a given CPU (i.e., specifying the
|
|
.I pid
|
|
argument as \-1);
|
|
and not setting
|
|
.I exclude_kernel
|
|
when the paranoid setting requires it.
|
|
.TP
|
|
.B EBADF
|
|
Returned if the
|
|
.I group_fd
|
|
file descriptor is not valid, or, if
|
|
.B PERF_FLAG_PID_CGROUP
|
|
is set,
|
|
the cgroup file descriptor in
|
|
.I pid
|
|
is not valid.
|
|
.TP
|
|
.BR EBUSY " (since Linux 4.1)"
|
|
.\" bed5b25ad9c8a2f5d735ef0bc746ec870c01c1b0
|
|
Returned if another event already has exclusive
|
|
access to the PMU.
|
|
.TP
|
|
.B EFAULT
|
|
Returned if the
|
|
.I attr
|
|
pointer points at an invalid memory address.
|
|
.TP
|
|
.B EINVAL
|
|
Returned if the specified event is invalid.
|
|
There are many possible reasons for this.
|
|
A not-exhaustive list:
|
|
.I sample_freq
|
|
is higher than the maximum setting;
|
|
the
|
|
.I cpu
|
|
to monitor does not exist;
|
|
.I read_format
|
|
is out of range;
|
|
.I sample_type
|
|
is out of range;
|
|
the
|
|
.I flags
|
|
value is out of range;
|
|
.I exclusive
|
|
or
|
|
.I pinned
|
|
set and the event is not a group leader;
|
|
the event
|
|
.I config
|
|
values are out of range or set reserved bits;
|
|
the generic event selected is not supported; or
|
|
there is not enough room to add the selected event.
|
|
.TP
|
|
.B EINTR
|
|
Returned when trying to mix perf and ftrace handling
|
|
for a uprobe.
|
|
.TP
|
|
.B EMFILE
|
|
Each opened event uses one file descriptor.
|
|
If a large number of events are opened,
|
|
the per-process limit on the number of open file descriptors will be reached,
|
|
and no more events can be created.
|
|
.TP
|
|
.B ENODEV
|
|
Returned when the event involves a feature not supported
|
|
by the current CPU.
|
|
.TP
|
|
.B ENOENT
|
|
Returned if the
|
|
.I type
|
|
setting is not valid.
|
|
This error is also returned for
|
|
some unsupported generic events.
|
|
.TP
|
|
.B ENOSPC
|
|
Prior to Linux 3.3, if there was not enough room for the event,
|
|
.\" commit aa2bc1ade59003a379ffc485d6da2d92ea3370a6
|
|
.B ENOSPC
|
|
was returned.
|
|
In Linux 3.3, this was changed to
|
|
.BR EINVAL .
|
|
.B ENOSPC
|
|
is still returned if you try to add more breakpoint events
|
|
than supported by the hardware.
|
|
.TP
|
|
.B ENOSYS
|
|
Returned if
|
|
.B PERF_SAMPLE_STACK_USER
|
|
is set in
|
|
.I sample_type
|
|
and it is not supported by hardware.
|
|
.TP
|
|
.B EOPNOTSUPP
|
|
Returned if an event requiring a specific hardware feature is
|
|
requested but there is no hardware support.
|
|
This includes requesting low-skid events if not supported,
|
|
branch tracing if it is not available, sampling if no PMU
|
|
interrupt is available, and branch stacks for software events.
|
|
.TP
|
|
.BR EOVERFLOW " (since Linux 4.8)"
|
|
.\" 97c79a38cd454602645f0470ffb444b3b75ce574
|
|
Returned if
|
|
.B PERF_SAMPLE_CALLCHAIN
|
|
is requested and
|
|
.I sample_max_stack
|
|
is larger than the maximum specified in
|
|
.IR /proc/sys/kernel/perf_event_max_stack .
|
|
.TP
|
|
.B EPERM
|
|
Returned on many (but not all) architectures when an unsupported
|
|
.IR exclude_hv ", " exclude_idle ", " exclude_user ", or " exclude_kernel
|
|
setting is specified.
|
|
.IP
|
|
It can also happen, as with
|
|
.BR EACCES ,
|
|
when the requested event requires
|
|
.B CAP_PERFMON
|
|
(since Linux 5.8) or
|
|
.B CAP_SYS_ADMIN
|
|
permissions (or a more permissive perf_event paranoid setting).
|
|
This includes setting a breakpoint on a kernel address,
|
|
and (since Linux 3.13) setting a kernel function-trace tracepoint.
|
|
.\" commit a4e95fc2cbb31d70a65beffeaf8773f881328c34
|
|
.TP
|
|
.B ESRCH
|
|
Returned if attempting to attach to a process that does not exist.
|
|
.SH VERSION
|
|
.BR perf_event_open ()
|
|
was introduced in Linux 2.6.31 but was called
|
|
.\" commit 0793a61d4df8daeac6492dbf8d2f3e5713caae5e
|
|
.BR perf_counter_open ().
|
|
It was renamed in Linux 2.6.32.
|
|
.\" commit cdd6c482c9ff9c55475ee7392ec8f672eddb7be6
|
|
.SH CONFORMING TO
|
|
This
|
|
.BR perf_event_open ()
|
|
system call Linux-specific
|
|
and should not be used in programs intended to be portable.
|
|
.SH NOTES
|
|
The official way of knowing if
|
|
.BR perf_event_open ()
|
|
support is enabled is checking
|
|
for the existence of the file
|
|
.IR /proc/sys/kernel/perf_event_paranoid .
|
|
.PP
|
|
.B CAP_PERFMON
|
|
capability (since Linux 5.8) provides secure approach to
|
|
performance monitoring and observability operations in a system
|
|
according to the principal of least privilege (POSIX IEEE 1003.1e).
|
|
Accessing system performance monitoring and observability operations
|
|
using
|
|
.B CAP_PERFMON
|
|
rather than the much more powerful
|
|
.B CAP_SYS_ADMIN
|
|
excludes chances to misuse credentials and makes operations more secure.
|
|
.B CAP_SYS_ADMIN
|
|
usage for secure system performance monitoring and observability
|
|
is discouraged in favor of the
|
|
.B CAP_PERFMON
|
|
capability.
|
|
.SH BUGS
|
|
The
|
|
.B F_SETOWN_EX
|
|
option to
|
|
.BR fcntl (2)
|
|
is needed to properly get overflow signals in threads.
|
|
This was introduced in Linux 2.6.32.
|
|
.\" commit ba0a6c9f6fceed11c6a99e8326f0477fe383e6b5
|
|
.PP
|
|
Prior to Linux 2.6.33 (at least for x86),
|
|
.\" commit b690081d4d3f6a23541493f1682835c3cd5c54a1
|
|
the kernel did not check
|
|
if events could be scheduled together until read time.
|
|
The same happens on all known kernels if the NMI watchdog is enabled.
|
|
This means to see if a given set of events works you have to
|
|
.BR perf_event_open (),
|
|
start, then read before you know for sure you
|
|
can get valid measurements.
|
|
.PP
|
|
Prior to Linux 2.6.34,
|
|
.\" FIXME . cannot find a kernel commit for this one
|
|
event constraints were not enforced by the kernel.
|
|
In that case, some events would silently return "0" if the kernel
|
|
scheduled them in an improper counter slot.
|
|
.PP
|
|
Prior to Linux 2.6.34, there was a bug when multiplexing where the
|
|
wrong results could be returned.
|
|
.\" commit 45e16a6834b6af098702e5ea6c9a40de42ff77d8
|
|
.PP
|
|
Kernels from Linux 2.6.35 to Linux 2.6.39 can quickly crash the kernel if
|
|
"inherit" is enabled and many threads are started.
|
|
.\" commit 38b435b16c36b0d863efcf3f07b34a6fac9873fd
|
|
.PP
|
|
Prior to Linux 2.6.35,
|
|
.\" commit 050735b08ca8a016bbace4445fa025b88fee770b
|
|
.B PERF_FORMAT_GROUP
|
|
did not work with attached processes.
|
|
.PP
|
|
There is a bug in the kernel code between
|
|
Linux 2.6.36 and Linux 3.0 that ignores the
|
|
"watermark" field and acts as if a wakeup_event
|
|
was chosen if the union has a
|
|
nonzero value in it.
|
|
.\" commit 4ec8363dfc1451f8c8f86825731fe712798ada02
|
|
.PP
|
|
From Linux 2.6.31 to Linux 3.4, the
|
|
.B PERF_IOC_FLAG_GROUP
|
|
ioctl argument was broken and would repeatedly operate
|
|
on the event specified rather than iterating across
|
|
all sibling events in a group.
|
|
.\" commit 724b6daa13e100067c30cfc4d1ad06629609dc4e
|
|
.PP
|
|
From Linux 3.4 to Linux 3.11, the mmap
|
|
.\" commit fa7315871046b9a4c48627905691dbde57e51033
|
|
.I cap_usr_rdpmc
|
|
and
|
|
.I cap_usr_time
|
|
bits mapped to the same location.
|
|
Code should migrate to the new
|
|
.I cap_user_rdpmc
|
|
and
|
|
.I cap_user_time
|
|
fields instead.
|
|
.PP
|
|
Always double-check your results!
|
|
Various generalized events have had wrong values.
|
|
For example, retired branches measured
|
|
the wrong thing on AMD machines until Linux 2.6.35.
|
|
.\" commit f287d332ce835f77a4f5077d2c0ef1e3f9ea42d2
|
|
.SH EXAMPLES
|
|
The following is a short example that measures the total
|
|
instruction count of a call to
|
|
.BR printf (3).
|
|
.PP
|
|
.EX
|
|
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
#include <unistd.h>
|
|
#include <string.h>
|
|
#include <sys/ioctl.h>
|
|
#include <linux/perf_event.h>
|
|
#include <asm/unistd.h>
|
|
|
|
static long
|
|
perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
|
|
int cpu, int group_fd, unsigned long flags)
|
|
{
|
|
int ret;
|
|
|
|
ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
|
|
group_fd, flags);
|
|
return ret;
|
|
}
|
|
|
|
int
|
|
main(int argc, char **argv)
|
|
{
|
|
struct perf_event_attr pe;
|
|
long long count;
|
|
int fd;
|
|
|
|
memset(&pe, 0, sizeof(pe));
|
|
pe.type = PERF_TYPE_HARDWARE;
|
|
pe.size = sizeof(pe);
|
|
pe.config = PERF_COUNT_HW_INSTRUCTIONS;
|
|
pe.disabled = 1;
|
|
pe.exclude_kernel = 1;
|
|
pe.exclude_hv = 1;
|
|
|
|
fd = perf_event_open(&pe, 0, \-1, \-1, 0);
|
|
if (fd == \-1) {
|
|
fprintf(stderr, "Error opening leader %llx\en", pe.config);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
ioctl(fd, PERF_EVENT_IOC_RESET, 0);
|
|
ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
|
|
|
|
printf("Measuring instruction count for this printf\en");
|
|
|
|
ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
|
|
read(fd, &count, sizeof(count));
|
|
|
|
printf("Used %lld instructions\en", count);
|
|
|
|
close(fd);
|
|
}
|
|
.EE
|
|
.SH SEE ALSO
|
|
.BR perf (1),
|
|
.BR fcntl (2),
|
|
.BR mmap (2),
|
|
.BR open (2),
|
|
.BR prctl (2),
|
|
.BR read (2)
|
|
.PP
|
|
.IR Documentation/admin\-guide/perf\-security.rst
|
|
in the kernel source tree
|