mirror of https://github.com/mkerrisk/man-pages
2096 lines
49 KiB
Groff
2096 lines
49 KiB
Groff
.\" Hey Emacs! This file is -*- nroff -*- source.
|
|
.\"
|
|
.\" Copyright (c) 2012, Vincent Weaver
|
|
.\"
|
|
.\" This is free documentation; you can redistribute it and/or
|
|
.\" modify it under the terms of the GNU General Public License as
|
|
.\" published by the Free Software Foundation; either version 2 of
|
|
.\" the License, or (at your option) any later version.
|
|
.\"
|
|
.\" The GNU General Public License's references to "object code"
|
|
.\" and "executables" are to be interpreted as the output of any
|
|
.\" document formatting or typesetting system, including
|
|
.\" intermediate and printed output.
|
|
.\"
|
|
.\" This manual is distributed in the hope that it will be useful,
|
|
.\" but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
.\" GNU General Public License for more details.
|
|
.\"
|
|
.\" You should have received a copy of the GNU General Public
|
|
.\" License along with this manual; if not, see
|
|
.\" <http://www.gnu.org/licenses/>.
|
|
.\"
|
|
.\" This document is based on the perf_event.h header file, the
|
|
.\" tools/perf/design.txt file, and a lot of bitter experience.
|
|
.\"
|
|
.TH PERF_EVENT_OPEN 2 2013-02-04 "Linux" "Linux Programmer's Manual"
|
|
.SH NAME
|
|
perf_event_open \- set up performance monitoring
|
|
.SH SYNOPSIS
|
|
.nf
|
|
.B #include <linux/perf_event.h>
|
|
.B #include <linux/hw_breakpoint.h>
|
|
.sp
|
|
.BI "int perf_event_open(struct perf_event_attr *" attr ,
|
|
.BI " pid_t " pid ", int " cpu ", int " group_fd ,
|
|
.BI " unsigned long " flags );
|
|
.fi
|
|
|
|
.IR Note :
|
|
There is no glibc wrapper for this system call; see NOTES.
|
|
.SH DESCRIPTION
|
|
Given a list of parameters,
|
|
.BR perf_event_open ()
|
|
returns a file descriptor, for use in subsequent system calls
|
|
.RB ( read "(2), " mmap "(2), " prctl "(2), " fcntl "(2), etc.)."
|
|
.PP
|
|
A call to
|
|
.BR perf_event_open ()
|
|
creates a file descriptor that allows measuring performance
|
|
information.
|
|
Each file descriptor corresponds to one
|
|
event that is measured; these can be grouped together
|
|
to measure multiple events simultaneously.
|
|
.PP
|
|
Events can be enabled and disabled in two ways: via
|
|
.BR ioctl (2)
|
|
and via
|
|
.BR prctl (2) .
|
|
When an event is disabled it does not count or generate overflows but does
|
|
continue to exist and maintain its count value.
|
|
.PP
|
|
Events come in two flavors: counting and sampled.
|
|
A
|
|
.I counting
|
|
event is one that is used for counting the aggregate number of events
|
|
that occur.
|
|
In general, counting event results are gathered with a
|
|
.BR read (2)
|
|
call.
|
|
A
|
|
.I sampling
|
|
event periodically writes measurements to a buffer that can then
|
|
be accessed via
|
|
.BR mmap (2) .
|
|
.SS Arguments
|
|
.P
|
|
The argument
|
|
.I pid
|
|
allows events to be attached to processes in various ways.
|
|
If
|
|
.I pid
|
|
is 0, measurements happen on the current thread, if
|
|
.I pid
|
|
is greater than 0, the process indicated by
|
|
.I pid
|
|
is measured, and if
|
|
.I pid
|
|
is \-1, all processes are counted.
|
|
|
|
The
|
|
.I cpu
|
|
argument allows measurements to be specific to a CPU.
|
|
If
|
|
.I cpu
|
|
is greater than or equal to 0,
|
|
measurements are restricted to the specified CPU;
|
|
if
|
|
.I cpu
|
|
is \-1, the events are measured on all CPUs.
|
|
.P
|
|
Note that the combination of
|
|
.IR pid " == \-1"
|
|
and
|
|
.IR cpu " == \-1"
|
|
is not valid.
|
|
.P
|
|
A
|
|
.IR pid " > 0"
|
|
and
|
|
.IR cpu " == \-1"
|
|
setting measures per-process and follows that process to whatever CPU the
|
|
process gets scheduled to.
|
|
Per-process events can be created by any user.
|
|
.P
|
|
A
|
|
.IR pid " == \-1"
|
|
and
|
|
.IR cpu " >= 0"
|
|
setting is per-CPU and measures all processes on the specified CPU.
|
|
Per-CPU events need the
|
|
.B CAP_SYS_ADMIN
|
|
capability or a
|
|
.I /proc/sys/kernel/perf_event_paranoid
|
|
value of less than 1.
|
|
.P
|
|
The
|
|
.I group_fd
|
|
argument allows event groups to be created.
|
|
An event group has one event which is the group leader.
|
|
The leader is created first, with
|
|
.IR group_fd " = \-1."
|
|
The rest of the group members are created with subsequent
|
|
.BR perf_event_open ()
|
|
calls with
|
|
.IR group_fd
|
|
being set to the fd of the group leader.
|
|
(A single event on its own is created with
|
|
.IR group_fd " = \-1"
|
|
and is considered to be a group with only 1 member.)
|
|
An event group is scheduled onto the CPU as a unit: it will only
|
|
be put onto the CPU if all of the events in the group can be put onto
|
|
the CPU.
|
|
This means that the values of the member events can be
|
|
meaningfully compared, added, divided (to get ratios), etc., with each
|
|
other, since they have counted events for the same set of executed
|
|
instructions.
|
|
.P
|
|
The
|
|
.I flags
|
|
argument takes one of the following values:
|
|
.TP
|
|
.BR PERF_FLAG_FD_NO_GROUP
|
|
.\" FIXME The following sentence is unclear
|
|
This flag allows creating an event as part of an event group but
|
|
having no group leader.
|
|
It is unclear why this is useful.
|
|
.\" FIXME So, why is it useful?
|
|
.TP
|
|
.BR PERF_FLAG_FD_OUTPUT
|
|
This flag re-routes the output from an event to the group leader.
|
|
.TP
|
|
.BR PERF_FLAG_PID_CGROUP " (Since Linux 2.6.39)."
|
|
This flag activates per-container system-wide monitoring.
|
|
A container
|
|
is an abstraction that isolates a set of resources for finer grain
|
|
control (CPUs, memory, etc...).
|
|
In this mode, the event is measured
|
|
only if the thread running on the monitored CPU belongs to the designated
|
|
container (cgroup).
|
|
The cgroup is identified by passing a file descriptor
|
|
opened on its directory in the cgroupfs filesystem.
|
|
For instance, if the
|
|
cgroup to monitor is called
|
|
.IR test ,
|
|
then a file descriptor opened on
|
|
.I /dev/cgroup/test
|
|
(assuming cgroupfs is mounted on
|
|
.IR /dev/cgroup )
|
|
must be passed as the
|
|
.I pid
|
|
parameter.
|
|
cgroup monitoring is only available
|
|
for system-wide events and may therefore require extra permissions.
|
|
.P
|
|
The
|
|
.I perf_event_attr
|
|
structure provides detailed configuration information
|
|
for the event being created.
|
|
|
|
.in +4n
|
|
.nf
|
|
struct perf_event_attr {
|
|
__u32 type; /* Type of event */
|
|
__u32 size; /* Size of attribute structure */
|
|
__u64 config; /* Type-specific configuration */
|
|
|
|
union {
|
|
__u64 sample_period; /* Period of sampling */
|
|
__u64 sample_freq; /* Frequency of sampling */
|
|
};
|
|
|
|
__u64 sample_type; /* Specifies values included in sample */
|
|
__u64 read_format; /* Specifies values returned in read */
|
|
|
|
__u64 disabled : 1, /* off by default */
|
|
inherit : 1, /* children inherit it */
|
|
pinned : 1, /* must always be on PMU */
|
|
exclusive : 1, /* only group on PMU */
|
|
exclude_user : 1, /* don't count user */
|
|
exclude_kernel : 1, /* don't count kernel */
|
|
exclude_hv : 1, /* don't count hypervisor */
|
|
exclude_idle : 1, /* don't count when idle */
|
|
mmap : 1, /* include mmap data */
|
|
comm : 1, /* include comm data */
|
|
freq : 1, /* use freq, not period */
|
|
inherit_stat : 1, /* per task counts */
|
|
enable_on_exec : 1, /* next exec enables */
|
|
task : 1, /* trace fork/exit */
|
|
watermark : 1, /* wakeup_watermark */
|
|
precise_ip : 2, /* skid constraint */
|
|
mmap_data : 1, /* non-exec mmap data */
|
|
sample_id_all : 1, /* sample_type all events */
|
|
exclude_host : 1, /* don't count in host */
|
|
exclude_guest : 1, /* don't count in guest */
|
|
exclude_callchain_kernel : 1,
|
|
/* exclude kernel callchains */
|
|
exclude_callchain_user : 1,
|
|
/* exclude user callchains */
|
|
__reserved_1 : 41;
|
|
|
|
union {
|
|
__u32 wakeup_events; /* wakeup every n events */
|
|
__u32 wakeup_watermark; /* bytes before wakeup */
|
|
};
|
|
|
|
__u32 bp_type; /* breakpoint type */
|
|
|
|
union {
|
|
__u64 bp_addr; /* breakpoint address */
|
|
__u64 config1; /* extension of config */
|
|
};
|
|
|
|
union {
|
|
__u64 bp_len; /* breakpoint length */
|
|
__u64 config2; /* extension of config1 */
|
|
};
|
|
__u64 branch_sample_type; /* enum perf_branch_sample_type */
|
|
__u64 sample_regs_user; /* user regs to dump on samples */
|
|
__u32 sample_stack_user; /* size of stack to dump on
|
|
samples */
|
|
__u32 __reserved_2; /* Align to u64 */
|
|
|
|
};
|
|
.fi
|
|
.in
|
|
|
|
The fields of the
|
|
.I perf_event_attr
|
|
structure are described in more detail below:
|
|
|
|
.TP
|
|
.I type
|
|
This field specifies the overall event type.
|
|
It has one of the following values:
|
|
.RS
|
|
.TP
|
|
.B PERF_TYPE_HARDWARE
|
|
This indicates one of the "generalized" hardware events provided
|
|
by the kernel.
|
|
See the
|
|
.I config
|
|
field definition for more details.
|
|
.TP
|
|
.B PERF_TYPE_SOFTWARE
|
|
This indicates one of the software-defined events provided by the kernel
|
|
(even if no hardware support is available).
|
|
.TP
|
|
.B PERF_TYPE_TRACEPOINT
|
|
This indicates a tracepoint
|
|
provided by the kernel tracepoint infrastructure.
|
|
.TP
|
|
.B PERF_TYPE_HW_CACHE
|
|
This indicates a hardware cache event.
|
|
This has a special encoding, described in the
|
|
.I config
|
|
field definition.
|
|
.TP
|
|
.B PERF_TYPE_RAW
|
|
This indicates a "raw" implementation-specific event in the
|
|
.IR config " field."
|
|
.TP
|
|
.BR PERF_TYPE_BREAKPOINT " (Since Linux 2.6.33)"
|
|
This indicates a hardware breakpoint as provided by the CPU.
|
|
Breakpoints can be read/write accesses to an address as well as
|
|
execution of an instruction address.
|
|
.TP
|
|
.RB "dynamic PMU"
|
|
Since Linux 2.6.39,
|
|
.BR perf_event_open ()
|
|
can support multiple PMUs.
|
|
To enable this, a value exported by the kernel can be used in the
|
|
.I type
|
|
field to indicate which PMU to use.
|
|
The value to use can be found in the sysfs filesystem:
|
|
there is a subdirectory per PMU instance under
|
|
.IR /sys/bus/event_source/devices .
|
|
In each sub-directory there is a
|
|
.I type
|
|
file whose content is an integer that can be used in the
|
|
.I type
|
|
field.
|
|
For instance,
|
|
.I /sys/bus/event_source/devices/cpu/type
|
|
contains the value for the core CPU PMU, which is usually 4.
|
|
.RE
|
|
|
|
.TP
|
|
.I "size"
|
|
The size of the
|
|
.I perf_event_attr
|
|
structure for forward/backward compatibility.
|
|
Set this using
|
|
.I sizeof(struct perf_event_attr)
|
|
to allow the kernel to see
|
|
the struct size at the time of compilation.
|
|
|
|
The related define
|
|
.B PERF_ATTR_SIZE_VER0
|
|
is set to 64; this was the size of the first published struct.
|
|
.B PERF_ATTR_SIZE_VER1
|
|
is 72, corresponding to the addition of breakpoints in Linux 2.6.33.
|
|
.B PERF_ATTR_SIZE_VER2
|
|
is 80 corresponding to the addition of branch sampling in Linux 3.4.
|
|
.B PERF_ATR_SIZE_VER3
|
|
is 96 corresponding to the addition
|
|
of sample_regs_user and sample_stack_user in Linux 3.7.
|
|
|
|
.TP
|
|
.I "config"
|
|
This specifies which event you want, in conjunction with
|
|
the
|
|
.I type
|
|
field.
|
|
The
|
|
.IR config1 " and " config2
|
|
fields are also taken into account in cases where 64 bits is not
|
|
enough to fully specify the event.
|
|
The encoding of these fields are event dependent.
|
|
|
|
The most significant bit (bit 63) of
|
|
.I config
|
|
signifies CPU-specific (raw) counter configuration data;
|
|
if the most significant bit is unset, the next 7 bits are an event
|
|
type and the rest of the bits are the event identifier.
|
|
|
|
There are various ways to set the
|
|
.I config
|
|
field that are dependent on the value of the previously
|
|
described
|
|
.I type
|
|
field.
|
|
What follows are various possible settings for
|
|
.I config
|
|
separated out by
|
|
.IR type .
|
|
|
|
If
|
|
.I type
|
|
is
|
|
.BR PERF_TYPE_HARDWARE ,
|
|
we are measuring one of the generalized hardware CPU events.
|
|
Not all of these are available on all platforms.
|
|
Set
|
|
.I config
|
|
to one of the following:
|
|
.RS 12
|
|
.TP
|
|
.B PERF_COUNT_HW_CPU_CYCLES
|
|
Total cycles.
|
|
Be wary of what happens during CPU frequency scaling
|
|
.TP
|
|
.B PERF_COUNT_HW_INSTRUCTIONS
|
|
Retired instructions.
|
|
Be careful, these can be affected by various
|
|
issues, most notably hardware interrupt counts
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_REFERENCES
|
|
Cache accesses.
|
|
Usually this indicates Last Level Cache accesses but this may
|
|
vary depending on your CPU.
|
|
This may include prefetches and coherency messages; again this
|
|
depends on the design of your CPU.
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_MISSES
|
|
Cache misses.
|
|
Usually this indicates Last Level Cache misses; this is intended to be
|
|
used in conjunction with the
|
|
.B PERF_COUNT_HW_CACHE_REFERENCES
|
|
event to calculate cache miss rates.
|
|
.TP
|
|
.B PERF_COUNT_HW_BRANCH_INSTRUCTIONS
|
|
Retired branch instructions.
|
|
Prior to Linux 2.6.34, this used
|
|
the wrong event on AMD processors.
|
|
.TP
|
|
.B PERF_COUNT_HW_BRANCH_MISSES
|
|
Mispredicted branch instructions.
|
|
.TP
|
|
.B PERF_COUNT_HW_BUS_CYCLES
|
|
Bus cycles, which can be different from total cycles.
|
|
.TP
|
|
.BR PERF_COUNT_HW_STALLED_CYCLES_FRONTEND " (Since Linux 3.0)"
|
|
Stalled cycles during issue.
|
|
.TP
|
|
.BR PERF_COUNT_HW_STALLED_CYCLES_BACKEND " (Since Linux 3.0)"
|
|
Stalled cycles during retirement.
|
|
.TP
|
|
.BR PERF_COUNT_HW_REF_CPU_CYCLES " (Since Linux 3.3)"
|
|
Total cycles; not affected by CPU frequency scaling.
|
|
.RE
|
|
.IP
|
|
If
|
|
.I type
|
|
is
|
|
.BR PERF_TYPE_SOFTWARE ,
|
|
we are measuring software events provided by the kernel.
|
|
Set
|
|
.I config
|
|
to one of the following:
|
|
.RS 12
|
|
.TP
|
|
.B PERF_COUNT_SW_CPU_CLOCK
|
|
This reports the CPU clock, a high-resolution per-CPU timer.
|
|
.TP
|
|
.B PERF_COUNT_SW_TASK_CLOCK
|
|
This reports a clock count specific to the task that is running.
|
|
.TP
|
|
.B PERF_COUNT_SW_PAGE_FAULTS
|
|
This reports the number of page faults.
|
|
.TP
|
|
.B PERF_COUNT_SW_CONTEXT_SWITCHES
|
|
This counts context switches.
|
|
Until Linux 2.6.34, these were all reported as user-space
|
|
events, after that they are reported as happening in the kernel.
|
|
.TP
|
|
.B PERF_COUNT_SW_CPU_MIGRATIONS
|
|
This reports the number of times the process
|
|
has migrated to a new CPU.
|
|
.TP
|
|
.B PERF_COUNT_SW_PAGE_FAULTS_MIN
|
|
This counts the number of minor page faults.
|
|
These did not require disk I/O to handle.
|
|
.TP
|
|
.B PERF_COUNT_SW_PAGE_FAULTS_MAJ
|
|
This counts the number of major page faults.
|
|
These required disk I/O to handle.
|
|
.TP
|
|
.BR PERF_COUNT_SW_ALIGNMENT_FAULTS " (Since Linux 2.6.33)"
|
|
This counts the number of alignment faults.
|
|
These happen when unaligned memory accesses happen; the kernel
|
|
can handle these but it reduces performance.
|
|
This only happens on some architectures (never on x86).
|
|
.TP
|
|
.BR PERF_COUNT_SW_EMULATION_FAULTS " (Since Linux 2.6.33)"
|
|
This counts the number of emulation faults.
|
|
The kernel sometimes traps on unimplemented instructions
|
|
and emulates them for user space.
|
|
This can negatively impact performance.
|
|
.RE
|
|
.RE
|
|
|
|
|
|
.RS
|
|
If
|
|
.I type
|
|
is
|
|
.BR PERF_TYPE_TRACEPOINT ,
|
|
then we are measuring kernel tracepoints.
|
|
The value to use in
|
|
.I config
|
|
can be obtained from under debugfs
|
|
.I tracing/events/*/*/id
|
|
if ftrace is enabled in the kernel.
|
|
|
|
.RE
|
|
|
|
.RS
|
|
If
|
|
.I type
|
|
is
|
|
.BR PERF_TYPE_HW_CACHE ,
|
|
then we are measuring a hardware CPU cache event.
|
|
To calculate the appropriate
|
|
.I config
|
|
value use the following equation:
|
|
.RS 4
|
|
.nf
|
|
|
|
(perf_hw_cache_id) | (perf_hw_cache_op_id << 8) |
|
|
(perf_hw_cache_op_result_id << 16)
|
|
.fi
|
|
.P
|
|
where
|
|
.I perf_hw_cache_id
|
|
is one of:
|
|
.RS 4
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_L1D
|
|
for measuring Level 1 Data Cache
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_L1I
|
|
for measuring Level 1 Instruction Cache
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_LL
|
|
for measuring Last-Level Cache
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_DTLB
|
|
for measuring the Data TLB
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_ITLB
|
|
for measuring the Instruction TLB
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_BPU
|
|
for measuring the branch prediction unit
|
|
.TP
|
|
.BR PERF_COUNT_HW_CACHE_NODE " (Since Linux 3.0)"
|
|
for measuring local memory accesses
|
|
.RE
|
|
|
|
.P
|
|
and
|
|
.I perf_hw_cache_op_id
|
|
is one of
|
|
.RS 4
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_OP_READ
|
|
for read accesses
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_OP_WRITE
|
|
for write accesses
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_OP_PREFETCH
|
|
for prefetch accesses
|
|
.RE
|
|
|
|
.P
|
|
and
|
|
.I perf_hw_cache_op_result_id
|
|
is one of
|
|
.RS 4
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_RESULT_ACCESS
|
|
to measure accesses
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_RESULT_MISS
|
|
to measure misses
|
|
.RE
|
|
.RE
|
|
|
|
If
|
|
.I type
|
|
is
|
|
.BR PERF_TYPE_RAW ,
|
|
then a custom "raw"
|
|
.I config
|
|
value is needed.
|
|
Most CPUs support events that are not covered by the "generalized" events.
|
|
These are implementation defined; see your CPU manual (for example
|
|
the Intel Volume 3B documentation or the AMD BIOS and Kernel Developer
|
|
Guide).
|
|
The libpfm4 library can be used to translate from the name in the
|
|
architectural manuals to the raw hex value
|
|
.BR perf_event_open ()
|
|
expects in this field.
|
|
|
|
If
|
|
.I type
|
|
is
|
|
.BR PERF_TYPE_BREAKPOINT ,
|
|
then leave
|
|
.I config
|
|
set to zero.
|
|
Its parameters are set in other places.
|
|
.RE
|
|
.TP
|
|
.IR sample_period ", " sample_freq
|
|
A "sampling" counter is one that generates an interrupt
|
|
every N events, where N is given by
|
|
.IR sample_period .
|
|
A sampling counter has
|
|
.IR sample_period " > 0."
|
|
When an overflow interrupt occurs, requested data is recorded
|
|
in the mmap buffer.
|
|
The
|
|
.I sample_type
|
|
field controls what data is recorded on each interrupt.
|
|
|
|
.I sample_freq
|
|
can be used if you wish to use frequency rather than period.
|
|
In this case you set the
|
|
.I freq
|
|
flag.
|
|
The kernel will adjust the sampling period
|
|
to try and achieve the desired rate.
|
|
The rate of adjustment is a
|
|
timer tick.
|
|
|
|
|
|
.TP
|
|
.I "sample_type"
|
|
The various bits in this field specify which values to include
|
|
in the sample.
|
|
They will be recorded in a ring-buffer,
|
|
which is available to user-space using
|
|
.BR mmap (2).
|
|
The order in which the values are saved in the
|
|
sample are documented in the MMAP Layout subsection below;
|
|
it is not the
|
|
.I "enum perf_event_sample_format"
|
|
order.
|
|
.RS
|
|
.TP
|
|
.B PERF_SAMPLE_IP
|
|
Records instruction pointer.
|
|
.TP
|
|
.B PERF_SAMPLE_TID
|
|
Records the process and thread IDs.
|
|
.TP
|
|
.B PERF_SAMPLE_TIME
|
|
Records a timestamp.
|
|
.TP
|
|
.B PERF_SAMPLE_ADDR
|
|
Records an address, if applicable.
|
|
.TP
|
|
.B PERF_SAMPLE_READ
|
|
Record counter values for all events in a group, not just the group leader.
|
|
.TP
|
|
.B PERF_SAMPLE_CALLCHAIN
|
|
Records the callchain (stack backtrace).
|
|
.TP
|
|
.B PERF_SAMPLE_ID
|
|
Records a unique ID for the opened event's group leader.
|
|
.TP
|
|
.B PERF_SAMPLE_CPU
|
|
Records CPU number.
|
|
.TP
|
|
.B PERF_SAMPLE_PERIOD
|
|
Records the current sampling period.
|
|
.TP
|
|
.B PERF_SAMPLE_STREAM_ID
|
|
Records a unique ID for the opened event.
|
|
Unlike
|
|
.B PERF_SAMPLE_ID
|
|
the actual ID is returned, not the group leader.
|
|
This ID is the same as the one returned by PERF_FORMAT_ID.
|
|
.TP
|
|
.B PERF_SAMPLE_RAW
|
|
Records additional data, if applicable.
|
|
Usually returned by tracepoint events.
|
|
.TP
|
|
.BR PERF_SAMPLE_BRANCH_STACK " (Since Linux 3.4)"
|
|
Records the branch stack.
|
|
See branch_sample_type.
|
|
.TP
|
|
.BR PERF_SAMPLE_REGS_USER " (Since Linux 3.7)"
|
|
Records the current register state.
|
|
.TP
|
|
.BR PERF_SAMPLE_STACK_USER " (Since Linux 3.7)"
|
|
[To be documented]
|
|
.RE
|
|
|
|
.TP
|
|
.IR "read_format"
|
|
This field specifies the format of the data returned by
|
|
.BR read (2)
|
|
on a
|
|
.BR perf_event_open ()
|
|
file descriptor.
|
|
.RS
|
|
.TP
|
|
.B PERF_FORMAT_TOTAL_TIME_ENABLED
|
|
Adds the 64-bit "time_enabled" field.
|
|
This can be used to calculate estimated totals if
|
|
the PMU is overcommitted and multiplexing is happening.
|
|
.TP
|
|
.B PERF_FORMAT_TOTAL_TIME_RUNNING
|
|
Adds the 64-bit "time_running" field.
|
|
This can be used to calculate estimated totals if
|
|
the PMU is overcommitted and multiplexing is happening.
|
|
.TP
|
|
.B PERF_FORMAT_ID
|
|
Adds a 64-bit unique value that corresponds to the event group.
|
|
.TP
|
|
.B PERF_FORMAT_GROUP
|
|
Allows all counter values in an event group to be read with one read.
|
|
.RE
|
|
|
|
.TP
|
|
.IR "disabled"
|
|
The
|
|
.I disabled
|
|
bit specifies whether the counter starts out disabled or enabled.
|
|
If disabled, the event can later be enabled by
|
|
.BR ioctl (2),
|
|
.BR prctl (2),
|
|
or
|
|
.IR enable_on_exec .
|
|
|
|
.TP
|
|
.IR "inherit"
|
|
The
|
|
.I inherit
|
|
bit specifies that this counter should count events of child
|
|
tasks as well as the task specified.
|
|
This only applies to new children, not to any existing children at
|
|
the time the counter is created (nor to any new children of
|
|
existing children).
|
|
|
|
Inherit does not work for some combinations of
|
|
.IR read_format s,
|
|
such as
|
|
.BR PERF_FORMAT_GROUP .
|
|
|
|
.TP
|
|
.IR "pinned"
|
|
The
|
|
.I pinned
|
|
bit specifies that the counter should always be on the CPU if at all
|
|
possible.
|
|
It only applies to hardware counters and only to group leaders.
|
|
If a pinned counter cannot be put onto the CPU (e.g., because there are
|
|
not enough hardware counters or because of a conflict with some other
|
|
event), then the counter goes into an 'error' state, where reads
|
|
return end-of-file (i.e.,
|
|
.BR read (2)
|
|
returns 0) until the counter is subsequently enabled or disabled.
|
|
|
|
.TP
|
|
.IR "exclusive"
|
|
The
|
|
.I exclusive
|
|
bit specifies that when this counter's group is on the CPU,
|
|
it should be the only group using the CPU's counters.
|
|
In the future this may allow monitoring programs to
|
|
support PMU features that need to run alone so that they do not
|
|
disrupt other hardware counters.
|
|
|
|
.TP
|
|
.IR "exclude_user"
|
|
If this bit is set, the count excludes events that happen in user-space.
|
|
|
|
.TP
|
|
.IR "exclude_kernel"
|
|
If this bit is set, the count excludes events that happen in kernel-space.
|
|
|
|
.TP
|
|
.IR "exclude_hv"
|
|
If this bit is set, the count excludes events that happen in the
|
|
hypervisor.
|
|
This is mainly for PMUs that have built-in support for handling this
|
|
(such as POWER).
|
|
Extra support is needed for handling hypervisor measurements on most
|
|
machines.
|
|
|
|
.TP
|
|
.IR "exclude_idle"
|
|
If set, don't count when the CPU is idle.
|
|
|
|
.TP
|
|
.IR "mmap"
|
|
The
|
|
.I mmap
|
|
bit enables recording of exec mmap events.
|
|
|
|
.TP
|
|
.IR "comm"
|
|
The
|
|
.I comm
|
|
bit enables tracking of process command name as modified by the
|
|
.IR exec (2)
|
|
and
|
|
.IR prctl (PR_SET_NAME)
|
|
system calls.
|
|
Unfortunately for tools,
|
|
there is no way to distinguish one system call versus the other.
|
|
|
|
.TP
|
|
.IR "freq"
|
|
If this bit is set, then
|
|
.I sample_frequency
|
|
not
|
|
.I sample_period
|
|
is used when setting up the sampling interval.
|
|
|
|
.TP
|
|
.IR "inherit_stat"
|
|
This bit enables saving of event counts on context switch for
|
|
inherited tasks.
|
|
This is only meaningful if the
|
|
.I inherit
|
|
field is set.
|
|
|
|
.TP
|
|
.IR "enable_on_exec"
|
|
If this bit is set, a counter is automatically
|
|
enabled after a call to
|
|
.BR exec (2).
|
|
|
|
.TP
|
|
.IR "task"
|
|
If this bit is set, then
|
|
fork/exit notifications are included in the ring buffer.
|
|
|
|
.TP
|
|
.IR "watermark"
|
|
If set, have a sampling interrupt happen when we cross the
|
|
.I wakeup_watermark
|
|
boundary.
|
|
Otherwise interrupts happen after
|
|
.I wakeup_events
|
|
samples.
|
|
|
|
.TP
|
|
.IR "precise_ip" " (Since Linux 2.6.35)"
|
|
This controls the amount of skid.
|
|
Skid is how many instructions
|
|
execute between an event of interest happening and the kernel
|
|
being able to stop and record the event.
|
|
Smaller skid is
|
|
better and allows more accurate reporting of which events
|
|
correspond to which instructions, but hardware is often limited
|
|
with how small this can be.
|
|
|
|
The values of this are the following:
|
|
.RS
|
|
.TP
|
|
0 -
|
|
.B SAMPLE_IP
|
|
can have arbitrary skid
|
|
.TP
|
|
1 -
|
|
.B SAMPLE_IP
|
|
must have constant skid
|
|
.TP
|
|
2 -
|
|
.B SAMPLE_IP
|
|
requested to have 0 skid
|
|
.TP
|
|
3 -
|
|
.B SAMPLE_IP
|
|
must have 0 skid.
|
|
See also
|
|
.BR PERF_RECORD_MISC_EXACT_IP .
|
|
.RE
|
|
|
|
.TP
|
|
.IR "mmap_data" " (Since Linux 2.6.36)"
|
|
The counterpart of the
|
|
.I mmap
|
|
field, but enables including data mmap events
|
|
in the ring-buffer.
|
|
|
|
.TP
|
|
.IR "sample_id_all" " (Since Linux 2.6.38)"
|
|
If set, then TID, TIME, ID, CPU, and STREAM_ID can
|
|
additionally be included in
|
|
.RB non- PERF_RECORD_SAMPLE s
|
|
if the corresponding
|
|
.I sample_type
|
|
is selected.
|
|
|
|
.TP
|
|
.IR "exclude_host" " (Since Linux 3.2)"
|
|
Do not measure time spent in VM host
|
|
|
|
.TP
|
|
.IR "exclude_guest" " (Since Linux 3.2)"
|
|
Do not measure time spent in VM guest
|
|
|
|
.TP
|
|
.IR "exclude_callchain_kernel" " (Since Linux 3.7)"
|
|
Do not include kernel callchains.
|
|
|
|
.TP
|
|
.IR "exclude_callchain_user" " (Since Linux 3.7)"
|
|
Do not include user callchains.
|
|
|
|
.TP
|
|
.IR "wakeup_events" ", " "wakeup_watermark"
|
|
This union sets how many samples
|
|
.RI ( wakeup_events )
|
|
or bytes
|
|
.RI ( wakeup_watermark )
|
|
happen before an overflow signal happens.
|
|
Which one is used is selected by the
|
|
.I watermark
|
|
bitflag.
|
|
|
|
.TP
|
|
.IR "bp_type" " (Since Linux 2.6.33)"
|
|
This chooses the breakpoint type.
|
|
It is one of:
|
|
.RS
|
|
.TP
|
|
.BR HW_BREAKPOINT_EMPTY
|
|
no breakpoint
|
|
.TP
|
|
.BR HW_BREAKPOINT_R
|
|
count when we read the memory location
|
|
.TP
|
|
.BR HW_BREAKPOINT_W
|
|
count when we write the memory location
|
|
.TP
|
|
.BR HW_BREAKPOINT_RW
|
|
count when we read or write the memory location
|
|
.TP
|
|
.BR HW_BREAKPOINT_X
|
|
count when we execute code at the memory location
|
|
|
|
.LP
|
|
The values can be combined via a bitwise or, but the
|
|
combination of
|
|
.B HW_BREAKPOINT_R
|
|
or
|
|
.B HW_BREAKPOINT_W
|
|
with
|
|
.B HW_BREAKPOINT_X
|
|
is not allowed.
|
|
.RE
|
|
|
|
.TP
|
|
.IR "bp_addr" " (Since Linux 2.6.33)"
|
|
.I bp_addr
|
|
address of the breakpoint.
|
|
For execution breakpoints this is the memory address of the instruction
|
|
of interest; for read and write breakpoints it is the memory address
|
|
of the memory location of interest.
|
|
|
|
.TP
|
|
.IR "config1" " (Since Linux 2.6.39)"
|
|
.I config1
|
|
is used for setting events that need an extra register or otherwise
|
|
do not fit in the regular config field.
|
|
Raw OFFCORE_EVENTS on Nehalem/Westmere/SandyBridge use this field
|
|
on 3.3 and later kernels.
|
|
|
|
.TP
|
|
.IR "bp_len" " (Since Linux 2.6.33)"
|
|
.I bp_len
|
|
is the length of the breakpoint being measured if
|
|
.I type
|
|
is
|
|
.BR PERF_TYPE_BREAKPOINT .
|
|
Options are
|
|
.BR HW_BREAKPOINT_LEN_1 ,
|
|
.BR HW_BREAKPOINT_LEN_2 ,
|
|
.BR HW_BREAKPOINT_LEN_4 ,
|
|
.BR HW_BREAKPOINT_LEN_8 .
|
|
For an execution breakpoint, set this to
|
|
.IR sizeof(long) .
|
|
|
|
.TP
|
|
.IR "config2" " (Since Linux 2.6.39)"
|
|
|
|
.I config2
|
|
is a further extension of the
|
|
.I config1
|
|
field.
|
|
|
|
.TP
|
|
.IR "branch_sample_type" " (Since Linux 3.4)"
|
|
This is used with the CPUs hardware branch sampling, if available.
|
|
It can have one of the following values:
|
|
.RS
|
|
.TP
|
|
.B PERF_SAMPLE_BRANCH_USER
|
|
Branch target is in user space
|
|
.TP
|
|
.B PERF_SAMPLE_BRANCH_KERNEL
|
|
Branch target is in kernel space
|
|
.TP
|
|
.B PERF_SAMPLE_BRANCH_HV
|
|
Branch target is in hypervisor
|
|
.TP
|
|
.B PERF_SAMPLE_BRANCH_ANY
|
|
Any branch type.
|
|
.TP
|
|
.B PERF_SAMPLE_BRANCH_ANY_CALL
|
|
Any call branch
|
|
.TP
|
|
.B PERF_SAMPLE_BRANCH_ANY_RETURN
|
|
Any return branch
|
|
.TP
|
|
.BR PERF_SAMPLE_BRANCH_IND_CALL
|
|
Indirect calls
|
|
.TP
|
|
.BR PERF_SAMPLE_BRANCH_PLM_ALL
|
|
User, kernel, and hv
|
|
.RE
|
|
|
|
.TP
|
|
.IR "sample_regs_user" " (Since Linux 3.7)"
|
|
This defines the set of user registers to dump on samples.
|
|
See
|
|
.\" FIXME: The following refernce seems to be not quite right:
|
|
.IR asm/perf_regs.h .
|
|
|
|
.TP
|
|
.IR "sample_stack_user" " (Since Linux 3.7)"
|
|
This defines the size of the user stack to dump on sample.
|
|
|
|
.RE
|
|
|
|
.SS "Reading Results"
|
|
Once a
|
|
.BR perf_event_open ()
|
|
file descriptor has been opened, the values
|
|
of the events can be read from the file descriptor.
|
|
The values that are there are specified by the
|
|
.I read_format
|
|
field in the
|
|
.I attr
|
|
structure at open time.
|
|
|
|
If you attempt to read into a buffer that is not big enough to hold the
|
|
data
|
|
.B ENOSPC
|
|
is returned
|
|
|
|
Here is the layout of the data returned by a read:
|
|
|
|
If
|
|
.B PERF_FORMAT_GROUP
|
|
was specified to allow reading all events in a group at once:
|
|
|
|
.in +4n
|
|
.nf
|
|
struct read_format {
|
|
u64 nr; /* The number of events */
|
|
u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
|
|
u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
|
|
struct {
|
|
u64 value; /* The value of the event */
|
|
u64 id; /* if PERF_FORMAT_ID */
|
|
} values[nr];
|
|
};
|
|
.fi
|
|
.in
|
|
|
|
If
|
|
.B PERF_FORMAT_GROUP
|
|
was
|
|
.I not
|
|
specified, then the read values look as following:
|
|
|
|
.in +4n
|
|
.nf
|
|
struct read_format {
|
|
u64 value; /* The value of the event */
|
|
u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
|
|
u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
|
|
u64 id; /* if PERF_FORMAT_ID */
|
|
};
|
|
.fi
|
|
.in
|
|
|
|
The values read are described in more detail below.
|
|
.RS
|
|
.TP
|
|
.I nr
|
|
The number of events in this file descriptor.
|
|
Only available if
|
|
.B PERF_FORMAT_GROUP
|
|
was specified.
|
|
|
|
.TP
|
|
.IR time_enabled ", " time_running
|
|
Total time the event was enabled and running.
|
|
Normally these are the same.
|
|
If more events are started
|
|
than available counter slots on the PMU, then multiplexing
|
|
happens and events only run part of the time.
|
|
In that case the
|
|
.I time_enabled
|
|
and
|
|
.I time running
|
|
values can be used to scale an estimated value for the count.
|
|
|
|
.TP
|
|
.I value
|
|
An unsigned 64-bit value containing the counter result.
|
|
|
|
.TP
|
|
.I id
|
|
A globally unique value for this particular event, only there if
|
|
.B PERF_FORMAT_ID
|
|
was specified in read_format.
|
|
|
|
.RE
|
|
.RE
|
|
|
|
|
|
|
|
.SS "MMAP Layout"
|
|
|
|
When using
|
|
.BR perf_event_open ()
|
|
in sampled mode, asynchronous events
|
|
(like counter overflow or
|
|
.B PROT_EXEC
|
|
mmap tracking)
|
|
are logged into a ring-buffer.
|
|
This ring-buffer is created and accessed through
|
|
.BR mmap (2).
|
|
|
|
The mmap size should be 1+2^n pages, where the first page is a
|
|
metadata page
|
|
.IR ( "struct perf_event_mmap_page" )
|
|
that contains various
|
|
bits of information such as where the ring-buffer head is.
|
|
|
|
Before kernel 2.6.39, there is a bug that means you must allocate a mmap
|
|
ring buffer when sampling even if you do not plan to access it.
|
|
|
|
The structure of the first metadata mmap page is as follows:
|
|
|
|
.in +4n
|
|
.nf
|
|
struct perf_event_mmap_page {
|
|
__u32 version; /* version number of this structure */
|
|
__u32 compat_version; /* lowest version this is compat with */
|
|
__u32 lock; /* seqlock for synchronization */
|
|
__u32 index; /* hardware counter identifier */
|
|
__s64 offset; /* add to hardware counter value */
|
|
__u64 time_enabled; /* time event active */
|
|
__u64 time_running; /* time event on CPU */
|
|
union {
|
|
__u64 capabilities;
|
|
__u64 cap_usr_time : 1,
|
|
cap_usr_rdpmc : 1,
|
|
};
|
|
__u16 pmc_width;
|
|
__u16 time_shift;
|
|
__u32 time_mult;
|
|
__u64 time_offset;
|
|
__u64 __reserved[120]; /* Pad to 1k */
|
|
__u64 data_head; /* head in the data section */
|
|
__u64 data_tail; /* user-space written tail */
|
|
}
|
|
.fi
|
|
.in
|
|
|
|
|
|
|
|
The following looks at the fields in the
|
|
.I perf_event_mmap_page
|
|
structure in more detail.
|
|
|
|
.RS 4
|
|
|
|
.TP
|
|
.I version
|
|
Version number of this structure.
|
|
|
|
.TP
|
|
.I compat_version
|
|
The lowest version this is compatible with.
|
|
|
|
.TP
|
|
.I lock
|
|
A seqlock for synchronization.
|
|
|
|
.TP
|
|
.I index
|
|
A unique hardware counter identifier.
|
|
|
|
.TP
|
|
.I offset
|
|
.\" FIXME clarify
|
|
Add this to hardware counter value??
|
|
|
|
.TP
|
|
.I time_enabled
|
|
Time the event was active.
|
|
|
|
.TP
|
|
.I time_running
|
|
Time the event was running.
|
|
|
|
.TP
|
|
.I cap_usr_time
|
|
User time capability
|
|
|
|
.TP
|
|
.I cap_usr_rdpmc
|
|
If the hardware supports user-space read of performance counters
|
|
without syscall (this is the "rdpmc" instruction on x86), then
|
|
the following code can be used to do a read:
|
|
|
|
.in +4n
|
|
.nf
|
|
u32 seq, time_mult, time_shift, idx, width;
|
|
u64 count, enabled, running;
|
|
u64 cyc, time_offset;
|
|
s64 pmc = 0;
|
|
|
|
do {
|
|
seq = pc\->lock;
|
|
barrier();
|
|
enabled = pc\->time_enabled;
|
|
running = pc\->time_running;
|
|
|
|
if (pc\->cap_usr_time && enabled != running) {
|
|
cyc = rdtsc();
|
|
time_offset = pc\->time_offset;
|
|
time_mult = pc\->time_mult;
|
|
time_shift = pc\->time_shift;
|
|
}
|
|
|
|
idx = pc\->index;
|
|
count = pc\->offset;
|
|
|
|
if (pc\->cap_usr_rdpmc && idx) {
|
|
width = pc\->pmc_width;
|
|
pmc = rdpmc(idx \- 1);
|
|
}
|
|
|
|
barrier();
|
|
} while (pc\->lock != seq);
|
|
.fi
|
|
.in
|
|
|
|
|
|
|
|
.TP
|
|
.I pmc_width
|
|
If
|
|
.IR cap_usr_rdpmc ,
|
|
this field provides the bit-width of the value
|
|
read using the rdpmc or equivalent instruction.
|
|
This can be used to sign extend the result like:
|
|
|
|
.in +4n
|
|
.nf
|
|
pmc <<= 64 \- pmc_width;
|
|
pmc >>= 64 \- pmc_width; // signed shift right
|
|
count += pmc;
|
|
.fi
|
|
.in
|
|
|
|
|
|
.TP
|
|
.IR time_shift ", " time_mult ", " time_offset
|
|
|
|
If
|
|
.IR cap_usr_time ,
|
|
these fields can be used to compute the time
|
|
delta since time_enabled (in nanoseconds) using rdtsc or similar.
|
|
.nf
|
|
|
|
u64 quot, rem;
|
|
u64 delta;
|
|
quot = (cyc >> time_shift);
|
|
rem = cyc & ((1 << time_shift) \- 1);
|
|
delta = time_offset + quot * time_mult +
|
|
((rem * time_mult) >> time_shift);
|
|
.fi
|
|
|
|
Where
|
|
.IR time_offset ,
|
|
.IR time_mult ,
|
|
.IR time_shift ,
|
|
and
|
|
.IR cyc
|
|
are read in the
|
|
seqcount loop described above.
|
|
This delta can then be added to
|
|
enabled and possible running (if idx), improving the scaling:
|
|
.nf
|
|
|
|
enabled += delta;
|
|
if (idx)
|
|
running += delta;
|
|
quot = count / running;
|
|
rem = count % running;
|
|
count = quot * enabled + (rem * enabled) / running;
|
|
.fi
|
|
|
|
.TP
|
|
.I data_head
|
|
This points to the head of the data section.
|
|
The value continuously increases, it does not wrap.
|
|
The value needs to be manually wrapped by the size of the mmap buffer
|
|
before accessing the samples.
|
|
|
|
On SMP-capable platforms, after reading the data_head value,
|
|
user-space should issue an rmb().
|
|
|
|
.TP
|
|
.I data_tail;
|
|
When the mapping is
|
|
.BR PROT_WRITE ,
|
|
the
|
|
.I data_tail
|
|
value should be written by user space to reflect the last read data.
|
|
In this case the kernel will not over-write unread data.
|
|
|
|
.RE
|
|
|
|
|
|
The following 2^n ring-buffer pages have the layout described below.
|
|
|
|
If
|
|
.I perf_event_attr.sample_id_all
|
|
is set, then all event types will
|
|
have the sample_type selected fields related to where/when (identity)
|
|
an event took place (TID, TIME, ID, CPU, STREAM_ID) described in
|
|
.B PERF_RECORD_SAMPLE
|
|
below, it will be stashed just after the
|
|
.I perf_event_header
|
|
and the fields already present for the existing
|
|
fields, i.e., at the end of the payload.
|
|
That way a newer perf.data
|
|
file will be supported by older perf tools, with these new optional
|
|
fields being ignored.
|
|
|
|
The mmap values start with a header:
|
|
|
|
.in +4n
|
|
.nf
|
|
struct perf_event_header {
|
|
__u32 type;
|
|
__u16 misc;
|
|
__u16 size;
|
|
};
|
|
.fi
|
|
.in
|
|
|
|
Below, we describe the
|
|
.I perf_event_header
|
|
fields in more detail.
|
|
|
|
.TP
|
|
.I type
|
|
The
|
|
.I type
|
|
value is one of the below.
|
|
The values in the corresponding record (that follows the header)
|
|
depend on the
|
|
.I type
|
|
selected as shown.
|
|
|
|
.RS
|
|
.TP 4
|
|
.B PERF_RECORD_MMAP
|
|
The MMAP events record the
|
|
.B PROT_EXEC
|
|
mappings so that we can correlate
|
|
user space IPs to code.
|
|
They have the following structure:
|
|
|
|
.in +4n
|
|
.nf
|
|
struct {
|
|
struct perf_event_header header;
|
|
u32 pid, tid;
|
|
u64 addr;
|
|
u64 len;
|
|
u64 pgoff;
|
|
char filename[];
|
|
};
|
|
.fi
|
|
.in
|
|
|
|
.TP
|
|
.B PERF_RECORD_LOST
|
|
This record indicates when events are lost.
|
|
|
|
.in +4n
|
|
.nf
|
|
struct {
|
|
struct perf_event_header header;
|
|
u64 id;
|
|
u64 lost;
|
|
};
|
|
.fi
|
|
.in
|
|
|
|
.RS
|
|
.TP
|
|
.I id
|
|
is the unique event ID for the samples that were lost.
|
|
.TP
|
|
.I lost
|
|
is the number of events that were lost.
|
|
.RE
|
|
|
|
.TP
|
|
.B PERF_RECORD_COMM
|
|
This record indicates a change in the process name.
|
|
|
|
.in +4n
|
|
.nf
|
|
struct {
|
|
struct perf_event_header header;
|
|
u32 pid, tid;
|
|
char comm[];
|
|
};
|
|
.fi
|
|
.in
|
|
|
|
.TP
|
|
.B PERF_RECORD_EXIT
|
|
This record indicates a process exit event.
|
|
|
|
.in +4n
|
|
.nf
|
|
struct {
|
|
struct perf_event_header header;
|
|
u32 pid, ppid;
|
|
u32 tid, ptid;
|
|
u64 time;
|
|
};
|
|
.fi
|
|
.in
|
|
|
|
.TP
|
|
.BR PERF_RECORD_THROTTLE ", " PERF_RECORD_UNTHROTTLE
|
|
This record indicates a throttle/unthrottle event.
|
|
|
|
.in +4n
|
|
.nf
|
|
struct {
|
|
struct perf_event_header header;
|
|
u64 time;
|
|
u64 id;
|
|
u64 stream_id;
|
|
};
|
|
.fi
|
|
.in
|
|
|
|
.TP
|
|
.B PERF_RECORD_FORK
|
|
This record indicates a fork event.
|
|
|
|
.in +4n
|
|
.nf
|
|
struct {
|
|
struct perf_event_header header;
|
|
u32 pid, ppid;
|
|
u32 tid, ptid;
|
|
u64 time;
|
|
};
|
|
.fi
|
|
.in
|
|
|
|
.TP
|
|
.B PERF_RECORD_READ
|
|
This record indicates a read event.
|
|
|
|
.in +4n
|
|
.nf
|
|
struct {
|
|
struct perf_event_header header;
|
|
u32 pid, tid;
|
|
struct read_format values;
|
|
};
|
|
.fi
|
|
.in
|
|
|
|
.TP
|
|
.B PERF_RECORD_SAMPLE
|
|
This record indicates a sample.
|
|
|
|
.in +4n
|
|
.nf
|
|
struct {
|
|
struct perf_event_header header;
|
|
u64 ip; /* if PERF_SAMPLE_IP */
|
|
u32 pid, tid; /* if PERF_SAMPLE_TID */
|
|
u64 time; /* if PERF_SAMPLE_TIME */
|
|
u64 addr; /* if PERF_SAMPLE_ADDR */
|
|
u64 id; /* if PERF_SAMPLE_ID */
|
|
u64 stream_id; /* if PERF_SAMPLE_STREAM_ID */
|
|
u32 cpu, res; /* if PERF_SAMPLE_CPU */
|
|
u64 period; /* if PERF_SAMPLE_PERIOD */
|
|
struct read_format v; /* if PERF_SAMPLE_READ */
|
|
u64 nr; /* if PERF_SAMPLE_CALLCHAIN */
|
|
u64 ips[nr]; /* if PERF_SAMPLE_CALLCHAIN */
|
|
u32 size; /* if PERF_SAMPLE_RAW */
|
|
char data[size]; /* if PERF_SAMPLE_RAW */
|
|
u64 bnr; /* if PERF_SAMPLE_BRANCH_STACK */
|
|
struct perf_branch_entry lbr[bnr];
|
|
/* if PERF_SAMPLE_BRANCH_STACK */
|
|
u64 abi; /* if PERF_SAMPLE_REGS_USER */
|
|
u64 regs[weight(mask)];
|
|
/* if PERF_SAMPLE_REGS_USER */
|
|
u64 size; /* if PERF_SAMPLE_STACK_USER */
|
|
char data[size]; /* if PERF_SAMPLE_STACK_USER */
|
|
u64 dyn_size; /* if PERF_SAMPLE_STACK_USER */
|
|
};
|
|
.fi
|
|
|
|
.RS
|
|
.TP
|
|
.I ip
|
|
If
|
|
.B PERF_SAMPLE_IP
|
|
is enabled, then a 64-bit instruction
|
|
pointer value is included.
|
|
|
|
.TP
|
|
.IR pid ", " tid
|
|
If
|
|
.B PERF_SAMPLE_TID
|
|
is enabled, then a 32-bit process ID
|
|
and 32-bit thread ID are included.
|
|
|
|
.TP
|
|
.I time
|
|
If
|
|
.B PERF_SAMPLE_TIME
|
|
is enabled, then a 64-bit timestamp
|
|
is included.
|
|
This is obtained via local_clock() which is a hardware timestamp
|
|
if available and the jiffies value if not.
|
|
|
|
.TP
|
|
.I addr
|
|
If
|
|
.B PERF_SAMPLE_ADDR
|
|
is enabled, then a 64-bit address is included.
|
|
This is usually the address of a tracepoint,
|
|
breakpoint, or software event; otherwise the value is 0.
|
|
|
|
.TP
|
|
.I id
|
|
If
|
|
.B PERF_SAMPLE_ID
|
|
is enabled, a 64-bit unique ID is included.
|
|
If the event is a member of an event group, the group leader ID is returned.
|
|
This ID is the same as the one returned by
|
|
.BR PERF_FORMAT_ID .
|
|
|
|
.TP
|
|
.I stream_id
|
|
If
|
|
.B PERF_SAMPLE_STREAM_ID
|
|
is enabled, a 64-bit unique ID is included.
|
|
Unlike
|
|
.B PERF_SAMPLE_ID
|
|
the actual ID is returned, not the group leader.
|
|
This ID is the same as the one returned by
|
|
.BR PERF_FORMAT_ID .
|
|
|
|
.TP
|
|
.IR cpu ", " res
|
|
If
|
|
.B PERF_SAMPLE_CPU
|
|
is enabled, this is a 32-bit value indicating
|
|
which CPU was being used, in addition to a reserved (unused)
|
|
32-bit value.
|
|
|
|
.TP
|
|
.I period
|
|
If
|
|
.B PERF_SAMPLE_PERIOD
|
|
is enabled, a 64-bit value indicating
|
|
the current sampling period is written.
|
|
|
|
.TP
|
|
.I v
|
|
If
|
|
.B PERF_SAMPLE_READ
|
|
is enabled, a structure of type read_format
|
|
is included which has values for all events in the event group.
|
|
The values included depend on the
|
|
.I read_format
|
|
value used at
|
|
.BR perf_event_open ()
|
|
time.
|
|
|
|
.TP
|
|
.IR nr ", " ips[nr]
|
|
If
|
|
.B PERF_SAMPLE_CALLCHAIN
|
|
is enabled, then a 64-bit number is included
|
|
which indicates how many following 64-bit instruction pointers will
|
|
follow.
|
|
This is the current callchain.
|
|
|
|
.TP
|
|
.IR size ", " data
|
|
If
|
|
.B PERF_SAMPLE_RAW
|
|
is enabled, then a 32-bit value indicating size
|
|
is included followed by an array of 8-bit values of length size.
|
|
The values are padded with 0 to have 64-bit alignment.
|
|
|
|
This RAW record data is opaque with respect to the ABI.
|
|
The ABI doesn't make any promises with respect to the stability
|
|
of its content, it may vary depending
|
|
on event, hardware, and kernel version.
|
|
|
|
.TP
|
|
.IR bnr ", " lbr[bnr]
|
|
If
|
|
.B PERF_SAMPLE_BRANCH_STACK
|
|
is enabled, then a 64-bit value indicating
|
|
the number of records is included, followed by
|
|
.I bnr
|
|
.I perf_branch_entry
|
|
structures.
|
|
These structures have from, to, and flags values indicating
|
|
the from and to addresses from the branches on the callstack.
|
|
|
|
.TP
|
|
.IR abi ", " regs[weight(mask)]
|
|
If
|
|
.B PERF_SAMPLE_REGS_USER
|
|
is enabled, then
|
|
[to be documented].
|
|
|
|
The
|
|
.I abi
|
|
field is one of
|
|
.BR PERF_SAMPLE_REGS_ABI_NONE ", " PERF_SAMPLE_REGS_ABI_32 " or "
|
|
.BR PERF_SAMPLE_REGS_ABI_64 .
|
|
|
|
.TP
|
|
.IR size ", " data[size] ", " dyn_size
|
|
If
|
|
.B PERF_SAMPLE_STACK_USER
|
|
is enabled, then
|
|
[to be documented].
|
|
|
|
.RE
|
|
|
|
.RE
|
|
|
|
|
|
.TP
|
|
.I misc
|
|
The
|
|
.I misc
|
|
field contains additional information about the sample.
|
|
|
|
The CPU mode can be determined from this value by masking with
|
|
.B PERF_RECORD_MISC_CPUMODE_MASK
|
|
and looking for one of the following (note these are not
|
|
bitmasks, only one can be set at a time):
|
|
.RS
|
|
.TP
|
|
.B PERF_RECORD_MISC_CPUMODE_UNKNOWN
|
|
Unknown CPU mode.
|
|
.TP
|
|
.B PERF_RECORD_MISC_KERNEL
|
|
Sample happened in the kernel.
|
|
.TP
|
|
.B PERF_RECORD_MISC_USER
|
|
Sample happened in user code.
|
|
.TP
|
|
.B PERF_RECORD_MISC_HYPERVISOR
|
|
Sample happened in the hypervisor.
|
|
.TP
|
|
.B PERF_RECORD_MISC_GUEST_KERNEL
|
|
Sample happened in the guest kernel.
|
|
.TP
|
|
.B PERF_RECORD_MISC_GUEST_USER
|
|
Sample happened in guest user code.
|
|
.RE
|
|
|
|
In addition, one of the following bits can be set:
|
|
.RS
|
|
.TP
|
|
.B PERF_RECORD_MISC_EXACT_IP
|
|
This indicates that the content of
|
|
.B PERF_SAMPLE_IP
|
|
points
|
|
to the actual instruction that triggered the event.
|
|
See also
|
|
.IR perf_event_attr.precise_ip .
|
|
|
|
.TP
|
|
.B PERF_RECORD_MISC_EXT_RESERVED
|
|
This indicates there is extended data available (currently not used).
|
|
|
|
.RE
|
|
|
|
.TP
|
|
.I size
|
|
This indicates the size of the record.
|
|
|
|
.RE
|
|
|
|
.SS "Signal Overflow"
|
|
|
|
Events can be set to deliver a signal when a threshold is crossed.
|
|
The signal handler is set up using the
|
|
.BR poll (2),
|
|
.BR select (2),
|
|
.BR epoll (2)
|
|
and
|
|
.BR fcntl (2),
|
|
system calls.
|
|
|
|
To generate signals, sampling must be enabled
|
|
.RI ( sample_period
|
|
must have a non-zero value).
|
|
|
|
There are two ways to generate signals.
|
|
|
|
The first is to set a
|
|
.I wakeup_events
|
|
or
|
|
.I wakeup_watermark
|
|
value that will generate a signal if a certain number of samples
|
|
or bytes have been written to the mmap ring buffer.
|
|
In this case a signal of type
|
|
.B POLL_IN
|
|
is sent.
|
|
|
|
The other way is by use of the
|
|
.B PERF_EVENT_IOC_REFRESH
|
|
ioctl.
|
|
This ioctl adds to a counter that decrements each time the event overflows.
|
|
When non-zero, a
|
|
.B POLL_IN
|
|
signal is sent on overflow, but
|
|
once the value reaches 0, a signal is sent of type
|
|
.B POLL_HUP
|
|
and
|
|
the underlying event is disabled.
|
|
|
|
Note: on newer kernels (definitely noticed with 3.2)
|
|
.\" FIXME(Vince) : Find out when this was introduced
|
|
a signal is provided for every overflow, even if
|
|
.I wakeup_events
|
|
is not set.
|
|
|
|
.SS "rdpmc instruction"
|
|
Starting with Linux 3.4 on x86, you can use the
|
|
.I rdpmc
|
|
instruction to get low-latency reads without having to enter the kernel.
|
|
Note that using
|
|
.I rdpmc
|
|
is not necessarily faster than other methods for reading event values.
|
|
|
|
Support for this can be detected with the
|
|
.I cap_usr_rdpmc
|
|
field in the mmap page; documentation on how
|
|
to calculate event values can be found in that section.
|
|
|
|
.SS "perf_event ioctl calls"
|
|
.PP
|
|
Various ioctls act on
|
|
.BR perf_event_open ()
|
|
file descriptors
|
|
|
|
.TP
|
|
.B PERF_EVENT_IOC_ENABLE
|
|
Enables the individual event or event group specified by the
|
|
file descriptor argument.
|
|
|
|
The ioctl argument is ignored.
|
|
|
|
.TP
|
|
.B PERF_EVENT_IOC_DISABLE
|
|
Disables the individual counter or event group specified by the
|
|
file descriptor argument.
|
|
|
|
Enabling or disabling the leader of a group enables or disables the
|
|
entire group; that is, while the group leader is disabled, none of the
|
|
counters in the group will count.
|
|
Enabling or disabling a member of a group other than the leader only
|
|
affects that counter; disabling a non-leader
|
|
stops that counter from counting but doesn't affect any other counter.
|
|
|
|
The ioctl argument is ignored.
|
|
|
|
.TP
|
|
.B PERF_EVENT_IOC_REFRESH
|
|
Non-inherited overflow counters can use this
|
|
to enable a counter for a number of overflows specified by the argument,
|
|
after which it is disabled.
|
|
Subsequent calls of this ioctl add the argument value to the current
|
|
count.
|
|
A signal with
|
|
.B POLL_IN
|
|
set will happen on each overflow until the
|
|
count reaches 0; when that happens a signal with
|
|
POLL_HUP
|
|
set is sent and the event is disabled.
|
|
Using an argument of 0 is considered undefined behavior.
|
|
|
|
.TP
|
|
.B PERF_EVENT_IOC_RESET
|
|
Reset the event count specified by the
|
|
file descriptor argumentto zero.
|
|
This only resets the counts; there is no way to reset the
|
|
multiplexing
|
|
.I time_enabled
|
|
or
|
|
.I time_running
|
|
values.
|
|
When sent to a group leader, only
|
|
the leader is reset (child events are not).
|
|
|
|
The ioctl argument is ignored.
|
|
|
|
.TP
|
|
.B PERF_EVENT_IOC_PERIOD
|
|
IOC_PERIOD is the command to update the period; it
|
|
does not update the current period but instead defers until next.
|
|
|
|
The argument is a pointer to a 64-bit value containing the
|
|
desired new period.
|
|
|
|
.TP
|
|
.B PERF_EVENT_IOC_SET_OUTPUT
|
|
This tells the kernel to report event notifications to the specified
|
|
file descriptor rather than the default one.
|
|
The file descriptors must all be on the same CPU.
|
|
|
|
The argument specifies the desired file descriptor, or \-1 if
|
|
output should be ignored.
|
|
|
|
.TP
|
|
.BR PERF_EVENT_IOC_SET_FILTER " (Since Linux 2.6.33)"
|
|
This adds an ftrace filter to this event.
|
|
|
|
The argument is a pointer to the desired ftrace filter.
|
|
|
|
.SS "Using prctl"
|
|
A process can enable or disable all the event groups that are
|
|
attached to it using the
|
|
.BR prctl (2)
|
|
.B PR_TASK_PERF_EVENTS_ENABLE
|
|
and
|
|
.B PR_TASK_PERF_EVENTS_DISABLE
|
|
operations.
|
|
This applies to all counters on the current process, whether created by
|
|
this process or by another, and does not affect any counters that this
|
|
process has created on other processes.
|
|
It only enables or disables
|
|
the group leaders, not any other members in the groups.
|
|
|
|
.SS perf_event related configuration files
|
|
|
|
Files in
|
|
.I /proc/sys/kernel/
|
|
|
|
.RS 4
|
|
.TP
|
|
.I /proc/sys/kernel/perf_event_paranoid
|
|
|
|
The
|
|
.I perf_event_paranoid
|
|
file can be set to restrict access to the performance counters.
|
|
|
|
2 - only allow user-space measurements
|
|
|
|
1 - (default) allow both kernel and user measurements
|
|
|
|
0 - allow access to CPU-specific data but not raw tracepoint samples
|
|
|
|
\-1 - no restrictions
|
|
|
|
The existence of the
|
|
.I perf_event_paranoid
|
|
file is the official method for determining if a kernel supports
|
|
.BR perf_event_open ().
|
|
|
|
.TP
|
|
.I /proc/sys/kernel/perf_event_max_sample_rate
|
|
|
|
This sets the maximum sample rate.
|
|
Setting this too high can allow
|
|
users to sample at a rate that impacts overall machine performance
|
|
and potentially lock up the machine.
|
|
The default value is
|
|
100000 (samples per second).
|
|
|
|
.TP
|
|
.I /proc/sys/kernel/perf_event_mlock_kb
|
|
|
|
Maximum number of pages an unprivileged user can mlock (2) .
|
|
The default is 516 (kB).
|
|
.RE
|
|
|
|
Files in
|
|
.I /sys/bus/event_source/devices/
|
|
|
|
.RS 4
|
|
Since Linux 2.6.34 the kernel supports having multiple PMUs
|
|
available for monitoring.
|
|
Information on how to program these PMUs can be found under
|
|
.IR /sys/bus/event_source/devices/ .
|
|
Each subdirectory corresponds to a different PMU.
|
|
|
|
.TP
|
|
.I /sys/bus/event_source/devices/*/type
|
|
This contains an integer that can be used in the
|
|
.I type
|
|
field of perf_event_attr to indicate you wish to use this PMU.
|
|
|
|
.TP
|
|
.I /sys/bus/event_source/devices/*/rdpmc
|
|
[To be documented]
|
|
|
|
.TP
|
|
.I /sys/bus/event_source/devices/*/format/
|
|
This sub-directory contains information on what bits in the
|
|
.I config
|
|
field of perf_event_attr correspond to.
|
|
|
|
.TP
|
|
.I /sys/bus/event_source/devices/*/events/
|
|
This sub-directory contains files with pre-defined events.
|
|
The contents are strings describing the event settings
|
|
expressed in terms of the fields found in the
|
|
.I ./format/
|
|
directory.
|
|
These are not necessarily complete lists of all events supported by
|
|
a PMU, but usually a subset of events deemed useful or interesting.
|
|
|
|
.TP
|
|
.I /sys/bus/event_source/devices/*/uevent
|
|
[To be documented]
|
|
|
|
.RE
|
|
|
|
|
|
.SH "RETURN VALUE"
|
|
.BR perf_event_open ()
|
|
returns the new file descriptor, or \-1 if an error occurred
|
|
(in which case,
|
|
.I errno
|
|
is set appropriately).
|
|
.SH ERRORS
|
|
.TP
|
|
.B EINVAL
|
|
Returned if the specified event is not available.
|
|
.TP
|
|
.B ENOSPC
|
|
Prior to Linux 3.3, if there was not enough room for the event,
|
|
.B ENOSPC
|
|
was returned.
|
|
Linus did not like this, and this was changed to
|
|
.BR EINVAL .
|
|
.B ENOSPC
|
|
is still returned if you try to read results into
|
|
too small of a buffer.
|
|
|
|
.SH VERSION
|
|
|
|
.BR perf_event_open ()
|
|
was introduced in Linux 2.6.31 but was called
|
|
.BR perf_counter_open () .
|
|
It was renamed in Linux 2.6.32.
|
|
|
|
.SH CONFORMING TO
|
|
|
|
This
|
|
.BR perf_event_open ()
|
|
system call Linux- specific
|
|
and should not be used in programs intended to be portable.
|
|
|
|
.SH NOTES
|
|
Glibc does not provide a wrapper for this system call; call it using
|
|
.BR syscall (2).
|
|
See the example below.
|
|
|
|
The official way of knowing if
|
|
.BR perf_event_open ()
|
|
support is enabled is checking
|
|
for the existence of the file
|
|
.IR /proc/sys/kernel/perf_event_paranoid .
|
|
|
|
.SH BUGS
|
|
|
|
The
|
|
.B F_SETOWN_EX
|
|
option to
|
|
.BR fcntl (2)
|
|
is needed to properly get overflow signals in threads.
|
|
This was introduced in Linux 2.6.32.
|
|
|
|
Prior to Linux 2.6.33 (at least for x86) the kernel did not check
|
|
if events could be scheduled together until read time.
|
|
The same happens on all known kernels if the NMI watchdog is enabled.
|
|
This means to see if a given set of events works you have to
|
|
.BR perf_event_open (),
|
|
start, then read before you know for sure you
|
|
can get valid measurements.
|
|
|
|
Prior to Linux 2.6.34 event constraints were not enforced by the kernel.
|
|
In that case, some events would silently return "0" if the kernel
|
|
scheduled them in an improper counter slot.
|
|
|
|
Prior to Linux 2.6.34 there was a bug when multiplexing where the
|
|
wrong results could be returned.
|
|
|
|
Kernels from Linux 2.6.35 to Linux 2.6.39 can quickly crash the kernel if
|
|
"inherit" is enabled and many threads are started.
|
|
|
|
Prior to Linux 2.6.35,
|
|
.B PERF_FORMAT_GROUP
|
|
did not work with attached processes.
|
|
|
|
In older Linux 2.6 versions,
|
|
refreshing an event group leader refreshed all siblings,
|
|
and refreshing with a parameter of 0 enabled infinite refresh.
|
|
This behavior is unsupported and should not be relied on.
|
|
|
|
There is a bug in the kernel code between
|
|
Linux 2.6.36 and Linux 3.0 that ignores the
|
|
"watermark" field and acts as if a wakeup_event
|
|
was chosen if the union has a
|
|
non-zero value in it.
|
|
|
|
Always double-check your results!
|
|
Various generalized events have had wrong values.
|
|
For example, retired branches measured
|
|
the wrong thing on AMD machines until Linux 2.6.35.
|
|
|
|
.SH EXAMPLE
|
|
The following is a short example that measures the total
|
|
instruction count of a call to
|
|
.BR printf (3).
|
|
.nf
|
|
|
|
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
#include <unistd.h>
|
|
#include <string.h>
|
|
#include <sys/ioctl.h>
|
|
#include <linux/perf_event.h>
|
|
#include <asm/unistd.h>
|
|
|
|
long
|
|
perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
|
|
int cpu, int group_fd, unsigned long flags)
|
|
{
|
|
int ret;
|
|
|
|
ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
|
|
group_fd, flags);
|
|
return ret;
|
|
}
|
|
|
|
|
|
int
|
|
main(int argc, char **argv)
|
|
{
|
|
|
|
struct perf_event_attr pe;
|
|
long long count;
|
|
int fd;
|
|
|
|
memset(&pe, 0, sizeof(struct perf_event_attr));
|
|
pe.type = PERF_TYPE_HARDWARE;
|
|
pe.size = sizeof(struct perf_event_attr);
|
|
pe.config = PERF_COUNT_HW_INSTRUCTIONS;
|
|
pe.disabled = 1;
|
|
pe.exclude_kernel = 1;
|
|
pe.exclude_hv = 1;
|
|
|
|
fd = perf_event_open(&pe, 0, \-1, \-1, 0);
|
|
if (fd == \-1) {
|
|
fprintf(stderr, "Error opening leader %llx\\n", pe.config);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
ioctl(fd, PERF_EVENT_IOC_RESET, 0);
|
|
ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
|
|
|
|
printf("Measuring instruction count for this printf\\n");
|
|
|
|
ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
|
|
read(fd, &count, sizeof(long long));
|
|
|
|
printf("Used %lld instructions\\n", count);
|
|
|
|
close(fd);
|
|
}
|
|
.fi
|
|
|
|
.SH "SEE ALSO"
|
|
.BR fcntl (2),
|
|
.BR mmap (2),
|
|
.BR open (2),
|
|
.BR prctl (2),
|
|
.BR read (2)
|