mirror of https://github.com/mkerrisk/man-pages
2287 lines
56 KiB
Groff
2287 lines
56 KiB
Groff
.\" Copyright (c) 2012, Vincent Weaver
|
|
.\"
|
|
.\" %%%LICENSE_START(GPLv2+_DOC_FULL)
|
|
.\" This is free documentation; you can redistribute it and/or
|
|
.\" modify it under the terms of the GNU General Public License as
|
|
.\" published by the Free Software Foundation; either version 2 of
|
|
.\" the License, or (at your option) any later version.
|
|
.\"
|
|
.\" The GNU General Public License's references to "object code"
|
|
.\" and "executables" are to be interpreted as the output of any
|
|
.\" document formatting or typesetting system, including
|
|
.\" intermediate and printed output.
|
|
.\"
|
|
.\" This manual is distributed in the hope that it will be useful,
|
|
.\" but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
.\" GNU General Public License for more details.
|
|
.\"
|
|
.\" You should have received a copy of the GNU General Public
|
|
.\" License along with this manual; if not, see
|
|
.\" <http://www.gnu.org/licenses/>.
|
|
.\" %%%LICENSE_END
|
|
.\"
|
|
.\" This document is based on the perf_event.h header file, the
|
|
.\" tools/perf/design.txt file, and a lot of bitter experience.
|
|
.\"
|
|
.TH PERF_EVENT_OPEN 2 2013-09-13 "Linux" "Linux Programmer's Manual"
|
|
.SH NAME
|
|
perf_event_open \- set up performance monitoring
|
|
.SH SYNOPSIS
|
|
.nf
|
|
.B #include <linux/perf_event.h>
|
|
.B #include <linux/hw_breakpoint.h>
|
|
.sp
|
|
.BI "int perf_event_open(struct perf_event_attr *" attr ,
|
|
.BI " pid_t " pid ", int " cpu ", int " group_fd ,
|
|
.BI " unsigned long " flags );
|
|
.fi
|
|
|
|
.IR Note :
|
|
There is no glibc wrapper for this system call; see NOTES.
|
|
.SH DESCRIPTION
|
|
Given a list of parameters,
|
|
.BR perf_event_open ()
|
|
returns a file descriptor, for use in subsequent system calls
|
|
.RB ( read "(2), " mmap "(2), " prctl "(2), " fcntl "(2), etc.)."
|
|
.PP
|
|
A call to
|
|
.BR perf_event_open ()
|
|
creates a file descriptor that allows measuring performance
|
|
information.
|
|
Each file descriptor corresponds to one
|
|
event that is measured; these can be grouped together
|
|
to measure multiple events simultaneously.
|
|
.PP
|
|
Events can be enabled and disabled in two ways: via
|
|
.BR ioctl (2)
|
|
and via
|
|
.BR prctl (2) .
|
|
When an event is disabled it does not count or generate overflows but does
|
|
continue to exist and maintain its count value.
|
|
.PP
|
|
Events come in two flavors: counting and sampled.
|
|
A
|
|
.I counting
|
|
event is one that is used for counting the aggregate number of events
|
|
that occur.
|
|
In general, counting event results are gathered with a
|
|
.BR read (2)
|
|
call.
|
|
A
|
|
.I sampling
|
|
event periodically writes measurements to a buffer that can then
|
|
be accessed via
|
|
.BR mmap (2) .
|
|
.SS Arguments
|
|
.P
|
|
The argument
|
|
.I pid
|
|
allows events to be attached to processes in various ways.
|
|
If
|
|
.I pid
|
|
is 0, measurements happen on the current thread, if
|
|
.I pid
|
|
is greater than 0, the process indicated by
|
|
.I pid
|
|
is measured, and if
|
|
.I pid
|
|
is \-1, all processes are counted.
|
|
|
|
The
|
|
.I cpu
|
|
argument allows measurements to be specific to a CPU.
|
|
If
|
|
.I cpu
|
|
is greater than or equal to 0,
|
|
measurements are restricted to the specified CPU;
|
|
if
|
|
.I cpu
|
|
is \-1, the events are measured on all CPUs.
|
|
.P
|
|
Note that the combination of
|
|
.IR pid " == \-1"
|
|
and
|
|
.IR cpu " == \-1"
|
|
is not valid.
|
|
.P
|
|
A
|
|
.IR pid " > 0"
|
|
and
|
|
.IR cpu " == \-1"
|
|
setting measures per-process and follows that process to whatever CPU the
|
|
process gets scheduled to.
|
|
Per-process events can be created by any user.
|
|
.P
|
|
A
|
|
.IR pid " == \-1"
|
|
and
|
|
.IR cpu " >= 0"
|
|
setting is per-CPU and measures all processes on the specified CPU.
|
|
Per-CPU events need the
|
|
.B CAP_SYS_ADMIN
|
|
capability or a
|
|
.I /proc/sys/kernel/perf_event_paranoid
|
|
value of less than 1.
|
|
.P
|
|
The
|
|
.I group_fd
|
|
argument allows event groups to be created.
|
|
An event group has one event which is the group leader.
|
|
The leader is created first, with
|
|
.IR group_fd " = \-1."
|
|
The rest of the group members are created with subsequent
|
|
.BR perf_event_open ()
|
|
calls with
|
|
.IR group_fd
|
|
being set to the fd of the group leader.
|
|
(A single event on its own is created with
|
|
.IR group_fd " = \-1"
|
|
and is considered to be a group with only 1 member.)
|
|
An event group is scheduled onto the CPU as a unit: it will
|
|
be put onto the CPU only if all of the events in the group can be put onto
|
|
the CPU.
|
|
This means that the values of the member events can be
|
|
meaningfully compared, added, divided (to get ratios), etc., with each
|
|
other, since they have counted events for the same set of executed
|
|
instructions.
|
|
.P
|
|
The
|
|
.I flags
|
|
argument is formed by ORing together zero or more of the following values:
|
|
.TP
|
|
.BR PERF_FLAG_FD_NO_GROUP
|
|
.\" FIXME The following sentence is unclear
|
|
This flag allows creating an event as part of an event group but
|
|
having no group leader.
|
|
It is unclear why this is useful.
|
|
.\" FIXME So, why is it useful?
|
|
.TP
|
|
.BR PERF_FLAG_FD_OUTPUT
|
|
This flag re-routes the output from an event to the group leader.
|
|
.TP
|
|
.BR PERF_FLAG_PID_CGROUP " (Since Linux 2.6.39)."
|
|
This flag activates per-container system-wide monitoring.
|
|
A container
|
|
is an abstraction that isolates a set of resources for finer grain
|
|
control (CPUs, memory, etc.).
|
|
In this mode, the event is measured
|
|
only if the thread running on the monitored CPU belongs to the designated
|
|
container (cgroup).
|
|
The cgroup is identified by passing a file descriptor
|
|
opened on its directory in the cgroupfs filesystem.
|
|
For instance, if the
|
|
cgroup to monitor is called
|
|
.IR test ,
|
|
then a file descriptor opened on
|
|
.I /dev/cgroup/test
|
|
(assuming cgroupfs is mounted on
|
|
.IR /dev/cgroup )
|
|
must be passed as the
|
|
.I pid
|
|
parameter.
|
|
cgroup monitoring is available only
|
|
for system-wide events and may therefore require extra permissions.
|
|
.P
|
|
The
|
|
.I perf_event_attr
|
|
structure provides detailed configuration information
|
|
for the event being created.
|
|
|
|
.in +4n
|
|
.nf
|
|
struct perf_event_attr {
|
|
__u32 type; /* Type of event */
|
|
__u32 size; /* Size of attribute structure */
|
|
__u64 config; /* Type-specific configuration */
|
|
|
|
union {
|
|
__u64 sample_period; /* Period of sampling */
|
|
__u64 sample_freq; /* Frequency of sampling */
|
|
};
|
|
|
|
__u64 sample_type; /* Specifies values included in sample */
|
|
__u64 read_format; /* Specifies values returned in read */
|
|
|
|
__u64 disabled : 1, /* off by default */
|
|
inherit : 1, /* children inherit it */
|
|
pinned : 1, /* must always be on PMU */
|
|
exclusive : 1, /* only group on PMU */
|
|
exclude_user : 1, /* don't count user */
|
|
exclude_kernel : 1, /* don't count kernel */
|
|
exclude_hv : 1, /* don't count hypervisor */
|
|
exclude_idle : 1, /* don't count when idle */
|
|
mmap : 1, /* include mmap data */
|
|
comm : 1, /* include comm data */
|
|
freq : 1, /* use freq, not period */
|
|
inherit_stat : 1, /* per task counts */
|
|
enable_on_exec : 1, /* next exec enables */
|
|
task : 1, /* trace fork/exit */
|
|
watermark : 1, /* wakeup_watermark */
|
|
precise_ip : 2, /* skid constraint */
|
|
mmap_data : 1, /* non-exec mmap data */
|
|
sample_id_all : 1, /* sample_type all events */
|
|
exclude_host : 1, /* don't count in host */
|
|
exclude_guest : 1, /* don't count in guest */
|
|
exclude_callchain_kernel : 1,
|
|
/* exclude kernel callchains */
|
|
exclude_callchain_user : 1,
|
|
/* exclude user callchains */
|
|
__reserved_1 : 41;
|
|
|
|
union {
|
|
__u32 wakeup_events; /* wakeup every n events */
|
|
__u32 wakeup_watermark; /* bytes before wakeup */
|
|
};
|
|
|
|
__u32 bp_type; /* breakpoint type */
|
|
|
|
union {
|
|
__u64 bp_addr; /* breakpoint address */
|
|
__u64 config1; /* extension of config */
|
|
};
|
|
|
|
union {
|
|
__u64 bp_len; /* breakpoint length */
|
|
__u64 config2; /* extension of config1 */
|
|
};
|
|
__u64 branch_sample_type; /* enum perf_branch_sample_type */
|
|
__u64 sample_regs_user; /* user regs to dump on samples */
|
|
__u32 sample_stack_user; /* size of stack to dump on
|
|
samples */
|
|
__u32 __reserved_2; /* Align to u64 */
|
|
|
|
};
|
|
.fi
|
|
.in
|
|
|
|
The fields of the
|
|
.I perf_event_attr
|
|
structure are described in more detail below:
|
|
.TP
|
|
.I type
|
|
This field specifies the overall event type.
|
|
It has one of the following values:
|
|
.RS
|
|
.TP
|
|
.B PERF_TYPE_HARDWARE
|
|
This indicates one of the "generalized" hardware events provided
|
|
by the kernel.
|
|
See the
|
|
.I config
|
|
field definition for more details.
|
|
.TP
|
|
.B PERF_TYPE_SOFTWARE
|
|
This indicates one of the software-defined events provided by the kernel
|
|
(even if no hardware support is available).
|
|
.TP
|
|
.B PERF_TYPE_TRACEPOINT
|
|
This indicates a tracepoint
|
|
provided by the kernel tracepoint infrastructure.
|
|
.TP
|
|
.B PERF_TYPE_HW_CACHE
|
|
This indicates a hardware cache event.
|
|
This has a special encoding, described in the
|
|
.I config
|
|
field definition.
|
|
.TP
|
|
.B PERF_TYPE_RAW
|
|
This indicates a "raw" implementation-specific event in the
|
|
.IR config " field."
|
|
.TP
|
|
.BR PERF_TYPE_BREAKPOINT " (Since Linux 2.6.33)"
|
|
This indicates a hardware breakpoint as provided by the CPU.
|
|
Breakpoints can be read/write accesses to an address as well as
|
|
execution of an instruction address.
|
|
.TP
|
|
.RB "dynamic PMU"
|
|
Since Linux 2.6.39,
|
|
.BR perf_event_open ()
|
|
can support multiple PMUs.
|
|
To enable this, a value exported by the kernel can be used in the
|
|
.I type
|
|
field to indicate which PMU to use.
|
|
The value to use can be found in the sysfs filesystem:
|
|
there is a subdirectory per PMU instance under
|
|
.IR /sys/bus/event_source/devices .
|
|
In each sub-directory there is a
|
|
.I type
|
|
file whose content is an integer that can be used in the
|
|
.I type
|
|
field.
|
|
For instance,
|
|
.I /sys/bus/event_source/devices/cpu/type
|
|
contains the value for the core CPU PMU, which is usually 4.
|
|
.RE
|
|
.TP
|
|
.I "size"
|
|
The size of the
|
|
.I perf_event_attr
|
|
structure for forward/backward compatibility.
|
|
Set this using
|
|
.I sizeof(struct perf_event_attr)
|
|
to allow the kernel to see
|
|
the struct size at the time of compilation.
|
|
|
|
The related define
|
|
.B PERF_ATTR_SIZE_VER0
|
|
is set to 64; this was the size of the first published struct.
|
|
.B PERF_ATTR_SIZE_VER1
|
|
is 72, corresponding to the addition of breakpoints in Linux 2.6.33.
|
|
.B PERF_ATTR_SIZE_VER2
|
|
is 80 corresponding to the addition of branch sampling in Linux 3.4.
|
|
.B PERF_ATR_SIZE_VER3
|
|
is 96 corresponding to the addition
|
|
of
|
|
.I sample_regs_user
|
|
and
|
|
.I sample_stack_user
|
|
in Linux 3.7.
|
|
.TP
|
|
.I "config"
|
|
This specifies which event you want, in conjunction with
|
|
the
|
|
.I type
|
|
field.
|
|
The
|
|
.IR config1 " and " config2
|
|
fields are also taken into account in cases where 64 bits is not
|
|
enough to fully specify the event.
|
|
The encoding of these fields are event dependent.
|
|
|
|
The most significant bit (bit 63) of
|
|
.I config
|
|
signifies CPU-specific (raw) counter configuration data;
|
|
if the most significant bit is unset, the next 7 bits are an event
|
|
type and the rest of the bits are the event identifier.
|
|
|
|
There are various ways to set the
|
|
.I config
|
|
field that are dependent on the value of the previously
|
|
described
|
|
.I type
|
|
field.
|
|
What follows are various possible settings for
|
|
.I config
|
|
separated out by
|
|
.IR type .
|
|
|
|
If
|
|
.I type
|
|
is
|
|
.BR PERF_TYPE_HARDWARE ,
|
|
we are measuring one of the generalized hardware CPU events.
|
|
Not all of these are available on all platforms.
|
|
Set
|
|
.I config
|
|
to one of the following:
|
|
.RS 12
|
|
.TP
|
|
.B PERF_COUNT_HW_CPU_CYCLES
|
|
Total cycles.
|
|
Be wary of what happens during CPU frequency scaling.
|
|
.TP
|
|
.B PERF_COUNT_HW_INSTRUCTIONS
|
|
Retired instructions.
|
|
Be careful, these can be affected by various
|
|
issues, most notably hardware interrupt counts.
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_REFERENCES
|
|
Cache accesses.
|
|
Usually this indicates Last Level Cache accesses but this may
|
|
vary depending on your CPU.
|
|
This may include prefetches and coherency messages; again this
|
|
depends on the design of your CPU.
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_MISSES
|
|
Cache misses.
|
|
Usually this indicates Last Level Cache misses; this is intended to be
|
|
used in conjunction with the
|
|
.B PERF_COUNT_HW_CACHE_REFERENCES
|
|
event to calculate cache miss rates.
|
|
.TP
|
|
.B PERF_COUNT_HW_BRANCH_INSTRUCTIONS
|
|
Retired branch instructions.
|
|
Prior to Linux 2.6.34, this used
|
|
the wrong event on AMD processors.
|
|
.TP
|
|
.B PERF_COUNT_HW_BRANCH_MISSES
|
|
Mispredicted branch instructions.
|
|
.TP
|
|
.B PERF_COUNT_HW_BUS_CYCLES
|
|
Bus cycles, which can be different from total cycles.
|
|
.TP
|
|
.BR PERF_COUNT_HW_STALLED_CYCLES_FRONTEND " (Since Linux 3.0)"
|
|
Stalled cycles during issue.
|
|
.TP
|
|
.BR PERF_COUNT_HW_STALLED_CYCLES_BACKEND " (Since Linux 3.0)"
|
|
Stalled cycles during retirement.
|
|
.TP
|
|
.BR PERF_COUNT_HW_REF_CPU_CYCLES " (Since Linux 3.3)"
|
|
Total cycles; not affected by CPU frequency scaling.
|
|
.RE
|
|
.IP
|
|
If
|
|
.I type
|
|
is
|
|
.BR PERF_TYPE_SOFTWARE ,
|
|
we are measuring software events provided by the kernel.
|
|
Set
|
|
.I config
|
|
to one of the following:
|
|
.RS 12
|
|
.TP
|
|
.B PERF_COUNT_SW_CPU_CLOCK
|
|
This reports the CPU clock, a high-resolution per-CPU timer.
|
|
.TP
|
|
.B PERF_COUNT_SW_TASK_CLOCK
|
|
This reports a clock count specific to the task that is running.
|
|
.TP
|
|
.B PERF_COUNT_SW_PAGE_FAULTS
|
|
This reports the number of page faults.
|
|
.TP
|
|
.B PERF_COUNT_SW_CONTEXT_SWITCHES
|
|
This counts context switches.
|
|
Until Linux 2.6.34, these were all reported as user-space
|
|
events, after that they are reported as happening in the kernel.
|
|
.TP
|
|
.B PERF_COUNT_SW_CPU_MIGRATIONS
|
|
This reports the number of times the process
|
|
has migrated to a new CPU.
|
|
.TP
|
|
.B PERF_COUNT_SW_PAGE_FAULTS_MIN
|
|
This counts the number of minor page faults.
|
|
These did not require disk I/O to handle.
|
|
.TP
|
|
.B PERF_COUNT_SW_PAGE_FAULTS_MAJ
|
|
This counts the number of major page faults.
|
|
These required disk I/O to handle.
|
|
.TP
|
|
.BR PERF_COUNT_SW_ALIGNMENT_FAULTS " (Since Linux 2.6.33)"
|
|
This counts the number of alignment faults.
|
|
These happen when unaligned memory accesses happen; the kernel
|
|
can handle these but it reduces performance.
|
|
This happens only on some architectures (never on x86).
|
|
.TP
|
|
.BR PERF_COUNT_SW_EMULATION_FAULTS " (Since Linux 2.6.33)"
|
|
This counts the number of emulation faults.
|
|
The kernel sometimes traps on unimplemented instructions
|
|
and emulates them for user space.
|
|
This can negatively impact performance.
|
|
.RE
|
|
|
|
.RS
|
|
If
|
|
.I type
|
|
is
|
|
.BR PERF_TYPE_TRACEPOINT ,
|
|
then we are measuring kernel tracepoints.
|
|
The value to use in
|
|
.I config
|
|
can be obtained from under debugfs
|
|
.I tracing/events/*/*/id
|
|
if ftrace is enabled in the kernel.
|
|
.RE
|
|
|
|
.RS
|
|
If
|
|
.I type
|
|
is
|
|
.BR PERF_TYPE_HW_CACHE ,
|
|
then we are measuring a hardware CPU cache event.
|
|
To calculate the appropriate
|
|
.I config
|
|
value use the following equation:
|
|
.RS 4
|
|
.nf
|
|
|
|
(perf_hw_cache_id) | (perf_hw_cache_op_id << 8) |
|
|
(perf_hw_cache_op_result_id << 16)
|
|
.fi
|
|
.P
|
|
where
|
|
.I perf_hw_cache_id
|
|
is one of:
|
|
.RS 4
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_L1D
|
|
for measuring Level 1 Data Cache
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_L1I
|
|
for measuring Level 1 Instruction Cache
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_LL
|
|
for measuring Last-Level Cache
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_DTLB
|
|
for measuring the Data TLB
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_ITLB
|
|
for measuring the Instruction TLB
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_BPU
|
|
for measuring the branch prediction unit
|
|
.TP
|
|
.BR PERF_COUNT_HW_CACHE_NODE " (Since Linux 3.0)"
|
|
for measuring local memory accesses
|
|
.RE
|
|
.P
|
|
and
|
|
.I perf_hw_cache_op_id
|
|
is one of
|
|
.RS 4
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_OP_READ
|
|
for read accesses
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_OP_WRITE
|
|
for write accesses
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_OP_PREFETCH
|
|
for prefetch accesses
|
|
.RE
|
|
.P
|
|
and
|
|
.I perf_hw_cache_op_result_id
|
|
is one of
|
|
.RS 4
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_RESULT_ACCESS
|
|
to measure accesses
|
|
.TP
|
|
.B PERF_COUNT_HW_CACHE_RESULT_MISS
|
|
to measure misses
|
|
.RE
|
|
.RE
|
|
|
|
If
|
|
.I type
|
|
is
|
|
.BR PERF_TYPE_RAW ,
|
|
then a custom "raw"
|
|
.I config
|
|
value is needed.
|
|
Most CPUs support events that are not covered by the "generalized" events.
|
|
These are implementation defined; see your CPU manual (for example
|
|
the Intel Volume 3B documentation or the AMD BIOS and Kernel Developer
|
|
Guide).
|
|
The libpfm4 library can be used to translate from the name in the
|
|
architectural manuals to the raw hex value
|
|
.BR perf_event_open ()
|
|
expects in this field.
|
|
|
|
If
|
|
.I type
|
|
is
|
|
.BR PERF_TYPE_BREAKPOINT ,
|
|
then leave
|
|
.I config
|
|
set to zero.
|
|
Its parameters are set in other places.
|
|
.RE
|
|
.TP
|
|
.IR sample_period ", " sample_freq
|
|
A "sampling" counter is one that generates an interrupt
|
|
every N events, where N is given by
|
|
.IR sample_period .
|
|
A sampling counter has
|
|
.IR sample_period " > 0."
|
|
When an overflow interrupt occurs, requested data is recorded
|
|
in the mmap buffer.
|
|
The
|
|
.I sample_type
|
|
field controls what data is recorded on each interrupt.
|
|
|
|
.I sample_freq
|
|
can be used if you wish to use frequency rather than period.
|
|
In this case you set the
|
|
.I freq
|
|
flag.
|
|
The kernel will adjust the sampling period
|
|
to try and achieve the desired rate.
|
|
The rate of adjustment is a
|
|
timer tick.
|
|
.TP
|
|
.I "sample_type"
|
|
The various bits in this field specify which values to include
|
|
in the sample.
|
|
They will be recorded in a ring-buffer,
|
|
which is available to user space using
|
|
.BR mmap (2).
|
|
The order in which the values are saved in the
|
|
sample are documented in the MMAP Layout subsection below;
|
|
it is not the
|
|
.I "enum perf_event_sample_format"
|
|
order.
|
|
.RS
|
|
.TP
|
|
.B PERF_SAMPLE_IP
|
|
Records instruction pointer.
|
|
.TP
|
|
.B PERF_SAMPLE_TID
|
|
Records the process and thread IDs.
|
|
.TP
|
|
.B PERF_SAMPLE_TIME
|
|
Records a timestamp.
|
|
.TP
|
|
.B PERF_SAMPLE_ADDR
|
|
Records an address, if applicable.
|
|
.TP
|
|
.B PERF_SAMPLE_READ
|
|
Record counter values for all events in a group, not just the group leader.
|
|
.TP
|
|
.B PERF_SAMPLE_CALLCHAIN
|
|
Records the callchain (stack backtrace).
|
|
.TP
|
|
.B PERF_SAMPLE_ID
|
|
Records a unique ID for the opened event's group leader.
|
|
.TP
|
|
.B PERF_SAMPLE_CPU
|
|
Records CPU number.
|
|
.TP
|
|
.B PERF_SAMPLE_PERIOD
|
|
Records the current sampling period.
|
|
.TP
|
|
.B PERF_SAMPLE_STREAM_ID
|
|
Records a unique ID for the opened event.
|
|
Unlike
|
|
.B PERF_SAMPLE_ID
|
|
the actual ID is returned, not the group leader.
|
|
This ID is the same as the one returned by PERF_FORMAT_ID.
|
|
.TP
|
|
.B PERF_SAMPLE_RAW
|
|
Records additional data, if applicable.
|
|
Usually returned by tracepoint events.
|
|
.TP
|
|
.BR PERF_SAMPLE_BRANCH_STACK " (Since Linux 3.4)"
|
|
This provides a record of recent branches, as provided
|
|
by CPU branch sampling hardware (such as Intel Last Branch Record).
|
|
Not all hardware supports this feature.
|
|
|
|
See the
|
|
.I branch_sample_type
|
|
field for how to filter which branches are reported.
|
|
.TP
|
|
.BR PERF_SAMPLE_REGS_USER " (Since Linux 3.7)"
|
|
Records the current user-level CPU register state
|
|
(the values in the process before the kernel was called).
|
|
.TP
|
|
.BR PERF_SAMPLE_STACK_USER " (Since Linux 3.7)"
|
|
Records the user level stack, allowing stack unwinding.
|
|
.TP
|
|
.BR PERF_SAMPLE_WEIGHT " (Since Linux 3.10)"
|
|
Records a hardware provided weight value that expresses how
|
|
costly the sampled event was.
|
|
This allows the hardware to highlight expensive events in
|
|
a profile.
|
|
.TP
|
|
.BR PERF_SAMPLE_DATA_SRC " (Since Linux 3.10)"
|
|
Records the data source: where in the memory hierarchy
|
|
the data associated with the sampled instruction came from.
|
|
This is only available if the underlying hardware
|
|
supports this feature.
|
|
.RE
|
|
.TP
|
|
.IR "read_format"
|
|
This field specifies the format of the data returned by
|
|
.BR read (2)
|
|
on a
|
|
.BR perf_event_open ()
|
|
file descriptor.
|
|
.RS
|
|
.TP
|
|
.B PERF_FORMAT_TOTAL_TIME_ENABLED
|
|
Adds the 64-bit
|
|
.I time_enabled
|
|
field.
|
|
This can be used to calculate estimated totals if
|
|
the PMU is overcommitted and multiplexing is happening.
|
|
.TP
|
|
.B PERF_FORMAT_TOTAL_TIME_RUNNING
|
|
Adds the 64-bit
|
|
.I time_running
|
|
field.
|
|
This can be used to calculate estimated totals if
|
|
the PMU is overcommitted and multiplexing is happening.
|
|
.TP
|
|
.B PERF_FORMAT_ID
|
|
Adds a 64-bit unique value that corresponds to the event group.
|
|
.TP
|
|
.B PERF_FORMAT_GROUP
|
|
Allows all counter values in an event group to be read with one read.
|
|
.RE
|
|
.TP
|
|
.IR "disabled"
|
|
The
|
|
.I disabled
|
|
bit specifies whether the counter starts out disabled or enabled.
|
|
If disabled, the event can later be enabled by
|
|
.BR ioctl (2),
|
|
.BR prctl (2),
|
|
or
|
|
.IR enable_on_exec .
|
|
.TP
|
|
.IR "inherit"
|
|
The
|
|
.I inherit
|
|
bit specifies that this counter should count events of child
|
|
tasks as well as the task specified.
|
|
This applies only to new children, not to any existing children at
|
|
the time the counter is created (nor to any new children of
|
|
existing children).
|
|
|
|
Inherit does not work for some combinations of
|
|
.IR read_format s,
|
|
such as
|
|
.BR PERF_FORMAT_GROUP .
|
|
.TP
|
|
.IR "pinned"
|
|
The
|
|
.I pinned
|
|
bit specifies that the counter should always be on the CPU if at all
|
|
possible.
|
|
It applies only to hardware counters and only to group leaders.
|
|
If a pinned counter cannot be put onto the CPU (e.g., because there are
|
|
not enough hardware counters or because of a conflict with some other
|
|
event), then the counter goes into an 'error' state, where reads
|
|
return end-of-file (i.e.,
|
|
.BR read (2)
|
|
returns 0) until the counter is subsequently enabled or disabled.
|
|
.TP
|
|
.IR "exclusive"
|
|
The
|
|
.I exclusive
|
|
bit specifies that when this counter's group is on the CPU,
|
|
it should be the only group using the CPU's counters.
|
|
In the future this may allow monitoring programs to
|
|
support PMU features that need to run alone so that they do not
|
|
disrupt other hardware counters.
|
|
.TP
|
|
.IR "exclude_user"
|
|
If this bit is set, the count excludes events that happen in user space.
|
|
.TP
|
|
.IR "exclude_kernel"
|
|
If this bit is set, the count excludes events that happen in kernel-space.
|
|
.TP
|
|
.IR "exclude_hv"
|
|
If this bit is set, the count excludes events that happen in the
|
|
hypervisor.
|
|
This is mainly for PMUs that have built-in support for handling this
|
|
(such as POWER).
|
|
Extra support is needed for handling hypervisor measurements on most
|
|
machines.
|
|
.TP
|
|
.IR "exclude_idle"
|
|
If set, don't count when the CPU is idle.
|
|
.TP
|
|
.IR "mmap"
|
|
The
|
|
.I mmap
|
|
bit enables recording of exec mmap events.
|
|
.TP
|
|
.IR "comm"
|
|
The
|
|
.I comm
|
|
bit enables tracking of process command name as modified by the
|
|
.IR exec (2)
|
|
and
|
|
.IR prctl (PR_SET_NAME)
|
|
system calls.
|
|
Unfortunately for tools,
|
|
there is no way to distinguish one system call versus the other.
|
|
.TP
|
|
.IR "freq"
|
|
If this bit is set, then
|
|
.I sample_frequency
|
|
not
|
|
.I sample_period
|
|
is used when setting up the sampling interval.
|
|
.TP
|
|
.IR "inherit_stat"
|
|
This bit enables saving of event counts on context switch for
|
|
inherited tasks.
|
|
This is meaningful only if the
|
|
.I inherit
|
|
field is set.
|
|
.TP
|
|
.IR "enable_on_exec"
|
|
If this bit is set, a counter is automatically
|
|
enabled after a call to
|
|
.BR exec (2).
|
|
.TP
|
|
.IR "task"
|
|
If this bit is set, then
|
|
fork/exit notifications are included in the ring buffer.
|
|
.TP
|
|
.IR "watermark"
|
|
If set, have a sampling interrupt happen when we cross the
|
|
.I wakeup_watermark
|
|
boundary.
|
|
Otherwise interrupts happen after
|
|
.I wakeup_events
|
|
samples.
|
|
.TP
|
|
.IR "precise_ip" " (Since Linux 2.6.35)"
|
|
This controls the amount of skid.
|
|
Skid is how many instructions
|
|
execute between an event of interest happening and the kernel
|
|
being able to stop and record the event.
|
|
Smaller skid is
|
|
better and allows more accurate reporting of which events
|
|
correspond to which instructions, but hardware is often limited
|
|
with how small this can be.
|
|
|
|
The values of this are the following:
|
|
.RS
|
|
.TP
|
|
0 -
|
|
.B SAMPLE_IP
|
|
can have arbitrary skid.
|
|
.TP
|
|
1 -
|
|
.B SAMPLE_IP
|
|
must have constant skid.
|
|
.TP
|
|
2 -
|
|
.B SAMPLE_IP
|
|
requested to have 0 skid.
|
|
.TP
|
|
3 -
|
|
.B SAMPLE_IP
|
|
must have 0 skid.
|
|
See also
|
|
.BR PERF_RECORD_MISC_EXACT_IP .
|
|
.RE
|
|
.TP
|
|
.IR "mmap_data" " (Since Linux 2.6.36)"
|
|
The counterpart of the
|
|
.I mmap
|
|
field, but enables including data mmap events
|
|
in the ring-buffer.
|
|
.TP
|
|
.IR "sample_id_all" " (Since Linux 2.6.38)"
|
|
If set, then TID, TIME, ID, CPU, and STREAM_ID can
|
|
additionally be included in
|
|
.RB non- PERF_RECORD_SAMPLE s
|
|
if the corresponding
|
|
.I sample_type
|
|
is selected.
|
|
.TP
|
|
.IR "exclude_host" " (Since Linux 3.2)"
|
|
Do not measure time spent in VM host.
|
|
.TP
|
|
.IR "exclude_guest" " (Since Linux 3.2)"
|
|
Do not measure time spent in VM guest.
|
|
.TP
|
|
.IR "exclude_callchain_kernel" " (Since Linux 3.7)"
|
|
Do not include kernel callchains.
|
|
.TP
|
|
.IR "exclude_callchain_user" " (Since Linux 3.7)"
|
|
Do not include user callchains.
|
|
.TP
|
|
.IR "wakeup_events" ", " "wakeup_watermark"
|
|
This union sets how many samples
|
|
.RI ( wakeup_events )
|
|
or bytes
|
|
.RI ( wakeup_watermark )
|
|
happen before an overflow signal happens.
|
|
Which one is used is selected by the
|
|
.I watermark
|
|
bitflag.
|
|
|
|
.I wakeup_events
|
|
only counts
|
|
.B PERF_RECORD_SAMPLE
|
|
record types.
|
|
To receive a signal for every incoming
|
|
.B PERF_RECORD
|
|
type set
|
|
.I wakeup_watermark
|
|
to 1.
|
|
.TP
|
|
.IR "bp_type" " (Since Linux 2.6.33)"
|
|
This chooses the breakpoint type.
|
|
It is one of:
|
|
.RS
|
|
.TP
|
|
.BR HW_BREAKPOINT_EMPTY
|
|
No breakpoint.
|
|
.TP
|
|
.BR HW_BREAKPOINT_R
|
|
Count when we read the memory location.
|
|
.TP
|
|
.BR HW_BREAKPOINT_W
|
|
Count when we write the memory location.
|
|
.TP
|
|
.BR HW_BREAKPOINT_RW
|
|
Count when we read or write the memory location.
|
|
.TP
|
|
.BR HW_BREAKPOINT_X
|
|
Count when we execute code at the memory location.
|
|
.LP
|
|
The values can be combined via a bitwise or, but the
|
|
combination of
|
|
.B HW_BREAKPOINT_R
|
|
or
|
|
.B HW_BREAKPOINT_W
|
|
with
|
|
.B HW_BREAKPOINT_X
|
|
is not allowed.
|
|
.RE
|
|
.TP
|
|
.IR "bp_addr" " (Since Linux 2.6.33)"
|
|
.I bp_addr
|
|
address of the breakpoint.
|
|
For execution breakpoints this is the memory address of the instruction
|
|
of interest; for read and write breakpoints it is the memory address
|
|
of the memory location of interest.
|
|
.TP
|
|
.IR "config1" " (Since Linux 2.6.39)"
|
|
.I config1
|
|
is used for setting events that need an extra register or otherwise
|
|
do not fit in the regular config field.
|
|
Raw OFFCORE_EVENTS on Nehalem/Westmere/SandyBridge use this field
|
|
on 3.3 and later kernels.
|
|
.TP
|
|
.IR "bp_len" " (Since Linux 2.6.33)"
|
|
.I bp_len
|
|
is the length of the breakpoint being measured if
|
|
.I type
|
|
is
|
|
.BR PERF_TYPE_BREAKPOINT .
|
|
Options are
|
|
.BR HW_BREAKPOINT_LEN_1 ,
|
|
.BR HW_BREAKPOINT_LEN_2 ,
|
|
.BR HW_BREAKPOINT_LEN_4 ,
|
|
.BR HW_BREAKPOINT_LEN_8 .
|
|
For an execution breakpoint, set this to
|
|
.IR sizeof(long) .
|
|
.TP
|
|
.IR "config2" " (Since Linux 2.6.39)"
|
|
|
|
.I config2
|
|
is a further extension of the
|
|
.I config1
|
|
field.
|
|
.TP
|
|
.IR "branch_sample_type" " (Since Linux 3.4)"
|
|
If
|
|
.B PERF_SAMPLE_BRANCH_STACK
|
|
is enabled, then this specifies what branches to include
|
|
in the branch record.
|
|
|
|
The first part of the value is the privilege level, which
|
|
is a combination of one of the following values.
|
|
If the user does not set privilege level explicitly, the kernel
|
|
will use the event's privilege level.
|
|
Event and branch privilege levels do not have to match.
|
|
.RS
|
|
.TP
|
|
.B PERF_SAMPLE_BRANCH_USER
|
|
Branch target is in user space.
|
|
.TP
|
|
.B PERF_SAMPLE_BRANCH_KERNEL
|
|
Branch target is in kernel space.
|
|
.TP
|
|
.B PERF_SAMPLE_BRANCH_HV
|
|
Branch target is in hypervisor.
|
|
.TP
|
|
.B PERF_SAMPLE_BRANCH_PLM_ALL
|
|
A convenience value that is the three preceding values ORed together.
|
|
|
|
.P
|
|
In addition to the privilege value, at least one or more of the
|
|
following bits must be set.
|
|
|
|
.TP
|
|
.B PERF_SAMPLE_BRANCH_ANY
|
|
Any branch type.
|
|
.TP
|
|
.B PERF_SAMPLE_BRANCH_ANY_CALL
|
|
Any call branch.
|
|
.TP
|
|
.B PERF_SAMPLE_BRANCH_ANY_RETURN
|
|
Any return branch.
|
|
.TP
|
|
.B PERF_SAMPLE_BRANCH_IND_CALL
|
|
Indirect calls.
|
|
.TP
|
|
.BR PERF_SAMPLE_BRANCH_ABORT_TX " (Since Linux 3.11)"
|
|
Transactional memory aborts.
|
|
.TP
|
|
.BR PERF_SAMPLE_BRANCH_IN_TX " (Since Linux 3.11)"
|
|
Branch in transactional memory transaction.
|
|
.TP
|
|
.BR PERF_SAMPLE_BRANCH_NO_TX " (Since Linux 3.11)"
|
|
Branch not in transactional memory transaction.
|
|
.RE
|
|
|
|
.TP
|
|
.IR "sample_regs_user" " (Since Linux 3.7)"
|
|
This bitmask defines the set of user CPU registers to dump on samples.
|
|
The layout of the register mask is architecture specific and
|
|
described in the kernel header
|
|
.IR arch/ARCH/include/uapi/asm/perf_regs.h .
|
|
.TP
|
|
.IR "sample_stack_user" " (Since Linux 3.7)"
|
|
This defines the size of the user stack to dump if
|
|
.B PERF_SAMPLE_STACK_USER
|
|
is specified.
|
|
.SS Reading results
|
|
Once a
|
|
.BR perf_event_open ()
|
|
file descriptor has been opened, the values
|
|
of the events can be read from the file descriptor.
|
|
The values that are there are specified by the
|
|
.I read_format
|
|
field in the
|
|
.I attr
|
|
structure at open time.
|
|
|
|
If you attempt to read into a buffer that is not big enough to hold the
|
|
data
|
|
.B ENOSPC
|
|
is returned
|
|
|
|
Here is the layout of the data returned by a read:
|
|
.IP * 2
|
|
If
|
|
.B PERF_FORMAT_GROUP
|
|
was specified to allow reading all events in a group at once:
|
|
|
|
.in +4n
|
|
.nf
|
|
struct read_format {
|
|
u64 nr; /* The number of events */
|
|
u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
|
|
u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
|
|
struct
|
|
u64 value; /* The value of the event */
|
|
u64 id; /* if PERF_FORMAT_ID */
|
|
} values[nr];
|
|
};
|
|
.fi
|
|
.in
|
|
.IP *
|
|
If
|
|
.B PERF_FORMAT_GROUP
|
|
was
|
|
.I not
|
|
specified:
|
|
|
|
.in +4n
|
|
.nf
|
|
struct read_format {
|
|
u64 value; /* The value of the event */
|
|
u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
|
|
u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
|
|
u64 id; /* if PERF_FORMAT_ID */
|
|
};
|
|
.fi
|
|
.in
|
|
.PP
|
|
The values read are as follows:
|
|
.TP
|
|
.I nr
|
|
The number of events in this file descriptor.
|
|
Only available if
|
|
.B PERF_FORMAT_GROUP
|
|
was specified.
|
|
.TP
|
|
.IR time_enabled ", " time_running
|
|
Total time the event was enabled and running.
|
|
Normally these are the same.
|
|
If more events are started
|
|
than available counter slots on the PMU, then multiplexing
|
|
happens and events run only part of the time.
|
|
In that case the
|
|
.I time_enabled
|
|
and
|
|
.I time running
|
|
values can be used to scale an estimated value for the count.
|
|
.TP
|
|
.I value
|
|
An unsigned 64-bit value containing the counter result.
|
|
.TP
|
|
.I id
|
|
A globally unique value for this particular event, only there if
|
|
.B PERF_FORMAT_ID
|
|
was specified in
|
|
.IR read_format .
|
|
.SS MMAP layout
|
|
When using
|
|
.BR perf_event_open ()
|
|
in sampled mode, asynchronous events
|
|
(like counter overflow or
|
|
.B PROT_EXEC
|
|
mmap tracking)
|
|
are logged into a ring-buffer.
|
|
This ring-buffer is created and accessed through
|
|
.BR mmap (2).
|
|
|
|
The mmap size should be 1+2^n pages, where the first page is a
|
|
metadata page
|
|
.RI ( "struct perf_event_mmap_page" )
|
|
that contains various
|
|
bits of information such as where the ring-buffer head is.
|
|
|
|
Before kernel 2.6.39, there is a bug that means you must allocate a mmap
|
|
ring buffer when sampling even if you do not plan to access it.
|
|
|
|
The structure of the first metadata mmap page is as follows:
|
|
|
|
.in +4n
|
|
.nf
|
|
struct perf_event_mmap_page {
|
|
__u32 version; /* version number of this structure */
|
|
__u32 compat_version; /* lowest version this is compat with */
|
|
__u32 lock; /* seqlock for synchronization */
|
|
__u32 index; /* hardware counter identifier */
|
|
__s64 offset; /* add to hardware counter value */
|
|
__u64 time_enabled; /* time event active */
|
|
__u64 time_running; /* time event on CPU */
|
|
union {
|
|
__u64 capabilities;
|
|
__u64 cap_usr_time : 1,
|
|
cap_usr_rdpmc : 1,
|
|
};
|
|
__u16 pmc_width;
|
|
__u16 time_shift;
|
|
__u32 time_mult;
|
|
__u64 time_offset;
|
|
__u64 __reserved[120]; /* Pad to 1k */
|
|
__u64 data_head; /* head in the data section */
|
|
__u64 data_tail; /* user-space written tail */
|
|
}
|
|
.fi
|
|
.in
|
|
|
|
The following looks at the fields in the
|
|
.I perf_event_mmap_page
|
|
structure in more detail:
|
|
.TP
|
|
.I version
|
|
Version number of this structure.
|
|
.TP
|
|
.I compat_version
|
|
The lowest version this is compatible with.
|
|
.TP
|
|
.I lock
|
|
A seqlock for synchronization.
|
|
.TP
|
|
.I index
|
|
A unique hardware counter identifier.
|
|
.TP
|
|
.I offset
|
|
.\" FIXME clarify
|
|
Add this to hardware counter value??
|
|
.TP
|
|
.I time_enabled
|
|
Time the event was active.
|
|
.TP
|
|
.I time_running
|
|
Time the event was running.
|
|
.TP
|
|
.I cap_usr_time
|
|
User time capability.
|
|
.TP
|
|
.I cap_usr_rdpmc
|
|
If the hardware supports user-space read of performance counters
|
|
without syscall (this is the "rdpmc" instruction on x86), then
|
|
the following code can be used to do a read:
|
|
|
|
.in +4n
|
|
.nf
|
|
u32 seq, time_mult, time_shift, idx, width;
|
|
u64 count, enabled, running;
|
|
u64 cyc, time_offset;
|
|
s64 pmc = 0;
|
|
|
|
do {
|
|
seq = pc\->lock;
|
|
barrier();
|
|
enabled = pc\->time_enabled;
|
|
running = pc\->time_running;
|
|
|
|
if (pc\->cap_usr_time && enabled != running) {
|
|
cyc = rdtsc();
|
|
time_offset = pc\->time_offset;
|
|
time_mult = pc\->time_mult;
|
|
time_shift = pc\->time_shift;
|
|
}
|
|
|
|
idx = pc\->index;
|
|
count = pc\->offset;
|
|
|
|
if (pc\->cap_usr_rdpmc && idx) {
|
|
width = pc\->pmc_width;
|
|
pmc = rdpmc(idx \- 1);
|
|
}
|
|
|
|
barrier();
|
|
} while (pc\->lock != seq);
|
|
.fi
|
|
.in
|
|
.TP
|
|
.I pmc_width
|
|
If
|
|
.IR cap_usr_rdpmc ,
|
|
this field provides the bit-width of the value
|
|
read using the rdpmc or equivalent instruction.
|
|
This can be used to sign extend the result like:
|
|
|
|
.in +4n
|
|
.nf
|
|
pmc <<= 64 \- pmc_width;
|
|
pmc >>= 64 \- pmc_width; // signed shift right
|
|
count += pmc;
|
|
.fi
|
|
.in
|
|
.TP
|
|
.IR time_shift ", " time_mult ", " time_offset
|
|
|
|
If
|
|
.IR cap_usr_time ,
|
|
these fields can be used to compute the time
|
|
delta since time_enabled (in nanoseconds) using rdtsc or similar.
|
|
.nf
|
|
|
|
u64 quot, rem;
|
|
u64 delta;
|
|
quot = (cyc >> time_shift);
|
|
rem = cyc & ((1 << time_shift) \- 1);
|
|
delta = time_offset + quot * time_mult +
|
|
((rem * time_mult) >> time_shift);
|
|
.fi
|
|
|
|
Where
|
|
.IR time_offset ,
|
|
.IR time_mult ,
|
|
.IR time_shift ,
|
|
and
|
|
.IR cyc
|
|
are read in the
|
|
seqcount loop described above.
|
|
This delta can then be added to
|
|
enabled and possible running (if idx), improving the scaling:
|
|
.nf
|
|
|
|
enabled += delta;
|
|
if (idx)
|
|
running += delta;
|
|
quot = count / running;
|
|
rem = count % running;
|
|
count = quot * enabled + (rem * enabled) / running;
|
|
.fi
|
|
.TP
|
|
.I data_head
|
|
This points to the head of the data section.
|
|
The value continuously increases, it does not wrap.
|
|
The value needs to be manually wrapped by the size of the mmap buffer
|
|
before accessing the samples.
|
|
|
|
On SMP-capable platforms, after reading the data_head value,
|
|
user space should issue an rmb().
|
|
.TP
|
|
.I data_tail;
|
|
When the mapping is
|
|
.BR PROT_WRITE ,
|
|
the
|
|
.I data_tail
|
|
value should be written by user space to reflect the last read data.
|
|
In this case the kernel will not over-write unread data.
|
|
.PP
|
|
The following 2^n ring-buffer pages have the layout described below.
|
|
|
|
If
|
|
.I perf_event_attr.sample_id_all
|
|
is set, then all event types will
|
|
have the sample_type selected fields related to where/when (identity)
|
|
an event took place (TID, TIME, ID, CPU, STREAM_ID) described in
|
|
.B PERF_RECORD_SAMPLE
|
|
below, it will be stashed just after the
|
|
.I perf_event_header
|
|
and the fields already present for the existing
|
|
fields, that is, at the end of the payload.
|
|
That way a newer perf.data
|
|
file will be supported by older perf tools, with these new optional
|
|
fields being ignored.
|
|
|
|
The mmap values start with a header:
|
|
|
|
.in +4n
|
|
.nf
|
|
struct perf_event_header {
|
|
__u32 type;
|
|
__u16 misc;
|
|
__u16 size;
|
|
};
|
|
.fi
|
|
.in
|
|
|
|
Below, we describe the
|
|
.I perf_event_header
|
|
fields in more detail.
|
|
For ease of reading,
|
|
the fields with shorter descriptions are presented first.
|
|
.TP
|
|
.I size
|
|
This indicates the size of the record.
|
|
.TP
|
|
.I misc
|
|
The
|
|
.I misc
|
|
field contains additional information about the sample.
|
|
|
|
The CPU mode can be determined from this value by masking with
|
|
.B PERF_RECORD_MISC_CPUMODE_MASK
|
|
and looking for one of the following (note these are not
|
|
bit masks, only one can be set at a time):
|
|
.RS
|
|
.TP
|
|
.B PERF_RECORD_MISC_CPUMODE_UNKNOWN
|
|
Unknown CPU mode.
|
|
.TP
|
|
.B PERF_RECORD_MISC_KERNEL
|
|
Sample happened in the kernel.
|
|
.TP
|
|
.B PERF_RECORD_MISC_USER
|
|
Sample happened in user code.
|
|
.TP
|
|
.B PERF_RECORD_MISC_HYPERVISOR
|
|
Sample happened in the hypervisor.
|
|
.TP
|
|
.B PERF_RECORD_MISC_GUEST_KERNEL
|
|
Sample happened in the guest kernel.
|
|
.TP
|
|
.B PERF_RECORD_MISC_GUEST_USER
|
|
Sample happened in guest user code.
|
|
.RE
|
|
|
|
.RS
|
|
In addition, one of the following bits can be set:
|
|
.TP
|
|
.B PERF_RECORD_MISC_MMAP_DATA
|
|
This is set when the mapping is not executable;
|
|
otherwise the mapping is executable.
|
|
.TP
|
|
.B PERF_RECORD_MISC_EXACT_IP
|
|
This indicates that the content of
|
|
.B PERF_SAMPLE_IP
|
|
points
|
|
to the actual instruction that triggered the event.
|
|
See also
|
|
.IR perf_event_attr.precise_ip .
|
|
.TP
|
|
.B PERF_RECORD_MISC_EXT_RESERVED
|
|
This indicates there is extended data available (currently not used).
|
|
.RE
|
|
.TP
|
|
.I type
|
|
The
|
|
.I type
|
|
value is one of the below.
|
|
The values in the corresponding record (that follows the header)
|
|
depend on the
|
|
.I type
|
|
selected as shown.
|
|
.RS
|
|
.TP 4
|
|
.B PERF_RECORD_MMAP
|
|
The MMAP events record the
|
|
.B PROT_EXEC
|
|
mappings so that we can correlate
|
|
user-space IPs to code.
|
|
They have the following structure:
|
|
|
|
.in +4n
|
|
.nf
|
|
struct {
|
|
struct perf_event_header header;
|
|
u32 pid, tid;
|
|
u64 addr;
|
|
u64 len;
|
|
u64 pgoff;
|
|
char filename[];
|
|
};
|
|
.fi
|
|
.in
|
|
.TP
|
|
.B PERF_RECORD_LOST
|
|
This record indicates when events are lost.
|
|
|
|
.in +4n
|
|
.nf
|
|
struct {
|
|
struct perf_event_header header;
|
|
u64 id;
|
|
u64 lost;
|
|
};
|
|
.fi
|
|
.in
|
|
.RS
|
|
.TP
|
|
.I id
|
|
is the unique event ID for the samples that were lost.
|
|
.TP
|
|
.I lost
|
|
is the number of events that were lost.
|
|
.RE
|
|
.TP
|
|
.B PERF_RECORD_COMM
|
|
This record indicates a change in the process name.
|
|
|
|
.in +4n
|
|
.nf
|
|
struct {
|
|
struct perf_event_header header;
|
|
u32 pid, tid;
|
|
char comm[];
|
|
};
|
|
.fi
|
|
.in
|
|
.TP
|
|
.B PERF_RECORD_EXIT
|
|
This record indicates a process exit event.
|
|
|
|
.in +4n
|
|
.nf
|
|
struct {
|
|
struct perf_event_header header;
|
|
u32 pid, ppid;
|
|
u32 tid, ptid;
|
|
u64 time;
|
|
};
|
|
.fi
|
|
.in
|
|
.TP
|
|
.BR PERF_RECORD_THROTTLE ", " PERF_RECORD_UNTHROTTLE
|
|
This record indicates a throttle/unthrottle event.
|
|
|
|
.in +4n
|
|
.nf
|
|
struct {
|
|
struct perf_event_header header;
|
|
u64 time;
|
|
u64 id;
|
|
u64 stream_id;
|
|
};
|
|
.fi
|
|
.in
|
|
.TP
|
|
.B PERF_RECORD_FORK
|
|
This record indicates a fork event.
|
|
|
|
.in +4n
|
|
.nf
|
|
struct {
|
|
struct perf_event_header header;
|
|
u32 pid, ppid;
|
|
u32 tid, ptid;
|
|
u64 time;
|
|
};
|
|
.fi
|
|
.in
|
|
.TP
|
|
.B PERF_RECORD_READ
|
|
This record indicates a read event.
|
|
|
|
.in +4n
|
|
.nf
|
|
struct {
|
|
struct perf_event_header header;
|
|
u32 pid, tid;
|
|
struct read_format values;
|
|
};
|
|
.fi
|
|
.in
|
|
.TP
|
|
.B PERF_RECORD_SAMPLE
|
|
This record indicates a sample.
|
|
|
|
.in +4n
|
|
.nf
|
|
struct {
|
|
struct perf_event_header header;
|
|
u64 ip; /* if PERF_SAMPLE_IP */
|
|
u32 pid, tid; /* if PERF_SAMPLE_TID */
|
|
u64 time; /* if PERF_SAMPLE_TIME */
|
|
u64 addr; /* if PERF_SAMPLE_ADDR */
|
|
u64 id; /* if PERF_SAMPLE_ID */
|
|
u64 stream_id; /* if PERF_SAMPLE_STREAM_ID */
|
|
u32 cpu, res; /* if PERF_SAMPLE_CPU */
|
|
u64 period; /* if PERF_SAMPLE_PERIOD */
|
|
struct read_format v; /* if PERF_SAMPLE_READ */
|
|
u64 nr; /* if PERF_SAMPLE_CALLCHAIN */
|
|
u64 ips[nr]; /* if PERF_SAMPLE_CALLCHAIN */
|
|
u32 size; /* if PERF_SAMPLE_RAW */
|
|
char data[size]; /* if PERF_SAMPLE_RAW */
|
|
u64 bnr; /* if PERF_SAMPLE_BRANCH_STACK */
|
|
struct perf_branch_entry lbr[bnr];
|
|
/* if PERF_SAMPLE_BRANCH_STACK */
|
|
u64 abi; /* if PERF_SAMPLE_REGS_USER */
|
|
u64 regs[weight(mask)];
|
|
/* if PERF_SAMPLE_REGS_USER */
|
|
u64 size; /* if PERF_SAMPLE_STACK_USER */
|
|
char data[size]; /* if PERF_SAMPLE_STACK_USER */
|
|
u64 dyn_size; /* if PERF_SAMPLE_STACK_USER */
|
|
u64 weight; /* if PERF_SAMPLE_WEIGHT */
|
|
u64 data_src; /* if PERF_SAMPLE_DATA_SRC */
|
|
};
|
|
.fi
|
|
.RS 4
|
|
.TP 4
|
|
.I ip
|
|
If
|
|
.B PERF_SAMPLE_IP
|
|
is enabled, then a 64-bit instruction
|
|
pointer value is included.
|
|
.TP
|
|
.IR pid ", " tid
|
|
If
|
|
.B PERF_SAMPLE_TID
|
|
is enabled, then a 32-bit process ID
|
|
and 32-bit thread ID are included.
|
|
.TP
|
|
.I time
|
|
If
|
|
.B PERF_SAMPLE_TIME
|
|
is enabled, then a 64-bit timestamp
|
|
is included.
|
|
This is obtained via local_clock() which is a hardware timestamp
|
|
if available and the jiffies value if not.
|
|
.TP
|
|
.I addr
|
|
If
|
|
.B PERF_SAMPLE_ADDR
|
|
is enabled, then a 64-bit address is included.
|
|
This is usually the address of a tracepoint,
|
|
breakpoint, or software event; otherwise the value is 0.
|
|
.TP
|
|
.I id
|
|
If
|
|
.B PERF_SAMPLE_ID
|
|
is enabled, a 64-bit unique ID is included.
|
|
If the event is a member of an event group, the group leader ID is returned.
|
|
This ID is the same as the one returned by
|
|
.BR PERF_FORMAT_ID .
|
|
.TP
|
|
.I stream_id
|
|
If
|
|
.B PERF_SAMPLE_STREAM_ID
|
|
is enabled, a 64-bit unique ID is included.
|
|
Unlike
|
|
.B PERF_SAMPLE_ID
|
|
the actual ID is returned, not the group leader.
|
|
This ID is the same as the one returned by
|
|
.BR PERF_FORMAT_ID .
|
|
.TP
|
|
.IR cpu ", " res
|
|
If
|
|
.B PERF_SAMPLE_CPU
|
|
is enabled, this is a 32-bit value indicating
|
|
which CPU was being used, in addition to a reserved (unused)
|
|
32-bit value.
|
|
.TP
|
|
.I period
|
|
If
|
|
.B PERF_SAMPLE_PERIOD
|
|
is enabled, a 64-bit value indicating
|
|
the current sampling period is written.
|
|
.TP
|
|
.I v
|
|
If
|
|
.B PERF_SAMPLE_READ
|
|
is enabled, a structure of type read_format
|
|
is included which has values for all events in the event group.
|
|
The values included depend on the
|
|
.I read_format
|
|
value used at
|
|
.BR perf_event_open ()
|
|
time.
|
|
.TP
|
|
.IR nr ", " ips[nr]
|
|
If
|
|
.B PERF_SAMPLE_CALLCHAIN
|
|
is enabled, then a 64-bit number is included
|
|
which indicates how many following 64-bit instruction pointers will
|
|
follow.
|
|
This is the current callchain.
|
|
.TP
|
|
.IR size ", " data[size]
|
|
If
|
|
.B PERF_SAMPLE_RAW
|
|
is enabled, then a 32-bit value indicating size
|
|
is included followed by an array of 8-bit values of length size.
|
|
The values are padded with 0 to have 64-bit alignment.
|
|
|
|
This RAW record data is opaque with respect to the ABI.
|
|
The ABI doesn't make any promises with respect to the stability
|
|
of its content, it may vary depending
|
|
on event, hardware, and kernel version.
|
|
.TP
|
|
.IR bnr ", " lbr[bnr]
|
|
If
|
|
.B PERF_SAMPLE_BRANCH_STACK
|
|
is enabled, then a 64-bit value indicating
|
|
the number of records is included, followed by
|
|
.I bnr
|
|
.I perf_branch_entry
|
|
structures which each include the fields:
|
|
.RS
|
|
.TP
|
|
.I from
|
|
This indicates the source instruction (may not be a branch).
|
|
.TP
|
|
.I to
|
|
The branch target.
|
|
.TP
|
|
.I mispred
|
|
The branch target was mispredicted.
|
|
.TP
|
|
.I predicted
|
|
The branch target was predicted.
|
|
.TP
|
|
.IR in_tx " (Since Linux 3.11)"
|
|
The branch was in a transactional memory transaction.
|
|
.TP
|
|
.IR abort " (Since Linux 3.11)"
|
|
The branch was in an aborted transactional memory transaction.
|
|
|
|
.P
|
|
The entries are from most to least recent, so the first entry
|
|
has the most recent branch.
|
|
|
|
Support for
|
|
.I mispred
|
|
and
|
|
.I predicted
|
|
is optional; if not supported, both
|
|
values will be 0.
|
|
|
|
The type of branches recorded is specified by the
|
|
.I branch_sample_type
|
|
field.
|
|
.RE
|
|
|
|
.TP
|
|
.IR abi ", " regs[weight(mask)]
|
|
If
|
|
.B PERF_SAMPLE_REGS_USER
|
|
is enabled, then the user CPU registers are recorded.
|
|
|
|
The
|
|
.I abi
|
|
field is one of
|
|
.BR PERF_SAMPLE_REGS_ABI_NONE ", " PERF_SAMPLE_REGS_ABI_32 " or "
|
|
.BR PERF_SAMPLE_REGS_ABI_64 .
|
|
|
|
The
|
|
.I regs
|
|
field is an array of the CPU registers that were specified by
|
|
the
|
|
.I sample_regs_user
|
|
attr field.
|
|
The number of values is the number of bits set in the
|
|
.I sample_regs_user
|
|
bitmask.
|
|
.TP
|
|
.IR size ", " data[size] ", " dyn_size
|
|
If
|
|
.B PERF_SAMPLE_STACK_USER
|
|
is enabled, then record the user stack to enable backtracing.
|
|
.I size
|
|
is the size requested by the user in
|
|
.I stack_user_size
|
|
or else the maximum record size.
|
|
.I data
|
|
is the stack data.
|
|
.I dyn_size
|
|
is the amount of data actually dumped (can be less than
|
|
.IR size ).
|
|
.TP
|
|
.I weight
|
|
If
|
|
.B PERF_SAMPLE_WEIGHT
|
|
is enabled, then a 64 bit value provided by the hardware
|
|
is recorded that indicates how costly the event was.
|
|
This allows expensive events to stand out more clearly
|
|
in profiles.
|
|
.TP
|
|
.I data_src
|
|
If
|
|
.B PERF_SAMPLE_DATA_SRC
|
|
is enabled, then a 64 bit value is recorded that is made up of
|
|
the following fields:
|
|
.RS
|
|
.TP 4
|
|
.I mem_op
|
|
Type of opcode, a bitwise combination of:
|
|
|
|
.PD 0
|
|
.RS
|
|
.TP 24
|
|
.B PERF_MEM_OP_NA
|
|
Not available
|
|
.TP
|
|
.B PERF_MEM_OP_LOAD
|
|
Load instruction
|
|
.TP
|
|
.B PERF_MEM_OP_STORE
|
|
Store instruction
|
|
.TP
|
|
.B PERF_MEM_OP_PFETCH
|
|
Prefetch
|
|
.TP
|
|
.B PERF_MEM_OP_EXEC
|
|
Executable code
|
|
.RE
|
|
.PD
|
|
.TP
|
|
.I mem_lvl
|
|
Memory hierarchy level hit or miss, a bitwise combination of:
|
|
|
|
.PD 0
|
|
.RS
|
|
.TP 24
|
|
.B PERF_MEM_LVL_NA
|
|
Not available
|
|
.TP
|
|
.B PERF_MEM_LVL_HIT
|
|
Hit
|
|
.TP
|
|
.B PERF_MEM_LVL_MISS
|
|
Miss
|
|
.TP
|
|
.B PERF_MEM_LVL_L1
|
|
Level 1 cache
|
|
.TP
|
|
.B PERF_MEM_LVL_LFB
|
|
Line fill buffer
|
|
.TP
|
|
.B PERF_MEM_LVL_L2
|
|
Level 2 cache
|
|
.TP
|
|
.B PERF_MEM_LVL_L3
|
|
Level 3 cache
|
|
.TP
|
|
.B PERF_MEM_LVL_LOC_RAM
|
|
Local DRAM
|
|
.TP
|
|
.B PERF_MEM_LVL_REM_RAM1
|
|
Remote DRAM 1 hop
|
|
.TP
|
|
.B PERF_MEM_LVL_REM_RAM2
|
|
Remote DRAM 2 hops
|
|
.TP
|
|
.B PERF_MEM_LVL_REM_CCE1
|
|
Remote cache 1 hop
|
|
.TP
|
|
.B PERF_MEM_LVL_REM_CCE2
|
|
Remote cache 2 hops
|
|
.TP
|
|
.B PERF_MEM_LVL_IO
|
|
I/O memory
|
|
.TP
|
|
.B PERF_MEM_LVL_UNC
|
|
Uncached memory
|
|
.RE
|
|
.PD
|
|
.TP
|
|
.I mem_snoop
|
|
Snoop mode, a bitwise combination of:
|
|
|
|
.PD 0
|
|
.RS
|
|
.TP 24
|
|
.B PERF_MEM_SNOOP_NA
|
|
Not available
|
|
.TP
|
|
.B PERF_MEM_SNOOP_NONE
|
|
No snoop
|
|
.TP
|
|
.B PERF_MEM_SNOOP_HIT
|
|
Snoop hit
|
|
.TP
|
|
.B PERF_MEM_SNOOP_MISS
|
|
Snoop miss
|
|
.TP
|
|
.B PERF_MEM_SNOOP_HITM
|
|
Snoop hit modified
|
|
.RE
|
|
.PD
|
|
.TP
|
|
.I mem_lock
|
|
Lock instruction, a bitwise combination of:
|
|
|
|
.PD 0
|
|
.RS
|
|
.TP 24
|
|
.B PERF_MEM_LOCK_NA
|
|
Not available
|
|
.TP
|
|
.B PERF_MEM_LOCK_LOCKED
|
|
Locked transaction
|
|
.RE
|
|
.PD
|
|
.TP
|
|
.I mem_dtlb
|
|
TLB access hit or miss, a bitwise combination of:
|
|
|
|
.PD 0
|
|
.RS
|
|
.TP 24
|
|
.B PERF_MEM_TLB_NA
|
|
Not available
|
|
.TP
|
|
.B PERF_MEM_TLB_HIT
|
|
Hit
|
|
.TP
|
|
.B PERF_MEM_TLB_MISS
|
|
Miss
|
|
.TP
|
|
.B PERF_MEM_TLB_L1
|
|
Level 1 TLB
|
|
.TP
|
|
.B PERF_MEM_TLB_L2
|
|
Level 2 TLB
|
|
.TP
|
|
.B PERF_MEM_TLB_WK
|
|
Hardware walker
|
|
.TP
|
|
.B PERF_MEM_TLB_OS
|
|
OS fault handler
|
|
.RE
|
|
.PD
|
|
.RE
|
|
.RE
|
|
.RE
|
|
.RE
|
|
.SS Signal overflow
|
|
Events can be set to deliver a signal when a threshold is crossed.
|
|
The signal handler is set up using the
|
|
.BR poll (2),
|
|
.BR select (2),
|
|
.BR epoll (2)
|
|
and
|
|
.BR fcntl (2),
|
|
system calls.
|
|
|
|
To generate signals, sampling must be enabled
|
|
.RI ( sample_period
|
|
must have a non-zero value).
|
|
|
|
There are two ways to generate signals.
|
|
|
|
The first is to set a
|
|
.I wakeup_events
|
|
or
|
|
.I wakeup_watermark
|
|
value that will generate a signal if a certain number of samples
|
|
or bytes have been written to the mmap ring buffer.
|
|
In this case a signal of type
|
|
.B POLL_IN
|
|
is sent.
|
|
|
|
The other way is by use of the
|
|
.B PERF_EVENT_IOC_REFRESH
|
|
ioctl.
|
|
This ioctl adds to a counter that decrements each time the event overflows.
|
|
When non-zero, a
|
|
.B POLL_IN
|
|
signal is sent on overflow, but
|
|
once the value reaches 0, a signal is sent of type
|
|
.B POLL_HUP
|
|
and
|
|
the underlying event is disabled.
|
|
|
|
Note: on newer kernels (definitely noticed with 3.2)
|
|
.\" FIXME(Vince) : Find out when this was introduced
|
|
a signal is provided for every overflow, even if
|
|
.I wakeup_events
|
|
is not set.
|
|
.SS rdpmc instruction
|
|
Starting with Linux 3.4 on x86, you can use the
|
|
.I rdpmc
|
|
instruction to get low-latency reads without having to enter the kernel.
|
|
Note that using
|
|
.I rdpmc
|
|
is not necessarily faster than other methods for reading event values.
|
|
|
|
Support for this can be detected with the
|
|
.I cap_usr_rdpmc
|
|
field in the mmap page; documentation on how
|
|
to calculate event values can be found in that section.
|
|
.SS perf_event ioctl calls
|
|
.PP
|
|
Various ioctls act on
|
|
.BR perf_event_open ()
|
|
file descriptors
|
|
.TP
|
|
.B PERF_EVENT_IOC_ENABLE
|
|
Enables the individual event or event group specified by the
|
|
file descriptor argument.
|
|
|
|
If the
|
|
.B PERF_IOC_FLAG_GROUP
|
|
bit is set in the ioctl argument, then all events in a group are
|
|
enabled, even if the event specified is not the group leader
|
|
(but see BUGS).
|
|
.TP
|
|
.B PERF_EVENT_IOC_DISABLE
|
|
Disables the individual counter or event group specified by the
|
|
file descriptor argument.
|
|
|
|
Enabling or disabling the leader of a group enables or disables the
|
|
entire group; that is, while the group leader is disabled, none of the
|
|
counters in the group will count.
|
|
Enabling or disabling a member of a group other than the leader
|
|
affects only that counter; disabling a non-leader
|
|
stops that counter from counting but doesn't affect any other counter.
|
|
|
|
If the
|
|
.B PERF_IOC_FLAG_GROUP
|
|
bit is set in the ioctl argument, then all events in a group are
|
|
disabled, even if the event specified is not the group leader
|
|
(but see BUGS).
|
|
.TP
|
|
.B PERF_EVENT_IOC_REFRESH
|
|
Non-inherited overflow counters can use this
|
|
to enable a counter for a number of overflows specified by the argument,
|
|
after which it is disabled.
|
|
Subsequent calls of this ioctl add the argument value to the current
|
|
count.
|
|
A signal with
|
|
.B POLL_IN
|
|
set will happen on each overflow until the
|
|
count reaches 0; when that happens a signal with
|
|
POLL_HUP
|
|
set is sent and the event is disabled.
|
|
Using an argument of 0 is considered undefined behavior.
|
|
.TP
|
|
.B PERF_EVENT_IOC_RESET
|
|
Reset the event count specified by the
|
|
file descriptor argument to zero.
|
|
This resets only the counts; there is no way to reset the
|
|
multiplexing
|
|
.I time_enabled
|
|
or
|
|
.I time_running
|
|
values.
|
|
|
|
If the
|
|
.B PERF_IOC_FLAG_GROUP
|
|
bit is set in the ioctl argument, then all events in a group are
|
|
reset, even if the event specified is not the group leader
|
|
(but see BUGS).
|
|
.TP
|
|
.B PERF_EVENT_IOC_PERIOD
|
|
IOC_PERIOD is the command to update the period; it
|
|
does not update the current period but instead defers until next.
|
|
|
|
The argument is a pointer to a 64-bit value containing the
|
|
desired new period.
|
|
.TP
|
|
.B PERF_EVENT_IOC_SET_OUTPUT
|
|
This tells the kernel to report event notifications to the specified
|
|
file descriptor rather than the default one.
|
|
The file descriptors must all be on the same CPU.
|
|
|
|
The argument specifies the desired file descriptor, or \-1 if
|
|
output should be ignored.
|
|
.TP
|
|
.BR PERF_EVENT_IOC_SET_FILTER " (Since Linux 2.6.33)"
|
|
This adds an ftrace filter to this event.
|
|
|
|
The argument is a pointer to the desired ftrace filter.
|
|
.SS Using prctl
|
|
A process can enable or disable all the event groups that are
|
|
attached to it using the
|
|
.BR prctl (2)
|
|
.B PR_TASK_PERF_EVENTS_ENABLE
|
|
and
|
|
.B PR_TASK_PERF_EVENTS_DISABLE
|
|
operations.
|
|
This applies to all counters on the current process, whether created by
|
|
this process or by another, and does not affect any counters that this
|
|
process has created on other processes.
|
|
It enables or disables only
|
|
the group leaders, not any other members in the groups.
|
|
.SS perf_event related configuration files
|
|
Files in
|
|
.I /proc/sys/kernel/
|
|
.RS 4
|
|
.TP
|
|
.I /proc/sys/kernel/perf_event_paranoid
|
|
|
|
The
|
|
.I perf_event_paranoid
|
|
file can be set to restrict access to the performance counters.
|
|
.RS
|
|
.IP 2 4
|
|
only allow user-space measurements.
|
|
.IP 1
|
|
allow both kernel and user measurements (default).
|
|
.IP 0
|
|
allow access to CPU-specific data but not raw tracepoint samples.
|
|
.IP \-1
|
|
no restrictions.
|
|
.RE
|
|
.IP
|
|
The existence of the
|
|
.I perf_event_paranoid
|
|
file is the official method for determining if a kernel supports
|
|
.BR perf_event_open ().
|
|
.TP
|
|
.I /proc/sys/kernel/perf_event_max_sample_rate
|
|
|
|
This sets the maximum sample rate.
|
|
Setting this too high can allow
|
|
users to sample at a rate that impacts overall machine performance
|
|
and potentially lock up the machine.
|
|
The default value is
|
|
100000 (samples per second).
|
|
.TP
|
|
.I /proc/sys/kernel/perf_event_mlock_kb
|
|
|
|
Maximum number of pages an unprivileged user can mlock (2) .
|
|
The default is 516 (kB).
|
|
|
|
.RE
|
|
Files in
|
|
.I /sys/bus/event_source/devices/
|
|
.RS 4
|
|
Since Linux 2.6.34 the kernel supports having multiple PMUs
|
|
available for monitoring.
|
|
Information on how to program these PMUs can be found under
|
|
.IR /sys/bus/event_source/devices/ .
|
|
Each subdirectory corresponds to a different PMU.
|
|
.TP
|
|
.IR /sys/bus/event_source/devices/*/type " (Since Linux 2.6.38)"
|
|
This contains an integer that can be used in the
|
|
.I type
|
|
field of perf_event_attr to indicate you wish to use this PMU.
|
|
.TP
|
|
.IR /sys/bus/event_source/devices/*/rdpmc " (Since Linux 3.4)"
|
|
If this file is 1, then direct user-space access to the
|
|
performance counter registers is allowed via the rdpmc instruction.
|
|
This can be disabled by echoing 0 to the file.
|
|
.TP
|
|
.IR /sys/bus/event_source/devices/*/format/ " (Since Linux 3.4)"
|
|
This sub-directory contains information on the architecture-specific
|
|
sub-fields available for programming the various
|
|
.I config
|
|
fields in the perf_event_attr struct.
|
|
|
|
The content of each file is the name of the config field, followed
|
|
by a colon, followed by a series of integer bit ranges separated by
|
|
commas.
|
|
For example, the file
|
|
.I event
|
|
may contain the value
|
|
.I config1:1,6-10,44
|
|
which indicates that event is an attribute that occupies bits 1,6-10, and 44
|
|
of perf_event_attr::config1.
|
|
.TP
|
|
.IR /sys/bus/event_source/devices/*/events/ " (Since Linux 3.4)"
|
|
This sub-directory contains files with pre-defined events.
|
|
The contents are strings describing the event settings
|
|
expressed in terms of the fields found in the previously mentioned
|
|
.I ./format/
|
|
directory.
|
|
These are not necessarily complete lists of all events supported by
|
|
a PMU, but usually a subset of events deemed useful or interesting.
|
|
|
|
The content of each file is a list of attribute names
|
|
separated by commas.
|
|
Each entry has an optional value (either hex or decimal).
|
|
If no value is specified than it is assumed to be a single-bit
|
|
field with a value of 1.
|
|
An example entry may look like this:
|
|
.IR event=0x2,inv,ldlat=3 .
|
|
.TP
|
|
.I /sys/bus/event_source/devices/*/uevent
|
|
This file is the standard kernel device interface
|
|
for injecting hotplug events.
|
|
.TP
|
|
.IR /sys/bus/event_source/devices/*/cpumask " (Since Linux 3.7)"
|
|
The
|
|
.I cpumask
|
|
file contains a comma-separated list of integers that
|
|
indicate a representative CPU number for each socket (package)
|
|
on the motherboard.
|
|
This is needed when setting up uncore or northbridge events, as
|
|
those PMUs present socket-wide events.
|
|
.RE
|
|
.SH RETURN VALUE
|
|
.BR perf_event_open ()
|
|
returns the new file descriptor, or \-1 if an error occurred
|
|
(in which case,
|
|
.I errno
|
|
is set appropriately).
|
|
.SH ERRORS
|
|
.TP
|
|
.B E2BIG
|
|
Returned if the perf_event_attr
|
|
.I size
|
|
value is too small
|
|
(smaller than
|
|
.BR PERF_ATTR_SIZE_VER0 ),
|
|
too big (larger than the page size),
|
|
or larger than the kernel supports and the extra bytes are not zero.
|
|
When
|
|
.B E2BIG
|
|
is returned, the perf_event_attr
|
|
.I size
|
|
field is over-written by the kernel to be the size of the structure
|
|
it was expecting.
|
|
.TP
|
|
.B EINVAL
|
|
Returned if the specified event is not available.
|
|
.TP
|
|
.B ENOSPC
|
|
Prior to Linux 3.3, if there was not enough room for the event,
|
|
.B ENOSPC
|
|
was returned.
|
|
Linus did not like this, and this was changed to
|
|
.BR EINVAL .
|
|
.B ENOSPC
|
|
is still returned if you try to read results into
|
|
too small of a buffer.
|
|
.SH VERSION
|
|
.BR perf_event_open ()
|
|
was introduced in Linux 2.6.31 but was called
|
|
.BR perf_counter_open () .
|
|
It was renamed in Linux 2.6.32.
|
|
.SH CONFORMING TO
|
|
This
|
|
.BR perf_event_open ()
|
|
system call Linux- specific
|
|
and should not be used in programs intended to be portable.
|
|
.SH NOTES
|
|
Glibc does not provide a wrapper for this system call; call it using
|
|
.BR syscall (2).
|
|
See the example below.
|
|
|
|
The official way of knowing if
|
|
.BR perf_event_open ()
|
|
support is enabled is checking
|
|
for the existence of the file
|
|
.IR /proc/sys/kernel/perf_event_paranoid .
|
|
.SH BUGS
|
|
The
|
|
.B F_SETOWN_EX
|
|
option to
|
|
.BR fcntl (2)
|
|
is needed to properly get overflow signals in threads.
|
|
This was introduced in Linux 2.6.32.
|
|
|
|
Prior to Linux 2.6.33 (at least for x86) the kernel did not check
|
|
if events could be scheduled together until read time.
|
|
The same happens on all known kernels if the NMI watchdog is enabled.
|
|
This means to see if a given set of events works you have to
|
|
.BR perf_event_open (),
|
|
start, then read before you know for sure you
|
|
can get valid measurements.
|
|
|
|
Prior to Linux 2.6.34 event constraints were not enforced by the kernel.
|
|
In that case, some events would silently return "0" if the kernel
|
|
scheduled them in an improper counter slot.
|
|
|
|
Prior to Linux 2.6.34 there was a bug when multiplexing where the
|
|
wrong results could be returned.
|
|
|
|
Kernels from Linux 2.6.35 to Linux 2.6.39 can quickly crash the kernel if
|
|
"inherit" is enabled and many threads are started.
|
|
|
|
Prior to Linux 2.6.35,
|
|
.B PERF_FORMAT_GROUP
|
|
did not work with attached processes.
|
|
|
|
In older Linux 2.6 versions,
|
|
refreshing an event group leader refreshed all siblings,
|
|
and refreshing with a parameter of 0 enabled infinite refresh.
|
|
This behavior is unsupported and should not be relied on.
|
|
|
|
There is a bug in the kernel code between
|
|
Linux 2.6.36 and Linux 3.0 that ignores the
|
|
"watermark" field and acts as if a wakeup_event
|
|
was chosen if the union has a
|
|
non-zero value in it.
|
|
|
|
From Linux 2.6.31 to Linux 3.4, the
|
|
.B PERF_IOC_FLAG_GROUP
|
|
ioctl argument was broken and would repeatedly operate
|
|
on the event specified rather than iterating across
|
|
all sibling events in a group.
|
|
|
|
Always double-check your results!
|
|
Various generalized events have had wrong values.
|
|
For example, retired branches measured
|
|
the wrong thing on AMD machines until Linux 2.6.35.
|
|
.SH EXAMPLE
|
|
The following is a short example that measures the total
|
|
instruction count of a call to
|
|
.BR printf (3).
|
|
.nf
|
|
|
|
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
#include <unistd.h>
|
|
#include <string.h>
|
|
#include <sys/ioctl.h>
|
|
#include <linux/perf_event.h>
|
|
#include <asm/unistd.h>
|
|
|
|
long
|
|
perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
|
|
int cpu, int group_fd, unsigned long flags)
|
|
{
|
|
int ret;
|
|
|
|
ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
|
|
group_fd, flags);
|
|
return ret;
|
|
}
|
|
|
|
int
|
|
main(int argc, char **argv)
|
|
{
|
|
struct perf_event_attr pe;
|
|
long long count;
|
|
int fd;
|
|
|
|
memset(&pe, 0, sizeof(struct perf_event_attr));
|
|
pe.type = PERF_TYPE_HARDWARE;
|
|
pe.size = sizeof(struct perf_event_attr);
|
|
pe.config = PERF_COUNT_HW_INSTRUCTIONS;
|
|
pe.disabled = 1;
|
|
pe.exclude_kernel = 1;
|
|
pe.exclude_hv = 1;
|
|
|
|
fd = perf_event_open(&pe, 0, \-1, \-1, 0);
|
|
if (fd == \-1) {
|
|
fprintf(stderr, "Error opening leader %llx\\n", pe.config);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
ioctl(fd, PERF_EVENT_IOC_RESET, 0);
|
|
ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
|
|
|
|
printf("Measuring instruction count for this printf\\n");
|
|
|
|
ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
|
|
read(fd, &count, sizeof(long long));
|
|
|
|
printf("Used %lld instructions\\n", count);
|
|
|
|
close(fd);
|
|
}
|
|
.fi
|
|
.SH SEE ALSO
|
|
.BR fcntl (2),
|
|
.BR mmap (2),
|
|
.BR open (2),
|
|
.BR prctl (2),
|
|
.BR read (2)
|