perf_event_open.2: Minor fixes

Signed-off-by: Michael Kerrisk <mtk.manpages@gmail.com>
This commit is contained in:
Michael Kerrisk 2013-02-03 12:55:04 +01:00
parent f2b1d7209e
commit 7db515ef59
1 changed files with 215 additions and 141 deletions

View File

@ -203,33 +203,35 @@ struct perf_event_attr {
__u64 sample_type; /* Specifies values included in sample */
__u64 read_format; /* Specifies values returned in read */
__u64 disabled : 1, /* off by default */
inherit : 1, /* children inherit it */
pinned : 1, /* must always be on PMU */
exclusive : 1, /* only group on PMU */
exclude_user : 1, /* don't count user */
exclude_kernel : 1, /* don't count kernel */
__u64 disabled : 1, /* off by default */
inherit : 1, /* children inherit it */
pinned : 1, /* must always be on PMU */
exclusive : 1, /* only group on PMU */
exclude_user : 1, /* don't count user */
exclude_kernel : 1, /* don't count kernel */
exclude_hv : 1, /* don't count hypervisor */
exclude_idle : 1, /* don't count when idle */
mmap : 1, /* include mmap data */
comm : 1, /* include comm data */
freq : 1, /* use freq, not period */
inherit_stat : 1, /* per task counts */
enable_on_exec : 1, /* next exec enables */
task : 1, /* trace fork/exit */
watermark : 1, /* wakeup_watermark */
precise_ip : 2, /* skid constraint */
mmap_data : 1, /* non-exec mmap data */
exclude_idle : 1, /* don't count when idle */
mmap : 1, /* include mmap data */
comm : 1, /* include comm data */
freq : 1, /* use freq, not period */
inherit_stat : 1, /* per task counts */
enable_on_exec : 1, /* next exec enables */
task : 1, /* trace fork/exit */
watermark : 1, /* wakeup_watermark */
precise_ip : 2, /* skid constraint */
mmap_data : 1, /* non-exec mmap data */
sample_id_all : 1, /* sample_type all events */
exclude_host : 1, /* don't count in host */
exclude_guest : 1, /* don't count in guest */
exclude_callchain_kernel : 1, /* exclude kernel callchains */
exclude_callchain_user : 1, /* exclude user callchains */
exclude_host : 1, /* don't count in host */
exclude_guest : 1, /* don't count in guest */
exclude_callchain_kernel : 1,
/* exclude kernel callchains */
exclude_callchain_user : 1,
/* exclude user callchains */
__reserved_1 : 41;
union {
__u32 wakeup_events; /* wakeup every n events */
__u32 wakeup_watermark; /* bytes before wakeup */
__u32 wakeup_watermark; /* bytes before wakeup */
};
__u32 bp_type; /* breakpoint type */
@ -243,10 +245,11 @@ struct perf_event_attr {
__u64 bp_len; /* breakpoint length */
__u64 config2; /* extension of config1 */
};
__u64 branch_sample_type; /* enum perf_branch_sample_type */
__u64 sample_regs_user; /* user regs to dump on samples */
__u32 sample_stack_user; /* size of stack to dump on samples */
__u32 __reserved_2; /* Align to u64. */
__u64 branch_sample_type; /* enum perf_branch_sample_type */
__u64 sample_regs_user; /* user regs to dump on samples */
__u32 sample_stack_user; /* size of stack to dump on
samples */
__u32 __reserved_2; /* Align to u64 */
};
.fi
@ -294,7 +297,7 @@ execution of an instruction address.
.TP
.RB "dynamic PMU"
Since Linux 2.6.39,
.BR perf_event_open()
.BR perf_event_open ()
can support multiple PMUs.
To enable this, a value exported by the kernel can be used in the
.I type
@ -462,7 +465,7 @@ This only happens on some architectures (never on x86).
.BR PERF_COUNT_SW_EMULATION_FAULTS " (Since Linux 2.6.33)"
This counts the number of emulation faults.
The kernel sometimes traps on unimplemented instructions
and emulates them for userspace.
and emulates them for user space.
This can negatively impact performance.
.RE
.RE
@ -501,7 +504,7 @@ value use the following equation:
where
.I perf_hw_cache_id
is one of:
.RS
.RS 4
.TP
.B PERF_COUNT_HW_CACHE_L1D
for measuring Level 1 Data Cache
@ -529,7 +532,7 @@ for measuring local memory accesses
and
.I perf_hw_cache_op_id
is one of
.RS
.RS 4
.TP
.B PERF_COUNT_HW_CACHE_OP_READ
for read accesses
@ -545,7 +548,7 @@ for prefetch accesses
and
.I perf_hw_cache_op_result_id
is one of
.RS
.RS 4
.TP
.B PERF_COUNT_HW_CACHE_RESULT_ACCESS
to measure accesses
@ -622,7 +625,7 @@ order.
Records instruction pointer.
.TP
.B PERF_SAMPLE_TID
Records the process and thread ids.
Records the process and thread IDs.
.TP
.B PERF_SAMPLE_TIME
Records a timestamp.
@ -657,7 +660,8 @@ Records additional data, if applicable.
Usually returned by tracepoint events.
.TP
.BR PERF_SAMPLE_BRANCH_STACK " (Since Linux 3.4)"
Records the branch stack. See branch_sample_type.
Records the branch stack.
See branch_sample_type.
.TP
.BR PERF_SAMPLE_REGS_USER " (Since Linux 3.7)"
Records the current register state.
@ -671,7 +675,7 @@ Records the current register state.
This field specifies the format of the data returned by
.BR read (2)
on a
.BR perf_event_open()
.BR perf_event_open ()
file descriptor.
.RS
.TP
@ -915,7 +919,7 @@ count when we read or write the memory location
count when we execute code at the memory location
.LP
The values can be combined via a bitwsie or, but the
The values can be combined via a bitwise or, but the
combination of
.B HW_BREAKPOINT_R
or
@ -998,7 +1002,9 @@ User, kernel, and hv
.TP
.IR "sample_regs_user" " (Since Linux 3.7)"
This defines the set of user registers to dump on samples.
See asm/perf_regs.h.
See
.\" FIXME: The following refernce seems to be not quite right:
.IR asm/perf_regs.h .
.TP
.IR "sample_stack_user" " (Since Linux 3.7)"
@ -1008,12 +1014,14 @@ This defines the size of the user stack to dump on sample.
.SS "Reading Results"
Once a
.BR perf_event_open()
.BR perf_event_open ()
file descriptor has been opened, the values
of the events can be read from the file descriptor.
The values that are there are specified by the
.I read_format
field in the attr structure at open time.
field in the
.I attr
structure at open time.
If you attempt to read into a buffer that is not big enough to hold the
data
@ -1097,7 +1105,7 @@ was specified in read_format.
.SS "MMAP Layout"
When using
.BR perf_event_open()
.BR perf_event_open ()
in sampled mode, asynchronous events
(like counter overflow or
.B PROT_EXEC
@ -1120,13 +1128,13 @@ The structure of the first metadata mmap page is as follows:
.in +4n
.nf
struct perf_event_mmap_page {
__u32 version; /* version number of this structure */
__u32 version; /* version number of this structure */
__u32 compat_version; /* lowest version this is compat with */
__u32 lock; /* seqlock for synchronization */
__u32 index; /* hardware counter identifier */
__s64 offset; /* add to hardware counter value */
__u64 time_enabled; /* time event active */
__u64 time_running; /* time event on CPU */
__u32 lock; /* seqlock for synchronization */
__u32 index; /* hardware counter identifier */
__s64 offset; /* add to hardware counter value */
__u64 time_enabled; /* time event active */
__u64 time_running; /* time event on CPU */
union {
__u64 capabilities;
__u64 cap_usr_time : 1,
@ -1136,9 +1144,9 @@ struct perf_event_mmap_page {
__u16 time_shift;
__u32 time_mult;
__u64 time_offset;
__u64 __reserved[120]; /* Pad to 1k */
__u64 __reserved[120]; /* Pad to 1k */
__u64 data_head; /* head in the data section */
__u64 data_tail; /* user-space written tail */
__u64 data_tail; /* user-space written tail */
}
.fi
.in
@ -1149,7 +1157,7 @@ The following looks at the fields in the
.I perf_event_mmap_page
structure in more detail.
.RS
.RS 4
.TP
.I version
@ -1248,7 +1256,7 @@ count += pmc;
If
.IR cap_usr_time ,
these fields can be used to compute the time
delta since time_enabled (in ns) using rdtsc or similar.
delta since time_enabled (in nanoseconds) using rdtsc or similar.
.nf
u64 quot, rem;
@ -1259,7 +1267,13 @@ delta since time_enabled (in ns) using rdtsc or similar.
((rem * time_mult) >> time_shift);
.fi
Where time_offset,time_mult,time_shift and cyc are read in the
Where
.IR time_offset ,
.IR time_mult ,
.IR time_shift ,
and
.IR cyc
are read in the
seqcount loop described above.
This delta can then be added to
enabled and possible running (if idx), improving the scaling:
@ -1276,8 +1290,8 @@ enabled and possible running (if idx), improving the scaling:
.TP
.I data_head
This points to the head of the data section.
The value continuously increases, it does not wrap. The value
needs to be manually wrapped by the size of the mmap buffer
The value continuously increases, it does not wrap.
The value needs to be manually wrapped by the size of the mmap buffer
before accessing the samples.
On SMP-capable platforms, after reading the data_head value,
@ -1287,8 +1301,9 @@ user-space should issue an rmb().
.I data_tail;
When the mapping is
.BR PROT_WRITE ,
the data_tail value should be written by
userspace to reflect the last read data.
the
.I data_tail
value should be written by user space to reflect the last read data.
In this case the kernel will not over-write unread data.
.RE
@ -1303,7 +1318,8 @@ have the sample_type selected fields related to where/when (identity)
an event took place (TID, TIME, ID, CPU, STREAM_ID) described in
.B PERF_RECORD_SAMPLE
below, it will be stashed just after the
perf_event_header and the fields already present for the existing
.I perf_event_header
and the fields already present for the existing
fields, i.e., at the end of the payload.
That way a newer perf.data
file will be supported by older perf tools, with these new optional
@ -1336,12 +1352,12 @@ depend on the
selected as shown.
.RS
.TP
.TP 4
.B PERF_RECORD_MMAP
The MMAP events record the
.B PROT_EXEC
mappings so that we can correlate
userspace IPs to code.
user space IPs to code.
They have the following structure:
.in +4n
@ -1461,96 +1477,124 @@ This record indicates a sample.
.nf
struct {
struct perf_event_header header;
u64 ip; /* if PERF_SAMPLE_IP */
u32 pid, tid; /* if PERF_SAMPLE_TID */
u64 time; /* if PERF_SAMPLE_TIME */
u64 addr; /* if PERF_SAMPLE_ADDR */
u64 id; /* if PERF_SAMPLE_ID */
u64 stream_id; /* if PERF_SAMPLE_STREAM_ID */
u32 cpu, res; /* if PERF_SAMPLE_CPU */
u64 period; /* if PERF_SAMPLE_PERIOD */
u64 ip; /* if PERF_SAMPLE_IP */
u32 pid, tid; /* if PERF_SAMPLE_TID */
u64 time; /* if PERF_SAMPLE_TIME */
u64 addr; /* if PERF_SAMPLE_ADDR */
u64 id; /* if PERF_SAMPLE_ID */
u64 stream_id; /* if PERF_SAMPLE_STREAM_ID */
u32 cpu, res; /* if PERF_SAMPLE_CPU */
u64 period; /* if PERF_SAMPLE_PERIOD */
struct read_format v; /* if PERF_SAMPLE_READ */
u64 nr; /* if PERF_SAMPLE_CALLCHAIN */
u64 ips[nr]; /* if PERF_SAMPLE_CALLCHAIN */
u32 size; /* if PERF_SAMPLE_RAW */
char data[size]; /* if PERF_SAMPLE_RAW */
u64 bnr; /* if PERF_SAMPLE_BRANCH_STACK */
struct perf_branch_entry
lbr[bnr]; /* if PERF_SAMPLE_BRANCH_STACK */
u64 abi; /* if PERF_SAMPLE_REGS_USER */
u64 regs[weight(mask)]; /* if PERF_SAMPLE_REGS_USER */
u64 size; /* if PERF_SAMPLE_STACK_USER */
char data[size]; /* if PERF_SAMPLE_STACK_USER */
u64 dyn_size; /* if PERF_SAMPLE_STACK_USER */
u64 nr; /* if PERF_SAMPLE_CALLCHAIN */
u64 ips[nr]; /* if PERF_SAMPLE_CALLCHAIN */
u32 size; /* if PERF_SAMPLE_RAW */
char data[size]; /* if PERF_SAMPLE_RAW */
u64 bnr; /* if PERF_SAMPLE_BRANCH_STACK */
struct perf_branch_entry lbr[bnr];
/* if PERF_SAMPLE_BRANCH_STACK */
u64 abi; /* if PERF_SAMPLE_REGS_USER */
u64 regs[weight(mask)];
/* if PERF_SAMPLE_REGS_USER */
u64 size; /* if PERF_SAMPLE_STACK_USER */
char data[size]; /* if PERF_SAMPLE_STACK_USER */
u64 dyn_size; /* if PERF_SAMPLE_STACK_USER */
};
.fi
.RS
.TP
.I ip
If PERF_SAMPLE_IP is enabled then a 64-bit instruction
If
.B PERF_SAMPLE_IP
is enabled, then a 64-bit instruction
pointer value is included.
.TP
.IR pid , tid
If PERF_SAMPLE_TID is enabled then a 32-bit process id
and 32-bit thread id are included.
.IR pid ", " tid
If
.B PERF_SAMPLE_TID
is enabled, then a 32-bit process ID
and 32-bit thread ID are included.
.TP
.I time
If PERF_SAMPLE_TIME is enabled then a 64-bit timestamp
If
.B PERF_SAMPLE_TIME
is enabled, then a 64-bit timestamp
is included.
This is obtained via local_clock() which is a hardware timestamp
if available and the jiffies value if not.
.TP
.I addr
If PERF_SAMPLE_ADDR is enabled than a 64-bit address is included.
If
.B PERF_SAMPLE_ADDR
is enabled, then a 64-bit address is included.
This is usually the address of a tracepoint,
breakpoint, or software event; otherwise the value is 0.
.TP
.I id
If PERF_SAMPLE_ID is enabled a 64-bit unique ID is included.
If
.B PERF_SAMPLE_ID
is enabled, a 64-bit unique ID is included.
If the event is a member of an event group, the group leader ID is returned.
This ID is the same as the one returned by PERF_FORMAT_ID.
This ID is the same as the one returned by
.BR PERF_FORMAT_ID .
.TP
.I stream_id
If PERF_SAMPLE_STREAM_ID is enabled a 64-bit unique ID is included.
If
.B PERF_SAMPLE_STREAM_ID
is enabled, a 64-bit unique ID is included.
Unlike
.B PERF_SAMPLE_ID
the actual ID is returned, not the group leader.
This ID is the same as the one returned by PERF_FORMAT_ID.
This ID is the same as the one returned by
.BR PERF_FORMAT_ID .
.TP
.IR cpu , res
If PERF_SAMPLE_CPU is enabled this is a 32-bit value indicating
.IR cpu ", " res
If
.B PERF_SAMPLE_CPU
is enabled, this is a 32-bit value indicating
which CPU was being used, in addition to a reserved (unused)
32-bit value.
.TP
.I period
If PERF_SAMPLE_PERIOD is enabled a 64-bit value indicating
If
.B PERF_SAMPLE_PERIOD
is enabled, a 64-bit value indicating
the current sampling period is written.
.TP
.I v
If PERF_SAMPLE_READ is enabled a structure of type read_format
If
.B PERF_SAMPLE_READ
is enabled, a structure of type read_format
is included which has values for all events in the event group.
The values included depend on the
.I read_format
value used at perf_event_open() time.
value used at
.BR perf_event_open ()
time.
.TP
.IR nr , ips[nr]
If PERF_SAMPLE_CALLCHAIN is enabled then a 64-bit number is included
.IR nr ", " ips[nr]
If
.B PERF_SAMPLE_CALLCHAIN
is enabled, then a 64-bit number is included
which indicates how many following 64-bit instruction pointers will
follow. This is the current callchain.
follow.
This is the current callchain.
.TP
.IR size , data
If PERF_SAMPLE_RAW is enabled then a 32-bit value indicating size
.IR size ", " data
If
.B PERF_SAMPLE_RAW
is enabled, then a 32-bit value indicating size
is included followed by an array of 8-bit values of length size.
The values are padded with 0 to have 64-bit alignment.
@ -1560,26 +1604,35 @@ of its content, it may vary depending
on event, hardware, and kernel version.
.TP
.IR bnr , lbr[bnr]
If PERF_SAMPLE_BRANCH_STACK is enabled then a 64-bit value indicating
the number of records is included, followed by bnr perf_branch_entry
structures. These structures have from, to, and flags values indicating
.IR bnr ", " lbr[bnr]
If
.B PERF_SAMPLE_BRANCH_STACK
is enabled, then a 64-bit value indicating
the number of records is included, followed by
.I bnr
.I perf_branch_entry
structures.
These structures have from, to, and flags values indicating
the from and to addresses from the branches on the callstack.
.TP
.IR abi , regs[weight(mask)]
If PERF_SAMPLE_REGS_USER is enabled then
.IR abi ", " regs[weight(mask)]
If
.B PERF_SAMPLE_REGS_USER
is enabled, then
[to be documented].
The
.I abi
field is one of
.BR PERF_SAMPLE_REGS_ABI_NONE ", " PERF_SAMPLE_REGS_ABI_32 " or "
.BR PERF_SAMPLE_REGS_ABI_64 ". "
.BR PERF_SAMPLE_REGS_ABI_64 .
.TP
.IR size , data[size] , dyn_size
If PERF_SAMPLE_STACK_USER is enabled then
.IR size ", " data[size] ", " dyn_size
If
.B PERF_SAMPLE_STACK_USER
is enabled, then
[to be documented].
.RE
@ -1618,7 +1671,7 @@ Sample happened in the guest kernel.
Sample happened in guest user code.
.RE
In addition one of the following bits can be set:
In addition, one of the following bits can be set:
.RS
.TP
.B PERF_RECORD_MISC_EXACT_IP
@ -1664,18 +1717,24 @@ or
.I wakeup_watermark
value that will generate a signal if a certain number of samples
or bytes have been written to the mmap ring buffer.
In this case a signal of type POLL_IN is sent.
In this case a signal of type
.B POLL_IN
is sent.
The other way is by use of the
.I PERF_EVENT_IOC_REFRESH
.B PERF_EVENT_IOC_REFRESH
ioctl.
This ioctl adds to a counter that decrements each time the event overflows.
When non-zero, a POLL_IN signal is sent on overflow, but
once the value reaches 0, a signal is sent of type POLL_HUP and
When non-zero, a
.B POLL_IN
signal is sent on overflow, but
once the value reaches 0, a signal is sent of type
.B POLL_HUP
and
the underlying event is disabled.
Note: on newer kernels (definitely noticed with 3.2)
.\" FIXME : Find out when this was introduced
.\" FIXME(Vince) : Find out when this was introduced
a signal is provided for every overflow, even if
.I wakeup_events
is not set.
@ -1696,18 +1755,20 @@ to calculate event values can be found in that section.
.SS "perf_event ioctl calls"
.PP
Various ioctls act on
.BR perf_event_open()
.BR perf_event_open ()
file descriptors
.TP
.B PERF_EVENT_IOC_ENABLE
Enables the individual event or event group specified by the fd.
Enables the individual event or event group specified by the
file descriptor argument.
The ioctl argument is ignored.
.TP
.B PERF_EVENT_IOC_DISABLE
Disables the individual counter or event group specified by the fd.
Disables the individual counter or event group specified by the
file descriptor argument.
Enabling or disabling the leader of a group enables or disables the
entire group; that is, while the group leader is disabled, none of the
@ -1725,14 +1786,18 @@ to enable a counter for a number of overflows specified by the argument,
after which it is disabled.
Subsequent calls of this ioctl add the argument value to the current
count.
A signal with POLL_IN set will happen on each overflow until the
count reaches 0; when that happens a signal with POLL_HUP set is
sent and the event is disabled.
A signal with
.B POLL_IN
set will happen on each overflow until the
count reaches 0; when that happens a signal with
POLL_HUP
set is sent and the event is disabled.
Using an argument of 0 is considered undefined behavior.
.TP
.B PERF_EVENT_IOC_RESET
Reset the event count specified by the fd to zero.
Reset the event count specified by the
file descriptor argumentto zero.
This only resets the counts; there is no way to reset the
multiplexing
.I time_enabled
@ -1783,18 +1848,18 @@ the group leaders, not any other members in the groups.
.SS perf_event related configuration files
Files in /proc/sys/kernel/
Files in
.I /proc/sys/kernel/
.RS
.RS 4
.TP
.I
/proc/sys/kernel/perf_event_paranoid
.I /proc/sys/kernel/perf_event_paranoid
The
.I perf_event_paranoid
file can be set to restrict access to the performance counters.
2 - only allow userspace measurements
2 - only allow user-space measurements
1 - (default) allow both kernel and user measurements
@ -1805,32 +1870,35 @@ file can be set to restrict access to the performance counters.
The existence of the
.I perf_event_paranoid
file is the official method for determining if a kernel supports
.BR perf_event_open().
.BR perf_event_open ().
.TP
.I /proc/sys/kernel/perf_event_max_sample_rate
This sets the maximum sample rate. Setting this too high can allow
This sets the maximum sample rate.
Setting this too high can allow
users to sample at a rate that impacts overall machine performance
and potentially lock up the machine. The default value is
and potentially lock up the machine.
The default value is
100000 (samples per second).
.TP
.I /proc/sys/kernel/perf_event_mlock_kb
Maximum number of pages an unprivledged user can mlock (2) .
Maximum number of pages an unprivileged user can mlock (2) .
The default is 516 (kB).
.RE
Files in /sys/bus/event_source/devices/
Files in
.I /sys/bus/event_source/devices/
.RS 4
Since Linux 2.6.34 the kernel supports having multiple PMUs
available for monitoring.
Information on how to program these PMUs can be found under
.IR /sys/bus/event_source/devices/ .
Each subdirectory corresponds to a different PMU.
.RS
.TP
.I /sys/bus/event_source/devices/*/type
This contains an integer that can be used in the
@ -1894,25 +1962,28 @@ It was renamed in Linux 2.6.32.
.SH CONFORMING TO
This call is specific to Linux
This
.BR perf_event_open ()
system call Linux- specific
and should not be used in programs intended to be portable.
.SH NOTES
Glibc does not provide a wrapper for this system call; call it using
.BR syscall (2).
See the example below.
The official way of knowing if
.BR perf_event_open()
.BR perf_event_open ()
support is enabled is checking
for the existence of the file
.I /proc/sys/kernel/perf_event_paranoid
.IR /proc/sys/kernel/perf_event_paranoid .
.SH BUGS
The
.B F_SETOWN_EX
option to
.IR fcntl (2)
.BR fcntl (2)
is needed to properly get overflow signals in threads.
This was introduced in Linux 2.6.32.
@ -1949,14 +2020,15 @@ Linux 2.6.36 and Linux 3.0 that ignores the
was chosen if the union has a
non-zero value in it.
Always double-check your results! Various generalized events
have had wrong values.
Always double-check your results!
Various generalized events have had wrong values.
For example, retired branches measured
the wrong thing on AMD machines until Linux 2.6.35.
.SH EXAMPLE
The following is a short example that measures the total
instruction count of a call to printf().
instruction count of a call to
.BR printf (3).
.nf
#include <stdlib.h>
@ -1967,13 +2039,14 @@ instruction count of a call to printf().
#include <linux/perf_event.h>
#include <asm/unistd.h>
long perf_event_open( struct perf_event_attr *hw_event, pid_t pid,
int cpu, int group_fd, unsigned long flags )
long
perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
int cpu, int group_fd, unsigned long flags)
{
int ret;
ret = syscall( __NR_perf_event_open, hw_event, pid, cpu,
group_fd, flags );
ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
group_fd, flags);
return ret;
}
@ -1995,8 +2068,9 @@ main(int argc, char **argv)
pe.exclude_hv = 1;
fd = perf_event_open(&pe, 0, \-1, \-1, 0);
if (fd < 0) {
if (fd == \-1) {
fprintf(stderr, "Error opening leader %llx\\n", pe.config);
exit(EXIT_FAILURE);
}
ioctl(fd, PERF_EVENT_IOC_RESET, 0);