perf_event_open(2) System Calls Manual perf_event_open(2) perf_event_open - LIBRARY Standard C library (libc, -lc) #include /* PERF_* */ #include /* HW_* */ #include /* SYS_* */ #include int syscall(SYS_perf_event_open, struct perf_event_attr *attr, pid_t pid, int cpu, int group_fd, unsigned long flags); Note: glibc provides no wrapper for perf_event_open(), necessitating the use of syscall(2). Given a list of parameters, perf_event_open() returns a file descriptor, for use in subsequent system calls ( read(2), mmap(2), prctl(2), fcntl(2), etc.). perf_event_open() , . ; . : ioctl(2) prctl(2). , , . : (counting) (sampled). . , read(2). , mmap(2). pid cpu : pid == 0 cpu == -1 / . pid == 0 cpu >= 0 / , . pid > 0 cpu == -1 / . pid > 0 cpu >= 0 / , . pid == -1 cpu >= 0 This measures all processes/threads on the specified CPU. This requires CAP_PERFMON (since Linux 5.8) or CAP_SYS_ADMIN capability or a /proc/sys/kernel/perf_event_paranoid value of less than 1. pid == -1 cpu == -1 . When pid is greater than zero, permission to perform this system call is governed by CAP_PERFMON (since Linux 5.9) and a ptrace access mode PTRACE_MODE_READ_REALCREDS check on older Linux versions; see ptrace(2). The group_fd argument allows event groups to be created. An event group has one event which is the group leader. The leader is created first, with group_fd = -1. The rest of the group members are created with subsequent perf_event_open() calls with group_fd being set to the file descriptor of the group leader. (A single event on its own is created with group_fd = -1 and is considered to be a group with only 1 member.) An event group is scheduled onto the CPU as a unit: it will be put onto the CPU only if all of the events in the group can be put onto the CPU. This means that the values of the member events can be meaningfully compared --added, divided (to get ratios), and so on-- with each other, since they have counted events for the same set of executed instructions. flags : PERF_FLAG_FD_CLOEXEC ( Linux 3.14) close-on-exec , execve(2). close-on-exec , fcntl(2), , perf_event_open() fcntl(2) fork(2) execve(2). PERF_FLAG_FD_NO_GROUP group_fd, PERF_FLAG_FD_OUTPUT. PERF_FLAG_FD_OUTPUT ( , Linux 2.6.35) mmap , group_fd. PERF_FLAG_PID_CGROUP ( Linux 2.6.39) . -- , (, . .). , , (cgroup). cgroup , cgroupfs. , cgroup test, /dev/cgroup/test (, cgroupfs /dev/cgroup) pid. cgroup . perf_event_attr . struct perf_event_attr { __u32 type; /* Type of event */ __u32 size; /* Size of attribute structure */ __u64 config; /* Type-specific configuration */ union { __u64 sample_period; /* Period of sampling */ __u64 sample_freq; /* Frequency of sampling */ }; __u64 sample_type; /* Specifies values included in sample */ __u64 read_format; /* Specifies values returned in read */ __u64 disabled : 1, /* off by default */ inherit : 1, /* children inherit it */ pinned : 1, /* must always be on PMU */ exclusive : 1, /* only group on PMU */ exclude_user : 1, /* don't count user */ exclude_kernel : 1, /* don't count kernel */ exclude_hv : 1, /* don't count hypervisor */ exclude_idle : 1, /* don't count when idle */ mmap : 1, /* include mmap data */ comm : 1, /* include comm data */ freq : 1, /* use freq, not period */ inherit_stat : 1, /* per task counts */ enable_on_exec : 1, /* next exec enables */ task : 1, /* trace fork/exit */ watermark : 1, /* wakeup_watermark */ precise_ip : 2, /* skid constraint */ mmap_data : 1, /* non-exec mmap data */ sample_id_all : 1, /* sample_type all events */ exclude_host : 1, /* don't count in host */ exclude_guest : 1, /* don't count in guest */ exclude_callchain_kernel : 1, /* exclude kernel callchains */ exclude_callchain_user : 1, /* exclude user callchains */ mmap2 : 1, /* include mmap with inode data */ comm_exec : 1, /* flag comm events that are due to exec */ use_clockid : 1, /* use clockid for time fields */ context_switch : 1, /* context switch data */ write_backward : 1, /* Write ring buffer from end to beginning */ namespaces : 1, /* include namespaces data */ ksymbol : 1, /* include ksymbol events */ bpf_event : 1, /* include bpf events */ aux_output : 1, /* generate AUX records instead of events */ cgroup : 1, /* include cgroup events */ text_poke : 1, /* include text poke events */ build_id : 1, /* use build id in mmap2 events */ inherit_thread : 1, /* children only inherit */ /* if cloned with CLONE_THREAD */ remove_on_exec : 1, /* event is removed from task on exec */ sigtrap : 1, /* send synchronous SIGTRAP on event */ __reserved_1 : 26; union { __u32 wakeup_events; /* wakeup every n events */ __u32 wakeup_watermark; /* bytes before wakeup */ }; __u32 bp_type; /* breakpoint type */ union { __u64 bp_addr; /* breakpoint address */ __u64 kprobe_func; /* for perf_kprobe */ __u64 uprobe_path; /* for perf_uprobe */ __u64 config1; /* extension of config */ }; union { __u64 bp_len; /* breakpoint length */ __u64 kprobe_addr; /* with kprobe_func == NULL */ __u64 probe_offset; /* for perf_[k,u]probe */ __u64 config2; /* extension of config1 */ }; __u64 branch_sample_type; /* enum perf_branch_sample_type */ __u64 sample_regs_user; /* user regs to dump on samples */ __u32 sample_stack_user; /* size of stack to dump on samples */ __s32 clockid; /* clock to use for time fields */ __u64 sample_regs_intr; /* regs to dump on samples */ __u32 aux_watermark; /* aux bytes before wakeup */ __u16 sample_max_stack; /* max frames in callchain */ __u16 __reserved_2; /* align to u64 */ __u32 aux_sample_size; /* max aux sample size */ __u32 __reserved_3; /* align to u64 */ __u64 sig_data; /* user data for sigtrap */ }; perf_event_attr: type . : PERF_TYPE_HARDWARE <<>> , . config. PERF_TYPE_SOFTWARE , (, ). PERF_TYPE_TRACEPOINT , . PERF_TYPE_HW_CACHE . , config. PERF_TYPE_RAW <<>>, config. PERF_TYPE_BREAKPOINT ( Linux 2.6.33) , . / , . PMU Linux 2.6.38, perf_event_open() PMU. PMU, type, . sysfs: PMU /sys/bus/event_source/devices. type, , type. , /sys/bus/event_source/devices/cpu/type PMU , , 4. kprobe uprobe ( Linux 4.17) PMU kprobe/uprobe , perf_event_open. kprobe/uprobe . kprobe_func, uprobe_path, kprobe_addr probe_offset. size perf_event_attr . sizeof(struct perf_event_attr) . PERF_ATTR_SIZE_VER0 64; . PERF_ATTR_SIZE_VER1 72, Linux 2.6.33 . PERF_ATTR_SIZE_VER2 80, Linux 3.4 . PERF_ATTR_SIZE_VER3 96, Linux 3.7 sample_regs_user sample_stack_user. PERF_ATTR_SIZE_VER4 104, Linux 3.19 sample_regs_intr. PERF_ATTR_SIZE_VER5 112, in Linux 4.1 aux_watermark. config type. config1 config2 , 64 . . config, type. config type. type PERF_TYPE_HARDWARE, . . config : PERF_COUNT_HW_CPU_CYCLES . , . PERF_COUNT_HW_INSTRUCTIONS (retired instructions). , , , . PERF_COUNT_HW_CACHE_REFERENCES . , , -. ; , . PERF_COUNT_HW_CACHE_MISSES . , ; PERF_COUNT_HW_CACHE_REFERENCES . PERF_COUNT_HW_BRANCH_INSTRUCTIONS . Linux 2.6.35 AMD. PERF_COUNT_HW_BRANCH_MISSES . PERF_COUNT_HW_BUS_CYCLES , . PERF_COUNT_HW_STALLED_CYCLES_FRONTEND ( Linux 3.0) . PERF_COUNT_HW_STALLED_CYCLES_BACKEND ( Linux 3.0) . PERF_COUNT_HW_REF_CPU_CYCLES ( Linux 3.3) ; . type PERF_TYPE_SOFTWARE, , . config : PERF_COUNT_SW_CPU_CLOCK , , . PERF_COUNT_SW_TASK_CLOCK . PERF_COUNT_SW_PAGE_FAULTS . PERF_COUNT_SW_CONTEXT_SWITCHES . Linux 2.6.34, , . PERF_COUNT_SW_CPU_MIGRATIONS . PERF_COUNT_SW_PAGE_FAULTS_MIN . -. PERF_COUNT_SW_PAGE_FAULTS_MAJ . -. PERF_COUNT_SW_ALIGNMENT_FAULTS ( Linux 2.6.33) . , ; , . ( x86 -- ). PERF_COUNT_SW_EMULATION_FAULTS ( Linux 2.6.33) . . . PERF_COUNT_SW_DUMMY ( Linux 3.12) , . This is a placeholder event that counts nothing. , mmap comm, . . PERF_COUNT_SW_BPF_OUTPUT (since Linux 4.4) This is used to generate raw sample data from BPF. BPF programs can write to this event using bpf_perf_event_output helper. PERF_COUNT_SW_CGROUP_SWITCHES (since Linux 5.13) This counts context switches to a task in a different cgroup. In other words, if the next task is in the same cgroup, it won't count the switch. type PERF_TYPE_TRACEPOINT, . , config, debugfs tracing/events/*/*/id, ftrace . type PERF_TYPE_HW_CACHE, . config : config = (perf_hw_cache_id) | (perf_hw_cache_op_id << 8) | (perf_hw_cache_op_result_id << 16); perf_hw_cache_id : PERF_COUNT_HW_CACHE_L1D 1- PERF_COUNT_HW_CACHE_L1I 1- PERF_COUNT_HW_CACHE_LL PERF_COUNT_HW_CACHE_DTLB TLB PERF_COUNT_HW_CACHE_ITLB TLB PERF_COUNT_HW_CACHE_BPU PERF_COUNT_HW_CACHE_NODE ( Linux 3.1) perf_hw_cache_op_id : PERF_COUNT_HW_CACHE_OP_READ PERF_COUNT_HW_CACHE_OP_WRITE PERF_COUNT_HW_CACHE_OP_PREFETCH perf_hw_cache_op_result_id : PERF_COUNT_HW_CACHE_RESULT_ACCESS PERF_COUNT_HW_CACHE_RESULT_MISS type PERF_TYPE_RAW, <<>> config. , <<>> . ; (, Intel Volume 3B AMD BIOS ). perf_event_open() libpfm4. type PERF_TYPE_BREAKPOINT, config 0. . type kprobe uprobe, retprobe ( 0 config, /sys/bus/event_source/devices/[k,u]probe/format/retprobe) kretprobe/uretprobe. kprobe_func, uprobe_path, kprobe_addr probe_offset. kprobe_func uprobe_path kprobe_addr probe_offset kprobe/uprobe PMU kprobe uprobe. kprobe: kprobe_func probe_offset, kprobe_addr kprobe_func NULL. uprobe: uprobe_path probe_offset. sample_period sample_freq <<>> N , N sample_period. sample_period > 0. , mmap. sample_type . , , sample_freq. freq. , . . sample_type . , mmap(2). << MMAP>> ; enum perf_event_sample_format. PERF_SAMPLE_IP . PERF_SAMPLE_TID . PERF_SAMPLE_TIME . PERF_SAMPLE_ADDR ( ). PERF_SAMPLE_READ , . PERF_SAMPLE_CALLCHAIN ( ). PERF_SAMPLE_ID . PERF_SAMPLE_CPU . PERF_SAMPLE_PERIOD . PERF_SAMPLE_STREAM_ID . PERF_SAMPLE_ID , . PERF_FORMAT_ID. PERF_SAMPLE_RAW , . . PERF_SAMPLE_BRANCH_STACK ( Linux 3.4) , (, Intel Last Branch Record). . branch_sample_type , . PERF_SAMPLE_REGS_USER ( Linux 3.7) ( ). PERF_SAMPLE_STACK_USER ( Linux 3.7) . PERF_SAMPLE_WEIGHT ( Linux 3.10) , , . . PERF_SAMPLE_DATA_SRC ( Linux 3.10) : , . , . PERF_SAMPLE_IDENTIFIER ( Linux 3.12) SAMPLE_ID , ( ) ( ). , sample_type. , SAMPLE_ID, ( ). PERF_SAMPLE_IDENTIFIER , SAMPLE_ID , SAMPLE_ID . PERF_SAMPLE_TRANSACTION ( Linux 3.13) (, Intel TSX). precise_ip 0 . , perf_event, , ( ). PERF_SAMPLE_REGS_INTR ( Linux 3.19) , sample_regs_intr. PERF_SAMPLE_REGS_USER , . (PEBS Intel x86) precise_ip , , . PERF_SAMPLE_PHYS_ADDR ( Linux 4.13) Records physical address of data like in PERF_SAMPLE_ADDR. PERF_SAMPLE_CGROUP ( Linux 5.7) Records (perf_event) cgroup ID of the process. This corresponds to the id field in the PERF_RECORD_CGROUP event. PERF_SAMPLE_DATA_PAGE_SIZE (since Linux 5.11) Records page size of data like in PERF_SAMPLE_ADDR. PERF_SAMPLE_CODE_PAGE_SIZE (since Linux 5.11) Records page size of ip like in PERF_SAMPLE_IP. PERF_SAMPLE_WEIGHT_STRUCT (since Linux 5.12) Records hardware provided weight values like in PERF_SAMPLE_WEIGHT, but it can represent multiple values in a struct. This shares the same space as PERF_SAMPLE_WEIGHT, so users can apply either of those, not both. It has the following format and the meaning of each field is dependent on the hardware implementation. union perf_sample_weight { u64 full; /* PERF_SAMPLE_WEIGHT */ struct { /* PERF_SAMPLE_WEIGHT_STRUCT */ u32 var1_dw; u16 var2_w; u16 var3_w; }; }; read_format , read(2) perf_event_open(). PERF_FORMAT_TOTAL_TIME_ENABLED 64- time_enabled. , PMU . PERF_FORMAT_TOTAL_TIME_RUNNING 64- time_running. , PMU . PERF_FORMAT_ID 64- , . PERF_FORMAT_GROUP . PERF_FORMAT_LOST (since Linux 6.0) Adds a 64-bit value that is the number of lost samples for this event. This would be only meaningful when sample_period or sample_freq is set. disabled disabled , . , ioctl(2), prctl(2) enable_on_exec. , disabled 1, disabled 0. disabled 0, , . inherit inherit , , . , ( ). read_format, PERF_FORMAT_GROUP. pinned pinned , , . . (, ), <<>>, (. ., read(2) 0) , . exclusive exclusive , , . , PMU, . , exclusive . , ( NMI Watchdog Timer). exclude_user , , . exclude_kernel , , . exclude_hv , , . PMU, ( POWER). . exclude_idle , . , . mmap mmap PERF_RECORD_MMAP mmap(2) PROT_EXEC. , (, ) , . comm comm , execve(2) prctl(PR_SET_NAME), /proc/self/comm. comm_exec (, Linux 3.16), PERF_RECORD_MISC_COMM_EXEC, execve(2) . freq , sample_frequency, sample_period. inherit_stat . , inherit. enable_on_exec , execve(2). task , fork/exit. watermark , wakeup_watermark. , wakeup_events . precise_ip ( Linux 2.6.35) (skid). -- , . : , , . : 0 SAMPLE_IP . 1 SAMPLE_IP . 2 SAMPLE_IP . 3 SAMPLE_IP 0 (skid). I PERF_RECORD_MISC_EXACT_IP(). mmap_data ( Linux 2.6.36) mmap. PERF_RECORD_MMAP mmap(2), PROT_EXEC (, SysV). sample_id_all ( Linux 2.6.38) , TID, TIME, ID, STREAM_ID -PERF_RECORD_SAMPLE, sample_type. PERF_SAMPLE_IDENTIFIER, ID . id . -: struct sample_id { { u32 pid, tid; } /* PERF_SAMPLE_TID */ { u64 time; } /* PERF_SAMPLE_TIME */ { u64 id; } /* PERF_SAMPLE_ID */ { u64 stream_id;} /* PERF_SAMPLE_STREAM_ID */ { u32 cpu, res; } /* PERF_SAMPLE_CPU */ { u64 id; } /* PERF_SAMPLE_IDENTIFIER */ }; exclude_host ( Linux 3.2) , , VM (. . I ioctl(2) KVM_RUN), , . ; , . x86. exclude_guest ( Linux 3.2) , , VM (. . I ioctl(2) KVM_RUN), , . ; , . x86. exclude_callchain_kernel ( Linux 3.7) . exclude_callchain_user ( Linux 3.7) . mmap2 ( Linux 3.16) mmap, , . mmap. comm_exec ( Linux 3.16) , . , comm, PERF_RECORD_MISC_COMM_EXEC misc comm, , execve(2). . use_clockid ( Linux 4.1) clockid Linux, . , . context_switch ( Linux 4.3) PERF_RECORD_SWITCH . PERF_RECORD_SWITCH_CPU_WIDE CPU-wide. . , perf_event_paranoid. write_backward ( Linux 4.6) This causes the ring buffer to be written from the end to the beginning. This is to support reading from overwritable ring buffer. namespaces ( Linux 4.11) This enables the generation of PERF_RECORD_NAMESPACES records when a task enters a new namespace. Each namespace has a combination of device and inode numbers. ksymbol ( Linux 5.0) This enables the generation of PERF_RECORD_KSYMBOL records when new kernel symbols are registered or unregistered. This is analyzing dynamic kernel functions like eBPF. bpf_event ( Linux 5.0) This enables the generation of PERF_RECORD_BPF_EVENT records when an eBPF program is loaded or unloaded. aux_output (since Linux 5.4) This allows normal (non-AUX) events to generate data for AUX events if the hardware supports it. cgroup ( Linux 5.7) This enables the generation of PERF_RECORD_CGROUP records when a new cgroup is created (and activated). text_poke ( Linux 5.8) This enables the generation of PERF_RECORD_TEXT_POKE records when there's a change to the kernel text (i.e., self-modifying code). build_id (since Linux 5.12) This changes the contents in the PERF_RECORD_MMAP2 to have a build-id instead of device and inode numbers. inherit_thread (since Linux 5.13) This disables the inheritance of the event to a child process. Only new threads in the same process (which is cloned with CLONE_THREAD) will inherit the event. remove_on_exec (since Linux 5.13) This closes the event when it starts a new process image by execve(2). sigtrap (since Linux 5.13) This enables synchronous signal delivery of SIGTRAP on event overflow. wakeup_events wakeup_watermark (wakeup_events) (wakeup_watermark) . watermark. wakeup_events PERF_RECORD_SAMPLE. PERF_RECORD watermark wakeup_watermark 1. Linux 3.0 wakeup_events 0 ; 0 1. bp_type ( Linux 2.6.33) . : HW_BREAKPOINT_EMPTY . HW_BREAKPOINT_R , . HW_BREAKPOINT_W , . HW_BREAKPOINT_RW , . HW_BREAKPOINT_X , . , HW_BREAKPOINT_R HW_BREAKPOINT_W HW_BREAKPOINT_X . bp_addr ( Linux 2.6.33) . ; . config1 ( Linux 2.6.39) config1 , config. Linux 3.3 OFFCORE_EVENTS Nehalem/Westmere/SandyBridge. bp_len ( Linux 2.6.33) bp_len , type PERF_TYPE_BREAKPOINT. HW_BREAKPOINT_LEN_1, HW_BREAKPOINT_LEN_2, HW_BREAKPOINT_LEN_4 HW_BREAKPOINT_LEN_8. sizeof(long). config2 ( Linux 2.6.39) config2 -- config1. branch_sample_type ( Linux 3.4) PERF_SAMPLE_BRANCH_STACK, . , . , . . PERF_SAMPLE_BRANCH_USER PERF_SAMPLE_BRANCH_KERNEL PERF_SAMPLE_BRANCH_HV . PERF_SAMPLE_BRANCH_PLM_ALL OR. , , : PERF_SAMPLE_BRANCH_ANY . PERF_SAMPLE_BRANCH_ANY_CALL ( , ). PERF_SAMPLE_BRANCH_IND_CALL . PERF_SAMPLE_BRANCH_CALL ( Linux 4.4) . PERF_SAMPLE_BRANCH_ANY_RETURN . PERF_SAMPLE_BRANCH_IND_JUMP ( Linux 4.2) . PERF_SAMPLE_BRANCH_COND ( Linux 3.16) . PERF_SAMPLE_BRANCH_ABORT_TX ( Linux 3.11) . PERF_SAMPLE_BRANCH_IN_TX ( Linux 3.11) . PERF_SAMPLE_BRANCH_NO_TX ( Linux 3.11) . PERF_SAMPLE_BRANCH_CALL_STACK ( Linux 4.1) . , Intel x86 Haswell . sample_regs_user ( Linux 3.7) , . arch/ARCH/include/uapi/asm/perf_regs.h. sample_stack_user ( Linux 3.7) , PERF_SAMPLE_STACK_USER. clockid ( Linux 4.1) use_clockid, Linux, . linux/time.h; CLOCK_MONOTONIC, CLOCK_MONOTONIC_RAW, CLOCK_REALTIME, CLOCK_BOOTTIME CLOCK_TAI. aux_watermark ( Linux 4.1) PERF_RECORD_AUX. sample_max_stack ( Linux 4.8) sample_type PERF_SAMPLE_CALLCHAIN, . aux_sample_size (since Linux 5.5) When PERF_SAMPLE_AUX flag is set, specify the desired size of AUX data. Note that it can get smaller data than the specified size. sig_data (since Linux 5.13) This data will be copied to user's signal handler (through si_perf in the siginfo_t) to disambiguate which event triggered the signal. perf_event_open(), . read_format attr . , ENOSPC. , : o PERF_FORMAT_GROUP : struct read_format { u64 nr; /* The number of events */ u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */ u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */ struct { u64 value; /* The value of the event */ u64 id; /* if PERF_FORMAT_ID */ u64 lost; /* if PERF_FORMAT_LOST */ } values[nr]; }; o PERF_FORMAT_GROUP : struct read_format { u64 value; /* The value of the event */ u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */ u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */ u64 id; /* if PERF_FORMAT_ID */ u64 lost; /* if PERF_FORMAT_LOST */ }; : nr . , PERF_FORMAT_GROUP. time_enabled time_running . , . , PMU, . time_enabled time running . value 64- , -. id ; , read_format PERF_FORMAT_ID. lost The number of lost samples of this event; only present if PERF_FORMAT_LOST was specified in read_format. MMAP perf_event_open() , ( PROT_EXEC mmap) . mmap(2). The mmap size should be 1+2^n pages, where the first page is a metadata page (struct perf_event_mmap_page) that contains various bits of information such as where the ring-buffer head is. Before Linux 2.6.39, there is a bug that means you must allocate an mmap ring buffer when sampling even if you do not plan to access it. mmap: struct perf_event_mmap_page { __u32 version; /* version number of this structure */ __u32 compat_version; /* lowest version this is compat with */ __u32 lock; /* seqlock for synchronization */ __u32 index; /* hardware counter identifier */ __s64 offset; /* add to hardware counter value */ __u64 time_enabled; /* time event active */ __u64 time_running; /* time event on CPU */ union { __u64 capabilities; struct { __u64 cap_usr_time / cap_usr_rdpmc / cap_bit0 : 1, cap_bit0_is_deprecated : 1, cap_user_rdpmc : 1, cap_user_time : 1, cap_user_time_zero : 1, }; }; __u16 pmc_width; __u16 time_shift; __u32 time_mult; __u64 time_offset; __u64 __reserved[120]; /* Pad to 1 k */ __u64 data_head; /* head in the data section */ __u64 data_tail; /* user-space written tail */ __u64 data_offset; /* where the buffer starts */ __u64 data_size; /* data buffer size */ __u64 aux_head; __u64 aux_tail; __u64 aux_offset; __u64 aux_size; } perf_event_mmap_page : version . compat_version , . lock seqlock . index . offset rdpmc rdpmc . time_enabled . time_running . cap_usr_time / cap_usr_rdpmc / cap_bit0 ( Linux 3.4) Linux 3.4 Linux 3.11 cap_usr_time cap_usr_rdpmc. , : cap_usr_time cap_usr_rdpmc. Linux 3.12, cap_bit0 cap_user_time cap_user_rdpmc. cap_bit0_is_deprecated ( Linux 3.12) , , cap_user_time cap_user_rdpmc. , , cap_usr_time cap_usr_rdpmc , . cap_user_rdpmc ( Linux 3.12) ( <> x86), : u32 seq, time_mult, time_shift, idx, width; u64 count, enabled, running; u64 cyc, time_offset; do { seq = pc->lock; barrier(); enabled = pc->time_enabled; running = pc->time_running; if (pc->cap_usr_time && enabled != running) { cyc = rdtsc(); time_offset = pc->time_offset; time_mult = pc->time_mult; time_shift = pc->time_shift; } idx = pc->index; count = pc->offset; if (pc->cap_usr_rdpmc && idx) { width = pc->pmc_width; count += rdpmc(idx - 1); } barrier(); } while (pc->lock != seq); cap_user_time ( Linux 3.12) , , (TSC x86). cap_user_time_zero ( Linux 3.12) time_zero, . pmc_width cap_usr_rdpmc, ( ) , rdpmc . : pmc <<= 64 - pmc_width; pmc >>= 64 - pmc_width; // count += pmc; time_shift time_mult time_offset cap_usr_time, , time_enabled ( ) rdtsc . u64 quot, rem; u64 delta; quot = cyc >> time_shift; rem = cyc & (((u64)1 << time_shift) - 1); delta = time_offset + quot * time_mult + ((rem * time_mult) >> time_shift); time_offset, time_mult, time_shift cyc seqcount, . , , ( idx) : enabled += delta; if (idx) running += delta; quot = count / running; rem = count % running; count = quot * enabled + (rem * enabled) / running; time_zero ( Linux 3.12) cap_usr_time_zero, ( TSC x86) time_zero, time_mult time_shift: time = timestamp - time_zero; quot = time / time_mult; rem = time % time_mult; cyc = (quot << time_shift) + (rem << time_shift) / time_mult; : quot = cyc >> time_shift; rem = cyc & (((u64)1 << time_shift) - 1); timestamp = time_zero + quot * time_mult + ((rem * time_mult) >> time_shift); data_head . , . -- mmap. SMP data_head rmb(). data_tail PROT_WRITE, data_tail . . data_offset ( Linux 4.1) perf mmap. data_size ( Linux 4.1) perf mmap. aux_head aux_tail aux_offset aux_size (since Linux 4.1) The AUX region allows mmap(2)-ing a separate sample buffer for high-bandwidth data streams (separate from the main perf sample buffer). An example of a high-bandwidth stream is instruction tracing support, as is found in newer Intel processors. AUX, aux_offset data_offset+data_size, aux_size . , . mmap AUX. AUX RLIMIT_MEMLOCK ( setrlimit(2)), perf_event_mlock_kb. AUX , . AUX , , . , . aux_head aux_tail , data_head data_tail. 2^n. perf_event_attr.sample_id_all, sample_type, / () (TID, TIME, ID, CPU, STREAM_ID), PERF_RECORD_SAMPLE ; perf_event_header , . perf.data perf, . mmap : struct perf_event_header { __u32 type; __u16 misc; __u16 size; }; perf_event_header . . size . misc misc . , PERF_RECORD_MISC_CPUMODE_MASK (, , ): PERF_RECORD_MISC_CPUMODE_UNKNOWN . PERF_RECORD_MISC_KERNEL . PERF_RECORD_MISC_USER . PERF_RECORD_MISC_HYPERVISOR . PERF_RECORD_MISC_GUEST_KERNEL ( Linux 2.6.35) . PERF_RECORD_MISC_GUEST_USER ( Linux 2.6.35) . , : PERF_RECORD_MISC_MMAP_DATA ( Linux 3.10) , ; . PERF_RECORD_MISC_COMM_EXEC ( Linux 3.16) PERF_RECORD_COMM Linux 3.16, execve(2). PERF_RECORD_MISC_SWITCH_OUT ( Linux 4.3) PERF_RECORD_SWITCH PERF_RECORD_SWITCH_CPU_WIDE , ( ). : PERF_RECORD_MISC_EXACT_IP , PERF_SAMPLE_IP , - . perf_event_attr.precise_ip. PERF_RECORD_MISC_SWITCH_OUT_PREEMPT (since Linux 4.17) When a PERF_RECORD_SWITCH or PERF_RECORD_SWITCH_CPU_WIDE record is generated, this indicates the context switch was a preemption. PERF_RECORD_MISC_MMAP_BUILD_ID (since Linux 5.12) This indicates that the content of PERF_SAMPLE_MMAP2 contains build-ID data instead of device major and minor numbers as well as the inode number. PERF_RECORD_MISC_EXT_RESERVED ( Linux 2.6.35) , ( ). PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT This bit is not set by the kernel. It is reserved for the user-space perf utility to indicate that /proc/pid/maps parsing was taking too long and was stopped, and thus the mmap records may be truncated. type type -- . ( ) type . PERF_RECORD_MMAP MMAP PROT_EXEC , IP . : struct { struct perf_event_header header; u32 pid, tid; u64 addr; u64 len; u64 pgoff; char filename[]; }; pid . tid . addr . len . pgoff . filename , . PERF_RECORD_LOST . struct { struct perf_event_header header; u64 id; u64 lost; struct sample_id sample_id; }; id ID . lost . PERF_RECORD_COMM . struct { struct perf_event_header header; u32 pid; u32 tid; char comm[]; struct sample_id sample_id; }; pid . tid . comm , . PERF_RECORD_EXIT . struct { struct perf_event_header header; u32 pid, ppid; u32 tid, ptid; u64 time; struct sample_id sample_id; }; PERF_RECORD_THROTTLE PERF_RECORD_UNTHROTTLE / (throttle). struct { struct perf_event_header header; u64 time; u64 id; u64 stream_id; struct sample_id sample_id; }; PERF_RECORD_FORK (fork) . struct { struct perf_event_header header; u32 pid, ppid; u32 tid, ptid; u64 time; struct sample_id sample_id; }; PERF_RECORD_READ . struct { struct perf_event_header header; u32 pid, tid; struct read_format values; struct sample_id sample_id; }; PERF_RECORD_SAMPLE . struct { struct perf_event_header header; u64 sample_id; /* if PERF_SAMPLE_IDENTIFIER */ u64 ip; /* if PERF_SAMPLE_IP */ u32 pid, tid; /* if PERF_SAMPLE_TID */ u64 time; /* if PERF_SAMPLE_TIME */ u64 addr; /* if PERF_SAMPLE_ADDR */ u64 id; /* if PERF_SAMPLE_ID */ u64 stream_id; /* if PERF_SAMPLE_STREAM_ID */ u32 cpu, res; /* if PERF_SAMPLE_CPU */ u64 period; /* if PERF_SAMPLE_PERIOD */ struct read_format v; /* if PERF_SAMPLE_READ */ u64 nr; /* if PERF_SAMPLE_CALLCHAIN */ u64 ips[nr]; /* if PERF_SAMPLE_CALLCHAIN */ u32 size; /* if PERF_SAMPLE_RAW */ char data[size]; /* if PERF_SAMPLE_RAW */ u64 bnr; /* if PERF_SAMPLE_BRANCH_STACK */ struct perf_branch_entry lbr[bnr]; /* if PERF_SAMPLE_BRANCH_STACK */ u64 abi; /* if PERF_SAMPLE_REGS_USER */ u64 regs[weight(mask)]; /* if PERF_SAMPLE_REGS_USER */ u64 size; /* if PERF_SAMPLE_STACK_USER */ char data[size]; /* if PERF_SAMPLE_STACK_USER */ u64 dyn_size; /* if PERF_SAMPLE_STACK_USER && size != 0 */ union perf_sample_weight weight; /* if PERF_SAMPLE_WEIGHT */ /* || PERF_SAMPLE_WEIGHT_STRUCT */ u64 data_src; /* if PERF_SAMPLE_DATA_SRC */ u64 transaction; /* if PERF_SAMPLE_TRANSACTION */ u64 abi; /* if PERF_SAMPLE_REGS_INTR */ u64 regs[weight(mask)]; /* if PERF_SAMPLE_REGS_INTR */ u64 phys_addr; /* if PERF_SAMPLE_PHYS_ADDR */ u64 cgroup; /* if PERF_SAMPLE_CGROUP */ u64 data_page_size; /* if PERF_SAMPLE_DATA_PAGE_SIZE */ u64 code_page_size; /* if PERF_SAMPLE_CODE_PAGE_SIZE */ u64 size; /* if PERF_SAMPLE_AUX */ char data[size]; /* if PERF_SAMPLE_AUX */ }; sample_id PERF_SAMPLE_IDENTIFIER, 64- . id PERF_SAMPLE_ID, , . ip PERF_SAMPLE_IP, 64- . pid tid PERF_SAMPLE_TID, 32- 32- . time PERF_SAMPLE_TIME, 64- . local_clock(), , , , . addr PERF_SAMPLE_ADDR, 64- . , , ; 0. id PERF_SAMPLE_ID, 64- . , . , PERF_FORMAT_ID. stream_id PERF_SAMPLE_STREAM_ID, 64- . PERF_SAMPLE_ID , . PERF_FORMAT_ID. cpu res PERF_SAMPLE_CPU, 32- , , , () 32- . period PERF_SAMPLE_PERIOD, 64- , . v PERF_SAMPLE_READ, read_format, . read_format, perf_event_open(). nr ips[nr] PERF_SAMPLE_CALLCHAIN, 64- , 64- . . size data[size] PERF_SAMPLE_RAW, 32- , 8- . 64-. , . , , . bnr lbr[bnr] PERF_SAMPLE_BRANCH_STACK, 64- , , bnr perf_branch_entry, : from ( ). to . mispred . predicted . in_tx ( Linux 3.11) . abort ( Linux 3.11) . cycles ( Linux 4.3) , . , , . mispred, predicted cycles ; , 0. branch_sample_type. abi regs[weight(mask)] PERF_SAMPLE_REGS_USER, . abi PERF_SAMPLE_REGS_ABI_NONE, PERF_SAMPLE_REGS_ABI_32 PERF_SAMPLE_REGS_ABI_64. regs , sample_regs_user. , sample_regs_user. size data[size] dyn_size PERF_SAMPLE_STACK_USER, . . size -- , sample_stack_user . data -- ( , ). dyn_size -- , ( size). , dyn_size , size 0. weight If PERF_SAMPLE_WEIGHT or PERF_SAMPLE_WEIGHT_STRUCT is enabled, then a 64-bit value provided by the hardware is recorded that indicates how costly the event was. This allows expensive events to stand out more clearly in profiles. data_src PERF_SAMPLE_DATA_SRC, 64- , : mem_op , : PERF_MEM_OP_NA PERF_MEM_OP_LOAD PERF_MEM_OP_STORE PERF_MEM_OP_PFETCH PERF_MEM_OP_EXEC mem_lvl , , PERF_MEM_LVL_SHIFT: PERF_MEM_LVL_NA PERF_MEM_LVL_HIT PERF_MEM_LVL_MISS PERF_MEM_LVL_L1 1 PERF_MEM_LVL_LFB PERF_MEM_LVL_L2 2 PERF_MEM_LVL_L3 3 PERF_MEM_LVL_LOC_RAM PERF_MEM_LVL_REM_RAM1 1 PERF_MEM_LVL_REM_RAM2 2 PERF_MEM_LVL_REM_CCE1 1 PERF_MEM_LVL_REM_CCE2 2 PERF_MEM_LVL_IO - PERF_MEM_LVL_UNC mem_snoop (snoop mode), , PERF_MEM_SNOOP_SHIFT: PERF_MEM_SNOOP_NA PERF_MEM_SNOOP_NONE PERF_MEM_SNOOP_HIT PERF_MEM_SNOOP_MISS PERF_MEM_SNOOP_HITM mem_lock , , PERF_MEM_LOCK_SHIFT: PERF_MEM_LOCK_NA PERF_MEM_LOCK_LOCKED mem_dtlb TLB, , PERF_MEM_TLB_SHIFT: PERF_MEM_TLB_NA PERF_MEM_TLB_HIT PERF_MEM_TLB_MISS PERF_MEM_TLB_L1 1 TLB PERF_MEM_TLB_L2 2 TLB PERF_MEM_TLB_WK PERF_MEM_TLB_OS transaction PERF_SAMPLE_TRANSACTION, 64- , . : PERF_TXN_ELISION - ( Intel). PERF_TXN_TRANSACTION - . PERF_TXN_SYNC ( ). PERF_TXN_ASYNC ( ). PERF_TXN_RETRY ( ). PERF_TXN_CONFLICT - . PERF_TXN_CAPACITY_WRITE - . PERF_TXN_CAPACITY_READ - . , , 32 PERF_TXN_ABORT_SHIFT PERF_TXN_ABORT_MASK. abi regs[weight(mask)] PERF_SAMPLE_REGS_INTR, . abi PERF_SAMPLE_REGS_ABI_NONE, PERF_SAMPLE_REGS_ABI_32 PERF_SAMPLE_REGS_ABI_64. regs , sample_regs_intr. , sample_regs_intr. phys_addr If the PERF_SAMPLE_PHYS_ADDR flag is set, then the 64-bit physical address is recorded. cgroup If the PERF_SAMPLE_CGROUP flag is set, then the 64-bit cgroup ID (for the perf_event subsystem) is recorded. To get the pathname of the cgroup, the ID should match to one in a PERF_RECORD_CGROUP. data_page_size If the PERF_SAMPLE_DATA_PAGE_SIZE flag is set, then the 64-bit page size value of the data address is recorded. code_page_size If the PERF_SAMPLE_CODE_PAGE_SIZE flag is set, then the 64-bit page size value of the ip address is recorded. size data[size] If PERF_SAMPLE_AUX is enabled, a snapshot of the aux buffer is recorded. PERF_RECORD_MMAP2 This record includes extended information on mmap(2) calls returning executable mappings. The format is similar to that of the PERF_RECORD_MMAP record, but includes extra values that allow uniquely identifying shared mappings. Depending on the PERF_RECORD_MISC_MMAP_BUILD_ID bit in the header, the extra values have different layout and meanings. struct { struct perf_event_header header; u32 pid; u32 tid; u64 addr; u64 len; u64 pgoff; union { struct { u32 maj; u32 min; u64 ino; u64 ino_generation; }; struct { /* if PERF_RECORD_MISC_MMAP_BUILD_ID */ u8 build_id_size; u8 __reserved_1; u16 __reserved_2; u8 build_id[20]; }; }; u32 prot; u32 flags; char filename[]; struct sample_id sample_id; }; pid . tid . addr . len . pgoff . maj . min . ino inode. ino_generation inode. build_id_size is the actual size of build_id field (up to 20). build_id is a raw data to identify a binary. prot . flags . filename , . PERF_RECORD_AUX ( Linux 4.1) AUX. struct { struct perf_event_header header; u64 aux_offset; u64 aux_size; u64 flags; struct sample_id sample_id; }; aux_offset AUX mmap, . aux_size . flags AUX. PERF_AUX_FLAG_TRUNCATED , . PERF_AUX_FLAG_OVERWRITE , . PERF_RECORD_ITRACE_START ( Linux 4.1) , , AUX . struct { struct perf_event_header header; u32 pid; u32 tid; }; pid , . tid , . PERF_RECORD_LOST_SAMPLES ( Linux 4.2) ( Intel PEBS), , . struct { struct perf_event_header header; u64 lost; struct sample_id sample_id; }; lost . PERF_RECORD_SWITCH ( Linux 4.3) . PERF_RECORD_MISC_SWITCH_OUT misc -- . struct { struct perf_event_header header; struct sample_id sample_id; }; PERF_RECORD_SWITCH_CPU_WIDE ( Linux 4.3) PERF_RECORD_SWITCH , , CPU-wide /. PERF_RECORD_MISC_SWITCH_OUT misc : . struct { struct perf_event_header header; u32 next_prev_pid; u32 next_prev_tid; struct sample_id sample_id; }; next_prev_pid ( ) ( ) . next_prev_tid ( ) ( ) . PERF_RECORD_NAMESPACES ( Linux 4.11) This record includes various namespace information of a process. struct { struct perf_event_header header; u32 pid; u32 tid; u64 nr_namespaces; struct { u64 dev, inode } [nr_namespaces]; struct sample_id sample_id; }; pid is the process ID tid is the thread ID nr_namespace is the number of namespaces in this record Each namespace has dev and inode fields and is recorded in the fixed position like below: NET_NS_INDEX=0 Network namespace UTS_NS_INDEX=1 UTS IPC_NS_INDEX=2 IPC namespace PID_NS_INDEX=3 PID namespace USER_NS_INDEX=4 User namespace MNT_NS_INDEX=5 Mount namespace CGROUP_NS_INDEX=6 Cgroup namespace PERF_RECORD_KSYMBOL ( Linux 5.0) This record indicates kernel symbol register/unregister events. struct { struct perf_event_header header; u64 addr; u32 len; u16 ksym_type; u16 flags; char name[]; struct sample_id sample_id; }; addr is the address of the kernel symbol. len is the length of the kernel symbol. ksym_type is the type of the kernel symbol. Currently the following types are available: PERF_RECORD_KSYMBOL_TYPE_BPF The kernel symbol is a BPF function. flags If the PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER is set, then this event is for unregistering the kernel symbol. PERF_RECORD_BPF_EVENT ( Linux 5.0) This record indicates BPF program is loaded or unloaded. struct { struct perf_event_header header; u16 type; u16 flags; u32 id; u8 tag[BPF_TAG_SIZE]; struct sample_id sample_id; }; type is one of the following values: PERF_BPF_EVENT_PROG_LOAD A BPF program is loaded PERF_BPF_EVENT_PROG_UNLOAD A BPF program is unloaded id is the ID of the BPF program. tag is the tag of the BPF program. Currently, BPF_TAG_SIZE is defined as 8. PERF_RECORD_CGROUP ( Linux 5.7) This record indicates a new cgroup is created and activated. struct { struct perf_event_header header; u64 id; char path[]; struct sample_id sample_id; }; id is the cgroup identifier. This can be also retrieved by name_to_handle_at(2) on the cgroup path (as a file handle). path is the path of the cgroup from the root. PERF_RECORD_TEXT_POKE ( Linux 5.8) This record indicates a change in the kernel text. This includes addition and removal of the text and the corresponding length is zero in this case. struct { struct perf_event_header header; u64 addr; u16 old_len; u16 new_len; u8 bytes[]; struct sample_id sample_id; }; addr is the address of the change old_len is the old length new_len is the new length bytes contains old bytes immediately followed by new bytes. , , . , poll(2), select(2) epoll(7). , - ; F_SETOWN F_SETSIG fcntl(2). ( sample_period ). . : wakeup_events wakeup_watermark, mmap. POLL_IN. : ioctl PERF_EVENT_IOC_REFRESH. ioctl , . 0, POLL_IN, , 0, POLL_HUP . , , 0 ; . Linux 3.18, POLL_HUP , . rdpmc Linux 3.4 x86, rdpmc . , rdpmc . cap_usr_rdpmc mmap; . , rdpmc , ( perf) rdpmc . Linux 4.0 rdpmc , . 2 /sys/devices/cpu/rdpmc. ioctl perf_event perf_event_open() ioctl: PERF_EVENT_IOC_ENABLE , . ioctl PERF_IOC_FLAG_GROUP, , ( ). PERF_EVENT_IOC_DISABLE , . ; , . ( ) ; . ioctl PERF_IOC_FLAG_GROUP, , ( ). PERF_EVENT_IOC_REFRESH , ( ). ioctl . POLL_IN 0; , POLL_HUP . 0 . PERF_EVENT_IOC_RESET ( ) , . ; time_enabled time_running. ioctl PERF_IOC_FLAG_GROUP, , ( ). PERF_EVENT_IOC_PERIOD . Linux 3.7 ( ARM) Linux 3.14 ( ), . . 64- , . Linux 2.6.36 ioctl - . PERF_EVENT_IOC_SET_OUTPUT , . . -1, . PERF_EVENT_IOC_SET_FILTER ( Linux 2.6.33) ftrace . ftrace. PERF_EVENT_IOC_ID ( Linux 3.12) . 64- , . PERF_EVENT_IOC_SET_BPF ( Linux 4.1) This allows attaching a Berkeley Packet Filter (BPF) program to an existing kprobe tracepoint event. You need CAP_PERFMON (since Linux 5.8) or CAP_SYS_ADMIN privileges to use this ioctl. BPF, bpf(2). PERF_EVENT_IOC_PAUSE_OUTPUT ( Linux 4.7) . , . , , PERF_RECORD_LOST. - - , . 32- . , -- . PERF_EVENT_MODIFY_ATTRIBUTES ( Linux 4.17) . (breakpoint). perf_event_attr . PERF_EVENT_IOC_QUERY_BPF ( Linux 4.16) This allows querying which Berkeley Packet Filter (BPF) programs are attached to an existing kprobe tracepoint. You can only attach one BPF program per event, but you can have multiple events attached to a tracepoint. Querying this value on one tracepoint event returns the ID of all BPF programs in all events attached to the tracepoint. You need CAP_PERFMON (since Linux 5.8) or CAP_SYS_ADMIN privileges to use this ioctl. struct perf_event_query_bpf { __u32 ids_len; __u32 prog_cnt; __u32 ids[0]; }; ids_len ids. prog_cnt BPF. ids BPF. , , ENOSPC ids_len , . prctl(2) , , ( prctl(2) PR_TASK_PERF_EVENTS_ENABLE PR_TASK_PERF_EVENTS_DISABLE). . , , , . , . perf_event /proc/sys/kernel/ /proc/sys/kernel/perf_event_paranoid perf_event_paranoid . 2 ( Linux 4.6). 1 ( Linux 4.6). 0 , . -1 . perf_event_paranoid -- perf_event_open(). /proc/sys/kernel/perf_event_max_sample_rate . , , , . 100000 ( ). /proc/sys/kernel/perf_event_max_stack , . /proc/sys/kernel/perf_event_mlock_kb , mlock(2). 516 (). /sys/bus/event_source/devices/ Linux 2.6.34, PMU . PMU /sys/bus/event_source/devices/. PMU. /sys/bus/event_source/devices/*/type ( Linux 2.6.38) , type perf_event_attr, , PMU. /sys/bus/event_source/devices/cpu/rdpmc ( Linux 3.4) 1, rdpmc . 0 . Linux 4.0 , 1 perf, 2 . /sys/bus/event_source/devices/*/format/ ( Linux 3.4) , config perf_event_attr. The content of each file is the name of the config field, followed by a colon, followed by a series of integer bit ranges separated by commas. For example, the file event may contain the value config1:1,6-10,44 which indicates that event is an attribute that occupies bits 1,6-10, and 44 of perf_event_attr::config1. /sys/bus/event_source/devices/*/events/ ( Linux 3.4) . -- , ./format/ . , PMU, , . -- . ( ). , 1. : event=0x2,inv,ldlat=3. /sys/bus/event_source/devices/*/uevent -- . /sys/bus/event_source/devices/*/cpumask ( Linux 3.7) cpumask ( ), () . , PMU . On success, perf_event_open() returns the new file descriptor. On error, -1 is returned and errno is set to indicate the error. , perf_event_open(), . E2BIG , perf_event_attr size ( PERF_ATTR_SIZE_VER0), ( ) . E2BIG size perf_event_attr , . EACCES Returned when the requested event requires CAP_PERFMON (since Linux 5.8) or CAP_SYS_ADMIN permissions (or a more permissive perf_event paranoid setting). Some common cases where an unprivileged process may encounter this error: attaching to a process owned by a different user; monitoring all processes on a given CPU (i.e., specifying the pid argument as -1); and not setting exclude_kernel when the paranoid setting requires it. EBADF , group_fd , PERF_FLAG_PID_CGROUP cgroup pid . EBUSY ( Linux 4.1) , PMU. EFAULT , attr . EINTR Returned when trying to mix perf and ftrace handling for a uprobe. EINVAL , . . : sample_freq , ; cpu ; read_format ; sample_type ; flags ; exclusive pinned ; config ; ; . EMFILE . , . ENODEV , , . ENOENT , type . . ENOSPC Linux 3.3, ENOSPC. Linux 3.3 EINVAL. ENOSPC , , . ENOSYS , PERF_SAMPLE_STACK_USER sample_type . EOPNOTSUPP , , . (low-skid) , , , PMU . EOVERFLOW ( Linux 4.8) , PERF_SAMPLE_CALLCHAIN sample_max_stack , /proc/sys/kernel/perf_event_max_stack. EPERM ( ) , exclude_hv, exclude_idle, exclude_user exclude_kernel. It can also happen, as with EACCES, when the requested event requires CAP_PERFMON (since Linux 5.8) or CAP_SYS_ADMIN permissions (or a more permissive perf_event paranoid setting). This includes setting a breakpoint on a kernel address, and (since Linux 3.13) setting a kernel function-trace tracepoint. ESRCH . Linux. perf_event_open() Linux 2.6.31, perf_counter_open(). Linux 2.6.32. perf_event_open() /proc/sys/kernel/perf_event_paranoid. CAP_PERFMON capability (since Linux 5.8) provides secure approach to performance monitoring and observability operations in a system according to the principal of least privilege (POSIX IEEE 1003.1e). Accessing system performance monitoring and observability operations using CAP_PERFMON rather than the much more powerful CAP_SYS_ADMIN excludes chances to misuse credentials and makes operations more secure. CAP_SYS_ADMIN usage for secure system performance monitoring and observability is discouraged in favor of the CAP_PERFMON capability. F_SETOWN_EX fcntl(2) . Linux 2.6.32. Linux 2.6.33 ( , x86), . NMI. , , perf_event_open(), , , , . Linux 2.6.34 . <<0>>, . Linux 2.6.34 , . Linux 2.6.35 Linux 2.6.39 , <<>> . Linux 2.6.35 PERF_FORMAT_GROUP . Linux 2.6.36 Linux 3.0, - <> , wakeup_event, . Linux 2.6.31 Linux 3.4 ioctl PERF_IOC_FLAG_GROUP , . Linux 3.4 Linux 3.11, mmap cap_usr_rdpmc cap_usr_time . , cap_user_rdpmc cap_user_time. ! . , AMD Linux 2.6.35. printf(3). #include #include #include #include #include #include #include static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) { int ret; ret = syscall(SYS_perf_event_open, hw_event, pid, cpu, group_fd, flags); return ret; } int main(void) { int fd; long long count; struct perf_event_attr pe; memset(&pe, 0, sizeof(pe)); pe.type = PERF_TYPE_HARDWARE; pe.size = sizeof(pe); pe.config = PERF_COUNT_HW_INSTRUCTIONS; pe.disabled = 1; pe.exclude_kernel = 1; pe.exclude_hv = 1; fd = perf_event_open(&pe, 0, -1, -1, 0); if (fd == -1) { fprintf(stderr, "Error opening leader %llx\n", pe.config); exit(EXIT_FAILURE); } ioctl(fd, PERF_EVENT_IOC_RESET, 0); ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); printf("Measuring instruction count for this printf\n"); ioctl(fd, PERF_EVENT_IOC_DISABLE, 0); read(fd, &count, sizeof(count)); printf("Used %lld instructions\n", count); close(fd); } . perf(1), fcntl(2), mmap(2), open(2), prctl(2), read(2) Documentation/admin-guide/perf-security.rst in the kernel source tree Alexey, Azamat Hackimov , kogamatranslator49 , Kogan, Max Is , Yuri Kozlov ; GNU 3 , . . , , . Linux man-pages 6.06 19 2023 . perf_event_open(2)