bpf(2) System Calls Manual bpf(2) bpf - BPF #include int bpf(int cmd, union bpf_attr *attr, unsigned int size); bpf() , (Berkeley Packet Filters). BPF ( eBPF) (<<>>) BPF (cBPF), . cBPF eBPF . eBPF cBPF , ( BPF_CALL, eBPF) , eBPF. / BPF eBPF -- , . , , (binary blobs), . , / . ( / (opaque bytes of data)) . eBPF . , , eBPF. , (program array). , eBPF. eBPF . 32, . , , , . , , bpf(). , . BPF_MAP_TYPE_PROG_ARRAY. , eBPF . , , tc-bpf(8), , , . eBPF , , tc. , , , bpf(). eBPF , . , eBPF . , eBPF, . eBPF . , , ( eBPF, tc(8)) , . eBPF, eBPF. , eBPF . eBPF , eBPF : tracing tracing tracing packet packet packet event A event B event C on eth0 on eth1 on eth2 | | | | | ^ | | | | v | --> tracing <-- tracing socket tc ingress tc egress prog_1 prog_2 prog_3 classifier action | | | | prog_4 prog_5 |--- -----| |------| map_3 | | map_1 map_2 --| map_4 |-- The operation to be performed by the bpf() system call is determined by the cmd argument. Each operation takes an accompanying argument, provided via attr, which is a pointer to a union of type bpf_attr (see below). The unused fields and padding must be zeroed out before the call. The size argument is the size of the union pointed to by attr. cmd : BPF_MAP_CREATE , . close-on-exec ( fcntl(2)) . BPF_MAP_LOOKUP_ELEM . BPF_MAP_UPDATE_ELEM ( /) . BPF_MAP_DELETE_ELEM . BPF_MAP_GET_NEXT_KEY . BPF_PROG_LOAD eBPF; , . close-on-exec ( fcntl(2)) . bpf_attr , bpf(): union bpf_attr { struct { /* Used by BPF_MAP_CREATE */ __u32 map_type; __u32 key_size; /* size of key in bytes */ __u32 value_size; /* size of value in bytes */ __u32 max_entries; /* maximum number of entries in a map */ }; struct { /* Used by BPF_MAP_*_ELEM and BPF_MAP_GET_NEXT_KEY commands */ __u32 map_fd; __aligned_u64 key; union { __aligned_u64 value; __aligned_u64 next_key; }; __u64 flags; }; struct { /* Used by BPF_PROG_LOAD */ __u32 prog_type; __u32 insn_cnt; __aligned_u64 insns; /* 'const struct bpf_insn *' */ __aligned_u64 license; /* 'const char *' */ __u32 log_level; /* verbosity level of verifier */ __u32 log_size; /* size of user buffer */ __aligned_u64 log_buf; /* user supplied 'char *' buffer */ __u32 kern_version; /* checked when prog_type=kprobe (since Linux 4.1) */ }; } __attribute__((aligned(8))); eBPF , . eBPF , . : o o o o bpf(). cmd. BPF_MAP_CREATE BPF_MAP_CREATE , , . int bpf_create_map(enum bpf_map_type map_type, unsigned int key_size, unsigned int value_size, unsigned int max_entries) { union bpf_attr attr = { .map_type = map_type, .key_size = key_size, .value_size = value_size, .max_entries = max_entries }; return bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } , map_type, , key_size, value_size max_entries. . -1 errno EINVAL, EPERM ENOMEM. key_size value_size , bpf_map_*_elem() key value , value_size. , key_size 8 eBPF bpf_map_lookup_elem(map_fd, fp - 4) , bpf_map_lookup_elem(map_fd, void *key) 8 , key, fp - 4 ( fp -- ) . , value_size 1 eBPF value = bpf_map_lookup_elem(...); *(u32 *) value = 1; , value value_size, 1 . map_type: enum bpf_map_type { BPF_MAP_TYPE_UNSPEC, /* Reserve 0 as invalid map type */ BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PROG_ARRAY, BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_MAP_TYPE_PERCPU_HASH, BPF_MAP_TYPE_PERCPU_ARRAY, BPF_MAP_TYPE_STACK_TRACE, BPF_MAP_TYPE_CGROUP_ARRAY, BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, BPF_MAP_TYPE_LPM_TRIE, BPF_MAP_TYPE_ARRAY_OF_MAPS, BPF_MAP_TYPE_HASH_OF_MAPS, BPF_MAP_TYPE_DEVMAP, BPF_MAP_TYPE_SOCKMAP, BPF_MAP_TYPE_CPUMAP, BPF_MAP_TYPE_XSKMAP, BPF_MAP_TYPE_SOCKHASH, BPF_MAP_TYPE_CGROUP_STORAGE, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, BPF_MAP_TYPE_QUEUE, BPF_MAP_TYPE_STACK, /* See /usr/include/linux/bpf.h for the full list. */ }; map_type . eBPF bpf_map_lookup_elem() bpf_map_update_elem(). . BPF_MAP_LOOKUP_ELEM BPF_MAP_LOOKUP_ELEM key , fd. int bpf_lookup_elem(int fd, const void *key, void *value) { union bpf_attr attr = { .map_fd = fd, .key = ptr_to_u64(key), .value = ptr_to_u64(value), }; return bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)); } , value, value_size . , -1, errno ENOENT. BPF_MAP_UPDATE_ELEM BPF_MAP_UPDATE_ELEM key/value , fd. int bpf_update_elem(int fd, const void *key, const void *value, uint64_t flags) { union bpf_attr attr = { .map_fd = fd, .key = ptr_to_u64(key), .value = ptr_to_u64(value), .flags = flags, }; return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)); } flags : BPF_ANY . BPF_NOEXIST , . BPF_EXIST . . -1, errno EINVAL, EPERM, ENOMEM E2BIG. E2BIG , max_entries, . EEXIST , flags BPF_NOEXIST key . ENOENT , flags BPF_EXIST key . BPF_MAP_DELETE_ELEM BPF_MAP_DELETE_ELEM key , fd. int bpf_delete_elem(int fd, const void *key) { union bpf_attr attr = { .map_fd = fd, .key = ptr_to_u64(key), }; return bpf(BPF_MAP_DELETE_ELEM, &attr, sizeof(attr)); } . , -1, errno ENOENT. BPF_MAP_GET_NEXT_KEY BPF_MAP_GET_NEXT_KEY key , fd, next_key . int bpf_get_next_key(int fd, const void *key, void *next_key) { union bpf_attr attr = { .map_fd = fd, .key = ptr_to_u64(key), .next_key = ptr_to_u64(next_key), }; return bpf(BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr)); } key , next_key . key , next_key . key -- , -1 errno ENOENT. errno: ENOMEM, EFAULT, EPERM EINVAL. . close(map_fd) , map_fd. , , , ( ). eBPF : BPF_MAP_TYPE_HASH - : o . eBPF , . o / / . o map_update_elem() max_entries ( , eBPF ). o map_update_elem() . - . BPF_MAP_TYPE_ARRAY : o . / JIT lookup(), - -. , value_size eBPF. , array_map_lookup_elem() <<>> / JIT, . o . o , . o map_delete_elem() EINVAL, . o map_update_elem() ; -. : __sync_fetch_and_add() 32 64- . , , , , . . : o <<>> eBPF: 1, () 0, -- <<>> , eBPF . o (buckets). o , , . BPF_MAP_TYPE_PROG_ARRAY ( Linux 4.2) -- , , eBPF. key_size value_size . bpf_tail_call(). , eBPF void bpf_tail_call(void *context, void *prog_map, unsigned int index); , , , . eBPF. . . eBPF ( , / (32)), eBPF. . , , . , eBPF. , , , . eBPF BPF_PROG_LOAD eBPF . , eBPF. char bpf_log_buf[LOG_BUF_SIZE]; int bpf_prog_load(enum bpf_prog_type type, const struct bpf_insn *insns, int insn_cnt, const char *license) { union bpf_attr attr = { .prog_type = type, .insns = ptr_to_u64(insns), .insn_cnt = insn_cnt, .license = ptr_to_u64(license), .log_buf = ptr_to_u64(bpf_log_buf), .log_size = LOG_BUF_SIZE, .log_level = 1, }; return bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); } prog_type : enum bpf_prog_type { BPF_PROG_TYPE_UNSPEC, /* Reserve 0 as invalid program type */ BPF_PROG_TYPE_SOCKET_FILTER, BPF_PROG_TYPE_KPROBE, BPF_PROG_TYPE_SCHED_CLS, BPF_PROG_TYPE_SCHED_ACT, BPF_PROG_TYPE_TRACEPOINT, BPF_PROG_TYPE_XDP, BPF_PROG_TYPE_PERF_EVENT, BPF_PROG_TYPE_CGROUP_SKB, BPF_PROG_TYPE_CGROUP_SOCK, BPF_PROG_TYPE_LWT_IN, BPF_PROG_TYPE_LWT_OUT, BPF_PROG_TYPE_LWT_XMIT, BPF_PROG_TYPE_SOCK_OPS, BPF_PROG_TYPE_SK_SKB, BPF_PROG_TYPE_CGROUP_DEVICE, BPF_PROG_TYPE_SK_MSG, BPF_PROG_TYPE_RAW_TRACEPOINT, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_PROG_TYPE_LWT_SEG6LOCAL, BPF_PROG_TYPE_LIRC_MODE2, BPF_PROG_TYPE_SK_REUSEPORT, BPF_PROG_TYPE_FLOW_DISSECTOR, /* See /usr/include/linux/bpf.h for the full list. */ }; eBPF . bpf_attr : o insns -- struct bpf_insn. o insn_cnt -- , insns. o license -- , GPL, , gpl_only ( , , <>). o log_buf -- , , . , , eBPF . , . o log_size -- , log_buf. , -1, errno ENOSPC. o log_level verbosity level of the verifier. A value of zero means that the verifier will not provide a log; in this case, log_buf must be a null pointer, and log_size must be zero. close(2) , BPF_PROG_LOAD, eBPF ( ). eBPF eBPF, eBPF . , eBPF (kprobe, ) , . , , , eBPF, , , . eBPF The eBPF program type (prog_type) determines the subset of kernel helper functions that the program may call. The program type also determines the program input (context)--the format of struct bpf_context (which is the data blob passed into the eBPF program as the first argument). , , ( ). , () -- , -- . , eBPF , . : BPF_PROG_TYPE_SOCKET_FILTER ( Linux 3.19) , BPF_PROG_TYPE_SOCKET_FILTER : bpf_map_lookup_elem(map_fd, void *key) /* map_fd */ bpf_map_update_elem(map_fd, void *key, void *value) /* / */ bpf_map_delete_elem(map_fd, void *key) /* map_fd */ bpf_context struct __sk_buff. BPF_PROG_TYPE_KPROBE ( Linux 4.1) [ ] BPF_PROG_TYPE_SCHED_CLS ( Linux 4.1) [ ] BPF_PROG_TYPE_SCHED_ACT ( Linux 4.1) [ ] , . -. Linux 3.19, prog_fd sockfd, socket(2) : setsockopt(sockfd, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof(prog_fd)); Linux 4.1, eBPF, prog_fd, perf event_fd, perf_event_open(2) : ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd); : BPF_MAP_CREATE , eBPF. BPF_PROG_LOAD , eBPF. . -1, errno . E2BIG eBPF max_entries ( ) . EACCES BPF_PROG_LOAD: , , . - / . bpf() log_level = 1 log_buf , . EAGAIN For BPF_PROG_LOAD, indicates that needed resources are blocked. This happens when the verifier detects pending signals while it is checking the validity of the bpf program. In this case, just call bpf() again with the same parameters. EBADF fd . EFAULT (key, value, log_buf insns) . EINVAL cmd . EINVAL BPF_MAP_CREATE: map_type. EINVAL BPF_MAP_*_ELEM: union bpf_attr, , . EINVAL BPF_PROG_LOAD: . eBPF - , , , . ENOENT BPF_MAP_LOOKUP_ELEM BPF_MAP_DELETE_ELEM: key . ENOMEM . EPERM ( CAP_SYS_ADMIN). Linux. Linux 3.18. Prior to Linux 4.4, all bpf() commands require the caller to have the CAP_SYS_ADMIN capability. From Linux 4.4 onwards, an unprivileged user may create limited programs of type BPF_PROG_TYPE_SOCKET_FILTER and associated maps. However they may not store kernel pointers within the maps and are presently limited to the following helper functions: o get_random o get_smp_processor_id o tail_call o ktime_get_ns Unprivileged access may be blocked by writing the value 1 to the file /proc/sys/kernel/unprivileged_bpf_disabled. eBPF ( ) . , fork(2) , eBPF. , , eBPF, UNIX. , eBPF, dup(2) . eBPF , . eBPF C, ( clang) - eBPF. C , , , , . samples/bpf/*_kern.c . The kernel contains a just-in-time (JIT) compiler that translates eBPF bytecode into native machine code for better performance. Before Linux 4.15, the JIT compiler is disabled by default, but its operation can be controlled by writing one of the following integer strings to the file /proc/sys/net/core/bpf_jit_enable: 0 JIT ( ). 1 . 2 . . tools/net/bpf_jit_disasm.c, . Linux 4.15, CONFIG_BPF_JIT_ALWAYS_ON. JIT bpf_jit_enable 1 ( <<>>, BPF). JIT eBPF : o x86-64 ( Linux 3.18; cBPF Linux 3.0); o ARM32 ( Linux 3.18; cBPF Linux 3.4); o SPARC 32 ( Linux 3.18; cBPF Linux 3.5); o ARM-64 ( Linux 3.18); o s390 ( Linux 4.1; cBPF Linux 3.7); o PowerPC 64 ( Linux 4.8; cBPF Linux 3.1); o SPARC 64 ( Linux 4.12); o x86-32 ( Linux 4.18); o MIPS 64 ( Linux 4.18; cBPF Linux 3.16); o riscv ( Linux 5.1). /* bpf+sockets example: * 1. create array map of 256 elements * 2. load program that counts number of packets received * r0 = skb->data[ETH_HLEN + offsetof(struct iphdr, protocol)] * map[r0]++ * 3. attach prog_fd to raw socket via setsockopt() * 4. print number of received TCP/UDP packets every second */ int main(int argc, char *argv[]) { int sock, map_fd, prog_fd, key; long long value = 0, tcp_cnt, udp_cnt; map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value), 256); if (map_fd < 0) { printf("failed to create map '%s'\n", strerror(errno)); /* likely not run as root */ return 1; } struct bpf_insn prog[] = { BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), /* r6 = r1 */ BPF_LD_ABS(BPF_B, ETH_HLEN + offsetof(struct iphdr, protocol)), /* r0 = ip->proto */ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), /* r2 = fp */ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = r2 - 4 */ BPF_LD_MAP_FD(BPF_REG_1, map_fd), /* r1 = map_fd */ BPF_CALL_FUNC(BPF_FUNC_map_lookup_elem), /* r0 = map_lookup(r1, r2) */ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), /* if (r0 == 0) goto pc+2 */ BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */ BPF_XADD(BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* lock *(u64 *) r0 += r1 */ BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */ BPF_EXIT_INSN(), /* return r0 */ }; prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, prog, sizeof(prog) / sizeof(prog[0]), "GPL"); sock = open_raw_sock("lo"); assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof(prog_fd)) == 0); for (;;) { key = IPPROTO_TCP; assert(bpf_lookup_elem(map_fd, &key, &tcp_cnt) == 0); key = IPPROTO_UDP; assert(bpf_lookup_elem(map_fd, &key, &udp_cnt) == 0); printf("TCP %lld UDP %lld packets\n", tcp_cnt, udp_cnt); sleep(1); } return 0; } samples/bpf . . seccomp(2), bpf-helpers(7), socket(7), tc(8), tc-bpf(8) BPF Documentation/networking/filter.txt. Artyom Kunyov , Azamat Hackimov , Dmitriy Ovchinnikov , Dmitry Bolkhovskikh , ITriskTI , Yuri Kozlov ; GNU 3 , . . , , . Linux man-pages 6.06 1 2023 . bpf(2)