bpf(2) System Calls Manual bpf(2) bpf - BPF #include int bpf(int cmd, union bpf_attr *attr, unsigned int size); bpf() , Berkeley. BPF ( eBPF) ("") BPF, . cBPF, eBPF , , . eBPF cBPF , ( BPF_CALL, eBPF) , eBPF. / BPF eBPF - . , . , / . ( /, ) . eBPF . , . , eBPF. , eBPF . 32, . , , , . , , bpf(). , . BPF_MAP_TYPE_PROG_ARRAY . eBPF . , , tc-bpf(8), , . tc eBPF , . , , , , bpf(). eBPF , . , eBPF . , eBPF, , . eBPF . , , ( eBPF, tc(8)), , . eBPF, eBPF. , eBPF . eBPF eBPF : A B C eth0 eth1 eth2 | | | | | ^ | | | | v | --> <-- tc tc _1 _2 _3 | | | | _4 _5 |--- -----| |------| _3 | | _1 _2 --| _4 |-- , bpf(), cmd. , attr, bpf_attr (. ). . size - , attr. , cmd, : BPF_MAP_CREATE , . close-on-exec (. fcntl(2)) . BPF_MAP_LOOKUP_ELEM . BPF_MAP_UPDATE_ELEM ( /) . BPF_MAP_DELETE_ELEM . BPF_MAP_GET_NEXT_KEY . BPF_PROG_LOAD eBPF, , . close-on-exec (. fcntl(2)) . bpf_attr , bpf(): union bpf_attr { struct { /* Used by BPF_MAP_CREATE */ __u32 map_type; __u32 key_size; /* size of key in bytes */ __u32 value_size; /* size of value in bytes */ __u32 max_entries; /* maximum number of entries in a map */ }; struct { /* Used by BPF_MAP_*_ELEM and BPF_MAP_GET_NEXT_KEY commands */ __u32 map_fd; __aligned_u64 key; union { __aligned_u64 value; __aligned_u64 next_key; }; __u64 flags; }; struct { /* Used by BPF_PROG_LOAD */ __u32 prog_type; __u32 insn_cnt; __aligned_u64 insns; /* 'const struct bpf_insn *' */ __aligned_u64 license; /* 'const char *' */ __u32 log_level; /* verbosity level of verifier */ __u32 log_size; /* size of user buffer */ __aligned_u64 log_buf; /* user supplied 'char *' buffer */ __u32 kern_version; /* checked when prog_type=kprobe (since Linux 4.1) */ }; } __attribute__((aligned(8))); eBPF maps . eBPF, . : o o o o - , bpf() . cmd . BPF_MAP_CREATE BPF_MAP_CREATE , . int bpf_create_map(enum bpf_map_type map_type, unsigned int key_size, unsigned int value_size, unsigned int max_entries) { union bpf_attr attr = { .map_type = map_type, .key_size = key_size, .value_size = value_size, .max_entries = max_entries }; return bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } , map_type , key_size, value_size max_entries. . -1, errno EINVAL, EPERM ENOMEM. key_size value_size , bpf_map_*_elem() key , c value , value_size. , key_size 8 eBPF bpf_map_lookup_elem(map_fd, fp - 4) , bpf_map_lookup_elem(map_fd, void *key) , 8 , key, fp -4 ( fp - ) . , man avalue_size 1 bpf value = bpf_map_lookup_elem(...); *(u32 *) value = 1; , value 1 value_size. map_type : enum bpf_map_type { BPF_MAP_TYPE_UNSPEC, /* Reserve 0 as invalid map type */ BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PROG_ARRAY, BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_MAP_TYPE_PERCPU_HASH, BPF_MAP_TYPE_PERCPU_ARRAY, BPF_MAP_TYPE_STACK_TRACE, BPF_MAP_TYPE_CGROUP_ARRAY, BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, BPF_MAP_TYPE_LPM_TRIE, BPF_MAP_TYPE_ARRAY_OF_MAPS, BPF_MAP_TYPE_HASH_OF_MAPS, BPF_MAP_TYPE_DEVMAP, BPF_MAP_TYPE_SOCKMAP, BPF_MAP_TYPE_CPUMAP, BPF_MAP_TYPE_XSKMAP, BPF_MAP_TYPE_SOCKHASH, BPF_MAP_TYPE_CGROUP_STORAGE, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, BPF_MAP_TYPE_QUEUE, BPF_MAP_TYPE_STACK, /* See /usr/include/linux/bpf.h for the full list. */ }; map_type . eBPF bpf_map_lookup_elem() bpf_map_update_elem(). . BPF_MAP_LOOKUP_ELEM BPF_MAP_LOOKUP_ELEM key , fd. int bpf_lookup_elem(int fd, const void *key, void *value) { union bpf_attr attr = { .map_fd = fd, .key = ptr_to_u64(key), .value = ptr_to_u64(value), }; return bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)); } , value, value_size . , -1 errno ENOENT. BPF_MAP_UPDATE_ELEM BPF_MAP_UPDATE_ELEM key/value , fd. int bpf_update_elem(int fd, const void *key, const void *value, uint64_t flags) { union bpf_attr attr = { .map_fd = fd, .key = ptr_to_u64(key), .value = ptr_to_u64(value), .flags = flags, }; return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)); } flags : BPF_ANY . BPF_NOEXIST , . BPF_EXIST . . -1, errno EINVAL, EPERM, ENOMEM E2BIG. BPF_MAP_DELETE_ELEM BPF_MAP_DELETE_ELEM , key, , fd. int bpf_delete_elem(int fd, const void *key) { union bpf_attr attr = { .map_fd = fd, .key = ptr_to_u64(key), }; return bpf(BPF_MAP_DELETE_ELEM, &attr, sizeof(attr)); } . , -1, errno ENOENT. BPF_MAP_GET_NEXT_KEY BPF_MAP_GET_NEXT_KEY key , fd next_key . int bpf_get_next_key(int fd, const void *key, void *next_key) { union bpf_attr attr = { .map_fd = fd, .key = ptr_to_u64(key), .next_key = ptr_to_u64(next_key), }; return bpf(BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr)); } key , next_key . key , next_key . key , -1, errno ENOENT. errno ENOMEM, DEFAULT, EPERM EINVAL. . close(map_fd) , map_fd. , , (, ). eBPF map types : BPF_MAP_TYPE_HASH -, : o . , eBPF , . o key/value. o map_update_elem() , max_entries ( , eBPF ). o map_update_elem() - . - . BPF_MAP_TYPE_ARRAY : o . verifier/JIT lookup(), . , value_size eBPF. , array_map_lookup_elem() "" /JIT-, . o . o . o map_delete_elem() EINVAL, . o map_update_elem() nonatomic; -. , : __sync_fetch__add() 32- 64- . , , , , , . . : o "" eBPF: 1 , () 0, "" , eBPF . o . o , , . BPF_MAP_TYPE_PROG_ARRAY (since Linux 4.2) - , , eBPF. , key_size, value_size . bpf_tail_call(). , eBPF void bpf_tail_call(void *context, void *prog_map, unsigned int index); , , , . eBPF. . , . eBPF ( , / 32 ), eBPF. . , , . , eBPF. , , , . eBPF BPF_PROG_LOAD eBPF . - , eBPF. char bpf_log_buf[LOG_BUF_SIZE]; int bpf_prog_load(enum bpf_prog_type type, const struct bpf_insn *insns, int insn_cnt, const char *license) { union bpf_attr attr = { .prog_type = type, .insns = ptr_to_u64(insns), .insn_cnt = insn_cnt, .license = ptr_to_u64(license), .log_buf = ptr_to_u64(bpf_log_buf), .log_size = LOG_BUF_SIZE, .log_level = 1, }; return bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); } prog_type : enum bpf_prog_type { BPF_PROG_TYPE_UNSPEC, /* Reserve 0 as invalid program type */ BPF_PROG_TYPE_SOCKET_FILTER, BPF_PROG_TYPE_KPROBE, BPF_PROG_TYPE_SCHED_CLS, BPF_PROG_TYPE_SCHED_ACT, BPF_PROG_TYPE_TRACEPOINT, BPF_PROG_TYPE_XDP, BPF_PROG_TYPE_PERF_EVENT, BPF_PROG_TYPE_CGROUP_SKB, BPF_PROG_TYPE_CGROUP_SOCK, BPF_PROG_TYPE_LWT_IN, BPF_PROG_TYPE_LWT_OUT, BPF_PROG_TYPE_LWT_XMIT, BPF_PROG_TYPE_SOCK_OPS, BPF_PROG_TYPE_SK_SKB, BPF_PROG_TYPE_CGROUP_DEVICE, BPF_PROG_TYPE_SK_MSG, BPF_PROG_TYPE_RAW_TRACEPOINT, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_PROG_TYPE_LWT_SEG6LOCAL, BPF_PROG_TYPE_LIRC_MODE2, BPF_PROG_TYPE_SK_REUSEPORT, BPF_PROG_TYPE_FLOW_DISSECTOR, /* /usr/include/linux/bpf.h. */ }; eBPF . bpf_attr : o ins - struct bpf_insn. o insn_cnt - , ins. o license - , GPL gpl_only. ( , , , "Dual BSD/GPL"). o log_buf - , . , , , , eBPF . . o log_size - , log_buf. , -1, errno ENOSPC. o log_level - . , ; log_buf , log_size . close(2) , BPF_PROG_LOAD, eBPF (, ). eBPF eBPF, eBPF . , eBPF (, kprobe, ) , . , , , eBPF, . eBPF eBPF (prog_type) , . () struct bpf_context, , eBPF . , , ( ). , () , . , eBPF , . : BPF_PROG_TYPE_SOCKET_FILTER ( Linux 3.19) BPF_PROG_TYPE_SOCKET_FILTER : bpf_map_lookup_elem(map_fd, void *key) /* look up key in a map_fd */ bpf_map_update_elem(map_fd, void *key, void *value) /* update key/value */ bpf_map_delete_elem(map_fd, void *key) /* delete key in a map_fd */ bpf_context struct __sk_buff. BPF_PROG_TYPE_KPROBE ( Linux 4.1) [ ] BPF_PROG_TYPE_SCHED_CLS ( Linux 4.1) [ ] BPF_PROG_TYPE_SCHED_ACT ( Linux 4.1) [ ] , . -. Linux 3.19, prog_fd sockfd, socket(2).: setsockopt(sockfd, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof(prog_fd)); Linux 4.1, eBPF, prog_fd, perf, event_fd, perf_event_open(2): ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd); : BPF_MAP_CREATE , eBPF. BPF_PROG_LOAD , eBPF. . -1, errno . E2BIG eBPF , max_entries ( ). EACCES BPF_PROG_LOAD, , , , . , /, , , , . bpf() log_level = 1 log_buf , . EAGAIN BPF_PROG_LOAD , . , bpf. bpf() . EBADF fd . EFAULT (key value, log_buf, ins) . EINVAL cmd . EINVAL BPF_MAP_CREATE map_type, . EINVAL BPF_MAP_*_ELEM union bpf_attr, , . EINVAL BPF_PROG_LOAD . eBPF - , , , . ENOENT BPF_MAP_LOOKUP_ELEM BPF_MAP_DELETE_ELEM , key . ENOMEM . EPERM ( CAP_SYS_ADMIN). Linux. Linux 3.18. Linux 4.4 bpf() , CAP_SYS_ADMIN. Linux 4.4, BPF_PROG_TYPE_SOCKET_FILTER . : o get_random o get_smp_processor_id o tail_call o ktime_get_ns 1 /proc/sys/kernel/unprivileged_bpf_disabled. eBPF ( ) . , fork(2) , eBPF. , , eBPF, UNIX. , eBPF, , dup(2) . eBPF , , , . eBPF , ( clang) - bpf. C , , , , . samples/bpf/*_kern.c . JIT-, - bpf . Linux 4.15 JIT- , , /proc/sys/net/core/bpf_jit_enable: 0 JIT- ( ). 1 . 2 . . tools/net/bpf_jit_disasm.c, . Linux 4.15, CONFIG_BPF_JIT_ALWAYS_ON. JIT- , bpf_jit_enable 1 . Spectre BPF. JIT- bpf : o x86-64 ( Linux 3.18; cBPF Linux 3.0); o ARM32 ( Linux 3.18; cBPF Linux 3.4); o SPARC 32 ( Linux 3.18; cBPF Linux 3.5); o ARM-64 ( Linux 3.18); o s390 ( Linux 4.1; cBPF Linux 3.7); o PowerPC 64 ( Linux 4.8; cBPF Linux 3.1); o SPARC 64 ( Linux 4.12); o x86-32 ( Linux 4.18); o MIPS 64 ( Linux 4.18; cBPF Linux 3.16); o riscv ( Linux 5.1). /* bpf+sockets example: * 1. create array map of 256 elements * 2. load program that counts number of packets received * r0 = skb->data[ETH_HLEN + offsetof(struct iphdr, protocol)] * map[r0]++ * 3. attach prog_fd to raw socket via setsockopt() * 4. print number of received TCP/UDP packets every second */ int main(int argc, char *argv[]) { int sock, map_fd, prog_fd, key; long long value = 0, tcp_cnt, udp_cnt; map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value), 256); if (map_fd < 0) { printf("failed to create map '%s'\n", strerror(errno)); /* likely not run as root */ return 1; } struct bpf_insn prog[] = { BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), /* r6 = r1 */ BPF_LD_ABS(BPF_B, ETH_HLEN + offsetof(struct iphdr, protocol)), /* r0 = ip->proto */ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), /* r2 = fp */ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = r2 - 4 */ BPF_LD_MAP_FD(BPF_REG_1, map_fd), /* r1 = map_fd */ BPF_CALL_FUNC(BPF_FUNC_map_lookup_elem), /* r0 = map_lookup(r1, r2) */ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), /* if (r0 == 0) goto pc+2 */ BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */ BPF_XADD(BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* lock *(u64 *) r0 += r1 */ BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */ BPF_EXIT_INSN(), /* return r0 */ }; prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, prog, sizeof(prog) / sizeof(prog[0]), "GPL"); sock = open_raw_sock("lo"); assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof(prog_fd)) == 0); for (;;) { key = IPPROTO_TCP; assert(bpf_lookup_elem(map_fd, &key, &tcp_cnt) == 0); key = IPPROTO_UDP; assert(bpf_lookup_elem(map_fd, &key, &udp_cnt) == 0); printf("TCP %lld UDP %lld packets\n", tcp_cnt, udp_cnt); sleep(1); } return 0; } samples/bpf . seccomp(2), bpf-helpers(7), socket(7), tc(8), tc-bpf(8) , BPF Documentation/networking/filter.txt. () Aleksandr Felda Kirill Rekhov ; GNU (GNU General Public License - GPL, 3 ) , - . - , , () () () <>. Linux 6.9.1 15 2024 . bpf(2)