函数原型
1 | /* |
上面的几个参数都是输入/输出参数,既传递输入值,存储返回结果。
源码分析
源码位置:fs/select.c
入口函数:sys_select
我们在程序中调用select函数时,系统调用该函数。该函数只是处理了下超时时间参数,然后调用core_sys_select
函数,最后计算剩余时间值。1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,fd_set __user *exp, struct timeval __user *tvp)
{
s64 timeout = -1;
struct timeval tv;
int ret;
if (tvp) {
/* 将超时时间拷贝到内核空间 */
if (copy_from_user(&tv, tvp, sizeof(tv)))
return -EFAULT;
/* 非法时间 */
if (tv.tv_sec < 0 || tv.tv_usec < 0)
return -EINVAL;
/* Cast to u64 to make GCC stop complaining */
if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS)
timeout = -1; /* infinite */
else {
/* 计算出超时时间(转换为始终周期数) */
timeout = DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC/HZ);
timeout += tv.tv_sec * HZ;
}
}
ret = core_sys_select(n, inp, outp, exp, &timeout);
if (tvp) {
struct timeval rtv;
if (current->personality & STICKY_TIMEOUTS)
goto sticky;
/* rtv 是剩余值*/
rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
rtv.tv_sec = timeout;
if (timeval_compare(&rtv, &tv) >= 0)
rtv =smlinkage long sys_select() tv;
if (copy_to_user(tvp, &rtv, sizeof(rtv))) {
sticky:
/*
* If an application puts its timeval in read-only
* memory, we don't want the Linux-specific update to
* the timeval to cause a fault after the select has
* completed successfully. However, because we're not
* updating the timeval, we can't restart the system
* call.
*/
if (ret == -ERESTARTNOHAND)
ret = -EINTR;
}
}
return ret;
}
函数core_sys_select
该函数主要做了以下事情:
- fix传入的n值,使其不会超过文件描述符的最大值。这就是select能同时处理的连接受限的原因。
- 为描述符分配空间,将描述符从用户空间拷贝到内核空间。
- 调用
do_select
执行真正的IO复用。 - 将3的结果从用户内核空间拷贝的用户空间,返回给程序。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, s64 *timeout)
{
fd_set_bits fds; /* 参考后面的数据结构 */
void *bits;
int ret, max_fds;
unsigned int size;
struct fdtable *fdt;
/* Allocate small arguments on the stack to save memory and be faster */
long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
ret = -EINVAL;
if (n < 0)
goto out_nofds;
/* max_fds can increase, so grab it once to avoid race */
rcu_read_lock();
/* 获取当前进程的文件描述符表 */
fdt = files_fdtable(current->files);
max_fds = fdt->max_fds;
rcu_read_unlock();
/* 修正用户传入的第一个参数:fd_set中文件描述符的最大值 */
if (n > max_fds)
n = max_fds;
/*
* We need 6 bitmaps (in/out/ex for both incoming and outgoing),
* since we used fdset we need to allocate memory in units of
* long-words.
*/
size = FDS_BYTES(n); //n个bit位需要的字节数
bits = stack_fds;
/* 读、写、异常,输入&输出,共需6倍大小的描述符空间 */
if (size > sizeof(stack_fds) / 6) {
/* Not enough space in on-stack array; must use kmalloc */
ret = -ENOMEM;
bits = kmalloc(6 * size, GFP_KERNEL);
if (!bits)
goto out_nofds;
}
fds.in = bits;
fds.out = bits + size;
fds.ex = bits + 2*size;
fds.res_in = bits + 3*size;
fds.res_out = bits + 4*size;
fds.res_ex = bits + 5*size;
/* get_fd_set仅仅调用copy_from_user从用户空间拷贝了fd_set */
if ((ret = get_fd_set(n, inp, fds.in)) ||
(ret = get_fd_set(n, outp, fds.out)) ||
(ret = get_fd_set(n, exp, fds.ex)))
goto out;
zero_fd_set(n, fds.res_in);
zero_fd_set(n, fds.res_out);
zero_fd_set(n, fds.res_ex);
ret = do_select(n, &fds, timeout);
if (ret < 0)
goto out;
if (!ret) {
ret = -ERESTARTNOHAND;
if (signal_pending(current))
goto out;
ret = 0;
}
/*
* 把结果集拷贝到用户空间
*/
if (set_fd_set(n, inp, fds.res_in) ||
set_fd_set(n, outp, fds.res_out) ||
set_fd_set(n, exp, fds.res_ex))
ret = -EFAULT;
out:
if (bits != stack_fds)
kfree(bits);
out_nofds:
return ret;
}
函数do_select
该函数会遍历所有的描述符,返回有事件发生的描述符。其中涉及到poll_wait相关的进程休眠、唤醒相关的代码没有去看,但这不影响我们对select的理解。1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109int do_select(int n, fd_set_bits *fds, s64 *timeout)
{
struct poll_wqueues table;
poll_table *wait;
int retval, i;
rcu_read_lock();
/*
* 根据已经打开fd的位图检查用户打开的fd,要求对应的fd必须打开,
* 并且返回最大的fd
*/
retval = max_select_fd(n, fds);
rcu_read_unlock();
if (retval < 0)
return retval;
n = retval;
/*将当前进程放入自已的等待队列table, 并将该等待队列加入到该测试表wait*/
poll_initwait(&table);
wait = &table.pt;
if (!*timeout)
wait = NULL;
retval = 0;
for (;;) {
unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
long __timeout;
set_current_state(TASK_INTERRUPTIBLE);
inp = fds->in; outp = fds->out; exp = fds->ex;
rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
unsigned long in, out, ex, all_bits, bit = 1, mask, j;
unsigned long res_in = 0, res_out = 0, res_ex = 0;
const struct file_operations *f_op = NULL;
struct file *file = NULL;
in = *inp++; out = *outp++; ex = *exp++;
all_bits = in | out | ex;
/* 由于数据描述符表是用的long,所以每次i要加上long所占的bit位数 */
if (all_bits == 0) {
i += __NFDBITS;
continue;
/* 有事件发生,遍历每一个bit */
for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) {
int fput_needed;
if (i >= n)
break;
if (!(bit & all_bits))
continue;
file = fget_light(i, &fput_needed);
if (file) {
f_op = file->f_op;
mask = DEFAULT_POLLMASK;
if (f_op && f_op->poll)
mask = (*f_op->poll)(file, retval ? NULL : wait);
fput_light(file, fput_needed);
if ((mask & POLLIN_SET) && (in & bit)) {
res_in |= bit;
retval++;
}
if ((mask & POLLOUT_SET) && (out & bit)) {
res_out |= bit;
retval++;
}
if ((mask & POLLEX_SET) && (ex & bit)) {
res_ex |= bit;
retval++;
}
}
cond_resched();
}
if (res_in)
*rinp = res_in;
if (res_out)
*routp = res_out;
if (res_ex)
*rexp = res_ex;
}
wait = NULL;
if (retval || !*timeout || signal_pending(current))
break;
if(table.error) {
retval = table.error;
break;
}
if (*timeout < 0) {
/* Wait indefinitely */
__timeout = MAX_SCHEDULE_TIMEOUT;
} else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT - 1)) {
/* Wait for longer than MAX_SCHEDULE_TIMEOUT. Do it in a loop */
__timeout = MAX_SCHEDULE_TIMEOUT - 1;
*timeout -= __timeout;
} else {
__timeout = *timeout;
*timeout = 0;
}
__timeout = schedule_timeout(__timeout);
if (*timeout >= 0)
*timeout += __timeout;
}
__set_current_state(TASK_RUNNING);
poll_freewait(&table);
return retval;
}
总结
select主要做了3件事情:
- 将需要监测的描述符从用户空间拷贝到内核空间;
- 遍历描述符,返回有事件发生的描述符;
- 将发生的描述符从内核空间拷贝到用户空间。
由于需要将全部描述符在内核空间和用户空间的来回拷贝、以及遍历,造成了效率的不高;而且描述符的个数也受系统最大文件描述符的限制。
一些数据数据结构
fd_set_bits1
2
3
4
5
6
7
8
9
10
11typedef struct {
unsigned long *in, *out, *ex;
unsigned long *res_in, *res_out, *res_ex;
} fd_set_bits;
/*
* How many longwords for "nr" bits?
*/
#define FDS_BITPERLONG (8*sizeof(long))
#define FDS_LONGS(nr) ((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG)
#define FDS_BYTES(nr) (FDS_LONG(nr)*sizeof(long))
fd_set_bits,标识出可读、可写、异常描述符表的输入和输出。
下面三个宏分别表示:long类型的bit数、nr个bit位需要几个long类型存放、nr个bt位存放需要的字节数。1
2
3
4typedef stuct poll_table_stuct{
poll_queue_proc qproc;
unsigned long key;
} poll_table;
poll_table:对每个文件进行poll操作时,判读是否能够非阻塞的进行key值(poll事件组成)标识的I/O操作;如果不能,则调用回调函数qproc将进程添加到文件的poll等待队列中。1
2
3
4
5
6struct poll_table_entry {
stuct file *file;
unsigned long key;
wait_queue_t wait;
wait_queue_head_t *wait_address;
}
poll_table_entry: 用于阻塞进程并将进程添加到文件的poll等待队列中,一个文件对应一个poll_table_entry。1
2
3
4
5
6
7
8
9stuct poll_wqueues {
poll_table pt;
struct poll_table_page *table;
struct task_struct *polling_task;
int triggered;
int error;
int inline_index;
struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
};
poll_wqueues: 用于在select/poll时,如果需要阻塞进程,将进程添加到描述表标识的所有文件的poll等待队列中,以便任意一个文件可进行非阻塞I/O操作时唤醒进程。