select源码学习

函数原型

1
2
3
4
5
6
7
8
9
10
11
12
13
/*
* maxfdp1: 指定待检测的描述符的个数
* readset: 读描述符集合
* writeset: 写描述符集合
* execeptset: 异常描述符集合
* timeout: 超时时间(一直、指定时间、不等待)
*
* 返回值:
* 0: 超时
* -1: 错误
* >0: 可进行读、写、异常操作描述符的大小。
*/

int select(int maxfdp1,fd_set *readset,fd_set *writeset,fd_set *exceptset,const struct timeval *timeout)

上面的几个参数都是输入/输出参数,既传递输入值,存储返回结果。


源码分析

源码位置:fs/select.c

入口函数:sys_select

我们在程序中调用select函数时,系统调用该函数。该函数只是处理了下超时时间参数,然后调用core_sys_select函数,最后计算剩余时间值。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,fd_set __user *exp, struct timeval __user *tvp)
{

s64 timeout = -1;
struct timeval tv;
int ret;

if (tvp) {
/* 将超时时间拷贝到内核空间 */
if (copy_from_user(&tv, tvp, sizeof(tv)))
return -EFAULT;

/* 非法时间 */
if (tv.tv_sec < 0 || tv.tv_usec < 0)
return -EINVAL;

/* Cast to u64 to make GCC stop complaining */
if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS)
timeout = -1; /* infinite */
else {
/* 计算出超时时间(转换为始终周期数) */
timeout = DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC/HZ);
timeout += tv.tv_sec * HZ;
}
}

ret = core_sys_select(n, inp, outp, exp, &timeout);
if (tvp) {
struct timeval rtv;

if (current->personality & STICKY_TIMEOUTS)
goto sticky;
/* rtv 是剩余值*/
rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
rtv.tv_sec = timeout;
if (timeval_compare(&rtv, &tv) >= 0)
rtv =smlinkage long sys_select() tv;
if (copy_to_user(tvp, &rtv, sizeof(rtv))) {
sticky:
/*
* If an application puts its timeval in read-only
* memory, we don't want the Linux-specific update to
* the timeval to cause a fault after the select has
* completed successfully. However, because we're not
* updating the timeval, we can't restart the system
* call.
*/

if (ret == -ERESTARTNOHAND)
ret = -EINTR;
}
}

return ret;
}

函数core_sys_select

该函数主要做了以下事情:

  1. fix传入的n值,使其不会超过文件描述符的最大值。这就是select能同时处理的连接受限的原因。
  2. 为描述符分配空间,将描述符从用户空间拷贝到内核空间。
  3. 调用do_select执行真正的IO复用。
  4. 将3的结果从用户内核空间拷贝的用户空间,返回给程序。
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, s64 *timeout)
    {

    fd_set_bits fds; /* 参考后面的数据结构 */
    void *bits;
    int ret, max_fds;
    unsigned int size;
    struct fdtable *fdt;
    /* Allocate small arguments on the stack to save memory and be faster */
    long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];

    ret = -EINVAL;
    if (n < 0)
    goto out_nofds;

    /* max_fds can increase, so grab it once to avoid race */
    rcu_read_lock();
    /* 获取当前进程的文件描述符表 */
    fdt = files_fdtable(current->files);
    max_fds = fdt->max_fds;
    rcu_read_unlock();
    /* 修正用户传入的第一个参数:fd_set中文件描述符的最大值 */
    if (n > max_fds)
    n = max_fds;

    /*
    * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
    * since we used fdset we need to allocate memory in units of
    * long-words.
    */

    size = FDS_BYTES(n); //n个bit位需要的字节数
    bits = stack_fds;
    /* 读、写、异常,输入&输出,共需6倍大小的描述符空间 */
    if (size > sizeof(stack_fds) / 6) {
    /* Not enough space in on-stack array; must use kmalloc */
    ret = -ENOMEM;
    bits = kmalloc(6 * size, GFP_KERNEL);
    if (!bits)
    goto out_nofds;
    }
    fds.in = bits;
    fds.out = bits + size;
    fds.ex = bits + 2*size;
    fds.res_in = bits + 3*size;
    fds.res_out = bits + 4*size;
    fds.res_ex = bits + 5*size;

    /* get_fd_set仅仅调用copy_from_user从用户空间拷贝了fd_set */
    if ((ret = get_fd_set(n, inp, fds.in)) ||
    (ret = get_fd_set(n, outp, fds.out)) ||
    (ret = get_fd_set(n, exp, fds.ex)))
    goto out;
    zero_fd_set(n, fds.res_in);
    zero_fd_set(n, fds.res_out);
    zero_fd_set(n, fds.res_ex);

    ret = do_select(n, &fds, timeout);

    if (ret < 0)
    goto out;
    if (!ret) {
    ret = -ERESTARTNOHAND;
    if (signal_pending(current))
    goto out;
    ret = 0;
    }

    /*
    * 把结果集拷贝到用户空间
    */

    if (set_fd_set(n, inp, fds.res_in) ||
    set_fd_set(n, outp, fds.res_out) ||
    set_fd_set(n, exp, fds.res_ex))
    ret = -EFAULT;

    out:
    if (bits != stack_fds)
    kfree(bits);
    out_nofds:
    return ret;
    }
函数do_select

该函数会遍历所有的描述符,返回有事件发生的描述符。其中涉及到poll_wait相关的进程休眠、唤醒相关的代码没有去看,但这不影响我们对select的理解。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
int do_select(int n, fd_set_bits *fds, s64 *timeout)
{

struct poll_wqueues table;
poll_table *wait;
int retval, i;

rcu_read_lock();
/*
* 根据已经打开fd的位图检查用户打开的fd,要求对应的fd必须打开,
* 并且返回最大的fd
*/

retval = max_select_fd(n, fds);
rcu_read_unlock();

if (retval < 0)
return retval;
n = retval;

/*将当前进程放入自已的等待队列table, 并将该等待队列加入到该测试表wait*/
poll_initwait(&table);
wait = &table.pt;
if (!*timeout)
wait = NULL;
retval = 0;
for (;;) {
unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
long __timeout;

set_current_state(TASK_INTERRUPTIBLE);

inp = fds->in; outp = fds->out; exp = fds->ex;
rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;

for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
unsigned long in, out, ex, all_bits, bit = 1, mask, j;
unsigned long res_in = 0, res_out = 0, res_ex = 0;
const struct file_operations *f_op = NULL;
struct file *file = NULL;

in = *inp++; out = *outp++; ex = *exp++;
all_bits = in | out | ex;
/* 由于数据描述符表是用的long,所以每次i要加上long所占的bit位数 */
if (all_bits == 0) {
i += __NFDBITS;
continue;
/* 有事件发生,遍历每一个bit */
for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) {
int fput_needed;
if (i >= n)
break;
if (!(bit & all_bits))
continue;
file = fget_light(i, &fput_needed);
if (file) {
f_op = file->f_op;
mask = DEFAULT_POLLMASK;
if (f_op && f_op->poll)
mask = (*f_op->poll)(file, retval ? NULL : wait);
fput_light(file, fput_needed);
if ((mask & POLLIN_SET) && (in & bit)) {
res_in |= bit;
retval++;
}
if ((mask & POLLOUT_SET) && (out & bit)) {
res_out |= bit;
retval++;
}
if ((mask & POLLEX_SET) && (ex & bit)) {
res_ex |= bit;
retval++;
}
}
cond_resched();
}
if (res_in)
*rinp = res_in;
if (res_out)
*routp = res_out;
if (res_ex)
*rexp = res_ex;
}
wait = NULL;
if (retval || !*timeout || signal_pending(current))
break;
if(table.error) {
retval = table.error;
break;
}
if (*timeout < 0) {
/* Wait indefinitely */
__timeout = MAX_SCHEDULE_TIMEOUT;
} else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT - 1)) {
/* Wait for longer than MAX_SCHEDULE_TIMEOUT. Do it in a loop */
__timeout = MAX_SCHEDULE_TIMEOUT - 1;
*timeout -= __timeout;
} else {
__timeout = *timeout;
*timeout = 0;
}
__timeout = schedule_timeout(__timeout);
if (*timeout >= 0)
*timeout += __timeout;
}
__set_current_state(TASK_RUNNING);

poll_freewait(&table);

return retval;
}

总结

select主要做了3件事情:

  1. 将需要监测的描述符从用户空间拷贝到内核空间;
  2. 遍历描述符,返回有事件发生的描述符;
  3. 将发生的描述符从内核空间拷贝到用户空间。

由于需要将全部描述符在内核空间和用户空间的来回拷贝、以及遍历,造成了效率的不高;而且描述符的个数也受系统最大文件描述符的限制。


一些数据数据结构
fd_set_bits

1
2
3
4
5
6
7
8
9
10
11
typedef struct {
unsigned long *in, *out, *ex;
unsigned long *res_in, *res_out, *res_ex;
} fd_set_bits;

/*
* How many longwords for "nr" bits?
*/

#define FDS_BITPERLONG (8*sizeof(long))
#define FDS_LONGS(nr) ((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG)
#define FDS_BYTES(nr) (FDS_LONG(nr)*sizeof(long))

fd_set_bits,标识出可读、可写、异常描述符表的输入和输出。
下面三个宏分别表示:long类型的bit数、nr个bit位需要几个long类型存放、nr个bt位存放需要的字节数。

1
2
3
4
typedef stuct poll_table_stuct{
poll_queue_proc qproc;
unsigned long key;
} poll_table;

poll_table:对每个文件进行poll操作时,判读是否能够非阻塞的进行key值(poll事件组成)标识的I/O操作;如果不能,则调用回调函数qproc将进程添加到文件的poll等待队列中。

1
2
3
4
5
6
struct poll_table_entry {
stuct file *file;
unsigned long key;
wait_queue_t wait;
wait_queue_head_t *wait_address;
}

poll_table_entry: 用于阻塞进程并将进程添加到文件的poll等待队列中,一个文件对应一个poll_table_entry。

1
2
3
4
5
6
7
8
9
stuct poll_wqueues {
poll_table pt;
struct poll_table_page *table;
struct task_struct *polling_task;
int triggered;
int error;
int inline_index;
struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
};

poll_wqueues: 用于在select/poll时,如果需要阻塞进程,将进程添加到描述表标识的所有文件的poll等待队列中,以便任意一个文件可进行非阻塞I/O操作时唤醒进程。