select源码学习 // wangzhilong's blog

函数原型
源码分析
总结

函数原型

/*
 * maxfdp1: 指定待检测的描述符的个数
 * readset: 读描述符集合
 * writeset: 写描述符集合
 * execeptset: 异常描述符集合
 * timeout: 超时时间（一直、指定时间、不等待）
 *
 * 返回值：
 * 0: 超时
 * -1: 错误
 * >0: 可进行读、写、异常操作描述符的大小。
 */
int select(int maxfdp1,fd_set *readset,fd_set *writeset,fd_set *exceptset,const struct timeval *timeout)

上面的几个参数都是输入/输出参数，既传递输入值，存储返回结果。

源码分析

源码位置:fs/select.c

入口函数：sys_select

我们在程序中调用select函数时，系统调用该函数。该函数只是处理了下超时时间参数，然后调用core_sys_select函数，最后计算剩余时间值。

asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,fd_set __user *exp, struct timeval __user *tvp)
{
    s64 timeout = -1;
    struct timeval tv;
    int ret;

    if (tvp) {
        /* 将超时时间拷贝到内核空间 */
        if (copy_from_user(&tv, tvp, sizeof(tv)))
            return -EFAULT;

        /* 非法时间 */
        if (tv.tv_sec < 0 || tv.tv_usec < 0)
            return -EINVAL;

        /* Cast to u64 to make GCC stop complaining */
        if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS)
            timeout = -1;   /* infinite */
        else {
            /* 计算出超时时间(转换为始终周期数) */
            timeout = DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC/HZ);
            timeout += tv.tv_sec * HZ;
        }
    }
    
    ret = core_sys_select(n, inp, outp, exp, &timeout);
    if (tvp) {
        struct timeval rtv;

        if (current->personality & STICKY_TIMEOUTS)
            goto sticky;
        /* rtv 是剩余值*/
        rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
        rtv.tv_sec = timeout;
        if (timeval_compare(&rtv, &tv) >= 0)
            rtv =smlinkage long sys_select() tv;
        if (copy_to_user(tvp, &rtv, sizeof(rtv))) {
sticky:
            /*
             * If an application puts its timeval in read-only
             * memory, we don't want the Linux-specific update to
             * the timeval to cause a fault after the select has
             * completed successfully. However, because we're not
             * updating the timeval, we can't restart the system
             * call.
             */
            if (ret == -ERESTARTNOHAND)
                ret = -EINTR;
        }
    }

    return ret;
}

函数core_sys_select

该函数主要做了以下事情：

fix传入的n值，使其不会超过文件描述符的最大值。这就是select能同时处理的连接受限的原因。
为描述符分配空间，将描述符从用户空间拷贝到内核空间。
调用do_select执行真正的IO复用。

将3的结果从用户内核空间拷贝的用户空间，返回给程序。

static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, s64 *timeout)
{
    fd_set_bits fds; /* 参考后面的数据结构 */
    void *bits;
    int ret, max_fds;
    unsigned int size;
    struct fdtable *fdt;
    /* Allocate small arguments on the stack to save memory and be faster */
    long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];

    ret = -EINVAL;
    if (n < 0)
        goto out_nofds;

    /* max_fds can increase, so grab it once to avoid race */
    rcu_read_lock();
    /* 获取当前进程的文件描述符表 */
    fdt = files_fdtable(current->files);
    max_fds = fdt->max_fds;
    rcu_read_unlock();
    /* 修正用户传入的第一个参数:fd_set中文件描述符的最大值 */
    if (n > max_fds)
        n = max_fds;

    /*
     * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
     * since we used fdset we need to allocate memory in units of
     * long-words.
     */
    size = FDS_BYTES(n); //n个bit位需要的字节数
    bits = stack_fds;
    /* 读、写、异常，输入&输出，共需6倍大小的描述符空间 */
    if (size > sizeof(stack_fds) / 6) {
        /* Not enough space in on-stack array; must use kmalloc */
        ret = -ENOMEM;
        bits = kmalloc(6 * size, GFP_KERNEL);
        if (!bits)
            goto out_nofds;
    }
    fds.in      = bits;
    fds.out     = bits +   size;
    fds.ex      = bits + 2*size;
    fds.res_in  = bits + 3*size;
    fds.res_out = bits + 4*size;
    fds.res_ex  = bits + 5*size;

    /* get_fd_set仅仅调用copy_from_user从用户空间拷贝了fd_set */
    if ((ret = get_fd_set(n, inp, fds.in)) ||
        (ret = get_fd_set(n, outp, fds.out)) ||
        (ret = get_fd_set(n, exp, fds.ex)))
        goto out;
    zero_fd_set(n, fds.res_in);
    zero_fd_set(n, fds.res_out);
    zero_fd_set(n, fds.res_ex);

    ret = do_select(n, &fds, timeout);

    if (ret < 0)
        goto out;
    if (!ret) {
        ret = -ERESTARTNOHAND;
        if (signal_pending(current))
            goto out;
        ret = 0;
    }

    /*
     * 把结果集拷贝到用户空间
     */
    if (set_fd_set(n, inp, fds.res_in) ||
        set_fd_set(n, outp, fds.res_out) ||
        set_fd_set(n, exp, fds.res_ex))
        ret = -EFAULT;

out:
    if (bits != stack_fds)
        kfree(bits);
out_nofds:
    return ret;
}

函数do_select

该函数会遍历所有的描述符，返回有事件发生的描述符。其中涉及到poll_wait相关的进程休眠、唤醒相关的代码没有去看，但这不影响我们对select的理解。

int do_select(int n, fd_set_bits *fds, s64 *timeout)
{
    struct poll_wqueues table;
    poll_table *wait;
    int retval, i;

    rcu_read_lock();
    /*
     * 根据已经打开fd的位图检查用户打开的fd，要求对应的fd必须打开，
     * 并且返回最大的fd
    */
    retval = max_select_fd(n, fds);
    rcu_read_unlock();

    if (retval < 0)
        return retval;
    n = retval;

    /*将当前进程放入自已的等待队列table, 并将该等待队列加入到该测试表wait*/
    poll_initwait(&table);
    wait = &table.pt;
    if (!*timeout)
        wait = NULL;
    retval = 0;
    for (;;) {
        unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
        long __timeout;

        set_current_state(TASK_INTERRUPTIBLE);

        inp = fds->in; outp = fds->out; exp = fds->ex;
        rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;

        for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
            unsigned long in, out, ex, all_bits, bit = 1, mask, j;
            unsigned long res_in = 0, res_out = 0, res_ex = 0;
            const struct file_operations *f_op = NULL;
            struct file *file = NULL;

            in = *inp++; out = *outp++; ex = *exp++;
            all_bits = in | out | ex;
            /* 由于数据描述符表是用的long，所以每次i要加上long所占的bit位数 */
            if (all_bits == 0) {
                i += __NFDBITS;
                continue;
            /* 有事件发生，遍历每一个bit */
            for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) {
                int fput_needed;
                if (i >= n)
                    break;
                if (!(bit & all_bits))
                    continue;
                file = fget_light(i, &fput_needed);
                if (file) {
                    f_op = file->f_op;
                    mask = DEFAULT_POLLMASK;
                    if (f_op && f_op->poll)
                        mask = (*f_op->poll)(file, retval ? NULL : wait);
                    fput_light(file, fput_needed);
                    if ((mask & POLLIN_SET) && (in & bit)) {
                        res_in |= bit;
                        retval++;
                    }
                    if ((mask & POLLOUT_SET) && (out & bit)) {
                        res_out |= bit;
                        retval++;
                    }
                    if ((mask & POLLEX_SET) && (ex & bit)) {
                        res_ex |= bit;
                        retval++;
                    }
                }
                cond_resched();
            }
            if (res_in)
                *rinp = res_in;
            if (res_out)
                *routp = res_out;
            if (res_ex)
                *rexp = res_ex;
        }
        wait = NULL;
        if (retval || !*timeout || signal_pending(current))
            break;
        if(table.error) {
            retval = table.error;
            break;
        }
      if (*timeout < 0) {
            /* Wait indefinitely */
            __timeout = MAX_SCHEDULE_TIMEOUT;
        } else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT - 1)) {
            /* Wait for longer than MAX_SCHEDULE_TIMEOUT. Do it in a loop */
            __timeout = MAX_SCHEDULE_TIMEOUT - 1;
            *timeout -= __timeout;
        } else {
            __timeout = *timeout;
            *timeout = 0;
        }
        __timeout = schedule_timeout(__timeout);
        if (*timeout >= 0)
            *timeout += __timeout;
    }
    __set_current_state(TASK_RUNNING);

    poll_freewait(&table);

    return retval;
}

总结

select主要做了3件事情：

将需要监测的描述符从用户空间拷贝到内核空间；
遍历描述符，返回有事件发生的描述符；
将发生的描述符从内核空间拷贝到用户空间。

由于需要将全部描述符在内核空间和用户空间的来回拷贝、以及遍历，造成了效率的不高；而且描述符的个数也受系统最大文件描述符的限制。

一些数据数据结构
fd_set_bits

typedef struct {
    unsigned long *in, *out, *ex;
    unsigned long *res_in, *res_out, *res_ex;
} fd_set_bits;

/*
 * How many longwords for "nr" bits?
 */
 #define FDS_BITPERLONG (8*sizeof(long))
 #define FDS_LONGS(nr) ((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG)
 #define FDS_BYTES(nr) (FDS_LONG(nr)*sizeof(long))

fd_set_bits，标识出可读、可写、异常描述符表的输入和输出。
下面三个宏分别表示：long类型的bit数、nr个bit位需要几个long类型存放、nr个bt位存放需要的字节数。

typedef stuct poll_table_stuct{
    poll_queue_proc qproc;
    unsigned long key;
} poll_table;

poll_table：对每个文件进行poll操作时，判读是否能够非阻塞的进行key值（poll事件组成）标识的I/O操作；如果不能，则调用回调函数qproc将进程添加到文件的poll等待队列中。

struct poll_table_entry {
    stuct file *file;
    unsigned long key;
    wait_queue_t wait;
    wait_queue_head_t *wait_address;
}

poll_table_entry: 用于阻塞进程并将进程添加到文件的poll等待队列中，一个文件对应一个poll_table_entry。

stuct poll_wqueues {
    poll_table pt;
    struct poll_table_page *table;
    struct task_struct *polling_task;
    int triggered;
    int error;
    int inline_index;
    struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
};

poll_wqueues: 用于在select/poll时，如果需要阻塞进程，将进程添加到描述表标识的所有文件的poll等待队列中，以便任意一个文件可进行非阻塞I/O操作时唤醒进程。