select、poll、epoll解析

224 阅读 0 评论 148 点赞

我是靠谱客的博主饱满黑夜，这篇文章主要介绍select、poll、epoll解析，现在分享给大家，希望可以做个参考。

一、select

#include <sys/select.h>
#include <sys/time.h>

int select(int nfds, fd_set *readfds, fd_set *writefds, d_set *exceptfds, struct timeval *timeout);

void FD_CLR(int fd, fd_set *set);
int  FD_ISSET(int fd, fd_set *set);
void FD_SET(int fd, fd_set *set);
void FD_ZERO(fd_set *set);
// nfds设置成这三个参数中[readfds, writefds, exceptfds] socket描述符句柄中最大值加1

static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
{
  // ... ...
  poll_initwait(&table);  // init_poll_funcptr(&pwq->pt, __pollwait); 注册回调函数
  for (;;) {
    // ... ...
    for (i = 0; i < n; ++rinp, ++routp, ++rexp) {  // 遍历所有fd
      // ... ...
      f = fdget(i);
      if (f.file) {
        if (f_op->poll) {  // 调用对应的poll方法
          // 对每个fd进行I/O事件检测
          mask = (*f_op->poll)(f.file, wait);  // 返回一个描述读写操作是否就绪的mask掩码
        }
        fdput(f);
        // 根据mask掩码判断是否有就绪的fd
        if ((mask & POLLIN_SET) && (in & bit)) {
            retval++;
        }
        if ((mask & POLLOUT_SET) && (out & bit)) {
          retval++;
        }
        if ((mask & POLLEX_SET) && (ex & bit)) {
          retval++;
        }
      }
      cond_resched();  // 让出CPU给其他进程运行，下次执行时从这里开始
    }
    // 如果有准备好的fd，或者超时，或者有信号打断，退出循环
    if (retval || timed_out || signal_pending(current))
      break;
    // 当do_select()没有达到上述三种情况时，会让当前进程去休眠一段时间，
    // 等待fd设备或定时器来唤醒自己，然后再继续循环看看哪些fd可用，提高效率
    if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, to, slack))
      timed_out = 1;
  }
}

步骤：

1、用copy_from_user从用户空间拷贝fd_set到内核空间

2、注册回调函数__pollwait

3、遍历所有fd，调用对应的poll方法（会调用到__pollwait），poll函数返回一个mask掩码，表示读写操作是否就绪。

4、根据mask掩码判断是否有就绪的fd

5、把fd_set从内核空间拷贝到用户空间

缺点：

1、每次调用select，都需要把fd_set集合从用户态拷贝到内核态。（fd很多时开销会很大）

2、每次调用select都需要在内核遍历传进来的所有fd。（fd很多时开销会很大）

3、select支持的文件描述符数量太小，默认是1024。

include/uapi/linux/posix_types.h

#define __FD_SETSIZE    1024

二、poll

poll的实现和select机会一样，不同的地方在于把fd_set替换成了pollfd。poll()使用链表保存文件描述符，poll()没有最大文件描述符的限制，但是数量过大后性能也会下降。同时用户态到内核态之间的拷贝问题依然存在。

#include <poll.h>

int poll(struct pollfd *fds, nfds_t nfds, int timeout);

struct pollfd {

  int   fd;       /* file descriptor */

  short events;   /* requested events */

  short revents;  /* returned events */

};

// events and revents可能的取值

POLLIN:There is data to read.

POLLPRI:There is urgent data to read (e.g., out-of-band data on TCP socket).

POLLOUT:Writing now will not block.

POLLRDHUP:Stream socket peer closed connection, or shut down writing half of connection.

POLLERR:Error condition (output only).

POLLHUP:Hang up (output only).

POLLNVAL:Invalid request: fd not open (output only).

三、epoll

#include <sys/epoll.h>

int epoll_create(int size);

int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);

int epoll_wait(int epfd, struct epoll_event *events, int maxevents, int timeout);

1、epoll_create()

int epoll_create(int size);

// 创建一个epoll的句柄，size用来告诉内核这个监听的数目一共有多大。

// 当创建好epoll句柄后，它就会占用一个fd值。在使用完epoll后，必须调用close()关闭，否则可能导致fd被耗尽。

// 返回值：非负的文件描述符(fd)

当进程调用epoll_create()创建一个epoll对象时，内核会创建一个eventpoll结构体。

struct eventpoll {
    # ... ...
    /* List of ready file descriptors */
    struct list_head rdllist;  // 双链表中存放着将要通过epoll_wait返回给用户的满足条件的事件
    /* RB tree root used to store monitored fd structs */
    struct rb_root rbr; // 红黑树根节点，树存储着所有添加到epoll中的需要监控的事件
    # ... ...
};

在epoll对象中，每一个事件event都会建立一个epitem结构体。

struct epitem {
    union {
        /* RB tree node links this structure to the eventpoll RB tree */
        struct rb_node rbn;  // 红黑树节点
        // ... ...
    };
    /* List header used to link this structure to the eventpoll ready list */
    struct list_head rdllink;  // 双向链表节点
    /* The file descriptor information this item refers to */
    struct epoll_filefd ffd;  // 事件句柄信息
    struct eventpoll *ep; // 指向其所属的eventpoll对象
    /* The structure that describe the interested events and the source fd */
    struct epoll_event event;  // 期待发生的事件类型
};

2、epoll_ctl()

事件注册函数，当op为EPOLL_CTL_ADD会调用回调函数ep_poll_callback，它会将事件的rdllink添加到rdllist双链表中。

int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
// 事件注册函数
// epfd:epoll_create()的返回值

// op:表示动作
EPOLL_CTL_ADD:注册新的fd到epfd;
EPOLL_CTL_MOD:修改已经注册的fd的监听事件;
EPOLL_CTL_DEL:从epfd中删除一个fd;

// fd:需要监听的fd
// event:告诉内核需要监听什么事件
// 返回值：成功返回0

typedef union epoll_data {
  void        *ptr;
  int          fd;
  __uint32_t   u32;
  __uint64_t   u64;
} epoll_data_t;

struct epoll_event {
  __uint32_t   events;  /* Epoll events */
  epoll_data_t data;    /* User data variable */
};

// events表示对应的文件描述符可以进行的操作
EPOLLIN:可读；
EPOLLOUT:可写；
EPOLLRDHUP:[TODO]；
EPOLLPRI:有紧急的数据可读；
EPOLLERR:发生错误；
EPOLLHUP:被挂断；
EPOLLET:将EPOLL设置为边缘触发模式(Edge Triggered)；
EPOLLONESHOT:只监听一次事件，当监听完这次事件之后，如果还需要继续监听这个socket，需要再次把这个socket加入到EPOLL队列里。

3、epoll_wait()

只需要检查rdllist是否为空，如果不为空，则把发生的事件复制到用户态，同时将事件数量返回给用户。

int epoll_wait(int epfd, struct epoll_event *events, int maxevents, int timeout);
// 等待事件的产生
// events:从内核得到事件的集合
// maxevents:告诉内核这个events有多大，该值不能大于epoll_create()时的size
// timeout:超时时间，毫秒，0表示立即返回，-1表示不确定也就是永久阻塞
// 返回值：需要处理的fd数目，0表示timeout

对于select的三个缺点，epoll都可以避免。

第一个缺点，epoll的解决方案在epoll_ctl函数中，每次注册新的事件到句柄中时(EPOLL_CTL_ADD)，会把所有的fd拷贝进内核，而不是在

epoll_wait的时候重复拷贝。epoll保证了每个fd在整个过程中只会拷贝一次。

对于第二个缺点，epoll的解决方案不像select或poll一样每次都把current轮流加入fd对应的设备等待队列中，而只在epoll_ctl时把current挂一遍，

并为每个fd指定一个回调函数。当设备就绪，唤醒等待队列上的等待着时，就会调用这个回调函数，而这个回调函数会把就绪的fd加入一个就绪链表。

epoll_wait的工作实际上就是在这个就绪链表中查看有没有就绪的fd（也会休眠一会，然后再判断）。

4、总结

通过红黑树和双链表数据结构，并结合回调机制，使得epoll高效。

（1）进程调用epoll_create()时，创建一个epoll对象，对应内核创建的一个eventpoll结构体。

（2）调用epoll_ctl()时，如果op为EPOLL_CTL_ADD，向epoll对象中添加事件，这些事件会挂载在红黑树上（rbn）。重复添加的事件就可以通过红黑树高效的识别出来。

（3）所添加到epoll中的事件会与设备驱动程序建立回调关系。也就是说当相应的事件发生时会调用回调函数ep_poll_callback，它会将事件的rdllink添加到rdllist双链表中。

（4）调用epoll_wait()时，检查是否有事件发生时，只需要检查eventpoll对象中的rdllist双链表中是否有epitem元素即可。