epoll源码分析

- 主要数据结构
- epoll_create()函数实现
- - ep_alloc()：初始化结构
  - 初始化eventpoll
- epoll_ctl()函数实现
- - ep_insert回调函数的实现
  - ep_ptable_queue_proc函数
  - ep_poll_callback
- epoll_wait函数
- - SYSCALL_DEFINE4(epoll_wait, ...)
  - ep_poll
  - ep_send_events

cmd

主要数据结构

struct eventpoll
{
    spinlock_t lock;	// 自旋锁
    struct mutex mtx;	// 防止使用时被删除
    wait_queue_head_t wq;	// sys_epoll_wait() 使用的等待队列
    wait_queue_head_t poll_wait;	// file->poll() 使用的等待队列
    struct list_head rdllist;	// 事件满足条件的链表, 事件就绪, 准备在epoll_wait时写入用户空间
    struct rb_root rbr;	
    struct epitem *ovflist;	// 将事件到达的fd进行连接, 并发送至用户空间
    struct user_struct *user;
};

struct epitem
{
	struct rb_node rbn;
    struct list_head rdllink;	// 事件的就绪队列
    struct epoll_filed ffd;
    int nwait;
    struct list_ead pwqlist;	// poll 等待队列
    struct eventpoll *ep;		// 属于哪个结构体
    struct list_head fllink;	// 链接fd对应的file链表
    struct epoll_event event;	// 事件
};

struct epoll_filefd
{
    struct file *file;
    int fd;
};

static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next)
{
    next->prev = new;
    new->prev = prev;
    new->next = next;
    prev->next = new;
}

static inline void list_add_tail(struct list_head *new, struct list_head *head)
{
    __list_add(new, head->prev, head);
}

// 检查是否为空链表
static inline int waitqueue_active(wait_queue_t *q)
{
    return !list_empty(&q->task_list);
}

epoll_create()函数实现

ep_alloc()：初始化结构

通过get_cunrent_user获得用户信息；
通过kzalloc()分配空间；
失败则释放获得的用户信息并退出；
初始化自旋锁
加锁
初始化等待队列、就绪队列、红黑树根；
ep保存用户信息、红黑树头、等待队列和就绪队列。

初始化eventpoll

// fs/eventpoll.c
static int ep_alloc(strut eventpoll **pep)
{
    ...
    struct user_struct *user;
    struct eventpoll *ep;
    user = get_current_user();	// 获得用户的信息
    ep = kzalloc(sizeog(*ep), GFP_KERNEL);	// 申请空间, kzalloc()与kalloc基本一样, 只是会在申请空间之后将其清零, 避免以前的数据影响.
    /*加锁*/
    spin_lock_init(&ep->lock);
    mutex_init(&ep->mtx);
    
    /*队列的初始化*/
    init_waitqueue_head(&ep->wq);
    init_waitqueue_head(&ep->poll_wait);
    INIT_LIST_HEAED(&ep->rdllist);
    ...
}

// kernel/wait.c
void init_waitqueue_head(wwait_queue_head_t *q)
{
    spin_lock_init(&q->lock);
    INIT_LIST_HEAD(&q->task_list);
}

// epoll_create()的系统调用
SYSCALL_DEFINE1(epoll_create1, int, flags)
{
    int errno, fd = -1;
    struct eventpoll *ep;
    ...
    errnno = ep_alloc(&ep);
    ...
    // 重点 : 创建匿名文件, 并返回文件描述符
    fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, flags & O_CLOEXEC);
}
SYSCALL_DEFINE1(epoll_create, int, size)
{
    if(size <= 0)
        return EINVAL;
    // 这里我一直认为是size, 但是源码是这样, 并且也没有人改, 应该是我没有理解吧.
    return sys_epoll_create1(0);
}

// 创建匿名文件，返回匿名文件的文件描述符
// 创建文件描述符, inode 指针的指向
int anon_inode_getfd(const char *name, const struct file_operations *fops,
		     void *priv, int flags)
{
	struct qstr this;
	struct dentry *dentry;
	struct file *file;
	int error, fd;

    // 完成文件描述符的分配
	error = get_unused_fd_flags(flags);
	if (error < 0)
		return error;
	fd = error;
    
	error = -ENOMEM;
    // 初始化即将放入目录项中的文件名
	this.name = name;
	this.len = strlen(name);
	this.hash = 0;
    // 文件名信息加入到目录项中
	dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this);
	if (!dentry)
		goto err_put_unused_fd;
	// 为文件分配空间, 映射地址
	...

	return fd;
err_dput:
	...
}

// 文件描述符的分配
// 创建文件, 文件, 返回文件描述符
// 这里传入的 start的值为 0
int alloc_fd(unsigned start, unsigned flags)
{
    // currnet指向当前进程的指针, 通过current获得进程的文件表
	struct files_struct *files = current->files;
	unsigned int fd;
	int error;
	struct fdtable *fdt;

	spin_lock(&files->file_lock);
repeat:
    // 通过进程的打开文件列表获得文件描述符位图结构, 说白了就是打开进程的文件描述符表
	fdt = files_fdtable(files);
	fd = start;
	if (fd < files->next_fd)
		fd = files->next_fd;
	if (fd < fdt->max_fds)
        // 在文件描述符表中找到一个空闲的bit位, 找到空闲的bit位就相当于找到一个文件描述符
		fd = find_next_zero_bit(fdt->open_fds->fds_bits,fdt->max_fds, fd);
    // 通过fd值, 判断是否对文件描述符表进行扩展
	error = expand_files(files, fd);
    // 错误判断
	...
	if (start <= files->next_fd)
		files->next_fd = fd + 1;
    
	FD_SET(fd, fdt->open_fds);
	if (flags & O_CLOEXEC)
		FD_SET(fd, fdt->close_on_exec);
	else
		FD_CLR(fd, fdt->close_on_exec);
	...
}

函数从SYSCALL_DEFINE1(epoll_create)开始，调用ep_alloc函数对event_poll分配内存空间，然后初始化红黑树、就序列表等。
最重要的是alloc_fd函数的调用，为epoll创建一个存放数据的匿名文件，并未文件设置好inode索引号，文件名存放在目录项中。
最后返回文件打开的文件描述符，将文件信息放入进程的task_stuct结构中的文件描述符表中。
epoll_create就是进程在内核中创建了一个从epoll文件描述符到eventpoll结构的通道。

epoll_ctl()函数实现

struct ep_pqueue
{
    poll_table pt;
    struct epitem *epi;
};

通过copy_from_user将数据从用户空间拷贝到内核空间；
通过fget()获得epoll_create()创建的匿名文件的文件指针；
进行epoll_ctl()传入的op方法的判断

SYSCALL_DEFINE4(epoll_create, int, epfd, int, op, int, fd, struct epoll_event __user*, event)
{
    struct epoll_event epds;
    ...
    // 从用户态拷贝到内核态
    if(ep_op_has_event(op) && copy_from_user(&epds, event, sizeof(struct epoll_event)))
        goto error_return;
        // 获取调用 epoll_create()函数返回的文件描述符后, 获得创建匿名文件的文件指针.
    file = fget(epfd);
    if (!file)
        goto error_return;
    tfile = fget(fd);
    if (!tfile)
        goto error_fput;
    ...
    epi = ep_find(ep, tfile, fd);   // 查找的原因在于ep是否已经存在了
    /*
    ep_find : 二叉查找文件
        1. 通过将ep_set_ffd()将文件描述符和文件指针加入到一个结构体中
        2. 调用ep_cmp_ffd()进行节点与要查找的文件进行比较, 二叉搜索
    */
    error = -EINVAL;
    // 调用 epoll_ctl()函数的方法
    switch (op) 
    {
    case EPOLL_CTL_ADD: // 添加
        if (!epi) 
        {
            epds.events |= POLLERR | POLLHUP;
            // 调用ep_insert()函数, 设置好回调函数
            error = ep_insert(ep, &epds, tfile, fd);
        } 
        else
            error = -EEXIST;
        break;
    case EPOLL_CTL_DEL: // 删除
        if (epi)
            error = ep_remove(ep, epi);
        else
            error = -ENOENT;
        break;
    case EPOLL_CTL_MOD: //修改
        if (epi) 
        {
            epds.events |= POLLERR | POLLHUP;
            error = ep_modify(ep, epi, &epds);
        } 
        else
            error = -ENOENT;
        break;
    }
    ...
}

ep_insert回调函数的实现

调用kmem_cache_alloc申请缓存空间；
调用ep_set_ffd将文件描述符和文件指针加入到ffd结构体中；
调用init_poll_funcptr()设置回调函数为ep_ptable_queue_proc；
调用ep_rbtree_insert()将ep加入到ep1中；
将struct epitem epi插入到红黑树中

static int ep_insert(struct eventpoll *ep, struct epoll_event *event, struct file *file, int fd)
{
    ...
    struct epitem *epi;
    struct ep_pqueue epq;
    // 从slab里面分配缓存空间
    if(!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
        return -ENOMEM;

    // 初始化
    INIT_LIST_HEAD(&epi->rdllink);
    INIT_LIST_HEAD(&epi->fllink);
    INIT_LIST_HEAD(&epi->pwqlist);
    ...

    // 保存 epi 以便回调时使用
    epq.epi = epi;
    // 设置好 poll 回调函数为ep_ptable_queue_proc
    init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
    // 调用事件 poll 函数来获取当前事件位, 利用它来调用注册函数 ep_ptable_queue_proc
    revents = tfile->f_op->poll(tfile, &epq.pt);
    ...

    // 将其加入到红黑树中
    ep_rbtree_insert(ep, epi);
    /*
    ep_rbtree_insert : 插入二叉树中
        1. 通过ep_cmp_ffd进行二叉搜索
        2. 调用rb_link_node rb_insert_color 将 ep结构加入到epi中
    */

    // 返回的事件位(revent)与最初设置的事件位(events)相与进行判断, 判断是否有时间到来, 同时还要保证返回给epoll_wait()的就绪队列不为空
    if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) 
    {
        list_add_tail(&epi->rdllink, &ep->rdllist);
        // 检查等待队列是否为空
        if (waitqueue_active(&ep->wq))
            wake_up_locked(&ep->wq);
        // 等待队列不为空, 则增加唤醒次数
        if (waitqueue_active(&ep->poll_wait))
            pwake++;
    }
    ...
}

// 将qproc注册到 poll_table_struct 中
// 因为执行 f_op->poll() 时. XXX_poll() 函数会执行 poll_wait() 回调函数, 而 poll_wait()又会调用 poll_table_struct 中的 qproc
static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
    pt->qproc = qproc;
}

ep_ptable_queue_proc函数

将申请缓存空间；
设置poll被唤醒时要调用的回调函数；
将其加入到等待队列中。
首先将eppoll_entry的whead指向fd的设备等待队列, 再初始化eppoll_entry的base变量指向epitem，最后通过add_wait_queue将epoll_entry挂载到fd的设备等待队列上。完成这个动作后，epoll_entry已经被挂载到fd的设备等待队列。

// 当 poll 函数唤醒时就调用该函数
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, poll_table *pt)
{
    // 从注册的结构中struct ep_pqueue中获取项epi
    struct epitem *epi = ep_item_from_epqueue(pt);

    // eppoll_entry主要完成epitem和epitem事件发生时的callback（ep_poll_callback）函数之间的关联
    struct eppoll_entry *pwq;
    // 申请eppoll_entry 缓存, 加入到等待队列中, 和链表中
    if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache,GFP_KERNEL))) 
    {
        // 初始化等待队列函数的入口. 也就是 poll 醒来时要调用的回调函数
        init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
        pwq->whead = whead;
        pwq->base = epi;
        // 加入到等待队列中
        add_wait_queue(whead, &pwq->wait);
        // 将等待队列 llink 的链表挂载到 eptiem等待链表中
        list_add_tail(&pwq->llink, &epi->pwqlist);
        epi->nwait++;
    } 
    ...
}

ep_poll_callback

加锁；
进行事件判断，没有事件则goto out_unlink；
event_poll是否加入就绪队列中，goto is_list；
调用list_add_tail加入链表；
将epi结构体加入到就绪队列中；
is_list段：调用waitqueue_active判断就绪队列是否为空，不为空计数值增加；
out_list段：解除所有锁；
退出。
ep_poll_callback函数主要的功能是将被监视文件的等待事件就绪时，将文件对应的epitem实例添加到就绪队列中，当用户调用epoll_wait()时，内核会将就绪队列中的事件报告给用户。

// poll 到来时, 调用的回调函数. 判断poll 事件是否到来, 是否加入到就绪队列中了
static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
    struct epitem *epi = ep_item_from_wait(wait);
    ...
    // 事件epi在是否在准备列表中
    if (ep_is_linked(&epi->rdllink))
        goto is_linked;
    // 重要 : 将 fd 加入到 epoll 监听的就绪队列中
    list_add_tail(&epi->rdllink, &ep->rdllist);
    ...
}

以SYSCALL_DEFINE4(epoll_ctr, …)开始，函数首先分配空间，将结构从用户空间复制到内核空间中；
在进行方法(op)判断之前，先采用ep_find函数进行查找，以确保该数据已经设置好回调函数；
然后使用fget函数获取该epoll的匿名文件描述符，最后进行方法判断。

epoll_wait函数

SYSCALL_DEFINE4(epoll_wait, …)

判断最大值合法性；
获取匿名文件的文件指针，取得文件信息；
调用ep_poll阻塞自己，等待有消息的到来

// epoll_wait()函数的调用
SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *,events,
 int, maxevents, int, timeout)
{
    int error;
    struct file *file;
    struct eventpoll *ep;
    ...
    // 判断最大值没有超过能承受的范围   
    if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
        return -EINVAL;
    // 获取epoll_create()函数创建的匿名文件的文件描述符
    file = fget(epfd);
    if (!file)
        goto error_return;
    ...

    error = -EINVAL;
    // 判断类型是不是epoll类型
    if (!is_file_epoll(file))
        goto error_fput;
    // 获取文件中保存的信息
    ep = file->private_data;
    // 重点，有消息到来时会执行，否则阻塞自己
    // 等待就绪队列不为空, 也就是等待有要接受的消息到来
    error = ep_poll(ep, events, maxevents, timeout);
    ...
}

ep_poll

先设置好时间长度，如果传过来的已经设置好的事件，就设置为传入的时间，没有的话就默认一个初始事件；
初始化就绪的等待队列；
对有没有进程准备好、有没有消息传来或时间进行判断，如果满足就退出循环，执行进程；
如果没有满足就执行进程调度，让出CPu，等待一段时间后返回该进程进行判断；
调用ep_send_events将就绪链表中的数据返回给用户空间。

// 设置进程的为可中断, 一直等待有事件的到来. 没哟到来就被调度
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, long timeout)
{
    int res, eavail;
    unsigned long flags;
    long jtimeout;
    wait_queue_t wait;
    ...

    // 设置运行的时间节拍
    jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ?
        MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000;

    if (list_empty(&ep->rdllist)) 
    {
        // 初始化 wait 等待队列的入口
        init_waitqueue_entry(&wait, current);
        wait.flags |= WQ_FLAG_EXCLUSIVE;
        __add_wait_queue(&ep->wq, &wait);

        //  等带回调函数, 进程设置为是可中断的
        for (;;) 
        {
            // 设置进程此时的状态是可中断的
            set_current_state(TASK_INTERRUPTIBLE);
            // 链表不为空, 或者时间到了
            if (!list_empty(&ep->rdllist) || !jtimeout)
                break;
            ...
            // 在时间节拍到达之前, 让出 cpu 调用其他进程, 时间节拍到达时, 重新通过中断回调该进程
            jtimeout = schedule_timeout(jtimeout);
            /*
            // 可用于在阻塞态的进程让出cpu, 一定时间后再回调查看
            extern inline void schedule_timeout(int timeout)
            {
                current->timeout = jiffies + timeout;
                current->state = TASK_INTERRUPTIBLE;
                schedule();
                current->timeout = 0;
            }
            */
            spin_lock_irqsave(&ep->lock, flags);
        }
        __remove_wait_queue(&ep->wq, &wait);

        // 将进程状态设置为运行态
        set_current_state(TASK_RUNNING);
    }
    eavail = !list_empty(&ep->rdllist);
    ...
    // 没有被中断, 有就绪队列, 并向用户空间发送成功, 并调用ep_send_event()返回给用户空间就绪的epoll事件
    if (!res && eavail &&
        !(res = ep_send_events(ep, events, maxevents)) && jtimeout)
        goto retry;

    return res;
}

ep_send_events

将就绪队列的链表复制到一个新的链表中，并且清空就绪队列；
将新保存的链表调用__put_uer，将就绪队列从内核空间复制用户空间；
判断是否是下降沿有效；
如果在将队列返回给用户空间的时候又有新消息到来的时候，需要重新将就绪消息放入ovflist中，同时增加唤醒的次数。

// 向用户空间发送就绪事件队列
static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events,
              int maxevents)
{
    int eventcnt, error = -EFAULT, pwake = 0;
    unsigned int revents;
    unsigned long flags;
    struct epitem *epi, *nepi;
    struct list_head txlist;

    INIT_LIST_HEAD(&txlist);
    ...
    // 将 rdllist 就绪队列的链表放入到 txlist 链表中, 此时 rdllist 就为空链表了
    list_splice(&ep->rdllist, &txlist);
    INIT_LIST_HEAD(&ep->rdllist);
    // 清空ovflist, 原因是后面执行将链表复制到用户空间的时侯还有消息到来, 就保存在ovflist中
    ep->ovflist = NULL;
    spin_unlock_irqrestore(&ep->lock, flags);

    // 将就绪的队列放入传回到用户空间
    for (eventcnt = 0; !list_empty(&txlist) && eventcnt < maxevents;)
    {
        // 取出第一个消息数据到来的结构, 然后移除这个数据结构
        epi = list_first_entry(&txlist, struct epitem, rdllink);
        list_del_init(&epi->rdllink);

        // 确保有需要文件的存在, 并且也有操作的掩码存在
        revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
        revents &= epi->event.events;
        // 就绪队列发送至用户空间
        if (revents) {
            if (__put_user(revents, &events[eventcnt].events) ||__put_user(epi->event.data, &events[eventcnt].data))
                goto errxit;
            if (epi->event.events & EPOLLONESHOT) 
                epi->event.events &= EP_PRIVATE_BITS;
            // 唤醒消息的个数
            eventcnt++;
        }
        // 是否是下降沿有效
        if (!(epi->event.events & EPOLLET) && (revents & epi->event.events))
            list_add_tail(&epi->rdllink, &ep->rdllist);
    }
    error = 0;

errxit:
    spin_lock_irqsave(&ep->lock, flags);
    // 在发送给用户空间的时侯又有数据传来时, 将事件放入到 ovflist 队列中. 在将数据加入到就绪队列事件中
    for (nepi = ep->ovflist; (epi = nepi) != NULL; nepi = epi->next, epi->next = EP_UNACTIVE_PTR)
    {
        if (!ep_is_linked(&epi->rdllink) && (epi->event.events & ~EP_PRIVATE_BITS))
            list_add_tail(&epi->rdllink, &ep->rdllist);
    }
    ep->ovflist = EP_UNACTIVE_PTR;
    ... 
        if (waitqueue_active(&ep->wq))
            wake_up_locked(&ep->wq);
        // 如果 poll 队列不为空, 则唤醒的次数++
        if (waitqueue_active(&ep->poll_wait))
            pwake++;
    }
    ...
    // 返回累计的值
    return eventcnt == 0 ? error: eventcnt;
}

调用SYSCALL_DEFINE(epoll_wait, …)函数，获得匿名文件的文件指针，在通过调用ep_poll函数，进行时间片的设置，在时间片结束后就绪队列为空，就退出等待；
如果时间设置的是负数，ep_poll函数会调用schedule_timeout，执行进程调度，当设置的时间片结束后又继续回到ep_poll进程查看就绪队列是否为空，为空的话就继续进程调度，此时wait又变成阻塞态；
当就绪队列准备好后，就退出进程调度，执行ep_send_events函数，主要是为了将就绪队列的链表从内核空间发送给用户空间。