/* * This structure is stored inside the "private_data" member of the file * structure and represents the main data structure for the eventpoll * interface. */ struct eventpoll { /* Protect the access to this structure */ spinlock_t lock;
/* * This mutex is used to ensure that files are not removed * while epoll is using them. This is held during the event * collection loop, the file cleanup path, the epoll file exit * code and the ctl operations. */ struct mutex mtx;
/* Wait queue used by sys_epoll_wait() */ //这个队列里存放的是执行epoll_wait从而等待的进程队列 wait_queue_head_t wq;
/* Wait queue used by file->poll() */ //这个队列里存放的是该eventloop作为poll对象的一个实例,加入到等待的队列 //这是因为eventpoll本身也是一个file, 所以也会有poll操作 wait_queue_head_t poll_wait;
/* List of ready file descriptors */ //这里存放的是事件就绪的fd列表,链表的每个元素是下面的epitem struct list_head rdllist;
/* RB tree root used to store monitored fd structs */ //这是用来快速查找fd的红黑树 struct rb_root_cached rbr;
/* * This is a single linked list that chains all the "struct epitem" that * happened while transferring ready events to userspace w/out * holding ->lock. */ struct epitem *ovflist;
/* wakeup_source used when ep_scan_ready_list is running */ struct wakeup_source *ws;
/* The user that created the eventpoll descriptor */ struct user_struct *user;
/* * Each file descriptor added to the eventpoll interface will * have an entry of this type linked to the "rbr" RB tree. * Avoid increasing the size of this struct, there can be many thousands * of these on a server and we do not want this to take another cache line. */ struct epitem { union { /* RB tree node links this structure to the eventpoll RB tree */ struct rb_node rbn; /* Used to free the struct epitem */ struct rcu_head rcu; };
/* List header used to link this structure to the eventpoll ready list */ //将这个epitem连接到eventpoll 里面的rdllist的list指针 struct list_head rdllink;
/* * Works together "struct eventpoll"->ovflist in keeping the * single linked chain of items. */ struct epitem *next;
/* The file descriptor information this item refers to */ //epoll监听的fd struct epoll_filefd ffd;
/* Number of active wait queue attached to poll operations */ //一个文件可以被多个epoll实例所监听,这里就记录了当前文件被监听的次数 int nwait;
/* List containing poll wait queues */ struct list_head pwqlist;
/* The "container" of this item */ //当前epollitem所属的eventpoll struct eventpoll *ep;
/* List header used to link this item to the "struct file" items list */ struct list_head fllink;
/* wakeup_source used when EPOLLWAKEUP is set */ struct wakeup_source __rcu *ws;
/* The structure that describe the interested events and the source fd */ struct epoll_event event; };
/* Wait structure used by the poll hooks */ struct eppoll_entry { /* List header used to link this structure to the "struct epitem" */ struct list_head llink;
/* The "base" pointer is set to the container "struct epitem" */ struct epitem *base;
/* * Wait queue item that will be linked to the target file wait * queue head. */ wait_queue_entry_t wait;
/* The wait queue head that linked the "wait" wait queue item */ wait_queue_head_t *whead; };
/* * Try to lookup the file inside our RB tree, Since we grabbed "mtx" * above, we can be sure to be able to use the item looked up by * ep_find() till we release the mutex. */ epi = ep_find(ep, tf.file, fd);
struct epoll_filefd { struct file *file; // pointer to the target file struct corresponding to the fd int fd; // target file descriptor number } __packed;
/* * This is the callback that is used to add our wait queue to the * target file wakeup lists. */ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,poll_table *pt) { struct epitem *epi = ep_item_from_epqueue(pt); struct eppoll_entry *pwq;
if (epi>nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) { init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); pwq->whead = whead; pwq->base = epi; if (epi->event.events & EPOLLEXCLUSIVE) add_wait_queue_exclusive(whead, &pwq->wait); else add_wait_queue(whead, &pwq->wait); list_add_tail(&pwq->llink, &epi->pwqlist); epi->nwait++; } else { /* We have to signal that an error occurred */ epi->nwait = -1; } }
/* * This is the callback that is passed to the wait queue wakeup * mechanism. It is called by the stored file descriptors when they * have events to report. */ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { int pwake = 0; unsigned long flags; struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep;
/* * Check the events coming with the callback. At this stage, not * every device reports the events in the "key" parameter of the * callback. We need to be able to handle both cases here, hence the * test for "key" != NULL before the event match test. */ if (key && !((unsigned long) key & epi->event.events)) goto out_unlock;
接下来,判断是否需要把该事件传递给用户空间。
1 2 3 4 5 6 7 8 9 10 11 12 13 14
if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) { if (epi->next == EP_UNACTIVE_PTR) { epi->next = ep->ovflist; ep->ovflist = epi; if (epi->ws) { /* * Activate ep->ws since epi->ws may get * deactivated at any time. */ __pm_stay_awake(ep->ws); } } goto out_unlock; }
/* If this file is already in the ready list we exit soon */ if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake_rcu(epi); }
/* * Wake up ( if active ) both the eventpoll wait list and the ->poll() * wait list. */ if (waitqueue_active(&ep->wq)) { if ((epi->event.events & EPOLLEXCLUSIVE) && !((unsigned long)key & POLLFREE)) { switch ((unsigned long)key & EPOLLINOUT_BITS) { case POLLIN: if (epi->event.events & POLLIN) ewake = 1; break; case POLLOUT: if (epi->event.events & POLLOUT) ewake = 1; break; case 0: ewake = 1; break; } } wake_up_locked(&ep->wq); }
查找epoll实例
epoll_wait函数首先进行一系列的检查,例如传入的maxevents应该大于0。
1 2 3 4 5 6 7
/* The maximum number of event must be greater than zero */ if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) return -EINVAL;
/* Verify that the area passed by the user is writeable */ if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) return -EFAULT;
/* Get the "struct file *" for the eventpoll file */ f = fdget(epfd); if (!f.file) return -EBADF;
/* * We have to check that the file structure underneath the fd * the user passed to us _is_ an eventpoll file. */ error = -EINVAL; if (!is_file_epoll(f.file)) goto error_fput;
还是通过读取epoll实例对应匿名文件的private_data得到eventpoll实例。
1 2 3 4 5
/* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ ep = f.file->private_data;
接下来调用ep_poll来完成对应的事件收集并传递到用户空间。
1 2
/* Time to fish for events ... */ error = ep_poll(ep, events, maxevents, timeout);
if (!ep_events_available(ep)) { /* * Busy poll timed out. Drop NAPI ID for now, we can add * it back in when we have moved a socket with a valid NAPI * ID onto the ready list. */ ep_reset_busy_poll_napi_id(ep);
/* * We don't have any available event to return to the caller. * We need to sleep here, and we will be wake up by * ep_poll_callback() when events will become available. */ init_waitqueue_entry(&wait, current); __add_wait_queue_exclusive(&ep->wq, &wait);
//这个循环里,当前进程可能会被唤醒,唤醒的途径包括 //1.当前进程超时 //2.当前进行收到一个signal信号 //3.某个描述字上有事件发生 //对应的1.2.3都会通过break跳出循环 //第4个可能是当前进程被CPU重新调度,进入for循环的判断,如果没有满足1.2.3的条件,就又重新进入休眠 for (;;) { /* * We don't want to sleep if the ep_poll_callback() sends us * a wakeup in between. That's why we set the task state * to TASK_INTERRUPTIBLE before doing the checks. */ set_current_state(TASK_INTERRUPTIBLE); /* * Always short-circuit for fatal signals to allow * threads to make a timely exit without the chance of * finding more events available and fetching * repeatedly. */ if (fatal_signal_pending(current)) { res = -EINTR; break; } if (ep_events_available(ep) || timed_out) break; if (signal_pending(current)) { res = -EINTR; break; }
spin_unlock_irqrestore(&ep->lock, flags);
//通过调用schedule_hrtimeout_range,当前进程进入休眠,CPU时间被调度器调度给其他进程使用 if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) timed_out = 1;
//ep_send_events将事件拷贝到用户空间 /* * Try to transfer events to user space. In case we get 0 events and * there's still timeout left over, we go trying again in search of * more luck. */ if (!res && eavail && !(res = ep_send_events(ep, events, maxevents)) && !timed_out) goto fetch_events;
/* * If the event mask intersect the caller-requested one, * deliver the event to userspace. Again, ep_scan_ready_list() * is holding "mtx", so no operations coming from userspace * can change the item. */ if (revents) { if (__put_user(revents, &uevent->events) || __put_user(epi->event.data, &uevent->data)) { list_add(&epi->rdllink, head); ep_pm_stay_awake(epi); return eventcnt ? eventcnt : -EFAULT; } eventcnt++; uevent++;
//这里是Level-triggered的处理,可以看到,在Level-triggered的情况下,这个事件被重新加回到ready list里面 //这样,下一轮epoll_wait的时候,这个事件会被重新check else if (!(epi->event.events & EPOLLET)) { /* * If this file has been added with Level * Trigger mode, we need to insert back inside * the ready list, so that the next call to * epoll_wait() will check again the events * availability. At this point, no one can insert * into ep->rdllist besides us. The epoll_ctl() * callers are locked out by * ep_scan_ready_list() holding "mtx" and the * poll callback will queue them in ep->ovflist. */ list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); }
int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timespec64 *end_time) { fd_set_bits fds; void *bits; int ret, max_fds; size_t size, alloc_size; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
ret = -EINVAL; if (n < 0) goto out_nofds;
/* max_fds can increase, so grab it once to avoid race */ rcu_read_lock(); fdt = files_fdtable(current->files); max_fds = fdt->max_fds; rcu_read_unlock(); if (n > max_fds) n = max_fds;
/* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), * since we used fdset we need to allocate memory in units of * long-words. */ size = FDS_BYTES(n); bits = stack_fds; if (size > sizeof(stack_fds) / 6) { /* Not enough space in on-stack array; must use kmalloc */ ret = -ENOMEM; if (size > (SIZE_MAX / 6)) goto out_nofds;
for (i = 0; i < n; ++rinp, ++routp, ++rexp) { unsigned long in, out, ex, all_bits, bit = 1, mask, j; unsigned long res_in = 0, res_out = 0, res_ex = 0;
in = *inp++; out = *outp++; ex = *exp++; all_bits = in | out | ex; if (all_bits == 0) { i += BITS_PER_LONG; continue; } if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; ...