LCOV - code coverage report
Current view: top level - fs - eventpoll.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 12 635 1.9 %
Date: 2023-04-06 08:38:28 Functions: 2 45 4.4 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-or-later
       2             : /*
       3             :  *  fs/eventpoll.c (Efficient event retrieval implementation)
       4             :  *  Copyright (C) 2001,...,2009  Davide Libenzi
       5             :  *
       6             :  *  Davide Libenzi <davidel@xmailserver.org>
       7             :  */
       8             : 
       9             : #include <linux/init.h>
      10             : #include <linux/kernel.h>
      11             : #include <linux/sched/signal.h>
      12             : #include <linux/fs.h>
      13             : #include <linux/file.h>
      14             : #include <linux/signal.h>
      15             : #include <linux/errno.h>
      16             : #include <linux/mm.h>
      17             : #include <linux/slab.h>
      18             : #include <linux/poll.h>
      19             : #include <linux/string.h>
      20             : #include <linux/list.h>
      21             : #include <linux/hash.h>
      22             : #include <linux/spinlock.h>
      23             : #include <linux/syscalls.h>
      24             : #include <linux/rbtree.h>
      25             : #include <linux/wait.h>
      26             : #include <linux/eventpoll.h>
      27             : #include <linux/mount.h>
      28             : #include <linux/bitops.h>
      29             : #include <linux/mutex.h>
      30             : #include <linux/anon_inodes.h>
      31             : #include <linux/device.h>
      32             : #include <linux/uaccess.h>
      33             : #include <asm/io.h>
      34             : #include <asm/mman.h>
      35             : #include <linux/atomic.h>
      36             : #include <linux/proc_fs.h>
      37             : #include <linux/seq_file.h>
      38             : #include <linux/compat.h>
      39             : #include <linux/rculist.h>
      40             : #include <net/busy_poll.h>
      41             : 
      42             : /*
      43             :  * LOCKING:
      44             :  * There are three level of locking required by epoll :
      45             :  *
      46             :  * 1) epmutex (mutex)
      47             :  * 2) ep->mtx (mutex)
      48             :  * 3) ep->lock (rwlock)
      49             :  *
      50             :  * The acquire order is the one listed above, from 1 to 3.
      51             :  * We need a rwlock (ep->lock) because we manipulate objects
      52             :  * from inside the poll callback, that might be triggered from
      53             :  * a wake_up() that in turn might be called from IRQ context.
      54             :  * So we can't sleep inside the poll callback and hence we need
      55             :  * a spinlock. During the event transfer loop (from kernel to
      56             :  * user space) we could end up sleeping due a copy_to_user(), so
      57             :  * we need a lock that will allow us to sleep. This lock is a
      58             :  * mutex (ep->mtx). It is acquired during the event transfer loop,
      59             :  * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
      60             :  * Then we also need a global mutex to serialize eventpoll_release_file()
      61             :  * and ep_free().
      62             :  * This mutex is acquired by ep_free() during the epoll file
      63             :  * cleanup path and it is also acquired by eventpoll_release_file()
      64             :  * if a file has been pushed inside an epoll set and it is then
      65             :  * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL).
      66             :  * It is also acquired when inserting an epoll fd onto another epoll
      67             :  * fd. We do this so that we walk the epoll tree and ensure that this
      68             :  * insertion does not create a cycle of epoll file descriptors, which
      69             :  * could lead to deadlock. We need a global mutex to prevent two
      70             :  * simultaneous inserts (A into B and B into A) from racing and
      71             :  * constructing a cycle without either insert observing that it is
      72             :  * going to.
      73             :  * It is necessary to acquire multiple "ep->mtx"es at once in the
      74             :  * case when one epoll fd is added to another. In this case, we
      75             :  * always acquire the locks in the order of nesting (i.e. after
      76             :  * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
      77             :  * before e2->mtx). Since we disallow cycles of epoll file
      78             :  * descriptors, this ensures that the mutexes are well-ordered. In
      79             :  * order to communicate this nesting to lockdep, when walking a tree
      80             :  * of epoll file descriptors, we use the current recursion depth as
      81             :  * the lockdep subkey.
      82             :  * It is possible to drop the "ep->mtx" and to use the global
      83             :  * mutex "epmutex" (together with "ep->lock") to have it working,
      84             :  * but having "ep->mtx" will make the interface more scalable.
      85             :  * Events that require holding "epmutex" are very rare, while for
      86             :  * normal operations the epoll private "ep->mtx" will guarantee
      87             :  * a better scalability.
      88             :  */
      89             : 
      90             : /* Epoll private bits inside the event mask */
      91             : #define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
      92             : 
      93             : #define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT)
      94             : 
      95             : #define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \
      96             :                                 EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)
      97             : 
      98             : /* Maximum number of nesting allowed inside epoll sets */
      99             : #define EP_MAX_NESTS 4
     100             : 
     101             : #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
     102             : 
     103             : #define EP_UNACTIVE_PTR ((void *) -1L)
     104             : 
     105             : #define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
     106             : 
     107             : struct epoll_filefd {
     108             :         struct file *file;
     109             :         int fd;
     110             : } __packed;
     111             : 
     112             : /* Wait structure used by the poll hooks */
     113             : struct eppoll_entry {
     114             :         /* List header used to link this structure to the "struct epitem" */
     115             :         struct eppoll_entry *next;
     116             : 
     117             :         /* The "base" pointer is set to the container "struct epitem" */
     118             :         struct epitem *base;
     119             : 
     120             :         /*
     121             :          * Wait queue item that will be linked to the target file wait
     122             :          * queue head.
     123             :          */
     124             :         wait_queue_entry_t wait;
     125             : 
     126             :         /* The wait queue head that linked the "wait" wait queue item */
     127             :         wait_queue_head_t *whead;
     128             : };
     129             : 
     130             : /*
     131             :  * Each file descriptor added to the eventpoll interface will
     132             :  * have an entry of this type linked to the "rbr" RB tree.
     133             :  * Avoid increasing the size of this struct, there can be many thousands
     134             :  * of these on a server and we do not want this to take another cache line.
     135             :  */
     136             : struct epitem {
     137             :         union {
     138             :                 /* RB tree node links this structure to the eventpoll RB tree */
     139             :                 struct rb_node rbn;
     140             :                 /* Used to free the struct epitem */
     141             :                 struct rcu_head rcu;
     142             :         };
     143             : 
     144             :         /* List header used to link this structure to the eventpoll ready list */
     145             :         struct list_head rdllink;
     146             : 
     147             :         /*
     148             :          * Works together "struct eventpoll"->ovflist in keeping the
     149             :          * single linked chain of items.
     150             :          */
     151             :         struct epitem *next;
     152             : 
     153             :         /* The file descriptor information this item refers to */
     154             :         struct epoll_filefd ffd;
     155             : 
     156             :         /* List containing poll wait queues */
     157             :         struct eppoll_entry *pwqlist;
     158             : 
     159             :         /* The "container" of this item */
     160             :         struct eventpoll *ep;
     161             : 
     162             :         /* List header used to link this item to the "struct file" items list */
     163             :         struct hlist_node fllink;
     164             : 
     165             :         /* wakeup_source used when EPOLLWAKEUP is set */
     166             :         struct wakeup_source __rcu *ws;
     167             : 
     168             :         /* The structure that describe the interested events and the source fd */
     169             :         struct epoll_event event;
     170             : };
     171             : 
     172             : /*
     173             :  * This structure is stored inside the "private_data" member of the file
     174             :  * structure and represents the main data structure for the eventpoll
     175             :  * interface.
     176             :  */
     177             : struct eventpoll {
     178             :         /*
     179             :          * This mutex is used to ensure that files are not removed
     180             :          * while epoll is using them. This is held during the event
     181             :          * collection loop, the file cleanup path, the epoll file exit
     182             :          * code and the ctl operations.
     183             :          */
     184             :         struct mutex mtx;
     185             : 
     186             :         /* Wait queue used by sys_epoll_wait() */
     187             :         wait_queue_head_t wq;
     188             : 
     189             :         /* Wait queue used by file->poll() */
     190             :         wait_queue_head_t poll_wait;
     191             : 
     192             :         /* List of ready file descriptors */
     193             :         struct list_head rdllist;
     194             : 
     195             :         /* Lock which protects rdllist and ovflist */
     196             :         rwlock_t lock;
     197             : 
     198             :         /* RB tree root used to store monitored fd structs */
     199             :         struct rb_root_cached rbr;
     200             : 
     201             :         /*
     202             :          * This is a single linked list that chains all the "struct epitem" that
     203             :          * happened while transferring ready events to userspace w/out
     204             :          * holding ->lock.
     205             :          */
     206             :         struct epitem *ovflist;
     207             : 
     208             :         /* wakeup_source used when ep_scan_ready_list is running */
     209             :         struct wakeup_source *ws;
     210             : 
     211             :         /* The user that created the eventpoll descriptor */
     212             :         struct user_struct *user;
     213             : 
     214             :         struct file *file;
     215             : 
     216             :         /* used to optimize loop detection check */
     217             :         u64 gen;
     218             :         struct hlist_head refs;
     219             : 
     220             : #ifdef CONFIG_NET_RX_BUSY_POLL
     221             :         /* used to track busy poll napi_id */
     222             :         unsigned int napi_id;
     223             : #endif
     224             : 
     225             : #ifdef CONFIG_DEBUG_LOCK_ALLOC
     226             :         /* tracks wakeup nests for lockdep validation */
     227             :         u8 nests;
     228             : #endif
     229             : };
     230             : 
     231             : /* Wrapper struct used by poll queueing */
     232             : struct ep_pqueue {
     233             :         poll_table pt;
     234             :         struct epitem *epi;
     235             : };
     236             : 
     237             : /*
     238             :  * Configuration options available inside /proc/sys/fs/epoll/
     239             :  */
     240             : /* Maximum number of epoll watched descriptors, per user */
     241             : static long max_user_watches __read_mostly;
     242             : 
     243             : /*
     244             :  * This mutex is used to serialize ep_free() and eventpoll_release_file().
     245             :  */
     246             : static DEFINE_MUTEX(epmutex);
     247             : 
     248             : static u64 loop_check_gen = 0;
     249             : 
     250             : /* Used to check for epoll file descriptor inclusion loops */
     251             : static struct eventpoll *inserting_into;
     252             : 
     253             : /* Slab cache used to allocate "struct epitem" */
     254             : static struct kmem_cache *epi_cache __read_mostly;
     255             : 
     256             : /* Slab cache used to allocate "struct eppoll_entry" */
     257             : static struct kmem_cache *pwq_cache __read_mostly;
     258             : 
     259             : /*
     260             :  * List of files with newly added links, where we may need to limit the number
     261             :  * of emanating paths. Protected by the epmutex.
     262             :  */
     263             : struct epitems_head {
     264             :         struct hlist_head epitems;
     265             :         struct epitems_head *next;
     266             : };
     267             : static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR;
     268             : 
     269             : static struct kmem_cache *ephead_cache __read_mostly;
     270             : 
     271             : static inline void free_ephead(struct epitems_head *head)
     272             : {
     273           0 :         if (head)
     274           0 :                 kmem_cache_free(ephead_cache, head);
     275             : }
     276             : 
     277             : static void list_file(struct file *file)
     278             : {
     279             :         struct epitems_head *head;
     280             : 
     281           0 :         head = container_of(file->f_ep, struct epitems_head, epitems);
     282           0 :         if (!head->next) {
     283           0 :                 head->next = tfile_check_list;
     284           0 :                 tfile_check_list = head;
     285             :         }
     286             : }
     287             : 
     288           0 : static void unlist_file(struct epitems_head *head)
     289             : {
     290           0 :         struct epitems_head *to_free = head;
     291           0 :         struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems));
     292           0 :         if (p) {
     293           0 :                 struct epitem *epi= container_of(p, struct epitem, fllink);
     294           0 :                 spin_lock(&epi->ffd.file->f_lock);
     295           0 :                 if (!hlist_empty(&head->epitems))
     296           0 :                         to_free = NULL;
     297           0 :                 head->next = NULL;
     298           0 :                 spin_unlock(&epi->ffd.file->f_lock);
     299             :         }
     300           0 :         free_ephead(to_free);
     301           0 : }
     302             : 
     303             : #ifdef CONFIG_SYSCTL
     304             : 
     305             : #include <linux/sysctl.h>
     306             : 
     307             : static long long_zero;
     308             : static long long_max = LONG_MAX;
     309             : 
     310             : static struct ctl_table epoll_table[] = {
     311             :         {
     312             :                 .procname       = "max_user_watches",
     313             :                 .data           = &max_user_watches,
     314             :                 .maxlen         = sizeof(max_user_watches),
     315             :                 .mode           = 0644,
     316             :                 .proc_handler   = proc_doulongvec_minmax,
     317             :                 .extra1         = &long_zero,
     318             :                 .extra2         = &long_max,
     319             :         },
     320             :         { }
     321             : };
     322             : 
     323           1 : static void __init epoll_sysctls_init(void)
     324             : {
     325           1 :         register_sysctl("fs/epoll", epoll_table);
     326           1 : }
     327             : #else
     328             : #define epoll_sysctls_init() do { } while (0)
     329             : #endif /* CONFIG_SYSCTL */
     330             : 
     331             : static const struct file_operations eventpoll_fops;
     332             : 
     333             : static inline int is_file_epoll(struct file *f)
     334             : {
     335           0 :         return f->f_op == &eventpoll_fops;
     336             : }
     337             : 
     338             : /* Setup the structure that is used as key for the RB tree */
     339             : static inline void ep_set_ffd(struct epoll_filefd *ffd,
     340             :                               struct file *file, int fd)
     341             : {
     342           0 :         ffd->file = file;
     343           0 :         ffd->fd = fd;
     344             : }
     345             : 
     346             : /* Compare RB tree keys */
     347             : static inline int ep_cmp_ffd(struct epoll_filefd *p1,
     348             :                              struct epoll_filefd *p2)
     349             : {
     350           0 :         return (p1->file > p2->file ? +1:
     351           0 :                 (p1->file < p2->file ? -1 : p1->fd - p2->fd));
     352             : }
     353             : 
     354             : /* Tells us if the item is currently linked */
     355             : static inline int ep_is_linked(struct epitem *epi)
     356             : {
     357           0 :         return !list_empty(&epi->rdllink);
     358             : }
     359             : 
     360             : static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)
     361             : {
     362           0 :         return container_of(p, struct eppoll_entry, wait);
     363             : }
     364             : 
     365             : /* Get the "struct epitem" from a wait queue pointer */
     366             : static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
     367             : {
     368           0 :         return container_of(p, struct eppoll_entry, wait)->base;
     369             : }
     370             : 
     371             : /**
     372             :  * ep_events_available - Checks if ready events might be available.
     373             :  *
     374             :  * @ep: Pointer to the eventpoll context.
     375             :  *
     376             :  * Return: a value different than %zero if ready events are available,
     377             :  *          or %zero otherwise.
     378             :  */
     379             : static inline int ep_events_available(struct eventpoll *ep)
     380             : {
     381           0 :         return !list_empty_careful(&ep->rdllist) ||
     382           0 :                 READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
     383             : }
     384             : 
     385             : #ifdef CONFIG_NET_RX_BUSY_POLL
     386             : static bool ep_busy_loop_end(void *p, unsigned long start_time)
     387             : {
     388             :         struct eventpoll *ep = p;
     389             : 
     390             :         return ep_events_available(ep) || busy_loop_timeout(start_time);
     391             : }
     392             : 
     393             : /*
     394             :  * Busy poll if globally on and supporting sockets found && no events,
     395             :  * busy loop will return if need_resched or ep_events_available.
     396             :  *
     397             :  * we must do our busy polling with irqs enabled
     398             :  */
     399             : static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
     400             : {
     401             :         unsigned int napi_id = READ_ONCE(ep->napi_id);
     402             : 
     403             :         if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) {
     404             :                 napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false,
     405             :                                BUSY_POLL_BUDGET);
     406             :                 if (ep_events_available(ep))
     407             :                         return true;
     408             :                 /*
     409             :                  * Busy poll timed out.  Drop NAPI ID for now, we can add
     410             :                  * it back in when we have moved a socket with a valid NAPI
     411             :                  * ID onto the ready list.
     412             :                  */
     413             :                 ep->napi_id = 0;
     414             :                 return false;
     415             :         }
     416             :         return false;
     417             : }
     418             : 
     419             : /*
     420             :  * Set epoll busy poll NAPI ID from sk.
     421             :  */
     422             : static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
     423             : {
     424             :         struct eventpoll *ep;
     425             :         unsigned int napi_id;
     426             :         struct socket *sock;
     427             :         struct sock *sk;
     428             : 
     429             :         if (!net_busy_loop_on())
     430             :                 return;
     431             : 
     432             :         sock = sock_from_file(epi->ffd.file);
     433             :         if (!sock)
     434             :                 return;
     435             : 
     436             :         sk = sock->sk;
     437             :         if (!sk)
     438             :                 return;
     439             : 
     440             :         napi_id = READ_ONCE(sk->sk_napi_id);
     441             :         ep = epi->ep;
     442             : 
     443             :         /* Non-NAPI IDs can be rejected
     444             :          *      or
     445             :          * Nothing to do if we already have this ID
     446             :          */
     447             :         if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id)
     448             :                 return;
     449             : 
     450             :         /* record NAPI ID for use in next busy poll */
     451             :         ep->napi_id = napi_id;
     452             : }
     453             : 
     454             : #else
     455             : 
     456             : static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock)
     457             : {
     458             :         return false;
     459             : }
     460             : 
     461             : static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
     462             : {
     463             : }
     464             : 
     465             : #endif /* CONFIG_NET_RX_BUSY_POLL */
     466             : 
     467             : /*
     468             :  * As described in commit 0ccf831cb lockdep: annotate epoll
     469             :  * the use of wait queues used by epoll is done in a very controlled
     470             :  * manner. Wake ups can nest inside each other, but are never done
     471             :  * with the same locking. For example:
     472             :  *
     473             :  *   dfd = socket(...);
     474             :  *   efd1 = epoll_create();
     475             :  *   efd2 = epoll_create();
     476             :  *   epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...);
     477             :  *   epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);
     478             :  *
     479             :  * When a packet arrives to the device underneath "dfd", the net code will
     480             :  * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a
     481             :  * callback wakeup entry on that queue, and the wake_up() performed by the
     482             :  * "dfd" net code will end up in ep_poll_callback(). At this point epoll
     483             :  * (efd1) notices that it may have some event ready, so it needs to wake up
     484             :  * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()
     485             :  * that ends up in another wake_up(), after having checked about the
     486             :  * recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to
     487             :  * avoid stack blasting.
     488             :  *
     489             :  * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle
     490             :  * this special case of epoll.
     491             :  */
     492             : #ifdef CONFIG_DEBUG_LOCK_ALLOC
     493             : 
     494             : static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
     495             :                              unsigned pollflags)
     496             : {
     497             :         struct eventpoll *ep_src;
     498             :         unsigned long flags;
     499             :         u8 nests = 0;
     500             : 
     501             :         /*
     502             :          * To set the subclass or nesting level for spin_lock_irqsave_nested()
     503             :          * it might be natural to create a per-cpu nest count. However, since
     504             :          * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can
     505             :          * schedule() in the -rt kernel, the per-cpu variable are no longer
     506             :          * protected. Thus, we are introducing a per eventpoll nest field.
     507             :          * If we are not being call from ep_poll_callback(), epi is NULL and
     508             :          * we are at the first level of nesting, 0. Otherwise, we are being
     509             :          * called from ep_poll_callback() and if a previous wakeup source is
     510             :          * not an epoll file itself, we are at depth 1 since the wakeup source
     511             :          * is depth 0. If the wakeup source is a previous epoll file in the
     512             :          * wakeup chain then we use its nests value and record ours as
     513             :          * nests + 1. The previous epoll file nests value is stable since its
     514             :          * already holding its own poll_wait.lock.
     515             :          */
     516             :         if (epi) {
     517             :                 if ((is_file_epoll(epi->ffd.file))) {
     518             :                         ep_src = epi->ffd.file->private_data;
     519             :                         nests = ep_src->nests;
     520             :                 } else {
     521             :                         nests = 1;
     522             :                 }
     523             :         }
     524             :         spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
     525             :         ep->nests = nests + 1;
     526             :         wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags);
     527             :         ep->nests = 0;
     528             :         spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
     529             : }
     530             : 
     531             : #else
     532             : 
     533             : static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
     534             :                              unsigned pollflags)
     535             : {
     536           0 :         wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags);
     537             : }
     538             : 
     539             : #endif
     540             : 
     541             : static void ep_remove_wait_queue(struct eppoll_entry *pwq)
     542             : {
     543             :         wait_queue_head_t *whead;
     544             : 
     545             :         rcu_read_lock();
     546             :         /*
     547             :          * If it is cleared by POLLFREE, it should be rcu-safe.
     548             :          * If we read NULL we need a barrier paired with
     549             :          * smp_store_release() in ep_poll_callback(), otherwise
     550             :          * we rely on whead->lock.
     551             :          */
     552           0 :         whead = smp_load_acquire(&pwq->whead);
     553           0 :         if (whead)
     554           0 :                 remove_wait_queue(whead, &pwq->wait);
     555             :         rcu_read_unlock();
     556             : }
     557             : 
     558             : /*
     559             :  * This function unregisters poll callbacks from the associated file
     560             :  * descriptor.  Must be called with "mtx" held (or "epmutex" if called from
     561             :  * ep_free).
     562             :  */
     563           0 : static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
     564             : {
     565           0 :         struct eppoll_entry **p = &epi->pwqlist;
     566             :         struct eppoll_entry *pwq;
     567             : 
     568           0 :         while ((pwq = *p) != NULL) {
     569           0 :                 *p = pwq->next;
     570           0 :                 ep_remove_wait_queue(pwq);
     571           0 :                 kmem_cache_free(pwq_cache, pwq);
     572             :         }
     573           0 : }
     574             : 
     575             : /* call only when ep->mtx is held */
     576             : static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)
     577             : {
     578           0 :         return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
     579             : }
     580             : 
     581             : /* call only when ep->mtx is held */
     582             : static inline void ep_pm_stay_awake(struct epitem *epi)
     583             : {
     584           0 :         struct wakeup_source *ws = ep_wakeup_source(epi);
     585             : 
     586           0 :         if (ws)
     587           0 :                 __pm_stay_awake(ws);
     588             : }
     589             : 
     590             : static inline bool ep_has_wakeup_source(struct epitem *epi)
     591             : {
     592           0 :         return rcu_access_pointer(epi->ws) ? true : false;
     593             : }
     594             : 
     595             : /* call when ep->mtx cannot be held (ep_poll_callback) */
     596             : static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
     597             : {
     598             :         struct wakeup_source *ws;
     599             : 
     600             :         rcu_read_lock();
     601           0 :         ws = rcu_dereference(epi->ws);
     602           0 :         if (ws)
     603           0 :                 __pm_stay_awake(ws);
     604             :         rcu_read_unlock();
     605             : }
     606             : 
     607             : 
     608             : /*
     609             :  * ep->mutex needs to be held because we could be hit by
     610             :  * eventpoll_release_file() and epoll_ctl().
     611             :  */
     612           0 : static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist)
     613             : {
     614             :         /*
     615             :          * Steal the ready list, and re-init the original one to the
     616             :          * empty list. Also, set ep->ovflist to NULL so that events
     617             :          * happening while looping w/out locks, are not lost. We cannot
     618             :          * have the poll callback to queue directly on ep->rdllist,
     619             :          * because we want the "sproc" callback to be able to do it
     620             :          * in a lockless way.
     621             :          */
     622             :         lockdep_assert_irqs_enabled();
     623           0 :         write_lock_irq(&ep->lock);
     624           0 :         list_splice_init(&ep->rdllist, txlist);
     625           0 :         WRITE_ONCE(ep->ovflist, NULL);
     626           0 :         write_unlock_irq(&ep->lock);
     627           0 : }
     628             : 
     629           0 : static void ep_done_scan(struct eventpoll *ep,
     630             :                          struct list_head *txlist)
     631             : {
     632             :         struct epitem *epi, *nepi;
     633             : 
     634           0 :         write_lock_irq(&ep->lock);
     635             :         /*
     636             :          * During the time we spent inside the "sproc" callback, some
     637             :          * other events might have been queued by the poll callback.
     638             :          * We re-insert them inside the main ready-list here.
     639             :          */
     640           0 :         for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
     641           0 :              nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
     642             :                 /*
     643             :                  * We need to check if the item is already in the list.
     644             :                  * During the "sproc" callback execution time, items are
     645             :                  * queued into ->ovflist but the "txlist" might already
     646             :                  * contain them, and the list_splice() below takes care of them.
     647             :                  */
     648           0 :                 if (!ep_is_linked(epi)) {
     649             :                         /*
     650             :                          * ->ovflist is LIFO, so we have to reverse it in order
     651             :                          * to keep in FIFO.
     652             :                          */
     653           0 :                         list_add(&epi->rdllink, &ep->rdllist);
     654             :                         ep_pm_stay_awake(epi);
     655             :                 }
     656             :         }
     657             :         /*
     658             :          * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
     659             :          * releasing the lock, events will be queued in the normal way inside
     660             :          * ep->rdllist.
     661             :          */
     662           0 :         WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
     663             : 
     664             :         /*
     665             :          * Quickly re-inject items left on "txlist".
     666             :          */
     667           0 :         list_splice(txlist, &ep->rdllist);
     668           0 :         __pm_relax(ep->ws);
     669             : 
     670           0 :         if (!list_empty(&ep->rdllist)) {
     671           0 :                 if (waitqueue_active(&ep->wq))
     672           0 :                         wake_up(&ep->wq);
     673             :         }
     674             : 
     675           0 :         write_unlock_irq(&ep->lock);
     676           0 : }
     677             : 
     678           0 : static void epi_rcu_free(struct rcu_head *head)
     679             : {
     680           0 :         struct epitem *epi = container_of(head, struct epitem, rcu);
     681           0 :         kmem_cache_free(epi_cache, epi);
     682           0 : }
     683             : 
     684             : /*
     685             :  * Removes a "struct epitem" from the eventpoll RB tree and deallocates
     686             :  * all the associated resources. Must be called with "mtx" held.
     687             :  */
     688           0 : static int ep_remove(struct eventpoll *ep, struct epitem *epi)
     689             : {
     690           0 :         struct file *file = epi->ffd.file;
     691             :         struct epitems_head *to_free;
     692             :         struct hlist_head *head;
     693             : 
     694             :         lockdep_assert_irqs_enabled();
     695             : 
     696             :         /*
     697             :          * Removes poll wait queue hooks.
     698             :          */
     699           0 :         ep_unregister_pollwait(ep, epi);
     700             : 
     701             :         /* Remove the current item from the list of epoll hooks */
     702           0 :         spin_lock(&file->f_lock);
     703           0 :         to_free = NULL;
     704           0 :         head = file->f_ep;
     705           0 :         if (head->first == &epi->fllink && !epi->fllink.next) {
     706           0 :                 file->f_ep = NULL;
     707           0 :                 if (!is_file_epoll(file)) {
     708             :                         struct epitems_head *v;
     709           0 :                         v = container_of(head, struct epitems_head, epitems);
     710           0 :                         if (!smp_load_acquire(&v->next))
     711           0 :                                 to_free = v;
     712             :                 }
     713             :         }
     714           0 :         hlist_del_rcu(&epi->fllink);
     715           0 :         spin_unlock(&file->f_lock);
     716           0 :         free_ephead(to_free);
     717             : 
     718           0 :         rb_erase_cached(&epi->rbn, &ep->rbr);
     719             : 
     720           0 :         write_lock_irq(&ep->lock);
     721           0 :         if (ep_is_linked(epi))
     722           0 :                 list_del_init(&epi->rdllink);
     723           0 :         write_unlock_irq(&ep->lock);
     724             : 
     725           0 :         wakeup_source_unregister(ep_wakeup_source(epi));
     726             :         /*
     727             :          * At this point it is safe to free the eventpoll item. Use the union
     728             :          * field epi->rcu, since we are trying to minimize the size of
     729             :          * 'struct epitem'. The 'rbn' field is no longer in use. Protected by
     730             :          * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
     731             :          * use of the rbn field.
     732             :          */
     733           0 :         call_rcu(&epi->rcu, epi_rcu_free);
     734             : 
     735           0 :         percpu_counter_dec(&ep->user->epoll_watches);
     736             : 
     737           0 :         return 0;
     738             : }
     739             : 
     740           0 : static void ep_free(struct eventpoll *ep)
     741             : {
     742             :         struct rb_node *rbp;
     743             :         struct epitem *epi;
     744             : 
     745             :         /* We need to release all tasks waiting for these file */
     746           0 :         if (waitqueue_active(&ep->poll_wait))
     747           0 :                 ep_poll_safewake(ep, NULL, 0);
     748             : 
     749             :         /*
     750             :          * We need to lock this because we could be hit by
     751             :          * eventpoll_release_file() while we're freeing the "struct eventpoll".
     752             :          * We do not need to hold "ep->mtx" here because the epoll file
     753             :          * is on the way to be removed and no one has references to it
     754             :          * anymore. The only hit might come from eventpoll_release_file() but
     755             :          * holding "epmutex" is sufficient here.
     756             :          */
     757           0 :         mutex_lock(&epmutex);
     758             : 
     759             :         /*
     760             :          * Walks through the whole tree by unregistering poll callbacks.
     761             :          */
     762           0 :         for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
     763           0 :                 epi = rb_entry(rbp, struct epitem, rbn);
     764             : 
     765           0 :                 ep_unregister_pollwait(ep, epi);
     766           0 :                 cond_resched();
     767             :         }
     768             : 
     769             :         /*
     770             :          * Walks through the whole tree by freeing each "struct epitem". At this
     771             :          * point we are sure no poll callbacks will be lingering around, and also by
     772             :          * holding "epmutex" we can be sure that no file cleanup code will hit
     773             :          * us during this operation. So we can avoid the lock on "ep->lock".
     774             :          * We do not need to lock ep->mtx, either, we only do it to prevent
     775             :          * a lockdep warning.
     776             :          */
     777           0 :         mutex_lock(&ep->mtx);
     778           0 :         while ((rbp = rb_first_cached(&ep->rbr)) != NULL) {
     779           0 :                 epi = rb_entry(rbp, struct epitem, rbn);
     780           0 :                 ep_remove(ep, epi);
     781           0 :                 cond_resched();
     782             :         }
     783           0 :         mutex_unlock(&ep->mtx);
     784             : 
     785           0 :         mutex_unlock(&epmutex);
     786           0 :         mutex_destroy(&ep->mtx);
     787           0 :         free_uid(ep->user);
     788           0 :         wakeup_source_unregister(ep->ws);
     789           0 :         kfree(ep);
     790           0 : }
     791             : 
     792           0 : static int ep_eventpoll_release(struct inode *inode, struct file *file)
     793             : {
     794           0 :         struct eventpoll *ep = file->private_data;
     795             : 
     796           0 :         if (ep)
     797           0 :                 ep_free(ep);
     798             : 
     799           0 :         return 0;
     800             : }
     801             : 
     802             : static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth);
     803             : 
     804           0 : static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth)
     805             : {
     806           0 :         struct eventpoll *ep = file->private_data;
     807           0 :         LIST_HEAD(txlist);
     808             :         struct epitem *epi, *tmp;
     809             :         poll_table pt;
     810           0 :         __poll_t res = 0;
     811             : 
     812           0 :         init_poll_funcptr(&pt, NULL);
     813             : 
     814             :         /* Insert inside our poll wait queue */
     815           0 :         poll_wait(file, &ep->poll_wait, wait);
     816             : 
     817             :         /*
     818             :          * Proceed to find out if wanted events are really available inside
     819             :          * the ready list.
     820             :          */
     821           0 :         mutex_lock_nested(&ep->mtx, depth);
     822           0 :         ep_start_scan(ep, &txlist);
     823           0 :         list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
     824           0 :                 if (ep_item_poll(epi, &pt, depth + 1)) {
     825             :                         res = EPOLLIN | EPOLLRDNORM;
     826             :                         break;
     827             :                 } else {
     828             :                         /*
     829             :                          * Item has been dropped into the ready list by the poll
     830             :                          * callback, but it's not actually ready, as far as
     831             :                          * caller requested events goes. We can remove it here.
     832             :                          */
     833           0 :                         __pm_relax(ep_wakeup_source(epi));
     834           0 :                         list_del_init(&epi->rdllink);
     835             :                 }
     836             :         }
     837           0 :         ep_done_scan(ep, &txlist);
     838           0 :         mutex_unlock(&ep->mtx);
     839           0 :         return res;
     840             : }
     841             : 
     842             : /*
     843             :  * Differs from ep_eventpoll_poll() in that internal callers already have
     844             :  * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
     845             :  * is correctly annotated.
     846             :  */
     847           0 : static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
     848             :                                  int depth)
     849             : {
     850           0 :         struct file *file = epi->ffd.file;
     851             :         __poll_t res;
     852             : 
     853           0 :         pt->_key = epi->event.events;
     854           0 :         if (!is_file_epoll(file))
     855             :                 res = vfs_poll(file, pt);
     856             :         else
     857           0 :                 res = __ep_eventpoll_poll(file, pt, depth);
     858           0 :         return res & epi->event.events;
     859             : }
     860             : 
     861           0 : static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait)
     862             : {
     863           0 :         return __ep_eventpoll_poll(file, wait, 0);
     864             : }
     865             : 
     866             : #ifdef CONFIG_PROC_FS
     867           0 : static void ep_show_fdinfo(struct seq_file *m, struct file *f)
     868             : {
     869           0 :         struct eventpoll *ep = f->private_data;
     870             :         struct rb_node *rbp;
     871             : 
     872           0 :         mutex_lock(&ep->mtx);
     873           0 :         for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
     874           0 :                 struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
     875           0 :                 struct inode *inode = file_inode(epi->ffd.file);
     876             : 
     877           0 :                 seq_printf(m, "tfd: %8d events: %8x data: %16llx "
     878             :                            " pos:%lli ino:%lx sdev:%x\n",
     879             :                            epi->ffd.fd, epi->event.events,
     880           0 :                            (long long)epi->event.data,
     881           0 :                            (long long)epi->ffd.file->f_pos,
     882           0 :                            inode->i_ino, inode->i_sb->s_dev);
     883           0 :                 if (seq_has_overflowed(m))
     884             :                         break;
     885             :         }
     886           0 :         mutex_unlock(&ep->mtx);
     887           0 : }
     888             : #endif
     889             : 
     890             : /* File callbacks that implement the eventpoll file behaviour */
     891             : static const struct file_operations eventpoll_fops = {
     892             : #ifdef CONFIG_PROC_FS
     893             :         .show_fdinfo    = ep_show_fdinfo,
     894             : #endif
     895             :         .release        = ep_eventpoll_release,
     896             :         .poll           = ep_eventpoll_poll,
     897             :         .llseek         = noop_llseek,
     898             : };
     899             : 
     900             : /*
     901             :  * This is called from eventpoll_release() to unlink files from the eventpoll
     902             :  * interface. We need to have this facility to cleanup correctly files that are
     903             :  * closed without being removed from the eventpoll interface.
     904             :  */
     905           0 : void eventpoll_release_file(struct file *file)
     906             : {
     907             :         struct eventpoll *ep;
     908             :         struct epitem *epi;
     909             :         struct hlist_node *next;
     910             : 
     911             :         /*
     912             :          * We don't want to get "file->f_lock" because it is not
     913             :          * necessary. It is not necessary because we're in the "struct file"
     914             :          * cleanup path, and this means that no one is using this file anymore.
     915             :          * So, for example, epoll_ctl() cannot hit here since if we reach this
     916             :          * point, the file counter already went to zero and fget() would fail.
     917             :          * The only hit might come from ep_free() but by holding the mutex
     918             :          * will correctly serialize the operation. We do need to acquire
     919             :          * "ep->mtx" after "epmutex" because ep_remove() requires it when called
     920             :          * from anywhere but ep_free().
     921             :          *
     922             :          * Besides, ep_remove() acquires the lock, so we can't hold it here.
     923             :          */
     924           0 :         mutex_lock(&epmutex);
     925           0 :         if (unlikely(!file->f_ep)) {
     926           0 :                 mutex_unlock(&epmutex);
     927           0 :                 return;
     928             :         }
     929           0 :         hlist_for_each_entry_safe(epi, next, file->f_ep, fllink) {
     930           0 :                 ep = epi->ep;
     931           0 :                 mutex_lock_nested(&ep->mtx, 0);
     932           0 :                 ep_remove(ep, epi);
     933           0 :                 mutex_unlock(&ep->mtx);
     934             :         }
     935           0 :         mutex_unlock(&epmutex);
     936             : }
     937             : 
     938           0 : static int ep_alloc(struct eventpoll **pep)
     939             : {
     940             :         int error;
     941             :         struct user_struct *user;
     942             :         struct eventpoll *ep;
     943             : 
     944           0 :         user = get_current_user();
     945           0 :         error = -ENOMEM;
     946           0 :         ep = kzalloc(sizeof(*ep), GFP_KERNEL);
     947           0 :         if (unlikely(!ep))
     948             :                 goto free_uid;
     949             : 
     950           0 :         mutex_init(&ep->mtx);
     951             :         rwlock_init(&ep->lock);
     952           0 :         init_waitqueue_head(&ep->wq);
     953           0 :         init_waitqueue_head(&ep->poll_wait);
     954           0 :         INIT_LIST_HEAD(&ep->rdllist);
     955           0 :         ep->rbr = RB_ROOT_CACHED;
     956           0 :         ep->ovflist = EP_UNACTIVE_PTR;
     957           0 :         ep->user = user;
     958             : 
     959           0 :         *pep = ep;
     960             : 
     961           0 :         return 0;
     962             : 
     963             : free_uid:
     964           0 :         free_uid(user);
     965           0 :         return error;
     966             : }
     967             : 
     968             : /*
     969             :  * Search the file inside the eventpoll tree. The RB tree operations
     970             :  * are protected by the "mtx" mutex, and ep_find() must be called with
     971             :  * "mtx" held.
     972             :  */
     973             : static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
     974             : {
     975             :         int kcmp;
     976             :         struct rb_node *rbp;
     977           0 :         struct epitem *epi, *epir = NULL;
     978             :         struct epoll_filefd ffd;
     979             : 
     980           0 :         ep_set_ffd(&ffd, file, fd);
     981           0 :         for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
     982           0 :                 epi = rb_entry(rbp, struct epitem, rbn);
     983           0 :                 kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
     984           0 :                 if (kcmp > 0)
     985           0 :                         rbp = rbp->rb_right;
     986           0 :                 else if (kcmp < 0)
     987           0 :                         rbp = rbp->rb_left;
     988             :                 else {
     989             :                         epir = epi;
     990             :                         break;
     991             :                 }
     992             :         }
     993             : 
     994             :         return epir;
     995             : }
     996             : 
     997             : #ifdef CONFIG_KCMP
     998           0 : static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
     999             : {
    1000             :         struct rb_node *rbp;
    1001             :         struct epitem *epi;
    1002             : 
    1003           0 :         for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
    1004           0 :                 epi = rb_entry(rbp, struct epitem, rbn);
    1005           0 :                 if (epi->ffd.fd == tfd) {
    1006           0 :                         if (toff == 0)
    1007             :                                 return epi;
    1008             :                         else
    1009           0 :                                 toff--;
    1010             :                 }
    1011           0 :                 cond_resched();
    1012             :         }
    1013             : 
    1014             :         return NULL;
    1015             : }
    1016             : 
    1017           0 : struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
    1018             :                                      unsigned long toff)
    1019             : {
    1020             :         struct file *file_raw;
    1021             :         struct eventpoll *ep;
    1022             :         struct epitem *epi;
    1023             : 
    1024           0 :         if (!is_file_epoll(file))
    1025             :                 return ERR_PTR(-EINVAL);
    1026             : 
    1027           0 :         ep = file->private_data;
    1028             : 
    1029           0 :         mutex_lock(&ep->mtx);
    1030           0 :         epi = ep_find_tfd(ep, tfd, toff);
    1031           0 :         if (epi)
    1032           0 :                 file_raw = epi->ffd.file;
    1033             :         else
    1034             :                 file_raw = ERR_PTR(-ENOENT);
    1035           0 :         mutex_unlock(&ep->mtx);
    1036             : 
    1037           0 :         return file_raw;
    1038             : }
    1039             : #endif /* CONFIG_KCMP */
    1040             : 
    1041             : /*
    1042             :  * Adds a new entry to the tail of the list in a lockless way, i.e.
    1043             :  * multiple CPUs are allowed to call this function concurrently.
    1044             :  *
    1045             :  * Beware: it is necessary to prevent any other modifications of the
    1046             :  *         existing list until all changes are completed, in other words
    1047             :  *         concurrent list_add_tail_lockless() calls should be protected
    1048             :  *         with a read lock, where write lock acts as a barrier which
    1049             :  *         makes sure all list_add_tail_lockless() calls are fully
    1050             :  *         completed.
    1051             :  *
    1052             :  *        Also an element can be locklessly added to the list only in one
    1053             :  *        direction i.e. either to the tail or to the head, otherwise
    1054             :  *        concurrent access will corrupt the list.
    1055             :  *
    1056             :  * Return: %false if element has been already added to the list, %true
    1057             :  * otherwise.
    1058             :  */
    1059             : static inline bool list_add_tail_lockless(struct list_head *new,
    1060             :                                           struct list_head *head)
    1061             : {
    1062             :         struct list_head *prev;
    1063             : 
    1064             :         /*
    1065             :          * This is simple 'new->next = head' operation, but cmpxchg()
    1066             :          * is used in order to detect that same element has been just
    1067             :          * added to the list from another CPU: the winner observes
    1068             :          * new->next == new.
    1069             :          */
    1070           0 :         if (!try_cmpxchg(&new->next, &new, head))
    1071             :                 return false;
    1072             : 
    1073             :         /*
    1074             :          * Initially ->next of a new element must be updated with the head
    1075             :          * (we are inserting to the tail) and only then pointers are atomically
    1076             :          * exchanged.  XCHG guarantees memory ordering, thus ->next should be
    1077             :          * updated before pointers are actually swapped and pointers are
    1078             :          * swapped before prev->next is updated.
    1079             :          */
    1080             : 
    1081           0 :         prev = xchg(&head->prev, new);
    1082             : 
    1083             :         /*
    1084             :          * It is safe to modify prev->next and new->prev, because a new element
    1085             :          * is added only to the tail and new->next is updated before XCHG.
    1086             :          */
    1087             : 
    1088           0 :         prev->next = new;
    1089           0 :         new->prev = prev;
    1090             : 
    1091             :         return true;
    1092             : }
    1093             : 
    1094             : /*
    1095             :  * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
    1096             :  * i.e. multiple CPUs are allowed to call this function concurrently.
    1097             :  *
    1098             :  * Return: %false if epi element has been already chained, %true otherwise.
    1099             :  */
    1100             : static inline bool chain_epi_lockless(struct epitem *epi)
    1101             : {
    1102           0 :         struct eventpoll *ep = epi->ep;
    1103             : 
    1104             :         /* Fast preliminary check */
    1105           0 :         if (epi->next != EP_UNACTIVE_PTR)
    1106             :                 return false;
    1107             : 
    1108             :         /* Check that the same epi has not been just chained from another CPU */
    1109           0 :         if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
    1110             :                 return false;
    1111             : 
    1112             :         /* Atomically exchange tail */
    1113           0 :         epi->next = xchg(&ep->ovflist, epi);
    1114             : 
    1115             :         return true;
    1116             : }
    1117             : 
    1118             : /*
    1119             :  * This is the callback that is passed to the wait queue wakeup
    1120             :  * mechanism. It is called by the stored file descriptors when they
    1121             :  * have events to report.
    1122             :  *
    1123             :  * This callback takes a read lock in order not to contend with concurrent
    1124             :  * events from another file descriptor, thus all modifications to ->rdllist
    1125             :  * or ->ovflist are lockless.  Read lock is paired with the write lock from
    1126             :  * ep_scan_ready_list(), which stops all list modifications and guarantees
    1127             :  * that lists state is seen correctly.
    1128             :  *
    1129             :  * Another thing worth to mention is that ep_poll_callback() can be called
    1130             :  * concurrently for the same @epi from different CPUs if poll table was inited
    1131             :  * with several wait queues entries.  Plural wakeup from different CPUs of a
    1132             :  * single wait queue is serialized by wq.lock, but the case when multiple wait
    1133             :  * queues are used should be detected accordingly.  This is detected using
    1134             :  * cmpxchg() operation.
    1135             :  */
    1136           0 : static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
    1137             : {
    1138           0 :         int pwake = 0;
    1139           0 :         struct epitem *epi = ep_item_from_wait(wait);
    1140           0 :         struct eventpoll *ep = epi->ep;
    1141           0 :         __poll_t pollflags = key_to_poll(key);
    1142             :         unsigned long flags;
    1143           0 :         int ewake = 0;
    1144             : 
    1145           0 :         read_lock_irqsave(&ep->lock, flags);
    1146             : 
    1147             :         ep_set_busy_poll_napi_id(epi);
    1148             : 
    1149             :         /*
    1150             :          * If the event mask does not contain any poll(2) event, we consider the
    1151             :          * descriptor to be disabled. This condition is likely the effect of the
    1152             :          * EPOLLONESHOT bit that disables the descriptor when an event is received,
    1153             :          * until the next EPOLL_CTL_MOD will be issued.
    1154             :          */
    1155           0 :         if (!(epi->event.events & ~EP_PRIVATE_BITS))
    1156             :                 goto out_unlock;
    1157             : 
    1158             :         /*
    1159             :          * Check the events coming with the callback. At this stage, not
    1160             :          * every device reports the events in the "key" parameter of the
    1161             :          * callback. We need to be able to handle both cases here, hence the
    1162             :          * test for "key" != NULL before the event match test.
    1163             :          */
    1164           0 :         if (pollflags && !(pollflags & epi->event.events))
    1165             :                 goto out_unlock;
    1166             : 
    1167             :         /*
    1168             :          * If we are transferring events to userspace, we can hold no locks
    1169             :          * (because we're accessing user memory, and because of linux f_op->poll()
    1170             :          * semantics). All the events that happen during that period of time are
    1171             :          * chained in ep->ovflist and requeued later on.
    1172             :          */
    1173           0 :         if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
    1174           0 :                 if (chain_epi_lockless(epi))
    1175             :                         ep_pm_stay_awake_rcu(epi);
    1176           0 :         } else if (!ep_is_linked(epi)) {
    1177             :                 /* In the usual case, add event to ready list. */
    1178           0 :                 if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
    1179             :                         ep_pm_stay_awake_rcu(epi);
    1180             :         }
    1181             : 
    1182             :         /*
    1183             :          * Wake up ( if active ) both the eventpoll wait list and the ->poll()
    1184             :          * wait list.
    1185             :          */
    1186           0 :         if (waitqueue_active(&ep->wq)) {
    1187           0 :                 if ((epi->event.events & EPOLLEXCLUSIVE) &&
    1188           0 :                                         !(pollflags & POLLFREE)) {
    1189           0 :                         switch (pollflags & EPOLLINOUT_BITS) {
    1190             :                         case EPOLLIN:
    1191           0 :                                 if (epi->event.events & EPOLLIN)
    1192           0 :                                         ewake = 1;
    1193             :                                 break;
    1194             :                         case EPOLLOUT:
    1195           0 :                                 if (epi->event.events & EPOLLOUT)
    1196           0 :                                         ewake = 1;
    1197             :                                 break;
    1198             :                         case 0:
    1199           0 :                                 ewake = 1;
    1200           0 :                                 break;
    1201             :                         }
    1202             :                 }
    1203           0 :                 wake_up(&ep->wq);
    1204             :         }
    1205           0 :         if (waitqueue_active(&ep->poll_wait))
    1206           0 :                 pwake++;
    1207             : 
    1208             : out_unlock:
    1209           0 :         read_unlock_irqrestore(&ep->lock, flags);
    1210             : 
    1211             :         /* We have to call this outside the lock */
    1212           0 :         if (pwake)
    1213           0 :                 ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE);
    1214             : 
    1215           0 :         if (!(epi->event.events & EPOLLEXCLUSIVE))
    1216           0 :                 ewake = 1;
    1217             : 
    1218           0 :         if (pollflags & POLLFREE) {
    1219             :                 /*
    1220             :                  * If we race with ep_remove_wait_queue() it can miss
    1221             :                  * ->whead = NULL and do another remove_wait_queue() after
    1222             :                  * us, so we can't use __remove_wait_queue().
    1223             :                  */
    1224           0 :                 list_del_init(&wait->entry);
    1225             :                 /*
    1226             :                  * ->whead != NULL protects us from the race with ep_free()
    1227             :                  * or ep_remove(), ep_remove_wait_queue() takes whead->lock
    1228             :                  * held by the caller. Once we nullify it, nothing protects
    1229             :                  * ep/epi or even wait.
    1230             :                  */
    1231           0 :                 smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
    1232             :         }
    1233             : 
    1234           0 :         return ewake;
    1235             : }
    1236             : 
    1237             : /*
    1238             :  * This is the callback that is used to add our wait queue to the
    1239             :  * target file wakeup lists.
    1240             :  */
    1241           0 : static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
    1242             :                                  poll_table *pt)
    1243             : {
    1244           0 :         struct ep_pqueue *epq = container_of(pt, struct ep_pqueue, pt);
    1245           0 :         struct epitem *epi = epq->epi;
    1246             :         struct eppoll_entry *pwq;
    1247             : 
    1248           0 :         if (unlikely(!epi))     // an earlier allocation has failed
    1249             :                 return;
    1250             : 
    1251           0 :         pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL);
    1252           0 :         if (unlikely(!pwq)) {
    1253           0 :                 epq->epi = NULL;
    1254           0 :                 return;
    1255             :         }
    1256             : 
    1257           0 :         init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
    1258           0 :         pwq->whead = whead;
    1259           0 :         pwq->base = epi;
    1260           0 :         if (epi->event.events & EPOLLEXCLUSIVE)
    1261           0 :                 add_wait_queue_exclusive(whead, &pwq->wait);
    1262             :         else
    1263           0 :                 add_wait_queue(whead, &pwq->wait);
    1264           0 :         pwq->next = epi->pwqlist;
    1265           0 :         epi->pwqlist = pwq;
    1266             : }
    1267             : 
    1268           0 : static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
    1269             : {
    1270             :         int kcmp;
    1271           0 :         struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL;
    1272             :         struct epitem *epic;
    1273           0 :         bool leftmost = true;
    1274             : 
    1275           0 :         while (*p) {
    1276           0 :                 parent = *p;
    1277           0 :                 epic = rb_entry(parent, struct epitem, rbn);
    1278           0 :                 kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
    1279           0 :                 if (kcmp > 0) {
    1280           0 :                         p = &parent->rb_right;
    1281           0 :                         leftmost = false;
    1282             :                 } else
    1283           0 :                         p = &parent->rb_left;
    1284             :         }
    1285           0 :         rb_link_node(&epi->rbn, parent, p);
    1286           0 :         rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost);
    1287           0 : }
    1288             : 
    1289             : 
    1290             : 
    1291             : #define PATH_ARR_SIZE 5
    1292             : /*
    1293             :  * These are the number paths of length 1 to 5, that we are allowing to emanate
    1294             :  * from a single file of interest. For example, we allow 1000 paths of length
    1295             :  * 1, to emanate from each file of interest. This essentially represents the
    1296             :  * potential wakeup paths, which need to be limited in order to avoid massive
    1297             :  * uncontrolled wakeup storms. The common use case should be a single ep which
    1298             :  * is connected to n file sources. In this case each file source has 1 path
    1299             :  * of length 1. Thus, the numbers below should be more than sufficient. These
    1300             :  * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
    1301             :  * and delete can't add additional paths. Protected by the epmutex.
    1302             :  */
    1303             : static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
    1304             : static int path_count[PATH_ARR_SIZE];
    1305             : 
    1306             : static int path_count_inc(int nests)
    1307             : {
    1308             :         /* Allow an arbitrary number of depth 1 paths */
    1309           0 :         if (nests == 0)
    1310             :                 return 0;
    1311             : 
    1312           0 :         if (++path_count[nests] > path_limits[nests])
    1313             :                 return -1;
    1314             :         return 0;
    1315             : }
    1316             : 
    1317             : static void path_count_init(void)
    1318             : {
    1319             :         int i;
    1320             : 
    1321           0 :         for (i = 0; i < PATH_ARR_SIZE; i++)
    1322           0 :                 path_count[i] = 0;
    1323             : }
    1324             : 
    1325           0 : static int reverse_path_check_proc(struct hlist_head *refs, int depth)
    1326             : {
    1327           0 :         int error = 0;
    1328             :         struct epitem *epi;
    1329             : 
    1330           0 :         if (depth > EP_MAX_NESTS) /* too deep nesting */
    1331             :                 return -1;
    1332             : 
    1333             :         /* CTL_DEL can remove links here, but that can't increase our count */
    1334           0 :         hlist_for_each_entry_rcu(epi, refs, fllink) {
    1335           0 :                 struct hlist_head *refs = &epi->ep->refs;
    1336           0 :                 if (hlist_empty(refs))
    1337             :                         error = path_count_inc(depth);
    1338             :                 else
    1339           0 :                         error = reverse_path_check_proc(refs, depth + 1);
    1340           0 :                 if (error != 0)
    1341             :                         break;
    1342             :         }
    1343             :         return error;
    1344             : }
    1345             : 
    1346             : /**
    1347             :  * reverse_path_check - The tfile_check_list is list of epitem_head, which have
    1348             :  *                      links that are proposed to be newly added. We need to
    1349             :  *                      make sure that those added links don't add too many
    1350             :  *                      paths such that we will spend all our time waking up
    1351             :  *                      eventpoll objects.
    1352             :  *
    1353             :  * Return: %zero if the proposed links don't create too many paths,
    1354             :  *          %-1 otherwise.
    1355             :  */
    1356           0 : static int reverse_path_check(void)
    1357             : {
    1358             :         struct epitems_head *p;
    1359             : 
    1360           0 :         for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) {
    1361             :                 int error;
    1362           0 :                 path_count_init();
    1363             :                 rcu_read_lock();
    1364           0 :                 error = reverse_path_check_proc(&p->epitems, 0);
    1365             :                 rcu_read_unlock();
    1366           0 :                 if (error)
    1367             :                         return error;
    1368             :         }
    1369             :         return 0;
    1370             : }
    1371             : 
    1372           0 : static int ep_create_wakeup_source(struct epitem *epi)
    1373             : {
    1374             :         struct name_snapshot n;
    1375             :         struct wakeup_source *ws;
    1376             : 
    1377           0 :         if (!epi->ep->ws) {
    1378           0 :                 epi->ep->ws = wakeup_source_register(NULL, "eventpoll");
    1379           0 :                 if (!epi->ep->ws)
    1380             :                         return -ENOMEM;
    1381             :         }
    1382             : 
    1383           0 :         take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry);
    1384           0 :         ws = wakeup_source_register(NULL, n.name.name);
    1385           0 :         release_dentry_name_snapshot(&n);
    1386             : 
    1387           0 :         if (!ws)
    1388             :                 return -ENOMEM;
    1389           0 :         rcu_assign_pointer(epi->ws, ws);
    1390             : 
    1391           0 :         return 0;
    1392             : }
    1393             : 
    1394             : /* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */
    1395           0 : static noinline void ep_destroy_wakeup_source(struct epitem *epi)
    1396             : {
    1397           0 :         struct wakeup_source *ws = ep_wakeup_source(epi);
    1398             : 
    1399           0 :         RCU_INIT_POINTER(epi->ws, NULL);
    1400             : 
    1401             :         /*
    1402             :          * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is
    1403             :          * used internally by wakeup_source_remove, too (called by
    1404             :          * wakeup_source_unregister), so we cannot use call_rcu
    1405             :          */
    1406           0 :         synchronize_rcu();
    1407           0 :         wakeup_source_unregister(ws);
    1408           0 : }
    1409             : 
    1410           0 : static int attach_epitem(struct file *file, struct epitem *epi)
    1411             : {
    1412           0 :         struct epitems_head *to_free = NULL;
    1413           0 :         struct hlist_head *head = NULL;
    1414           0 :         struct eventpoll *ep = NULL;
    1415             : 
    1416           0 :         if (is_file_epoll(file))
    1417           0 :                 ep = file->private_data;
    1418             : 
    1419           0 :         if (ep) {
    1420           0 :                 head = &ep->refs;
    1421           0 :         } else if (!READ_ONCE(file->f_ep)) {
    1422             : allocate:
    1423           0 :                 to_free = kmem_cache_zalloc(ephead_cache, GFP_KERNEL);
    1424           0 :                 if (!to_free)
    1425             :                         return -ENOMEM;
    1426           0 :                 head = &to_free->epitems;
    1427             :         }
    1428           0 :         spin_lock(&file->f_lock);
    1429           0 :         if (!file->f_ep) {
    1430           0 :                 if (unlikely(!head)) {
    1431           0 :                         spin_unlock(&file->f_lock);
    1432             :                         goto allocate;
    1433             :                 }
    1434           0 :                 file->f_ep = head;
    1435           0 :                 to_free = NULL;
    1436             :         }
    1437           0 :         hlist_add_head_rcu(&epi->fllink, file->f_ep);
    1438           0 :         spin_unlock(&file->f_lock);
    1439             :         free_ephead(to_free);
    1440             :         return 0;
    1441             : }
    1442             : 
    1443             : /*
    1444             :  * Must be called with "mtx" held.
    1445             :  */
    1446           0 : static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
    1447             :                      struct file *tfile, int fd, int full_check)
    1448             : {
    1449           0 :         int error, pwake = 0;
    1450             :         __poll_t revents;
    1451             :         struct epitem *epi;
    1452             :         struct ep_pqueue epq;
    1453           0 :         struct eventpoll *tep = NULL;
    1454             : 
    1455           0 :         if (is_file_epoll(tfile))
    1456           0 :                 tep = tfile->private_data;
    1457             : 
    1458             :         lockdep_assert_irqs_enabled();
    1459             : 
    1460           0 :         if (unlikely(percpu_counter_compare(&ep->user->epoll_watches,
    1461             :                                             max_user_watches) >= 0))
    1462             :                 return -ENOSPC;
    1463           0 :         percpu_counter_inc(&ep->user->epoll_watches);
    1464             : 
    1465           0 :         if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) {
    1466           0 :                 percpu_counter_dec(&ep->user->epoll_watches);
    1467           0 :                 return -ENOMEM;
    1468             :         }
    1469             : 
    1470             :         /* Item initialization follow here ... */
    1471           0 :         INIT_LIST_HEAD(&epi->rdllink);
    1472           0 :         epi->ep = ep;
    1473           0 :         ep_set_ffd(&epi->ffd, tfile, fd);
    1474           0 :         epi->event = *event;
    1475           0 :         epi->next = EP_UNACTIVE_PTR;
    1476             : 
    1477           0 :         if (tep)
    1478           0 :                 mutex_lock_nested(&tep->mtx, 1);
    1479             :         /* Add the current item to the list of active epoll hook for this file */
    1480           0 :         if (unlikely(attach_epitem(tfile, epi) < 0)) {
    1481           0 :                 if (tep)
    1482           0 :                         mutex_unlock(&tep->mtx);
    1483           0 :                 kmem_cache_free(epi_cache, epi);
    1484           0 :                 percpu_counter_dec(&ep->user->epoll_watches);
    1485           0 :                 return -ENOMEM;
    1486             :         }
    1487             : 
    1488           0 :         if (full_check && !tep)
    1489           0 :                 list_file(tfile);
    1490             : 
    1491             :         /*
    1492             :          * Add the current item to the RB tree. All RB tree operations are
    1493             :          * protected by "mtx", and ep_insert() is called with "mtx" held.
    1494             :          */
    1495           0 :         ep_rbtree_insert(ep, epi);
    1496           0 :         if (tep)
    1497           0 :                 mutex_unlock(&tep->mtx);
    1498             : 
    1499             :         /* now check if we've created too many backpaths */
    1500           0 :         if (unlikely(full_check && reverse_path_check())) {
    1501           0 :                 ep_remove(ep, epi);
    1502           0 :                 return -EINVAL;
    1503             :         }
    1504             : 
    1505           0 :         if (epi->event.events & EPOLLWAKEUP) {
    1506           0 :                 error = ep_create_wakeup_source(epi);
    1507           0 :                 if (error) {
    1508           0 :                         ep_remove(ep, epi);
    1509           0 :                         return error;
    1510             :                 }
    1511             :         }
    1512             : 
    1513             :         /* Initialize the poll table using the queue callback */
    1514           0 :         epq.epi = epi;
    1515           0 :         init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
    1516             : 
    1517             :         /*
    1518             :          * Attach the item to the poll hooks and get current event bits.
    1519             :          * We can safely use the file* here because its usage count has
    1520             :          * been increased by the caller of this function. Note that after
    1521             :          * this operation completes, the poll callback can start hitting
    1522             :          * the new item.
    1523             :          */
    1524           0 :         revents = ep_item_poll(epi, &epq.pt, 1);
    1525             : 
    1526             :         /*
    1527             :          * We have to check if something went wrong during the poll wait queue
    1528             :          * install process. Namely an allocation for a wait queue failed due
    1529             :          * high memory pressure.
    1530             :          */
    1531           0 :         if (unlikely(!epq.epi)) {
    1532           0 :                 ep_remove(ep, epi);
    1533           0 :                 return -ENOMEM;
    1534             :         }
    1535             : 
    1536             :         /* We have to drop the new item inside our item list to keep track of it */
    1537           0 :         write_lock_irq(&ep->lock);
    1538             : 
    1539             :         /* record NAPI ID of new item if present */
    1540             :         ep_set_busy_poll_napi_id(epi);
    1541             : 
    1542             :         /* If the file is already "ready" we drop it inside the ready list */
    1543           0 :         if (revents && !ep_is_linked(epi)) {
    1544           0 :                 list_add_tail(&epi->rdllink, &ep->rdllist);
    1545           0 :                 ep_pm_stay_awake(epi);
    1546             : 
    1547             :                 /* Notify waiting tasks that events are available */
    1548           0 :                 if (waitqueue_active(&ep->wq))
    1549           0 :                         wake_up(&ep->wq);
    1550           0 :                 if (waitqueue_active(&ep->poll_wait))
    1551           0 :                         pwake++;
    1552             :         }
    1553             : 
    1554           0 :         write_unlock_irq(&ep->lock);
    1555             : 
    1556             :         /* We have to call this outside the lock */
    1557           0 :         if (pwake)
    1558           0 :                 ep_poll_safewake(ep, NULL, 0);
    1559             : 
    1560             :         return 0;
    1561             : }
    1562             : 
    1563             : /*
    1564             :  * Modify the interest event mask by dropping an event if the new mask
    1565             :  * has a match in the current file status. Must be called with "mtx" held.
    1566             :  */
    1567           0 : static int ep_modify(struct eventpoll *ep, struct epitem *epi,
    1568             :                      const struct epoll_event *event)
    1569             : {
    1570           0 :         int pwake = 0;
    1571             :         poll_table pt;
    1572             : 
    1573             :         lockdep_assert_irqs_enabled();
    1574             : 
    1575           0 :         init_poll_funcptr(&pt, NULL);
    1576             : 
    1577             :         /*
    1578             :          * Set the new event interest mask before calling f_op->poll();
    1579             :          * otherwise we might miss an event that happens between the
    1580             :          * f_op->poll() call and the new event set registering.
    1581             :          */
    1582           0 :         epi->event.events = event->events; /* need barrier below */
    1583           0 :         epi->event.data = event->data; /* protected by mtx */
    1584           0 :         if (epi->event.events & EPOLLWAKEUP) {
    1585           0 :                 if (!ep_has_wakeup_source(epi))
    1586           0 :                         ep_create_wakeup_source(epi);
    1587           0 :         } else if (ep_has_wakeup_source(epi)) {
    1588           0 :                 ep_destroy_wakeup_source(epi);
    1589             :         }
    1590             : 
    1591             :         /*
    1592             :          * The following barrier has two effects:
    1593             :          *
    1594             :          * 1) Flush epi changes above to other CPUs.  This ensures
    1595             :          *    we do not miss events from ep_poll_callback if an
    1596             :          *    event occurs immediately after we call f_op->poll().
    1597             :          *    We need this because we did not take ep->lock while
    1598             :          *    changing epi above (but ep_poll_callback does take
    1599             :          *    ep->lock).
    1600             :          *
    1601             :          * 2) We also need to ensure we do not miss _past_ events
    1602             :          *    when calling f_op->poll().  This barrier also
    1603             :          *    pairs with the barrier in wq_has_sleeper (see
    1604             :          *    comments for wq_has_sleeper).
    1605             :          *
    1606             :          * This barrier will now guarantee ep_poll_callback or f_op->poll
    1607             :          * (or both) will notice the readiness of an item.
    1608             :          */
    1609           0 :         smp_mb();
    1610             : 
    1611             :         /*
    1612             :          * Get current event bits. We can safely use the file* here because
    1613             :          * its usage count has been increased by the caller of this function.
    1614             :          * If the item is "hot" and it is not registered inside the ready
    1615             :          * list, push it inside.
    1616             :          */
    1617           0 :         if (ep_item_poll(epi, &pt, 1)) {
    1618           0 :                 write_lock_irq(&ep->lock);
    1619           0 :                 if (!ep_is_linked(epi)) {
    1620           0 :                         list_add_tail(&epi->rdllink, &ep->rdllist);
    1621           0 :                         ep_pm_stay_awake(epi);
    1622             : 
    1623             :                         /* Notify waiting tasks that events are available */
    1624           0 :                         if (waitqueue_active(&ep->wq))
    1625           0 :                                 wake_up(&ep->wq);
    1626           0 :                         if (waitqueue_active(&ep->poll_wait))
    1627           0 :                                 pwake++;
    1628             :                 }
    1629           0 :                 write_unlock_irq(&ep->lock);
    1630             :         }
    1631             : 
    1632             :         /* We have to call this outside the lock */
    1633           0 :         if (pwake)
    1634           0 :                 ep_poll_safewake(ep, NULL, 0);
    1635             : 
    1636           0 :         return 0;
    1637             : }
    1638             : 
    1639           0 : static int ep_send_events(struct eventpoll *ep,
    1640             :                           struct epoll_event __user *events, int maxevents)
    1641             : {
    1642             :         struct epitem *epi, *tmp;
    1643           0 :         LIST_HEAD(txlist);
    1644             :         poll_table pt;
    1645           0 :         int res = 0;
    1646             : 
    1647             :         /*
    1648             :          * Always short-circuit for fatal signals to allow threads to make a
    1649             :          * timely exit without the chance of finding more events available and
    1650             :          * fetching repeatedly.
    1651             :          */
    1652           0 :         if (fatal_signal_pending(current))
    1653             :                 return -EINTR;
    1654             : 
    1655           0 :         init_poll_funcptr(&pt, NULL);
    1656             : 
    1657           0 :         mutex_lock(&ep->mtx);
    1658           0 :         ep_start_scan(ep, &txlist);
    1659             : 
    1660             :         /*
    1661             :          * We can loop without lock because we are passed a task private list.
    1662             :          * Items cannot vanish during the loop we are holding ep->mtx.
    1663             :          */
    1664           0 :         list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
    1665             :                 struct wakeup_source *ws;
    1666             :                 __poll_t revents;
    1667             : 
    1668           0 :                 if (res >= maxevents)
    1669             :                         break;
    1670             : 
    1671             :                 /*
    1672             :                  * Activate ep->ws before deactivating epi->ws to prevent
    1673             :                  * triggering auto-suspend here (in case we reactive epi->ws
    1674             :                  * below).
    1675             :                  *
    1676             :                  * This could be rearranged to delay the deactivation of epi->ws
    1677             :                  * instead, but then epi->ws would temporarily be out of sync
    1678             :                  * with ep_is_linked().
    1679             :                  */
    1680           0 :                 ws = ep_wakeup_source(epi);
    1681           0 :                 if (ws) {
    1682           0 :                         if (ws->active)
    1683           0 :                                 __pm_stay_awake(ep->ws);
    1684           0 :                         __pm_relax(ws);
    1685             :                 }
    1686             : 
    1687           0 :                 list_del_init(&epi->rdllink);
    1688             : 
    1689             :                 /*
    1690             :                  * If the event mask intersect the caller-requested one,
    1691             :                  * deliver the event to userspace. Again, we are holding ep->mtx,
    1692             :                  * so no operations coming from userspace can change the item.
    1693             :                  */
    1694           0 :                 revents = ep_item_poll(epi, &pt, 1);
    1695           0 :                 if (!revents)
    1696           0 :                         continue;
    1697             : 
    1698           0 :                 events = epoll_put_uevent(revents, epi->event.data, events);
    1699           0 :                 if (!events) {
    1700           0 :                         list_add(&epi->rdllink, &txlist);
    1701           0 :                         ep_pm_stay_awake(epi);
    1702           0 :                         if (!res)
    1703           0 :                                 res = -EFAULT;
    1704             :                         break;
    1705             :                 }
    1706           0 :                 res++;
    1707           0 :                 if (epi->event.events & EPOLLONESHOT)
    1708           0 :                         epi->event.events &= EP_PRIVATE_BITS;
    1709           0 :                 else if (!(epi->event.events & EPOLLET)) {
    1710             :                         /*
    1711             :                          * If this file has been added with Level
    1712             :                          * Trigger mode, we need to insert back inside
    1713             :                          * the ready list, so that the next call to
    1714             :                          * epoll_wait() will check again the events
    1715             :                          * availability. At this point, no one can insert
    1716             :                          * into ep->rdllist besides us. The epoll_ctl()
    1717             :                          * callers are locked out by
    1718             :                          * ep_scan_ready_list() holding "mtx" and the
    1719             :                          * poll callback will queue them in ep->ovflist.
    1720             :                          */
    1721           0 :                         list_add_tail(&epi->rdllink, &ep->rdllist);
    1722             :                         ep_pm_stay_awake(epi);
    1723             :                 }
    1724             :         }
    1725           0 :         ep_done_scan(ep, &txlist);
    1726           0 :         mutex_unlock(&ep->mtx);
    1727             : 
    1728           0 :         return res;
    1729             : }
    1730             : 
    1731           0 : static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
    1732             : {
    1733             :         struct timespec64 now;
    1734             : 
    1735           0 :         if (ms < 0)
    1736             :                 return NULL;
    1737             : 
    1738           0 :         if (!ms) {
    1739           0 :                 to->tv_sec = 0;
    1740           0 :                 to->tv_nsec = 0;
    1741           0 :                 return to;
    1742             :         }
    1743             : 
    1744           0 :         to->tv_sec = ms / MSEC_PER_SEC;
    1745           0 :         to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC);
    1746             : 
    1747           0 :         ktime_get_ts64(&now);
    1748           0 :         *to = timespec64_add_safe(now, *to);
    1749           0 :         return to;
    1750             : }
    1751             : 
    1752             : /*
    1753             :  * autoremove_wake_function, but remove even on failure to wake up, because we
    1754             :  * know that default_wake_function/ttwu will only fail if the thread is already
    1755             :  * woken, and in that case the ep_poll loop will remove the entry anyways, not
    1756             :  * try to reuse it.
    1757             :  */
    1758           0 : static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
    1759             :                                        unsigned int mode, int sync, void *key)
    1760             : {
    1761           0 :         int ret = default_wake_function(wq_entry, mode, sync, key);
    1762             : 
    1763           0 :         list_del_init(&wq_entry->entry);
    1764           0 :         return ret;
    1765             : }
    1766             : 
    1767             : /**
    1768             :  * ep_poll - Retrieves ready events, and delivers them to the caller-supplied
    1769             :  *           event buffer.
    1770             :  *
    1771             :  * @ep: Pointer to the eventpoll context.
    1772             :  * @events: Pointer to the userspace buffer where the ready events should be
    1773             :  *          stored.
    1774             :  * @maxevents: Size (in terms of number of events) of the caller event buffer.
    1775             :  * @timeout: Maximum timeout for the ready events fetch operation, in
    1776             :  *           timespec. If the timeout is zero, the function will not block,
    1777             :  *           while if the @timeout ptr is NULL, the function will block
    1778             :  *           until at least one event has been retrieved (or an error
    1779             :  *           occurred).
    1780             :  *
    1781             :  * Return: the number of ready events which have been fetched, or an
    1782             :  *          error code, in case of error.
    1783             :  */
    1784           0 : static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
    1785             :                    int maxevents, struct timespec64 *timeout)
    1786             : {
    1787           0 :         int res, eavail, timed_out = 0;
    1788           0 :         u64 slack = 0;
    1789             :         wait_queue_entry_t wait;
    1790           0 :         ktime_t expires, *to = NULL;
    1791             : 
    1792             :         lockdep_assert_irqs_enabled();
    1793             : 
    1794           0 :         if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
    1795           0 :                 slack = select_estimate_accuracy(timeout);
    1796           0 :                 to = &expires;
    1797           0 :                 *to = timespec64_to_ktime(*timeout);
    1798           0 :         } else if (timeout) {
    1799             :                 /*
    1800             :                  * Avoid the unnecessary trip to the wait queue loop, if the
    1801             :                  * caller specified a non blocking operation.
    1802             :                  */
    1803           0 :                 timed_out = 1;
    1804             :         }
    1805             : 
    1806             :         /*
    1807             :          * This call is racy: We may or may not see events that are being added
    1808             :          * to the ready list under the lock (e.g., in IRQ callbacks). For cases
    1809             :          * with a non-zero timeout, this thread will check the ready list under
    1810             :          * lock and will add to the wait queue.  For cases with a zero
    1811             :          * timeout, the user by definition should not care and will have to
    1812             :          * recheck again.
    1813             :          */
    1814             :         eavail = ep_events_available(ep);
    1815             : 
    1816             :         while (1) {
    1817           0 :                 if (eavail) {
    1818             :                         /*
    1819             :                          * Try to transfer events to user space. In case we get
    1820             :                          * 0 events and there's still timeout left over, we go
    1821             :                          * trying again in search of more luck.
    1822             :                          */
    1823           0 :                         res = ep_send_events(ep, events, maxevents);
    1824           0 :                         if (res)
    1825             :                                 return res;
    1826             :                 }
    1827             : 
    1828           0 :                 if (timed_out)
    1829             :                         return 0;
    1830             : 
    1831           0 :                 eavail = ep_busy_loop(ep, timed_out);
    1832             :                 if (eavail)
    1833             :                         continue;
    1834             : 
    1835           0 :                 if (signal_pending(current))
    1836             :                         return -EINTR;
    1837             : 
    1838             :                 /*
    1839             :                  * Internally init_wait() uses autoremove_wake_function(),
    1840             :                  * thus wait entry is removed from the wait queue on each
    1841             :                  * wakeup. Why it is important? In case of several waiters
    1842             :                  * each new wakeup will hit the next waiter, giving it the
    1843             :                  * chance to harvest new event. Otherwise wakeup can be
    1844             :                  * lost. This is also good performance-wise, because on
    1845             :                  * normal wakeup path no need to call __remove_wait_queue()
    1846             :                  * explicitly, thus ep->lock is not taken, which halts the
    1847             :                  * event delivery.
    1848             :                  *
    1849             :                  * In fact, we now use an even more aggressive function that
    1850             :                  * unconditionally removes, because we don't reuse the wait
    1851             :                  * entry between loop iterations. This lets us also avoid the
    1852             :                  * performance issue if a process is killed, causing all of its
    1853             :                  * threads to wake up without being removed normally.
    1854             :                  */
    1855           0 :                 init_wait(&wait);
    1856           0 :                 wait.func = ep_autoremove_wake_function;
    1857             : 
    1858           0 :                 write_lock_irq(&ep->lock);
    1859             :                 /*
    1860             :                  * Barrierless variant, waitqueue_active() is called under
    1861             :                  * the same lock on wakeup ep_poll_callback() side, so it
    1862             :                  * is safe to avoid an explicit barrier.
    1863             :                  */
    1864           0 :                 __set_current_state(TASK_INTERRUPTIBLE);
    1865             : 
    1866             :                 /*
    1867             :                  * Do the final check under the lock. ep_scan_ready_list()
    1868             :                  * plays with two lists (->rdllist and ->ovflist) and there
    1869             :                  * is always a race when both lists are empty for short
    1870             :                  * period of time although events are pending, so lock is
    1871             :                  * important.
    1872             :                  */
    1873           0 :                 eavail = ep_events_available(ep);
    1874           0 :                 if (!eavail)
    1875           0 :                         __add_wait_queue_exclusive(&ep->wq, &wait);
    1876             : 
    1877           0 :                 write_unlock_irq(&ep->lock);
    1878             : 
    1879           0 :                 if (!eavail)
    1880           0 :                         timed_out = !schedule_hrtimeout_range(to, slack,
    1881             :                                                               HRTIMER_MODE_ABS);
    1882           0 :                 __set_current_state(TASK_RUNNING);
    1883             : 
    1884             :                 /*
    1885             :                  * We were woken up, thus go and try to harvest some events.
    1886             :                  * If timed out and still on the wait queue, recheck eavail
    1887             :                  * carefully under lock, below.
    1888             :                  */
    1889           0 :                 eavail = 1;
    1890             : 
    1891           0 :                 if (!list_empty_careful(&wait.entry)) {
    1892           0 :                         write_lock_irq(&ep->lock);
    1893             :                         /*
    1894             :                          * If the thread timed out and is not on the wait queue,
    1895             :                          * it means that the thread was woken up after its
    1896             :                          * timeout expired before it could reacquire the lock.
    1897             :                          * Thus, when wait.entry is empty, it needs to harvest
    1898             :                          * events.
    1899             :                          */
    1900           0 :                         if (timed_out)
    1901           0 :                                 eavail = list_empty(&wait.entry);
    1902           0 :                         __remove_wait_queue(&ep->wq, &wait);
    1903           0 :                         write_unlock_irq(&ep->lock);
    1904             :                 }
    1905             :         }
    1906             : }
    1907             : 
    1908             : /**
    1909             :  * ep_loop_check_proc - verify that adding an epoll file inside another
    1910             :  *                      epoll structure does not violate the constraints, in
    1911             :  *                      terms of closed loops, or too deep chains (which can
    1912             :  *                      result in excessive stack usage).
    1913             :  *
    1914             :  * @ep: the &struct eventpoll to be currently checked.
    1915             :  * @depth: Current depth of the path being checked.
    1916             :  *
    1917             :  * Return: %zero if adding the epoll @file inside current epoll
    1918             :  *          structure @ep does not violate the constraints, or %-1 otherwise.
    1919             :  */
    1920           0 : static int ep_loop_check_proc(struct eventpoll *ep, int depth)
    1921             : {
    1922           0 :         int error = 0;
    1923             :         struct rb_node *rbp;
    1924             :         struct epitem *epi;
    1925             : 
    1926           0 :         mutex_lock_nested(&ep->mtx, depth + 1);
    1927           0 :         ep->gen = loop_check_gen;
    1928           0 :         for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
    1929           0 :                 epi = rb_entry(rbp, struct epitem, rbn);
    1930           0 :                 if (unlikely(is_file_epoll(epi->ffd.file))) {
    1931             :                         struct eventpoll *ep_tovisit;
    1932           0 :                         ep_tovisit = epi->ffd.file->private_data;
    1933           0 :                         if (ep_tovisit->gen == loop_check_gen)
    1934           0 :                                 continue;
    1935           0 :                         if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS)
    1936             :                                 error = -1;
    1937             :                         else
    1938           0 :                                 error = ep_loop_check_proc(ep_tovisit, depth + 1);
    1939           0 :                         if (error != 0)
    1940             :                                 break;
    1941             :                 } else {
    1942             :                         /*
    1943             :                          * If we've reached a file that is not associated with
    1944             :                          * an ep, then we need to check if the newly added
    1945             :                          * links are going to add too many wakeup paths. We do
    1946             :                          * this by adding it to the tfile_check_list, if it's
    1947             :                          * not already there, and calling reverse_path_check()
    1948             :                          * during ep_insert().
    1949             :                          */
    1950           0 :                         list_file(epi->ffd.file);
    1951             :                 }
    1952             :         }
    1953           0 :         mutex_unlock(&ep->mtx);
    1954             : 
    1955           0 :         return error;
    1956             : }
    1957             : 
    1958             : /**
    1959             :  * ep_loop_check - Performs a check to verify that adding an epoll file (@to)
    1960             :  *                 into another epoll file (represented by @ep) does not create
    1961             :  *                 closed loops or too deep chains.
    1962             :  *
    1963             :  * @ep: Pointer to the epoll we are inserting into.
    1964             :  * @to: Pointer to the epoll to be inserted.
    1965             :  *
    1966             :  * Return: %zero if adding the epoll @to inside the epoll @from
    1967             :  * does not violate the constraints, or %-1 otherwise.
    1968             :  */
    1969             : static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to)
    1970             : {
    1971           0 :         inserting_into = ep;
    1972           0 :         return ep_loop_check_proc(to, 0);
    1973             : }
    1974             : 
    1975             : static void clear_tfile_check_list(void)
    1976             : {
    1977             :         rcu_read_lock();
    1978           0 :         while (tfile_check_list != EP_UNACTIVE_PTR) {
    1979           0 :                 struct epitems_head *head = tfile_check_list;
    1980           0 :                 tfile_check_list = head->next;
    1981           0 :                 unlist_file(head);
    1982             :         }
    1983             :         rcu_read_unlock();
    1984             : }
    1985             : 
    1986             : /*
    1987             :  * Open an eventpoll file descriptor.
    1988             :  */
    1989           0 : static int do_epoll_create(int flags)
    1990             : {
    1991             :         int error, fd;
    1992           0 :         struct eventpoll *ep = NULL;
    1993             :         struct file *file;
    1994             : 
    1995             :         /* Check the EPOLL_* constant for consistency.  */
    1996             :         BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
    1997             : 
    1998           0 :         if (flags & ~EPOLL_CLOEXEC)
    1999             :                 return -EINVAL;
    2000             :         /*
    2001             :          * Create the internal data structure ("struct eventpoll").
    2002             :          */
    2003           0 :         error = ep_alloc(&ep);
    2004           0 :         if (error < 0)
    2005             :                 return error;
    2006             :         /*
    2007             :          * Creates all the items needed to setup an eventpoll file. That is,
    2008             :          * a file structure and a free file descriptor.
    2009             :          */
    2010           0 :         fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
    2011           0 :         if (fd < 0) {
    2012             :                 error = fd;
    2013             :                 goto out_free_ep;
    2014             :         }
    2015           0 :         file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
    2016             :                                  O_RDWR | (flags & O_CLOEXEC));
    2017           0 :         if (IS_ERR(file)) {
    2018           0 :                 error = PTR_ERR(file);
    2019             :                 goto out_free_fd;
    2020             :         }
    2021           0 :         ep->file = file;
    2022           0 :         fd_install(fd, file);
    2023           0 :         return fd;
    2024             : 
    2025             : out_free_fd:
    2026           0 :         put_unused_fd(fd);
    2027             : out_free_ep:
    2028           0 :         ep_free(ep);
    2029           0 :         return error;
    2030             : }
    2031             : 
    2032           0 : SYSCALL_DEFINE1(epoll_create1, int, flags)
    2033             : {
    2034           0 :         return do_epoll_create(flags);
    2035             : }
    2036             : 
    2037           0 : SYSCALL_DEFINE1(epoll_create, int, size)
    2038             : {
    2039           0 :         if (size <= 0)
    2040             :                 return -EINVAL;
    2041             : 
    2042           0 :         return do_epoll_create(0);
    2043             : }
    2044             : 
    2045             : static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
    2046             :                                    bool nonblock)
    2047             : {
    2048           0 :         if (!nonblock) {
    2049           0 :                 mutex_lock_nested(mutex, depth);
    2050             :                 return 0;
    2051             :         }
    2052           0 :         if (mutex_trylock(mutex))
    2053             :                 return 0;
    2054             :         return -EAGAIN;
    2055             : }
    2056             : 
    2057           0 : int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
    2058             :                  bool nonblock)
    2059             : {
    2060             :         int error;
    2061           0 :         int full_check = 0;
    2062             :         struct fd f, tf;
    2063             :         struct eventpoll *ep;
    2064             :         struct epitem *epi;
    2065           0 :         struct eventpoll *tep = NULL;
    2066             : 
    2067           0 :         error = -EBADF;
    2068           0 :         f = fdget(epfd);
    2069           0 :         if (!f.file)
    2070             :                 goto error_return;
    2071             : 
    2072             :         /* Get the "struct file *" for the target file */
    2073           0 :         tf = fdget(fd);
    2074           0 :         if (!tf.file)
    2075             :                 goto error_fput;
    2076             : 
    2077             :         /* The target file descriptor must support poll */
    2078           0 :         error = -EPERM;
    2079           0 :         if (!file_can_poll(tf.file))
    2080             :                 goto error_tgt_fput;
    2081             : 
    2082             :         /* Check if EPOLLWAKEUP is allowed */
    2083           0 :         if (ep_op_has_event(op))
    2084           0 :                 ep_take_care_of_epollwakeup(epds);
    2085             : 
    2086             :         /*
    2087             :          * We have to check that the file structure underneath the file descriptor
    2088             :          * the user passed to us _is_ an eventpoll file. And also we do not permit
    2089             :          * adding an epoll file descriptor inside itself.
    2090             :          */
    2091           0 :         error = -EINVAL;
    2092           0 :         if (f.file == tf.file || !is_file_epoll(f.file))
    2093             :                 goto error_tgt_fput;
    2094             : 
    2095             :         /*
    2096             :          * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
    2097             :          * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
    2098             :          * Also, we do not currently supported nested exclusive wakeups.
    2099             :          */
    2100           0 :         if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
    2101           0 :                 if (op == EPOLL_CTL_MOD)
    2102             :                         goto error_tgt_fput;
    2103           0 :                 if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
    2104           0 :                                 (epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
    2105             :                         goto error_tgt_fput;
    2106             :         }
    2107             : 
    2108             :         /*
    2109             :          * At this point it is safe to assume that the "private_data" contains
    2110             :          * our own data structure.
    2111             :          */
    2112           0 :         ep = f.file->private_data;
    2113             : 
    2114             :         /*
    2115             :          * When we insert an epoll file descriptor inside another epoll file
    2116             :          * descriptor, there is the chance of creating closed loops, which are
    2117             :          * better be handled here, than in more critical paths. While we are
    2118             :          * checking for loops we also determine the list of files reachable
    2119             :          * and hang them on the tfile_check_list, so we can check that we
    2120             :          * haven't created too many possible wakeup paths.
    2121             :          *
    2122             :          * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
    2123             :          * the epoll file descriptor is attaching directly to a wakeup source,
    2124             :          * unless the epoll file descriptor is nested. The purpose of taking the
    2125             :          * 'epmutex' on add is to prevent complex toplogies such as loops and
    2126             :          * deep wakeup paths from forming in parallel through multiple
    2127             :          * EPOLL_CTL_ADD operations.
    2128             :          */
    2129           0 :         error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
    2130           0 :         if (error)
    2131             :                 goto error_tgt_fput;
    2132           0 :         if (op == EPOLL_CTL_ADD) {
    2133           0 :                 if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen ||
    2134           0 :                     is_file_epoll(tf.file)) {
    2135           0 :                         mutex_unlock(&ep->mtx);
    2136           0 :                         error = epoll_mutex_lock(&epmutex, 0, nonblock);
    2137           0 :                         if (error)
    2138             :                                 goto error_tgt_fput;
    2139           0 :                         loop_check_gen++;
    2140           0 :                         full_check = 1;
    2141           0 :                         if (is_file_epoll(tf.file)) {
    2142           0 :                                 tep = tf.file->private_data;
    2143           0 :                                 error = -ELOOP;
    2144           0 :                                 if (ep_loop_check(ep, tep) != 0)
    2145             :                                         goto error_tgt_fput;
    2146             :                         }
    2147           0 :                         error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
    2148           0 :                         if (error)
    2149             :                                 goto error_tgt_fput;
    2150             :                 }
    2151             :         }
    2152             : 
    2153             :         /*
    2154             :          * Try to lookup the file inside our RB tree. Since we grabbed "mtx"
    2155             :          * above, we can be sure to be able to use the item looked up by
    2156             :          * ep_find() till we release the mutex.
    2157             :          */
    2158           0 :         epi = ep_find(ep, tf.file, fd);
    2159             : 
    2160           0 :         error = -EINVAL;
    2161           0 :         switch (op) {
    2162             :         case EPOLL_CTL_ADD:
    2163           0 :                 if (!epi) {
    2164           0 :                         epds->events |= EPOLLERR | EPOLLHUP;
    2165           0 :                         error = ep_insert(ep, epds, tf.file, fd, full_check);
    2166             :                 } else
    2167             :                         error = -EEXIST;
    2168             :                 break;
    2169             :         case EPOLL_CTL_DEL:
    2170           0 :                 if (epi)
    2171           0 :                         error = ep_remove(ep, epi);
    2172             :                 else
    2173             :                         error = -ENOENT;
    2174             :                 break;
    2175             :         case EPOLL_CTL_MOD:
    2176           0 :                 if (epi) {
    2177           0 :                         if (!(epi->event.events & EPOLLEXCLUSIVE)) {
    2178           0 :                                 epds->events |= EPOLLERR | EPOLLHUP;
    2179           0 :                                 error = ep_modify(ep, epi, epds);
    2180             :                         }
    2181             :                 } else
    2182             :                         error = -ENOENT;
    2183             :                 break;
    2184             :         }
    2185           0 :         mutex_unlock(&ep->mtx);
    2186             : 
    2187             : error_tgt_fput:
    2188           0 :         if (full_check) {
    2189             :                 clear_tfile_check_list();
    2190           0 :                 loop_check_gen++;
    2191           0 :                 mutex_unlock(&epmutex);
    2192             :         }
    2193             : 
    2194           0 :         fdput(tf);
    2195             : error_fput:
    2196           0 :         fdput(f);
    2197             : error_return:
    2198             : 
    2199           0 :         return error;
    2200             : }
    2201             : 
    2202             : /*
    2203             :  * The following function implements the controller interface for
    2204             :  * the eventpoll file that enables the insertion/removal/change of
    2205             :  * file descriptors inside the interest set.
    2206             :  */
    2207           0 : SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
    2208             :                 struct epoll_event __user *, event)
    2209             : {
    2210             :         struct epoll_event epds;
    2211             : 
    2212           0 :         if (ep_op_has_event(op) &&
    2213           0 :             copy_from_user(&epds, event, sizeof(struct epoll_event)))
    2214             :                 return -EFAULT;
    2215             : 
    2216           0 :         return do_epoll_ctl(epfd, op, fd, &epds, false);
    2217             : }
    2218             : 
    2219             : /*
    2220             :  * Implement the event wait interface for the eventpoll file. It is the kernel
    2221             :  * part of the user space epoll_wait(2).
    2222             :  */
    2223           0 : static int do_epoll_wait(int epfd, struct epoll_event __user *events,
    2224             :                          int maxevents, struct timespec64 *to)
    2225             : {
    2226             :         int error;
    2227             :         struct fd f;
    2228             :         struct eventpoll *ep;
    2229             : 
    2230             :         /* The maximum number of event must be greater than zero */
    2231           0 :         if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
    2232             :                 return -EINVAL;
    2233             : 
    2234             :         /* Verify that the area passed by the user is writeable */
    2235           0 :         if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
    2236             :                 return -EFAULT;
    2237             : 
    2238             :         /* Get the "struct file *" for the eventpoll file */
    2239           0 :         f = fdget(epfd);
    2240           0 :         if (!f.file)
    2241             :                 return -EBADF;
    2242             : 
    2243             :         /*
    2244             :          * We have to check that the file structure underneath the fd
    2245             :          * the user passed to us _is_ an eventpoll file.
    2246             :          */
    2247           0 :         error = -EINVAL;
    2248           0 :         if (!is_file_epoll(f.file))
    2249             :                 goto error_fput;
    2250             : 
    2251             :         /*
    2252             :          * At this point it is safe to assume that the "private_data" contains
    2253             :          * our own data structure.
    2254             :          */
    2255           0 :         ep = f.file->private_data;
    2256             : 
    2257             :         /* Time to fish for events ... */
    2258           0 :         error = ep_poll(ep, events, maxevents, to);
    2259             : 
    2260             : error_fput:
    2261           0 :         fdput(f);
    2262             :         return error;
    2263             : }
    2264             : 
    2265           0 : SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
    2266             :                 int, maxevents, int, timeout)
    2267             : {
    2268             :         struct timespec64 to;
    2269             : 
    2270           0 :         return do_epoll_wait(epfd, events, maxevents,
    2271             :                              ep_timeout_to_timespec(&to, timeout));
    2272             : }
    2273             : 
    2274             : /*
    2275             :  * Implement the event wait interface for the eventpoll file. It is the kernel
    2276             :  * part of the user space epoll_pwait(2).
    2277             :  */
    2278           0 : static int do_epoll_pwait(int epfd, struct epoll_event __user *events,
    2279             :                           int maxevents, struct timespec64 *to,
    2280             :                           const sigset_t __user *sigmask, size_t sigsetsize)
    2281             : {
    2282             :         int error;
    2283             : 
    2284             :         /*
    2285             :          * If the caller wants a certain signal mask to be set during the wait,
    2286             :          * we apply it here.
    2287             :          */
    2288           0 :         error = set_user_sigmask(sigmask, sigsetsize);
    2289           0 :         if (error)
    2290             :                 return error;
    2291             : 
    2292           0 :         error = do_epoll_wait(epfd, events, maxevents, to);
    2293             : 
    2294           0 :         restore_saved_sigmask_unless(error == -EINTR);
    2295             : 
    2296           0 :         return error;
    2297             : }
    2298             : 
    2299           0 : SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
    2300             :                 int, maxevents, int, timeout, const sigset_t __user *, sigmask,
    2301             :                 size_t, sigsetsize)
    2302             : {
    2303             :         struct timespec64 to;
    2304             : 
    2305           0 :         return do_epoll_pwait(epfd, events, maxevents,
    2306             :                               ep_timeout_to_timespec(&to, timeout),
    2307             :                               sigmask, sigsetsize);
    2308             : }
    2309             : 
    2310           0 : SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events,
    2311             :                 int, maxevents, const struct __kernel_timespec __user *, timeout,
    2312             :                 const sigset_t __user *, sigmask, size_t, sigsetsize)
    2313             : {
    2314           0 :         struct timespec64 ts, *to = NULL;
    2315             : 
    2316           0 :         if (timeout) {
    2317           0 :                 if (get_timespec64(&ts, timeout))
    2318             :                         return -EFAULT;
    2319           0 :                 to = &ts;
    2320           0 :                 if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
    2321             :                         return -EINVAL;
    2322             :         }
    2323             : 
    2324           0 :         return do_epoll_pwait(epfd, events, maxevents, to,
    2325             :                               sigmask, sigsetsize);
    2326             : }
    2327             : 
    2328             : #ifdef CONFIG_COMPAT
    2329             : static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events,
    2330             :                                  int maxevents, struct timespec64 *timeout,
    2331             :                                  const compat_sigset_t __user *sigmask,
    2332             :                                  compat_size_t sigsetsize)
    2333             : {
    2334             :         long err;
    2335             : 
    2336             :         /*
    2337             :          * If the caller wants a certain signal mask to be set during the wait,
    2338             :          * we apply it here.
    2339             :          */
    2340             :         err = set_compat_user_sigmask(sigmask, sigsetsize);
    2341             :         if (err)
    2342             :                 return err;
    2343             : 
    2344             :         err = do_epoll_wait(epfd, events, maxevents, timeout);
    2345             : 
    2346             :         restore_saved_sigmask_unless(err == -EINTR);
    2347             : 
    2348             :         return err;
    2349             : }
    2350             : 
    2351             : COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
    2352             :                        struct epoll_event __user *, events,
    2353             :                        int, maxevents, int, timeout,
    2354             :                        const compat_sigset_t __user *, sigmask,
    2355             :                        compat_size_t, sigsetsize)
    2356             : {
    2357             :         struct timespec64 to;
    2358             : 
    2359             :         return do_compat_epoll_pwait(epfd, events, maxevents,
    2360             :                                      ep_timeout_to_timespec(&to, timeout),
    2361             :                                      sigmask, sigsetsize);
    2362             : }
    2363             : 
    2364             : COMPAT_SYSCALL_DEFINE6(epoll_pwait2, int, epfd,
    2365             :                        struct epoll_event __user *, events,
    2366             :                        int, maxevents,
    2367             :                        const struct __kernel_timespec __user *, timeout,
    2368             :                        const compat_sigset_t __user *, sigmask,
    2369             :                        compat_size_t, sigsetsize)
    2370             : {
    2371             :         struct timespec64 ts, *to = NULL;
    2372             : 
    2373             :         if (timeout) {
    2374             :                 if (get_timespec64(&ts, timeout))
    2375             :                         return -EFAULT;
    2376             :                 to = &ts;
    2377             :                 if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
    2378             :                         return -EINVAL;
    2379             :         }
    2380             : 
    2381             :         return do_compat_epoll_pwait(epfd, events, maxevents, to,
    2382             :                                      sigmask, sigsetsize);
    2383             : }
    2384             : 
    2385             : #endif
    2386             : 
    2387           1 : static int __init eventpoll_init(void)
    2388             : {
    2389             :         struct sysinfo si;
    2390             : 
    2391           1 :         si_meminfo(&si);
    2392             :         /*
    2393             :          * Allows top 4% of lomem to be allocated for epoll watches (per user).
    2394             :          */
    2395           1 :         max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
    2396             :                 EP_ITEM_COST;
    2397           1 :         BUG_ON(max_user_watches < 0);
    2398             : 
    2399             :         /*
    2400             :          * We can have many thousands of epitems, so prevent this from
    2401             :          * using an extra cache line on 64-bit (and smaller) CPUs
    2402             :          */
    2403             :         BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);
    2404             : 
    2405             :         /* Allocates slab cache used to allocate "struct epitem" items */
    2406           1 :         epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
    2407             :                         0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
    2408             : 
    2409             :         /* Allocates slab cache used to allocate "struct eppoll_entry" */
    2410           1 :         pwq_cache = kmem_cache_create("eventpoll_pwq",
    2411             :                 sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
    2412           1 :         epoll_sysctls_init();
    2413             : 
    2414           1 :         ephead_cache = kmem_cache_create("ep_head",
    2415             :                 sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
    2416             : 
    2417           1 :         return 0;
    2418             : }
    2419             : fs_initcall(eventpoll_init);

Generated by: LCOV version 1.14