Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-only
2 : /*
3 : * fs/eventfd.c
4 : *
5 : * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
6 : *
7 : */
8 :
9 : #include <linux/file.h>
10 : #include <linux/poll.h>
11 : #include <linux/init.h>
12 : #include <linux/fs.h>
13 : #include <linux/sched/signal.h>
14 : #include <linux/kernel.h>
15 : #include <linux/slab.h>
16 : #include <linux/list.h>
17 : #include <linux/spinlock.h>
18 : #include <linux/anon_inodes.h>
19 : #include <linux/syscalls.h>
20 : #include <linux/export.h>
21 : #include <linux/kref.h>
22 : #include <linux/eventfd.h>
23 : #include <linux/proc_fs.h>
24 : #include <linux/seq_file.h>
25 : #include <linux/idr.h>
26 : #include <linux/uio.h>
27 :
28 : static DEFINE_IDA(eventfd_ida);
29 :
30 : struct eventfd_ctx {
31 : struct kref kref;
32 : wait_queue_head_t wqh;
33 : /*
34 : * Every time that a write(2) is performed on an eventfd, the
35 : * value of the __u64 being written is added to "count" and a
36 : * wakeup is performed on "wqh". A read(2) will return the "count"
37 : * value to userspace, and will reset "count" to zero. The kernel
38 : * side eventfd_signal() also, adds to the "count" counter and
39 : * issue a wakeup.
40 : */
41 : __u64 count;
42 : unsigned int flags;
43 : int id;
44 : };
45 :
46 0 : __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask)
47 : {
48 : unsigned long flags;
49 :
50 : /*
51 : * Deadlock or stack overflow issues can happen if we recurse here
52 : * through waitqueue wakeup handlers. If the caller users potentially
53 : * nested waitqueues with custom wakeup handlers, then it should
54 : * check eventfd_signal_allowed() before calling this function. If
55 : * it returns false, the eventfd_signal() call should be deferred to a
56 : * safe context.
57 : */
58 0 : if (WARN_ON_ONCE(current->in_eventfd))
59 : return 0;
60 :
61 0 : spin_lock_irqsave(&ctx->wqh.lock, flags);
62 0 : current->in_eventfd = 1;
63 0 : if (ULLONG_MAX - ctx->count < n)
64 0 : n = ULLONG_MAX - ctx->count;
65 0 : ctx->count += n;
66 0 : if (waitqueue_active(&ctx->wqh))
67 0 : wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask);
68 0 : current->in_eventfd = 0;
69 0 : spin_unlock_irqrestore(&ctx->wqh.lock, flags);
70 :
71 0 : return n;
72 : }
73 :
74 : /**
75 : * eventfd_signal - Adds @n to the eventfd counter.
76 : * @ctx: [in] Pointer to the eventfd context.
77 : * @n: [in] Value of the counter to be added to the eventfd internal counter.
78 : * The value cannot be negative.
79 : *
80 : * This function is supposed to be called by the kernel in paths that do not
81 : * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
82 : * value, and we signal this as overflow condition by returning a EPOLLERR
83 : * to poll(2).
84 : *
85 : * Returns the amount by which the counter was incremented. This will be less
86 : * than @n if the counter has overflowed.
87 : */
88 0 : __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
89 : {
90 0 : return eventfd_signal_mask(ctx, n, 0);
91 : }
92 : EXPORT_SYMBOL_GPL(eventfd_signal);
93 :
94 0 : static void eventfd_free_ctx(struct eventfd_ctx *ctx)
95 : {
96 0 : if (ctx->id >= 0)
97 0 : ida_simple_remove(&eventfd_ida, ctx->id);
98 0 : kfree(ctx);
99 0 : }
100 :
101 0 : static void eventfd_free(struct kref *kref)
102 : {
103 0 : struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
104 :
105 0 : eventfd_free_ctx(ctx);
106 0 : }
107 :
108 : /**
109 : * eventfd_ctx_put - Releases a reference to the internal eventfd context.
110 : * @ctx: [in] Pointer to eventfd context.
111 : *
112 : * The eventfd context reference must have been previously acquired either
113 : * with eventfd_ctx_fdget() or eventfd_ctx_fileget().
114 : */
115 0 : void eventfd_ctx_put(struct eventfd_ctx *ctx)
116 : {
117 0 : kref_put(&ctx->kref, eventfd_free);
118 0 : }
119 : EXPORT_SYMBOL_GPL(eventfd_ctx_put);
120 :
121 0 : static int eventfd_release(struct inode *inode, struct file *file)
122 : {
123 0 : struct eventfd_ctx *ctx = file->private_data;
124 :
125 0 : wake_up_poll(&ctx->wqh, EPOLLHUP);
126 0 : eventfd_ctx_put(ctx);
127 0 : return 0;
128 : }
129 :
130 0 : static __poll_t eventfd_poll(struct file *file, poll_table *wait)
131 : {
132 0 : struct eventfd_ctx *ctx = file->private_data;
133 0 : __poll_t events = 0;
134 : u64 count;
135 :
136 0 : poll_wait(file, &ctx->wqh, wait);
137 :
138 : /*
139 : * All writes to ctx->count occur within ctx->wqh.lock. This read
140 : * can be done outside ctx->wqh.lock because we know that poll_wait
141 : * takes that lock (through add_wait_queue) if our caller will sleep.
142 : *
143 : * The read _can_ therefore seep into add_wait_queue's critical
144 : * section, but cannot move above it! add_wait_queue's spin_lock acts
145 : * as an acquire barrier and ensures that the read be ordered properly
146 : * against the writes. The following CAN happen and is safe:
147 : *
148 : * poll write
149 : * ----------------- ------------
150 : * lock ctx->wqh.lock (in poll_wait)
151 : * count = ctx->count
152 : * __add_wait_queue
153 : * unlock ctx->wqh.lock
154 : * lock ctx->qwh.lock
155 : * ctx->count += n
156 : * if (waitqueue_active)
157 : * wake_up_locked_poll
158 : * unlock ctx->qwh.lock
159 : * eventfd_poll returns 0
160 : *
161 : * but the following, which would miss a wakeup, cannot happen:
162 : *
163 : * poll write
164 : * ----------------- ------------
165 : * count = ctx->count (INVALID!)
166 : * lock ctx->qwh.lock
167 : * ctx->count += n
168 : * **waitqueue_active is false**
169 : * **no wake_up_locked_poll!**
170 : * unlock ctx->qwh.lock
171 : * lock ctx->wqh.lock (in poll_wait)
172 : * __add_wait_queue
173 : * unlock ctx->wqh.lock
174 : * eventfd_poll returns 0
175 : */
176 0 : count = READ_ONCE(ctx->count);
177 :
178 0 : if (count > 0)
179 0 : events |= EPOLLIN;
180 0 : if (count == ULLONG_MAX)
181 0 : events |= EPOLLERR;
182 0 : if (ULLONG_MAX - 1 > count)
183 0 : events |= EPOLLOUT;
184 :
185 0 : return events;
186 : }
187 :
188 0 : void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
189 : {
190 : lockdep_assert_held(&ctx->wqh.lock);
191 :
192 0 : *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
193 0 : ctx->count -= *cnt;
194 0 : }
195 : EXPORT_SYMBOL_GPL(eventfd_ctx_do_read);
196 :
197 : /**
198 : * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
199 : * @ctx: [in] Pointer to eventfd context.
200 : * @wait: [in] Wait queue to be removed.
201 : * @cnt: [out] Pointer to the 64-bit counter value.
202 : *
203 : * Returns %0 if successful, or the following error codes:
204 : *
205 : * -EAGAIN : The operation would have blocked.
206 : *
207 : * This is used to atomically remove a wait queue entry from the eventfd wait
208 : * queue head, and read/reset the counter value.
209 : */
210 0 : int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
211 : __u64 *cnt)
212 : {
213 : unsigned long flags;
214 :
215 0 : spin_lock_irqsave(&ctx->wqh.lock, flags);
216 0 : eventfd_ctx_do_read(ctx, cnt);
217 0 : __remove_wait_queue(&ctx->wqh, wait);
218 0 : if (*cnt != 0 && waitqueue_active(&ctx->wqh))
219 0 : wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
220 0 : spin_unlock_irqrestore(&ctx->wqh.lock, flags);
221 :
222 0 : return *cnt != 0 ? 0 : -EAGAIN;
223 : }
224 : EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
225 :
226 0 : static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to)
227 : {
228 0 : struct file *file = iocb->ki_filp;
229 0 : struct eventfd_ctx *ctx = file->private_data;
230 0 : __u64 ucnt = 0;
231 0 : DECLARE_WAITQUEUE(wait, current);
232 :
233 0 : if (iov_iter_count(to) < sizeof(ucnt))
234 : return -EINVAL;
235 0 : spin_lock_irq(&ctx->wqh.lock);
236 0 : if (!ctx->count) {
237 0 : if ((file->f_flags & O_NONBLOCK) ||
238 0 : (iocb->ki_flags & IOCB_NOWAIT)) {
239 0 : spin_unlock_irq(&ctx->wqh.lock);
240 0 : return -EAGAIN;
241 : }
242 0 : __add_wait_queue(&ctx->wqh, &wait);
243 : for (;;) {
244 0 : set_current_state(TASK_INTERRUPTIBLE);
245 0 : if (ctx->count)
246 : break;
247 0 : if (signal_pending(current)) {
248 0 : __remove_wait_queue(&ctx->wqh, &wait);
249 0 : __set_current_state(TASK_RUNNING);
250 0 : spin_unlock_irq(&ctx->wqh.lock);
251 0 : return -ERESTARTSYS;
252 : }
253 0 : spin_unlock_irq(&ctx->wqh.lock);
254 0 : schedule();
255 0 : spin_lock_irq(&ctx->wqh.lock);
256 : }
257 0 : __remove_wait_queue(&ctx->wqh, &wait);
258 0 : __set_current_state(TASK_RUNNING);
259 : }
260 0 : eventfd_ctx_do_read(ctx, &ucnt);
261 0 : current->in_eventfd = 1;
262 0 : if (waitqueue_active(&ctx->wqh))
263 0 : wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
264 0 : current->in_eventfd = 0;
265 0 : spin_unlock_irq(&ctx->wqh.lock);
266 0 : if (unlikely(copy_to_iter(&ucnt, sizeof(ucnt), to) != sizeof(ucnt)))
267 : return -EFAULT;
268 :
269 0 : return sizeof(ucnt);
270 : }
271 :
272 0 : static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
273 : loff_t *ppos)
274 : {
275 0 : struct eventfd_ctx *ctx = file->private_data;
276 : ssize_t res;
277 : __u64 ucnt;
278 0 : DECLARE_WAITQUEUE(wait, current);
279 :
280 0 : if (count < sizeof(ucnt))
281 : return -EINVAL;
282 0 : if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
283 : return -EFAULT;
284 0 : if (ucnt == ULLONG_MAX)
285 : return -EINVAL;
286 0 : spin_lock_irq(&ctx->wqh.lock);
287 0 : res = -EAGAIN;
288 0 : if (ULLONG_MAX - ctx->count > ucnt)
289 : res = sizeof(ucnt);
290 0 : else if (!(file->f_flags & O_NONBLOCK)) {
291 0 : __add_wait_queue(&ctx->wqh, &wait);
292 0 : for (res = 0;;) {
293 0 : set_current_state(TASK_INTERRUPTIBLE);
294 0 : if (ULLONG_MAX - ctx->count > ucnt) {
295 : res = sizeof(ucnt);
296 : break;
297 : }
298 0 : if (signal_pending(current)) {
299 : res = -ERESTARTSYS;
300 : break;
301 : }
302 0 : spin_unlock_irq(&ctx->wqh.lock);
303 0 : schedule();
304 0 : spin_lock_irq(&ctx->wqh.lock);
305 : }
306 0 : __remove_wait_queue(&ctx->wqh, &wait);
307 0 : __set_current_state(TASK_RUNNING);
308 : }
309 0 : if (likely(res > 0)) {
310 0 : ctx->count += ucnt;
311 0 : current->in_eventfd = 1;
312 0 : if (waitqueue_active(&ctx->wqh))
313 0 : wake_up_locked_poll(&ctx->wqh, EPOLLIN);
314 0 : current->in_eventfd = 0;
315 : }
316 0 : spin_unlock_irq(&ctx->wqh.lock);
317 :
318 0 : return res;
319 : }
320 :
321 : #ifdef CONFIG_PROC_FS
322 0 : static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
323 : {
324 0 : struct eventfd_ctx *ctx = f->private_data;
325 :
326 0 : spin_lock_irq(&ctx->wqh.lock);
327 0 : seq_printf(m, "eventfd-count: %16llx\n",
328 0 : (unsigned long long)ctx->count);
329 0 : spin_unlock_irq(&ctx->wqh.lock);
330 0 : seq_printf(m, "eventfd-id: %d\n", ctx->id);
331 0 : }
332 : #endif
333 :
334 : static const struct file_operations eventfd_fops = {
335 : #ifdef CONFIG_PROC_FS
336 : .show_fdinfo = eventfd_show_fdinfo,
337 : #endif
338 : .release = eventfd_release,
339 : .poll = eventfd_poll,
340 : .read_iter = eventfd_read,
341 : .write = eventfd_write,
342 : .llseek = noop_llseek,
343 : };
344 :
345 : /**
346 : * eventfd_fget - Acquire a reference of an eventfd file descriptor.
347 : * @fd: [in] Eventfd file descriptor.
348 : *
349 : * Returns a pointer to the eventfd file structure in case of success, or the
350 : * following error pointer:
351 : *
352 : * -EBADF : Invalid @fd file descriptor.
353 : * -EINVAL : The @fd file descriptor is not an eventfd file.
354 : */
355 0 : struct file *eventfd_fget(int fd)
356 : {
357 : struct file *file;
358 :
359 0 : file = fget(fd);
360 0 : if (!file)
361 : return ERR_PTR(-EBADF);
362 0 : if (file->f_op != &eventfd_fops) {
363 0 : fput(file);
364 0 : return ERR_PTR(-EINVAL);
365 : }
366 :
367 : return file;
368 : }
369 : EXPORT_SYMBOL_GPL(eventfd_fget);
370 :
371 : /**
372 : * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
373 : * @fd: [in] Eventfd file descriptor.
374 : *
375 : * Returns a pointer to the internal eventfd context, otherwise the error
376 : * pointers returned by the following functions:
377 : *
378 : * eventfd_fget
379 : */
380 0 : struct eventfd_ctx *eventfd_ctx_fdget(int fd)
381 : {
382 : struct eventfd_ctx *ctx;
383 0 : struct fd f = fdget(fd);
384 0 : if (!f.file)
385 : return ERR_PTR(-EBADF);
386 0 : ctx = eventfd_ctx_fileget(f.file);
387 0 : fdput(f);
388 : return ctx;
389 : }
390 : EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
391 :
392 : /**
393 : * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
394 : * @file: [in] Eventfd file pointer.
395 : *
396 : * Returns a pointer to the internal eventfd context, otherwise the error
397 : * pointer:
398 : *
399 : * -EINVAL : The @fd file descriptor is not an eventfd file.
400 : */
401 0 : struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
402 : {
403 : struct eventfd_ctx *ctx;
404 :
405 0 : if (file->f_op != &eventfd_fops)
406 : return ERR_PTR(-EINVAL);
407 :
408 0 : ctx = file->private_data;
409 0 : kref_get(&ctx->kref);
410 0 : return ctx;
411 : }
412 : EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
413 :
414 0 : static int do_eventfd(unsigned int count, int flags)
415 : {
416 : struct eventfd_ctx *ctx;
417 : struct file *file;
418 : int fd;
419 :
420 : /* Check the EFD_* constants for consistency. */
421 : BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
422 : BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
423 :
424 0 : if (flags & ~EFD_FLAGS_SET)
425 : return -EINVAL;
426 :
427 0 : ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
428 0 : if (!ctx)
429 : return -ENOMEM;
430 :
431 0 : kref_init(&ctx->kref);
432 0 : init_waitqueue_head(&ctx->wqh);
433 0 : ctx->count = count;
434 0 : ctx->flags = flags;
435 0 : ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
436 :
437 0 : flags &= EFD_SHARED_FCNTL_FLAGS;
438 0 : flags |= O_RDWR;
439 0 : fd = get_unused_fd_flags(flags);
440 0 : if (fd < 0)
441 : goto err;
442 :
443 0 : file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, flags);
444 0 : if (IS_ERR(file)) {
445 0 : put_unused_fd(fd);
446 0 : fd = PTR_ERR(file);
447 0 : goto err;
448 : }
449 :
450 0 : file->f_mode |= FMODE_NOWAIT;
451 0 : fd_install(fd, file);
452 0 : return fd;
453 : err:
454 0 : eventfd_free_ctx(ctx);
455 0 : return fd;
456 : }
457 :
458 0 : SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
459 : {
460 0 : return do_eventfd(count, flags);
461 : }
462 :
463 0 : SYSCALL_DEFINE1(eventfd, unsigned int, count)
464 : {
465 0 : return do_eventfd(count, 0);
466 : }
467 :
|