Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-only
2 : /*
3 : * fs/eventfd.c
4 : *
5 : * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
6 : *
7 : */
8 :
9 : #include <linux/file.h>
10 : #include <linux/poll.h>
11 : #include <linux/init.h>
12 : #include <linux/fs.h>
13 : #include <linux/sched/signal.h>
14 : #include <linux/kernel.h>
15 : #include <linux/slab.h>
16 : #include <linux/list.h>
17 : #include <linux/spinlock.h>
18 : #include <linux/anon_inodes.h>
19 : #include <linux/syscalls.h>
20 : #include <linux/export.h>
21 : #include <linux/kref.h>
22 : #include <linux/eventfd.h>
23 : #include <linux/proc_fs.h>
24 : #include <linux/seq_file.h>
25 : #include <linux/idr.h>
26 : #include <linux/uio.h>
27 :
28 : static DEFINE_IDA(eventfd_ida);
29 :
30 : struct eventfd_ctx {
31 : struct kref kref;
32 : wait_queue_head_t wqh;
33 : /*
34 : * Every time that a write(2) is performed on an eventfd, the
35 : * value of the __u64 being written is added to "count" and a
36 : * wakeup is performed on "wqh". If EFD_SEMAPHORE flag was not
37 : * specified, a read(2) will return the "count" value to userspace,
38 : * and will reset "count" to zero. The kernel side eventfd_signal()
39 : * also, adds to the "count" counter and issue a wakeup.
40 : */
41 : __u64 count;
42 : unsigned int flags;
43 : int id;
44 : };
45 :
46 0 : __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask)
47 : {
48 : unsigned long flags;
49 :
50 : /*
51 : * Deadlock or stack overflow issues can happen if we recurse here
52 : * through waitqueue wakeup handlers. If the caller users potentially
53 : * nested waitqueues with custom wakeup handlers, then it should
54 : * check eventfd_signal_allowed() before calling this function. If
55 : * it returns false, the eventfd_signal() call should be deferred to a
56 : * safe context.
57 : */
58 0 : if (WARN_ON_ONCE(current->in_eventfd))
59 : return 0;
60 :
61 0 : spin_lock_irqsave(&ctx->wqh.lock, flags);
62 0 : current->in_eventfd = 1;
63 0 : if (ULLONG_MAX - ctx->count < n)
64 0 : n = ULLONG_MAX - ctx->count;
65 0 : ctx->count += n;
66 0 : if (waitqueue_active(&ctx->wqh))
67 0 : wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask);
68 0 : current->in_eventfd = 0;
69 0 : spin_unlock_irqrestore(&ctx->wqh.lock, flags);
70 :
71 0 : return n;
72 : }
73 :
74 : /**
75 : * eventfd_signal - Adds @n to the eventfd counter.
76 : * @ctx: [in] Pointer to the eventfd context.
77 : * @n: [in] Value of the counter to be added to the eventfd internal counter.
78 : * The value cannot be negative.
79 : *
80 : * This function is supposed to be called by the kernel in paths that do not
81 : * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
82 : * value, and we signal this as overflow condition by returning a EPOLLERR
83 : * to poll(2).
84 : *
85 : * Returns the amount by which the counter was incremented. This will be less
86 : * than @n if the counter has overflowed.
87 : */
88 0 : __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
89 : {
90 0 : return eventfd_signal_mask(ctx, n, 0);
91 : }
92 : EXPORT_SYMBOL_GPL(eventfd_signal);
93 :
94 0 : static void eventfd_free_ctx(struct eventfd_ctx *ctx)
95 : {
96 0 : if (ctx->id >= 0)
97 0 : ida_simple_remove(&eventfd_ida, ctx->id);
98 0 : kfree(ctx);
99 0 : }
100 :
101 0 : static void eventfd_free(struct kref *kref)
102 : {
103 0 : struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
104 :
105 0 : eventfd_free_ctx(ctx);
106 0 : }
107 :
108 : /**
109 : * eventfd_ctx_put - Releases a reference to the internal eventfd context.
110 : * @ctx: [in] Pointer to eventfd context.
111 : *
112 : * The eventfd context reference must have been previously acquired either
113 : * with eventfd_ctx_fdget() or eventfd_ctx_fileget().
114 : */
115 0 : void eventfd_ctx_put(struct eventfd_ctx *ctx)
116 : {
117 0 : kref_put(&ctx->kref, eventfd_free);
118 0 : }
119 : EXPORT_SYMBOL_GPL(eventfd_ctx_put);
120 :
121 0 : static int eventfd_release(struct inode *inode, struct file *file)
122 : {
123 0 : struct eventfd_ctx *ctx = file->private_data;
124 :
125 0 : wake_up_poll(&ctx->wqh, EPOLLHUP);
126 0 : eventfd_ctx_put(ctx);
127 0 : return 0;
128 : }
129 :
130 0 : static __poll_t eventfd_poll(struct file *file, poll_table *wait)
131 : {
132 0 : struct eventfd_ctx *ctx = file->private_data;
133 0 : __poll_t events = 0;
134 : u64 count;
135 :
136 0 : poll_wait(file, &ctx->wqh, wait);
137 :
138 : /*
139 : * All writes to ctx->count occur within ctx->wqh.lock. This read
140 : * can be done outside ctx->wqh.lock because we know that poll_wait
141 : * takes that lock (through add_wait_queue) if our caller will sleep.
142 : *
143 : * The read _can_ therefore seep into add_wait_queue's critical
144 : * section, but cannot move above it! add_wait_queue's spin_lock acts
145 : * as an acquire barrier and ensures that the read be ordered properly
146 : * against the writes. The following CAN happen and is safe:
147 : *
148 : * poll write
149 : * ----------------- ------------
150 : * lock ctx->wqh.lock (in poll_wait)
151 : * count = ctx->count
152 : * __add_wait_queue
153 : * unlock ctx->wqh.lock
154 : * lock ctx->qwh.lock
155 : * ctx->count += n
156 : * if (waitqueue_active)
157 : * wake_up_locked_poll
158 : * unlock ctx->qwh.lock
159 : * eventfd_poll returns 0
160 : *
161 : * but the following, which would miss a wakeup, cannot happen:
162 : *
163 : * poll write
164 : * ----------------- ------------
165 : * count = ctx->count (INVALID!)
166 : * lock ctx->qwh.lock
167 : * ctx->count += n
168 : * **waitqueue_active is false**
169 : * **no wake_up_locked_poll!**
170 : * unlock ctx->qwh.lock
171 : * lock ctx->wqh.lock (in poll_wait)
172 : * __add_wait_queue
173 : * unlock ctx->wqh.lock
174 : * eventfd_poll returns 0
175 : */
176 0 : count = READ_ONCE(ctx->count);
177 :
178 0 : if (count > 0)
179 0 : events |= EPOLLIN;
180 0 : if (count == ULLONG_MAX)
181 0 : events |= EPOLLERR;
182 0 : if (ULLONG_MAX - 1 > count)
183 0 : events |= EPOLLOUT;
184 :
185 0 : return events;
186 : }
187 :
188 0 : void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
189 : {
190 : lockdep_assert_held(&ctx->wqh.lock);
191 :
192 0 : *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
193 0 : ctx->count -= *cnt;
194 0 : }
195 : EXPORT_SYMBOL_GPL(eventfd_ctx_do_read);
196 :
197 : /**
198 : * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
199 : * @ctx: [in] Pointer to eventfd context.
200 : * @wait: [in] Wait queue to be removed.
201 : * @cnt: [out] Pointer to the 64-bit counter value.
202 : *
203 : * Returns %0 if successful, or the following error codes:
204 : *
205 : * -EAGAIN : The operation would have blocked.
206 : *
207 : * This is used to atomically remove a wait queue entry from the eventfd wait
208 : * queue head, and read/reset the counter value.
209 : */
210 0 : int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
211 : __u64 *cnt)
212 : {
213 : unsigned long flags;
214 :
215 0 : spin_lock_irqsave(&ctx->wqh.lock, flags);
216 0 : eventfd_ctx_do_read(ctx, cnt);
217 0 : __remove_wait_queue(&ctx->wqh, wait);
218 0 : if (*cnt != 0 && waitqueue_active(&ctx->wqh))
219 0 : wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
220 0 : spin_unlock_irqrestore(&ctx->wqh.lock, flags);
221 :
222 0 : return *cnt != 0 ? 0 : -EAGAIN;
223 : }
224 : EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
225 :
226 0 : static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to)
227 : {
228 0 : struct file *file = iocb->ki_filp;
229 0 : struct eventfd_ctx *ctx = file->private_data;
230 0 : __u64 ucnt = 0;
231 :
232 0 : if (iov_iter_count(to) < sizeof(ucnt))
233 : return -EINVAL;
234 0 : spin_lock_irq(&ctx->wqh.lock);
235 0 : if (!ctx->count) {
236 0 : if ((file->f_flags & O_NONBLOCK) ||
237 0 : (iocb->ki_flags & IOCB_NOWAIT)) {
238 0 : spin_unlock_irq(&ctx->wqh.lock);
239 0 : return -EAGAIN;
240 : }
241 :
242 0 : if (wait_event_interruptible_locked_irq(ctx->wqh, ctx->count)) {
243 0 : spin_unlock_irq(&ctx->wqh.lock);
244 0 : return -ERESTARTSYS;
245 : }
246 : }
247 0 : eventfd_ctx_do_read(ctx, &ucnt);
248 0 : current->in_eventfd = 1;
249 0 : if (waitqueue_active(&ctx->wqh))
250 0 : wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
251 0 : current->in_eventfd = 0;
252 0 : spin_unlock_irq(&ctx->wqh.lock);
253 0 : if (unlikely(copy_to_iter(&ucnt, sizeof(ucnt), to) != sizeof(ucnt)))
254 : return -EFAULT;
255 :
256 0 : return sizeof(ucnt);
257 : }
258 :
259 0 : static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
260 : loff_t *ppos)
261 : {
262 0 : struct eventfd_ctx *ctx = file->private_data;
263 : ssize_t res;
264 : __u64 ucnt;
265 :
266 0 : if (count < sizeof(ucnt))
267 : return -EINVAL;
268 0 : if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
269 : return -EFAULT;
270 0 : if (ucnt == ULLONG_MAX)
271 : return -EINVAL;
272 0 : spin_lock_irq(&ctx->wqh.lock);
273 0 : res = -EAGAIN;
274 0 : if (ULLONG_MAX - ctx->count > ucnt)
275 : res = sizeof(ucnt);
276 0 : else if (!(file->f_flags & O_NONBLOCK)) {
277 0 : res = wait_event_interruptible_locked_irq(ctx->wqh,
278 : ULLONG_MAX - ctx->count > ucnt);
279 0 : if (!res)
280 0 : res = sizeof(ucnt);
281 : }
282 0 : if (likely(res > 0)) {
283 0 : ctx->count += ucnt;
284 0 : current->in_eventfd = 1;
285 0 : if (waitqueue_active(&ctx->wqh))
286 0 : wake_up_locked_poll(&ctx->wqh, EPOLLIN);
287 0 : current->in_eventfd = 0;
288 : }
289 0 : spin_unlock_irq(&ctx->wqh.lock);
290 :
291 0 : return res;
292 : }
293 :
294 : #ifdef CONFIG_PROC_FS
295 0 : static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
296 : {
297 0 : struct eventfd_ctx *ctx = f->private_data;
298 :
299 0 : spin_lock_irq(&ctx->wqh.lock);
300 0 : seq_printf(m, "eventfd-count: %16llx\n",
301 0 : (unsigned long long)ctx->count);
302 0 : spin_unlock_irq(&ctx->wqh.lock);
303 0 : seq_printf(m, "eventfd-id: %d\n", ctx->id);
304 0 : seq_printf(m, "eventfd-semaphore: %d\n",
305 0 : !!(ctx->flags & EFD_SEMAPHORE));
306 0 : }
307 : #endif
308 :
309 : static const struct file_operations eventfd_fops = {
310 : #ifdef CONFIG_PROC_FS
311 : .show_fdinfo = eventfd_show_fdinfo,
312 : #endif
313 : .release = eventfd_release,
314 : .poll = eventfd_poll,
315 : .read_iter = eventfd_read,
316 : .write = eventfd_write,
317 : .llseek = noop_llseek,
318 : };
319 :
320 : /**
321 : * eventfd_fget - Acquire a reference of an eventfd file descriptor.
322 : * @fd: [in] Eventfd file descriptor.
323 : *
324 : * Returns a pointer to the eventfd file structure in case of success, or the
325 : * following error pointer:
326 : *
327 : * -EBADF : Invalid @fd file descriptor.
328 : * -EINVAL : The @fd file descriptor is not an eventfd file.
329 : */
330 0 : struct file *eventfd_fget(int fd)
331 : {
332 : struct file *file;
333 :
334 0 : file = fget(fd);
335 0 : if (!file)
336 : return ERR_PTR(-EBADF);
337 0 : if (file->f_op != &eventfd_fops) {
338 0 : fput(file);
339 0 : return ERR_PTR(-EINVAL);
340 : }
341 :
342 : return file;
343 : }
344 : EXPORT_SYMBOL_GPL(eventfd_fget);
345 :
346 : /**
347 : * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
348 : * @fd: [in] Eventfd file descriptor.
349 : *
350 : * Returns a pointer to the internal eventfd context, otherwise the error
351 : * pointers returned by the following functions:
352 : *
353 : * eventfd_fget
354 : */
355 0 : struct eventfd_ctx *eventfd_ctx_fdget(int fd)
356 : {
357 : struct eventfd_ctx *ctx;
358 0 : struct fd f = fdget(fd);
359 0 : if (!f.file)
360 : return ERR_PTR(-EBADF);
361 0 : ctx = eventfd_ctx_fileget(f.file);
362 0 : fdput(f);
363 : return ctx;
364 : }
365 : EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
366 :
367 : /**
368 : * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
369 : * @file: [in] Eventfd file pointer.
370 : *
371 : * Returns a pointer to the internal eventfd context, otherwise the error
372 : * pointer:
373 : *
374 : * -EINVAL : The @fd file descriptor is not an eventfd file.
375 : */
376 0 : struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
377 : {
378 : struct eventfd_ctx *ctx;
379 :
380 0 : if (file->f_op != &eventfd_fops)
381 : return ERR_PTR(-EINVAL);
382 :
383 0 : ctx = file->private_data;
384 0 : kref_get(&ctx->kref);
385 0 : return ctx;
386 : }
387 : EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
388 :
389 0 : static int do_eventfd(unsigned int count, int flags)
390 : {
391 : struct eventfd_ctx *ctx;
392 : struct file *file;
393 : int fd;
394 :
395 : /* Check the EFD_* constants for consistency. */
396 : BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
397 : BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
398 :
399 0 : if (flags & ~EFD_FLAGS_SET)
400 : return -EINVAL;
401 :
402 0 : ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
403 0 : if (!ctx)
404 : return -ENOMEM;
405 :
406 0 : kref_init(&ctx->kref);
407 0 : init_waitqueue_head(&ctx->wqh);
408 0 : ctx->count = count;
409 0 : ctx->flags = flags;
410 0 : ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
411 :
412 0 : flags &= EFD_SHARED_FCNTL_FLAGS;
413 0 : flags |= O_RDWR;
414 0 : fd = get_unused_fd_flags(flags);
415 0 : if (fd < 0)
416 : goto err;
417 :
418 0 : file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, flags);
419 0 : if (IS_ERR(file)) {
420 0 : put_unused_fd(fd);
421 0 : fd = PTR_ERR(file);
422 0 : goto err;
423 : }
424 :
425 0 : file->f_mode |= FMODE_NOWAIT;
426 0 : fd_install(fd, file);
427 0 : return fd;
428 : err:
429 0 : eventfd_free_ctx(ctx);
430 0 : return fd;
431 : }
432 :
433 0 : SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
434 : {
435 0 : return do_eventfd(count, flags);
436 : }
437 :
438 0 : SYSCALL_DEFINE1(eventfd, unsigned int, count)
439 : {
440 0 : return do_eventfd(count, 0);
441 : }
442 :
|