Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-only
2 : /*
3 : * fs/eventfd.c
4 : *
5 : * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
6 : *
7 : */
8 :
9 : #include <linux/file.h>
10 : #include <linux/poll.h>
11 : #include <linux/init.h>
12 : #include <linux/fs.h>
13 : #include <linux/sched/signal.h>
14 : #include <linux/kernel.h>
15 : #include <linux/slab.h>
16 : #include <linux/list.h>
17 : #include <linux/spinlock.h>
18 : #include <linux/anon_inodes.h>
19 : #include <linux/syscalls.h>
20 : #include <linux/export.h>
21 : #include <linux/kref.h>
22 : #include <linux/eventfd.h>
23 : #include <linux/proc_fs.h>
24 : #include <linux/seq_file.h>
25 : #include <linux/idr.h>
26 : #include <linux/uio.h>
27 :
28 : static DEFINE_IDA(eventfd_ida);
29 :
30 : struct eventfd_ctx {
31 : struct kref kref;
32 : wait_queue_head_t wqh;
33 : /*
34 : * Every time that a write(2) is performed on an eventfd, the
35 : * value of the __u64 being written is added to "count" and a
36 : * wakeup is performed on "wqh". A read(2) will return the "count"
37 : * value to userspace, and will reset "count" to zero. The kernel
38 : * side eventfd_signal() also, adds to the "count" counter and
39 : * issue a wakeup.
40 : */
41 : __u64 count;
42 : unsigned int flags;
43 : int id;
44 : };
45 :
46 0 : __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask)
47 : {
48 : unsigned long flags;
49 :
50 : /*
51 : * Deadlock or stack overflow issues can happen if we recurse here
52 : * through waitqueue wakeup handlers. If the caller users potentially
53 : * nested waitqueues with custom wakeup handlers, then it should
54 : * check eventfd_signal_allowed() before calling this function. If
55 : * it returns false, the eventfd_signal() call should be deferred to a
56 : * safe context.
57 : */
58 0 : if (WARN_ON_ONCE(current->in_eventfd))
59 : return 0;
60 :
61 0 : spin_lock_irqsave(&ctx->wqh.lock, flags);
62 0 : current->in_eventfd = 1;
63 0 : if (ULLONG_MAX - ctx->count < n)
64 0 : n = ULLONG_MAX - ctx->count;
65 0 : ctx->count += n;
66 0 : if (waitqueue_active(&ctx->wqh))
67 0 : wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask);
68 0 : current->in_eventfd = 0;
69 0 : spin_unlock_irqrestore(&ctx->wqh.lock, flags);
70 :
71 0 : return n;
72 : }
73 :
74 : /**
75 : * eventfd_signal - Adds @n to the eventfd counter.
76 : * @ctx: [in] Pointer to the eventfd context.
77 : * @n: [in] Value of the counter to be added to the eventfd internal counter.
78 : * The value cannot be negative.
79 : *
80 : * This function is supposed to be called by the kernel in paths that do not
81 : * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
82 : * value, and we signal this as overflow condition by returning a EPOLLERR
83 : * to poll(2).
84 : *
85 : * Returns the amount by which the counter was incremented. This will be less
86 : * than @n if the counter has overflowed.
87 : */
88 0 : __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
89 : {
90 0 : return eventfd_signal_mask(ctx, n, 0);
91 : }
92 : EXPORT_SYMBOL_GPL(eventfd_signal);
93 :
94 0 : static void eventfd_free_ctx(struct eventfd_ctx *ctx)
95 : {
96 0 : if (ctx->id >= 0)
97 0 : ida_simple_remove(&eventfd_ida, ctx->id);
98 0 : kfree(ctx);
99 0 : }
100 :
101 0 : static void eventfd_free(struct kref *kref)
102 : {
103 0 : struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
104 :
105 0 : eventfd_free_ctx(ctx);
106 0 : }
107 :
108 : /**
109 : * eventfd_ctx_put - Releases a reference to the internal eventfd context.
110 : * @ctx: [in] Pointer to eventfd context.
111 : *
112 : * The eventfd context reference must have been previously acquired either
113 : * with eventfd_ctx_fdget() or eventfd_ctx_fileget().
114 : */
115 0 : void eventfd_ctx_put(struct eventfd_ctx *ctx)
116 : {
117 0 : kref_put(&ctx->kref, eventfd_free);
118 0 : }
119 : EXPORT_SYMBOL_GPL(eventfd_ctx_put);
120 :
121 0 : static int eventfd_release(struct inode *inode, struct file *file)
122 : {
123 0 : struct eventfd_ctx *ctx = file->private_data;
124 :
125 0 : wake_up_poll(&ctx->wqh, EPOLLHUP);
126 0 : eventfd_ctx_put(ctx);
127 0 : return 0;
128 : }
129 :
130 0 : static __poll_t eventfd_poll(struct file *file, poll_table *wait)
131 : {
132 0 : struct eventfd_ctx *ctx = file->private_data;
133 0 : __poll_t events = 0;
134 : u64 count;
135 :
136 0 : poll_wait(file, &ctx->wqh, wait);
137 :
138 : /*
139 : * All writes to ctx->count occur within ctx->wqh.lock. This read
140 : * can be done outside ctx->wqh.lock because we know that poll_wait
141 : * takes that lock (through add_wait_queue) if our caller will sleep.
142 : *
143 : * The read _can_ therefore seep into add_wait_queue's critical
144 : * section, but cannot move above it! add_wait_queue's spin_lock acts
145 : * as an acquire barrier and ensures that the read be ordered properly
146 : * against the writes. The following CAN happen and is safe:
147 : *
148 : * poll write
149 : * ----------------- ------------
150 : * lock ctx->wqh.lock (in poll_wait)
151 : * count = ctx->count
152 : * __add_wait_queue
153 : * unlock ctx->wqh.lock
154 : * lock ctx->qwh.lock
155 : * ctx->count += n
156 : * if (waitqueue_active)
157 : * wake_up_locked_poll
158 : * unlock ctx->qwh.lock
159 : * eventfd_poll returns 0
160 : *
161 : * but the following, which would miss a wakeup, cannot happen:
162 : *
163 : * poll write
164 : * ----------------- ------------
165 : * count = ctx->count (INVALID!)
166 : * lock ctx->qwh.lock
167 : * ctx->count += n
168 : * **waitqueue_active is false**
169 : * **no wake_up_locked_poll!**
170 : * unlock ctx->qwh.lock
171 : * lock ctx->wqh.lock (in poll_wait)
172 : * __add_wait_queue
173 : * unlock ctx->wqh.lock
174 : * eventfd_poll returns 0
175 : */
176 0 : count = READ_ONCE(ctx->count);
177 :
178 0 : if (count > 0)
179 0 : events |= EPOLLIN;
180 0 : if (count == ULLONG_MAX)
181 0 : events |= EPOLLERR;
182 0 : if (ULLONG_MAX - 1 > count)
183 0 : events |= EPOLLOUT;
184 :
185 0 : return events;
186 : }
187 :
188 0 : void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
189 : {
190 : lockdep_assert_held(&ctx->wqh.lock);
191 :
192 0 : *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
193 0 : ctx->count -= *cnt;
194 0 : }
195 : EXPORT_SYMBOL_GPL(eventfd_ctx_do_read);
196 :
197 : /**
198 : * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
199 : * @ctx: [in] Pointer to eventfd context.
200 : * @wait: [in] Wait queue to be removed.
201 : * @cnt: [out] Pointer to the 64-bit counter value.
202 : *
203 : * Returns %0 if successful, or the following error codes:
204 : *
205 : * -EAGAIN : The operation would have blocked.
206 : *
207 : * This is used to atomically remove a wait queue entry from the eventfd wait
208 : * queue head, and read/reset the counter value.
209 : */
210 0 : int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
211 : __u64 *cnt)
212 : {
213 : unsigned long flags;
214 :
215 0 : spin_lock_irqsave(&ctx->wqh.lock, flags);
216 0 : eventfd_ctx_do_read(ctx, cnt);
217 0 : __remove_wait_queue(&ctx->wqh, wait);
218 0 : if (*cnt != 0 && waitqueue_active(&ctx->wqh))
219 0 : wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
220 0 : spin_unlock_irqrestore(&ctx->wqh.lock, flags);
221 :
222 0 : return *cnt != 0 ? 0 : -EAGAIN;
223 : }
224 : EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
225 :
226 0 : static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to)
227 : {
228 0 : struct file *file = iocb->ki_filp;
229 0 : struct eventfd_ctx *ctx = file->private_data;
230 0 : __u64 ucnt = 0;
231 :
232 0 : if (iov_iter_count(to) < sizeof(ucnt))
233 : return -EINVAL;
234 0 : spin_lock_irq(&ctx->wqh.lock);
235 0 : if (!ctx->count) {
236 0 : if ((file->f_flags & O_NONBLOCK) ||
237 0 : (iocb->ki_flags & IOCB_NOWAIT)) {
238 0 : spin_unlock_irq(&ctx->wqh.lock);
239 0 : return -EAGAIN;
240 : }
241 :
242 0 : if (wait_event_interruptible_locked_irq(ctx->wqh, ctx->count)) {
243 0 : spin_unlock_irq(&ctx->wqh.lock);
244 0 : return -ERESTARTSYS;
245 : }
246 : }
247 0 : eventfd_ctx_do_read(ctx, &ucnt);
248 0 : current->in_eventfd = 1;
249 0 : if (waitqueue_active(&ctx->wqh))
250 0 : wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
251 0 : current->in_eventfd = 0;
252 0 : spin_unlock_irq(&ctx->wqh.lock);
253 0 : if (unlikely(copy_to_iter(&ucnt, sizeof(ucnt), to) != sizeof(ucnt)))
254 : return -EFAULT;
255 :
256 0 : return sizeof(ucnt);
257 : }
258 :
259 0 : static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
260 : loff_t *ppos)
261 : {
262 0 : struct eventfd_ctx *ctx = file->private_data;
263 : ssize_t res;
264 : __u64 ucnt;
265 :
266 0 : if (count < sizeof(ucnt))
267 : return -EINVAL;
268 0 : if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
269 : return -EFAULT;
270 0 : if (ucnt == ULLONG_MAX)
271 : return -EINVAL;
272 0 : spin_lock_irq(&ctx->wqh.lock);
273 0 : res = -EAGAIN;
274 0 : if (ULLONG_MAX - ctx->count > ucnt)
275 : res = sizeof(ucnt);
276 0 : else if (!(file->f_flags & O_NONBLOCK)) {
277 0 : res = wait_event_interruptible_locked_irq(ctx->wqh,
278 : ULLONG_MAX - ctx->count > ucnt);
279 0 : if (!res)
280 0 : res = sizeof(ucnt);
281 : }
282 0 : if (likely(res > 0)) {
283 0 : ctx->count += ucnt;
284 0 : current->in_eventfd = 1;
285 0 : if (waitqueue_active(&ctx->wqh))
286 0 : wake_up_locked_poll(&ctx->wqh, EPOLLIN);
287 0 : current->in_eventfd = 0;
288 : }
289 0 : spin_unlock_irq(&ctx->wqh.lock);
290 :
291 0 : return res;
292 : }
293 :
294 : #ifdef CONFIG_PROC_FS
295 0 : static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
296 : {
297 0 : struct eventfd_ctx *ctx = f->private_data;
298 :
299 0 : spin_lock_irq(&ctx->wqh.lock);
300 0 : seq_printf(m, "eventfd-count: %16llx\n",
301 0 : (unsigned long long)ctx->count);
302 0 : spin_unlock_irq(&ctx->wqh.lock);
303 0 : seq_printf(m, "eventfd-id: %d\n", ctx->id);
304 0 : }
305 : #endif
306 :
307 : static const struct file_operations eventfd_fops = {
308 : #ifdef CONFIG_PROC_FS
309 : .show_fdinfo = eventfd_show_fdinfo,
310 : #endif
311 : .release = eventfd_release,
312 : .poll = eventfd_poll,
313 : .read_iter = eventfd_read,
314 : .write = eventfd_write,
315 : .llseek = noop_llseek,
316 : };
317 :
318 : /**
319 : * eventfd_fget - Acquire a reference of an eventfd file descriptor.
320 : * @fd: [in] Eventfd file descriptor.
321 : *
322 : * Returns a pointer to the eventfd file structure in case of success, or the
323 : * following error pointer:
324 : *
325 : * -EBADF : Invalid @fd file descriptor.
326 : * -EINVAL : The @fd file descriptor is not an eventfd file.
327 : */
328 0 : struct file *eventfd_fget(int fd)
329 : {
330 : struct file *file;
331 :
332 0 : file = fget(fd);
333 0 : if (!file)
334 : return ERR_PTR(-EBADF);
335 0 : if (file->f_op != &eventfd_fops) {
336 0 : fput(file);
337 0 : return ERR_PTR(-EINVAL);
338 : }
339 :
340 : return file;
341 : }
342 : EXPORT_SYMBOL_GPL(eventfd_fget);
343 :
344 : /**
345 : * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
346 : * @fd: [in] Eventfd file descriptor.
347 : *
348 : * Returns a pointer to the internal eventfd context, otherwise the error
349 : * pointers returned by the following functions:
350 : *
351 : * eventfd_fget
352 : */
353 0 : struct eventfd_ctx *eventfd_ctx_fdget(int fd)
354 : {
355 : struct eventfd_ctx *ctx;
356 0 : struct fd f = fdget(fd);
357 0 : if (!f.file)
358 : return ERR_PTR(-EBADF);
359 0 : ctx = eventfd_ctx_fileget(f.file);
360 0 : fdput(f);
361 : return ctx;
362 : }
363 : EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
364 :
365 : /**
366 : * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
367 : * @file: [in] Eventfd file pointer.
368 : *
369 : * Returns a pointer to the internal eventfd context, otherwise the error
370 : * pointer:
371 : *
372 : * -EINVAL : The @fd file descriptor is not an eventfd file.
373 : */
374 0 : struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
375 : {
376 : struct eventfd_ctx *ctx;
377 :
378 0 : if (file->f_op != &eventfd_fops)
379 : return ERR_PTR(-EINVAL);
380 :
381 0 : ctx = file->private_data;
382 0 : kref_get(&ctx->kref);
383 0 : return ctx;
384 : }
385 : EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
386 :
387 0 : static int do_eventfd(unsigned int count, int flags)
388 : {
389 : struct eventfd_ctx *ctx;
390 : struct file *file;
391 : int fd;
392 :
393 : /* Check the EFD_* constants for consistency. */
394 : BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
395 : BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
396 :
397 0 : if (flags & ~EFD_FLAGS_SET)
398 : return -EINVAL;
399 :
400 0 : ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
401 0 : if (!ctx)
402 : return -ENOMEM;
403 :
404 0 : kref_init(&ctx->kref);
405 0 : init_waitqueue_head(&ctx->wqh);
406 0 : ctx->count = count;
407 0 : ctx->flags = flags;
408 0 : ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
409 :
410 0 : flags &= EFD_SHARED_FCNTL_FLAGS;
411 0 : flags |= O_RDWR;
412 0 : fd = get_unused_fd_flags(flags);
413 0 : if (fd < 0)
414 : goto err;
415 :
416 0 : file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, flags);
417 0 : if (IS_ERR(file)) {
418 0 : put_unused_fd(fd);
419 0 : fd = PTR_ERR(file);
420 0 : goto err;
421 : }
422 :
423 0 : file->f_mode |= FMODE_NOWAIT;
424 0 : fd_install(fd, file);
425 0 : return fd;
426 : err:
427 0 : eventfd_free_ctx(ctx);
428 0 : return fd;
429 : }
430 :
431 0 : SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
432 : {
433 0 : return do_eventfd(count, flags);
434 : }
435 :
436 0 : SYSCALL_DEFINE1(eventfd, unsigned int, count)
437 : {
438 0 : return do_eventfd(count, 0);
439 : }
440 :
|