Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-only
2 : /*
3 : * linux/fs/buffer.c
4 : *
5 : * Copyright (C) 1991, 1992, 2002 Linus Torvalds
6 : */
7 :
8 : /*
9 : * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
10 : *
11 : * Removed a lot of unnecessary code and simplified things now that
12 : * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
13 : *
14 : * Speed up hash, lru, and free list operations. Use gfp() for allocating
15 : * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
16 : *
17 : * Added 32k buffer block sizes - these are required older ARM systems. - RMK
18 : *
19 : * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
20 : */
21 :
22 : #include <linux/kernel.h>
23 : #include <linux/sched/signal.h>
24 : #include <linux/syscalls.h>
25 : #include <linux/fs.h>
26 : #include <linux/iomap.h>
27 : #include <linux/mm.h>
28 : #include <linux/percpu.h>
29 : #include <linux/slab.h>
30 : #include <linux/capability.h>
31 : #include <linux/blkdev.h>
32 : #include <linux/file.h>
33 : #include <linux/quotaops.h>
34 : #include <linux/highmem.h>
35 : #include <linux/export.h>
36 : #include <linux/backing-dev.h>
37 : #include <linux/writeback.h>
38 : #include <linux/hash.h>
39 : #include <linux/suspend.h>
40 : #include <linux/buffer_head.h>
41 : #include <linux/task_io_accounting_ops.h>
42 : #include <linux/bio.h>
43 : #include <linux/cpu.h>
44 : #include <linux/bitops.h>
45 : #include <linux/mpage.h>
46 : #include <linux/bit_spinlock.h>
47 : #include <linux/pagevec.h>
48 : #include <linux/sched/mm.h>
49 : #include <trace/events/block.h>
50 : #include <linux/fscrypt.h>
51 : #include <linux/fsverity.h>
52 :
53 : #include "internal.h"
54 :
55 : static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
56 : static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
57 : struct writeback_control *wbc);
58 :
59 : #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
60 :
61 0 : inline void touch_buffer(struct buffer_head *bh)
62 : {
63 0 : trace_block_touch_buffer(bh);
64 0 : folio_mark_accessed(bh->b_folio);
65 0 : }
66 : EXPORT_SYMBOL(touch_buffer);
67 :
68 0 : void __lock_buffer(struct buffer_head *bh)
69 : {
70 0 : wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
71 0 : }
72 : EXPORT_SYMBOL(__lock_buffer);
73 :
74 0 : void unlock_buffer(struct buffer_head *bh)
75 : {
76 0 : clear_bit_unlock(BH_Lock, &bh->b_state);
77 0 : smp_mb__after_atomic();
78 0 : wake_up_bit(&bh->b_state, BH_Lock);
79 0 : }
80 : EXPORT_SYMBOL(unlock_buffer);
81 :
82 : /*
83 : * Returns if the folio has dirty or writeback buffers. If all the buffers
84 : * are unlocked and clean then the folio_test_dirty information is stale. If
85 : * any of the buffers are locked, it is assumed they are locked for IO.
86 : */
87 0 : void buffer_check_dirty_writeback(struct folio *folio,
88 : bool *dirty, bool *writeback)
89 : {
90 : struct buffer_head *head, *bh;
91 0 : *dirty = false;
92 0 : *writeback = false;
93 :
94 0 : BUG_ON(!folio_test_locked(folio));
95 :
96 0 : head = folio_buffers(folio);
97 0 : if (!head)
98 : return;
99 :
100 0 : if (folio_test_writeback(folio))
101 0 : *writeback = true;
102 :
103 : bh = head;
104 : do {
105 0 : if (buffer_locked(bh))
106 0 : *writeback = true;
107 :
108 0 : if (buffer_dirty(bh))
109 0 : *dirty = true;
110 :
111 0 : bh = bh->b_this_page;
112 0 : } while (bh != head);
113 : }
114 : EXPORT_SYMBOL(buffer_check_dirty_writeback);
115 :
116 : /*
117 : * Block until a buffer comes unlocked. This doesn't stop it
118 : * from becoming locked again - you have to lock it yourself
119 : * if you want to preserve its state.
120 : */
121 0 : void __wait_on_buffer(struct buffer_head * bh)
122 : {
123 0 : wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
124 0 : }
125 : EXPORT_SYMBOL(__wait_on_buffer);
126 :
127 0 : static void buffer_io_error(struct buffer_head *bh, char *msg)
128 : {
129 0 : if (!test_bit(BH_Quiet, &bh->b_state))
130 0 : printk_ratelimited(KERN_ERR
131 : "Buffer I/O error on dev %pg, logical block %llu%s\n",
132 : bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
133 0 : }
134 :
135 : /*
136 : * End-of-IO handler helper function which does not touch the bh after
137 : * unlocking it.
138 : * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
139 : * a race there is benign: unlock_buffer() only use the bh's address for
140 : * hashing after unlocking the buffer, so it doesn't actually touch the bh
141 : * itself.
142 : */
143 0 : static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
144 : {
145 0 : if (uptodate) {
146 : set_buffer_uptodate(bh);
147 : } else {
148 : /* This happens, due to failed read-ahead attempts. */
149 : clear_buffer_uptodate(bh);
150 : }
151 0 : unlock_buffer(bh);
152 0 : }
153 :
154 : /*
155 : * Default synchronous end-of-IO handler.. Just mark it up-to-date and
156 : * unlock the buffer.
157 : */
158 0 : void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
159 : {
160 0 : __end_buffer_read_notouch(bh, uptodate);
161 0 : put_bh(bh);
162 0 : }
163 : EXPORT_SYMBOL(end_buffer_read_sync);
164 :
165 0 : void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
166 : {
167 0 : if (uptodate) {
168 : set_buffer_uptodate(bh);
169 : } else {
170 0 : buffer_io_error(bh, ", lost sync page write");
171 0 : mark_buffer_write_io_error(bh);
172 : clear_buffer_uptodate(bh);
173 : }
174 0 : unlock_buffer(bh);
175 0 : put_bh(bh);
176 0 : }
177 : EXPORT_SYMBOL(end_buffer_write_sync);
178 :
179 : /*
180 : * Various filesystems appear to want __find_get_block to be non-blocking.
181 : * But it's the page lock which protects the buffers. To get around this,
182 : * we get exclusion from try_to_free_buffers with the blockdev mapping's
183 : * private_lock.
184 : *
185 : * Hack idea: for the blockdev mapping, private_lock contention
186 : * may be quite high. This code could TryLock the page, and if that
187 : * succeeds, there is no need to take private_lock.
188 : */
189 : static struct buffer_head *
190 0 : __find_get_block_slow(struct block_device *bdev, sector_t block)
191 : {
192 0 : struct inode *bd_inode = bdev->bd_inode;
193 0 : struct address_space *bd_mapping = bd_inode->i_mapping;
194 0 : struct buffer_head *ret = NULL;
195 : pgoff_t index;
196 : struct buffer_head *bh;
197 : struct buffer_head *head;
198 : struct page *page;
199 0 : int all_mapped = 1;
200 : static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);
201 :
202 0 : index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
203 0 : page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
204 0 : if (!page)
205 : goto out;
206 :
207 0 : spin_lock(&bd_mapping->private_lock);
208 0 : if (!page_has_buffers(page))
209 : goto out_unlock;
210 0 : head = page_buffers(page);
211 0 : bh = head;
212 : do {
213 0 : if (!buffer_mapped(bh))
214 : all_mapped = 0;
215 0 : else if (bh->b_blocknr == block) {
216 0 : ret = bh;
217 : get_bh(bh);
218 : goto out_unlock;
219 : }
220 0 : bh = bh->b_this_page;
221 0 : } while (bh != head);
222 :
223 : /* we might be here because some of the buffers on this page are
224 : * not mapped. This is due to various races between
225 : * file io on the block device and getblk. It gets dealt with
226 : * elsewhere, don't buffer_error if we had some unmapped buffers
227 : */
228 0 : ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE);
229 0 : if (all_mapped && __ratelimit(&last_warned)) {
230 0 : printk("__find_get_block_slow() failed. block=%llu, "
231 : "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
232 : "device %pg blocksize: %d\n",
233 : (unsigned long long)block,
234 : (unsigned long long)bh->b_blocknr,
235 : bh->b_state, bh->b_size, bdev,
236 : 1 << bd_inode->i_blkbits);
237 : }
238 : out_unlock:
239 0 : spin_unlock(&bd_mapping->private_lock);
240 0 : put_page(page);
241 : out:
242 0 : return ret;
243 : }
244 :
245 0 : static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
246 : {
247 : unsigned long flags;
248 : struct buffer_head *first;
249 : struct buffer_head *tmp;
250 : struct folio *folio;
251 0 : int folio_uptodate = 1;
252 :
253 0 : BUG_ON(!buffer_async_read(bh));
254 :
255 0 : folio = bh->b_folio;
256 0 : if (uptodate) {
257 : set_buffer_uptodate(bh);
258 : } else {
259 0 : clear_buffer_uptodate(bh);
260 0 : buffer_io_error(bh, ", async page read");
261 : folio_set_error(folio);
262 : }
263 :
264 : /*
265 : * Be _very_ careful from here on. Bad things can happen if
266 : * two buffer heads end IO at almost the same time and both
267 : * decide that the page is now completely done.
268 : */
269 0 : first = folio_buffers(folio);
270 0 : spin_lock_irqsave(&first->b_uptodate_lock, flags);
271 0 : clear_buffer_async_read(bh);
272 0 : unlock_buffer(bh);
273 0 : tmp = bh;
274 : do {
275 0 : if (!buffer_uptodate(tmp))
276 0 : folio_uptodate = 0;
277 0 : if (buffer_async_read(tmp)) {
278 0 : BUG_ON(!buffer_locked(tmp));
279 : goto still_busy;
280 : }
281 0 : tmp = tmp->b_this_page;
282 0 : } while (tmp != bh);
283 0 : spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
284 :
285 : /*
286 : * If all of the buffers are uptodate then we can set the page
287 : * uptodate.
288 : */
289 0 : if (folio_uptodate)
290 : folio_mark_uptodate(folio);
291 0 : folio_unlock(folio);
292 0 : return;
293 :
294 : still_busy:
295 0 : spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
296 : return;
297 : }
298 :
299 : struct postprocess_bh_ctx {
300 : struct work_struct work;
301 : struct buffer_head *bh;
302 : };
303 :
304 : static void verify_bh(struct work_struct *work)
305 : {
306 : struct postprocess_bh_ctx *ctx =
307 : container_of(work, struct postprocess_bh_ctx, work);
308 : struct buffer_head *bh = ctx->bh;
309 : bool valid;
310 :
311 : valid = fsverity_verify_blocks(bh->b_folio, bh->b_size, bh_offset(bh));
312 : end_buffer_async_read(bh, valid);
313 : kfree(ctx);
314 : }
315 :
316 : static bool need_fsverity(struct buffer_head *bh)
317 : {
318 0 : struct folio *folio = bh->b_folio;
319 0 : struct inode *inode = folio->mapping->host;
320 :
321 0 : return fsverity_active(inode) &&
322 : /* needed by ext4 */
323 : folio->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
324 : }
325 :
326 : static void decrypt_bh(struct work_struct *work)
327 : {
328 : struct postprocess_bh_ctx *ctx =
329 : container_of(work, struct postprocess_bh_ctx, work);
330 : struct buffer_head *bh = ctx->bh;
331 : int err;
332 :
333 : err = fscrypt_decrypt_pagecache_blocks(bh->b_folio, bh->b_size,
334 : bh_offset(bh));
335 : if (err == 0 && need_fsverity(bh)) {
336 : /*
337 : * We use different work queues for decryption and for verity
338 : * because verity may require reading metadata pages that need
339 : * decryption, and we shouldn't recurse to the same workqueue.
340 : */
341 : INIT_WORK(&ctx->work, verify_bh);
342 : fsverity_enqueue_verify_work(&ctx->work);
343 : return;
344 : }
345 : end_buffer_async_read(bh, err == 0);
346 : kfree(ctx);
347 : }
348 :
349 : /*
350 : * I/O completion handler for block_read_full_folio() - pages
351 : * which come unlocked at the end of I/O.
352 : */
353 0 : static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
354 : {
355 0 : struct inode *inode = bh->b_folio->mapping->host;
356 0 : bool decrypt = fscrypt_inode_uses_fs_layer_crypto(inode);
357 0 : bool verify = need_fsverity(bh);
358 :
359 : /* Decrypt (with fscrypt) and/or verify (with fsverity) if needed. */
360 : if (uptodate && (decrypt || verify)) {
361 : struct postprocess_bh_ctx *ctx =
362 : kmalloc(sizeof(*ctx), GFP_ATOMIC);
363 :
364 : if (ctx) {
365 : ctx->bh = bh;
366 : if (decrypt) {
367 : INIT_WORK(&ctx->work, decrypt_bh);
368 : fscrypt_enqueue_decrypt_work(&ctx->work);
369 : } else {
370 : INIT_WORK(&ctx->work, verify_bh);
371 : fsverity_enqueue_verify_work(&ctx->work);
372 : }
373 : return;
374 : }
375 : uptodate = 0;
376 : }
377 0 : end_buffer_async_read(bh, uptodate);
378 : }
379 :
380 : /*
381 : * Completion handler for block_write_full_page() - pages which are unlocked
382 : * during I/O, and which have PageWriteback cleared upon I/O completion.
383 : */
384 0 : void end_buffer_async_write(struct buffer_head *bh, int uptodate)
385 : {
386 : unsigned long flags;
387 : struct buffer_head *first;
388 : struct buffer_head *tmp;
389 : struct folio *folio;
390 :
391 0 : BUG_ON(!buffer_async_write(bh));
392 :
393 0 : folio = bh->b_folio;
394 0 : if (uptodate) {
395 : set_buffer_uptodate(bh);
396 : } else {
397 0 : buffer_io_error(bh, ", lost async page write");
398 0 : mark_buffer_write_io_error(bh);
399 0 : clear_buffer_uptodate(bh);
400 : folio_set_error(folio);
401 : }
402 :
403 0 : first = folio_buffers(folio);
404 0 : spin_lock_irqsave(&first->b_uptodate_lock, flags);
405 :
406 0 : clear_buffer_async_write(bh);
407 0 : unlock_buffer(bh);
408 0 : tmp = bh->b_this_page;
409 0 : while (tmp != bh) {
410 0 : if (buffer_async_write(tmp)) {
411 0 : BUG_ON(!buffer_locked(tmp));
412 : goto still_busy;
413 : }
414 0 : tmp = tmp->b_this_page;
415 : }
416 0 : spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
417 0 : folio_end_writeback(folio);
418 0 : return;
419 :
420 : still_busy:
421 0 : spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
422 : return;
423 : }
424 : EXPORT_SYMBOL(end_buffer_async_write);
425 :
426 : /*
427 : * If a page's buffers are under async readin (end_buffer_async_read
428 : * completion) then there is a possibility that another thread of
429 : * control could lock one of the buffers after it has completed
430 : * but while some of the other buffers have not completed. This
431 : * locked buffer would confuse end_buffer_async_read() into not unlocking
432 : * the page. So the absence of BH_Async_Read tells end_buffer_async_read()
433 : * that this buffer is not under async I/O.
434 : *
435 : * The page comes unlocked when it has no locked buffer_async buffers
436 : * left.
437 : *
438 : * PageLocked prevents anyone starting new async I/O reads any of
439 : * the buffers.
440 : *
441 : * PageWriteback is used to prevent simultaneous writeout of the same
442 : * page.
443 : *
444 : * PageLocked prevents anyone from starting writeback of a page which is
445 : * under read I/O (PageWriteback is only ever set against a locked page).
446 : */
447 0 : static void mark_buffer_async_read(struct buffer_head *bh)
448 : {
449 0 : bh->b_end_io = end_buffer_async_read_io;
450 0 : set_buffer_async_read(bh);
451 0 : }
452 :
453 : static void mark_buffer_async_write_endio(struct buffer_head *bh,
454 : bh_end_io_t *handler)
455 : {
456 0 : bh->b_end_io = handler;
457 0 : set_buffer_async_write(bh);
458 : }
459 :
460 0 : void mark_buffer_async_write(struct buffer_head *bh)
461 : {
462 0 : mark_buffer_async_write_endio(bh, end_buffer_async_write);
463 0 : }
464 : EXPORT_SYMBOL(mark_buffer_async_write);
465 :
466 :
467 : /*
468 : * fs/buffer.c contains helper functions for buffer-backed address space's
469 : * fsync functions. A common requirement for buffer-based filesystems is
470 : * that certain data from the backing blockdev needs to be written out for
471 : * a successful fsync(). For example, ext2 indirect blocks need to be
472 : * written back and waited upon before fsync() returns.
473 : *
474 : * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
475 : * inode_has_buffers() and invalidate_inode_buffers() are provided for the
476 : * management of a list of dependent buffers at ->i_mapping->private_list.
477 : *
478 : * Locking is a little subtle: try_to_free_buffers() will remove buffers
479 : * from their controlling inode's queue when they are being freed. But
480 : * try_to_free_buffers() will be operating against the *blockdev* mapping
481 : * at the time, not against the S_ISREG file which depends on those buffers.
482 : * So the locking for private_list is via the private_lock in the address_space
483 : * which backs the buffers. Which is different from the address_space
484 : * against which the buffers are listed. So for a particular address_space,
485 : * mapping->private_lock does *not* protect mapping->private_list! In fact,
486 : * mapping->private_list will always be protected by the backing blockdev's
487 : * ->private_lock.
488 : *
489 : * Which introduces a requirement: all buffers on an address_space's
490 : * ->private_list must be from the same address_space: the blockdev's.
491 : *
492 : * address_spaces which do not place buffers at ->private_list via these
493 : * utility functions are free to use private_lock and private_list for
494 : * whatever they want. The only requirement is that list_empty(private_list)
495 : * be true at clear_inode() time.
496 : *
497 : * FIXME: clear_inode should not call invalidate_inode_buffers(). The
498 : * filesystems should do that. invalidate_inode_buffers() should just go
499 : * BUG_ON(!list_empty).
500 : *
501 : * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
502 : * take an address_space, not an inode. And it should be called
503 : * mark_buffer_dirty_fsync() to clearly define why those buffers are being
504 : * queued up.
505 : *
506 : * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
507 : * list if it is already on a list. Because if the buffer is on a list,
508 : * it *must* already be on the right one. If not, the filesystem is being
509 : * silly. This will save a ton of locking. But first we have to ensure
510 : * that buffers are taken *off* the old inode's list when they are freed
511 : * (presumably in truncate). That requires careful auditing of all
512 : * filesystems (do it inside bforget()). It could also be done by bringing
513 : * b_inode back.
514 : */
515 :
516 : /*
517 : * The buffer's backing address_space's private_lock must be held
518 : */
519 0 : static void __remove_assoc_queue(struct buffer_head *bh)
520 : {
521 0 : list_del_init(&bh->b_assoc_buffers);
522 0 : WARN_ON(!bh->b_assoc_map);
523 0 : bh->b_assoc_map = NULL;
524 0 : }
525 :
526 44 : int inode_has_buffers(struct inode *inode)
527 : {
528 88 : return !list_empty(&inode->i_data.private_list);
529 : }
530 :
531 : /*
532 : * osync is designed to support O_SYNC io. It waits synchronously for
533 : * all already-submitted IO to complete, but does not queue any new
534 : * writes to the disk.
535 : *
536 : * To do O_SYNC writes, just queue the buffer writes with write_dirty_buffer
537 : * as you dirty the buffers, and then use osync_inode_buffers to wait for
538 : * completion. Any other dirty buffers which are not yet queued for
539 : * write will not be flushed to disk by the osync.
540 : */
541 0 : static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
542 : {
543 : struct buffer_head *bh;
544 : struct list_head *p;
545 0 : int err = 0;
546 :
547 : spin_lock(lock);
548 : repeat:
549 0 : list_for_each_prev(p, list) {
550 0 : bh = BH_ENTRY(p);
551 0 : if (buffer_locked(bh)) {
552 0 : get_bh(bh);
553 0 : spin_unlock(lock);
554 0 : wait_on_buffer(bh);
555 0 : if (!buffer_uptodate(bh))
556 0 : err = -EIO;
557 0 : brelse(bh);
558 : spin_lock(lock);
559 : goto repeat;
560 : }
561 : }
562 0 : spin_unlock(lock);
563 0 : return err;
564 : }
565 :
566 0 : void emergency_thaw_bdev(struct super_block *sb)
567 : {
568 0 : while (sb->s_bdev && !thaw_bdev(sb->s_bdev))
569 0 : printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev);
570 0 : }
571 :
572 : /**
573 : * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
574 : * @mapping: the mapping which wants those buffers written
575 : *
576 : * Starts I/O against the buffers at mapping->private_list, and waits upon
577 : * that I/O.
578 : *
579 : * Basically, this is a convenience function for fsync().
580 : * @mapping is a file or directory which needs those buffers to be written for
581 : * a successful fsync().
582 : */
583 0 : int sync_mapping_buffers(struct address_space *mapping)
584 : {
585 0 : struct address_space *buffer_mapping = mapping->private_data;
586 :
587 0 : if (buffer_mapping == NULL || list_empty(&mapping->private_list))
588 : return 0;
589 :
590 0 : return fsync_buffers_list(&buffer_mapping->private_lock,
591 : &mapping->private_list);
592 : }
593 : EXPORT_SYMBOL(sync_mapping_buffers);
594 :
595 : /*
596 : * Called when we've recently written block `bblock', and it is known that
597 : * `bblock' was for a buffer_boundary() buffer. This means that the block at
598 : * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
599 : * dirty, schedule it for IO. So that indirects merge nicely with their data.
600 : */
601 0 : void write_boundary_block(struct block_device *bdev,
602 : sector_t bblock, unsigned blocksize)
603 : {
604 0 : struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
605 0 : if (bh) {
606 0 : if (buffer_dirty(bh))
607 0 : write_dirty_buffer(bh, 0);
608 : put_bh(bh);
609 : }
610 0 : }
611 :
612 0 : void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
613 : {
614 0 : struct address_space *mapping = inode->i_mapping;
615 0 : struct address_space *buffer_mapping = bh->b_folio->mapping;
616 :
617 0 : mark_buffer_dirty(bh);
618 0 : if (!mapping->private_data) {
619 0 : mapping->private_data = buffer_mapping;
620 : } else {
621 0 : BUG_ON(mapping->private_data != buffer_mapping);
622 : }
623 0 : if (!bh->b_assoc_map) {
624 0 : spin_lock(&buffer_mapping->private_lock);
625 0 : list_move_tail(&bh->b_assoc_buffers,
626 : &mapping->private_list);
627 0 : bh->b_assoc_map = mapping;
628 0 : spin_unlock(&buffer_mapping->private_lock);
629 : }
630 0 : }
631 : EXPORT_SYMBOL(mark_buffer_dirty_inode);
632 :
633 : /*
634 : * Add a page to the dirty page list.
635 : *
636 : * It is a sad fact of life that this function is called from several places
637 : * deeply under spinlocking. It may not sleep.
638 : *
639 : * If the page has buffers, the uptodate buffers are set dirty, to preserve
640 : * dirty-state coherency between the page and the buffers. It the page does
641 : * not have buffers then when they are later attached they will all be set
642 : * dirty.
643 : *
644 : * The buffers are dirtied before the page is dirtied. There's a small race
645 : * window in which a writepage caller may see the page cleanness but not the
646 : * buffer dirtiness. That's fine. If this code were to set the page dirty
647 : * before the buffers, a concurrent writepage caller could clear the page dirty
648 : * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
649 : * page on the dirty page list.
650 : *
651 : * We use private_lock to lock against try_to_free_buffers while using the
652 : * page's buffer list. Also use this to protect against clean buffers being
653 : * added to the page after it was set dirty.
654 : *
655 : * FIXME: may need to call ->reservepage here as well. That's rather up to the
656 : * address_space though.
657 : */
658 0 : bool block_dirty_folio(struct address_space *mapping, struct folio *folio)
659 : {
660 : struct buffer_head *head;
661 : bool newly_dirty;
662 :
663 0 : spin_lock(&mapping->private_lock);
664 0 : head = folio_buffers(folio);
665 0 : if (head) {
666 : struct buffer_head *bh = head;
667 :
668 : do {
669 0 : set_buffer_dirty(bh);
670 0 : bh = bh->b_this_page;
671 0 : } while (bh != head);
672 : }
673 : /*
674 : * Lock out page's memcg migration to keep PageDirty
675 : * synchronized with per-memcg dirty page counters.
676 : */
677 0 : folio_memcg_lock(folio);
678 0 : newly_dirty = !folio_test_set_dirty(folio);
679 0 : spin_unlock(&mapping->private_lock);
680 :
681 0 : if (newly_dirty)
682 0 : __folio_mark_dirty(folio, mapping, 1);
683 :
684 0 : folio_memcg_unlock(folio);
685 :
686 0 : if (newly_dirty)
687 0 : __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
688 :
689 0 : return newly_dirty;
690 : }
691 : EXPORT_SYMBOL(block_dirty_folio);
692 :
693 : /*
694 : * Write out and wait upon a list of buffers.
695 : *
696 : * We have conflicting pressures: we want to make sure that all
697 : * initially dirty buffers get waited on, but that any subsequently
698 : * dirtied buffers don't. After all, we don't want fsync to last
699 : * forever if somebody is actively writing to the file.
700 : *
701 : * Do this in two main stages: first we copy dirty buffers to a
702 : * temporary inode list, queueing the writes as we go. Then we clean
703 : * up, waiting for those writes to complete.
704 : *
705 : * During this second stage, any subsequent updates to the file may end
706 : * up refiling the buffer on the original inode's dirty list again, so
707 : * there is a chance we will end up with a buffer queued for write but
708 : * not yet completed on that list. So, as a final cleanup we go through
709 : * the osync code to catch these locked, dirty buffers without requeuing
710 : * any newly dirty buffers for write.
711 : */
712 0 : static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
713 : {
714 : struct buffer_head *bh;
715 : struct list_head tmp;
716 : struct address_space *mapping;
717 0 : int err = 0, err2;
718 : struct blk_plug plug;
719 :
720 0 : INIT_LIST_HEAD(&tmp);
721 0 : blk_start_plug(&plug);
722 :
723 : spin_lock(lock);
724 0 : while (!list_empty(list)) {
725 0 : bh = BH_ENTRY(list->next);
726 0 : mapping = bh->b_assoc_map;
727 0 : __remove_assoc_queue(bh);
728 : /* Avoid race with mark_buffer_dirty_inode() which does
729 : * a lockless check and we rely on seeing the dirty bit */
730 0 : smp_mb();
731 0 : if (buffer_dirty(bh) || buffer_locked(bh)) {
732 0 : list_add(&bh->b_assoc_buffers, &tmp);
733 0 : bh->b_assoc_map = mapping;
734 0 : if (buffer_dirty(bh)) {
735 0 : get_bh(bh);
736 0 : spin_unlock(lock);
737 : /*
738 : * Ensure any pending I/O completes so that
739 : * write_dirty_buffer() actually writes the
740 : * current contents - it is a noop if I/O is
741 : * still in flight on potentially older
742 : * contents.
743 : */
744 0 : write_dirty_buffer(bh, REQ_SYNC);
745 :
746 : /*
747 : * Kick off IO for the previous mapping. Note
748 : * that we will not run the very last mapping,
749 : * wait_on_buffer() will do that for us
750 : * through sync_buffer().
751 : */
752 0 : brelse(bh);
753 : spin_lock(lock);
754 : }
755 : }
756 : }
757 :
758 0 : spin_unlock(lock);
759 0 : blk_finish_plug(&plug);
760 : spin_lock(lock);
761 :
762 0 : while (!list_empty(&tmp)) {
763 0 : bh = BH_ENTRY(tmp.prev);
764 0 : get_bh(bh);
765 0 : mapping = bh->b_assoc_map;
766 0 : __remove_assoc_queue(bh);
767 : /* Avoid race with mark_buffer_dirty_inode() which does
768 : * a lockless check and we rely on seeing the dirty bit */
769 0 : smp_mb();
770 0 : if (buffer_dirty(bh)) {
771 0 : list_add(&bh->b_assoc_buffers,
772 : &mapping->private_list);
773 0 : bh->b_assoc_map = mapping;
774 : }
775 0 : spin_unlock(lock);
776 0 : wait_on_buffer(bh);
777 0 : if (!buffer_uptodate(bh))
778 0 : err = -EIO;
779 0 : brelse(bh);
780 : spin_lock(lock);
781 : }
782 :
783 0 : spin_unlock(lock);
784 0 : err2 = osync_buffers_list(lock, list);
785 0 : if (err)
786 : return err;
787 : else
788 : return err2;
789 : }
790 :
791 : /*
792 : * Invalidate any and all dirty buffers on a given inode. We are
793 : * probably unmounting the fs, but that doesn't mean we have already
794 : * done a sync(). Just drop the buffers from the inode list.
795 : *
796 : * NOTE: we take the inode's blockdev's mapping's private_lock. Which
797 : * assumes that all the buffers are against the blockdev. Not true
798 : * for reiserfs.
799 : */
800 0 : void invalidate_inode_buffers(struct inode *inode)
801 : {
802 0 : if (inode_has_buffers(inode)) {
803 0 : struct address_space *mapping = &inode->i_data;
804 0 : struct list_head *list = &mapping->private_list;
805 0 : struct address_space *buffer_mapping = mapping->private_data;
806 :
807 0 : spin_lock(&buffer_mapping->private_lock);
808 0 : while (!list_empty(list))
809 0 : __remove_assoc_queue(BH_ENTRY(list->next));
810 0 : spin_unlock(&buffer_mapping->private_lock);
811 : }
812 0 : }
813 : EXPORT_SYMBOL(invalidate_inode_buffers);
814 :
815 : /*
816 : * Remove any clean buffers from the inode's buffer list. This is called
817 : * when we're trying to free the inode itself. Those buffers can pin it.
818 : *
819 : * Returns true if all buffers were removed.
820 : */
821 0 : int remove_inode_buffers(struct inode *inode)
822 : {
823 0 : int ret = 1;
824 :
825 0 : if (inode_has_buffers(inode)) {
826 0 : struct address_space *mapping = &inode->i_data;
827 0 : struct list_head *list = &mapping->private_list;
828 0 : struct address_space *buffer_mapping = mapping->private_data;
829 :
830 0 : spin_lock(&buffer_mapping->private_lock);
831 0 : while (!list_empty(list)) {
832 0 : struct buffer_head *bh = BH_ENTRY(list->next);
833 0 : if (buffer_dirty(bh)) {
834 : ret = 0;
835 : break;
836 : }
837 0 : __remove_assoc_queue(bh);
838 : }
839 0 : spin_unlock(&buffer_mapping->private_lock);
840 : }
841 0 : return ret;
842 : }
843 :
844 : /*
845 : * Create the appropriate buffers when given a folio for data area and
846 : * the size of each buffer.. Use the bh->b_this_page linked list to
847 : * follow the buffers created. Return NULL if unable to create more
848 : * buffers.
849 : *
850 : * The retry flag is used to differentiate async IO (paging, swapping)
851 : * which may not fail from ordinary buffer allocations.
852 : */
853 0 : struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
854 : bool retry)
855 : {
856 : struct buffer_head *bh, *head;
857 0 : gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
858 : long offset;
859 : struct mem_cgroup *memcg, *old_memcg;
860 :
861 0 : if (retry)
862 0 : gfp |= __GFP_NOFAIL;
863 :
864 : /* The folio lock pins the memcg */
865 0 : memcg = folio_memcg(folio);
866 0 : old_memcg = set_active_memcg(memcg);
867 :
868 0 : head = NULL;
869 0 : offset = folio_size(folio);
870 0 : while ((offset -= size) >= 0) {
871 0 : bh = alloc_buffer_head(gfp);
872 0 : if (!bh)
873 : goto no_grow;
874 :
875 0 : bh->b_this_page = head;
876 0 : bh->b_blocknr = -1;
877 0 : head = bh;
878 :
879 0 : bh->b_size = size;
880 :
881 : /* Link the buffer to its folio */
882 0 : folio_set_bh(bh, folio, offset);
883 : }
884 : out:
885 : set_active_memcg(old_memcg);
886 0 : return head;
887 : /*
888 : * In case anything failed, we just free everything we got.
889 : */
890 : no_grow:
891 0 : if (head) {
892 : do {
893 0 : bh = head;
894 0 : head = head->b_this_page;
895 0 : free_buffer_head(bh);
896 0 : } while (head);
897 : }
898 :
899 : goto out;
900 : }
901 : EXPORT_SYMBOL_GPL(folio_alloc_buffers);
902 :
903 0 : struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
904 : bool retry)
905 : {
906 0 : return folio_alloc_buffers(page_folio(page), size, retry);
907 : }
908 : EXPORT_SYMBOL_GPL(alloc_page_buffers);
909 :
910 : static inline void
911 : link_dev_buffers(struct page *page, struct buffer_head *head)
912 : {
913 : struct buffer_head *bh, *tail;
914 :
915 : bh = head;
916 : do {
917 0 : tail = bh;
918 0 : bh = bh->b_this_page;
919 0 : } while (bh);
920 0 : tail->b_this_page = head;
921 0 : attach_page_private(page, head);
922 : }
923 :
924 : static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
925 : {
926 0 : sector_t retval = ~((sector_t)0);
927 0 : loff_t sz = bdev_nr_bytes(bdev);
928 :
929 0 : if (sz) {
930 0 : unsigned int sizebits = blksize_bits(size);
931 0 : retval = (sz >> sizebits);
932 : }
933 : return retval;
934 : }
935 :
936 : /*
937 : * Initialise the state of a blockdev page's buffers.
938 : */
939 : static sector_t
940 0 : init_page_buffers(struct page *page, struct block_device *bdev,
941 : sector_t block, int size)
942 : {
943 0 : struct buffer_head *head = page_buffers(page);
944 0 : struct buffer_head *bh = head;
945 0 : int uptodate = PageUptodate(page);
946 0 : sector_t end_block = blkdev_max_block(bdev, size);
947 :
948 : do {
949 0 : if (!buffer_mapped(bh)) {
950 0 : bh->b_end_io = NULL;
951 0 : bh->b_private = NULL;
952 0 : bh->b_bdev = bdev;
953 0 : bh->b_blocknr = block;
954 0 : if (uptodate)
955 : set_buffer_uptodate(bh);
956 0 : if (block < end_block)
957 : set_buffer_mapped(bh);
958 : }
959 0 : block++;
960 0 : bh = bh->b_this_page;
961 0 : } while (bh != head);
962 :
963 : /*
964 : * Caller needs to validate requested block against end of device.
965 : */
966 0 : return end_block;
967 : }
968 :
969 : /*
970 : * Create the page-cache page that contains the requested block.
971 : *
972 : * This is used purely for blockdev mappings.
973 : */
974 : static int
975 0 : grow_dev_page(struct block_device *bdev, sector_t block,
976 : pgoff_t index, int size, int sizebits, gfp_t gfp)
977 : {
978 0 : struct inode *inode = bdev->bd_inode;
979 : struct page *page;
980 : struct buffer_head *bh;
981 : sector_t end_block;
982 0 : int ret = 0;
983 : gfp_t gfp_mask;
984 :
985 0 : gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;
986 :
987 : /*
988 : * XXX: __getblk_slow() can not really deal with failure and
989 : * will endlessly loop on improvised global reclaim. Prefer
990 : * looping in the allocator rather than here, at least that
991 : * code knows what it's doing.
992 : */
993 0 : gfp_mask |= __GFP_NOFAIL;
994 :
995 0 : page = find_or_create_page(inode->i_mapping, index, gfp_mask);
996 :
997 0 : BUG_ON(!PageLocked(page));
998 :
999 0 : if (page_has_buffers(page)) {
1000 0 : bh = page_buffers(page);
1001 0 : if (bh->b_size == size) {
1002 0 : end_block = init_page_buffers(page, bdev,
1003 : (sector_t)index << sizebits,
1004 : size);
1005 0 : goto done;
1006 : }
1007 0 : if (!try_to_free_buffers(page_folio(page)))
1008 : goto failed;
1009 : }
1010 :
1011 : /*
1012 : * Allocate some buffers for this page
1013 : */
1014 0 : bh = alloc_page_buffers(page, size, true);
1015 :
1016 : /*
1017 : * Link the page to the buffers and initialise them. Take the
1018 : * lock to be atomic wrt __find_get_block(), which does not
1019 : * run under the page lock.
1020 : */
1021 0 : spin_lock(&inode->i_mapping->private_lock);
1022 0 : link_dev_buffers(page, bh);
1023 0 : end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits,
1024 : size);
1025 0 : spin_unlock(&inode->i_mapping->private_lock);
1026 : done:
1027 0 : ret = (block < end_block) ? 1 : -ENXIO;
1028 : failed:
1029 0 : unlock_page(page);
1030 0 : put_page(page);
1031 0 : return ret;
1032 : }
1033 :
1034 : /*
1035 : * Create buffers for the specified block device block's page. If
1036 : * that page was dirty, the buffers are set dirty also.
1037 : */
1038 : static int
1039 0 : grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
1040 : {
1041 : pgoff_t index;
1042 : int sizebits;
1043 :
1044 0 : sizebits = PAGE_SHIFT - __ffs(size);
1045 0 : index = block >> sizebits;
1046 :
1047 : /*
1048 : * Check for a block which wants to lie outside our maximum possible
1049 : * pagecache index. (this comparison is done using sector_t types).
1050 : */
1051 : if (unlikely(index != block >> sizebits)) {
1052 : printk(KERN_ERR "%s: requested out-of-range block %llu for "
1053 : "device %pg\n",
1054 : __func__, (unsigned long long)block,
1055 : bdev);
1056 : return -EIO;
1057 : }
1058 :
1059 : /* Create a page with the proper size buffers.. */
1060 0 : return grow_dev_page(bdev, block, index, size, sizebits, gfp);
1061 : }
1062 :
1063 : static struct buffer_head *
1064 0 : __getblk_slow(struct block_device *bdev, sector_t block,
1065 : unsigned size, gfp_t gfp)
1066 : {
1067 : /* Size must be multiple of hard sectorsize */
1068 0 : if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1069 : (size < 512 || size > PAGE_SIZE))) {
1070 0 : printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1071 : size);
1072 0 : printk(KERN_ERR "logical block size: %d\n",
1073 : bdev_logical_block_size(bdev));
1074 :
1075 0 : dump_stack();
1076 0 : return NULL;
1077 : }
1078 :
1079 : for (;;) {
1080 : struct buffer_head *bh;
1081 : int ret;
1082 :
1083 0 : bh = __find_get_block(bdev, block, size);
1084 0 : if (bh)
1085 : return bh;
1086 :
1087 0 : ret = grow_buffers(bdev, block, size, gfp);
1088 0 : if (ret < 0)
1089 : return NULL;
1090 : }
1091 : }
1092 :
1093 : /*
1094 : * The relationship between dirty buffers and dirty pages:
1095 : *
1096 : * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1097 : * the page is tagged dirty in the page cache.
1098 : *
1099 : * At all times, the dirtiness of the buffers represents the dirtiness of
1100 : * subsections of the page. If the page has buffers, the page dirty bit is
1101 : * merely a hint about the true dirty state.
1102 : *
1103 : * When a page is set dirty in its entirety, all its buffers are marked dirty
1104 : * (if the page has buffers).
1105 : *
1106 : * When a buffer is marked dirty, its page is dirtied, but the page's other
1107 : * buffers are not.
1108 : *
1109 : * Also. When blockdev buffers are explicitly read with bread(), they
1110 : * individually become uptodate. But their backing page remains not
1111 : * uptodate - even if all of its buffers are uptodate. A subsequent
1112 : * block_read_full_folio() against that folio will discover all the uptodate
1113 : * buffers, will set the folio uptodate and will perform no I/O.
1114 : */
1115 :
1116 : /**
1117 : * mark_buffer_dirty - mark a buffer_head as needing writeout
1118 : * @bh: the buffer_head to mark dirty
1119 : *
1120 : * mark_buffer_dirty() will set the dirty bit against the buffer, then set
1121 : * its backing page dirty, then tag the page as dirty in the page cache
1122 : * and then attach the address_space's inode to its superblock's dirty
1123 : * inode list.
1124 : *
1125 : * mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->private_lock,
1126 : * i_pages lock and mapping->host->i_lock.
1127 : */
1128 0 : void mark_buffer_dirty(struct buffer_head *bh)
1129 : {
1130 0 : WARN_ON_ONCE(!buffer_uptodate(bh));
1131 :
1132 0 : trace_block_dirty_buffer(bh);
1133 :
1134 : /*
1135 : * Very *carefully* optimize the it-is-already-dirty case.
1136 : *
1137 : * Don't let the final "is it dirty" escape to before we
1138 : * perhaps modified the buffer.
1139 : */
1140 0 : if (buffer_dirty(bh)) {
1141 0 : smp_mb();
1142 0 : if (buffer_dirty(bh))
1143 : return;
1144 : }
1145 :
1146 0 : if (!test_set_buffer_dirty(bh)) {
1147 0 : struct folio *folio = bh->b_folio;
1148 0 : struct address_space *mapping = NULL;
1149 :
1150 0 : folio_memcg_lock(folio);
1151 0 : if (!folio_test_set_dirty(folio)) {
1152 0 : mapping = folio->mapping;
1153 0 : if (mapping)
1154 0 : __folio_mark_dirty(folio, mapping, 0);
1155 : }
1156 0 : folio_memcg_unlock(folio);
1157 0 : if (mapping)
1158 0 : __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1159 : }
1160 : }
1161 : EXPORT_SYMBOL(mark_buffer_dirty);
1162 :
1163 0 : void mark_buffer_write_io_error(struct buffer_head *bh)
1164 : {
1165 : struct super_block *sb;
1166 :
1167 0 : set_buffer_write_io_error(bh);
1168 : /* FIXME: do we need to set this in both places? */
1169 0 : if (bh->b_folio && bh->b_folio->mapping)
1170 0 : mapping_set_error(bh->b_folio->mapping, -EIO);
1171 0 : if (bh->b_assoc_map)
1172 0 : mapping_set_error(bh->b_assoc_map, -EIO);
1173 : rcu_read_lock();
1174 0 : sb = READ_ONCE(bh->b_bdev->bd_super);
1175 0 : if (sb)
1176 0 : errseq_set(&sb->s_wb_err, -EIO);
1177 : rcu_read_unlock();
1178 0 : }
1179 : EXPORT_SYMBOL(mark_buffer_write_io_error);
1180 :
1181 : /*
1182 : * Decrement a buffer_head's reference count. If all buffers against a page
1183 : * have zero reference count, are clean and unlocked, and if the page is clean
1184 : * and unlocked then try_to_free_buffers() may strip the buffers from the page
1185 : * in preparation for freeing it (sometimes, rarely, buffers are removed from
1186 : * a page but it ends up not being freed, and buffers may later be reattached).
1187 : */
1188 0 : void __brelse(struct buffer_head * buf)
1189 : {
1190 0 : if (atomic_read(&buf->b_count)) {
1191 : put_bh(buf);
1192 : return;
1193 : }
1194 0 : WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1195 : }
1196 : EXPORT_SYMBOL(__brelse);
1197 :
1198 : /*
1199 : * bforget() is like brelse(), except it discards any
1200 : * potentially dirty data.
1201 : */
1202 0 : void __bforget(struct buffer_head *bh)
1203 : {
1204 0 : clear_buffer_dirty(bh);
1205 0 : if (bh->b_assoc_map) {
1206 0 : struct address_space *buffer_mapping = bh->b_folio->mapping;
1207 :
1208 0 : spin_lock(&buffer_mapping->private_lock);
1209 0 : list_del_init(&bh->b_assoc_buffers);
1210 0 : bh->b_assoc_map = NULL;
1211 0 : spin_unlock(&buffer_mapping->private_lock);
1212 : }
1213 0 : __brelse(bh);
1214 0 : }
1215 : EXPORT_SYMBOL(__bforget);
1216 :
1217 0 : static struct buffer_head *__bread_slow(struct buffer_head *bh)
1218 : {
1219 0 : lock_buffer(bh);
1220 0 : if (buffer_uptodate(bh)) {
1221 0 : unlock_buffer(bh);
1222 0 : return bh;
1223 : } else {
1224 0 : get_bh(bh);
1225 0 : bh->b_end_io = end_buffer_read_sync;
1226 0 : submit_bh(REQ_OP_READ, bh);
1227 0 : wait_on_buffer(bh);
1228 0 : if (buffer_uptodate(bh))
1229 : return bh;
1230 : }
1231 0 : brelse(bh);
1232 0 : return NULL;
1233 : }
1234 :
1235 : /*
1236 : * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
1237 : * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
1238 : * refcount elevated by one when they're in an LRU. A buffer can only appear
1239 : * once in a particular CPU's LRU. A single buffer can be present in multiple
1240 : * CPU's LRUs at the same time.
1241 : *
1242 : * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1243 : * sb_find_get_block().
1244 : *
1245 : * The LRUs themselves only need locking against invalidate_bh_lrus. We use
1246 : * a local interrupt disable for that.
1247 : */
1248 :
1249 : #define BH_LRU_SIZE 16
1250 :
1251 : struct bh_lru {
1252 : struct buffer_head *bhs[BH_LRU_SIZE];
1253 : };
1254 :
1255 : static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1256 :
1257 : #ifdef CONFIG_SMP
1258 : #define bh_lru_lock() local_irq_disable()
1259 : #define bh_lru_unlock() local_irq_enable()
1260 : #else
1261 : #define bh_lru_lock() preempt_disable()
1262 : #define bh_lru_unlock() preempt_enable()
1263 : #endif
1264 :
1265 0 : static inline void check_irqs_on(void)
1266 : {
1267 : #ifdef irqs_disabled
1268 0 : BUG_ON(irqs_disabled());
1269 : #endif
1270 0 : }
1271 :
1272 : /*
1273 : * Install a buffer_head into this cpu's LRU. If not already in the LRU, it is
1274 : * inserted at the front, and the buffer_head at the back if any is evicted.
1275 : * Or, if already in the LRU it is moved to the front.
1276 : */
1277 0 : static void bh_lru_install(struct buffer_head *bh)
1278 : {
1279 0 : struct buffer_head *evictee = bh;
1280 : struct bh_lru *b;
1281 : int i;
1282 :
1283 0 : check_irqs_on();
1284 0 : bh_lru_lock();
1285 :
1286 : /*
1287 : * the refcount of buffer_head in bh_lru prevents dropping the
1288 : * attached page(i.e., try_to_free_buffers) so it could cause
1289 : * failing page migration.
1290 : * Skip putting upcoming bh into bh_lru until migration is done.
1291 : */
1292 0 : if (lru_cache_disabled()) {
1293 0 : bh_lru_unlock();
1294 0 : return;
1295 : }
1296 :
1297 : b = this_cpu_ptr(&bh_lrus);
1298 0 : for (i = 0; i < BH_LRU_SIZE; i++) {
1299 0 : swap(evictee, b->bhs[i]);
1300 0 : if (evictee == bh) {
1301 0 : bh_lru_unlock();
1302 0 : return;
1303 : }
1304 : }
1305 :
1306 0 : get_bh(bh);
1307 0 : bh_lru_unlock();
1308 0 : brelse(evictee);
1309 : }
1310 :
1311 : /*
1312 : * Look up the bh in this cpu's LRU. If it's there, move it to the head.
1313 : */
1314 : static struct buffer_head *
1315 0 : lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1316 : {
1317 0 : struct buffer_head *ret = NULL;
1318 : unsigned int i;
1319 :
1320 0 : check_irqs_on();
1321 0 : bh_lru_lock();
1322 0 : for (i = 0; i < BH_LRU_SIZE; i++) {
1323 0 : struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1324 :
1325 0 : if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
1326 0 : bh->b_size == size) {
1327 0 : if (i) {
1328 0 : while (i) {
1329 0 : __this_cpu_write(bh_lrus.bhs[i],
1330 : __this_cpu_read(bh_lrus.bhs[i - 1]));
1331 0 : i--;
1332 : }
1333 0 : __this_cpu_write(bh_lrus.bhs[0], bh);
1334 : }
1335 0 : get_bh(bh);
1336 0 : ret = bh;
1337 0 : break;
1338 : }
1339 : }
1340 0 : bh_lru_unlock();
1341 0 : return ret;
1342 : }
1343 :
1344 : /*
1345 : * Perform a pagecache lookup for the matching buffer. If it's there, refresh
1346 : * it in the LRU and mark it as accessed. If it is not present then return
1347 : * NULL
1348 : */
1349 : struct buffer_head *
1350 0 : __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1351 : {
1352 0 : struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1353 :
1354 0 : if (bh == NULL) {
1355 : /* __find_get_block_slow will mark the page accessed */
1356 0 : bh = __find_get_block_slow(bdev, block);
1357 0 : if (bh)
1358 0 : bh_lru_install(bh);
1359 : } else
1360 : touch_buffer(bh);
1361 :
1362 0 : return bh;
1363 : }
1364 : EXPORT_SYMBOL(__find_get_block);
1365 :
1366 : /*
1367 : * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
1368 : * which corresponds to the passed block_device, block and size. The
1369 : * returned buffer has its reference count incremented.
1370 : *
1371 : * __getblk_gfp() will lock up the machine if grow_dev_page's
1372 : * try_to_free_buffers() attempt is failing. FIXME, perhaps?
1373 : */
1374 : struct buffer_head *
1375 0 : __getblk_gfp(struct block_device *bdev, sector_t block,
1376 : unsigned size, gfp_t gfp)
1377 : {
1378 0 : struct buffer_head *bh = __find_get_block(bdev, block, size);
1379 :
1380 : might_sleep();
1381 0 : if (bh == NULL)
1382 0 : bh = __getblk_slow(bdev, block, size, gfp);
1383 0 : return bh;
1384 : }
1385 : EXPORT_SYMBOL(__getblk_gfp);
1386 :
1387 : /*
1388 : * Do async read-ahead on a buffer..
1389 : */
1390 0 : void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1391 : {
1392 0 : struct buffer_head *bh = __getblk(bdev, block, size);
1393 0 : if (likely(bh)) {
1394 0 : bh_readahead(bh, REQ_RAHEAD);
1395 0 : brelse(bh);
1396 : }
1397 0 : }
1398 : EXPORT_SYMBOL(__breadahead);
1399 :
1400 : /**
1401 : * __bread_gfp() - reads a specified block and returns the bh
1402 : * @bdev: the block_device to read from
1403 : * @block: number of block
1404 : * @size: size (in bytes) to read
1405 : * @gfp: page allocation flag
1406 : *
1407 : * Reads a specified block, and returns buffer head that contains it.
1408 : * The page cache can be allocated from non-movable area
1409 : * not to prevent page migration if you set gfp to zero.
1410 : * It returns NULL if the block was unreadable.
1411 : */
1412 : struct buffer_head *
1413 0 : __bread_gfp(struct block_device *bdev, sector_t block,
1414 : unsigned size, gfp_t gfp)
1415 : {
1416 0 : struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
1417 :
1418 0 : if (likely(bh) && !buffer_uptodate(bh))
1419 0 : bh = __bread_slow(bh);
1420 0 : return bh;
1421 : }
1422 : EXPORT_SYMBOL(__bread_gfp);
1423 :
1424 : static void __invalidate_bh_lrus(struct bh_lru *b)
1425 : {
1426 : int i;
1427 :
1428 0 : for (i = 0; i < BH_LRU_SIZE; i++) {
1429 0 : brelse(b->bhs[i]);
1430 0 : b->bhs[i] = NULL;
1431 : }
1432 : }
1433 : /*
1434 : * invalidate_bh_lrus() is called rarely - but not only at unmount.
1435 : * This doesn't race because it runs in each cpu either in irq
1436 : * or with preempt disabled.
1437 : */
1438 0 : static void invalidate_bh_lru(void *arg)
1439 : {
1440 0 : struct bh_lru *b = &get_cpu_var(bh_lrus);
1441 :
1442 0 : __invalidate_bh_lrus(b);
1443 0 : put_cpu_var(bh_lrus);
1444 0 : }
1445 :
1446 0 : bool has_bh_in_lru(int cpu, void *dummy)
1447 : {
1448 0 : struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
1449 : int i;
1450 :
1451 0 : for (i = 0; i < BH_LRU_SIZE; i++) {
1452 0 : if (b->bhs[i])
1453 : return true;
1454 : }
1455 :
1456 : return false;
1457 : }
1458 :
1459 0 : void invalidate_bh_lrus(void)
1460 : {
1461 0 : on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
1462 0 : }
1463 : EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1464 :
1465 : /*
1466 : * It's called from workqueue context so we need a bh_lru_lock to close
1467 : * the race with preemption/irq.
1468 : */
1469 0 : void invalidate_bh_lrus_cpu(void)
1470 : {
1471 : struct bh_lru *b;
1472 :
1473 0 : bh_lru_lock();
1474 0 : b = this_cpu_ptr(&bh_lrus);
1475 0 : __invalidate_bh_lrus(b);
1476 0 : bh_lru_unlock();
1477 0 : }
1478 :
1479 0 : void set_bh_page(struct buffer_head *bh,
1480 : struct page *page, unsigned long offset)
1481 : {
1482 0 : bh->b_page = page;
1483 0 : BUG_ON(offset >= PAGE_SIZE);
1484 0 : if (PageHighMem(page))
1485 : /*
1486 : * This catches illegal uses and preserves the offset:
1487 : */
1488 : bh->b_data = (char *)(0 + offset);
1489 : else
1490 0 : bh->b_data = page_address(page) + offset;
1491 0 : }
1492 : EXPORT_SYMBOL(set_bh_page);
1493 :
1494 0 : void folio_set_bh(struct buffer_head *bh, struct folio *folio,
1495 : unsigned long offset)
1496 : {
1497 0 : bh->b_folio = folio;
1498 0 : BUG_ON(offset >= folio_size(folio));
1499 0 : if (folio_test_highmem(folio))
1500 : /*
1501 : * This catches illegal uses and preserves the offset:
1502 : */
1503 : bh->b_data = (char *)(0 + offset);
1504 : else
1505 0 : bh->b_data = folio_address(folio) + offset;
1506 0 : }
1507 : EXPORT_SYMBOL(folio_set_bh);
1508 :
1509 : /*
1510 : * Called when truncating a buffer on a page completely.
1511 : */
1512 :
1513 : /* Bits that are cleared during an invalidate */
1514 : #define BUFFER_FLAGS_DISCARD \
1515 : (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
1516 : 1 << BH_Delay | 1 << BH_Unwritten)
1517 :
1518 0 : static void discard_buffer(struct buffer_head * bh)
1519 : {
1520 : unsigned long b_state;
1521 :
1522 0 : lock_buffer(bh);
1523 0 : clear_buffer_dirty(bh);
1524 0 : bh->b_bdev = NULL;
1525 0 : b_state = READ_ONCE(bh->b_state);
1526 : do {
1527 0 : } while (!try_cmpxchg(&bh->b_state, &b_state,
1528 : b_state & ~BUFFER_FLAGS_DISCARD));
1529 0 : unlock_buffer(bh);
1530 0 : }
1531 :
1532 : /**
1533 : * block_invalidate_folio - Invalidate part or all of a buffer-backed folio.
1534 : * @folio: The folio which is affected.
1535 : * @offset: start of the range to invalidate
1536 : * @length: length of the range to invalidate
1537 : *
1538 : * block_invalidate_folio() is called when all or part of the folio has been
1539 : * invalidated by a truncate operation.
1540 : *
1541 : * block_invalidate_folio() does not have to release all buffers, but it must
1542 : * ensure that no dirty buffer is left outside @offset and that no I/O
1543 : * is underway against any of the blocks which are outside the truncation
1544 : * point. Because the caller is about to free (and possibly reuse) those
1545 : * blocks on-disk.
1546 : */
1547 0 : void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
1548 : {
1549 : struct buffer_head *head, *bh, *next;
1550 0 : size_t curr_off = 0;
1551 0 : size_t stop = length + offset;
1552 :
1553 0 : BUG_ON(!folio_test_locked(folio));
1554 :
1555 : /*
1556 : * Check for overflow
1557 : */
1558 0 : BUG_ON(stop > folio_size(folio) || stop < length);
1559 :
1560 0 : head = folio_buffers(folio);
1561 0 : if (!head)
1562 : return;
1563 :
1564 : bh = head;
1565 : do {
1566 0 : size_t next_off = curr_off + bh->b_size;
1567 0 : next = bh->b_this_page;
1568 :
1569 : /*
1570 : * Are we still fully in range ?
1571 : */
1572 0 : if (next_off > stop)
1573 : goto out;
1574 :
1575 : /*
1576 : * is this block fully invalidated?
1577 : */
1578 0 : if (offset <= curr_off)
1579 0 : discard_buffer(bh);
1580 0 : curr_off = next_off;
1581 0 : bh = next;
1582 0 : } while (bh != head);
1583 :
1584 : /*
1585 : * We release buffers only if the entire folio is being invalidated.
1586 : * The get_block cached value has been unconditionally invalidated,
1587 : * so real IO is not possible anymore.
1588 : */
1589 0 : if (length == folio_size(folio))
1590 0 : filemap_release_folio(folio, 0);
1591 : out:
1592 : return;
1593 : }
1594 : EXPORT_SYMBOL(block_invalidate_folio);
1595 :
1596 : /*
1597 : * We attach and possibly dirty the buffers atomically wrt
1598 : * block_dirty_folio() via private_lock. try_to_free_buffers
1599 : * is already excluded via the folio lock.
1600 : */
1601 0 : void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize,
1602 : unsigned long b_state)
1603 : {
1604 : struct buffer_head *bh, *head, *tail;
1605 :
1606 0 : head = folio_alloc_buffers(folio, blocksize, true);
1607 0 : bh = head;
1608 : do {
1609 0 : bh->b_state |= b_state;
1610 0 : tail = bh;
1611 0 : bh = bh->b_this_page;
1612 0 : } while (bh);
1613 0 : tail->b_this_page = head;
1614 :
1615 0 : spin_lock(&folio->mapping->private_lock);
1616 0 : if (folio_test_uptodate(folio) || folio_test_dirty(folio)) {
1617 : bh = head;
1618 : do {
1619 0 : if (folio_test_dirty(folio))
1620 : set_buffer_dirty(bh);
1621 0 : if (folio_test_uptodate(folio))
1622 : set_buffer_uptodate(bh);
1623 0 : bh = bh->b_this_page;
1624 0 : } while (bh != head);
1625 : }
1626 0 : folio_attach_private(folio, head);
1627 0 : spin_unlock(&folio->mapping->private_lock);
1628 0 : }
1629 : EXPORT_SYMBOL(folio_create_empty_buffers);
1630 :
1631 0 : void create_empty_buffers(struct page *page,
1632 : unsigned long blocksize, unsigned long b_state)
1633 : {
1634 0 : folio_create_empty_buffers(page_folio(page), blocksize, b_state);
1635 0 : }
1636 : EXPORT_SYMBOL(create_empty_buffers);
1637 :
1638 : /**
1639 : * clean_bdev_aliases: clean a range of buffers in block device
1640 : * @bdev: Block device to clean buffers in
1641 : * @block: Start of a range of blocks to clean
1642 : * @len: Number of blocks to clean
1643 : *
1644 : * We are taking a range of blocks for data and we don't want writeback of any
1645 : * buffer-cache aliases starting from return from this function and until the
1646 : * moment when something will explicitly mark the buffer dirty (hopefully that
1647 : * will not happen until we will free that block ;-) We don't even need to mark
1648 : * it not-uptodate - nobody can expect anything from a newly allocated buffer
1649 : * anyway. We used to use unmap_buffer() for such invalidation, but that was
1650 : * wrong. We definitely don't want to mark the alias unmapped, for example - it
1651 : * would confuse anyone who might pick it with bread() afterwards...
1652 : *
1653 : * Also.. Note that bforget() doesn't lock the buffer. So there can be
1654 : * writeout I/O going on against recently-freed buffers. We don't wait on that
1655 : * I/O in bforget() - it's more efficient to wait on the I/O only if we really
1656 : * need to. That happens here.
1657 : */
1658 0 : void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
1659 : {
1660 0 : struct inode *bd_inode = bdev->bd_inode;
1661 0 : struct address_space *bd_mapping = bd_inode->i_mapping;
1662 : struct folio_batch fbatch;
1663 0 : pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
1664 : pgoff_t end;
1665 : int i, count;
1666 : struct buffer_head *bh;
1667 : struct buffer_head *head;
1668 :
1669 0 : end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
1670 0 : folio_batch_init(&fbatch);
1671 0 : while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) {
1672 0 : count = folio_batch_count(&fbatch);
1673 0 : for (i = 0; i < count; i++) {
1674 0 : struct folio *folio = fbatch.folios[i];
1675 :
1676 0 : if (!folio_buffers(folio))
1677 0 : continue;
1678 : /*
1679 : * We use folio lock instead of bd_mapping->private_lock
1680 : * to pin buffers here since we can afford to sleep and
1681 : * it scales better than a global spinlock lock.
1682 : */
1683 0 : folio_lock(folio);
1684 : /* Recheck when the folio is locked which pins bhs */
1685 0 : head = folio_buffers(folio);
1686 0 : if (!head)
1687 : goto unlock_page;
1688 : bh = head;
1689 : do {
1690 0 : if (!buffer_mapped(bh) || (bh->b_blocknr < block))
1691 : goto next;
1692 0 : if (bh->b_blocknr >= block + len)
1693 : break;
1694 0 : clear_buffer_dirty(bh);
1695 0 : wait_on_buffer(bh);
1696 : clear_buffer_req(bh);
1697 : next:
1698 0 : bh = bh->b_this_page;
1699 0 : } while (bh != head);
1700 : unlock_page:
1701 0 : folio_unlock(folio);
1702 : }
1703 0 : folio_batch_release(&fbatch);
1704 0 : cond_resched();
1705 : /* End of range already reached? */
1706 0 : if (index > end || !index)
1707 : break;
1708 : }
1709 0 : }
1710 : EXPORT_SYMBOL(clean_bdev_aliases);
1711 :
1712 : /*
1713 : * Size is a power-of-two in the range 512..PAGE_SIZE,
1714 : * and the case we care about most is PAGE_SIZE.
1715 : *
1716 : * So this *could* possibly be written with those
1717 : * constraints in mind (relevant mostly if some
1718 : * architecture has a slow bit-scan instruction)
1719 : */
1720 : static inline int block_size_bits(unsigned int blocksize)
1721 : {
1722 0 : return ilog2(blocksize);
1723 : }
1724 :
1725 0 : static struct buffer_head *folio_create_buffers(struct folio *folio,
1726 : struct inode *inode,
1727 : unsigned int b_state)
1728 : {
1729 0 : BUG_ON(!folio_test_locked(folio));
1730 :
1731 0 : if (!folio_buffers(folio))
1732 0 : folio_create_empty_buffers(folio,
1733 0 : 1 << READ_ONCE(inode->i_blkbits),
1734 : b_state);
1735 0 : return folio_buffers(folio);
1736 : }
1737 :
1738 : /*
1739 : * NOTE! All mapped/uptodate combinations are valid:
1740 : *
1741 : * Mapped Uptodate Meaning
1742 : *
1743 : * No No "unknown" - must do get_block()
1744 : * No Yes "hole" - zero-filled
1745 : * Yes No "allocated" - allocated on disk, not read in
1746 : * Yes Yes "valid" - allocated and up-to-date in memory.
1747 : *
1748 : * "Dirty" is valid only with the last case (mapped+uptodate).
1749 : */
1750 :
1751 : /*
1752 : * While block_write_full_page is writing back the dirty buffers under
1753 : * the page lock, whoever dirtied the buffers may decide to clean them
1754 : * again at any time. We handle that by only looking at the buffer
1755 : * state inside lock_buffer().
1756 : *
1757 : * If block_write_full_page() is called for regular writeback
1758 : * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1759 : * locked buffer. This only can happen if someone has written the buffer
1760 : * directly, with submit_bh(). At the address_space level PageWriteback
1761 : * prevents this contention from occurring.
1762 : *
1763 : * If block_write_full_page() is called with wbc->sync_mode ==
1764 : * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
1765 : * causes the writes to be flagged as synchronous writes.
1766 : */
1767 0 : int __block_write_full_page(struct inode *inode, struct page *page,
1768 : get_block_t *get_block, struct writeback_control *wbc,
1769 : bh_end_io_t *handler)
1770 : {
1771 : int err;
1772 : sector_t block;
1773 : sector_t last_block;
1774 : struct buffer_head *bh, *head;
1775 : unsigned int blocksize, bbits;
1776 0 : int nr_underway = 0;
1777 0 : blk_opf_t write_flags = wbc_to_write_flags(wbc);
1778 :
1779 0 : head = folio_create_buffers(page_folio(page), inode,
1780 : (1 << BH_Dirty) | (1 << BH_Uptodate));
1781 :
1782 : /*
1783 : * Be very careful. We have no exclusion from block_dirty_folio
1784 : * here, and the (potentially unmapped) buffers may become dirty at
1785 : * any time. If a buffer becomes dirty here after we've inspected it
1786 : * then we just miss that fact, and the page stays dirty.
1787 : *
1788 : * Buffers outside i_size may be dirtied by block_dirty_folio;
1789 : * handle that here by just cleaning them.
1790 : */
1791 :
1792 0 : bh = head;
1793 0 : blocksize = bh->b_size;
1794 0 : bbits = block_size_bits(blocksize);
1795 :
1796 0 : block = (sector_t)page->index << (PAGE_SHIFT - bbits);
1797 0 : last_block = (i_size_read(inode) - 1) >> bbits;
1798 :
1799 : /*
1800 : * Get all the dirty buffers mapped to disk addresses and
1801 : * handle any aliases from the underlying blockdev's mapping.
1802 : */
1803 : do {
1804 0 : if (block > last_block) {
1805 : /*
1806 : * mapped buffers outside i_size will occur, because
1807 : * this page can be outside i_size when there is a
1808 : * truncate in progress.
1809 : */
1810 : /*
1811 : * The buffer was zeroed by block_write_full_page()
1812 : */
1813 0 : clear_buffer_dirty(bh);
1814 : set_buffer_uptodate(bh);
1815 0 : } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1816 0 : buffer_dirty(bh)) {
1817 0 : WARN_ON(bh->b_size != blocksize);
1818 0 : err = get_block(inode, block, bh, 1);
1819 0 : if (err)
1820 : goto recover;
1821 0 : clear_buffer_delay(bh);
1822 0 : if (buffer_new(bh)) {
1823 : /* blockdev mappings never come here */
1824 0 : clear_buffer_new(bh);
1825 0 : clean_bdev_bh_alias(bh);
1826 : }
1827 : }
1828 0 : bh = bh->b_this_page;
1829 0 : block++;
1830 0 : } while (bh != head);
1831 :
1832 : do {
1833 0 : if (!buffer_mapped(bh))
1834 0 : continue;
1835 : /*
1836 : * If it's a fully non-blocking write attempt and we cannot
1837 : * lock the buffer then redirty the page. Note that this can
1838 : * potentially cause a busy-wait loop from writeback threads
1839 : * and kswapd activity, but those code paths have their own
1840 : * higher-level throttling.
1841 : */
1842 0 : if (wbc->sync_mode != WB_SYNC_NONE) {
1843 : lock_buffer(bh);
1844 0 : } else if (!trylock_buffer(bh)) {
1845 0 : redirty_page_for_writepage(wbc, page);
1846 0 : continue;
1847 : }
1848 0 : if (test_clear_buffer_dirty(bh)) {
1849 : mark_buffer_async_write_endio(bh, handler);
1850 : } else {
1851 : unlock_buffer(bh);
1852 : }
1853 0 : } while ((bh = bh->b_this_page) != head);
1854 :
1855 : /*
1856 : * The page and its buffers are protected by PageWriteback(), so we can
1857 : * drop the bh refcounts early.
1858 : */
1859 0 : BUG_ON(PageWriteback(page));
1860 0 : set_page_writeback(page);
1861 :
1862 : do {
1863 0 : struct buffer_head *next = bh->b_this_page;
1864 0 : if (buffer_async_write(bh)) {
1865 0 : submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc);
1866 0 : nr_underway++;
1867 : }
1868 0 : bh = next;
1869 0 : } while (bh != head);
1870 0 : unlock_page(page);
1871 :
1872 0 : err = 0;
1873 : done:
1874 0 : if (nr_underway == 0) {
1875 : /*
1876 : * The page was marked dirty, but the buffers were
1877 : * clean. Someone wrote them back by hand with
1878 : * write_dirty_buffer/submit_bh. A rare case.
1879 : */
1880 0 : end_page_writeback(page);
1881 :
1882 : /*
1883 : * The page and buffer_heads can be released at any time from
1884 : * here on.
1885 : */
1886 : }
1887 0 : return err;
1888 :
1889 : recover:
1890 : /*
1891 : * ENOSPC, or some other error. We may already have added some
1892 : * blocks to the file, so we need to write these out to avoid
1893 : * exposing stale data.
1894 : * The page is currently locked and not marked for writeback
1895 : */
1896 : bh = head;
1897 : /* Recovery: lock and submit the mapped buffers */
1898 : do {
1899 0 : if (buffer_mapped(bh) && buffer_dirty(bh) &&
1900 0 : !buffer_delay(bh)) {
1901 0 : lock_buffer(bh);
1902 : mark_buffer_async_write_endio(bh, handler);
1903 : } else {
1904 : /*
1905 : * The buffer may have been set dirty during
1906 : * attachment to a dirty page.
1907 : */
1908 : clear_buffer_dirty(bh);
1909 : }
1910 0 : } while ((bh = bh->b_this_page) != head);
1911 0 : SetPageError(page);
1912 0 : BUG_ON(PageWriteback(page));
1913 0 : mapping_set_error(page->mapping, err);
1914 0 : set_page_writeback(page);
1915 : do {
1916 0 : struct buffer_head *next = bh->b_this_page;
1917 0 : if (buffer_async_write(bh)) {
1918 0 : clear_buffer_dirty(bh);
1919 0 : submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc);
1920 0 : nr_underway++;
1921 : }
1922 0 : bh = next;
1923 0 : } while (bh != head);
1924 0 : unlock_page(page);
1925 0 : goto done;
1926 : }
1927 : EXPORT_SYMBOL(__block_write_full_page);
1928 :
1929 : /*
1930 : * If a page has any new buffers, zero them out here, and mark them uptodate
1931 : * and dirty so they'll be written out (in order to prevent uninitialised
1932 : * block data from leaking). And clear the new bit.
1933 : */
1934 0 : void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1935 : {
1936 : unsigned int block_start, block_end;
1937 : struct buffer_head *head, *bh;
1938 :
1939 0 : BUG_ON(!PageLocked(page));
1940 0 : if (!page_has_buffers(page))
1941 : return;
1942 :
1943 0 : bh = head = page_buffers(page);
1944 0 : block_start = 0;
1945 : do {
1946 0 : block_end = block_start + bh->b_size;
1947 :
1948 0 : if (buffer_new(bh)) {
1949 0 : if (block_end > from && block_start < to) {
1950 0 : if (!PageUptodate(page)) {
1951 : unsigned start, size;
1952 :
1953 0 : start = max(from, block_start);
1954 0 : size = min(to, block_end) - start;
1955 :
1956 0 : zero_user(page, start, size);
1957 : set_buffer_uptodate(bh);
1958 : }
1959 :
1960 0 : clear_buffer_new(bh);
1961 0 : mark_buffer_dirty(bh);
1962 : }
1963 : }
1964 :
1965 0 : block_start = block_end;
1966 0 : bh = bh->b_this_page;
1967 0 : } while (bh != head);
1968 : }
1969 : EXPORT_SYMBOL(page_zero_new_buffers);
1970 :
1971 : static void
1972 0 : iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
1973 : const struct iomap *iomap)
1974 : {
1975 0 : loff_t offset = block << inode->i_blkbits;
1976 :
1977 0 : bh->b_bdev = iomap->bdev;
1978 :
1979 : /*
1980 : * Block points to offset in file we need to map, iomap contains
1981 : * the offset at which the map starts. If the map ends before the
1982 : * current block, then do not map the buffer and let the caller
1983 : * handle it.
1984 : */
1985 0 : BUG_ON(offset >= iomap->offset + iomap->length);
1986 :
1987 0 : switch (iomap->type) {
1988 : case IOMAP_HOLE:
1989 : /*
1990 : * If the buffer is not up to date or beyond the current EOF,
1991 : * we need to mark it as new to ensure sub-block zeroing is
1992 : * executed if necessary.
1993 : */
1994 0 : if (!buffer_uptodate(bh) ||
1995 0 : (offset >= i_size_read(inode)))
1996 : set_buffer_new(bh);
1997 : break;
1998 : case IOMAP_DELALLOC:
1999 0 : if (!buffer_uptodate(bh) ||
2000 0 : (offset >= i_size_read(inode)))
2001 : set_buffer_new(bh);
2002 0 : set_buffer_uptodate(bh);
2003 0 : set_buffer_mapped(bh);
2004 : set_buffer_delay(bh);
2005 : break;
2006 : case IOMAP_UNWRITTEN:
2007 : /*
2008 : * For unwritten regions, we always need to ensure that regions
2009 : * in the block we are not writing to are zeroed. Mark the
2010 : * buffer as new to ensure this.
2011 : */
2012 0 : set_buffer_new(bh);
2013 : set_buffer_unwritten(bh);
2014 : fallthrough;
2015 : case IOMAP_MAPPED:
2016 0 : if ((iomap->flags & IOMAP_F_NEW) ||
2017 0 : offset >= i_size_read(inode))
2018 : set_buffer_new(bh);
2019 0 : bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
2020 0 : inode->i_blkbits;
2021 : set_buffer_mapped(bh);
2022 : break;
2023 : }
2024 0 : }
2025 :
2026 0 : int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
2027 : get_block_t *get_block, const struct iomap *iomap)
2028 : {
2029 0 : unsigned from = pos & (PAGE_SIZE - 1);
2030 0 : unsigned to = from + len;
2031 0 : struct inode *inode = folio->mapping->host;
2032 : unsigned block_start, block_end;
2033 : sector_t block;
2034 0 : int err = 0;
2035 : unsigned blocksize, bbits;
2036 0 : struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
2037 :
2038 0 : BUG_ON(!folio_test_locked(folio));
2039 : BUG_ON(from > PAGE_SIZE);
2040 0 : BUG_ON(to > PAGE_SIZE);
2041 0 : BUG_ON(from > to);
2042 :
2043 0 : head = folio_create_buffers(folio, inode, 0);
2044 0 : blocksize = head->b_size;
2045 0 : bbits = block_size_bits(blocksize);
2046 :
2047 0 : block = (sector_t)folio->index << (PAGE_SHIFT - bbits);
2048 :
2049 0 : for(bh = head, block_start = 0; bh != head || !block_start;
2050 0 : block++, block_start=block_end, bh = bh->b_this_page) {
2051 0 : block_end = block_start + blocksize;
2052 0 : if (block_end <= from || block_start >= to) {
2053 0 : if (folio_test_uptodate(folio)) {
2054 0 : if (!buffer_uptodate(bh))
2055 : set_buffer_uptodate(bh);
2056 : }
2057 0 : continue;
2058 : }
2059 0 : if (buffer_new(bh))
2060 : clear_buffer_new(bh);
2061 0 : if (!buffer_mapped(bh)) {
2062 0 : WARN_ON(bh->b_size != blocksize);
2063 0 : if (get_block) {
2064 0 : err = get_block(inode, block, bh, 1);
2065 0 : if (err)
2066 : break;
2067 : } else {
2068 0 : iomap_to_bh(inode, block, bh, iomap);
2069 : }
2070 :
2071 0 : if (buffer_new(bh)) {
2072 0 : clean_bdev_bh_alias(bh);
2073 0 : if (folio_test_uptodate(folio)) {
2074 0 : clear_buffer_new(bh);
2075 0 : set_buffer_uptodate(bh);
2076 0 : mark_buffer_dirty(bh);
2077 0 : continue;
2078 : }
2079 0 : if (block_end > to || block_start < from)
2080 0 : folio_zero_segments(folio,
2081 : to, block_end,
2082 : block_start, from);
2083 0 : continue;
2084 : }
2085 : }
2086 0 : if (folio_test_uptodate(folio)) {
2087 0 : if (!buffer_uptodate(bh))
2088 : set_buffer_uptodate(bh);
2089 0 : continue;
2090 : }
2091 0 : if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
2092 0 : !buffer_unwritten(bh) &&
2093 0 : (block_start < from || block_end > to)) {
2094 0 : bh_read_nowait(bh, 0);
2095 0 : *wait_bh++=bh;
2096 : }
2097 : }
2098 : /*
2099 : * If we issued read requests - let them complete.
2100 : */
2101 0 : while(wait_bh > wait) {
2102 0 : wait_on_buffer(*--wait_bh);
2103 0 : if (!buffer_uptodate(*wait_bh))
2104 0 : err = -EIO;
2105 : }
2106 0 : if (unlikely(err))
2107 0 : page_zero_new_buffers(&folio->page, from, to);
2108 0 : return err;
2109 : }
2110 :
2111 0 : int __block_write_begin(struct page *page, loff_t pos, unsigned len,
2112 : get_block_t *get_block)
2113 : {
2114 0 : return __block_write_begin_int(page_folio(page), pos, len, get_block,
2115 : NULL);
2116 : }
2117 : EXPORT_SYMBOL(__block_write_begin);
2118 :
2119 0 : static int __block_commit_write(struct inode *inode, struct page *page,
2120 : unsigned from, unsigned to)
2121 : {
2122 : unsigned block_start, block_end;
2123 0 : int partial = 0;
2124 : unsigned blocksize;
2125 : struct buffer_head *bh, *head;
2126 :
2127 0 : bh = head = page_buffers(page);
2128 0 : blocksize = bh->b_size;
2129 :
2130 0 : block_start = 0;
2131 : do {
2132 0 : block_end = block_start + blocksize;
2133 0 : if (block_end <= from || block_start >= to) {
2134 0 : if (!buffer_uptodate(bh))
2135 0 : partial = 1;
2136 : } else {
2137 0 : set_buffer_uptodate(bh);
2138 0 : mark_buffer_dirty(bh);
2139 : }
2140 0 : if (buffer_new(bh))
2141 : clear_buffer_new(bh);
2142 :
2143 0 : block_start = block_end;
2144 0 : bh = bh->b_this_page;
2145 0 : } while (bh != head);
2146 :
2147 : /*
2148 : * If this is a partial write which happened to make all buffers
2149 : * uptodate then we can optimize away a bogus read_folio() for
2150 : * the next read(). Here we 'discover' whether the page went
2151 : * uptodate as a result of this (potentially partial) write.
2152 : */
2153 0 : if (!partial)
2154 : SetPageUptodate(page);
2155 0 : return 0;
2156 : }
2157 :
2158 : /*
2159 : * block_write_begin takes care of the basic task of block allocation and
2160 : * bringing partial write blocks uptodate first.
2161 : *
2162 : * The filesystem needs to handle block truncation upon failure.
2163 : */
2164 0 : int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
2165 : struct page **pagep, get_block_t *get_block)
2166 : {
2167 0 : pgoff_t index = pos >> PAGE_SHIFT;
2168 : struct page *page;
2169 : int status;
2170 :
2171 0 : page = grab_cache_page_write_begin(mapping, index);
2172 0 : if (!page)
2173 : return -ENOMEM;
2174 :
2175 0 : status = __block_write_begin(page, pos, len, get_block);
2176 0 : if (unlikely(status)) {
2177 0 : unlock_page(page);
2178 0 : put_page(page);
2179 0 : page = NULL;
2180 : }
2181 :
2182 0 : *pagep = page;
2183 0 : return status;
2184 : }
2185 : EXPORT_SYMBOL(block_write_begin);
2186 :
2187 0 : int block_write_end(struct file *file, struct address_space *mapping,
2188 : loff_t pos, unsigned len, unsigned copied,
2189 : struct page *page, void *fsdata)
2190 : {
2191 0 : struct inode *inode = mapping->host;
2192 : unsigned start;
2193 :
2194 0 : start = pos & (PAGE_SIZE - 1);
2195 :
2196 0 : if (unlikely(copied < len)) {
2197 : /*
2198 : * The buffers that were written will now be uptodate, so
2199 : * we don't have to worry about a read_folio reading them
2200 : * and overwriting a partial write. However if we have
2201 : * encountered a short write and only partially written
2202 : * into a buffer, it will not be marked uptodate, so a
2203 : * read_folio might come in and destroy our partial write.
2204 : *
2205 : * Do the simplest thing, and just treat any short write to a
2206 : * non uptodate page as a zero-length write, and force the
2207 : * caller to redo the whole thing.
2208 : */
2209 0 : if (!PageUptodate(page))
2210 0 : copied = 0;
2211 :
2212 0 : page_zero_new_buffers(page, start+copied, start+len);
2213 : }
2214 0 : flush_dcache_page(page);
2215 :
2216 : /* This could be a short (even 0-length) commit */
2217 0 : __block_commit_write(inode, page, start, start+copied);
2218 :
2219 0 : return copied;
2220 : }
2221 : EXPORT_SYMBOL(block_write_end);
2222 :
2223 0 : int generic_write_end(struct file *file, struct address_space *mapping,
2224 : loff_t pos, unsigned len, unsigned copied,
2225 : struct page *page, void *fsdata)
2226 : {
2227 0 : struct inode *inode = mapping->host;
2228 0 : loff_t old_size = inode->i_size;
2229 0 : bool i_size_changed = false;
2230 :
2231 0 : copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2232 :
2233 : /*
2234 : * No need to use i_size_read() here, the i_size cannot change under us
2235 : * because we hold i_rwsem.
2236 : *
2237 : * But it's important to update i_size while still holding page lock:
2238 : * page writeout could otherwise come in and zero beyond i_size.
2239 : */
2240 0 : if (pos + copied > inode->i_size) {
2241 0 : i_size_write(inode, pos + copied);
2242 0 : i_size_changed = true;
2243 : }
2244 :
2245 0 : unlock_page(page);
2246 0 : put_page(page);
2247 :
2248 0 : if (old_size < pos)
2249 0 : pagecache_isize_extended(inode, old_size, pos);
2250 : /*
2251 : * Don't mark the inode dirty under page lock. First, it unnecessarily
2252 : * makes the holding time of page lock longer. Second, it forces lock
2253 : * ordering of page lock and transaction start for journaling
2254 : * filesystems.
2255 : */
2256 0 : if (i_size_changed)
2257 : mark_inode_dirty(inode);
2258 0 : return copied;
2259 : }
2260 : EXPORT_SYMBOL(generic_write_end);
2261 :
2262 : /*
2263 : * block_is_partially_uptodate checks whether buffers within a folio are
2264 : * uptodate or not.
2265 : *
2266 : * Returns true if all buffers which correspond to the specified part
2267 : * of the folio are uptodate.
2268 : */
2269 0 : bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
2270 : {
2271 : unsigned block_start, block_end, blocksize;
2272 : unsigned to;
2273 : struct buffer_head *bh, *head;
2274 0 : bool ret = true;
2275 :
2276 0 : head = folio_buffers(folio);
2277 0 : if (!head)
2278 : return false;
2279 0 : blocksize = head->b_size;
2280 0 : to = min_t(unsigned, folio_size(folio) - from, count);
2281 0 : to = from + to;
2282 0 : if (from < blocksize && to > folio_size(folio) - blocksize)
2283 : return false;
2284 :
2285 : bh = head;
2286 : block_start = 0;
2287 : do {
2288 0 : block_end = block_start + blocksize;
2289 0 : if (block_end > from && block_start < to) {
2290 0 : if (!buffer_uptodate(bh)) {
2291 : ret = false;
2292 : break;
2293 : }
2294 0 : if (block_end >= to)
2295 : break;
2296 : }
2297 0 : block_start = block_end;
2298 0 : bh = bh->b_this_page;
2299 0 : } while (bh != head);
2300 :
2301 : return ret;
2302 : }
2303 : EXPORT_SYMBOL(block_is_partially_uptodate);
2304 :
2305 : /*
2306 : * Generic "read_folio" function for block devices that have the normal
2307 : * get_block functionality. This is most of the block device filesystems.
2308 : * Reads the folio asynchronously --- the unlock_buffer() and
2309 : * set/clear_buffer_uptodate() functions propagate buffer state into the
2310 : * folio once IO has completed.
2311 : */
2312 0 : int block_read_full_folio(struct folio *folio, get_block_t *get_block)
2313 : {
2314 0 : struct inode *inode = folio->mapping->host;
2315 : sector_t iblock, lblock;
2316 : struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2317 : unsigned int blocksize, bbits;
2318 : int nr, i;
2319 0 : int fully_mapped = 1;
2320 0 : bool page_error = false;
2321 0 : loff_t limit = i_size_read(inode);
2322 :
2323 : /* This is needed for ext4. */
2324 : if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
2325 : limit = inode->i_sb->s_maxbytes;
2326 :
2327 : VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
2328 :
2329 0 : head = folio_create_buffers(folio, inode, 0);
2330 0 : blocksize = head->b_size;
2331 0 : bbits = block_size_bits(blocksize);
2332 :
2333 0 : iblock = (sector_t)folio->index << (PAGE_SHIFT - bbits);
2334 0 : lblock = (limit+blocksize-1) >> bbits;
2335 0 : bh = head;
2336 0 : nr = 0;
2337 0 : i = 0;
2338 :
2339 : do {
2340 0 : if (buffer_uptodate(bh))
2341 0 : continue;
2342 :
2343 0 : if (!buffer_mapped(bh)) {
2344 0 : int err = 0;
2345 :
2346 0 : fully_mapped = 0;
2347 0 : if (iblock < lblock) {
2348 0 : WARN_ON(bh->b_size != blocksize);
2349 0 : err = get_block(inode, iblock, bh, 0);
2350 0 : if (err) {
2351 0 : folio_set_error(folio);
2352 0 : page_error = true;
2353 : }
2354 : }
2355 0 : if (!buffer_mapped(bh)) {
2356 0 : folio_zero_range(folio, i * blocksize,
2357 : blocksize);
2358 0 : if (!err)
2359 : set_buffer_uptodate(bh);
2360 0 : continue;
2361 : }
2362 : /*
2363 : * get_block() might have updated the buffer
2364 : * synchronously
2365 : */
2366 0 : if (buffer_uptodate(bh))
2367 0 : continue;
2368 : }
2369 0 : arr[nr++] = bh;
2370 0 : } while (i++, iblock++, (bh = bh->b_this_page) != head);
2371 :
2372 0 : if (fully_mapped)
2373 : folio_set_mappedtodisk(folio);
2374 :
2375 0 : if (!nr) {
2376 : /*
2377 : * All buffers are uptodate - we can set the folio uptodate
2378 : * as well. But not if get_block() returned an error.
2379 : */
2380 0 : if (!page_error)
2381 : folio_mark_uptodate(folio);
2382 0 : folio_unlock(folio);
2383 0 : return 0;
2384 : }
2385 :
2386 : /* Stage two: lock the buffers */
2387 0 : for (i = 0; i < nr; i++) {
2388 0 : bh = arr[i];
2389 0 : lock_buffer(bh);
2390 0 : mark_buffer_async_read(bh);
2391 : }
2392 :
2393 : /*
2394 : * Stage 3: start the IO. Check for uptodateness
2395 : * inside the buffer lock in case another process reading
2396 : * the underlying blockdev brought it uptodate (the sct fix).
2397 : */
2398 0 : for (i = 0; i < nr; i++) {
2399 0 : bh = arr[i];
2400 0 : if (buffer_uptodate(bh))
2401 0 : end_buffer_async_read(bh, 1);
2402 : else
2403 : submit_bh(REQ_OP_READ, bh);
2404 : }
2405 : return 0;
2406 : }
2407 : EXPORT_SYMBOL(block_read_full_folio);
2408 :
2409 : /* utility function for filesystems that need to do work on expanding
2410 : * truncates. Uses filesystem pagecache writes to allow the filesystem to
2411 : * deal with the hole.
2412 : */
2413 0 : int generic_cont_expand_simple(struct inode *inode, loff_t size)
2414 : {
2415 0 : struct address_space *mapping = inode->i_mapping;
2416 0 : const struct address_space_operations *aops = mapping->a_ops;
2417 : struct page *page;
2418 0 : void *fsdata = NULL;
2419 : int err;
2420 :
2421 0 : err = inode_newsize_ok(inode, size);
2422 0 : if (err)
2423 : goto out;
2424 :
2425 0 : err = aops->write_begin(NULL, mapping, size, 0, &page, &fsdata);
2426 0 : if (err)
2427 : goto out;
2428 :
2429 0 : err = aops->write_end(NULL, mapping, size, 0, 0, page, fsdata);
2430 0 : BUG_ON(err > 0);
2431 :
2432 : out:
2433 0 : return err;
2434 : }
2435 : EXPORT_SYMBOL(generic_cont_expand_simple);
2436 :
2437 0 : static int cont_expand_zero(struct file *file, struct address_space *mapping,
2438 : loff_t pos, loff_t *bytes)
2439 : {
2440 0 : struct inode *inode = mapping->host;
2441 0 : const struct address_space_operations *aops = mapping->a_ops;
2442 0 : unsigned int blocksize = i_blocksize(inode);
2443 : struct page *page;
2444 0 : void *fsdata = NULL;
2445 : pgoff_t index, curidx;
2446 : loff_t curpos;
2447 : unsigned zerofrom, offset, len;
2448 0 : int err = 0;
2449 :
2450 0 : index = pos >> PAGE_SHIFT;
2451 0 : offset = pos & ~PAGE_MASK;
2452 :
2453 0 : while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
2454 0 : zerofrom = curpos & ~PAGE_MASK;
2455 0 : if (zerofrom & (blocksize-1)) {
2456 0 : *bytes |= (blocksize-1);
2457 0 : (*bytes)++;
2458 : }
2459 0 : len = PAGE_SIZE - zerofrom;
2460 :
2461 0 : err = aops->write_begin(file, mapping, curpos, len,
2462 : &page, &fsdata);
2463 0 : if (err)
2464 : goto out;
2465 0 : zero_user(page, zerofrom, len);
2466 0 : err = aops->write_end(file, mapping, curpos, len, len,
2467 : page, fsdata);
2468 0 : if (err < 0)
2469 : goto out;
2470 0 : BUG_ON(err != len);
2471 0 : err = 0;
2472 :
2473 0 : balance_dirty_pages_ratelimited(mapping);
2474 :
2475 0 : if (fatal_signal_pending(current)) {
2476 : err = -EINTR;
2477 : goto out;
2478 : }
2479 : }
2480 :
2481 : /* page covers the boundary, find the boundary offset */
2482 0 : if (index == curidx) {
2483 0 : zerofrom = curpos & ~PAGE_MASK;
2484 : /* if we will expand the thing last block will be filled */
2485 0 : if (offset <= zerofrom) {
2486 : goto out;
2487 : }
2488 0 : if (zerofrom & (blocksize-1)) {
2489 0 : *bytes |= (blocksize-1);
2490 0 : (*bytes)++;
2491 : }
2492 0 : len = offset - zerofrom;
2493 :
2494 0 : err = aops->write_begin(file, mapping, curpos, len,
2495 : &page, &fsdata);
2496 0 : if (err)
2497 : goto out;
2498 0 : zero_user(page, zerofrom, len);
2499 0 : err = aops->write_end(file, mapping, curpos, len, len,
2500 : page, fsdata);
2501 0 : if (err < 0)
2502 : goto out;
2503 0 : BUG_ON(err != len);
2504 : err = 0;
2505 : }
2506 : out:
2507 0 : return err;
2508 : }
2509 :
2510 : /*
2511 : * For moronic filesystems that do not allow holes in file.
2512 : * We may have to extend the file.
2513 : */
2514 0 : int cont_write_begin(struct file *file, struct address_space *mapping,
2515 : loff_t pos, unsigned len,
2516 : struct page **pagep, void **fsdata,
2517 : get_block_t *get_block, loff_t *bytes)
2518 : {
2519 0 : struct inode *inode = mapping->host;
2520 0 : unsigned int blocksize = i_blocksize(inode);
2521 : unsigned int zerofrom;
2522 : int err;
2523 :
2524 0 : err = cont_expand_zero(file, mapping, pos, bytes);
2525 0 : if (err)
2526 : return err;
2527 :
2528 0 : zerofrom = *bytes & ~PAGE_MASK;
2529 0 : if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2530 0 : *bytes |= (blocksize-1);
2531 0 : (*bytes)++;
2532 : }
2533 :
2534 0 : return block_write_begin(mapping, pos, len, pagep, get_block);
2535 : }
2536 : EXPORT_SYMBOL(cont_write_begin);
2537 :
2538 0 : int block_commit_write(struct page *page, unsigned from, unsigned to)
2539 : {
2540 0 : struct inode *inode = page->mapping->host;
2541 0 : __block_commit_write(inode,page,from,to);
2542 0 : return 0;
2543 : }
2544 : EXPORT_SYMBOL(block_commit_write);
2545 :
2546 : /*
2547 : * block_page_mkwrite() is not allowed to change the file size as it gets
2548 : * called from a page fault handler when a page is first dirtied. Hence we must
2549 : * be careful to check for EOF conditions here. We set the page up correctly
2550 : * for a written page which means we get ENOSPC checking when writing into
2551 : * holes and correct delalloc and unwritten extent mapping on filesystems that
2552 : * support these features.
2553 : *
2554 : * We are not allowed to take the i_mutex here so we have to play games to
2555 : * protect against truncate races as the page could now be beyond EOF. Because
2556 : * truncate writes the inode size before removing pages, once we have the
2557 : * page lock we can determine safely if the page is beyond EOF. If it is not
2558 : * beyond EOF, then the page is guaranteed safe against truncation until we
2559 : * unlock the page.
2560 : *
2561 : * Direct callers of this function should protect against filesystem freezing
2562 : * using sb_start_pagefault() - sb_end_pagefault() functions.
2563 : */
2564 0 : int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2565 : get_block_t get_block)
2566 : {
2567 0 : struct page *page = vmf->page;
2568 0 : struct inode *inode = file_inode(vma->vm_file);
2569 : unsigned long end;
2570 : loff_t size;
2571 : int ret;
2572 :
2573 0 : lock_page(page);
2574 0 : size = i_size_read(inode);
2575 0 : if ((page->mapping != inode->i_mapping) ||
2576 0 : (page_offset(page) > size)) {
2577 : /* We overload EFAULT to mean page got truncated */
2578 : ret = -EFAULT;
2579 : goto out_unlock;
2580 : }
2581 :
2582 : /* page is wholly or partially inside EOF */
2583 0 : if (((page->index + 1) << PAGE_SHIFT) > size)
2584 0 : end = size & ~PAGE_MASK;
2585 : else
2586 : end = PAGE_SIZE;
2587 :
2588 0 : ret = __block_write_begin(page, 0, end, get_block);
2589 0 : if (!ret)
2590 0 : ret = block_commit_write(page, 0, end);
2591 :
2592 0 : if (unlikely(ret < 0))
2593 : goto out_unlock;
2594 0 : set_page_dirty(page);
2595 0 : wait_for_stable_page(page);
2596 0 : return 0;
2597 : out_unlock:
2598 0 : unlock_page(page);
2599 0 : return ret;
2600 : }
2601 : EXPORT_SYMBOL(block_page_mkwrite);
2602 :
2603 0 : int block_truncate_page(struct address_space *mapping,
2604 : loff_t from, get_block_t *get_block)
2605 : {
2606 0 : pgoff_t index = from >> PAGE_SHIFT;
2607 0 : unsigned offset = from & (PAGE_SIZE-1);
2608 : unsigned blocksize;
2609 : sector_t iblock;
2610 : unsigned length, pos;
2611 0 : struct inode *inode = mapping->host;
2612 : struct page *page;
2613 : struct buffer_head *bh;
2614 0 : int err = 0;
2615 :
2616 0 : blocksize = i_blocksize(inode);
2617 0 : length = offset & (blocksize - 1);
2618 :
2619 : /* Block boundary? Nothing to do */
2620 0 : if (!length)
2621 : return 0;
2622 :
2623 0 : length = blocksize - length;
2624 0 : iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
2625 :
2626 0 : page = grab_cache_page(mapping, index);
2627 0 : if (!page)
2628 : return -ENOMEM;
2629 :
2630 0 : if (!page_has_buffers(page))
2631 0 : create_empty_buffers(page, blocksize, 0);
2632 :
2633 : /* Find the buffer that contains "offset" */
2634 0 : bh = page_buffers(page);
2635 0 : pos = blocksize;
2636 0 : while (offset >= pos) {
2637 0 : bh = bh->b_this_page;
2638 0 : iblock++;
2639 0 : pos += blocksize;
2640 : }
2641 :
2642 0 : if (!buffer_mapped(bh)) {
2643 0 : WARN_ON(bh->b_size != blocksize);
2644 0 : err = get_block(inode, iblock, bh, 0);
2645 0 : if (err)
2646 : goto unlock;
2647 : /* unmapped? It's a hole - nothing to do */
2648 0 : if (!buffer_mapped(bh))
2649 : goto unlock;
2650 : }
2651 :
2652 : /* Ok, it's mapped. Make sure it's up-to-date */
2653 0 : if (PageUptodate(page))
2654 : set_buffer_uptodate(bh);
2655 :
2656 0 : if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2657 0 : err = bh_read(bh, 0);
2658 : /* Uhhuh. Read error. Complain and punt. */
2659 0 : if (err < 0)
2660 : goto unlock;
2661 : }
2662 :
2663 0 : zero_user(page, offset, length);
2664 0 : mark_buffer_dirty(bh);
2665 :
2666 : unlock:
2667 0 : unlock_page(page);
2668 0 : put_page(page);
2669 :
2670 0 : return err;
2671 : }
2672 : EXPORT_SYMBOL(block_truncate_page);
2673 :
2674 : /*
2675 : * The generic ->writepage function for buffer-backed address_spaces
2676 : */
2677 0 : int block_write_full_page(struct page *page, get_block_t *get_block,
2678 : struct writeback_control *wbc)
2679 : {
2680 0 : struct inode * const inode = page->mapping->host;
2681 0 : loff_t i_size = i_size_read(inode);
2682 0 : const pgoff_t end_index = i_size >> PAGE_SHIFT;
2683 : unsigned offset;
2684 :
2685 : /* Is the page fully inside i_size? */
2686 0 : if (page->index < end_index)
2687 0 : return __block_write_full_page(inode, page, get_block, wbc,
2688 : end_buffer_async_write);
2689 :
2690 : /* Is the page fully outside i_size? (truncate in progress) */
2691 0 : offset = i_size & (PAGE_SIZE-1);
2692 0 : if (page->index >= end_index+1 || !offset) {
2693 0 : unlock_page(page);
2694 0 : return 0; /* don't care */
2695 : }
2696 :
2697 : /*
2698 : * The page straddles i_size. It must be zeroed out on each and every
2699 : * writepage invocation because it may be mmapped. "A file is mapped
2700 : * in multiples of the page size. For a file that is not a multiple of
2701 : * the page size, the remaining memory is zeroed when mapped, and
2702 : * writes to that region are not written out to the file."
2703 : */
2704 0 : zero_user_segment(page, offset, PAGE_SIZE);
2705 0 : return __block_write_full_page(inode, page, get_block, wbc,
2706 : end_buffer_async_write);
2707 : }
2708 : EXPORT_SYMBOL(block_write_full_page);
2709 :
2710 0 : sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2711 : get_block_t *get_block)
2712 : {
2713 0 : struct inode *inode = mapping->host;
2714 0 : struct buffer_head tmp = {
2715 0 : .b_size = i_blocksize(inode),
2716 : };
2717 :
2718 0 : get_block(inode, block, &tmp, 0);
2719 0 : return tmp.b_blocknr;
2720 : }
2721 : EXPORT_SYMBOL(generic_block_bmap);
2722 :
2723 0 : static void end_bio_bh_io_sync(struct bio *bio)
2724 : {
2725 0 : struct buffer_head *bh = bio->bi_private;
2726 :
2727 0 : if (unlikely(bio_flagged(bio, BIO_QUIET)))
2728 0 : set_bit(BH_Quiet, &bh->b_state);
2729 :
2730 0 : bh->b_end_io(bh, !bio->bi_status);
2731 0 : bio_put(bio);
2732 0 : }
2733 :
2734 0 : static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
2735 : struct writeback_control *wbc)
2736 : {
2737 0 : const enum req_op op = opf & REQ_OP_MASK;
2738 : struct bio *bio;
2739 :
2740 0 : BUG_ON(!buffer_locked(bh));
2741 0 : BUG_ON(!buffer_mapped(bh));
2742 0 : BUG_ON(!bh->b_end_io);
2743 0 : BUG_ON(buffer_delay(bh));
2744 0 : BUG_ON(buffer_unwritten(bh));
2745 :
2746 : /*
2747 : * Only clear out a write error when rewriting
2748 : */
2749 0 : if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
2750 : clear_buffer_write_io_error(bh);
2751 :
2752 0 : if (buffer_meta(bh))
2753 0 : opf |= REQ_META;
2754 0 : if (buffer_prio(bh))
2755 0 : opf |= REQ_PRIO;
2756 :
2757 0 : bio = bio_alloc(bh->b_bdev, 1, opf, GFP_NOIO);
2758 :
2759 0 : fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
2760 :
2761 0 : bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2762 :
2763 0 : bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
2764 0 : BUG_ON(bio->bi_iter.bi_size != bh->b_size);
2765 :
2766 0 : bio->bi_end_io = end_bio_bh_io_sync;
2767 0 : bio->bi_private = bh;
2768 :
2769 : /* Take care of bh's that straddle the end of the device */
2770 0 : guard_bio_eod(bio);
2771 :
2772 : if (wbc) {
2773 : wbc_init_bio(wbc, bio);
2774 : wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size);
2775 : }
2776 :
2777 0 : submit_bio(bio);
2778 0 : }
2779 :
2780 0 : void submit_bh(blk_opf_t opf, struct buffer_head *bh)
2781 : {
2782 0 : submit_bh_wbc(opf, bh, NULL);
2783 0 : }
2784 : EXPORT_SYMBOL(submit_bh);
2785 :
2786 0 : void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
2787 : {
2788 0 : lock_buffer(bh);
2789 0 : if (!test_clear_buffer_dirty(bh)) {
2790 : unlock_buffer(bh);
2791 : return;
2792 : }
2793 0 : bh->b_end_io = end_buffer_write_sync;
2794 0 : get_bh(bh);
2795 0 : submit_bh(REQ_OP_WRITE | op_flags, bh);
2796 : }
2797 : EXPORT_SYMBOL(write_dirty_buffer);
2798 :
2799 : /*
2800 : * For a data-integrity writeout, we need to wait upon any in-progress I/O
2801 : * and then start new I/O and then wait upon it. The caller must have a ref on
2802 : * the buffer_head.
2803 : */
2804 0 : int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
2805 : {
2806 0 : WARN_ON(atomic_read(&bh->b_count) < 1);
2807 0 : lock_buffer(bh);
2808 0 : if (test_clear_buffer_dirty(bh)) {
2809 : /*
2810 : * The bh should be mapped, but it might not be if the
2811 : * device was hot-removed. Not much we can do but fail the I/O.
2812 : */
2813 0 : if (!buffer_mapped(bh)) {
2814 0 : unlock_buffer(bh);
2815 0 : return -EIO;
2816 : }
2817 :
2818 0 : get_bh(bh);
2819 0 : bh->b_end_io = end_buffer_write_sync;
2820 0 : submit_bh(REQ_OP_WRITE | op_flags, bh);
2821 0 : wait_on_buffer(bh);
2822 0 : if (!buffer_uptodate(bh))
2823 : return -EIO;
2824 : } else {
2825 : unlock_buffer(bh);
2826 : }
2827 : return 0;
2828 : }
2829 : EXPORT_SYMBOL(__sync_dirty_buffer);
2830 :
2831 0 : int sync_dirty_buffer(struct buffer_head *bh)
2832 : {
2833 0 : return __sync_dirty_buffer(bh, REQ_SYNC);
2834 : }
2835 : EXPORT_SYMBOL(sync_dirty_buffer);
2836 :
2837 : /*
2838 : * try_to_free_buffers() checks if all the buffers on this particular folio
2839 : * are unused, and releases them if so.
2840 : *
2841 : * Exclusion against try_to_free_buffers may be obtained by either
2842 : * locking the folio or by holding its mapping's private_lock.
2843 : *
2844 : * If the folio is dirty but all the buffers are clean then we need to
2845 : * be sure to mark the folio clean as well. This is because the folio
2846 : * may be against a block device, and a later reattachment of buffers
2847 : * to a dirty folio will set *all* buffers dirty. Which would corrupt
2848 : * filesystem data on the same device.
2849 : *
2850 : * The same applies to regular filesystem folios: if all the buffers are
2851 : * clean then we set the folio clean and proceed. To do that, we require
2852 : * total exclusion from block_dirty_folio(). That is obtained with
2853 : * private_lock.
2854 : *
2855 : * try_to_free_buffers() is non-blocking.
2856 : */
2857 : static inline int buffer_busy(struct buffer_head *bh)
2858 : {
2859 0 : return atomic_read(&bh->b_count) |
2860 0 : (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2861 : }
2862 :
2863 : static bool
2864 0 : drop_buffers(struct folio *folio, struct buffer_head **buffers_to_free)
2865 : {
2866 0 : struct buffer_head *head = folio_buffers(folio);
2867 : struct buffer_head *bh;
2868 :
2869 0 : bh = head;
2870 : do {
2871 0 : if (buffer_busy(bh))
2872 : goto failed;
2873 0 : bh = bh->b_this_page;
2874 0 : } while (bh != head);
2875 :
2876 : do {
2877 0 : struct buffer_head *next = bh->b_this_page;
2878 :
2879 0 : if (bh->b_assoc_map)
2880 0 : __remove_assoc_queue(bh);
2881 0 : bh = next;
2882 0 : } while (bh != head);
2883 0 : *buffers_to_free = head;
2884 0 : folio_detach_private(folio);
2885 0 : return true;
2886 : failed:
2887 : return false;
2888 : }
2889 :
2890 0 : bool try_to_free_buffers(struct folio *folio)
2891 : {
2892 0 : struct address_space * const mapping = folio->mapping;
2893 0 : struct buffer_head *buffers_to_free = NULL;
2894 0 : bool ret = 0;
2895 :
2896 0 : BUG_ON(!folio_test_locked(folio));
2897 0 : if (folio_test_writeback(folio))
2898 : return false;
2899 :
2900 0 : if (mapping == NULL) { /* can this still happen? */
2901 0 : ret = drop_buffers(folio, &buffers_to_free);
2902 0 : goto out;
2903 : }
2904 :
2905 0 : spin_lock(&mapping->private_lock);
2906 0 : ret = drop_buffers(folio, &buffers_to_free);
2907 :
2908 : /*
2909 : * If the filesystem writes its buffers by hand (eg ext3)
2910 : * then we can have clean buffers against a dirty folio. We
2911 : * clean the folio here; otherwise the VM will never notice
2912 : * that the filesystem did any IO at all.
2913 : *
2914 : * Also, during truncate, discard_buffer will have marked all
2915 : * the folio's buffers clean. We discover that here and clean
2916 : * the folio also.
2917 : *
2918 : * private_lock must be held over this entire operation in order
2919 : * to synchronise against block_dirty_folio and prevent the
2920 : * dirty bit from being lost.
2921 : */
2922 0 : if (ret)
2923 : folio_cancel_dirty(folio);
2924 0 : spin_unlock(&mapping->private_lock);
2925 : out:
2926 0 : if (buffers_to_free) {
2927 : struct buffer_head *bh = buffers_to_free;
2928 :
2929 : do {
2930 0 : struct buffer_head *next = bh->b_this_page;
2931 0 : free_buffer_head(bh);
2932 0 : bh = next;
2933 0 : } while (bh != buffers_to_free);
2934 : }
2935 : return ret;
2936 : }
2937 : EXPORT_SYMBOL(try_to_free_buffers);
2938 :
2939 : /*
2940 : * Buffer-head allocation
2941 : */
2942 : static struct kmem_cache *bh_cachep __read_mostly;
2943 :
2944 : /*
2945 : * Once the number of bh's in the machine exceeds this level, we start
2946 : * stripping them in writeback.
2947 : */
2948 : static unsigned long max_buffer_heads;
2949 :
2950 : int buffer_heads_over_limit;
2951 :
2952 : struct bh_accounting {
2953 : int nr; /* Number of live bh's */
2954 : int ratelimit; /* Limit cacheline bouncing */
2955 : };
2956 :
2957 : static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
2958 :
2959 : static void recalc_bh_state(void)
2960 : {
2961 : int i;
2962 0 : int tot = 0;
2963 :
2964 0 : if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
2965 : return;
2966 0 : __this_cpu_write(bh_accounting.ratelimit, 0);
2967 0 : for_each_online_cpu(i)
2968 0 : tot += per_cpu(bh_accounting, i).nr;
2969 0 : buffer_heads_over_limit = (tot > max_buffer_heads);
2970 : }
2971 :
2972 0 : struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
2973 : {
2974 0 : struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
2975 0 : if (ret) {
2976 0 : INIT_LIST_HEAD(&ret->b_assoc_buffers);
2977 0 : spin_lock_init(&ret->b_uptodate_lock);
2978 0 : preempt_disable();
2979 0 : __this_cpu_inc(bh_accounting.nr);
2980 0 : recalc_bh_state();
2981 0 : preempt_enable();
2982 : }
2983 0 : return ret;
2984 : }
2985 : EXPORT_SYMBOL(alloc_buffer_head);
2986 :
2987 0 : void free_buffer_head(struct buffer_head *bh)
2988 : {
2989 0 : BUG_ON(!list_empty(&bh->b_assoc_buffers));
2990 0 : kmem_cache_free(bh_cachep, bh);
2991 0 : preempt_disable();
2992 0 : __this_cpu_dec(bh_accounting.nr);
2993 0 : recalc_bh_state();
2994 0 : preempt_enable();
2995 0 : }
2996 : EXPORT_SYMBOL(free_buffer_head);
2997 :
2998 0 : static int buffer_exit_cpu_dead(unsigned int cpu)
2999 : {
3000 : int i;
3001 0 : struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3002 :
3003 0 : for (i = 0; i < BH_LRU_SIZE; i++) {
3004 0 : brelse(b->bhs[i]);
3005 0 : b->bhs[i] = NULL;
3006 : }
3007 0 : this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3008 0 : per_cpu(bh_accounting, cpu).nr = 0;
3009 0 : return 0;
3010 : }
3011 :
3012 : /**
3013 : * bh_uptodate_or_lock - Test whether the buffer is uptodate
3014 : * @bh: struct buffer_head
3015 : *
3016 : * Return true if the buffer is up-to-date and false,
3017 : * with the buffer locked, if not.
3018 : */
3019 0 : int bh_uptodate_or_lock(struct buffer_head *bh)
3020 : {
3021 0 : if (!buffer_uptodate(bh)) {
3022 0 : lock_buffer(bh);
3023 0 : if (!buffer_uptodate(bh))
3024 : return 0;
3025 : unlock_buffer(bh);
3026 : }
3027 : return 1;
3028 : }
3029 : EXPORT_SYMBOL(bh_uptodate_or_lock);
3030 :
3031 : /**
3032 : * __bh_read - Submit read for a locked buffer
3033 : * @bh: struct buffer_head
3034 : * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
3035 : * @wait: wait until reading finish
3036 : *
3037 : * Returns zero on success or don't wait, and -EIO on error.
3038 : */
3039 0 : int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
3040 : {
3041 0 : int ret = 0;
3042 :
3043 0 : BUG_ON(!buffer_locked(bh));
3044 :
3045 0 : get_bh(bh);
3046 0 : bh->b_end_io = end_buffer_read_sync;
3047 0 : submit_bh(REQ_OP_READ | op_flags, bh);
3048 0 : if (wait) {
3049 0 : wait_on_buffer(bh);
3050 0 : if (!buffer_uptodate(bh))
3051 0 : ret = -EIO;
3052 : }
3053 0 : return ret;
3054 : }
3055 : EXPORT_SYMBOL(__bh_read);
3056 :
3057 : /**
3058 : * __bh_read_batch - Submit read for a batch of unlocked buffers
3059 : * @nr: entry number of the buffer batch
3060 : * @bhs: a batch of struct buffer_head
3061 : * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
3062 : * @force_lock: force to get a lock on the buffer if set, otherwise drops any
3063 : * buffer that cannot lock.
3064 : *
3065 : * Returns zero on success or don't wait, and -EIO on error.
3066 : */
3067 0 : void __bh_read_batch(int nr, struct buffer_head *bhs[],
3068 : blk_opf_t op_flags, bool force_lock)
3069 : {
3070 : int i;
3071 :
3072 0 : for (i = 0; i < nr; i++) {
3073 0 : struct buffer_head *bh = bhs[i];
3074 :
3075 0 : if (buffer_uptodate(bh))
3076 0 : continue;
3077 :
3078 0 : if (force_lock)
3079 : lock_buffer(bh);
3080 : else
3081 0 : if (!trylock_buffer(bh))
3082 0 : continue;
3083 :
3084 0 : if (buffer_uptodate(bh)) {
3085 0 : unlock_buffer(bh);
3086 0 : continue;
3087 : }
3088 :
3089 0 : bh->b_end_io = end_buffer_read_sync;
3090 0 : get_bh(bh);
3091 : submit_bh(REQ_OP_READ | op_flags, bh);
3092 : }
3093 0 : }
3094 : EXPORT_SYMBOL(__bh_read_batch);
3095 :
3096 1 : void __init buffer_init(void)
3097 : {
3098 : unsigned long nrpages;
3099 : int ret;
3100 :
3101 1 : bh_cachep = kmem_cache_create("buffer_head",
3102 : sizeof(struct buffer_head), 0,
3103 : (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3104 : SLAB_MEM_SPREAD),
3105 : NULL);
3106 :
3107 : /*
3108 : * Limit the bh occupancy to 10% of ZONE_NORMAL
3109 : */
3110 1 : nrpages = (nr_free_buffer_pages() * 10) / 100;
3111 1 : max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3112 1 : ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead",
3113 : NULL, buffer_exit_cpu_dead);
3114 1 : WARN_ON(ret < 0);
3115 1 : }
|