Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-only
2 : /*
3 : * linux/fs/buffer.c
4 : *
5 : * Copyright (C) 1991, 1992, 2002 Linus Torvalds
6 : */
7 :
8 : /*
9 : * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
10 : *
11 : * Removed a lot of unnecessary code and simplified things now that
12 : * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
13 : *
14 : * Speed up hash, lru, and free list operations. Use gfp() for allocating
15 : * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
16 : *
17 : * Added 32k buffer block sizes - these are required older ARM systems. - RMK
18 : *
19 : * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
20 : */
21 :
22 : #include <linux/kernel.h>
23 : #include <linux/sched/signal.h>
24 : #include <linux/syscalls.h>
25 : #include <linux/fs.h>
26 : #include <linux/iomap.h>
27 : #include <linux/mm.h>
28 : #include <linux/percpu.h>
29 : #include <linux/slab.h>
30 : #include <linux/capability.h>
31 : #include <linux/blkdev.h>
32 : #include <linux/file.h>
33 : #include <linux/quotaops.h>
34 : #include <linux/highmem.h>
35 : #include <linux/export.h>
36 : #include <linux/backing-dev.h>
37 : #include <linux/writeback.h>
38 : #include <linux/hash.h>
39 : #include <linux/suspend.h>
40 : #include <linux/buffer_head.h>
41 : #include <linux/task_io_accounting_ops.h>
42 : #include <linux/bio.h>
43 : #include <linux/cpu.h>
44 : #include <linux/bitops.h>
45 : #include <linux/mpage.h>
46 : #include <linux/bit_spinlock.h>
47 : #include <linux/pagevec.h>
48 : #include <linux/sched/mm.h>
49 : #include <trace/events/block.h>
50 : #include <linux/fscrypt.h>
51 : #include <linux/fsverity.h>
52 :
53 : #include "internal.h"
54 :
55 : static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
56 : static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
57 : struct writeback_control *wbc);
58 :
59 : #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
60 :
61 0 : inline void touch_buffer(struct buffer_head *bh)
62 : {
63 0 : trace_block_touch_buffer(bh);
64 0 : folio_mark_accessed(bh->b_folio);
65 0 : }
66 : EXPORT_SYMBOL(touch_buffer);
67 :
68 0 : void __lock_buffer(struct buffer_head *bh)
69 : {
70 0 : wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
71 0 : }
72 : EXPORT_SYMBOL(__lock_buffer);
73 :
74 0 : void unlock_buffer(struct buffer_head *bh)
75 : {
76 0 : clear_bit_unlock(BH_Lock, &bh->b_state);
77 0 : smp_mb__after_atomic();
78 0 : wake_up_bit(&bh->b_state, BH_Lock);
79 0 : }
80 : EXPORT_SYMBOL(unlock_buffer);
81 :
82 : /*
83 : * Returns if the folio has dirty or writeback buffers. If all the buffers
84 : * are unlocked and clean then the folio_test_dirty information is stale. If
85 : * any of the buffers are locked, it is assumed they are locked for IO.
86 : */
87 0 : void buffer_check_dirty_writeback(struct folio *folio,
88 : bool *dirty, bool *writeback)
89 : {
90 : struct buffer_head *head, *bh;
91 0 : *dirty = false;
92 0 : *writeback = false;
93 :
94 0 : BUG_ON(!folio_test_locked(folio));
95 :
96 0 : head = folio_buffers(folio);
97 0 : if (!head)
98 : return;
99 :
100 0 : if (folio_test_writeback(folio))
101 0 : *writeback = true;
102 :
103 : bh = head;
104 : do {
105 0 : if (buffer_locked(bh))
106 0 : *writeback = true;
107 :
108 0 : if (buffer_dirty(bh))
109 0 : *dirty = true;
110 :
111 0 : bh = bh->b_this_page;
112 0 : } while (bh != head);
113 : }
114 : EXPORT_SYMBOL(buffer_check_dirty_writeback);
115 :
116 : /*
117 : * Block until a buffer comes unlocked. This doesn't stop it
118 : * from becoming locked again - you have to lock it yourself
119 : * if you want to preserve its state.
120 : */
121 0 : void __wait_on_buffer(struct buffer_head * bh)
122 : {
123 0 : wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
124 0 : }
125 : EXPORT_SYMBOL(__wait_on_buffer);
126 :
127 0 : static void buffer_io_error(struct buffer_head *bh, char *msg)
128 : {
129 0 : if (!test_bit(BH_Quiet, &bh->b_state))
130 0 : printk_ratelimited(KERN_ERR
131 : "Buffer I/O error on dev %pg, logical block %llu%s\n",
132 : bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
133 0 : }
134 :
135 : /*
136 : * End-of-IO handler helper function which does not touch the bh after
137 : * unlocking it.
138 : * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
139 : * a race there is benign: unlock_buffer() only use the bh's address for
140 : * hashing after unlocking the buffer, so it doesn't actually touch the bh
141 : * itself.
142 : */
143 0 : static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
144 : {
145 0 : if (uptodate) {
146 : set_buffer_uptodate(bh);
147 : } else {
148 : /* This happens, due to failed read-ahead attempts. */
149 : clear_buffer_uptodate(bh);
150 : }
151 0 : unlock_buffer(bh);
152 0 : }
153 :
154 : /*
155 : * Default synchronous end-of-IO handler.. Just mark it up-to-date and
156 : * unlock the buffer.
157 : */
158 0 : void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
159 : {
160 0 : __end_buffer_read_notouch(bh, uptodate);
161 0 : put_bh(bh);
162 0 : }
163 : EXPORT_SYMBOL(end_buffer_read_sync);
164 :
165 0 : void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
166 : {
167 0 : if (uptodate) {
168 : set_buffer_uptodate(bh);
169 : } else {
170 0 : buffer_io_error(bh, ", lost sync page write");
171 0 : mark_buffer_write_io_error(bh);
172 : clear_buffer_uptodate(bh);
173 : }
174 0 : unlock_buffer(bh);
175 0 : put_bh(bh);
176 0 : }
177 : EXPORT_SYMBOL(end_buffer_write_sync);
178 :
179 : /*
180 : * Various filesystems appear to want __find_get_block to be non-blocking.
181 : * But it's the page lock which protects the buffers. To get around this,
182 : * we get exclusion from try_to_free_buffers with the blockdev mapping's
183 : * private_lock.
184 : *
185 : * Hack idea: for the blockdev mapping, private_lock contention
186 : * may be quite high. This code could TryLock the page, and if that
187 : * succeeds, there is no need to take private_lock.
188 : */
189 : static struct buffer_head *
190 0 : __find_get_block_slow(struct block_device *bdev, sector_t block)
191 : {
192 0 : struct inode *bd_inode = bdev->bd_inode;
193 0 : struct address_space *bd_mapping = bd_inode->i_mapping;
194 0 : struct buffer_head *ret = NULL;
195 : pgoff_t index;
196 : struct buffer_head *bh;
197 : struct buffer_head *head;
198 : struct page *page;
199 0 : int all_mapped = 1;
200 : static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);
201 :
202 0 : index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
203 0 : page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
204 0 : if (!page)
205 : goto out;
206 :
207 0 : spin_lock(&bd_mapping->private_lock);
208 0 : if (!page_has_buffers(page))
209 : goto out_unlock;
210 0 : head = page_buffers(page);
211 0 : bh = head;
212 : do {
213 0 : if (!buffer_mapped(bh))
214 : all_mapped = 0;
215 0 : else if (bh->b_blocknr == block) {
216 0 : ret = bh;
217 : get_bh(bh);
218 : goto out_unlock;
219 : }
220 0 : bh = bh->b_this_page;
221 0 : } while (bh != head);
222 :
223 : /* we might be here because some of the buffers on this page are
224 : * not mapped. This is due to various races between
225 : * file io on the block device and getblk. It gets dealt with
226 : * elsewhere, don't buffer_error if we had some unmapped buffers
227 : */
228 0 : ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE);
229 0 : if (all_mapped && __ratelimit(&last_warned)) {
230 0 : printk("__find_get_block_slow() failed. block=%llu, "
231 : "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
232 : "device %pg blocksize: %d\n",
233 : (unsigned long long)block,
234 : (unsigned long long)bh->b_blocknr,
235 : bh->b_state, bh->b_size, bdev,
236 : 1 << bd_inode->i_blkbits);
237 : }
238 : out_unlock:
239 0 : spin_unlock(&bd_mapping->private_lock);
240 0 : put_page(page);
241 : out:
242 0 : return ret;
243 : }
244 :
245 0 : static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
246 : {
247 : unsigned long flags;
248 : struct buffer_head *first;
249 : struct buffer_head *tmp;
250 : struct folio *folio;
251 0 : int folio_uptodate = 1;
252 :
253 0 : BUG_ON(!buffer_async_read(bh));
254 :
255 0 : folio = bh->b_folio;
256 0 : if (uptodate) {
257 : set_buffer_uptodate(bh);
258 : } else {
259 0 : clear_buffer_uptodate(bh);
260 0 : buffer_io_error(bh, ", async page read");
261 : folio_set_error(folio);
262 : }
263 :
264 : /*
265 : * Be _very_ careful from here on. Bad things can happen if
266 : * two buffer heads end IO at almost the same time and both
267 : * decide that the page is now completely done.
268 : */
269 0 : first = folio_buffers(folio);
270 0 : spin_lock_irqsave(&first->b_uptodate_lock, flags);
271 0 : clear_buffer_async_read(bh);
272 0 : unlock_buffer(bh);
273 0 : tmp = bh;
274 : do {
275 0 : if (!buffer_uptodate(tmp))
276 0 : folio_uptodate = 0;
277 0 : if (buffer_async_read(tmp)) {
278 0 : BUG_ON(!buffer_locked(tmp));
279 : goto still_busy;
280 : }
281 0 : tmp = tmp->b_this_page;
282 0 : } while (tmp != bh);
283 0 : spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
284 :
285 : /*
286 : * If all of the buffers are uptodate then we can set the page
287 : * uptodate.
288 : */
289 0 : if (folio_uptodate)
290 : folio_mark_uptodate(folio);
291 0 : folio_unlock(folio);
292 0 : return;
293 :
294 : still_busy:
295 0 : spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
296 : return;
297 : }
298 :
299 : struct postprocess_bh_ctx {
300 : struct work_struct work;
301 : struct buffer_head *bh;
302 : };
303 :
304 : static void verify_bh(struct work_struct *work)
305 : {
306 : struct postprocess_bh_ctx *ctx =
307 : container_of(work, struct postprocess_bh_ctx, work);
308 : struct buffer_head *bh = ctx->bh;
309 : bool valid;
310 :
311 : valid = fsverity_verify_blocks(page_folio(bh->b_page), bh->b_size,
312 : bh_offset(bh));
313 : end_buffer_async_read(bh, valid);
314 : kfree(ctx);
315 : }
316 :
317 : static bool need_fsverity(struct buffer_head *bh)
318 : {
319 0 : struct page *page = bh->b_page;
320 0 : struct inode *inode = page->mapping->host;
321 :
322 0 : return fsverity_active(inode) &&
323 : /* needed by ext4 */
324 : page->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
325 : }
326 :
327 : static void decrypt_bh(struct work_struct *work)
328 : {
329 : struct postprocess_bh_ctx *ctx =
330 : container_of(work, struct postprocess_bh_ctx, work);
331 : struct buffer_head *bh = ctx->bh;
332 : int err;
333 :
334 : err = fscrypt_decrypt_pagecache_blocks(page_folio(bh->b_page),
335 : bh->b_size, bh_offset(bh));
336 : if (err == 0 && need_fsverity(bh)) {
337 : /*
338 : * We use different work queues for decryption and for verity
339 : * because verity may require reading metadata pages that need
340 : * decryption, and we shouldn't recurse to the same workqueue.
341 : */
342 : INIT_WORK(&ctx->work, verify_bh);
343 : fsverity_enqueue_verify_work(&ctx->work);
344 : return;
345 : }
346 : end_buffer_async_read(bh, err == 0);
347 : kfree(ctx);
348 : }
349 :
350 : /*
351 : * I/O completion handler for block_read_full_folio() - pages
352 : * which come unlocked at the end of I/O.
353 : */
354 0 : static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
355 : {
356 0 : struct inode *inode = bh->b_folio->mapping->host;
357 0 : bool decrypt = fscrypt_inode_uses_fs_layer_crypto(inode);
358 0 : bool verify = need_fsverity(bh);
359 :
360 : /* Decrypt (with fscrypt) and/or verify (with fsverity) if needed. */
361 : if (uptodate && (decrypt || verify)) {
362 : struct postprocess_bh_ctx *ctx =
363 : kmalloc(sizeof(*ctx), GFP_ATOMIC);
364 :
365 : if (ctx) {
366 : ctx->bh = bh;
367 : if (decrypt) {
368 : INIT_WORK(&ctx->work, decrypt_bh);
369 : fscrypt_enqueue_decrypt_work(&ctx->work);
370 : } else {
371 : INIT_WORK(&ctx->work, verify_bh);
372 : fsverity_enqueue_verify_work(&ctx->work);
373 : }
374 : return;
375 : }
376 : uptodate = 0;
377 : }
378 0 : end_buffer_async_read(bh, uptodate);
379 : }
380 :
381 : /*
382 : * Completion handler for block_write_full_page() - pages which are unlocked
383 : * during I/O, and which have PageWriteback cleared upon I/O completion.
384 : */
385 0 : void end_buffer_async_write(struct buffer_head *bh, int uptodate)
386 : {
387 : unsigned long flags;
388 : struct buffer_head *first;
389 : struct buffer_head *tmp;
390 : struct folio *folio;
391 :
392 0 : BUG_ON(!buffer_async_write(bh));
393 :
394 0 : folio = bh->b_folio;
395 0 : if (uptodate) {
396 : set_buffer_uptodate(bh);
397 : } else {
398 0 : buffer_io_error(bh, ", lost async page write");
399 0 : mark_buffer_write_io_error(bh);
400 0 : clear_buffer_uptodate(bh);
401 : folio_set_error(folio);
402 : }
403 :
404 0 : first = folio_buffers(folio);
405 0 : spin_lock_irqsave(&first->b_uptodate_lock, flags);
406 :
407 0 : clear_buffer_async_write(bh);
408 0 : unlock_buffer(bh);
409 0 : tmp = bh->b_this_page;
410 0 : while (tmp != bh) {
411 0 : if (buffer_async_write(tmp)) {
412 0 : BUG_ON(!buffer_locked(tmp));
413 : goto still_busy;
414 : }
415 0 : tmp = tmp->b_this_page;
416 : }
417 0 : spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
418 0 : folio_end_writeback(folio);
419 0 : return;
420 :
421 : still_busy:
422 0 : spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
423 : return;
424 : }
425 : EXPORT_SYMBOL(end_buffer_async_write);
426 :
427 : /*
428 : * If a page's buffers are under async readin (end_buffer_async_read
429 : * completion) then there is a possibility that another thread of
430 : * control could lock one of the buffers after it has completed
431 : * but while some of the other buffers have not completed. This
432 : * locked buffer would confuse end_buffer_async_read() into not unlocking
433 : * the page. So the absence of BH_Async_Read tells end_buffer_async_read()
434 : * that this buffer is not under async I/O.
435 : *
436 : * The page comes unlocked when it has no locked buffer_async buffers
437 : * left.
438 : *
439 : * PageLocked prevents anyone starting new async I/O reads any of
440 : * the buffers.
441 : *
442 : * PageWriteback is used to prevent simultaneous writeout of the same
443 : * page.
444 : *
445 : * PageLocked prevents anyone from starting writeback of a page which is
446 : * under read I/O (PageWriteback is only ever set against a locked page).
447 : */
448 0 : static void mark_buffer_async_read(struct buffer_head *bh)
449 : {
450 0 : bh->b_end_io = end_buffer_async_read_io;
451 0 : set_buffer_async_read(bh);
452 0 : }
453 :
454 : static void mark_buffer_async_write_endio(struct buffer_head *bh,
455 : bh_end_io_t *handler)
456 : {
457 0 : bh->b_end_io = handler;
458 0 : set_buffer_async_write(bh);
459 : }
460 :
461 0 : void mark_buffer_async_write(struct buffer_head *bh)
462 : {
463 0 : mark_buffer_async_write_endio(bh, end_buffer_async_write);
464 0 : }
465 : EXPORT_SYMBOL(mark_buffer_async_write);
466 :
467 :
468 : /*
469 : * fs/buffer.c contains helper functions for buffer-backed address space's
470 : * fsync functions. A common requirement for buffer-based filesystems is
471 : * that certain data from the backing blockdev needs to be written out for
472 : * a successful fsync(). For example, ext2 indirect blocks need to be
473 : * written back and waited upon before fsync() returns.
474 : *
475 : * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
476 : * inode_has_buffers() and invalidate_inode_buffers() are provided for the
477 : * management of a list of dependent buffers at ->i_mapping->private_list.
478 : *
479 : * Locking is a little subtle: try_to_free_buffers() will remove buffers
480 : * from their controlling inode's queue when they are being freed. But
481 : * try_to_free_buffers() will be operating against the *blockdev* mapping
482 : * at the time, not against the S_ISREG file which depends on those buffers.
483 : * So the locking for private_list is via the private_lock in the address_space
484 : * which backs the buffers. Which is different from the address_space
485 : * against which the buffers are listed. So for a particular address_space,
486 : * mapping->private_lock does *not* protect mapping->private_list! In fact,
487 : * mapping->private_list will always be protected by the backing blockdev's
488 : * ->private_lock.
489 : *
490 : * Which introduces a requirement: all buffers on an address_space's
491 : * ->private_list must be from the same address_space: the blockdev's.
492 : *
493 : * address_spaces which do not place buffers at ->private_list via these
494 : * utility functions are free to use private_lock and private_list for
495 : * whatever they want. The only requirement is that list_empty(private_list)
496 : * be true at clear_inode() time.
497 : *
498 : * FIXME: clear_inode should not call invalidate_inode_buffers(). The
499 : * filesystems should do that. invalidate_inode_buffers() should just go
500 : * BUG_ON(!list_empty).
501 : *
502 : * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
503 : * take an address_space, not an inode. And it should be called
504 : * mark_buffer_dirty_fsync() to clearly define why those buffers are being
505 : * queued up.
506 : *
507 : * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
508 : * list if it is already on a list. Because if the buffer is on a list,
509 : * it *must* already be on the right one. If not, the filesystem is being
510 : * silly. This will save a ton of locking. But first we have to ensure
511 : * that buffers are taken *off* the old inode's list when they are freed
512 : * (presumably in truncate). That requires careful auditing of all
513 : * filesystems (do it inside bforget()). It could also be done by bringing
514 : * b_inode back.
515 : */
516 :
517 : /*
518 : * The buffer's backing address_space's private_lock must be held
519 : */
520 0 : static void __remove_assoc_queue(struct buffer_head *bh)
521 : {
522 0 : list_del_init(&bh->b_assoc_buffers);
523 0 : WARN_ON(!bh->b_assoc_map);
524 0 : bh->b_assoc_map = NULL;
525 0 : }
526 :
527 34 : int inode_has_buffers(struct inode *inode)
528 : {
529 68 : return !list_empty(&inode->i_data.private_list);
530 : }
531 :
532 : /*
533 : * osync is designed to support O_SYNC io. It waits synchronously for
534 : * all already-submitted IO to complete, but does not queue any new
535 : * writes to the disk.
536 : *
537 : * To do O_SYNC writes, just queue the buffer writes with write_dirty_buffer
538 : * as you dirty the buffers, and then use osync_inode_buffers to wait for
539 : * completion. Any other dirty buffers which are not yet queued for
540 : * write will not be flushed to disk by the osync.
541 : */
542 0 : static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
543 : {
544 : struct buffer_head *bh;
545 : struct list_head *p;
546 0 : int err = 0;
547 :
548 : spin_lock(lock);
549 : repeat:
550 0 : list_for_each_prev(p, list) {
551 0 : bh = BH_ENTRY(p);
552 0 : if (buffer_locked(bh)) {
553 0 : get_bh(bh);
554 0 : spin_unlock(lock);
555 0 : wait_on_buffer(bh);
556 0 : if (!buffer_uptodate(bh))
557 0 : err = -EIO;
558 0 : brelse(bh);
559 : spin_lock(lock);
560 : goto repeat;
561 : }
562 : }
563 0 : spin_unlock(lock);
564 0 : return err;
565 : }
566 :
567 0 : void emergency_thaw_bdev(struct super_block *sb)
568 : {
569 0 : while (sb->s_bdev && !thaw_bdev(sb->s_bdev))
570 0 : printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev);
571 0 : }
572 :
573 : /**
574 : * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
575 : * @mapping: the mapping which wants those buffers written
576 : *
577 : * Starts I/O against the buffers at mapping->private_list, and waits upon
578 : * that I/O.
579 : *
580 : * Basically, this is a convenience function for fsync().
581 : * @mapping is a file or directory which needs those buffers to be written for
582 : * a successful fsync().
583 : */
584 0 : int sync_mapping_buffers(struct address_space *mapping)
585 : {
586 0 : struct address_space *buffer_mapping = mapping->private_data;
587 :
588 0 : if (buffer_mapping == NULL || list_empty(&mapping->private_list))
589 : return 0;
590 :
591 0 : return fsync_buffers_list(&buffer_mapping->private_lock,
592 : &mapping->private_list);
593 : }
594 : EXPORT_SYMBOL(sync_mapping_buffers);
595 :
596 : /*
597 : * Called when we've recently written block `bblock', and it is known that
598 : * `bblock' was for a buffer_boundary() buffer. This means that the block at
599 : * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
600 : * dirty, schedule it for IO. So that indirects merge nicely with their data.
601 : */
602 0 : void write_boundary_block(struct block_device *bdev,
603 : sector_t bblock, unsigned blocksize)
604 : {
605 0 : struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
606 0 : if (bh) {
607 0 : if (buffer_dirty(bh))
608 0 : write_dirty_buffer(bh, 0);
609 : put_bh(bh);
610 : }
611 0 : }
612 :
613 0 : void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
614 : {
615 0 : struct address_space *mapping = inode->i_mapping;
616 0 : struct address_space *buffer_mapping = bh->b_folio->mapping;
617 :
618 0 : mark_buffer_dirty(bh);
619 0 : if (!mapping->private_data) {
620 0 : mapping->private_data = buffer_mapping;
621 : } else {
622 0 : BUG_ON(mapping->private_data != buffer_mapping);
623 : }
624 0 : if (!bh->b_assoc_map) {
625 0 : spin_lock(&buffer_mapping->private_lock);
626 0 : list_move_tail(&bh->b_assoc_buffers,
627 : &mapping->private_list);
628 0 : bh->b_assoc_map = mapping;
629 0 : spin_unlock(&buffer_mapping->private_lock);
630 : }
631 0 : }
632 : EXPORT_SYMBOL(mark_buffer_dirty_inode);
633 :
634 : /*
635 : * Add a page to the dirty page list.
636 : *
637 : * It is a sad fact of life that this function is called from several places
638 : * deeply under spinlocking. It may not sleep.
639 : *
640 : * If the page has buffers, the uptodate buffers are set dirty, to preserve
641 : * dirty-state coherency between the page and the buffers. It the page does
642 : * not have buffers then when they are later attached they will all be set
643 : * dirty.
644 : *
645 : * The buffers are dirtied before the page is dirtied. There's a small race
646 : * window in which a writepage caller may see the page cleanness but not the
647 : * buffer dirtiness. That's fine. If this code were to set the page dirty
648 : * before the buffers, a concurrent writepage caller could clear the page dirty
649 : * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
650 : * page on the dirty page list.
651 : *
652 : * We use private_lock to lock against try_to_free_buffers while using the
653 : * page's buffer list. Also use this to protect against clean buffers being
654 : * added to the page after it was set dirty.
655 : *
656 : * FIXME: may need to call ->reservepage here as well. That's rather up to the
657 : * address_space though.
658 : */
659 0 : bool block_dirty_folio(struct address_space *mapping, struct folio *folio)
660 : {
661 : struct buffer_head *head;
662 : bool newly_dirty;
663 :
664 0 : spin_lock(&mapping->private_lock);
665 0 : head = folio_buffers(folio);
666 0 : if (head) {
667 : struct buffer_head *bh = head;
668 :
669 : do {
670 0 : set_buffer_dirty(bh);
671 0 : bh = bh->b_this_page;
672 0 : } while (bh != head);
673 : }
674 : /*
675 : * Lock out page's memcg migration to keep PageDirty
676 : * synchronized with per-memcg dirty page counters.
677 : */
678 0 : folio_memcg_lock(folio);
679 0 : newly_dirty = !folio_test_set_dirty(folio);
680 0 : spin_unlock(&mapping->private_lock);
681 :
682 0 : if (newly_dirty)
683 0 : __folio_mark_dirty(folio, mapping, 1);
684 :
685 0 : folio_memcg_unlock(folio);
686 :
687 0 : if (newly_dirty)
688 0 : __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
689 :
690 0 : return newly_dirty;
691 : }
692 : EXPORT_SYMBOL(block_dirty_folio);
693 :
694 : /*
695 : * Write out and wait upon a list of buffers.
696 : *
697 : * We have conflicting pressures: we want to make sure that all
698 : * initially dirty buffers get waited on, but that any subsequently
699 : * dirtied buffers don't. After all, we don't want fsync to last
700 : * forever if somebody is actively writing to the file.
701 : *
702 : * Do this in two main stages: first we copy dirty buffers to a
703 : * temporary inode list, queueing the writes as we go. Then we clean
704 : * up, waiting for those writes to complete.
705 : *
706 : * During this second stage, any subsequent updates to the file may end
707 : * up refiling the buffer on the original inode's dirty list again, so
708 : * there is a chance we will end up with a buffer queued for write but
709 : * not yet completed on that list. So, as a final cleanup we go through
710 : * the osync code to catch these locked, dirty buffers without requeuing
711 : * any newly dirty buffers for write.
712 : */
713 0 : static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
714 : {
715 : struct buffer_head *bh;
716 : struct list_head tmp;
717 : struct address_space *mapping;
718 0 : int err = 0, err2;
719 : struct blk_plug plug;
720 :
721 0 : INIT_LIST_HEAD(&tmp);
722 0 : blk_start_plug(&plug);
723 :
724 : spin_lock(lock);
725 0 : while (!list_empty(list)) {
726 0 : bh = BH_ENTRY(list->next);
727 0 : mapping = bh->b_assoc_map;
728 0 : __remove_assoc_queue(bh);
729 : /* Avoid race with mark_buffer_dirty_inode() which does
730 : * a lockless check and we rely on seeing the dirty bit */
731 0 : smp_mb();
732 0 : if (buffer_dirty(bh) || buffer_locked(bh)) {
733 0 : list_add(&bh->b_assoc_buffers, &tmp);
734 0 : bh->b_assoc_map = mapping;
735 0 : if (buffer_dirty(bh)) {
736 0 : get_bh(bh);
737 0 : spin_unlock(lock);
738 : /*
739 : * Ensure any pending I/O completes so that
740 : * write_dirty_buffer() actually writes the
741 : * current contents - it is a noop if I/O is
742 : * still in flight on potentially older
743 : * contents.
744 : */
745 0 : write_dirty_buffer(bh, REQ_SYNC);
746 :
747 : /*
748 : * Kick off IO for the previous mapping. Note
749 : * that we will not run the very last mapping,
750 : * wait_on_buffer() will do that for us
751 : * through sync_buffer().
752 : */
753 0 : brelse(bh);
754 : spin_lock(lock);
755 : }
756 : }
757 : }
758 :
759 0 : spin_unlock(lock);
760 0 : blk_finish_plug(&plug);
761 : spin_lock(lock);
762 :
763 0 : while (!list_empty(&tmp)) {
764 0 : bh = BH_ENTRY(tmp.prev);
765 0 : get_bh(bh);
766 0 : mapping = bh->b_assoc_map;
767 0 : __remove_assoc_queue(bh);
768 : /* Avoid race with mark_buffer_dirty_inode() which does
769 : * a lockless check and we rely on seeing the dirty bit */
770 0 : smp_mb();
771 0 : if (buffer_dirty(bh)) {
772 0 : list_add(&bh->b_assoc_buffers,
773 : &mapping->private_list);
774 0 : bh->b_assoc_map = mapping;
775 : }
776 0 : spin_unlock(lock);
777 0 : wait_on_buffer(bh);
778 0 : if (!buffer_uptodate(bh))
779 0 : err = -EIO;
780 0 : brelse(bh);
781 : spin_lock(lock);
782 : }
783 :
784 0 : spin_unlock(lock);
785 0 : err2 = osync_buffers_list(lock, list);
786 0 : if (err)
787 : return err;
788 : else
789 : return err2;
790 : }
791 :
792 : /*
793 : * Invalidate any and all dirty buffers on a given inode. We are
794 : * probably unmounting the fs, but that doesn't mean we have already
795 : * done a sync(). Just drop the buffers from the inode list.
796 : *
797 : * NOTE: we take the inode's blockdev's mapping's private_lock. Which
798 : * assumes that all the buffers are against the blockdev. Not true
799 : * for reiserfs.
800 : */
801 0 : void invalidate_inode_buffers(struct inode *inode)
802 : {
803 0 : if (inode_has_buffers(inode)) {
804 0 : struct address_space *mapping = &inode->i_data;
805 0 : struct list_head *list = &mapping->private_list;
806 0 : struct address_space *buffer_mapping = mapping->private_data;
807 :
808 0 : spin_lock(&buffer_mapping->private_lock);
809 0 : while (!list_empty(list))
810 0 : __remove_assoc_queue(BH_ENTRY(list->next));
811 0 : spin_unlock(&buffer_mapping->private_lock);
812 : }
813 0 : }
814 : EXPORT_SYMBOL(invalidate_inode_buffers);
815 :
816 : /*
817 : * Remove any clean buffers from the inode's buffer list. This is called
818 : * when we're trying to free the inode itself. Those buffers can pin it.
819 : *
820 : * Returns true if all buffers were removed.
821 : */
822 0 : int remove_inode_buffers(struct inode *inode)
823 : {
824 0 : int ret = 1;
825 :
826 0 : if (inode_has_buffers(inode)) {
827 0 : struct address_space *mapping = &inode->i_data;
828 0 : struct list_head *list = &mapping->private_list;
829 0 : struct address_space *buffer_mapping = mapping->private_data;
830 :
831 0 : spin_lock(&buffer_mapping->private_lock);
832 0 : while (!list_empty(list)) {
833 0 : struct buffer_head *bh = BH_ENTRY(list->next);
834 0 : if (buffer_dirty(bh)) {
835 : ret = 0;
836 : break;
837 : }
838 0 : __remove_assoc_queue(bh);
839 : }
840 0 : spin_unlock(&buffer_mapping->private_lock);
841 : }
842 0 : return ret;
843 : }
844 :
845 : /*
846 : * Create the appropriate buffers when given a page for data area and
847 : * the size of each buffer.. Use the bh->b_this_page linked list to
848 : * follow the buffers created. Return NULL if unable to create more
849 : * buffers.
850 : *
851 : * The retry flag is used to differentiate async IO (paging, swapping)
852 : * which may not fail from ordinary buffer allocations.
853 : */
854 0 : struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
855 : bool retry)
856 : {
857 : struct buffer_head *bh, *head;
858 0 : gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
859 : long offset;
860 : struct mem_cgroup *memcg, *old_memcg;
861 :
862 0 : if (retry)
863 0 : gfp |= __GFP_NOFAIL;
864 :
865 : /* The page lock pins the memcg */
866 0 : memcg = page_memcg(page);
867 0 : old_memcg = set_active_memcg(memcg);
868 :
869 0 : head = NULL;
870 0 : offset = PAGE_SIZE;
871 0 : while ((offset -= size) >= 0) {
872 0 : bh = alloc_buffer_head(gfp);
873 0 : if (!bh)
874 : goto no_grow;
875 :
876 0 : bh->b_this_page = head;
877 0 : bh->b_blocknr = -1;
878 0 : head = bh;
879 :
880 0 : bh->b_size = size;
881 :
882 : /* Link the buffer to its page */
883 0 : set_bh_page(bh, page, offset);
884 : }
885 : out:
886 : set_active_memcg(old_memcg);
887 0 : return head;
888 : /*
889 : * In case anything failed, we just free everything we got.
890 : */
891 : no_grow:
892 0 : if (head) {
893 : do {
894 0 : bh = head;
895 0 : head = head->b_this_page;
896 0 : free_buffer_head(bh);
897 0 : } while (head);
898 : }
899 :
900 : goto out;
901 : }
902 : EXPORT_SYMBOL_GPL(alloc_page_buffers);
903 :
904 : static inline void
905 : link_dev_buffers(struct page *page, struct buffer_head *head)
906 : {
907 : struct buffer_head *bh, *tail;
908 :
909 : bh = head;
910 : do {
911 0 : tail = bh;
912 0 : bh = bh->b_this_page;
913 0 : } while (bh);
914 0 : tail->b_this_page = head;
915 0 : attach_page_private(page, head);
916 : }
917 :
918 : static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
919 : {
920 0 : sector_t retval = ~((sector_t)0);
921 0 : loff_t sz = bdev_nr_bytes(bdev);
922 :
923 0 : if (sz) {
924 0 : unsigned int sizebits = blksize_bits(size);
925 0 : retval = (sz >> sizebits);
926 : }
927 : return retval;
928 : }
929 :
930 : /*
931 : * Initialise the state of a blockdev page's buffers.
932 : */
933 : static sector_t
934 0 : init_page_buffers(struct page *page, struct block_device *bdev,
935 : sector_t block, int size)
936 : {
937 0 : struct buffer_head *head = page_buffers(page);
938 0 : struct buffer_head *bh = head;
939 0 : int uptodate = PageUptodate(page);
940 0 : sector_t end_block = blkdev_max_block(bdev, size);
941 :
942 : do {
943 0 : if (!buffer_mapped(bh)) {
944 0 : bh->b_end_io = NULL;
945 0 : bh->b_private = NULL;
946 0 : bh->b_bdev = bdev;
947 0 : bh->b_blocknr = block;
948 0 : if (uptodate)
949 : set_buffer_uptodate(bh);
950 0 : if (block < end_block)
951 : set_buffer_mapped(bh);
952 : }
953 0 : block++;
954 0 : bh = bh->b_this_page;
955 0 : } while (bh != head);
956 :
957 : /*
958 : * Caller needs to validate requested block against end of device.
959 : */
960 0 : return end_block;
961 : }
962 :
963 : /*
964 : * Create the page-cache page that contains the requested block.
965 : *
966 : * This is used purely for blockdev mappings.
967 : */
968 : static int
969 0 : grow_dev_page(struct block_device *bdev, sector_t block,
970 : pgoff_t index, int size, int sizebits, gfp_t gfp)
971 : {
972 0 : struct inode *inode = bdev->bd_inode;
973 : struct page *page;
974 : struct buffer_head *bh;
975 : sector_t end_block;
976 0 : int ret = 0;
977 : gfp_t gfp_mask;
978 :
979 0 : gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;
980 :
981 : /*
982 : * XXX: __getblk_slow() can not really deal with failure and
983 : * will endlessly loop on improvised global reclaim. Prefer
984 : * looping in the allocator rather than here, at least that
985 : * code knows what it's doing.
986 : */
987 0 : gfp_mask |= __GFP_NOFAIL;
988 :
989 0 : page = find_or_create_page(inode->i_mapping, index, gfp_mask);
990 :
991 0 : BUG_ON(!PageLocked(page));
992 :
993 0 : if (page_has_buffers(page)) {
994 0 : bh = page_buffers(page);
995 0 : if (bh->b_size == size) {
996 0 : end_block = init_page_buffers(page, bdev,
997 : (sector_t)index << sizebits,
998 : size);
999 0 : goto done;
1000 : }
1001 0 : if (!try_to_free_buffers(page_folio(page)))
1002 : goto failed;
1003 : }
1004 :
1005 : /*
1006 : * Allocate some buffers for this page
1007 : */
1008 0 : bh = alloc_page_buffers(page, size, true);
1009 :
1010 : /*
1011 : * Link the page to the buffers and initialise them. Take the
1012 : * lock to be atomic wrt __find_get_block(), which does not
1013 : * run under the page lock.
1014 : */
1015 0 : spin_lock(&inode->i_mapping->private_lock);
1016 0 : link_dev_buffers(page, bh);
1017 0 : end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits,
1018 : size);
1019 0 : spin_unlock(&inode->i_mapping->private_lock);
1020 : done:
1021 0 : ret = (block < end_block) ? 1 : -ENXIO;
1022 : failed:
1023 0 : unlock_page(page);
1024 0 : put_page(page);
1025 0 : return ret;
1026 : }
1027 :
1028 : /*
1029 : * Create buffers for the specified block device block's page. If
1030 : * that page was dirty, the buffers are set dirty also.
1031 : */
1032 : static int
1033 0 : grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
1034 : {
1035 : pgoff_t index;
1036 : int sizebits;
1037 :
1038 0 : sizebits = PAGE_SHIFT - __ffs(size);
1039 0 : index = block >> sizebits;
1040 :
1041 : /*
1042 : * Check for a block which wants to lie outside our maximum possible
1043 : * pagecache index. (this comparison is done using sector_t types).
1044 : */
1045 : if (unlikely(index != block >> sizebits)) {
1046 : printk(KERN_ERR "%s: requested out-of-range block %llu for "
1047 : "device %pg\n",
1048 : __func__, (unsigned long long)block,
1049 : bdev);
1050 : return -EIO;
1051 : }
1052 :
1053 : /* Create a page with the proper size buffers.. */
1054 0 : return grow_dev_page(bdev, block, index, size, sizebits, gfp);
1055 : }
1056 :
1057 : static struct buffer_head *
1058 0 : __getblk_slow(struct block_device *bdev, sector_t block,
1059 : unsigned size, gfp_t gfp)
1060 : {
1061 : /* Size must be multiple of hard sectorsize */
1062 0 : if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1063 : (size < 512 || size > PAGE_SIZE))) {
1064 0 : printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1065 : size);
1066 0 : printk(KERN_ERR "logical block size: %d\n",
1067 : bdev_logical_block_size(bdev));
1068 :
1069 0 : dump_stack();
1070 0 : return NULL;
1071 : }
1072 :
1073 : for (;;) {
1074 : struct buffer_head *bh;
1075 : int ret;
1076 :
1077 0 : bh = __find_get_block(bdev, block, size);
1078 0 : if (bh)
1079 : return bh;
1080 :
1081 0 : ret = grow_buffers(bdev, block, size, gfp);
1082 0 : if (ret < 0)
1083 : return NULL;
1084 : }
1085 : }
1086 :
1087 : /*
1088 : * The relationship between dirty buffers and dirty pages:
1089 : *
1090 : * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1091 : * the page is tagged dirty in the page cache.
1092 : *
1093 : * At all times, the dirtiness of the buffers represents the dirtiness of
1094 : * subsections of the page. If the page has buffers, the page dirty bit is
1095 : * merely a hint about the true dirty state.
1096 : *
1097 : * When a page is set dirty in its entirety, all its buffers are marked dirty
1098 : * (if the page has buffers).
1099 : *
1100 : * When a buffer is marked dirty, its page is dirtied, but the page's other
1101 : * buffers are not.
1102 : *
1103 : * Also. When blockdev buffers are explicitly read with bread(), they
1104 : * individually become uptodate. But their backing page remains not
1105 : * uptodate - even if all of its buffers are uptodate. A subsequent
1106 : * block_read_full_folio() against that folio will discover all the uptodate
1107 : * buffers, will set the folio uptodate and will perform no I/O.
1108 : */
1109 :
1110 : /**
1111 : * mark_buffer_dirty - mark a buffer_head as needing writeout
1112 : * @bh: the buffer_head to mark dirty
1113 : *
1114 : * mark_buffer_dirty() will set the dirty bit against the buffer, then set
1115 : * its backing page dirty, then tag the page as dirty in the page cache
1116 : * and then attach the address_space's inode to its superblock's dirty
1117 : * inode list.
1118 : *
1119 : * mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->private_lock,
1120 : * i_pages lock and mapping->host->i_lock.
1121 : */
1122 0 : void mark_buffer_dirty(struct buffer_head *bh)
1123 : {
1124 0 : WARN_ON_ONCE(!buffer_uptodate(bh));
1125 :
1126 0 : trace_block_dirty_buffer(bh);
1127 :
1128 : /*
1129 : * Very *carefully* optimize the it-is-already-dirty case.
1130 : *
1131 : * Don't let the final "is it dirty" escape to before we
1132 : * perhaps modified the buffer.
1133 : */
1134 0 : if (buffer_dirty(bh)) {
1135 0 : smp_mb();
1136 0 : if (buffer_dirty(bh))
1137 : return;
1138 : }
1139 :
1140 0 : if (!test_set_buffer_dirty(bh)) {
1141 0 : struct folio *folio = bh->b_folio;
1142 0 : struct address_space *mapping = NULL;
1143 :
1144 0 : folio_memcg_lock(folio);
1145 0 : if (!folio_test_set_dirty(folio)) {
1146 0 : mapping = folio->mapping;
1147 0 : if (mapping)
1148 0 : __folio_mark_dirty(folio, mapping, 0);
1149 : }
1150 0 : folio_memcg_unlock(folio);
1151 0 : if (mapping)
1152 0 : __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1153 : }
1154 : }
1155 : EXPORT_SYMBOL(mark_buffer_dirty);
1156 :
1157 0 : void mark_buffer_write_io_error(struct buffer_head *bh)
1158 : {
1159 : struct super_block *sb;
1160 :
1161 0 : set_buffer_write_io_error(bh);
1162 : /* FIXME: do we need to set this in both places? */
1163 0 : if (bh->b_folio && bh->b_folio->mapping)
1164 0 : mapping_set_error(bh->b_folio->mapping, -EIO);
1165 0 : if (bh->b_assoc_map)
1166 0 : mapping_set_error(bh->b_assoc_map, -EIO);
1167 : rcu_read_lock();
1168 0 : sb = READ_ONCE(bh->b_bdev->bd_super);
1169 0 : if (sb)
1170 0 : errseq_set(&sb->s_wb_err, -EIO);
1171 : rcu_read_unlock();
1172 0 : }
1173 : EXPORT_SYMBOL(mark_buffer_write_io_error);
1174 :
1175 : /*
1176 : * Decrement a buffer_head's reference count. If all buffers against a page
1177 : * have zero reference count, are clean and unlocked, and if the page is clean
1178 : * and unlocked then try_to_free_buffers() may strip the buffers from the page
1179 : * in preparation for freeing it (sometimes, rarely, buffers are removed from
1180 : * a page but it ends up not being freed, and buffers may later be reattached).
1181 : */
1182 0 : void __brelse(struct buffer_head * buf)
1183 : {
1184 0 : if (atomic_read(&buf->b_count)) {
1185 : put_bh(buf);
1186 : return;
1187 : }
1188 0 : WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1189 : }
1190 : EXPORT_SYMBOL(__brelse);
1191 :
1192 : /*
1193 : * bforget() is like brelse(), except it discards any
1194 : * potentially dirty data.
1195 : */
1196 0 : void __bforget(struct buffer_head *bh)
1197 : {
1198 0 : clear_buffer_dirty(bh);
1199 0 : if (bh->b_assoc_map) {
1200 0 : struct address_space *buffer_mapping = bh->b_folio->mapping;
1201 :
1202 0 : spin_lock(&buffer_mapping->private_lock);
1203 0 : list_del_init(&bh->b_assoc_buffers);
1204 0 : bh->b_assoc_map = NULL;
1205 0 : spin_unlock(&buffer_mapping->private_lock);
1206 : }
1207 0 : __brelse(bh);
1208 0 : }
1209 : EXPORT_SYMBOL(__bforget);
1210 :
1211 0 : static struct buffer_head *__bread_slow(struct buffer_head *bh)
1212 : {
1213 0 : lock_buffer(bh);
1214 0 : if (buffer_uptodate(bh)) {
1215 0 : unlock_buffer(bh);
1216 0 : return bh;
1217 : } else {
1218 0 : get_bh(bh);
1219 0 : bh->b_end_io = end_buffer_read_sync;
1220 0 : submit_bh(REQ_OP_READ, bh);
1221 0 : wait_on_buffer(bh);
1222 0 : if (buffer_uptodate(bh))
1223 : return bh;
1224 : }
1225 0 : brelse(bh);
1226 0 : return NULL;
1227 : }
1228 :
1229 : /*
1230 : * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
1231 : * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
1232 : * refcount elevated by one when they're in an LRU. A buffer can only appear
1233 : * once in a particular CPU's LRU. A single buffer can be present in multiple
1234 : * CPU's LRUs at the same time.
1235 : *
1236 : * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1237 : * sb_find_get_block().
1238 : *
1239 : * The LRUs themselves only need locking against invalidate_bh_lrus. We use
1240 : * a local interrupt disable for that.
1241 : */
1242 :
1243 : #define BH_LRU_SIZE 16
1244 :
1245 : struct bh_lru {
1246 : struct buffer_head *bhs[BH_LRU_SIZE];
1247 : };
1248 :
1249 : static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1250 :
1251 : #ifdef CONFIG_SMP
1252 : #define bh_lru_lock() local_irq_disable()
1253 : #define bh_lru_unlock() local_irq_enable()
1254 : #else
1255 : #define bh_lru_lock() preempt_disable()
1256 : #define bh_lru_unlock() preempt_enable()
1257 : #endif
1258 :
1259 0 : static inline void check_irqs_on(void)
1260 : {
1261 : #ifdef irqs_disabled
1262 0 : BUG_ON(irqs_disabled());
1263 : #endif
1264 0 : }
1265 :
1266 : /*
1267 : * Install a buffer_head into this cpu's LRU. If not already in the LRU, it is
1268 : * inserted at the front, and the buffer_head at the back if any is evicted.
1269 : * Or, if already in the LRU it is moved to the front.
1270 : */
1271 0 : static void bh_lru_install(struct buffer_head *bh)
1272 : {
1273 0 : struct buffer_head *evictee = bh;
1274 : struct bh_lru *b;
1275 : int i;
1276 :
1277 0 : check_irqs_on();
1278 0 : bh_lru_lock();
1279 :
1280 : /*
1281 : * the refcount of buffer_head in bh_lru prevents dropping the
1282 : * attached page(i.e., try_to_free_buffers) so it could cause
1283 : * failing page migration.
1284 : * Skip putting upcoming bh into bh_lru until migration is done.
1285 : */
1286 0 : if (lru_cache_disabled()) {
1287 0 : bh_lru_unlock();
1288 0 : return;
1289 : }
1290 :
1291 : b = this_cpu_ptr(&bh_lrus);
1292 0 : for (i = 0; i < BH_LRU_SIZE; i++) {
1293 0 : swap(evictee, b->bhs[i]);
1294 0 : if (evictee == bh) {
1295 0 : bh_lru_unlock();
1296 0 : return;
1297 : }
1298 : }
1299 :
1300 0 : get_bh(bh);
1301 0 : bh_lru_unlock();
1302 0 : brelse(evictee);
1303 : }
1304 :
1305 : /*
1306 : * Look up the bh in this cpu's LRU. If it's there, move it to the head.
1307 : */
1308 : static struct buffer_head *
1309 0 : lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1310 : {
1311 0 : struct buffer_head *ret = NULL;
1312 : unsigned int i;
1313 :
1314 0 : check_irqs_on();
1315 0 : bh_lru_lock();
1316 0 : for (i = 0; i < BH_LRU_SIZE; i++) {
1317 0 : struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1318 :
1319 0 : if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
1320 0 : bh->b_size == size) {
1321 0 : if (i) {
1322 0 : while (i) {
1323 0 : __this_cpu_write(bh_lrus.bhs[i],
1324 : __this_cpu_read(bh_lrus.bhs[i - 1]));
1325 0 : i--;
1326 : }
1327 0 : __this_cpu_write(bh_lrus.bhs[0], bh);
1328 : }
1329 0 : get_bh(bh);
1330 0 : ret = bh;
1331 0 : break;
1332 : }
1333 : }
1334 0 : bh_lru_unlock();
1335 0 : return ret;
1336 : }
1337 :
1338 : /*
1339 : * Perform a pagecache lookup for the matching buffer. If it's there, refresh
1340 : * it in the LRU and mark it as accessed. If it is not present then return
1341 : * NULL
1342 : */
1343 : struct buffer_head *
1344 0 : __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1345 : {
1346 0 : struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1347 :
1348 0 : if (bh == NULL) {
1349 : /* __find_get_block_slow will mark the page accessed */
1350 0 : bh = __find_get_block_slow(bdev, block);
1351 0 : if (bh)
1352 0 : bh_lru_install(bh);
1353 : } else
1354 : touch_buffer(bh);
1355 :
1356 0 : return bh;
1357 : }
1358 : EXPORT_SYMBOL(__find_get_block);
1359 :
1360 : /*
1361 : * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
1362 : * which corresponds to the passed block_device, block and size. The
1363 : * returned buffer has its reference count incremented.
1364 : *
1365 : * __getblk_gfp() will lock up the machine if grow_dev_page's
1366 : * try_to_free_buffers() attempt is failing. FIXME, perhaps?
1367 : */
1368 : struct buffer_head *
1369 0 : __getblk_gfp(struct block_device *bdev, sector_t block,
1370 : unsigned size, gfp_t gfp)
1371 : {
1372 0 : struct buffer_head *bh = __find_get_block(bdev, block, size);
1373 :
1374 : might_sleep();
1375 0 : if (bh == NULL)
1376 0 : bh = __getblk_slow(bdev, block, size, gfp);
1377 0 : return bh;
1378 : }
1379 : EXPORT_SYMBOL(__getblk_gfp);
1380 :
1381 : /*
1382 : * Do async read-ahead on a buffer..
1383 : */
1384 0 : void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1385 : {
1386 0 : struct buffer_head *bh = __getblk(bdev, block, size);
1387 0 : if (likely(bh)) {
1388 0 : bh_readahead(bh, REQ_RAHEAD);
1389 0 : brelse(bh);
1390 : }
1391 0 : }
1392 : EXPORT_SYMBOL(__breadahead);
1393 :
1394 : /**
1395 : * __bread_gfp() - reads a specified block and returns the bh
1396 : * @bdev: the block_device to read from
1397 : * @block: number of block
1398 : * @size: size (in bytes) to read
1399 : * @gfp: page allocation flag
1400 : *
1401 : * Reads a specified block, and returns buffer head that contains it.
1402 : * The page cache can be allocated from non-movable area
1403 : * not to prevent page migration if you set gfp to zero.
1404 : * It returns NULL if the block was unreadable.
1405 : */
1406 : struct buffer_head *
1407 0 : __bread_gfp(struct block_device *bdev, sector_t block,
1408 : unsigned size, gfp_t gfp)
1409 : {
1410 0 : struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
1411 :
1412 0 : if (likely(bh) && !buffer_uptodate(bh))
1413 0 : bh = __bread_slow(bh);
1414 0 : return bh;
1415 : }
1416 : EXPORT_SYMBOL(__bread_gfp);
1417 :
1418 : static void __invalidate_bh_lrus(struct bh_lru *b)
1419 : {
1420 : int i;
1421 :
1422 0 : for (i = 0; i < BH_LRU_SIZE; i++) {
1423 0 : brelse(b->bhs[i]);
1424 0 : b->bhs[i] = NULL;
1425 : }
1426 : }
1427 : /*
1428 : * invalidate_bh_lrus() is called rarely - but not only at unmount.
1429 : * This doesn't race because it runs in each cpu either in irq
1430 : * or with preempt disabled.
1431 : */
1432 0 : static void invalidate_bh_lru(void *arg)
1433 : {
1434 0 : struct bh_lru *b = &get_cpu_var(bh_lrus);
1435 :
1436 0 : __invalidate_bh_lrus(b);
1437 0 : put_cpu_var(bh_lrus);
1438 0 : }
1439 :
1440 0 : bool has_bh_in_lru(int cpu, void *dummy)
1441 : {
1442 0 : struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
1443 : int i;
1444 :
1445 0 : for (i = 0; i < BH_LRU_SIZE; i++) {
1446 0 : if (b->bhs[i])
1447 : return true;
1448 : }
1449 :
1450 : return false;
1451 : }
1452 :
1453 0 : void invalidate_bh_lrus(void)
1454 : {
1455 0 : on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
1456 0 : }
1457 : EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1458 :
1459 : /*
1460 : * It's called from workqueue context so we need a bh_lru_lock to close
1461 : * the race with preemption/irq.
1462 : */
1463 0 : void invalidate_bh_lrus_cpu(void)
1464 : {
1465 : struct bh_lru *b;
1466 :
1467 0 : bh_lru_lock();
1468 0 : b = this_cpu_ptr(&bh_lrus);
1469 0 : __invalidate_bh_lrus(b);
1470 0 : bh_lru_unlock();
1471 0 : }
1472 :
1473 0 : void set_bh_page(struct buffer_head *bh,
1474 : struct page *page, unsigned long offset)
1475 : {
1476 0 : bh->b_page = page;
1477 0 : BUG_ON(offset >= PAGE_SIZE);
1478 0 : if (PageHighMem(page))
1479 : /*
1480 : * This catches illegal uses and preserves the offset:
1481 : */
1482 : bh->b_data = (char *)(0 + offset);
1483 : else
1484 0 : bh->b_data = page_address(page) + offset;
1485 0 : }
1486 : EXPORT_SYMBOL(set_bh_page);
1487 :
1488 : /*
1489 : * Called when truncating a buffer on a page completely.
1490 : */
1491 :
1492 : /* Bits that are cleared during an invalidate */
1493 : #define BUFFER_FLAGS_DISCARD \
1494 : (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
1495 : 1 << BH_Delay | 1 << BH_Unwritten)
1496 :
1497 0 : static void discard_buffer(struct buffer_head * bh)
1498 : {
1499 : unsigned long b_state;
1500 :
1501 0 : lock_buffer(bh);
1502 0 : clear_buffer_dirty(bh);
1503 0 : bh->b_bdev = NULL;
1504 0 : b_state = READ_ONCE(bh->b_state);
1505 : do {
1506 0 : } while (!try_cmpxchg(&bh->b_state, &b_state,
1507 : b_state & ~BUFFER_FLAGS_DISCARD));
1508 0 : unlock_buffer(bh);
1509 0 : }
1510 :
1511 : /**
1512 : * block_invalidate_folio - Invalidate part or all of a buffer-backed folio.
1513 : * @folio: The folio which is affected.
1514 : * @offset: start of the range to invalidate
1515 : * @length: length of the range to invalidate
1516 : *
1517 : * block_invalidate_folio() is called when all or part of the folio has been
1518 : * invalidated by a truncate operation.
1519 : *
1520 : * block_invalidate_folio() does not have to release all buffers, but it must
1521 : * ensure that no dirty buffer is left outside @offset and that no I/O
1522 : * is underway against any of the blocks which are outside the truncation
1523 : * point. Because the caller is about to free (and possibly reuse) those
1524 : * blocks on-disk.
1525 : */
1526 0 : void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
1527 : {
1528 : struct buffer_head *head, *bh, *next;
1529 0 : size_t curr_off = 0;
1530 0 : size_t stop = length + offset;
1531 :
1532 0 : BUG_ON(!folio_test_locked(folio));
1533 :
1534 : /*
1535 : * Check for overflow
1536 : */
1537 0 : BUG_ON(stop > folio_size(folio) || stop < length);
1538 :
1539 0 : head = folio_buffers(folio);
1540 0 : if (!head)
1541 : return;
1542 :
1543 : bh = head;
1544 : do {
1545 0 : size_t next_off = curr_off + bh->b_size;
1546 0 : next = bh->b_this_page;
1547 :
1548 : /*
1549 : * Are we still fully in range ?
1550 : */
1551 0 : if (next_off > stop)
1552 : goto out;
1553 :
1554 : /*
1555 : * is this block fully invalidated?
1556 : */
1557 0 : if (offset <= curr_off)
1558 0 : discard_buffer(bh);
1559 0 : curr_off = next_off;
1560 0 : bh = next;
1561 0 : } while (bh != head);
1562 :
1563 : /*
1564 : * We release buffers only if the entire folio is being invalidated.
1565 : * The get_block cached value has been unconditionally invalidated,
1566 : * so real IO is not possible anymore.
1567 : */
1568 0 : if (length == folio_size(folio))
1569 0 : filemap_release_folio(folio, 0);
1570 : out:
1571 : return;
1572 : }
1573 : EXPORT_SYMBOL(block_invalidate_folio);
1574 :
1575 :
1576 : /*
1577 : * We attach and possibly dirty the buffers atomically wrt
1578 : * block_dirty_folio() via private_lock. try_to_free_buffers
1579 : * is already excluded via the page lock.
1580 : */
1581 0 : void create_empty_buffers(struct page *page,
1582 : unsigned long blocksize, unsigned long b_state)
1583 : {
1584 : struct buffer_head *bh, *head, *tail;
1585 :
1586 0 : head = alloc_page_buffers(page, blocksize, true);
1587 0 : bh = head;
1588 : do {
1589 0 : bh->b_state |= b_state;
1590 0 : tail = bh;
1591 0 : bh = bh->b_this_page;
1592 0 : } while (bh);
1593 0 : tail->b_this_page = head;
1594 :
1595 0 : spin_lock(&page->mapping->private_lock);
1596 0 : if (PageUptodate(page) || PageDirty(page)) {
1597 : bh = head;
1598 : do {
1599 0 : if (PageDirty(page))
1600 : set_buffer_dirty(bh);
1601 0 : if (PageUptodate(page))
1602 : set_buffer_uptodate(bh);
1603 0 : bh = bh->b_this_page;
1604 0 : } while (bh != head);
1605 : }
1606 0 : attach_page_private(page, head);
1607 0 : spin_unlock(&page->mapping->private_lock);
1608 0 : }
1609 : EXPORT_SYMBOL(create_empty_buffers);
1610 :
1611 : /**
1612 : * clean_bdev_aliases: clean a range of buffers in block device
1613 : * @bdev: Block device to clean buffers in
1614 : * @block: Start of a range of blocks to clean
1615 : * @len: Number of blocks to clean
1616 : *
1617 : * We are taking a range of blocks for data and we don't want writeback of any
1618 : * buffer-cache aliases starting from return from this function and until the
1619 : * moment when something will explicitly mark the buffer dirty (hopefully that
1620 : * will not happen until we will free that block ;-) We don't even need to mark
1621 : * it not-uptodate - nobody can expect anything from a newly allocated buffer
1622 : * anyway. We used to use unmap_buffer() for such invalidation, but that was
1623 : * wrong. We definitely don't want to mark the alias unmapped, for example - it
1624 : * would confuse anyone who might pick it with bread() afterwards...
1625 : *
1626 : * Also.. Note that bforget() doesn't lock the buffer. So there can be
1627 : * writeout I/O going on against recently-freed buffers. We don't wait on that
1628 : * I/O in bforget() - it's more efficient to wait on the I/O only if we really
1629 : * need to. That happens here.
1630 : */
1631 0 : void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
1632 : {
1633 0 : struct inode *bd_inode = bdev->bd_inode;
1634 0 : struct address_space *bd_mapping = bd_inode->i_mapping;
1635 : struct folio_batch fbatch;
1636 0 : pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
1637 : pgoff_t end;
1638 : int i, count;
1639 : struct buffer_head *bh;
1640 : struct buffer_head *head;
1641 :
1642 0 : end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
1643 0 : folio_batch_init(&fbatch);
1644 0 : while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) {
1645 0 : count = folio_batch_count(&fbatch);
1646 0 : for (i = 0; i < count; i++) {
1647 0 : struct folio *folio = fbatch.folios[i];
1648 :
1649 0 : if (!folio_buffers(folio))
1650 0 : continue;
1651 : /*
1652 : * We use folio lock instead of bd_mapping->private_lock
1653 : * to pin buffers here since we can afford to sleep and
1654 : * it scales better than a global spinlock lock.
1655 : */
1656 0 : folio_lock(folio);
1657 : /* Recheck when the folio is locked which pins bhs */
1658 0 : head = folio_buffers(folio);
1659 0 : if (!head)
1660 : goto unlock_page;
1661 : bh = head;
1662 : do {
1663 0 : if (!buffer_mapped(bh) || (bh->b_blocknr < block))
1664 : goto next;
1665 0 : if (bh->b_blocknr >= block + len)
1666 : break;
1667 0 : clear_buffer_dirty(bh);
1668 0 : wait_on_buffer(bh);
1669 : clear_buffer_req(bh);
1670 : next:
1671 0 : bh = bh->b_this_page;
1672 0 : } while (bh != head);
1673 : unlock_page:
1674 0 : folio_unlock(folio);
1675 : }
1676 0 : folio_batch_release(&fbatch);
1677 0 : cond_resched();
1678 : /* End of range already reached? */
1679 0 : if (index > end || !index)
1680 : break;
1681 : }
1682 0 : }
1683 : EXPORT_SYMBOL(clean_bdev_aliases);
1684 :
1685 : /*
1686 : * Size is a power-of-two in the range 512..PAGE_SIZE,
1687 : * and the case we care about most is PAGE_SIZE.
1688 : *
1689 : * So this *could* possibly be written with those
1690 : * constraints in mind (relevant mostly if some
1691 : * architecture has a slow bit-scan instruction)
1692 : */
1693 : static inline int block_size_bits(unsigned int blocksize)
1694 : {
1695 0 : return ilog2(blocksize);
1696 : }
1697 :
1698 0 : static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state)
1699 : {
1700 0 : BUG_ON(!PageLocked(page));
1701 :
1702 0 : if (!page_has_buffers(page))
1703 0 : create_empty_buffers(page, 1 << READ_ONCE(inode->i_blkbits),
1704 : b_state);
1705 0 : return page_buffers(page);
1706 : }
1707 :
1708 : /*
1709 : * NOTE! All mapped/uptodate combinations are valid:
1710 : *
1711 : * Mapped Uptodate Meaning
1712 : *
1713 : * No No "unknown" - must do get_block()
1714 : * No Yes "hole" - zero-filled
1715 : * Yes No "allocated" - allocated on disk, not read in
1716 : * Yes Yes "valid" - allocated and up-to-date in memory.
1717 : *
1718 : * "Dirty" is valid only with the last case (mapped+uptodate).
1719 : */
1720 :
1721 : /*
1722 : * While block_write_full_page is writing back the dirty buffers under
1723 : * the page lock, whoever dirtied the buffers may decide to clean them
1724 : * again at any time. We handle that by only looking at the buffer
1725 : * state inside lock_buffer().
1726 : *
1727 : * If block_write_full_page() is called for regular writeback
1728 : * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1729 : * locked buffer. This only can happen if someone has written the buffer
1730 : * directly, with submit_bh(). At the address_space level PageWriteback
1731 : * prevents this contention from occurring.
1732 : *
1733 : * If block_write_full_page() is called with wbc->sync_mode ==
1734 : * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
1735 : * causes the writes to be flagged as synchronous writes.
1736 : */
1737 0 : int __block_write_full_page(struct inode *inode, struct page *page,
1738 : get_block_t *get_block, struct writeback_control *wbc,
1739 : bh_end_io_t *handler)
1740 : {
1741 : int err;
1742 : sector_t block;
1743 : sector_t last_block;
1744 : struct buffer_head *bh, *head;
1745 : unsigned int blocksize, bbits;
1746 0 : int nr_underway = 0;
1747 0 : blk_opf_t write_flags = wbc_to_write_flags(wbc);
1748 :
1749 0 : head = create_page_buffers(page, inode,
1750 : (1 << BH_Dirty)|(1 << BH_Uptodate));
1751 :
1752 : /*
1753 : * Be very careful. We have no exclusion from block_dirty_folio
1754 : * here, and the (potentially unmapped) buffers may become dirty at
1755 : * any time. If a buffer becomes dirty here after we've inspected it
1756 : * then we just miss that fact, and the page stays dirty.
1757 : *
1758 : * Buffers outside i_size may be dirtied by block_dirty_folio;
1759 : * handle that here by just cleaning them.
1760 : */
1761 :
1762 0 : bh = head;
1763 0 : blocksize = bh->b_size;
1764 0 : bbits = block_size_bits(blocksize);
1765 :
1766 0 : block = (sector_t)page->index << (PAGE_SHIFT - bbits);
1767 0 : last_block = (i_size_read(inode) - 1) >> bbits;
1768 :
1769 : /*
1770 : * Get all the dirty buffers mapped to disk addresses and
1771 : * handle any aliases from the underlying blockdev's mapping.
1772 : */
1773 : do {
1774 0 : if (block > last_block) {
1775 : /*
1776 : * mapped buffers outside i_size will occur, because
1777 : * this page can be outside i_size when there is a
1778 : * truncate in progress.
1779 : */
1780 : /*
1781 : * The buffer was zeroed by block_write_full_page()
1782 : */
1783 0 : clear_buffer_dirty(bh);
1784 : set_buffer_uptodate(bh);
1785 0 : } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1786 0 : buffer_dirty(bh)) {
1787 0 : WARN_ON(bh->b_size != blocksize);
1788 0 : err = get_block(inode, block, bh, 1);
1789 0 : if (err)
1790 : goto recover;
1791 0 : clear_buffer_delay(bh);
1792 0 : if (buffer_new(bh)) {
1793 : /* blockdev mappings never come here */
1794 0 : clear_buffer_new(bh);
1795 0 : clean_bdev_bh_alias(bh);
1796 : }
1797 : }
1798 0 : bh = bh->b_this_page;
1799 0 : block++;
1800 0 : } while (bh != head);
1801 :
1802 : do {
1803 0 : if (!buffer_mapped(bh))
1804 0 : continue;
1805 : /*
1806 : * If it's a fully non-blocking write attempt and we cannot
1807 : * lock the buffer then redirty the page. Note that this can
1808 : * potentially cause a busy-wait loop from writeback threads
1809 : * and kswapd activity, but those code paths have their own
1810 : * higher-level throttling.
1811 : */
1812 0 : if (wbc->sync_mode != WB_SYNC_NONE) {
1813 : lock_buffer(bh);
1814 0 : } else if (!trylock_buffer(bh)) {
1815 0 : redirty_page_for_writepage(wbc, page);
1816 0 : continue;
1817 : }
1818 0 : if (test_clear_buffer_dirty(bh)) {
1819 : mark_buffer_async_write_endio(bh, handler);
1820 : } else {
1821 : unlock_buffer(bh);
1822 : }
1823 0 : } while ((bh = bh->b_this_page) != head);
1824 :
1825 : /*
1826 : * The page and its buffers are protected by PageWriteback(), so we can
1827 : * drop the bh refcounts early.
1828 : */
1829 0 : BUG_ON(PageWriteback(page));
1830 0 : set_page_writeback(page);
1831 :
1832 : do {
1833 0 : struct buffer_head *next = bh->b_this_page;
1834 0 : if (buffer_async_write(bh)) {
1835 0 : submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc);
1836 0 : nr_underway++;
1837 : }
1838 0 : bh = next;
1839 0 : } while (bh != head);
1840 0 : unlock_page(page);
1841 :
1842 0 : err = 0;
1843 : done:
1844 0 : if (nr_underway == 0) {
1845 : /*
1846 : * The page was marked dirty, but the buffers were
1847 : * clean. Someone wrote them back by hand with
1848 : * write_dirty_buffer/submit_bh. A rare case.
1849 : */
1850 0 : end_page_writeback(page);
1851 :
1852 : /*
1853 : * The page and buffer_heads can be released at any time from
1854 : * here on.
1855 : */
1856 : }
1857 0 : return err;
1858 :
1859 : recover:
1860 : /*
1861 : * ENOSPC, or some other error. We may already have added some
1862 : * blocks to the file, so we need to write these out to avoid
1863 : * exposing stale data.
1864 : * The page is currently locked and not marked for writeback
1865 : */
1866 : bh = head;
1867 : /* Recovery: lock and submit the mapped buffers */
1868 : do {
1869 0 : if (buffer_mapped(bh) && buffer_dirty(bh) &&
1870 0 : !buffer_delay(bh)) {
1871 0 : lock_buffer(bh);
1872 : mark_buffer_async_write_endio(bh, handler);
1873 : } else {
1874 : /*
1875 : * The buffer may have been set dirty during
1876 : * attachment to a dirty page.
1877 : */
1878 : clear_buffer_dirty(bh);
1879 : }
1880 0 : } while ((bh = bh->b_this_page) != head);
1881 0 : SetPageError(page);
1882 0 : BUG_ON(PageWriteback(page));
1883 0 : mapping_set_error(page->mapping, err);
1884 0 : set_page_writeback(page);
1885 : do {
1886 0 : struct buffer_head *next = bh->b_this_page;
1887 0 : if (buffer_async_write(bh)) {
1888 0 : clear_buffer_dirty(bh);
1889 0 : submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc);
1890 0 : nr_underway++;
1891 : }
1892 0 : bh = next;
1893 0 : } while (bh != head);
1894 0 : unlock_page(page);
1895 0 : goto done;
1896 : }
1897 : EXPORT_SYMBOL(__block_write_full_page);
1898 :
1899 : /*
1900 : * If a page has any new buffers, zero them out here, and mark them uptodate
1901 : * and dirty so they'll be written out (in order to prevent uninitialised
1902 : * block data from leaking). And clear the new bit.
1903 : */
1904 0 : void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1905 : {
1906 : unsigned int block_start, block_end;
1907 : struct buffer_head *head, *bh;
1908 :
1909 0 : BUG_ON(!PageLocked(page));
1910 0 : if (!page_has_buffers(page))
1911 : return;
1912 :
1913 0 : bh = head = page_buffers(page);
1914 0 : block_start = 0;
1915 : do {
1916 0 : block_end = block_start + bh->b_size;
1917 :
1918 0 : if (buffer_new(bh)) {
1919 0 : if (block_end > from && block_start < to) {
1920 0 : if (!PageUptodate(page)) {
1921 : unsigned start, size;
1922 :
1923 0 : start = max(from, block_start);
1924 0 : size = min(to, block_end) - start;
1925 :
1926 0 : zero_user(page, start, size);
1927 : set_buffer_uptodate(bh);
1928 : }
1929 :
1930 0 : clear_buffer_new(bh);
1931 0 : mark_buffer_dirty(bh);
1932 : }
1933 : }
1934 :
1935 0 : block_start = block_end;
1936 0 : bh = bh->b_this_page;
1937 0 : } while (bh != head);
1938 : }
1939 : EXPORT_SYMBOL(page_zero_new_buffers);
1940 :
1941 : static void
1942 0 : iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
1943 : const struct iomap *iomap)
1944 : {
1945 0 : loff_t offset = block << inode->i_blkbits;
1946 :
1947 0 : bh->b_bdev = iomap->bdev;
1948 :
1949 : /*
1950 : * Block points to offset in file we need to map, iomap contains
1951 : * the offset at which the map starts. If the map ends before the
1952 : * current block, then do not map the buffer and let the caller
1953 : * handle it.
1954 : */
1955 0 : BUG_ON(offset >= iomap->offset + iomap->length);
1956 :
1957 0 : switch (iomap->type) {
1958 : case IOMAP_HOLE:
1959 : /*
1960 : * If the buffer is not up to date or beyond the current EOF,
1961 : * we need to mark it as new to ensure sub-block zeroing is
1962 : * executed if necessary.
1963 : */
1964 0 : if (!buffer_uptodate(bh) ||
1965 0 : (offset >= i_size_read(inode)))
1966 : set_buffer_new(bh);
1967 : break;
1968 : case IOMAP_DELALLOC:
1969 0 : if (!buffer_uptodate(bh) ||
1970 0 : (offset >= i_size_read(inode)))
1971 : set_buffer_new(bh);
1972 0 : set_buffer_uptodate(bh);
1973 0 : set_buffer_mapped(bh);
1974 : set_buffer_delay(bh);
1975 : break;
1976 : case IOMAP_UNWRITTEN:
1977 : /*
1978 : * For unwritten regions, we always need to ensure that regions
1979 : * in the block we are not writing to are zeroed. Mark the
1980 : * buffer as new to ensure this.
1981 : */
1982 0 : set_buffer_new(bh);
1983 : set_buffer_unwritten(bh);
1984 : fallthrough;
1985 : case IOMAP_MAPPED:
1986 0 : if ((iomap->flags & IOMAP_F_NEW) ||
1987 0 : offset >= i_size_read(inode))
1988 : set_buffer_new(bh);
1989 0 : bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
1990 0 : inode->i_blkbits;
1991 : set_buffer_mapped(bh);
1992 : break;
1993 : }
1994 0 : }
1995 :
1996 0 : int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
1997 : get_block_t *get_block, const struct iomap *iomap)
1998 : {
1999 0 : unsigned from = pos & (PAGE_SIZE - 1);
2000 0 : unsigned to = from + len;
2001 0 : struct inode *inode = folio->mapping->host;
2002 : unsigned block_start, block_end;
2003 : sector_t block;
2004 0 : int err = 0;
2005 : unsigned blocksize, bbits;
2006 0 : struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
2007 :
2008 0 : BUG_ON(!folio_test_locked(folio));
2009 : BUG_ON(from > PAGE_SIZE);
2010 0 : BUG_ON(to > PAGE_SIZE);
2011 0 : BUG_ON(from > to);
2012 :
2013 0 : head = create_page_buffers(&folio->page, inode, 0);
2014 0 : blocksize = head->b_size;
2015 0 : bbits = block_size_bits(blocksize);
2016 :
2017 0 : block = (sector_t)folio->index << (PAGE_SHIFT - bbits);
2018 :
2019 0 : for(bh = head, block_start = 0; bh != head || !block_start;
2020 0 : block++, block_start=block_end, bh = bh->b_this_page) {
2021 0 : block_end = block_start + blocksize;
2022 0 : if (block_end <= from || block_start >= to) {
2023 0 : if (folio_test_uptodate(folio)) {
2024 0 : if (!buffer_uptodate(bh))
2025 : set_buffer_uptodate(bh);
2026 : }
2027 0 : continue;
2028 : }
2029 0 : if (buffer_new(bh))
2030 : clear_buffer_new(bh);
2031 0 : if (!buffer_mapped(bh)) {
2032 0 : WARN_ON(bh->b_size != blocksize);
2033 0 : if (get_block) {
2034 0 : err = get_block(inode, block, bh, 1);
2035 0 : if (err)
2036 : break;
2037 : } else {
2038 0 : iomap_to_bh(inode, block, bh, iomap);
2039 : }
2040 :
2041 0 : if (buffer_new(bh)) {
2042 0 : clean_bdev_bh_alias(bh);
2043 0 : if (folio_test_uptodate(folio)) {
2044 0 : clear_buffer_new(bh);
2045 0 : set_buffer_uptodate(bh);
2046 0 : mark_buffer_dirty(bh);
2047 0 : continue;
2048 : }
2049 0 : if (block_end > to || block_start < from)
2050 0 : folio_zero_segments(folio,
2051 : to, block_end,
2052 : block_start, from);
2053 0 : continue;
2054 : }
2055 : }
2056 0 : if (folio_test_uptodate(folio)) {
2057 0 : if (!buffer_uptodate(bh))
2058 : set_buffer_uptodate(bh);
2059 0 : continue;
2060 : }
2061 0 : if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
2062 0 : !buffer_unwritten(bh) &&
2063 0 : (block_start < from || block_end > to)) {
2064 0 : bh_read_nowait(bh, 0);
2065 0 : *wait_bh++=bh;
2066 : }
2067 : }
2068 : /*
2069 : * If we issued read requests - let them complete.
2070 : */
2071 0 : while(wait_bh > wait) {
2072 0 : wait_on_buffer(*--wait_bh);
2073 0 : if (!buffer_uptodate(*wait_bh))
2074 0 : err = -EIO;
2075 : }
2076 0 : if (unlikely(err))
2077 0 : page_zero_new_buffers(&folio->page, from, to);
2078 0 : return err;
2079 : }
2080 :
2081 0 : int __block_write_begin(struct page *page, loff_t pos, unsigned len,
2082 : get_block_t *get_block)
2083 : {
2084 0 : return __block_write_begin_int(page_folio(page), pos, len, get_block,
2085 : NULL);
2086 : }
2087 : EXPORT_SYMBOL(__block_write_begin);
2088 :
2089 0 : static int __block_commit_write(struct inode *inode, struct page *page,
2090 : unsigned from, unsigned to)
2091 : {
2092 : unsigned block_start, block_end;
2093 0 : int partial = 0;
2094 : unsigned blocksize;
2095 : struct buffer_head *bh, *head;
2096 :
2097 0 : bh = head = page_buffers(page);
2098 0 : blocksize = bh->b_size;
2099 :
2100 0 : block_start = 0;
2101 : do {
2102 0 : block_end = block_start + blocksize;
2103 0 : if (block_end <= from || block_start >= to) {
2104 0 : if (!buffer_uptodate(bh))
2105 0 : partial = 1;
2106 : } else {
2107 0 : set_buffer_uptodate(bh);
2108 0 : mark_buffer_dirty(bh);
2109 : }
2110 0 : if (buffer_new(bh))
2111 : clear_buffer_new(bh);
2112 :
2113 0 : block_start = block_end;
2114 0 : bh = bh->b_this_page;
2115 0 : } while (bh != head);
2116 :
2117 : /*
2118 : * If this is a partial write which happened to make all buffers
2119 : * uptodate then we can optimize away a bogus read_folio() for
2120 : * the next read(). Here we 'discover' whether the page went
2121 : * uptodate as a result of this (potentially partial) write.
2122 : */
2123 0 : if (!partial)
2124 : SetPageUptodate(page);
2125 0 : return 0;
2126 : }
2127 :
2128 : /*
2129 : * block_write_begin takes care of the basic task of block allocation and
2130 : * bringing partial write blocks uptodate first.
2131 : *
2132 : * The filesystem needs to handle block truncation upon failure.
2133 : */
2134 0 : int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
2135 : struct page **pagep, get_block_t *get_block)
2136 : {
2137 0 : pgoff_t index = pos >> PAGE_SHIFT;
2138 : struct page *page;
2139 : int status;
2140 :
2141 0 : page = grab_cache_page_write_begin(mapping, index);
2142 0 : if (!page)
2143 : return -ENOMEM;
2144 :
2145 0 : status = __block_write_begin(page, pos, len, get_block);
2146 0 : if (unlikely(status)) {
2147 0 : unlock_page(page);
2148 0 : put_page(page);
2149 0 : page = NULL;
2150 : }
2151 :
2152 0 : *pagep = page;
2153 0 : return status;
2154 : }
2155 : EXPORT_SYMBOL(block_write_begin);
2156 :
2157 0 : int block_write_end(struct file *file, struct address_space *mapping,
2158 : loff_t pos, unsigned len, unsigned copied,
2159 : struct page *page, void *fsdata)
2160 : {
2161 0 : struct inode *inode = mapping->host;
2162 : unsigned start;
2163 :
2164 0 : start = pos & (PAGE_SIZE - 1);
2165 :
2166 0 : if (unlikely(copied < len)) {
2167 : /*
2168 : * The buffers that were written will now be uptodate, so
2169 : * we don't have to worry about a read_folio reading them
2170 : * and overwriting a partial write. However if we have
2171 : * encountered a short write and only partially written
2172 : * into a buffer, it will not be marked uptodate, so a
2173 : * read_folio might come in and destroy our partial write.
2174 : *
2175 : * Do the simplest thing, and just treat any short write to a
2176 : * non uptodate page as a zero-length write, and force the
2177 : * caller to redo the whole thing.
2178 : */
2179 0 : if (!PageUptodate(page))
2180 0 : copied = 0;
2181 :
2182 0 : page_zero_new_buffers(page, start+copied, start+len);
2183 : }
2184 0 : flush_dcache_page(page);
2185 :
2186 : /* This could be a short (even 0-length) commit */
2187 0 : __block_commit_write(inode, page, start, start+copied);
2188 :
2189 0 : return copied;
2190 : }
2191 : EXPORT_SYMBOL(block_write_end);
2192 :
2193 0 : int generic_write_end(struct file *file, struct address_space *mapping,
2194 : loff_t pos, unsigned len, unsigned copied,
2195 : struct page *page, void *fsdata)
2196 : {
2197 0 : struct inode *inode = mapping->host;
2198 0 : loff_t old_size = inode->i_size;
2199 0 : bool i_size_changed = false;
2200 :
2201 0 : copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2202 :
2203 : /*
2204 : * No need to use i_size_read() here, the i_size cannot change under us
2205 : * because we hold i_rwsem.
2206 : *
2207 : * But it's important to update i_size while still holding page lock:
2208 : * page writeout could otherwise come in and zero beyond i_size.
2209 : */
2210 0 : if (pos + copied > inode->i_size) {
2211 0 : i_size_write(inode, pos + copied);
2212 0 : i_size_changed = true;
2213 : }
2214 :
2215 0 : unlock_page(page);
2216 0 : put_page(page);
2217 :
2218 0 : if (old_size < pos)
2219 0 : pagecache_isize_extended(inode, old_size, pos);
2220 : /*
2221 : * Don't mark the inode dirty under page lock. First, it unnecessarily
2222 : * makes the holding time of page lock longer. Second, it forces lock
2223 : * ordering of page lock and transaction start for journaling
2224 : * filesystems.
2225 : */
2226 0 : if (i_size_changed)
2227 : mark_inode_dirty(inode);
2228 0 : return copied;
2229 : }
2230 : EXPORT_SYMBOL(generic_write_end);
2231 :
2232 : /*
2233 : * block_is_partially_uptodate checks whether buffers within a folio are
2234 : * uptodate or not.
2235 : *
2236 : * Returns true if all buffers which correspond to the specified part
2237 : * of the folio are uptodate.
2238 : */
2239 0 : bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
2240 : {
2241 : unsigned block_start, block_end, blocksize;
2242 : unsigned to;
2243 : struct buffer_head *bh, *head;
2244 0 : bool ret = true;
2245 :
2246 0 : head = folio_buffers(folio);
2247 0 : if (!head)
2248 : return false;
2249 0 : blocksize = head->b_size;
2250 0 : to = min_t(unsigned, folio_size(folio) - from, count);
2251 0 : to = from + to;
2252 0 : if (from < blocksize && to > folio_size(folio) - blocksize)
2253 : return false;
2254 :
2255 : bh = head;
2256 : block_start = 0;
2257 : do {
2258 0 : block_end = block_start + blocksize;
2259 0 : if (block_end > from && block_start < to) {
2260 0 : if (!buffer_uptodate(bh)) {
2261 : ret = false;
2262 : break;
2263 : }
2264 0 : if (block_end >= to)
2265 : break;
2266 : }
2267 0 : block_start = block_end;
2268 0 : bh = bh->b_this_page;
2269 0 : } while (bh != head);
2270 :
2271 : return ret;
2272 : }
2273 : EXPORT_SYMBOL(block_is_partially_uptodate);
2274 :
2275 : /*
2276 : * Generic "read_folio" function for block devices that have the normal
2277 : * get_block functionality. This is most of the block device filesystems.
2278 : * Reads the folio asynchronously --- the unlock_buffer() and
2279 : * set/clear_buffer_uptodate() functions propagate buffer state into the
2280 : * folio once IO has completed.
2281 : */
2282 0 : int block_read_full_folio(struct folio *folio, get_block_t *get_block)
2283 : {
2284 0 : struct inode *inode = folio->mapping->host;
2285 : sector_t iblock, lblock;
2286 : struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2287 : unsigned int blocksize, bbits;
2288 : int nr, i;
2289 0 : int fully_mapped = 1;
2290 0 : bool page_error = false;
2291 0 : loff_t limit = i_size_read(inode);
2292 :
2293 : /* This is needed for ext4. */
2294 : if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
2295 : limit = inode->i_sb->s_maxbytes;
2296 :
2297 : VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
2298 :
2299 0 : head = create_page_buffers(&folio->page, inode, 0);
2300 0 : blocksize = head->b_size;
2301 0 : bbits = block_size_bits(blocksize);
2302 :
2303 0 : iblock = (sector_t)folio->index << (PAGE_SHIFT - bbits);
2304 0 : lblock = (limit+blocksize-1) >> bbits;
2305 0 : bh = head;
2306 0 : nr = 0;
2307 0 : i = 0;
2308 :
2309 : do {
2310 0 : if (buffer_uptodate(bh))
2311 0 : continue;
2312 :
2313 0 : if (!buffer_mapped(bh)) {
2314 0 : int err = 0;
2315 :
2316 0 : fully_mapped = 0;
2317 0 : if (iblock < lblock) {
2318 0 : WARN_ON(bh->b_size != blocksize);
2319 0 : err = get_block(inode, iblock, bh, 0);
2320 0 : if (err) {
2321 0 : folio_set_error(folio);
2322 0 : page_error = true;
2323 : }
2324 : }
2325 0 : if (!buffer_mapped(bh)) {
2326 0 : folio_zero_range(folio, i * blocksize,
2327 : blocksize);
2328 0 : if (!err)
2329 : set_buffer_uptodate(bh);
2330 0 : continue;
2331 : }
2332 : /*
2333 : * get_block() might have updated the buffer
2334 : * synchronously
2335 : */
2336 0 : if (buffer_uptodate(bh))
2337 0 : continue;
2338 : }
2339 0 : arr[nr++] = bh;
2340 0 : } while (i++, iblock++, (bh = bh->b_this_page) != head);
2341 :
2342 0 : if (fully_mapped)
2343 : folio_set_mappedtodisk(folio);
2344 :
2345 0 : if (!nr) {
2346 : /*
2347 : * All buffers are uptodate - we can set the folio uptodate
2348 : * as well. But not if get_block() returned an error.
2349 : */
2350 0 : if (!page_error)
2351 : folio_mark_uptodate(folio);
2352 0 : folio_unlock(folio);
2353 0 : return 0;
2354 : }
2355 :
2356 : /* Stage two: lock the buffers */
2357 0 : for (i = 0; i < nr; i++) {
2358 0 : bh = arr[i];
2359 0 : lock_buffer(bh);
2360 0 : mark_buffer_async_read(bh);
2361 : }
2362 :
2363 : /*
2364 : * Stage 3: start the IO. Check for uptodateness
2365 : * inside the buffer lock in case another process reading
2366 : * the underlying blockdev brought it uptodate (the sct fix).
2367 : */
2368 0 : for (i = 0; i < nr; i++) {
2369 0 : bh = arr[i];
2370 0 : if (buffer_uptodate(bh))
2371 0 : end_buffer_async_read(bh, 1);
2372 : else
2373 : submit_bh(REQ_OP_READ, bh);
2374 : }
2375 : return 0;
2376 : }
2377 : EXPORT_SYMBOL(block_read_full_folio);
2378 :
2379 : /* utility function for filesystems that need to do work on expanding
2380 : * truncates. Uses filesystem pagecache writes to allow the filesystem to
2381 : * deal with the hole.
2382 : */
2383 0 : int generic_cont_expand_simple(struct inode *inode, loff_t size)
2384 : {
2385 0 : struct address_space *mapping = inode->i_mapping;
2386 0 : const struct address_space_operations *aops = mapping->a_ops;
2387 : struct page *page;
2388 0 : void *fsdata = NULL;
2389 : int err;
2390 :
2391 0 : err = inode_newsize_ok(inode, size);
2392 0 : if (err)
2393 : goto out;
2394 :
2395 0 : err = aops->write_begin(NULL, mapping, size, 0, &page, &fsdata);
2396 0 : if (err)
2397 : goto out;
2398 :
2399 0 : err = aops->write_end(NULL, mapping, size, 0, 0, page, fsdata);
2400 0 : BUG_ON(err > 0);
2401 :
2402 : out:
2403 0 : return err;
2404 : }
2405 : EXPORT_SYMBOL(generic_cont_expand_simple);
2406 :
2407 0 : static int cont_expand_zero(struct file *file, struct address_space *mapping,
2408 : loff_t pos, loff_t *bytes)
2409 : {
2410 0 : struct inode *inode = mapping->host;
2411 0 : const struct address_space_operations *aops = mapping->a_ops;
2412 0 : unsigned int blocksize = i_blocksize(inode);
2413 : struct page *page;
2414 0 : void *fsdata = NULL;
2415 : pgoff_t index, curidx;
2416 : loff_t curpos;
2417 : unsigned zerofrom, offset, len;
2418 0 : int err = 0;
2419 :
2420 0 : index = pos >> PAGE_SHIFT;
2421 0 : offset = pos & ~PAGE_MASK;
2422 :
2423 0 : while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
2424 0 : zerofrom = curpos & ~PAGE_MASK;
2425 0 : if (zerofrom & (blocksize-1)) {
2426 0 : *bytes |= (blocksize-1);
2427 0 : (*bytes)++;
2428 : }
2429 0 : len = PAGE_SIZE - zerofrom;
2430 :
2431 0 : err = aops->write_begin(file, mapping, curpos, len,
2432 : &page, &fsdata);
2433 0 : if (err)
2434 : goto out;
2435 0 : zero_user(page, zerofrom, len);
2436 0 : err = aops->write_end(file, mapping, curpos, len, len,
2437 : page, fsdata);
2438 0 : if (err < 0)
2439 : goto out;
2440 0 : BUG_ON(err != len);
2441 0 : err = 0;
2442 :
2443 0 : balance_dirty_pages_ratelimited(mapping);
2444 :
2445 0 : if (fatal_signal_pending(current)) {
2446 : err = -EINTR;
2447 : goto out;
2448 : }
2449 : }
2450 :
2451 : /* page covers the boundary, find the boundary offset */
2452 0 : if (index == curidx) {
2453 0 : zerofrom = curpos & ~PAGE_MASK;
2454 : /* if we will expand the thing last block will be filled */
2455 0 : if (offset <= zerofrom) {
2456 : goto out;
2457 : }
2458 0 : if (zerofrom & (blocksize-1)) {
2459 0 : *bytes |= (blocksize-1);
2460 0 : (*bytes)++;
2461 : }
2462 0 : len = offset - zerofrom;
2463 :
2464 0 : err = aops->write_begin(file, mapping, curpos, len,
2465 : &page, &fsdata);
2466 0 : if (err)
2467 : goto out;
2468 0 : zero_user(page, zerofrom, len);
2469 0 : err = aops->write_end(file, mapping, curpos, len, len,
2470 : page, fsdata);
2471 0 : if (err < 0)
2472 : goto out;
2473 0 : BUG_ON(err != len);
2474 : err = 0;
2475 : }
2476 : out:
2477 0 : return err;
2478 : }
2479 :
2480 : /*
2481 : * For moronic filesystems that do not allow holes in file.
2482 : * We may have to extend the file.
2483 : */
2484 0 : int cont_write_begin(struct file *file, struct address_space *mapping,
2485 : loff_t pos, unsigned len,
2486 : struct page **pagep, void **fsdata,
2487 : get_block_t *get_block, loff_t *bytes)
2488 : {
2489 0 : struct inode *inode = mapping->host;
2490 0 : unsigned int blocksize = i_blocksize(inode);
2491 : unsigned int zerofrom;
2492 : int err;
2493 :
2494 0 : err = cont_expand_zero(file, mapping, pos, bytes);
2495 0 : if (err)
2496 : return err;
2497 :
2498 0 : zerofrom = *bytes & ~PAGE_MASK;
2499 0 : if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2500 0 : *bytes |= (blocksize-1);
2501 0 : (*bytes)++;
2502 : }
2503 :
2504 0 : return block_write_begin(mapping, pos, len, pagep, get_block);
2505 : }
2506 : EXPORT_SYMBOL(cont_write_begin);
2507 :
2508 0 : int block_commit_write(struct page *page, unsigned from, unsigned to)
2509 : {
2510 0 : struct inode *inode = page->mapping->host;
2511 0 : __block_commit_write(inode,page,from,to);
2512 0 : return 0;
2513 : }
2514 : EXPORT_SYMBOL(block_commit_write);
2515 :
2516 : /*
2517 : * block_page_mkwrite() is not allowed to change the file size as it gets
2518 : * called from a page fault handler when a page is first dirtied. Hence we must
2519 : * be careful to check for EOF conditions here. We set the page up correctly
2520 : * for a written page which means we get ENOSPC checking when writing into
2521 : * holes and correct delalloc and unwritten extent mapping on filesystems that
2522 : * support these features.
2523 : *
2524 : * We are not allowed to take the i_mutex here so we have to play games to
2525 : * protect against truncate races as the page could now be beyond EOF. Because
2526 : * truncate writes the inode size before removing pages, once we have the
2527 : * page lock we can determine safely if the page is beyond EOF. If it is not
2528 : * beyond EOF, then the page is guaranteed safe against truncation until we
2529 : * unlock the page.
2530 : *
2531 : * Direct callers of this function should protect against filesystem freezing
2532 : * using sb_start_pagefault() - sb_end_pagefault() functions.
2533 : */
2534 0 : int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2535 : get_block_t get_block)
2536 : {
2537 0 : struct page *page = vmf->page;
2538 0 : struct inode *inode = file_inode(vma->vm_file);
2539 : unsigned long end;
2540 : loff_t size;
2541 : int ret;
2542 :
2543 0 : lock_page(page);
2544 0 : size = i_size_read(inode);
2545 0 : if ((page->mapping != inode->i_mapping) ||
2546 0 : (page_offset(page) > size)) {
2547 : /* We overload EFAULT to mean page got truncated */
2548 : ret = -EFAULT;
2549 : goto out_unlock;
2550 : }
2551 :
2552 : /* page is wholly or partially inside EOF */
2553 0 : if (((page->index + 1) << PAGE_SHIFT) > size)
2554 0 : end = size & ~PAGE_MASK;
2555 : else
2556 : end = PAGE_SIZE;
2557 :
2558 0 : ret = __block_write_begin(page, 0, end, get_block);
2559 0 : if (!ret)
2560 0 : ret = block_commit_write(page, 0, end);
2561 :
2562 0 : if (unlikely(ret < 0))
2563 : goto out_unlock;
2564 0 : set_page_dirty(page);
2565 0 : wait_for_stable_page(page);
2566 0 : return 0;
2567 : out_unlock:
2568 0 : unlock_page(page);
2569 0 : return ret;
2570 : }
2571 : EXPORT_SYMBOL(block_page_mkwrite);
2572 :
2573 0 : int block_truncate_page(struct address_space *mapping,
2574 : loff_t from, get_block_t *get_block)
2575 : {
2576 0 : pgoff_t index = from >> PAGE_SHIFT;
2577 0 : unsigned offset = from & (PAGE_SIZE-1);
2578 : unsigned blocksize;
2579 : sector_t iblock;
2580 : unsigned length, pos;
2581 0 : struct inode *inode = mapping->host;
2582 : struct page *page;
2583 : struct buffer_head *bh;
2584 : int err;
2585 :
2586 0 : blocksize = i_blocksize(inode);
2587 0 : length = offset & (blocksize - 1);
2588 :
2589 : /* Block boundary? Nothing to do */
2590 0 : if (!length)
2591 : return 0;
2592 :
2593 0 : length = blocksize - length;
2594 0 : iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
2595 :
2596 0 : page = grab_cache_page(mapping, index);
2597 0 : err = -ENOMEM;
2598 0 : if (!page)
2599 : goto out;
2600 :
2601 0 : if (!page_has_buffers(page))
2602 0 : create_empty_buffers(page, blocksize, 0);
2603 :
2604 : /* Find the buffer that contains "offset" */
2605 0 : bh = page_buffers(page);
2606 0 : pos = blocksize;
2607 0 : while (offset >= pos) {
2608 0 : bh = bh->b_this_page;
2609 0 : iblock++;
2610 0 : pos += blocksize;
2611 : }
2612 :
2613 0 : err = 0;
2614 0 : if (!buffer_mapped(bh)) {
2615 0 : WARN_ON(bh->b_size != blocksize);
2616 0 : err = get_block(inode, iblock, bh, 0);
2617 0 : if (err)
2618 : goto unlock;
2619 : /* unmapped? It's a hole - nothing to do */
2620 0 : if (!buffer_mapped(bh))
2621 : goto unlock;
2622 : }
2623 :
2624 : /* Ok, it's mapped. Make sure it's up-to-date */
2625 0 : if (PageUptodate(page))
2626 : set_buffer_uptodate(bh);
2627 :
2628 0 : if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2629 0 : err = bh_read(bh, 0);
2630 : /* Uhhuh. Read error. Complain and punt. */
2631 0 : if (err < 0)
2632 : goto unlock;
2633 : }
2634 :
2635 0 : zero_user(page, offset, length);
2636 0 : mark_buffer_dirty(bh);
2637 0 : err = 0;
2638 :
2639 : unlock:
2640 0 : unlock_page(page);
2641 0 : put_page(page);
2642 : out:
2643 : return err;
2644 : }
2645 : EXPORT_SYMBOL(block_truncate_page);
2646 :
2647 : /*
2648 : * The generic ->writepage function for buffer-backed address_spaces
2649 : */
2650 0 : int block_write_full_page(struct page *page, get_block_t *get_block,
2651 : struct writeback_control *wbc)
2652 : {
2653 0 : struct inode * const inode = page->mapping->host;
2654 0 : loff_t i_size = i_size_read(inode);
2655 0 : const pgoff_t end_index = i_size >> PAGE_SHIFT;
2656 : unsigned offset;
2657 :
2658 : /* Is the page fully inside i_size? */
2659 0 : if (page->index < end_index)
2660 0 : return __block_write_full_page(inode, page, get_block, wbc,
2661 : end_buffer_async_write);
2662 :
2663 : /* Is the page fully outside i_size? (truncate in progress) */
2664 0 : offset = i_size & (PAGE_SIZE-1);
2665 0 : if (page->index >= end_index+1 || !offset) {
2666 0 : unlock_page(page);
2667 0 : return 0; /* don't care */
2668 : }
2669 :
2670 : /*
2671 : * The page straddles i_size. It must be zeroed out on each and every
2672 : * writepage invocation because it may be mmapped. "A file is mapped
2673 : * in multiples of the page size. For a file that is not a multiple of
2674 : * the page size, the remaining memory is zeroed when mapped, and
2675 : * writes to that region are not written out to the file."
2676 : */
2677 0 : zero_user_segment(page, offset, PAGE_SIZE);
2678 0 : return __block_write_full_page(inode, page, get_block, wbc,
2679 : end_buffer_async_write);
2680 : }
2681 : EXPORT_SYMBOL(block_write_full_page);
2682 :
2683 0 : sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2684 : get_block_t *get_block)
2685 : {
2686 0 : struct inode *inode = mapping->host;
2687 0 : struct buffer_head tmp = {
2688 0 : .b_size = i_blocksize(inode),
2689 : };
2690 :
2691 0 : get_block(inode, block, &tmp, 0);
2692 0 : return tmp.b_blocknr;
2693 : }
2694 : EXPORT_SYMBOL(generic_block_bmap);
2695 :
2696 0 : static void end_bio_bh_io_sync(struct bio *bio)
2697 : {
2698 0 : struct buffer_head *bh = bio->bi_private;
2699 :
2700 0 : if (unlikely(bio_flagged(bio, BIO_QUIET)))
2701 0 : set_bit(BH_Quiet, &bh->b_state);
2702 :
2703 0 : bh->b_end_io(bh, !bio->bi_status);
2704 0 : bio_put(bio);
2705 0 : }
2706 :
2707 0 : static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
2708 : struct writeback_control *wbc)
2709 : {
2710 0 : const enum req_op op = opf & REQ_OP_MASK;
2711 : struct bio *bio;
2712 :
2713 0 : BUG_ON(!buffer_locked(bh));
2714 0 : BUG_ON(!buffer_mapped(bh));
2715 0 : BUG_ON(!bh->b_end_io);
2716 0 : BUG_ON(buffer_delay(bh));
2717 0 : BUG_ON(buffer_unwritten(bh));
2718 :
2719 : /*
2720 : * Only clear out a write error when rewriting
2721 : */
2722 0 : if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
2723 : clear_buffer_write_io_error(bh);
2724 :
2725 0 : if (buffer_meta(bh))
2726 0 : opf |= REQ_META;
2727 0 : if (buffer_prio(bh))
2728 0 : opf |= REQ_PRIO;
2729 :
2730 0 : bio = bio_alloc(bh->b_bdev, 1, opf, GFP_NOIO);
2731 :
2732 0 : fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
2733 :
2734 0 : bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2735 :
2736 0 : bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
2737 0 : BUG_ON(bio->bi_iter.bi_size != bh->b_size);
2738 :
2739 0 : bio->bi_end_io = end_bio_bh_io_sync;
2740 0 : bio->bi_private = bh;
2741 :
2742 : /* Take care of bh's that straddle the end of the device */
2743 0 : guard_bio_eod(bio);
2744 :
2745 : if (wbc) {
2746 : wbc_init_bio(wbc, bio);
2747 : wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size);
2748 : }
2749 :
2750 0 : submit_bio(bio);
2751 0 : }
2752 :
2753 0 : void submit_bh(blk_opf_t opf, struct buffer_head *bh)
2754 : {
2755 0 : submit_bh_wbc(opf, bh, NULL);
2756 0 : }
2757 : EXPORT_SYMBOL(submit_bh);
2758 :
2759 0 : void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
2760 : {
2761 0 : lock_buffer(bh);
2762 0 : if (!test_clear_buffer_dirty(bh)) {
2763 : unlock_buffer(bh);
2764 : return;
2765 : }
2766 0 : bh->b_end_io = end_buffer_write_sync;
2767 0 : get_bh(bh);
2768 0 : submit_bh(REQ_OP_WRITE | op_flags, bh);
2769 : }
2770 : EXPORT_SYMBOL(write_dirty_buffer);
2771 :
2772 : /*
2773 : * For a data-integrity writeout, we need to wait upon any in-progress I/O
2774 : * and then start new I/O and then wait upon it. The caller must have a ref on
2775 : * the buffer_head.
2776 : */
2777 0 : int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
2778 : {
2779 0 : WARN_ON(atomic_read(&bh->b_count) < 1);
2780 0 : lock_buffer(bh);
2781 0 : if (test_clear_buffer_dirty(bh)) {
2782 : /*
2783 : * The bh should be mapped, but it might not be if the
2784 : * device was hot-removed. Not much we can do but fail the I/O.
2785 : */
2786 0 : if (!buffer_mapped(bh)) {
2787 0 : unlock_buffer(bh);
2788 0 : return -EIO;
2789 : }
2790 :
2791 0 : get_bh(bh);
2792 0 : bh->b_end_io = end_buffer_write_sync;
2793 0 : submit_bh(REQ_OP_WRITE | op_flags, bh);
2794 0 : wait_on_buffer(bh);
2795 0 : if (!buffer_uptodate(bh))
2796 : return -EIO;
2797 : } else {
2798 : unlock_buffer(bh);
2799 : }
2800 : return 0;
2801 : }
2802 : EXPORT_SYMBOL(__sync_dirty_buffer);
2803 :
2804 0 : int sync_dirty_buffer(struct buffer_head *bh)
2805 : {
2806 0 : return __sync_dirty_buffer(bh, REQ_SYNC);
2807 : }
2808 : EXPORT_SYMBOL(sync_dirty_buffer);
2809 :
2810 : /*
2811 : * try_to_free_buffers() checks if all the buffers on this particular folio
2812 : * are unused, and releases them if so.
2813 : *
2814 : * Exclusion against try_to_free_buffers may be obtained by either
2815 : * locking the folio or by holding its mapping's private_lock.
2816 : *
2817 : * If the folio is dirty but all the buffers are clean then we need to
2818 : * be sure to mark the folio clean as well. This is because the folio
2819 : * may be against a block device, and a later reattachment of buffers
2820 : * to a dirty folio will set *all* buffers dirty. Which would corrupt
2821 : * filesystem data on the same device.
2822 : *
2823 : * The same applies to regular filesystem folios: if all the buffers are
2824 : * clean then we set the folio clean and proceed. To do that, we require
2825 : * total exclusion from block_dirty_folio(). That is obtained with
2826 : * private_lock.
2827 : *
2828 : * try_to_free_buffers() is non-blocking.
2829 : */
2830 : static inline int buffer_busy(struct buffer_head *bh)
2831 : {
2832 0 : return atomic_read(&bh->b_count) |
2833 0 : (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2834 : }
2835 :
2836 : static bool
2837 0 : drop_buffers(struct folio *folio, struct buffer_head **buffers_to_free)
2838 : {
2839 0 : struct buffer_head *head = folio_buffers(folio);
2840 : struct buffer_head *bh;
2841 :
2842 0 : bh = head;
2843 : do {
2844 0 : if (buffer_busy(bh))
2845 : goto failed;
2846 0 : bh = bh->b_this_page;
2847 0 : } while (bh != head);
2848 :
2849 : do {
2850 0 : struct buffer_head *next = bh->b_this_page;
2851 :
2852 0 : if (bh->b_assoc_map)
2853 0 : __remove_assoc_queue(bh);
2854 0 : bh = next;
2855 0 : } while (bh != head);
2856 0 : *buffers_to_free = head;
2857 0 : folio_detach_private(folio);
2858 0 : return true;
2859 : failed:
2860 : return false;
2861 : }
2862 :
2863 0 : bool try_to_free_buffers(struct folio *folio)
2864 : {
2865 0 : struct address_space * const mapping = folio->mapping;
2866 0 : struct buffer_head *buffers_to_free = NULL;
2867 0 : bool ret = 0;
2868 :
2869 0 : BUG_ON(!folio_test_locked(folio));
2870 0 : if (folio_test_writeback(folio))
2871 : return false;
2872 :
2873 0 : if (mapping == NULL) { /* can this still happen? */
2874 0 : ret = drop_buffers(folio, &buffers_to_free);
2875 0 : goto out;
2876 : }
2877 :
2878 0 : spin_lock(&mapping->private_lock);
2879 0 : ret = drop_buffers(folio, &buffers_to_free);
2880 :
2881 : /*
2882 : * If the filesystem writes its buffers by hand (eg ext3)
2883 : * then we can have clean buffers against a dirty folio. We
2884 : * clean the folio here; otherwise the VM will never notice
2885 : * that the filesystem did any IO at all.
2886 : *
2887 : * Also, during truncate, discard_buffer will have marked all
2888 : * the folio's buffers clean. We discover that here and clean
2889 : * the folio also.
2890 : *
2891 : * private_lock must be held over this entire operation in order
2892 : * to synchronise against block_dirty_folio and prevent the
2893 : * dirty bit from being lost.
2894 : */
2895 0 : if (ret)
2896 : folio_cancel_dirty(folio);
2897 0 : spin_unlock(&mapping->private_lock);
2898 : out:
2899 0 : if (buffers_to_free) {
2900 : struct buffer_head *bh = buffers_to_free;
2901 :
2902 : do {
2903 0 : struct buffer_head *next = bh->b_this_page;
2904 0 : free_buffer_head(bh);
2905 0 : bh = next;
2906 0 : } while (bh != buffers_to_free);
2907 : }
2908 : return ret;
2909 : }
2910 : EXPORT_SYMBOL(try_to_free_buffers);
2911 :
2912 : /*
2913 : * Buffer-head allocation
2914 : */
2915 : static struct kmem_cache *bh_cachep __read_mostly;
2916 :
2917 : /*
2918 : * Once the number of bh's in the machine exceeds this level, we start
2919 : * stripping them in writeback.
2920 : */
2921 : static unsigned long max_buffer_heads;
2922 :
2923 : int buffer_heads_over_limit;
2924 :
2925 : struct bh_accounting {
2926 : int nr; /* Number of live bh's */
2927 : int ratelimit; /* Limit cacheline bouncing */
2928 : };
2929 :
2930 : static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
2931 :
2932 : static void recalc_bh_state(void)
2933 : {
2934 : int i;
2935 0 : int tot = 0;
2936 :
2937 0 : if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
2938 : return;
2939 0 : __this_cpu_write(bh_accounting.ratelimit, 0);
2940 0 : for_each_online_cpu(i)
2941 0 : tot += per_cpu(bh_accounting, i).nr;
2942 0 : buffer_heads_over_limit = (tot > max_buffer_heads);
2943 : }
2944 :
2945 0 : struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
2946 : {
2947 0 : struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
2948 0 : if (ret) {
2949 0 : INIT_LIST_HEAD(&ret->b_assoc_buffers);
2950 0 : spin_lock_init(&ret->b_uptodate_lock);
2951 0 : preempt_disable();
2952 0 : __this_cpu_inc(bh_accounting.nr);
2953 0 : recalc_bh_state();
2954 0 : preempt_enable();
2955 : }
2956 0 : return ret;
2957 : }
2958 : EXPORT_SYMBOL(alloc_buffer_head);
2959 :
2960 0 : void free_buffer_head(struct buffer_head *bh)
2961 : {
2962 0 : BUG_ON(!list_empty(&bh->b_assoc_buffers));
2963 0 : kmem_cache_free(bh_cachep, bh);
2964 0 : preempt_disable();
2965 0 : __this_cpu_dec(bh_accounting.nr);
2966 0 : recalc_bh_state();
2967 0 : preempt_enable();
2968 0 : }
2969 : EXPORT_SYMBOL(free_buffer_head);
2970 :
2971 0 : static int buffer_exit_cpu_dead(unsigned int cpu)
2972 : {
2973 : int i;
2974 0 : struct bh_lru *b = &per_cpu(bh_lrus, cpu);
2975 :
2976 0 : for (i = 0; i < BH_LRU_SIZE; i++) {
2977 0 : brelse(b->bhs[i]);
2978 0 : b->bhs[i] = NULL;
2979 : }
2980 0 : this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
2981 0 : per_cpu(bh_accounting, cpu).nr = 0;
2982 0 : return 0;
2983 : }
2984 :
2985 : /**
2986 : * bh_uptodate_or_lock - Test whether the buffer is uptodate
2987 : * @bh: struct buffer_head
2988 : *
2989 : * Return true if the buffer is up-to-date and false,
2990 : * with the buffer locked, if not.
2991 : */
2992 0 : int bh_uptodate_or_lock(struct buffer_head *bh)
2993 : {
2994 0 : if (!buffer_uptodate(bh)) {
2995 0 : lock_buffer(bh);
2996 0 : if (!buffer_uptodate(bh))
2997 : return 0;
2998 : unlock_buffer(bh);
2999 : }
3000 : return 1;
3001 : }
3002 : EXPORT_SYMBOL(bh_uptodate_or_lock);
3003 :
3004 : /**
3005 : * __bh_read - Submit read for a locked buffer
3006 : * @bh: struct buffer_head
3007 : * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
3008 : * @wait: wait until reading finish
3009 : *
3010 : * Returns zero on success or don't wait, and -EIO on error.
3011 : */
3012 0 : int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
3013 : {
3014 0 : int ret = 0;
3015 :
3016 0 : BUG_ON(!buffer_locked(bh));
3017 :
3018 0 : get_bh(bh);
3019 0 : bh->b_end_io = end_buffer_read_sync;
3020 0 : submit_bh(REQ_OP_READ | op_flags, bh);
3021 0 : if (wait) {
3022 0 : wait_on_buffer(bh);
3023 0 : if (!buffer_uptodate(bh))
3024 0 : ret = -EIO;
3025 : }
3026 0 : return ret;
3027 : }
3028 : EXPORT_SYMBOL(__bh_read);
3029 :
3030 : /**
3031 : * __bh_read_batch - Submit read for a batch of unlocked buffers
3032 : * @nr: entry number of the buffer batch
3033 : * @bhs: a batch of struct buffer_head
3034 : * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
3035 : * @force_lock: force to get a lock on the buffer if set, otherwise drops any
3036 : * buffer that cannot lock.
3037 : *
3038 : * Returns zero on success or don't wait, and -EIO on error.
3039 : */
3040 0 : void __bh_read_batch(int nr, struct buffer_head *bhs[],
3041 : blk_opf_t op_flags, bool force_lock)
3042 : {
3043 : int i;
3044 :
3045 0 : for (i = 0; i < nr; i++) {
3046 0 : struct buffer_head *bh = bhs[i];
3047 :
3048 0 : if (buffer_uptodate(bh))
3049 0 : continue;
3050 :
3051 0 : if (force_lock)
3052 : lock_buffer(bh);
3053 : else
3054 0 : if (!trylock_buffer(bh))
3055 0 : continue;
3056 :
3057 0 : if (buffer_uptodate(bh)) {
3058 0 : unlock_buffer(bh);
3059 0 : continue;
3060 : }
3061 :
3062 0 : bh->b_end_io = end_buffer_read_sync;
3063 0 : get_bh(bh);
3064 : submit_bh(REQ_OP_READ | op_flags, bh);
3065 : }
3066 0 : }
3067 : EXPORT_SYMBOL(__bh_read_batch);
3068 :
3069 1 : void __init buffer_init(void)
3070 : {
3071 : unsigned long nrpages;
3072 : int ret;
3073 :
3074 1 : bh_cachep = kmem_cache_create("buffer_head",
3075 : sizeof(struct buffer_head), 0,
3076 : (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3077 : SLAB_MEM_SPREAD),
3078 : NULL);
3079 :
3080 : /*
3081 : * Limit the bh occupancy to 10% of ZONE_NORMAL
3082 : */
3083 1 : nrpages = (nr_free_buffer_pages() * 10) / 100;
3084 1 : max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3085 1 : ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead",
3086 : NULL, buffer_exit_cpu_dead);
3087 1 : WARN_ON(ret < 0);
3088 1 : }
|