Merge 54126fafea ("Merge tag 'vfs-6.9.iomap' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs") into android-mainline
Steps on the way to v6.9-rc1 Signed-off-by: Lee Jones <joneslee@google.com> Change-Id: I580f3caa96d65fed2aefa6da1d873ddc91837ebf
This commit is contained in:
@@ -251,6 +251,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
|
||||
bio->bi_opf = opf;
|
||||
bio->bi_flags = 0;
|
||||
bio->bi_ioprio = 0;
|
||||
bio->bi_write_hint = 0;
|
||||
bio->bi_status = 0;
|
||||
bio->bi_iter.bi_sector = 0;
|
||||
bio->bi_iter.bi_size = 0;
|
||||
@@ -816,6 +817,7 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
|
||||
{
|
||||
bio_set_flag(bio, BIO_CLONED);
|
||||
bio->bi_ioprio = bio_src->bi_ioprio;
|
||||
bio->bi_write_hint = bio_src->bi_write_hint;
|
||||
bio->bi_iter = bio_src->bi_iter;
|
||||
|
||||
if (bio->bi_bdev) {
|
||||
|
||||
@@ -172,6 +172,7 @@ static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src)
|
||||
if (bio_flagged(bio_src, BIO_REMAPPED))
|
||||
bio_set_flag(bio, BIO_REMAPPED);
|
||||
bio->bi_ioprio = bio_src->bi_ioprio;
|
||||
bio->bi_write_hint = bio_src->bi_write_hint;
|
||||
bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
|
||||
bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
|
||||
|
||||
|
||||
@@ -810,6 +810,10 @@ static struct request *attempt_merge(struct request_queue *q,
|
||||
if (rq_data_dir(req) != rq_data_dir(next))
|
||||
return NULL;
|
||||
|
||||
/* Don't merge requests with different write hints. */
|
||||
if (req->write_hint != next->write_hint)
|
||||
return NULL;
|
||||
|
||||
if (req->ioprio != next->ioprio)
|
||||
return NULL;
|
||||
|
||||
@@ -937,6 +941,10 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
|
||||
if (!bio_crypt_rq_ctx_compatible(rq, bio))
|
||||
return false;
|
||||
|
||||
/* Don't merge requests with different write hints. */
|
||||
if (rq->write_hint != bio->bi_write_hint)
|
||||
return false;
|
||||
|
||||
if (rq->ioprio != bio_prio(bio))
|
||||
return false;
|
||||
|
||||
|
||||
@@ -2584,6 +2584,7 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
|
||||
rq->cmd_flags |= REQ_FAILFAST_MASK;
|
||||
|
||||
rq->__sector = bio->bi_iter.bi_sector;
|
||||
rq->write_hint = bio->bi_write_hint;
|
||||
blk_rq_bio_prep(rq, bio, nr_segs);
|
||||
|
||||
/* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */
|
||||
@@ -3175,6 +3176,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
|
||||
}
|
||||
rq->nr_phys_segments = rq_src->nr_phys_segments;
|
||||
rq->ioprio = rq_src->ioprio;
|
||||
rq->write_hint = rq_src->write_hint;
|
||||
|
||||
if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
|
||||
goto free_and_out;
|
||||
|
||||
@@ -169,6 +169,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src)
|
||||
if (bio_flagged(bio_src, BIO_REMAPPED))
|
||||
bio_set_flag(bio, BIO_REMAPPED);
|
||||
bio->bi_ioprio = bio_src->bi_ioprio;
|
||||
bio->bi_write_hint = bio_src->bi_write_hint;
|
||||
bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
|
||||
bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
|
||||
|
||||
|
||||
+4
-1
@@ -73,6 +73,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
|
||||
bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb));
|
||||
}
|
||||
bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
|
||||
bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
|
||||
bio.bi_ioprio = iocb->ki_ioprio;
|
||||
|
||||
ret = bio_iov_iter_get_pages(&bio, iter);
|
||||
@@ -203,6 +204,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
|
||||
|
||||
for (;;) {
|
||||
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
|
||||
bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
|
||||
bio->bi_private = dio;
|
||||
bio->bi_end_io = blkdev_bio_end_io;
|
||||
bio->bi_ioprio = iocb->ki_ioprio;
|
||||
@@ -321,6 +323,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
|
||||
dio->flags = 0;
|
||||
dio->iocb = iocb;
|
||||
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
|
||||
bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
|
||||
bio->bi_end_io = blkdev_bio_end_io_async;
|
||||
bio->bi_ioprio = iocb->ki_ioprio;
|
||||
|
||||
@@ -482,7 +485,7 @@ static void blkdev_readahead(struct readahead_control *rac)
|
||||
}
|
||||
|
||||
static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc,
|
||||
struct inode *inode, loff_t offset)
|
||||
struct inode *inode, loff_t offset, unsigned int len)
|
||||
{
|
||||
loff_t isize = i_size_read(inode);
|
||||
|
||||
|
||||
+8
-4
@@ -55,7 +55,7 @@
|
||||
|
||||
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
|
||||
static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
|
||||
struct writeback_control *wbc);
|
||||
enum rw_hint hint, struct writeback_control *wbc);
|
||||
|
||||
#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
|
||||
|
||||
@@ -1889,7 +1889,8 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio,
|
||||
do {
|
||||
struct buffer_head *next = bh->b_this_page;
|
||||
if (buffer_async_write(bh)) {
|
||||
submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc);
|
||||
submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
|
||||
inode->i_write_hint, wbc);
|
||||
nr_underway++;
|
||||
}
|
||||
bh = next;
|
||||
@@ -1944,7 +1945,8 @@ recover:
|
||||
struct buffer_head *next = bh->b_this_page;
|
||||
if (buffer_async_write(bh)) {
|
||||
clear_buffer_dirty(bh);
|
||||
submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc);
|
||||
submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
|
||||
inode->i_write_hint, wbc);
|
||||
nr_underway++;
|
||||
}
|
||||
bh = next;
|
||||
@@ -2756,6 +2758,7 @@ static void end_bio_bh_io_sync(struct bio *bio)
|
||||
}
|
||||
|
||||
static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
|
||||
enum rw_hint write_hint,
|
||||
struct writeback_control *wbc)
|
||||
{
|
||||
const enum req_op op = opf & REQ_OP_MASK;
|
||||
@@ -2783,6 +2786,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
|
||||
fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
|
||||
|
||||
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
|
||||
bio->bi_write_hint = write_hint;
|
||||
|
||||
__bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
|
||||
|
||||
@@ -2802,7 +2806,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
|
||||
|
||||
void submit_bh(blk_opf_t opf, struct buffer_head *bh)
|
||||
{
|
||||
submit_bh_wbc(opf, bh, NULL);
|
||||
submit_bh_wbc(opf, bh, WRITE_LIFE_NOT_SET, NULL);
|
||||
}
|
||||
EXPORT_SYMBOL(submit_bh);
|
||||
|
||||
|
||||
@@ -410,6 +410,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
|
||||
bio->bi_end_io = dio_bio_end_io;
|
||||
if (dio->is_pinned)
|
||||
bio_set_flag(bio, BIO_PAGE_PINNED);
|
||||
bio->bi_write_hint = file_inode(dio->iocb->ki_filp)->i_write_hint;
|
||||
|
||||
sdio->bio = bio;
|
||||
sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
|
||||
}
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/quotaops.h>
|
||||
#include <linux/part_stat.h>
|
||||
#include <linux/rw_hint.h>
|
||||
#include <crypto/hash.h>
|
||||
|
||||
#include <linux/fscrypt.h>
|
||||
|
||||
+40
-24
@@ -27,6 +27,7 @@
|
||||
#include <linux/memfd.h>
|
||||
#include <linux/compat.h>
|
||||
#include <linux/mount.h>
|
||||
#include <linux/rw_hint.h>
|
||||
|
||||
#include <linux/poll.h>
|
||||
#include <asm/siginfo.h>
|
||||
@@ -268,8 +269,15 @@ static int f_getowner_uids(struct file *filp, unsigned long arg)
|
||||
}
|
||||
#endif
|
||||
|
||||
static bool rw_hint_valid(enum rw_hint hint)
|
||||
static bool rw_hint_valid(u64 hint)
|
||||
{
|
||||
BUILD_BUG_ON(WRITE_LIFE_NOT_SET != RWH_WRITE_LIFE_NOT_SET);
|
||||
BUILD_BUG_ON(WRITE_LIFE_NONE != RWH_WRITE_LIFE_NONE);
|
||||
BUILD_BUG_ON(WRITE_LIFE_SHORT != RWH_WRITE_LIFE_SHORT);
|
||||
BUILD_BUG_ON(WRITE_LIFE_MEDIUM != RWH_WRITE_LIFE_MEDIUM);
|
||||
BUILD_BUG_ON(WRITE_LIFE_LONG != RWH_WRITE_LIFE_LONG);
|
||||
BUILD_BUG_ON(WRITE_LIFE_EXTREME != RWH_WRITE_LIFE_EXTREME);
|
||||
|
||||
switch (hint) {
|
||||
case RWH_WRITE_LIFE_NOT_SET:
|
||||
case RWH_WRITE_LIFE_NONE:
|
||||
@@ -283,34 +291,40 @@ static bool rw_hint_valid(enum rw_hint hint)
|
||||
}
|
||||
}
|
||||
|
||||
static long fcntl_rw_hint(struct file *file, unsigned int cmd,
|
||||
unsigned long arg)
|
||||
static long fcntl_get_rw_hint(struct file *file, unsigned int cmd,
|
||||
unsigned long arg)
|
||||
{
|
||||
struct inode *inode = file_inode(file);
|
||||
u64 __user *argp = (u64 __user *)arg;
|
||||
enum rw_hint hint;
|
||||
u64 h;
|
||||
u64 hint = READ_ONCE(inode->i_write_hint);
|
||||
|
||||
switch (cmd) {
|
||||
case F_GET_RW_HINT:
|
||||
h = inode->i_write_hint;
|
||||
if (copy_to_user(argp, &h, sizeof(*argp)))
|
||||
return -EFAULT;
|
||||
return 0;
|
||||
case F_SET_RW_HINT:
|
||||
if (copy_from_user(&h, argp, sizeof(h)))
|
||||
return -EFAULT;
|
||||
hint = (enum rw_hint) h;
|
||||
if (!rw_hint_valid(hint))
|
||||
return -EINVAL;
|
||||
if (copy_to_user(argp, &hint, sizeof(*argp)))
|
||||
return -EFAULT;
|
||||
return 0;
|
||||
}
|
||||
|
||||
inode_lock(inode);
|
||||
inode->i_write_hint = hint;
|
||||
inode_unlock(inode);
|
||||
return 0;
|
||||
default:
|
||||
static long fcntl_set_rw_hint(struct file *file, unsigned int cmd,
|
||||
unsigned long arg)
|
||||
{
|
||||
struct inode *inode = file_inode(file);
|
||||
u64 __user *argp = (u64 __user *)arg;
|
||||
u64 hint;
|
||||
|
||||
if (copy_from_user(&hint, argp, sizeof(hint)))
|
||||
return -EFAULT;
|
||||
if (!rw_hint_valid(hint))
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
WRITE_ONCE(inode->i_write_hint, hint);
|
||||
|
||||
/*
|
||||
* file->f_mapping->host may differ from inode. As an example,
|
||||
* blkdev_open() modifies file->f_mapping.
|
||||
*/
|
||||
if (file->f_mapping->host != inode)
|
||||
WRITE_ONCE(file->f_mapping->host->i_write_hint, hint);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
|
||||
@@ -416,8 +430,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
|
||||
err = memfd_fcntl(filp, cmd, argi);
|
||||
break;
|
||||
case F_GET_RW_HINT:
|
||||
err = fcntl_get_rw_hint(filp, cmd, arg);
|
||||
break;
|
||||
case F_SET_RW_HINT:
|
||||
err = fcntl_rw_hint(filp, cmd, arg);
|
||||
err = fcntl_set_rw_hint(filp, cmd, arg);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
||||
+1
-1
@@ -2465,7 +2465,7 @@ out:
|
||||
}
|
||||
|
||||
static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode,
|
||||
loff_t offset)
|
||||
loff_t offset, unsigned int len)
|
||||
{
|
||||
int ret;
|
||||
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
#include <linux/ratelimit.h>
|
||||
#include <linux/list_lru.h>
|
||||
#include <linux/iversion.h>
|
||||
#include <linux/rw_hint.h>
|
||||
#include <trace/events/writeback.h>
|
||||
#include "internal.h"
|
||||
|
||||
|
||||
+297
-308
@@ -1,7 +1,7 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Copyright (C) 2010 Red Hat, Inc.
|
||||
* Copyright (C) 2016-2019 Christoph Hellwig.
|
||||
* Copyright (C) 2016-2023 Christoph Hellwig.
|
||||
*/
|
||||
#include <linux/module.h>
|
||||
#include <linux/compiler.h>
|
||||
@@ -95,6 +95,44 @@ static inline bool ifs_block_is_dirty(struct folio *folio,
|
||||
return test_bit(block + blks_per_folio, ifs->state);
|
||||
}
|
||||
|
||||
static unsigned ifs_find_dirty_range(struct folio *folio,
|
||||
struct iomap_folio_state *ifs, u64 *range_start, u64 range_end)
|
||||
{
|
||||
struct inode *inode = folio->mapping->host;
|
||||
unsigned start_blk =
|
||||
offset_in_folio(folio, *range_start) >> inode->i_blkbits;
|
||||
unsigned end_blk = min_not_zero(
|
||||
offset_in_folio(folio, range_end) >> inode->i_blkbits,
|
||||
i_blocks_per_folio(inode, folio));
|
||||
unsigned nblks = 1;
|
||||
|
||||
while (!ifs_block_is_dirty(folio, ifs, start_blk))
|
||||
if (++start_blk == end_blk)
|
||||
return 0;
|
||||
|
||||
while (start_blk + nblks < end_blk) {
|
||||
if (!ifs_block_is_dirty(folio, ifs, start_blk + nblks))
|
||||
break;
|
||||
nblks++;
|
||||
}
|
||||
|
||||
*range_start = folio_pos(folio) + (start_blk << inode->i_blkbits);
|
||||
return nblks << inode->i_blkbits;
|
||||
}
|
||||
|
||||
static unsigned iomap_find_dirty_range(struct folio *folio, u64 *range_start,
|
||||
u64 range_end)
|
||||
{
|
||||
struct iomap_folio_state *ifs = folio->private;
|
||||
|
||||
if (*range_start >= range_end)
|
||||
return 0;
|
||||
|
||||
if (ifs)
|
||||
return ifs_find_dirty_range(folio, ifs, range_start, range_end);
|
||||
return range_end - *range_start;
|
||||
}
|
||||
|
||||
static void ifs_clear_range_dirty(struct folio *folio,
|
||||
struct iomap_folio_state *ifs, size_t off, size_t len)
|
||||
{
|
||||
@@ -1454,15 +1492,10 @@ out_unlock:
|
||||
EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
|
||||
|
||||
static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
|
||||
size_t len, int error)
|
||||
size_t len)
|
||||
{
|
||||
struct iomap_folio_state *ifs = folio->private;
|
||||
|
||||
if (error) {
|
||||
folio_set_error(folio);
|
||||
mapping_set_error(inode->i_mapping, error);
|
||||
}
|
||||
|
||||
WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
|
||||
WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
|
||||
|
||||
@@ -1479,40 +1512,29 @@ static u32
|
||||
iomap_finish_ioend(struct iomap_ioend *ioend, int error)
|
||||
{
|
||||
struct inode *inode = ioend->io_inode;
|
||||
struct bio *bio = &ioend->io_inline_bio;
|
||||
struct bio *last = ioend->io_bio, *next;
|
||||
u64 start = bio->bi_iter.bi_sector;
|
||||
loff_t offset = ioend->io_offset;
|
||||
bool quiet = bio_flagged(bio, BIO_QUIET);
|
||||
struct bio *bio = &ioend->io_bio;
|
||||
struct folio_iter fi;
|
||||
u32 folio_count = 0;
|
||||
|
||||
for (bio = &ioend->io_inline_bio; bio; bio = next) {
|
||||
struct folio_iter fi;
|
||||
|
||||
/*
|
||||
* For the last bio, bi_private points to the ioend, so we
|
||||
* need to explicitly end the iteration here.
|
||||
*/
|
||||
if (bio == last)
|
||||
next = NULL;
|
||||
else
|
||||
next = bio->bi_private;
|
||||
|
||||
/* walk all folios in bio, ending page IO on them */
|
||||
bio_for_each_folio_all(fi, bio) {
|
||||
iomap_finish_folio_write(inode, fi.folio, fi.length,
|
||||
error);
|
||||
folio_count++;
|
||||
}
|
||||
bio_put(bio);
|
||||
}
|
||||
/* The ioend has been freed by bio_put() */
|
||||
|
||||
if (unlikely(error && !quiet)) {
|
||||
printk_ratelimited(KERN_ERR
|
||||
if (error) {
|
||||
mapping_set_error(inode->i_mapping, error);
|
||||
if (!bio_flagged(bio, BIO_QUIET)) {
|
||||
pr_err_ratelimited(
|
||||
"%s: writeback error on inode %lu, offset %lld, sector %llu",
|
||||
inode->i_sb->s_id, inode->i_ino, offset, start);
|
||||
inode->i_sb->s_id, inode->i_ino,
|
||||
ioend->io_offset, ioend->io_sector);
|
||||
}
|
||||
}
|
||||
|
||||
/* walk all folios in bio, ending page IO on them */
|
||||
bio_for_each_folio_all(fi, bio) {
|
||||
if (error)
|
||||
folio_set_error(fi.folio);
|
||||
iomap_finish_folio_write(inode, fi.folio, fi.length);
|
||||
folio_count++;
|
||||
}
|
||||
|
||||
bio_put(bio); /* frees the ioend */
|
||||
return folio_count;
|
||||
}
|
||||
|
||||
@@ -1553,7 +1575,7 @@ EXPORT_SYMBOL_GPL(iomap_finish_ioends);
|
||||
static bool
|
||||
iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
|
||||
{
|
||||
if (ioend->io_bio->bi_status != next->io_bio->bi_status)
|
||||
if (ioend->io_bio.bi_status != next->io_bio.bi_status)
|
||||
return false;
|
||||
if ((ioend->io_flags & IOMAP_F_SHARED) ^
|
||||
(next->io_flags & IOMAP_F_SHARED))
|
||||
@@ -1618,47 +1640,46 @@ EXPORT_SYMBOL_GPL(iomap_sort_ioends);
|
||||
|
||||
static void iomap_writepage_end_bio(struct bio *bio)
|
||||
{
|
||||
struct iomap_ioend *ioend = bio->bi_private;
|
||||
|
||||
iomap_finish_ioend(ioend, blk_status_to_errno(bio->bi_status));
|
||||
iomap_finish_ioend(iomap_ioend_from_bio(bio),
|
||||
blk_status_to_errno(bio->bi_status));
|
||||
}
|
||||
|
||||
/*
|
||||
* Submit the final bio for an ioend.
|
||||
*
|
||||
* If @error is non-zero, it means that we have a situation where some part of
|
||||
* the submission process has failed after we've marked pages for writeback
|
||||
* and unlocked them. In this situation, we need to fail the bio instead of
|
||||
* submitting it. This typically only happens on a filesystem shutdown.
|
||||
* the submission process has failed after we've marked pages for writeback.
|
||||
* We cannot cancel ioend directly in that case, so call the bio end I/O handler
|
||||
* with the error status here to run the normal I/O completion handler to clear
|
||||
* the writeback bit and let the file system proess the errors.
|
||||
*/
|
||||
static int
|
||||
iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend,
|
||||
int error)
|
||||
static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
|
||||
{
|
||||
ioend->io_bio->bi_private = ioend;
|
||||
ioend->io_bio->bi_end_io = iomap_writepage_end_bio;
|
||||
|
||||
if (wpc->ops->prepare_ioend)
|
||||
error = wpc->ops->prepare_ioend(ioend, error);
|
||||
if (error) {
|
||||
/*
|
||||
* If we're failing the IO now, just mark the ioend with an
|
||||
* error and finish it. This will run IO completion immediately
|
||||
* as there is only one reference to the ioend at this point in
|
||||
* time.
|
||||
*/
|
||||
ioend->io_bio->bi_status = errno_to_blk_status(error);
|
||||
bio_endio(ioend->io_bio);
|
||||
if (!wpc->ioend)
|
||||
return error;
|
||||
|
||||
/*
|
||||
* Let the file systems prepare the I/O submission and hook in an I/O
|
||||
* comletion handler. This also needs to happen in case after a
|
||||
* failure happened so that the file system end I/O handler gets called
|
||||
* to clean up.
|
||||
*/
|
||||
if (wpc->ops->prepare_ioend)
|
||||
error = wpc->ops->prepare_ioend(wpc->ioend, error);
|
||||
|
||||
if (error) {
|
||||
wpc->ioend->io_bio.bi_status = errno_to_blk_status(error);
|
||||
bio_endio(&wpc->ioend->io_bio);
|
||||
} else {
|
||||
submit_bio(&wpc->ioend->io_bio);
|
||||
}
|
||||
|
||||
submit_bio(ioend->io_bio);
|
||||
return 0;
|
||||
wpc->ioend = NULL;
|
||||
return error;
|
||||
}
|
||||
|
||||
static struct iomap_ioend *
|
||||
iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
|
||||
loff_t offset, sector_t sector, struct writeback_control *wbc)
|
||||
static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
|
||||
struct writeback_control *wbc, struct inode *inode, loff_t pos)
|
||||
{
|
||||
struct iomap_ioend *ioend;
|
||||
struct bio *bio;
|
||||
@@ -1666,63 +1687,42 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
|
||||
bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
|
||||
REQ_OP_WRITE | wbc_to_write_flags(wbc),
|
||||
GFP_NOFS, &iomap_ioend_bioset);
|
||||
bio->bi_iter.bi_sector = sector;
|
||||
bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
|
||||
bio->bi_end_io = iomap_writepage_end_bio;
|
||||
wbc_init_bio(wbc, bio);
|
||||
bio->bi_write_hint = inode->i_write_hint;
|
||||
|
||||
ioend = container_of(bio, struct iomap_ioend, io_inline_bio);
|
||||
ioend = iomap_ioend_from_bio(bio);
|
||||
INIT_LIST_HEAD(&ioend->io_list);
|
||||
ioend->io_type = wpc->iomap.type;
|
||||
ioend->io_flags = wpc->iomap.flags;
|
||||
ioend->io_inode = inode;
|
||||
ioend->io_size = 0;
|
||||
ioend->io_folios = 0;
|
||||
ioend->io_offset = offset;
|
||||
ioend->io_bio = bio;
|
||||
ioend->io_sector = sector;
|
||||
ioend->io_offset = pos;
|
||||
ioend->io_sector = bio->bi_iter.bi_sector;
|
||||
|
||||
wpc->nr_folios = 0;
|
||||
return ioend;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate a new bio, and chain the old bio to the new one.
|
||||
*
|
||||
* Note that we have to perform the chaining in this unintuitive order
|
||||
* so that the bi_private linkage is set up in the right direction for the
|
||||
* traversal in iomap_finish_ioend().
|
||||
*/
|
||||
static struct bio *
|
||||
iomap_chain_bio(struct bio *prev)
|
||||
{
|
||||
struct bio *new;
|
||||
|
||||
new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS);
|
||||
bio_clone_blkg_association(new, prev);
|
||||
new->bi_iter.bi_sector = bio_end_sector(prev);
|
||||
|
||||
bio_chain(prev, new);
|
||||
bio_get(prev); /* for iomap_finish_ioend */
|
||||
submit_bio(prev);
|
||||
return new;
|
||||
}
|
||||
|
||||
static bool
|
||||
iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
|
||||
sector_t sector)
|
||||
static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
|
||||
{
|
||||
if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
|
||||
(wpc->ioend->io_flags & IOMAP_F_SHARED))
|
||||
return false;
|
||||
if (wpc->iomap.type != wpc->ioend->io_type)
|
||||
return false;
|
||||
if (offset != wpc->ioend->io_offset + wpc->ioend->io_size)
|
||||
if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
|
||||
return false;
|
||||
if (sector != bio_end_sector(wpc->ioend->io_bio))
|
||||
if (iomap_sector(&wpc->iomap, pos) !=
|
||||
bio_end_sector(&wpc->ioend->io_bio))
|
||||
return false;
|
||||
/*
|
||||
* Limit ioend bio chain lengths to minimise IO completion latency. This
|
||||
* also prevents long tight loops ending page writeback on all the
|
||||
* folios in the ioend.
|
||||
*/
|
||||
if (wpc->ioend->io_folios >= IOEND_BATCH_SIZE)
|
||||
if (wpc->nr_folios >= IOEND_BATCH_SIZE)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
@@ -1730,255 +1730,238 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
|
||||
/*
|
||||
* Test to see if we have an existing ioend structure that we could append to
|
||||
* first; otherwise finish off the current ioend and start another.
|
||||
*
|
||||
* If a new ioend is created and cached, the old ioend is submitted to the block
|
||||
* layer instantly. Batching optimisations are provided by higher level block
|
||||
* plugging.
|
||||
*
|
||||
* At the end of a writeback pass, there will be a cached ioend remaining on the
|
||||
* writepage context that the caller will need to submit.
|
||||
*/
|
||||
static void
|
||||
iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio,
|
||||
struct iomap_folio_state *ifs, struct iomap_writepage_ctx *wpc,
|
||||
struct writeback_control *wbc, struct list_head *iolist)
|
||||
static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
|
||||
struct writeback_control *wbc, struct folio *folio,
|
||||
struct inode *inode, loff_t pos, unsigned len)
|
||||
{
|
||||
sector_t sector = iomap_sector(&wpc->iomap, pos);
|
||||
unsigned len = i_blocksize(inode);
|
||||
struct iomap_folio_state *ifs = folio->private;
|
||||
size_t poff = offset_in_folio(folio, pos);
|
||||
int error;
|
||||
|
||||
if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, sector)) {
|
||||
if (wpc->ioend)
|
||||
list_add(&wpc->ioend->io_list, iolist);
|
||||
wpc->ioend = iomap_alloc_ioend(inode, wpc, pos, sector, wbc);
|
||||
if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) {
|
||||
new_ioend:
|
||||
error = iomap_submit_ioend(wpc, 0);
|
||||
if (error)
|
||||
return error;
|
||||
wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos);
|
||||
}
|
||||
|
||||
if (!bio_add_folio(wpc->ioend->io_bio, folio, len, poff)) {
|
||||
wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio);
|
||||
bio_add_folio_nofail(wpc->ioend->io_bio, folio, len, poff);
|
||||
}
|
||||
if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
|
||||
goto new_ioend;
|
||||
|
||||
if (ifs)
|
||||
atomic_add(len, &ifs->write_bytes_pending);
|
||||
wpc->ioend->io_size += len;
|
||||
wbc_account_cgroup_owner(wbc, &folio->page, len);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* We implement an immediate ioend submission policy here to avoid needing to
|
||||
* chain multiple ioends and hence nest mempool allocations which can violate
|
||||
* the forward progress guarantees we need to provide. The current ioend we're
|
||||
* adding blocks to is cached in the writepage context, and if the new block
|
||||
* doesn't append to the cached ioend, it will create a new ioend and cache that
|
||||
* instead.
|
||||
*
|
||||
* If a new ioend is created and cached, the old ioend is returned and queued
|
||||
* locally for submission once the entire page is processed or an error has been
|
||||
* detected. While ioends are submitted immediately after they are completed,
|
||||
* batching optimisations are provided by higher level block plugging.
|
||||
*
|
||||
* At the end of a writeback pass, there will be a cached ioend remaining on the
|
||||
* writepage context that the caller will need to submit.
|
||||
*/
|
||||
static int
|
||||
iomap_writepage_map(struct iomap_writepage_ctx *wpc,
|
||||
struct writeback_control *wbc, struct inode *inode,
|
||||
struct folio *folio, u64 end_pos)
|
||||
static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
|
||||
struct writeback_control *wbc, struct folio *folio,
|
||||
struct inode *inode, u64 pos, unsigned dirty_len,
|
||||
unsigned *count)
|
||||
{
|
||||
struct iomap_folio_state *ifs = folio->private;
|
||||
struct iomap_ioend *ioend, *next;
|
||||
unsigned len = i_blocksize(inode);
|
||||
unsigned nblocks = i_blocks_per_folio(inode, folio);
|
||||
u64 pos = folio_pos(folio);
|
||||
int error = 0, count = 0, i;
|
||||
LIST_HEAD(submit_list);
|
||||
int error;
|
||||
|
||||
WARN_ON_ONCE(end_pos <= pos);
|
||||
do {
|
||||
unsigned map_len;
|
||||
|
||||
if (!ifs && nblocks > 1) {
|
||||
ifs = ifs_alloc(inode, folio, 0);
|
||||
iomap_set_range_dirty(folio, 0, end_pos - pos);
|
||||
}
|
||||
|
||||
WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0);
|
||||
|
||||
/*
|
||||
* Walk through the folio to find areas to write back. If we
|
||||
* run off the end of the current map or find the current map
|
||||
* invalid, grab a new one.
|
||||
*/
|
||||
for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
|
||||
if (ifs && !ifs_block_is_dirty(folio, ifs, i))
|
||||
continue;
|
||||
|
||||
error = wpc->ops->map_blocks(wpc, inode, pos);
|
||||
error = wpc->ops->map_blocks(wpc, inode, pos, dirty_len);
|
||||
if (error)
|
||||
break;
|
||||
trace_iomap_writepage_map(inode, &wpc->iomap);
|
||||
if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE))
|
||||
continue;
|
||||
if (wpc->iomap.type == IOMAP_HOLE)
|
||||
continue;
|
||||
iomap_add_to_ioend(inode, pos, folio, ifs, wpc, wbc,
|
||||
&submit_list);
|
||||
count++;
|
||||
}
|
||||
if (count)
|
||||
wpc->ioend->io_folios++;
|
||||
trace_iomap_writepage_map(inode, pos, dirty_len, &wpc->iomap);
|
||||
|
||||
WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list));
|
||||
WARN_ON_ONCE(!folio_test_locked(folio));
|
||||
WARN_ON_ONCE(folio_test_writeback(folio));
|
||||
WARN_ON_ONCE(folio_test_dirty(folio));
|
||||
map_len = min_t(u64, dirty_len,
|
||||
wpc->iomap.offset + wpc->iomap.length - pos);
|
||||
WARN_ON_ONCE(!folio->private && map_len < dirty_len);
|
||||
|
||||
switch (wpc->iomap.type) {
|
||||
case IOMAP_INLINE:
|
||||
WARN_ON_ONCE(1);
|
||||
error = -EIO;
|
||||
break;
|
||||
case IOMAP_HOLE:
|
||||
break;
|
||||
default:
|
||||
error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos,
|
||||
map_len);
|
||||
if (!error)
|
||||
(*count)++;
|
||||
break;
|
||||
}
|
||||
dirty_len -= map_len;
|
||||
pos += map_len;
|
||||
} while (dirty_len && !error);
|
||||
|
||||
/*
|
||||
* We cannot cancel the ioend directly here on error. We may have
|
||||
* already set other pages under writeback and hence we have to run I/O
|
||||
* completion to mark the error state of the pages under writeback
|
||||
* appropriately.
|
||||
*
|
||||
* Just let the file system know what portion of the folio failed to
|
||||
* map.
|
||||
*/
|
||||
if (unlikely(error)) {
|
||||
if (error && wpc->ops->discard_folio)
|
||||
wpc->ops->discard_folio(folio, pos);
|
||||
return error;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check interaction of the folio with the file end.
|
||||
*
|
||||
* If the folio is entirely beyond i_size, return false. If it straddles
|
||||
* i_size, adjust end_pos and zero all data beyond i_size.
|
||||
*/
|
||||
static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode,
|
||||
u64 *end_pos)
|
||||
{
|
||||
u64 isize = i_size_read(inode);
|
||||
|
||||
if (*end_pos > isize) {
|
||||
size_t poff = offset_in_folio(folio, isize);
|
||||
pgoff_t end_index = isize >> PAGE_SHIFT;
|
||||
|
||||
/*
|
||||
* Let the filesystem know what portion of the current page
|
||||
* failed to map. If the page hasn't been added to ioend, it
|
||||
* won't be affected by I/O completion and we must unlock it
|
||||
* now.
|
||||
* If the folio is entirely ouside of i_size, skip it.
|
||||
*
|
||||
* This can happen due to a truncate operation that is in
|
||||
* progress and in that case truncate will finish it off once
|
||||
* we've dropped the folio lock.
|
||||
*
|
||||
* Note that the pgoff_t used for end_index is an unsigned long.
|
||||
* If the given offset is greater than 16TB on a 32-bit system,
|
||||
* then if we checked if the folio is fully outside i_size with
|
||||
* "if (folio->index >= end_index + 1)", "end_index + 1" would
|
||||
* overflow and evaluate to 0. Hence this folio would be
|
||||
* redirtied and written out repeatedly, which would result in
|
||||
* an infinite loop; the user program performing this operation
|
||||
* would hang. Instead, we can detect this situation by
|
||||
* checking if the folio is totally beyond i_size or if its
|
||||
* offset is just equal to the EOF.
|
||||
*/
|
||||
if (wpc->ops->discard_folio)
|
||||
wpc->ops->discard_folio(folio, pos);
|
||||
if (!count) {
|
||||
folio_unlock(folio);
|
||||
goto done;
|
||||
}
|
||||
if (folio->index > end_index ||
|
||||
(folio->index == end_index && poff == 0))
|
||||
return false;
|
||||
|
||||
/*
|
||||
* The folio straddles i_size.
|
||||
*
|
||||
* It must be zeroed out on each and every writepage invocation
|
||||
* because it may be mmapped:
|
||||
*
|
||||
* A file is mapped in multiples of the page size. For a
|
||||
* file that is not a multiple of the page size, the
|
||||
* remaining memory is zeroed when mapped, and writes to that
|
||||
* region are not written out to the file.
|
||||
*
|
||||
* Also adjust the writeback range to skip all blocks entirely
|
||||
* beyond i_size.
|
||||
*/
|
||||
folio_zero_segment(folio, poff, folio_size(folio));
|
||||
*end_pos = round_up(isize, i_blocksize(inode));
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
|
||||
struct writeback_control *wbc, struct folio *folio)
|
||||
{
|
||||
struct iomap_folio_state *ifs = folio->private;
|
||||
struct inode *inode = folio->mapping->host;
|
||||
u64 pos = folio_pos(folio);
|
||||
u64 end_pos = pos + folio_size(folio);
|
||||
unsigned count = 0;
|
||||
int error = 0;
|
||||
u32 rlen;
|
||||
|
||||
WARN_ON_ONCE(!folio_test_locked(folio));
|
||||
WARN_ON_ONCE(folio_test_dirty(folio));
|
||||
WARN_ON_ONCE(folio_test_writeback(folio));
|
||||
|
||||
trace_iomap_writepage(inode, pos, folio_size(folio));
|
||||
|
||||
if (!iomap_writepage_handle_eof(folio, inode, &end_pos)) {
|
||||
folio_unlock(folio);
|
||||
return 0;
|
||||
}
|
||||
WARN_ON_ONCE(end_pos <= pos);
|
||||
|
||||
if (i_blocks_per_folio(inode, folio) > 1) {
|
||||
if (!ifs) {
|
||||
ifs = ifs_alloc(inode, folio, 0);
|
||||
iomap_set_range_dirty(folio, 0, end_pos - pos);
|
||||
}
|
||||
|
||||
/*
|
||||
* Keep the I/O completion handler from clearing the writeback
|
||||
* bit until we have submitted all blocks by adding a bias to
|
||||
* ifs->write_bytes_pending, which is dropped after submitting
|
||||
* all blocks.
|
||||
*/
|
||||
WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0);
|
||||
atomic_inc(&ifs->write_bytes_pending);
|
||||
}
|
||||
|
||||
/*
|
||||
* Set the writeback bit ASAP, as the I/O completion for the single
|
||||
* block per folio case happen hit as soon as we're submitting the bio.
|
||||
*/
|
||||
folio_start_writeback(folio);
|
||||
|
||||
/*
|
||||
* Walk through the folio to find dirty areas to write back.
|
||||
*/
|
||||
while ((rlen = iomap_find_dirty_range(folio, &pos, end_pos))) {
|
||||
error = iomap_writepage_map_blocks(wpc, wbc, folio, inode,
|
||||
pos, rlen, &count);
|
||||
if (error)
|
||||
break;
|
||||
pos += rlen;
|
||||
}
|
||||
|
||||
if (count)
|
||||
wpc->nr_folios++;
|
||||
|
||||
/*
|
||||
* We can have dirty bits set past end of file in page_mkwrite path
|
||||
* while mapping the last partial folio. Hence it's better to clear
|
||||
* all the dirty bits in the folio here.
|
||||
*/
|
||||
iomap_clear_range_dirty(folio, 0, folio_size(folio));
|
||||
folio_start_writeback(folio);
|
||||
|
||||
/*
|
||||
* Usually the writeback bit is cleared by the I/O completion handler.
|
||||
* But we may end up either not actually writing any blocks, or (when
|
||||
* there are multiple blocks in a folio) all I/O might have finished
|
||||
* already at this point. In that case we need to clear the writeback
|
||||
* bit ourselves right after unlocking the page.
|
||||
*/
|
||||
folio_unlock(folio);
|
||||
|
||||
/*
|
||||
* Preserve the original error if there was one; catch
|
||||
* submission errors here and propagate into subsequent ioend
|
||||
* submissions.
|
||||
*/
|
||||
list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
|
||||
int error2;
|
||||
|
||||
list_del_init(&ioend->io_list);
|
||||
error2 = iomap_submit_ioend(wpc, ioend, error);
|
||||
if (error2 && !error)
|
||||
error = error2;
|
||||
if (ifs) {
|
||||
if (atomic_dec_and_test(&ifs->write_bytes_pending))
|
||||
folio_end_writeback(folio);
|
||||
} else {
|
||||
if (!count)
|
||||
folio_end_writeback(folio);
|
||||
}
|
||||
|
||||
/*
|
||||
* We can end up here with no error and nothing to write only if we race
|
||||
* with a partial page truncate on a sub-page block sized filesystem.
|
||||
*/
|
||||
if (!count)
|
||||
folio_end_writeback(folio);
|
||||
done:
|
||||
mapping_set_error(inode->i_mapping, error);
|
||||
return error;
|
||||
}
|
||||
|
||||
/*
|
||||
* Write out a dirty page.
|
||||
*
|
||||
* For delalloc space on the page, we need to allocate space and flush it.
|
||||
* For unwritten space on the page, we need to start the conversion to
|
||||
* regular allocated space.
|
||||
*/
|
||||
static int iomap_do_writepage(struct folio *folio,
|
||||
struct writeback_control *wbc, void *data)
|
||||
{
|
||||
struct iomap_writepage_ctx *wpc = data;
|
||||
struct inode *inode = folio->mapping->host;
|
||||
u64 end_pos, isize;
|
||||
|
||||
trace_iomap_writepage(inode, folio_pos(folio), folio_size(folio));
|
||||
|
||||
/*
|
||||
* Refuse to write the folio out if we're called from reclaim context.
|
||||
*
|
||||
* This avoids stack overflows when called from deeply used stacks in
|
||||
* random callers for direct reclaim or memcg reclaim. We explicitly
|
||||
* allow reclaim from kswapd as the stack usage there is relatively low.
|
||||
*
|
||||
* This should never happen except in the case of a VM regression so
|
||||
* warn about it.
|
||||
*/
|
||||
if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
|
||||
PF_MEMALLOC))
|
||||
goto redirty;
|
||||
|
||||
/*
|
||||
* Is this folio beyond the end of the file?
|
||||
*
|
||||
* The folio index is less than the end_index, adjust the end_pos
|
||||
* to the highest offset that this folio should represent.
|
||||
* -----------------------------------------------------
|
||||
* | file mapping | <EOF> |
|
||||
* -----------------------------------------------------
|
||||
* | Page ... | Page N-2 | Page N-1 | Page N | |
|
||||
* ^--------------------------------^----------|--------
|
||||
* | desired writeback range | see else |
|
||||
* ---------------------------------^------------------|
|
||||
*/
|
||||
isize = i_size_read(inode);
|
||||
end_pos = folio_pos(folio) + folio_size(folio);
|
||||
if (end_pos > isize) {
|
||||
/*
|
||||
* Check whether the page to write out is beyond or straddles
|
||||
* i_size or not.
|
||||
* -------------------------------------------------------
|
||||
* | file mapping | <EOF> |
|
||||
* -------------------------------------------------------
|
||||
* | Page ... | Page N-2 | Page N-1 | Page N | Beyond |
|
||||
* ^--------------------------------^-----------|---------
|
||||
* | | Straddles |
|
||||
* ---------------------------------^-----------|--------|
|
||||
*/
|
||||
size_t poff = offset_in_folio(folio, isize);
|
||||
pgoff_t end_index = isize >> PAGE_SHIFT;
|
||||
|
||||
/*
|
||||
* Skip the page if it's fully outside i_size, e.g.
|
||||
* due to a truncate operation that's in progress. We've
|
||||
* cleaned this page and truncate will finish things off for
|
||||
* us.
|
||||
*
|
||||
* Note that the end_index is unsigned long. If the given
|
||||
* offset is greater than 16TB on a 32-bit system then if we
|
||||
* checked if the page is fully outside i_size with
|
||||
* "if (page->index >= end_index + 1)", "end_index + 1" would
|
||||
* overflow and evaluate to 0. Hence this page would be
|
||||
* redirtied and written out repeatedly, which would result in
|
||||
* an infinite loop; the user program performing this operation
|
||||
* would hang. Instead, we can detect this situation by
|
||||
* checking if the page is totally beyond i_size or if its
|
||||
* offset is just equal to the EOF.
|
||||
*/
|
||||
if (folio->index > end_index ||
|
||||
(folio->index == end_index && poff == 0))
|
||||
goto unlock;
|
||||
|
||||
/*
|
||||
* The page straddles i_size. It must be zeroed out on each
|
||||
* and every writepage invocation because it may be mmapped.
|
||||
* "A file is mapped in multiples of the page size. For a file
|
||||
* that is not a multiple of the page size, the remaining
|
||||
* memory is zeroed when mapped, and writes to that region are
|
||||
* not written out to the file."
|
||||
*/
|
||||
folio_zero_segment(folio, poff, folio_size(folio));
|
||||
end_pos = isize;
|
||||
}
|
||||
|
||||
return iomap_writepage_map(wpc, wbc, inode, folio, end_pos);
|
||||
|
||||
redirty:
|
||||
folio_redirty_for_writepage(wbc, folio);
|
||||
unlock:
|
||||
folio_unlock(folio);
|
||||
return 0;
|
||||
return iomap_writepage_map(data, wbc, folio);
|
||||
}
|
||||
|
||||
int
|
||||
@@ -1988,18 +1971,24 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
|
||||
{
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* Writeback from reclaim context should never happen except in the case
|
||||
* of a VM regression so warn about it and refuse to write the data.
|
||||
*/
|
||||
if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC | PF_KSWAPD)) ==
|
||||
PF_MEMALLOC))
|
||||
return -EIO;
|
||||
|
||||
wpc->ops = ops;
|
||||
ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc);
|
||||
if (!wpc->ioend)
|
||||
return ret;
|
||||
return iomap_submit_ioend(wpc, wpc->ioend, ret);
|
||||
return iomap_submit_ioend(wpc, ret);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(iomap_writepages);
|
||||
|
||||
static int __init iomap_init(void)
|
||||
{
|
||||
return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
|
||||
offsetof(struct iomap_ioend, io_inline_bio),
|
||||
offsetof(struct iomap_ioend, io_bio),
|
||||
BIOSET_NEED_BVECS);
|
||||
}
|
||||
fs_initcall(iomap_init);
|
||||
|
||||
@@ -380,6 +380,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
|
||||
fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
|
||||
GFP_KERNEL);
|
||||
bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
|
||||
bio->bi_write_hint = inode->i_write_hint;
|
||||
bio->bi_ioprio = dio->iocb->ki_ioprio;
|
||||
bio->bi_private = dio;
|
||||
bio->bi_end_io = iomap_dio_bio_end_io;
|
||||
|
||||
+46
-2
@@ -154,7 +154,48 @@ DEFINE_EVENT(iomap_class, name, \
|
||||
TP_ARGS(inode, iomap))
|
||||
DEFINE_IOMAP_EVENT(iomap_iter_dstmap);
|
||||
DEFINE_IOMAP_EVENT(iomap_iter_srcmap);
|
||||
DEFINE_IOMAP_EVENT(iomap_writepage_map);
|
||||
|
||||
TRACE_EVENT(iomap_writepage_map,
|
||||
TP_PROTO(struct inode *inode, u64 pos, unsigned int dirty_len,
|
||||
struct iomap *iomap),
|
||||
TP_ARGS(inode, pos, dirty_len, iomap),
|
||||
TP_STRUCT__entry(
|
||||
__field(dev_t, dev)
|
||||
__field(u64, ino)
|
||||
__field(u64, pos)
|
||||
__field(u64, dirty_len)
|
||||
__field(u64, addr)
|
||||
__field(loff_t, offset)
|
||||
__field(u64, length)
|
||||
__field(u16, type)
|
||||
__field(u16, flags)
|
||||
__field(dev_t, bdev)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->dev = inode->i_sb->s_dev;
|
||||
__entry->ino = inode->i_ino;
|
||||
__entry->pos = pos;
|
||||
__entry->dirty_len = dirty_len;
|
||||
__entry->addr = iomap->addr;
|
||||
__entry->offset = iomap->offset;
|
||||
__entry->length = iomap->length;
|
||||
__entry->type = iomap->type;
|
||||
__entry->flags = iomap->flags;
|
||||
__entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0;
|
||||
),
|
||||
TP_printk("dev %d:%d ino 0x%llx bdev %d:%d pos 0x%llx dirty len 0x%llx "
|
||||
"addr 0x%llx offset 0x%llx length 0x%llx type %s flags %s",
|
||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||
__entry->ino,
|
||||
MAJOR(__entry->bdev), MINOR(__entry->bdev),
|
||||
__entry->pos,
|
||||
__entry->dirty_len,
|
||||
__entry->addr,
|
||||
__entry->offset,
|
||||
__entry->length,
|
||||
__print_symbolic(__entry->type, IOMAP_TYPE_STRINGS),
|
||||
__print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS))
|
||||
);
|
||||
|
||||
TRACE_EVENT(iomap_iter,
|
||||
TP_PROTO(struct iomap_iter *iter, const void *ops,
|
||||
@@ -165,6 +206,7 @@ TRACE_EVENT(iomap_iter,
|
||||
__field(u64, ino)
|
||||
__field(loff_t, pos)
|
||||
__field(u64, length)
|
||||
__field(s64, processed)
|
||||
__field(unsigned int, flags)
|
||||
__field(const void *, ops)
|
||||
__field(unsigned long, caller)
|
||||
@@ -174,15 +216,17 @@ TRACE_EVENT(iomap_iter,
|
||||
__entry->ino = iter->inode->i_ino;
|
||||
__entry->pos = iter->pos;
|
||||
__entry->length = iomap_length(iter);
|
||||
__entry->processed = iter->processed;
|
||||
__entry->flags = iter->flags;
|
||||
__entry->ops = ops;
|
||||
__entry->caller = caller;
|
||||
),
|
||||
TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx flags %s (0x%x) ops %ps caller %pS",
|
||||
TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx processed %lld flags %s (0x%x) ops %ps caller %pS",
|
||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||
__entry->ino,
|
||||
__entry->pos,
|
||||
__entry->length,
|
||||
__entry->processed,
|
||||
__print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS),
|
||||
__entry->flags,
|
||||
__entry->ops,
|
||||
|
||||
@@ -605,6 +605,7 @@ alloc_new:
|
||||
GFP_NOFS);
|
||||
bio->bi_iter.bi_sector = first_block << (blkbits - 9);
|
||||
wbc_init_bio(wbc, bio);
|
||||
bio->bi_write_hint = inode->i_write_hint;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
+5
-4
@@ -112,7 +112,7 @@ xfs_end_ioend(
|
||||
* longer dirty. If we don't remove delalloc blocks here, they become
|
||||
* stale and can corrupt free space accounting on unmount.
|
||||
*/
|
||||
error = blk_status_to_errno(ioend->io_bio->bi_status);
|
||||
error = blk_status_to_errno(ioend->io_bio.bi_status);
|
||||
if (unlikely(error)) {
|
||||
if (ioend->io_flags & IOMAP_F_SHARED) {
|
||||
xfs_reflink_cancel_cow_range(ip, offset, size, true);
|
||||
@@ -179,7 +179,7 @@ STATIC void
|
||||
xfs_end_bio(
|
||||
struct bio *bio)
|
||||
{
|
||||
struct iomap_ioend *ioend = bio->bi_private;
|
||||
struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
|
||||
struct xfs_inode *ip = XFS_I(ioend->io_inode);
|
||||
unsigned long flags;
|
||||
|
||||
@@ -276,7 +276,8 @@ static int
|
||||
xfs_map_blocks(
|
||||
struct iomap_writepage_ctx *wpc,
|
||||
struct inode *inode,
|
||||
loff_t offset)
|
||||
loff_t offset,
|
||||
unsigned int len)
|
||||
{
|
||||
struct xfs_inode *ip = XFS_I(inode);
|
||||
struct xfs_mount *mp = ip->i_mount;
|
||||
@@ -444,7 +445,7 @@ xfs_prepare_ioend(
|
||||
/* send ioends that might require a transaction to the completion wq */
|
||||
if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
|
||||
(ioend->io_flags & IOMAP_F_SHARED))
|
||||
ioend->io_bio->bi_end_io = xfs_end_bio;
|
||||
ioend->io_bio.bi_end_io = xfs_end_bio;
|
||||
return status;
|
||||
}
|
||||
|
||||
|
||||
+2
-1
@@ -125,7 +125,8 @@ static void zonefs_readahead(struct readahead_control *rac)
|
||||
* which implies that the page range can only be within the fixed inode size.
|
||||
*/
|
||||
static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
|
||||
struct inode *inode, loff_t offset)
|
||||
struct inode *inode, loff_t offset,
|
||||
unsigned int len)
|
||||
{
|
||||
struct zonefs_zone *z = zonefs_inode_zone(inode);
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include <linux/scatterlist.h>
|
||||
#include <linux/prefetch.h>
|
||||
#include <linux/srcu.h>
|
||||
#include <linux/rw_hint.h>
|
||||
|
||||
struct blk_mq_tags;
|
||||
struct blk_flush_queue;
|
||||
@@ -135,6 +136,7 @@ struct request {
|
||||
struct blk_crypto_keyslot *crypt_keyslot;
|
||||
#endif
|
||||
|
||||
enum rw_hint write_hint;
|
||||
unsigned short ioprio;
|
||||
|
||||
enum mq_rq_state state;
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
#include <linux/bvec.h>
|
||||
#include <linux/device.h>
|
||||
#include <linux/ktime.h>
|
||||
#include <linux/rw_hint.h>
|
||||
|
||||
struct bio_set;
|
||||
struct bio;
|
||||
@@ -269,6 +270,7 @@ struct bio {
|
||||
*/
|
||||
unsigned short bi_flags; /* BIO_* below */
|
||||
unsigned short bi_ioprio;
|
||||
enum rw_hint bi_write_hint;
|
||||
blk_status_t bi_status;
|
||||
atomic_t __bi_remaining;
|
||||
|
||||
|
||||
+2
-14
@@ -44,6 +44,7 @@
|
||||
#include <linux/mnt_idmapping.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/maple_tree.h>
|
||||
#include <linux/rw_hint.h>
|
||||
|
||||
#include <asm/byteorder.h>
|
||||
#include <uapi/linux/fs.h>
|
||||
@@ -310,19 +311,6 @@ struct address_space;
|
||||
struct writeback_control;
|
||||
struct readahead_control;
|
||||
|
||||
/*
|
||||
* Write life time hint values.
|
||||
* Stored in struct inode as u8.
|
||||
*/
|
||||
enum rw_hint {
|
||||
WRITE_LIFE_NOT_SET = 0,
|
||||
WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE,
|
||||
WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT,
|
||||
WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM,
|
||||
WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG,
|
||||
WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME,
|
||||
};
|
||||
|
||||
/* Match RWF_* bits to IOCB bits */
|
||||
#define IOCB_HIPRI (__force int) RWF_HIPRI
|
||||
#define IOCB_DSYNC (__force int) RWF_DSYNC
|
||||
@@ -680,7 +668,7 @@ struct inode {
|
||||
spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
|
||||
unsigned short i_bytes;
|
||||
u8 i_blkbits;
|
||||
u8 i_write_hint;
|
||||
enum rw_hint i_write_hint;
|
||||
blkcnt_t i_blocks;
|
||||
|
||||
#ifdef __NEED_I_SIZE_ORDERED
|
||||
|
||||
+15
-4
@@ -293,22 +293,32 @@ struct iomap_ioend {
|
||||
struct list_head io_list; /* next ioend in chain */
|
||||
u16 io_type;
|
||||
u16 io_flags; /* IOMAP_F_* */
|
||||
u32 io_folios; /* folios added to ioend */
|
||||
struct inode *io_inode; /* file being written to */
|
||||
size_t io_size; /* size of the extent */
|
||||
loff_t io_offset; /* offset in the file */
|
||||
sector_t io_sector; /* start sector of ioend */
|
||||
struct bio *io_bio; /* bio being built */
|
||||
struct bio io_inline_bio; /* MUST BE LAST! */
|
||||
struct bio io_bio; /* MUST BE LAST! */
|
||||
};
|
||||
|
||||
static inline struct iomap_ioend *iomap_ioend_from_bio(struct bio *bio)
|
||||
{
|
||||
return container_of(bio, struct iomap_ioend, io_bio);
|
||||
}
|
||||
|
||||
struct iomap_writeback_ops {
|
||||
/*
|
||||
* Required, maps the blocks so that writeback can be performed on
|
||||
* the range starting at offset.
|
||||
*
|
||||
* Can return arbitrarily large regions, but we need to call into it at
|
||||
* least once per folio to allow the file systems to synchronize with
|
||||
* the write path that could be invalidating mappings.
|
||||
*
|
||||
* An existing mapping from a previous call to this method can be reused
|
||||
* by the file system if it is still valid.
|
||||
*/
|
||||
int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode,
|
||||
loff_t offset);
|
||||
loff_t offset, unsigned len);
|
||||
|
||||
/*
|
||||
* Optional, allows the file systems to perform actions just before
|
||||
@@ -329,6 +339,7 @@ struct iomap_writepage_ctx {
|
||||
struct iomap iomap;
|
||||
struct iomap_ioend *ioend;
|
||||
const struct iomap_writeback_ops *ops;
|
||||
u32 nr_folios; /* folios added to the ioend */
|
||||
};
|
||||
|
||||
void iomap_finish_ioends(struct iomap_ioend *ioend, int error);
|
||||
|
||||
@@ -0,0 +1,24 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _LINUX_RW_HINT_H
|
||||
#define _LINUX_RW_HINT_H
|
||||
|
||||
#include <linux/build_bug.h>
|
||||
#include <linux/compiler_attributes.h>
|
||||
#include <uapi/linux/fcntl.h>
|
||||
|
||||
/* Block storage write lifetime hint values. */
|
||||
enum rw_hint {
|
||||
WRITE_LIFE_NOT_SET = RWH_WRITE_LIFE_NOT_SET,
|
||||
WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE,
|
||||
WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT,
|
||||
WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM,
|
||||
WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG,
|
||||
WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME,
|
||||
} __packed;
|
||||
|
||||
/* Sparse ignores __packed annotations on enums, hence the #ifndef below. */
|
||||
#ifndef __CHECKER__
|
||||
static_assert(sizeof(enum rw_hint) == 1);
|
||||
#endif
|
||||
|
||||
#endif /* _LINUX_RW_HINT_H */
|
||||
Reference in New Issue
Block a user