diff options
Diffstat (limited to 'fs')
120 files changed, 3099 insertions, 1510 deletions
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 48ae509f2ac2..030ab44fce18 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -499,12 +499,16 @@ static void fragment_free_space(struct btrfs_block_group *block_group) * used yet since their free space will be released as soon as the transaction * commits. */ -u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end) +int add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end, + u64 *total_added_ret) { struct btrfs_fs_info *info = block_group->fs_info; - u64 extent_start, extent_end, size, total_added = 0; + u64 extent_start, extent_end, size; int ret; + if (total_added_ret) + *total_added_ret = 0; + while (start < end) { ret = find_first_extent_bit(&info->excluded_extents, start, &extent_start, &extent_end, @@ -517,10 +521,12 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end start = extent_end + 1; } else if (extent_start > start && extent_start < end) { size = extent_start - start; - total_added += size; ret = btrfs_add_free_space_async_trimmed(block_group, start, size); - BUG_ON(ret); /* -ENOMEM or logic error */ + if (ret) + return ret; + if (total_added_ret) + *total_added_ret += size; start = extent_end + 1; } else { break; @@ -529,13 +535,15 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end if (start < end) { size = end - start; - total_added += size; ret = btrfs_add_free_space_async_trimmed(block_group, start, size); - BUG_ON(ret); /* -ENOMEM or logic error */ + if (ret) + return ret; + if (total_added_ret) + *total_added_ret += size; } - return total_added; + return 0; } /* @@ -779,8 +787,13 @@ next: if (key.type == BTRFS_EXTENT_ITEM_KEY || key.type == BTRFS_METADATA_ITEM_KEY) { - total_found += add_new_free_space(block_group, last, - key.objectid); + u64 space_added; + + ret = add_new_free_space(block_group, last, key.objectid, + &space_added); + if (ret) + goto out; + total_found += space_added; if (key.type == BTRFS_METADATA_ITEM_KEY) last = key.objectid + fs_info->nodesize; @@ -795,11 +808,10 @@ next: } path->slots[0]++; } - ret = 0; - - total_found += add_new_free_space(block_group, last, - block_group->start + block_group->length); + ret = add_new_free_space(block_group, last, + block_group->start + block_group->length, + NULL); out: btrfs_free_path(path); return ret; @@ -1640,13 +1652,14 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg) { struct btrfs_fs_info *fs_info = bg->fs_info; - trace_btrfs_add_unused_block_group(bg); spin_lock(&fs_info->unused_bgs_lock); if (list_empty(&bg->bg_list)) { btrfs_get_block_group(bg); + trace_btrfs_add_unused_block_group(bg); list_add_tail(&bg->bg_list, &fs_info->unused_bgs); - } else { + } else if (!test_bit(BLOCK_GROUP_FLAG_NEW, &bg->runtime_flags)) { /* Pull out the block group from the reclaim_bgs list. */ + trace_btrfs_add_unused_block_group(bg); list_move_tail(&bg->bg_list, &fs_info->unused_bgs); } spin_unlock(&fs_info->unused_bgs_lock); @@ -2087,6 +2100,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) /* Shouldn't have super stripes in sequential zones */ if (zoned && nr) { + kfree(logical); btrfs_err(fs_info, "zoned: block group %llu must not contain super block", cache->start); @@ -2292,9 +2306,11 @@ static int read_one_block_group(struct btrfs_fs_info *info, btrfs_free_excluded_extents(cache); } else if (cache->used == 0) { cache->cached = BTRFS_CACHE_FINISHED; - add_new_free_space(cache, cache->start, - cache->start + cache->length); + ret = add_new_free_space(cache, cache->start, + cache->start + cache->length, NULL); btrfs_free_excluded_extents(cache); + if (ret) + goto error; } ret = btrfs_add_block_group_cache(info, cache); @@ -2668,6 +2684,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) next: btrfs_delayed_refs_rsv_release(fs_info, 1); list_del_init(&block_group->bg_list); + clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags); } btrfs_trans_release_chunk_metadata(trans); } @@ -2707,6 +2724,13 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran if (!cache) return ERR_PTR(-ENOMEM); + /* + * Mark it as new before adding it to the rbtree of block groups or any + * list, so that no other task finds it and calls btrfs_mark_bg_unused() + * before the new flag is set. + */ + set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags); + cache->length = size; set_free_space_tree_thresholds(cache); cache->flags = type; @@ -2730,9 +2754,12 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran return ERR_PTR(ret); } - add_new_free_space(cache, chunk_offset, chunk_offset + size); - + ret = add_new_free_space(cache, chunk_offset, chunk_offset + size, NULL); btrfs_free_excluded_extents(cache); + if (ret) { + btrfs_put_block_group(cache); + return ERR_PTR(ret); + } /* * Ensure the corresponding space_info object is created and diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index f204addc3fe8..aba5dff66c19 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -70,6 +70,11 @@ enum btrfs_block_group_flags { BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, /* Indicate that the block group is placed on a sequential zone */ BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, + /* + * Indicate that block group is in the list of new block groups of a + * transaction. + */ + BLOCK_GROUP_FLAG_NEW, }; enum btrfs_caching_type { @@ -284,8 +289,8 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait); void btrfs_put_caching_control(struct btrfs_caching_control *ctl); struct btrfs_caching_control *btrfs_get_caching_control( struct btrfs_block_group *cache); -u64 add_new_free_space(struct btrfs_block_group *block_group, - u64 start, u64 end); +int add_new_free_space(struct btrfs_block_group *block_group, + u64 start, u64 end, u64 *total_added_ret); struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( struct btrfs_fs_info *fs_info, const u64 chunk_offset); diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 6279d200cf83..77684c5e0c8b 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -349,6 +349,11 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) } read_unlock(&fs_info->global_root_lock); + if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) { + num_bytes += btrfs_root_used(&fs_info->block_group_root->root_item); + min_items++; + } + /* * But we also want to reserve enough space so we can do the fallback * global reserve for an unlink, which is an additional diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7513388b0567..9b9914e5f03d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3438,11 +3438,16 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device * For devices supporting discard turn on discard=async automatically, * unless it's already set or disabled. This could be turned off by * nodiscard for the same mount. + * + * The zoned mode piggy backs on the discard functionality for + * resetting a zone. There is no reason to delay the zone reset as it is + * fast enough. So, do not enable async discard for zoned mode. */ if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) || btrfs_test_opt(fs_info, DISCARD_ASYNC) || btrfs_test_opt(fs_info, NODISCARD)) && - fs_info->fs_devices->discardable) { + fs_info->fs_devices->discardable && + !btrfs_is_zoned(fs_info)) { btrfs_set_and_info(fs_info, DISCARD_ASYNC, "auto enabling async discard"); } diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 045ddce32eca..f169378e2ca6 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1515,9 +1515,13 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, if (prev_bit == 0 && bit == 1) { extent_start = offset; } else if (prev_bit == 1 && bit == 0) { - total_found += add_new_free_space(block_group, - extent_start, - offset); + u64 space_added; + + ret = add_new_free_space(block_group, extent_start, + offset, &space_added); + if (ret) + goto out; + total_found += space_added; if (total_found > CACHING_CTL_WAKE_UP) { total_found = 0; wake_up(&caching_ctl->wait); @@ -1529,8 +1533,9 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, } } if (prev_bit == 1) { - total_found += add_new_free_space(block_group, extent_start, - end); + ret = add_new_free_space(block_group, extent_start, end, NULL); + if (ret) + goto out; extent_count++; } @@ -1569,6 +1574,8 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, end = block_group->start + block_group->length; while (1) { + u64 space_added; + ret = btrfs_next_item(root, path); if (ret < 0) goto out; @@ -1583,8 +1590,11 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY); ASSERT(key.objectid < end && key.objectid + key.offset <= end); - total_found += add_new_free_space(block_group, key.objectid, - key.objectid + key.offset); + ret = add_new_free_space(block_group, key.objectid, + key.objectid + key.offset, &space_added); + if (ret) + goto out; + total_found += space_added; if (total_found > CACHING_CTL_WAKE_UP) { total_found = 0; wake_up(&caching_ctl->wait); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dbbb67293e34..49cef61f6a39 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3482,15 +3482,21 @@ zeroit: void btrfs_add_delayed_iput(struct btrfs_inode *inode) { struct btrfs_fs_info *fs_info = inode->root->fs_info; + unsigned long flags; if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1)) return; atomic_inc(&fs_info->nr_delayed_iputs); - spin_lock(&fs_info->delayed_iput_lock); + /* + * Need to be irq safe here because we can be called from either an irq + * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq + * context. + */ + spin_lock_irqsave(&fs_info->delayed_iput_lock, flags); ASSERT(list_empty(&inode->delayed_iput)); list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs); - spin_unlock(&fs_info->delayed_iput_lock); + spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags); if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags)) wake_up_process(fs_info->cleaner_kthread); } @@ -3499,37 +3505,46 @@ static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { list_del_init(&inode->delayed_iput); - spin_unlock(&fs_info->delayed_iput_lock); + spin_unlock_irq(&fs_info->delayed_iput_lock); iput(&inode->vfs_inode); if (atomic_dec_and_test(&fs_info->nr_delayed_iputs)) wake_up(&fs_info->delayed_iputs_wait); - spin_lock(&fs_info->delayed_iput_lock); + spin_lock_irq(&fs_info->delayed_iput_lock); } static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { if (!list_empty(&inode->delayed_iput)) { - spin_lock(&fs_info->delayed_iput_lock); + spin_lock_irq(&fs_info->delayed_iput_lock); if (!list_empty(&inode->delayed_iput)) run_delayed_iput_locked(fs_info, inode); - spin_unlock(&fs_info->delayed_iput_lock); + spin_unlock_irq(&fs_info->delayed_iput_lock); } } void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) { - - spin_lock(&fs_info->delayed_iput_lock); + /* + * btrfs_put_ordered_extent() can run in irq context (see bio.c), which + * calls btrfs_add_delayed_iput() and that needs to lock + * fs_info->delayed_iput_lock. So we need to disable irqs here to + * prevent a deadlock. + */ + spin_lock_irq(&fs_info->delayed_iput_lock); while (!list_empty(&fs_info->delayed_iputs)) { struct btrfs_inode *inode; inode = list_first_entry(&fs_info->delayed_iputs, struct btrfs_inode, delayed_iput); run_delayed_iput_locked(fs_info, inode); - cond_resched_lock(&fs_info->delayed_iput_lock); + if (need_resched()) { + spin_unlock_irq(&fs_info->delayed_iput_lock); + cond_resched(); + spin_lock_irq(&fs_info->delayed_iput_lock); + } } - spin_unlock(&fs_info->delayed_iput_lock); + spin_unlock_irq(&fs_info->delayed_iput_lock); } /* @@ -3659,11 +3674,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; inode = btrfs_iget(fs_info->sb, last_objectid, root); - ret = PTR_ERR_OR_ZERO(inode); - if (ret && ret != -ENOENT) - goto out; + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; + if (ret != -ENOENT) + goto out; + } - if (ret == -ENOENT && root == fs_info->tree_root) { + if (!inode && root == fs_info->tree_root) { struct btrfs_root *dead_root; int is_dead_root = 0; @@ -3724,17 +3742,17 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * deleted but wasn't. The inode number may have been reused, * but either way, we can delete the orphan item. */ - if (ret == -ENOENT || inode->i_nlink) { - if (!ret) { + if (!inode || inode->i_nlink) { + if (inode) { ret = btrfs_drop_verity_items(BTRFS_I(inode)); iput(inode); + inode = NULL; if (ret) goto out; } trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - iput(inode); goto out; } btrfs_debug(fs_info, "auto deleting %Lu", @@ -3742,10 +3760,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) ret = btrfs_del_orphan_item(trans, root, found_key.objectid); btrfs_end_transaction(trans); - if (ret) { - iput(inode); + if (ret) goto out; - } continue; } @@ -4847,9 +4863,6 @@ again: ret = -ENOMEM; goto out; } - ret = set_page_extent_mapped(page); - if (ret < 0) - goto out_unlock; if (!PageUptodate(page)) { ret = btrfs_read_folio(NULL, page_folio(page)); @@ -4864,6 +4877,17 @@ again: goto out_unlock; } } + + /* + * We unlock the page after the io is completed and then re-lock it + * above. release_folio() could have come in between that and cleared + * PagePrivate(), but left the page in the mapping. Set the page mapped + * here to make sure it's properly set for the subpage stuff. + */ + ret = set_page_extent_mapped(page); + if (ret < 0) + goto out_unlock; + wait_on_page_writeback(page); lock_extent(io_tree, block_start, block_end, &cached_state); @@ -7849,8 +7873,11 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered); if (ret) { - bbio->bio.bi_status = errno_to_blk_status(ret); - btrfs_dio_end_io(bbio); + btrfs_finish_ordered_extent(dio_data->ordered, NULL, + file_offset, dip->bytes, + !ret); + bio->bi_status = errno_to_blk_status(ret); + iomap_dio_bio_end_io(bio); return; } } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index da1f84a0eb29..2637d6b157ff 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -4445,4 +4445,5 @@ void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) ulist_free(entry->old_roots); kfree(entry); } + *root = RB_ROOT; } diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index f37b925d587f..0249ea52bb80 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -71,7 +71,7 @@ static void rmw_rbio_work_locked(struct work_struct *work); static void index_rbio_pages(struct btrfs_raid_bio *rbio); static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); -static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check); +static int finish_parity_scrub(struct btrfs_raid_bio *rbio); static void scrub_rbio_work_locked(struct work_struct *work); static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio) @@ -2404,7 +2404,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) return 0; } -static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) +static int finish_parity_scrub(struct btrfs_raid_bio *rbio) { struct btrfs_io_context *bioc = rbio->bioc; const u32 sectorsize = bioc->fs_info->sectorsize; @@ -2445,9 +2445,6 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) */ clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - if (!need_check) - goto writeback; - p_sector.page = alloc_page(GFP_NOFS); if (!p_sector.page) return -ENOMEM; @@ -2516,7 +2513,6 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) q_sector.page = NULL; } -writeback: /* * time to start writing. Make bios for everything from the * higher layers (the bio_list in our rbio) and our p/q. Ignore @@ -2699,7 +2695,6 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) static void scrub_rbio(struct btrfs_raid_bio *rbio) { - bool need_check = false; int sector_nr; int ret; @@ -2722,7 +2717,7 @@ static void scrub_rbio(struct btrfs_raid_bio *rbio) * We have every sector properly prepared. Can finish the scrub * and writeback the good content. */ - ret = finish_parity_scrub(rbio, need_check); + ret = finish_parity_scrub(rbio); wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { int found_errors; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index cf306351b148..91b6c2fdc420 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -826,8 +826,13 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root) trans = start_transaction(root, 0, TRANS_ATTACH, BTRFS_RESERVE_NO_FLUSH, true); - if (trans == ERR_PTR(-ENOENT)) - btrfs_wait_for_commit(root->fs_info, 0); + if (trans == ERR_PTR(-ENOENT)) { + int ret; + + ret = btrfs_wait_for_commit(root->fs_info, 0); + if (ret) + return ERR_PTR(ret); + } return trans; } @@ -931,6 +936,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) } wait_for_commit(cur_trans, TRANS_STATE_COMPLETED); + ret = cur_trans->aborted; btrfs_put_transaction(cur_trans); out: return ret; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 73f9ea7672db..2ecb76cf3d91 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4078,14 +4078,6 @@ static int alloc_profile_is_valid(u64 flags, int extended) return has_single_bit_set(flags); } -static inline int balance_need_close(struct btrfs_fs_info *fs_info) -{ - /* cancel requested || normal exit path */ - return atomic_read(&fs_info->balance_cancel_req) || - (atomic_read(&fs_info->balance_pause_req) == 0 && - atomic_read(&fs_info->balance_cancel_req) == 0); -} - /* * Validate target profile against allowed profiles and return true if it's OK. * Otherwise print the error message and return false. @@ -4275,6 +4267,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, u64 num_devices; unsigned seq; bool reducing_redundancy; + bool paused = false; int i; if (btrfs_fs_closing(fs_info) || @@ -4405,6 +4398,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) { btrfs_info(fs_info, "balance: paused"); btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED); + paused = true; } /* * Balance can be canceled by: @@ -4433,8 +4427,8 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, btrfs_update_ioctl_balance_args(fs_info, bargs); } - if ((ret && ret != -ECANCELED && ret != -ENOSPC) || - balance_need_close(fs_info)) { + /* We didn't pause, we can clean everything up. */ + if (!paused) { reset_balance_state(fs_info); btrfs_exclop_finish(fs_info); } @@ -6404,7 +6398,8 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, (op == BTRFS_MAP_READ || !dev_replace_is_ongoing || !dev_replace->tgtdev)) { set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr); - *mirror_num_ret = mirror_num; + if (mirror_num_ret) + *mirror_num_ret = mirror_num; *bioc_ret = NULL; ret = 0; goto out; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 85b8b332add9..72b90bc19a19 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -805,6 +805,9 @@ int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) return -EINVAL; } + btrfs_clear_and_info(info, DISCARD_ASYNC, + "zoned: async discard ignored and disabled for zoned mode"); + return 0; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 6bb251a4d613..59cbfb80edbd 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -187,16 +187,42 @@ static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq) struct inode *inode = rreq->inode; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_layout *lo = &ci->i_layout; + unsigned long max_pages = inode->i_sb->s_bdi->ra_pages; + loff_t end = rreq->start + rreq->len, new_end; + struct ceph_netfs_request_data *priv = rreq->netfs_priv; + unsigned long max_len; u32 blockoff; - u64 blockno; - /* Expand the start downward */ - blockno = div_u64_rem(rreq->start, lo->stripe_unit, &blockoff); - rreq->start = blockno * lo->stripe_unit; - rreq->len += blockoff; + if (priv) { + /* Readahead is disabled by posix_fadvise POSIX_FADV_RANDOM */ + if (priv->file_ra_disabled) + max_pages = 0; + else + max_pages = priv->file_ra_pages; + + } - /* Now, round up the length to the next block */ - rreq->len = roundup(rreq->len, lo->stripe_unit); + /* Readahead is disabled */ + if (!max_pages) + return; + + max_len = max_pages << PAGE_SHIFT; + + /* + * Try to expand the length forward by rounding up it to the next + * block, but do not exceed the file size, unless the original + * request already exceeds it. + */ + new_end = min(round_up(end, lo->stripe_unit), rreq->i_size); + if (new_end > end && new_end <= rreq->start + max_len) + rreq->len = new_end - rreq->start; + + /* Try to expand the start downward */ + div_u64_rem(rreq->start, lo->stripe_unit, &blockoff); + if (rreq->len + blockoff <= max_len) { + rreq->start -= blockoff; + rreq->len += blockoff; + } } static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq) @@ -362,18 +388,28 @@ static int ceph_init_request(struct netfs_io_request *rreq, struct file *file) { struct inode *inode = rreq->inode; int got = 0, want = CEPH_CAP_FILE_CACHE; + struct ceph_netfs_request_data *priv; int ret = 0; if (rreq->origin != NETFS_READAHEAD) return 0; + priv = kzalloc(sizeof(*priv), GFP_NOFS); + if (!priv) + return -ENOMEM; + if (file) { struct ceph_rw_context *rw_ctx; struct ceph_file_info *fi = file->private_data; + priv->file_ra_pages = file->f_ra.ra_pages; + priv->file_ra_disabled = file->f_mode & FMODE_RANDOM; + rw_ctx = ceph_find_rw_context(fi); - if (rw_ctx) + if (rw_ctx) { + rreq->netfs_priv = priv; return 0; + } } /* @@ -383,27 +419,40 @@ static int ceph_init_request(struct netfs_io_request *rreq, struct file *file) ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got); if (ret < 0) { dout("start_read %p, error getting cap\n", inode); - return ret; + goto out; } if (!(got & want)) { dout("start_read %p, no cache cap\n", inode); - return -EACCES; + ret = -EACCES; + goto out; + } + if (ret == 0) { + ret = -EACCES; + goto out; } - if (ret == 0) - return -EACCES; - rreq->netfs_priv = (void *)(uintptr_t)got; - return 0; + priv->caps = got; + rreq->netfs_priv = priv; + +out: + if (ret < 0) + kfree(priv); + + return ret; } static void ceph_netfs_free_request(struct netfs_io_request *rreq) { - struct ceph_inode_info *ci = ceph_inode(rreq->inode); - int got = (uintptr_t)rreq->netfs_priv; + struct ceph_netfs_request_data *priv = rreq->netfs_priv; + + if (!priv) + return; - if (got) - ceph_put_cap_refs(ci, got); + if (priv->caps) + ceph_put_cap_refs(ceph_inode(rreq->inode), priv->caps); + kfree(priv); + rreq->netfs_priv = NULL; } const struct netfs_request_ops ceph_netfs_ops = { diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 2321e5ddb664..e2bb0d0072da 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3109,6 +3109,12 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, } if (had & CEPH_CAP_FILE_WR) { if (--ci->i_wr_ref == 0) { + /* + * The Fb caps will always be took and released + * together with the Fw caps. + */ + WARN_ON_ONCE(ci->i_wb_ref); + last++; check_flushsnaps = true; if (ci->i_wrbuffer_ref_head == 0 && @@ -3560,6 +3566,15 @@ static void handle_cap_grant(struct inode *inode, } BUG_ON(cap->issued & ~cap->implemented); + /* don't let check_caps skip sending a response to MDS for revoke msgs */ + if (le32_to_cpu(grant->op) == CEPH_CAP_OP_REVOKE) { + cap->mds_wanted = 0; + if (cap == ci->i_auth_cap) + check_caps = 1; /* check auth cap only */ + else + check_caps = 2; /* check all caps */ + } + if (extra_info->inline_version > 0 && extra_info->inline_version >= ci->i_inline_version) { ci->i_inline_version = extra_info->inline_version; @@ -4086,6 +4101,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, struct cap_extra_info extra_info = {}; bool queue_trunc; bool close_sessions = false; + bool do_cap_release = false; dout("handle_caps from mds%d\n", session->s_mds); @@ -4192,17 +4208,14 @@ void ceph_handle_caps(struct ceph_mds_session *session, if (!inode) { dout(" i don't have ino %llx\n", vino.ino); - if (op == CEPH_CAP_OP_IMPORT) { - cap = ceph_get_cap(mdsc, NULL); - cap->cap_ino = vino.ino; - cap->queue_release = 1; - cap->cap_id = le64_to_cpu(h->cap_id); - cap->mseq = mseq; - cap->seq = seq; - cap->issue_seq = seq; - spin_lock(&session->s_cap_lock); - __ceph_queue_cap_release(session, cap); - spin_unlock(&session->s_cap_lock); + switch (op) { + case CEPH_CAP_OP_IMPORT: + case CEPH_CAP_OP_REVOKE: + case CEPH_CAP_OP_GRANT: + do_cap_release = true; + break; + default: + break; } goto flush_cap_releases; } @@ -4252,6 +4265,14 @@ void ceph_handle_caps(struct ceph_mds_session *session, inode, ceph_ino(inode), ceph_snap(inode), session->s_mds); spin_unlock(&ci->i_ceph_lock); + switch (op) { + case CEPH_CAP_OP_REVOKE: + case CEPH_CAP_OP_GRANT: + do_cap_release = true; + break; + default: + break; + } goto flush_cap_releases; } @@ -4302,6 +4323,18 @@ flush_cap_releases: * along for the mds (who clearly thinks we still have this * cap). */ + if (do_cap_release) { + cap = ceph_get_cap(mdsc, NULL); + cap->cap_ino = vino.ino; + cap->queue_release = 1; + cap->cap_id = le64_to_cpu(h->cap_id); + cap->mseq = mseq; + cap->seq = seq; + cap->issue_seq = seq; + spin_lock(&session->s_cap_lock); + __ceph_queue_cap_release(session, cap); + spin_unlock(&session->s_cap_lock); + } ceph_flush_cap_releases(mdsc, session); goto done; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index cb67ac821f0e..4a2b39d9a61a 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -886,7 +886,8 @@ static int ceph_mknod(struct mnt_idmap *idmap, struct inode *dir, set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_args.mknod.mode = cpu_to_le32(mode); req->r_args.mknod.rdev = cpu_to_le32(rdev); - req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL | + CEPH_CAP_XATTR_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; if (as_ctx.pagelist) { req->r_pagelist = as_ctx.pagelist; @@ -953,7 +954,8 @@ static int ceph_symlink(struct mnt_idmap *idmap, struct inode *dir, set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_dentry = dget(dentry); req->r_num_caps = 2; - req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL | + CEPH_CAP_XATTR_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; if (as_ctx.pagelist) { req->r_pagelist = as_ctx.pagelist; @@ -1022,7 +1024,8 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, ihold(dir); set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_args.mkdir.mode = cpu_to_le32(mode); - req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL | + CEPH_CAP_XATTR_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; if (as_ctx.pagelist) { req->r_pagelist = as_ctx.pagelist; @@ -1079,7 +1082,7 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, req->r_parent = dir; ihold(dir); set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); - req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_XATTR_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; /* release LINK_SHARED on source inode (mds will lock it) */ req->r_old_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; @@ -1218,7 +1221,7 @@ retry: req->r_num_caps = 2; req->r_parent = dir; ihold(dir); - req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_XATTR_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; req->r_inode_drop = ceph_drop_caps_for_unlink(inode); @@ -1320,9 +1323,9 @@ static int ceph_rename(struct mnt_idmap *idmap, struct inode *old_dir, req->r_parent = new_dir; ihold(new_dir); set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); - req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_XATTR_EXCL; req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; - req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_XATTR_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; /* release LINK_RDCACHE on source inode (mds will lock it) */ req->r_old_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index b1925232dc08..63efe5389783 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -791,7 +791,8 @@ retry: if (flags & O_CREAT) { struct ceph_file_layout lo; - req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL | + CEPH_CAP_XATTR_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; if (as_ctx.pagelist) { req->r_pagelist = as_ctx.pagelist; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 4c0f22acf53d..66048a86c480 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -645,6 +645,7 @@ bad: err = -EIO; out_bad: pr_err("mds parse_reply err %d\n", err); + ceph_msg_dump(msg); return err; } @@ -3538,6 +3539,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, bad: pr_err("mdsc_handle_forward decode error err=%d\n", err); + ceph_msg_dump(msg); } static int __decode_session_metadata(void **p, void *end, @@ -5258,6 +5260,7 @@ void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) bad: pr_err("error decoding fsmap %d. Shutting down mount.\n", err); ceph_umount_begin(mdsc->fsc->sb); + ceph_msg_dump(msg); err_out: mutex_lock(&mdsc->mutex); mdsc->mdsmap_err = err; @@ -5326,6 +5329,7 @@ bad_unlock: bad: pr_err("error decoding mdsmap %d. Shutting down mount.\n", err); ceph_umount_begin(mdsc->fsc->sb); + ceph_msg_dump(msg); return; } diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index c47347d2e84e..cce78d769f55 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -36,6 +36,14 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, s32 items = 0; s32 len; + /* Do not send the metrics until the MDS rank is ready */ + mutex_lock(&mdsc->mutex); + if (ceph_mdsmap_get_state(mdsc->mdsmap, s->s_mds) != CEPH_MDS_STATE_ACTIVE) { + mutex_unlock(&mdsc->mutex); + return false; + } + mutex_unlock(&mdsc->mutex); + len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write) + sizeof(*meta) + sizeof(*dlease) + sizeof(*files) + sizeof(*icaps) + sizeof(*inodes) + sizeof(*rsize) diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 2e73ba62bd7a..343d738448dc 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -675,14 +675,17 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, return 0; } - /* Fb cap still in use, delay it */ - if (ci->i_wb_ref) { + /* + * Defer flushing the capsnap if the dirty buffer not flushed yet. + * And trigger to flush the buffer immediately. + */ + if (ci->i_wrbuffer_ref) { dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu " "used WRBUFFER, delaying\n", __func__, inode, ceph_vinop(inode), capsnap, capsnap->context, capsnap->context->seq, ceph_cap_string(capsnap->dirty), capsnap->size); - capsnap->writing = 1; + ceph_queue_writeback(inode); return 0; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index d24bf0db5234..3bfddf34d488 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -451,6 +451,19 @@ struct ceph_inode_info { unsigned long i_work_mask; }; +struct ceph_netfs_request_data { + int caps; + + /* + * Maximum size of a file readahead request. + * The fadvise could update the bdi's default ra_pages. + */ + unsigned int file_ra_pages; + + /* Set it if fadvise disables file readahead entirely */ + bool file_ra_disabled; +}; + static inline struct ceph_inode_info * ceph_inode(const struct inode *inode) { diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 2a29943fa5cc..cfad1eac7fd9 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -148,7 +148,7 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, *maptype = 0; return inpage; } - kunmap_atomic(inpage); + kunmap_local(inpage); might_sleep(); src = erofs_vm_map_ram(rq->in, ctx->inpages); if (!src) @@ -162,7 +162,7 @@ docopy: src = erofs_get_pcpubuf(ctx->inpages); if (!src) { DBG_BUGON(1); - kunmap_atomic(inpage); + kunmap_local(inpage); return ERR_PTR(-EFAULT); } @@ -173,9 +173,9 @@ docopy: min_t(unsigned int, total, PAGE_SIZE - *inputmargin); if (!inpage) - inpage = kmap_atomic(*in); + inpage = kmap_local_page(*in); memcpy(tmp, inpage + *inputmargin, page_copycnt); - kunmap_atomic(inpage); + kunmap_local(inpage); inpage = NULL; tmp += page_copycnt; total -= page_copycnt; @@ -214,7 +214,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, int ret, maptype; DBG_BUGON(*rq->in == NULL); - headpage = kmap_atomic(*rq->in); + headpage = kmap_local_page(*rq->in); /* LZ4 decompression inplace is only safe if zero_padding is enabled */ if (erofs_sb_has_zero_padding(EROFS_SB(rq->sb))) { @@ -223,7 +223,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, min_t(unsigned int, rq->inputsize, rq->sb->s_blocksize - rq->pageofs_in)); if (ret) { - kunmap_atomic(headpage); + kunmap_local(headpage); return ret; } may_inplace = !((rq->pageofs_in + rq->inputsize) & @@ -261,7 +261,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, } if (maptype == 0) { - kunmap_atomic(headpage); + kunmap_local(headpage); } else if (maptype == 1) { vm_unmap_ram(src, ctx->inpages); } else if (maptype == 2) { @@ -289,7 +289,7 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, /* one optimized fast path only for non bigpcluster cases yet */ if (ctx.inpages == 1 && ctx.outpages == 1 && !rq->inplace_io) { DBG_BUGON(!*rq->out); - dst = kmap_atomic(*rq->out); + dst = kmap_local_page(*rq->out); dst_maptype = 0; goto dstmap_out; } @@ -311,7 +311,7 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, dstmap_out: ret = z_erofs_lz4_decompress_mem(&ctx, dst + rq->pageofs_out); if (!dst_maptype) - kunmap_atomic(dst); + kunmap_local(dst); else if (dst_maptype == 2) vm_unmap_ram(dst, ctx.outpages); return ret; @@ -328,7 +328,7 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, const unsigned int lefthalf = rq->outputsize - righthalf; const unsigned int interlaced_offset = rq->alg == Z_EROFS_COMPRESSION_SHIFTED ? 0 : rq->pageofs_out; - unsigned char *src, *dst; + u8 *src; if (outpages > 2 && rq->alg == Z_EROFS_COMPRESSION_SHIFTED) { DBG_BUGON(1); @@ -341,22 +341,19 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, } src = kmap_local_page(rq->in[inpages - 1]) + rq->pageofs_in; - if (rq->out[0]) { - dst = kmap_local_page(rq->out[0]); - memcpy(dst + rq->pageofs_out, src + interlaced_offset, - righthalf); - kunmap_local(dst); - } + if (rq->out[0]) + memcpy_to_page(rq->out[0], rq->pageofs_out, + src + interlaced_offset, righthalf); if (outpages > inpages) { DBG_BUGON(!rq->out[outpages - 1]); if (rq->out[outpages - 1] != rq->in[inpages - 1]) { - dst = kmap_local_page(rq->out[outpages - 1]); - memcpy(dst, interlaced_offset ? src : - (src + righthalf), lefthalf); - kunmap_local(dst); + memcpy_to_page(rq->out[outpages - 1], 0, src + + (interlaced_offset ? 0 : righthalf), + lefthalf); } else if (!interlaced_offset) { memmove(src, src + righthalf, lefthalf); + flush_dcache_page(rq->in[inpages - 1]); } } kunmap_local(src); diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index d70b12b81507..e12592727a54 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -183,7 +183,8 @@ static void *erofs_read_inode(struct erofs_buf *buf, inode->i_flags &= ~S_DAX; if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) && - vi->datalayout == EROFS_INODE_FLAT_PLAIN) + (vi->datalayout == EROFS_INODE_FLAT_PLAIN || + vi->datalayout == EROFS_INODE_CHUNK_BASED)) inode->i_flags |= S_DAX; if (!nblks) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 5f1890e309c6..b69d89a11dd0 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1035,7 +1035,7 @@ hitted: */ tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); - cur = end - min_t(unsigned int, offset + end - map->m_la, end); + cur = end - min_t(erofs_off_t, offset + end - map->m_la, end); if (!(map->m_flags & EROFS_MAP_MAPPED)) { zero_user_segment(page, cur, end); goto next_part; @@ -1841,7 +1841,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, } cur = map->m_la + map->m_llen - 1; - while (cur >= end) { + while ((cur >= end) && (cur < i_size_read(inode))) { pgoff_t index = cur >> PAGE_SHIFT; struct page *page; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index a2475b8c9fb5..21b903fe546e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1006,14 +1006,11 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context * fls() instead since we need to know the actual length while modifying * goal length. */ - order = fls(ac->ac_g_ex.fe_len); + order = fls(ac->ac_g_ex.fe_len) - 1; min_order = order - sbi->s_mb_best_avail_max_trim_order; if (min_order < 0) min_order = 0; - if (1 << min_order < ac->ac_o_ex.fe_len) - min_order = fls(ac->ac_o_ex.fe_len) + 1; - if (sbi->s_stripe > 0) { /* * We are assuming that stripe size is always a multiple of @@ -1021,9 +1018,16 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context */ num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe); if (1 << min_order < num_stripe_clusters) - min_order = fls(num_stripe_clusters); + /* + * We consider 1 order less because later we round + * up the goal len to num_stripe_clusters + */ + min_order = fls(num_stripe_clusters) - 1; } + if (1 << min_order < ac->ac_o_ex.fe_len) + min_order = fls(ac->ac_o_ex.fe_len); + for (i = order; i >= min_order; i--) { int frag_order; /* @@ -4761,8 +4765,8 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) int order, i; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_locality_group *lg; - struct ext4_prealloc_space *tmp_pa, *cpa = NULL; - ext4_lblk_t tmp_pa_start, tmp_pa_end; + struct ext4_prealloc_space *tmp_pa = NULL, *cpa = NULL; + loff_t tmp_pa_end; struct rb_node *iter; ext4_fsblk_t goal_block; @@ -4770,47 +4774,151 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) return false; - /* first, try per-file preallocation */ + /* + * first, try per-file preallocation by searching the inode pa rbtree. + * + * Here, we can't do a direct traversal of the tree because + * ext4_mb_discard_group_preallocation() can paralelly mark the pa + * deleted and that can cause direct traversal to skip some entries. + */ read_lock(&ei->i_prealloc_lock); + + if (RB_EMPTY_ROOT(&ei->i_prealloc_node)) { + goto try_group_pa; + } + + /* + * Step 1: Find a pa with logical start immediately adjacent to the + * original logical start. This could be on the left or right. + * + * (tmp_pa->pa_lstart never changes so we can skip locking for it). + */ for (iter = ei->i_prealloc_node.rb_node; iter; iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical, - tmp_pa_start, iter)) { + tmp_pa->pa_lstart, iter)) { tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); + } - /* all fields in this condition don't change, - * so we can skip locking for them */ - tmp_pa_start = tmp_pa->pa_lstart; - tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); - - /* original request start doesn't lie in this PA */ - if (ac->ac_o_ex.fe_logical < tmp_pa_start || - ac->ac_o_ex.fe_logical >= tmp_pa_end) - continue; + /* + * Step 2: The adjacent pa might be to the right of logical start, find + * the left adjacent pa. After this step we'd have a valid tmp_pa whose + * logical start is towards the left of original request's logical start + */ + if (tmp_pa->pa_lstart > ac->ac_o_ex.fe_logical) { + struct rb_node *tmp; + tmp = rb_prev(&tmp_pa->pa_node.inode_node); - /* non-extent files can't have physical blocks past 2^32 */ - if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && - (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) > - EXT4_MAX_BLOCK_FILE_PHYS)) { + if (tmp) { + tmp_pa = rb_entry(tmp, struct ext4_prealloc_space, + pa_node.inode_node); + } else { /* - * Since PAs don't overlap, we won't find any - * other PA to satisfy this. + * If there is no adjacent pa to the left then finding + * an overlapping pa is not possible hence stop searching + * inode pa tree */ - break; + goto try_group_pa; } + } + + BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical)); - /* found preallocated blocks, use them */ + /* + * Step 3: If the left adjacent pa is deleted, keep moving left to find + * the first non deleted adjacent pa. After this step we should have a + * valid tmp_pa which is guaranteed to be non deleted. + */ + for (iter = &tmp_pa->pa_node.inode_node;; iter = rb_prev(iter)) { + if (!iter) { + /* + * no non deleted left adjacent pa, so stop searching + * inode pa tree + */ + goto try_group_pa; + } + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); spin_lock(&tmp_pa->pa_lock); - if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free && - likely(ext4_mb_pa_goal_check(ac, tmp_pa))) { - atomic_inc(&tmp_pa->pa_count); - ext4_mb_use_inode_pa(ac, tmp_pa); + if (tmp_pa->pa_deleted == 0) { + /* + * We will keep holding the pa_lock from + * this point on because we don't want group discard + * to delete this pa underneath us. Since group + * discard is anyways an ENOSPC operation it + * should be okay for it to wait a few more cycles. + */ + break; + } else { spin_unlock(&tmp_pa->pa_lock); - read_unlock(&ei->i_prealloc_lock); - return true; } + } + + BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical)); + BUG_ON(tmp_pa->pa_deleted == 1); + + /* + * Step 4: We now have the non deleted left adjacent pa. Only this + * pa can possibly satisfy the request hence check if it overlaps + * original logical start and stop searching if it doesn't. + */ + tmp_pa_end = (loff_t)tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + + if (ac->ac_o_ex.fe_logical >= tmp_pa_end) { spin_unlock(&tmp_pa->pa_lock); + goto try_group_pa; + } + + /* non-extent files can't have physical blocks past 2^32 */ + if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && + (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) > + EXT4_MAX_BLOCK_FILE_PHYS)) { + /* + * Since PAs don't overlap, we won't find any other PA to + * satisfy this. + */ + spin_unlock(&tmp_pa->pa_lock); + goto try_group_pa; + } + + if (tmp_pa->pa_free && likely(ext4_mb_pa_goal_check(ac, tmp_pa))) { + atomic_inc(&tmp_pa->pa_count); + ext4_mb_use_inode_pa(ac, tmp_pa); + spin_unlock(&tmp_pa->pa_lock); + read_unlock(&ei->i_prealloc_lock); + return true; + } else { + /* + * We found a valid overlapping pa but couldn't use it because + * it had no free blocks. This should ideally never happen + * because: + * + * 1. When a new inode pa is added to rbtree it must have + * pa_free > 0 since otherwise we won't actually need + * preallocation. + * + * 2. An inode pa that is in the rbtree can only have it's + * pa_free become zero when another thread calls: + * ext4_mb_new_blocks + * ext4_mb_use_preallocated + * ext4_mb_use_inode_pa + * + * 3. Further, after the above calls make pa_free == 0, we will + * immediately remove it from the rbtree in: + * ext4_mb_new_blocks + * ext4_mb_release_context + * ext4_mb_put_pa + * + * 4. Since the pa_free becoming 0 and pa_free getting removed + * from tree both happen in ext4_mb_new_blocks, which is always + * called with i_data_sem held for data allocations, we can be + * sure that another process will never see a pa in rbtree with + * pa_free == 0. + */ + WARN_ON_ONCE(tmp_pa->pa_free == 0); } + spin_unlock(&tmp_pa->pa_lock); +try_group_pa: read_unlock(&ei->i_prealloc_lock); /* can we use group allocation? */ diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 321e3a888c20..05151d61b00b 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1782,6 +1782,20 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, memmove(here, (void *)here + size, (void *)last - (void *)here + sizeof(__u32)); memset(last, 0, size); + + /* + * Update i_inline_off - moved ibody region might contain + * system.data attribute. Handling a failure here won't + * cause other complications for setting an xattr. + */ + if (!is_block && ext4_has_inline_data(inode)) { + ret = ext4_find_inline_data_nolock(inode); + if (ret) { + ext4_warning_inode(inode, + "unable to update i_inline_off"); + goto out; + } + } } else if (s->not_found) { /* Insert new name. */ size_t size = EXT4_XATTR_LEN(name_len); diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 64b3860f50ee..8fd3b7f9fb88 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -30,12 +30,9 @@ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, unsigned char reason) { f2fs_build_fault_attr(sbi, 0, 0); - set_ckpt_flags(sbi, CP_ERROR_FLAG); - if (!end_io) { + if (!end_io) f2fs_flush_merged_writes(sbi); - - f2fs_handle_stop(sbi, reason); - } + f2fs_handle_critical_error(sbi, reason, end_io); } /* diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 11653fa79289..236d890f560b 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -55,6 +55,7 @@ struct f2fs_compress_ops { int (*init_decompress_ctx)(struct decompress_io_ctx *dic); void (*destroy_decompress_ctx)(struct decompress_io_ctx *dic); int (*decompress_pages)(struct decompress_io_ctx *dic); + bool (*is_level_valid)(int level); }; static unsigned int offset_in_cluster(struct compress_ctx *cc, pgoff_t index) @@ -308,17 +309,25 @@ static int lz4_decompress_pages(struct decompress_io_ctx *dic) return 0; } +static bool lz4_is_level_valid(int lvl) +{ +#ifdef CONFIG_F2FS_FS_LZ4HC + return !lvl || (lvl >= LZ4HC_MIN_CLEVEL && lvl <= LZ4HC_MAX_CLEVEL); +#else + return lvl == 0; +#endif +} + static const struct f2fs_compress_ops f2fs_lz4_ops = { .init_compress_ctx = lz4_init_compress_ctx, .destroy_compress_ctx = lz4_destroy_compress_ctx, .compress_pages = lz4_compress_pages, .decompress_pages = lz4_decompress_pages, + .is_level_valid = lz4_is_level_valid, }; #endif #ifdef CONFIG_F2FS_FS_ZSTD -#define F2FS_ZSTD_DEFAULT_CLEVEL 1 - static int zstd_init_compress_ctx(struct compress_ctx *cc) { zstd_parameters params; @@ -327,6 +336,7 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc) unsigned int workspace_size; unsigned char level = F2FS_I(cc->inode)->i_compress_level; + /* Need to remain this for backward compatibility */ if (!level) level = F2FS_ZSTD_DEFAULT_CLEVEL; @@ -477,6 +487,11 @@ static int zstd_decompress_pages(struct decompress_io_ctx *dic) return 0; } +static bool zstd_is_level_valid(int lvl) +{ + return lvl >= zstd_min_clevel() && lvl <= zstd_max_clevel(); +} + static const struct f2fs_compress_ops f2fs_zstd_ops = { .init_compress_ctx = zstd_init_compress_ctx, .destroy_compress_ctx = zstd_destroy_compress_ctx, @@ -484,6 +499,7 @@ static const struct f2fs_compress_ops f2fs_zstd_ops = { .init_decompress_ctx = zstd_init_decompress_ctx, .destroy_decompress_ctx = zstd_destroy_decompress_ctx, .decompress_pages = zstd_decompress_pages, + .is_level_valid = zstd_is_level_valid, }; #endif @@ -542,6 +558,16 @@ bool f2fs_is_compress_backend_ready(struct inode *inode) return f2fs_cops[F2FS_I(inode)->i_compress_algorithm]; } +bool f2fs_is_compress_level_valid(int alg, int lvl) +{ + const struct f2fs_compress_ops *cops = f2fs_cops[alg]; + + if (cops->is_level_valid) + return cops->is_level_valid(lvl); + + return lvl == 0; +} + static mempool_t *compress_page_pool; static int num_compress_pages = 512; module_param(num_compress_pages, uint, 0444); @@ -743,8 +769,8 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) ret = -EFSCORRUPTED; /* Avoid f2fs_commit_super in irq context */ - if (in_task) - f2fs_save_errors(sbi, ERROR_FAIL_DECOMPRESSION); + if (!in_task) + f2fs_handle_error_async(sbi, ERROR_FAIL_DECOMPRESSION); else f2fs_handle_error(sbi, ERROR_FAIL_DECOMPRESSION); goto out_release; @@ -1215,6 +1241,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, unsigned int last_index = cc->cluster_size - 1; loff_t psize; int i, err; + bool quota_inode = IS_NOQUOTA(inode); /* we should bypass data pages to proceed the kworker jobs */ if (unlikely(f2fs_cp_error(sbi))) { @@ -1222,7 +1249,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, goto out_free; } - if (IS_NOQUOTA(inode)) { + if (quota_inode) { /* * We need to wait for node_write to avoid block allocation during * checkpoint. This can only happen to quota writes which can cause @@ -1344,7 +1371,7 @@ unlock_continue: set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); f2fs_put_dnode(&dn); - if (IS_NOQUOTA(inode)) + if (quota_inode) f2fs_up_read(&sbi->node_write); else f2fs_unlock_op(sbi); @@ -1370,7 +1397,7 @@ out_put_cic: out_put_dnode: f2fs_put_dnode(&dn); out_unlock_op: - if (IS_NOQUOTA(inode)) + if (quota_inode) f2fs_up_read(&sbi->node_write); else f2fs_unlock_op(sbi); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7165b1202f53..5882afe71d82 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -383,6 +383,17 @@ static void f2fs_write_end_io(struct bio *bio) bio_put(bio); } +#ifdef CONFIG_BLK_DEV_ZONED +static void f2fs_zone_write_end_io(struct bio *bio) +{ + struct f2fs_bio_info *io = (struct f2fs_bio_info *)bio->bi_private; + + bio->bi_private = io->bi_private; + complete(&io->zone_wait); + f2fs_write_end_io(bio); +} +#endif + struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, block_t blk_addr, sector_t *sector) { @@ -639,6 +650,11 @@ int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list); init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock); +#ifdef CONFIG_BLK_DEV_ZONED + init_completion(&sbi->write_io[i][j].zone_wait); + sbi->write_io[i][j].zone_pending_bio = NULL; + sbi->write_io[i][j].bi_private = NULL; +#endif } } @@ -965,6 +981,26 @@ alloc_new: return 0; } +#ifdef CONFIG_BLK_DEV_ZONED +static bool is_end_zone_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + int devi = 0; + + if (f2fs_is_multi_device(sbi)) { + devi = f2fs_target_device_index(sbi, blkaddr); + if (blkaddr < FDEV(devi).start_blk || + blkaddr > FDEV(devi).end_blk) { + f2fs_err(sbi, "Invalid block %x", blkaddr); + return false; + } + blkaddr -= FDEV(devi).start_blk; + } + return bdev_zoned_model(FDEV(devi).bdev) == BLK_ZONED_HM && + f2fs_blkz_is_seq(sbi, devi, blkaddr) && + (blkaddr % sbi->blocks_per_blkz == sbi->blocks_per_blkz - 1); +} +#endif + void f2fs_submit_page_write(struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; @@ -975,6 +1011,16 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) f2fs_bug_on(sbi, is_read_io(fio->op)); f2fs_down_write(&io->io_rwsem); + +#ifdef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_has_blkzoned(sbi) && btype < META && io->zone_pending_bio) { + wait_for_completion_io(&io->zone_wait); + bio_put(io->zone_pending_bio); + io->zone_pending_bio = NULL; + io->bi_private = NULL; + } +#endif + next: if (fio->in_list) { spin_lock(&io->io_lock); @@ -1038,6 +1084,18 @@ skip: if (fio->in_list) goto next; out: +#ifdef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_has_blkzoned(sbi) && btype < META && + is_end_zone_blkaddr(sbi, fio->new_blkaddr)) { + bio_get(io->bio); + reinit_completion(&io->zone_wait); + io->bi_private = io->bio->bi_private; + io->bio->bi_private = io; + io->bio->bi_end_io = f2fs_zone_write_end_io; + io->zone_pending_bio = io->bio; + __submit_merged_bio(io); + } +#endif if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || !f2fs_is_checkpoint_ready(sbi)) __submit_merged_bio(io); @@ -2173,7 +2231,6 @@ submit_and_realloc: f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO, F2FS_BLKSIZE); *last_block_in_bio = block_nr; - goto out; out: *bio_ret = bio; return ret; @@ -2775,6 +2832,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, loff_t psize = (loff_t)(page->index + 1) << PAGE_SHIFT; unsigned offset = 0; bool need_balance_fs = false; + bool quota_inode = IS_NOQUOTA(inode); int err = 0; struct f2fs_io_info fio = { .sbi = sbi, @@ -2807,6 +2865,10 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, if (S_ISDIR(inode->i_mode) && !is_sbi_flag_set(sbi, SBI_IS_CLOSE)) goto redirty_out; + + /* keep data pages in remount-ro mode */ + if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_READONLY) + goto redirty_out; goto out; } @@ -2832,19 +2894,19 @@ write: goto out; /* Dentry/quota blocks are controlled by checkpoint */ - if (S_ISDIR(inode->i_mode) || IS_NOQUOTA(inode)) { + if (S_ISDIR(inode->i_mode) || quota_inode) { /* * We need to wait for node_write to avoid block allocation during * checkpoint. This can only happen to quota writes which can cause * the below discard race condition. */ - if (IS_NOQUOTA(inode)) + if (quota_inode) f2fs_down_read(&sbi->node_write); fio.need_lock = LOCK_DONE; err = f2fs_do_write_data_page(&fio); - if (IS_NOQUOTA(inode)) + if (quota_inode) f2fs_up_read(&sbi->node_write); goto done; @@ -4067,7 +4129,6 @@ const struct address_space_operations f2fs_dblock_aops = { .migrate_folio = filemap_migrate_folio, .invalidate_folio = f2fs_invalidate_folio, .release_folio = f2fs_release_folio, - .direct_IO = noop_direct_IO, .bmap = f2fs_bmap, .swap_activate = f2fs_swap_activate, .swap_deactivate = f2fs_swap_deactivate, diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 887e55988450..d635c58cf5a3 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -775,8 +775,15 @@ int f2fs_add_dentry(struct inode *dir, const struct f2fs_filename *fname, { int err = -EAGAIN; - if (f2fs_has_inline_dentry(dir)) + if (f2fs_has_inline_dentry(dir)) { + /* + * Should get i_xattr_sem to keep the lock order: + * i_xattr_sem -> inode_page lock used by f2fs_setxattr. + */ + f2fs_down_read(&F2FS_I(dir)->i_xattr_sem); err = f2fs_add_inline_entry(dir, fname, inode, ino, mode); + f2fs_up_read(&F2FS_I(dir)->i_xattr_sem); + } if (err == -EAGAIN) err = f2fs_add_regular_entry(dir, fname, inode, ino, mode); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d211ee89c158..c7cb2177b252 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -80,34 +80,34 @@ extern const char *f2fs_fault_name[FAULT_MAX]; /* * For mount options */ -#define F2FS_MOUNT_DISABLE_ROLL_FORWARD 0x00000002 -#define F2FS_MOUNT_DISCARD 0x00000004 -#define F2FS_MOUNT_NOHEAP 0x00000008 -#define F2FS_MOUNT_XATTR_USER 0x00000010 -#define F2FS_MOUNT_POSIX_ACL 0x00000020 -#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040 -#define F2FS_MOUNT_INLINE_XATTR 0x00000080 -#define F2FS_MOUNT_INLINE_DATA 0x00000100 -#define F2FS_MOUNT_INLINE_DENTRY 0x00000200 -#define F2FS_MOUNT_FLUSH_MERGE 0x00000400 -#define F2FS_MOUNT_NOBARRIER 0x00000800 -#define F2FS_MOUNT_FASTBOOT 0x00001000 -#define F2FS_MOUNT_READ_EXTENT_CACHE 0x00002000 -#define F2FS_MOUNT_DATA_FLUSH 0x00008000 -#define F2FS_MOUNT_FAULT_INJECTION 0x00010000 -#define F2FS_MOUNT_USRQUOTA 0x00080000 -#define F2FS_MOUNT_GRPQUOTA 0x00100000 -#define F2FS_MOUNT_PRJQUOTA 0x00200000 -#define F2FS_MOUNT_QUOTA 0x00400000 -#define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00800000 -#define F2FS_MOUNT_RESERVE_ROOT 0x01000000 -#define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000 -#define F2FS_MOUNT_NORECOVERY 0x04000000 -#define F2FS_MOUNT_ATGC 0x08000000 -#define F2FS_MOUNT_MERGE_CHECKPOINT 0x10000000 -#define F2FS_MOUNT_GC_MERGE 0x20000000 -#define F2FS_MOUNT_COMPRESS_CACHE 0x40000000 -#define F2FS_MOUNT_AGE_EXTENT_CACHE 0x80000000 +#define F2FS_MOUNT_DISABLE_ROLL_FORWARD 0x00000001 +#define F2FS_MOUNT_DISCARD 0x00000002 +#define F2FS_MOUNT_NOHEAP 0x00000004 +#define F2FS_MOUNT_XATTR_USER 0x00000008 +#define F2FS_MOUNT_POSIX_ACL 0x00000010 +#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000020 +#define F2FS_MOUNT_INLINE_XATTR 0x00000040 +#define F2FS_MOUNT_INLINE_DATA 0x00000080 +#define F2FS_MOUNT_INLINE_DENTRY 0x00000100 +#define F2FS_MOUNT_FLUSH_MERGE 0x00000200 +#define F2FS_MOUNT_NOBARRIER 0x00000400 +#define F2FS_MOUNT_FASTBOOT 0x00000800 +#define F2FS_MOUNT_READ_EXTENT_CACHE 0x00001000 +#define F2FS_MOUNT_DATA_FLUSH 0x00002000 +#define F2FS_MOUNT_FAULT_INJECTION 0x00004000 +#define F2FS_MOUNT_USRQUOTA 0x00008000 +#define F2FS_MOUNT_GRPQUOTA 0x00010000 +#define F2FS_MOUNT_PRJQUOTA 0x00020000 +#define F2FS_MOUNT_QUOTA 0x00040000 +#define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00080000 +#define F2FS_MOUNT_RESERVE_ROOT 0x00100000 +#define F2FS_MOUNT_DISABLE_CHECKPOINT 0x00200000 +#define F2FS_MOUNT_NORECOVERY 0x00400000 +#define F2FS_MOUNT_ATGC 0x00800000 +#define F2FS_MOUNT_MERGE_CHECKPOINT 0x01000000 +#define F2FS_MOUNT_GC_MERGE 0x02000000 +#define F2FS_MOUNT_COMPRESS_CACHE 0x04000000 +#define F2FS_MOUNT_AGE_EXTENT_CACHE 0x08000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -162,6 +162,7 @@ struct f2fs_mount_info { int fs_mode; /* fs mode: LFS or ADAPTIVE */ int bggc_mode; /* bggc mode: off, on or sync */ int memory_mode; /* memory mode */ + int errors; /* errors parameter */ int discard_unit; /* * discard command's offset/size should * be aligned to this unit: block, @@ -185,21 +186,21 @@ struct f2fs_mount_info { unsigned char noextensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ }; -#define F2FS_FEATURE_ENCRYPT 0x0001 -#define F2FS_FEATURE_BLKZONED 0x0002 -#define F2FS_FEATURE_ATOMIC_WRITE 0x0004 -#define F2FS_FEATURE_EXTRA_ATTR 0x0008 -#define F2FS_FEATURE_PRJQUOTA 0x0010 -#define F2FS_FEATURE_INODE_CHKSUM 0x0020 -#define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR 0x0040 -#define F2FS_FEATURE_QUOTA_INO 0x0080 -#define F2FS_FEATURE_INODE_CRTIME 0x0100 -#define F2FS_FEATURE_LOST_FOUND 0x0200 -#define F2FS_FEATURE_VERITY 0x0400 -#define F2FS_FEATURE_SB_CHKSUM 0x0800 -#define F2FS_FEATURE_CASEFOLD 0x1000 -#define F2FS_FEATURE_COMPRESSION 0x2000 -#define F2FS_FEATURE_RO 0x4000 +#define F2FS_FEATURE_ENCRYPT 0x00000001 +#define F2FS_FEATURE_BLKZONED 0x00000002 +#define F2FS_FEATURE_ATOMIC_WRITE 0x00000004 +#define F2FS_FEATURE_EXTRA_ATTR 0x00000008 +#define F2FS_FEATURE_PRJQUOTA 0x00000010 +#define F2FS_FEATURE_INODE_CHKSUM 0x00000020 +#define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR 0x00000040 +#define F2FS_FEATURE_QUOTA_INO 0x00000080 +#define F2FS_FEATURE_INODE_CRTIME 0x00000100 +#define F2FS_FEATURE_LOST_FOUND 0x00000200 +#define F2FS_FEATURE_VERITY 0x00000400 +#define F2FS_FEATURE_SB_CHKSUM 0x00000800 +#define F2FS_FEATURE_CASEFOLD 0x00001000 +#define F2FS_FEATURE_COMPRESSION 0x00002000 +#define F2FS_FEATURE_RO 0x00004000 #define __F2FS_HAS_FEATURE(raw_super, mask) \ ((raw_super->feature & cpu_to_le32(mask)) != 0) @@ -1175,6 +1176,7 @@ enum iostat_type { /* other */ FS_DISCARD_IO, /* discard */ FS_FLUSH_IO, /* flush */ + FS_ZONE_RESET_IO, /* zone reset */ NR_IO_TYPE, }; @@ -1217,6 +1219,11 @@ struct f2fs_bio_info { struct bio *bio; /* bios to merge */ sector_t last_block_in_bio; /* last block number */ struct f2fs_io_info fio; /* store buffered io info. */ +#ifdef CONFIG_BLK_DEV_ZONED + struct completion zone_wait; /* condition value for the previous open zone to close */ + struct bio *zone_pending_bio; /* pending bio for the previous zone */ + void *bi_private; /* previous bi_private for pending bio */ +#endif struct f2fs_rwsem io_rwsem; /* blocking op for bio */ spinlock_t io_lock; /* serialize DATA/NODE IOs */ struct list_head io_list; /* track fios */ @@ -1370,6 +1377,12 @@ enum { MEMORY_MODE_LOW, /* memory mode for low memry devices */ }; +enum errors_option { + MOUNT_ERRORS_READONLY, /* remount fs ro on errors */ + MOUNT_ERRORS_CONTINUE, /* continue on errors */ + MOUNT_ERRORS_PANIC, /* panic on errors */ +}; + static inline int f2fs_test_bit(unsigned int nr, char *addr); static inline void f2fs_set_bit(unsigned int nr, char *addr); static inline void f2fs_clear_bit(unsigned int nr, char *addr); @@ -1427,6 +1440,8 @@ struct compress_data { #define F2FS_COMPRESSED_PAGE_MAGIC 0xF5F2C000 +#define F2FS_ZSTD_DEFAULT_CLEVEL 1 + #define COMPRESS_LEVEL_OFFSET 8 /* compress context */ @@ -1721,8 +1736,14 @@ struct f2fs_sb_info { struct workqueue_struct *post_read_wq; /* post read workqueue */ - unsigned char errors[MAX_F2FS_ERRORS]; /* error flags */ - spinlock_t error_lock; /* protect errors array */ + /* + * If we are in irq context, let's update error information into + * on-disk superblock in the work. + */ + struct work_struct s_error_work; + unsigned char errors[MAX_F2FS_ERRORS]; /* error flags */ + unsigned char stop_reason[MAX_STOP_REASON]; /* stop reason */ + spinlock_t error_lock; /* protect errors/stop_reason array */ bool error_dirty; /* errors of sb is dirty */ struct kmem_cache *inline_xattr_slab; /* inline xattr entry */ @@ -2941,6 +2962,8 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) #define F2FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define F2FS_CASEFOLD_FL 0x40000000 /* Casefolded file */ +#define F2FS_QUOTA_DEFAULT_FL (F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL) + /* Flags that should be inherited by new inodes from their parent. */ #define F2FS_FL_INHERITED (F2FS_SYNC_FL | F2FS_NODUMP_FL | F2FS_NOATIME_FL | \ F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \ @@ -3394,6 +3417,8 @@ static inline int get_inline_xattr_addrs(struct inode *inode) ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) +#define F2FS_MIN_EXTRA_ATTR_SIZE (sizeof(__le32)) + #define F2FS_TOTAL_EXTRA_ATTR_SIZE \ (offsetof(struct f2fs_inode, i_extra_end) - \ offsetof(struct f2fs_inode, i_extra_isize)) \ @@ -3432,7 +3457,6 @@ static inline bool __is_valid_data_blkaddr(block_t blkaddr) * file.c */ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); -void f2fs_truncate_data_blocks(struct dnode_of_data *dn); int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate(struct inode *inode); @@ -3541,9 +3565,11 @@ int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); int f2fs_quota_sync(struct super_block *sb, int type); loff_t max_file_blocks(struct inode *inode); void f2fs_quota_off_umount(struct super_block *sb); -void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason); void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag); +void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason, + bool irq_context); void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error); +void f2fs_handle_error_async(struct f2fs_sb_info *sbi, unsigned char error); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi); @@ -3815,7 +3841,7 @@ void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi); block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode); int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control); void f2fs_build_gc_manager(struct f2fs_sb_info *sbi); -int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count); +int f2fs_resize_fs(struct file *filp, __u64 block_count); int __init f2fs_create_garbage_collection_cache(void); void f2fs_destroy_garbage_collection_cache(void); /* victim selection function for cleaning and SSR */ @@ -4213,6 +4239,7 @@ bool f2fs_compress_write_end(struct inode *inode, void *fsdata, int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock); void f2fs_compress_write_end_io(struct bio *bio, struct page *page); bool f2fs_is_compress_backend_ready(struct inode *inode); +bool f2fs_is_compress_level_valid(int alg, int lvl); int __init f2fs_init_compress_mempool(void); void f2fs_destroy_compress_mempool(void); void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task); @@ -4277,6 +4304,7 @@ static inline bool f2fs_is_compress_backend_ready(struct inode *inode) /* not support compression */ return false; } +static inline bool f2fs_is_compress_level_valid(int alg, int lvl) { return false; } static inline struct page *f2fs_compress_control_page(struct page *page) { WARN_ON_ONCE(1); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 2435111a8532..093039dee992 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -149,8 +149,6 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) zero_user_segment(page, offset, PAGE_SIZE); } set_page_dirty(page); - if (!PageUptodate(page)) - SetPageUptodate(page); f2fs_update_iostat(sbi, inode, APP_MAPPED_IO, F2FS_BLKSIZE); f2fs_update_time(sbi, REQ_TIME); @@ -546,7 +544,8 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) if (err) return err; - filp->f_mode |= FMODE_NOWAIT; + filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; + filp->f_mode |= FMODE_CAN_ODIRECT; return dquot_file_open(inode, filp); } @@ -627,11 +626,6 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) dn->ofs_in_node, nr_free); } -void f2fs_truncate_data_blocks(struct dnode_of_data *dn) -{ - f2fs_truncate_data_blocks_range(dn, ADDRS_PER_BLOCK(dn->inode)); -} - static int truncate_partial_data_page(struct inode *inode, u64 from, bool cache_only) { @@ -2225,7 +2219,6 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) ret = 0; f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); - set_sbi_flag(sbi, SBI_IS_SHUTDOWN); trace_f2fs_shutdown(sbi, in, ret); } return ret; @@ -2238,7 +2231,6 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) if (ret) goto out; f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); - set_sbi_flag(sbi, SBI_IS_SHUTDOWN); thaw_bdev(sb->s_bdev); break; case F2FS_GOING_DOWN_METASYNC: @@ -2247,16 +2239,13 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) if (ret) goto out; f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); - set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_NOSYNC: f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); - set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_METAFLUSH: f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); - set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_NEED_FSCK: set_sbi_flag(sbi, SBI_NEED_FSCK); @@ -2593,6 +2582,11 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, inode_lock(inode); + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { + err = -EINVAL; + goto unlock_out; + } + /* if in-place-update policy is enabled, don't waste time here */ set_inode_flag(inode, FI_OPU_WRITE); if (f2fs_should_update_inplace(inode, NULL)) { @@ -2717,6 +2711,7 @@ clear_out: clear_inode_flag(inode, FI_SKIP_WRITES); out: clear_inode_flag(inode, FI_OPU_WRITE); +unlock_out: inode_unlock(inode); if (!err) range->len = (u64)total << PAGE_SHIFT; @@ -2876,6 +2871,17 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, f2fs_up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); out_src: f2fs_up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); + if (ret) + goto out_unlock; + + src->i_mtime = src->i_ctime = current_time(src); + f2fs_mark_inode_dirty_sync(src, false); + if (src != dst) { + dst->i_mtime = dst->i_ctime = current_time(dst); + f2fs_mark_inode_dirty_sync(dst, false); + } + f2fs_update_time(sbi, REQ_TIME); + out_unlock: if (src != dst) inode_unlock(dst); @@ -3278,7 +3284,7 @@ static int f2fs_ioc_resize_fs(struct file *filp, unsigned long arg) sizeof(block_count))) return -EFAULT; - return f2fs_resize_fs(sbi, block_count); + return f2fs_resize_fs(filp, block_count); } static int f2fs_ioc_enable_verity(struct file *filp, unsigned long arg) @@ -3375,18 +3381,29 @@ out: return err; } -static int f2fs_get_compress_blocks(struct file *filp, unsigned long arg) +static int f2fs_get_compress_blocks(struct inode *inode, __u64 *blocks) { - struct inode *inode = file_inode(filp); - __u64 blocks; - if (!f2fs_sb_has_compression(F2FS_I_SB(inode))) return -EOPNOTSUPP; if (!f2fs_compressed_file(inode)) return -EINVAL; - blocks = atomic_read(&F2FS_I(inode)->i_compr_blocks); + *blocks = atomic_read(&F2FS_I(inode)->i_compr_blocks); + + return 0; +} + +static int f2fs_ioc_get_compress_blocks(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + __u64 blocks; + int ret; + + ret = f2fs_get_compress_blocks(inode, &blocks); + if (ret < 0) + return ret; + return put_user(blocks, (u64 __user *)arg); } @@ -3455,7 +3472,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) int ret; int writecount; - if (!f2fs_sb_has_compression(F2FS_I_SB(inode))) + if (!f2fs_sb_has_compression(sbi)) return -EOPNOTSUPP; if (!f2fs_compressed_file(inode)) @@ -3468,7 +3485,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) if (ret) return ret; - f2fs_balance_fs(F2FS_I_SB(inode), true); + f2fs_balance_fs(sbi, true); inode_lock(inode); @@ -3488,13 +3505,15 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) if (ret) goto out; + if (!atomic_read(&F2FS_I(inode)->i_compr_blocks)) { + ret = -EPERM; + goto out; + } + set_inode_flag(inode, FI_COMPRESS_RELEASED); inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, true); - if (!atomic_read(&F2FS_I(inode)->i_compr_blocks)) - goto out; - f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); @@ -3625,7 +3644,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) unsigned int reserved_blocks = 0; int ret; - if (!f2fs_sb_has_compression(F2FS_I_SB(inode))) + if (!f2fs_sb_has_compression(sbi)) return -EOPNOTSUPP; if (!f2fs_compressed_file(inode)) @@ -3641,7 +3660,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) if (atomic_read(&F2FS_I(inode)->i_compr_blocks)) goto out; - f2fs_balance_fs(F2FS_I_SB(inode), true); + f2fs_balance_fs(sbi, true); inode_lock(inode); @@ -4035,7 +4054,7 @@ static int f2fs_ioc_decompress_file(struct file *filp) if (!f2fs_compressed_file(inode)) return -EINVAL; - f2fs_balance_fs(F2FS_I_SB(inode), true); + f2fs_balance_fs(sbi, true); file_start_write(filp); inode_lock(inode); @@ -4110,7 +4129,7 @@ static int f2fs_ioc_compress_file(struct file *filp) if (!f2fs_compressed_file(inode)) return -EINVAL; - f2fs_balance_fs(F2FS_I_SB(inode), true); + f2fs_balance_fs(sbi, true); file_start_write(filp); inode_lock(inode); @@ -4238,7 +4257,7 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case FS_IOC_SETFSLABEL: return f2fs_ioc_setfslabel(filp, arg); case F2FS_IOC_GET_COMPRESS_BLOCKS: - return f2fs_get_compress_blocks(filp, arg); + return f2fs_ioc_get_compress_blocks(filp, arg); case F2FS_IOC_RELEASE_COMPRESS_BLOCKS: return f2fs_release_compress_blocks(filp, arg); case F2FS_IOC_RESERVE_COMPRESS_BLOCKS: diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 61c5f9d26018..01effd3fcb6c 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -59,7 +59,7 @@ static int gc_thread_func(void *data) if (gc_th->gc_wake) gc_th->gc_wake = false; - if (try_to_freeze()) { + if (try_to_freeze() || f2fs_readonly(sbi->sb)) { stat_other_skip_bggc_count(sbi); continue; } @@ -1797,7 +1797,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control) { int gc_type = gc_control->init_gc_type; unsigned int segno = gc_control->victim_segno; - int sec_freed = 0, seg_freed = 0, total_freed = 0; + int sec_freed = 0, seg_freed = 0, total_freed = 0, total_sec_freed = 0; int ret = 0; struct cp_control cpc; struct gc_inode_list gc_list = { @@ -1842,6 +1842,8 @@ gc_more: ret = f2fs_write_checkpoint(sbi, &cpc); if (ret) goto stop; + /* Reset due to checkpoint */ + sec_freed = 0; } } @@ -1866,15 +1868,17 @@ retry: gc_control->should_migrate_blocks); total_freed += seg_freed; - if (seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) + if (seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) { sec_freed++; + total_sec_freed++; + } if (gc_type == FG_GC) { sbi->cur_victim_sec = NULL_SEGNO; if (has_enough_free_secs(sbi, sec_freed, 0)) { if (!gc_control->no_bg_gc && - sec_freed < gc_control->nr_free_secs) + total_sec_freed < gc_control->nr_free_secs) goto go_gc_more; goto stop; } @@ -1901,6 +1905,8 @@ retry: ret = f2fs_write_checkpoint(sbi, &cpc); if (ret) goto stop; + /* Reset due to checkpoint */ + sec_freed = 0; } go_gc_more: segno = NULL_SEGNO; @@ -1913,7 +1919,7 @@ stop: if (gc_type == FG_GC) f2fs_unpin_all_sections(sbi, true); - trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed, + trace_f2fs_gc_end(sbi->sb, ret, total_freed, total_sec_freed, get_pages(sbi, F2FS_DIRTY_NODES), get_pages(sbi, F2FS_DIRTY_DENTS), get_pages(sbi, F2FS_DIRTY_IMETA), @@ -1927,7 +1933,7 @@ stop: put_gc_inode(&gc_list); if (gc_control->err_gc_skipped && !ret) - ret = sec_freed ? 0 : -EAGAIN; + ret = total_sec_freed ? 0 : -EAGAIN; return ret; } @@ -2099,8 +2105,9 @@ static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) } } -int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) +int f2fs_resize_fs(struct file *filp, __u64 block_count) { + struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp)); __u64 old_block_count, shrunk_blocks; struct cp_control cpc = { CP_RESIZE, 0, 0, 0 }; unsigned int secs; @@ -2138,12 +2145,18 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) return -EINVAL; } + err = mnt_want_write_file(filp); + if (err) + return err; + shrunk_blocks = old_block_count - block_count; secs = div_u64(shrunk_blocks, BLKS_PER_SEC(sbi)); /* stop other GC */ - if (!f2fs_down_write_trylock(&sbi->gc_lock)) - return -EAGAIN; + if (!f2fs_down_write_trylock(&sbi->gc_lock)) { + err = -EAGAIN; + goto out_drop_write; + } /* stop CP to protect MAIN_SEC in free_segment_range */ f2fs_lock_op(sbi); @@ -2163,10 +2176,20 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) out_unlock: f2fs_unlock_op(sbi); f2fs_up_write(&sbi->gc_lock); +out_drop_write: + mnt_drop_write_file(filp); if (err) return err; - freeze_super(sbi->sb); + err = freeze_super(sbi->sb); + if (err) + return err; + + if (f2fs_readonly(sbi->sb)) { + thaw_super(sbi->sb); + return -EROFS; + } + f2fs_down_write(&sbi->gc_lock); f2fs_down_write(&sbi->cp_global_sem); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index cf4327ad106c..09e986b050c6 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -10,6 +10,8 @@ #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/sched/mm.h> +#include <linux/lz4.h> +#include <linux/zstd.h> #include "f2fs.h" #include "node.h" @@ -202,6 +204,80 @@ void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page) ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, page)); } +static bool sanity_check_compress_inode(struct inode *inode, + struct f2fs_inode *ri) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned char clevel; + + if (ri->i_compress_algorithm >= COMPRESS_MAX) { + f2fs_warn(sbi, + "%s: inode (ino=%lx) has unsupported compress algorithm: %u, run fsck to fix", + __func__, inode->i_ino, ri->i_compress_algorithm); + goto err; + } + if (le64_to_cpu(ri->i_compr_blocks) > + SECTOR_TO_BLOCK(inode->i_blocks)) { + f2fs_warn(sbi, + "%s: inode (ino=%lx) has inconsistent i_compr_blocks:%llu, i_blocks:%llu, run fsck to fix", + __func__, inode->i_ino, le64_to_cpu(ri->i_compr_blocks), + SECTOR_TO_BLOCK(inode->i_blocks)); + goto err; + } + if (ri->i_log_cluster_size < MIN_COMPRESS_LOG_SIZE || + ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE) { + f2fs_warn(sbi, + "%s: inode (ino=%lx) has unsupported log cluster size: %u, run fsck to fix", + __func__, inode->i_ino, ri->i_log_cluster_size); + goto err; + } + + clevel = le16_to_cpu(ri->i_compress_flag) >> + COMPRESS_LEVEL_OFFSET; + switch (ri->i_compress_algorithm) { + case COMPRESS_LZO: +#ifdef CONFIG_F2FS_FS_LZO + if (clevel) + goto err_level; +#endif + break; + case COMPRESS_LZORLE: +#ifdef CONFIG_F2FS_FS_LZORLE + if (clevel) + goto err_level; +#endif + break; + case COMPRESS_LZ4: +#ifdef CONFIG_F2FS_FS_LZ4 +#ifdef CONFIG_F2FS_FS_LZ4HC + if (clevel && + (clevel < LZ4HC_MIN_CLEVEL || clevel > LZ4HC_MAX_CLEVEL)) + goto err_level; +#else + if (clevel) + goto err_level; +#endif +#endif + break; + case COMPRESS_ZSTD: +#ifdef CONFIG_F2FS_FS_ZSTD + if (clevel < zstd_min_clevel() || clevel > zstd_max_clevel()) + goto err_level; +#endif + break; + default: + goto err_level; + } + + return true; +err_level: + f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported compress level: %u, run fsck to fix", + __func__, inode->i_ino, clevel); +err: + set_sbi_flag(sbi, SBI_NEED_FSCK); + return false; +} + static bool sanity_check_inode(struct inode *inode, struct page *node_page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -225,41 +301,77 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } - if (f2fs_sb_has_flexible_inline_xattr(sbi) - && !f2fs_has_extra_attr(inode)) { + if (f2fs_has_extra_attr(inode)) { + if (!f2fs_sb_has_extra_attr(sbi)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: inode (ino=%lx) is with extra_attr, but extra_attr feature is off", + __func__, inode->i_ino); + return false; + } + if (fi->i_extra_isize > F2FS_TOTAL_EXTRA_ATTR_SIZE || + fi->i_extra_isize < F2FS_MIN_EXTRA_ATTR_SIZE || + fi->i_extra_isize % sizeof(__le32)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_extra_isize: %d, max: %zu", + __func__, inode->i_ino, fi->i_extra_isize, + F2FS_TOTAL_EXTRA_ATTR_SIZE); + return false; + } + if (f2fs_sb_has_flexible_inline_xattr(sbi) && + f2fs_has_inline_xattr(inode) && + (!fi->i_inline_xattr_size || + fi->i_inline_xattr_size > MAX_INLINE_XATTR_SIZE)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_inline_xattr_size: %d, max: %zu", + __func__, inode->i_ino, fi->i_inline_xattr_size, + MAX_INLINE_XATTR_SIZE); + return false; + } + if (f2fs_sb_has_compression(sbi) && + fi->i_flags & F2FS_COMPR_FL && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, + i_compress_flag)) { + if (!sanity_check_compress_inode(inode, ri)) + return false; + } + } else if (f2fs_sb_has_flexible_inline_xattr(sbi)) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: corrupted inode ino=%lx, run fsck to fix.", __func__, inode->i_ino); return false; } - if (f2fs_has_extra_attr(inode) && - !f2fs_sb_has_extra_attr(sbi)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_warn(sbi, "%s: inode (ino=%lx) is with extra_attr, but extra_attr feature is off", - __func__, inode->i_ino); - return false; - } - - if (fi->i_extra_isize > F2FS_TOTAL_EXTRA_ATTR_SIZE || - fi->i_extra_isize % sizeof(__le32)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_extra_isize: %d, max: %zu", - __func__, inode->i_ino, fi->i_extra_isize, - F2FS_TOTAL_EXTRA_ATTR_SIZE); - return false; - } - - if (f2fs_has_extra_attr(inode) && - f2fs_sb_has_flexible_inline_xattr(sbi) && - f2fs_has_inline_xattr(inode) && - (!fi->i_inline_xattr_size || - fi->i_inline_xattr_size > MAX_INLINE_XATTR_SIZE)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_inline_xattr_size: %d, max: %zu", - __func__, inode->i_ino, fi->i_inline_xattr_size, - MAX_INLINE_XATTR_SIZE); - return false; + if (!f2fs_sb_has_extra_attr(sbi)) { + if (f2fs_sb_has_project_quota(sbi)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", + __func__, inode->i_ino, F2FS_FEATURE_PRJQUOTA); + return false; + } + if (f2fs_sb_has_inode_chksum(sbi)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", + __func__, inode->i_ino, F2FS_FEATURE_INODE_CHKSUM); + return false; + } + if (f2fs_sb_has_flexible_inline_xattr(sbi)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", + __func__, inode->i_ino, F2FS_FEATURE_FLEXIBLE_INLINE_XATTR); + return false; + } + if (f2fs_sb_has_inode_crtime(sbi)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", + __func__, inode->i_ino, F2FS_FEATURE_INODE_CRTIME); + return false; + } + if (f2fs_sb_has_compression(sbi)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", + __func__, inode->i_ino, F2FS_FEATURE_COMPRESSION); + return false; + } } if (f2fs_sanity_check_inline_data(inode)) { @@ -283,39 +395,6 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } - if (f2fs_has_extra_attr(inode) && f2fs_sb_has_compression(sbi) && - fi->i_flags & F2FS_COMPR_FL && - F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, - i_log_cluster_size)) { - if (ri->i_compress_algorithm >= COMPRESS_MAX) { - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported " - "compress algorithm: %u, run fsck to fix", - __func__, inode->i_ino, - ri->i_compress_algorithm); - return false; - } - if (le64_to_cpu(ri->i_compr_blocks) > - SECTOR_TO_BLOCK(inode->i_blocks)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_warn(sbi, "%s: inode (ino=%lx) has inconsistent " - "i_compr_blocks:%llu, i_blocks:%llu, run fsck to fix", - __func__, inode->i_ino, - le64_to_cpu(ri->i_compr_blocks), - SECTOR_TO_BLOCK(inode->i_blocks)); - return false; - } - if (ri->i_log_cluster_size < MIN_COMPRESS_LOG_SIZE || - ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE) { - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported " - "log cluster size: %u, run fsck to fix", - __func__, inode->i_ino, - ri->i_log_cluster_size); - return false; - } - } - return true; } @@ -442,7 +521,7 @@ static int do_read_inode(struct inode *inode) if (f2fs_has_extra_attr(inode) && f2fs_sb_has_compression(sbi) && (fi->i_flags & F2FS_COMPR_FL)) { if (F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, - i_log_cluster_size)) { + i_compress_flag)) { unsigned short compress_flag; atomic_set(&fi->i_compr_blocks, @@ -680,7 +759,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) if (f2fs_sb_has_compression(F2FS_I_SB(inode)) && F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, - i_log_cluster_size)) { + i_compress_flag)) { unsigned short compress_flag; ri->i_compr_blocks = diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c index 3d5bfb1ad585..f8703038e1d8 100644 --- a/fs/f2fs/iostat.c +++ b/fs/f2fs/iostat.c @@ -80,6 +80,7 @@ int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset) seq_puts(seq, "[OTHER]\n"); IOSTAT_INFO_SHOW("fs discard", FS_DISCARD_IO); IOSTAT_INFO_SHOW("fs flush", FS_FLUSH_IO); + IOSTAT_INFO_SHOW("fs zone reset", FS_ZONE_RESET_IO); return 0; } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index ad597b417fea..bee0568888da 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -23,7 +23,7 @@ #include <trace/events/f2fs.h> static inline bool is_extension_exist(const unsigned char *s, const char *sub, - bool tmp_ext) + bool tmp_ext, bool tmp_dot) { size_t slen = strlen(s); size_t sublen = strlen(sub); @@ -49,13 +49,27 @@ static inline bool is_extension_exist(const unsigned char *s, const char *sub, for (i = 1; i < slen - sublen; i++) { if (s[i] != '.') continue; - if (!strncasecmp(s + i + 1, sub, sublen)) - return true; + if (!strncasecmp(s + i + 1, sub, sublen)) { + if (!tmp_dot) + return true; + if (i == slen - sublen - 1 || s[i + 1 + sublen] == '.') + return true; + } } return false; } +static inline bool is_temperature_extension(const unsigned char *s, const char *sub) +{ + return is_extension_exist(s, sub, true, false); +} + +static inline bool is_compress_extension(const unsigned char *s, const char *sub) +{ + return is_extension_exist(s, sub, true, true); +} + int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, bool hot, bool set) { @@ -148,7 +162,7 @@ static void set_compress_new_inode(struct f2fs_sb_info *sbi, struct inode *dir, cold_count = le32_to_cpu(sbi->raw_super->extension_count); hot_count = sbi->raw_super->hot_ext_count; for (i = cold_count; i < cold_count + hot_count; i++) - if (is_extension_exist(name, extlist[i], false)) + if (is_temperature_extension(name, extlist[i])) break; f2fs_up_read(&sbi->sb_lock); if (i < (cold_count + hot_count)) @@ -156,12 +170,12 @@ static void set_compress_new_inode(struct f2fs_sb_info *sbi, struct inode *dir, /* Don't compress unallowed extension. */ for (i = 0; i < noext_cnt; i++) - if (is_extension_exist(name, noext[i], false)) + if (is_compress_extension(name, noext[i])) return; /* Compress wanting extension. */ for (i = 0; i < ext_cnt; i++) { - if (is_extension_exist(name, ext[i], false)) { + if (is_compress_extension(name, ext[i])) { set_compress_context(inode); return; } @@ -189,7 +203,7 @@ static void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode, cold_count = le32_to_cpu(sbi->raw_super->extension_count); hot_count = sbi->raw_super->hot_ext_count; for (i = 0; i < cold_count + hot_count; i++) - if (is_extension_exist(name, extlist[i], true)) + if (is_temperature_extension(name, extlist[i])) break; f2fs_up_read(&sbi->sb_lock); @@ -576,8 +590,8 @@ out_splice: } #endif new = d_splice_alias(inode, dentry); - err = PTR_ERR_OR_ZERO(new); - trace_f2fs_lookup_end(dir, dentry, ino, !new ? -ENOENT : err); + trace_f2fs_lookup_end(dir, !IS_ERR_OR_NULL(new) ? new : dentry, + ino, IS_ERR(new) ? PTR_ERR(new) : err); return new; out_iput: iput(inode); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index bd1dad523796..ee2e1dd64f25 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -925,6 +925,7 @@ static int truncate_node(struct dnode_of_data *dn) static int truncate_dnode(struct dnode_of_data *dn) { + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct page *page; int err; @@ -932,19 +933,30 @@ static int truncate_dnode(struct dnode_of_data *dn) return 1; /* get direct node */ - page = f2fs_get_node_page(F2FS_I_SB(dn->inode), dn->nid); + page = f2fs_get_node_page(sbi, dn->nid); if (PTR_ERR(page) == -ENOENT) return 1; else if (IS_ERR(page)) return PTR_ERR(page); + if (IS_INODE(page) || ino_of_node(page) != dn->inode->i_ino) { + f2fs_err(sbi, "incorrect node reference, ino: %lu, nid: %u, ino_of_node: %u", + dn->inode->i_ino, dn->nid, ino_of_node(page)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INVALID_NODE_REFERENCE); + f2fs_put_page(page, 1); + return -EFSCORRUPTED; + } + /* Make dnode_of_data for parameter */ dn->node_page = page; dn->ofs_in_node = 0; - f2fs_truncate_data_blocks(dn); + f2fs_truncate_data_blocks_range(dn, ADDRS_PER_BLOCK(dn->inode)); err = truncate_node(dn); - if (err) + if (err) { + f2fs_put_page(page, 1); return err; + } return 1; } @@ -1596,6 +1608,9 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, trace_f2fs_writepage(page, NODE); if (unlikely(f2fs_cp_error(sbi))) { + /* keep node pages in remount-ro mode */ + if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_READONLY) + goto redirty_out; ClearPageUptodate(page); dec_page_count(sbi, F2FS_DIRTY_NODES); unlock_page(page); @@ -2063,7 +2078,6 @@ int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, struct list_head *head = &sbi->fsync_node_list; unsigned long flags; unsigned int cur_seq_id = 0; - int ret2, ret = 0; while (seq_id && cur_seq_id < seq_id) { spin_lock_irqsave(&sbi->fsync_node_lock, flags); @@ -2084,16 +2098,9 @@ int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, f2fs_wait_on_page_writeback(page, NODE, true, false); put_page(page); - - if (ret) - break; } - ret2 = filemap_check_errors(NODE_MAPPING(sbi)); - if (!ret) - ret = ret2; - - return ret; + return filemap_check_errors(NODE_MAPPING(sbi)); } static int f2fs_write_node_pages(struct address_space *mapping, @@ -3065,7 +3072,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_journal *journal = curseg->journal; - struct nat_entry_set *setvec[SETVEC_SIZE]; + struct nat_entry_set *setvec[NAT_VEC_SIZE]; struct nat_entry_set *set, *tmp; unsigned int found; nid_t set_idx = 0; @@ -3098,7 +3105,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) remove_nats_in_journal(sbi); while ((found = __gang_lookup_nat_set(nm_i, - set_idx, SETVEC_SIZE, setvec))) { + set_idx, NAT_VEC_SIZE, setvec))) { unsigned idx; set_idx = setvec[found - 1]->set + 1; @@ -3319,8 +3326,9 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i, *next_i; - struct nat_entry *natvec[NATVEC_SIZE]; - struct nat_entry_set *setvec[SETVEC_SIZE]; + void *vec[NAT_VEC_SIZE]; + struct nat_entry **natvec = (struct nat_entry **)vec; + struct nat_entry_set **setvec = (struct nat_entry_set **)vec; nid_t nid = 0; unsigned int found; @@ -3343,7 +3351,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) /* destroy nat cache */ f2fs_down_write(&nm_i->nat_tree_lock); while ((found = __gang_lookup_nat_cache(nm_i, - nid, NATVEC_SIZE, natvec))) { + nid, NAT_VEC_SIZE, natvec))) { unsigned idx; nid = nat_get_nid(natvec[found - 1]) + 1; @@ -3359,8 +3367,9 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) /* destroy nat set cache */ nid = 0; + memset(vec, 0, sizeof(void *) * NAT_VEC_SIZE); while ((found = __gang_lookup_nat_set(nm_i, - nid, SETVEC_SIZE, setvec))) { + nid, NAT_VEC_SIZE, setvec))) { unsigned idx; nid = setvec[found - 1]->set + 1; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 906fb67a99da..5bd16a95eef8 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -35,8 +35,7 @@ #define DEF_RF_NODE_BLOCKS 0 /* vector size for gang look-up from nat cache that consists of radix tree */ -#define NATVEC_SIZE 64 -#define SETVEC_SIZE 32 +#define NAT_VEC_SIZE 32 /* return value for read_node_page */ #define LOCKED_PAGE 1 diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 58c1a0096f7d..4e7d4ceeb084 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -360,21 +360,63 @@ static unsigned int adjust_por_ra_blocks(struct f2fs_sb_info *sbi, return ra_blocks; } +/* Detect looped node chain with Floyd's cycle detection algorithm. */ +static int sanity_check_node_chain(struct f2fs_sb_info *sbi, block_t blkaddr, + block_t *blkaddr_fast, bool *is_detecting) +{ + unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS; + struct page *page = NULL; + int i; + + if (!*is_detecting) + return 0; + + for (i = 0; i < 2; i++) { + if (!f2fs_is_valid_blkaddr(sbi, *blkaddr_fast, META_POR)) { + *is_detecting = false; + return 0; + } + + page = f2fs_get_tmp_page(sbi, *blkaddr_fast); + if (IS_ERR(page)) + return PTR_ERR(page); + + if (!is_recoverable_dnode(page)) { + f2fs_put_page(page, 1); + *is_detecting = false; + return 0; + } + + ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, *blkaddr_fast, + next_blkaddr_of_node(page)); + + *blkaddr_fast = next_blkaddr_of_node(page); + f2fs_put_page(page, 1); + + f2fs_ra_meta_pages_cond(sbi, *blkaddr_fast, ra_blocks); + } + + if (*blkaddr_fast == blkaddr) { + f2fs_notice(sbi, "%s: Detect looped node chain on blkaddr:%u." + " Run fsck to fix it.", __func__, blkaddr); + return -EINVAL; + } + return 0; +} + static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, bool check_only) { struct curseg_info *curseg; struct page *page = NULL; - block_t blkaddr; - unsigned int loop_cnt = 0; - unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS; - unsigned int free_blocks = MAIN_SEGS(sbi) * sbi->blocks_per_seg - - valid_user_blocks(sbi); + block_t blkaddr, blkaddr_fast; + bool is_detecting = true; int err = 0; /* get node pages in the current segment */ curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + blkaddr_fast = blkaddr; while (1) { struct fsync_inode_entry *entry; @@ -418,10 +460,8 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, quota_inode); if (IS_ERR(entry)) { err = PTR_ERR(entry); - if (err == -ENOENT) { - err = 0; + if (err == -ENOENT) goto next; - } f2fs_put_page(page, 1); break; } @@ -431,25 +471,14 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, if (IS_INODE(page) && is_dent_dnode(page)) entry->last_dentry = blkaddr; next: - /* sanity check in order to detect looped node chain */ - if (++loop_cnt >= free_blocks || - blkaddr == next_blkaddr_of_node(page)) { - f2fs_notice(sbi, "%s: detect looped node chain, blkaddr:%u, next:%u", - __func__, blkaddr, - next_blkaddr_of_node(page)); - f2fs_put_page(page, 1); - err = -EINVAL; - break; - } - - ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, blkaddr, - next_blkaddr_of_node(page)); - /* check next segment */ blkaddr = next_blkaddr_of_node(page); f2fs_put_page(page, 1); - f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks); + err = sanity_check_node_chain(sbi, blkaddr, &blkaddr_fast, + &is_detecting); + if (err) + break; } return err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6db410f1bb8c..0457d620011f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1196,6 +1196,45 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, static void __update_discard_tree_range(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t lstart, block_t start, block_t len); + +#ifdef CONFIG_BLK_DEV_ZONED +static void __submit_zone_reset_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc, blk_opf_t flag, + struct list_head *wait_list, + unsigned int *issued) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct block_device *bdev = dc->bdev; + struct bio *bio = bio_alloc(bdev, 0, REQ_OP_ZONE_RESET | flag, GFP_NOFS); + unsigned long flags; + + trace_f2fs_issue_reset_zone(bdev, dc->di.start); + + spin_lock_irqsave(&dc->lock, flags); + dc->state = D_SUBMIT; + dc->bio_ref++; + spin_unlock_irqrestore(&dc->lock, flags); + + if (issued) + (*issued)++; + + atomic_inc(&dcc->queued_discard); + dc->queued++; + list_move_tail(&dc->list, wait_list); + + /* sanity check on discard range */ + __check_sit_bitmap(sbi, dc->di.lstart, dc->di.lstart + dc->di.len); + + bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(dc->di.start); + bio->bi_private = dc; + bio->bi_end_io = f2fs_submit_discard_endio; + submit_bio(bio); + + atomic_inc(&dcc->issued_discard); + f2fs_update_iostat(sbi, NULL, FS_ZONE_RESET_IO, dc->di.len * F2FS_BLKSIZE); +} +#endif + /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, @@ -1217,6 +1256,13 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) return 0; +#ifdef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev)) { + __submit_zone_reset_cmd(sbi, dc, flag, wait_list, issued); + return 0; + } +#endif + trace_f2fs_issue_discard(bdev, dc->di.start, dc->di.len); lstart = dc->di.lstart; @@ -1461,6 +1507,19 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, } } +#ifdef CONFIG_BLK_DEV_ZONED +static void __queue_zone_reset_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t lblkstart, + block_t blklen) +{ + trace_f2fs_queue_reset_zone(bdev, blkstart); + + mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock); + __insert_discard_cmd(sbi, bdev, lblkstart, blkstart, blklen); + mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock); +} +#endif + static void __queue_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { @@ -1724,6 +1783,19 @@ static void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) mutex_lock(&dcc->cmd_lock); dc = __lookup_discard_cmd(sbi, blkaddr); +#ifdef CONFIG_BLK_DEV_ZONED + if (dc && f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(dc->bdev)) { + /* force submit zone reset */ + if (dc->state == D_PREP) + __submit_zone_reset_cmd(sbi, dc, REQ_SYNC, + &dcc->wait_list, NULL); + dc->ref++; + mutex_unlock(&dcc->cmd_lock); + /* wait zone reset */ + __wait_one_discard_bio(sbi, dc); + return; + } +#endif if (dc) { if (dc->state == D_PREP) { __punch_discard_cmd(sbi, dc, blkaddr); @@ -1876,9 +1948,15 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, blkstart, blklen); return -EIO; } - trace_f2fs_issue_reset_zone(bdev, blkstart); - return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, - sector, nr_sects, GFP_NOFS); + + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) { + trace_f2fs_issue_reset_zone(bdev, blkstart); + return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, + sector, nr_sects, GFP_NOFS); + } + + __queue_zone_reset_cmd(sbi, bdev, blkstart, lblkstart, blklen); + return 0; } /* For conventional zones, use regular discard if supported */ @@ -2115,7 +2193,7 @@ find_next: len = next_pos - cur_pos; if (f2fs_sb_has_blkzoned(sbi) || - (force && len < cpc->trim_minlen)) + !force || len < cpc->trim_minlen) goto skip; f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos, @@ -4768,17 +4846,17 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, { unsigned int wp_segno, wp_blkoff, zone_secno, zone_segno, segno; block_t zone_block, wp_block, last_valid_block; - unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT; int i, s, b, ret; struct seg_entry *se; if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ) return 0; - wp_block = fdev->start_blk + (zone->wp >> log_sectors_per_block); + wp_block = fdev->start_blk + (zone->wp >> sbi->log_sectors_per_block); wp_segno = GET_SEGNO(sbi, wp_block); wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno); - zone_block = fdev->start_blk + (zone->start >> log_sectors_per_block); + zone_block = fdev->start_blk + (zone->start >> + sbi->log_sectors_per_block); zone_segno = GET_SEGNO(sbi, zone_block); zone_secno = GET_SEC_FROM_SEG(sbi, zone_segno); @@ -4811,39 +4889,52 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, } /* - * If last valid block is beyond the write pointer, report the - * inconsistency. This inconsistency does not cause write error - * because the zone will not be selected for write operation until - * it get discarded. Just report it. + * The write pointer matches with the valid blocks or + * already points to the end of the zone. */ - if (last_valid_block >= wp_block) { - f2fs_notice(sbi, "Valid block beyond write pointer: " - "valid block[0x%x,0x%x] wp[0x%x,0x%x]", - GET_SEGNO(sbi, last_valid_block), - GET_BLKOFF_FROM_SEG0(sbi, last_valid_block), - wp_segno, wp_blkoff); + if ((last_valid_block + 1 == wp_block) || + (zone->wp == zone->start + zone->len)) return 0; - } - /* - * If there is no valid block in the zone and if write pointer is - * not at zone start, reset the write pointer. - */ - if (last_valid_block + 1 == zone_block && zone->wp != zone->start) { + if (last_valid_block + 1 == zone_block) { + /* + * If there is no valid block in the zone and if write pointer + * is not at zone start, reset the write pointer. + */ f2fs_notice(sbi, "Zone without valid block has non-zero write " "pointer. Reset the write pointer: wp[0x%x,0x%x]", wp_segno, wp_blkoff); ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block, - zone->len >> log_sectors_per_block); - if (ret) { + zone->len >> sbi->log_sectors_per_block); + if (ret) f2fs_err(sbi, "Discard zone failed: %s (errno=%d)", fdev->path, ret); - return ret; - } + + return ret; } - return 0; + /* + * If there are valid blocks and the write pointer doesn't + * match with them, we need to report the inconsistency and + * fill the zone till the end to close the zone. This inconsistency + * does not cause write error because the zone will not be selected + * for write operation until it get discarded. + */ + f2fs_notice(sbi, "Valid blocks are not aligned with write pointer: " + "valid block[0x%x,0x%x] wp[0x%x,0x%x]", + GET_SEGNO(sbi, last_valid_block), + GET_BLKOFF_FROM_SEG0(sbi, last_valid_block), + wp_segno, wp_blkoff); + + ret = blkdev_issue_zeroout(fdev->bdev, zone->wp, + zone->len - (zone->wp - zone->start), + GFP_NOFS, 0); + if (ret) + f2fs_err(sbi, "Fill up zone failed: %s (errno=%d)", + fdev->path, ret); + + return ret; } static struct f2fs_dev_info *get_target_zoned_dev(struct f2fs_sb_info *sbi, @@ -4876,7 +4967,6 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) struct blk_zone zone; unsigned int cs_section, wp_segno, wp_blkoff, wp_sector_off; block_t cs_zone_block, wp_block; - unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT; sector_t zone_sector; int err; @@ -4888,8 +4978,8 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) return 0; /* report zone for the sector the curseg points to */ - zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) - << log_sectors_per_block; + zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) << + sbi->log_sectors_per_block; err = blkdev_report_zones(zbd->bdev, zone_sector, 1, report_one_zone_cb, &zone); if (err != 1) { @@ -4901,10 +4991,10 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ) return 0; - wp_block = zbd->start_blk + (zone.wp >> log_sectors_per_block); + wp_block = zbd->start_blk + (zone.wp >> sbi->log_sectors_per_block); wp_segno = GET_SEGNO(sbi, wp_block); wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno); - wp_sector_off = zone.wp & GENMASK(log_sectors_per_block - 1, 0); + wp_sector_off = zone.wp & GENMASK(sbi->log_sectors_per_block - 1, 0); if (cs->segno == wp_segno && cs->next_blkoff == wp_blkoff && wp_sector_off == 0) @@ -4931,8 +5021,8 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) if (!zbd) return 0; - zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) - << log_sectors_per_block; + zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) << + sbi->log_sectors_per_block; err = blkdev_report_zones(zbd->bdev, zone_sector, 1, report_one_zone_cb, &zone); if (err != 1) { @@ -4950,7 +5040,7 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) "Reset the zone: curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff); err = __f2fs_issue_discard_zone(sbi, zbd->bdev, cs_zone_block, - zone.len >> log_sectors_per_block); + zone.len >> sbi->log_sectors_per_block); if (err) { f2fs_err(sbi, "Discard zone failed: %s (errno=%d)", zbd->path, err); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e34197a70dc1..ca31163da00a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -164,6 +164,7 @@ enum { Opt_discard_unit, Opt_memory_mode, Opt_age_extent_cache, + Opt_errors, Opt_err, }; @@ -243,6 +244,7 @@ static match_table_t f2fs_tokens = { {Opt_discard_unit, "discard_unit=%s"}, {Opt_memory_mode, "memory=%s"}, {Opt_age_extent_cache, "age_extent_cache"}, + {Opt_errors, "errors=%s"}, {Opt_err, NULL}, }; @@ -587,14 +589,12 @@ static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str) { #ifdef CONFIG_F2FS_FS_LZ4HC unsigned int level; -#endif if (strlen(str) == 3) { - F2FS_OPTION(sbi).compress_level = 0; + F2FS_OPTION(sbi).compress_level = LZ4HC_DEFAULT_CLEVEL; return 0; } -#ifdef CONFIG_F2FS_FS_LZ4HC str += 3; if (str[0] != ':') { @@ -604,7 +604,7 @@ static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str) if (kstrtouint(str + 1, 10, &level)) return -EINVAL; - if (level < LZ4HC_MIN_CLEVEL || level > LZ4HC_MAX_CLEVEL) { + if (!f2fs_is_compress_level_valid(COMPRESS_LZ4, level)) { f2fs_info(sbi, "invalid lz4hc compress level: %d", level); return -EINVAL; } @@ -612,6 +612,10 @@ static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str) F2FS_OPTION(sbi).compress_level = level; return 0; #else + if (strlen(str) == 3) { + F2FS_OPTION(sbi).compress_level = 0; + return 0; + } f2fs_info(sbi, "kernel doesn't support lz4hc compression"); return -EINVAL; #endif @@ -625,7 +629,7 @@ static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str) int len = 4; if (strlen(str) == len) { - F2FS_OPTION(sbi).compress_level = 0; + F2FS_OPTION(sbi).compress_level = F2FS_ZSTD_DEFAULT_CLEVEL; return 0; } @@ -638,7 +642,7 @@ static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str) if (kstrtouint(str + 1, 10, &level)) return -EINVAL; - if (!level || level > zstd_max_clevel()) { + if (!f2fs_is_compress_level_valid(COMPRESS_ZSTD, level)) { f2fs_info(sbi, "invalid zstd compress level: %d", level); return -EINVAL; } @@ -1268,6 +1272,25 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_age_extent_cache: set_opt(sbi, AGE_EXTENT_CACHE); break; + case Opt_errors: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (!strcmp(name, "remount-ro")) { + F2FS_OPTION(sbi).errors = + MOUNT_ERRORS_READONLY; + } else if (!strcmp(name, "continue")) { + F2FS_OPTION(sbi).errors = + MOUNT_ERRORS_CONTINUE; + } else if (!strcmp(name, "panic")) { + F2FS_OPTION(sbi).errors = + MOUNT_ERRORS_PANIC; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; default: f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", p); @@ -1340,7 +1363,7 @@ default_check: return -EINVAL; } - min_size = sizeof(struct f2fs_xattr_header) / sizeof(__le32); + min_size = MIN_INLINE_XATTR_SIZE; max_size = MAX_INLINE_XATTR_SIZE; if (F2FS_OPTION(sbi).inline_xattr_size < min_size || @@ -1550,6 +1573,7 @@ static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); int i; + int err = 0; bool done; /* unregister procfs/sysfs entries in advance to avoid race case */ @@ -1576,7 +1600,7 @@ static void f2fs_put_super(struct super_block *sb) struct cp_control cpc = { .reason = CP_UMOUNT, }; - f2fs_write_checkpoint(sbi, &cpc); + err = f2fs_write_checkpoint(sbi, &cpc); } /* be sure to wait for any on-going discard commands */ @@ -1585,7 +1609,7 @@ static void f2fs_put_super(struct super_block *sb) struct cp_control cpc = { .reason = CP_UMOUNT | CP_TRIMMED, }; - f2fs_write_checkpoint(sbi, &cpc); + err = f2fs_write_checkpoint(sbi, &cpc); } /* @@ -1602,6 +1626,19 @@ static void f2fs_put_super(struct super_block *sb) f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); + if (err) { + truncate_inode_pages_final(NODE_MAPPING(sbi)); + truncate_inode_pages_final(META_MAPPING(sbi)); + } + + for (i = 0; i < NR_COUNT_TYPE; i++) { + if (!get_pages(sbi, i)) + continue; + f2fs_err(sbi, "detect filesystem reference count leak during " + "umount, type: %d, count: %lld", i, get_pages(sbi, i)); + f2fs_bug_on(sbi, 1); + } + f2fs_bug_on(sbi, sbi->fsync_node_num); f2fs_destroy_compress_inode(sbi); @@ -1622,6 +1659,9 @@ static void f2fs_put_super(struct super_block *sb) f2fs_destroy_node_manager(sbi); f2fs_destroy_segment_manager(sbi); + /* flush s_error_work before sbi destroy */ + flush_work(&sbi->s_error_work); + f2fs_destroy_post_read_wq(sbi); kvfree(sbi->ckpt); @@ -2052,12 +2092,32 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_LOW) seq_printf(seq, ",memory=%s", "low"); + if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_READONLY) + seq_printf(seq, ",errors=%s", "remount-ro"); + else if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_CONTINUE) + seq_printf(seq, ",errors=%s", "continue"); + else if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_PANIC) + seq_printf(seq, ",errors=%s", "panic"); + return 0; } -static void default_options(struct f2fs_sb_info *sbi) +static void default_options(struct f2fs_sb_info *sbi, bool remount) { /* init some FS parameters */ + if (!remount) { + set_opt(sbi, READ_EXTENT_CACHE); + clear_opt(sbi, DISABLE_CHECKPOINT); + + if (f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) + set_opt(sbi, DISCARD); + + if (f2fs_sb_has_blkzoned(sbi)) + F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_SECTION; + else + F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_BLOCK; + } + if (f2fs_sb_has_readonly(sbi)) F2FS_OPTION(sbi).active_logs = NR_CURSEG_RO_TYPE; else @@ -2080,29 +2140,23 @@ static void default_options(struct f2fs_sb_info *sbi) } F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON; F2FS_OPTION(sbi).memory_mode = MEMORY_MODE_NORMAL; + F2FS_OPTION(sbi).errors = MOUNT_ERRORS_CONTINUE; sbi->sb->s_flags &= ~SB_INLINECRYPT; set_opt(sbi, INLINE_XATTR); set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); - set_opt(sbi, READ_EXTENT_CACHE); set_opt(sbi, NOHEAP); - clear_opt(sbi, DISABLE_CHECKPOINT); set_opt(sbi, MERGE_CHECKPOINT); F2FS_OPTION(sbi).unusable_cap = 0; sbi->sb->s_flags |= SB_LAZYTIME; if (!f2fs_is_readonly(sbi)) set_opt(sbi, FLUSH_MERGE); - if (f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) - set_opt(sbi, DISCARD); - if (f2fs_sb_has_blkzoned(sbi)) { + if (f2fs_sb_has_blkzoned(sbi)) F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS; - F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_SECTION; - } else { + else F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE; - F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_BLOCK; - } #ifdef CONFIG_F2FS_FS_XATTR set_opt(sbi, XATTR_USER); @@ -2274,13 +2328,16 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) clear_sbi_flag(sbi, SBI_NEED_SB_WRITE); } - default_options(sbi); + default_options(sbi, true); /* parse mount options */ err = parse_options(sb, data, true); if (err) goto restore_opts; + /* flush outstanding errors before changing fs state */ + flush_work(&sbi->s_error_work); + /* * Previous and new state of filesystem is RO, * so skip checking GC and FLUSH_MERGE conditions. @@ -2713,6 +2770,7 @@ static int f2fs_quota_enable(struct super_block *sb, int type, int format_id, { struct inode *qf_inode; unsigned long qf_inum; + unsigned long qf_flag = F2FS_QUOTA_DEFAULT_FL; int err; BUG_ON(!f2fs_sb_has_quota_ino(F2FS_SB(sb))); @@ -2728,7 +2786,15 @@ static int f2fs_quota_enable(struct super_block *sb, int type, int format_id, } /* Don't account quota for quota files to avoid recursion */ + inode_lock(qf_inode); qf_inode->i_flags |= S_NOQUOTA; + + if ((F2FS_I(qf_inode)->i_flags & qf_flag) != qf_flag) { + F2FS_I(qf_inode)->i_flags |= qf_flag; + f2fs_set_inode_flags(qf_inode); + } + inode_unlock(qf_inode); + err = dquot_load_quota_inode(qf_inode, type, format_id, flags); iput(qf_inode); return err; @@ -2859,18 +2925,29 @@ static int f2fs_quota_on(struct super_block *sb, int type, int format_id, return -EBUSY; } + if (path->dentry->d_sb != sb) + return -EXDEV; + err = f2fs_quota_sync(sb, type); if (err) return err; - err = dquot_quota_on(sb, type, format_id, path); + inode = d_inode(path->dentry); + + err = filemap_fdatawrite(inode->i_mapping); if (err) return err; - inode = d_inode(path->dentry); + err = filemap_fdatawait(inode->i_mapping); + if (err) + return err; + + err = dquot_quota_on(sb, type, format_id, path); + if (err) + return err; inode_lock(inode); - F2FS_I(inode)->i_flags |= F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL; + F2FS_I(inode)->i_flags |= F2FS_QUOTA_DEFAULT_FL; f2fs_set_inode_flags(inode); inode_unlock(inode); f2fs_mark_inode_dirty_sync(inode, false); @@ -2895,7 +2972,7 @@ static int __f2fs_quota_off(struct super_block *sb, int type) goto out_put; inode_lock(inode); - F2FS_I(inode)->i_flags &= ~(F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL); + F2FS_I(inode)->i_flags &= ~F2FS_QUOTA_DEFAULT_FL; f2fs_set_inode_flags(inode); inode_unlock(inode); f2fs_mark_inode_dirty_sync(inode, false); @@ -3926,55 +4003,73 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) return err; } -void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason) +static void save_stop_reason(struct f2fs_sb_info *sbi, unsigned char reason) +{ + unsigned long flags; + + spin_lock_irqsave(&sbi->error_lock, flags); + if (sbi->stop_reason[reason] < GENMASK(BITS_PER_BYTE - 1, 0)) + sbi->stop_reason[reason]++; + spin_unlock_irqrestore(&sbi->error_lock, flags); +} + +static void f2fs_record_stop_reason(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + unsigned long flags; int err; f2fs_down_write(&sbi->sb_lock); - if (raw_super->s_stop_reason[reason] < GENMASK(BITS_PER_BYTE - 1, 0)) - raw_super->s_stop_reason[reason]++; + spin_lock_irqsave(&sbi->error_lock, flags); + if (sbi->error_dirty) { + memcpy(F2FS_RAW_SUPER(sbi)->s_errors, sbi->errors, + MAX_F2FS_ERRORS); + sbi->error_dirty = false; + } + memcpy(raw_super->s_stop_reason, sbi->stop_reason, MAX_STOP_REASON); + spin_unlock_irqrestore(&sbi->error_lock, flags); err = f2fs_commit_super(sbi, false); - if (err) - f2fs_err(sbi, "f2fs_commit_super fails to record reason:%u err:%d", - reason, err); + f2fs_up_write(&sbi->sb_lock); + if (err) + f2fs_err(sbi, "f2fs_commit_super fails to record err:%d", err); } void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag) { - spin_lock(&sbi->error_lock); + unsigned long flags; + + spin_lock_irqsave(&sbi->error_lock, flags); if (!test_bit(flag, (unsigned long *)sbi->errors)) { set_bit(flag, (unsigned long *)sbi->errors); sbi->error_dirty = true; } - spin_unlock(&sbi->error_lock); + spin_unlock_irqrestore(&sbi->error_lock, flags); } static bool f2fs_update_errors(struct f2fs_sb_info *sbi) { + unsigned long flags; bool need_update = false; - spin_lock(&sbi->error_lock); + spin_lock_irqsave(&sbi->error_lock, flags); if (sbi->error_dirty) { memcpy(F2FS_RAW_SUPER(sbi)->s_errors, sbi->errors, MAX_F2FS_ERRORS); sbi->error_dirty = false; need_update = true; } - spin_unlock(&sbi->error_lock); + spin_unlock_irqrestore(&sbi->error_lock, flags); return need_update; } -void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error) +static void f2fs_record_errors(struct f2fs_sb_info *sbi, unsigned char error) { int err; - f2fs_save_errors(sbi, error); - f2fs_down_write(&sbi->sb_lock); if (!f2fs_update_errors(sbi)) @@ -3988,6 +4083,83 @@ out_unlock: f2fs_up_write(&sbi->sb_lock); } +void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error) +{ + f2fs_save_errors(sbi, error); + f2fs_record_errors(sbi, error); +} + +void f2fs_handle_error_async(struct f2fs_sb_info *sbi, unsigned char error) +{ + f2fs_save_errors(sbi, error); + + if (!sbi->error_dirty) + return; + if (!test_bit(error, (unsigned long *)sbi->errors)) + return; + schedule_work(&sbi->s_error_work); +} + +static bool system_going_down(void) +{ + return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF + || system_state == SYSTEM_RESTART; +} + +void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason, + bool irq_context) +{ + struct super_block *sb = sbi->sb; + bool shutdown = reason == STOP_CP_REASON_SHUTDOWN; + bool continue_fs = !shutdown && + F2FS_OPTION(sbi).errors == MOUNT_ERRORS_CONTINUE; + + set_ckpt_flags(sbi, CP_ERROR_FLAG); + + if (!f2fs_hw_is_readonly(sbi)) { + save_stop_reason(sbi, reason); + + if (irq_context && !shutdown) + schedule_work(&sbi->s_error_work); + else + f2fs_record_stop_reason(sbi); + } + + /* + * We force ERRORS_RO behavior when system is rebooting. Otherwise we + * could panic during 'reboot -f' as the underlying device got already + * disabled. + */ + if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_PANIC && + !shutdown && !system_going_down() && + !is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) + panic("F2FS-fs (device %s): panic forced after error\n", + sb->s_id); + + if (shutdown) + set_sbi_flag(sbi, SBI_IS_SHUTDOWN); + + /* continue filesystem operators if errors=continue */ + if (continue_fs || f2fs_readonly(sb)) + return; + + f2fs_warn(sbi, "Remounting filesystem read-only"); + /* + * Make sure updated value of ->s_mount_flags will be visible before + * ->s_flags update + */ + smp_wmb(); + sb->s_flags |= SB_RDONLY; +} + +static void f2fs_record_error_work(struct work_struct *work) +{ + struct f2fs_sb_info *sbi = container_of(work, + struct f2fs_sb_info, s_error_work); + + f2fs_record_stop_reason(sbi); +} + static int f2fs_scan_devices(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); @@ -4220,14 +4392,16 @@ try_onemore: sb->s_fs_info = sbi; sbi->raw_super = raw_super; + INIT_WORK(&sbi->s_error_work, f2fs_record_error_work); memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS); + memcpy(sbi->stop_reason, raw_super->s_stop_reason, MAX_STOP_REASON); /* precompute checksum seed for metadata */ if (f2fs_sb_has_inode_chksum(sbi)) sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, sizeof(raw_super->uuid)); - default_options(sbi); + default_options(sbi, false); /* parse mount options */ options = kstrdup((const char *)data, GFP_KERNEL); if (data && !options) { @@ -4617,6 +4791,8 @@ free_sm: f2fs_destroy_segment_manager(sbi); stop_ckpt_thread: f2fs_stop_ckpt_thread(sbi); + /* flush s_error_work before sbi destroy */ + flush_work(&sbi->s_error_work); f2fs_destroy_post_read_wq(sbi); free_devices: destroy_device_list(sbi); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 8ea05340bad9..48b7e0073884 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -842,68 +842,160 @@ static struct f2fs_attr f2fs_attr_##_name = { \ #define F2FS_GENERAL_RO_ATTR(name) \ static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) -#define F2FS_STAT_ATTR(_struct_type, _struct_name, _name, _elname) \ -static struct f2fs_attr f2fs_attr_##_name = { \ - .attr = {.name = __stringify(_name), .mode = 0444 }, \ - .show = f2fs_sbi_show, \ - .struct_type = _struct_type, \ - .offset = offsetof(struct _struct_name, _elname), \ -} +#ifdef CONFIG_F2FS_STAT_FS +#define STAT_INFO_RO_ATTR(name, elname) \ + F2FS_RO_ATTR(STAT_INFO, f2fs_stat_info, name, elname) +#endif -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent_sleep_time, - urgent_sleep_time); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle, gc_mode); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent, gc_mode); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); -F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); -F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_request, max_discard_request); -F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, min_discard_issue_time, min_discard_issue_time); -F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_issue_time); -F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time); -F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_io_aware_gran, discard_io_aware_gran); -F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_urgent_util, discard_urgent_util); -F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); -F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_ordered_discard, max_ordered_discard); -F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_seq_blocks, min_seq_blocks); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ssr_sections, min_ssr_sections); -F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); -F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); -F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); -F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, max_roll_forward_node_blocks, max_rf_node_blocks); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, migration_granularity, migration_granularity); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, discard_idle_interval, - interval_time[DISCARD_TIME]); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle_interval, interval_time[GC_TIME]); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, - umount_discard_timeout, interval_time[UMOUNT_DISCARD_TIMEOUT]); -#ifdef CONFIG_F2FS_IOSTAT -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_period_ms, iostat_period_ms); +#define GC_THREAD_RW_ATTR(name, elname) \ + F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, name, elname) + +#define SM_INFO_RW_ATTR(name, elname) \ + F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, name, elname) + +#define SM_INFO_GENERAL_RW_ATTR(elname) \ + SM_INFO_RW_ATTR(elname, elname) + +#define DCC_INFO_RW_ATTR(name, elname) \ + F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, name, elname) + +#define DCC_INFO_GENERAL_RW_ATTR(elname) \ + DCC_INFO_RW_ATTR(elname, elname) + +#define NM_INFO_RW_ATTR(name, elname) \ + F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, name, elname) + +#define NM_INFO_GENERAL_RW_ATTR(elname) \ + NM_INFO_RW_ATTR(elname, elname) + +#define F2FS_SBI_RW_ATTR(name, elname) \ + F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, name, elname) + +#define F2FS_SBI_GENERAL_RW_ATTR(elname) \ + F2FS_SBI_RW_ATTR(elname, elname) + +#define F2FS_SBI_GENERAL_RO_ATTR(elname) \ + F2FS_RO_ATTR(F2FS_SBI, f2fs_sb_info, elname, elname) + +#ifdef CONFIG_F2FS_FAULT_INJECTION +#define FAULT_INFO_GENERAL_RW_ATTR(type, elname) \ + F2FS_RW_ATTR(type, f2fs_fault_info, elname, elname) #endif -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_io_bytes, max_io_bytes); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold); + +#define RESERVED_BLOCKS_GENERAL_RW_ATTR(elname) \ + F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, elname, elname) + +#define CPRC_INFO_GENERAL_RW_ATTR(elname) \ + F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, elname, elname) + +#define ATGC_INFO_RW_ATTR(name, elname) \ + F2FS_RW_ATTR(ATGC_INFO, atgc_management, name, elname) + +/* GC_THREAD ATTR */ +GC_THREAD_RW_ATTR(gc_urgent_sleep_time, urgent_sleep_time); +GC_THREAD_RW_ATTR(gc_min_sleep_time, min_sleep_time); +GC_THREAD_RW_ATTR(gc_max_sleep_time, max_sleep_time); +GC_THREAD_RW_ATTR(gc_no_gc_sleep_time, no_gc_sleep_time); + +/* SM_INFO ATTR */ +SM_INFO_RW_ATTR(reclaim_segments, rec_prefree_segments); +SM_INFO_GENERAL_RW_ATTR(ipu_policy); +SM_INFO_GENERAL_RW_ATTR(min_ipu_util); +SM_INFO_GENERAL_RW_ATTR(min_fsync_blocks); +SM_INFO_GENERAL_RW_ATTR(min_seq_blocks); +SM_INFO_GENERAL_RW_ATTR(min_hot_blocks); +SM_INFO_GENERAL_RW_ATTR(min_ssr_sections); + +/* DCC_INFO ATTR */ +DCC_INFO_RW_ATTR(max_small_discards, max_discards); +DCC_INFO_GENERAL_RW_ATTR(max_discard_request); +DCC_INFO_GENERAL_RW_ATTR(min_discard_issue_time); +DCC_INFO_GENERAL_RW_ATTR(mid_discard_issue_time); +DCC_INFO_GENERAL_RW_ATTR(max_discard_issue_time); +DCC_INFO_GENERAL_RW_ATTR(discard_io_aware_gran); +DCC_INFO_GENERAL_RW_ATTR(discard_urgent_util); +DCC_INFO_GENERAL_RW_ATTR(discard_granularity); +DCC_INFO_GENERAL_RW_ATTR(max_ordered_discard); + +/* NM_INFO ATTR */ +NM_INFO_RW_ATTR(max_roll_forward_node_blocks, max_rf_node_blocks); +NM_INFO_GENERAL_RW_ATTR(ram_thresh); +NM_INFO_GENERAL_RW_ATTR(ra_nid_pages); +NM_INFO_GENERAL_RW_ATTR(dirty_nats_ratio); + +/* F2FS_SBI ATTR */ F2FS_RW_ATTR(F2FS_SBI, f2fs_super_block, extension_list, extension_list); +F2FS_SBI_RW_ATTR(gc_idle, gc_mode); +F2FS_SBI_RW_ATTR(gc_urgent, gc_mode); +F2FS_SBI_RW_ATTR(cp_interval, interval_time[CP_TIME]); +F2FS_SBI_RW_ATTR(idle_interval, interval_time[REQ_TIME]); +F2FS_SBI_RW_ATTR(discard_idle_interval, interval_time[DISCARD_TIME]); +F2FS_SBI_RW_ATTR(gc_idle_interval, interval_time[GC_TIME]); +F2FS_SBI_RW_ATTR(umount_discard_timeout, interval_time[UMOUNT_DISCARD_TIMEOUT]); +F2FS_SBI_RW_ATTR(gc_pin_file_thresh, gc_pin_file_threshold); +F2FS_SBI_RW_ATTR(gc_reclaimed_segments, gc_reclaimed_segs); +F2FS_SBI_GENERAL_RW_ATTR(max_victim_search); +F2FS_SBI_GENERAL_RW_ATTR(migration_granularity); +F2FS_SBI_GENERAL_RW_ATTR(dir_level); +#ifdef CONFIG_F2FS_IOSTAT +F2FS_SBI_GENERAL_RW_ATTR(iostat_enable); +F2FS_SBI_GENERAL_RW_ATTR(iostat_period_ms); +#endif +F2FS_SBI_GENERAL_RW_ATTR(readdir_ra); +F2FS_SBI_GENERAL_RW_ATTR(max_io_bytes); +F2FS_SBI_GENERAL_RW_ATTR(data_io_flag); +F2FS_SBI_GENERAL_RW_ATTR(node_io_flag); +F2FS_SBI_GENERAL_RW_ATTR(gc_remaining_trials); +F2FS_SBI_GENERAL_RW_ATTR(seq_file_ra_mul); +F2FS_SBI_GENERAL_RW_ATTR(gc_segment_mode); +F2FS_SBI_GENERAL_RW_ATTR(max_fragment_chunk); +F2FS_SBI_GENERAL_RW_ATTR(max_fragment_hole); +#ifdef CONFIG_F2FS_FS_COMPRESSION +F2FS_SBI_GENERAL_RW_ATTR(compr_written_block); +F2FS_SBI_GENERAL_RW_ATTR(compr_saved_block); +F2FS_SBI_GENERAL_RW_ATTR(compr_new_inode); +F2FS_SBI_GENERAL_RW_ATTR(compress_percent); +F2FS_SBI_GENERAL_RW_ATTR(compress_watermark); +#endif +/* atomic write */ +F2FS_SBI_GENERAL_RO_ATTR(current_atomic_write); +F2FS_SBI_GENERAL_RW_ATTR(peak_atomic_write); +F2FS_SBI_GENERAL_RW_ATTR(committed_atomic_block); +F2FS_SBI_GENERAL_RW_ATTR(revoked_atomic_block); +/* block age extent cache */ +F2FS_SBI_GENERAL_RW_ATTR(hot_data_age_threshold); +F2FS_SBI_GENERAL_RW_ATTR(warm_data_age_threshold); +F2FS_SBI_GENERAL_RW_ATTR(last_age_weight); +#ifdef CONFIG_BLK_DEV_ZONED +F2FS_SBI_GENERAL_RO_ATTR(unusable_blocks_per_sec); +#endif + +/* STAT_INFO ATTR */ +#ifdef CONFIG_F2FS_STAT_FS +STAT_INFO_RO_ATTR(cp_foreground_calls, cp_count); +STAT_INFO_RO_ATTR(cp_background_calls, bg_cp_count); +STAT_INFO_RO_ATTR(gc_foreground_calls, call_count); +STAT_INFO_RO_ATTR(gc_background_calls, bg_gc); +#endif + +/* FAULT_INFO ATTR */ #ifdef CONFIG_F2FS_FAULT_INJECTION -F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); -F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); +FAULT_INFO_GENERAL_RW_ATTR(FAULT_INFO_RATE, inject_rate); +FAULT_INFO_GENERAL_RW_ATTR(FAULT_INFO_TYPE, inject_type); #endif -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_remaining_trials, gc_remaining_trials); -F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio); + +/* RESERVED_BLOCKS ATTR */ +RESERVED_BLOCKS_GENERAL_RW_ATTR(reserved_blocks); + +/* CPRC_INFO ATTR */ +CPRC_INFO_GENERAL_RW_ATTR(ckpt_thread_ioprio); + +/* ATGC_INFO ATTR */ +ATGC_INFO_RW_ATTR(atgc_candidate_ratio, candidate_ratio); +ATGC_INFO_RW_ATTR(atgc_candidate_count, max_candidate_count); +ATGC_INFO_RW_ATTR(atgc_age_weight, age_weight); +ATGC_INFO_RW_ATTR(atgc_age_threshold, age_threshold); + F2FS_GENERAL_RO_ATTR(dirty_segments); F2FS_GENERAL_RO_ATTR(free_segments); F2FS_GENERAL_RO_ATTR(ovp_segments); @@ -917,10 +1009,6 @@ F2FS_GENERAL_RO_ATTR(main_blkaddr); F2FS_GENERAL_RO_ATTR(pending_discard); F2FS_GENERAL_RO_ATTR(gc_mode); #ifdef CONFIG_F2FS_STAT_FS -F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count); -F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count); -F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, gc_foreground_calls, call_count); -F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, gc_background_calls, bg_gc); F2FS_GENERAL_RO_ATTR(moved_blocks_background); F2FS_GENERAL_RO_ATTR(moved_blocks_foreground); F2FS_GENERAL_RO_ATTR(avg_vblocks); @@ -935,8 +1023,6 @@ F2FS_FEATURE_RO_ATTR(encrypted_casefold); #endif /* CONFIG_FS_ENCRYPTION */ #ifdef CONFIG_BLK_DEV_ZONED F2FS_FEATURE_RO_ATTR(block_zoned); -F2FS_RO_ATTR(F2FS_SBI, f2fs_sb_info, unusable_blocks_per_sec, - unusable_blocks_per_sec); #endif F2FS_FEATURE_RO_ATTR(atomic_write); F2FS_FEATURE_RO_ATTR(extra_attr); @@ -956,37 +1042,9 @@ F2FS_FEATURE_RO_ATTR(casefold); F2FS_FEATURE_RO_ATTR(readonly); #ifdef CONFIG_F2FS_FS_COMPRESSION F2FS_FEATURE_RO_ATTR(compression); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_written_block, compr_written_block); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_saved_block, compr_saved_block); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_new_inode, compr_new_inode); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compress_percent, compress_percent); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compress_watermark, compress_watermark); #endif F2FS_FEATURE_RO_ATTR(pin_file); -/* For ATGC */ -F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_candidate_ratio, candidate_ratio); -F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_candidate_count, max_candidate_count); -F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_weight, age_weight); -F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_threshold, age_threshold); - -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, seq_file_ra_mul, seq_file_ra_mul); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_segment_mode, gc_segment_mode); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_reclaimed_segments, gc_reclaimed_segs); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_fragment_chunk, max_fragment_chunk); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_fragment_hole, max_fragment_hole); - -/* For atomic write */ -F2FS_RO_ATTR(F2FS_SBI, f2fs_sb_info, current_atomic_write, current_atomic_write); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, peak_atomic_write, peak_atomic_write); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, committed_atomic_block, committed_atomic_block); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, revoked_atomic_block, revoked_atomic_block); - -/* For block age extent cache */ -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, hot_data_age_threshold, hot_data_age_threshold); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, warm_data_age_threshold, warm_data_age_threshold); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, last_age_weight, last_age_weight); - #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { ATTR_LIST(gc_urgent_sleep_time), @@ -1386,12 +1444,19 @@ int __init f2fs_init_sysfs(void) ret = kobject_init_and_add(&f2fs_feat, &f2fs_feat_ktype, NULL, "features"); - if (ret) { - kobject_put(&f2fs_feat); - kset_unregister(&f2fs_kset); - } else { - f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + if (ret) + goto put_kobject; + + f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + if (!f2fs_proc_root) { + ret = -ENOMEM; + goto put_kobject; } + + return 0; +put_kobject: + kobject_put(&f2fs_feat); + kset_unregister(&f2fs_kset); return ret; } @@ -1430,23 +1495,24 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) if (err) goto put_feature_list_kobj; - if (f2fs_proc_root) - sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); + sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); + if (!sbi->s_proc) { + err = -ENOMEM; + goto put_feature_list_kobj; + } - if (sbi->s_proc) { - proc_create_single_data("segment_info", 0444, sbi->s_proc, + proc_create_single_data("segment_info", 0444, sbi->s_proc, segment_info_seq_show, sb); - proc_create_single_data("segment_bits", 0444, sbi->s_proc, + proc_create_single_data("segment_bits", 0444, sbi->s_proc, segment_bits_seq_show, sb); #ifdef CONFIG_F2FS_IOSTAT - proc_create_single_data("iostat_info", 0444, sbi->s_proc, + proc_create_single_data("iostat_info", 0444, sbi->s_proc, iostat_info_seq_show, sb); #endif - proc_create_single_data("victim_bits", 0444, sbi->s_proc, + proc_create_single_data("victim_bits", 0444, sbi->s_proc, victim_bits_seq_show, sb); - proc_create_single_data("discard_plist_info", 0444, sbi->s_proc, + proc_create_single_data("discard_plist_info", 0444, sbi->s_proc, discard_plist_seq_show, sb); - } return 0; put_feature_list_kobj: kobject_put(&sbi->s_feature_list_kobj); @@ -1462,8 +1528,7 @@ put_sb_kobj: void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) { - if (sbi->s_proc) - remove_proc_subtree(sbi->sb->s_id, f2fs_proc_root); + remove_proc_subtree(sbi->sb->s_id, f2fs_proc_root); kobject_put(&sbi->s_stat_kobj); wait_for_completion(&sbi->s_stat_kobj_unregister); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 213805d3592c..476b186b90a6 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -528,10 +528,12 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, if (len > F2FS_NAME_LEN) return -ERANGE; - f2fs_down_read(&F2FS_I(inode)->i_xattr_sem); + if (!ipage) + f2fs_down_read(&F2FS_I(inode)->i_xattr_sem); error = lookup_all_xattrs(inode, ipage, index, len, name, &entry, &base_addr, &base_size, &is_inline); - f2fs_up_read(&F2FS_I(inode)->i_xattr_sem); + if (!ipage) + f2fs_up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 416d652774a3..b1811c392e6f 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -83,6 +83,7 @@ struct f2fs_xattr_entry { sizeof(struct f2fs_xattr_header) - \ sizeof(struct f2fs_xattr_entry)) +#define MIN_INLINE_XATTR_SIZE (sizeof(struct f2fs_xattr_header) / sizeof(__le32)) #define MAX_INLINE_XATTR_SIZE \ (DEF_ADDRS_PER_INODE - \ F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - \ diff --git a/fs/file.c b/fs/file.c index 7893ea161d77..35c62b54c9d6 100644 --- a/fs/file.c +++ b/fs/file.c @@ -1042,10 +1042,8 @@ unsigned long __fdget_pos(unsigned int fd) struct file *file = (struct file *)(v & ~3); if (file && (file->f_mode & FMODE_ATOMIC_POS)) { - if (file_count(file) > 1) { - v |= FDPUT_POS_UNLOCK; - mutex_lock(&file->f_pos_lock); - } + v |= FDPUT_POS_UNLOCK; + mutex_lock(&file->f_pos_lock); } return v; } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 35bc174f9ba2..f67bef9d83c4 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -258,7 +258,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) spin_unlock(&fi->lock); } kfree(forget); - if (ret == -ENOMEM) + if (ret == -ENOMEM || ret == -EINTR) goto out; if (ret || fuse_invalid_attr(&outarg.attr) || fuse_stale_inode(inode, outarg.generation, &outarg.attr)) @@ -395,8 +395,6 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name goto out_put_forget; err = -EIO; - if (!outarg->nodeid) - goto out_put_forget; if (fuse_invalid_attr(&outarg->attr)) goto out_put_forget; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index d66070af145d..f19d748890f0 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1134,7 +1134,10 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, process_init_limits(fc, arg); if (arg->minor >= 6) { - u64 flags = arg->flags | (u64) arg->flags2 << 32; + u64 flags = arg->flags; + + if (flags & FUSE_INIT_EXT) + flags |= (u64) arg->flags2 << 32; ra_pages = arg->max_readahead / PAGE_SIZE; if (flags & FUSE_ASYNC_READ) @@ -1254,7 +1257,8 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA | FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | - FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP; + FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | + FUSE_HAS_EXPIRE_ONLY; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index 8e01bfdfc430..726640fa439e 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -9,14 +9,23 @@ #include <linux/compat.h> #include <linux/fileattr.h> -static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args) +static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args, + struct fuse_ioctl_out *outarg) { - ssize_t ret = fuse_simple_request(fm, args); + ssize_t ret; + + args->out_args[0].size = sizeof(*outarg); + args->out_args[0].value = outarg; + + ret = fuse_simple_request(fm, args); /* Translate ENOSYS, which shouldn't be returned from fs */ if (ret == -ENOSYS) ret = -ENOTTY; + if (ret >= 0 && outarg->result == -ENOSYS) + outarg->result = -ENOTTY; + return ret; } @@ -264,13 +273,11 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, } ap.args.out_numargs = 2; - ap.args.out_args[0].size = sizeof(outarg); - ap.args.out_args[0].value = &outarg; ap.args.out_args[1].size = out_size; ap.args.out_pages = true; ap.args.out_argvar = true; - transferred = fuse_send_ioctl(fm, &ap.args); + transferred = fuse_send_ioctl(fm, &ap.args, &outarg); err = transferred; if (transferred < 0) goto out; @@ -399,12 +406,10 @@ static int fuse_priv_ioctl(struct inode *inode, struct fuse_file *ff, args.in_args[1].size = inarg.in_size; args.in_args[1].value = ptr; args.out_numargs = 2; - args.out_args[0].size = sizeof(outarg); - args.out_args[0].value = &outarg; args.out_args[1].size = inarg.out_size; args.out_args[1].value = ptr; - err = fuse_send_ioctl(fm, &args); + err = fuse_send_ioctl(fm, &args, &outarg); if (!err) { if (outarg.result < 0) err = outarg.result; diff --git a/fs/inode.c b/fs/inode.c index d37fad91c8da..8fefb69e1f84 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1156,8 +1156,10 @@ lock: */ void lock_two_nondirectories(struct inode *inode1, struct inode *inode2) { - WARN_ON_ONCE(S_ISDIR(inode1->i_mode)); - WARN_ON_ONCE(S_ISDIR(inode2->i_mode)); + if (inode1) + WARN_ON_ONCE(S_ISDIR(inode1->i_mode)); + if (inode2) + WARN_ON_ONCE(S_ISDIR(inode2->i_mode)); lock_two_inodes(inode1, inode2, I_MUTEX_NORMAL, I_MUTEX_NONDIR2); } EXPORT_SYMBOL(lock_two_nondirectories); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index adb92cdb24b0..aa8967cca1a3 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -872,10 +872,10 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = iomap_write_iter(&iter, i); - if (unlikely(ret < 0)) + if (unlikely(iter.pos == iocb->ki_pos)) return ret; ret = iter.pos - iocb->ki_pos; - iocb->ki_pos += ret; + iocb->ki_pos = iter.pos; return ret; } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 51bd38da21cd..9ec91017a7f3 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -27,7 +27,7 @@ * * Called with j_list_lock held. */ -static inline void __buffer_unlink_first(struct journal_head *jh) +static inline void __buffer_unlink(struct journal_head *jh) { transaction_t *transaction = jh->b_cp_transaction; @@ -41,45 +41,6 @@ static inline void __buffer_unlink_first(struct journal_head *jh) } /* - * Unlink a buffer from a transaction checkpoint(io) list. - * - * Called with j_list_lock held. - */ -static inline void __buffer_unlink(struct journal_head *jh) -{ - transaction_t *transaction = jh->b_cp_transaction; - - __buffer_unlink_first(jh); - if (transaction->t_checkpoint_io_list == jh) { - transaction->t_checkpoint_io_list = jh->b_cpnext; - if (transaction->t_checkpoint_io_list == jh) - transaction->t_checkpoint_io_list = NULL; - } -} - -/* - * Move a buffer from the checkpoint list to the checkpoint io list - * - * Called with j_list_lock held - */ -static inline void __buffer_relink_io(struct journal_head *jh) -{ - transaction_t *transaction = jh->b_cp_transaction; - - __buffer_unlink_first(jh); - - if (!transaction->t_checkpoint_io_list) { - jh->b_cpnext = jh->b_cpprev = jh; - } else { - jh->b_cpnext = transaction->t_checkpoint_io_list; - jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev; - jh->b_cpprev->b_cpnext = jh; - jh->b_cpnext->b_cpprev = jh; - } - transaction->t_checkpoint_io_list = jh; -} - -/* * Check a checkpoint buffer could be release or not. * * Requires j_list_lock @@ -183,6 +144,7 @@ __flush_batch(journal_t *journal, int *batch_count) struct buffer_head *bh = journal->j_chkpt_bhs[i]; BUFFER_TRACE(bh, "brelse"); __brelse(bh); + journal->j_chkpt_bhs[i] = NULL; } *batch_count = 0; } @@ -242,15 +204,6 @@ restart: jh = transaction->t_checkpoint_list; bh = jh2bh(jh); - if (buffer_locked(bh)) { - get_bh(bh); - spin_unlock(&journal->j_list_lock); - wait_on_buffer(bh); - /* the journal_head may have gone by now */ - BUFFER_TRACE(bh, "brelse"); - __brelse(bh); - goto retry; - } if (jh->b_transaction != NULL) { transaction_t *t = jh->b_transaction; tid_t tid = t->t_tid; @@ -285,30 +238,50 @@ restart: spin_lock(&journal->j_list_lock); goto restart; } - if (!buffer_dirty(bh)) { + if (!trylock_buffer(bh)) { + /* + * The buffer is locked, it may be writing back, or + * flushing out in the last couple of cycles, or + * re-adding into a new transaction, need to check + * it again until it's unlocked. + */ + get_bh(bh); + spin_unlock(&journal->j_list_lock); + wait_on_buffer(bh); + /* the journal_head may have gone by now */ + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + goto retry; + } else if (!buffer_dirty(bh)) { + unlock_buffer(bh); BUFFER_TRACE(bh, "remove from checkpoint"); - if (__jbd2_journal_remove_checkpoint(jh)) - /* The transaction was released; we're done */ + /* + * If the transaction was released or the checkpoint + * list was empty, we're done. + */ + if (__jbd2_journal_remove_checkpoint(jh) || + !transaction->t_checkpoint_list) goto out; - continue; + } else { + unlock_buffer(bh); + /* + * We are about to write the buffer, it could be + * raced by some other transaction shrink or buffer + * re-log logic once we release the j_list_lock, + * leave it on the checkpoint list and check status + * again to make sure it's clean. + */ + BUFFER_TRACE(bh, "queue"); + get_bh(bh); + J_ASSERT_BH(bh, !buffer_jwrite(bh)); + journal->j_chkpt_bhs[batch_count++] = bh; + transaction->t_chp_stats.cs_written++; + transaction->t_checkpoint_list = jh->b_cpnext; } - /* - * Important: we are about to write the buffer, and - * possibly block, while still holding the journal - * lock. We cannot afford to let the transaction - * logic start messing around with this buffer before - * we write it to disk, as that would break - * recoverability. - */ - BUFFER_TRACE(bh, "queue"); - get_bh(bh); - J_ASSERT_BH(bh, !buffer_jwrite(bh)); - journal->j_chkpt_bhs[batch_count++] = bh; - __buffer_relink_io(jh); - transaction->t_chp_stats.cs_written++; + if ((batch_count == JBD2_NR_BATCH) || - need_resched() || - spin_needbreak(&journal->j_list_lock)) + need_resched() || spin_needbreak(&journal->j_list_lock) || + jh2bh(transaction->t_checkpoint_list) == journal->j_chkpt_bhs[0]) goto unlock_and_flush; } @@ -322,38 +295,6 @@ restart: goto restart; } - /* - * Now we issued all of the transaction's buffers, let's deal - * with the buffers that are out for I/O. - */ -restart2: - /* Did somebody clean up the transaction in the meanwhile? */ - if (journal->j_checkpoint_transactions != transaction || - transaction->t_tid != this_tid) - goto out; - - while (transaction->t_checkpoint_io_list) { - jh = transaction->t_checkpoint_io_list; - bh = jh2bh(jh); - if (buffer_locked(bh)) { - get_bh(bh); - spin_unlock(&journal->j_list_lock); - wait_on_buffer(bh); - /* the journal_head may have gone by now */ - BUFFER_TRACE(bh, "brelse"); - __brelse(bh); - spin_lock(&journal->j_list_lock); - goto restart2; - } - - /* - * Now in whatever state the buffer currently is, we - * know that it has been written out and so we can - * drop it from the list - */ - if (__jbd2_journal_remove_checkpoint(jh)) - break; - } out: spin_unlock(&journal->j_list_lock); result = jbd2_cleanup_journal_tail(journal); @@ -409,49 +350,9 @@ int jbd2_cleanup_journal_tail(journal_t *journal) /* Checkpoint list management */ /* - * journal_clean_one_cp_list - * - * Find all the written-back checkpoint buffers in the given list and - * release them. If 'destroy' is set, clean all buffers unconditionally. - * - * Called with j_list_lock held. - * Returns 1 if we freed the transaction, 0 otherwise. - */ -static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) -{ - struct journal_head *last_jh; - struct journal_head *next_jh = jh; - - if (!jh) - return 0; - - last_jh = jh->b_cpprev; - do { - jh = next_jh; - next_jh = jh->b_cpnext; - - if (!destroy && __cp_buffer_busy(jh)) - return 0; - - if (__jbd2_journal_remove_checkpoint(jh)) - return 1; - /* - * This function only frees up some memory - * if possible so we dont have an obligation - * to finish processing. Bail out if preemption - * requested: - */ - if (need_resched()) - return 0; - } while (jh != last_jh); - - return 0; -} - -/* * journal_shrink_one_cp_list * - * Find 'nr_to_scan' written-back checkpoint buffers in the given list + * Find all the written-back checkpoint buffers in the given list * and try to release them. If the whole transaction is released, set * the 'released' parameter. Return the number of released checkpointed * buffers. @@ -459,15 +360,15 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) * Called with j_list_lock held. */ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, - unsigned long *nr_to_scan, - bool *released) + bool destroy, bool *released) { struct journal_head *last_jh; struct journal_head *next_jh = jh; unsigned long nr_freed = 0; int ret; - if (!jh || *nr_to_scan == 0) + *released = false; + if (!jh) return 0; last_jh = jh->b_cpprev; @@ -475,12 +376,15 @@ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, jh = next_jh; next_jh = jh->b_cpnext; - (*nr_to_scan)--; - if (__cp_buffer_busy(jh)) - continue; + if (destroy) { + ret = __jbd2_journal_remove_checkpoint(jh); + } else { + ret = jbd2_journal_try_remove_checkpoint(jh); + if (ret < 0) + continue; + } nr_freed++; - ret = __jbd2_journal_remove_checkpoint(jh); if (ret) { *released = true; break; @@ -488,7 +392,7 @@ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, if (need_resched()) break; - } while (jh != last_jh && *nr_to_scan); + } while (jh != last_jh); return nr_freed; } @@ -506,11 +410,11 @@ unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan) { transaction_t *transaction, *last_transaction, *next_transaction; - bool released; + bool __maybe_unused released; tid_t first_tid = 0, last_tid = 0, next_tid = 0; tid_t tid = 0; unsigned long nr_freed = 0; - unsigned long nr_scanned = *nr_to_scan; + unsigned long freed; again: spin_lock(&journal->j_list_lock); @@ -539,19 +443,11 @@ again: transaction = next_transaction; next_transaction = transaction->t_cpnext; tid = transaction->t_tid; - released = false; - - nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_list, - nr_to_scan, &released); - if (*nr_to_scan == 0) - break; - if (need_resched() || spin_needbreak(&journal->j_list_lock)) - break; - if (released) - continue; - nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_io_list, - nr_to_scan, &released); + freed = journal_shrink_one_cp_list(transaction->t_checkpoint_list, + false, &released); + nr_freed += freed; + (*nr_to_scan) -= min(*nr_to_scan, freed); if (*nr_to_scan == 0) break; if (need_resched() || spin_needbreak(&journal->j_list_lock)) @@ -572,9 +468,8 @@ again: if (*nr_to_scan && next_tid) goto again; out: - nr_scanned -= *nr_to_scan; trace_jbd2_shrink_checkpoint_list(journal, first_tid, tid, last_tid, - nr_freed, nr_scanned, next_tid); + nr_freed, next_tid); return nr_freed; } @@ -590,7 +485,7 @@ out: void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy) { transaction_t *transaction, *last_transaction, *next_transaction; - int ret; + bool released; transaction = journal->j_checkpoint_transactions; if (!transaction) @@ -601,8 +496,8 @@ void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy) do { transaction = next_transaction; next_transaction = transaction->t_cpnext; - ret = journal_clean_one_cp_list(transaction->t_checkpoint_list, - destroy); + journal_shrink_one_cp_list(transaction->t_checkpoint_list, + destroy, &released); /* * This function only frees up some memory if possible so we * dont have an obligation to finish processing. Bail out if @@ -610,23 +505,12 @@ void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy) */ if (need_resched()) return; - if (ret) - continue; - /* - * It is essential that we are as careful as in the case of - * t_checkpoint_list with removing the buffer from the list as - * we can possibly see not yet submitted buffers on io_list - */ - ret = journal_clean_one_cp_list(transaction-> - t_checkpoint_io_list, destroy); - if (need_resched()) - return; /* * Stop scanning if we couldn't free the transaction. This * avoids pointless scanning of transactions which still * weren't checkpointed. */ - if (!ret) + if (!released) return; } while (transaction != last_transaction); } @@ -705,7 +589,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) jbd2_journal_put_journal_head(jh); /* Is this transaction empty? */ - if (transaction->t_checkpoint_list || transaction->t_checkpoint_io_list) + if (transaction->t_checkpoint_list) return 0; /* @@ -737,6 +621,34 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) } /* + * Check the checkpoint buffer and try to remove it from the checkpoint + * list if it's clean. Returns -EBUSY if it is not clean, returns 1 if + * it frees the transaction, 0 otherwise. + * + * This function is called with j_list_lock held. + */ +int jbd2_journal_try_remove_checkpoint(struct journal_head *jh) +{ + struct buffer_head *bh = jh2bh(jh); + + if (!trylock_buffer(bh)) + return -EBUSY; + if (buffer_dirty(bh)) { + unlock_buffer(bh); + return -EBUSY; + } + unlock_buffer(bh); + + /* + * Buffer is clean and the IO has finished (we held the buffer + * lock) so the checkpoint is done. We can safely remove the + * buffer from this transaction. + */ + JBUFFER_TRACE(jh, "remove from checkpoint list"); + return __jbd2_journal_remove_checkpoint(jh); +} + +/* * journal_insert_checkpoint: put a committed buffer onto a checkpoint * list so that we know when it is safe to clean the transaction out of * the log. @@ -797,7 +709,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact J_ASSERT(transaction->t_forget == NULL); J_ASSERT(transaction->t_shadow_list == NULL); J_ASSERT(transaction->t_checkpoint_list == NULL); - J_ASSERT(transaction->t_checkpoint_io_list == NULL); J_ASSERT(atomic_read(&transaction->t_updates) == 0); J_ASSERT(journal->j_committing_transaction != transaction); J_ASSERT(journal->j_running_transaction != transaction); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index b33155dd7001..1073259902a6 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -1141,8 +1141,7 @@ restart_loop: spin_lock(&journal->j_list_lock); commit_transaction->t_state = T_FINISHED; /* Check if the transaction can be dropped now that we are finished */ - if (commit_transaction->t_checkpoint_list == NULL && - commit_transaction->t_checkpoint_io_list == NULL) { + if (commit_transaction->t_checkpoint_list == NULL) { __jbd2_journal_drop_transaction(journal, commit_transaction); jbd2_journal_free_transaction(commit_transaction); } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 18611241f451..4d1fda1f7143 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1784,8 +1784,7 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh) * Otherwise, if the buffer has been written to disk, * it is safe to remove the checkpoint and drop it. */ - if (!buffer_dirty(bh)) { - __jbd2_journal_remove_checkpoint(jh); + if (jbd2_journal_try_remove_checkpoint(jh) >= 0) { spin_unlock(&journal->j_list_lock); goto drop; } @@ -2100,35 +2099,6 @@ void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh) __brelse(bh); } -/* - * Called from jbd2_journal_try_to_free_buffers(). - * - * Called under jh->b_state_lock - */ -static void -__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) -{ - struct journal_head *jh; - - jh = bh2jh(bh); - - if (buffer_locked(bh) || buffer_dirty(bh)) - goto out; - - if (jh->b_next_transaction != NULL || jh->b_transaction != NULL) - goto out; - - spin_lock(&journal->j_list_lock); - if (jh->b_cp_transaction != NULL) { - /* written-back checkpointed metadata buffer */ - JBUFFER_TRACE(jh, "remove from checkpoint list"); - __jbd2_journal_remove_checkpoint(jh); - } - spin_unlock(&journal->j_list_lock); -out: - return; -} - /** * jbd2_journal_try_to_free_buffers() - try to free page buffers. * @journal: journal for operation @@ -2186,7 +2156,13 @@ bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio) continue; spin_lock(&jh->b_state_lock); - __journal_try_to_free_buffer(journal, bh); + if (!jh->b_transaction && !jh->b_next_transaction) { + spin_lock(&journal->j_list_lock); + /* Remove written-back checkpointed metadata buffer */ + if (jh->b_cp_transaction != NULL) + jbd2_journal_try_remove_checkpoint(jh); + spin_unlock(&journal->j_list_lock); + } spin_unlock(&jh->b_state_lock); jbd2_journal_put_journal_head(jh); if (buffer_jbd(bh)) diff --git a/fs/namei.c b/fs/namei.c index 91171da719c5..e56ff39a79bc 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4874,8 +4874,7 @@ int vfs_rename(struct renamedata *rd) d_exchange(old_dentry, new_dentry); } out: - if (source) - inode_unlock(source); + inode_unlock(source); if (target) inode_unlock(target); dput(new_dentry); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 6e61fa3acaf1..3aefbad4cc09 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -6341,8 +6341,6 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || CLOSE_STATEID(stateid)) return status; - if (!same_clid(&stateid->si_opaque.so_clid, &cl->cl_clientid)) - return status; spin_lock(&cl->cl_lock); s = find_stateid_locked(cl, stateid); if (!s) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 95d7d8790bc3..f69c451018e3 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1623,6 +1623,20 @@ static int fanotify_events_supported(struct fsnotify_group *group, return -EINVAL; /* + * mount and sb marks are not allowed on kernel internal pseudo fs, + * like pipe_mnt, because that would subscribe to events on all the + * anonynous pipes in the system. + * + * SB_NOUSER covers all of the internal pseudo fs whose objects are not + * exposed to user's mount namespace, but there are other SB_KERNMOUNT + * fs, like nsfs, debugfs, for which the value of allowing sb and mount + * mark is questionable. For now we leave them alone. + */ + if (mark_type != FAN_MARK_INODE && + path->mnt->mnt_sb->s_flags & SB_NOUSER) + return -EINVAL; + + /* * We shouldn't have allowed setting dirent events and the directory * flags FAN_ONDIR and FAN_EVENT_ON_CHILD in mask of non-dir inode, * but because we always allowed it, error only when using new APIs. diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c index 0b8bc66377db..a9d82bbb4729 100644 --- a/fs/ntfs3/attrib.c +++ b/fs/ntfs3/attrib.c @@ -573,7 +573,7 @@ add_alloc_in_same_attr_seg: sbi, run, vcn, lcn, to_allocate, &pre_alloc, is_mft ? ALLOCATE_MFT : ALLOCATE_DEF, &alen, is_mft ? 0 : - (sbi->record_size - + (sbi->record_size - le32_to_cpu(rec->used) + 8) / 3 + 1, diff --git a/fs/ntfs3/attrlist.c b/fs/ntfs3/attrlist.c index c0c6bcbc8c05..42631b31adf1 100644 --- a/fs/ntfs3/attrlist.c +++ b/fs/ntfs3/attrlist.c @@ -52,7 +52,7 @@ int ntfs_load_attr_list(struct ntfs_inode *ni, struct ATTRIB *attr) if (!attr->non_res) { lsize = le32_to_cpu(attr->res.data_size); - le = kmalloc(al_aligned(lsize), GFP_NOFS); + le = kmalloc(al_aligned(lsize), GFP_NOFS | __GFP_NOWARN); if (!le) { err = -ENOMEM; goto out; @@ -80,7 +80,7 @@ int ntfs_load_attr_list(struct ntfs_inode *ni, struct ATTRIB *attr) if (err < 0) goto out; - le = kmalloc(al_aligned(lsize), GFP_NOFS); + le = kmalloc(al_aligned(lsize), GFP_NOFS | __GFP_NOWARN); if (!le) { err = -ENOMEM; goto out; @@ -375,8 +375,7 @@ bool al_remove_le(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le) * al_delete_le - Delete first le from the list which matches its parameters. */ bool al_delete_le(struct ntfs_inode *ni, enum ATTR_TYPE type, CLST vcn, - const __le16 *name, size_t name_len, - const struct MFT_REF *ref) + const __le16 *name, u8 name_len, const struct MFT_REF *ref) { u16 size; struct ATTR_LIST_ENTRY *le; diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c index 9a6c6a09d70c..107e808e06ea 100644 --- a/fs/ntfs3/bitmap.c +++ b/fs/ntfs3/bitmap.c @@ -287,8 +287,8 @@ static void wnd_add_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len, /* Check bits before 'bit'. */ ib = wnd->zone_bit == wnd->zone_end || bit < wnd->zone_end ? - 0 : - wnd->zone_end; + 0 : + wnd->zone_end; while (bit > ib && wnd_is_free_hlp(wnd, bit - 1, 1)) { bit -= 1; @@ -298,8 +298,8 @@ static void wnd_add_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len, /* Check bits after 'end_in'. */ ib = wnd->zone_bit == wnd->zone_end || end_in > wnd->zone_bit ? - wnd->nbits : - wnd->zone_bit; + wnd->nbits : + wnd->zone_bit; while (end_in < ib && wnd_is_free_hlp(wnd, end_in, 1)) { end_in += 1; @@ -418,7 +418,7 @@ static void wnd_remove_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len) n3 = rb_first(&wnd->count_tree); wnd->extent_max = n3 ? rb_entry(n3, struct e_node, count.node)->count.key : - 0; + 0; return; } diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 9be3e8edf4f3..1d6c824246c4 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -179,7 +179,7 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) { int err = 0; struct address_space *mapping = inode->i_mapping; - u32 blocksize = 1 << inode->i_blkbits; + u32 blocksize = i_blocksize(inode); pgoff_t idx = vbo >> PAGE_SHIFT; u32 from = vbo & (PAGE_SIZE - 1); pgoff_t idx_end = (vbo_to + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -192,7 +192,7 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) for (; idx < idx_end; idx += 1, from = 0) { page_off = (loff_t)idx << PAGE_SHIFT; to = (page_off + PAGE_SIZE) > vbo_to ? (vbo_to - page_off) : - PAGE_SIZE; + PAGE_SIZE; iblock = page_off >> inode->i_blkbits; page = find_or_create_page(mapping, idx, @@ -1078,7 +1078,7 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out; ret = is_compressed(ni) ? ntfs_compress_write(iocb, from) : - __generic_file_write_iter(iocb, from); + __generic_file_write_iter(iocb, from); out: inode_unlock(inode); diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index 2bfcf1a989c9..16bd9faa2d28 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -77,7 +77,7 @@ struct ATTR_STD_INFO *ni_std(struct ntfs_inode *ni) attr = mi_find_attr(&ni->mi, NULL, ATTR_STD, NULL, 0, NULL); return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO)) : - NULL; + NULL; } /* @@ -92,7 +92,7 @@ struct ATTR_STD_INFO5 *ni_std5(struct ntfs_inode *ni) attr = mi_find_attr(&ni->mi, NULL, ATTR_STD, NULL, 0, NULL); return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO5)) : - NULL; + NULL; } /* @@ -236,6 +236,7 @@ struct ATTRIB *ni_find_attr(struct ntfs_inode *ni, struct ATTRIB *attr, return attr; out: + ntfs_inode_err(&ni->vfs_inode, "failed to parse mft record"); ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR); return NULL; } @@ -384,7 +385,7 @@ bool ni_add_subrecord(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi) * ni_remove_attr - Remove all attributes for the given type/name/id. */ int ni_remove_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, - const __le16 *name, size_t name_len, bool base_only, + const __le16 *name, u8 name_len, bool base_only, const __le16 *id) { int err; @@ -517,6 +518,9 @@ out: */ static int ni_repack(struct ntfs_inode *ni) { +#if 1 + return 0; +#else int err = 0; struct ntfs_sb_info *sbi = ni->mi.sbi; struct mft_inode *mi, *mi_p = NULL; @@ -639,6 +643,7 @@ static int ni_repack(struct ntfs_inode *ni) run_close(&run); return err; +#endif } /* @@ -813,10 +818,8 @@ int ni_create_attr_list(struct ntfs_inode *ni) * Looks like one record_size is always enough. */ le = kmalloc(al_aligned(rs), GFP_NOFS); - if (!le) { - err = -ENOMEM; - goto out; - } + if (!le) + return -ENOMEM; mi_get_ref(&ni->mi, &le->ref); ni->attr_list.le = le; @@ -865,15 +868,16 @@ int ni_create_attr_list(struct ntfs_inode *ni) if (to_free > free_b) { err = -EINVAL; - goto out1; + goto out; } } /* Allocate child MFT. */ err = ntfs_look_free_mft(sbi, &rno, is_mft, ni, &mi); if (err) - goto out1; + goto out; + err = -EINVAL; /* Call mi_remove_attr() in reverse order to keep pointers 'arr_move' valid. */ while (to_free > 0) { struct ATTRIB *b = arr_move[--nb]; @@ -882,7 +886,8 @@ int ni_create_attr_list(struct ntfs_inode *ni) attr = mi_insert_attr(mi, b->type, Add2Ptr(b, name_off), b->name_len, asize, name_off); - WARN_ON(!attr); + if (!attr) + goto out; mi_get_ref(mi, &le_b[nb]->ref); le_b[nb]->id = attr->id; @@ -892,17 +897,20 @@ int ni_create_attr_list(struct ntfs_inode *ni) attr->id = le_b[nb]->id; /* Remove from primary record. */ - WARN_ON(!mi_remove_attr(NULL, &ni->mi, b)); + if (!mi_remove_attr(NULL, &ni->mi, b)) + goto out; if (to_free <= asize) break; to_free -= asize; - WARN_ON(!nb); + if (!nb) + goto out; } attr = mi_insert_attr(&ni->mi, ATTR_LIST, NULL, 0, lsize + SIZEOF_RESIDENT, SIZEOF_RESIDENT); - WARN_ON(!attr); + if (!attr) + goto out; attr->non_res = 0; attr->flags = 0; @@ -916,14 +924,12 @@ int ni_create_attr_list(struct ntfs_inode *ni) ni->attr_list.dirty = false; mark_inode_dirty(&ni->vfs_inode); - goto out; + return 0; -out1: +out: kfree(ni->attr_list.le); ni->attr_list.le = NULL; ni->attr_list.size = 0; - -out: return err; } @@ -1638,14 +1644,13 @@ int ni_delete_all(struct ntfs_inode *ni) * Return: File name attribute by its value. */ struct ATTR_FILE_NAME *ni_fname_name(struct ntfs_inode *ni, - const struct cpu_str *uni, + const struct le_str *uni, const struct MFT_REF *home_dir, struct mft_inode **mi, struct ATTR_LIST_ENTRY **le) { struct ATTRIB *attr = NULL; struct ATTR_FILE_NAME *fname; - struct le_str *fns; if (le) *le = NULL; @@ -1669,10 +1674,9 @@ next: if (uni->len != fname->name_len) goto next; - fns = (struct le_str *)&fname->name_len; - if (ntfs_cmp_names_cpu(uni, fns, NULL, false)) + if (ntfs_cmp_names(uni->name, uni->len, fname->name, uni->len, NULL, + false)) goto next; - return fname; } @@ -1757,8 +1761,8 @@ int ni_new_attr_flags(struct ntfs_inode *ni, enum FILE_ATTRIBUTE new_fa) /* Resize nonresident empty attribute in-place only. */ new_asize = (new_aflags & (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED)) ? - (SIZEOF_NONRESIDENT_EX + 8) : - (SIZEOF_NONRESIDENT + 8); + (SIZEOF_NONRESIDENT_EX + 8) : + (SIZEOF_NONRESIDENT + 8); if (!mi_resize_attr(mi, attr, new_asize - le32_to_cpu(attr->size))) return -EOPNOTSUPP; @@ -2910,7 +2914,7 @@ int ni_remove_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, /* Find name in record. */ mi_get_ref(&dir_ni->mi, &de_name->home); - fname = ni_fname_name(ni, (struct cpu_str *)&de_name->name_len, + fname = ni_fname_name(ni, (struct le_str *)&de_name->name_len, &de_name->home, &mi, &le); if (!fname) return -ENOENT; @@ -3160,8 +3164,8 @@ static bool ni_update_parent(struct ntfs_inode *ni, struct NTFS_DUP_INFO *dup, __le64 valid_le; dup->alloc_size = is_attr_ext(attr) ? - attr->nres.total_size : - attr->nres.alloc_size; + attr->nres.total_size : + attr->nres.alloc_size; dup->data_size = attr->nres.data_size; if (new_valid > data_size) diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c index 57762c5fe68b..12f28cdf5c83 100644 --- a/fs/ntfs3/fslog.c +++ b/fs/ntfs3/fslog.c @@ -828,8 +828,8 @@ static inline struct RESTART_TABLE *extend_rsttbl(struct RESTART_TABLE *tbl, memcpy(rt + 1, tbl + 1, esize * used); rt->free_goal = free_goal == ~0u ? - cpu_to_le32(~0u) : - cpu_to_le32(sizeof(struct RESTART_TABLE) + + cpu_to_le32(~0u) : + cpu_to_le32(sizeof(struct RESTART_TABLE) + free_goal * esize); if (tbl->first_free) { @@ -1090,8 +1090,8 @@ static inline u64 base_lsn(struct ntfs_log *log, << log->file_data_bits) + ((((is_log_record_end(hdr) && h_lsn <= le64_to_cpu(hdr->record_hdr.last_end_lsn)) ? - le16_to_cpu(hdr->record_hdr.next_record_off) : - log->page_size) + + le16_to_cpu(hdr->record_hdr.next_record_off) : + log->page_size) + lsn) >> 3); @@ -1299,8 +1299,8 @@ static void log_init_pg_hdr(struct ntfs_log *log, u32 sys_page_size, log->clst_per_page = 1; log->first_page = major_ver >= 2 ? - 0x22 * page_size : - ((sys_page_size << 1) + (page_size << 1)); + 0x22 * page_size : + ((sys_page_size << 1) + (page_size << 1)); log->major_ver = major_ver; log->minor_ver = minor_ver; } @@ -1513,8 +1513,8 @@ static u32 current_log_avail(struct ntfs_log *log) * If there is no oldest lsn then start at the first page of the file. */ oldest_off = (log->l_flags & NTFSLOG_NO_OLDEST_LSN) ? - log->first_page : - (log->oldest_lsn_off & ~log->sys_page_mask); + log->first_page : + (log->oldest_lsn_off & ~log->sys_page_mask); /* * We will use the next log page offset to compute the next free page. @@ -1522,9 +1522,9 @@ static u32 current_log_avail(struct ntfs_log *log) * If we are at the first page then use the end of the file. */ next_free_off = (log->l_flags & NTFSLOG_REUSE_TAIL) ? - log->next_page + log->page_size : + log->next_page + log->page_size : log->next_page == log->first_page ? log->l_size : - log->next_page; + log->next_page; /* If the two offsets are the same then there is no available space. */ if (oldest_off == next_free_off) @@ -1535,8 +1535,8 @@ static u32 current_log_avail(struct ntfs_log *log) */ free_bytes = oldest_off < next_free_off ? - log->total_avail_pages - (next_free_off - oldest_off) : - oldest_off - next_free_off; + log->total_avail_pages - (next_free_off - oldest_off) : + oldest_off - next_free_off; free_bytes >>= log->page_bits; return free_bytes * log->reserved; @@ -1671,7 +1671,7 @@ next_tail: best_lsn1 = first_tail ? base_lsn(log, first_tail, first_file_off) : 0; best_lsn2 = second_tail ? base_lsn(log, second_tail, second_file_off) : - 0; + 0; if (first_tail && second_tail) { if (best_lsn1 > best_lsn2) { @@ -1767,7 +1767,7 @@ tail_read: page_cnt = page_pos = 1; curpage_off = seq_base == log->seq_num ? min(log->next_page, page_off) : - log->next_page; + log->next_page; wrapped_file = curpage_off == log->first_page && @@ -1826,8 +1826,8 @@ use_cur_page: ((lsn_cur >> log->file_data_bits) + ((curpage_off < (lsn_to_vbo(log, lsn_cur) & ~log->page_mask)) ? - 1 : - 0)) != expected_seq) { + 1 : + 0)) != expected_seq) { goto check_tail; } @@ -2643,8 +2643,8 @@ static inline bool check_index_root(const struct ATTRIB *attr, const struct INDEX_ROOT *root = resident_data(attr); u8 index_bits = le32_to_cpu(root->index_block_size) >= sbi->cluster_size ? - sbi->cluster_bits : - SECTOR_SHIFT; + sbi->cluster_bits : + SECTOR_SHIFT; u8 block_clst = root->index_block_clst; if (le32_to_cpu(attr->res.data_size) < sizeof(struct INDEX_ROOT) || @@ -3861,9 +3861,9 @@ check_restart_area: /* If we have a valid page then grab a pointer to the restart area. */ ra2 = rst_info.valid_page ? - Add2Ptr(rst_info.r_page, + Add2Ptr(rst_info.r_page, le16_to_cpu(rst_info.r_page->ra_off)) : - NULL; + NULL; if (rst_info.chkdsk_was_run || (ra2 && ra2->client_idx[1] == LFS_NO_CLIENT_LE)) { diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c index 28cc421102e5..33afee0f5559 100644 --- a/fs/ntfs3/fsntfs.c +++ b/fs/ntfs3/fsntfs.c @@ -9,6 +9,7 @@ #include <linux/buffer_head.h> #include <linux/fs.h> #include <linux/kernel.h> +#include <linux/nls.h> #include "debug.h" #include "ntfs.h" @@ -173,12 +174,12 @@ int ntfs_fix_post_read(struct NTFS_RECORD_HEADER *rhdr, size_t bytes, fo = le16_to_cpu(rhdr->fix_off); fn = simple ? ((bytes >> SECTOR_SHIFT) + 1) : - le16_to_cpu(rhdr->fix_num); + le16_to_cpu(rhdr->fix_num); /* Check errors. */ if ((fo & 1) || fo + fn * sizeof(short) > SECTOR_SIZE || !fn-- || fn * SECTOR_SIZE > bytes) { - return -EINVAL; /* Native chkntfs returns ok! */ + return -E_NTFS_CORRUPT; } /* Get fixup pointer. */ @@ -1661,7 +1662,8 @@ int ntfs_vbo_to_lbo(struct ntfs_sb_info *sbi, const struct runs_tree *run, return 0; } -struct ntfs_inode *ntfs_new_inode(struct ntfs_sb_info *sbi, CLST rno, bool dir) +struct ntfs_inode *ntfs_new_inode(struct ntfs_sb_info *sbi, CLST rno, + enum RECORD_FLAG flag) { int err = 0; struct super_block *sb = sbi->sb; @@ -1673,8 +1675,7 @@ struct ntfs_inode *ntfs_new_inode(struct ntfs_sb_info *sbi, CLST rno, bool dir) ni = ntfs_i(inode); - err = mi_format_new(&ni->mi, sbi, rno, dir ? RECORD_FLAG_DIR : 0, - false); + err = mi_format_new(&ni->mi, sbi, rno, flag, false); if (err) goto out; @@ -1937,7 +1938,7 @@ int ntfs_security_init(struct ntfs_sb_info *sbi) break; sii_e = (struct NTFS_DE_SII *)ne; - if (le16_to_cpu(ne->view.data_size) < SIZEOF_SECURITY_HDR) + if (le16_to_cpu(ne->view.data_size) < sizeof(sii_e->sec_hdr)) continue; next_id = le32_to_cpu(sii_e->sec_id) + 1; @@ -1998,18 +1999,18 @@ int ntfs_get_security_by_id(struct ntfs_sb_info *sbi, __le32 security_id, goto out; t32 = le32_to_cpu(sii_e->sec_hdr.size); - if (t32 < SIZEOF_SECURITY_HDR) { + if (t32 < sizeof(struct SECURITY_HDR)) { err = -EINVAL; goto out; } - if (t32 > SIZEOF_SECURITY_HDR + 0x10000) { + if (t32 > sizeof(struct SECURITY_HDR) + 0x10000) { /* Looks like too big security. 0x10000 - is arbitrary big number. */ err = -EFBIG; goto out; } - *size = t32 - SIZEOF_SECURITY_HDR; + *size = t32 - sizeof(struct SECURITY_HDR); p = kmalloc(*size, GFP_NOFS); if (!p) { @@ -2023,14 +2024,14 @@ int ntfs_get_security_by_id(struct ntfs_sb_info *sbi, __le32 security_id, if (err) goto out; - if (memcmp(&d_security, &sii_e->sec_hdr, SIZEOF_SECURITY_HDR)) { + if (memcmp(&d_security, &sii_e->sec_hdr, sizeof(d_security))) { err = -EINVAL; goto out; } err = ntfs_read_run_nb(sbi, &ni->file.run, le64_to_cpu(sii_e->sec_hdr.off) + - SIZEOF_SECURITY_HDR, + sizeof(struct SECURITY_HDR), p, *size, NULL); if (err) goto out; @@ -2069,7 +2070,7 @@ int ntfs_insert_security(struct ntfs_sb_info *sbi, struct NTFS_DE_SDH sdh_e; struct NTFS_DE_SII sii_e; struct SECURITY_HDR *d_security; - u32 new_sec_size = size_sd + SIZEOF_SECURITY_HDR; + u32 new_sec_size = size_sd + sizeof(struct SECURITY_HDR); u32 aligned_sec_size = ALIGN(new_sec_size, 16); struct SECURITY_KEY hash_key; struct ntfs_fnd *fnd_sdh = NULL; @@ -2207,14 +2208,14 @@ int ntfs_insert_security(struct ntfs_sb_info *sbi, /* Fill SII entry. */ sii_e.de.view.data_off = cpu_to_le16(offsetof(struct NTFS_DE_SII, sec_hdr)); - sii_e.de.view.data_size = cpu_to_le16(SIZEOF_SECURITY_HDR); + sii_e.de.view.data_size = cpu_to_le16(sizeof(struct SECURITY_HDR)); sii_e.de.view.res = 0; - sii_e.de.size = cpu_to_le16(SIZEOF_SII_DIRENTRY); + sii_e.de.size = cpu_to_le16(sizeof(struct NTFS_DE_SII)); sii_e.de.key_size = cpu_to_le16(sizeof(d_security->key.sec_id)); sii_e.de.flags = 0; sii_e.de.res = 0; sii_e.sec_id = d_security->key.sec_id; - memcpy(&sii_e.sec_hdr, d_security, SIZEOF_SECURITY_HDR); + memcpy(&sii_e.sec_hdr, d_security, sizeof(struct SECURITY_HDR)); err = indx_insert_entry(indx_sii, ni, &sii_e.de, NULL, NULL, 0); if (err) @@ -2223,7 +2224,7 @@ int ntfs_insert_security(struct ntfs_sb_info *sbi, /* Fill SDH entry. */ sdh_e.de.view.data_off = cpu_to_le16(offsetof(struct NTFS_DE_SDH, sec_hdr)); - sdh_e.de.view.data_size = cpu_to_le16(SIZEOF_SECURITY_HDR); + sdh_e.de.view.data_size = cpu_to_le16(sizeof(struct SECURITY_HDR)); sdh_e.de.view.res = 0; sdh_e.de.size = cpu_to_le16(SIZEOF_SDH_DIRENTRY); sdh_e.de.key_size = cpu_to_le16(sizeof(sdh_e.key)); @@ -2231,7 +2232,7 @@ int ntfs_insert_security(struct ntfs_sb_info *sbi, sdh_e.de.res = 0; sdh_e.key.hash = d_security->key.hash; sdh_e.key.sec_id = d_security->key.sec_id; - memcpy(&sdh_e.sec_hdr, d_security, SIZEOF_SECURITY_HDR); + memcpy(&sdh_e.sec_hdr, d_security, sizeof(struct SECURITY_HDR)); sdh_e.magic[0] = cpu_to_le16('I'); sdh_e.magic[1] = cpu_to_le16('I'); @@ -2522,7 +2523,8 @@ out: /* * run_deallocate - Deallocate clusters. */ -int run_deallocate(struct ntfs_sb_info *sbi, struct runs_tree *run, bool trim) +int run_deallocate(struct ntfs_sb_info *sbi, const struct runs_tree *run, + bool trim) { CLST lcn, len; size_t idx = 0; @@ -2578,13 +2580,13 @@ static inline bool name_has_forbidden_chars(const struct le_str *fname) return false; } -static inline bool is_reserved_name(struct ntfs_sb_info *sbi, +static inline bool is_reserved_name(const struct ntfs_sb_info *sbi, const struct le_str *fname) { int port_digit; const __le16 *name = fname->name; int len = fname->len; - u16 *upcase = sbi->upcase; + const u16 *upcase = sbi->upcase; /* check for 3 chars reserved names (device names) */ /* name by itself or with any extension is forbidden */ @@ -2618,3 +2620,60 @@ bool valid_windows_name(struct ntfs_sb_info *sbi, const struct le_str *fname) return !name_has_forbidden_chars(fname) && !is_reserved_name(sbi, fname); } + +/* + * ntfs_set_label - updates current ntfs label. + */ +int ntfs_set_label(struct ntfs_sb_info *sbi, u8 *label, int len) +{ + int err; + struct ATTRIB *attr; + struct ntfs_inode *ni = sbi->volume.ni; + const u8 max_ulen = 0x80; /* TODO: use attrdef to get maximum length */ + /* Allocate PATH_MAX bytes. */ + struct cpu_str *uni = __getname(); + + if (!uni) + return -ENOMEM; + + err = ntfs_nls_to_utf16(sbi, label, len, uni, (PATH_MAX - 2) / 2, + UTF16_LITTLE_ENDIAN); + if (err < 0) + goto out; + + if (uni->len > max_ulen) { + ntfs_warn(sbi->sb, "new label is too long"); + err = -EFBIG; + goto out; + } + + ni_lock(ni); + + /* Ignore any errors. */ + ni_remove_attr(ni, ATTR_LABEL, NULL, 0, false, NULL); + + err = ni_insert_resident(ni, uni->len * sizeof(u16), ATTR_LABEL, NULL, + 0, &attr, NULL, NULL); + if (err < 0) + goto unlock_out; + + /* write new label in on-disk struct. */ + memcpy(resident_data(attr), uni->name, uni->len * sizeof(u16)); + + /* update cached value of current label. */ + if (len >= ARRAY_SIZE(sbi->volume.label)) + len = ARRAY_SIZE(sbi->volume.label) - 1; + memcpy(sbi->volume.label, label, len); + sbi->volume.label[len] = 0; + mark_inode_dirty_sync(&ni->vfs_inode); + +unlock_out: + ni_unlock(ni); + + if (!err) + err = _ni_write_inode(&ni->vfs_inode, 0); + +out: + __putname(uni); + return err; +}
\ No newline at end of file diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c index 0a48d2d67219..124c6e822623 100644 --- a/fs/ntfs3/index.c +++ b/fs/ntfs3/index.c @@ -432,8 +432,8 @@ next_run: nbits = 8 * (data_size - vbo); ok = nbits > from ? - (*fn)((ulong *)bh->b_data, from, nbits, ret) : - false; + (*fn)((ulong *)bh->b_data, from, nbits, ret) : + false; put_bh(bh); if (ok) { @@ -1113,6 +1113,12 @@ ok: *node = in; out: + if (err == -E_NTFS_CORRUPT) { + ntfs_inode_err(&ni->vfs_inode, "directory corrupted"); + ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR); + err = -EINVAL; + } + if (ib != in->index) kfree(ib); @@ -1676,8 +1682,8 @@ static int indx_insert_into_root(struct ntfs_index *indx, struct ntfs_inode *ni, /* Create alloc and bitmap attributes (if not). */ err = run_is_empty(&indx->alloc_run) ? - indx_create_allocate(indx, ni, &new_vbn) : - indx_add_allocate(indx, ni, &new_vbn); + indx_create_allocate(indx, ni, &new_vbn) : + indx_add_allocate(indx, ni, &new_vbn); /* Layout of record may be changed, so rescan root. */ root = indx_get_root(indx, ni, &attr, &mi); @@ -1868,8 +1874,8 @@ indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni, (*indx->cmp)(new_de + 1, le16_to_cpu(new_de->key_size), up_e + 1, le16_to_cpu(up_e->key_size), ctx) < 0 ? - hdr2 : - hdr1, + hdr2 : + hdr1, new_de, NULL, ctx); indx_mark_used(indx, ni, new_vbn >> indx->idx2vbn_bits); @@ -2340,7 +2346,7 @@ int indx_delete_entry(struct ntfs_index *indx, struct ntfs_inode *ni, re, ctx, fnd->level - 1, fnd) : - indx_insert_into_root(indx, ni, re, e, + indx_insert_into_root(indx, ni, re, e, ctx, fnd, 0); kfree(re); diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 6c560245eef4..dc7e7ab701c6 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -263,7 +263,7 @@ next_attr: goto next_attr; run = ino == MFT_REC_BITMAP ? &sbi->used.bitmap.run : - &ni->file.run; + &ni->file.run; break; case ATTR_ROOT: @@ -291,8 +291,8 @@ next_attr: goto out; mode = sb->s_root ? - (S_IFDIR | (0777 & sbi->options->fs_dmask_inv)) : - (S_IFDIR | 0777); + (S_IFDIR | (0777 & sbi->options->fs_dmask_inv)) : + (S_IFDIR | 0777); goto next_attr; case ATTR_ALLOC: @@ -450,7 +450,7 @@ end_enum: inode->i_op = &ntfs_file_inode_operations; inode->i_fop = &ntfs_file_operations; inode->i_mapping->a_ops = is_compressed(ni) ? &ntfs_aops_cmpr : - &ntfs_aops; + &ntfs_aops; if (ino != MFT_REC_MFT) init_rwsem(&ni->file.run_lock); } else if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode) || @@ -787,7 +787,7 @@ static ssize_t ntfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) ret = blockdev_direct_IO(iocb, inode, iter, wr ? ntfs_get_block_direct_IO_W : - ntfs_get_block_direct_IO_R); + ntfs_get_block_direct_IO_R); if (ret > 0) end = vbo + ret; @@ -1191,11 +1191,11 @@ out: * - ntfs_symlink * - ntfs_mkdir * - ntfs_atomic_open - * + * * NOTE: if fnd != NULL (ntfs_atomic_open) then @dir is locked */ -struct inode *ntfs_create_inode(struct mnt_idmap *idmap, - struct inode *dir, struct dentry *dentry, +struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, const struct cpu_str *uni, umode_t mode, dev_t dev, const char *symname, u32 size, struct ntfs_fnd *fnd) @@ -1309,7 +1309,7 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, if (err) goto out2; - ni = ntfs_new_inode(sbi, ino, fa & FILE_ATTRIBUTE_DIRECTORY); + ni = ntfs_new_inode(sbi, ino, S_ISDIR(mode) ? RECORD_FLAG_DIR : 0); if (IS_ERR(ni)) { err = PTR_ERR(ni); ni = NULL; @@ -1437,8 +1437,7 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, root = Add2Ptr(attr, sizeof(I30_NAME) + SIZEOF_RESIDENT); memcpy(root, dir_root, offsetof(struct INDEX_ROOT, ihdr)); - root->ihdr.de_off = - cpu_to_le32(sizeof(struct INDEX_HDR)); // 0x10 + root->ihdr.de_off = cpu_to_le32(sizeof(struct INDEX_HDR)); root->ihdr.used = cpu_to_le32(sizeof(struct INDEX_HDR) + sizeof(struct NTFS_DE)); root->ihdr.total = root->ihdr.used; @@ -1605,7 +1604,7 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, inode->i_op = &ntfs_file_inode_operations; inode->i_fop = &ntfs_file_operations; inode->i_mapping->a_ops = is_compressed(ni) ? &ntfs_aops_cmpr : - &ntfs_aops; + &ntfs_aops; init_rwsem(&ni->file.run_lock); } else { inode->i_op = &ntfs_special_inode_operations; diff --git a/fs/ntfs3/lznt.c b/fs/ntfs3/lznt.c index 61e161c7c567..4aae598d6d88 100644 --- a/fs/ntfs3/lznt.c +++ b/fs/ntfs3/lznt.c @@ -297,7 +297,7 @@ next: struct lznt *get_lznt_ctx(int level) { struct lznt *r = kzalloc(level ? offsetof(struct lznt, hash) : - sizeof(struct lznt), + sizeof(struct lznt), GFP_NOFS); if (r) @@ -393,8 +393,8 @@ ssize_t decompress_lznt(const void *cmpr, size_t cmpr_size, void *unc, } else { /* This chunk does not contain compressed data. */ unc_use = unc_chunk + LZNT_CHUNK_SIZE > unc_end ? - unc_end - unc_chunk : - LZNT_CHUNK_SIZE; + unc_end - unc_chunk : + LZNT_CHUNK_SIZE; if (cmpr_chunk + sizeof(chunk_hdr) + unc_use > cmpr_end) { diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index 9736b1e4a0f6..70f8c859e0ad 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -109,8 +109,8 @@ static int ntfs_create(struct mnt_idmap *idmap, struct inode *dir, { struct inode *inode; - inode = ntfs_create_inode(idmap, dir, dentry, NULL, S_IFREG | mode, - 0, NULL, 0, NULL); + inode = ntfs_create_inode(idmap, dir, dentry, NULL, S_IFREG | mode, 0, + NULL, 0, NULL); return IS_ERR(inode) ? PTR_ERR(inode) : 0; } @@ -125,8 +125,8 @@ static int ntfs_mknod(struct mnt_idmap *idmap, struct inode *dir, { struct inode *inode; - inode = ntfs_create_inode(idmap, dir, dentry, NULL, mode, rdev, - NULL, 0, NULL); + inode = ntfs_create_inode(idmap, dir, dentry, NULL, mode, rdev, NULL, 0, + NULL); return IS_ERR(inode) ? PTR_ERR(inode) : 0; } @@ -199,8 +199,8 @@ static int ntfs_symlink(struct mnt_idmap *idmap, struct inode *dir, u32 size = strlen(symname); struct inode *inode; - inode = ntfs_create_inode(idmap, dir, dentry, NULL, S_IFLNK | 0777, - 0, symname, size, NULL); + inode = ntfs_create_inode(idmap, dir, dentry, NULL, S_IFLNK | 0777, 0, + symname, size, NULL); return IS_ERR(inode) ? PTR_ERR(inode) : 0; } @@ -213,8 +213,8 @@ static int ntfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, { struct inode *inode; - inode = ntfs_create_inode(idmap, dir, dentry, NULL, S_IFDIR | mode, - 0, NULL, 0, NULL); + inode = ntfs_create_inode(idmap, dir, dentry, NULL, S_IFDIR | mode, 0, + NULL, 0, NULL); return IS_ERR(inode) ? PTR_ERR(inode) : 0; } @@ -422,19 +422,10 @@ static int ntfs_atomic_open(struct inode *dir, struct dentry *dentry, * fnd contains tree's path to insert to. * If fnd is not NULL then dir is locked. */ - - /* - * Unfortunately I don't know how to get here correct 'struct nameidata *nd' - * or 'struct mnt_idmap *idmap'. - * See atomic_open in fs/namei.c. - * This is why xfstest/633 failed. - * Looks like ntfs_atomic_open must accept 'struct mnt_idmap *idmap' as argument. - */ - - inode = ntfs_create_inode(&nop_mnt_idmap, dir, dentry, uni, mode, 0, - NULL, 0, fnd); + inode = ntfs_create_inode(mnt_idmap(file->f_path.mnt), dir, dentry, uni, + mode, 0, NULL, 0, fnd); err = IS_ERR(inode) ? PTR_ERR(inode) : - finish_open(file, dentry, ntfs_file_open); + finish_open(file, dentry, ntfs_file_open); dput(d); out2: diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h index 90151e56c122..98b76d1b09e7 100644 --- a/fs/ntfs3/ntfs.h +++ b/fs/ntfs3/ntfs.h @@ -95,11 +95,10 @@ enum RECORD_NUM { MFT_REC_BITMAP = 6, MFT_REC_BOOT = 7, MFT_REC_BADCLUST = 8, - //MFT_REC_QUOTA = 9, - MFT_REC_SECURE = 9, // NTFS 3.0 + MFT_REC_SECURE = 9, MFT_REC_UPCASE = 10, - MFT_REC_EXTEND = 11, // NTFS 3.0 - MFT_REC_RESERVED = 11, + MFT_REC_EXTEND = 11, + MFT_REC_RESERVED = 12, MFT_REC_FREE = 16, MFT_REC_USER = 24, }; @@ -109,7 +108,6 @@ enum ATTR_TYPE { ATTR_STD = cpu_to_le32(0x10), ATTR_LIST = cpu_to_le32(0x20), ATTR_NAME = cpu_to_le32(0x30), - // ATTR_VOLUME_VERSION on Nt4 ATTR_ID = cpu_to_le32(0x40), ATTR_SECURE = cpu_to_le32(0x50), ATTR_LABEL = cpu_to_le32(0x60), @@ -118,7 +116,6 @@ enum ATTR_TYPE { ATTR_ROOT = cpu_to_le32(0x90), ATTR_ALLOC = cpu_to_le32(0xA0), ATTR_BITMAP = cpu_to_le32(0xB0), - // ATTR_SYMLINK on Nt4 ATTR_REPARSE = cpu_to_le32(0xC0), ATTR_EA_INFO = cpu_to_le32(0xD0), ATTR_EA = cpu_to_le32(0xE0), @@ -144,6 +141,7 @@ enum FILE_ATTRIBUTE { FILE_ATTRIBUTE_ENCRYPTED = cpu_to_le32(0x00004000), FILE_ATTRIBUTE_VALID_FLAGS = cpu_to_le32(0x00007fb7), FILE_ATTRIBUTE_DIRECTORY = cpu_to_le32(0x10000000), + FILE_ATTRIBUTE_INDEX = cpu_to_le32(0x20000000) }; static_assert(sizeof(enum FILE_ATTRIBUTE) == 4); @@ -266,7 +264,7 @@ enum RECORD_FLAG { RECORD_FLAG_IN_USE = cpu_to_le16(0x0001), RECORD_FLAG_DIR = cpu_to_le16(0x0002), RECORD_FLAG_SYSTEM = cpu_to_le16(0x0004), - RECORD_FLAG_UNKNOWN = cpu_to_le16(0x0008), + RECORD_FLAG_INDEX = cpu_to_le16(0x0008), }; /* MFT Record structure. */ @@ -290,6 +288,15 @@ struct MFT_REC { #define MFTRECORD_FIXUP_OFFSET_1 offsetof(struct MFT_REC, res) #define MFTRECORD_FIXUP_OFFSET_3 offsetof(struct MFT_REC, fixups) +/* + * define MFTRECORD_FIXUP_OFFSET as MFTRECORD_FIXUP_OFFSET_3 (0x30) + * to format new mft records with bigger header (as current ntfs.sys does) + * + * define MFTRECORD_FIXUP_OFFSET as MFTRECORD_FIXUP_OFFSET_1 (0x2A) + * to format new mft records with smaller header (as old ntfs.sys did) + * Both variants are valid. + */ +#define MFTRECORD_FIXUP_OFFSET MFTRECORD_FIXUP_OFFSET_1 static_assert(MFTRECORD_FIXUP_OFFSET_1 == 0x2A); static_assert(MFTRECORD_FIXUP_OFFSET_3 == 0x30); @@ -331,18 +338,18 @@ struct ATTR_NONRESIDENT { __le64 svcn; // 0x10: Starting VCN of this segment. __le64 evcn; // 0x18: End VCN of this segment. __le16 run_off; // 0x20: Offset to packed runs. - // Unit of Compression size for this stream, expressed - // as a log of the cluster size. + // Unit of Compression size for this stream, expressed + // as a log of the cluster size. // - // 0 means file is not compressed - // 1, 2, 3, and 4 are potentially legal values if the - // stream is compressed, however the implementation - // may only choose to use 4, or possibly 3. Note - // that 4 means cluster size time 16. If convenient - // the implementation may wish to accept a - // reasonable range of legal values here (1-5?), - // even if the implementation only generates - // a smaller set of values itself. + // 0 means file is not compressed + // 1, 2, 3, and 4 are potentially legal values if the + // stream is compressed, however the implementation + // may only choose to use 4, or possibly 3. + // Note that 4 means cluster size time 16. + // If convenient the implementation may wish to accept a + // reasonable range of legal values here (1-5?), + // even if the implementation only generates + // a smaller set of values itself. u8 c_unit; // 0x22: u8 res1[5]; // 0x23: __le64 alloc_size; // 0x28: The allocated size of attribute in bytes. @@ -836,16 +843,22 @@ static_assert(sizeof(struct ATTR_DEF_ENTRY) == 0xa0); /* Object ID (0x40) */ struct OBJECT_ID { struct GUID ObjId; // 0x00: Unique Id assigned to file. - struct GUID BirthVolumeId; // 0x10: Birth Volume Id is the Object Id of the Volume on. - // which the Object Id was allocated. It never changes. - struct GUID BirthObjectId; // 0x20: Birth Object Id is the first Object Id that was - // ever assigned to this MFT Record. I.e. If the Object Id - // is changed for some reason, this field will reflect the - // original value of the Object Id. - struct GUID DomainId; // 0x30: Domain Id is currently unused but it is intended to be - // used in a network environment where the local machine is - // part of a Windows 2000 Domain. This may be used in a Windows - // 2000 Advanced Server managed domain. + + // Birth Volume Id is the Object Id of the Volume on. + // which the Object Id was allocated. It never changes. + struct GUID BirthVolumeId; //0x10: + + // Birth Object Id is the first Object Id that was + // ever assigned to this MFT Record. I.e. If the Object Id + // is changed for some reason, this field will reflect the + // original value of the Object Id. + struct GUID BirthObjectId; // 0x20: + + // Domain Id is currently unused but it is intended to be + // used in a network environment where the local machine is + // part of a Windows 2000 Domain. This may be used in a Windows + // 2000 Advanced Server managed domain. + struct GUID DomainId; // 0x30: }; static_assert(sizeof(struct OBJECT_ID) == 0x40); @@ -855,32 +868,35 @@ struct NTFS_DE_O { struct NTFS_DE de; struct GUID ObjId; // 0x10: Unique Id assigned to file. struct MFT_REF ref; // 0x20: MFT record number with this file. - struct GUID BirthVolumeId; // 0x28: Birth Volume Id is the Object Id of the Volume on - // which the Object Id was allocated. It never changes. - struct GUID BirthObjectId; // 0x38: Birth Object Id is the first Object Id that was - // ever assigned to this MFT Record. I.e. If the Object Id - // is changed for some reason, this field will reflect the - // original value of the Object Id. - // This field is valid if data_size == 0x48. - struct GUID BirthDomainId; // 0x48: Domain Id is currently unused but it is intended - // to be used in a network environment where the local - // machine is part of a Windows 2000 Domain. This may be - // used in a Windows 2000 Advanced Server managed domain. + + // Birth Volume Id is the Object Id of the Volume on + // which the Object Id was allocated. It never changes. + struct GUID BirthVolumeId; // 0x28: + + // Birth Object Id is the first Object Id that was + // ever assigned to this MFT Record. I.e. If the Object Id + // is changed for some reason, this field will reflect the + // original value of the Object Id. + // This field is valid if data_size == 0x48. + struct GUID BirthObjectId; // 0x38: + + // Domain Id is currently unused but it is intended + // to be used in a network environment where the local + // machine is part of a Windows 2000 Domain. This may be + // used in a Windows 2000 Advanced Server managed domain. + struct GUID BirthDomainId; // 0x48: }; static_assert(sizeof(struct NTFS_DE_O) == 0x58); -#define NTFS_OBJECT_ENTRY_DATA_SIZE1 \ - 0x38 // struct NTFS_DE_O.BirthDomainId is not used -#define NTFS_OBJECT_ENTRY_DATA_SIZE2 \ - 0x48 // struct NTFS_DE_O.BirthDomainId is used - /* Q Directory entry structure ( rule = 0x11 ) */ struct NTFS_DE_Q { struct NTFS_DE de; __le32 owner_id; // 0x10: Unique Id assigned to file + + /* here is 0x30 bytes of user quota. NOTE: 4 byte aligned! */ __le32 Version; // 0x14: 0x02 - __le32 flags2; // 0x18: Quota flags, see above + __le32 Flags; // 0x18: Quota flags, see above __le64 BytesUsed; // 0x1C: __le64 ChangeTime; // 0x24: __le64 WarningLimit; // 0x28: @@ -888,9 +904,9 @@ struct NTFS_DE_Q { __le64 ExceededTime; // 0x3C: // SID is placed here -}; // sizeof() = 0x44 +}__packed; // sizeof() = 0x44 -#define SIZEOF_NTFS_DE_Q 0x44 +static_assert(sizeof(struct NTFS_DE_Q) == 0x44); #define SecurityDescriptorsBlockSize 0x40000 // 256K #define SecurityDescriptorMaxSize 0x20000 // 128K @@ -912,7 +928,7 @@ struct SECURITY_HDR { */ } __packed; -#define SIZEOF_SECURITY_HDR 0x14 +static_assert(sizeof(struct SECURITY_HDR) == 0x14); /* SII Directory entry structure */ struct NTFS_DE_SII { @@ -921,7 +937,8 @@ struct NTFS_DE_SII { struct SECURITY_HDR sec_hdr; // 0x14: } __packed; -#define SIZEOF_SII_DIRENTRY 0x28 +static_assert(offsetof(struct NTFS_DE_SII, sec_hdr) == 0x14); +static_assert(sizeof(struct NTFS_DE_SII) == 0x28); /* SDH Directory entry structure */ struct NTFS_DE_SDH { @@ -1155,7 +1172,7 @@ struct REPARSE_DATA_BUFFER { #define FILE_NEED_EA 0x80 // See ntifs.h /* - *FILE_NEED_EA, indicates that the file to which the EA belongs cannot be + * FILE_NEED_EA, indicates that the file to which the EA belongs cannot be * interpreted without understanding the associated extended attributes. */ struct EA_INFO { diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index eb01f7e76479..629403ede6e5 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -53,6 +53,8 @@ enum utf16_endian; #define E_NTFS_NONRESIDENT 556 /* NTFS specific error code about punch hole. */ #define E_NTFS_NOTALIGNED 557 +/* NTFS specific error code when on-disk struct is corrupted. */ +#define E_NTFS_CORRUPT 558 /* sbi->flags */ @@ -274,7 +276,7 @@ struct ntfs_sb_info { __le16 flags; // Cached current VOLUME_INFO::flags, VOLUME_FLAG_DIRTY. u8 major_ver; u8 minor_ver; - char label[65]; + char label[256]; bool real_dirty; // Real fs state. } volume; @@ -284,7 +286,6 @@ struct ntfs_sb_info { struct ntfs_inode *ni; u32 next_id; u64 next_off; - __le32 def_security_id; } security; @@ -312,6 +313,7 @@ struct ntfs_sb_info { struct ntfs_mount_options *options; struct ratelimit_state msg_ratelimit; + struct proc_dir_entry *procdir; }; /* One MFT record(usually 1024 bytes), consists of attributes. */ @@ -465,8 +467,7 @@ int al_add_le(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, struct ATTR_LIST_ENTRY **new_le); bool al_remove_le(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le); bool al_delete_le(struct ntfs_inode *ni, enum ATTR_TYPE type, CLST vcn, - const __le16 *name, size_t name_len, - const struct MFT_REF *ref); + const __le16 *name, u8 name_len, const struct MFT_REF *ref); int al_update(struct ntfs_inode *ni, int sync); static inline size_t al_aligned(size_t size) { @@ -525,7 +526,7 @@ struct ATTRIB *ni_load_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, int ni_load_all_mi(struct ntfs_inode *ni); bool ni_add_subrecord(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi); int ni_remove_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, - const __le16 *name, size_t name_len, bool base_only, + const __le16 *name, u8 name_len, bool base_only, const __le16 *id); int ni_create_attr_list(struct ntfs_inode *ni); int ni_expand_list(struct ntfs_inode *ni); @@ -542,7 +543,7 @@ void ni_remove_attr_le(struct ntfs_inode *ni, struct ATTRIB *attr, struct mft_inode *mi, struct ATTR_LIST_ENTRY *le); int ni_delete_all(struct ntfs_inode *ni); struct ATTR_FILE_NAME *ni_fname_name(struct ntfs_inode *ni, - const struct cpu_str *uni, + const struct le_str *uni, const struct MFT_REF *home, struct mft_inode **mi, struct ATTR_LIST_ENTRY **entry); @@ -629,7 +630,7 @@ int ntfs_bio_fill_1(struct ntfs_sb_info *sbi, const struct runs_tree *run); int ntfs_vbo_to_lbo(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo, u64 *lbo, u64 *bytes); struct ntfs_inode *ntfs_new_inode(struct ntfs_sb_info *sbi, CLST nRec, - bool dir); + enum RECORD_FLAG flag); extern const u8 s_default_security[0x50]; bool is_sd_valid(const struct SECURITY_DESCRIPTOR_RELATIVE *sd, u32 len); int ntfs_security_init(struct ntfs_sb_info *sbi); @@ -647,8 +648,10 @@ int ntfs_insert_reparse(struct ntfs_sb_info *sbi, __le32 rtag, int ntfs_remove_reparse(struct ntfs_sb_info *sbi, __le32 rtag, const struct MFT_REF *ref); void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim); -int run_deallocate(struct ntfs_sb_info *sbi, struct runs_tree *run, bool trim); +int run_deallocate(struct ntfs_sb_info *sbi, const struct runs_tree *run, + bool trim); bool valid_windows_name(struct ntfs_sb_info *sbi, const struct le_str *name); +int ntfs_set_label(struct ntfs_sb_info *sbi, u8 *label, int len); /* Globals from index.c */ int indx_used_bit(struct ntfs_index *indx, struct ntfs_inode *ni, size_t *bit); @@ -706,8 +709,8 @@ int ntfs_sync_inode(struct inode *inode); int ntfs_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2); int inode_write_data(struct inode *inode, const void *data, size_t bytes); -struct inode *ntfs_create_inode(struct mnt_idmap *idmap, - struct inode *dir, struct dentry *dentry, +struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, const struct cpu_str *uni, umode_t mode, dev_t dev, const char *symname, u32 size, struct ntfs_fnd *fnd); @@ -736,7 +739,7 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr); // TODO: id? struct ATTRIB *mi_find_attr(struct mft_inode *mi, struct ATTRIB *attr, enum ATTR_TYPE type, const __le16 *name, - size_t name_len, const __le16 *id); + u8 name_len, const __le16 *id); static inline struct ATTRIB *rec_find_attr_le(struct mft_inode *rec, struct ATTR_LIST_ENTRY *le) { @@ -856,12 +859,12 @@ unsigned long ntfs_names_hash(const u16 *name, size_t len, const u16 *upcase, /* globals from xattr.c */ #ifdef CONFIG_NTFS3_FS_POSIX_ACL -struct posix_acl *ntfs_get_acl(struct mnt_idmap *idmap, - struct dentry *dentry, int type); +struct posix_acl *ntfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, + int type); int ntfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); int ntfs_init_acl(struct mnt_idmap *idmap, struct inode *inode, - struct inode *dir); + struct inode *dir); #else #define ntfs_get_acl NULL #define ntfs_set_acl NULL diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c index 2a281cead2bc..c12ebffc94da 100644 --- a/fs/ntfs3/record.c +++ b/fs/ntfs3/record.c @@ -124,7 +124,7 @@ int mi_read(struct mft_inode *mi, bool is_mft) struct rw_semaphore *rw_lock = NULL; if (is_mounted(sbi)) { - if (!is_mft) { + if (!is_mft && mft_ni) { rw_lock = &mft_ni->file.run_lock; down_read(rw_lock); } @@ -148,7 +148,7 @@ int mi_read(struct mft_inode *mi, bool is_mft) ni_lock(mft_ni); down_write(rw_lock); } - err = attr_load_runs_vcn(mft_ni, ATTR_DATA, NULL, 0, &mft_ni->file.run, + err = attr_load_runs_vcn(mft_ni, ATTR_DATA, NULL, 0, run, vbo >> sbi->cluster_bits); if (rw_lock) { up_write(rw_lock); @@ -180,6 +180,12 @@ ok: return 0; out: + if (err == -E_NTFS_CORRUPT) { + ntfs_err(sbi->sb, "mft corrupted"); + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + err = -EINVAL; + } + return err; } @@ -296,7 +302,7 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) */ struct ATTRIB *mi_find_attr(struct mft_inode *mi, struct ATTRIB *attr, enum ATTR_TYPE type, const __le16 *name, - size_t name_len, const __le16 *id) + u8 name_len, const __le16 *id) { u32 type_in = le32_to_cpu(type); u32 atype; @@ -382,6 +388,8 @@ int mi_format_new(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno, rec->seq = cpu_to_le16(seq); rec->flags = RECORD_FLAG_IN_USE | flags; + if (MFTRECORD_FIXUP_OFFSET == MFTRECORD_FIXUP_OFFSET_3) + rec->mft_record = cpu_to_le32(rno); mi->dirty = true; diff --git a/fs/ntfs3/run.c b/fs/ntfs3/run.c index 47612d16c027..cb8cf0161177 100644 --- a/fs/ntfs3/run.c +++ b/fs/ntfs3/run.c @@ -434,8 +434,8 @@ requires_new_range: if (should_add_tail) { tail_lcn = r->lcn == SPARSE_LCN ? - SPARSE_LCN : - (r->lcn + Tovcn); + SPARSE_LCN : + (r->lcn + Tovcn); tail_vcn = r->vcn + Tovcn; tail_len = r->len - Tovcn; } diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 5158dd31fd97..1a02072b6b0e 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -57,6 +57,7 @@ #include <linux/minmax.h> #include <linux/module.h> #include <linux/nls.h> +#include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/statfs.h> @@ -116,8 +117,8 @@ void ntfs_inode_printk(struct inode *inode, const char *fmt, ...) /* Use static allocated buffer, if possible. */ name = atomic_dec_and_test(&s_name_buf_cnt) ? - s_name_buf : - kmalloc(sizeof(s_name_buf), GFP_NOFS); + s_name_buf : + kmalloc(sizeof(s_name_buf), GFP_NOFS); if (name) { struct dentry *de = d_find_alias(inode); @@ -257,6 +258,7 @@ enum Opt { Opt_err, }; +// clang-format off static const struct fs_parameter_spec ntfs_fs_parameters[] = { fsparam_u32("uid", Opt_uid), fsparam_u32("gid", Opt_gid), @@ -277,9 +279,13 @@ static const struct fs_parameter_spec ntfs_fs_parameters[] = { fsparam_flag_no("nocase", Opt_nocase), {} }; +// clang-format on /* * Load nls table or if @nls is utf8 then return NULL. + * + * It is good idea to use here "const char *nls". + * But load_nls accepts "char*". */ static struct nls_table *ntfs_load_nls(char *nls) { @@ -436,6 +442,103 @@ static int ntfs_fs_reconfigure(struct fs_context *fc) return 0; } +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry *proc_info_root; + +/* + * ntfs3_volinfo: + * + * The content of /proc/fs/ntfs3/<dev>/volinfo + * + * ntfs3.1 + * cluster size + * number of clusters +*/ +static int ntfs3_volinfo(struct seq_file *m, void *o) +{ + struct super_block *sb = m->private; + struct ntfs_sb_info *sbi = sb->s_fs_info; + + seq_printf(m, "ntfs%d.%d\n%u\n%zu\n", sbi->volume.major_ver, + sbi->volume.minor_ver, sbi->cluster_size, + sbi->used.bitmap.nbits); + + return 0; +} + +static int ntfs3_volinfo_open(struct inode *inode, struct file *file) +{ + return single_open(file, ntfs3_volinfo, pde_data(inode)); +} + +/* read /proc/fs/ntfs3/<dev>/label */ +static int ntfs3_label_show(struct seq_file *m, void *o) +{ + struct super_block *sb = m->private; + struct ntfs_sb_info *sbi = sb->s_fs_info; + + seq_printf(m, "%s\n", sbi->volume.label); + + return 0; +} + +/* write /proc/fs/ntfs3/<dev>/label */ +static ssize_t ntfs3_label_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + int err; + struct super_block *sb = pde_data(file_inode(file)); + struct ntfs_sb_info *sbi = sb->s_fs_info; + ssize_t ret = count; + u8 *label = kmalloc(count, GFP_NOFS); + + if (!label) + return -ENOMEM; + + if (copy_from_user(label, buffer, ret)) { + ret = -EFAULT; + goto out; + } + while (ret > 0 && label[ret - 1] == '\n') + ret -= 1; + + err = ntfs_set_label(sbi, label, ret); + + if (err < 0) { + ntfs_err(sb, "failed (%d) to write label", err); + ret = err; + goto out; + } + + *ppos += count; + ret = count; +out: + kfree(label); + return ret; +} + +static int ntfs3_label_open(struct inode *inode, struct file *file) +{ + return single_open(file, ntfs3_label_show, pde_data(inode)); +} + +static const struct proc_ops ntfs3_volinfo_fops = { + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_open = ntfs3_volinfo_open, +}; + +static const struct proc_ops ntfs3_label_fops = { + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_open = ntfs3_label_open, + .proc_write = ntfs3_label_write, +}; + +#endif + static struct kmem_cache *ntfs_inode_cachep; static struct inode *ntfs_alloc_inode(struct super_block *sb) @@ -510,6 +613,16 @@ static void ntfs_put_super(struct super_block *sb) { struct ntfs_sb_info *sbi = sb->s_fs_info; +#ifdef CONFIG_PROC_FS + // Remove /proc/fs/ntfs3/.. + if (sbi->procdir) { + remove_proc_entry("label", sbi->procdir); + remove_proc_entry("volinfo", sbi->procdir); + remove_proc_entry(sb->s_id, proc_info_root); + sbi->procdir = NULL; + } +#endif + /* Mark rw ntfs as clear, if possible. */ ntfs_set_state(sbi, NTFS_DIRTY_CLEAR); @@ -711,9 +824,16 @@ static u32 true_sectors_per_clst(const struct NTFS_BOOT *boot) /* * ntfs_init_from_boot - Init internal info from on-disk boot sector. + * + * NTFS mount begins from boot - special formatted 512 bytes. + * There are two boots: the first and the last 512 bytes of volume. + * The content of boot is not changed during ntfs life. + * + * NOTE: ntfs.sys checks only first (primary) boot. + * chkdsk checks both boots. */ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, - u64 dev_size) + u64 dev_size, struct NTFS_BOOT **boot2) { struct ntfs_sb_info *sbi = sb->s_fs_info; int err; @@ -724,6 +844,8 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, struct MFT_REC *rec; u16 fn, ao; u8 cluster_bits; + u32 boot_off = 0; + const char *hint = "Primary boot"; sbi->volume.blocks = dev_size >> PAGE_SHIFT; @@ -731,11 +853,12 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, if (!bh) return -EIO; +check_boot: err = -EINVAL; - boot = (struct NTFS_BOOT *)bh->b_data; + boot = (struct NTFS_BOOT *)Add2Ptr(bh->b_data, boot_off); if (memcmp(boot->system_id, "NTFS ", sizeof("NTFS ") - 1)) { - ntfs_err(sb, "Boot's signature is not NTFS."); + ntfs_err(sb, "%s signature is not NTFS.", hint); goto out; } @@ -748,14 +871,16 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, boot->bytes_per_sector[0]; if (boot_sector_size < SECTOR_SIZE || !is_power_of_2(boot_sector_size)) { - ntfs_err(sb, "Invalid bytes per sector %u.", boot_sector_size); + ntfs_err(sb, "%s: invalid bytes per sector %u.", hint, + boot_sector_size); goto out; } /* cluster size: 512, 1K, 2K, 4K, ... 2M */ sct_per_clst = true_sectors_per_clst(boot); if ((int)sct_per_clst < 0 || !is_power_of_2(sct_per_clst)) { - ntfs_err(sb, "Invalid sectors per cluster %u.", sct_per_clst); + ntfs_err(sb, "%s: invalid sectors per cluster %u.", hint, + sct_per_clst); goto out; } @@ -771,20 +896,20 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, if (mlcn * sct_per_clst >= sectors || mlcn2 * sct_per_clst >= sectors) { ntfs_err( sb, - "Start of MFT 0x%llx (0x%llx) is out of volume 0x%llx.", - mlcn, mlcn2, sectors); + "%s: start of MFT 0x%llx (0x%llx) is out of volume 0x%llx.", + hint, mlcn, mlcn2, sectors); goto out; } sbi->record_size = record_size = boot->record_size < 0 ? 1 << (-boot->record_size) : - (u32)boot->record_size << cluster_bits; + (u32)boot->record_size << cluster_bits; sbi->record_bits = blksize_bits(record_size); sbi->attr_size_tr = (5 * record_size >> 4); // ~320 bytes /* Check MFT record size. */ if (record_size < SECTOR_SIZE || !is_power_of_2(record_size)) { - ntfs_err(sb, "Invalid bytes per MFT record %u (%d).", + ntfs_err(sb, "%s: invalid bytes per MFT record %u (%d).", hint, record_size, boot->record_size); goto out; } @@ -796,18 +921,18 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, } sbi->index_size = boot->index_size < 0 ? - 1u << (-boot->index_size) : - (u32)boot->index_size << cluster_bits; + 1u << (-boot->index_size) : + (u32)boot->index_size << cluster_bits; /* Check index record size. */ if (sbi->index_size < SECTOR_SIZE || !is_power_of_2(sbi->index_size)) { - ntfs_err(sb, "Invalid bytes per index %u(%d).", sbi->index_size, - boot->index_size); + ntfs_err(sb, "%s: invalid bytes per index %u(%d).", hint, + sbi->index_size, boot->index_size); goto out; } if (sbi->index_size > MAXIMUM_BYTES_PER_INDEX) { - ntfs_err(sb, "Unsupported bytes per index %u.", + ntfs_err(sb, "%s: unsupported bytes per index %u.", hint, sbi->index_size); goto out; } @@ -834,7 +959,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, /* Compare boot's cluster and sector. */ if (sbi->cluster_size < boot_sector_size) { - ntfs_err(sb, "Invalid bytes per cluster (%u).", + ntfs_err(sb, "%s: invalid bytes per cluster (%u).", hint, sbi->cluster_size); goto out; } @@ -850,7 +975,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, } sbi->max_bytes_per_attr = - record_size - ALIGN(MFTRECORD_FIXUP_OFFSET_1, 8) - + record_size - ALIGN(MFTRECORD_FIXUP_OFFSET, 8) - ALIGN(((record_size >> SECTOR_SHIFT) * sizeof(short)), 8) - ALIGN(sizeof(enum ATTR_TYPE), 8); @@ -892,10 +1017,10 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, sbi->new_rec = rec; rec->rhdr.sign = NTFS_FILE_SIGNATURE; - rec->rhdr.fix_off = cpu_to_le16(MFTRECORD_FIXUP_OFFSET_1); + rec->rhdr.fix_off = cpu_to_le16(MFTRECORD_FIXUP_OFFSET); fn = (sbi->record_size >> SECTOR_SHIFT) + 1; rec->rhdr.fix_num = cpu_to_le16(fn); - ao = ALIGN(MFTRECORD_FIXUP_OFFSET_1 + sizeof(short) * fn, 8); + ao = ALIGN(MFTRECORD_FIXUP_OFFSET + sizeof(short) * fn, 8); rec->attr_off = cpu_to_le16(ao); rec->used = cpu_to_le32(ao + ALIGN(sizeof(enum ATTR_TYPE), 8)); rec->total = cpu_to_le32(sbi->record_size); @@ -930,7 +1055,34 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, err = 0; + if (bh->b_blocknr && !sb_rdonly(sb)) { + /* + * Alternative boot is ok but primary is not ok. + * Do not update primary boot here 'cause it may be faked boot. + * Let ntfs to be mounted and update boot later. + */ + *boot2 = kmemdup(boot, sizeof(*boot), GFP_NOFS | __GFP_NOWARN); + } + out: + if (err == -EINVAL && !bh->b_blocknr && dev_size > PAGE_SHIFT) { + u32 block_size = min_t(u32, sector_size, PAGE_SIZE); + u64 lbo = dev_size - sizeof(*boot); + + /* + * Try alternative boot (last sector) + */ + brelse(bh); + + sb_set_blocksize(sb, block_size); + bh = ntfs_bread(sb, lbo >> blksize_bits(block_size)); + if (!bh) + return -EINVAL; + + boot_off = lbo & (block_size - 1); + hint = "Alternative boot"; + goto check_boot; + } brelse(bh); return err; @@ -955,6 +1107,8 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) struct ATTR_DEF_ENTRY *t; u16 *shared; struct MFT_REF ref; + bool ro = sb_rdonly(sb); + struct NTFS_BOOT *boot2 = NULL; ref.high = 0; @@ -985,7 +1139,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) /* Parse boot. */ err = ntfs_init_from_boot(sb, bdev_logical_block_size(bdev), - bdev_nr_bytes(bdev)); + bdev_nr_bytes(bdev), &boot2); if (err) goto out; @@ -1035,6 +1189,10 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) sbi->volume.minor_ver = info->minor_ver; sbi->volume.flags = info->flags; sbi->volume.ni = ni; + if (info->flags & VOLUME_FLAG_DIRTY) { + sbi->volume.real_dirty = true; + ntfs_info(sb, "It is recommened to use chkdsk."); + } /* Load $MFTMirr to estimate recs_mirr. */ ref.low = cpu_to_le32(MFT_REC_MIRR); @@ -1069,21 +1227,16 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) iput(inode); - if (sbi->flags & NTFS_FLAGS_NEED_REPLAY) { - if (!sb_rdonly(sb)) { - ntfs_warn(sb, - "failed to replay log file. Can't mount rw!"); - err = -EINVAL; - goto out; - } - } else if (sbi->volume.flags & VOLUME_FLAG_DIRTY) { - if (!sb_rdonly(sb) && !options->force) { - ntfs_warn( - sb, - "volume is dirty and \"force\" flag is not set!"); - err = -EINVAL; - goto out; - } + if ((sbi->flags & NTFS_FLAGS_NEED_REPLAY) && !ro) { + ntfs_warn(sb, "failed to replay log file. Can't mount rw!"); + err = -EINVAL; + goto out; + } + + if ((sbi->volume.flags & VOLUME_FLAG_DIRTY) && !ro && !options->force) { + ntfs_warn(sb, "volume is dirty and \"force\" flag is not set!"); + err = -EINVAL; + goto out; } /* Load $MFT. */ @@ -1173,7 +1326,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) bad_len += len; bad_frags += 1; - if (sb_rdonly(sb)) + if (ro) continue; if (wnd_set_used_safe(&sbi->used.bitmap, lcn, len, &tt) || tt) { @@ -1368,6 +1521,44 @@ load_root: goto put_inode_out; } + if (boot2) { + /* + * Alternative boot is ok but primary is not ok. + * Volume is recognized as NTFS. Update primary boot. + */ + struct buffer_head *bh0 = sb_getblk(sb, 0); + if (bh0) { + if (buffer_locked(bh0)) + __wait_on_buffer(bh0); + + lock_buffer(bh0); + memcpy(bh0->b_data, boot2, sizeof(*boot2)); + set_buffer_uptodate(bh0); + mark_buffer_dirty(bh0); + unlock_buffer(bh0); + if (!sync_dirty_buffer(bh0)) + ntfs_warn(sb, "primary boot is updated"); + put_bh(bh0); + } + + kfree(boot2); + } + +#ifdef CONFIG_PROC_FS + /* Create /proc/fs/ntfs3/.. */ + if (proc_info_root) { + struct proc_dir_entry *e = proc_mkdir(sb->s_id, proc_info_root); + static_assert((S_IRUGO | S_IWUSR) == 0644); + if (e) { + proc_create_data("volinfo", S_IRUGO, e, + &ntfs3_volinfo_fops, sb); + proc_create_data("label", S_IRUGO | S_IWUSR, e, + &ntfs3_label_fops, sb); + sbi->procdir = e; + } + } +#endif + return 0; put_inode_out: @@ -1380,6 +1571,7 @@ out: put_mount_options(sbi->options); put_ntfs(sbi); sb->s_fs_info = NULL; + kfree(boot2); return err; } @@ -1473,12 +1665,14 @@ static void ntfs_fs_free(struct fs_context *fc) put_mount_options(opts); } +// clang-format off static const struct fs_context_operations ntfs_context_ops = { .parse_param = ntfs_fs_parse_param, .get_tree = ntfs_fs_get_tree, .reconfigure = ntfs_fs_reconfigure, .free = ntfs_fs_free, }; +// clang-format on /* * ntfs_init_fs_context - Initialize sbi and opts @@ -1559,6 +1753,12 @@ static int __init init_ntfs_fs(void) if (IS_ENABLED(CONFIG_NTFS3_LZX_XPRESS)) pr_info("ntfs3: Read-only LZX/Xpress compression included\n"); + +#ifdef CONFIG_PROC_FS + /* Create "/proc/fs/ntfs3" */ + proc_info_root = proc_mkdir("fs/ntfs3", NULL); +#endif + err = ntfs3_init_bitmap(); if (err) return err; @@ -1590,6 +1790,12 @@ static void __exit exit_ntfs_fs(void) kmem_cache_destroy(ntfs_inode_cachep); unregister_filesystem(&ntfs_fs_type); ntfs3_exit_bitmap(); + +#ifdef CONFIG_PROC_FS + if (proc_info_root) + remove_proc_entry("fs/ntfs3", NULL); +#endif + } MODULE_LICENSE("GPL"); diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index c3de60a4543f..023f314e8950 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -24,7 +24,7 @@ static inline size_t unpacked_ea_size(const struct EA_FULL *ea) { return ea->size ? le32_to_cpu(ea->size) : - ALIGN(struct_size(ea, name, + ALIGN(struct_size(ea, name, 1 + ea->name_len + le16_to_cpu(ea->elength)), 4); @@ -141,6 +141,7 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea, memset(Add2Ptr(ea_p, size), 0, add_bytes); + err = -EINVAL; /* Check all attributes for consistency. */ for (off = 0; off < size; off += ea_size) { const struct EA_FULL *ef = Add2Ptr(ea_p, off); @@ -214,6 +215,9 @@ static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer, ea = Add2Ptr(ea_all, off); ea_size = unpacked_ea_size(ea); + if (!ea->name_len) + break; + if (buffer) { if (ret + ea->name_len + 1 > bytes_per_buffer) { err = -ERANGE; @@ -524,8 +528,8 @@ out: /* * ntfs_get_acl - inode_operations::get_acl */ -struct posix_acl *ntfs_get_acl(struct mnt_idmap *idmap, - struct dentry *dentry, int type) +struct posix_acl *ntfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, + int type) { struct inode *inode = d_inode(dentry); struct ntfs_inode *ni = ntfs_i(inode); @@ -592,8 +596,7 @@ static noinline int ntfs_set_acl_ex(struct mnt_idmap *idmap, case ACL_TYPE_ACCESS: /* Do not change i_mode if we are in init_acl */ if (acl && !init_acl) { - err = posix_acl_update_mode(idmap, inode, &mode, - &acl); + err = posix_acl_update_mode(idmap, inode, &mode, &acl); if (err) return err; } @@ -816,10 +819,9 @@ out: * ntfs_setxattr - inode_operations::setxattr */ static noinline int ntfs_setxattr(const struct xattr_handler *handler, - struct mnt_idmap *idmap, - struct dentry *de, struct inode *inode, - const char *name, const void *value, - size_t size, int flags) + struct mnt_idmap *idmap, struct dentry *de, + struct inode *inode, const char *name, + const void *value, size_t size, int flags) { int err = -EINVAL; struct ntfs_inode *ni = ntfs_i(inode); diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig index 304d12186ccd..3123da7cfb30 100644 --- a/fs/ocfs2/Kconfig +++ b/fs/ocfs2/Kconfig @@ -17,9 +17,9 @@ config OCFS2_FS You'll want to install the ocfs2-tools package in order to at least get "mount.ocfs2". - Project web page: https://oss.oracle.com/projects/ocfs2 - Tools web page: https://oss.oracle.com/projects/ocfs2-tools - OCFS2 mailing lists: https://oss.oracle.com/projects/ocfs2/mailman/ + Project web page: https://ocfs2.wiki.kernel.org/ + Tools web page: https://github.com/markfasheh/ocfs2-tools + OCFS2 mailing lists: https://subspace.kernel.org/lists.linux.dev.html For more information on OCFS2, see the file <file:Documentation/filesystems/ocfs2.rst>. diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 5b069f1a1e44..cc8977498c48 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1460,7 +1460,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) ovl_trusted_xattr_handlers; sb->s_fs_info = ofs; sb->s_flags |= SB_POSIXACL; - sb->s_iflags |= SB_I_SKIP_SYNC; + sb->s_iflags |= SB_I_SKIP_SYNC | SB_I_IMA_UNVERIFIABLE_SIGNATURE; err = -ENOMEM; root_dentry = ovl_get_root(sb, ctx->upper.dentry, oe); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 5d0cf59c4926..9cb32e1a78a0 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -199,7 +199,7 @@ kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg) ent->addr = (unsigned long)page_to_virt(p); ent->size = nr_pages << PAGE_SHIFT; - if (!virt_addr_valid(ent->addr)) + if (!virt_addr_valid((void *)ent->addr)) goto free_out; /* cut not-mapped area. ....from ppc-32 code. */ diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index bfc964b36c72..fe483f163dbc 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -568,6 +568,53 @@ static void free_cached_dir(struct cached_fid *cfid) kfree(cfid); } +static int +cifs_cfids_laundromat_thread(void *p) +{ + struct cached_fids *cfids = p; + struct cached_fid *cfid, *q; + struct list_head entry; + + while (!kthread_should_stop()) { + ssleep(1); + INIT_LIST_HEAD(&entry); + if (kthread_should_stop()) + return 0; + spin_lock(&cfids->cfid_list_lock); + list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { + if (time_after(jiffies, cfid->time + HZ * 30)) { + list_del(&cfid->entry); + list_add(&cfid->entry, &entry); + cfids->num_entries--; + } + } + spin_unlock(&cfids->cfid_list_lock); + + list_for_each_entry_safe(cfid, q, &entry, entry) { + cfid->on_list = false; + list_del(&cfid->entry); + /* + * Cancel, and wait for the work to finish in + * case we are racing with it. + */ + cancel_work_sync(&cfid->lease_break); + if (cfid->has_lease) { + /* + * We lease has not yet been cancelled from + * the server so we need to drop the reference. + */ + spin_lock(&cfids->cfid_list_lock); + cfid->has_lease = false; + spin_unlock(&cfids->cfid_list_lock); + kref_put(&cfid->refcount, smb2_close_cached_fid); + } + } + } + + return 0; +} + + struct cached_fids *init_cached_dirs(void) { struct cached_fids *cfids; @@ -577,6 +624,20 @@ struct cached_fids *init_cached_dirs(void) return NULL; spin_lock_init(&cfids->cfid_list_lock); INIT_LIST_HEAD(&cfids->entries); + + /* + * since we're in a cifs function already, we know that + * this will succeed. No need for try_module_get(). + */ + __module_get(THIS_MODULE); + cfids->laundromat = kthread_run(cifs_cfids_laundromat_thread, + cfids, "cifsd-cfid-laundromat"); + if (IS_ERR(cfids->laundromat)) { + cifs_dbg(VFS, "Failed to start cfids laundromat thread.\n"); + kfree(cfids); + module_put(THIS_MODULE); + return NULL; + } return cfids; } @@ -589,6 +650,12 @@ void free_cached_dirs(struct cached_fids *cfids) struct cached_fid *cfid, *q; LIST_HEAD(entry); + if (cfids->laundromat) { + kthread_stop(cfids->laundromat); + cfids->laundromat = NULL; + module_put(THIS_MODULE); + } + spin_lock(&cfids->cfid_list_lock); list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { cfid->on_list = false; diff --git a/fs/smb/client/cached_dir.h b/fs/smb/client/cached_dir.h index 2f4e764c9ca9..facc9b154d00 100644 --- a/fs/smb/client/cached_dir.h +++ b/fs/smb/client/cached_dir.h @@ -57,6 +57,7 @@ struct cached_fids { spinlock_t cfid_list_lock; int num_entries; struct list_head entries; + struct task_struct *laundromat; }; extern struct cached_fids *init_cached_dirs(void); diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h index d7274eefc666..15c8cc4b6680 100644 --- a/fs/smb/client/cifsfs.h +++ b/fs/smb/client/cifsfs.h @@ -159,6 +159,6 @@ extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ /* when changing internal version - update following two lines at same time */ -#define SMB3_PRODUCT_BUILD 43 -#define CIFS_VERSION "2.43" +#define SMB3_PRODUCT_BUILD 44 +#define CIFS_VERSION "2.44" #endif /* _CIFSFS_H */ diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index ca2da713c5fe..e5eec6d38d02 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -532,7 +532,7 @@ struct smb_version_operations { /* Check for STATUS_IO_TIMEOUT */ bool (*is_status_io_timeout)(char *buf); /* Check for STATUS_NETWORK_NAME_DELETED */ - void (*is_network_name_deleted)(char *buf, struct TCP_Server_Info *srv); + bool (*is_network_name_deleted)(char *buf, struct TCP_Server_Info *srv); }; struct smb_version_values { @@ -2179,7 +2179,7 @@ static inline void cifs_sg_set_buf(struct sg_table *sgtable, } while (buflen); } else { sg_set_page(&sgtable->sgl[sgtable->nents++], - virt_to_page(addr), buflen, off); + virt_to_page((void *)addr), buflen, off); } } diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 19f7385abeec..9dee267f1893 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -3184,7 +3184,7 @@ setAclRetry: param_offset = offsetof(struct smb_com_transaction2_spi_req, InformationLevel) - 4; offset = param_offset + params; - parm_data = ((char *) &pSMB->hdr.Protocol) + offset; + parm_data = ((char *)pSMB) + sizeof(pSMB->hdr.smb_buf_length) + offset; pSMB->ParameterOffset = cpu_to_le16(param_offset); /* convert to on the wire format for POSIX ACL */ diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index dab7bc876507..9280e253bf09 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -60,7 +60,7 @@ extern bool disable_legacy_dialects; #define TLINK_IDLE_EXPIRE (600 * HZ) /* Drop the connection to not overload the server */ -#define NUM_STATUS_IO_TIMEOUT 5 +#define MAX_STATUS_IO_TIMEOUT 5 static int ip_connect(struct TCP_Server_Info *server); static int generic_ip_connect(struct TCP_Server_Info *server); @@ -1117,6 +1117,7 @@ cifs_demultiplex_thread(void *p) struct mid_q_entry *mids[MAX_COMPOUND]; char *bufs[MAX_COMPOUND]; unsigned int noreclaim_flag, num_io_timeout = 0; + bool pending_reconnect = false; noreclaim_flag = memalloc_noreclaim_save(); cifs_dbg(FYI, "Demultiplex PID: %d\n", task_pid_nr(current)); @@ -1156,6 +1157,8 @@ cifs_demultiplex_thread(void *p) cifs_dbg(FYI, "RFC1002 header 0x%x\n", pdu_length); if (!is_smb_response(server, buf[0])) continue; + + pending_reconnect = false; next_pdu: server->pdu_size = pdu_length; @@ -1213,10 +1216,13 @@ next_pdu: if (server->ops->is_status_io_timeout && server->ops->is_status_io_timeout(buf)) { num_io_timeout++; - if (num_io_timeout > NUM_STATUS_IO_TIMEOUT) { - cifs_reconnect(server, false); + if (num_io_timeout > MAX_STATUS_IO_TIMEOUT) { + cifs_server_dbg(VFS, + "Number of request timeouts exceeded %d. Reconnecting", + MAX_STATUS_IO_TIMEOUT); + + pending_reconnect = true; num_io_timeout = 0; - continue; } } @@ -1226,9 +1232,14 @@ next_pdu: if (mids[i] != NULL) { mids[i]->resp_buf_size = server->pdu_size; - if (bufs[i] && server->ops->is_network_name_deleted) - server->ops->is_network_name_deleted(bufs[i], - server); + if (bufs[i] != NULL) { + if (server->ops->is_network_name_deleted && + server->ops->is_network_name_deleted(bufs[i], + server)) { + cifs_server_dbg(FYI, + "Share deleted. Reconnect needed"); + } + } if (!mids[i]->multiRsp || mids[i]->multiEnd) mids[i]->callback(mids[i]); @@ -1263,6 +1274,11 @@ next_pdu: buf = server->smallbuf; goto next_pdu; } + + /* do this reconnect at the very end after processing all MIDs */ + if (pending_reconnect) + cifs_reconnect(server, true); + } /* end while !EXITING */ /* buffer usually freed in free_mid - need to free it here on exit */ @@ -1967,15 +1983,16 @@ void __cifs_put_smb_ses(struct cifs_ses *ses) spin_unlock(&cifs_tcp_ses_lock); return; } + spin_lock(&ses->ses_lock); + if (ses->ses_status == SES_GOOD) + ses->ses_status = SES_EXITING; + spin_unlock(&ses->ses_lock); spin_unlock(&cifs_tcp_ses_lock); /* ses_count can never go negative */ WARN_ON(ses->ses_count < 0); spin_lock(&ses->ses_lock); - if (ses->ses_status == SES_GOOD) - ses->ses_status = SES_EXITING; - if (ses->ses_status == SES_EXITING && server->ops->logoff) { spin_unlock(&ses->ses_lock); cifs_free_ipc(ses); diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c index 26d14dd0482e..df3fd3b720da 100644 --- a/fs/smb/client/dfs.c +++ b/fs/smb/client/dfs.c @@ -66,6 +66,12 @@ static int get_session(struct cifs_mount_ctx *mnt_ctx, const char *full_path) return rc; } +/* + * Track individual DFS referral servers used by new DFS mount. + * + * On success, their lifetime will be shared by final tcon (dfs_ses_list). + * Otherwise, they will be put by dfs_put_root_smb_sessions() in cifs_mount(). + */ static int add_root_smb_session(struct cifs_mount_ctx *mnt_ctx) { struct smb3_fs_context *ctx = mnt_ctx->fs_ctx; @@ -80,11 +86,12 @@ static int add_root_smb_session(struct cifs_mount_ctx *mnt_ctx) INIT_LIST_HEAD(&root_ses->list); spin_lock(&cifs_tcp_ses_lock); - ses->ses_count++; + cifs_smb_ses_inc_refcount(ses); spin_unlock(&cifs_tcp_ses_lock); root_ses->ses = ses; list_add_tail(&root_ses->list, &mnt_ctx->dfs_ses_list); } + /* Select new DFS referral server so that new referrals go through it */ ctx->dfs_root_ses = ses; return 0; } @@ -143,7 +150,6 @@ static int __dfs_mount_share(struct cifs_mount_ctx *mnt_ctx) struct smb3_fs_context *ctx = mnt_ctx->fs_ctx; char *ref_path = NULL, *full_path = NULL; struct dfs_cache_tgt_iterator *tit; - struct TCP_Server_Info *server; struct cifs_tcon *tcon; char *origin_fullpath = NULL; char sep = CIFS_DIR_SEP(cifs_sb); @@ -214,7 +220,6 @@ static int __dfs_mount_share(struct cifs_mount_ctx *mnt_ctx) } while (rc == -EREMOTE); if (!rc) { - server = mnt_ctx->server; tcon = mnt_ctx->tcon; spin_lock(&tcon->tc_lock); @@ -244,7 +249,6 @@ out: int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs) { struct smb3_fs_context *ctx = mnt_ctx->fs_ctx; - struct cifs_ses *ses; bool nodfs = ctx->nodfs; int rc; @@ -278,20 +282,8 @@ int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs) } *isdfs = true; - /* - * Prevent DFS root session of being put in the first call to - * cifs_mount_put_conns(). If another DFS root server was not found - * while chasing the referrals (@ctx->dfs_root_ses == @ses), then we - * can safely put extra refcount of @ses. - */ - ses = mnt_ctx->ses; - mnt_ctx->ses = NULL; - mnt_ctx->server = NULL; - rc = __dfs_mount_share(mnt_ctx); - if (ses == ctx->dfs_root_ses) - cifs_put_smb_ses(ses); - - return rc; + add_root_smb_session(mnt_ctx); + return __dfs_mount_share(mnt_ctx); } /* Update dfs referral path of superblock */ diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 879bc8e6555c..fc5acc95cd13 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -1080,8 +1080,8 @@ int cifs_close(struct inode *inode, struct file *file) cfile = file->private_data; file->private_data = NULL; dclose = kmalloc(sizeof(struct cifs_deferred_close), GFP_KERNEL); - if ((cinode->oplock == CIFS_CACHE_RHW_FLG) && - cinode->lease_granted && + if ((cifs_sb->ctx->closetimeo && cinode->oplock == CIFS_CACHE_RHW_FLG) + && cinode->lease_granted && !test_bit(CIFS_INO_CLOSE_ON_LOCK, &cinode->flags) && dclose) { if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) { diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c index fff092bbc7a3..e1904b86ed96 100644 --- a/fs/smb/client/ioctl.c +++ b/fs/smb/client/ioctl.c @@ -433,16 +433,21 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) * Dump encryption keys. This is an old ioctl that only * handles AES-128-{CCM,GCM}. */ - if (pSMBFile == NULL) - break; if (!capable(CAP_SYS_ADMIN)) { rc = -EACCES; break; } - tcon = tlink_tcon(pSMBFile->tlink); + cifs_sb = CIFS_SB(inode->i_sb); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + rc = PTR_ERR(tlink); + break; + } + tcon = tlink_tcon(tlink); if (!smb3_encryption_required(tcon)) { rc = -EOPNOTSUPP; + cifs_put_tlink(tlink); break; } pkey_inf.cipher_type = @@ -459,6 +464,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) rc = -EFAULT; else rc = 0; + cifs_put_tlink(tlink); break; case CIFS_DUMP_FULL_KEY: /* @@ -470,8 +476,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) rc = -EACCES; break; } - tcon = tlink_tcon(pSMBFile->tlink); + cifs_sb = CIFS_SB(inode->i_sb); + tlink = cifs_sb_tlink(cifs_sb); + tcon = tlink_tcon(tlink); rc = cifs_dump_full_key(tcon, (void __user *)arg); + cifs_put_tlink(tlink); break; case CIFS_IOC_NOTIFY: if (!S_ISDIR(inode->i_mode)) { diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 87abce010974..0f62bc373ad0 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -2395,7 +2395,7 @@ smb2_is_status_io_timeout(char *buf) return false; } -static void +static bool smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) { struct smb2_hdr *shdr = (struct smb2_hdr *)buf; @@ -2404,7 +2404,7 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) struct cifs_tcon *tcon; if (shdr->Status != STATUS_NETWORK_NAME_DELETED) - return; + return false; /* If server is a channel, select the primary channel */ pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; @@ -2419,11 +2419,13 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) spin_unlock(&cifs_tcp_ses_lock); pr_warn_once("Server share %s deleted.\n", tcon->tree_name); - return; + return true; } } } spin_unlock(&cifs_tcp_ses_lock); + + return false; } static int diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c index c6db898dab7c..7676091b3e77 100644 --- a/fs/smb/client/smb2transport.c +++ b/fs/smb/client/smb2transport.c @@ -160,7 +160,7 @@ smb2_find_smb_ses_unlocked(struct TCP_Server_Info *server, __u64 ses_id) spin_unlock(&ses->ses_lock); continue; } - ++ses->ses_count; + cifs_smb_ses_inc_refcount(ses); spin_unlock(&ses->ses_lock); return ses; } diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 223e17c16b60..2a2aec8c6112 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -2500,7 +2500,7 @@ static ssize_t smb_extract_kvec_to_rdma(struct iov_iter *iter, if (is_vmalloc_or_module_addr((void *)kaddr)) page = vmalloc_to_page((void *)kaddr); else - page = virt_to_page(kaddr); + page = virt_to_page((void *)kaddr); if (!smb_set_sge(rdma, page, off, seg)) return -EIO; diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h index fb8b2d566efb..b7521e41402e 100644 --- a/fs/smb/server/ksmbd_netlink.h +++ b/fs/smb/server/ksmbd_netlink.h @@ -352,7 +352,8 @@ enum KSMBD_TREE_CONN_STATUS { #define KSMBD_SHARE_FLAG_STREAMS BIT(11) #define KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS BIT(12) #define KSMBD_SHARE_FLAG_ACL_XATTR BIT(13) -#define KSMBD_SHARE_FLAG_UPDATE BIT(14) +#define KSMBD_SHARE_FLAG_UPDATE BIT(14) +#define KSMBD_SHARE_FLAG_CROSSMNT BIT(15) /* * Tree connect request flags. diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index ced7a9e916f0..9df121bdf349 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -286,6 +286,7 @@ static void handle_ksmbd_work(struct work_struct *wk) static int queue_ksmbd_work(struct ksmbd_conn *conn) { struct ksmbd_work *work; + int err; work = ksmbd_alloc_work_struct(); if (!work) { @@ -297,7 +298,11 @@ static int queue_ksmbd_work(struct ksmbd_conn *conn) work->request_buf = conn->request_buf; conn->request_buf = NULL; - ksmbd_init_smb_server(work); + err = ksmbd_init_smb_server(work); + if (err) { + ksmbd_free_work_struct(work); + return 0; + } ksmbd_conn_enqueue_request(work); atomic_inc(&conn->r_count); diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index cf8822103f50..9849d7489345 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -87,9 +87,9 @@ struct channel *lookup_chann_list(struct ksmbd_session *sess, struct ksmbd_conn */ int smb2_get_ksmbd_tcon(struct ksmbd_work *work) { - struct smb2_hdr *req_hdr = smb2_get_msg(work->request_buf); + struct smb2_hdr *req_hdr = ksmbd_req_buf_next(work); unsigned int cmd = le16_to_cpu(req_hdr->Command); - int tree_id; + unsigned int tree_id; if (cmd == SMB2_TREE_CONNECT_HE || cmd == SMB2_CANCEL_HE || @@ -114,7 +114,7 @@ int smb2_get_ksmbd_tcon(struct ksmbd_work *work) pr_err("The first operation in the compound does not have tcon\n"); return -EINVAL; } - if (work->tcon->id != tree_id) { + if (tree_id != UINT_MAX && work->tcon->id != tree_id) { pr_err("tree id(%u) is different with id(%u) in first operation\n", tree_id, work->tcon->id); return -EINVAL; @@ -559,9 +559,9 @@ int smb2_allocate_rsp_buf(struct ksmbd_work *work) */ int smb2_check_user_session(struct ksmbd_work *work) { - struct smb2_hdr *req_hdr = smb2_get_msg(work->request_buf); + struct smb2_hdr *req_hdr = ksmbd_req_buf_next(work); struct ksmbd_conn *conn = work->conn; - unsigned int cmd = conn->ops->get_cmd_val(work); + unsigned int cmd = le16_to_cpu(req_hdr->Command); unsigned long long sess_id; /* @@ -587,7 +587,7 @@ int smb2_check_user_session(struct ksmbd_work *work) pr_err("The first operation in the compound does not have sess\n"); return -EINVAL; } - if (work->sess->id != sess_id) { + if (sess_id != ULLONG_MAX && work->sess->id != sess_id) { pr_err("session id(%llu) is different with the first operation(%lld)\n", sess_id, work->sess->id); return -EINVAL; @@ -2467,8 +2467,9 @@ static void smb2_update_xattrs(struct ksmbd_tree_connect *tcon, } } -static int smb2_creat(struct ksmbd_work *work, struct path *path, char *name, - int open_flags, umode_t posix_mode, bool is_dir) +static int smb2_creat(struct ksmbd_work *work, struct path *parent_path, + struct path *path, char *name, int open_flags, + umode_t posix_mode, bool is_dir) { struct ksmbd_tree_connect *tcon = work->tcon; struct ksmbd_share_config *share = tcon->share_conf; @@ -2495,7 +2496,7 @@ static int smb2_creat(struct ksmbd_work *work, struct path *path, char *name, return rc; } - rc = ksmbd_vfs_kern_path_locked(work, name, 0, path, 0); + rc = ksmbd_vfs_kern_path_locked(work, name, 0, parent_path, path, 0); if (rc) { pr_err("cannot get linux path (%s), err = %d\n", name, rc); @@ -2565,7 +2566,7 @@ int smb2_open(struct ksmbd_work *work) struct ksmbd_tree_connect *tcon = work->tcon; struct smb2_create_req *req; struct smb2_create_rsp *rsp; - struct path path; + struct path path, parent_path; struct ksmbd_share_config *share = tcon->share_conf; struct ksmbd_file *fp = NULL; struct file *filp = NULL; @@ -2786,7 +2787,8 @@ int smb2_open(struct ksmbd_work *work) goto err_out1; } - rc = ksmbd_vfs_kern_path_locked(work, name, LOOKUP_NO_SYMLINKS, &path, 1); + rc = ksmbd_vfs_kern_path_locked(work, name, LOOKUP_NO_SYMLINKS, + &parent_path, &path, 1); if (!rc) { file_present = true; @@ -2906,7 +2908,8 @@ int smb2_open(struct ksmbd_work *work) /*create file if not present */ if (!file_present) { - rc = smb2_creat(work, &path, name, open_flags, posix_mode, + rc = smb2_creat(work, &parent_path, &path, name, open_flags, + posix_mode, req->CreateOptions & FILE_DIRECTORY_FILE_LE); if (rc) { if (rc == -ENOENT) { @@ -3321,8 +3324,9 @@ int smb2_open(struct ksmbd_work *work) err_out: if (file_present || created) { - inode_unlock(d_inode(path.dentry->d_parent)); - dput(path.dentry); + inode_unlock(d_inode(parent_path.dentry)); + path_put(&path); + path_put(&parent_path); } ksmbd_revert_fsids(work); err_out1: @@ -5545,7 +5549,7 @@ static int smb2_create_link(struct ksmbd_work *work, struct nls_table *local_nls) { char *link_name = NULL, *target_name = NULL, *pathname = NULL; - struct path path; + struct path path, parent_path; bool file_present = false; int rc; @@ -5575,7 +5579,7 @@ static int smb2_create_link(struct ksmbd_work *work, ksmbd_debug(SMB, "target name is %s\n", target_name); rc = ksmbd_vfs_kern_path_locked(work, link_name, LOOKUP_NO_SYMLINKS, - &path, 0); + &parent_path, &path, 0); if (rc) { if (rc != -ENOENT) goto out; @@ -5605,8 +5609,9 @@ static int smb2_create_link(struct ksmbd_work *work, rc = -EINVAL; out: if (file_present) { - inode_unlock(d_inode(path.dentry->d_parent)); + inode_unlock(d_inode(parent_path.dentry)); path_put(&path); + path_put(&parent_path); } if (!IS_ERR(link_name)) kfree(link_name); @@ -6209,6 +6214,11 @@ int smb2_read(struct ksmbd_work *work) unsigned int max_read_size = conn->vals->max_read_size; WORK_BUFFERS(work, req, rsp); + if (work->next_smb2_rcv_hdr_off) { + work->send_no_response = 1; + err = -EOPNOTSUPP; + goto out; + } if (test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_PIPE)) { @@ -8609,7 +8619,8 @@ int smb3_decrypt_req(struct ksmbd_work *work) struct smb2_transform_hdr *tr_hdr = smb2_get_msg(buf); int rc = 0; - if (buf_data_size < sizeof(struct smb2_hdr)) { + if (pdu_length < sizeof(struct smb2_transform_hdr) || + buf_data_size < sizeof(struct smb2_hdr)) { pr_err("Transform message is too small (%u)\n", pdu_length); return -ECONNABORTED; diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c index ef20f63e55e6..c2b75d898852 100644 --- a/fs/smb/server/smb_common.c +++ b/fs/smb/server/smb_common.c @@ -388,26 +388,29 @@ static struct smb_version_cmds smb1_server_cmds[1] = { [SMB_COM_NEGOTIATE_EX] = { .proc = smb1_negotiate, }, }; -static void init_smb1_server(struct ksmbd_conn *conn) +static int init_smb1_server(struct ksmbd_conn *conn) { conn->ops = &smb1_server_ops; conn->cmds = smb1_server_cmds; conn->max_cmds = ARRAY_SIZE(smb1_server_cmds); + return 0; } -void ksmbd_init_smb_server(struct ksmbd_work *work) +int ksmbd_init_smb_server(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; __le32 proto; - if (conn->need_neg == false) - return; - proto = *(__le32 *)((struct smb_hdr *)work->request_buf)->Protocol; + if (conn->need_neg == false) { + if (proto == SMB1_PROTO_NUMBER) + return -EINVAL; + return 0; + } + if (proto == SMB1_PROTO_NUMBER) - init_smb1_server(conn); - else - init_smb3_11_server(conn); + return init_smb1_server(conn); + return init_smb3_11_server(conn); } int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level, diff --git a/fs/smb/server/smb_common.h b/fs/smb/server/smb_common.h index aeca0f46068f..f1092519c0c2 100644 --- a/fs/smb/server/smb_common.h +++ b/fs/smb/server/smb_common.h @@ -427,7 +427,7 @@ bool ksmbd_smb_request(struct ksmbd_conn *conn); int ksmbd_lookup_dialect_by_id(__le16 *cli_dialects, __le16 dialects_count); -void ksmbd_init_smb_server(struct ksmbd_work *work); +int ksmbd_init_smb_server(struct ksmbd_work *work); struct ksmbd_kstat; int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index e35914457350..3d5d652153a5 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -63,13 +63,13 @@ int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child) static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf, char *pathname, unsigned int flags, + struct path *parent_path, struct path *path) { struct qstr last; struct filename *filename; struct path *root_share_path = &share_conf->vfs_path; int err, type; - struct path parent_path; struct dentry *d; if (pathname[0] == '\0') { @@ -84,7 +84,7 @@ static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf, return PTR_ERR(filename); err = vfs_path_parent_lookup(filename, flags, - &parent_path, &last, &type, + parent_path, &last, &type, root_share_path); if (err) { putname(filename); @@ -92,13 +92,13 @@ static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf, } if (unlikely(type != LAST_NORM)) { - path_put(&parent_path); + path_put(parent_path); putname(filename); return -ENOENT; } - inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); - d = lookup_one_qstr_excl(&last, parent_path.dentry, 0); + inode_lock_nested(parent_path->dentry->d_inode, I_MUTEX_PARENT); + d = lookup_one_qstr_excl(&last, parent_path->dentry, 0); if (IS_ERR(d)) goto err_out; @@ -108,15 +108,22 @@ static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf, } path->dentry = d; - path->mnt = share_conf->vfs_path.mnt; - path_put(&parent_path); - putname(filename); + path->mnt = mntget(parent_path->mnt); + if (test_share_config_flag(share_conf, KSMBD_SHARE_FLAG_CROSSMNT)) { + err = follow_down(path, 0); + if (err < 0) { + path_put(path); + goto err_out; + } + } + + putname(filename); return 0; err_out: - inode_unlock(parent_path.dentry->d_inode); - path_put(&parent_path); + inode_unlock(d_inode(parent_path->dentry)); + path_put(parent_path); putname(filename); return -ENOENT; } @@ -412,7 +419,8 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos, { char *stream_buf = NULL, *wbuf; struct mnt_idmap *idmap = file_mnt_idmap(fp->filp); - size_t size, v_len; + size_t size; + ssize_t v_len; int err = 0; ksmbd_debug(VFS, "write stream data pos : %llu, count : %zd\n", @@ -429,9 +437,9 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos, fp->stream.name, fp->stream.size, &stream_buf); - if ((int)v_len < 0) { + if (v_len < 0) { pr_err("not found stream in xattr : %zd\n", v_len); - err = (int)v_len; + err = v_len; goto out; } @@ -1194,14 +1202,14 @@ static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name, * Return: 0 on success, otherwise error */ int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, - unsigned int flags, struct path *path, - bool caseless) + unsigned int flags, struct path *parent_path, + struct path *path, bool caseless) { struct ksmbd_share_config *share_conf = work->tcon->share_conf; int err; - struct path parent_path; - err = ksmbd_vfs_path_lookup_locked(share_conf, name, flags, path); + err = ksmbd_vfs_path_lookup_locked(share_conf, name, flags, parent_path, + path); if (!err) return 0; @@ -1216,10 +1224,10 @@ int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, path_len = strlen(filepath); remain_len = path_len; - parent_path = share_conf->vfs_path; - path_get(&parent_path); + *parent_path = share_conf->vfs_path; + path_get(parent_path); - while (d_can_lookup(parent_path.dentry)) { + while (d_can_lookup(parent_path->dentry)) { char *filename = filepath + path_len - remain_len; char *next = strchrnul(filename, '/'); size_t filename_len = next - filename; @@ -1228,7 +1236,7 @@ int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, if (filename_len == 0) break; - err = ksmbd_vfs_lookup_in_dir(&parent_path, filename, + err = ksmbd_vfs_lookup_in_dir(parent_path, filename, filename_len, work->conn->um); if (err) @@ -1245,8 +1253,8 @@ int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, goto out2; else if (is_last) goto out1; - path_put(&parent_path); - parent_path = *path; + path_put(parent_path); + *parent_path = *path; next[0] = '/'; remain_len -= filename_len + 1; @@ -1254,16 +1262,17 @@ int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, err = -EINVAL; out2: - path_put(&parent_path); + path_put(parent_path); out1: kfree(filepath); } if (!err) { - err = ksmbd_vfs_lock_parent(parent_path.dentry, path->dentry); - if (err) - dput(path->dentry); - path_put(&parent_path); + err = ksmbd_vfs_lock_parent(parent_path->dentry, path->dentry); + if (err) { + path_put(path); + path_put(parent_path); + } } return err; } diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h index 80039312c255..72f9fb4b48d1 100644 --- a/fs/smb/server/vfs.h +++ b/fs/smb/server/vfs.h @@ -115,8 +115,8 @@ int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name, int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap, const struct path *path, char *attr_name); int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, - unsigned int flags, struct path *path, - bool caseless); + unsigned int flags, struct path *parent_path, + struct path *path, bool caseless); struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, const char *name, unsigned int flags, diff --git a/fs/splice.c b/fs/splice.c index 004eb1c4ce31..3e2a31e1ce6a 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -876,6 +876,8 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, msg.msg_flags |= MSG_MORE; if (remain && pipe_occupancy(pipe->head, tail) > 0) msg.msg_flags |= MSG_MORE; + if (out->f_flags & O_NONBLOCK) + msg.msg_flags |= MSG_DONTWAIT; iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, bvec, bc, len - remain); diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 6aa9c2e1e8eb..581ce9519339 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -166,6 +166,26 @@ static int squashfs_bio_read_cached(struct bio *fullbio, return 0; } +static struct page *squashfs_get_cache_page(struct address_space *mapping, + pgoff_t index) +{ + struct page *page; + + if (!mapping) + return NULL; + + page = find_get_page(mapping, index); + if (!page) + return NULL; + + if (!PageUptodate(page)) { + put_page(page); + return NULL; + } + + return page; +} + static int squashfs_bio_read(struct super_block *sb, u64 index, int length, struct bio **biop, int *block_offset) { @@ -190,11 +210,10 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length, for (i = 0; i < page_count; ++i) { unsigned int len = min_t(unsigned int, PAGE_SIZE - offset, total_len); - struct page *page = NULL; + pgoff_t index = (read_start >> PAGE_SHIFT) + i; + struct page *page; - if (cache_mapping) - page = find_get_page(cache_mapping, - (read_start >> PAGE_SHIFT) + i); + page = squashfs_get_cache_page(cache_mapping, index); if (!page) page = alloc_page(GFP_NOIO); diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index ee84835ebc66..e9cc481b4ddf 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -985,7 +985,7 @@ xfs_ag_shrink_space( goto resv_err; err2 = __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, - true); + XFS_AG_RESV_NONE, true); if (err2) goto resv_err; diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index c20fe99405d8..3069194527dd 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1536,7 +1536,8 @@ xfs_alloc_ag_vextent_lastblock( */ STATIC int xfs_alloc_ag_vextent_near( - struct xfs_alloc_arg *args) + struct xfs_alloc_arg *args, + uint32_t alloc_flags) { struct xfs_alloc_cur acur = {}; int error; /* error code */ @@ -1555,6 +1556,8 @@ xfs_alloc_ag_vextent_near( if (args->agbno > args->max_agbno) args->agbno = args->max_agbno; + /* Retry once quickly if we find busy extents before blocking. */ + alloc_flags |= XFS_ALLOC_FLAG_TRYFLUSH; restart: len = 0; @@ -1610,9 +1613,20 @@ restart: */ if (!acur.len) { if (acur.busy) { + /* + * Our only valid extents must have been busy. Flush and + * retry the allocation again. If we get an -EAGAIN + * error, we're being told that a deadlock was avoided + * and the current transaction needs committing before + * the allocation can be retried. + */ trace_xfs_alloc_near_busy(args); - xfs_extent_busy_flush(args->mp, args->pag, - acur.busy_gen); + error = xfs_extent_busy_flush(args->tp, args->pag, + acur.busy_gen, alloc_flags); + if (error) + goto out; + + alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH; goto restart; } trace_xfs_alloc_size_neither(args); @@ -1635,22 +1649,25 @@ out: * and of the form k * prod + mod unless there's nothing that large. * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. */ -STATIC int /* error */ +static int xfs_alloc_ag_vextent_size( - xfs_alloc_arg_t *args) /* allocation argument structure */ + struct xfs_alloc_arg *args, + uint32_t alloc_flags) { - struct xfs_agf *agf = args->agbp->b_addr; - struct xfs_btree_cur *bno_cur; /* cursor for bno btree */ - struct xfs_btree_cur *cnt_cur; /* cursor for cnt btree */ - int error; /* error result */ - xfs_agblock_t fbno; /* start of found freespace */ - xfs_extlen_t flen; /* length of found freespace */ - int i; /* temp status variable */ - xfs_agblock_t rbno; /* returned block number */ - xfs_extlen_t rlen; /* length of returned extent */ - bool busy; - unsigned busy_gen; + struct xfs_agf *agf = args->agbp->b_addr; + struct xfs_btree_cur *bno_cur; + struct xfs_btree_cur *cnt_cur; + xfs_agblock_t fbno; /* start of found freespace */ + xfs_extlen_t flen; /* length of found freespace */ + xfs_agblock_t rbno; /* returned block number */ + xfs_extlen_t rlen; /* length of returned extent */ + bool busy; + unsigned busy_gen; + int error; + int i; + /* Retry once quickly if we find busy extents before blocking. */ + alloc_flags |= XFS_ALLOC_FLAG_TRYFLUSH; restart: /* * Allocate and initialize a cursor for the by-size btree. @@ -1708,19 +1725,25 @@ restart: error = xfs_btree_increment(cnt_cur, 0, &i); if (error) goto error0; - if (i == 0) { - /* - * Our only valid extents must have been busy. - * Make it unbusy by forcing the log out and - * retrying. - */ - xfs_btree_del_cursor(cnt_cur, - XFS_BTREE_NOERROR); - trace_xfs_alloc_size_busy(args); - xfs_extent_busy_flush(args->mp, - args->pag, busy_gen); - goto restart; - } + if (i) + continue; + + /* + * Our only valid extents must have been busy. Flush and + * retry the allocation again. If we get an -EAGAIN + * error, we're being told that a deadlock was avoided + * and the current transaction needs committing before + * the allocation can be retried. + */ + trace_xfs_alloc_size_busy(args); + error = xfs_extent_busy_flush(args->tp, args->pag, + busy_gen, alloc_flags); + if (error) + goto error0; + + alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH; + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + goto restart; } } @@ -1800,9 +1823,21 @@ restart: args->len = rlen; if (rlen < args->minlen) { if (busy) { - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + /* + * Our only valid extents must have been busy. Flush and + * retry the allocation again. If we get an -EAGAIN + * error, we're being told that a deadlock was avoided + * and the current transaction needs committing before + * the allocation can be retried. + */ trace_xfs_alloc_size_busy(args); - xfs_extent_busy_flush(args->mp, args->pag, busy_gen); + error = xfs_extent_busy_flush(args->tp, args->pag, + busy_gen, alloc_flags); + if (error) + goto error0; + + alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH; + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); goto restart; } goto out_nominleft; @@ -2435,23 +2470,25 @@ static int xfs_defer_agfl_block( struct xfs_trans *tp, xfs_agnumber_t agno, - xfs_fsblock_t agbno, + xfs_agblock_t agbno, struct xfs_owner_info *oinfo) { struct xfs_mount *mp = tp->t_mountp; struct xfs_extent_free_item *xefi; + xfs_fsblock_t fsbno = XFS_AGB_TO_FSB(mp, agno, agbno); ASSERT(xfs_extfree_item_cache != NULL); ASSERT(oinfo != NULL); + if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, fsbno))) + return -EFSCORRUPTED; + xefi = kmem_cache_zalloc(xfs_extfree_item_cache, GFP_KERNEL | __GFP_NOFAIL); - xefi->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); + xefi->xefi_startblock = fsbno; xefi->xefi_blockcount = 1; xefi->xefi_owner = oinfo->oi_owner; - - if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, xefi->xefi_startblock))) - return -EFSCORRUPTED; + xefi->xefi_agresv = XFS_AG_RESV_AGFL; trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); @@ -2470,6 +2507,7 @@ __xfs_free_extent_later( xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type, bool skip_discard) { struct xfs_extent_free_item *xefi; @@ -2490,6 +2528,7 @@ __xfs_free_extent_later( ASSERT(agbno + len <= mp->m_sb.sb_agblocks); #endif ASSERT(xfs_extfree_item_cache != NULL); + ASSERT(type != XFS_AG_RESV_AGFL); if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len))) return -EFSCORRUPTED; @@ -2498,6 +2537,7 @@ __xfs_free_extent_later( GFP_KERNEL | __GFP_NOFAIL); xefi->xefi_startblock = bno; xefi->xefi_blockcount = (xfs_extlen_t)len; + xefi->xefi_agresv = type; if (skip_discard) xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD; if (oinfo) { @@ -2568,7 +2608,7 @@ out: int /* error */ xfs_alloc_fix_freelist( struct xfs_alloc_arg *args, /* allocation argument structure */ - int flags) /* XFS_ALLOC_FLAG_... */ + uint32_t alloc_flags) { struct xfs_mount *mp = args->mp; struct xfs_perag *pag = args->pag; @@ -2584,7 +2624,7 @@ xfs_alloc_fix_freelist( ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); if (!xfs_perag_initialised_agf(pag)) { - error = xfs_alloc_read_agf(pag, tp, flags, &agbp); + error = xfs_alloc_read_agf(pag, tp, alloc_flags, &agbp); if (error) { /* Couldn't lock the AGF so skip this AG. */ if (error == -EAGAIN) @@ -2600,13 +2640,13 @@ xfs_alloc_fix_freelist( */ if (xfs_perag_prefers_metadata(pag) && (args->datatype & XFS_ALLOC_USERDATA) && - (flags & XFS_ALLOC_FLAG_TRYLOCK)) { - ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); + (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK)) { + ASSERT(!(alloc_flags & XFS_ALLOC_FLAG_FREEING)); goto out_agbp_relse; } need = xfs_alloc_min_freelist(mp, pag); - if (!xfs_alloc_space_available(args, need, flags | + if (!xfs_alloc_space_available(args, need, alloc_flags | XFS_ALLOC_FLAG_CHECK)) goto out_agbp_relse; @@ -2615,7 +2655,7 @@ xfs_alloc_fix_freelist( * Can fail if we're not blocking on locks, and it's held. */ if (!agbp) { - error = xfs_alloc_read_agf(pag, tp, flags, &agbp); + error = xfs_alloc_read_agf(pag, tp, alloc_flags, &agbp); if (error) { /* Couldn't lock the AGF so skip this AG. */ if (error == -EAGAIN) @@ -2630,7 +2670,7 @@ xfs_alloc_fix_freelist( /* If there isn't enough total space or single-extent, reject it. */ need = xfs_alloc_min_freelist(mp, pag); - if (!xfs_alloc_space_available(args, need, flags)) + if (!xfs_alloc_space_available(args, need, alloc_flags)) goto out_agbp_relse; #ifdef DEBUG @@ -2668,11 +2708,12 @@ xfs_alloc_fix_freelist( */ memset(&targs, 0, sizeof(targs)); /* struct copy below */ - if (flags & XFS_ALLOC_FLAG_NORMAP) + if (alloc_flags & XFS_ALLOC_FLAG_NORMAP) targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; else targs.oinfo = XFS_RMAP_OINFO_AG; - while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) { + while (!(alloc_flags & XFS_ALLOC_FLAG_NOSHRINK) && + pag->pagf_flcount > need) { error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0); if (error) goto out_agbp_relse; @@ -2700,7 +2741,7 @@ xfs_alloc_fix_freelist( targs.resv = XFS_AG_RESV_AGFL; /* Allocate as many blocks as possible at once. */ - error = xfs_alloc_ag_vextent_size(&targs); + error = xfs_alloc_ag_vextent_size(&targs, alloc_flags); if (error) goto out_agflbp_relse; @@ -2710,7 +2751,7 @@ xfs_alloc_fix_freelist( * on a completely full ag. */ if (targs.agbno == NULLAGBLOCK) { - if (flags & XFS_ALLOC_FLAG_FREEING) + if (alloc_flags & XFS_ALLOC_FLAG_FREEING) break; goto out_agflbp_relse; } @@ -2916,6 +2957,47 @@ xfs_alloc_put_freelist( } /* + * Check that this AGF/AGI header's sequence number and length matches the AG + * number and size in fsblocks. + */ +xfs_failaddr_t +xfs_validate_ag_length( + struct xfs_buf *bp, + uint32_t seqno, + uint32_t length) +{ + struct xfs_mount *mp = bp->b_mount; + /* + * During growfs operations, the perag is not fully initialised, + * so we can't use it for any useful checking. growfs ensures we can't + * use it by using uncached buffers that don't have the perag attached + * so we can detect and avoid this problem. + */ + if (bp->b_pag && seqno != bp->b_pag->pag_agno) + return __this_address; + + /* + * Only the last AG in the filesystem is allowed to be shorter + * than the AG size recorded in the superblock. + */ + if (length != mp->m_sb.sb_agblocks) { + /* + * During growfs, the new last AG can get here before we + * have updated the superblock. Give it a pass on the seqno + * check. + */ + if (bp->b_pag && seqno != mp->m_sb.sb_agcount - 1) + return __this_address; + if (length < XFS_MIN_AG_BLOCKS) + return __this_address; + if (length > mp->m_sb.sb_agblocks) + return __this_address; + } + + return NULL; +} + +/* * Verify the AGF is consistent. * * We do not verify the AGFL indexes in the AGF are fully consistent here @@ -2934,6 +3016,9 @@ xfs_agf_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_agf *agf = bp->b_addr; + xfs_failaddr_t fa; + uint32_t agf_seqno = be32_to_cpu(agf->agf_seqno); + uint32_t agf_length = be32_to_cpu(agf->agf_length); if (xfs_has_crc(mp)) { if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid)) @@ -2945,18 +3030,26 @@ xfs_agf_verify( if (!xfs_verify_magic(bp, agf->agf_magicnum)) return __this_address; - if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && - be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && - be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) && - be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) && - be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp))) + if (!XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum))) return __this_address; - if (be32_to_cpu(agf->agf_length) > mp->m_sb.sb_dblocks) + /* + * Both agf_seqno and agf_length need to validated before anything else + * block number related in the AGF or AGFL can be checked. + */ + fa = xfs_validate_ag_length(bp, agf_seqno, agf_length); + if (fa) + return fa; + + if (be32_to_cpu(agf->agf_flfirst) >= xfs_agfl_size(mp)) + return __this_address; + if (be32_to_cpu(agf->agf_fllast) >= xfs_agfl_size(mp)) + return __this_address; + if (be32_to_cpu(agf->agf_flcount) > xfs_agfl_size(mp)) return __this_address; if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) || - be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length)) + be32_to_cpu(agf->agf_freeblks) > agf_length) return __this_address; if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 || @@ -2967,38 +3060,28 @@ xfs_agf_verify( mp->m_alloc_maxlevels) return __this_address; - if (xfs_has_rmapbt(mp) && - (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 || - be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > - mp->m_rmap_maxlevels)) - return __this_address; - - if (xfs_has_rmapbt(mp) && - be32_to_cpu(agf->agf_rmap_blocks) > be32_to_cpu(agf->agf_length)) + if (xfs_has_lazysbcount(mp) && + be32_to_cpu(agf->agf_btreeblks) > agf_length) return __this_address; - /* - * during growfs operations, the perag is not fully initialised, - * so we can't use it for any useful checking. growfs ensures we can't - * use it by using uncached buffers that don't have the perag attached - * so we can detect and avoid this problem. - */ - if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno) - return __this_address; + if (xfs_has_rmapbt(mp)) { + if (be32_to_cpu(agf->agf_rmap_blocks) > agf_length) + return __this_address; - if (xfs_has_lazysbcount(mp) && - be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length)) - return __this_address; + if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 || + be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > + mp->m_rmap_maxlevels) + return __this_address; + } - if (xfs_has_reflink(mp) && - be32_to_cpu(agf->agf_refcount_blocks) > - be32_to_cpu(agf->agf_length)) - return __this_address; + if (xfs_has_reflink(mp)) { + if (be32_to_cpu(agf->agf_refcount_blocks) > agf_length) + return __this_address; - if (xfs_has_reflink(mp) && - (be32_to_cpu(agf->agf_refcount_level) < 1 || - be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels)) - return __this_address; + if (be32_to_cpu(agf->agf_refcount_level) < 1 || + be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels) + return __this_address; + } return NULL; } @@ -3226,7 +3309,7 @@ xfs_alloc_vextent_check_args( static int xfs_alloc_vextent_prepare_ag( struct xfs_alloc_arg *args, - uint32_t flags) + uint32_t alloc_flags) { bool need_pag = !args->pag; int error; @@ -3235,7 +3318,7 @@ xfs_alloc_vextent_prepare_ag( args->pag = xfs_perag_get(args->mp, args->agno); args->agbp = NULL; - error = xfs_alloc_fix_freelist(args, flags); + error = xfs_alloc_fix_freelist(args, alloc_flags); if (error) { trace_xfs_alloc_vextent_nofix(args); if (need_pag) @@ -3357,6 +3440,7 @@ xfs_alloc_vextent_this_ag( { struct xfs_mount *mp = args->mp; xfs_agnumber_t minimum_agno; + uint32_t alloc_flags = 0; int error; ASSERT(args->pag != NULL); @@ -3375,9 +3459,9 @@ xfs_alloc_vextent_this_ag( return error; } - error = xfs_alloc_vextent_prepare_ag(args, 0); + error = xfs_alloc_vextent_prepare_ag(args, alloc_flags); if (!error && args->agbp) - error = xfs_alloc_ag_vextent_size(args); + error = xfs_alloc_ag_vextent_size(args, alloc_flags); return xfs_alloc_vextent_finish(args, minimum_agno, error, false); } @@ -3406,20 +3490,20 @@ xfs_alloc_vextent_iterate_ags( xfs_agnumber_t minimum_agno, xfs_agnumber_t start_agno, xfs_agblock_t target_agbno, - uint32_t flags) + uint32_t alloc_flags) { struct xfs_mount *mp = args->mp; xfs_agnumber_t restart_agno = minimum_agno; xfs_agnumber_t agno; int error = 0; - if (flags & XFS_ALLOC_FLAG_TRYLOCK) + if (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK) restart_agno = 0; restart: for_each_perag_wrap_range(mp, start_agno, restart_agno, mp->m_sb.sb_agcount, agno, args->pag) { args->agno = agno; - error = xfs_alloc_vextent_prepare_ag(args, flags); + error = xfs_alloc_vextent_prepare_ag(args, alloc_flags); if (error) break; if (!args->agbp) { @@ -3433,10 +3517,10 @@ restart: */ if (args->agno == start_agno && target_agbno) { args->agbno = target_agbno; - error = xfs_alloc_ag_vextent_near(args); + error = xfs_alloc_ag_vextent_near(args, alloc_flags); } else { args->agbno = 0; - error = xfs_alloc_ag_vextent_size(args); + error = xfs_alloc_ag_vextent_size(args, alloc_flags); } break; } @@ -3453,8 +3537,8 @@ restart: * constraining flags by the caller, drop them and retry the allocation * without any constraints being set. */ - if (flags) { - flags = 0; + if (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK) { + alloc_flags &= ~XFS_ALLOC_FLAG_TRYLOCK; restart_agno = minimum_agno; goto restart; } @@ -3482,6 +3566,7 @@ xfs_alloc_vextent_start_ag( xfs_agnumber_t start_agno; xfs_agnumber_t rotorstep = xfs_rotorstep; bool bump_rotor = false; + uint32_t alloc_flags = XFS_ALLOC_FLAG_TRYLOCK; int error; ASSERT(args->pag == NULL); @@ -3508,7 +3593,7 @@ xfs_alloc_vextent_start_ag( start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target)); error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno, - XFS_FSB_TO_AGBNO(mp, target), XFS_ALLOC_FLAG_TRYLOCK); + XFS_FSB_TO_AGBNO(mp, target), alloc_flags); if (bump_rotor) { if (args->agno == start_agno) @@ -3535,6 +3620,7 @@ xfs_alloc_vextent_first_ag( struct xfs_mount *mp = args->mp; xfs_agnumber_t minimum_agno; xfs_agnumber_t start_agno; + uint32_t alloc_flags = XFS_ALLOC_FLAG_TRYLOCK; int error; ASSERT(args->pag == NULL); @@ -3553,7 +3639,7 @@ xfs_alloc_vextent_first_ag( start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target)); error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno, - XFS_FSB_TO_AGBNO(mp, target), 0); + XFS_FSB_TO_AGBNO(mp, target), alloc_flags); return xfs_alloc_vextent_finish(args, minimum_agno, error, true); } @@ -3606,6 +3692,7 @@ xfs_alloc_vextent_near_bno( struct xfs_mount *mp = args->mp; xfs_agnumber_t minimum_agno; bool needs_perag = args->pag == NULL; + uint32_t alloc_flags = 0; int error; if (!needs_perag) @@ -3626,9 +3713,9 @@ xfs_alloc_vextent_near_bno( if (needs_perag) args->pag = xfs_perag_grab(mp, args->agno); - error = xfs_alloc_vextent_prepare_ag(args, 0); + error = xfs_alloc_vextent_prepare_ag(args, alloc_flags); if (!error && args->agbp) - error = xfs_alloc_ag_vextent_near(args); + error = xfs_alloc_ag_vextent_near(args, alloc_flags); return xfs_alloc_vextent_finish(args, minimum_agno, error, needs_perag); } @@ -3756,15 +3843,11 @@ xfs_alloc_query_range( xfs_alloc_query_range_fn fn, void *priv) { - union xfs_btree_irec low_brec; - union xfs_btree_irec high_brec; - struct xfs_alloc_query_range_info query; + union xfs_btree_irec low_brec = { .a = *low_rec }; + union xfs_btree_irec high_brec = { .a = *high_rec }; + struct xfs_alloc_query_range_info query = { .priv = priv, .fn = fn }; ASSERT(cur->bc_btnum == XFS_BTNUM_BNO); - low_brec.a = *low_rec; - high_brec.a = *high_rec; - query.priv = priv; - query.fn = fn; return xfs_btree_query_range(cur, &low_brec, &high_brec, xfs_alloc_query_range_helper, &query); } diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 85ac470be0da..6bb8d295c321 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -19,11 +19,12 @@ unsigned int xfs_agfl_size(struct xfs_mount *mp); /* * Flags for xfs_alloc_fix_freelist. */ -#define XFS_ALLOC_FLAG_TRYLOCK 0x00000001 /* use trylock for buffer locking */ -#define XFS_ALLOC_FLAG_FREEING 0x00000002 /* indicate caller is freeing extents*/ -#define XFS_ALLOC_FLAG_NORMAP 0x00000004 /* don't modify the rmapbt */ -#define XFS_ALLOC_FLAG_NOSHRINK 0x00000008 /* don't shrink the freelist */ -#define XFS_ALLOC_FLAG_CHECK 0x00000010 /* test only, don't modify args */ +#define XFS_ALLOC_FLAG_TRYLOCK (1U << 0) /* use trylock for buffer locking */ +#define XFS_ALLOC_FLAG_FREEING (1U << 1) /* indicate caller is freeing extents*/ +#define XFS_ALLOC_FLAG_NORMAP (1U << 2) /* don't modify the rmapbt */ +#define XFS_ALLOC_FLAG_NOSHRINK (1U << 3) /* don't shrink the freelist */ +#define XFS_ALLOC_FLAG_CHECK (1U << 4) /* test only, don't modify args */ +#define XFS_ALLOC_FLAG_TRYFLUSH (1U << 5) /* don't wait in busy extent flush */ /* * Argument structure for xfs_alloc routines. @@ -195,7 +196,7 @@ int xfs_alloc_read_agfl(struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf **bpp); int xfs_free_agfl_block(struct xfs_trans *, xfs_agnumber_t, xfs_agblock_t, struct xfs_buf *, struct xfs_owner_info *); -int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags); +int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, uint32_t alloc_flags); int xfs_free_extent_fix_freelist(struct xfs_trans *tp, struct xfs_perag *pag, struct xfs_buf **agbp); @@ -232,7 +233,7 @@ xfs_buf_to_agfl_bno( int __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo, - bool skip_discard); + enum xfs_ag_resv_type type, bool skip_discard); /* * List of extents to be free "later". @@ -245,6 +246,7 @@ struct xfs_extent_free_item { xfs_extlen_t xefi_blockcount;/* number of blocks in extent */ struct xfs_perag *xefi_pag; unsigned int xefi_flags; + enum xfs_ag_resv_type xefi_agresv; }; void xfs_extent_free_get_group(struct xfs_mount *mp, @@ -259,9 +261,10 @@ xfs_free_extent_later( struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, - const struct xfs_owner_info *oinfo) + const struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type) { - return __xfs_free_extent_later(tp, bno, len, oinfo, false); + return __xfs_free_extent_later(tp, bno, len, oinfo, type, false); } @@ -270,4 +273,7 @@ extern struct kmem_cache *xfs_extfree_item_cache; int __init xfs_extfree_intent_init_cache(void); void xfs_extfree_intent_destroy_cache(void); +xfs_failaddr_t xfs_validate_ag_length(struct xfs_buf *bp, uint32_t seqno, + uint32_t length); + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index beee51ad75ce..2580ae47209a 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -2293,8 +2293,6 @@ xfs_attr3_leaf_unbalance( trace_xfs_attr_leaf_unbalance(state->args); - drop_leaf = drop_blk->bp->b_addr; - save_leaf = save_blk->bp->b_addr; xfs_attr3_leaf_hdr_from_disk(state->args->geo, &drophdr, drop_leaf); xfs_attr3_leaf_hdr_from_disk(state->args->geo, &savehdr, save_leaf); entry = xfs_attr3_leaf_entryp(drop_leaf); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index fef35696adb7..30c931b38853 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -574,7 +574,8 @@ xfs_bmap_btree_to_extents( return error; xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); - error = xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo); + error = xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo, + XFS_AG_RESV_NONE); if (error) return error; @@ -5236,8 +5237,9 @@ xfs_bmap_del_extent_real( } else { error = __xfs_free_extent_later(tp, del->br_startblock, del->br_blockcount, NULL, - (bflags & XFS_BMAPI_NODISCARD) || - del->br_state == XFS_EXT_UNWRITTEN); + XFS_AG_RESV_NONE, + ((bflags & XFS_BMAPI_NODISCARD) || + del->br_state == XFS_EXT_UNWRITTEN)); if (error) goto done; } diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 36564ae3084f..bf3f1b36fdd2 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -271,7 +271,8 @@ xfs_bmbt_free_block( int error; xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); - error = xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo); + error = xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo, + XFS_AG_RESV_NONE); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 25e2841084e1..f9015f88eca7 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -591,7 +591,7 @@ struct xfs_attr_shortform { uint8_t valuelen; /* actual length of value (no NULL) */ uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ uint8_t nameval[]; /* name & value bytes concatenated */ - } list[1]; /* variable sized array */ + } list[]; /* variable sized array */ }; typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */ @@ -620,19 +620,29 @@ typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */ typedef struct xfs_attr_leaf_name_local { __be16 valuelen; /* number of bytes in value */ __u8 namelen; /* length of name bytes */ - __u8 nameval[1]; /* name/value bytes */ + /* + * In Linux 6.5 this flex array was converted from nameval[1] to + * nameval[]. Be very careful here about extra padding at the end; + * see xfs_attr_leaf_entsize_local() for details. + */ + __u8 nameval[]; /* name/value bytes */ } xfs_attr_leaf_name_local_t; typedef struct xfs_attr_leaf_name_remote { __be32 valueblk; /* block number of value bytes */ __be32 valuelen; /* number of bytes in value */ __u8 namelen; /* length of name bytes */ - __u8 name[1]; /* name bytes */ + /* + * In Linux 6.5 this flex array was converted from name[1] to name[]. + * Be very careful here about extra padding at the end; see + * xfs_attr_leaf_entsize_remote() for details. + */ + __u8 name[]; /* name bytes */ } xfs_attr_leaf_name_remote_t; typedef struct xfs_attr_leafblock { xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */ - xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */ + xfs_attr_leaf_entry_t entries[]; /* sorted on key, not name */ /* * The rest of the block contains the following structures after the * leaf entries, growing from the bottom up. The variables are never @@ -664,7 +674,7 @@ struct xfs_attr3_leaf_hdr { struct xfs_attr3_leafblock { struct xfs_attr3_leaf_hdr hdr; - struct xfs_attr_leaf_entry entries[1]; + struct xfs_attr_leaf_entry entries[]; /* * The rest of the block contains the following structures after the @@ -747,14 +757,61 @@ xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx) */ static inline int xfs_attr_leaf_entsize_remote(int nlen) { - return round_up(sizeof(struct xfs_attr_leaf_name_remote) - 1 + - nlen, XFS_ATTR_LEAF_NAME_ALIGN); + /* + * Prior to Linux 6.5, struct xfs_attr_leaf_name_remote ended with + * name[1], which was used as a flexarray. The layout of this struct + * is 9 bytes of fixed-length fields followed by a __u8 flex array at + * offset 9. + * + * On most architectures, struct xfs_attr_leaf_name_remote had two + * bytes of implicit padding at the end of the struct to make the + * struct length 12. After converting name[1] to name[], there are + * three implicit padding bytes and the struct size remains 12. + * However, there are compiler configurations that do not add implicit + * padding at all (m68k) and have been broken for years. + * + * This entsize computation historically added (the xattr name length) + * to (the padded struct length - 1) and rounded that sum up to the + * nearest multiple of 4 (NAME_ALIGN). IOWs, round_up(11 + nlen, 4). + * This is encoded in the ondisk format, so we cannot change this. + * + * Compute the entsize from offsetof of the flexarray and manually + * adding bytes for the implicit padding. + */ + const size_t remotesize = + offsetof(struct xfs_attr_leaf_name_remote, name) + 2; + + return round_up(remotesize + nlen, XFS_ATTR_LEAF_NAME_ALIGN); } static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen) { - return round_up(sizeof(struct xfs_attr_leaf_name_local) - 1 + - nlen + vlen, XFS_ATTR_LEAF_NAME_ALIGN); + /* + * Prior to Linux 6.5, struct xfs_attr_leaf_name_local ended with + * nameval[1], which was used as a flexarray. The layout of this + * struct is 3 bytes of fixed-length fields followed by a __u8 flex + * array at offset 3. + * + * struct xfs_attr_leaf_name_local had zero bytes of implicit padding + * at the end of the struct to make the struct length 4. On most + * architectures, after converting nameval[1] to nameval[], there is + * one implicit padding byte and the struct size remains 4. However, + * there are compiler configurations that do not add implicit padding + * at all (m68k) and would break. + * + * This entsize computation historically added (the xattr name and + * value length) to (the padded struct length - 1) and rounded that sum + * up to the nearest multiple of 4 (NAME_ALIGN). IOWs, the formula is + * round_up(3 + nlen + vlen, 4). This is encoded in the ondisk format, + * so we cannot change this. + * + * Compute the entsize from offsetof of the flexarray and manually + * adding bytes for the implicit padding. + */ + const size_t localsize = + offsetof(struct xfs_attr_leaf_name_local, nameval); + + return round_up(localsize + nlen + vlen, XFS_ATTR_LEAF_NAME_ALIGN); } static inline int xfs_attr_leaf_entsize_local_max(int bsize) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 9c60ebb328b4..2cbf9ea39b8c 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -592,12 +592,12 @@ typedef struct xfs_attrlist_cursor { struct xfs_attrlist { __s32 al_count; /* number of entries in attrlist */ __s32 al_more; /* T/F: more attrs (do call again) */ - __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */ + __s32 al_offset[]; /* byte offsets of attrs [var-sized] */ }; struct xfs_attrlist_ent { /* data from attr_list() */ __u32 a_valuelen; /* number bytes in value of attr */ - char a_name[1]; /* attr name (NULL terminated) */ + char a_name[]; /* attr name (NULL terminated) */ }; typedef struct xfs_fsop_attrlist_handlereq { diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 34600f94c2f4..b83e54c70906 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1853,8 +1853,8 @@ xfs_difree_inode_chunk( /* not sparse, calculate extent info directly */ return xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, sagbno), - M_IGEO(mp)->ialloc_blks, - &XFS_RMAP_OINFO_INODES); + M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES, + XFS_AG_RESV_NONE); } /* holemask is only 16-bits (fits in an unsigned long) */ @@ -1899,8 +1899,8 @@ xfs_difree_inode_chunk( ASSERT(agbno % mp->m_sb.sb_spino_align == 0); ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); error = xfs_free_extent_later(tp, - XFS_AGB_TO_FSB(mp, agno, agbno), - contigblk, &XFS_RMAP_OINFO_INODES); + XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, + &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE); if (error) return error; @@ -2486,11 +2486,14 @@ xfs_ialloc_log_agi( static xfs_failaddr_t xfs_agi_verify( - struct xfs_buf *bp) + struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_mount; - struct xfs_agi *agi = bp->b_addr; - int i; + struct xfs_mount *mp = bp->b_mount; + struct xfs_agi *agi = bp->b_addr; + xfs_failaddr_t fa; + uint32_t agi_seqno = be32_to_cpu(agi->agi_seqno); + uint32_t agi_length = be32_to_cpu(agi->agi_length); + int i; if (xfs_has_crc(mp)) { if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid)) @@ -2507,6 +2510,10 @@ xfs_agi_verify( if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) return __this_address; + fa = xfs_validate_ag_length(bp, agi_seqno, agi_length); + if (fa) + return fa; + if (be32_to_cpu(agi->agi_level) < 1 || be32_to_cpu(agi->agi_level) > M_IGEO(mp)->inobt_maxlevels) return __this_address; @@ -2516,15 +2523,6 @@ xfs_agi_verify( be32_to_cpu(agi->agi_free_level) > M_IGEO(mp)->inobt_maxlevels)) return __this_address; - /* - * during growfs operations, the perag is not fully initialised, - * so we can't use it for any useful checking. growfs ensures we can't - * use it by using uncached buffers that don't have the perag attached - * so we can detect and avoid this problem. - */ - if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno) - return __this_address; - for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { if (agi->agi_unlinked[i] == cpu_to_be32(NULLAGINO)) continue; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 5a945ae21b5d..9258f01c0015 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -160,8 +160,7 @@ __xfs_inobt_free_block( xfs_inobt_mod_blockcount(cur, -1); fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); - return xfs_free_extent(cur->bc_tp, cur->bc_ag.pag, - XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1, + return xfs_free_extent_later(cur->bc_tp, fsbno, 1, &XFS_RMAP_OINFO_INOBT, resv); } diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index b6e21433925c..646b3fa362ad 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1152,7 +1152,8 @@ xfs_refcount_adjust_extents( cur->bc_ag.pag->pag_agno, tmp.rc_startblock); error = xfs_free_extent_later(cur->bc_tp, fsbno, - tmp.rc_blockcount, NULL); + tmp.rc_blockcount, NULL, + XFS_AG_RESV_NONE); if (error) goto out_error; } @@ -1213,7 +1214,8 @@ xfs_refcount_adjust_extents( cur->bc_ag.pag->pag_agno, ext.rc_startblock); error = xfs_free_extent_later(cur->bc_tp, fsbno, - ext.rc_blockcount, NULL); + ext.rc_blockcount, NULL, + XFS_AG_RESV_NONE); if (error) goto out_error; } @@ -1919,8 +1921,13 @@ xfs_refcount_recover_cow_leftovers( struct xfs_buf *agbp; struct xfs_refcount_recovery *rr, *n; struct list_head debris; - union xfs_btree_irec low; - union xfs_btree_irec high; + union xfs_btree_irec low = { + .rc.rc_domain = XFS_REFC_DOMAIN_COW, + }; + union xfs_btree_irec high = { + .rc.rc_domain = XFS_REFC_DOMAIN_COW, + .rc.rc_startblock = -1U, + }; xfs_fsblock_t fsb; int error; @@ -1951,10 +1958,6 @@ xfs_refcount_recover_cow_leftovers( cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag); /* Find all the leftover CoW staging extents. */ - memset(&low, 0, sizeof(low)); - memset(&high, 0, sizeof(high)); - low.rc.rc_domain = high.rc.rc_domain = XFS_REFC_DOMAIN_COW; - high.rc.rc_startblock = -1U; error = xfs_btree_query_range(cur, &low, &high, xfs_refcount_recover_extent, &debris); xfs_btree_del_cursor(cur, error); @@ -1981,7 +1984,8 @@ xfs_refcount_recover_cow_leftovers( /* Free the block. */ error = xfs_free_extent_later(tp, fsb, - rr->rr_rrec.rc_blockcount, NULL); + rr->rr_rrec.rc_blockcount, NULL, + XFS_AG_RESV_NONE); if (error) goto out_trans; diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index d4afc5f4e6a5..5c3987d8dc24 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -106,19 +106,13 @@ xfs_refcountbt_free_block( struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); - int error; trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.pag->pag_agno, XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1); be32_add_cpu(&agf->agf_refcount_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); - error = xfs_free_extent(cur->bc_tp, cur->bc_ag.pag, - XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1, + return xfs_free_extent_later(cur->bc_tp, fsbno, 1, &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA); - if (error) - return error; - - return error; } STATIC int diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index f4dc23b3b837..fbb0b2637463 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -2389,14 +2389,10 @@ xfs_rmap_query_range( xfs_rmap_query_range_fn fn, void *priv) { - union xfs_btree_irec low_brec; - union xfs_btree_irec high_brec; - struct xfs_rmap_query_range_info query; + union xfs_btree_irec low_brec = { .r = *low_rec }; + union xfs_btree_irec high_brec = { .r = *high_rec }; + struct xfs_rmap_query_range_info query = { .priv = priv, .fn = fn }; - low_brec.r = *low_rec; - high_brec.r = *high_rec; - query.priv = priv; - query.fn = fn; return xfs_btree_query_range(cur, &low_brec, &high_brec, xfs_rmap_query_range_helper, &query); } diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index ba0f17bc1dc0..5e174685a77c 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -412,7 +412,6 @@ xfs_validate_sb_common( sbp->sb_inodelog < XFS_DINODE_MIN_LOG || sbp->sb_inodelog > XFS_DINODE_MAX_LOG || sbp->sb_inodesize != (1 << sbp->sb_inodelog) || - sbp->sb_logsunit > XLOG_MAX_RECORD_BSIZE || sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) || XFS_FSB_TO_B(mp, sbp->sb_agblocks) < XFS_MIN_AG_BYTES || XFS_FSB_TO_B(mp, sbp->sb_agblocks) > XFS_MAX_AG_BYTES || @@ -430,6 +429,61 @@ xfs_validate_sb_common( return -EFSCORRUPTED; } + /* + * Logs that are too large are not supported at all. Reject them + * outright. Logs that are too small are tolerated on v4 filesystems, + * but we can only check that when mounting the log. Hence we skip + * those checks here. + */ + if (sbp->sb_logblocks > XFS_MAX_LOG_BLOCKS) { + xfs_notice(mp, + "Log size 0x%x blocks too large, maximum size is 0x%llx blocks", + sbp->sb_logblocks, XFS_MAX_LOG_BLOCKS); + return -EFSCORRUPTED; + } + + if (XFS_FSB_TO_B(mp, sbp->sb_logblocks) > XFS_MAX_LOG_BYTES) { + xfs_warn(mp, + "log size 0x%llx bytes too large, maximum size is 0x%llx bytes", + XFS_FSB_TO_B(mp, sbp->sb_logblocks), + XFS_MAX_LOG_BYTES); + return -EFSCORRUPTED; + } + + /* + * Do not allow filesystems with corrupted log sector or stripe units to + * be mounted. We cannot safely size the iclogs or write to the log if + * the log stripe unit is not valid. + */ + if (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT) { + if (sbp->sb_logsectsize != (1U << sbp->sb_logsectlog)) { + xfs_notice(mp, + "log sector size in bytes/log2 (0x%x/0x%x) must match", + sbp->sb_logsectsize, 1U << sbp->sb_logsectlog); + return -EFSCORRUPTED; + } + } else if (sbp->sb_logsectsize || sbp->sb_logsectlog) { + xfs_notice(mp, + "log sector size in bytes/log2 (0x%x/0x%x) are not zero", + sbp->sb_logsectsize, sbp->sb_logsectlog); + return -EFSCORRUPTED; + } + + if (sbp->sb_logsunit > 1) { + if (sbp->sb_logsunit % sbp->sb_blocksize) { + xfs_notice(mp, + "log stripe unit 0x%x bytes must be a multiple of block size", + sbp->sb_logsunit); + return -EFSCORRUPTED; + } + if (sbp->sb_logsunit > XLOG_MAX_RECORD_BSIZE) { + xfs_notice(mp, + "log stripe unit 0x%x bytes over maximum size (0x%x bytes)", + sbp->sb_logsunit, XLOG_MAX_RECORD_BSIZE); + return -EFSCORRUPTED; + } + } + /* Validate the realtime geometry; stolen from xfs_repair */ if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) { diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index f3d328e4a440..7c2fdc71e42d 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -566,20 +566,45 @@ xfs_extent_busy_clear( /* * Flush out all busy extents for this AG. + * + * If the current transaction is holding busy extents, the caller may not want + * to wait for committed busy extents to resolve. If we are being told just to + * try a flush or progress has been made since we last skipped a busy extent, + * return immediately to allow the caller to try again. + * + * If we are freeing extents, we might actually be holding the only free extents + * in the transaction busy list and the log force won't resolve that situation. + * In this case, we must return -EAGAIN to avoid a deadlock by informing the + * caller it needs to commit the busy extents it holds before retrying the + * extent free operation. */ -void +int xfs_extent_busy_flush( - struct xfs_mount *mp, + struct xfs_trans *tp, struct xfs_perag *pag, - unsigned busy_gen) + unsigned busy_gen, + uint32_t alloc_flags) { DEFINE_WAIT (wait); int error; - error = xfs_log_force(mp, XFS_LOG_SYNC); + error = xfs_log_force(tp->t_mountp, XFS_LOG_SYNC); if (error) - return; + return error; + + /* Avoid deadlocks on uncommitted busy extents. */ + if (!list_empty(&tp->t_busy)) { + if (alloc_flags & XFS_ALLOC_FLAG_TRYFLUSH) + return 0; + + if (busy_gen != READ_ONCE(pag->pagb_gen)) + return 0; + + if (alloc_flags & XFS_ALLOC_FLAG_FREEING) + return -EAGAIN; + } + /* Wait for committed busy extents to resolve. */ do { prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE); if (busy_gen != READ_ONCE(pag->pagb_gen)) @@ -588,6 +613,7 @@ xfs_extent_busy_flush( } while (1); finish_wait(&pag->pagb_wait, &wait); + return 0; } void diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h index 4a118131059f..c37bf87e6781 100644 --- a/fs/xfs/xfs_extent_busy.h +++ b/fs/xfs/xfs_extent_busy.h @@ -51,9 +51,9 @@ bool xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t *bno, xfs_extlen_t *len, unsigned *busy_gen); -void -xfs_extent_busy_flush(struct xfs_mount *mp, struct xfs_perag *pag, - unsigned busy_gen); +int +xfs_extent_busy_flush(struct xfs_trans *tp, struct xfs_perag *pag, + unsigned busy_gen, uint32_t alloc_flags); void xfs_extent_busy_wait_all(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index f9e36b810663..f1a5ecf099aa 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -337,6 +337,34 @@ xfs_trans_get_efd( } /* + * Fill the EFD with all extents from the EFI when we need to roll the + * transaction and continue with a new EFI. + * + * This simply copies all the extents in the EFI to the EFD rather than make + * assumptions about which extents in the EFI have already been processed. We + * currently keep the xefi list in the same order as the EFI extent list, but + * that may not always be the case. Copying everything avoids leaving a landmine + * were we fail to cancel all the extents in an EFI if the xefi list is + * processed in a different order to the extents in the EFI. + */ +static void +xfs_efd_from_efi( + struct xfs_efd_log_item *efdp) +{ + struct xfs_efi_log_item *efip = efdp->efd_efip; + uint i; + + ASSERT(efip->efi_format.efi_nextents > 0); + ASSERT(efdp->efd_next_extent < efip->efi_format.efi_nextents); + + for (i = 0; i < efip->efi_format.efi_nextents; i++) { + efdp->efd_format.efd_extents[i] = + efip->efi_format.efi_extents[i]; + } + efdp->efd_next_extent = efip->efi_format.efi_nextents; +} + +/* * Free an extent and log it to the EFD. Note that the transaction is marked * dirty regardless of whether the extent free succeeds or fails to support the * EFI/EFD lifecycle rules. @@ -365,7 +393,7 @@ xfs_trans_free_extent( agbno, xefi->xefi_blockcount); error = __xfs_free_extent(tp, xefi->xefi_pag, agbno, - xefi->xefi_blockcount, &oinfo, XFS_AG_RESV_NONE, + xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv, xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); /* @@ -378,6 +406,17 @@ xfs_trans_free_extent( tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); + /* + * If we need a new transaction to make progress, the caller will log a + * new EFI with the current contents. It will also log an EFD to cancel + * the existing EFI, and so we need to copy all the unprocessed extents + * in this EFI to the EFD so this works correctly. + */ + if (error == -EAGAIN) { + xfs_efd_from_efi(efdp); + return error; + } + next_extent = efdp->efd_next_extent; ASSERT(next_extent < efdp->efd_format.efd_nextents); extp = &(efdp->efd_format.efd_extents[next_extent]); @@ -495,6 +534,13 @@ xfs_extent_free_finish_item( error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi); + /* + * Don't free the XEFI if we need a new transaction to complete + * processing of it. + */ + if (error == -EAGAIN) + return error; + xfs_extent_free_put_group(xefi); kmem_cache_free(xfs_extfree_item_cache, xefi); return error; @@ -620,6 +666,7 @@ xfs_efi_item_recover( struct xfs_trans *tp; int i; int error = 0; + bool requeue_only = false; /* * First check the validity of the extents described by the @@ -644,6 +691,7 @@ xfs_efi_item_recover( for (i = 0; i < efip->efi_format.efi_nextents; i++) { struct xfs_extent_free_item fake = { .xefi_owner = XFS_RMAP_OWN_UNKNOWN, + .xefi_agresv = XFS_AG_RESV_NONE, }; struct xfs_extent *extp; @@ -652,9 +700,28 @@ xfs_efi_item_recover( fake.xefi_startblock = extp->ext_start; fake.xefi_blockcount = extp->ext_len; - xfs_extent_free_get_group(mp, &fake); - error = xfs_trans_free_extent(tp, efdp, &fake); - xfs_extent_free_put_group(&fake); + if (!requeue_only) { + xfs_extent_free_get_group(mp, &fake); + error = xfs_trans_free_extent(tp, efdp, &fake); + xfs_extent_free_put_group(&fake); + } + + /* + * If we can't free the extent without potentially deadlocking, + * requeue the rest of the extents to a new so that they get + * run again later with a new transaction context. + */ + if (error == -EAGAIN || requeue_only) { + error = xfs_free_extent_later(tp, fake.xefi_startblock, + fake.xefi_blockcount, + &XFS_RMAP_OINFO_ANY_OWNER, + fake.xefi_agresv); + if (!error) { + requeue_only = true; + continue; + } + } + if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, extp, sizeof(*extp)); diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 59e7d1a14b67..10403ba9b58f 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -160,9 +160,18 @@ struct xfs_getfsmap_info { struct xfs_buf *agf_bp; /* AGF, for refcount queries */ struct xfs_perag *pag; /* AG info, if applicable */ xfs_daddr_t next_daddr; /* next daddr we expect */ + /* daddr of low fsmap key when we're using the rtbitmap */ + xfs_daddr_t low_daddr; u64 missing_owner; /* owner of holes */ u32 dev; /* device id */ - struct xfs_rmap_irec low; /* low rmap key */ + /* + * Low rmap key for the query. If low.rm_blockcount is nonzero, this + * is the second (or later) call to retrieve the recordset in pieces. + * xfs_getfsmap_rec_before_start will compare all records retrieved + * by the rmapbt query to filter out any records that start before + * the last record. + */ + struct xfs_rmap_irec low; struct xfs_rmap_irec high; /* high rmap key */ bool last; /* last extent? */ }; @@ -237,16 +246,31 @@ xfs_getfsmap_format( xfs_fsmap_from_internal(rec, xfm); } +static inline bool +xfs_getfsmap_rec_before_start( + struct xfs_getfsmap_info *info, + const struct xfs_rmap_irec *rec, + xfs_daddr_t rec_daddr) +{ + if (info->low_daddr != -1ULL) + return rec_daddr < info->low_daddr; + if (info->low.rm_blockcount) + return xfs_rmap_compare(rec, &info->low) < 0; + return false; +} + /* * Format a reverse mapping for getfsmap, having translated rm_startblock - * into the appropriate daddr units. + * into the appropriate daddr units. Pass in a nonzero @len_daddr if the + * length could be larger than rm_blockcount in struct xfs_rmap_irec. */ STATIC int xfs_getfsmap_helper( struct xfs_trans *tp, struct xfs_getfsmap_info *info, const struct xfs_rmap_irec *rec, - xfs_daddr_t rec_daddr) + xfs_daddr_t rec_daddr, + xfs_daddr_t len_daddr) { struct xfs_fsmap fmr; struct xfs_mount *mp = tp->t_mountp; @@ -256,12 +280,15 @@ xfs_getfsmap_helper( if (fatal_signal_pending(current)) return -EINTR; + if (len_daddr == 0) + len_daddr = XFS_FSB_TO_BB(mp, rec->rm_blockcount); + /* * Filter out records that start before our startpoint, if the * caller requested that. */ - if (xfs_rmap_compare(rec, &info->low) < 0) { - rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); + if (xfs_getfsmap_rec_before_start(info, rec, rec_daddr)) { + rec_daddr += len_daddr; if (info->next_daddr < rec_daddr) info->next_daddr = rec_daddr; return 0; @@ -280,7 +307,7 @@ xfs_getfsmap_helper( info->head->fmh_entries++; - rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); + rec_daddr += len_daddr; if (info->next_daddr < rec_daddr) info->next_daddr = rec_daddr; return 0; @@ -320,7 +347,7 @@ xfs_getfsmap_helper( if (error) return error; fmr.fmr_offset = XFS_FSB_TO_BB(mp, rec->rm_offset); - fmr.fmr_length = XFS_FSB_TO_BB(mp, rec->rm_blockcount); + fmr.fmr_length = len_daddr; if (rec->rm_flags & XFS_RMAP_UNWRITTEN) fmr.fmr_flags |= FMR_OF_PREALLOC; if (rec->rm_flags & XFS_RMAP_ATTR_FORK) @@ -337,7 +364,7 @@ xfs_getfsmap_helper( xfs_getfsmap_format(mp, &fmr, info); out: - rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); + rec_daddr += len_daddr; if (info->next_daddr < rec_daddr) info->next_daddr = rec_daddr; return 0; @@ -358,7 +385,7 @@ xfs_getfsmap_datadev_helper( fsb = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno, rec->rm_startblock); rec_daddr = XFS_FSB_TO_DADDR(mp, fsb); - return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr); + return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr, 0); } /* Transform a bnobt irec into a fsmap */ @@ -382,7 +409,7 @@ xfs_getfsmap_datadev_bnobt_helper( irec.rm_offset = 0; irec.rm_flags = 0; - return xfs_getfsmap_helper(cur->bc_tp, info, &irec, rec_daddr); + return xfs_getfsmap_helper(cur->bc_tp, info, &irec, rec_daddr, 0); } /* Set rmap flags based on the getfsmap flags */ @@ -409,31 +436,25 @@ xfs_getfsmap_logdev( { struct xfs_mount *mp = tp->t_mountp; struct xfs_rmap_irec rmap; - int error; + xfs_daddr_t rec_daddr, len_daddr; + xfs_fsblock_t start_fsb, end_fsb; + uint64_t eofs; - /* Set up search keys */ - info->low.rm_startblock = XFS_BB_TO_FSBT(mp, keys[0].fmr_physical); - info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset); - error = xfs_fsmap_owner_to_rmap(&info->low, keys); - if (error) - return error; - info->low.rm_blockcount = 0; - xfs_getfsmap_set_irec_flags(&info->low, &keys[0]); + eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); + if (keys[0].fmr_physical >= eofs) + return 0; + start_fsb = XFS_BB_TO_FSBT(mp, + keys[0].fmr_physical + keys[0].fmr_length); + end_fsb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical)); - error = xfs_fsmap_owner_to_rmap(&info->high, keys + 1); - if (error) - return error; - info->high.rm_startblock = -1U; - info->high.rm_owner = ULLONG_MAX; - info->high.rm_offset = ULLONG_MAX; - info->high.rm_blockcount = 0; - info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS; - info->missing_owner = XFS_FMR_OWN_FREE; + /* Adjust the low key if we are continuing from where we left off. */ + if (keys[0].fmr_length > 0) + info->low_daddr = XFS_FSB_TO_BB(mp, start_fsb); - trace_xfs_fsmap_low_key(mp, info->dev, NULLAGNUMBER, &info->low); - trace_xfs_fsmap_high_key(mp, info->dev, NULLAGNUMBER, &info->high); + trace_xfs_fsmap_low_key_linear(mp, info->dev, start_fsb); + trace_xfs_fsmap_high_key_linear(mp, info->dev, end_fsb); - if (keys[0].fmr_physical > 0) + if (start_fsb > 0) return 0; /* Fabricate an rmap entry for the external log device. */ @@ -443,7 +464,9 @@ xfs_getfsmap_logdev( rmap.rm_offset = 0; rmap.rm_flags = 0; - return xfs_getfsmap_helper(tp, info, &rmap, 0); + rec_daddr = XFS_FSB_TO_BB(mp, rmap.rm_startblock); + len_daddr = XFS_FSB_TO_BB(mp, rmap.rm_blockcount); + return xfs_getfsmap_helper(tp, info, &rmap, rec_daddr, len_daddr); } #ifdef CONFIG_XFS_RT @@ -457,72 +480,58 @@ xfs_getfsmap_rtdev_rtbitmap_helper( { struct xfs_getfsmap_info *info = priv; struct xfs_rmap_irec irec; - xfs_daddr_t rec_daddr; + xfs_rtblock_t rtbno; + xfs_daddr_t rec_daddr, len_daddr; + + rtbno = rec->ar_startext * mp->m_sb.sb_rextsize; + rec_daddr = XFS_FSB_TO_BB(mp, rtbno); + irec.rm_startblock = rtbno; + + rtbno = rec->ar_extcount * mp->m_sb.sb_rextsize; + len_daddr = XFS_FSB_TO_BB(mp, rtbno); + irec.rm_blockcount = rtbno; - irec.rm_startblock = rec->ar_startext * mp->m_sb.sb_rextsize; - rec_daddr = XFS_FSB_TO_BB(mp, irec.rm_startblock); - irec.rm_blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize; irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */ irec.rm_offset = 0; irec.rm_flags = 0; - return xfs_getfsmap_helper(tp, info, &irec, rec_daddr); + return xfs_getfsmap_helper(tp, info, &irec, rec_daddr, len_daddr); } -/* Execute a getfsmap query against the realtime device. */ +/* Execute a getfsmap query against the realtime device rtbitmap. */ STATIC int -__xfs_getfsmap_rtdev( +xfs_getfsmap_rtdev_rtbitmap( struct xfs_trans *tp, const struct xfs_fsmap *keys, - int (*query_fn)(struct xfs_trans *, - struct xfs_getfsmap_info *), struct xfs_getfsmap_info *info) { + + struct xfs_rtalloc_rec alow = { 0 }; + struct xfs_rtalloc_rec ahigh = { 0 }; struct xfs_mount *mp = tp->t_mountp; - xfs_fsblock_t start_fsb; - xfs_fsblock_t end_fsb; + xfs_rtblock_t start_rtb; + xfs_rtblock_t end_rtb; uint64_t eofs; - int error = 0; + int error; - eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); + eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rextents * mp->m_sb.sb_rextsize); if (keys[0].fmr_physical >= eofs) return 0; - start_fsb = XFS_BB_TO_FSBT(mp, keys[0].fmr_physical); - end_fsb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical)); + start_rtb = XFS_BB_TO_FSBT(mp, + keys[0].fmr_physical + keys[0].fmr_length); + end_rtb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical)); - /* Set up search keys */ - info->low.rm_startblock = start_fsb; - error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]); - if (error) - return error; - info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset); - info->low.rm_blockcount = 0; - xfs_getfsmap_set_irec_flags(&info->low, &keys[0]); - - info->high.rm_startblock = end_fsb; - error = xfs_fsmap_owner_to_rmap(&info->high, &keys[1]); - if (error) - return error; - info->high.rm_offset = XFS_BB_TO_FSBT(mp, keys[1].fmr_offset); - info->high.rm_blockcount = 0; - xfs_getfsmap_set_irec_flags(&info->high, &keys[1]); - - trace_xfs_fsmap_low_key(mp, info->dev, NULLAGNUMBER, &info->low); - trace_xfs_fsmap_high_key(mp, info->dev, NULLAGNUMBER, &info->high); + info->missing_owner = XFS_FMR_OWN_UNKNOWN; - return query_fn(tp, info); -} + /* Adjust the low key if we are continuing from where we left off. */ + if (keys[0].fmr_length > 0) { + info->low_daddr = XFS_FSB_TO_BB(mp, start_rtb); + if (info->low_daddr >= eofs) + return 0; + } -/* Actually query the realtime bitmap. */ -STATIC int -xfs_getfsmap_rtdev_rtbitmap_query( - struct xfs_trans *tp, - struct xfs_getfsmap_info *info) -{ - struct xfs_rtalloc_rec alow = { 0 }; - struct xfs_rtalloc_rec ahigh = { 0 }; - struct xfs_mount *mp = tp->t_mountp; - int error; + trace_xfs_fsmap_low_key_linear(mp, info->dev, start_rtb); + trace_xfs_fsmap_high_key_linear(mp, info->dev, end_rtb); xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); @@ -530,8 +539,8 @@ xfs_getfsmap_rtdev_rtbitmap_query( * Set up query parameters to return free rtextents covering the range * we want. */ - alow.ar_startext = info->low.rm_startblock; - ahigh.ar_startext = info->high.rm_startblock; + alow.ar_startext = start_rtb; + ahigh.ar_startext = end_rtb; do_div(alow.ar_startext, mp->m_sb.sb_rextsize); if (do_div(ahigh.ar_startext, mp->m_sb.sb_rextsize)) ahigh.ar_startext++; @@ -554,18 +563,6 @@ err: xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); return error; } - -/* Execute a getfsmap query against the realtime device rtbitmap. */ -STATIC int -xfs_getfsmap_rtdev_rtbitmap( - struct xfs_trans *tp, - const struct xfs_fsmap *keys, - struct xfs_getfsmap_info *info) -{ - info->missing_owner = XFS_FMR_OWN_UNKNOWN; - return __xfs_getfsmap_rtdev(tp, keys, xfs_getfsmap_rtdev_rtbitmap_query, - info); -} #endif /* CONFIG_XFS_RT */ /* Execute a getfsmap query against the regular data device. */ @@ -606,9 +603,27 @@ __xfs_getfsmap_datadev( error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]); if (error) return error; - info->low.rm_blockcount = 0; + info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, keys[0].fmr_length); xfs_getfsmap_set_irec_flags(&info->low, &keys[0]); + /* Adjust the low key if we are continuing from where we left off. */ + if (info->low.rm_blockcount == 0) { + /* empty */ + } else if (XFS_RMAP_NON_INODE_OWNER(info->low.rm_owner) || + (info->low.rm_flags & (XFS_RMAP_ATTR_FORK | + XFS_RMAP_BMBT_BLOCK | + XFS_RMAP_UNWRITTEN))) { + info->low.rm_startblock += info->low.rm_blockcount; + info->low.rm_owner = 0; + info->low.rm_offset = 0; + + start_fsb += info->low.rm_blockcount; + if (XFS_FSB_TO_DADDR(mp, start_fsb) >= eofs) + return 0; + } else { + info->low.rm_offset += info->low.rm_blockcount; + } + info->high.rm_startblock = -1U; info->high.rm_owner = ULLONG_MAX; info->high.rm_offset = ULLONG_MAX; @@ -659,12 +674,8 @@ __xfs_getfsmap_datadev( * Set the AG low key to the start of the AG prior to * moving on to the next AG. */ - if (pag->pag_agno == start_ag) { - info->low.rm_startblock = 0; - info->low.rm_owner = 0; - info->low.rm_offset = 0; - info->low.rm_flags = 0; - } + if (pag->pag_agno == start_ag) + memset(&info->low, 0, sizeof(info->low)); /* * If this is the last AG, report any gap at the end of it @@ -791,6 +802,19 @@ xfs_getfsmap_check_keys( struct xfs_fsmap *low_key, struct xfs_fsmap *high_key) { + if (low_key->fmr_flags & (FMR_OF_SPECIAL_OWNER | FMR_OF_EXTENT_MAP)) { + if (low_key->fmr_offset) + return false; + } + if (high_key->fmr_flags != -1U && + (high_key->fmr_flags & (FMR_OF_SPECIAL_OWNER | + FMR_OF_EXTENT_MAP))) { + if (high_key->fmr_offset && high_key->fmr_offset != -1ULL) + return false; + } + if (high_key->fmr_length && high_key->fmr_length != -1ULL) + return false; + if (low_key->fmr_device > high_key->fmr_device) return false; if (low_key->fmr_device < high_key->fmr_device) @@ -834,15 +858,15 @@ xfs_getfsmap_check_keys( * ---------------- * There are multiple levels of keys and counters at work here: * xfs_fsmap_head.fmh_keys -- low and high fsmap keys passed in; - * these reflect fs-wide sector addrs. + * these reflect fs-wide sector addrs. * dkeys -- fmh_keys used to query each device; - * these are fmh_keys but w/ the low key - * bumped up by fmr_length. + * these are fmh_keys but w/ the low key + * bumped up by fmr_length. * xfs_getfsmap_info.next_daddr -- next disk addr we expect to see; this * is how we detect gaps in the fsmap records and report them. * xfs_getfsmap_info.low/high -- per-AG low/high keys computed from - * dkeys; used to query the metadata. + * dkeys; used to query the metadata. */ int xfs_getfsmap( @@ -863,6 +887,8 @@ xfs_getfsmap( if (!xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[0]) || !xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[1])) return -EINVAL; + if (!xfs_getfsmap_check_keys(&head->fmh_keys[0], &head->fmh_keys[1])) + return -EINVAL; use_rmap = xfs_has_rmapbt(mp) && has_capability_noaudit(current, CAP_SYS_ADMIN); @@ -901,26 +927,15 @@ xfs_getfsmap( * blocks could be mapped to several other files/offsets. * According to rmapbt record ordering, the minimal next * possible record for the block range is the next starting - * offset in the same inode. Therefore, bump the file offset to - * continue the search appropriately. For all other low key - * mapping types (attr blocks, metadata), bump the physical - * offset as there can be no other mapping for the same physical - * block range. + * offset in the same inode. Therefore, each fsmap backend bumps + * the file offset to continue the search appropriately. For + * all other low key mapping types (attr blocks, metadata), each + * fsmap backend bumps the physical offset as there can be no + * other mapping for the same physical block range. */ dkeys[0] = head->fmh_keys[0]; - if (dkeys[0].fmr_flags & (FMR_OF_SPECIAL_OWNER | FMR_OF_EXTENT_MAP)) { - dkeys[0].fmr_physical += dkeys[0].fmr_length; - dkeys[0].fmr_owner = 0; - if (dkeys[0].fmr_offset) - return -EINVAL; - } else - dkeys[0].fmr_offset += dkeys[0].fmr_length; - dkeys[0].fmr_length = 0; memset(&dkeys[1], 0xFF, sizeof(struct xfs_fsmap)); - if (!xfs_getfsmap_check_keys(dkeys, &head->fmh_keys[1])) - return -EINVAL; - info.next_daddr = head->fmh_keys[0].fmr_physical + head->fmh_keys[0].fmr_length; info.fsmap_recs = fsmap_recs; @@ -960,6 +975,8 @@ xfs_getfsmap( info.dev = handlers[i].dev; info.last = false; info.pag = NULL; + info.low_daddr = -1ULL; + info.low.rm_blockcount = 0; error = handlers[i].fn(tp, dkeys, &info); if (error) break; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 122b83488a05..7cb75cb6b8e9 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -93,7 +93,7 @@ xfs_growfs_data_private( xfs_agnumber_t nagimax = 0; xfs_rfsblock_t nb, nb_div, nb_mod; int64_t delta; - bool lastag_extended; + bool lastag_extended = false; xfs_agnumber_t oagcount; struct xfs_trans *tp; struct aghdr_init_data id = {}; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index fc61cc024023..79004d193e54 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -639,7 +639,6 @@ xfs_log_mount( int num_bblks) { struct xlog *log; - bool fatal = xfs_has_crc(mp); int error = 0; int min_logfsbs; @@ -663,53 +662,37 @@ xfs_log_mount( mp->m_log = log; /* - * Validate the given log space and drop a critical message via syslog - * if the log size is too small that would lead to some unexpected - * situations in transaction log space reservation stage. + * Now that we have set up the log and it's internal geometry + * parameters, we can validate the given log space and drop a critical + * message via syslog if the log size is too small. A log that is too + * small can lead to unexpected situations in transaction log space + * reservation stage. The superblock verifier has already validated all + * the other log geometry constraints, so we don't have to check those + * here. * - * Note: we can't just reject the mount if the validation fails. This - * would mean that people would have to downgrade their kernel just to - * remedy the situation as there is no way to grow the log (short of - * black magic surgery with xfs_db). + * Note: For v4 filesystems, we can't just reject the mount if the + * validation fails. This would mean that people would have to + * downgrade their kernel just to remedy the situation as there is no + * way to grow the log (short of black magic surgery with xfs_db). * - * We can, however, reject mounts for CRC format filesystems, as the + * We can, however, reject mounts for V5 format filesystems, as the * mkfs binary being used to make the filesystem should never create a * filesystem with a log that is too small. */ min_logfsbs = xfs_log_calc_minimum_size(mp); - if (mp->m_sb.sb_logblocks < min_logfsbs) { xfs_warn(mp, "Log size %d blocks too small, minimum size is %d blocks", mp->m_sb.sb_logblocks, min_logfsbs); - error = -EINVAL; - } else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) { - xfs_warn(mp, - "Log size %d blocks too large, maximum size is %lld blocks", - mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS); - error = -EINVAL; - } else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) { - xfs_warn(mp, - "log size %lld bytes too large, maximum size is %lld bytes", - XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks), - XFS_MAX_LOG_BYTES); - error = -EINVAL; - } else if (mp->m_sb.sb_logsunit > 1 && - mp->m_sb.sb_logsunit % mp->m_sb.sb_blocksize) { - xfs_warn(mp, - "log stripe unit %u bytes must be a multiple of block size", - mp->m_sb.sb_logsunit); - error = -EINVAL; - fatal = true; - } - if (error) { + /* * Log check errors are always fatal on v5; or whenever bad * metadata leads to a crash. */ - if (fatal) { + if (xfs_has_crc(mp)) { xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!"); ASSERT(0); + error = -EINVAL; goto out_free_log; } xfs_crit(mp, "Log size out of supported range."); diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index c4078d0ec108..4a9bbd3fe120 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -114,7 +114,8 @@ xfs_dax_notify_ddev_failure( int error = 0; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, daddr); xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno); - xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp, daddr + bblen); + xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp, + daddr + bblen - 1); xfs_agnumber_t end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno); error = xfs_trans_alloc_empty(mp, &tp); @@ -210,7 +211,7 @@ xfs_dax_notify_failure( ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1; /* Ignore the range out of filesystem area */ - if (offset + len < ddev_start) + if (offset + len - 1 < ddev_start) return -ENXIO; if (offset > ddev_end) return -ENXIO; @@ -222,8 +223,8 @@ xfs_dax_notify_failure( len -= ddev_start - offset; offset = 0; } - if (offset + len > ddev_end) - len -= ddev_end - offset; + if (offset + len - 1 > ddev_end) + len = ddev_end - offset + 1; return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len), mf_flags); diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 9737b5a9f405..c4cc99b70dd3 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -56,7 +56,7 @@ xfs_check_ondisk_structs(void) /* dir/attr trees */ XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leaf_hdr, 80); - XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leafblock, 88); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leafblock, 80); XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_rmt_hdr, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_da3_blkinfo, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_da3_intnode, 64); @@ -88,7 +88,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valuelen, 4); XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8); XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9); - XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 40); + XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 32); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_shortform, 4); XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.totsize, 0); XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.count, 2); XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].namelen, 4); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index abcc559f3c64..eb9102453aff 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -617,7 +617,8 @@ xfs_reflink_cancel_cow_blocks( del.br_blockcount); error = xfs_free_extent_later(*tpp, del.br_startblock, - del.br_blockcount, NULL); + del.br_blockcount, NULL, + XFS_AG_RESV_NONE); if (error) break; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 4db669203149..f3cc204bb4bf 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3623,6 +3623,31 @@ DEFINE_FSMAP_EVENT(xfs_fsmap_low_key); DEFINE_FSMAP_EVENT(xfs_fsmap_high_key); DEFINE_FSMAP_EVENT(xfs_fsmap_mapping); +DECLARE_EVENT_CLASS(xfs_fsmap_linear_class, + TP_PROTO(struct xfs_mount *mp, u32 keydev, uint64_t bno), + TP_ARGS(mp, keydev, bno), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, keydev) + __field(xfs_fsblock_t, bno) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->keydev = new_decode_dev(keydev); + __entry->bno = bno; + ), + TP_printk("dev %d:%d keydev %d:%d bno 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->keydev), MINOR(__entry->keydev), + __entry->bno) +) +#define DEFINE_FSMAP_LINEAR_EVENT(name) \ +DEFINE_EVENT(xfs_fsmap_linear_class, name, \ + TP_PROTO(struct xfs_mount *mp, u32 keydev, uint64_t bno), \ + TP_ARGS(mp, keydev, bno)) +DEFINE_FSMAP_LINEAR_EVENT(xfs_fsmap_low_key_linear); +DEFINE_FSMAP_LINEAR_EVENT(xfs_fsmap_high_key_linear); + DECLARE_EVENT_CLASS(xfs_getfsmap_class, TP_PROTO(struct xfs_mount *mp, struct xfs_fsmap *fsmap), TP_ARGS(mp, fsmap), diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 7d4109af193e..1098452e7f95 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -823,7 +823,7 @@ xfs_trans_ail_update_bulk( trace_xfs_ail_insert(lip, 0, lsn); } lip->li_lsn = lsn; - list_add(&lip->li_ail, &tmp); + list_add_tail(&lip->li_ail, &tmp); } if (!list_empty(&tmp)) |