Re: [6.2][regression] after commit 947a629988f191807d2d22ba63ae18259bb645c5 btrfs volume periodical forced switch to readonly after a lot of disk writes

From: Qu Wenruo
Date: Tue Dec 27 2022 - 06:04:12 EST




On 2022/12/27 18:19, Mikhail Gavrilov wrote:
On Tue, Dec 27, 2022 at 10:13 AM Qu Wenruo <quwenruo.btrfs@xxxxxxx> wrote:
The result doesn't make sense...

A lot of read_block_for_search() and btrfs_read_node_slot() are
triggering the warning.

But inside both functions, we have just set the numbers before the call:

In read_block_for_search() we got:

check.has_first_key = true;
check.level = parent_level - 1;
check.transid = gen;
check.owner_root = root->root_key.objectid;

Thus at least check.has_first_key is always true, and the WARN_ON()
should never get triggered.
The same applies to btrfs_read_node_slot().

It looks like something involved in memory barrier.

Anyway, the latest debug patch has extra mb to be sure.

And despite the possible fix, could you provide extra info of your
build? Include:
Kernel log is attached. All answers are below.

- Hardware (mostly CPU and RAM spec)
This is laptop ASUS ROG Strix G15 Advantage Edition G513QY-HQ007
with CPU AMD Ryzen 9 5900HX and GPU Radeon RX 6800M

I have a similar laptop (G14), only GPU is different (RTX3060), and I failed to reproduce this so far...

I upgraded RAM to Crucial CT32G4SFD832A DDR4 - 32GB x 2 (64GB) and SSD
to Seagate FireCuda 530 ZP4000GM3A013 4TB x 2 (8TB)
https://linux-hardware.org/?probe=0e063e5fd5

- Toolchain used to compile the kernel (include compiler and its version)
$ rpm -q binutils
binutils-2.39-6.fc38.x86_64

$ ld --version
GNU ld version 2.39-6.fc38

$ rpm -q gcc
gcc-12.2.1-4.fc38.x86_64

My gcc is only a small version behind (12.2.0).

Thus none of the hardware seems suspicious at all...

Anyway I have attached my last struggle for the weird problem.
For now, I have no idea why this can even happen...

Thanks,
Qu


$ gcc --version
gcc (GCC) 12.2.1 20221121 (Red Hat 12.2.1-4)

$ rpm -q make
make-4.3-11.fc37.x86_64

$ make --version
GNU Make 4.3

- Kernel config
Attached with debug logs.
From 586430c9a5ca9f842626baccdb5fcd1be6294bbf Mon Sep 17 00:00:00 2001
Message-Id: <586430c9a5ca9f842626baccdb5fcd1be6294bbf.1672138657.git.wqu@xxxxxxxx>
From: Qu Wenruo <wqu@xxxxxxxx>
Date: Mon, 26 Dec 2022 16:44:08 +0800
Subject: [PATCH v5] btrfs: add extra debug for level mismatch

Currently I assume there is some race or uninitialized value for
check::level.

The extra output are for two locations:

- validate_extent_buffer()
Output the error message for read error and the members of check.

- read_extent_buffer_pages()
This will dump the stack for us to catch the offender.

Signed-off-by: Qu Wenruo <wqu@xxxxxxxx>
---
Changelog:
v2:
- Extra submission time output
This would greately enlarge the dmesg size

- Extra warning when submitting a metadata bio
If we have an uninitialized check structure, do a warning and stack
dump to show the offending call trace.

v3:
- Fix a compiling error

v4:
- Add extra mb for test

v5:
- Initialize check structure early
- Move mb to the call site we initialize the check structure
- Use sizeof(struct btrfs_tree_parent_check) instead of sizeof(*size)
- Add extra debug output for sizeof(*check) just in case.

Overall I have no idea how this can go wrong...
---
fs/btrfs/ctree.c | 31 ++++++++++++++++++++-----------
fs/btrfs/disk-io.c | 19 +++++++++++++++++--
fs/btrfs/extent_io.c | 20 +++++++++++++++++---
3 files changed, 54 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 4754c9101a4c..d858219c8886 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -958,7 +958,12 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
int slot)
{
int level = btrfs_header_level(parent);
- struct btrfs_tree_parent_check check = { 0 };
+ struct btrfs_tree_parent_check empty = { 0 };
+ struct btrfs_tree_parent_check check = {
+ .level = level - 1,
+ .transid = btrfs_node_ptr_generation(parent, slot),
+ .owner_root = btrfs_header_owner(parent),
+ };
struct extent_buffer *eb;

if (slot < 0 || slot >= btrfs_header_nritems(parent))
@@ -966,12 +971,12 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,

BUG_ON(level == 0);

- check.level = level - 1;
- check.transid = btrfs_node_ptr_generation(parent, slot);
- check.owner_root = btrfs_header_owner(parent);
check.has_first_key = true;
btrfs_node_key_to_cpu(parent, &check.first_key, slot);
+ smp_mb();

+ WARN_ON(!memcmp(&check, &empty,
+ sizeof(struct btrfs_tree_parent_check)));
eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
&check);
if (IS_ERR(eb))
@@ -1531,24 +1536,26 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
struct extent_buffer **eb_ret, int level, int slot,
const struct btrfs_key *key)
{
+ u64 gen = btrfs_node_ptr_generation(*eb_ret, slot);
+ u8 parent_level = btrfs_header_level(*eb_ret);
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_tree_parent_check check = { 0 };
+ struct btrfs_tree_parent_check empty = { 0 };
+ struct btrfs_tree_parent_check check = {
+ .transid = gen,
+ .level = parent_level - 1,
+ .owner_root = root->root_key.objectid,
+ };
u64 blocknr;
- u64 gen;
struct extent_buffer *tmp;
int ret;
- int parent_level;
bool unlock_up;

unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
blocknr = btrfs_node_blockptr(*eb_ret, slot);
- gen = btrfs_node_ptr_generation(*eb_ret, slot);
- parent_level = btrfs_header_level(*eb_ret);
btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot);
check.has_first_key = true;
- check.level = parent_level - 1;
- check.transid = gen;
check.owner_root = root->root_key.objectid;
+ smp_mb();

/*
* If we need to read an extent buffer from disk and we are holding locks
@@ -1587,6 +1594,8 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
btrfs_unlock_up_safe(p, level + 1);

/* now we're allowed to do a blocking uptodate check */
+ WARN_ON(!memcmp(&check, &empty,
+ sizeof(struct btrfs_tree_parent_check)));
ret = btrfs_read_extent_buffer(tmp, &check);
if (ret) {
free_extent_buffer(tmp);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f8b5955f003f..49b077acf359 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -530,6 +530,10 @@ static int validate_extent_buffer(struct extent_buffer *eb,
}

if (found_level != check->level) {
+ btrfs_err(eb->fs_info,
+"level verify failed on logical %llu mirror %u wanted %u found %u",
+ eb->start, eb->read_mirror, check->level,
+ found_level);
ret = -EIO;
goto out;
}
@@ -581,13 +585,20 @@ static int validate_extent_buffer(struct extent_buffer *eb,
if (found_level > 0 && btrfs_check_node(eb))
ret = -EIO;

+out:
if (!ret)
set_extent_buffer_uptodate(eb);
- else
+ else {
btrfs_err(fs_info,
"read time tree block corruption detected on logical %llu mirror %u",
eb->start, eb->read_mirror);
-out:
+ btrfs_err(eb->fs_info,
+"check owner_root=%llu transid=%llu first_key=(%llu %u %llu) has_first_key=%d level=%u",
+ check->owner_root,
+ check->transid, check->first_key.objectid,
+ check->first_key.type, check->first_key.offset,
+ check->has_first_key, check->level);
+ }
return ret;
}

@@ -652,6 +663,7 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
int reads_done;

ASSERT(page->private);
+ WARN_ON(!bbio->is_metadata);

if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
return validate_subpage_buffer(page, start, end, mirror,
@@ -833,12 +845,15 @@ void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int m
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_bio *bbio = btrfs_bio(bio);
+ struct btrfs_tree_parent_check check = {0};
blk_status_t ret;

bio->bi_opf |= REQ_META;
bbio->is_metadata = 1;

if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
+ WARN_ON(!memcmp(&check, &bbio->parent_check,
+ sizeof(struct btrfs_tree_parent_check)));
btrfs_submit_bio(fs_info, bio, mirror_num);
return;
}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 83dd3aa59663..e1ee1940132c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4878,7 +4878,8 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
*/
atomic_dec(&eb->io_pages);
}
- memcpy(&btrfs_bio(bio_ctrl.bio)->parent_check, check, sizeof(*check));
+ memcpy(&btrfs_bio(bio_ctrl.bio)->parent_check, check,
+ sizeof(struct btrfs_tree_parent_check));
submit_one_bio(&bio_ctrl);
if (ret || wait != WAIT_COMPLETE) {
free_extent_state(cached_state);
@@ -4996,7 +4997,8 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
}
}

- memcpy(&btrfs_bio(bio_ctrl.bio)->parent_check, check, sizeof(*check));
+ memcpy(&btrfs_bio(bio_ctrl.bio)->parent_check, check,
+ sizeof(struct btrfs_tree_parent_check));
submit_one_bio(&bio_ctrl);

if (ret || wait != WAIT_COMPLETE)
@@ -5005,8 +5007,20 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
wait_on_page_locked(page);
- if (!PageUptodate(page))
+ if (!PageUptodate(page)) {
ret = -EIO;
+ btrfs_err(eb->fs_info,
+"read failed, bytenr=%llu check owner_root=%llu transid=%llu has_first_key=%d first_key=(%llu %u %llu) level=%u size=%zu",
+ eb->start,
+ check->owner_root, check->transid,
+ check->has_first_key,
+ check->first_key.objectid,
+ check->first_key.type,
+ check->first_key.offset,
+ check->level,
+ sizeof(*check));
+ dump_stack();
+ }
}

return ret;
--
2.39.0