From 19a04e5ad21e1819b95c0ed748a4f128dc25508a Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 12 Dec 2024 13:22:41 -0500 Subject: [PATCH 01/43] BRT: Check bv_mos_entries in brt_entry_lookup() When vdev first sees some block cloning, there is a window when brt_maybe_exists() might already return true since something was cloned, but bv_mos_entries is still 0 since BRT ZAP was not yet created. In such case we should not try to look into the ZAP and dereference NULL bv_mos_entries_dnode. Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16851 --- module/zfs/brt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/module/zfs/brt.c b/module/zfs/brt.c index 7d94214143ea0..79748cd69bc07 100644 --- a/module/zfs/brt.c +++ b/module/zfs/brt.c @@ -859,6 +859,9 @@ brt_entry_lookup(brt_vdev_t *brtvd, brt_entry_t *bre) { uint64_t off = BRE_OFFSET(bre); + if (brtvd->bv_mos_entries == 0) + return (SET_ERROR(ENOENT)); + return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode, &off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count), &bre->bre_count)); } From 6c9b4f18d3e62d7e2562d2f101b9e1bc488abb63 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Thu, 12 Dec 2024 16:18:45 -0800 Subject: [PATCH 02/43] Fix DR_OVERRIDDEN use-after-free race in dbuf_sync_leaf In dbuf_sync_leaf, we clone the arc_buf in dr if we share it with db except for overridden case. However, this exception causes a race where dbuf_new_size could free the arc_buf after the last dereference of *datap and causes use-after-free. We fix this by cloning the buf regardless if it's overridden. The race: -- P0 P1 dbuf_hold_impl() // dbuf_hold_copy passed // because db_data_pending NULL dbuf_sync_leaf() // doesn't clone *datap // *datap derefed to db_buf dbuf_write(*datap) dbuf_new_size() dmu_buf_will_dirty() dbuf_fix_old_data() // alloc new buf for P0 dr // but can't change *datap arc_alloc_buf() arc_buf_destroy() // alloc new buf for db_buf // and destroy old buf dbuf_write() // continue abd_get_from_buf(data->b_data, arc_buf_size(data)) // use-after-free -- Here's an example when it happens: BUG: kernel NULL pointer dereference, address: 000000000000002e RIP: 0010:arc_buf_size+0x1c/0x30 [zfs] Call Trace: dbuf_write+0x3ff/0x580 [zfs] dbuf_sync_leaf+0x13c/0x530 [zfs] dbuf_sync_list+0xbf/0x120 [zfs] dnode_sync+0x3ea/0x7a0 [zfs] sync_dnodes_task+0x71/0xa0 [zfs] taskq_thread+0x2b8/0x4e0 [spl] kthread+0x112/0x130 ret_from_fork+0x1f/0x30 Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Chunwei Chen Co-authored-by: Chunwei Chen Closes #16854 --- module/zfs/dbuf.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 190d8ded39b0f..90395cad6e451 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -4779,8 +4779,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) if (*datap != NULL && *datap == db->db_buf && dn->dn_object != DMU_META_DNODE_OBJECT && - zfs_refcount_count(&db->db_holds) > 1 && - dr->dt.dl.dr_override_state != DR_OVERRIDDEN) { + zfs_refcount_count(&db->db_holds) > 1) { /* * If this buffer is currently "in use" (i.e., there * are active holds and db_data still references it), From ecc0970e3e517b620ea63f0e35632c7d21d35da0 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sat, 14 Dec 2024 05:12:14 +1100 Subject: [PATCH 03/43] backtrace: fix off-by-one on string output sizeof("foo") includes the trailing null byte, so all the output had nulls through it. Most terminals quietly ignore it, but it makes some tools misdetect file types and other annoyances. Easy fix: subtract 1. Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #16862 --- lib/libspl/backtrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/libspl/backtrace.c b/lib/libspl/backtrace.c index 6e8b3b12122de..7f1ef596f8edd 100644 --- a/lib/libspl/backtrace.c +++ b/lib/libspl/backtrace.c @@ -38,7 +38,7 @@ */ #define spl_bt_write_n(fd, s, n) \ do { ssize_t r __maybe_unused = write(fd, s, n); } while (0) -#define spl_bt_write(fd, s) spl_bt_write_n(fd, s, sizeof (s)) +#define spl_bt_write(fd, s) spl_bt_write_n(fd, s, sizeof (s)-1) #if defined(HAVE_LIBUNWIND) #define UNW_LOCAL_ONLY From 76f57ab9f7483335ba28b7de7831eeaa2dad438d Mon Sep 17 00:00:00 2001 From: Poscat Date: Sat, 14 Dec 2024 04:06:40 +0800 Subject: [PATCH 04/43] build: use correct bashcompletiondir on arch Reviewed-by: Tino Reichardt Reviewed-by: Brian Behlendorf Signed-off-by: poscat Closes #16861 --- config/zfs-build.m4 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/zfs-build.m4 b/config/zfs-build.m4 index 255813f46d19c..c44a893bbb8c3 100644 --- a/config/zfs-build.m4 +++ b/config/zfs-build.m4 @@ -627,7 +627,7 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ AC_MSG_CHECKING([default bash completion directory]) case "$VENDOR" in - alpine|artix|debian|gentoo|ubuntu) + alpine|arch|artix|debian|gentoo|ubuntu) bashcompletiondir=/usr/share/bash-completion/completions ;; freebsd) From fbea92432a4664786ff6a9f2f59bb82b6c5f0b84 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Mon, 1 Jul 2024 11:19:16 +1000 Subject: [PATCH 05/43] flush: don't report flush error when disabling flush support The first time a device returns ENOTSUP in repsonse to a flush request, we set vdev_nowritecache so we don't issue flushes in the future and instead just pretend the succeeded. However, we still return an error for the initial flush, even though we just decided such errors are meaningless! So, when setting vdev_nowritecache in response to a flush error, also reset the error code to assume success. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #16855 --- module/zfs/zio.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/module/zfs/zio.c b/module/zfs/zio.c index f4d7e57542a18..f13228051decb 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -4608,11 +4608,14 @@ zio_vdev_io_assess(zio_t *zio) /* * If a cache flush returns ENOTSUP or ENOTTY, we know that no future * attempts will ever succeed. In this case we set a persistent - * boolean flag so that we don't bother with it in the future. + * boolean flag so that we don't bother with it in the future, and + * then we act like the flush succeeded. */ if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) && - zio->io_type == ZIO_TYPE_FLUSH && vd != NULL) + zio->io_type == ZIO_TYPE_FLUSH && vd != NULL) { vd->vdev_nowritecache = B_TRUE; + zio->io_error = 0; + } if (zio->io_error) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; From 46e06fededd42ff4d6fe2dfe0b60612f609edf0b Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Mon, 1 Jul 2024 11:19:16 +1000 Subject: [PATCH 06/43] flush: only detect lack of flush support in one place It seems there's no good reason for vdev_disk & vdev_geom to explicitly detect no support for flush and set vdev_nowritecache. Instead, just signal it by setting the error to ENOTSUP, and let zio_vdev_io_assess() take care of it in one place. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #16855 --- module/os/freebsd/zfs/vdev_geom.c | 15 --------------- module/os/linux/zfs/vdev_disk.c | 5 ++--- module/zfs/zio.c | 6 +++--- 3 files changed, 5 insertions(+), 21 deletions(-) diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c index b7ff1063b0891..7aaa42bfb1a89 100644 --- a/module/os/freebsd/zfs/vdev_geom.c +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -1014,21 +1014,6 @@ vdev_geom_io_intr(struct bio *bp) zio->io_error = SET_ERROR(EIO); switch (zio->io_error) { - case ENOTSUP: - /* - * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know - * that future attempts will never succeed. In this case - * we set a persistent flag so that we don't bother with - * requests in the future. - */ - switch (bp->bio_cmd) { - case BIO_FLUSH: - vd->vdev_nowritecache = B_TRUE; - break; - case BIO_DELETE: - break; - } - break; case ENXIO: if (!vd->vdev_remove_wanted) { /* diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 6a66a72b91a98..e8bd513e69098 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -1198,9 +1198,8 @@ vdev_disk_io_flush_completion(struct bio *bio) { zio_t *zio = bio->bi_private; zio->io_error = bi_status_to_errno(bio->bi_status); - - if (zio->io_error && (zio->io_error == EOPNOTSUPP)) - zio->io_vd->vdev_nowritecache = B_TRUE; + if (zio->io_error == EOPNOTSUPP || zio->io_error == ENOTTY) + zio->io_error = SET_ERROR(ENOTSUP); bio_put(bio); ASSERT3S(zio->io_error, >=, 0); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index f13228051decb..bd6752f00ac56 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -4606,13 +4606,13 @@ zio_vdev_io_assess(zio_t *zio) } /* - * If a cache flush returns ENOTSUP or ENOTTY, we know that no future + * If a cache flush returns ENOTSUP we know that no future * attempts will ever succeed. In this case we set a persistent * boolean flag so that we don't bother with it in the future, and * then we act like the flush succeeded. */ - if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) && - zio->io_type == ZIO_TYPE_FLUSH && vd != NULL) { + if (zio->io_error == ENOTSUP && zio->io_type == ZIO_TYPE_FLUSH && + vd != NULL) { vd->vdev_nowritecache = B_TRUE; zio->io_error = 0; } From 586304ac444f888aae584fdbea5fbeac2b76cb32 Mon Sep 17 00:00:00 2001 From: kotauskas Date: Sat, 14 Dec 2024 00:50:50 +0300 Subject: [PATCH 07/43] Remount datasets on soft-reboot The one-shot zfs-mount.service is incorrectly deemed active by Systemd after a systemctl soft-reboot. As such, soft-rebooting prevents zfs mount -a from being ran automatically. This commit makes it so that zfs-mount.service is marked as being undone by the time umount.target is reached, so that zfs.target then pulls it in again and gets it restarted after a soft reboot. Reviewed by: Brian Behlendorf Signed-off-by: kotauskas Closes #16845 --- etc/systemd/system/zfs-mount.service.in | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/etc/systemd/system/zfs-mount.service.in b/etc/systemd/system/zfs-mount.service.in index 66d894923f4ac..41c02c8238ca5 100644 --- a/etc/systemd/system/zfs-mount.service.in +++ b/etc/systemd/system/zfs-mount.service.in @@ -8,6 +8,13 @@ After=systemd-remount-fs.service Before=local-fs.target ConditionPathIsDirectory=/sys/module/zfs +# This merely tells the service manager +# that unmounting everything undoes the +# effect of this service. No extra logic +# is ran as a result of these settings. +Conflicts=umount.target +Before=umount.target + [Service] Type=oneshot RemainAfterExit=yes From ff6266ee9bd0d6a59355b9980345819e6f68b50b Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Sat, 14 Dec 2024 17:02:11 -0500 Subject: [PATCH 08/43] Fix use-afer-free regression in RAIDZ expansion We should not dereference rra after the last zio_nowait() is called. It seems very unlikely, but ASAN in ztest managed to catch it. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16868 --- module/zfs/vdev_raidz.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index e4487c4850751..6103f780e6bc9 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -3914,8 +3914,8 @@ raidz_reflow_read_done(zio_t *zio) if (atomic_dec_32_nv(&rra->rra_tbd) > 0) return; - rra->rra_tbd = rra->rra_writes; - for (uint64_t i = 0; i < rra->rra_writes; i++) + uint32_t writes = rra->rra_tbd = rra->rra_writes; + for (uint64_t i = 0; i < writes; i++) zio_nowait(rra->rra_zio[i]); } From 22259fb24d6ef551d38ff19ceeb86c2da4ba6543 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Sat, 14 Dec 2024 14:05:12 -0800 Subject: [PATCH 09/43] Fix stray "no" in configure output This is purely a cosmetic fix which removes a stray "no" from the configure output. Reviewed-by: Tino Reichardt Reviewed-by: Alexander Motin Signed-off-by: Brian Behlendorf Closes #16867 --- config/kernel-xattr-handler.m4 | 1 - 1 file changed, 1 deletion(-) diff --git a/config/kernel-xattr-handler.m4 b/config/kernel-xattr-handler.m4 index d933cff7a4b92..ea4466d83fcc4 100644 --- a/config/kernel-xattr-handler.m4 +++ b/config/kernel-xattr-handler.m4 @@ -54,7 +54,6 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_GET_DENTRY_INODE_FLAGS], [ ]) AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_GET_DENTRY_INODE_FLAGS], [ - AC_MSG_RESULT(no) AC_MSG_CHECKING( [whether xattr_handler->get() wants dentry and inode and flags]) ZFS_LINUX_TEST_RESULT([xattr_handler_get_dentry_inode_flags], [ From acda137d8c160885542bcfd6fa9ef8fd650e95d2 Mon Sep 17 00:00:00 2001 From: Shengqi Chen Date: Tue, 17 Dec 2024 01:40:41 +0800 Subject: [PATCH 10/43] simd_stat: fix undefined CONFIG_KERNEL_MODE_NEON error on armel CONFIG_KERNEL_MODE_NEON depends on CONFIG_NEON. Neither is defined on armel. Add a guard to avoid compilation errors. Reviewed-by: Brian Behlendorf Signed-off-by: Shengqi Chen Closes #16871 --- module/zcommon/simd_stat.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/module/zcommon/simd_stat.c b/module/zcommon/simd_stat.c index 33c15140cdb92..d82a88ca9343e 100644 --- a/module/zcommon/simd_stat.c +++ b/module/zcommon/simd_stat.c @@ -132,8 +132,10 @@ simd_stat_kstat_data(char *buf, size_t size, void *data) #if defined(__arm__) || defined(__aarch64__) off += SIMD_STAT_PRINT(simd_stat_kstat_payload, "kernel_neon", HAVE_KERNEL_NEON); +#if defined(CONFIG_KERNEL_MODE_NEON) off += SIMD_STAT_PRINT(simd_stat_kstat_payload, "kernel_mode_neon", CONFIG_KERNEL_MODE_NEON); +#endif /* CONFIG_KERNEL_MODE_NEON */ off += SIMD_STAT_PRINT(simd_stat_kstat_payload, "neon", zfs_neon_available()); off += SIMD_STAT_PRINT(simd_stat_kstat_payload, From c6442bd3b6430e5bd57dac365a359b41e7fbd54e Mon Sep 17 00:00:00 2001 From: Brian Atkinson Date: Mon, 2 Dec 2024 18:04:56 -0700 Subject: [PATCH 11/43] Removing old code outside of 4.18 kernsls There were checks still in place to verify we could completely use iov_iter's on the Linux side. All interfaces are available as of kernel 4.18, so there is no reason to check whether we should use that interface at this point. This PR completely removes the UIO_USERSPACE type. It also removes the check for the direct_IO interface checks. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Brian Atkinson Closes #16856 --- config/kernel-vfs-direct_IO.m4 | 57 ----------- config/kernel-vfs-iov_iter.m4 | 39 +------ config/kernel.m4 | 2 - include/os/linux/spl/sys/uio.h | 17 +--- lib/libspl/include/sys/uio.h | 3 +- module/os/linux/zfs/zfs_uio.c | 180 ++------------------------------- module/os/linux/zfs/zpl_file.c | 52 ++-------- 7 files changed, 26 insertions(+), 324 deletions(-) delete mode 100644 config/kernel-vfs-direct_IO.m4 diff --git a/config/kernel-vfs-direct_IO.m4 b/config/kernel-vfs-direct_IO.m4 deleted file mode 100644 index 17605a13fdef5..0000000000000 --- a/config/kernel-vfs-direct_IO.m4 +++ /dev/null @@ -1,57 +0,0 @@ -dnl # -dnl # Check for Direct I/O interfaces. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO], [ - ZFS_LINUX_TEST_SRC([direct_io_iter], [ - #include - - static ssize_t test_direct_IO(struct kiocb *kiocb, - struct iov_iter *iter) { return 0; } - - static const struct address_space_operations - aops __attribute__ ((unused)) = { - .direct_IO = test_direct_IO, - }; - ],[]) - - ZFS_LINUX_TEST_SRC([direct_io_iter_offset], [ - #include - - static ssize_t test_direct_IO(struct kiocb *kiocb, - struct iov_iter *iter, loff_t offset) { return 0; } - - static const struct address_space_operations - aops __attribute__ ((unused)) = { - .direct_IO = test_direct_IO, - }; - ],[]) -]) - -AC_DEFUN([ZFS_AC_KERNEL_VFS_DIRECT_IO], [ - dnl # - dnl # Linux 4.6.x API change - dnl # - AC_MSG_CHECKING([whether aops->direct_IO() uses iov_iter]) - ZFS_LINUX_TEST_RESULT([direct_io_iter], [ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_VFS_DIRECT_IO_ITER, 1, - [aops->direct_IO() uses iov_iter without rw]) - ],[ - AC_MSG_RESULT([no]) - - dnl # - dnl # Linux 4.1.x API change - dnl # - AC_MSG_CHECKING( - [whether aops->direct_IO() uses offset]) - ZFS_LINUX_TEST_RESULT([direct_io_iter_offset], [ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_VFS_DIRECT_IO_ITER_OFFSET, 1, - [aops->direct_IO() uses iov_iter with offset]) - - ],[ - AC_MSG_RESULT([no]) - ZFS_LINUX_TEST_ERROR([Direct I/O]) - ]) - ]) -]) diff --git a/config/kernel-vfs-iov_iter.m4 b/config/kernel-vfs-iov_iter.m4 index ed7961a9e9ddd..29e19acddbb1b 100644 --- a/config/kernel-vfs-iov_iter.m4 +++ b/config/kernel-vfs-iov_iter.m4 @@ -15,7 +15,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_IOV_ITER], [ ZFS_LINUX_TEST_SRC([iov_iter_get_pages2], [ #include - ], [ + ],[ struct iov_iter iter = { 0 }; struct page **pages = NULL; size_t maxsize = 4096; @@ -27,20 +27,6 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_IOV_ITER], [ &start); ]) - ZFS_LINUX_TEST_SRC([iov_iter_get_pages], [ - #include - ], [ - struct iov_iter iter = { 0 }; - struct page **pages = NULL; - size_t maxsize = 4096; - unsigned maxpages = 1; - size_t start; - size_t ret __attribute__ ((unused)); - - ret = iov_iter_get_pages(&iter, pages, maxsize, maxpages, - &start); - ]) - ZFS_LINUX_TEST_SRC([iov_iter_type], [ #include #include @@ -59,7 +45,6 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_IOV_ITER], [ ]) AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [ - enable_vfs_iov_iter="yes" AC_MSG_CHECKING([whether fault_in_iov_iter_readable() is available]) ZFS_LINUX_TEST_RESULT([fault_in_iov_iter_readable], [ @@ -78,17 +63,8 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_IOV_ITER_GET_PAGES2, 1, [iov_iter_get_pages2() is available]) - ], [ + ],[ AC_MSG_RESULT(no) - AC_MSG_CHECKING([whether iov_iter_get_pages() is available]) - ZFS_LINUX_TEST_RESULT([iov_iter_get_pages], [ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_IOV_ITER_GET_PAGES, 1, - [iov_iter_get_pages() is available]) - ], [ - AC_MSG_RESULT(no) - enable_vfs_iov_iter="no" - ]) ]) dnl # @@ -105,17 +81,6 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [ AC_MSG_RESULT(no) ]) - dnl # - dnl # As of the 4.9 kernel support is provided for iovecs, kvecs, - dnl # bvecs and pipes in the iov_iter structure. As long as the - dnl # other support interfaces are all available the iov_iter can - dnl # be correctly used in the uio structure. - dnl # - AS_IF([test "x$enable_vfs_iov_iter" = "xyes"], [ - AC_DEFINE(HAVE_VFS_IOV_ITER, 1, - [All required iov_iter interfaces are available]) - ]) - dnl # dnl # Kernel 6.5 introduces the iter_iov() function that returns the dnl # __iov member of an iov_iter*. The iov member was renamed to this diff --git a/config/kernel.m4 b/config/kernel.m4 index 78f178ff27acd..49ec6266e87af 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -79,7 +79,6 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_VFS_READ_FOLIO ZFS_AC_KERNEL_SRC_VFS_MIGRATE_FOLIO ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS - ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO ZFS_AC_KERNEL_SRC_VFS_READPAGES ZFS_AC_KERNEL_SRC_VFS_SET_PAGE_DIRTY_NOBUFFERS ZFS_AC_KERNEL_SRC_VFS_IOV_ITER @@ -190,7 +189,6 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_VFS_READ_FOLIO ZFS_AC_KERNEL_VFS_MIGRATE_FOLIO ZFS_AC_KERNEL_VFS_FSYNC_2ARGS - ZFS_AC_KERNEL_VFS_DIRECT_IO ZFS_AC_KERNEL_VFS_READPAGES ZFS_AC_KERNEL_VFS_SET_PAGE_DIRTY_NOBUFFERS ZFS_AC_KERNEL_VFS_IOV_ITER diff --git a/include/os/linux/spl/sys/uio.h b/include/os/linux/spl/sys/uio.h index 5d483685eb205..9e7afea2ab345 100644 --- a/include/os/linux/spl/sys/uio.h +++ b/include/os/linux/spl/sys/uio.h @@ -40,7 +40,7 @@ */ #define UIO_DIRECT 0x0001 /* Direct I/O request */ -#if defined(HAVE_VFS_IOV_ITER) && defined(HAVE_FAULT_IN_IOV_ITER_READABLE) +#if defined(HAVE_FAULT_IN_IOV_ITER_READABLE) #define iov_iter_fault_in_readable(a, b) fault_in_iov_iter_readable(a, b) #endif @@ -52,12 +52,9 @@ typedef enum zfs_uio_rw { } zfs_uio_rw_t; typedef enum zfs_uio_seg { - UIO_USERSPACE = 0, - UIO_SYSSPACE = 1, - UIO_BVEC = 2, -#if defined(HAVE_VFS_IOV_ITER) - UIO_ITER = 3, -#endif + UIO_SYSSPACE = 0, + UIO_BVEC = 1, + UIO_ITER = 2, } zfs_uio_seg_t; /* @@ -72,9 +69,7 @@ typedef struct zfs_uio { union { const struct iovec *uio_iov; const struct bio_vec *uio_bvec; -#if defined(HAVE_VFS_IOV_ITER) struct iov_iter *uio_iter; -#endif }; int uio_iovcnt; /* Number of iovecs */ offset_t uio_soffset; /* Starting logical offset */ @@ -129,7 +124,7 @@ zfs_uio_iovec_init(zfs_uio_t *uio, const struct iovec *iov, unsigned long nr_segs, offset_t offset, zfs_uio_seg_t seg, ssize_t resid, size_t skip) { - ASSERT(seg == UIO_USERSPACE || seg == UIO_SYSSPACE); + ASSERT(seg == UIO_SYSSPACE); uio->uio_iov = iov; uio->uio_iovcnt = nr_segs; @@ -175,7 +170,6 @@ zfs_uio_bvec_init(zfs_uio_t *uio, struct bio *bio, struct request *rq) memset(&uio->uio_dio, 0, sizeof (zfs_uio_dio_t)); } -#if defined(HAVE_VFS_IOV_ITER) static inline void zfs_uio_iov_iter_init(zfs_uio_t *uio, struct iov_iter *iter, offset_t offset, ssize_t resid, size_t skip) @@ -192,7 +186,6 @@ zfs_uio_iov_iter_init(zfs_uio_t *uio, struct iov_iter *iter, offset_t offset, uio->uio_soffset = uio->uio_loffset; memset(&uio->uio_dio, 0, sizeof (zfs_uio_dio_t)); } -#endif /* HAVE_VFS_IOV_ITER */ #if defined(HAVE_ITER_IOV) #define zfs_uio_iter_iov(iter) iter_iov((iter)) diff --git a/lib/libspl/include/sys/uio.h b/lib/libspl/include/sys/uio.h index 16749fa492e5f..f86be64ce7f3d 100644 --- a/lib/libspl/include/sys/uio.h +++ b/lib/libspl/include/sys/uio.h @@ -57,8 +57,7 @@ typedef enum zfs_uio_rw { } zfs_uio_rw_t; typedef enum zfs_uio_seg { - UIO_USERSPACE = 0, - UIO_SYSSPACE = 1, + UIO_SYSSPACE = 0, } zfs_uio_seg_t; #elif defined(__FreeBSD__) diff --git a/module/os/linux/zfs/zfs_uio.c b/module/os/linux/zfs/zfs_uio.c index f08415fdb2e31..ed11f8b63fbf8 100644 --- a/module/os/linux/zfs/zfs_uio.c +++ b/module/os/linux/zfs/zfs_uio.c @@ -68,55 +68,13 @@ zfs_uiomove_iov(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) size_t skip = uio->uio_skip; ulong_t cnt; + ASSERT3S(uio->uio_segflg, ==, UIO_SYSSPACE); while (n && uio->uio_resid) { cnt = MIN(iov->iov_len - skip, n); - switch (uio->uio_segflg) { - case UIO_USERSPACE: - /* - * p = kernel data pointer - * iov->iov_base = user data pointer - */ - if (rw == UIO_READ) { - if (copy_to_user(iov->iov_base+skip, p, cnt)) - return (EFAULT); - } else { - unsigned long b_left = 0; - if (uio->uio_fault_disable) { - if (!zfs_access_ok(VERIFY_READ, - (iov->iov_base + skip), cnt)) { - return (EFAULT); - } - pagefault_disable(); - b_left = - __copy_from_user_inatomic(p, - (iov->iov_base + skip), cnt); - pagefault_enable(); - } else { - b_left = - copy_from_user(p, - (iov->iov_base + skip), cnt); - } - if (b_left > 0) { - unsigned long c_bytes = - cnt - b_left; - uio->uio_skip += c_bytes; - ASSERT3U(uio->uio_skip, <, - iov->iov_len); - uio->uio_resid -= c_bytes; - uio->uio_loffset += c_bytes; - return (EFAULT); - } - } - break; - case UIO_SYSSPACE: - if (rw == UIO_READ) - memcpy(iov->iov_base + skip, p, cnt); - else - memcpy(p, iov->iov_base + skip, cnt); - break; - default: - ASSERT(0); - } + if (rw == UIO_READ) + memcpy(iov->iov_base + skip, p, cnt); + else + memcpy(p, iov->iov_base + skip, cnt); skip += cnt; if (skip == iov->iov_len) { skip = 0; @@ -268,7 +226,6 @@ zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) return (zfs_uiomove_bvec_impl(p, n, rw, uio)); } -#if defined(HAVE_VFS_IOV_ITER) static int zfs_uiomove_iter(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, boolean_t revert) @@ -303,17 +260,14 @@ zfs_uiomove_iter(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, return (0); } -#endif int zfs_uiomove(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) { if (uio->uio_segflg == UIO_BVEC) return (zfs_uiomove_bvec(p, n, rw, uio)); -#if defined(HAVE_VFS_IOV_ITER) else if (uio->uio_segflg == UIO_ITER) return (zfs_uiomove_iter(p, n, rw, uio, B_FALSE)); -#endif else return (zfs_uiomove_iov(p, n, rw, uio)); } @@ -336,44 +290,14 @@ zfs_uio_prefaultpages(ssize_t n, zfs_uio_t *uio) * there is never a time for these pages a fault will occur. */ return (0); -#if defined(HAVE_VFS_IOV_ITER) - } else if (uio->uio_segflg == UIO_ITER) { + } else { + ASSERT3S(uio->uio_segflg, ==, UIO_ITER); /* - * At least a Linux 4.9 kernel, iov_iter_fault_in_readable() + * At least a Linux 4.18 kernel, iov_iter_fault_in_readable() * can be relied on to fault in user pages when referenced. */ if (iov_iter_fault_in_readable(uio->uio_iter, n)) return (EFAULT); -#endif - } else { - /* Fault in all user pages */ - ASSERT3S(uio->uio_segflg, ==, UIO_USERSPACE); - const struct iovec *iov = uio->uio_iov; - int iovcnt = uio->uio_iovcnt; - size_t skip = uio->uio_skip; - uint8_t tmp; - caddr_t p; - - for (; n > 0 && iovcnt > 0; iov++, iovcnt--, skip = 0) { - ulong_t cnt = MIN(iov->iov_len - skip, n); - /* empty iov */ - if (cnt == 0) - continue; - n -= cnt; - /* touch each page in this segment. */ - p = iov->iov_base + skip; - while (cnt) { - if (copy_from_user(&tmp, p, 1)) - return (EFAULT); - ulong_t incr = MIN(cnt, PAGESIZE); - p += incr; - cnt -= incr; - } - /* touch the last byte in case it straddles a page. */ - p--; - if (copy_from_user(&tmp, p, 1)) - return (EFAULT); - } } return (0); @@ -394,10 +318,8 @@ zfs_uiocopy(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, size_t *cbytes) if (uio->uio_segflg == UIO_BVEC) ret = zfs_uiomove_bvec(p, n, rw, &uio_copy); -#if defined(HAVE_VFS_IOV_ITER) else if (uio->uio_segflg == UIO_ITER) ret = zfs_uiomove_iter(p, n, rw, &uio_copy, B_TRUE); -#endif else ret = zfs_uiomove_iov(p, n, rw, &uio_copy); @@ -430,11 +352,10 @@ zfs_uioskip(zfs_uio_t *uio, size_t n) uio->uio_bvec++; uio->uio_iovcnt--; } -#if defined(HAVE_VFS_IOV_ITER) } else if (uio->uio_segflg == UIO_ITER) { iov_iter_advance(uio->uio_iter, n); -#endif } else { + ASSERT3S(uio->uio_segflg, ==, UIO_SYSSPACE); uio->uio_skip += n; while (uio->uio_iovcnt && uio->uio_skip >= uio->uio_iov->iov_len) { @@ -457,8 +378,7 @@ zfs_uio_page_aligned(zfs_uio_t *uio) { boolean_t aligned = B_TRUE; - if (uio->uio_segflg == UIO_USERSPACE || - uio->uio_segflg == UIO_SYSSPACE) { + if (uio->uio_segflg == UIO_SYSSPACE) { const struct iovec *iov = uio->uio_iov; size_t skip = uio->uio_skip; @@ -472,12 +392,10 @@ zfs_uio_page_aligned(zfs_uio_t *uio) } skip = 0; } -#if defined(HAVE_VFS_IOV_ITER) } else if (uio->uio_segflg == UIO_ITER) { unsigned long alignment = iov_iter_alignment(uio->uio_iter); aligned = IS_P2ALIGNED(alignment, PAGE_SIZE); -#endif } else { /* Currently not supported */ aligned = B_FALSE; @@ -578,76 +496,6 @@ zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw) uio->uio_dio.npages * sizeof (struct page *)); } -/* - * zfs_uio_iov_step() is just a modified version of the STEP function of Linux's - * iov_iter_get_pages(). - */ -static int -zfs_uio_iov_step(struct iovec v, zfs_uio_rw_t rw, zfs_uio_t *uio, - long *numpages) -{ - unsigned long addr = (unsigned long)(v.iov_base); - size_t len = v.iov_len; - unsigned long n = DIV_ROUND_UP(len, PAGE_SIZE); - - /* - * read returning FOLL_WRITE is due to the fact that we are stating - * that the kernel will have write access to the user pages. So, when a - * Direct I/O read request is issued, the kernel must write to the user - * pages. - */ - long res = get_user_pages_unlocked( - P2ALIGN_TYPED(addr, PAGE_SIZE, unsigned long), n, - &uio->uio_dio.pages[uio->uio_dio.npages], - rw == UIO_READ ? FOLL_WRITE : 0); - if (res < 0) { - return (SET_ERROR(-res)); - } else if (len != (res * PAGE_SIZE)) { - return (SET_ERROR(EFAULT)); - } - - ASSERT3S(len, ==, res * PAGE_SIZE); - *numpages = res; - return (0); -} - -static int -zfs_uio_get_dio_pages_iov(zfs_uio_t *uio, zfs_uio_rw_t rw) -{ - const struct iovec *iovp = uio->uio_iov; - size_t skip = uio->uio_skip; - size_t len = uio->uio_resid - skip; - - ASSERT(uio->uio_segflg != UIO_SYSSPACE); - - for (int i = 0; i < uio->uio_iovcnt; i++) { - struct iovec iov; - long numpages = 0; - - if (iovp->iov_len == 0) { - iovp++; - skip = 0; - continue; - } - iov.iov_len = MIN(len, iovp->iov_len - skip); - iov.iov_base = iovp->iov_base + skip; - int error = zfs_uio_iov_step(iov, rw, uio, &numpages); - - if (error) - return (error); - - uio->uio_dio.npages += numpages; - len -= iov.iov_len; - skip = 0; - iovp++; - } - - ASSERT0(len); - - return (0); -} - -#if defined(HAVE_VFS_IOV_ITER) static int zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw) { @@ -688,7 +536,6 @@ zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw) return (0); } -#endif /* HAVE_VFS_IOV_ITER */ /* * This function pins user pages. In the event that the user pages were not @@ -703,14 +550,9 @@ zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw) long npages = DIV_ROUND_UP(uio->uio_resid, PAGE_SIZE); size_t size = npages * sizeof (struct page *); - if (uio->uio_segflg == UIO_USERSPACE) { - uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP); - error = zfs_uio_get_dio_pages_iov(uio, rw); -#if defined(HAVE_VFS_IOV_ITER) - } else if (uio->uio_segflg == UIO_ITER) { + if (uio->uio_segflg == UIO_ITER) { uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP); error = zfs_uio_get_dio_pages_iov_iter(uio, rw); -#endif } else { return (SET_ERROR(EOPNOTSUPP)); } diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index ff1370c543dc1..42dfddc2717b8 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -216,27 +216,6 @@ zpl_file_accessed(struct file *filp) } } -/* - * When HAVE_VFS_IOV_ITER is defined the iov_iter structure supports - * iovecs, kvevs, bvecs and pipes, plus all the required interfaces to - * manipulate the iov_iter are available. In which case the full iov_iter - * can be attached to the uio and correctly handled in the lower layers. - * Otherwise, for older kernels extract the iovec and pass it instead. - */ -static void -zpl_uio_init(zfs_uio_t *uio, struct kiocb *kiocb, struct iov_iter *to, - loff_t pos, ssize_t count, size_t skip) -{ -#if defined(HAVE_VFS_IOV_ITER) - zfs_uio_iov_iter_init(uio, to, pos, count, skip); -#else - zfs_uio_iovec_init(uio, zfs_uio_iter_iov(to), to->nr_segs, pos, - zfs_uio_iov_iter_type(to) & ITER_KVEC ? - UIO_SYSSPACE : UIO_USERSPACE, - count, skip); -#endif -} - static ssize_t zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) { @@ -246,7 +225,7 @@ zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) ssize_t count = iov_iter_count(to); zfs_uio_t uio; - zpl_uio_init(&uio, kiocb, to, kiocb->ki_pos, count, 0); + zfs_uio_iov_iter_init(&uio, to, kiocb->ki_pos, count, 0); crhold(cr); cookie = spl_fstrans_mark(); @@ -296,7 +275,8 @@ zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) if (ret) return (ret); - zpl_uio_init(&uio, kiocb, from, kiocb->ki_pos, count, from->iov_offset); + zfs_uio_iov_iter_init(&uio, from, kiocb->ki_pos, count, + from->iov_offset); crhold(cr); cookie = spl_fstrans_mark(); @@ -317,34 +297,18 @@ zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) } static ssize_t -zpl_direct_IO_impl(void) +zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter) { /* * All O_DIRECT requests should be handled by - * zpl_{iter/aio}_{write/read}(). There is no way kernel generic code - * should call the direct_IO address_space_operations function. We set - * this code path to be fatal if it is executed. + * zpl_iter_write/read}(). There is no way kernel generic code should + * call the direct_IO address_space_operations function. We set this + * code path to be fatal if it is executed. */ PANIC(0); return (0); } -#if defined(HAVE_VFS_DIRECT_IO_ITER) -static ssize_t -zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter) -{ - return (zpl_direct_IO_impl()); -} -#elif defined(HAVE_VFS_DIRECT_IO_ITER_OFFSET) -static ssize_t -zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) -{ - return (zpl_direct_IO_impl()); -} -#else -#error "Unknown Direct I/O interface" -#endif - static loff_t zpl_llseek(struct file *filp, loff_t offset, int whence) { @@ -1104,14 +1068,12 @@ const struct file_operations zpl_file_operations = { .llseek = zpl_llseek, .read_iter = zpl_iter_read, .write_iter = zpl_iter_write, -#ifdef HAVE_VFS_IOV_ITER #ifdef HAVE_COPY_SPLICE_READ .splice_read = copy_splice_read, #else .splice_read = generic_file_splice_read, #endif .splice_write = iter_file_splice_write, -#endif .mmap = zpl_mmap, .fsync = zpl_fsync, .fallocate = zpl_fallocate, From 882a80998379184da5b36e1ef3f1444e2cf1461a Mon Sep 17 00:00:00 2001 From: Brian Atkinson Date: Tue, 10 Dec 2024 10:21:06 -0700 Subject: [PATCH 12/43] Use pin_user_pages API for Direct I/O requests As of kernel v5.8, pin_user_pages* interfaced were introduced. These interfaces use the FOLL_PIN flag. This is preferred interface now for Direct I/O requests in the kernel. The reasoning for using this new interface for Direct I/O requests is explained in the kernel documenetation: Documentation/core-api/pin_user_pages.rst If pin_user_pages_unlocked is available, the all Direct I/O requests will use this new API to stay uptodate with the kernel API requirements. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Brian Atkinson Closes #16856 --- config/kernel-pin-user-pages.m4 | 33 ++++++++++ config/kernel-vfs-iov_iter.m4 | 43 ++++++------- config/kernel.m4 | 2 + module/os/linux/zfs/zfs_uio.c | 111 +++++++++++++++++++++++++++----- 4 files changed, 148 insertions(+), 41 deletions(-) create mode 100644 config/kernel-pin-user-pages.m4 diff --git a/config/kernel-pin-user-pages.m4 b/config/kernel-pin-user-pages.m4 new file mode 100644 index 0000000000000..fe7aff3752080 --- /dev/null +++ b/config/kernel-pin-user-pages.m4 @@ -0,0 +1,33 @@ +dnl # +dnl # Check for pin_user_pages_unlocked(). +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_PIN_USER_PAGES], [ + ZFS_LINUX_TEST_SRC([pin_user_pages_unlocked], [ + #include + ],[ + unsigned long start = 0; + unsigned long nr_pages = 1; + struct page **pages = NULL; + unsigned int gup_flags = 0; + long ret __attribute__ ((unused)); + + ret = pin_user_pages_unlocked(start, nr_pages, pages, + gup_flags); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_PIN_USER_PAGES], [ + + dnl # + dnl # Kernal 5.8 introduced the pin_user_pages* interfaces which should + dnl # be used for Direct I/O requests. + dnl # + AC_MSG_CHECKING([whether pin_user_pages_unlocked() is available]) + ZFS_LINUX_TEST_RESULT([pin_user_pages_unlocked], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_PIN_USER_PAGES_UNLOCKED, 1, + [pin_user_pages_unlocked() is available]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-vfs-iov_iter.m4 b/config/kernel-vfs-iov_iter.m4 index 29e19acddbb1b..a223343030db4 100644 --- a/config/kernel-vfs-iov_iter.m4 +++ b/config/kernel-vfs-iov_iter.m4 @@ -13,26 +13,21 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_IOV_ITER], [ error = fault_in_iov_iter_readable(&iter, size); ]) - ZFS_LINUX_TEST_SRC([iov_iter_get_pages2], [ + ZFS_LINUX_TEST_SRC([iov_iter_type], [ + #include #include ],[ struct iov_iter iter = { 0 }; - struct page **pages = NULL; - size_t maxsize = 4096; - unsigned maxpages = 1; - size_t start; - size_t ret __attribute__ ((unused)); - - ret = iov_iter_get_pages2(&iter, pages, maxsize, maxpages, - &start); + __attribute__((unused)) enum iter_type i = iov_iter_type(&iter); ]) - ZFS_LINUX_TEST_SRC([iov_iter_type], [ - #include + ZFS_LINUX_TEST_SRC([iter_is_ubuf], [ #include ],[ struct iov_iter iter = { 0 }; - __attribute__((unused)) enum iter_type i = iov_iter_type(&iter); + bool ret __attribute__((unused)); + + ret = iter_is_ubuf(&iter); ]) ZFS_LINUX_TEST_SRC([iter_iov], [ @@ -55,18 +50,6 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [ AC_MSG_RESULT(no) ]) - dnl # - dnl # Kernel 6.0 changed iov_iter_get_pages() to iov_iter_page_pages2(). - dnl # - AC_MSG_CHECKING([whether iov_iter_get_pages2() is available]) - ZFS_LINUX_TEST_RESULT([iov_iter_get_pages2], [ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_IOV_ITER_GET_PAGES2, 1, - [iov_iter_get_pages2() is available]) - ],[ - AC_MSG_RESULT(no) - ]) - dnl # dnl # This checks for iov_iter_type() in linux/uio.h. It is not dnl # required, however, and the module will compiled without it @@ -81,6 +64,18 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [ AC_MSG_RESULT(no) ]) + dnl # + dnl # Kernel 6.0 introduced the ITER_UBUF iov_iter type. iter_is_ubuf() + dnl # was also added to determine if the iov_iter is an ITER_UBUF. + dnl # + AC_MSG_CHECKING([whether iter_is_ubuf() is available]) + ZFS_LINUX_TEST_RESULT([iter_is_ubuf], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_ITER_IS_UBUF, 1, [iter_is_ubuf() is available]) + ],[ + AC_MSG_RESULT(no) + ]) + dnl # dnl # Kernel 6.5 introduces the iter_iov() function that returns the dnl # __iov member of an iov_iter*. The iov member was renamed to this diff --git a/config/kernel.m4 b/config/kernel.m4 index 49ec6266e87af..ae66633907bf7 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -127,6 +127,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE ZFS_AC_KERNEL_SRC_MM_PAGE_MAPPING ZFS_AC_KERNEL_SRC_FILE + ZFS_AC_KERNEL_SRC_PIN_USER_PAGES case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE @@ -238,6 +239,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_MM_PAGE_MAPPING ZFS_AC_KERNEL_1ARG_ASSIGN_STR ZFS_AC_KERNEL_FILE + ZFS_AC_KERNEL_PIN_USER_PAGES case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_CPU_HAS_FEATURE diff --git a/module/os/linux/zfs/zfs_uio.c b/module/os/linux/zfs/zfs_uio.c index ed11f8b63fbf8..db85b626f12af 100644 --- a/module/os/linux/zfs/zfs_uio.c +++ b/module/os/linux/zfs/zfs_uio.c @@ -441,6 +441,7 @@ zfs_unmark_page(struct page *page) } #endif /* HAVE_ZERO_PAGE_GPL_ONLY || !_LP64 */ +#if !defined(HAVE_PIN_USER_PAGES_UNLOCKED) static void zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio) { @@ -472,6 +473,7 @@ zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio) } } } +#endif void zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw) @@ -480,6 +482,9 @@ zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw) ASSERT(uio->uio_extflg & UIO_DIRECT); ASSERT3P(uio->uio_dio.pages, !=, NULL); +#if defined(HAVE_PIN_USER_PAGES_UNLOCKED) + unpin_user_pages(uio->uio_dio.pages, uio->uio_dio.npages); +#else for (long i = 0; i < uio->uio_dio.npages; i++) { struct page *p = uio->uio_dio.pages[i]; @@ -491,44 +496,106 @@ zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw) put_page(p); } - +#endif vmem_free(uio->uio_dio.pages, uio->uio_dio.npages * sizeof (struct page *)); } +#if defined(HAVE_PIN_USER_PAGES_UNLOCKED) static int -zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw) +zfs_uio_pin_user_pages(zfs_uio_t *uio, zfs_uio_rw_t rw) { + long res; size_t skip = uio->uio_skip; + size_t len = uio->uio_resid - skip; + unsigned int gup_flags = 0; + unsigned long addr; + unsigned long nr_pages; + + /* + * Kernel 6.2 introduced the FOLL_PCI_P2PDMA flag. This flag could + * possibly be used here in the future to allow for P2P operations with + * user pages. + */ + if (rw == UIO_READ) + gup_flags = FOLL_WRITE; + + if (len == 0) + return (0); + +#if defined(HAVE_ITER_IS_UBUF) + if (iter_is_ubuf(uio->uio_iter)) { + nr_pages = DIV_ROUND_UP(len, PAGE_SIZE); + addr = (unsigned long)uio->uio_iter->ubuf + skip; + res = pin_user_pages_unlocked(addr, nr_pages, + &uio->uio_dio.pages[uio->uio_dio.npages], gup_flags); + if (res < 0) { + return (SET_ERROR(-res)); + } else if (len != (res * PAGE_SIZE)) { + uio->uio_dio.npages += res; + return (SET_ERROR(EFAULT)); + } + uio->uio_dio.npages += res; + return (0); + } +#endif + const struct iovec *iovp = zfs_uio_iter_iov(uio->uio_iter); + for (int i = 0; i < uio->uio_iovcnt; i++) { + size_t amt = iovp->iov_len - skip; + if (amt == 0) { + iovp++; + skip = 0; + continue; + } + + addr = (unsigned long)iovp->iov_base + skip; + nr_pages = DIV_ROUND_UP(amt, PAGE_SIZE); + res = pin_user_pages_unlocked(addr, nr_pages, + &uio->uio_dio.pages[uio->uio_dio.npages], gup_flags); + if (res < 0) { + return (SET_ERROR(-res)); + } else if (amt != (res * PAGE_SIZE)) { + uio->uio_dio.npages += res; + return (SET_ERROR(EFAULT)); + } + + len -= amt; + uio->uio_dio.npages += res; + skip = 0; + iovp++; + }; + + ASSERT0(len); + + return (0); +} + +#else +static int +zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw) +{ + size_t start; size_t wanted = uio->uio_resid - uio->uio_skip; ssize_t rollback = 0; ssize_t cnt; unsigned maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE); while (wanted) { -#if defined(HAVE_IOV_ITER_GET_PAGES2) - cnt = iov_iter_get_pages2(uio->uio_iter, - &uio->uio_dio.pages[uio->uio_dio.npages], - wanted, maxpages, &skip); -#else cnt = iov_iter_get_pages(uio->uio_iter, &uio->uio_dio.pages[uio->uio_dio.npages], - wanted, maxpages, &skip); -#endif + wanted, maxpages, &start); if (cnt < 0) { iov_iter_revert(uio->uio_iter, rollback); return (SET_ERROR(-cnt)); } + /* + * All Direct I/O operations must be page aligned. + */ + ASSERT(IS_P2ALIGNED(start, PAGE_SIZE)); uio->uio_dio.npages += DIV_ROUND_UP(cnt, PAGE_SIZE); rollback += cnt; wanted -= cnt; - skip = 0; -#if !defined(HAVE_IOV_ITER_GET_PAGES2) - /* - * iov_iter_get_pages2() advances the iov_iter on success. - */ iov_iter_advance(uio->uio_iter, cnt); -#endif } ASSERT3U(rollback, ==, uio->uio_resid - uio->uio_skip); @@ -536,6 +603,7 @@ zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw) return (0); } +#endif /* HAVE_PIN_USER_PAGES_UNLOCKED */ /* * This function pins user pages. In the event that the user pages were not @@ -552,7 +620,11 @@ zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw) if (uio->uio_segflg == UIO_ITER) { uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP); +#if defined(HAVE_PIN_USER_PAGES_UNLOCKED) + error = zfs_uio_pin_user_pages(uio, rw); +#else error = zfs_uio_get_dio_pages_iov_iter(uio, rw); +#endif } else { return (SET_ERROR(EOPNOTSUPP)); } @@ -560,17 +632,22 @@ zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw) ASSERT3S(uio->uio_dio.npages, >=, 0); if (error) { +#if defined(HAVE_PIN_USER_PAGES_UNLOCKED) + unpin_user_pages(uio->uio_dio.pages, uio->uio_dio.npages); +#else for (long i = 0; i < uio->uio_dio.npages; i++) put_page(uio->uio_dio.pages[i]); +#endif vmem_free(uio->uio_dio.pages, size); return (error); } else { ASSERT3S(uio->uio_dio.npages, ==, npages); } - if (rw == UIO_WRITE) { +#if !defined(HAVE_PIN_USER_PAGES_UNLOCKED) + if (rw == UIO_WRITE) zfs_uio_dio_check_for_zero_page(uio); - } +#endif uio->uio_extflg |= UIO_DIRECT; From 830a53124941cc1053b4a3f22c8bef18bf6eb942 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 17 Dec 2024 08:58:33 -0800 Subject: [PATCH 13/43] CI: Add FreeBSD 14.2 RELEASE+STABLE builds Update the CI to include FreeBSD 14.2 as a regularly tested platform. Reviewed-by: Tino Reichardt Reviewed-by: Alexander Motin Signed-off-by: Brian Behlendorf Closes #16869 --- .github/workflows/scripts/qemu-2-start.sh | 20 ++++++++++---------- .github/workflows/zfs-qemu.yml | 9 +++++---- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/.github/workflows/scripts/qemu-2-start.sh b/.github/workflows/scripts/qemu-2-start.sh index 39ac92107b71d..f0f505cac4b7a 100755 --- a/.github/workflows/scripts/qemu-2-start.sh +++ b/.github/workflows/scripts/qemu-2-start.sh @@ -14,7 +14,7 @@ OSv=$OS # compressed with .zst extension REPO="https://github.com/mcmilk/openzfs-freebsd-images" -FREEBSD="$REPO/releases/download/v2024-10-05" +FREEBSD="$REPO/releases/download/v2024-12-14" URLzs="" # Ubuntu mirrors @@ -76,28 +76,28 @@ case "$OS" in BASH="/usr/local/bin/bash" NIC="rtl8139" ;; - freebsd14-0r) - OSNAME="FreeBSD 14.0-RELEASE" - OSv="freebsd14.0" - URLzs="$FREEBSD/amd64-freebsd-14.0-RELEASE.qcow2.zst" - BASH="/usr/local/bin/bash" - ;; freebsd14-1r) OSNAME="FreeBSD 14.1-RELEASE" OSv="freebsd14.0" URLzs="$FREEBSD/amd64-freebsd-14.1-RELEASE.qcow2.zst" BASH="/usr/local/bin/bash" ;; + freebsd14-2r) + OSNAME="FreeBSD 14.2-RELEASE" + OSv="freebsd14.0" + URLzs="$FREEBSD/amd64-freebsd-14.2-RELEASE.qcow2.zst" + BASH="/usr/local/bin/bash" + ;; freebsd13-4s) OSNAME="FreeBSD 13.4-STABLE" OSv="freebsd13.0" URLzs="$FREEBSD/amd64-freebsd-13.4-STABLE.qcow2.zst" BASH="/usr/local/bin/bash" ;; - freebsd14-1s) - OSNAME="FreeBSD 14.1-STABLE" + freebsd14-2s) + OSNAME="FreeBSD 14.2-STABLE" OSv="freebsd14.0" - URLzs="$FREEBSD/amd64-freebsd-14.1-STABLE.qcow2.zst" + URLzs="$FREEBSD/amd64-freebsd-14.2-STABLE.qcow2.zst" BASH="/usr/local/bin/bash" ;; freebsd15-0c) diff --git a/.github/workflows/zfs-qemu.yml b/.github/workflows/zfs-qemu.yml index e90030f4c02eb..4748e90db50bf 100644 --- a/.github/workflows/zfs-qemu.yml +++ b/.github/workflows/zfs-qemu.yml @@ -22,8 +22,8 @@ jobs: - name: Generate OS config and CI type id: os run: | - FULL_OS='["almalinux8", "almalinux9", "centos-stream9", "debian11", "debian12", "fedora40", "fedora41", "freebsd13-4r", "freebsd14-0r", "freebsd14-1s", "ubuntu20", "ubuntu22", "ubuntu24"]' - QUICK_OS='["almalinux8", "almalinux9", "debian12", "fedora41", "freebsd13-3r", "freebsd14-1r", "ubuntu24"]' + FULL_OS='["almalinux8", "almalinux9", "centos-stream9", "debian11", "debian12", "fedora40", "fedora41", "freebsd13-3r", "freebsd13-4s", "freebsd14-1r", "freebsd14-2s", "freebsd15-0c", "ubuntu20", "ubuntu22", "ubuntu24"]' + QUICK_OS='["almalinux8", "almalinux9", "debian12", "fedora41", "freebsd13-3r", "freebsd14-2r", "ubuntu24"]' # determine CI type when running on PR ci_type="full" if ${{ github.event_name == 'pull_request' }}; then @@ -49,8 +49,9 @@ jobs: # rhl: almalinux8, almalinux9, centos-stream9, fedora40, fedora41 # debian: debian11, debian12, ubuntu20, ubuntu22, ubuntu24 # misc: archlinux, tumbleweed - # FreeBSD Release: freebsd13-3r, freebsd13-4r, freebsd14-0r, freebsd14-1r - # FreeBSD Stable: freebsd13-4s, freebsd14-1s + # FreeBSD variants of 2024-12: + # FreeBSD Release: freebsd13-3r, freebsd13-4r, freebsd14-1r, freebsd14-2r + # FreeBSD Stable: freebsd13-4s, freebsd14-2s # FreeBSD Current: freebsd15-0c os: ${{ fromJson(needs.test-config.outputs.test_os) }} runs-on: ubuntu-24.04 From ab7cbbe7890e98c8d7df93ddf179f7bed977ef8b Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 19 Dec 2024 10:25:12 +1100 Subject: [PATCH 14/43] zprop: fix value help for ZPOOL_PROP_CAPACITY It's a percentage and documented as such, but we were showing it as . Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: George Melikov Signed-off-by: Rob Norris Closes #16881 --- module/zcommon/zpool_prop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index a709679b90320..ea9eda4b316dc 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -105,7 +105,7 @@ zpool_prop_init(void) PROP_READONLY, ZFS_TYPE_POOL, "", "FRAG", B_FALSE, sfeatures); zprop_register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY, - ZFS_TYPE_POOL, "", "CAP", B_FALSE, sfeatures); + ZFS_TYPE_POOL, "", "CAP", B_FALSE, sfeatures); zprop_register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "GUID", B_TRUE, sfeatures); zprop_register_number(ZPOOL_PROP_LOAD_GUID, "load_guid", 0, From e5ac7786bda1db64778b51a80136fa2e3e2e93d0 Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Thu, 19 Dec 2024 17:01:34 +0100 Subject: [PATCH 15/43] CI: Fix FreeBSD 13.4 STABLE build In #16869 we added FreeBSD 13.4 STABLE, but forget the special thing, that the virtio nic within FreeBSD 13.x is buggy. This fix adds the needed rtl8139 nic to the VM. Reviewed-by: George Melikov Reviewed-by: Alexander Motin Signed-off-by: Tino Reichardt Closes #16885 --- .github/workflows/scripts/qemu-2-start.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/scripts/qemu-2-start.sh b/.github/workflows/scripts/qemu-2-start.sh index f0f505cac4b7a..0906e438ac0d8 100755 --- a/.github/workflows/scripts/qemu-2-start.sh +++ b/.github/workflows/scripts/qemu-2-start.sh @@ -93,6 +93,7 @@ case "$OS" in OSv="freebsd13.0" URLzs="$FREEBSD/amd64-freebsd-13.4-STABLE.qcow2.zst" BASH="/usr/local/bin/bash" + NIC="rtl8139" ;; freebsd14-2s) OSNAME="FreeBSD 14.2-STABLE" From f00a57a7868fe2fb0f96205f9f1fbf23f865cffb Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 20 Dec 2024 03:04:56 +1100 Subject: [PATCH 16/43] zfs_main: fix alignment on props usage output I guess we've got some long property names since this was first set up! Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Brian Behlendorf Reviewed-by: Tony Hutter Reviewed-by: Alexander Motin Reviewed-by: George Melikov Signed-off-by: Rob Norris Closes #16883 --- cmd/zfs/zfs_main.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 7836f5909f4a6..73ccf72d263c2 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -500,7 +500,7 @@ usage_prop_cb(int prop, void *cb) { FILE *fp = cb; - (void) fprintf(fp, "\t%-15s ", zfs_prop_to_name(prop)); + (void) fprintf(fp, "\t%-22s ", zfs_prop_to_name(prop)); if (zfs_prop_readonly(prop)) (void) fprintf(fp, " NO "); @@ -561,40 +561,40 @@ usage(boolean_t requested) (void) fprintf(fp, "%s", gettext("\nThe following properties are supported:\n")); - (void) fprintf(fp, "\n\t%-14s %s %s %s\n\n", + (void) fprintf(fp, "\n\t%-21s %s %s %s\n\n", "PROPERTY", "EDIT", "INHERIT", "VALUES"); /* Iterate over all properties */ (void) zprop_iter(usage_prop_cb, fp, B_FALSE, B_TRUE, ZFS_TYPE_DATASET); - (void) fprintf(fp, "\t%-15s ", "userused@..."); + (void) fprintf(fp, "\t%-22s ", "userused@..."); (void) fprintf(fp, " NO NO \n"); - (void) fprintf(fp, "\t%-15s ", "groupused@..."); + (void) fprintf(fp, "\t%-22s ", "groupused@..."); (void) fprintf(fp, " NO NO \n"); - (void) fprintf(fp, "\t%-15s ", "projectused@..."); + (void) fprintf(fp, "\t%-22s ", "projectused@..."); (void) fprintf(fp, " NO NO \n"); - (void) fprintf(fp, "\t%-15s ", "userobjused@..."); + (void) fprintf(fp, "\t%-22s ", "userobjused@..."); (void) fprintf(fp, " NO NO \n"); - (void) fprintf(fp, "\t%-15s ", "groupobjused@..."); + (void) fprintf(fp, "\t%-22s ", "groupobjused@..."); (void) fprintf(fp, " NO NO \n"); - (void) fprintf(fp, "\t%-15s ", "projectobjused@..."); + (void) fprintf(fp, "\t%-22s ", "projectobjused@..."); (void) fprintf(fp, " NO NO \n"); - (void) fprintf(fp, "\t%-15s ", "userquota@..."); + (void) fprintf(fp, "\t%-22s ", "userquota@..."); (void) fprintf(fp, "YES NO | none\n"); - (void) fprintf(fp, "\t%-15s ", "groupquota@..."); + (void) fprintf(fp, "\t%-22s ", "groupquota@..."); (void) fprintf(fp, "YES NO | none\n"); - (void) fprintf(fp, "\t%-15s ", "projectquota@..."); + (void) fprintf(fp, "\t%-22s ", "projectquota@..."); (void) fprintf(fp, "YES NO | none\n"); - (void) fprintf(fp, "\t%-15s ", "userobjquota@..."); + (void) fprintf(fp, "\t%-22s ", "userobjquota@..."); (void) fprintf(fp, "YES NO | none\n"); - (void) fprintf(fp, "\t%-15s ", "groupobjquota@..."); + (void) fprintf(fp, "\t%-22s ", "groupobjquota@..."); (void) fprintf(fp, "YES NO | none\n"); - (void) fprintf(fp, "\t%-15s ", "projectobjquota@..."); + (void) fprintf(fp, "\t%-22s ", "projectobjquota@..."); (void) fprintf(fp, "YES NO | none\n"); - (void) fprintf(fp, "\t%-15s ", "written@"); + (void) fprintf(fp, "\t%-22s ", "written@"); (void) fprintf(fp, " NO NO \n"); - (void) fprintf(fp, "\t%-15s ", "written#"); + (void) fprintf(fp, "\t%-22s ", "written#"); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, gettext("\nSizes are specified in bytes " From 219a89cbbfdcbdc30c3417b56ec3e81fd8370ee5 Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Fri, 20 Dec 2024 01:02:58 +0500 Subject: [PATCH 17/43] Skip iterating over snapshots for share properties Setting sharenfs and sharesmb properties on a dataset can become costly if there are large number of snapshots, since setting the share properties iterates over all snapshots present for a dataset. If it is the root dataset for which we are trying to set the share property, snapshots for all child datasets and their children will also be iterated. There is no need to iterate over snapshots for share properties because we do not allow share properties or any other property, to be set on a snapshot itself execpt for user properties. This commit skips iterating over snapshots for share properties, instead iterate over all child dataset and their children for share properties. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Umer Saleem Closes #16877 --- lib/libzfs/libzfs_changelist.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/lib/libzfs/libzfs_changelist.c b/lib/libzfs/libzfs_changelist.c index 4db1cbce9568d..47df8663165e1 100644 --- a/lib/libzfs/libzfs_changelist.c +++ b/lib/libzfs/libzfs_changelist.c @@ -563,8 +563,15 @@ change_one(zfs_handle_t *zhp, void *data) cn = NULL; } - if (!clp->cl_alldependents) - ret = zfs_iter_children_v2(zhp, 0, change_one, data); + if (!clp->cl_alldependents) { + if (clp->cl_prop != ZFS_PROP_MOUNTPOINT) { + ret = zfs_iter_filesystems_v2(zhp, 0, + change_one, data); + } else { + ret = zfs_iter_children_v2(zhp, 0, change_one, + data); + } + } /* * If we added the handle to the changelist, we will re-use it @@ -738,6 +745,11 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags, changelist_free(clp); return (NULL); } + } else if (clp->cl_prop != ZFS_PROP_MOUNTPOINT) { + if (zfs_iter_filesystems_v2(zhp, 0, change_one, clp) != 0) { + changelist_free(clp); + return (NULL); + } } else if (zfs_iter_children_v2(zhp, 0, change_one, clp) != 0) { changelist_free(clp); return (NULL); From 1acd2469643f8543f2941cdc72f977d56521c4c9 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 20 Dec 2024 17:25:35 -0500 Subject: [PATCH 18/43] Fix readonly check for vdev user properties VDEV_PROP_USERPROP is equal do VDEV_PROP_INVAL and so is not a real property. That's why vdev_prop_readonly() does not work right for it. In particular it may declare all vdev user properties readonly on FreeBSD. Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16890 --- module/zfs/vdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 250590f062eaa..85b6ee32158d9 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -5969,7 +5969,7 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) goto end; } - if (vdev_prop_readonly(prop)) { + if (prop != VDEV_PROP_USERPROP && vdev_prop_readonly(prop)) { error = EROFS; goto end; } From c37a2ddaaabc203feaae987f62289b2cfe306818 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 27 Dec 2024 09:10:09 +1100 Subject: [PATCH 19/43] microzap: set hard upper limit of 1M The count of chunks in a microzap block is stored as an uint16_t (mze_chunkid). Each chunk is 64 bytes, and the first is used to store a header, so there are 32767 usable chunks, which is just under 2M. 1M is the largest power-2-rounded block size under 2M, so we must set the limit there. If it goes higher, the loop in mzap_addent can overflow and fall into the PANIC case. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #16888 --- module/zfs/zap_micro.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index 55b60006e58ce..a9298d3e940e6 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -54,14 +54,25 @@ * machinery to understand not to try to split a microzap block). * * If large_microzap is enabled, this value will be clamped to - * spa_maxblocksize(). If not, it will be clamped to SPA_OLD_MAXBLOCKSIZE. + * spa_maxblocksize(), up to 1M. If not, it will be clamped to + * SPA_OLD_MAXBLOCKSIZE. */ static int zap_micro_max_size = SPA_OLD_MAXBLOCKSIZE; +/* + * The 1M upper limit is necessary because the count of chunks in a microzap + * block is stored as a uint16_t (mze_chunkid). Each chunk is 64 bytes, and the + * first is used to store a header, so there are 32767 usable chunks, which is + * just under 2M. 1M is the largest power-2-rounded block size under 2M, so we + * must set the limit there. + */ +#define MZAP_MAX_SIZE (1048576) + uint64_t zap_get_micro_max_size(spa_t *spa) { - uint64_t maxsz = P2ROUNDUP(zap_micro_max_size, SPA_MINBLOCKSIZE); + uint64_t maxsz = MIN(MZAP_MAX_SIZE, + P2ROUNDUP(zap_micro_max_size, SPA_MINBLOCKSIZE)); if (maxsz <= SPA_OLD_MAXBLOCKSIZE) return (maxsz); if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_MICROZAP)) @@ -2031,5 +2042,6 @@ EXPORT_SYMBOL(zap_cursor_init_serialized); EXPORT_SYMBOL(zap_get_stats); ZFS_MODULE_PARAM(zfs, , zap_micro_max_size, INT, ZMOD_RW, - "Maximum micro ZAP size, before converting to a fat ZAP, in bytes"); + "Maximum micro ZAP size before converting to a fat ZAP, " + "in bytes (max 1M)"); #endif From 89f796dec688ab706bb9905b57f7725dc5c2f3ee Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 27 Dec 2024 10:01:22 -0500 Subject: [PATCH 20/43] ZTS: Increase write sizes for RAIDZ/dRAID tests Many RAIDZ/dRAID tests filled files doing millions of 100 or even 10 byte writes. It makes very little sense since we are not micro-benchmarking syscalls or VFS layer here, while before the blocks reach the vdev layer absolute majority of the small writes will be aggregated. In some cases I see we spend almost as much time creating the test files as actually running the tests. And sometimes the tests even time out after that. Reviewed-by: Tony Hutter Reviewed-by: George Melikov Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16905 --- .../tests/functional/raidz/raidz_expand_001_pos.ksh | 6 +++--- .../tests/functional/raidz/raidz_expand_002_pos.ksh | 6 +++--- .../tests/functional/raidz/raidz_expand_003_neg.ksh | 2 +- .../tests/functional/raidz/raidz_expand_003_pos.ksh | 4 ++-- .../tests/functional/raidz/raidz_expand_004_pos.ksh | 4 ++-- .../tests/functional/raidz/raidz_expand_005_pos.ksh | 4 ++-- .../tests/functional/redundancy/redundancy_draid.ksh | 6 +++--- .../functional/redundancy/redundancy_draid_damaged1.ksh | 6 +++--- .../functional/redundancy/redundancy_draid_damaged2.ksh | 6 +++--- .../tests/functional/redundancy/redundancy_raidz.ksh | 6 +++--- 10 files changed, 25 insertions(+), 25 deletions(-) diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh index d4923fdb67d9f..125b0e5411a3a 100755 --- a/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh @@ -200,13 +200,13 @@ log_must zpool create -f -o cachefile=none $TESTPOOL $raid ${disks[@]} log_must zfs set primarycache=metadata $TESTPOOL log_must zfs create $TESTPOOL/fs -log_must fill_fs /$TESTPOOL/fs 1 512 100 1024 R +log_must fill_fs /$TESTPOOL/fs 1 512 102400 1 R log_must zfs create -o compress=on $TESTPOOL/fs2 -log_must fill_fs /$TESTPOOL/fs2 1 512 100 1024 R +log_must fill_fs /$TESTPOOL/fs2 1 512 102400 1 R log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3 -log_must fill_fs /$TESTPOOL/fs3 1 512 100 1024 R +log_must fill_fs /$TESTPOOL/fs3 1 512 102400 1 R log_must check_pool_status $TESTPOOL "errors" "No known data errors" diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh index 56810aca099f4..185316a7cb858 100755 --- a/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh @@ -78,13 +78,13 @@ log_must zpool create -f $opts $pool $raid ${disks[1..$(($nparity+1))]} log_must zfs set primarycache=metadata $pool log_must zfs create $pool/fs -log_must fill_fs /$pool/fs 1 512 100 1024 R +log_must fill_fs /$pool/fs 1 512 102400 1 R log_must zfs create -o compress=on $pool/fs2 -log_must fill_fs /$pool/fs2 1 512 100 1024 R +log_must fill_fs /$pool/fs2 1 512 102400 1 R log_must zfs create -o compress=on -o recordsize=8k $pool/fs3 -log_must fill_fs /$pool/fs3 1 512 100 1024 R +log_must fill_fs /$pool/fs3 1 512 102400 1 R typeset pool_size=$(get_pool_prop size $pool) diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_neg.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_neg.ksh index 4d85c46897b89..a2eb87b1f7228 100755 --- a/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_neg.ksh +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_neg.ksh @@ -92,7 +92,7 @@ log_must zpool destroy $pool log_must zpool create -f $opts $pool $raid ${disks[1..$(($devs-1))]} log_must zfs set primarycache=metadata $pool log_must zfs create $pool/fs -log_must fill_fs /$pool/fs 1 512 100 1024 R +log_must fill_fs /$pool/fs 1 512 102400 1 R allocated=$(zpool list -Hp -o allocated $pool) log_must set_tunable64 RAIDZ_EXPAND_MAX_REFLOW_BYTES $((allocated / 4)) log_must zpool attach $pool ${raid}-0 ${disks[$devs]} diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_pos.ksh index 712b252617738..6f852c516ca4c 100755 --- a/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_pos.ksh @@ -94,10 +94,10 @@ opts="-o cachefile=none" log_must zpool create -f $opts $pool $raid ${disks[1..$(($nparity+1))]} log_must zfs create -o recordsize=8k $pool/fs -log_must fill_fs /$pool/fs 1 256 100 1024 R +log_must fill_fs /$pool/fs 1 256 102400 1 R log_must zfs create -o recordsize=128k $pool/fs2 -log_must fill_fs /$pool/fs2 1 256 100 1024 R +log_must fill_fs /$pool/fs2 1 256 102400 1 R for disk in ${disks[$(($nparity+2))..$devs]}; do log_must mkfile -n 400m /$pool/fs/file diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_004_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_004_pos.ksh index 2be55dae42541..5056e4e4b1fd5 100755 --- a/tests/zfs-tests/tests/functional/raidz/raidz_expand_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_004_pos.ksh @@ -81,10 +81,10 @@ log_must set_tunable32 SCRUB_AFTER_EXPAND 0 log_must zpool create -f $opts $pool $raid ${disks[1..$(($nparity+1))]} log_must zfs create -o recordsize=8k $pool/fs -log_must fill_fs /$pool/fs 1 128 100 1024 R +log_must fill_fs /$pool/fs 1 128 102400 1 R log_must zfs create -o recordsize=128k $pool/fs2 -log_must fill_fs /$pool/fs2 1 128 100 1024 R +log_must fill_fs /$pool/fs2 1 128 102400 1 R for disk in ${disks[$(($nparity+2))..$devs]}; do log_must zpool attach $pool ${raid}-0 $disk diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_005_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_005_pos.ksh index 56ee3e9be67c6..49b9f6c1d353f 100755 --- a/tests/zfs-tests/tests/functional/raidz/raidz_expand_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_005_pos.ksh @@ -137,10 +137,10 @@ log_must zpool create -f $opts $pool $raid ${disks[1..$(($nparity+1))]} devices="${disks[1..$(($nparity+1))]}" log_must zfs create -o recordsize=8k $pool/fs -log_must fill_fs /$pool/fs 1 128 100 1024 R +log_must fill_fs /$pool/fs 1 128 102400 1 R log_must zfs create -o recordsize=128k $pool/fs2 -log_must fill_fs /$pool/fs2 1 128 100 1024 R +log_must fill_fs /$pool/fs2 1 128 102400 1 R for disk in ${disks[$(($nparity+2))..$devs]}; do # Set pause to some random value near halfway point diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid.ksh index 8208d2b4a3981..df113a98aa3c8 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid.ksh @@ -223,13 +223,13 @@ for nparity in 1 2 3; do log_must zfs set primarycache=metadata $TESTPOOL log_must zfs create $TESTPOOL/fs - log_must fill_fs /$TESTPOOL/fs 1 512 100 1024 R + log_must fill_fs /$TESTPOOL/fs 1 512 102400 1 R log_must zfs create -o compress=on $TESTPOOL/fs2 - log_must fill_fs /$TESTPOOL/fs2 1 512 100 1024 R + log_must fill_fs /$TESTPOOL/fs2 1 512 102400 1 R log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3 - log_must fill_fs /$TESTPOOL/fs3 1 512 100 1024 R + log_must fill_fs /$TESTPOOL/fs3 1 512 102400 1 R typeset pool_size=$(get_pool_prop size $TESTPOOL) diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged1.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged1.ksh index 110c69159eb1b..50d7358411dc0 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged1.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged1.ksh @@ -119,13 +119,13 @@ for nparity in 1 2 3; do log_must zfs set primarycache=metadata $TESTPOOL log_must zfs create $TESTPOOL/fs - log_must fill_fs /$TESTPOOL/fs 1 512 100 1024 R + log_must fill_fs /$TESTPOOL/fs 1 512 102400 1 R log_must zfs create -o compress=on $TESTPOOL/fs2 - log_must fill_fs /$TESTPOOL/fs2 1 512 100 1024 R + log_must fill_fs /$TESTPOOL/fs2 1 512 102400 1 R log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3 - log_must fill_fs /$TESTPOOL/fs3 1 512 100 1024 R + log_must fill_fs /$TESTPOOL/fs3 1 512 102400 1 R log_must zpool export $TESTPOOL log_must zpool import -o cachefile=none -d $dir $TESTPOOL diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh index b0bb4ef841298..ad66f86339864 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh @@ -94,13 +94,13 @@ for nparity in 1 2 3; do # log_must zfs set primarycache=metadata $TESTPOOL log_must zfs create $TESTPOOL/fs - log_must fill_fs /$TESTPOOL/fs 1 256 10 1024 R + log_must fill_fs /$TESTPOOL/fs 1 256 10240 1 R log_must zfs create -o compress=on $TESTPOOL/fs2 - log_must fill_fs /$TESTPOOL/fs2 1 256 10 1024 R + log_must fill_fs /$TESTPOOL/fs2 1 256 10240 1 R log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3 - log_must fill_fs /$TESTPOOL/fs3 1 256 10 1024 R + log_must fill_fs /$TESTPOOL/fs3 1 256 10240 1 R log_must zpool export $TESTPOOL log_must zpool import -o cachefile=none -d $dir $TESTPOOL diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz.ksh index 83cacda84b09b..7de35c947fec9 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz.ksh @@ -223,13 +223,13 @@ for nparity in 1 2 3; do log_must zfs set primarycache=metadata $TESTPOOL log_must zfs create $TESTPOOL/fs - log_must fill_fs /$TESTPOOL/fs 1 512 100 1024 R + log_must fill_fs /$TESTPOOL/fs 1 512 102400 1 R log_must zfs create -o compress=on $TESTPOOL/fs2 - log_must fill_fs /$TESTPOOL/fs2 1 512 100 1024 R + log_must fill_fs /$TESTPOOL/fs2 1 512 102400 1 R log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3 - log_must fill_fs /$TESTPOOL/fs3 1 512 100 1024 R + log_must fill_fs /$TESTPOOL/fs3 1 512 102400 1 R typeset pool_size=$(get_pool_prop size $TESTPOOL) From 779c5a5debf219b87cd335852ad060fa79acf3e8 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 19 Dec 2024 20:11:54 +1100 Subject: [PATCH 21/43] zpool_get_vdev_prop_value: show missing vdev userprops If a vdev userprop is not found, present it as value '-', default source, so it matches the output from pool userprops. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #16887 --- lib/libzfs/libzfs_pool.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index f256535e8ea04..64f9d1f6eb49b 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -5342,7 +5342,8 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name, strval = fnvlist_lookup_string(nv, ZPROP_VALUE); } else { /* user prop not found */ - return (-1); + src = ZPROP_SRC_DEFAULT; + strval = "-"; } (void) strlcpy(buf, strval, len); if (srctype) From 03b7cfdef38aed0966fb69510b1ce1a4bc3f2bde Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 19 Dec 2024 20:11:54 +1100 Subject: [PATCH 22/43] spa_sync_props: remove pool userprops by setting empty-string People have noted there's no way to remove a pool userprop, only zero it. Turns vdev userprops had a method, by setting empty-string. So this makes pool userprops follow the same behaviour. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #16887 --- module/zfs/spa.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index b83c982c13fd6..c93c7945f192f 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -9683,9 +9683,17 @@ spa_sync_props(void *arg, dmu_tx_t *tx) if (nvpair_type(elem) == DATA_TYPE_STRING) { ASSERT(proptype == PROP_TYPE_STRING); strval = fnvpair_value_string(elem); - VERIFY0(zap_update(mos, - spa->spa_pool_props_object, propname, - 1, strlen(strval) + 1, strval, tx)); + if (strlen(strval) == 0) { + /* remove the property if value == "" */ + (void) zap_remove(mos, + spa->spa_pool_props_object, + propname, tx); + } else { + VERIFY0(zap_update(mos, + spa->spa_pool_props_object, + propname, 1, strlen(strval) + 1, + strval, tx)); + } spa_history_log_internal(spa, "set", tx, "%s=%s", elemname, strval); } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { From c4e5fa5e175f19f44596cc43229f09ca58cd68ce Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 19 Dec 2024 20:11:54 +1100 Subject: [PATCH 23/43] ZTS: test clearing pool and vdev userprops Confirming that clearing pool and vdev userprops produce the same result: an empty value, with default source. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #16887 --- tests/runfiles/common.run | 3 +- tests/zfs-tests/tests/Makefile.am | 1 + .../zpool_set/zpool_set_clear_userprop.ksh | 44 +++++++++++++++++++ .../zpool_set/zpool_set_common.kshlib | 40 ++++++++++++++++- 4 files changed, 85 insertions(+), 3 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_clear_userprop.ksh diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index a69d36df2f984..6abb3b4213bb3 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -544,7 +544,8 @@ tags = ['functional', 'cli_root', 'zpool_scrub'] [tests/functional/cli_root/zpool_set] tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg', 'zpool_set_ashift', 'zpool_set_features', 'vdev_set_001_pos', - 'user_property_001_pos', 'user_property_002_neg'] + 'user_property_001_pos', 'user_property_002_neg', + 'zpool_set_clear_userprop'] tags = ['functional', 'cli_root', 'zpool_set'] [tests/functional/cli_root/zpool_split] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 67630cb564ae9..588249be45da7 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1243,6 +1243,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_set/user_property_001_pos.ksh \ functional/cli_root/zpool_set/user_property_002_neg.ksh \ functional/cli_root/zpool_set/zpool_set_features.ksh \ + functional/cli_root/zpool_set/zpool_set_clear_userprop.ksh \ functional/cli_root/zpool_split/cleanup.ksh \ functional/cli_root/zpool_split/setup.ksh \ functional/cli_root/zpool_split/zpool_split_cliargs.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_clear_userprop.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_clear_userprop.ksh new file mode 100755 index 0000000000000..d9395ea8a15be --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_clear_userprop.ksh @@ -0,0 +1,44 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024, Klara, Inc. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_set/zpool_set_common.kshlib + +verify_runnable "both" + +log_assert "Setting a user-defined property to the empty string removes it." +log_onexit cleanup_user_prop $TESTPOOL + +log_must zpool set cool:pool=hello $TESTPOOL +log_must check_user_prop $TESTPOOL cool:pool hello local +log_must zpool set cool:pool= $TESTPOOL +log_must check_user_prop $TESTPOOL cool:pool '-' default + +log_must zpool set cool:vdev=goodbye $TESTPOOL root +log_must check_vdev_user_prop $TESTPOOL root cool:vdev goodbye local +log_must zpool set cool:vdev= $TESTPOOL root +log_must check_vdev_user_prop $TESTPOOL root cool:vdev '-' default + +log_pass "Setting a user-defined property to the empty string removes it." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_common.kshlib b/tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_common.kshlib index 346e4a16b2ad9..e095d315c2b5b 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_common.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_common.kshlib @@ -160,19 +160,55 @@ function user_property_value random_string ALL_CHAR $len } +function _check_user_prop +{ + typeset pool="$1" + typeset vdev="$2" + typeset user_prop="$3" + typeset expect_value="$4" + typeset expect_source="$5" + + typeset -a \ + v=($(zpool get -p -H -o value,source "$user_prop" $pool $vdev 2>&1)) + + [[ "$expect_value" == "${v[0]}" && \ + -z "$expect_source" || "$expect_source" == "${v[1]}" ]] +} + # # Check if the user-defined property is identical to the expected value. # # $1 pool # $2 user property # $3 expected value +# $4 expected source (optional) # function check_user_prop { typeset pool=$1 typeset user_prop="$2" typeset expect_value="$3" - typeset value=$(zpool get -p -H -o value "$user_prop" $pool 2>&1) + typeset expect_source="${4:-}" + + _check_user_prop $pool '' $user_prop $expect_value $expect_source +} + +# +# Check if the user-defined property is identical to the expected value. +# +# $1 pool +# $2 vdev +# $3 user property +# $4 expected value +# $5 expected source (optional) +# +function check_vdev_user_prop +{ + typeset pool="$1" + typeset vdev="$2" + typeset user_prop="$3" + typeset expect_value="$4" + typeset expect_source="${5:-}" - [ "$expect_value" = "$value" ] + _check_user_prop $pool $vdev $user_prop $expect_value $expect_source } From 8bf1e83eefa4f4ed74505f4a9ba4c3720c73185c Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 20 Dec 2024 10:54:35 -0500 Subject: [PATCH 24/43] ZTS: Remove non-standard awk hex numbers usage FreeBSD recently removed non-standard hex numbers support from awk. Neither it supports -n argument, enabling it in gawk. Instead of depending on those rewrite list_file_blocks() fuction to handle the hex math in shell instead of awk. Reviewed-by: Brian Behlendorf Reviewed-by: Tony Hutter Reviewed-by: Tino Reichardt Signed-off-by:Alexander Motin Sponsored by: iXsystems, Inc. Closes #11141 --- tests/zfs-tests/include/blkdev.shlib | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/tests/zfs-tests/include/blkdev.shlib b/tests/zfs-tests/include/blkdev.shlib index 51eff3023e732..5b505f9252863 100644 --- a/tests/zfs-tests/include/blkdev.shlib +++ b/tests/zfs-tests/include/blkdev.shlib @@ -556,27 +556,15 @@ function list_file_blocks # input_file # 512B blocks for ease of use with dd. # typeset level vdev path offset length - if awk -n '' 2>/dev/null; then - # gawk needs -n to decode hex - AWK='awk -n' - else - AWK='awk' - fi sync_all_pools true - zdb -dddddd $ds $objnum | $AWK -v pad=$((4<<20)) -v bs=512 ' + zdb -dddddd $ds $objnum | awk ' /^$/ { looking = 0 } looking { level = $2 field = 3 while (split($field, dva, ":") == 3) { - # top level vdev id - vdev = int(dva[1]) - # offset + 4M label/boot pad in 512B blocks - offset = (int("0x"dva[2]) + pad) / bs - # length in 512B blocks - len = int("0x"dva[3]) / bs - print level, vdev, offset, len + print level, int(dva[1]), "0x"dva[2], "0x"dva[3] ++field } @@ -585,7 +573,8 @@ function list_file_blocks # input_file ' | \ while read level vdev offset length; do for path in ${VDEV_MAP[$vdev][@]}; do - echo "$level $path $offset $length" + echo "$level $path $(( ($offset + (4<<20)) / 512 ))" \ + "$(( $length / 512 ))" done done 2>/dev/null } From b66d910113921fbe6a9c67c2ba3e607820c1966b Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 25 Dec 2024 14:00:38 -0500 Subject: [PATCH 25/43] ZTS: Remove procfs use from zpool_import_status procfs might be not mounted on FreeBSD. Plus checking for specific PID might be not exactly reliable. Check for empty list of jobs instead. Premature loop exit can result in failed test and failed cleanup, failing also some following tests. Reviewed-by: Brian Behlendorf Reviewed-by: Tony Hutter Reviewed-by: Tino Reichardt Signed-off-by:Alexander Motin Sponsored by: iXsystems, Inc. Closes #11141 --- .../cli_root/zpool_import/zpool_import_status.ksh | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_status.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_status.ksh index c96961bf6419b..679362bbef505 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_status.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_status.ksh @@ -103,21 +103,16 @@ log_must zpool export $TESTPOOL1 log_must set_tunable64 METASLAB_DEBUG_LOAD 1 log_note "Starting zpool import in background at" $(date +'%H:%M:%S') zpool import -d $DEVICE_DIR -f $guid & -pid=$! # # capture progress until import is finished # -log_note waiting for pid $pid to exit kstat import_progress -while [[ -d /proc/"$pid" ]]; do +while [[ -n $(jobs) ]]; do line=$(kstat import_progress | grep -v pool_guid) if [[ -n $line ]]; then echo $line fi - if [[ -f /$TESTPOOL1/fs/00 ]]; then - break; - fi sleep 0.0001 done log_note "zpool import completed at" $(date +'%H:%M:%S') From a153397f414a1a22f2c54213e83afa092712faf3 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 25 Dec 2024 19:42:44 -0500 Subject: [PATCH 26/43] ZTS: Reduce file size in redacted_panic to 1GB This test takes 3 minutes on RELEASE FreeBSD bots, but on CURRENT, probably due to debugging it has in kernel, it does not complete within 10 minutes, ending up killed. As I see all the redacting here happens within the first ~128MB of the file, so I hope it won't matter if there is 1GB of data instead of 2GB. Reviewed-by: Brian Behlendorf Reviewed-by: Tony Hutter Reviewed-by: Tino Reichardt Signed-off-by:Alexander Motin Sponsored by: iXsystems, Inc. Closes #11141 --- .../zfs-tests/tests/functional/redacted_send/redacted_panic.ksh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_panic.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_panic.ksh index 032d1fb91a2ee..a2438c2cd7316 100755 --- a/tests/zfs-tests/tests/functional/redacted_send/redacted_panic.ksh +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_panic.ksh @@ -39,7 +39,7 @@ function cleanup log_onexit cleanup log_must zfs create -o recsize=8k $sendfs -log_must dd if=/dev/urandom of=/$sendfs/file bs=1024k count=2048 +log_must dd if=/dev/urandom of=/$sendfs/file bs=1024k count=1024 log_must zfs snapshot $sendfs@init log_must zfs clone $sendfs@init $clone log_must stride_dd -i /dev/urandom -o /$clone/file -b 8192 -s 2 -c 7226 From 9dd5fe1095df246299df9a953a9c4994142636c2 Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Mon, 30 Dec 2024 00:41:30 +0500 Subject: [PATCH 27/43] zvol: implement platform-independent part of block cloning In Linux, block devices currently lack support for `copy_file_range` API because the kernel does not provide the necessary functionality. However, there is an ongoing upstream effort to address this limitation: https://patchwork.kernel.org/project/dm-devel/cover/20240520102033.9361-1-nj.shetty@samsung.com/. We have adopted this upstream kernel patch into the TrueNAS kernel and made some additional modifications to enable block cloning specifically for the zvol block device. This patch implements the platform- independent portions of these changes for inclusion in OpenZFS. This patch does not introduce any new functionality directly into OpenZFS. The `TX_CLONE_RANGE` replay capability is only relevant when zvols are migrated to non-TrueNAS systems that support Clone Range replay in the ZIL. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Ameer Hamza Closes #16901 --- include/sys/zvol_impl.h | 5 + module/zfs/zfs_vnops.c | 2 +- module/zfs/zvol.c | 284 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 289 insertions(+), 2 deletions(-) diff --git a/include/sys/zvol_impl.h b/include/sys/zvol_impl.h index 3cd0d78c353d1..a8168850023a6 100644 --- a/include/sys/zvol_impl.h +++ b/include/sys/zvol_impl.h @@ -88,6 +88,11 @@ int zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, int zvol_init_impl(void); void zvol_fini_impl(void); void zvol_wait_close(zvol_state_t *zv); +int zvol_clone_range(zvol_state_handle_t *, uint64_t, + zvol_state_handle_t *, uint64_t, uint64_t); +void zvol_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, + uint64_t off, uint64_t len, uint64_t blksz, const blkptr_t *bps, + size_t nbps); /* * platform dependent functions exported to platform independent code diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index c01a9cf5d0b2e..b789d1ed52397 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -71,7 +71,7 @@ int zfs_bclone_enabled = 1; * a copy of the file and is therefore not the default. However, in certain * scenarios this behavior may be desirable so a tunable is provided. */ -static int zfs_bclone_wait_dirty = 0; +int zfs_bclone_wait_dirty = 0; /* * Enable Direct I/O. If this setting is 0, then all I/O requests will be diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index fec595b2c4c51..14a6219d19cd6 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -93,6 +93,7 @@ unsigned int zvol_volmode = ZFS_VOLMODE_GEOM; struct hlist_head *zvol_htable; static list_t zvol_state_list; krwlock_t zvol_state_lock; +extern int zfs_bclone_wait_dirty; typedef enum { ZVOL_ASYNC_REMOVE_MINORS, @@ -516,6 +517,285 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap) return (error); } +/* + * Replay a TX_CLONE_RANGE ZIL transaction that didn't get committed + * after a system failure + */ +static int +zvol_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap) +{ + zvol_state_t *zv = arg1; + lr_clone_range_t *lr = arg2; + objset_t *os = zv->zv_objset; + dmu_tx_t *tx; + int error; + uint64_t blksz; + uint64_t off; + uint64_t len; + + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lr->lr_common.lrc_reclen, >=, offsetof(lr_clone_range_t, + lr_bps[lr->lr_nbps])); + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + ASSERT(spa_feature_is_enabled(dmu_objset_spa(os), + SPA_FEATURE_BLOCK_CLONING)); + + off = lr->lr_offset; + len = lr->lr_length; + blksz = lr->lr_blksz; + + if ((off % blksz) != 0) { + return (SET_ERROR(EINVAL)); + } + + error = dnode_hold(os, ZVOL_OBJ, zv, &zv->zv_dn); + if (error != 0 || !zv->zv_dn) + return (error); + tx = dmu_tx_create(os); + dmu_tx_hold_clone_by_dnode(tx, zv->zv_dn, off, len); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + goto out; + } + error = dmu_brt_clone(zv->zv_objset, ZVOL_OBJ, off, len, + tx, lr->lr_bps, lr->lr_nbps); + if (error != 0) { + dmu_tx_commit(tx); + goto out; + } + + /* + * zil_replaying() not only check if we are replaying ZIL, but also + * updates the ZIL header to record replay progress. + */ + VERIFY(zil_replaying(zv->zv_zilog, tx)); + dmu_tx_commit(tx); + +out: + dnode_rele(zv->zv_dn, zv); + zv->zv_dn = NULL; + return (error); +} + +int +zvol_clone_range(zvol_state_t *zv_src, uint64_t inoff, zvol_state_t *zv_dst, + uint64_t outoff, uint64_t len) +{ + zilog_t *zilog_dst; + zfs_locked_range_t *inlr, *outlr; + objset_t *inos, *outos; + dmu_tx_t *tx; + blkptr_t *bps; + size_t maxblocks; + int error = EINVAL; + + rw_enter(&zv_dst->zv_suspend_lock, RW_READER); + if (zv_dst->zv_zilog == NULL) { + rw_exit(&zv_dst->zv_suspend_lock); + rw_enter(&zv_dst->zv_suspend_lock, RW_WRITER); + if (zv_dst->zv_zilog == NULL) { + zv_dst->zv_zilog = zil_open(zv_dst->zv_objset, + zvol_get_data, &zv_dst->zv_kstat.dk_zil_sums); + zv_dst->zv_flags |= ZVOL_WRITTEN_TO; + VERIFY0((zv_dst->zv_zilog->zl_header->zh_flags & + ZIL_REPLAY_NEEDED)); + } + rw_downgrade(&zv_dst->zv_suspend_lock); + } + if (zv_src != zv_dst) + rw_enter(&zv_src->zv_suspend_lock, RW_READER); + + inos = zv_src->zv_objset; + outos = zv_dst->zv_objset; + + /* + * Sanity checks + */ + if (!spa_feature_is_enabled(dmu_objset_spa(outos), + SPA_FEATURE_BLOCK_CLONING)) { + error = EOPNOTSUPP; + goto out; + } + if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) { + error = EXDEV; + goto out; + } + if (inos->os_encrypted != outos->os_encrypted) { + error = EXDEV; + goto out; + } + if (zv_src->zv_volblocksize != zv_dst->zv_volblocksize) { + error = EINVAL; + goto out; + } + if (inoff >= zv_src->zv_volsize || outoff >= zv_dst->zv_volsize) { + error = 0; + goto out; + } + + /* + * Do not read beyond boundary + */ + if (len > zv_src->zv_volsize - inoff) + len = zv_src->zv_volsize - inoff; + if (len > zv_dst->zv_volsize - outoff) + len = zv_dst->zv_volsize - outoff; + if (len == 0) { + error = 0; + goto out; + } + + /* + * No overlapping if we are cloning within the same file + */ + if (zv_src == zv_dst) { + if (inoff < outoff + len && outoff < inoff + len) { + error = EINVAL; + goto out; + } + } + + /* + * Offsets and length must be at block boundaries + */ + if ((inoff % zv_src->zv_volblocksize) != 0 || + (outoff % zv_dst->zv_volblocksize) != 0) { + error = EINVAL; + goto out; + } + + /* + * Length must be multiple of block size + */ + if ((len % zv_src->zv_volblocksize) != 0) { + error = EINVAL; + goto out; + } + + zilog_dst = zv_dst->zv_zilog; + maxblocks = zil_max_log_data(zilog_dst, sizeof (lr_clone_range_t)) / + sizeof (bps[0]); + bps = vmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP); + /* + * Maintain predictable lock order. + */ + if (zv_src < zv_dst || (zv_src == zv_dst && inoff < outoff)) { + inlr = zfs_rangelock_enter(&zv_src->zv_rangelock, inoff, len, + RL_READER); + outlr = zfs_rangelock_enter(&zv_dst->zv_rangelock, outoff, len, + RL_WRITER); + } else { + outlr = zfs_rangelock_enter(&zv_dst->zv_rangelock, outoff, len, + RL_WRITER); + inlr = zfs_rangelock_enter(&zv_src->zv_rangelock, inoff, len, + RL_READER); + } + + while (len > 0) { + uint64_t size, last_synced_txg; + size_t nbps = maxblocks; + size = MIN(zv_src->zv_volblocksize * maxblocks, len); + last_synced_txg = spa_last_synced_txg( + dmu_objset_spa(zv_src->zv_objset)); + error = dmu_read_l0_bps(zv_src->zv_objset, ZVOL_OBJ, inoff, + size, bps, &nbps); + if (error != 0) { + /* + * If we are trying to clone a block that was created + * in the current transaction group, the error will be + * EAGAIN here. Based on zfs_bclone_wait_dirty either + * return a shortened range to the caller so it can + * fallback, or wait for the next TXG and check again. + */ + if (error == EAGAIN && zfs_bclone_wait_dirty) { + txg_wait_synced(dmu_objset_pool + (zv_src->zv_objset), last_synced_txg + 1); + continue; + } + break; + } + + tx = dmu_tx_create(zv_dst->zv_objset); + dmu_tx_hold_clone_by_dnode(tx, zv_dst->zv_dn, outoff, size); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + break; + } + error = dmu_brt_clone(zv_dst->zv_objset, ZVOL_OBJ, outoff, size, + tx, bps, nbps); + if (error != 0) { + dmu_tx_commit(tx); + break; + } + zvol_log_clone_range(zilog_dst, tx, TX_CLONE_RANGE, outoff, + size, zv_src->zv_volblocksize, bps, nbps); + dmu_tx_commit(tx); + inoff += size; + outoff += size; + len -= size; + } + vmem_free(bps, sizeof (bps[0]) * maxblocks); + zfs_rangelock_exit(outlr); + zfs_rangelock_exit(inlr); + if (error == 0 && zv_dst->zv_objset->os_sync == ZFS_SYNC_ALWAYS) { + zil_commit(zilog_dst, ZVOL_OBJ); + } +out: + if (zv_src != zv_dst) + rw_exit(&zv_src->zv_suspend_lock); + rw_exit(&zv_dst->zv_suspend_lock); + return (SET_ERROR(error)); +} + +/* + * Handles TX_CLONE_RANGE transactions. + */ +void +zvol_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, uint64_t off, + uint64_t len, uint64_t blksz, const blkptr_t *bps, size_t nbps) +{ + itx_t *itx; + lr_clone_range_t *lr; + uint64_t partlen, max_log_data; + size_t partnbps; + + if (zil_replaying(zilog, tx)) + return; + + max_log_data = zil_max_log_data(zilog, sizeof (lr_clone_range_t)); + + while (nbps > 0) { + partnbps = MIN(nbps, max_log_data / sizeof (bps[0])); + partlen = partnbps * blksz; + ASSERT3U(partlen, <, len + blksz); + partlen = MIN(partlen, len); + + itx = zil_itx_create(txtype, + sizeof (*lr) + sizeof (bps[0]) * partnbps); + lr = (lr_clone_range_t *)&itx->itx_lr; + lr->lr_foid = ZVOL_OBJ; + lr->lr_offset = off; + lr->lr_length = partlen; + lr->lr_blksz = blksz; + lr->lr_nbps = partnbps; + memcpy(lr->lr_bps, bps, sizeof (bps[0]) * partnbps); + + zil_itx_assign(zilog, itx, tx); + + bps += partnbps; + ASSERT3U(nbps, >=, partnbps); + nbps -= partnbps; + off += partlen; + ASSERT3U(len, >=, partlen); + len -= partlen; + } +} + static int zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap) { @@ -540,7 +820,9 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = { zvol_replay_write, /* TX_WRITE */ zvol_replay_truncate, /* TX_TRUNCATE */ zvol_replay_err, /* TX_SETATTR */ + zvol_replay_err, /* TX_ACL_V0 */ zvol_replay_err, /* TX_ACL */ + zvol_replay_err, /* TX_CREATE_ACL */ zvol_replay_err, /* TX_CREATE_ATTR */ zvol_replay_err, /* TX_CREATE_ACL_ATTR */ zvol_replay_err, /* TX_MKDIR_ACL */ @@ -550,7 +832,7 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = { zvol_replay_err, /* TX_SETSAXATTR */ zvol_replay_err, /* TX_RENAME_EXCHANGE */ zvol_replay_err, /* TX_RENAME_WHITEOUT */ - zvol_replay_err, /* TX_CLONE_RANGE */ + zvol_replay_clone_range, /* TX_CLONE_RANGE */ }; /* From 54126fdb5bfe4db7c95955549581a67a6a6581a9 Mon Sep 17 00:00:00 2001 From: shodanshok Date: Sun, 29 Dec 2024 20:50:19 +0100 Subject: [PATCH 28/43] set zfs_arc_shrinker_limit to 0 by default zfs_arc_shrinker_limit was introduced to avoid ARC collapse due to aggressive kernel reclaim. While useful, the current default (10000) is too prone to OOM especially when MGLRU-enabled kernels with default min_ttl_ms are used. Even when no OOM happens, it often causes too much swap usage. This patch sets zfs_arc_shrinker_limit=0 to not ignore kernel reclaim requests. ARC now plays better with both kernel shrinker and pagecache but, should ARC collapse happen again, MGLRU behavior can be tuned or even disabled. Anyway, zfs should not cause OOM when ARC can be released. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Gionatan Danti Closes #16909 --- man/man4/zfs.4 | 4 ++-- module/os/linux/zfs/arc_os.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index da027798f962b..7078a5ba83732 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -867,14 +867,14 @@ where that percent may exceed This only operates during memory pressure/reclaim. . -.It Sy zfs_arc_shrinker_limit Ns = Ns Sy 10000 Pq int +.It Sy zfs_arc_shrinker_limit Ns = Ns Sy 0 Pq int This is a limit on how many pages the ARC shrinker makes available for eviction in response to one page allocation attempt. Note that in practice, the kernel's shrinker can ask us to evict up to about four times this for one allocation attempt. To reduce OOM risk, this limit is applied for kswapd reclaims only. .Pp -The default limit of +For example a value of .Sy 10000 Pq in practice, Em 160 MiB No per allocation attempt with 4 KiB pages limits the amount of time spent attempting to reclaim ARC memory to less than 100 ms per allocation attempt, diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c index b1e45b28743e6..3238977af6d1c 100644 --- a/module/os/linux/zfs/arc_os.c +++ b/module/os/linux/zfs/arc_os.c @@ -63,7 +63,7 @@ * practice, the kernel's shrinker can ask us to evict up to about 4x this * for one allocation attempt. * - * The default limit of 10,000 (in practice, 160MB per allocation attempt + * For example a value of 10,000 (in practice, 160MB per allocation attempt * with 4K pages) limits the amount of time spent attempting to reclaim ARC * memory to less than 100ms per allocation attempt, even with a small * average compressed block size of ~8KB. @@ -71,7 +71,7 @@ * See also the comment in arc_shrinker_count(). * Set to 0 to disable limit. */ -static int zfs_arc_shrinker_limit = 10000; +static int zfs_arc_shrinker_limit = 0; /* * Relative cost of ARC eviction, AKA number of seeks needed to restore evicted From 25238baad5c56ed2375fc6067ff691fb723614d1 Mon Sep 17 00:00:00 2001 From: Andrew Walker Date: Mon, 30 Dec 2024 19:06:48 -0600 Subject: [PATCH 29/43] Add missing zfs_exit() when snapdir is disabled (#16912) zfs_vget doesn't zfs_exit when erroring out due to snapdir being disabled. Signed-off-by: Andrew Walker Reviewed-by: @bmeagherix Reviewed-by: Alexander Motin Reviewed-by: Ameer Hamza Reviewed-by: Tony Hutter --- module/os/linux/zfs/zfs_vfsops.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index 3c53a8a315c34..b226fca147a5b 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -1702,13 +1702,14 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) /* A zero fid_gen means we are in the .zfs control directories */ if (fid_gen == 0 && (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { - *ipp = zfsvfs->z_ctldir; - ASSERT(*ipp != NULL); - if (zfsvfs->z_show_ctldir == ZFS_SNAPDIR_DISABLED) { + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOENT)); } + *ipp = zfsvfs->z_ctldir; + ASSERT(*ipp != NULL); + if (object == ZFSCTL_INO_SNAPDIR) { VERIFY(zfsctl_root_lookup(*ipp, "snapshot", ipp, 0, kcred, NULL, NULL) == 0); From 3c2267a873989698576ec8feaba9494f1502b919 Mon Sep 17 00:00:00 2001 From: James Reilly Date: Thu, 2 Jan 2025 22:58:56 +0530 Subject: [PATCH 30/43] ZTS: add centos stream10 (#16904) Added centos as optional runners via workflow_dispatch removed centos-stream9 from the FULL_OS runner list as CentOS is not officially support by ZFS. This commit will add preliminary support for EL10 and allow testing ZFS ahead of EL10 codebase solidifying in ~6 months Signed-off-by: James Reilly Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Reviewed-by: Tino Reichardt --- .github/workflows/scripts/qemu-2-start.sh | 6 +++++ .github/workflows/scripts/qemu-3-deps.sh | 2 +- .github/workflows/zfs-qemu.yml | 27 ++++++++++++++++++++++- 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/.github/workflows/scripts/qemu-2-start.sh b/.github/workflows/scripts/qemu-2-start.sh index 0906e438ac0d8..73496d4f3de66 100755 --- a/.github/workflows/scripts/qemu-2-start.sh +++ b/.github/workflows/scripts/qemu-2-start.sh @@ -40,6 +40,12 @@ case "$OS" in # dns sometimes fails with that url :/ echo "89.187.191.12 geo.mirror.pkgbuild.com" | sudo tee /etc/hosts > /dev/null ;; + centos-stream10) + OSNAME="CentOS Stream 10" + # TODO: #16903 Overwrite OSv to stream9 for virt-install until it's added to osinfo + OSv="centos-stream9" + URL="https://cloud.centos.org/centos/10-stream/x86_64/images/CentOS-Stream-GenericCloud-10-latest.x86_64.qcow2" + ;; centos-stream9) OSNAME="CentOS Stream 9" URL="https://cloud.centos.org/centos/9-stream/x86_64/images/CentOS-Stream-GenericCloud-9-latest.x86_64.qcow2" diff --git a/.github/workflows/scripts/qemu-3-deps.sh b/.github/workflows/scripts/qemu-3-deps.sh index 96979cd02e091..9b8957734277e 100755 --- a/.github/workflows/scripts/qemu-3-deps.sh +++ b/.github/workflows/scripts/qemu-3-deps.sh @@ -104,7 +104,7 @@ case "$1" in sudo dnf install -y kernel-abi-whitelists echo "##[endgroup]" ;; - almalinux9|centos-stream9) + almalinux9|centos-stream9|centos-stream10) echo "##[group]Enable epel and crb repositories" sudo dnf config-manager -y --set-enabled crb sudo dnf install -y epel-release diff --git a/.github/workflows/zfs-qemu.yml b/.github/workflows/zfs-qemu.yml index 4748e90db50bf..af26e135b91fc 100644 --- a/.github/workflows/zfs-qemu.yml +++ b/.github/workflows/zfs-qemu.yml @@ -3,6 +3,18 @@ name: zfs-qemu on: push: pull_request: + workflow_dispatch: + inputs: + include_stream9: + type: boolean + required: false + default: false + description: 'Test on CentOS 9 stream' + include_stream10: + type: boolean + required: false + default: false + description: 'Test on CentOS 10 stream' concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -22,7 +34,7 @@ jobs: - name: Generate OS config and CI type id: os run: | - FULL_OS='["almalinux8", "almalinux9", "centos-stream9", "debian11", "debian12", "fedora40", "fedora41", "freebsd13-3r", "freebsd13-4s", "freebsd14-1r", "freebsd14-2s", "freebsd15-0c", "ubuntu20", "ubuntu22", "ubuntu24"]' + FULL_OS='["almalinux8", "almalinux9", "debian11", "debian12", "fedora40", "fedora41", "freebsd13-3r", "freebsd13-4s", "freebsd14-1r", "freebsd14-2s", "freebsd15-0c", "ubuntu20", "ubuntu22", "ubuntu24"]' QUICK_OS='["almalinux8", "almalinux9", "debian12", "fedora41", "freebsd13-3r", "freebsd14-2r", "ubuntu24"]' # determine CI type when running on PR ci_type="full" @@ -37,9 +49,22 @@ jobs: os_selection="$FULL_OS" fi os_json=$(echo ${os_selection} | jq -c) + + # Add optional runners + if [ "${{ github.event.inputs.include_stream9 }}" == 'true' ]; then + os_json=$(echo $os_json | jq -c '. += ["centos-stream9"]') + fi + if [ "${{ github.event.inputs.include_stream10 }}" == 'true' ]; then + os_json=$(echo $os_json | jq -c '. += ["centos-stream10"]') + fi + + echo $os_json echo "os=$os_json" >> $GITHUB_OUTPUT echo "ci_type=$ci_type" >> $GITHUB_OUTPUT + + + qemu-vm: name: qemu-x86 needs: [ test-config ] From 8dc15ef4b3fddfcf7d4d8d5e99bbf037dfb6037f Mon Sep 17 00:00:00 2001 From: Toomas Soome Date: Thu, 2 Jan 2025 23:29:12 +0200 Subject: [PATCH 31/43] ZTS: zfs_mount_all_fail leaves /var/tmp/testrootPIDNUM directory around Before we can remove test files, we need to unmount datasets used by test first. See also: zfs_mount_all_mountpoints.ksh Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Toomas Soome Closes #16914 --- .../cli_root/zfs_mount/zfs_mount_all_fail.ksh | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh index d1103bddccbd7..7b6c2ccdf6603 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh @@ -16,6 +16,7 @@ # # Copyright (c) 2017 by Delphix. All rights reserved. +# Copyright 2024 MNX Cloud, Inc. # . $STF_SUITE/include/libtest.shlib @@ -44,8 +45,9 @@ typeset fscount=10 function setup_all { # Create $fscount filesystems at the top level of $path - for ((i=0; i<$fscount; i++)); do + for ((i=0; i Date: Fri, 3 Jan 2025 01:53:53 +0200 Subject: [PATCH 32/43] ZTS: functional/mount scripts are not removing /var/tmp/testdir.X dirs cleanup.ksh is assuming we have TESTDIRS set. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Toomas Soome Closes #16915 --- tests/zfs-tests/tests/functional/mount/cleanup.ksh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/zfs-tests/tests/functional/mount/cleanup.ksh b/tests/zfs-tests/tests/functional/mount/cleanup.ksh index bd6b0e435ed1c..0e88e2a1fc79a 100755 --- a/tests/zfs-tests/tests/functional/mount/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/mount/cleanup.ksh @@ -27,12 +27,14 @@ # # Copyright (c) 2013, 2016 by Delphix. All rights reserved. +# Copyright 2025 MNX Cloud, Inc. # . $STF_SUITE/include/libtest.shlib log_must destroy_pool $TESTPOOL -for dir in $TESTDIRS; do +for i in 1 2 3; do + dir=$TESTDIR.$i rm -rf $dir done From d35f9f2e841ef8880df07e5e1dbf7612faa430db Mon Sep 17 00:00:00 2001 From: Toomas Soome Date: Fri, 3 Jan 2025 01:57:24 +0200 Subject: [PATCH 33/43] ZTS: checkpoint_discard_busy does not set 16M on cleanup Originally hex value is used as decimal. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Toomas Soome Closes #16917 --- .../functional/pool_checkpoint/checkpoint_discard_busy.ksh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh index 087aef9027eab..07b658641f654 100755 --- a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh +++ b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh @@ -43,7 +43,7 @@ log_unsupported "Skipping, issue https://github.com/openzfs/zfs/issues/12053" function test_cleanup { # reset memory limit to 16M - set_tunable64 SPA_DISCARD_MEMORY_LIMIT 1000000 + set_tunable64 SPA_DISCARD_MEMORY_LIMIT 16777216 cleanup_nested_pools } From 478b09577a4799893072ce8c3614b6aac30dc7c4 Mon Sep 17 00:00:00 2001 From: pstef Date: Fri, 3 Jan 2025 18:03:14 +0100 Subject: [PATCH 34/43] zfs_vnops_os.c: fallocate is valid but not supported on FreeBSD This works around /usr/lib/go-1.18/pkg/tool/linux_amd64/link: mapping output file failed: invalid argument It's happened to me under a Linux jail, but it's also happened to other people, see https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=270247#c4 Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: pstef Closes #16918 --- module/os/freebsd/zfs/zfs_vnops_os.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index b8c2c341dacee..5edd3fcc76e7f 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -6258,7 +6258,7 @@ struct vop_vector zfs_vnodeops = { .vop_fplookup_vexec = zfs_freebsd_fplookup_vexec, .vop_fplookup_symlink = zfs_freebsd_fplookup_symlink, .vop_access = zfs_freebsd_access, - .vop_allocate = VOP_EINVAL, + .vop_allocate = VOP_EOPNOTSUPP, #if __FreeBSD_version >= 1400032 .vop_deallocate = zfs_deallocate, #endif From e94549d868c52a32d843132ea7171486069a6c9f Mon Sep 17 00:00:00 2001 From: Toomas Soome Date: Sat, 4 Jan 2025 00:41:03 +0200 Subject: [PATCH 35/43] ZTS: remove unused TESTDIRS from pam/cleanup.ksh Remove TESTDIRS as it is not set for pam tests. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Toomas Soome Closes #16920 --- tests/zfs-tests/tests/functional/pam/cleanup.ksh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/zfs-tests/tests/functional/pam/cleanup.ksh b/tests/zfs-tests/tests/functional/pam/cleanup.ksh index dbcb175ed069e..bfb98cd30707e 100755 --- a/tests/zfs-tests/tests/functional/pam/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/pam/cleanup.ksh @@ -27,4 +27,4 @@ destroy_pool $TESTPOOL del_user ${username} del_user ${username}rec del_group pamtestgroup -log_must rm -rf "$runstatedir" $TESTDIRS +log_must rm -rf "$runstatedir" From c02e1cf055bb599d78a95b33da4ee9b4aba8918e Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sat, 4 Jan 2025 09:42:06 +1100 Subject: [PATCH 36/43] vdev_open: clear async remove flag after reopen It's possible for a vdev to be flagged for async remove after the pool has suspended. If the removed device has been returned when the pool is resumed, the ASYNC_REMOVE task will still run at the end of txg, and remove the device from the pool again. To fix, we clear the async remove flag at reopen, just as we did for the async fault flag in 5de3ac223. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #16921 --- module/zfs/vdev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 85b6ee32158d9..5df2f77e57806 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -2041,6 +2041,7 @@ vdev_open(vdev_t *vd) vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_fault_wanted = B_FALSE; + vd->vdev_remove_wanted = B_FALSE; vd->vdev_min_asize = vdev_get_min_asize(vd); /* From ee3bde9dadf3d4255ed710ac40dfed4fb8c26149 Mon Sep 17 00:00:00 2001 From: Toomas Soome Date: Sat, 4 Jan 2025 00:48:30 +0200 Subject: [PATCH 37/43] ZTS: checkpoint_discard_busy should use save_tunable/restore_tunable Instead of using hardwired value for SPA_DISCARD_MEMORY_LIMIT, use save_tunable and restore_tunable to restore the pre-test state. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Toomas Soome Closes #16919 --- .../pool_checkpoint/checkpoint_discard_busy.ksh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh index 07b658641f654..2bf5ab199e6ee 100755 --- a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh +++ b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh @@ -42,8 +42,8 @@ log_unsupported "Skipping, issue https://github.com/openzfs/zfs/issues/12053" function test_cleanup { - # reset memory limit to 16M - set_tunable64 SPA_DISCARD_MEMORY_LIMIT 16777216 + # reset to original value + log_must restore_tunable SPA_DISCARD_MEMORY_LIMIT cleanup_nested_pools } @@ -69,6 +69,7 @@ log_onexit test_cleanup # map, we should have even more time to # verify this. # +log_must save_tunable SPA_DISCARD_MEMORY_LIMIT set_tunable64 SPA_DISCARD_MEMORY_LIMIT 128 log_must zpool checkpoint $NESTEDPOOL @@ -101,8 +102,8 @@ log_mustnot zpool checkpoint -d $NESTEDPOOL log_mustnot zpool remove $NESTEDPOOL $FILEDISK1 log_mustnot zpool reguid $NESTEDPOOL -# reset memory limit to 16M -set_tunable64 SPA_DISCARD_MEMORY_LIMIT 16777216 +# reset to original value +log_must restore_tunable SPA_DISCARD_MEMORY_LIMIT nested_wait_discard_finish From 50cbb14641b32215c87825ab158f726541707888 Mon Sep 17 00:00:00 2001 From: Robert Evans Date: Fri, 3 Jan 2025 22:04:01 -0500 Subject: [PATCH 38/43] Add Makefile dependencies for scripts/zfs-tests.sh -c This updates the Makefile to be more correct for parallel make. Reviewed-by: Brian Behlendorf Reviewed-by: George Melikov Signed-off-by: Robert Evans Closes #16030 Closes #16922 --- scripts/Makefile.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/Makefile.am b/scripts/Makefile.am index 7d9cef83d2c60..ee8fb8717cece 100644 --- a/scripts/Makefile.am +++ b/scripts/Makefile.am @@ -79,7 +79,7 @@ CLEANFILES += %D%/common.sh -$(AM_V_at)echo "$$SCRIPTS_EXTRA_ENVIRONMENT" >>$@ ALL_LOCAL += scripts-all-local -scripts-all-local: %D%/common.sh +scripts-all-local: %D%/common.sh $(PROGRAMS) $(SCRIPTS) $(DATA) -SCRIPT_COMMON=$< $(srcdir)/%D%/zfs-tests.sh -c CLEAN_LOCAL += scripts-clean-local From 939e0237c5499abaefdddb21c280801766ad59ea Mon Sep 17 00:00:00 2001 From: Don Brady Date: Sat, 4 Jan 2025 11:28:33 -0700 Subject: [PATCH 39/43] Too many vdev probe errors should suspend pool Similar to what we saw in #16569, we need to consider that a replacing vdev should not be considered as fully contributing to the redundancy of a raidz vdev even though current IO has enough redundancy. When a failed vdev_probe() is faulting a disk, it now checks if that disk is required, and if so it suspends the pool until the admin can return the missing disks. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: Allan Jude Reviewed-by: Tony Hutter Signed-off-by: Don Brady Closes #16864 --- module/zfs/spa.c | 25 ++- tests/runfiles/linux.run | 4 +- tests/zfs-tests/tests/Makefile.am | 1 + .../fault/suspend_on_probe_errors.ksh | 154 ++++++++++++++++++ 4 files changed, 176 insertions(+), 8 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/fault/suspend_on_probe_errors.ksh diff --git a/module/zfs/spa.c b/module/zfs/spa.c index c93c7945f192f..956bae46ef1b5 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -8948,16 +8948,26 @@ spa_async_remove(spa_t *spa, vdev_t *vd) } static void -spa_async_fault_vdev(spa_t *spa, vdev_t *vd) +spa_async_fault_vdev(vdev_t *vd, boolean_t *suspend) { if (vd->vdev_fault_wanted) { + vdev_state_t newstate = VDEV_STATE_FAULTED; vd->vdev_fault_wanted = B_FALSE; - vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, - VDEV_AUX_ERR_EXCEEDED); - } + /* + * If this device has the only valid copy of the data, then + * back off and simply mark the vdev as degraded instead. + */ + if (!vd->vdev_top->vdev_islog && vd->vdev_aux == NULL && + vdev_dtl_required(vd)) { + newstate = VDEV_STATE_DEGRADED; + /* A required disk is missing so suspend the pool */ + *suspend = B_TRUE; + } + vdev_set_state(vd, B_TRUE, newstate, VDEV_AUX_ERR_EXCEEDED); + } for (int c = 0; c < vd->vdev_children; c++) - spa_async_fault_vdev(spa, vd->vdev_child[c]); + spa_async_fault_vdev(vd->vdev_child[c], suspend); } static void @@ -9049,8 +9059,11 @@ spa_async_thread(void *arg) */ if (tasks & SPA_ASYNC_FAULT_VDEV) { spa_vdev_state_enter(spa, SCL_NONE); - spa_async_fault_vdev(spa, spa->spa_root_vdev); + boolean_t suspend = B_FALSE; + spa_async_fault_vdev(spa->spa_root_vdev, &suspend); (void) spa_vdev_state_exit(spa, NULL, 0); + if (suspend) + zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); } /* diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 76d07a6cc9c1e..e55ec583d2cc3 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -125,8 +125,8 @@ tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos', 'auto_replace_001_pos', 'auto_replace_002_pos', 'auto_spare_001_pos', 'auto_spare_002_pos', 'auto_spare_multiple', 'auto_spare_ashift', 'auto_spare_shared', 'decrypt_fault', 'decompress_fault', - 'fault_limits', 'scrub_after_resilver', 'suspend_resume_single', - 'zpool_status_-s'] + 'fault_limits', 'scrub_after_resilver', 'suspend_on_probe_errors', + 'suspend_resume_single', 'zpool_status_-s'] tags = ['functional', 'fault'] [tests/functional/features/large_dnode:Linux] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 588249be45da7..df183825dc680 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1532,6 +1532,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/fault/decrypt_fault.ksh \ functional/fault/fault_limits.ksh \ functional/fault/scrub_after_resilver.ksh \ + functional/fault/suspend_on_probe_errors.ksh \ functional/fault/suspend_resume_single.ksh \ functional/fault/setup.ksh \ functional/fault/zpool_status_-s.ksh \ diff --git a/tests/zfs-tests/tests/functional/fault/suspend_on_probe_errors.ksh b/tests/zfs-tests/tests/functional/fault/suspend_on_probe_errors.ksh new file mode 100755 index 0000000000000..d9261bb5d274f --- /dev/null +++ b/tests/zfs-tests/tests/functional/fault/suspend_on_probe_errors.ksh @@ -0,0 +1,154 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/include/blkdev.shlib + +# +# DESCRIPTION: Verify that 4 disks removed from a raidz3 will suspend the pool +# +# STRATEGY: +# 1. Disable ZED -- this test is focused on vdev_probe errors +# 2. Create a raidz3 pool where 4 disks can be removed (i.e., using scsi_debug) +# 3. Add some data to it for a resilver workload +# 4. Replace one of the child vdevs to start a replacing vdev +# 5. During the resilver, remove 4 disks including one from the replacing vdev +# 6. Verify that the pool is suspended (it used to remain online) +# + +DEV_SIZE_MB=1024 + +FILE_VDEV_CNT=8 +FILE_VDEV_SIZ=256M + +function cleanup +{ + destroy_pool $TESTPOOL + if [[ "$(cat /sys/block/$sd/device/state)" == "offline" ]]; then + log_must eval "echo running > /sys/block/$sd/device/state" + fi + unload_scsi_debug + rm -f $DATA_FILE + for i in {0..$((FILE_VDEV_CNT - 1))}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 + zed_start +} + +log_onexit cleanup + +log_assert "VDEV probe errors for more disks than parity should suspend a pool" + +log_note "Stoping ZED process" +zed_stop +zpool events -c + +# Make a debug device that we can "unplug" and lose 4 drives at once +unload_scsi_debug +load_scsi_debug $DEV_SIZE_MB 1 1 1 '512b' +sd=$(get_debug_device) + +# Create 4 partitions that match the FILE_VDEV_SIZ +parted "/dev/${sd}" --script mklabel gpt +parted "/dev/${sd}" --script mkpart primary 0% 25% +parted "/dev/${sd}" --script mkpart primary 25% 50% +parted "/dev/${sd}" --script mkpart primary 50% 75% +parted "/dev/${sd}" --script mkpart primary 75% 100% +block_device_wait "/dev/${sd}" +blkdevs="/dev/${sd}1 /dev/${sd}2 /dev/${sd}3 /dev/${sd}4" + +# Create 8 file vdevs +typeset -a filedevs +for i in {0..$((FILE_VDEV_CNT - 1))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s $FILE_VDEV_SIZ $device + # Use all but the last one for pool create + if [[ $i -lt "7" ]]; then + filedevs[${#filedevs[*]}+1]=$device + fi +done + +# Create a raidz-3 pool that we can pull 4 disks from +log_must zpool create -f $TESTPOOL raidz3 ${filedevs[@]} $blkdevs +sync_pool $TESTPOOL + +# Add some data to the pool +log_must zfs create $TESTPOOL/fs +MNTPOINT="$(get_prop mountpoint $TESTPOOL/fs)" +SECONDS=0 +log_must fill_fs $MNTPOINT 1 200 4096 10 Z +log_note "fill_fs took $SECONDS seconds" +sync_pool $TESTPOOL + +# Start a replacing vdev, but suspend the resilver +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 +log_must zpool replace -f $TESTPOOL /dev/${sd}4 $TEST_BASE_DIR/dev-7 + +# Remove 4 disks all at once +log_must eval "echo offline > /sys/block/${sd}/device/state" + +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 + +# Add some writes to drive the vdev probe errors +log_must dd if=/dev/urandom of=$MNTPOINT/writes bs=1M count=1 + +# Wait until sync starts, and the pool suspends +log_note "waiting for pool to suspend" +typeset -i tries=30 +until [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) == "SUSPENDED" ]] ; do + if ((tries-- == 0)); then + zpool status -s + log_fail "UNEXPECTED -- pool did not suspend" + fi + sleep 1 +done +log_note $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) + +# Put the missing disks back into service +log_must eval "echo running > /sys/block/$sd/device/state" + +# Clear the vdev error states, which will reopen the vdevs and resume the pool +log_must zpool clear $TESTPOOL + +# Wait until the pool resumes +log_note "waiting for pool to resume" +tries=30 +until [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) != "SUSPENDED" ]] ; do + if ((tries-- == 0)); then + log_fail "pool did not resume" + fi + sleep 1 +done +log_must zpool wait -t resilver $TESTPOOL +sync_pool $TESTPOOL + +# Make sure a pool scrub comes back clean +log_must zpool scrub -w $TESTPOOL +log_must zpool status -v $TESTPOOL +log_must check_pool_status $TESTPOOL "errors" "No known data errors" + +log_pass "VDEV probe errors for more disks than parity should suspend a pool" From dc0324bfa90011a5edf65bc0947aaf51ac9a8b61 Mon Sep 17 00:00:00 2001 From: Richard Kojedzinszky Date: Sat, 4 Jan 2025 19:33:27 +0100 Subject: [PATCH 40/43] fix: make zfs_strerror really thread-safe and portable #15793 wanted to make zfs_strerror threadsafe, unfortunately, it turned out that strerror_l() usage was wrong, and also, some libc implementations dont have strerror_l(). zfs_strerror() now simply calls original strerror() and copies the result to a thread-local buffer, then returns that. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Richard Kojedzinszky Closes #15793 Closes #16640 Closes #16923 --- config/user.m4 | 2 +- include/libzutil.h | 15 +++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/config/user.m4 b/config/user.m4 index 4e31745a2abcd..badd920d2b8a1 100644 --- a/config/user.m4 +++ b/config/user.m4 @@ -33,7 +33,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [ ZFS_AC_CONFIG_USER_MAKEDEV_IN_MKDEV ZFS_AC_CONFIG_USER_ZFSEXEC - AC_CHECK_FUNCS([execvpe issetugid mlockall strerror_l strlcat strlcpy gettid]) + AC_CHECK_FUNCS([execvpe issetugid mlockall strlcat strlcpy gettid]) AC_SUBST(RM) ]) diff --git a/include/libzutil.h b/include/libzutil.h index f8712340cc5e7..bcfe2fcf7960f 100644 --- a/include/libzutil.h +++ b/include/libzutil.h @@ -27,7 +27,7 @@ #define _LIBZUTIL_H extern __attribute__((visibility("default"))) #include -#include +#include #include #include @@ -276,11 +276,14 @@ _LIBZUTIL_H void update_vdev_config_dev_sysfs_path(nvlist_t *nv, * Thread-safe strerror() for use in ZFS libraries */ static inline char *zfs_strerror(int errnum) { -#ifdef HAVE_STRERROR_L - return (strerror_l(errnum, uselocale(0))); -#else - return (strerror(errnum)); -#endif + static __thread char errbuf[512]; + static pthread_mutex_t zfs_strerror_lock = PTHREAD_MUTEX_INITIALIZER; + + (void) pthread_mutex_lock(&zfs_strerror_lock); + (void) strlcpy(errbuf, strerror(errnum), sizeof (errbuf)); + (void) pthread_mutex_unlock(&zfs_strerror_lock); + + return (errbuf); } #ifdef __cplusplus From 3a445f2ef5eff3bf33696282f64a40942d37a031 Mon Sep 17 00:00:00 2001 From: Robert Evans Date: Sun, 5 Jan 2025 20:25:22 -0500 Subject: [PATCH 41/43] Remove duplicate dedup_legacy_create in common.run Reviewed-by: Brian Behlendorf Reviewed-by: George Melikov Signed-off-by: Robert Evans Closes #16926 --- tests/runfiles/common.run | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 6abb3b4213bb3..8a4a4b0f5cb81 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -677,8 +677,8 @@ post = tags = ['functional', 'deadman'] [tests/functional/dedup] -tests = ['dedup_legacy_create', 'dedup_fdt_create', 'dedup_fdt_import', - 'dedup_legacy_create', 'dedup_legacy_import', 'dedup_legacy_fdt_upgrade', +tests = ['dedup_fdt_create', 'dedup_fdt_import', 'dedup_legacy_create', + 'dedup_legacy_import', 'dedup_legacy_fdt_upgrade', 'dedup_legacy_fdt_mixed', 'dedup_quota'] pre = post = From 18c67d2418c571a0a85592401db793b5e6a9343a Mon Sep 17 00:00:00 2001 From: n0-1 Date: Mon, 6 Jan 2025 02:27:19 +0100 Subject: [PATCH 42/43] Support for cross-compiling kernel modules In order to correctly cross-compile, one has to pass ARCH and CROSS_COMPILE make flags to kernel module build calls. Facilitate this in the same way as for custom CC flag by recognizing KERNEL_-prefixed configure environment variables of same name. Reviewed-by: Brian Behlendorf Signed-off-by: Phil Sutter Closes #16924 --- config/kernel.m4 | 5 +++++ config/zfs-build.m4 | 2 ++ module/Makefile.in | 2 ++ rpm/generic/zfs-kmod.spec.in | 4 +++- rpm/redhat/zfs-kmod.spec.in | 4 +++- 5 files changed, 15 insertions(+), 2 deletions(-) diff --git a/config/kernel.m4 b/config/kernel.m4 index ae66633907bf7..9928ead1b6cea 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -681,11 +681,16 @@ AC_DEFUN([ZFS_LINUX_COMPILE], [ building kernel modules]) AC_ARG_VAR([KERNEL_LLVM], [Binary option to build kernel modules with LLVM/CLANG toolchain]) + AC_ARG_VAR([KERNEL_CROSS_COMPILE], [Cross compile prefix + for kernel module builds]) + AC_ARG_VAR([KERNEL_ARCH], [Architecture to build kernel modules for]) AC_TRY_COMMAND([ KBUILD_MODPOST_NOFINAL="$5" KBUILD_MODPOST_WARN="$6" make modules -k -j$TEST_JOBS ${KERNEL_CC:+CC=$KERNEL_CC} ${KERNEL_LD:+LD=$KERNEL_LD} ${KERNEL_LLVM:+LLVM=$KERNEL_LLVM} CONFIG_MODULES=y CFLAGS_MODULE=-DCONFIG_MODULES + ${KERNEL_CROSS_COMPILE:+CROSS_COMPILE=$KERNEL_CROSS_COMPILE} + ${KERNEL_ARCH:+ARCH=$KERNEL_ARCH} -C $LINUX_OBJ $ARCH_UM M=$PWD/$1 >$1/build.log 2>&1]) AS_IF([AC_TRY_COMMAND([$2])], [$3], [$4]) ]) diff --git a/config/zfs-build.m4 b/config/zfs-build.m4 index c44a893bbb8c3..55fc029f08471 100644 --- a/config/zfs-build.m4 +++ b/config/zfs-build.m4 @@ -393,6 +393,8 @@ AC_DEFUN([ZFS_AC_RPM], [ RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kernel_cc KERNEL_CC=$(KERNEL_CC)"' RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kernel_ld KERNEL_LD=$(KERNEL_LD)"' RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kernel_llvm KERNEL_LLVM=$(KERNEL_LLVM)"' + RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kernel_cross_compile KERNEL_CROSS_COMPILE=$(KERNEL_CROSS_COMPILE)"' + RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kernel_arch KERNEL_ARCH=$(KERNEL_ARCH)"' ]) RPM_DEFINE_DKMS='' diff --git a/module/Makefile.in b/module/Makefile.in index 9b34b3dfaec76..529ab81dcec59 100644 --- a/module/Makefile.in +++ b/module/Makefile.in @@ -55,6 +55,8 @@ modules-Linux: mkdir -p $(sort $(dir $(zfs-objs) $(zfs-))) $(MAKE) -C @LINUX_OBJ@ $(if @KERNEL_CC@,CC=@KERNEL_CC@) \ $(if @KERNEL_LD@,LD=@KERNEL_LD@) $(if @KERNEL_LLVM@,LLVM=@KERNEL_LLVM@) \ + $(if @KERNEL_CROSS_COMPILE@,CROSS_COMPILE=@KERNEL_CROSS_COMPILE@) \ + $(if @KERNEL_ARCH@,ARCH=@KERNEL_ARCH@) \ M="$$PWD" @KERNEL_MAKE@ CONFIG_ZFS=m modules modules-FreeBSD: diff --git a/rpm/generic/zfs-kmod.spec.in b/rpm/generic/zfs-kmod.spec.in index 30524474d1ace..7ed828bd0c9c4 100644 --- a/rpm/generic/zfs-kmod.spec.in +++ b/rpm/generic/zfs-kmod.spec.in @@ -144,7 +144,9 @@ for kernel_version in %{?kernel_versions}; do %{debuginfo} \ %{?kernel_cc} \ %{?kernel_ld} \ - %{?kernel_llvm} + %{?kernel_llvm} \ + %{?kernel_cross_compile} \ + %{?kernel_arch} # Pre-6.10 kernel builds didn't need to copy over the source files to the # build directory. However we do need to do it though post-6.10 due to diff --git a/rpm/redhat/zfs-kmod.spec.in b/rpm/redhat/zfs-kmod.spec.in index 876c198c64de0..a95bdf20f873e 100644 --- a/rpm/redhat/zfs-kmod.spec.in +++ b/rpm/redhat/zfs-kmod.spec.in @@ -69,7 +69,9 @@ fi %{debuginfo} \ %{?kernel_cc} \ %{?kernel_ld} \ - %{?kernel_llvm} + %{?kernel_llvm} \ + %{?kernel_cross_compile} \ + %{?kernel_arch} make %{?_smp_mflags} # Module signing (modsign) From b8e09c7007a0733a497aebe734cb8ed23a0415ae Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Wed, 8 Jan 2025 10:43:01 +1100 Subject: [PATCH 43/43] ZTS: remove empty zpool_add--allow-ashift-mismatch test Added in b1e46f869, but empty, so no point keeping it around. Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Brian Behlendorf Reviewed-by: George Melikov Signed-off-by: Rob Norris Closes #16931 --- tests/runfiles/common.run | 3 +-- tests/zfs-tests/tests/Makefile.am | 1 - .../cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh | 0 3 files changed, 1 insertion(+), 3 deletions(-) delete mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 8a4a4b0f5cb81..3ec052f56b406 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -387,8 +387,7 @@ tags = ['functional', 'cli_root', 'zpool'] tests = ['zpool_add_001_pos', 'zpool_add_002_pos', 'zpool_add_003_pos', 'zpool_add_004_pos', 'zpool_add_006_pos', 'zpool_add_007_neg', 'zpool_add_008_neg', 'zpool_add_009_neg', 'zpool_add_010_pos', - 'add-o_ashift', 'add_prop_ashift', 'zpool_add_dryrun_output', - 'zpool_add--allow-ashift-mismatch'] + 'add-o_ashift', 'add_prop_ashift', 'zpool_add_dryrun_output'] tags = ['functional', 'cli_root', 'zpool_add'] [tests/functional/cli_root/zpool_attach] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index df183825dc680..d62613d300359 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -998,7 +998,6 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_add/add_prop_ashift.ksh \ functional/cli_root/zpool_add/cleanup.ksh \ functional/cli_root/zpool_add/setup.ksh \ - functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh \ functional/cli_root/zpool_add/zpool_add_001_pos.ksh \ functional/cli_root/zpool_add/zpool_add_002_pos.ksh \ functional/cli_root/zpool_add/zpool_add_003_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh deleted file mode 100755 index e69de29bb2d1d..0000000000000