From 19a04e5ad21e1819b95c0ed748a4f128dc25508a Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 12 Dec 2024 13:22:41 -0500
Subject: [PATCH 01/43] BRT: Check bv_mos_entries in brt_entry_lookup()

When vdev first sees some block cloning, there is a window when
brt_maybe_exists() might already return true since something was
cloned, but bv_mos_entries is still 0 since BRT ZAP was not yet
created.  In such case we should not try to look into the ZAP
and dereference NULL bv_mos_entries_dnode.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16851
---
 module/zfs/brt.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 7d94214143ea0..79748cd69bc07 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -859,6 +859,9 @@ brt_entry_lookup(brt_vdev_t *brtvd, brt_entry_t *bre)
 {
 	uint64_t off = BRE_OFFSET(bre);
 
+	if (brtvd->bv_mos_entries == 0)
+		return (SET_ERROR(ENOENT));
+
 	return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
 	    &off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count), &bre->bre_count));
 }

From 6c9b4f18d3e62d7e2562d2f101b9e1bc488abb63 Mon Sep 17 00:00:00 2001
From: Chunwei Chen <tuxoko@gmail.com>
Date: Thu, 12 Dec 2024 16:18:45 -0800
Subject: [PATCH 02/43] Fix DR_OVERRIDDEN use-after-free race in dbuf_sync_leaf

In dbuf_sync_leaf, we clone the arc_buf in dr if we share it with db
except for overridden case. However, this exception causes a race where
dbuf_new_size could free the arc_buf after the last dereference of
*datap and causes use-after-free. We fix this by cloning the buf
regardless if it's overridden.

The race:
--
P0                                     P1

                                       dbuf_hold_impl()
                                         // dbuf_hold_copy passed
                                         // because db_data_pending NULL

dbuf_sync_leaf()
  // doesn't clone *datap
  // *datap derefed to db_buf
  dbuf_write(*datap)

                                       dbuf_new_size()
                                         dmu_buf_will_dirty()
                                           dbuf_fix_old_data()
                                             // alloc new buf for P0 dr
                                             // but can't change *datap

                                         arc_alloc_buf()
                                         arc_buf_destroy()
                                           // alloc new buf for db_buf
                                           // and destroy old buf

  dbuf_write() // continue
    abd_get_from_buf(data->b_data,
    arc_buf_size(data))
      // use-after-free
--

Here's an example when it happens:

BUG: kernel NULL pointer dereference, address: 000000000000002e
RIP: 0010:arc_buf_size+0x1c/0x30 [zfs]
Call Trace:
 dbuf_write+0x3ff/0x580 [zfs]
 dbuf_sync_leaf+0x13c/0x530 [zfs]
 dbuf_sync_list+0xbf/0x120 [zfs]
 dnode_sync+0x3ea/0x7a0 [zfs]
 sync_dnodes_task+0x71/0xa0 [zfs]
 taskq_thread+0x2b8/0x4e0 [spl]
 kthread+0x112/0x130
 ret_from_fork+0x1f/0x30

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Chunwei Chen <david.chen@nutanix.com>
Co-authored-by: Chunwei Chen <david.chen@nutanix.com>
Closes #16854
---
 module/zfs/dbuf.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 190d8ded39b0f..90395cad6e451 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -4779,8 +4779,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 
 	if (*datap != NULL && *datap == db->db_buf &&
 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
-	    zfs_refcount_count(&db->db_holds) > 1 &&
-	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN) {
+	    zfs_refcount_count(&db->db_holds) > 1) {
 		/*
 		 * If this buffer is currently "in use" (i.e., there
 		 * are active holds and db_data still references it),

From ecc0970e3e517b620ea63f0e35632c7d21d35da0 Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Sat, 14 Dec 2024 05:12:14 +1100
Subject: [PATCH 03/43] backtrace: fix off-by-one on string output

sizeof("foo") includes the trailing null byte, so all the output had
nulls through it. Most terminals quietly ignore it, but it makes some
tools misdetect file types and other annoyances.

Easy fix: subtract 1.

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #16862
---
 lib/libspl/backtrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/libspl/backtrace.c b/lib/libspl/backtrace.c
index 6e8b3b12122de..7f1ef596f8edd 100644
--- a/lib/libspl/backtrace.c
+++ b/lib/libspl/backtrace.c
@@ -38,7 +38,7 @@
  */
 #define	spl_bt_write_n(fd, s, n) \
 	do { ssize_t r __maybe_unused = write(fd, s, n); } while (0)
-#define	spl_bt_write(fd, s)		spl_bt_write_n(fd, s, sizeof (s))
+#define	spl_bt_write(fd, s)		spl_bt_write_n(fd, s, sizeof (s)-1)
 
 #if defined(HAVE_LIBUNWIND)
 #define	UNW_LOCAL_ONLY

From 76f57ab9f7483335ba28b7de7831eeaa2dad438d Mon Sep 17 00:00:00 2001
From: Poscat <poscat0x04@outlook.com>
Date: Sat, 14 Dec 2024 04:06:40 +0800
Subject: [PATCH 04/43] build: use correct bashcompletiondir on arch

Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: poscat <poscat@poscat.moe>
Closes #16861
---
 config/zfs-build.m4 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/zfs-build.m4 b/config/zfs-build.m4
index 255813f46d19c..c44a893bbb8c3 100644
--- a/config/zfs-build.m4
+++ b/config/zfs-build.m4
@@ -627,7 +627,7 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [
 
 	AC_MSG_CHECKING([default bash completion directory])
 	case "$VENDOR" in
-		alpine|artix|debian|gentoo|ubuntu)
+		alpine|arch|artix|debian|gentoo|ubuntu)
 			bashcompletiondir=/usr/share/bash-completion/completions
 			;;
 		freebsd)

From fbea92432a4664786ff6a9f2f59bb82b6c5f0b84 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 1 Jul 2024 11:19:16 +1000
Subject: [PATCH 05/43] flush: don't report flush error when disabling flush
 support

The first time a device returns ENOTSUP in repsonse to a flush request,
we set vdev_nowritecache so we don't issue flushes in the future and
instead just pretend the succeeded. However, we still return an error
for the initial flush, even though we just decided such errors are
meaningless!

So, when setting vdev_nowritecache in response to a flush error, also
reset the error code to assume success.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16855
---
 module/zfs/zio.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index f4d7e57542a18..f13228051decb 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -4608,11 +4608,14 @@ zio_vdev_io_assess(zio_t *zio)
 	/*
 	 * If a cache flush returns ENOTSUP or ENOTTY, we know that no future
 	 * attempts will ever succeed. In this case we set a persistent
-	 * boolean flag so that we don't bother with it in the future.
+	 * boolean flag so that we don't bother with it in the future, and
+	 * then we act like the flush succeeded.
 	 */
 	if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) &&
-	    zio->io_type == ZIO_TYPE_FLUSH && vd != NULL)
+	    zio->io_type == ZIO_TYPE_FLUSH && vd != NULL) {
 		vd->vdev_nowritecache = B_TRUE;
+		zio->io_error = 0;
+	}
 
 	if (zio->io_error)
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;

From 46e06fededd42ff4d6fe2dfe0b60612f609edf0b Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 1 Jul 2024 11:19:16 +1000
Subject: [PATCH 06/43] flush: only detect lack of flush support in one place

It seems there's no good reason for vdev_disk & vdev_geom to explicitly
detect no support for flush and set vdev_nowritecache.  Instead, just
signal it by setting the error to ENOTSUP, and let zio_vdev_io_assess()
take care of it in one place.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16855
---
 module/os/freebsd/zfs/vdev_geom.c | 15 ---------------
 module/os/linux/zfs/vdev_disk.c   |  5 ++---
 module/zfs/zio.c                  |  6 +++---
 3 files changed, 5 insertions(+), 21 deletions(-)

diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c
index b7ff1063b0891..7aaa42bfb1a89 100644
--- a/module/os/freebsd/zfs/vdev_geom.c
+++ b/module/os/freebsd/zfs/vdev_geom.c
@@ -1014,21 +1014,6 @@ vdev_geom_io_intr(struct bio *bp)
 		zio->io_error = SET_ERROR(EIO);
 
 	switch (zio->io_error) {
-	case ENOTSUP:
-		/*
-		 * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
-		 * that future attempts will never succeed. In this case
-		 * we set a persistent flag so that we don't bother with
-		 * requests in the future.
-		 */
-		switch (bp->bio_cmd) {
-		case BIO_FLUSH:
-			vd->vdev_nowritecache = B_TRUE;
-			break;
-		case BIO_DELETE:
-			break;
-		}
-		break;
 	case ENXIO:
 		if (!vd->vdev_remove_wanted) {
 			/*
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 6a66a72b91a98..e8bd513e69098 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -1198,9 +1198,8 @@ vdev_disk_io_flush_completion(struct bio *bio)
 {
 	zio_t *zio = bio->bi_private;
 	zio->io_error = bi_status_to_errno(bio->bi_status);
-
-	if (zio->io_error && (zio->io_error == EOPNOTSUPP))
-		zio->io_vd->vdev_nowritecache = B_TRUE;
+	if (zio->io_error == EOPNOTSUPP || zio->io_error == ENOTTY)
+		zio->io_error = SET_ERROR(ENOTSUP);
 
 	bio_put(bio);
 	ASSERT3S(zio->io_error, >=, 0);
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index f13228051decb..bd6752f00ac56 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -4606,13 +4606,13 @@ zio_vdev_io_assess(zio_t *zio)
 	}
 
 	/*
-	 * If a cache flush returns ENOTSUP or ENOTTY, we know that no future
+	 * If a cache flush returns ENOTSUP we know that no future
 	 * attempts will ever succeed. In this case we set a persistent
 	 * boolean flag so that we don't bother with it in the future, and
 	 * then we act like the flush succeeded.
 	 */
-	if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) &&
-	    zio->io_type == ZIO_TYPE_FLUSH && vd != NULL) {
+	if (zio->io_error == ENOTSUP && zio->io_type == ZIO_TYPE_FLUSH &&
+	    vd != NULL) {
 		vd->vdev_nowritecache = B_TRUE;
 		zio->io_error = 0;
 	}

From 586304ac444f888aae584fdbea5fbeac2b76cb32 Mon Sep 17 00:00:00 2001
From: kotauskas <v.toncharov@gmail.com>
Date: Sat, 14 Dec 2024 00:50:50 +0300
Subject: [PATCH 07/43] Remount datasets on soft-reboot

The one-shot zfs-mount.service is incorrectly deemed active by
Systemd after a systemctl soft-reboot. As such, soft-rebooting
prevents zfs mount -a from being ran automatically.

This commit makes it so that zfs-mount.service is marked as being
undone by the time umount.target is reached, so that zfs.target then
pulls it in again and gets it restarted after a soft reboot.

Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: kotauskas <v.toncharov@gmail.com>
Closes #16845
---
 etc/systemd/system/zfs-mount.service.in | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/etc/systemd/system/zfs-mount.service.in b/etc/systemd/system/zfs-mount.service.in
index 66d894923f4ac..41c02c8238ca5 100644
--- a/etc/systemd/system/zfs-mount.service.in
+++ b/etc/systemd/system/zfs-mount.service.in
@@ -8,6 +8,13 @@ After=systemd-remount-fs.service
 Before=local-fs.target
 ConditionPathIsDirectory=/sys/module/zfs
 
+# This merely tells the service manager
+# that unmounting everything undoes the
+# effect of this service. No extra logic
+# is ran as a result of these settings.
+Conflicts=umount.target
+Before=umount.target
+
 [Service]
 Type=oneshot
 RemainAfterExit=yes

From ff6266ee9bd0d6a59355b9980345819e6f68b50b Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Sat, 14 Dec 2024 17:02:11 -0500
Subject: [PATCH 08/43] Fix use-afer-free regression in RAIDZ expansion

We should not dereference rra after the last zio_nowait() is called.
It seems very unlikely, but ASAN in ztest managed to catch it.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16868
---
 module/zfs/vdev_raidz.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index e4487c4850751..6103f780e6bc9 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -3914,8 +3914,8 @@ raidz_reflow_read_done(zio_t *zio)
 
 	if (atomic_dec_32_nv(&rra->rra_tbd) > 0)
 		return;
-	rra->rra_tbd = rra->rra_writes;
-	for (uint64_t i = 0; i < rra->rra_writes; i++)
+	uint32_t writes = rra->rra_tbd = rra->rra_writes;
+	for (uint64_t i = 0; i < writes; i++)
 		zio_nowait(rra->rra_zio[i]);
 }
 

From 22259fb24d6ef551d38ff19ceeb86c2da4ba6543 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Sat, 14 Dec 2024 14:05:12 -0800
Subject: [PATCH 09/43] Fix stray "no" in configure output

This is purely a cosmetic fix which removes a stray "no" from
the configure output.

Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by:  Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #16867
---
 config/kernel-xattr-handler.m4 | 1 -
 1 file changed, 1 deletion(-)

diff --git a/config/kernel-xattr-handler.m4 b/config/kernel-xattr-handler.m4
index d933cff7a4b92..ea4466d83fcc4 100644
--- a/config/kernel-xattr-handler.m4
+++ b/config/kernel-xattr-handler.m4
@@ -54,7 +54,6 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_GET_DENTRY_INODE_FLAGS], [
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_GET_DENTRY_INODE_FLAGS], [
-	AC_MSG_RESULT(no)
 	AC_MSG_CHECKING(
 	    [whether xattr_handler->get() wants dentry and inode and flags])
 	ZFS_LINUX_TEST_RESULT([xattr_handler_get_dentry_inode_flags], [

From acda137d8c160885542bcfd6fa9ef8fd650e95d2 Mon Sep 17 00:00:00 2001
From: Shengqi Chen <harry-chen@outlook.com>
Date: Tue, 17 Dec 2024 01:40:41 +0800
Subject: [PATCH 10/43] simd_stat: fix undefined CONFIG_KERNEL_MODE_NEON error
 on armel

CONFIG_KERNEL_MODE_NEON depends on CONFIG_NEON. Neither is defined
on armel. Add a guard to avoid compilation errors.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
Closes #16871
---
 module/zcommon/simd_stat.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/module/zcommon/simd_stat.c b/module/zcommon/simd_stat.c
index 33c15140cdb92..d82a88ca9343e 100644
--- a/module/zcommon/simd_stat.c
+++ b/module/zcommon/simd_stat.c
@@ -132,8 +132,10 @@ simd_stat_kstat_data(char *buf, size_t size, void *data)
 #if defined(__arm__) || defined(__aarch64__)
 		off += SIMD_STAT_PRINT(simd_stat_kstat_payload,
 		    "kernel_neon", HAVE_KERNEL_NEON);
+#if defined(CONFIG_KERNEL_MODE_NEON)
 		off += SIMD_STAT_PRINT(simd_stat_kstat_payload,
 		    "kernel_mode_neon", CONFIG_KERNEL_MODE_NEON);
+#endif /* CONFIG_KERNEL_MODE_NEON */
 		off += SIMD_STAT_PRINT(simd_stat_kstat_payload,
 		    "neon", zfs_neon_available());
 		off += SIMD_STAT_PRINT(simd_stat_kstat_payload,

From c6442bd3b6430e5bd57dac365a359b41e7fbd54e Mon Sep 17 00:00:00 2001
From: Brian Atkinson <batkinson@lanl.gov>
Date: Mon, 2 Dec 2024 18:04:56 -0700
Subject: [PATCH 11/43] Removing old code outside of 4.18 kernsls

There were checks still in place to verify we could completely use
iov_iter's on the Linux side. All interfaces are available as of kernel
4.18, so there is no reason to check whether we should use that
interface at this point. This PR completely removes the UIO_USERSPACE
type. It also removes the check for the direct_IO interface checks.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Brian Atkinson <batkinson@lanl.gov>
Closes #16856
---
 config/kernel-vfs-direct_IO.m4 |  57 -----------
 config/kernel-vfs-iov_iter.m4  |  39 +------
 config/kernel.m4               |   2 -
 include/os/linux/spl/sys/uio.h |  17 +---
 lib/libspl/include/sys/uio.h   |   3 +-
 module/os/linux/zfs/zfs_uio.c  | 180 ++-------------------------------
 module/os/linux/zfs/zpl_file.c |  52 ++--------
 7 files changed, 26 insertions(+), 324 deletions(-)
 delete mode 100644 config/kernel-vfs-direct_IO.m4

diff --git a/config/kernel-vfs-direct_IO.m4 b/config/kernel-vfs-direct_IO.m4
deleted file mode 100644
index 17605a13fdef5..0000000000000
--- a/config/kernel-vfs-direct_IO.m4
+++ /dev/null
@@ -1,57 +0,0 @@
-dnl #
-dnl # Check for Direct I/O interfaces.
-dnl #
-AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO], [
-	ZFS_LINUX_TEST_SRC([direct_io_iter], [
-		#include <linux/fs.h>
-
-		static ssize_t test_direct_IO(struct kiocb *kiocb,
-		    struct iov_iter *iter) { return 0; }
-
-		static const struct address_space_operations
-		    aops __attribute__ ((unused)) = {
-			.direct_IO = test_direct_IO,
-		};
-	],[])
-
-	ZFS_LINUX_TEST_SRC([direct_io_iter_offset], [
-		#include <linux/fs.h>
-
-		static ssize_t test_direct_IO(struct kiocb *kiocb,
-		    struct iov_iter *iter, loff_t offset) { return 0; }
-
-		static const struct address_space_operations
-		    aops __attribute__ ((unused)) = {
-			.direct_IO = test_direct_IO,
-		};
-	],[])
-])
-
-AC_DEFUN([ZFS_AC_KERNEL_VFS_DIRECT_IO], [
-	dnl #
-	dnl # Linux 4.6.x API change
-	dnl #
-	AC_MSG_CHECKING([whether aops->direct_IO() uses iov_iter])
-	ZFS_LINUX_TEST_RESULT([direct_io_iter], [
-		AC_MSG_RESULT([yes])
-		AC_DEFINE(HAVE_VFS_DIRECT_IO_ITER, 1,
-		    [aops->direct_IO() uses iov_iter without rw])
-	],[
-		AC_MSG_RESULT([no])
-
-		dnl #
-		dnl # Linux 4.1.x API change
-		dnl #
-		AC_MSG_CHECKING(
-		    [whether aops->direct_IO() uses offset])
-		ZFS_LINUX_TEST_RESULT([direct_io_iter_offset], [
-			AC_MSG_RESULT([yes])
-			AC_DEFINE(HAVE_VFS_DIRECT_IO_ITER_OFFSET, 1,
-			    [aops->direct_IO() uses iov_iter with offset])
-
-		],[
-			AC_MSG_RESULT([no])
-			ZFS_LINUX_TEST_ERROR([Direct I/O])
-		])
-	])
-])
diff --git a/config/kernel-vfs-iov_iter.m4 b/config/kernel-vfs-iov_iter.m4
index ed7961a9e9ddd..29e19acddbb1b 100644
--- a/config/kernel-vfs-iov_iter.m4
+++ b/config/kernel-vfs-iov_iter.m4
@@ -15,7 +15,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_IOV_ITER], [
 
 	ZFS_LINUX_TEST_SRC([iov_iter_get_pages2], [
 		#include <linux/uio.h>
-	], [
+	],[
 		struct iov_iter iter = { 0 };
 		struct page **pages = NULL;
 		size_t maxsize = 4096;
@@ -27,20 +27,6 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_IOV_ITER], [
 		    &start);
 	])
 
-	ZFS_LINUX_TEST_SRC([iov_iter_get_pages], [
-		#include <linux/uio.h>
-	], [
-		struct iov_iter iter = { 0 };
-		struct page **pages = NULL;
-		size_t maxsize = 4096;
-		unsigned maxpages = 1;
-		size_t start;
-		size_t ret __attribute__ ((unused));
-
-		ret = iov_iter_get_pages(&iter, pages, maxsize, maxpages,
-		    &start);
-	])
-
 	ZFS_LINUX_TEST_SRC([iov_iter_type], [
 		#include <linux/fs.h>
 		#include <linux/uio.h>
@@ -59,7 +45,6 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_IOV_ITER], [
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [
-	enable_vfs_iov_iter="yes"
 
 	AC_MSG_CHECKING([whether fault_in_iov_iter_readable() is available])
 	ZFS_LINUX_TEST_RESULT([fault_in_iov_iter_readable], [
@@ -78,17 +63,8 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_IOV_ITER_GET_PAGES2, 1,
 		    [iov_iter_get_pages2() is available])
-	], [
+	],[
 		AC_MSG_RESULT(no)
-		AC_MSG_CHECKING([whether iov_iter_get_pages() is available])
-			ZFS_LINUX_TEST_RESULT([iov_iter_get_pages], [
-			AC_MSG_RESULT(yes)
-			AC_DEFINE(HAVE_IOV_ITER_GET_PAGES, 1,
-			    [iov_iter_get_pages() is available])
-		], [
-			AC_MSG_RESULT(no)
-			enable_vfs_iov_iter="no"
-		])
 	])
 
 	dnl #
@@ -105,17 +81,6 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [
 		AC_MSG_RESULT(no)
 	])
 
-	dnl #
-	dnl # As of the 4.9 kernel support is provided for iovecs, kvecs,
-	dnl # bvecs and pipes in the iov_iter structure.  As long as the
-	dnl # other support interfaces are all available the iov_iter can
-	dnl # be correctly used in the uio structure.
-	dnl #
-	AS_IF([test "x$enable_vfs_iov_iter" = "xyes"], [
-		AC_DEFINE(HAVE_VFS_IOV_ITER, 1,
-		    [All required iov_iter interfaces are available])
-	])
-
 	dnl #
 	dnl # Kernel 6.5 introduces the iter_iov() function that returns the
 	dnl # __iov member of an iov_iter*. The iov member was renamed to this
diff --git a/config/kernel.m4 b/config/kernel.m4
index 78f178ff27acd..49ec6266e87af 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -79,7 +79,6 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_VFS_READ_FOLIO
 	ZFS_AC_KERNEL_SRC_VFS_MIGRATE_FOLIO
 	ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS
-	ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO
 	ZFS_AC_KERNEL_SRC_VFS_READPAGES
 	ZFS_AC_KERNEL_SRC_VFS_SET_PAGE_DIRTY_NOBUFFERS
 	ZFS_AC_KERNEL_SRC_VFS_IOV_ITER
@@ -190,7 +189,6 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_VFS_READ_FOLIO
 	ZFS_AC_KERNEL_VFS_MIGRATE_FOLIO
 	ZFS_AC_KERNEL_VFS_FSYNC_2ARGS
-	ZFS_AC_KERNEL_VFS_DIRECT_IO
 	ZFS_AC_KERNEL_VFS_READPAGES
 	ZFS_AC_KERNEL_VFS_SET_PAGE_DIRTY_NOBUFFERS
 	ZFS_AC_KERNEL_VFS_IOV_ITER
diff --git a/include/os/linux/spl/sys/uio.h b/include/os/linux/spl/sys/uio.h
index 5d483685eb205..9e7afea2ab345 100644
--- a/include/os/linux/spl/sys/uio.h
+++ b/include/os/linux/spl/sys/uio.h
@@ -40,7 +40,7 @@
  */
 #define	UIO_DIRECT	0x0001 /* Direct I/O request */
 
-#if defined(HAVE_VFS_IOV_ITER) && defined(HAVE_FAULT_IN_IOV_ITER_READABLE)
+#if defined(HAVE_FAULT_IN_IOV_ITER_READABLE)
 #define	iov_iter_fault_in_readable(a, b)	fault_in_iov_iter_readable(a, b)
 #endif
 
@@ -52,12 +52,9 @@ typedef enum zfs_uio_rw {
 } zfs_uio_rw_t;
 
 typedef enum zfs_uio_seg {
-	UIO_USERSPACE =		0,
-	UIO_SYSSPACE =		1,
-	UIO_BVEC =		2,
-#if defined(HAVE_VFS_IOV_ITER)
-	UIO_ITER =		3,
-#endif
+	UIO_SYSSPACE =		0,
+	UIO_BVEC =		1,
+	UIO_ITER =		2,
 } zfs_uio_seg_t;
 
 /*
@@ -72,9 +69,7 @@ typedef struct zfs_uio {
 	union {
 		const struct iovec	*uio_iov;
 		const struct bio_vec	*uio_bvec;
-#if defined(HAVE_VFS_IOV_ITER)
 		struct iov_iter		*uio_iter;
-#endif
 	};
 	int		uio_iovcnt;	/* Number of iovecs */
 	offset_t	uio_soffset;	/* Starting logical offset */
@@ -129,7 +124,7 @@ zfs_uio_iovec_init(zfs_uio_t *uio, const struct iovec *iov,
     unsigned long nr_segs, offset_t offset, zfs_uio_seg_t seg, ssize_t resid,
     size_t skip)
 {
-	ASSERT(seg == UIO_USERSPACE || seg == UIO_SYSSPACE);
+	ASSERT(seg == UIO_SYSSPACE);
 
 	uio->uio_iov = iov;
 	uio->uio_iovcnt = nr_segs;
@@ -175,7 +170,6 @@ zfs_uio_bvec_init(zfs_uio_t *uio, struct bio *bio, struct request *rq)
 	memset(&uio->uio_dio, 0, sizeof (zfs_uio_dio_t));
 }
 
-#if defined(HAVE_VFS_IOV_ITER)
 static inline void
 zfs_uio_iov_iter_init(zfs_uio_t *uio, struct iov_iter *iter, offset_t offset,
     ssize_t resid, size_t skip)
@@ -192,7 +186,6 @@ zfs_uio_iov_iter_init(zfs_uio_t *uio, struct iov_iter *iter, offset_t offset,
 	uio->uio_soffset = uio->uio_loffset;
 	memset(&uio->uio_dio, 0, sizeof (zfs_uio_dio_t));
 }
-#endif /* HAVE_VFS_IOV_ITER */
 
 #if defined(HAVE_ITER_IOV)
 #define	zfs_uio_iter_iov(iter)	iter_iov((iter))
diff --git a/lib/libspl/include/sys/uio.h b/lib/libspl/include/sys/uio.h
index 16749fa492e5f..f86be64ce7f3d 100644
--- a/lib/libspl/include/sys/uio.h
+++ b/lib/libspl/include/sys/uio.h
@@ -57,8 +57,7 @@ typedef enum zfs_uio_rw {
 } zfs_uio_rw_t;
 
 typedef enum zfs_uio_seg {
-	UIO_USERSPACE =	0,
-	UIO_SYSSPACE =	1,
+	UIO_SYSSPACE =	0,
 } zfs_uio_seg_t;
 
 #elif defined(__FreeBSD__)
diff --git a/module/os/linux/zfs/zfs_uio.c b/module/os/linux/zfs/zfs_uio.c
index f08415fdb2e31..ed11f8b63fbf8 100644
--- a/module/os/linux/zfs/zfs_uio.c
+++ b/module/os/linux/zfs/zfs_uio.c
@@ -68,55 +68,13 @@ zfs_uiomove_iov(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
 	size_t skip = uio->uio_skip;
 	ulong_t cnt;
 
+	ASSERT3S(uio->uio_segflg, ==, UIO_SYSSPACE);
 	while (n && uio->uio_resid) {
 		cnt = MIN(iov->iov_len - skip, n);
-		switch (uio->uio_segflg) {
-		case UIO_USERSPACE:
-			/*
-			 * p = kernel data pointer
-			 * iov->iov_base = user data pointer
-			 */
-			if (rw == UIO_READ) {
-				if (copy_to_user(iov->iov_base+skip, p, cnt))
-					return (EFAULT);
-			} else {
-				unsigned long b_left = 0;
-				if (uio->uio_fault_disable) {
-					if (!zfs_access_ok(VERIFY_READ,
-					    (iov->iov_base + skip), cnt)) {
-						return (EFAULT);
-					}
-					pagefault_disable();
-					b_left =
-					    __copy_from_user_inatomic(p,
-					    (iov->iov_base + skip), cnt);
-					pagefault_enable();
-				} else {
-					b_left =
-					    copy_from_user(p,
-					    (iov->iov_base + skip), cnt);
-				}
-				if (b_left > 0) {
-					unsigned long c_bytes =
-					    cnt - b_left;
-					uio->uio_skip += c_bytes;
-					ASSERT3U(uio->uio_skip, <,
-					    iov->iov_len);
-					uio->uio_resid -= c_bytes;
-					uio->uio_loffset += c_bytes;
-					return (EFAULT);
-				}
-			}
-			break;
-		case UIO_SYSSPACE:
-			if (rw == UIO_READ)
-				memcpy(iov->iov_base + skip, p, cnt);
-			else
-				memcpy(p, iov->iov_base + skip, cnt);
-			break;
-		default:
-			ASSERT(0);
-		}
+		if (rw == UIO_READ)
+			memcpy(iov->iov_base + skip, p, cnt);
+		else
+			memcpy(p, iov->iov_base + skip, cnt);
 		skip += cnt;
 		if (skip == iov->iov_len) {
 			skip = 0;
@@ -268,7 +226,6 @@ zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
 	return (zfs_uiomove_bvec_impl(p, n, rw, uio));
 }
 
-#if defined(HAVE_VFS_IOV_ITER)
 static int
 zfs_uiomove_iter(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio,
     boolean_t revert)
@@ -303,17 +260,14 @@ zfs_uiomove_iter(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio,
 
 	return (0);
 }
-#endif
 
 int
 zfs_uiomove(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
 {
 	if (uio->uio_segflg == UIO_BVEC)
 		return (zfs_uiomove_bvec(p, n, rw, uio));
-#if defined(HAVE_VFS_IOV_ITER)
 	else if (uio->uio_segflg == UIO_ITER)
 		return (zfs_uiomove_iter(p, n, rw, uio, B_FALSE));
-#endif
 	else
 		return (zfs_uiomove_iov(p, n, rw, uio));
 }
@@ -336,44 +290,14 @@ zfs_uio_prefaultpages(ssize_t n, zfs_uio_t *uio)
 		 * there is never a time for these pages a fault will occur.
 		 */
 		return (0);
-#if defined(HAVE_VFS_IOV_ITER)
-	} else if (uio->uio_segflg == UIO_ITER) {
+	} else  {
+		ASSERT3S(uio->uio_segflg, ==, UIO_ITER);
 		/*
-		 * At least a Linux 4.9 kernel, iov_iter_fault_in_readable()
+		 * At least a Linux 4.18 kernel, iov_iter_fault_in_readable()
 		 * can be relied on to fault in user pages when referenced.
 		 */
 		if (iov_iter_fault_in_readable(uio->uio_iter, n))
 			return (EFAULT);
-#endif
-	} else {
-		/* Fault in all user pages */
-		ASSERT3S(uio->uio_segflg, ==, UIO_USERSPACE);
-		const struct iovec *iov = uio->uio_iov;
-		int iovcnt = uio->uio_iovcnt;
-		size_t skip = uio->uio_skip;
-		uint8_t tmp;
-		caddr_t p;
-
-		for (; n > 0 && iovcnt > 0; iov++, iovcnt--, skip = 0) {
-			ulong_t cnt = MIN(iov->iov_len - skip, n);
-			/* empty iov */
-			if (cnt == 0)
-				continue;
-			n -= cnt;
-			/* touch each page in this segment. */
-			p = iov->iov_base + skip;
-			while (cnt) {
-				if (copy_from_user(&tmp, p, 1))
-					return (EFAULT);
-				ulong_t incr = MIN(cnt, PAGESIZE);
-				p += incr;
-				cnt -= incr;
-			}
-			/* touch the last byte in case it straddles a page. */
-			p--;
-			if (copy_from_user(&tmp, p, 1))
-				return (EFAULT);
-		}
 	}
 
 	return (0);
@@ -394,10 +318,8 @@ zfs_uiocopy(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, size_t *cbytes)
 
 	if (uio->uio_segflg == UIO_BVEC)
 		ret = zfs_uiomove_bvec(p, n, rw, &uio_copy);
-#if defined(HAVE_VFS_IOV_ITER)
 	else if (uio->uio_segflg == UIO_ITER)
 		ret = zfs_uiomove_iter(p, n, rw, &uio_copy, B_TRUE);
-#endif
 	else
 		ret = zfs_uiomove_iov(p, n, rw, &uio_copy);
 
@@ -430,11 +352,10 @@ zfs_uioskip(zfs_uio_t *uio, size_t n)
 			uio->uio_bvec++;
 			uio->uio_iovcnt--;
 		}
-#if defined(HAVE_VFS_IOV_ITER)
 	} else if (uio->uio_segflg == UIO_ITER) {
 		iov_iter_advance(uio->uio_iter, n);
-#endif
 	} else {
+		ASSERT3S(uio->uio_segflg, ==, UIO_SYSSPACE);
 		uio->uio_skip += n;
 		while (uio->uio_iovcnt &&
 		    uio->uio_skip >= uio->uio_iov->iov_len) {
@@ -457,8 +378,7 @@ zfs_uio_page_aligned(zfs_uio_t *uio)
 {
 	boolean_t aligned = B_TRUE;
 
-	if (uio->uio_segflg == UIO_USERSPACE ||
-	    uio->uio_segflg == UIO_SYSSPACE) {
+	if (uio->uio_segflg == UIO_SYSSPACE) {
 		const struct iovec *iov = uio->uio_iov;
 		size_t skip = uio->uio_skip;
 
@@ -472,12 +392,10 @@ zfs_uio_page_aligned(zfs_uio_t *uio)
 			}
 			skip = 0;
 		}
-#if defined(HAVE_VFS_IOV_ITER)
 	} else if (uio->uio_segflg == UIO_ITER) {
 		unsigned long alignment =
 		    iov_iter_alignment(uio->uio_iter);
 		aligned = IS_P2ALIGNED(alignment, PAGE_SIZE);
-#endif
 	} else {
 		/* Currently not supported */
 		aligned = B_FALSE;
@@ -578,76 +496,6 @@ zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
 	    uio->uio_dio.npages * sizeof (struct page *));
 }
 
-/*
- * zfs_uio_iov_step() is just a modified version of the STEP function of Linux's
- * iov_iter_get_pages().
- */
-static int
-zfs_uio_iov_step(struct iovec v, zfs_uio_rw_t rw, zfs_uio_t *uio,
-    long *numpages)
-{
-	unsigned long addr = (unsigned long)(v.iov_base);
-	size_t len = v.iov_len;
-	unsigned long n = DIV_ROUND_UP(len, PAGE_SIZE);
-
-	/*
-	 * read returning FOLL_WRITE is due to the fact that we are stating
-	 * that the kernel will have write access to the user pages. So, when a
-	 * Direct I/O read request is issued, the kernel must write to the user
-	 * pages.
-	 */
-	long res = get_user_pages_unlocked(
-	    P2ALIGN_TYPED(addr, PAGE_SIZE, unsigned long), n,
-	    &uio->uio_dio.pages[uio->uio_dio.npages],
-	    rw == UIO_READ ? FOLL_WRITE : 0);
-	if (res < 0) {
-		return (SET_ERROR(-res));
-	} else if (len != (res * PAGE_SIZE)) {
-		return (SET_ERROR(EFAULT));
-	}
-
-	ASSERT3S(len, ==, res * PAGE_SIZE);
-	*numpages = res;
-	return (0);
-}
-
-static int
-zfs_uio_get_dio_pages_iov(zfs_uio_t *uio, zfs_uio_rw_t rw)
-{
-	const struct iovec *iovp = uio->uio_iov;
-	size_t skip = uio->uio_skip;
-	size_t len = uio->uio_resid - skip;
-
-	ASSERT(uio->uio_segflg != UIO_SYSSPACE);
-
-	for (int i = 0; i < uio->uio_iovcnt; i++) {
-		struct iovec iov;
-		long numpages = 0;
-
-		if (iovp->iov_len == 0) {
-			iovp++;
-			skip = 0;
-			continue;
-		}
-		iov.iov_len = MIN(len, iovp->iov_len - skip);
-		iov.iov_base = iovp->iov_base + skip;
-		int error = zfs_uio_iov_step(iov, rw, uio, &numpages);
-
-		if (error)
-			return (error);
-
-		uio->uio_dio.npages += numpages;
-		len -= iov.iov_len;
-		skip = 0;
-		iovp++;
-	}
-
-	ASSERT0(len);
-
-	return (0);
-}
-
-#if defined(HAVE_VFS_IOV_ITER)
 static int
 zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw)
 {
@@ -688,7 +536,6 @@ zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw)
 
 	return (0);
 }
-#endif /* HAVE_VFS_IOV_ITER */
 
 /*
  * This function pins user pages. In the event that the user pages were not
@@ -703,14 +550,9 @@ zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw)
 	long npages = DIV_ROUND_UP(uio->uio_resid, PAGE_SIZE);
 	size_t size = npages * sizeof (struct page *);
 
-	if (uio->uio_segflg == UIO_USERSPACE) {
-		uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP);
-		error = zfs_uio_get_dio_pages_iov(uio, rw);
-#if defined(HAVE_VFS_IOV_ITER)
-	} else if (uio->uio_segflg == UIO_ITER) {
+	if (uio->uio_segflg == UIO_ITER) {
 		uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP);
 		error = zfs_uio_get_dio_pages_iov_iter(uio, rw);
-#endif
 	} else {
 		return (SET_ERROR(EOPNOTSUPP));
 	}
diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
index ff1370c543dc1..42dfddc2717b8 100644
--- a/module/os/linux/zfs/zpl_file.c
+++ b/module/os/linux/zfs/zpl_file.c
@@ -216,27 +216,6 @@ zpl_file_accessed(struct file *filp)
 	}
 }
 
-/*
- * When HAVE_VFS_IOV_ITER is defined the iov_iter structure supports
- * iovecs, kvevs, bvecs and pipes, plus all the required interfaces to
- * manipulate the iov_iter are available.  In which case the full iov_iter
- * can be attached to the uio and correctly handled in the lower layers.
- * Otherwise, for older kernels extract the iovec and pass it instead.
- */
-static void
-zpl_uio_init(zfs_uio_t *uio, struct kiocb *kiocb, struct iov_iter *to,
-    loff_t pos, ssize_t count, size_t skip)
-{
-#if defined(HAVE_VFS_IOV_ITER)
-	zfs_uio_iov_iter_init(uio, to, pos, count, skip);
-#else
-	zfs_uio_iovec_init(uio, zfs_uio_iter_iov(to), to->nr_segs, pos,
-	    zfs_uio_iov_iter_type(to) & ITER_KVEC ?
-	    UIO_SYSSPACE : UIO_USERSPACE,
-	    count, skip);
-#endif
-}
-
 static ssize_t
 zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to)
 {
@@ -246,7 +225,7 @@ zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to)
 	ssize_t count = iov_iter_count(to);
 	zfs_uio_t uio;
 
-	zpl_uio_init(&uio, kiocb, to, kiocb->ki_pos, count, 0);
+	zfs_uio_iov_iter_init(&uio, to, kiocb->ki_pos, count, 0);
 
 	crhold(cr);
 	cookie = spl_fstrans_mark();
@@ -296,7 +275,8 @@ zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from)
 	if (ret)
 		return (ret);
 
-	zpl_uio_init(&uio, kiocb, from, kiocb->ki_pos, count, from->iov_offset);
+	zfs_uio_iov_iter_init(&uio, from, kiocb->ki_pos, count,
+	    from->iov_offset);
 
 	crhold(cr);
 	cookie = spl_fstrans_mark();
@@ -317,34 +297,18 @@ zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from)
 }
 
 static ssize_t
-zpl_direct_IO_impl(void)
+zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter)
 {
 	/*
 	 * All O_DIRECT requests should be handled by
-	 * zpl_{iter/aio}_{write/read}(). There is no way kernel generic code
-	 * should call the direct_IO address_space_operations function. We set
-	 * this code path to be fatal if it is executed.
+	 * zpl_iter_write/read}(). There is no way kernel generic code should
+	 * call the direct_IO address_space_operations function. We set this
+	 * code path to be fatal if it is executed.
 	 */
 	PANIC(0);
 	return (0);
 }
 
-#if defined(HAVE_VFS_DIRECT_IO_ITER)
-static ssize_t
-zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter)
-{
-	return (zpl_direct_IO_impl());
-}
-#elif defined(HAVE_VFS_DIRECT_IO_ITER_OFFSET)
-static ssize_t
-zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
-{
-	return (zpl_direct_IO_impl());
-}
-#else
-#error "Unknown Direct I/O interface"
-#endif
-
 static loff_t
 zpl_llseek(struct file *filp, loff_t offset, int whence)
 {
@@ -1104,14 +1068,12 @@ const struct file_operations zpl_file_operations = {
 	.llseek		= zpl_llseek,
 	.read_iter	= zpl_iter_read,
 	.write_iter	= zpl_iter_write,
-#ifdef HAVE_VFS_IOV_ITER
 #ifdef HAVE_COPY_SPLICE_READ
 	.splice_read	= copy_splice_read,
 #else
 	.splice_read	= generic_file_splice_read,
 #endif
 	.splice_write	= iter_file_splice_write,
-#endif
 	.mmap		= zpl_mmap,
 	.fsync		= zpl_fsync,
 	.fallocate	= zpl_fallocate,

From 882a80998379184da5b36e1ef3f1444e2cf1461a Mon Sep 17 00:00:00 2001
From: Brian Atkinson <batkinson@lanl.gov>
Date: Tue, 10 Dec 2024 10:21:06 -0700
Subject: [PATCH 12/43] Use pin_user_pages API for Direct I/O requests

As of kernel v5.8, pin_user_pages* interfaced were introduced. These
interfaces use the FOLL_PIN flag. This is preferred interface now for
Direct I/O requests in the kernel. The reasoning for using this new
interface for Direct I/O requests is explained in the kernel
documenetation:
Documentation/core-api/pin_user_pages.rst

If pin_user_pages_unlocked is available, the all Direct I/O requests
will use this new API to stay uptodate with the kernel API requirements.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Brian Atkinson <batkinson@lanl.gov>
Closes #16856
---
 config/kernel-pin-user-pages.m4 |  33 ++++++++++
 config/kernel-vfs-iov_iter.m4   |  43 ++++++-------
 config/kernel.m4                |   2 +
 module/os/linux/zfs/zfs_uio.c   | 111 +++++++++++++++++++++++++++-----
 4 files changed, 148 insertions(+), 41 deletions(-)
 create mode 100644 config/kernel-pin-user-pages.m4

diff --git a/config/kernel-pin-user-pages.m4 b/config/kernel-pin-user-pages.m4
new file mode 100644
index 0000000000000..fe7aff3752080
--- /dev/null
+++ b/config/kernel-pin-user-pages.m4
@@ -0,0 +1,33 @@
+dnl #
+dnl # Check for pin_user_pages_unlocked().
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_PIN_USER_PAGES], [
+	ZFS_LINUX_TEST_SRC([pin_user_pages_unlocked], [
+		#include <linux/mm.h>
+	],[
+		unsigned long start = 0;
+		unsigned long nr_pages = 1;
+		struct page **pages = NULL;
+		unsigned int gup_flags = 0;
+		long ret __attribute__ ((unused));
+
+		ret = pin_user_pages_unlocked(start, nr_pages, pages,
+		    gup_flags);
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_PIN_USER_PAGES], [
+
+	dnl #
+	dnl # Kernal 5.8 introduced the pin_user_pages* interfaces which should
+	dnl # be used for Direct I/O requests.
+	dnl #
+	AC_MSG_CHECKING([whether pin_user_pages_unlocked() is available])
+	ZFS_LINUX_TEST_RESULT([pin_user_pages_unlocked], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_PIN_USER_PAGES_UNLOCKED, 1,
+		    [pin_user_pages_unlocked() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
diff --git a/config/kernel-vfs-iov_iter.m4 b/config/kernel-vfs-iov_iter.m4
index 29e19acddbb1b..a223343030db4 100644
--- a/config/kernel-vfs-iov_iter.m4
+++ b/config/kernel-vfs-iov_iter.m4
@@ -13,26 +13,21 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_IOV_ITER], [
 		error = fault_in_iov_iter_readable(&iter, size);
 	])
 
-	ZFS_LINUX_TEST_SRC([iov_iter_get_pages2], [
+	ZFS_LINUX_TEST_SRC([iov_iter_type], [
+		#include <linux/fs.h>
 		#include <linux/uio.h>
 	],[
 		struct iov_iter iter = { 0 };
-		struct page **pages = NULL;
-		size_t maxsize = 4096;
-		unsigned maxpages = 1;
-		size_t start;
-		size_t ret __attribute__ ((unused));
-
-		ret = iov_iter_get_pages2(&iter, pages, maxsize, maxpages,
-		    &start);
+		__attribute__((unused)) enum iter_type i = iov_iter_type(&iter);
 	])
 
-	ZFS_LINUX_TEST_SRC([iov_iter_type], [
-		#include <linux/fs.h>
+	ZFS_LINUX_TEST_SRC([iter_is_ubuf], [
 		#include <linux/uio.h>
 	],[
 		struct iov_iter iter = { 0 };
-		__attribute__((unused)) enum iter_type i = iov_iter_type(&iter);
+		bool ret __attribute__((unused));
+
+		ret = iter_is_ubuf(&iter);
 	])
 
 	ZFS_LINUX_TEST_SRC([iter_iov], [
@@ -55,18 +50,6 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [
 		AC_MSG_RESULT(no)
 	])
 
-	dnl #
-	dnl # Kernel 6.0 changed iov_iter_get_pages() to iov_iter_page_pages2().
-	dnl #
-	AC_MSG_CHECKING([whether iov_iter_get_pages2() is available])
-	ZFS_LINUX_TEST_RESULT([iov_iter_get_pages2], [
-		AC_MSG_RESULT(yes)
-		AC_DEFINE(HAVE_IOV_ITER_GET_PAGES2, 1,
-		    [iov_iter_get_pages2() is available])
-	],[
-		AC_MSG_RESULT(no)
-	])
-
 	dnl #
 	dnl # This checks for iov_iter_type() in linux/uio.h. It is not
 	dnl # required, however, and the module will compiled without it
@@ -81,6 +64,18 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [
 		AC_MSG_RESULT(no)
 	])
 
+	dnl #
+	dnl # Kernel 6.0 introduced the ITER_UBUF iov_iter type. iter_is_ubuf()
+	dnl # was also added to determine if the iov_iter is an ITER_UBUF.
+	dnl #
+	AC_MSG_CHECKING([whether iter_is_ubuf() is available])
+	ZFS_LINUX_TEST_RESULT([iter_is_ubuf], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_ITER_IS_UBUF, 1, [iter_is_ubuf() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+
 	dnl #
 	dnl # Kernel 6.5 introduces the iter_iov() function that returns the
 	dnl # __iov member of an iov_iter*. The iov member was renamed to this
diff --git a/config/kernel.m4 b/config/kernel.m4
index 49ec6266e87af..ae66633907bf7 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -127,6 +127,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE
 	ZFS_AC_KERNEL_SRC_MM_PAGE_MAPPING
 	ZFS_AC_KERNEL_SRC_FILE
+	ZFS_AC_KERNEL_SRC_PIN_USER_PAGES
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
@@ -238,6 +239,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_MM_PAGE_MAPPING
 	ZFS_AC_KERNEL_1ARG_ASSIGN_STR
 	ZFS_AC_KERNEL_FILE
+	ZFS_AC_KERNEL_PIN_USER_PAGES
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_CPU_HAS_FEATURE
diff --git a/module/os/linux/zfs/zfs_uio.c b/module/os/linux/zfs/zfs_uio.c
index ed11f8b63fbf8..db85b626f12af 100644
--- a/module/os/linux/zfs/zfs_uio.c
+++ b/module/os/linux/zfs/zfs_uio.c
@@ -441,6 +441,7 @@ zfs_unmark_page(struct page *page)
 }
 #endif /* HAVE_ZERO_PAGE_GPL_ONLY || !_LP64 */
 
+#if !defined(HAVE_PIN_USER_PAGES_UNLOCKED)
 static void
 zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio)
 {
@@ -472,6 +473,7 @@ zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio)
 		}
 	}
 }
+#endif
 
 void
 zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
@@ -480,6 +482,9 @@ zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
 	ASSERT(uio->uio_extflg & UIO_DIRECT);
 	ASSERT3P(uio->uio_dio.pages, !=, NULL);
 
+#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
+	unpin_user_pages(uio->uio_dio.pages, uio->uio_dio.npages);
+#else
 	for (long i = 0; i < uio->uio_dio.npages; i++) {
 		struct page *p = uio->uio_dio.pages[i];
 
@@ -491,44 +496,106 @@ zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
 
 		put_page(p);
 	}
-
+#endif
 	vmem_free(uio->uio_dio.pages,
 	    uio->uio_dio.npages * sizeof (struct page *));
 }
 
+#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
 static int
-zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw)
+zfs_uio_pin_user_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
 {
+	long res;
 	size_t skip = uio->uio_skip;
+	size_t len = uio->uio_resid - skip;
+	unsigned int gup_flags = 0;
+	unsigned long addr;
+	unsigned long nr_pages;
+
+	/*
+	 * Kernel 6.2 introduced the FOLL_PCI_P2PDMA flag. This flag could
+	 * possibly be used here in the future to allow for P2P operations with
+	 * user pages.
+	 */
+	if (rw == UIO_READ)
+		gup_flags = FOLL_WRITE;
+
+	if (len == 0)
+		return (0);
+
+#if defined(HAVE_ITER_IS_UBUF)
+	if (iter_is_ubuf(uio->uio_iter)) {
+		nr_pages = DIV_ROUND_UP(len, PAGE_SIZE);
+		addr = (unsigned long)uio->uio_iter->ubuf + skip;
+		res = pin_user_pages_unlocked(addr, nr_pages,
+		    &uio->uio_dio.pages[uio->uio_dio.npages], gup_flags);
+		if (res < 0) {
+			return (SET_ERROR(-res));
+		} else if (len != (res * PAGE_SIZE)) {
+			uio->uio_dio.npages += res;
+			return (SET_ERROR(EFAULT));
+		}
+		uio->uio_dio.npages += res;
+		return (0);
+	}
+#endif
+	const struct iovec *iovp = zfs_uio_iter_iov(uio->uio_iter);
+	for (int i = 0; i < uio->uio_iovcnt; i++) {
+		size_t amt = iovp->iov_len - skip;
+		if (amt == 0) {
+			iovp++;
+			skip = 0;
+			continue;
+		}
+
+		addr = (unsigned long)iovp->iov_base + skip;
+		nr_pages = DIV_ROUND_UP(amt, PAGE_SIZE);
+		res = pin_user_pages_unlocked(addr, nr_pages,
+		    &uio->uio_dio.pages[uio->uio_dio.npages], gup_flags);
+		if (res < 0) {
+			return (SET_ERROR(-res));
+		} else if (amt != (res * PAGE_SIZE)) {
+			uio->uio_dio.npages += res;
+			return (SET_ERROR(EFAULT));
+		}
+
+		len -= amt;
+		uio->uio_dio.npages += res;
+		skip = 0;
+		iovp++;
+	};
+
+	ASSERT0(len);
+
+	return (0);
+}
+
+#else
+static int
+zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw)
+{
+	size_t start;
 	size_t wanted = uio->uio_resid - uio->uio_skip;
 	ssize_t rollback = 0;
 	ssize_t cnt;
 	unsigned maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE);
 
 	while (wanted) {
-#if defined(HAVE_IOV_ITER_GET_PAGES2)
-		cnt = iov_iter_get_pages2(uio->uio_iter,
-		    &uio->uio_dio.pages[uio->uio_dio.npages],
-		    wanted, maxpages, &skip);
-#else
 		cnt = iov_iter_get_pages(uio->uio_iter,
 		    &uio->uio_dio.pages[uio->uio_dio.npages],
-		    wanted, maxpages, &skip);
-#endif
+		    wanted, maxpages, &start);
 		if (cnt < 0) {
 			iov_iter_revert(uio->uio_iter, rollback);
 			return (SET_ERROR(-cnt));
 		}
+		/*
+		 * All Direct I/O operations must be page aligned.
+		 */
+		ASSERT(IS_P2ALIGNED(start, PAGE_SIZE));
 		uio->uio_dio.npages += DIV_ROUND_UP(cnt, PAGE_SIZE);
 		rollback += cnt;
 		wanted -= cnt;
-		skip = 0;
-#if !defined(HAVE_IOV_ITER_GET_PAGES2)
-		/*
-		 * iov_iter_get_pages2() advances the iov_iter on success.
-		 */
 		iov_iter_advance(uio->uio_iter, cnt);
-#endif
 
 	}
 	ASSERT3U(rollback, ==, uio->uio_resid - uio->uio_skip);
@@ -536,6 +603,7 @@ zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw)
 
 	return (0);
 }
+#endif /* HAVE_PIN_USER_PAGES_UNLOCKED */
 
 /*
  * This function pins user pages. In the event that the user pages were not
@@ -552,7 +620,11 @@ zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw)
 
 	if (uio->uio_segflg == UIO_ITER) {
 		uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP);
+#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
+		error = zfs_uio_pin_user_pages(uio, rw);
+#else
 		error = zfs_uio_get_dio_pages_iov_iter(uio, rw);
+#endif
 	} else {
 		return (SET_ERROR(EOPNOTSUPP));
 	}
@@ -560,17 +632,22 @@ zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw)
 	ASSERT3S(uio->uio_dio.npages, >=, 0);
 
 	if (error) {
+#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
+		unpin_user_pages(uio->uio_dio.pages, uio->uio_dio.npages);
+#else
 		for (long i = 0; i < uio->uio_dio.npages; i++)
 			put_page(uio->uio_dio.pages[i]);
+#endif
 		vmem_free(uio->uio_dio.pages, size);
 		return (error);
 	} else {
 		ASSERT3S(uio->uio_dio.npages, ==, npages);
 	}
 
-	if (rw == UIO_WRITE) {
+#if !defined(HAVE_PIN_USER_PAGES_UNLOCKED)
+	if (rw == UIO_WRITE)
 		zfs_uio_dio_check_for_zero_page(uio);
-	}
+#endif
 
 	uio->uio_extflg |= UIO_DIRECT;
 

From 830a53124941cc1053b4a3f22c8bef18bf6eb942 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Tue, 17 Dec 2024 08:58:33 -0800
Subject: [PATCH 13/43] CI: Add FreeBSD 14.2 RELEASE+STABLE builds

Update the CI to include FreeBSD 14.2 as a regularly tested platform.

Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #16869
---
 .github/workflows/scripts/qemu-2-start.sh | 20 ++++++++++----------
 .github/workflows/zfs-qemu.yml            |  9 +++++----
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/scripts/qemu-2-start.sh b/.github/workflows/scripts/qemu-2-start.sh
index 39ac92107b71d..f0f505cac4b7a 100755
--- a/.github/workflows/scripts/qemu-2-start.sh
+++ b/.github/workflows/scripts/qemu-2-start.sh
@@ -14,7 +14,7 @@ OSv=$OS
 
 # compressed with .zst extension
 REPO="https://github.com/mcmilk/openzfs-freebsd-images"
-FREEBSD="$REPO/releases/download/v2024-10-05"
+FREEBSD="$REPO/releases/download/v2024-12-14"
 URLzs=""
 
 # Ubuntu mirrors
@@ -76,28 +76,28 @@ case "$OS" in
     BASH="/usr/local/bin/bash"
     NIC="rtl8139"
     ;;
-  freebsd14-0r)
-    OSNAME="FreeBSD 14.0-RELEASE"
-    OSv="freebsd14.0"
-    URLzs="$FREEBSD/amd64-freebsd-14.0-RELEASE.qcow2.zst"
-    BASH="/usr/local/bin/bash"
-    ;;
   freebsd14-1r)
     OSNAME="FreeBSD 14.1-RELEASE"
     OSv="freebsd14.0"
     URLzs="$FREEBSD/amd64-freebsd-14.1-RELEASE.qcow2.zst"
     BASH="/usr/local/bin/bash"
     ;;
+  freebsd14-2r)
+    OSNAME="FreeBSD 14.2-RELEASE"
+    OSv="freebsd14.0"
+    URLzs="$FREEBSD/amd64-freebsd-14.2-RELEASE.qcow2.zst"
+    BASH="/usr/local/bin/bash"
+    ;;
   freebsd13-4s)
     OSNAME="FreeBSD 13.4-STABLE"
     OSv="freebsd13.0"
     URLzs="$FREEBSD/amd64-freebsd-13.4-STABLE.qcow2.zst"
     BASH="/usr/local/bin/bash"
     ;;
-  freebsd14-1s)
-    OSNAME="FreeBSD 14.1-STABLE"
+  freebsd14-2s)
+    OSNAME="FreeBSD 14.2-STABLE"
     OSv="freebsd14.0"
-    URLzs="$FREEBSD/amd64-freebsd-14.1-STABLE.qcow2.zst"
+    URLzs="$FREEBSD/amd64-freebsd-14.2-STABLE.qcow2.zst"
     BASH="/usr/local/bin/bash"
     ;;
   freebsd15-0c)
diff --git a/.github/workflows/zfs-qemu.yml b/.github/workflows/zfs-qemu.yml
index e90030f4c02eb..4748e90db50bf 100644
--- a/.github/workflows/zfs-qemu.yml
+++ b/.github/workflows/zfs-qemu.yml
@@ -22,8 +22,8 @@ jobs:
       - name: Generate OS config and CI type
         id: os
         run: |
-          FULL_OS='["almalinux8", "almalinux9", "centos-stream9", "debian11", "debian12", "fedora40", "fedora41", "freebsd13-4r", "freebsd14-0r", "freebsd14-1s", "ubuntu20", "ubuntu22", "ubuntu24"]'
-          QUICK_OS='["almalinux8", "almalinux9", "debian12", "fedora41", "freebsd13-3r", "freebsd14-1r", "ubuntu24"]'
+          FULL_OS='["almalinux8", "almalinux9", "centos-stream9", "debian11", "debian12", "fedora40", "fedora41", "freebsd13-3r", "freebsd13-4s", "freebsd14-1r", "freebsd14-2s", "freebsd15-0c", "ubuntu20", "ubuntu22", "ubuntu24"]'
+          QUICK_OS='["almalinux8", "almalinux9", "debian12", "fedora41", "freebsd13-3r", "freebsd14-2r", "ubuntu24"]'
           # determine CI type when running on PR
           ci_type="full"
           if ${{ github.event_name == 'pull_request' }}; then
@@ -49,8 +49,9 @@ jobs:
         # rhl:     almalinux8, almalinux9, centos-stream9, fedora40, fedora41
         # debian:  debian11, debian12, ubuntu20, ubuntu22, ubuntu24
         # misc:    archlinux, tumbleweed
-        # FreeBSD Release: freebsd13-3r, freebsd13-4r, freebsd14-0r, freebsd14-1r
-        # FreeBSD Stable:  freebsd13-4s, freebsd14-1s
+        # FreeBSD variants of 2024-12:
+        # FreeBSD Release: freebsd13-3r, freebsd13-4r, freebsd14-1r, freebsd14-2r
+        # FreeBSD Stable:  freebsd13-4s, freebsd14-2s
         # FreeBSD Current: freebsd15-0c
         os: ${{ fromJson(needs.test-config.outputs.test_os) }}
     runs-on: ubuntu-24.04

From ab7cbbe7890e98c8d7df93ddf179f7bed977ef8b Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Thu, 19 Dec 2024 10:25:12 +1100
Subject: [PATCH 14/43] zprop: fix value help for ZPOOL_PROP_CAPACITY

It's a percentage and documented as such, but we were showing it as
<size>.

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #16881
---
 module/zcommon/zpool_prop.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c
index a709679b90320..ea9eda4b316dc 100644
--- a/module/zcommon/zpool_prop.c
+++ b/module/zcommon/zpool_prop.c
@@ -105,7 +105,7 @@ zpool_prop_init(void)
 	    PROP_READONLY, ZFS_TYPE_POOL, "<percent>", "FRAG", B_FALSE,
 	    sfeatures);
 	zprop_register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY,
-	    ZFS_TYPE_POOL, "<size>", "CAP", B_FALSE, sfeatures);
+	    ZFS_TYPE_POOL, "<percent>", "CAP", B_FALSE, sfeatures);
 	zprop_register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY,
 	    ZFS_TYPE_POOL, "<guid>", "GUID", B_TRUE, sfeatures);
 	zprop_register_number(ZPOOL_PROP_LOAD_GUID, "load_guid", 0,

From e5ac7786bda1db64778b51a80136fa2e3e2e93d0 Mon Sep 17 00:00:00 2001
From: Tino Reichardt <milky-zfs@mcmilk.de>
Date: Thu, 19 Dec 2024 17:01:34 +0100
Subject: [PATCH 15/43] CI: Fix FreeBSD 13.4 STABLE build

In #16869 we added FreeBSD 13.4 STABLE, but forget the special
thing, that the virtio nic within FreeBSD 13.x is buggy.

This fix adds the needed rtl8139 nic to the VM.

Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by:  Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Tino Reichardt <milky-zfs@mcmilk.de>
Closes #16885
---
 .github/workflows/scripts/qemu-2-start.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/scripts/qemu-2-start.sh b/.github/workflows/scripts/qemu-2-start.sh
index f0f505cac4b7a..0906e438ac0d8 100755
--- a/.github/workflows/scripts/qemu-2-start.sh
+++ b/.github/workflows/scripts/qemu-2-start.sh
@@ -93,6 +93,7 @@ case "$OS" in
     OSv="freebsd13.0"
     URLzs="$FREEBSD/amd64-freebsd-13.4-STABLE.qcow2.zst"
     BASH="/usr/local/bin/bash"
+    NIC="rtl8139"
     ;;
   freebsd14-2s)
     OSNAME="FreeBSD 14.2-STABLE"

From f00a57a7868fe2fb0f96205f9f1fbf23f865cffb Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Fri, 20 Dec 2024 03:04:56 +1100
Subject: [PATCH 16/43] zfs_main: fix alignment on props usage output

I guess we've got some long property names since this was first set up!

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #16883
---
 cmd/zfs/zfs_main.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c
index 7836f5909f4a6..73ccf72d263c2 100644
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
@@ -500,7 +500,7 @@ usage_prop_cb(int prop, void *cb)
 {
 	FILE *fp = cb;
 
-	(void) fprintf(fp, "\t%-15s ", zfs_prop_to_name(prop));
+	(void) fprintf(fp, "\t%-22s ", zfs_prop_to_name(prop));
 
 	if (zfs_prop_readonly(prop))
 		(void) fprintf(fp, " NO    ");
@@ -561,40 +561,40 @@ usage(boolean_t requested)
 		(void) fprintf(fp, "%s",
 		    gettext("\nThe following properties are supported:\n"));
 
-		(void) fprintf(fp, "\n\t%-14s %s  %s   %s\n\n",
+		(void) fprintf(fp, "\n\t%-21s %s  %s   %s\n\n",
 		    "PROPERTY", "EDIT", "INHERIT", "VALUES");
 
 		/* Iterate over all properties */
 		(void) zprop_iter(usage_prop_cb, fp, B_FALSE, B_TRUE,
 		    ZFS_TYPE_DATASET);
 
-		(void) fprintf(fp, "\t%-15s ", "userused@...");
+		(void) fprintf(fp, "\t%-22s ", "userused@...");
 		(void) fprintf(fp, " NO       NO   <size>\n");
-		(void) fprintf(fp, "\t%-15s ", "groupused@...");
+		(void) fprintf(fp, "\t%-22s ", "groupused@...");
 		(void) fprintf(fp, " NO       NO   <size>\n");
-		(void) fprintf(fp, "\t%-15s ", "projectused@...");
+		(void) fprintf(fp, "\t%-22s ", "projectused@...");
 		(void) fprintf(fp, " NO       NO   <size>\n");
-		(void) fprintf(fp, "\t%-15s ", "userobjused@...");
+		(void) fprintf(fp, "\t%-22s ", "userobjused@...");
 		(void) fprintf(fp, " NO       NO   <size>\n");
-		(void) fprintf(fp, "\t%-15s ", "groupobjused@...");
+		(void) fprintf(fp, "\t%-22s ", "groupobjused@...");
 		(void) fprintf(fp, " NO       NO   <size>\n");
-		(void) fprintf(fp, "\t%-15s ", "projectobjused@...");
+		(void) fprintf(fp, "\t%-22s ", "projectobjused@...");
 		(void) fprintf(fp, " NO       NO   <size>\n");
-		(void) fprintf(fp, "\t%-15s ", "userquota@...");
+		(void) fprintf(fp, "\t%-22s ", "userquota@...");
 		(void) fprintf(fp, "YES       NO   <size> | none\n");
-		(void) fprintf(fp, "\t%-15s ", "groupquota@...");
+		(void) fprintf(fp, "\t%-22s ", "groupquota@...");
 		(void) fprintf(fp, "YES       NO   <size> | none\n");
-		(void) fprintf(fp, "\t%-15s ", "projectquota@...");
+		(void) fprintf(fp, "\t%-22s ", "projectquota@...");
 		(void) fprintf(fp, "YES       NO   <size> | none\n");
-		(void) fprintf(fp, "\t%-15s ", "userobjquota@...");
+		(void) fprintf(fp, "\t%-22s ", "userobjquota@...");
 		(void) fprintf(fp, "YES       NO   <size> | none\n");
-		(void) fprintf(fp, "\t%-15s ", "groupobjquota@...");
+		(void) fprintf(fp, "\t%-22s ", "groupobjquota@...");
 		(void) fprintf(fp, "YES       NO   <size> | none\n");
-		(void) fprintf(fp, "\t%-15s ", "projectobjquota@...");
+		(void) fprintf(fp, "\t%-22s ", "projectobjquota@...");
 		(void) fprintf(fp, "YES       NO   <size> | none\n");
-		(void) fprintf(fp, "\t%-15s ", "written@<snap>");
+		(void) fprintf(fp, "\t%-22s ", "written@<snap>");
 		(void) fprintf(fp, " NO       NO   <size>\n");
-		(void) fprintf(fp, "\t%-15s ", "written#<bookmark>");
+		(void) fprintf(fp, "\t%-22s ", "written#<bookmark>");
 		(void) fprintf(fp, " NO       NO   <size>\n");
 
 		(void) fprintf(fp, gettext("\nSizes are specified in bytes "

From 219a89cbbfdcbdc30c3417b56ec3e81fd8370ee5 Mon Sep 17 00:00:00 2001
From: Umer Saleem <usaleem@ixsystems.com>
Date: Fri, 20 Dec 2024 01:02:58 +0500
Subject: [PATCH 17/43] Skip iterating over snapshots for share properties

Setting sharenfs and sharesmb properties on a dataset can become costly
if there are large number of snapshots, since setting the share
properties iterates over all snapshots present for a dataset. If it is
the root dataset for which we are trying to set the share property,
snapshots for all child datasets and their children will also be
iterated.

There is no need to iterate over snapshots for share properties
because we do not allow share properties or any other property,
to be set on a snapshot itself execpt for user properties.

This commit skips iterating over snapshots for share properties,
instead iterate over all child dataset and their children for share
properties.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Umer Saleem <usaleem@ixsystems.com>
Closes #16877
---
 lib/libzfs/libzfs_changelist.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/lib/libzfs/libzfs_changelist.c b/lib/libzfs/libzfs_changelist.c
index 4db1cbce9568d..47df8663165e1 100644
--- a/lib/libzfs/libzfs_changelist.c
+++ b/lib/libzfs/libzfs_changelist.c
@@ -563,8 +563,15 @@ change_one(zfs_handle_t *zhp, void *data)
 			cn = NULL;
 		}
 
-		if (!clp->cl_alldependents)
-			ret = zfs_iter_children_v2(zhp, 0, change_one, data);
+		if (!clp->cl_alldependents) {
+			if (clp->cl_prop != ZFS_PROP_MOUNTPOINT) {
+				ret = zfs_iter_filesystems_v2(zhp, 0,
+				    change_one, data);
+			} else {
+				ret = zfs_iter_children_v2(zhp, 0, change_one,
+				    data);
+			}
+		}
 
 		/*
 		 * If we added the handle to the changelist, we will re-use it
@@ -738,6 +745,11 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags,
 			changelist_free(clp);
 			return (NULL);
 		}
+	} else if (clp->cl_prop != ZFS_PROP_MOUNTPOINT) {
+		if (zfs_iter_filesystems_v2(zhp, 0, change_one, clp) != 0) {
+			changelist_free(clp);
+			return (NULL);
+		}
 	} else if (zfs_iter_children_v2(zhp, 0, change_one, clp) != 0) {
 		changelist_free(clp);
 		return (NULL);

From 1acd2469643f8543f2941cdc72f977d56521c4c9 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 20 Dec 2024 17:25:35 -0500
Subject: [PATCH 18/43] Fix readonly check for vdev user properties

VDEV_PROP_USERPROP is equal do VDEV_PROP_INVAL and so is not a real
property.  That's why vdev_prop_readonly() does not work right for
it.  In particular it may declare all vdev user properties readonly
on FreeBSD.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16890
---
 module/zfs/vdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 250590f062eaa..85b6ee32158d9 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -5969,7 +5969,7 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 			goto end;
 		}
 
-		if (vdev_prop_readonly(prop)) {
+		if (prop != VDEV_PROP_USERPROP && vdev_prop_readonly(prop)) {
 			error = EROFS;
 			goto end;
 		}

From c37a2ddaaabc203feaae987f62289b2cfe306818 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Fri, 27 Dec 2024 09:10:09 +1100
Subject: [PATCH 19/43] microzap: set hard upper limit of 1M

The count of chunks in a microzap block is stored as an uint16_t
(mze_chunkid). Each chunk is 64 bytes, and the first is used to store a
header, so there are 32767 usable chunks, which is just under 2M. 1M is
the largest power-2-rounded block size under 2M, so we must set the
limit there.

If it goes higher, the loop in mzap_addent can overflow and fall into
the PANIC case.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16888
---
 module/zfs/zap_micro.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c
index 55b60006e58ce..a9298d3e940e6 100644
--- a/module/zfs/zap_micro.c
+++ b/module/zfs/zap_micro.c
@@ -54,14 +54,25 @@
  * machinery to understand not to try to split a microzap block).
  *
  * If large_microzap is enabled, this value will be clamped to
- * spa_maxblocksize(). If not, it will be clamped to SPA_OLD_MAXBLOCKSIZE.
+ * spa_maxblocksize(), up to 1M. If not, it will be clamped to
+ * SPA_OLD_MAXBLOCKSIZE.
  */
 static int zap_micro_max_size = SPA_OLD_MAXBLOCKSIZE;
 
+/*
+ * The 1M upper limit is necessary because the count of chunks in a microzap
+ * block is stored as a uint16_t (mze_chunkid). Each chunk is 64 bytes, and the
+ * first is used to store a header, so there are 32767 usable chunks, which is
+ * just under 2M. 1M is the largest power-2-rounded block size under 2M, so we
+ * must set the limit there.
+ */
+#define	MZAP_MAX_SIZE	(1048576)
+
 uint64_t
 zap_get_micro_max_size(spa_t *spa)
 {
-	uint64_t maxsz = P2ROUNDUP(zap_micro_max_size, SPA_MINBLOCKSIZE);
+	uint64_t maxsz = MIN(MZAP_MAX_SIZE,
+	    P2ROUNDUP(zap_micro_max_size, SPA_MINBLOCKSIZE));
 	if (maxsz <= SPA_OLD_MAXBLOCKSIZE)
 		return (maxsz);
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_MICROZAP))
@@ -2031,5 +2042,6 @@ EXPORT_SYMBOL(zap_cursor_init_serialized);
 EXPORT_SYMBOL(zap_get_stats);
 
 ZFS_MODULE_PARAM(zfs, , zap_micro_max_size, INT, ZMOD_RW,
-	"Maximum micro ZAP size, before converting to a fat ZAP, in bytes");
+	"Maximum micro ZAP size before converting to a fat ZAP, "
+	    "in bytes (max 1M)");
 #endif

From 89f796dec688ab706bb9905b57f7725dc5c2f3ee Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 27 Dec 2024 10:01:22 -0500
Subject: [PATCH 20/43] ZTS: Increase write sizes for RAIDZ/dRAID tests

Many RAIDZ/dRAID tests filled files doing millions of 100 or even
10 byte writes.  It makes very little sense since we are not
micro-benchmarking syscalls or VFS layer here, while before the
blocks reach the vdev layer absolute majority of the small writes
will be aggregated.  In some cases I see we spend almost as much
time creating the test files as actually running the tests.  And
sometimes the tests even time out after that.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16905
---
 .../tests/functional/raidz/raidz_expand_001_pos.ksh         | 6 +++---
 .../tests/functional/raidz/raidz_expand_002_pos.ksh         | 6 +++---
 .../tests/functional/raidz/raidz_expand_003_neg.ksh         | 2 +-
 .../tests/functional/raidz/raidz_expand_003_pos.ksh         | 4 ++--
 .../tests/functional/raidz/raidz_expand_004_pos.ksh         | 4 ++--
 .../tests/functional/raidz/raidz_expand_005_pos.ksh         | 4 ++--
 .../tests/functional/redundancy/redundancy_draid.ksh        | 6 +++---
 .../functional/redundancy/redundancy_draid_damaged1.ksh     | 6 +++---
 .../functional/redundancy/redundancy_draid_damaged2.ksh     | 6 +++---
 .../tests/functional/redundancy/redundancy_raidz.ksh        | 6 +++---
 10 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh
index d4923fdb67d9f..125b0e5411a3a 100755
--- a/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh
+++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh
@@ -200,13 +200,13 @@ log_must zpool create -f -o cachefile=none $TESTPOOL $raid ${disks[@]}
 log_must zfs set primarycache=metadata $TESTPOOL
 
 log_must zfs create $TESTPOOL/fs
-log_must fill_fs /$TESTPOOL/fs 1 512 100 1024 R
+log_must fill_fs /$TESTPOOL/fs 1 512 102400 1 R
 
 log_must zfs create -o compress=on $TESTPOOL/fs2
-log_must fill_fs /$TESTPOOL/fs2 1 512 100 1024 R
+log_must fill_fs /$TESTPOOL/fs2 1 512 102400 1 R
 
 log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3
-log_must fill_fs /$TESTPOOL/fs3 1 512 100 1024 R
+log_must fill_fs /$TESTPOOL/fs3 1 512 102400 1 R
 
 log_must check_pool_status $TESTPOOL "errors" "No known data errors"
 
diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh
index 56810aca099f4..185316a7cb858 100755
--- a/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh
+++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh
@@ -78,13 +78,13 @@ log_must zpool create -f $opts $pool $raid ${disks[1..$(($nparity+1))]}
 log_must zfs set primarycache=metadata $pool
 
 log_must zfs create $pool/fs
-log_must fill_fs /$pool/fs 1 512 100 1024 R
+log_must fill_fs /$pool/fs 1 512 102400 1 R
 
 log_must zfs create -o compress=on $pool/fs2
-log_must fill_fs /$pool/fs2 1 512 100 1024 R
+log_must fill_fs /$pool/fs2 1 512 102400 1 R
 
 log_must zfs create -o compress=on -o recordsize=8k $pool/fs3
-log_must fill_fs /$pool/fs3 1 512 100 1024 R
+log_must fill_fs /$pool/fs3 1 512 102400 1 R
 
 typeset pool_size=$(get_pool_prop size $pool)
 
diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_neg.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_neg.ksh
index 4d85c46897b89..a2eb87b1f7228 100755
--- a/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_neg.ksh
+++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_neg.ksh
@@ -92,7 +92,7 @@ log_must zpool destroy $pool
 log_must zpool create -f $opts $pool $raid ${disks[1..$(($devs-1))]}
 log_must zfs set primarycache=metadata $pool
 log_must zfs create $pool/fs
-log_must fill_fs /$pool/fs 1 512 100 1024 R
+log_must fill_fs /$pool/fs 1 512 102400 1 R
 allocated=$(zpool list -Hp -o allocated $pool)
 log_must set_tunable64 RAIDZ_EXPAND_MAX_REFLOW_BYTES $((allocated / 4))
 log_must zpool attach $pool ${raid}-0 ${disks[$devs]}
diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_pos.ksh
index 712b252617738..6f852c516ca4c 100755
--- a/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_pos.ksh
+++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_pos.ksh
@@ -94,10 +94,10 @@ opts="-o cachefile=none"
 log_must zpool create -f $opts $pool $raid ${disks[1..$(($nparity+1))]}
 
 log_must zfs create -o recordsize=8k $pool/fs
-log_must fill_fs /$pool/fs 1 256 100 1024 R
+log_must fill_fs /$pool/fs 1 256 102400 1 R
 
 log_must zfs create -o recordsize=128k $pool/fs2
-log_must fill_fs /$pool/fs2 1 256 100 1024 R
+log_must fill_fs /$pool/fs2 1 256 102400 1 R
 
 for disk in ${disks[$(($nparity+2))..$devs]}; do
 	log_must mkfile -n 400m /$pool/fs/file
diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_004_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_004_pos.ksh
index 2be55dae42541..5056e4e4b1fd5 100755
--- a/tests/zfs-tests/tests/functional/raidz/raidz_expand_004_pos.ksh
+++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_004_pos.ksh
@@ -81,10 +81,10 @@ log_must set_tunable32 SCRUB_AFTER_EXPAND 0
 log_must zpool create -f $opts $pool $raid ${disks[1..$(($nparity+1))]}
 
 log_must zfs create -o recordsize=8k $pool/fs
-log_must fill_fs /$pool/fs 1 128 100 1024 R
+log_must fill_fs /$pool/fs 1 128 102400 1 R
 
 log_must zfs create -o recordsize=128k $pool/fs2
-log_must fill_fs /$pool/fs2 1 128 100 1024 R
+log_must fill_fs /$pool/fs2 1 128 102400 1 R
 
 for disk in ${disks[$(($nparity+2))..$devs]}; do
 	log_must zpool attach $pool ${raid}-0 $disk
diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_005_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_005_pos.ksh
index 56ee3e9be67c6..49b9f6c1d353f 100755
--- a/tests/zfs-tests/tests/functional/raidz/raidz_expand_005_pos.ksh
+++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_005_pos.ksh
@@ -137,10 +137,10 @@ log_must zpool create -f $opts $pool $raid ${disks[1..$(($nparity+1))]}
 devices="${disks[1..$(($nparity+1))]}"
 
 log_must zfs create -o recordsize=8k $pool/fs
-log_must fill_fs /$pool/fs 1 128 100 1024 R
+log_must fill_fs /$pool/fs 1 128 102400 1 R
 
 log_must zfs create -o recordsize=128k $pool/fs2
-log_must fill_fs /$pool/fs2 1 128 100 1024 R
+log_must fill_fs /$pool/fs2 1 128 102400 1 R
 
 for disk in ${disks[$(($nparity+2))..$devs]}; do
 	# Set pause to some random value near halfway point
diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid.ksh
index 8208d2b4a3981..df113a98aa3c8 100755
--- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid.ksh
+++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid.ksh
@@ -223,13 +223,13 @@ for nparity in 1 2 3; do
 	log_must zfs set primarycache=metadata $TESTPOOL
 
 	log_must zfs create $TESTPOOL/fs
-	log_must fill_fs /$TESTPOOL/fs 1 512 100 1024 R
+	log_must fill_fs /$TESTPOOL/fs 1 512 102400 1 R
 
 	log_must zfs create -o compress=on $TESTPOOL/fs2
-	log_must fill_fs /$TESTPOOL/fs2 1 512 100 1024 R
+	log_must fill_fs /$TESTPOOL/fs2 1 512 102400 1 R
 
 	log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3
-	log_must fill_fs /$TESTPOOL/fs3 1 512 100 1024 R
+	log_must fill_fs /$TESTPOOL/fs3 1 512 102400 1 R
 
 	typeset pool_size=$(get_pool_prop size $TESTPOOL)
 
diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged1.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged1.ksh
index 110c69159eb1b..50d7358411dc0 100755
--- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged1.ksh
+++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged1.ksh
@@ -119,13 +119,13 @@ for nparity in 1 2 3; do
 	log_must zfs set primarycache=metadata $TESTPOOL
 
 	log_must zfs create $TESTPOOL/fs
-	log_must fill_fs /$TESTPOOL/fs 1 512 100 1024 R
+	log_must fill_fs /$TESTPOOL/fs 1 512 102400 1 R
 
 	log_must zfs create -o compress=on $TESTPOOL/fs2
-	log_must fill_fs /$TESTPOOL/fs2 1 512 100 1024 R
+	log_must fill_fs /$TESTPOOL/fs2 1 512 102400 1 R
 
 	log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3
-	log_must fill_fs /$TESTPOOL/fs3 1 512 100 1024 R
+	log_must fill_fs /$TESTPOOL/fs3 1 512 102400 1 R
 
 	log_must zpool export $TESTPOOL
 	log_must zpool import -o cachefile=none -d $dir $TESTPOOL
diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh
index b0bb4ef841298..ad66f86339864 100755
--- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh
+++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh
@@ -94,13 +94,13 @@ for nparity in 1 2 3; do
 	# log_must zfs set primarycache=metadata $TESTPOOL
 
 	log_must zfs create $TESTPOOL/fs
-	log_must fill_fs /$TESTPOOL/fs 1 256 10 1024 R
+	log_must fill_fs /$TESTPOOL/fs 1 256 10240 1 R
 
 	log_must zfs create -o compress=on $TESTPOOL/fs2
-	log_must fill_fs /$TESTPOOL/fs2 1 256 10 1024 R
+	log_must fill_fs /$TESTPOOL/fs2 1 256 10240 1 R
 
 	log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3
-	log_must fill_fs /$TESTPOOL/fs3 1 256 10 1024 R
+	log_must fill_fs /$TESTPOOL/fs3 1 256 10240 1 R
 
 	log_must zpool export $TESTPOOL
 	log_must zpool import -o cachefile=none -d $dir $TESTPOOL
diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz.ksh
index 83cacda84b09b..7de35c947fec9 100755
--- a/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz.ksh
+++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz.ksh
@@ -223,13 +223,13 @@ for nparity in 1 2 3; do
 	log_must zfs set primarycache=metadata $TESTPOOL
 
 	log_must zfs create $TESTPOOL/fs
-	log_must fill_fs /$TESTPOOL/fs 1 512 100 1024 R
+	log_must fill_fs /$TESTPOOL/fs 1 512 102400 1 R
 
 	log_must zfs create -o compress=on $TESTPOOL/fs2
-	log_must fill_fs /$TESTPOOL/fs2 1 512 100 1024 R
+	log_must fill_fs /$TESTPOOL/fs2 1 512 102400 1 R
 
 	log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3
-	log_must fill_fs /$TESTPOOL/fs3 1 512 100 1024 R
+	log_must fill_fs /$TESTPOOL/fs3 1 512 102400 1 R
 
 	typeset pool_size=$(get_pool_prop size $TESTPOOL)
 

From 779c5a5debf219b87cd335852ad060fa79acf3e8 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Thu, 19 Dec 2024 20:11:54 +1100
Subject: [PATCH 21/43] zpool_get_vdev_prop_value: show missing vdev userprops

If a vdev userprop is not found, present it as value '-', default
source, so it matches the output from pool userprops.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16887
---
 lib/libzfs/libzfs_pool.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index f256535e8ea04..64f9d1f6eb49b 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -5342,7 +5342,8 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name,
 			strval = fnvlist_lookup_string(nv, ZPROP_VALUE);
 		} else {
 			/* user prop not found */
-			return (-1);
+			src = ZPROP_SRC_DEFAULT;
+			strval = "-";
 		}
 		(void) strlcpy(buf, strval, len);
 		if (srctype)

From 03b7cfdef38aed0966fb69510b1ce1a4bc3f2bde Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Thu, 19 Dec 2024 20:11:54 +1100
Subject: [PATCH 22/43] spa_sync_props: remove pool userprops by setting
 empty-string

People have noted there's no way to remove a pool userprop, only zero
it. Turns vdev userprops had a method, by setting empty-string. So this
makes pool userprops follow the same behaviour.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16887
---
 module/zfs/spa.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index b83c982c13fd6..c93c7945f192f 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -9683,9 +9683,17 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
 			if (nvpair_type(elem) == DATA_TYPE_STRING) {
 				ASSERT(proptype == PROP_TYPE_STRING);
 				strval = fnvpair_value_string(elem);
-				VERIFY0(zap_update(mos,
-				    spa->spa_pool_props_object, propname,
-				    1, strlen(strval) + 1, strval, tx));
+				if (strlen(strval) == 0) {
+					/* remove the property if value == "" */
+					(void) zap_remove(mos,
+					    spa->spa_pool_props_object,
+					    propname, tx);
+				} else {
+					VERIFY0(zap_update(mos,
+					    spa->spa_pool_props_object,
+					    propname, 1, strlen(strval) + 1,
+					    strval, tx));
+				}
 				spa_history_log_internal(spa, "set", tx,
 				    "%s=%s", elemname, strval);
 			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {

From c4e5fa5e175f19f44596cc43229f09ca58cd68ce Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Thu, 19 Dec 2024 20:11:54 +1100
Subject: [PATCH 23/43] ZTS: test clearing pool and vdev userprops

Confirming that clearing pool and vdev userprops produce the same
result: an empty value, with default source.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16887
---
 tests/runfiles/common.run                     |  3 +-
 tests/zfs-tests/tests/Makefile.am             |  1 +
 .../zpool_set/zpool_set_clear_userprop.ksh    | 44 +++++++++++++++++++
 .../zpool_set/zpool_set_common.kshlib         | 40 ++++++++++++++++-
 4 files changed, 85 insertions(+), 3 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_clear_userprop.ksh

diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index a69d36df2f984..6abb3b4213bb3 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -544,7 +544,8 @@ tags = ['functional', 'cli_root', 'zpool_scrub']
 [tests/functional/cli_root/zpool_set]
 tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg',
     'zpool_set_ashift', 'zpool_set_features', 'vdev_set_001_pos',
-    'user_property_001_pos', 'user_property_002_neg']
+    'user_property_001_pos', 'user_property_002_neg',
+    'zpool_set_clear_userprop']
 tags = ['functional', 'cli_root', 'zpool_set']
 
 [tests/functional/cli_root/zpool_split]
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 67630cb564ae9..588249be45da7 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1243,6 +1243,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/cli_root/zpool_set/user_property_001_pos.ksh \
 	functional/cli_root/zpool_set/user_property_002_neg.ksh \
 	functional/cli_root/zpool_set/zpool_set_features.ksh \
+	functional/cli_root/zpool_set/zpool_set_clear_userprop.ksh \
 	functional/cli_root/zpool_split/cleanup.ksh \
 	functional/cli_root/zpool_split/setup.ksh \
 	functional/cli_root/zpool_split/zpool_split_cliargs.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_clear_userprop.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_clear_userprop.ksh
new file mode 100755
index 0000000000000..d9395ea8a15be
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_clear_userprop.ksh
@@ -0,0 +1,44 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024, Klara, Inc.
+#
+
+. $STF_SUITE/tests/functional/cli_root/zpool_set/zpool_set_common.kshlib
+
+verify_runnable "both"
+
+log_assert "Setting a user-defined property to the empty string removes it."
+log_onexit cleanup_user_prop $TESTPOOL
+
+log_must zpool set cool:pool=hello $TESTPOOL
+log_must check_user_prop $TESTPOOL cool:pool hello local
+log_must zpool set cool:pool= $TESTPOOL
+log_must check_user_prop $TESTPOOL cool:pool '-' default
+
+log_must zpool set cool:vdev=goodbye $TESTPOOL root
+log_must check_vdev_user_prop $TESTPOOL root cool:vdev goodbye local
+log_must zpool set cool:vdev= $TESTPOOL root
+log_must check_vdev_user_prop $TESTPOOL root cool:vdev '-' default
+
+log_pass "Setting a user-defined property to the empty string removes it."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_common.kshlib b/tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_common.kshlib
index 346e4a16b2ad9..e095d315c2b5b 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_common.kshlib
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_common.kshlib
@@ -160,19 +160,55 @@ function user_property_value
 	random_string ALL_CHAR $len
 }
 
+function _check_user_prop
+{
+	typeset pool="$1"
+	typeset vdev="$2"
+	typeset user_prop="$3"
+	typeset expect_value="$4"
+	typeset expect_source="$5"
+
+	typeset -a \
+	    v=($(zpool get -p -H -o value,source "$user_prop" $pool $vdev 2>&1))
+
+	[[ "$expect_value" == "${v[0]}" && \
+	    -z "$expect_source" || "$expect_source" == "${v[1]}" ]]
+}
+
 #
 # Check if the user-defined property is identical to the expected value.
 #
 # $1 pool
 # $2 user property
 # $3 expected value
+# $4 expected source (optional)
 #
 function check_user_prop
 {
 	typeset pool=$1
 	typeset user_prop="$2"
 	typeset expect_value="$3"
-	typeset value=$(zpool get -p -H -o value "$user_prop" $pool 2>&1)
+	typeset expect_source="${4:-}"
+
+	_check_user_prop $pool '' $user_prop $expect_value $expect_source
+}
+
+#
+# Check if the user-defined property is identical to the expected value.
+#
+# $1 pool
+# $2 vdev
+# $3 user property
+# $4 expected value
+# $5 expected source (optional)
+#
+function check_vdev_user_prop
+{
+	typeset pool="$1"
+	typeset vdev="$2"
+	typeset user_prop="$3"
+	typeset expect_value="$4"
+	typeset expect_source="${5:-}"
 
-	[ "$expect_value" = "$value" ]
+	_check_user_prop $pool $vdev $user_prop $expect_value $expect_source
 }

From 8bf1e83eefa4f4ed74505f4a9ba4c3720c73185c Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 20 Dec 2024 10:54:35 -0500
Subject: [PATCH 24/43] ZTS: Remove non-standard awk hex numbers usage

FreeBSD recently removed non-standard hex numbers support from awk.
Neither it supports -n argument, enabling it in gawk.  Instead of
depending on those rewrite list_file_blocks() fuction to handle the
hex math in shell instead of awk.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by:Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #11141
---
 tests/zfs-tests/include/blkdev.shlib | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/tests/zfs-tests/include/blkdev.shlib b/tests/zfs-tests/include/blkdev.shlib
index 51eff3023e732..5b505f9252863 100644
--- a/tests/zfs-tests/include/blkdev.shlib
+++ b/tests/zfs-tests/include/blkdev.shlib
@@ -556,27 +556,15 @@ function list_file_blocks # input_file
 	# 512B blocks for ease of use with dd.
 	#
 	typeset level vdev path offset length
-	if awk -n '' 2>/dev/null; then
-		# gawk needs -n to decode hex
-		AWK='awk -n'
-	else
-		AWK='awk'
-	fi
 	sync_all_pools true
-	zdb -dddddd $ds $objnum | $AWK -v pad=$((4<<20)) -v bs=512 '
+	zdb -dddddd $ds $objnum | awk '
 	    /^$/ { looking = 0 }
 	    looking {
 	        level = $2
 	        field = 3
 	        while (split($field, dva, ":") == 3) {
-	            # top level vdev id
-	            vdev = int(dva[1])
-	            # offset + 4M label/boot pad in 512B blocks
-	            offset = (int("0x"dva[2]) + pad) / bs
-		    # length in 512B blocks
-		    len = int("0x"dva[3]) / bs
 
-	            print level, vdev, offset, len
+	            print level, int(dva[1]), "0x"dva[2], "0x"dva[3]
 
 	            ++field
 	        }
@@ -585,7 +573,8 @@ function list_file_blocks # input_file
 	' | \
 	while read level vdev offset length; do
 		for path in ${VDEV_MAP[$vdev][@]}; do
-			echo "$level $path $offset $length"
+			echo "$level $path $(( ($offset + (4<<20)) / 512 ))" \
+			    "$(( $length / 512 ))"
 		done
 	done 2>/dev/null
 }

From b66d910113921fbe6a9c67c2ba3e607820c1966b Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Wed, 25 Dec 2024 14:00:38 -0500
Subject: [PATCH 25/43] ZTS: Remove procfs use from zpool_import_status

procfs might be not mounted on FreeBSD.  Plus checking for specific
PID might be not exactly reliable.  Check for empty list of jobs
instead.

Premature loop exit can result in failed test and failed cleanup,
failing also some following tests.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by:Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #11141
---
 .../cli_root/zpool_import/zpool_import_status.ksh          | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_status.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_status.ksh
index c96961bf6419b..679362bbef505 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_status.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_status.ksh
@@ -103,21 +103,16 @@ log_must zpool export $TESTPOOL1
 log_must set_tunable64 METASLAB_DEBUG_LOAD 1
 log_note "Starting zpool import in background at" $(date +'%H:%M:%S')
 zpool import -d $DEVICE_DIR -f $guid &
-pid=$!
 
 #
 # capture progress until import is finished
 #
-log_note waiting for pid $pid to exit
 kstat import_progress
-while [[ -d /proc/"$pid" ]]; do
+while [[ -n $(jobs) ]]; do
 	line=$(kstat import_progress | grep -v pool_guid)
 	if [[ -n $line ]]; then
 		echo $line
 	fi
-	if [[ -f /$TESTPOOL1/fs/00 ]]; then
-		break;
-	fi
 	sleep 0.0001
 done
 log_note "zpool import completed at" $(date +'%H:%M:%S')

From a153397f414a1a22f2c54213e83afa092712faf3 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Wed, 25 Dec 2024 19:42:44 -0500
Subject: [PATCH 26/43] ZTS: Reduce file size in redacted_panic to 1GB

This test takes 3 minutes on RELEASE FreeBSD bots, but on CURRENT,
probably due to debugging it has in kernel, it does not complete
within 10 minutes, ending up killed.  As I see all the redacting
here happens within the first ~128MB of the file, so I hope it
won't matter if there is 1GB of data instead of 2GB.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by:Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #11141
---
 .../zfs-tests/tests/functional/redacted_send/redacted_panic.ksh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_panic.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_panic.ksh
index 032d1fb91a2ee..a2438c2cd7316 100755
--- a/tests/zfs-tests/tests/functional/redacted_send/redacted_panic.ksh
+++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_panic.ksh
@@ -39,7 +39,7 @@ function cleanup
 log_onexit cleanup
 
 log_must zfs create -o recsize=8k $sendfs
-log_must dd if=/dev/urandom of=/$sendfs/file bs=1024k count=2048
+log_must dd if=/dev/urandom of=/$sendfs/file bs=1024k count=1024
 log_must zfs snapshot $sendfs@init
 log_must zfs clone $sendfs@init $clone
 log_must stride_dd -i /dev/urandom -o /$clone/file -b 8192 -s 2 -c 7226

From 9dd5fe1095df246299df9a953a9c4994142636c2 Mon Sep 17 00:00:00 2001
From: Ameer Hamza <ahamza@ixsystems.com>
Date: Mon, 30 Dec 2024 00:41:30 +0500
Subject: [PATCH 27/43] zvol: implement platform-independent part of block
 cloning

In Linux, block devices currently lack support for `copy_file_range`
API because the kernel does not provide the necessary functionality.
However, there is an ongoing upstream effort to address this
limitation: https://patchwork.kernel.org/project/dm-devel/cover/20240520102033.9361-1-nj.shetty@samsung.com/.
We have adopted this upstream kernel patch into the TrueNAS kernel and
made some additional modifications to enable block cloning specifically
for the zvol block device. This patch implements the platform-
independent portions of these changes for inclusion in OpenZFS.
This patch does not introduce any new functionality directly into
OpenZFS. The `TX_CLONE_RANGE` replay capability is only relevant when
zvols are migrated to non-TrueNAS systems that support Clone Range
replay in the ZIL.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #16901
---
 include/sys/zvol_impl.h |   5 +
 module/zfs/zfs_vnops.c  |   2 +-
 module/zfs/zvol.c       | 284 +++++++++++++++++++++++++++++++++++++++-
 3 files changed, 289 insertions(+), 2 deletions(-)

diff --git a/include/sys/zvol_impl.h b/include/sys/zvol_impl.h
index 3cd0d78c353d1..a8168850023a6 100644
--- a/include/sys/zvol_impl.h
+++ b/include/sys/zvol_impl.h
@@ -88,6 +88,11 @@ int zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
 int zvol_init_impl(void);
 void zvol_fini_impl(void);
 void zvol_wait_close(zvol_state_t *zv);
+int zvol_clone_range(zvol_state_handle_t *, uint64_t,
+    zvol_state_handle_t *, uint64_t, uint64_t);
+void zvol_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+    uint64_t off, uint64_t len, uint64_t blksz, const blkptr_t *bps,
+    size_t nbps);
 
 /*
  * platform dependent functions exported to platform independent code
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index c01a9cf5d0b2e..b789d1ed52397 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -71,7 +71,7 @@ int zfs_bclone_enabled = 1;
  * a copy of the file and is therefore not the default.  However, in certain
  * scenarios this behavior may be desirable so a tunable is provided.
  */
-static int zfs_bclone_wait_dirty = 0;
+int zfs_bclone_wait_dirty = 0;
 
 /*
  * Enable Direct I/O. If this setting is 0, then all I/O requests will be
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index fec595b2c4c51..14a6219d19cd6 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -93,6 +93,7 @@ unsigned int zvol_volmode = ZFS_VOLMODE_GEOM;
 struct hlist_head *zvol_htable;
 static list_t zvol_state_list;
 krwlock_t zvol_state_lock;
+extern int zfs_bclone_wait_dirty;
 
 typedef enum {
 	ZVOL_ASYNC_REMOVE_MINORS,
@@ -516,6 +517,285 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
 	return (error);
 }
 
+/*
+ * Replay a TX_CLONE_RANGE ZIL transaction that didn't get committed
+ * after a system failure
+ */
+static int
+zvol_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap)
+{
+	zvol_state_t *zv = arg1;
+	lr_clone_range_t *lr = arg2;
+	objset_t *os = zv->zv_objset;
+	dmu_tx_t *tx;
+	int error;
+	uint64_t blksz;
+	uint64_t off;
+	uint64_t len;
+
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, offsetof(lr_clone_range_t,
+	    lr_bps[lr->lr_nbps]));
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	ASSERT(spa_feature_is_enabled(dmu_objset_spa(os),
+	    SPA_FEATURE_BLOCK_CLONING));
+
+	off = lr->lr_offset;
+	len = lr->lr_length;
+	blksz = lr->lr_blksz;
+
+	if ((off % blksz) != 0) {
+		return (SET_ERROR(EINVAL));
+	}
+
+	error = dnode_hold(os, ZVOL_OBJ, zv, &zv->zv_dn);
+	if (error != 0 || !zv->zv_dn)
+		return (error);
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_clone_by_dnode(tx, zv->zv_dn, off, len);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error != 0) {
+		dmu_tx_abort(tx);
+		goto out;
+	}
+	error = dmu_brt_clone(zv->zv_objset, ZVOL_OBJ, off, len,
+	    tx, lr->lr_bps, lr->lr_nbps);
+	if (error != 0) {
+		dmu_tx_commit(tx);
+		goto out;
+	}
+
+	/*
+	 * zil_replaying() not only check if we are replaying ZIL, but also
+	 * updates the ZIL header to record replay progress.
+	 */
+	VERIFY(zil_replaying(zv->zv_zilog, tx));
+	dmu_tx_commit(tx);
+
+out:
+	dnode_rele(zv->zv_dn, zv);
+	zv->zv_dn = NULL;
+	return (error);
+}
+
+int
+zvol_clone_range(zvol_state_t *zv_src, uint64_t inoff, zvol_state_t *zv_dst,
+    uint64_t outoff, uint64_t len)
+{
+	zilog_t	*zilog_dst;
+	zfs_locked_range_t *inlr, *outlr;
+	objset_t *inos, *outos;
+	dmu_tx_t *tx;
+	blkptr_t *bps;
+	size_t maxblocks;
+	int error = EINVAL;
+
+	rw_enter(&zv_dst->zv_suspend_lock, RW_READER);
+	if (zv_dst->zv_zilog == NULL) {
+		rw_exit(&zv_dst->zv_suspend_lock);
+		rw_enter(&zv_dst->zv_suspend_lock, RW_WRITER);
+		if (zv_dst->zv_zilog == NULL) {
+			zv_dst->zv_zilog = zil_open(zv_dst->zv_objset,
+			    zvol_get_data, &zv_dst->zv_kstat.dk_zil_sums);
+			zv_dst->zv_flags |= ZVOL_WRITTEN_TO;
+			VERIFY0((zv_dst->zv_zilog->zl_header->zh_flags &
+			    ZIL_REPLAY_NEEDED));
+		}
+		rw_downgrade(&zv_dst->zv_suspend_lock);
+	}
+	if (zv_src != zv_dst)
+		rw_enter(&zv_src->zv_suspend_lock, RW_READER);
+
+	inos = zv_src->zv_objset;
+	outos = zv_dst->zv_objset;
+
+	/*
+	 * Sanity checks
+	 */
+	if (!spa_feature_is_enabled(dmu_objset_spa(outos),
+	    SPA_FEATURE_BLOCK_CLONING)) {
+		error = EOPNOTSUPP;
+		goto out;
+	}
+	if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) {
+		error = EXDEV;
+		goto out;
+	}
+	if (inos->os_encrypted != outos->os_encrypted) {
+		error = EXDEV;
+		goto out;
+	}
+	if (zv_src->zv_volblocksize != zv_dst->zv_volblocksize) {
+		error = EINVAL;
+		goto out;
+	}
+	if (inoff >= zv_src->zv_volsize || outoff >= zv_dst->zv_volsize) {
+		error = 0;
+		goto out;
+	}
+
+	/*
+	 * Do not read beyond boundary
+	 */
+	if (len > zv_src->zv_volsize - inoff)
+		len = zv_src->zv_volsize - inoff;
+	if (len > zv_dst->zv_volsize - outoff)
+		len = zv_dst->zv_volsize - outoff;
+	if (len == 0) {
+		error = 0;
+		goto out;
+	}
+
+	/*
+	 * No overlapping if we are cloning within the same file
+	 */
+	if (zv_src == zv_dst) {
+		if (inoff < outoff + len && outoff < inoff + len) {
+			error = EINVAL;
+			goto out;
+		}
+	}
+
+	/*
+	 * Offsets and length must be at block boundaries
+	 */
+	if ((inoff % zv_src->zv_volblocksize) != 0 ||
+	    (outoff % zv_dst->zv_volblocksize) != 0) {
+		error = EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Length must be multiple of block size
+	 */
+	if ((len % zv_src->zv_volblocksize) != 0) {
+		error = EINVAL;
+		goto out;
+	}
+
+	zilog_dst = zv_dst->zv_zilog;
+	maxblocks = zil_max_log_data(zilog_dst, sizeof (lr_clone_range_t)) /
+	    sizeof (bps[0]);
+	bps = vmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP);
+	/*
+	 * Maintain predictable lock order.
+	 */
+	if (zv_src < zv_dst || (zv_src == zv_dst && inoff < outoff)) {
+		inlr = zfs_rangelock_enter(&zv_src->zv_rangelock, inoff, len,
+		    RL_READER);
+		outlr = zfs_rangelock_enter(&zv_dst->zv_rangelock, outoff, len,
+		    RL_WRITER);
+	} else {
+		outlr = zfs_rangelock_enter(&zv_dst->zv_rangelock, outoff, len,
+		    RL_WRITER);
+		inlr = zfs_rangelock_enter(&zv_src->zv_rangelock, inoff, len,
+		    RL_READER);
+	}
+
+	while (len > 0) {
+		uint64_t size, last_synced_txg;
+		size_t nbps = maxblocks;
+		size = MIN(zv_src->zv_volblocksize * maxblocks, len);
+		last_synced_txg = spa_last_synced_txg(
+		    dmu_objset_spa(zv_src->zv_objset));
+		error = dmu_read_l0_bps(zv_src->zv_objset, ZVOL_OBJ, inoff,
+		    size, bps, &nbps);
+		if (error != 0) {
+			/*
+			 * If we are trying to clone a block that was created
+			 * in the current transaction group, the error will be
+			 * EAGAIN here.  Based on zfs_bclone_wait_dirty either
+			 * return a shortened range to the caller so it can
+			 * fallback, or wait for the next TXG and check again.
+			 */
+			if (error == EAGAIN && zfs_bclone_wait_dirty) {
+				txg_wait_synced(dmu_objset_pool
+				    (zv_src->zv_objset), last_synced_txg + 1);
+					continue;
+			}
+			break;
+		}
+
+		tx = dmu_tx_create(zv_dst->zv_objset);
+		dmu_tx_hold_clone_by_dnode(tx, zv_dst->zv_dn, outoff, size);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error != 0) {
+			dmu_tx_abort(tx);
+			break;
+		}
+		error = dmu_brt_clone(zv_dst->zv_objset, ZVOL_OBJ, outoff, size,
+		    tx, bps, nbps);
+		if (error != 0) {
+			dmu_tx_commit(tx);
+			break;
+		}
+		zvol_log_clone_range(zilog_dst, tx, TX_CLONE_RANGE, outoff,
+		    size, zv_src->zv_volblocksize, bps, nbps);
+		dmu_tx_commit(tx);
+		inoff += size;
+		outoff += size;
+		len -= size;
+	}
+	vmem_free(bps, sizeof (bps[0]) * maxblocks);
+	zfs_rangelock_exit(outlr);
+	zfs_rangelock_exit(inlr);
+	if (error == 0 && zv_dst->zv_objset->os_sync == ZFS_SYNC_ALWAYS) {
+		zil_commit(zilog_dst, ZVOL_OBJ);
+	}
+out:
+	if (zv_src != zv_dst)
+		rw_exit(&zv_src->zv_suspend_lock);
+	rw_exit(&zv_dst->zv_suspend_lock);
+	return (SET_ERROR(error));
+}
+
+/*
+ * Handles TX_CLONE_RANGE transactions.
+ */
+void
+zvol_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, uint64_t off,
+    uint64_t len, uint64_t blksz, const blkptr_t *bps, size_t nbps)
+{
+	itx_t *itx;
+	lr_clone_range_t *lr;
+	uint64_t partlen, max_log_data;
+	size_t partnbps;
+
+	if (zil_replaying(zilog, tx))
+		return;
+
+	max_log_data = zil_max_log_data(zilog, sizeof (lr_clone_range_t));
+
+	while (nbps > 0) {
+		partnbps = MIN(nbps, max_log_data / sizeof (bps[0]));
+		partlen = partnbps * blksz;
+		ASSERT3U(partlen, <, len + blksz);
+		partlen = MIN(partlen, len);
+
+		itx = zil_itx_create(txtype,
+		    sizeof (*lr) + sizeof (bps[0]) * partnbps);
+		lr = (lr_clone_range_t *)&itx->itx_lr;
+		lr->lr_foid = ZVOL_OBJ;
+		lr->lr_offset = off;
+		lr->lr_length = partlen;
+		lr->lr_blksz = blksz;
+		lr->lr_nbps = partnbps;
+		memcpy(lr->lr_bps, bps, sizeof (bps[0]) * partnbps);
+
+		zil_itx_assign(zilog, itx, tx);
+
+		bps += partnbps;
+		ASSERT3U(nbps, >=, partnbps);
+		nbps -= partnbps;
+		off += partlen;
+		ASSERT3U(len, >=, partlen);
+		len -= partlen;
+	}
+}
+
 static int
 zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap)
 {
@@ -540,7 +820,9 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = {
 	zvol_replay_write,	/* TX_WRITE */
 	zvol_replay_truncate,	/* TX_TRUNCATE */
 	zvol_replay_err,	/* TX_SETATTR */
+	zvol_replay_err,	/* TX_ACL_V0 */
 	zvol_replay_err,	/* TX_ACL */
+	zvol_replay_err,	/* TX_CREATE_ACL */
 	zvol_replay_err,	/* TX_CREATE_ATTR */
 	zvol_replay_err,	/* TX_CREATE_ACL_ATTR */
 	zvol_replay_err,	/* TX_MKDIR_ACL */
@@ -550,7 +832,7 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = {
 	zvol_replay_err,	/* TX_SETSAXATTR */
 	zvol_replay_err,	/* TX_RENAME_EXCHANGE */
 	zvol_replay_err,	/* TX_RENAME_WHITEOUT */
-	zvol_replay_err,	/* TX_CLONE_RANGE */
+	zvol_replay_clone_range,	/* TX_CLONE_RANGE */
 };
 
 /*

From 54126fdb5bfe4db7c95955549581a67a6a6581a9 Mon Sep 17 00:00:00 2001
From: shodanshok <g.danti@assyoma.it>
Date: Sun, 29 Dec 2024 20:50:19 +0100
Subject: [PATCH 28/43] set zfs_arc_shrinker_limit to 0 by default

zfs_arc_shrinker_limit was introduced to avoid ARC collapse due to
aggressive kernel reclaim. While useful, the current default (10000) is
too prone to OOM especially when MGLRU-enabled kernels with default
min_ttl_ms are used. Even when no OOM happens, it often causes too much
swap usage.

This patch sets zfs_arc_shrinker_limit=0 to not ignore kernel reclaim
requests. ARC now plays better with both kernel shrinker and pagecache
but, should ARC collapse happen again, MGLRU behavior can be tuned or
even disabled.

Anyway, zfs should not cause OOM when ARC can be released.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Gionatan Danti <g.danti@assyoma.it>
Closes #16909
---
 man/man4/zfs.4               | 4 ++--
 module/os/linux/zfs/arc_os.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index da027798f962b..7078a5ba83732 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -867,14 +867,14 @@ where that percent may exceed
 This
 only operates during memory pressure/reclaim.
 .
-.It Sy zfs_arc_shrinker_limit Ns = Ns Sy 10000 Pq int
+.It Sy zfs_arc_shrinker_limit Ns = Ns Sy 0 Pq int
 This is a limit on how many pages the ARC shrinker makes available for
 eviction in response to one page allocation attempt.
 Note that in practice, the kernel's shrinker can ask us to evict
 up to about four times this for one allocation attempt.
 To reduce OOM risk, this limit is applied for kswapd reclaims only.
 .Pp
-The default limit of
+For example a value of
 .Sy 10000 Pq in practice, Em 160 MiB No per allocation attempt with 4 KiB pages
 limits the amount of time spent attempting to reclaim ARC memory to
 less than 100 ms per allocation attempt,
diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c
index b1e45b28743e6..3238977af6d1c 100644
--- a/module/os/linux/zfs/arc_os.c
+++ b/module/os/linux/zfs/arc_os.c
@@ -63,7 +63,7 @@
  * practice, the kernel's shrinker can ask us to evict up to about 4x this
  * for one allocation attempt.
  *
- * The default limit of 10,000 (in practice, 160MB per allocation attempt
+ * For example a value of 10,000 (in practice, 160MB per allocation attempt
  * with 4K pages) limits the amount of time spent attempting to reclaim ARC
  * memory to less than 100ms per allocation attempt, even with a small
  * average compressed block size of ~8KB.
@@ -71,7 +71,7 @@
  * See also the comment in arc_shrinker_count().
  * Set to 0 to disable limit.
  */
-static int zfs_arc_shrinker_limit = 10000;
+static int zfs_arc_shrinker_limit = 0;
 
 /*
  * Relative cost of ARC eviction, AKA number of seeks needed to restore evicted

From 25238baad5c56ed2375fc6067ff691fb723614d1 Mon Sep 17 00:00:00 2001
From: Andrew Walker <awalker@ixsystems.com>
Date: Mon, 30 Dec 2024 19:06:48 -0600
Subject: [PATCH 29/43] Add missing zfs_exit() when snapdir is disabled
 (#16912)

zfs_vget doesn't zfs_exit when erroring out due to snapdir
being disabled.

Signed-off-by: Andrew Walker <awalker@ixsystems.com>
Reviewed-by: @bmeagherix
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Ameer Hamza <ahamza@ixsystems.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
---
 module/os/linux/zfs/zfs_vfsops.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c
index 3c53a8a315c34..b226fca147a5b 100644
--- a/module/os/linux/zfs/zfs_vfsops.c
+++ b/module/os/linux/zfs/zfs_vfsops.c
@@ -1702,13 +1702,14 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp)
 	/* A zero fid_gen means we are in the .zfs control directories */
 	if (fid_gen == 0 &&
 	    (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
-		*ipp = zfsvfs->z_ctldir;
-		ASSERT(*ipp != NULL);
-
 		if (zfsvfs->z_show_ctldir == ZFS_SNAPDIR_DISABLED) {
+			zfs_exit(zfsvfs, FTAG);
 			return (SET_ERROR(ENOENT));
 		}
 
+		*ipp = zfsvfs->z_ctldir;
+		ASSERT(*ipp != NULL);
+
 		if (object == ZFSCTL_INO_SNAPDIR) {
 			VERIFY(zfsctl_root_lookup(*ipp, "snapshot", ipp,
 			    0, kcred, NULL, NULL) == 0);

From 3c2267a873989698576ec8feaba9494f1502b919 Mon Sep 17 00:00:00 2001
From: James Reilly <jreilly1821@gmail.com>
Date: Thu, 2 Jan 2025 22:58:56 +0530
Subject: [PATCH 30/43] ZTS: add centos stream10 (#16904)

Added centos as optional runners via workflow_dispatch

removed centos-stream9 from the FULL_OS runner list as CentOS is not
officially support by ZFS. This commit will add preliminary support for
EL10 and allow testing ZFS ahead of EL10 codebase solidifying in ~6
months

Signed-off-by: James Reilly <jreilly1821@gmail.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
---
 .github/workflows/scripts/qemu-2-start.sh |  6 +++++
 .github/workflows/scripts/qemu-3-deps.sh  |  2 +-
 .github/workflows/zfs-qemu.yml            | 27 ++++++++++++++++++++++-
 3 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/scripts/qemu-2-start.sh b/.github/workflows/scripts/qemu-2-start.sh
index 0906e438ac0d8..73496d4f3de66 100755
--- a/.github/workflows/scripts/qemu-2-start.sh
+++ b/.github/workflows/scripts/qemu-2-start.sh
@@ -40,6 +40,12 @@ case "$OS" in
     # dns sometimes fails with that url  :/
     echo "89.187.191.12  geo.mirror.pkgbuild.com" | sudo tee /etc/hosts > /dev/null
     ;;
+  centos-stream10)
+    OSNAME="CentOS Stream 10"
+    # TODO: #16903 Overwrite OSv to stream9 for virt-install until it's added to osinfo
+    OSv="centos-stream9"
+    URL="https://cloud.centos.org/centos/10-stream/x86_64/images/CentOS-Stream-GenericCloud-10-latest.x86_64.qcow2"
+    ;;
   centos-stream9)
     OSNAME="CentOS Stream 9"
     URL="https://cloud.centos.org/centos/9-stream/x86_64/images/CentOS-Stream-GenericCloud-9-latest.x86_64.qcow2"
diff --git a/.github/workflows/scripts/qemu-3-deps.sh b/.github/workflows/scripts/qemu-3-deps.sh
index 96979cd02e091..9b8957734277e 100755
--- a/.github/workflows/scripts/qemu-3-deps.sh
+++ b/.github/workflows/scripts/qemu-3-deps.sh
@@ -104,7 +104,7 @@ case "$1" in
     sudo dnf install -y kernel-abi-whitelists
     echo "##[endgroup]"
     ;;
-  almalinux9|centos-stream9)
+  almalinux9|centos-stream9|centos-stream10)
     echo "##[group]Enable epel and crb repositories"
     sudo dnf config-manager -y --set-enabled crb
     sudo dnf install -y epel-release
diff --git a/.github/workflows/zfs-qemu.yml b/.github/workflows/zfs-qemu.yml
index 4748e90db50bf..af26e135b91fc 100644
--- a/.github/workflows/zfs-qemu.yml
+++ b/.github/workflows/zfs-qemu.yml
@@ -3,6 +3,18 @@ name: zfs-qemu
 on:
   push:
   pull_request:
+  workflow_dispatch:
+    inputs:
+      include_stream9:
+        type: boolean
+        required: false
+        default: false
+        description: 'Test on CentOS 9 stream'
+      include_stream10:
+        type: boolean
+        required: false
+        default: false
+        description: 'Test on CentOS 10 stream'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -22,7 +34,7 @@ jobs:
       - name: Generate OS config and CI type
         id: os
         run: |
-          FULL_OS='["almalinux8", "almalinux9", "centos-stream9", "debian11", "debian12", "fedora40", "fedora41", "freebsd13-3r", "freebsd13-4s", "freebsd14-1r", "freebsd14-2s", "freebsd15-0c", "ubuntu20", "ubuntu22", "ubuntu24"]'
+          FULL_OS='["almalinux8", "almalinux9", "debian11", "debian12", "fedora40", "fedora41", "freebsd13-3r", "freebsd13-4s", "freebsd14-1r", "freebsd14-2s", "freebsd15-0c", "ubuntu20", "ubuntu22", "ubuntu24"]'
           QUICK_OS='["almalinux8", "almalinux9", "debian12", "fedora41", "freebsd13-3r", "freebsd14-2r", "ubuntu24"]'
           # determine CI type when running on PR
           ci_type="full"
@@ -37,9 +49,22 @@ jobs:
             os_selection="$FULL_OS"
           fi
           os_json=$(echo ${os_selection} | jq -c)
+          
+          # Add optional runners
+          if [ "${{ github.event.inputs.include_stream9 }}" == 'true' ]; then
+            os_json=$(echo $os_json | jq -c '. += ["centos-stream9"]')
+          fi
+          if [ "${{ github.event.inputs.include_stream10 }}" == 'true' ]; then
+            os_json=$(echo $os_json | jq -c '. += ["centos-stream10"]')
+          fi
+          
+          echo $os_json
           echo "os=$os_json" >> $GITHUB_OUTPUT
           echo "ci_type=$ci_type" >> $GITHUB_OUTPUT
 
+
+
+          
   qemu-vm:
     name: qemu-x86
     needs: [ test-config ]

From 8dc15ef4b3fddfcf7d4d8d5e99bbf037dfb6037f Mon Sep 17 00:00:00 2001
From: Toomas Soome <tsoome@me.com>
Date: Thu, 2 Jan 2025 23:29:12 +0200
Subject: [PATCH 31/43] ZTS: zfs_mount_all_fail leaves /var/tmp/testrootPIDNUM
 directory around

Before we can remove test files, we need to unmount datasets
used by test first.

See also: zfs_mount_all_mountpoints.ksh

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Toomas Soome <tsoome@me.com>
Closes #16914
---
 .../cli_root/zfs_mount/zfs_mount_all_fail.ksh      | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh
index d1103bddccbd7..7b6c2ccdf6603 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh
@@ -16,6 +16,7 @@
 
 #
 # Copyright (c) 2017 by Delphix. All rights reserved.
+# Copyright 2024 MNX Cloud, Inc.
 #
 
 . $STF_SUITE/include/libtest.shlib
@@ -44,8 +45,9 @@ typeset fscount=10
 function setup_all
 {
 	# Create $fscount filesystems at the top level of $path
-	for ((i=0; i<$fscount; i++)); do
+	for ((i=0; i<fscount; i++)); do
 		setup_filesystem "$DISKS" "$TESTPOOL" $i "$path/$i" ctr
+		filesystems+=($i)
 	done
 
 	zfs list -r $TESTPOOL
@@ -58,6 +60,12 @@ function cleanup_all
 	export __ZFS_POOL_RESTRICT="$TESTPOOL"
 	log_must zfs $unmountall
 	unset __ZFS_POOL_RESTRICT
+	# make sure we leave $TESTPOOL mounted
+	log_must zfs mount $TESTPOOL
+
+	for fs in ${filesystems[@]}; do
+		cleanup_filesystem "$TESTPOOL" "$fs"
+	done
 
 	[[ -d ${TEST_BASE_DIR%%/}/testroot$$ ]] && \
 		rm -rf ${TEST_BASE_DIR%%/}/testroot$$
@@ -76,7 +84,7 @@ log_must zfs $unmountall
 unset __ZFS_POOL_RESTRICT
 
 # All of our filesystems should be unmounted at this point
-for ((i=0; i<$fscount; i++)); do
+for ((i=0; i<fscount; i++)); do
 	log_mustnot mounted "$TESTPOOL/$i"
 done
 
@@ -98,7 +106,7 @@ log_mustnot mounted "$TESTPOOL/0"
 unset __ZFS_POOL_RESTRICT
 
 # All other filesystems should be mounted
-for ((i=1; i<$fscount; i++)); do
+for ((i=1; i<fscount; i++)); do
 	log_must mounted "$TESTPOOL/$i"
 done
 

From d6b4110d715a02747f428ee243650ab4e4cb71ac Mon Sep 17 00:00:00 2001
From: Toomas Soome <tsoome@me.com>
Date: Fri, 3 Jan 2025 01:53:53 +0200
Subject: [PATCH 32/43] ZTS: functional/mount scripts are not removing
 /var/tmp/testdir.X dirs

cleanup.ksh is assuming we have TESTDIRS set.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Toomas Soome <tsoome@me.com>
Closes #16915
---
 tests/zfs-tests/tests/functional/mount/cleanup.ksh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/zfs-tests/tests/functional/mount/cleanup.ksh b/tests/zfs-tests/tests/functional/mount/cleanup.ksh
index bd6b0e435ed1c..0e88e2a1fc79a 100755
--- a/tests/zfs-tests/tests/functional/mount/cleanup.ksh
+++ b/tests/zfs-tests/tests/functional/mount/cleanup.ksh
@@ -27,12 +27,14 @@
 
 #
 # Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+# Copyright 2025 MNX Cloud, Inc.
 #
 
 . $STF_SUITE/include/libtest.shlib
 
 log_must destroy_pool $TESTPOOL
 
-for dir in $TESTDIRS; do
+for i in 1 2 3; do
+	dir=$TESTDIR.$i
 	rm -rf $dir
 done

From d35f9f2e841ef8880df07e5e1dbf7612faa430db Mon Sep 17 00:00:00 2001
From: Toomas Soome <tsoome@me.com>
Date: Fri, 3 Jan 2025 01:57:24 +0200
Subject: [PATCH 33/43] ZTS: checkpoint_discard_busy does not set 16M on
 cleanup

Originally hex value is used as decimal.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Toomas Soome <tsoome@me.com>
Closes #16917
---
 .../functional/pool_checkpoint/checkpoint_discard_busy.ksh      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh
index 087aef9027eab..07b658641f654 100755
--- a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh
+++ b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh
@@ -43,7 +43,7 @@ log_unsupported "Skipping, issue https://github.com/openzfs/zfs/issues/12053"
 function test_cleanup
 {
 	# reset memory limit to 16M
-	set_tunable64 SPA_DISCARD_MEMORY_LIMIT 1000000
+	set_tunable64 SPA_DISCARD_MEMORY_LIMIT 16777216
 	cleanup_nested_pools
 }
 

From 478b09577a4799893072ce8c3614b6aac30dc7c4 Mon Sep 17 00:00:00 2001
From: pstef <pstef@users.noreply.github.com>
Date: Fri, 3 Jan 2025 18:03:14 +0100
Subject: [PATCH 34/43] zfs_vnops_os.c: fallocate is valid but not supported on
 FreeBSD

This works around
/usr/lib/go-1.18/pkg/tool/linux_amd64/link:
mapping output file failed: invalid argument

It's happened to me under a Linux jail, but it's also happened to other
people, see https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=270247#c4

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: pstef <pstef@users.noreply.github.com>
Closes #16918
---
 module/os/freebsd/zfs/zfs_vnops_os.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c
index b8c2c341dacee..5edd3fcc76e7f 100644
--- a/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -6258,7 +6258,7 @@ struct vop_vector zfs_vnodeops = {
 	.vop_fplookup_vexec = zfs_freebsd_fplookup_vexec,
 	.vop_fplookup_symlink = zfs_freebsd_fplookup_symlink,
 	.vop_access =		zfs_freebsd_access,
-	.vop_allocate =		VOP_EINVAL,
+	.vop_allocate =		VOP_EOPNOTSUPP,
 #if __FreeBSD_version >= 1400032
 	.vop_deallocate =	zfs_deallocate,
 #endif

From e94549d868c52a32d843132ea7171486069a6c9f Mon Sep 17 00:00:00 2001
From: Toomas Soome <tsoome@me.com>
Date: Sat, 4 Jan 2025 00:41:03 +0200
Subject: [PATCH 35/43] ZTS: remove unused TESTDIRS from pam/cleanup.ksh

Remove TESTDIRS as it is not set for pam tests.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Toomas Soome <tsoome@me.com>
Closes #16920
---
 tests/zfs-tests/tests/functional/pam/cleanup.ksh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/zfs-tests/tests/functional/pam/cleanup.ksh b/tests/zfs-tests/tests/functional/pam/cleanup.ksh
index dbcb175ed069e..bfb98cd30707e 100755
--- a/tests/zfs-tests/tests/functional/pam/cleanup.ksh
+++ b/tests/zfs-tests/tests/functional/pam/cleanup.ksh
@@ -27,4 +27,4 @@ destroy_pool $TESTPOOL
 del_user ${username}
 del_user ${username}rec
 del_group pamtestgroup
-log_must rm -rf "$runstatedir" $TESTDIRS
+log_must rm -rf "$runstatedir"

From c02e1cf055bb599d78a95b33da4ee9b4aba8918e Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Sat, 4 Jan 2025 09:42:06 +1100
Subject: [PATCH 36/43] vdev_open: clear async remove flag after reopen

It's possible for a vdev to be flagged for async remove after the pool
has suspended. If the removed device has been returned when the pool is
resumed, the ASYNC_REMOVE task will still run at the end of txg, and
remove the device from the pool again.

To fix, we clear the async remove flag at reopen, just as we did for the
async fault flag in 5de3ac223.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16921
---
 module/zfs/vdev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 85b6ee32158d9..5df2f77e57806 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -2041,6 +2041,7 @@ vdev_open(vdev_t *vd)
 	vd->vdev_cant_read = B_FALSE;
 	vd->vdev_cant_write = B_FALSE;
 	vd->vdev_fault_wanted = B_FALSE;
+	vd->vdev_remove_wanted = B_FALSE;
 	vd->vdev_min_asize = vdev_get_min_asize(vd);
 
 	/*

From ee3bde9dadf3d4255ed710ac40dfed4fb8c26149 Mon Sep 17 00:00:00 2001
From: Toomas Soome <tsoome@me.com>
Date: Sat, 4 Jan 2025 00:48:30 +0200
Subject: [PATCH 37/43] ZTS: checkpoint_discard_busy should use
 save_tunable/restore_tunable

Instead of using hardwired value for SPA_DISCARD_MEMORY_LIMIT,
use save_tunable and restore_tunable to restore the pre-test state.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Toomas Soome <tsoome@me.com>
Closes #16919
---
 .../pool_checkpoint/checkpoint_discard_busy.ksh          | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh
index 07b658641f654..2bf5ab199e6ee 100755
--- a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh
+++ b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh
@@ -42,8 +42,8 @@ log_unsupported "Skipping, issue https://github.com/openzfs/zfs/issues/12053"
 
 function test_cleanup
 {
-	# reset memory limit to 16M
-	set_tunable64 SPA_DISCARD_MEMORY_LIMIT 16777216
+	# reset to original value
+	log_must restore_tunable SPA_DISCARD_MEMORY_LIMIT
 	cleanup_nested_pools
 }
 
@@ -69,6 +69,7 @@ log_onexit test_cleanup
 #	map, we should have even more time to
 #	verify this.
 #
+log_must save_tunable SPA_DISCARD_MEMORY_LIMIT
 set_tunable64 SPA_DISCARD_MEMORY_LIMIT 128
 
 log_must zpool checkpoint $NESTEDPOOL
@@ -101,8 +102,8 @@ log_mustnot zpool checkpoint -d $NESTEDPOOL
 log_mustnot zpool remove $NESTEDPOOL $FILEDISK1
 log_mustnot zpool reguid $NESTEDPOOL
 
-# reset memory limit to 16M
-set_tunable64 SPA_DISCARD_MEMORY_LIMIT 16777216
+# reset to original value
+log_must restore_tunable SPA_DISCARD_MEMORY_LIMIT
 
 nested_wait_discard_finish
 

From 50cbb14641b32215c87825ab158f726541707888 Mon Sep 17 00:00:00 2001
From: Robert Evans <evansr@google.com>
Date: Fri, 3 Jan 2025 22:04:01 -0500
Subject: [PATCH 38/43] Add Makefile dependencies for scripts/zfs-tests.sh -c

This updates the Makefile to be more correct for parallel make.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Signed-off-by: Robert Evans <evansr@google.com>
Closes #16030
Closes #16922
---
 scripts/Makefile.am | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/Makefile.am b/scripts/Makefile.am
index 7d9cef83d2c60..ee8fb8717cece 100644
--- a/scripts/Makefile.am
+++ b/scripts/Makefile.am
@@ -79,7 +79,7 @@ CLEANFILES += %D%/common.sh
 	-$(AM_V_at)echo "$$SCRIPTS_EXTRA_ENVIRONMENT" >>$@
 
 ALL_LOCAL += scripts-all-local
-scripts-all-local: %D%/common.sh
+scripts-all-local: %D%/common.sh $(PROGRAMS) $(SCRIPTS) $(DATA)
 	-SCRIPT_COMMON=$< $(srcdir)/%D%/zfs-tests.sh -c
 
 CLEAN_LOCAL += scripts-clean-local

From 939e0237c5499abaefdddb21c280801766ad59ea Mon Sep 17 00:00:00 2001
From: Don Brady <don.brady@klarasystems.com>
Date: Sat, 4 Jan 2025 11:28:33 -0700
Subject: [PATCH 39/43] Too many vdev probe errors should suspend pool

Similar to what we saw in #16569, we need to consider that a
replacing vdev should not be considered as fully contributing
to the redundancy of a raidz vdev even though current IO has
enough redundancy.

When a failed vdev_probe() is faulting a disk, it now checks
if that disk is required, and if so it suspends the pool until
the admin can return the missing disks.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Don Brady <don.brady@klarasystems.com>
Closes #16864
---
 module/zfs/spa.c                              |  25 ++-
 tests/runfiles/linux.run                      |   4 +-
 tests/zfs-tests/tests/Makefile.am             |   1 +
 .../fault/suspend_on_probe_errors.ksh         | 154 ++++++++++++++++++
 4 files changed, 176 insertions(+), 8 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/fault/suspend_on_probe_errors.ksh

diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index c93c7945f192f..956bae46ef1b5 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -8948,16 +8948,26 @@ spa_async_remove(spa_t *spa, vdev_t *vd)
 }
 
 static void
-spa_async_fault_vdev(spa_t *spa, vdev_t *vd)
+spa_async_fault_vdev(vdev_t *vd, boolean_t *suspend)
 {
 	if (vd->vdev_fault_wanted) {
+		vdev_state_t newstate = VDEV_STATE_FAULTED;
 		vd->vdev_fault_wanted = B_FALSE;
-		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
-		    VDEV_AUX_ERR_EXCEEDED);
-	}
 
+		/*
+		 * If this device has the only valid copy of the data, then
+		 * back off and simply mark the vdev as degraded instead.
+		 */
+		if (!vd->vdev_top->vdev_islog && vd->vdev_aux == NULL &&
+		    vdev_dtl_required(vd)) {
+			newstate = VDEV_STATE_DEGRADED;
+			/* A required disk is missing so suspend the pool */
+			*suspend = B_TRUE;
+		}
+		vdev_set_state(vd, B_TRUE, newstate, VDEV_AUX_ERR_EXCEEDED);
+	}
 	for (int c = 0; c < vd->vdev_children; c++)
-		spa_async_fault_vdev(spa, vd->vdev_child[c]);
+		spa_async_fault_vdev(vd->vdev_child[c], suspend);
 }
 
 static void
@@ -9049,8 +9059,11 @@ spa_async_thread(void *arg)
 	 */
 	if (tasks & SPA_ASYNC_FAULT_VDEV) {
 		spa_vdev_state_enter(spa, SCL_NONE);
-		spa_async_fault_vdev(spa, spa->spa_root_vdev);
+		boolean_t suspend = B_FALSE;
+		spa_async_fault_vdev(spa->spa_root_vdev, &suspend);
 		(void) spa_vdev_state_exit(spa, NULL, 0);
+		if (suspend)
+			zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR);
 	}
 
 	/*
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index 76d07a6cc9c1e..e55ec583d2cc3 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -125,8 +125,8 @@ tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos',
     'auto_replace_001_pos', 'auto_replace_002_pos', 'auto_spare_001_pos',
     'auto_spare_002_pos', 'auto_spare_multiple', 'auto_spare_ashift',
     'auto_spare_shared', 'decrypt_fault', 'decompress_fault',
-    'fault_limits', 'scrub_after_resilver', 'suspend_resume_single',
-    'zpool_status_-s']
+    'fault_limits', 'scrub_after_resilver', 'suspend_on_probe_errors',
+    'suspend_resume_single', 'zpool_status_-s']
 tags = ['functional', 'fault']
 
 [tests/functional/features/large_dnode:Linux]
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 588249be45da7..df183825dc680 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1532,6 +1532,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/fault/decrypt_fault.ksh \
 	functional/fault/fault_limits.ksh \
 	functional/fault/scrub_after_resilver.ksh \
+	functional/fault/suspend_on_probe_errors.ksh \
 	functional/fault/suspend_resume_single.ksh \
 	functional/fault/setup.ksh \
 	functional/fault/zpool_status_-s.ksh \
diff --git a/tests/zfs-tests/tests/functional/fault/suspend_on_probe_errors.ksh b/tests/zfs-tests/tests/functional/fault/suspend_on_probe_errors.ksh
new file mode 100755
index 0000000000000..d9261bb5d274f
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/fault/suspend_on_probe_errors.ksh
@@ -0,0 +1,154 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024, Klara Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/include/blkdev.shlib
+
+#
+# DESCRIPTION: Verify that 4 disks removed from a raidz3 will suspend the pool
+#
+# STRATEGY:
+# 1. Disable ZED -- this test is focused on vdev_probe errors
+# 2. Create a raidz3 pool where 4 disks can be removed (i.e., using scsi_debug)
+# 3. Add some data to it for a resilver workload
+# 4. Replace one of the child vdevs to start a replacing vdev
+# 5. During the resilver, remove 4 disks including one from the replacing vdev
+# 6. Verify that the pool is suspended (it used to remain online)
+#
+
+DEV_SIZE_MB=1024
+
+FILE_VDEV_CNT=8
+FILE_VDEV_SIZ=256M
+
+function cleanup
+{
+	destroy_pool $TESTPOOL
+	if [[ "$(cat /sys/block/$sd/device/state)" == "offline" ]]; then
+		log_must eval "echo running > /sys/block/$sd/device/state"
+	fi
+	unload_scsi_debug
+	rm -f $DATA_FILE
+	for i in {0..$((FILE_VDEV_CNT - 1))}; do
+		log_must rm -f "$TEST_BASE_DIR/dev-$i"
+	done
+	log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0
+	zed_start
+}
+
+log_onexit cleanup
+
+log_assert "VDEV probe errors for more disks than parity should suspend a pool"
+
+log_note "Stoping ZED process"
+zed_stop
+zpool events -c
+
+# Make a debug device that we can "unplug" and lose 4 drives at once
+unload_scsi_debug
+load_scsi_debug $DEV_SIZE_MB 1 1 1 '512b'
+sd=$(get_debug_device)
+
+# Create 4 partitions that match the FILE_VDEV_SIZ
+parted "/dev/${sd}" --script mklabel gpt
+parted "/dev/${sd}" --script mkpart primary 0% 25%
+parted "/dev/${sd}" --script mkpart primary 25% 50%
+parted "/dev/${sd}" --script mkpart primary 50% 75%
+parted "/dev/${sd}" --script mkpart primary 75% 100%
+block_device_wait "/dev/${sd}"
+blkdevs="/dev/${sd}1 /dev/${sd}2 /dev/${sd}3 /dev/${sd}4"
+
+# Create 8 file vdevs
+typeset -a filedevs
+for i in {0..$((FILE_VDEV_CNT - 1))}; do
+	device=$TEST_BASE_DIR/dev-$i
+	log_must truncate -s $FILE_VDEV_SIZ $device
+	# Use all but the last one for pool create
+	if [[ $i -lt "7" ]]; then
+		filedevs[${#filedevs[*]}+1]=$device
+	fi
+done
+
+# Create a raidz-3 pool that we can pull 4 disks from
+log_must zpool create -f $TESTPOOL raidz3 ${filedevs[@]} $blkdevs
+sync_pool $TESTPOOL
+
+# Add some data to the pool
+log_must zfs create $TESTPOOL/fs
+MNTPOINT="$(get_prop mountpoint $TESTPOOL/fs)"
+SECONDS=0
+log_must fill_fs $MNTPOINT 1 200 4096 10 Z
+log_note "fill_fs took $SECONDS seconds"
+sync_pool $TESTPOOL
+
+# Start a replacing vdev, but suspend the resilver
+log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
+log_must zpool replace -f $TESTPOOL /dev/${sd}4 $TEST_BASE_DIR/dev-7
+
+# Remove 4 disks all at once
+log_must eval "echo offline > /sys/block/${sd}/device/state"
+
+log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0
+
+# Add some writes to drive the vdev probe errors
+log_must dd if=/dev/urandom of=$MNTPOINT/writes bs=1M count=1
+
+# Wait until sync starts, and the pool suspends
+log_note "waiting for pool to suspend"
+typeset -i tries=30
+until [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) == "SUSPENDED" ]] ; do
+	if ((tries-- == 0)); then
+		zpool status -s
+		log_fail "UNEXPECTED -- pool did not suspend"
+	fi
+	sleep 1
+done
+log_note $(cat /proc/spl/kstat/zfs/$TESTPOOL/state)
+
+# Put the missing disks back into service
+log_must eval "echo running > /sys/block/$sd/device/state"
+
+# Clear the vdev error states, which will reopen the vdevs and resume the pool
+log_must zpool clear $TESTPOOL
+
+# Wait until the pool resumes
+log_note "waiting for pool to resume"
+tries=30
+until [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) != "SUSPENDED" ]] ; do
+	if ((tries-- == 0)); then
+		log_fail "pool did not resume"
+	fi
+	sleep 1
+done
+log_must zpool wait -t resilver $TESTPOOL
+sync_pool $TESTPOOL
+
+# Make sure a pool scrub comes back clean
+log_must zpool scrub -w $TESTPOOL
+log_must zpool status -v $TESTPOOL
+log_must check_pool_status $TESTPOOL "errors" "No known data errors"
+
+log_pass "VDEV probe errors for more disks than parity should suspend a pool"

From dc0324bfa90011a5edf65bc0947aaf51ac9a8b61 Mon Sep 17 00:00:00 2001
From: Richard Kojedzinszky <rkojedzinszky@users.noreply.github.com>
Date: Sat, 4 Jan 2025 19:33:27 +0100
Subject: [PATCH 40/43] fix: make zfs_strerror really thread-safe and portable

#15793 wanted to make zfs_strerror threadsafe, unfortunately, it
turned out that strerror_l() usage was wrong, and also, some libc
implementations dont have strerror_l().

zfs_strerror() now simply calls original strerror() and copies the
result to a thread-local buffer, then returns that.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Richard Kojedzinszky <richard@kojedz.in>
Closes #15793
Closes #16640
Closes #16923
---
 config/user.m4     |  2 +-
 include/libzutil.h | 15 +++++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/config/user.m4 b/config/user.m4
index 4e31745a2abcd..badd920d2b8a1 100644
--- a/config/user.m4
+++ b/config/user.m4
@@ -33,7 +33,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [
 	ZFS_AC_CONFIG_USER_MAKEDEV_IN_MKDEV
 	ZFS_AC_CONFIG_USER_ZFSEXEC
 
-	AC_CHECK_FUNCS([execvpe issetugid mlockall strerror_l strlcat strlcpy gettid])
+	AC_CHECK_FUNCS([execvpe issetugid mlockall strlcat strlcpy gettid])
 
 	AC_SUBST(RM)
 ])
diff --git a/include/libzutil.h b/include/libzutil.h
index f8712340cc5e7..bcfe2fcf7960f 100644
--- a/include/libzutil.h
+++ b/include/libzutil.h
@@ -27,7 +27,7 @@
 #define	_LIBZUTIL_H extern __attribute__((visibility("default")))
 
 #include <string.h>
-#include <locale.h>
+#include <pthread.h>
 #include <sys/nvpair.h>
 #include <sys/fs/zfs.h>
 
@@ -276,11 +276,14 @@ _LIBZUTIL_H void update_vdev_config_dev_sysfs_path(nvlist_t *nv,
  * Thread-safe strerror() for use in ZFS libraries
  */
 static inline char *zfs_strerror(int errnum) {
-#ifdef HAVE_STRERROR_L
-	return (strerror_l(errnum, uselocale(0)));
-#else
-	return (strerror(errnum));
-#endif
+	static __thread char errbuf[512];
+	static pthread_mutex_t zfs_strerror_lock = PTHREAD_MUTEX_INITIALIZER;
+
+	(void) pthread_mutex_lock(&zfs_strerror_lock);
+	(void) strlcpy(errbuf, strerror(errnum), sizeof (errbuf));
+	(void) pthread_mutex_unlock(&zfs_strerror_lock);
+
+	return (errbuf);
 }
 
 #ifdef	__cplusplus

From 3a445f2ef5eff3bf33696282f64a40942d37a031 Mon Sep 17 00:00:00 2001
From: Robert Evans <evansr@google.com>
Date: Sun, 5 Jan 2025 20:25:22 -0500
Subject: [PATCH 41/43] Remove duplicate dedup_legacy_create in common.run

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Signed-off-by: Robert Evans <evansr@google.com>
Closes #16926
---
 tests/runfiles/common.run | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 6abb3b4213bb3..8a4a4b0f5cb81 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -677,8 +677,8 @@ post =
 tags = ['functional', 'deadman']
 
 [tests/functional/dedup]
-tests = ['dedup_legacy_create', 'dedup_fdt_create', 'dedup_fdt_import',
-    'dedup_legacy_create', 'dedup_legacy_import', 'dedup_legacy_fdt_upgrade',
+tests = ['dedup_fdt_create', 'dedup_fdt_import', 'dedup_legacy_create',
+    'dedup_legacy_import', 'dedup_legacy_fdt_upgrade',
     'dedup_legacy_fdt_mixed', 'dedup_quota']
 pre =
 post =

From 18c67d2418c571a0a85592401db793b5e6a9343a Mon Sep 17 00:00:00 2001
From: n0-1 <p.github@nwl.cc>
Date: Mon, 6 Jan 2025 02:27:19 +0100
Subject: [PATCH 42/43] Support for cross-compiling kernel modules

In order to correctly cross-compile, one has to pass ARCH and
CROSS_COMPILE make flags to kernel module build calls. Facilitate this
in the same way as for custom CC flag by recognizing KERNEL_-prefixed
configure environment variables of same name.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Phil Sutter <phil@nwl.cc>
Closes #16924
---
 config/kernel.m4             | 5 +++++
 config/zfs-build.m4          | 2 ++
 module/Makefile.in           | 2 ++
 rpm/generic/zfs-kmod.spec.in | 4 +++-
 rpm/redhat/zfs-kmod.spec.in  | 4 +++-
 5 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/config/kernel.m4 b/config/kernel.m4
index ae66633907bf7..9928ead1b6cea 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -681,11 +681,16 @@ AC_DEFUN([ZFS_LINUX_COMPILE], [
 		building kernel modules])
 	AC_ARG_VAR([KERNEL_LLVM], [Binary option to
 		build kernel modules with LLVM/CLANG toolchain])
+	AC_ARG_VAR([KERNEL_CROSS_COMPILE], [Cross compile prefix
+		for kernel module builds])
+	AC_ARG_VAR([KERNEL_ARCH], [Architecture to build kernel modules for])
 	AC_TRY_COMMAND([
 	    KBUILD_MODPOST_NOFINAL="$5" KBUILD_MODPOST_WARN="$6"
 	    make modules -k -j$TEST_JOBS ${KERNEL_CC:+CC=$KERNEL_CC}
 	    ${KERNEL_LD:+LD=$KERNEL_LD} ${KERNEL_LLVM:+LLVM=$KERNEL_LLVM}
 	    CONFIG_MODULES=y CFLAGS_MODULE=-DCONFIG_MODULES
+	    ${KERNEL_CROSS_COMPILE:+CROSS_COMPILE=$KERNEL_CROSS_COMPILE}
+	    ${KERNEL_ARCH:+ARCH=$KERNEL_ARCH}
 	    -C $LINUX_OBJ $ARCH_UM M=$PWD/$1 >$1/build.log 2>&1])
 	AS_IF([AC_TRY_COMMAND([$2])], [$3], [$4])
 ])
diff --git a/config/zfs-build.m4 b/config/zfs-build.m4
index c44a893bbb8c3..55fc029f08471 100644
--- a/config/zfs-build.m4
+++ b/config/zfs-build.m4
@@ -393,6 +393,8 @@ AC_DEFUN([ZFS_AC_RPM], [
 		RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kernel_cc KERNEL_CC=$(KERNEL_CC)"'
 		RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kernel_ld KERNEL_LD=$(KERNEL_LD)"'
 		RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kernel_llvm KERNEL_LLVM=$(KERNEL_LLVM)"'
+		RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kernel_cross_compile KERNEL_CROSS_COMPILE=$(KERNEL_CROSS_COMPILE)"'
+		RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kernel_arch KERNEL_ARCH=$(KERNEL_ARCH)"'
 	])
 
 	RPM_DEFINE_DKMS=''
diff --git a/module/Makefile.in b/module/Makefile.in
index 9b34b3dfaec76..529ab81dcec59 100644
--- a/module/Makefile.in
+++ b/module/Makefile.in
@@ -55,6 +55,8 @@ modules-Linux:
 	mkdir -p $(sort $(dir $(zfs-objs) $(zfs-)))
 	$(MAKE) -C @LINUX_OBJ@ $(if @KERNEL_CC@,CC=@KERNEL_CC@) \
 		$(if @KERNEL_LD@,LD=@KERNEL_LD@) $(if @KERNEL_LLVM@,LLVM=@KERNEL_LLVM@) \
+		$(if @KERNEL_CROSS_COMPILE@,CROSS_COMPILE=@KERNEL_CROSS_COMPILE@) \
+		$(if @KERNEL_ARCH@,ARCH=@KERNEL_ARCH@) \
 		M="$$PWD" @KERNEL_MAKE@ CONFIG_ZFS=m modules
 
 modules-FreeBSD:
diff --git a/rpm/generic/zfs-kmod.spec.in b/rpm/generic/zfs-kmod.spec.in
index 30524474d1ace..7ed828bd0c9c4 100644
--- a/rpm/generic/zfs-kmod.spec.in
+++ b/rpm/generic/zfs-kmod.spec.in
@@ -144,7 +144,9 @@ for kernel_version in %{?kernel_versions}; do
         %{debuginfo} \
         %{?kernel_cc} \
         %{?kernel_ld} \
-        %{?kernel_llvm}
+        %{?kernel_llvm} \
+        %{?kernel_cross_compile} \
+        %{?kernel_arch}
 
     # Pre-6.10 kernel builds didn't need to copy over the source files to the
     # build directory.  However we do need to do it though post-6.10 due to
diff --git a/rpm/redhat/zfs-kmod.spec.in b/rpm/redhat/zfs-kmod.spec.in
index 876c198c64de0..a95bdf20f873e 100644
--- a/rpm/redhat/zfs-kmod.spec.in
+++ b/rpm/redhat/zfs-kmod.spec.in
@@ -69,7 +69,9 @@ fi
         %{debuginfo} \
         %{?kernel_cc} \
         %{?kernel_ld} \
-        %{?kernel_llvm}
+        %{?kernel_llvm} \
+        %{?kernel_cross_compile} \
+        %{?kernel_arch}
 make %{?_smp_mflags}
 
 # Module signing (modsign)

From b8e09c7007a0733a497aebe734cb8ed23a0415ae Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Wed, 8 Jan 2025 10:43:01 +1100
Subject: [PATCH 43/43] ZTS: remove empty zpool_add--allow-ashift-mismatch test

Added in b1e46f869, but empty, so no point keeping it around.

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #16931
---
 tests/runfiles/common.run                                      | 3 +--
 tests/zfs-tests/tests/Makefile.am                              | 1 -
 .../cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh    | 0
 3 files changed, 1 insertion(+), 3 deletions(-)
 delete mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh

diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 8a4a4b0f5cb81..3ec052f56b406 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -387,8 +387,7 @@ tags = ['functional', 'cli_root', 'zpool']
 tests = ['zpool_add_001_pos', 'zpool_add_002_pos', 'zpool_add_003_pos',
     'zpool_add_004_pos', 'zpool_add_006_pos', 'zpool_add_007_neg',
     'zpool_add_008_neg', 'zpool_add_009_neg', 'zpool_add_010_pos',
-    'add-o_ashift', 'add_prop_ashift', 'zpool_add_dryrun_output',
-    'zpool_add--allow-ashift-mismatch']
+    'add-o_ashift', 'add_prop_ashift', 'zpool_add_dryrun_output']
 tags = ['functional', 'cli_root', 'zpool_add']
 
 [tests/functional/cli_root/zpool_attach]
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index df183825dc680..d62613d300359 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -998,7 +998,6 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/cli_root/zpool_add/add_prop_ashift.ksh \
 	functional/cli_root/zpool_add/cleanup.ksh \
 	functional/cli_root/zpool_add/setup.ksh \
-	functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh \
 	functional/cli_root/zpool_add/zpool_add_001_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_002_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_003_pos.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh
deleted file mode 100755
index e69de29bb2d1d..0000000000000